JasonWang
/
fastText
mirror of https://github.com/facebookresearch/fastText


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
							#!/usr/bin/env python

# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree. An additional grant
# of patent rights can be found in the PATENTS file in the same directory.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division, absolute_import, print_function

from fastText import train_unsupervised
import numpy as np
import os
from scipy import stats


# Because of fasttext we don't need to account for OOV
def compute_similarity(data_path):
    def similarity(v1, v2):
        n1 = np.linalg.norm(v1)
        n2 = np.linalg.norm(v2)
        return np.dot(v1, v2) / n1 / n2

    mysim = []
    gold = []

    with open(data_path, 'rb') as fin:
        for line in fin:
            tline = line.split()
            word1 = tline[0].lower()
            word2 = tline[1].lower()

            v1 = model.get_word_vector(word1)
            v2 = model.get_word_vector(word2)
            d = similarity(v1, v2)
            mysim.append(d)
            gold.append(float(tline[2]))

    corr = stats.spearmanr(mysim, gold)
    dataset = os.path.basename(data_path)
    correlation = corr[0] * 100
    return dataset, correlation, 0


if __name__ == "__main__":
    model = train_unsupervised(
        input=os.path.join(os.getenv("DATADIR", ''), 'fil9'),
        model='skipgram',
    )
    model.save_model("fil9.bin")
    dataset, corr, oov = compute_similarity('rw.txt')
    print("{0:20s}: {1:2.0f}  (OOV: {2:2.0f}%)".format(dataset, corr, 0))