| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657 |
- #!/usr/bin/env python
- # Copyright (c) 2017-present, Facebook, Inc.
- # All rights reserved.
- #
- # This source code is licensed under the BSD-style license found in the
- # LICENSE file in the root directory of this source tree. An additional grant
- # of patent rights can be found in the PATENTS file in the same directory.
- from __future__ import absolute_import
- from __future__ import division
- from __future__ import print_function
- from __future__ import unicode_literals
- from __future__ import division, absolute_import, print_function
- from fastText import train_unsupervised
- import numpy as np
- import os
- from scipy import stats
- # Because of fasttext we don't need to account for OOV
- def compute_similarity(data_path):
- def similarity(v1, v2):
- n1 = np.linalg.norm(v1)
- n2 = np.linalg.norm(v2)
- return np.dot(v1, v2) / n1 / n2
- mysim = []
- gold = []
- with open(data_path, 'rb') as fin:
- for line in fin:
- tline = line.split()
- word1 = tline[0].lower()
- word2 = tline[1].lower()
- v1 = model.get_word_vector(word1)
- v2 = model.get_word_vector(word2)
- d = similarity(v1, v2)
- mysim.append(d)
- gold.append(float(tline[2]))
- corr = stats.spearmanr(mysim, gold)
- dataset = os.path.basename(data_path)
- correlation = corr[0] * 100
- return dataset, correlation, 0
- if __name__ == "__main__":
- model = train_unsupervised(
- input=os.path.join(os.getenv("DATADIR", ''), 'fil9'),
- model='skipgram',
- )
- model.save_model("fil9.bin")
- dataset, corr, oov = compute_similarity('rw.txt')
- print("{0:20s}: {1:2.0f} (OOV: {2:2.0f}%)".format(dataset, corr, 0))
|