| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556 |
- #!/usr/bin/env python
- # Copyright (c) 2017-present, Facebook, Inc.
- # All rights reserved.
- #
- # This source code is licensed under the MIT license found in the
- # LICENSE file in the root directory of this source tree.
- from __future__ import absolute_import
- from __future__ import division
- from __future__ import print_function
- from __future__ import unicode_literals
- from __future__ import division, absolute_import, print_function
- from fastText import train_unsupervised
- import numpy as np
- import os
- from scipy import stats
- # Because of fasttext we don't need to account for OOV
- def compute_similarity(data_path):
- def similarity(v1, v2):
- n1 = np.linalg.norm(v1)
- n2 = np.linalg.norm(v2)
- return np.dot(v1, v2) / n1 / n2
- mysim = []
- gold = []
- with open(data_path, 'rb') as fin:
- for line in fin:
- tline = line.split()
- word1 = tline[0].lower()
- word2 = tline[1].lower()
- v1 = model.get_word_vector(word1)
- v2 = model.get_word_vector(word2)
- d = similarity(v1, v2)
- mysim.append(d)
- gold.append(float(tline[2]))
- corr = stats.spearmanr(mysim, gold)
- dataset = os.path.basename(data_path)
- correlation = corr[0] * 100
- return dataset, correlation, 0
- if __name__ == "__main__":
- model = train_unsupervised(
- input=os.path.join(os.getenv("DATADIR", ''), 'fil9'),
- model='skipgram',
- )
- model.save_model("fil9.bin")
- dataset, corr, oov = compute_similarity('rw.txt')
- print("{0:20s}: {1:2.0f} (OOV: {2:2.0f}%)".format(dataset, corr, 0))
|