8 yıl önce · bcd5250fd4
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -19,6 +19,9 @@ To create a pull request:
 
				 5. Make sure your code lints.
			
 
				 6. If you haven't already, complete the Contributor License Agreement ("CLA").
			
 
				 
			
 
				+## Tests
			
 
				+First, you will need to make sure you have the required data. For that, please have a look at the fetch_test_data.sh script under tests. Next run the tests using the runtests.py script passing a path to the directory containing the datasets.
			
 
				+
			
 
				 ## Contributor License Agreement ("CLA")
			
 
				 In order to accept your pull request, we need you to submit a CLA. You only need
			
 
				 to do this once to work on any of Facebook's open source projects.
			
--- a/python/fastText/FastText.py
+++ b/python/fastText/FastText.py
@@ -280,7 +280,6 @@ def train_supervised(
 
				     label="__label__",
			
 
				     verbose=2,
			
 
				     pretrainedVectors="",
			
 
				-    saveOutput=0
			
 
				 ):
			
 
				     """
			
 
				     Train a supervised model and return a model object.
			
@@ -322,7 +321,6 @@ def train_unsupervised(
 
				     label="__label__",
			
 
				     verbose=2,
			
 
				     pretrainedVectors="",
			
 
				-    saveOutput=0
			
 
				 ):
			
 
				     """
			
 
				     Train an unsupervised model and return a model object.
			
--- a/python/fastText/test/README.md
+++ b/python/fastText/test/README.md
@@ -1,7 +0,0 @@
 
				-To run this test script you need to provide a path to the fasttext binary build in debug mode and a folder with the datsets downloaded by classification-results.sh and word-vector-example.sh.
			
 
				-
			
 
				-Example run:
			
 
				-
			
 
				-```
			
 
				-FASTTEXT_BIN=fasttext_bin FASTTEXT_DATA=data python test_script.py
			
 
				-```
			
--- a/python/fastText/test/test_script.py
+++ b/python/fastText/test/test_script.py
@@ -1,607 +0,0 @@
 
				-# Copyright (c) 2017-present, Facebook, Inc.
			
 
				-# All rights reserved.
			
 
				-#
			
 
				-# This source code is licensed under the BSD-style license found in the
			
 
				-# LICENSE file in the root directory of this source tree. An additional grant
			
 
				-# of patent rights can be found in the PATENTS file in the same directory.
			
 
				-
			
 
				-from __future__ import absolute_import
			
 
				-from __future__ import absolute_import
			
 
				-from __future__ import division
			
 
				-from __future__ import print_function
			
 
				-from __future__ import unicode_literals
			
 
				-
			
 
				-from fastText import train_supervised
			
 
				-from fastText import train_unsupervised
			
 
				-from fastText import load_model
			
 
				-from fastText import tokenize
			
 
				-import random
			
 
				-import sys
			
 
				-import os
			
 
				-import subprocess
			
 
				-import multiprocessing
			
 
				-import numpy as np
			
 
				-import unittest
			
 
				-import tempfile
			
 
				-import math
			
 
				-from scipy import stats
			
 
				-
			
 
				-
			
 
				-def compat_splitting(line):
			
 
				-    return line.decode('utf8').split()
			
 
				-
			
 
				-
			
 
				-def similarity(v1, v2):
			
 
				-    n1 = np.linalg.norm(v1)
			
 
				-    n2 = np.linalg.norm(v2)
			
 
				-    return np.dot(v1, v2) / n1 / n2
			
 
				-
			
 
				-
			
 
				-def read_vectors(model_path):
			
 
				-    vectors = {}
			
 
				-    with open(model_path, 'rb') as fin:
			
 
				-        for _, line in enumerate(fin):
			
 
				-            try:
			
 
				-                tab = compat_splitting(line)
			
 
				-                vec = np.array(tab[1:], dtype=float)
			
 
				-                word = tab[0]
			
 
				-                if np.linalg.norm(vec) == 0:
			
 
				-                    continue
			
 
				-                if word not in vectors:
			
 
				-                    vectors[word] = vec
			
 
				-            except ValueError:
			
 
				-                continue
			
 
				-            except UnicodeDecodeError:
			
 
				-                continue
			
 
				-    return vectors
			
 
				-
			
 
				-
			
 
				-def compute_similarity(model_path, data_path, vectors=None):
			
 
				-    if not vectors:
			
 
				-        vectors = read_vectors(model_path)
			
 
				-
			
 
				-    mysim = []
			
 
				-    gold = []
			
 
				-    drop = 0.0
			
 
				-    nwords = 0.0
			
 
				-
			
 
				-    with open(data_path, 'rb') as fin:
			
 
				-        for line in fin:
			
 
				-            tline = compat_splitting(line)
			
 
				-            word1 = tline[0].lower()
			
 
				-            word2 = tline[1].lower()
			
 
				-            nwords = nwords + 1.0
			
 
				-
			
 
				-            if (word1 in vectors) and (word2 in vectors):
			
 
				-                v1 = vectors[word1]
			
 
				-                v2 = vectors[word2]
			
 
				-                d = similarity(v1, v2)
			
 
				-                mysim.append(d)
			
 
				-                gold.append(float(tline[2]))
			
 
				-            else:
			
 
				-                drop = drop + 1.0
			
 
				-
			
 
				-    corr = stats.spearmanr(mysim, gold)
			
 
				-    dataset = os.path.basename(data_path)
			
 
				-    correlation = corr[0] * 100
			
 
				-    oov = math.ceil(drop / nwords * 100.0)
			
 
				-    return dataset, correlation, oov
			
 
				-
			
 
				-
			
 
				-def get_random_unicode(length):
			
 
				-    # See: https://stackoverflow.com/questions/1477294/generate-random-utf-8-string-in-python
			
 
				-
			
 
				-    try:
			
 
				-        get_char = unichr
			
 
				-    except NameError:
			
 
				-        get_char = chr
			
 
				-
			
 
				-    # Update this to include code point ranges to be sampled
			
 
				-    include_ranges = [
			
 
				-        (0x0021, 0x0021),
			
 
				-        (0x0023, 0x0026),
			
 
				-        (0x0028, 0x007E),
			
 
				-        (0x00A1, 0x00AC),
			
 
				-        (0x00AE, 0x00FF),
			
 
				-        (0x0100, 0x017F),
			
 
				-        (0x0180, 0x024F),
			
 
				-        (0x2C60, 0x2C7F),
			
 
				-        (0x16A0, 0x16F0),
			
 
				-        (0x0370, 0x0377),
			
 
				-        (0x037A, 0x037E),
			
 
				-        (0x0384, 0x038A),
			
 
				-        (0x038C, 0x038C),
			
 
				-    ]
			
 
				-
			
 
				-    alphabet = [
			
 
				-        get_char(code_point)
			
 
				-        for current_range in include_ranges
			
 
				-        for code_point in range(current_range[0], current_range[1] + 1)
			
 
				-    ]
			
 
				-    return ''.join(random.choice(alphabet) for i in range(length))
			
 
				-
			
 
				-
			
 
				-def get_random_words(N, a, b):
			
 
				-    words = []
			
 
				-    for _ in range(N):
			
 
				-        length = random.randint(a, b)
			
 
				-        words.append(get_random_unicode(length))
			
 
				-    return words
			
 
				-
			
 
				-
			
 
				-class TestFastTextPy(unittest.TestCase):
			
 
				-    @classmethod
			
 
				-    def eprint(cls, *args, **kwargs):
			
 
				-        print(*args, file=sys.stderr, **kwargs)
			
 
				-
			
 
				-    @classmethod
			
 
				-    def num_thread(cls):
			
 
				-        return multiprocessing.cpu_count() - 1
			
 
				-
			
 
				-    @classmethod
			
 
				-    def build_paths(cls, train, test, output):
			
 
				-        train = os.path.join(cls.data_dir, train)
			
 
				-        test = os.path.join(cls.data_dir, test)
			
 
				-        output = os.path.join(cls.result_dir, output)
			
 
				-        return train, test, output
			
 
				-
			
 
				-    @classmethod
			
 
				-    def build_train_args(cls, params, mode, train, output):
			
 
				-        args = [cls.bin, mode, "-input", train, "-output", output]
			
 
				-        return args + params.split(' ')
			
 
				-
			
 
				-    @classmethod
			
 
				-    def get_train_output(cls, train_args):
			
 
				-        cls.eprint("Executing: " + ' '.join(train_args))
			
 
				-        return subprocess.check_output(train_args).decode('utf-8')
			
 
				-
			
 
				-    @classmethod
			
 
				-    def get_path_size(cls, path):
			
 
				-        path_size = subprocess.check_output(["stat", "-c", "%s",
			
 
				-                                             path]).decode('utf-8')
			
 
				-        path_size = int(path_size)
			
 
				-        return path_size
			
 
				-
			
 
				-    @classmethod
			
 
				-    def default_test_args(cls, model, test, quantize=False):
			
 
				-        return [cls.bin, "test", model, test]
			
 
				-
			
 
				-    @classmethod
			
 
				-    def get_test_output(cls, test_args):
			
 
				-        cls.eprint("Executing: " + ' '.join(test_args))
			
 
				-        test_output = subprocess.check_output(test_args)
			
 
				-        test_output = test_output.decode('utf-8')
			
 
				-        cls.eprint("Test output:\n" + test_output)
			
 
				-        return list(
			
 
				-            map(lambda x: x.split('\t')[1], test_output.split('\n')[:-1])
			
 
				-        )
			
 
				-
			
 
				-    @classmethod
			
 
				-    def train_generic_classifier(cls, train, output):
			
 
				-        thread = cls.num_thread()
			
 
				-        cls.eprint("Using {} threads".format(thread))
			
 
				-        sup_params = (
			
 
				-            "-dim 10 -lr 0.1 -wordNgrams 2 -minCount 1 -bucket 10000000 "
			
 
				-            "-epoch 5 -thread {}".format(thread)
			
 
				-        )
			
 
				-        mode = 'supervised'
			
 
				-        cls.get_train_output(
			
 
				-            cls.build_train_args(sup_params, mode, train, output)
			
 
				-        )
			
 
				-
			
 
				-    @classmethod
			
 
				-    def train_generic_embeddings(cls, train, output):
			
 
				-        thread = cls.num_thread()
			
 
				-        cls.eprint("Using {} threads".format(thread))
			
 
				-        unsup_params = (
			
 
				-            "-thread {} -lr 0.025 -dim 100 -ws 5 -epoch 1 -minCount 5 "
			
 
				-            "-neg 5 -loss ns -bucket 2000000 -minn 3 -maxn 6 -t 1e-4 "
			
 
				-            "-lrUpdateRate 100".format(thread)
			
 
				-        )
			
 
				-        mode = 'cbow'
			
 
				-        cls.get_train_output(
			
 
				-            cls.build_train_args(unsup_params, mode, train, output)
			
 
				-        )
			
 
				-
			
 
				-    def get_predictions_from_list(self, output, words, k):
			
 
				-        args = [self.bin, "predict-prob", output + '.bin', '-', str(k)]
			
 
				-        self.eprint("Executing: " + ' '.join(args))
			
 
				-        p = subprocess.Popen(
			
 
				-            args, stdin=subprocess.PIPE, stdout=subprocess.PIPE
			
 
				-        )
			
 
				-        test_text = ""
			
 
				-        if words:
			
 
				-            test_text = '\n'.join(words) + '\n'
			
 
				-        test_text = test_text.encode('utf-8')
			
 
				-        stdout, stderr = p.communicate(test_text)
			
 
				-        stdout = stdout.decode('utf-8')
			
 
				-        return stdout, stderr, p.returncode
			
 
				-
			
 
				-    def get_word_vectors_from_list(self, output, words):
			
 
				-        args = [self.bin, "print-word-vectors", output + '.bin']
			
 
				-        self.eprint("Executing: " + ' '.join(args))
			
 
				-        p = subprocess.Popen(
			
 
				-            args, stdin=subprocess.PIPE, stdout=subprocess.PIPE
			
 
				-        )
			
 
				-        test_text = '\n'.join(words).encode('utf-8')
			
 
				-        stdout, stderr = p.communicate(test_text)
			
 
				-        return stdout
			
 
				-
			
 
				-
			
 
				-class TestFastTextPyUnit(TestFastTextPy):
			
 
				-    @classmethod
			
 
				-    def setUpClass(cls):
			
 
				-        cls.bin = os.environ['FASTTEXT_BIN']
			
 
				-        cls.data_dir = os.environ['FASTTEXT_DATA']
			
 
				-        cls.result_dir = tempfile.mkdtemp()
			
 
				-        train, _, output = cls.build_paths("fil9", "rw/rw.txt", "fil9")
			
 
				-        cls.train_generic_embeddings(train, output)
			
 
				-        cls.output = output
			
 
				-        train, _, output_sup = cls.build_paths(
			
 
				-            "dbpedia.train", "dbpedia.test", "dbpedia"
			
 
				-        )
			
 
				-        cls.train_generic_classifier(train, output_sup)
			
 
				-        cls.output_sup = output_sup
			
 
				-
			
 
				-    @classmethod
			
 
				-    def tearDownClass(cls):
			
 
				-        pass
			
 
				-        # shutil.rmtree(cls.result_dir)
			
 
				-
			
 
				-    # Check if get_word_vector aligns with vectors from stdin
			
 
				-    def test_getvector(self):
			
 
				-        f = load_model(self.output + '.bin')
			
 
				-        words, _ = f.get_words(include_freq=True)
			
 
				-        words += get_random_words(100, 1, 100)
			
 
				-        ftbin_vectors = self.get_word_vectors_from_list(self.output, words)
			
 
				-        ftbin_vectors = ftbin_vectors.decode('utf-8').split('\n')[:-1]
			
 
				-        for v in ftbin_vectors:
			
 
				-            word = v.split(' ')[0]
			
 
				-            vector = v.split(' ')[1:-1]
			
 
				-            vector = np.array(list(map(float, vector)))
			
 
				-            pvec = f.get_word_vector(word)
			
 
				-            # The fasttext cli returns floats with 5 digits,
			
 
				-            # but we use the full 6 digits.
			
 
				-            self.assertTrue(np.allclose(vector, pvec, rtol=1e-04))
			
 
				-
			
 
				-    def test_predict(self):
			
 
				-        # TODO: I went a little crazy here as an exercise for
			
 
				-        # a rigorous test case. This could be turned into
			
 
				-        # a few utility functions.
			
 
				-        f = load_model(self.output_sup + '.bin')
			
 
				-
			
 
				-        def _test(N, min_length, max_length, k, add_vocab=0):
			
 
				-            words = get_random_words(N, min_length, max_length)
			
 
				-            if add_vocab > 0:
			
 
				-                vocab, _ = f.get_words(include_freq=True)
			
 
				-                for _ in range(add_vocab):
			
 
				-                    ind = random.randint(0, len(vocab))
			
 
				-                    words += [vocab[ind]]
			
 
				-            all_labels = []
			
 
				-            all_probs = []
			
 
				-            ii = 0
			
 
				-            gotError = False
			
 
				-            for w in words:
			
 
				-                try:
			
 
				-                    labels, probs = f.predict(w, k)
			
 
				-                except ValueError:
			
 
				-                    gotError = True
			
 
				-                    continue
			
 
				-                all_labels.append(labels)
			
 
				-                all_probs.append(probs)
			
 
				-                ii += 1
			
 
				-            preds, _, retcode = self.get_predictions_from_list(
			
 
				-                self.output_sup, words, k
			
 
				-            )
			
 
				-            if gotError and retcode == 0:
			
 
				-                self.eprint(
			
 
				-                    "Didn't get error. Make sure your compiled "
			
 
				-                    "binary kept the assert statements"
			
 
				-                )
			
 
				-                self.assertTrue(False)
			
 
				-            else:
			
 
				-                return
			
 
				-            preds = preds.split('\n')[:-1]
			
 
				-            self.assertEqual(len(preds), len(all_labels))
			
 
				-            for i in range(len(preds)):
			
 
				-                labels = preds[i].split()
			
 
				-                probs = np.array(list(map(float, labels[1::2])))
			
 
				-                labels = np.array(labels[::2])
			
 
				-                self.assertTrue(np.allclose(probs, all_probs[i], rtol=1e-04))
			
 
				-                self.assertTrue(np.array_equal(labels, all_labels[i]))
			
 
				-
			
 
				-        _test(0, 0, 0, 0)
			
 
				-        _test(1, 0, 0, 0)
			
 
				-        _test(10, 0, 0, 0)
			
 
				-        _test(1, 1, 1, 0)
			
 
				-        _test(1, 1, 1, 1)
			
 
				-        _test(1, 2, 3, 0)
			
 
				-        _test(1, 2, 3, 1)
			
 
				-        _test(10, 1, 1, 1)
			
 
				-        _test(1, 1, 1, 0, add_vocab=10)
			
 
				-        _test(1, 1, 1, 1, add_vocab=10)
			
 
				-        _test(1, 2, 3, 0, add_vocab=10)
			
 
				-        _test(1, 2, 3, 1, add_vocab=10)
			
 
				-        reach = 10
			
 
				-        for _ in range(10):
			
 
				-            N = random.randint(0, reach)
			
 
				-            init = random.randint(0, reach)
			
 
				-            offset = random.randint(0, reach)
			
 
				-            k = random.randint(0, reach)
			
 
				-            _test(N, init, init + offset, k)
			
 
				-
			
 
				-    def test_vocab(self):
			
 
				-        f = load_model(self.output + '.bin')
			
 
				-        words, freq = f.get_words(include_freq=True)
			
 
				-        self.eprint(
			
 
				-            "There is no way to access words from the cli yet. "
			
 
				-            "Therefore there can be no rigorous test."
			
 
				-        )
			
 
				-
			
 
				-    def test_subwords(self):
			
 
				-        f = load_model(self.output + '.bin')
			
 
				-        words, _ = f.get_words(include_freq=True)
			
 
				-        words += get_random_words(10, 1, 10)
			
 
				-        for w in words:
			
 
				-            f.get_subwords(w)
			
 
				-        self.eprint(
			
 
				-            "There is no way to access words from the cli yet. "
			
 
				-            "Therefore there can be no test."
			
 
				-        )
			
 
				-
			
 
				-    def test_tokenize(self):
			
 
				-        train, _, _ = self.build_paths("fil9", "rw/rw.txt", "fil9")
			
 
				-        with open(train, 'r') as f:
			
 
				-            _ = tokenize(f.read())
			
 
				-
			
 
				-    def test_dimension(self):
			
 
				-        f = load_model(self.output + '.bin')
			
 
				-        f.get_dimension()
			
 
				-
			
 
				-    def test_subword_vector(self):
			
 
				-        f = load_model(self.output + '.bin')
			
 
				-        words, _ = f.get_words(include_freq=True)
			
 
				-        words += get_random_words(10000, 1, 200)
			
 
				-        input_matrix = f.get_input_matrix()
			
 
				-        for word in words:
			
 
				-
			
 
				-            # Universal api to get word vector
			
 
				-            vec1 = f.get_word_vector(word)
			
 
				-
			
 
				-            # Build word vector from subwords
			
 
				-            subwords, subinds = f.get_subwords(word)
			
 
				-            subvectors = list(map(lambda x: f.get_input_vector(x), subinds))
			
 
				-            subvectors = np.stack(subvectors)
			
 
				-            vec2 = np.sum((subvectors / len(subwords)), 0)
			
 
				-
			
 
				-            # Build word vector from subinds
			
 
				-            vec3 = np.sum(input_matrix[subinds] / len(subinds), 0)
			
 
				-
			
 
				-            # Build word vectors from word and subword ids
			
 
				-            wid = f.get_word_id(word)
			
 
				-            if wid >= 0:
			
 
				-                swids = list(map(lambda x: f.get_subword_id(x), subwords[1:]))
			
 
				-                swids.append(wid)
			
 
				-            else:
			
 
				-                swids = list(map(lambda x: f.get_subword_id(x), subwords))
			
 
				-            swids = np.array(swids)
			
 
				-            vec4 = np.sum(input_matrix[swids] / len(swids), 0)
			
 
				-
			
 
				-            self.assertTrue(np.isclose(vec1, vec2, atol=1e-5, rtol=0).all())
			
 
				-            self.assertTrue(np.isclose(vec2, vec3, atol=1e-5, rtol=0).all())
			
 
				-            self.assertTrue(np.isclose(vec3, vec4, atol=1e-5, rtol=0).all())
			
 
				-            self.assertTrue(np.isclose(vec4, vec1, atol=1e-5, rtol=0).all())
			
 
				-
			
 
				-    # TODO: Compare with .vec file
			
 
				-    def test_get_words(self):
			
 
				-        f = load_model(self.output + '.bin')
			
 
				-        words1, freq1 = f.get_words(include_freq=True)
			
 
				-        words2 = f.get_words(include_freq=False)
			
 
				-        self.assertEqual(len(words1), len(words2))
			
 
				-        self.assertEqual(len(words1), len(freq1))
			
 
				-        f = load_model(self.output_sup + '.bin')
			
 
				-        words1, freq1 = f.get_words(include_freq=True)
			
 
				-        words2 = f.get_words(include_freq=False)
			
 
				-        self.assertEqual(len(words1), len(words2))
			
 
				-        self.assertEqual(len(words1), len(freq1))
			
 
				-
			
 
				-    # TODO: Compare with .vec file for unsup
			
 
				-    def test_get_labels(self):
			
 
				-        f = load_model(self.output + '.bin')
			
 
				-        labels1, freq1 = f.get_labels(include_freq=True)
			
 
				-        labels2 = f.get_labels(include_freq=False)
			
 
				-        words2 = f.get_words(include_freq=False)
			
 
				-        self.assertEqual(len(labels1), len(labels2))
			
 
				-        self.assertEqual(len(labels1), len(freq1))
			
 
				-        self.assertEqual(len(labels1), len(words2))
			
 
				-        for w1, w2 in zip(labels2, words2):
			
 
				-            self.assertEqual(w1, w2)
			
 
				-        f = load_model(self.output_sup + '.bin')
			
 
				-        labels1, freq1 = f.get_labels(include_freq=True)
			
 
				-        labels2 = f.get_labels(include_freq=False)
			
 
				-        self.assertEqual(len(labels1), len(labels2))
			
 
				-        self.assertEqual(len(labels1), len(freq1))
			
 
				-
			
 
				-    def test_exercise_is_quant(self):
			
 
				-        f = load_model(self.output + '.bin')
			
 
				-        gotError = False
			
 
				-        try:
			
 
				-            f.quantize()
			
 
				-        except ValueError:
			
 
				-            gotError = True
			
 
				-        self.assertTrue(gotError)
			
 
				-        f = load_model(self.output_sup + '.bin')
			
 
				-        self.assertTrue(not f.is_quantized())
			
 
				-        f.quantize()
			
 
				-        self.assertTrue(f.is_quantized())
			
 
				-
			
 
				-    def test_newline_predict_sentence(self):
			
 
				-        f = load_model(self.output_sup + '.bin')
			
 
				-        sentence = get_random_words(1, 1000, 2000)[0]
			
 
				-        f.predict(sentence, k=5)
			
 
				-        sentence += "\n"
			
 
				-        gotError = False
			
 
				-        try:
			
 
				-            f.predict(sentence, k=5)
			
 
				-        except ValueError:
			
 
				-            gotError = True
			
 
				-        self.assertTrue(gotError)
			
 
				-
			
 
				-        f = load_model(self.output + '.bin')
			
 
				-        sentence = get_random_words(1, 1000, 2000)[0]
			
 
				-        f.get_sentence_vector(sentence)
			
 
				-        sentence += "\n"
			
 
				-        gotError = False
			
 
				-        try:
			
 
				-            f.get_sentence_vector(sentence)
			
 
				-        except ValueError:
			
 
				-            gotError = True
			
 
				-        self.assertTrue(gotError)
			
 
				-
			
 
				-
			
 
				-class TestFastTextPyIntegration(TestFastTextPy):
			
 
				-    @classmethod
			
 
				-    def setUpClass(cls):
			
 
				-        cls.bin = os.environ['FASTTEXT_BIN']
			
 
				-        cls.data_dir = os.environ['FASTTEXT_DATA']
			
 
				-        cls.result_dir = tempfile.mkdtemp()
			
 
				-
			
 
				-    def test_unsup1(self):
			
 
				-        train, test, output = self.build_paths("fil9", "rw/rw.txt", "fil9")
			
 
				-
			
 
				-        model = train_unsupervised(
			
 
				-            input=train,
			
 
				-            model="skipgram",
			
 
				-            lr=0.025,
			
 
				-            dim=100,
			
 
				-            ws=5,
			
 
				-            epoch=1,
			
 
				-            minCount=5,
			
 
				-            neg=5,
			
 
				-            loss="ns",
			
 
				-            bucket=2000000,
			
 
				-            minn=3,
			
 
				-            maxn=6,
			
 
				-            t=1e-4,
			
 
				-            lrUpdateRate=100,
			
 
				-            thread=self.num_thread(),
			
 
				-        )
			
 
				-        model.save_model(output)
			
 
				-
			
 
				-        path_size = self.get_path_size(output)
			
 
				-        vectors = {}
			
 
				-        with open(test, 'r') as test_f:
			
 
				-            for line in test_f:
			
 
				-                query0 = line.split()[0].strip()
			
 
				-                query1 = line.split()[1].strip()
			
 
				-                vector0 = model.get_word_vector(query0)
			
 
				-                vector1 = model.get_word_vector(query1)
			
 
				-                vectors[query0] = vector0
			
 
				-                vectors[query1] = vector1
			
 
				-        dataset, correlation, oov = compute_similarity(None, test, vectors)
			
 
				-        correlation = np.around(correlation)
			
 
				-
			
 
				-        self.assertTrue(
			
 
				-            correlation >= 41, "Correlation: Want: 41 Is: " + str(correlation)
			
 
				-        )
			
 
				-        self.assertEqual(oov, 0.0, "Oov: Want: 0 Is: " + str(oov))
			
 
				-        self.assertEqual(
			
 
				-            path_size, 978480868, "Size: Want: 978480868 Is: " + str(path_size)
			
 
				-        )
			
 
				-
			
 
				-
			
 
				-def gen_sup_test(lr, dataset, n, p1, r1, p1_q, r1_q, size, quant_size):
			
 
				-    def sup_test(self):
			
 
				-        def check(
			
 
				-            output_local, test_local, n_local, p1_local, r1_local, size_local,
			
 
				-            lessthan
			
 
				-        ):
			
 
				-            test_args = self.default_test_args(output_local, test_local)
			
 
				-            test_output = self.get_test_output(test_args)
			
 
				-            self.assertEqual(
			
 
				-                str(test_output[0]),
			
 
				-                str(n_local),
			
 
				-                "N: Want: " + str(n_local) + " Is: " + str(test_output[0])
			
 
				-            )
			
 
				-            self.assertTrue(
			
 
				-                float(test_output[1]) >= float(p1_local),
			
 
				-                "p1: Want: " + str(p1_local) + " Is: " + str(test_output[1])
			
 
				-            )
			
 
				-            self.assertTrue(
			
 
				-                float(test_output[2]) >= float(r1_local),
			
 
				-                "r1: Want: " + str(r1_local) + " Is: " + str(test_output[2])
			
 
				-            )
			
 
				-            path_size = self.get_path_size(output_local)
			
 
				-            if lessthan:
			
 
				-                self.assertTrue(
			
 
				-                    path_size <= size_local, "Size: Want at most: " +
			
 
				-                    str(size_local) + " Is: " + str(path_size)
			
 
				-                )
			
 
				-            else:
			
 
				-                self.assertTrue(
			
 
				-                    path_size == size_local,
			
 
				-                    "Size: Want: " + str(size_local) + " Is: " + str(path_size)
			
 
				-                )
			
 
				-
			
 
				-        train, test, output = self.build_paths(
			
 
				-            dataset + ".train", dataset + ".test", dataset
			
 
				-        )
			
 
				-        model = train_supervised(
			
 
				-            input=train,
			
 
				-            dim=10,
			
 
				-            lr=lr,
			
 
				-            wordNgrams=2,
			
 
				-            minCount=1,
			
 
				-            bucket=10000000,
			
 
				-            epoch=5,
			
 
				-            thread=self.num_thread()
			
 
				-        )
			
 
				-        model.save_model(output)
			
 
				-        check(output, test, n, p1, r1, size, False)
			
 
				-        # Exercising
			
 
				-        model.predict("hello world")
			
 
				-        model.quantize(input=train, retrain=True, cutoff=100000, qnorm=True)
			
 
				-        model.save_model(output + ".ftz")
			
 
				-        # Exercising
			
 
				-        model.predict("hello world")
			
 
				-        check(output + ".ftz", test, n, p1_q, r1_q, quant_size, True)
			
 
				-
			
 
				-    return sup_test
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    sup_job_lr = [0.25, 0.5, 0.5, 0.1, 0.1, 0.1, 0.05, 0.05]
			
 
				-    sup_job_n = [7600, 60000, 70000, 38000, 50000, 60000, 650000, 400000]
			
 
				-    sup_job_p1 = [0.921, 0.968, 0.984, 0.956, 0.638, 0.723, 0.603, 0.946]
			
 
				-    sup_job_r1 = [0.921, 0.968, 0.984, 0.956, 0.638, 0.723, 0.603, 0.946]
			
 
				-    sup_job_quant_p1 = [0.918, 0.965, 0.984, 0.953, 0.629, 0.707, 0.58, 0.940]
			
 
				-    sup_job_quant_r1 = [0.918, 0.965, 0.984, 0.953, 0.629, 0.707, 0.58, 0.940]
			
 
				-    sup_job_size = [
			
 
				-        405607193, 421445471, 447481878, 427867393, 431292576, 517549567,
			
 
				-        483742593, 493604598
			
 
				-    ]
			
 
				-    sup_job_quant_size = [
			
 
				-        405607193, 421445471, 447481878, 427867393, 431292576, 517549567,
			
 
				-        483742593, 493604598
			
 
				-    ]
			
 
				-    sup_job_quant_size = [
			
 
				-        1600000, 1457000, 1690000, 1550000, 1567896, 1655000, 1600000, 1575010
			
 
				-    ]
			
 
				-    # Yelp_review_full can be a bit flaky
			
 
				-    sup_job_dataset = [
			
 
				-        "ag_news", "sogou_news", "dbpedia", "yelp_review_polarity",
			
 
				-        "yelp_review_full", "yahoo_answers", "amazon_review_full",
			
 
				-        "amazon_review_polarity"
			
 
				-    ]
			
 
				-    sup_job_args = [
			
 
				-        sup_job_lr, sup_job_dataset, sup_job_n, sup_job_p1, sup_job_r1,
			
 
				-        sup_job_quant_p1, sup_job_quant_r1, sup_job_size, sup_job_quant_size
			
 
				-    ]
			
 
				-    for lr, dataset, n, p1, r1, p1_q, r1_q, size, quant_size in zip(
			
 
				-        *sup_job_args
			
 
				-    ):
			
 
				-        setattr(
			
 
				-            TestFastTextPyIntegration, "test_" + dataset,
			
 
				-            gen_sup_test(lr, dataset, n, p1, r1, p1_q, r1_q, size, quant_size)
			
 
				-        )
			
 
				-    unittest.main()
			
--- a/python/fastText/tests/__init__.py
+++ b/python/fastText/tests/__init__.py
@@ -0,0 +1,15 @@
 
				+# Copyright (c) 2017-present, Facebook, Inc.
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the BSD-style license found in the
			
 
				+# LICENSE file in the root directory of this source tree. An additional grant
			
 
				+# of patent rights can be found in the PATENTS file in the same directory.
			
 
				+
			
 
				+from __future__ import absolute_import
			
 
				+from __future__ import division
			
 
				+from __future__ import print_function
			
 
				+from __future__ import unicode_literals
			
 
				+
			
 
				+from .test_configurations import get_supervised_models
			
 
				+from .test_script import gen_tests
			
 
				+from .test_script import gen_small_tests
			
--- a/python/fastText/tests/test_configurations.py
+++ b/python/fastText/tests/test_configurations.py
@@ -0,0 +1,104 @@
 
				+# Copyright (c) 2017-present, Facebook, Inc.
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the BSD-style license found in the
			
 
				+# LICENSE file in the root directory of this source tree. An additional grant
			
 
				+# of patent rights can be found in the PATENTS file in the same directory.
			
 
				+
			
 
				+from __future__ import absolute_import
			
 
				+from __future__ import division
			
 
				+from __future__ import print_function
			
 
				+from __future__ import unicode_literals
			
 
				+
			
 
				+import multiprocessing
			
 
				+import os
			
 
				+
			
 
				+# This script represents a collection of integration tests
			
 
				+# Each integration test comes with a full set of parameters,
			
 
				+# a dataset, and expected metrics.
			
 
				+# These configurations can be used by various fastText apis
			
 
				+# to confirm some level of correctness.
			
 
				+
			
 
				+# Supervised models
			
 
				+# See https://fasttext.cc/docs/en/supervised-models.html
			
 
				+
			
 
				+
			
 
				+def max_thread():
			
 
				+    return multiprocessing.cpu_count() - 1
			
 
				+
			
 
				+
			
 
				+def get_supervised_models(data_dir=""):
			
 
				+    sup_job_dataset = [
			
 
				+        "ag_news", "sogou_news", "dbpedia", "yelp_review_polarity",
			
 
				+        "yelp_review_full", "yahoo_answers", "amazon_review_full",
			
 
				+        "amazon_review_polarity"
			
 
				+    ]
			
 
				+
			
 
				+    sup_params = {
			
 
				+        "dim": 10,
			
 
				+        "wordNgrams": 2,
			
 
				+        "minCount": 1,
			
 
				+        "bucket": 10000000,
			
 
				+        "epoch": 5,
			
 
				+        "thread": max_thread(),
			
 
				+        "verbose": 1,
			
 
				+    }
			
 
				+    quant_params = {
			
 
				+        "retrain": True,
			
 
				+        "cutoff": 100000,
			
 
				+        "qnorm": True,
			
 
				+        "verbose": 1,
			
 
				+    }
			
 
				+    sup_job_lr = [0.25, 0.5, 0.5, 0.1, 0.1, 0.1, 0.05, 0.05]
			
 
				+
			
 
				+    sup_job_n = [7600, 60000, 70000, 38000, 50000, 60000, 650000, 400000]
			
 
				+
			
 
				+    sup_job_p1 = [0.921, 0.968, 0.984, 0.956, 0.638, 0.723, 0.603, 0.946]
			
 
				+    sup_job_r1 = [0.921, 0.968, 0.984, 0.956, 0.638, 0.723, 0.603, 0.946]
			
 
				+    sup_job_size = [
			
 
				+        405607193, 421445471, 447481878, 427867393, 431292576, 517549567,
			
 
				+        483742593, 493604598
			
 
				+    ]
			
 
				+
			
 
				+    sup_job_quant_p1 = [0.918, 0.965, 0.984, 0.950, 0.625, 0.707, 0.58, 0.940]
			
 
				+    sup_job_quant_r1 = [0.918, 0.965, 0.984, 0.950, 0.625, 0.707, 0.58, 0.940]
			
 
				+    sup_job_quant_size = [
			
 
				+        1600000, 1457000, 1690000, 1550000, 1567896, 1655000, 1600000, 1575000
			
 
				+    ]
			
 
				+
			
 
				+    configurations = []
			
 
				+    for i in range(len(sup_job_dataset)):
			
 
				+        configuration = {}
			
 
				+        configuration["dataset"] = sup_job_dataset[i]
			
 
				+        args = sup_params.copy()
			
 
				+        quant_args = quant_params.copy()
			
 
				+        args["lr"] = sup_job_lr[i]
			
 
				+        args["input"] = sup_job_dataset[i] + ".train"
			
 
				+        quant_args["lr"] = sup_job_lr[i]
			
 
				+        quant_args["input"] = sup_job_dataset[i] + ".train"
			
 
				+        if data_dir:
			
 
				+            args["input"] = os.path.join(data_dir, args["input"])
			
 
				+            quant_args["input"] = os.path.join(data_dir, quant_args["input"])
			
 
				+        configuration["train_args"] = args
			
 
				+        configuration["quant_args"] = quant_args
			
 
				+        test = {
			
 
				+            "n": sup_job_n[i],
			
 
				+            "p1": sup_job_p1[i],
			
 
				+            "r1": sup_job_r1[i],
			
 
				+            "size": sup_job_size[i],
			
 
				+            "data": sup_job_dataset[i] + ".test",
			
 
				+        }
			
 
				+        quant_test = {
			
 
				+            "n": sup_job_n[i],
			
 
				+            "p1": sup_job_quant_p1[i],
			
 
				+            "r1": sup_job_quant_r1[i],
			
 
				+            "size": sup_job_quant_size[i],
			
 
				+            "data": sup_job_dataset[i] + ".test",
			
 
				+        }
			
 
				+        if data_dir:
			
 
				+            test["data"] = os.path.join(data_dir, test["data"])
			
 
				+            quant_test["data"] = os.path.join(data_dir, quant_test["data"])
			
 
				+        configuration["test"] = test
			
 
				+        configuration["quant_test"] = quant_test
			
 
				+        configurations.append(configuration)
			
 
				+    return configurations
			
--- a/python/fastText/tests/test_script.py
+++ b/python/fastText/tests/test_script.py
@@ -0,0 +1,125 @@
 
				+# Copyright (c) 2017-present, Facebook, Inc.
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the BSD-style license found in the
			
 
				+# LICENSE file in the root directory of this source tree. An additional grant
			
 
				+# of patent rights can be found in the PATENTS file in the same directory.
			
 
				+
			
 
				+from __future__ import absolute_import
			
 
				+from __future__ import absolute_import
			
 
				+from __future__ import division
			
 
				+from __future__ import print_function
			
 
				+from __future__ import unicode_literals
			
 
				+
			
 
				+from fastText import train_supervised
			
 
				+from fastText import util
			
 
				+import os
			
 
				+import subprocess
			
 
				+import unittest
			
 
				+import tempfile
			
 
				+try:
			
 
				+    import unicode
			
 
				+except ImportError:
			
 
				+    pass
			
 
				+from fastText.tests.test_configurations import get_supervised_models
			
 
				+
			
 
				+
			
 
				+def read_labels(data_file):
			
 
				+    labels = []
			
 
				+    lines = []
			
 
				+    with open(data_file, 'r') as f:
			
 
				+        for line in f:
			
 
				+            labels_line = []
			
 
				+            words_line = []
			
 
				+            try:
			
 
				+                line = unicode(line, "UTF-8").split()
			
 
				+            except NameError:
			
 
				+                line = line.split()
			
 
				+            for word in line:
			
 
				+                if word.startswith("__label__"):
			
 
				+                    labels_line.append(word)
			
 
				+                else:
			
 
				+                    words_line.append(word)
			
 
				+            labels.append(labels_line)
			
 
				+            lines.append(" ".join(words_line))
			
 
				+    return lines, labels
			
 
				+
			
 
				+
			
 
				+# Generate a supervised test case
			
 
				+# The returned function will be set as an attribute to a test class
			
 
				+def gen_sup_test(configuration):
			
 
				+    def sup_test(self):
			
 
				+        def get_path_size(path):
			
 
				+            path_size = subprocess.check_output(["stat", "-c", "%s",
			
 
				+                                                 path]).decode('utf-8')
			
 
				+            path_size = int(path_size)
			
 
				+            return path_size
			
 
				+
			
 
				+        def check(model, model_filename, test, lessthan, msg_prefix=""):
			
 
				+            lines, labels = read_labels(test["data"])
			
 
				+            predictions = []
			
 
				+            for line in lines:
			
 
				+                pred_label, _ = model.predict(line)
			
 
				+                predictions.append(pred_label)
			
 
				+            p1_local_out, r1_local_out = util.test(predictions, labels)
			
 
				+            self.assertEqual(
			
 
				+                len(predictions), test["n"], msg_prefix + "N: Want: " +
			
 
				+                str(test["n"]) + " Is: " + str(len(predictions))
			
 
				+            )
			
 
				+            self.assertTrue(
			
 
				+                p1_local_out >= test["p1"], msg_prefix + "p1: Want: " +
			
 
				+                str(test["p1"]) + " Is: " + str(p1_local_out)
			
 
				+            )
			
 
				+            self.assertTrue(
			
 
				+                r1_local_out >= test["r1"], msg_prefix + "r1: Want: " +
			
 
				+                str(test["r1"]) + " Is: " + str(r1_local_out)
			
 
				+            )
			
 
				+            path_size = get_path_size(model_filename)
			
 
				+            size_msg = str(test["size"]) + " Is: " + str(path_size)
			
 
				+            if lessthan:
			
 
				+                self.assertTrue(
			
 
				+                    path_size <= test["size"],
			
 
				+                    msg_prefix + "Size: Want at most: " + size_msg
			
 
				+                )
			
 
				+            else:
			
 
				+                self.assertTrue(
			
 
				+                    path_size == test["size"],
			
 
				+                    msg_prefix + "Size: Want: " + size_msg
			
 
				+                )
			
 
				+
			
 
				+        output = os.path.join(tempfile.mkdtemp(), configuration["dataset"])
			
 
				+        model = train_supervised(**configuration["train_args"])
			
 
				+        model.save_model(output + ".bin")
			
 
				+        check(model, output + ".bin", configuration["test"], False)
			
 
				+        model.quantize(**configuration["quant_args"])
			
 
				+        model.save_model(output + ".ftz")
			
 
				+        check(
			
 
				+            model, output + ".ftz", configuration["quant_test"], True, "Quant: "
			
 
				+        )
			
 
				+
			
 
				+    return sup_test
			
 
				+
			
 
				+
			
 
				+def gen_small_tests(data_dir):
			
 
				+    class TestFastTextSmallPy(unittest.TestCase):
			
 
				+        pass
			
 
				+
			
 
				+    for configuration in get_supervised_models(data_dir=data_dir):
			
 
				+        if configuration["dataset"] == "dbpedia":
			
 
				+            setattr(
			
 
				+                TestFastTextSmallPy, "test_small_" + configuration["dataset"],
			
 
				+                gen_sup_test(configuration)
			
 
				+            )
			
 
				+    return TestFastTextSmallPy
			
 
				+
			
 
				+
			
 
				+def gen_tests(data_dir):
			
 
				+    class TestFastTextPy(unittest.TestCase):
			
 
				+        pass
			
 
				+
			
 
				+    for configuration in get_supervised_models(data_dir=data_dir):
			
 
				+        setattr(
			
 
				+            TestFastTextPy, "test_" + configuration["dataset"],
			
 
				+            gen_sup_test(configuration)
			
 
				+        )
			
 
				+    return TestFastTextPy
			
--- a/python/fastText/util/__init__.py
+++ b/python/fastText/util/__init__.py
@@ -0,0 +1,14 @@
 
				+# Copyright (c) 2017-present, Facebook, Inc.
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the BSD-style license found in the
			
 
				+# LICENSE file in the root directory of this source tree. An additional grant
			
 
				+# of patent rights can be found in the PATENTS file in the same directory.
			
 
				+
			
 
				+from __future__ import absolute_import
			
 
				+from __future__ import division
			
 
				+from __future__ import print_function
			
 
				+from __future__ import unicode_literals
			
 
				+
			
 
				+from .util import test
			
 
				+from .util import find_nearest_neighbor
			
--- a/python/fastText/util/util.py
+++ b/python/fastText/util/util.py
@@ -0,0 +1,60 @@
 
				+# Copyright (c) 2017-present, Facebook, Inc.
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the BSD-style license found in the
			
 
				+# LICENSE file in the root directory of this source tree. An additional grant
			
 
				+# of patent rights can be found in the PATENTS file in the same directory.
			
 
				+
			
 
				+# NOTE: The purpose of this file is not to accumulate all useful utility
			
 
				+# functions. This file should contain very commonly used and requested functions
			
 
				+# (such as test). If you think you have a function at that level, please create
			
 
				+# an issue and we will happily review your suggestion. This file is also not supposed
			
 
				+# to pull in dependencies outside of numpy/scipy without very good reasons. For
			
 
				+# example, this file should not use sklearn and matplotlib to produce a t-sne
			
 
				+# plot of word embeddings or such.
			
 
				+
			
 
				+from __future__ import absolute_import
			
 
				+from __future__ import division
			
 
				+from __future__ import print_function
			
 
				+from __future__ import unicode_literals
			
 
				+
			
 
				+import numpy as np
			
 
				+
			
 
				+
			
 
				+def test(predictions, labels, k=1):
			
 
				+    """
			
 
				+    Return precision and recall modeled after fasttext's test
			
 
				+    """
			
 
				+    precision = 0.0
			
 
				+    nexamples = 0
			
 
				+    nlabels = 0
			
 
				+    for prediction, labels in zip(predictions, labels):
			
 
				+        for p in prediction:
			
 
				+            if p in labels:
			
 
				+                precision += 1
			
 
				+        nexamples += 1
			
 
				+        nlabels += len(labels)
			
 
				+    return (precision / (k * nexamples), precision / nlabels)
			
 
				+
			
 
				+
			
 
				+def find_nearest_neighbor(query, vectors, ban_set, cossims=None):
			
 
				+    """
			
 
				+    query is a 1d numpy array corresponding to the vector to which you want to
			
 
				+    find the closest vector
			
 
				+    vectors is a 2d numpy array corresponding to the vectors you want to consider
			
 
				+    ban_set is a set of indicies within vectors you want to ignore for nearest match
			
 
				+    cossims is a 1d numpy array of size len(vectors), which can be passed for efficiency
			
 
				+
			
 
				+    returns the index of the closest match to query within vectors
			
 
				+
			
 
				+    """
			
 
				+    if cossims is None:
			
 
				+        cossims = np.matmul(vectors, query, out=cossims)
			
 
				+    else:
			
 
				+        np.matmul(vectors, query, out=cossims)
			
 
				+    rank = len(cossims) - 1
			
 
				+    result_i = np.argpartition(cossims, rank)[rank]
			
 
				+    while result_i in ban_set:
			
 
				+        rank -= 1
			
 
				+        result_i = np.argpartition(cossims, rank)[rank]
			
 
				+    return result_i
			
--- a/runtests.py
+++ b/runtests.py
@@ -0,0 +1,31 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+#
			
 
				+# Copyright (c) 2016-present, Facebook, Inc.
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the BSD-style license found in the
			
 
				+# LICENSE file in the root directory of this source tree. An additional grant
			
 
				+# of patent rights can be found in the PATENTS file in the same directory.
			
 
				+#
			
 
				+
			
 
				+# To run this tests you must first fetch all the required test data.
			
 
				+# Have a look at tests/fetch_test_data.sh
			
 
				+# You will then need to point this script to the corresponding folder
			
 
				+
			
 
				+from __future__ import absolute_import
			
 
				+from __future__ import division
			
 
				+from __future__ import print_function
			
 
				+from __future__ import unicode_literals
			
 
				+
			
 
				+import unittest
			
 
				+import argparse
			
 
				+from fastText.tests import gen_tests
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    parser = argparse.ArgumentParser()
			
 
				+    parser.add_argument("data_dir", help="Full path to data directory")
			
 
				+    args = parser.parse_args()
			
 
				+    tests = gen_tests(args.data_dir)
			
 
				+    suite = unittest.TestLoader().loadTestsFromTestCase(tests)
			
 
				+    unittest.TextTestRunner(verbosity=3).run(suite)
			
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,7 @@ import sys
 
				 import setuptools
			
 
				 import os
			
 
				 
			
 
				-__version__ = '0.0.3'
			
 
				+__version__ = '0.0.6'
			
 
				 FASTTEXT_SRC = "src"
			
 
				 
			
 
				 # Based on https://github.com/pybind/python_example
			
@@ -81,16 +81,18 @@ def has_flag(compiler, flagname):
 
				 
			
 
				 
			
 
				 def cpp_flag(compiler):
			
 
				-    """Return the -std=c++[11/14] compiler flag.
			
 
				-    The c++14 is preferred over c++11 (when it is available).
			
 
				+    """Return the -std=c++[0x/11/14] compiler flag.
			
 
				+    The c++14 is preferred over c++0x/11 (when it is available).
			
 
				     """
			
 
				     if has_flag(compiler, '-std=c++14'):
			
 
				         return '-std=c++14'
			
 
				     elif has_flag(compiler, '-std=c++11'):
			
 
				         return '-std=c++11'
			
 
				+    elif has_flag(compiler, '-std=c++0x'):
			
 
				+        return '-std=c++0x'
			
 
				     else:
			
 
				         raise RuntimeError(
			
 
				-            'Unsupported compiler -- at least C++11 support '
			
 
				+            'Unsupported compiler -- at least C++0x support '
			
 
				             'is needed!'
			
 
				         )
			
 
				 
			
@@ -124,7 +126,7 @@ class BuildExt(build_ext):
 
				 
			
 
				 
			
 
				 setup(
			
 
				-    name='fastTextpy',
			
 
				+    name='fasttext',
			
 
				     version=__version__,
			
 
				     author='Christian Puhrsch',
			
 
				     author_email='[email protected]',
			
@@ -135,7 +137,11 @@ setup(
 
				     license='BSD',
			
 
				     install_requires=['pybind11>=2.2', "setuptools >= 0.7.0"],
			
 
				     cmdclass={'build_ext': BuildExt},
			
 
				-    packages=[str('fastText')],
			
 
				+    packages=[
			
 
				+        str('fastText'),
			
 
				+        str('fastText.util'),
			
 
				+        str('fastText.tests'),
			
 
				+    ],
			
 
				     package_dir={str(''): str('python')},
			
 
				-    zip_safe=False
			
 
				+    zip_safe=False,
			
 
				 )
			
--- a/tests/fetch_test_data.sh
+++ b/tests/fetch_test_data.sh
@@ -0,0 +1,129 @@
 
				+#!/usr/bin/env bash
			
 
				+#
			
 
				+# Copyright (c) 2016-present, Facebook, Inc.
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# This source code is licensed under the BSD-style license found in the
			
 
				+# LICENSE file in the root directory of this source tree. An additional grant
			
 
				+# of patent rights can be found in the PATENTS file in the same directory.
			
 
				+#
			
 
				+
			
 
				+DATADIR=data
			
 
				+
			
 
				+report_error() {
			
 
				+   echo "Error on line $1 of $0"
			
 
				+}
			
 
				+
			
 
				+myshuf() {
			
 
				+  perl -MList::Util=shuffle -e 'print shuffle(<>);' "$@";
			
 
				+}
			
 
				+
			
 
				+normalize_text() {
			
 
				+  tr '[:upper:]' '[:lower:]' | sed -e 's/^/__label__/g' | \
			
 
				+    sed -e "s/'/ ' /g" -e 's/"//g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' \
			
 
				+        -e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
			
 
				+        -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' | tr -s " " | myshuf
			
 
				+}
			
 
				+
			
 
				+set -e
			
 
				+trap 'report_error $LINENO' ERR
			
 
				+
			
 
				+mkdir "${DATADIR}"
			
 
				+
			
 
				+data_result="${DATADIR}/dbpedia_csv.tar.gz"
			
 
				+if [ ! -f "$data_result" ] || \
			
 
				+   [ $(md5sum "$data_result" | cut -f 1 -d ' ') != "8139d58cf075c7f70d085358e73af9b3" ]
			
 
				+then
			
 
				+  wget -c "https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz" -O "$data_result"
			
 
				+  tar -xzvf "$data_result" -C "${DATADIR}"
			
 
				+fi
			
 
				+
			
 
				+data_result="${DATADIR}/dbpedia.train"
			
 
				+if [ ! -f "$data_result" ]
			
 
				+then
			
 
				+  cat "${DATADIR}/dbpedia_csv/train.csv" | normalize_text > "$data_result" || rm -f "$data_result"
			
 
				+fi
			
 
				+
			
 
				+data_result="${DATADIR}/dbpedia.test"
			
 
				+if [ ! -f "$data_result" ]
			
 
				+then
			
 
				+  cat "${DATADIR}/dbpedia_csv/test.csv" | normalize_text > "$data_result" || rm -f "$data_result"
			
 
				+fi
			
 
				+
			
 
				+data_result="${DATADIR}/rw_queries.txt"
			
 
				+if [ ! -f "$data_result" ]
			
 
				+then
			
 
				+  cut -f 1,2 "${DATADIR}"/rw/rw.txt | awk '{print tolower($0)}' | tr '\t' '\n' > "$data_result" || rm -f "$data_result"
			
 
				+fi
			
 
				+
			
 
				+data_result="${DATADIR}/enwik9.zip"
			
 
				+if [ ! -f "$data_result" ] || \
			
 
				+   [ $(md5sum "$data_result" | cut -f 1 -d ' ') != "3e773f8a1577fda2e27f871ca17f31fd" ]
			
 
				+then
			
 
				+  wget -c http://mattmahoney.net/dc/enwik9.zip -P "${DATADIR}" || rm -f "$data_result"
			
 
				+  unzip "$data_result" -d "${DATADIR}" || rm -f "$data_result"
			
 
				+fi
			
 
				+
			
 
				+data_result="${DATADIR}/fil9"
			
 
				+if [ ! -f "$data_result" ]
			
 
				+then
			
 
				+  perl wikifil.pl "${DATADIR}/enwik9" > "$data_result" || rm -f "$data_result"
			
 
				+fi
			
 
				+
			
 
				+data_result="${DATADIR}/rw/rw.txt"
			
 
				+if [ ! -f "$data_result" ]
			
 
				+then
			
 
				+  wget -c https://nlp.stanford.edu/~lmthang/morphoNLM/rw.zip -P "${DATADIR}"
			
 
				+  unzip "${DATADIR}/rw.zip" -d "${DATADIR}" || rm -f "$data_result"
			
 
				+fi
			
 
				+
			
 
				+DATASET=(
			
 
				+  ag_news
			
 
				+  sogou_news
			
 
				+  dbpedia
			
 
				+  yelp_review_polarity
			
 
				+  yelp_review_full
			
 
				+  yahoo_answers
			
 
				+  amazon_review_full
			
 
				+  amazon_review_polarity
			
 
				+)
			
 
				+
			
 
				+ID=(
			
 
				+  0Bz8a_Dbh9QhbUDNpeUdjb0wxRms # ag_news
			
 
				+  0Bz8a_Dbh9QhbUkVqNEszd0pHaFE # sogou_news
			
 
				+  0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k # dbpedia
			
 
				+  0Bz8a_Dbh9QhbNUpYQ2N3SGlFaDg # yelp_review_polarity
			
 
				+  0Bz8a_Dbh9QhbZlU4dXhHTFhZQU0 # yelp_review_full
			
 
				+  0Bz8a_Dbh9Qhbd2JNdDBsQUdocVU # yahoo_answers
			
 
				+  0Bz8a_Dbh9QhbZVhsUnRWRDhETzA # amazon_review_full
			
 
				+  0Bz8a_Dbh9QhbaW12WVVZS2drcnM # amazon_review_polarity
			
 
				+)
			
 
				+
			
 
				+# Small datasets first
			
 
				+
			
 
				+for i in {0..0}
			
 
				+do
			
 
				+  echo "Downloading dataset ${DATASET[i]}"
			
 
				+  if [ ! -f "${DATADIR}/${DATASET[i]}.train" ]
			
 
				+  then
			
 
				+    wget -c "https://drive.google.com/uc?export=download&id=${ID[i]}" -O "${DATADIR}/${DATASET[i]}_csv.tar.gz"
			
 
				+    tar -xzvf "${DATADIR}/${DATASET[i]}_csv.tar.gz" -C "${DATADIR}"
			
 
				+    cat "${DATADIR}/${DATASET[i]}_csv/train.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.train"
			
 
				+    cat "${DATADIR}/${DATASET[i]}_csv/test.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.test"
			
 
				+  fi
			
 
				+done
			
 
				+
			
 
				+# Large datasets require a bit more work due to the extra request page
			
 
				+
			
 
				+for i in {1..7}
			
 
				+do
			
 
				+  echo "Downloading dataset ${DATASET[i]}"
			
 
				+  if [ ! -f "${DATADIR}/${DATASET[i]}.train" ]
			
 
				+  then
			
 
				+    curl -c /tmp/cookies "https://drive.google.com/uc?export=download&id=${ID[i]}" > /tmp/intermezzo.html
			
 
				+    curl -L -b /tmp/cookies "https://drive.google.com$(cat /tmp/intermezzo.html | grep -Po 'uc-download-link" [^>]* href="\K[^"]*' | sed 's/\&amp;/\&/g')" > "${DATADIR}/${DATASET[i]}_csv.tar.gz"
			
 
				+    tar -xzvf "${DATADIR}/${DATASET[i]}_csv.tar.gz" -C "${DATADIR}"
			
 
				+    cat "${DATADIR}/${DATASET[i]}_csv/train.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.train"
			
 
				+    cat "${DATADIR}/${DATASET[i]}_csv/test.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.test"
			
 
				+  fi
			
 
				+done