Kaynağa Gözat

move tests to OSS / util / update package name

Summary: See title.

Differential Revision: D6477912

fbshipit-source-id: 6f9f0f4d6e1c8a4b20f117f2ad2d12211d09ac5a
Christian Puhrsch 8 yıl önce
ebeveyn
işleme
bcd5250fd4

+ 3 - 0
CONTRIBUTING.md

@@ -19,6 +19,9 @@ To create a pull request:
 5. Make sure your code lints.
 6. If you haven't already, complete the Contributor License Agreement ("CLA").
 
+## Tests
+First, you will need to make sure you have the required data. For that, please have a look at the fetch_test_data.sh script under tests. Next run the tests using the runtests.py script passing a path to the directory containing the datasets.
+
 ## Contributor License Agreement ("CLA")
 In order to accept your pull request, we need you to submit a CLA. You only need
 to do this once to work on any of Facebook's open source projects.

+ 0 - 2
python/fastText/FastText.py

@@ -280,7 +280,6 @@ def train_supervised(
     label="__label__",
     verbose=2,
     pretrainedVectors="",
-    saveOutput=0
 ):
     """
     Train a supervised model and return a model object.
@@ -322,7 +321,6 @@ def train_unsupervised(
     label="__label__",
     verbose=2,
     pretrainedVectors="",
-    saveOutput=0
 ):
     """
     Train an unsupervised model and return a model object.

+ 0 - 7
python/fastText/test/README.md

@@ -1,7 +0,0 @@
-To run this test script you need to provide a path to the fasttext binary build in debug mode and a folder with the datsets downloaded by classification-results.sh and word-vector-example.sh.
-
-Example run:
-
-```
-FASTTEXT_BIN=fasttext_bin FASTTEXT_DATA=data python test_script.py
-```

+ 0 - 607
python/fastText/test/test_script.py

@@ -1,607 +0,0 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree. An additional grant
-# of patent rights can be found in the PATENTS file in the same directory.
-
-from __future__ import absolute_import
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-from fastText import train_supervised
-from fastText import train_unsupervised
-from fastText import load_model
-from fastText import tokenize
-import random
-import sys
-import os
-import subprocess
-import multiprocessing
-import numpy as np
-import unittest
-import tempfile
-import math
-from scipy import stats
-
-
-def compat_splitting(line):
-    return line.decode('utf8').split()
-
-
-def similarity(v1, v2):
-    n1 = np.linalg.norm(v1)
-    n2 = np.linalg.norm(v2)
-    return np.dot(v1, v2) / n1 / n2
-
-
-def read_vectors(model_path):
-    vectors = {}
-    with open(model_path, 'rb') as fin:
-        for _, line in enumerate(fin):
-            try:
-                tab = compat_splitting(line)
-                vec = np.array(tab[1:], dtype=float)
-                word = tab[0]
-                if np.linalg.norm(vec) == 0:
-                    continue
-                if word not in vectors:
-                    vectors[word] = vec
-            except ValueError:
-                continue
-            except UnicodeDecodeError:
-                continue
-    return vectors
-
-
-def compute_similarity(model_path, data_path, vectors=None):
-    if not vectors:
-        vectors = read_vectors(model_path)
-
-    mysim = []
-    gold = []
-    drop = 0.0
-    nwords = 0.0
-
-    with open(data_path, 'rb') as fin:
-        for line in fin:
-            tline = compat_splitting(line)
-            word1 = tline[0].lower()
-            word2 = tline[1].lower()
-            nwords = nwords + 1.0
-
-            if (word1 in vectors) and (word2 in vectors):
-                v1 = vectors[word1]
-                v2 = vectors[word2]
-                d = similarity(v1, v2)
-                mysim.append(d)
-                gold.append(float(tline[2]))
-            else:
-                drop = drop + 1.0
-
-    corr = stats.spearmanr(mysim, gold)
-    dataset = os.path.basename(data_path)
-    correlation = corr[0] * 100
-    oov = math.ceil(drop / nwords * 100.0)
-    return dataset, correlation, oov
-
-
-def get_random_unicode(length):
-    # See: https://stackoverflow.com/questions/1477294/generate-random-utf-8-string-in-python
-
-    try:
-        get_char = unichr
-    except NameError:
-        get_char = chr
-
-    # Update this to include code point ranges to be sampled
-    include_ranges = [
-        (0x0021, 0x0021),
-        (0x0023, 0x0026),
-        (0x0028, 0x007E),
-        (0x00A1, 0x00AC),
-        (0x00AE, 0x00FF),
-        (0x0100, 0x017F),
-        (0x0180, 0x024F),
-        (0x2C60, 0x2C7F),
-        (0x16A0, 0x16F0),
-        (0x0370, 0x0377),
-        (0x037A, 0x037E),
-        (0x0384, 0x038A),
-        (0x038C, 0x038C),
-    ]
-
-    alphabet = [
-        get_char(code_point)
-        for current_range in include_ranges
-        for code_point in range(current_range[0], current_range[1] + 1)
-    ]
-    return ''.join(random.choice(alphabet) for i in range(length))
-
-
-def get_random_words(N, a, b):
-    words = []
-    for _ in range(N):
-        length = random.randint(a, b)
-        words.append(get_random_unicode(length))
-    return words
-
-
-class TestFastTextPy(unittest.TestCase):
-    @classmethod
-    def eprint(cls, *args, **kwargs):
-        print(*args, file=sys.stderr, **kwargs)
-
-    @classmethod
-    def num_thread(cls):
-        return multiprocessing.cpu_count() - 1
-
-    @classmethod
-    def build_paths(cls, train, test, output):
-        train = os.path.join(cls.data_dir, train)
-        test = os.path.join(cls.data_dir, test)
-        output = os.path.join(cls.result_dir, output)
-        return train, test, output
-
-    @classmethod
-    def build_train_args(cls, params, mode, train, output):
-        args = [cls.bin, mode, "-input", train, "-output", output]
-        return args + params.split(' ')
-
-    @classmethod
-    def get_train_output(cls, train_args):
-        cls.eprint("Executing: " + ' '.join(train_args))
-        return subprocess.check_output(train_args).decode('utf-8')
-
-    @classmethod
-    def get_path_size(cls, path):
-        path_size = subprocess.check_output(["stat", "-c", "%s",
-                                             path]).decode('utf-8')
-        path_size = int(path_size)
-        return path_size
-
-    @classmethod
-    def default_test_args(cls, model, test, quantize=False):
-        return [cls.bin, "test", model, test]
-
-    @classmethod
-    def get_test_output(cls, test_args):
-        cls.eprint("Executing: " + ' '.join(test_args))
-        test_output = subprocess.check_output(test_args)
-        test_output = test_output.decode('utf-8')
-        cls.eprint("Test output:\n" + test_output)
-        return list(
-            map(lambda x: x.split('\t')[1], test_output.split('\n')[:-1])
-        )
-
-    @classmethod
-    def train_generic_classifier(cls, train, output):
-        thread = cls.num_thread()
-        cls.eprint("Using {} threads".format(thread))
-        sup_params = (
-            "-dim 10 -lr 0.1 -wordNgrams 2 -minCount 1 -bucket 10000000 "
-            "-epoch 5 -thread {}".format(thread)
-        )
-        mode = 'supervised'
-        cls.get_train_output(
-            cls.build_train_args(sup_params, mode, train, output)
-        )
-
-    @classmethod
-    def train_generic_embeddings(cls, train, output):
-        thread = cls.num_thread()
-        cls.eprint("Using {} threads".format(thread))
-        unsup_params = (
-            "-thread {} -lr 0.025 -dim 100 -ws 5 -epoch 1 -minCount 5 "
-            "-neg 5 -loss ns -bucket 2000000 -minn 3 -maxn 6 -t 1e-4 "
-            "-lrUpdateRate 100".format(thread)
-        )
-        mode = 'cbow'
-        cls.get_train_output(
-            cls.build_train_args(unsup_params, mode, train, output)
-        )
-
-    def get_predictions_from_list(self, output, words, k):
-        args = [self.bin, "predict-prob", output + '.bin', '-', str(k)]
-        self.eprint("Executing: " + ' '.join(args))
-        p = subprocess.Popen(
-            args, stdin=subprocess.PIPE, stdout=subprocess.PIPE
-        )
-        test_text = ""
-        if words:
-            test_text = '\n'.join(words) + '\n'
-        test_text = test_text.encode('utf-8')
-        stdout, stderr = p.communicate(test_text)
-        stdout = stdout.decode('utf-8')
-        return stdout, stderr, p.returncode
-
-    def get_word_vectors_from_list(self, output, words):
-        args = [self.bin, "print-word-vectors", output + '.bin']
-        self.eprint("Executing: " + ' '.join(args))
-        p = subprocess.Popen(
-            args, stdin=subprocess.PIPE, stdout=subprocess.PIPE
-        )
-        test_text = '\n'.join(words).encode('utf-8')
-        stdout, stderr = p.communicate(test_text)
-        return stdout
-
-
-class TestFastTextPyUnit(TestFastTextPy):
-    @classmethod
-    def setUpClass(cls):
-        cls.bin = os.environ['FASTTEXT_BIN']
-        cls.data_dir = os.environ['FASTTEXT_DATA']
-        cls.result_dir = tempfile.mkdtemp()
-        train, _, output = cls.build_paths("fil9", "rw/rw.txt", "fil9")
-        cls.train_generic_embeddings(train, output)
-        cls.output = output
-        train, _, output_sup = cls.build_paths(
-            "dbpedia.train", "dbpedia.test", "dbpedia"
-        )
-        cls.train_generic_classifier(train, output_sup)
-        cls.output_sup = output_sup
-
-    @classmethod
-    def tearDownClass(cls):
-        pass
-        # shutil.rmtree(cls.result_dir)
-
-    # Check if get_word_vector aligns with vectors from stdin
-    def test_getvector(self):
-        f = load_model(self.output + '.bin')
-        words, _ = f.get_words(include_freq=True)
-        words += get_random_words(100, 1, 100)
-        ftbin_vectors = self.get_word_vectors_from_list(self.output, words)
-        ftbin_vectors = ftbin_vectors.decode('utf-8').split('\n')[:-1]
-        for v in ftbin_vectors:
-            word = v.split(' ')[0]
-            vector = v.split(' ')[1:-1]
-            vector = np.array(list(map(float, vector)))
-            pvec = f.get_word_vector(word)
-            # The fasttext cli returns floats with 5 digits,
-            # but we use the full 6 digits.
-            self.assertTrue(np.allclose(vector, pvec, rtol=1e-04))
-
-    def test_predict(self):
-        # TODO: I went a little crazy here as an exercise for
-        # a rigorous test case. This could be turned into
-        # a few utility functions.
-        f = load_model(self.output_sup + '.bin')
-
-        def _test(N, min_length, max_length, k, add_vocab=0):
-            words = get_random_words(N, min_length, max_length)
-            if add_vocab > 0:
-                vocab, _ = f.get_words(include_freq=True)
-                for _ in range(add_vocab):
-                    ind = random.randint(0, len(vocab))
-                    words += [vocab[ind]]
-            all_labels = []
-            all_probs = []
-            ii = 0
-            gotError = False
-            for w in words:
-                try:
-                    labels, probs = f.predict(w, k)
-                except ValueError:
-                    gotError = True
-                    continue
-                all_labels.append(labels)
-                all_probs.append(probs)
-                ii += 1
-            preds, _, retcode = self.get_predictions_from_list(
-                self.output_sup, words, k
-            )
-            if gotError and retcode == 0:
-                self.eprint(
-                    "Didn't get error. Make sure your compiled "
-                    "binary kept the assert statements"
-                )
-                self.assertTrue(False)
-            else:
-                return
-            preds = preds.split('\n')[:-1]
-            self.assertEqual(len(preds), len(all_labels))
-            for i in range(len(preds)):
-                labels = preds[i].split()
-                probs = np.array(list(map(float, labels[1::2])))
-                labels = np.array(labels[::2])
-                self.assertTrue(np.allclose(probs, all_probs[i], rtol=1e-04))
-                self.assertTrue(np.array_equal(labels, all_labels[i]))
-
-        _test(0, 0, 0, 0)
-        _test(1, 0, 0, 0)
-        _test(10, 0, 0, 0)
-        _test(1, 1, 1, 0)
-        _test(1, 1, 1, 1)
-        _test(1, 2, 3, 0)
-        _test(1, 2, 3, 1)
-        _test(10, 1, 1, 1)
-        _test(1, 1, 1, 0, add_vocab=10)
-        _test(1, 1, 1, 1, add_vocab=10)
-        _test(1, 2, 3, 0, add_vocab=10)
-        _test(1, 2, 3, 1, add_vocab=10)
-        reach = 10
-        for _ in range(10):
-            N = random.randint(0, reach)
-            init = random.randint(0, reach)
-            offset = random.randint(0, reach)
-            k = random.randint(0, reach)
-            _test(N, init, init + offset, k)
-
-    def test_vocab(self):
-        f = load_model(self.output + '.bin')
-        words, freq = f.get_words(include_freq=True)
-        self.eprint(
-            "There is no way to access words from the cli yet. "
-            "Therefore there can be no rigorous test."
-        )
-
-    def test_subwords(self):
-        f = load_model(self.output + '.bin')
-        words, _ = f.get_words(include_freq=True)
-        words += get_random_words(10, 1, 10)
-        for w in words:
-            f.get_subwords(w)
-        self.eprint(
-            "There is no way to access words from the cli yet. "
-            "Therefore there can be no test."
-        )
-
-    def test_tokenize(self):
-        train, _, _ = self.build_paths("fil9", "rw/rw.txt", "fil9")
-        with open(train, 'r') as f:
-            _ = tokenize(f.read())
-
-    def test_dimension(self):
-        f = load_model(self.output + '.bin')
-        f.get_dimension()
-
-    def test_subword_vector(self):
-        f = load_model(self.output + '.bin')
-        words, _ = f.get_words(include_freq=True)
-        words += get_random_words(10000, 1, 200)
-        input_matrix = f.get_input_matrix()
-        for word in words:
-
-            # Universal api to get word vector
-            vec1 = f.get_word_vector(word)
-
-            # Build word vector from subwords
-            subwords, subinds = f.get_subwords(word)
-            subvectors = list(map(lambda x: f.get_input_vector(x), subinds))
-            subvectors = np.stack(subvectors)
-            vec2 = np.sum((subvectors / len(subwords)), 0)
-
-            # Build word vector from subinds
-            vec3 = np.sum(input_matrix[subinds] / len(subinds), 0)
-
-            # Build word vectors from word and subword ids
-            wid = f.get_word_id(word)
-            if wid >= 0:
-                swids = list(map(lambda x: f.get_subword_id(x), subwords[1:]))
-                swids.append(wid)
-            else:
-                swids = list(map(lambda x: f.get_subword_id(x), subwords))
-            swids = np.array(swids)
-            vec4 = np.sum(input_matrix[swids] / len(swids), 0)
-
-            self.assertTrue(np.isclose(vec1, vec2, atol=1e-5, rtol=0).all())
-            self.assertTrue(np.isclose(vec2, vec3, atol=1e-5, rtol=0).all())
-            self.assertTrue(np.isclose(vec3, vec4, atol=1e-5, rtol=0).all())
-            self.assertTrue(np.isclose(vec4, vec1, atol=1e-5, rtol=0).all())
-
-    # TODO: Compare with .vec file
-    def test_get_words(self):
-        f = load_model(self.output + '.bin')
-        words1, freq1 = f.get_words(include_freq=True)
-        words2 = f.get_words(include_freq=False)
-        self.assertEqual(len(words1), len(words2))
-        self.assertEqual(len(words1), len(freq1))
-        f = load_model(self.output_sup + '.bin')
-        words1, freq1 = f.get_words(include_freq=True)
-        words2 = f.get_words(include_freq=False)
-        self.assertEqual(len(words1), len(words2))
-        self.assertEqual(len(words1), len(freq1))
-
-    # TODO: Compare with .vec file for unsup
-    def test_get_labels(self):
-        f = load_model(self.output + '.bin')
-        labels1, freq1 = f.get_labels(include_freq=True)
-        labels2 = f.get_labels(include_freq=False)
-        words2 = f.get_words(include_freq=False)
-        self.assertEqual(len(labels1), len(labels2))
-        self.assertEqual(len(labels1), len(freq1))
-        self.assertEqual(len(labels1), len(words2))
-        for w1, w2 in zip(labels2, words2):
-            self.assertEqual(w1, w2)
-        f = load_model(self.output_sup + '.bin')
-        labels1, freq1 = f.get_labels(include_freq=True)
-        labels2 = f.get_labels(include_freq=False)
-        self.assertEqual(len(labels1), len(labels2))
-        self.assertEqual(len(labels1), len(freq1))
-
-    def test_exercise_is_quant(self):
-        f = load_model(self.output + '.bin')
-        gotError = False
-        try:
-            f.quantize()
-        except ValueError:
-            gotError = True
-        self.assertTrue(gotError)
-        f = load_model(self.output_sup + '.bin')
-        self.assertTrue(not f.is_quantized())
-        f.quantize()
-        self.assertTrue(f.is_quantized())
-
-    def test_newline_predict_sentence(self):
-        f = load_model(self.output_sup + '.bin')
-        sentence = get_random_words(1, 1000, 2000)[0]
-        f.predict(sentence, k=5)
-        sentence += "\n"
-        gotError = False
-        try:
-            f.predict(sentence, k=5)
-        except ValueError:
-            gotError = True
-        self.assertTrue(gotError)
-
-        f = load_model(self.output + '.bin')
-        sentence = get_random_words(1, 1000, 2000)[0]
-        f.get_sentence_vector(sentence)
-        sentence += "\n"
-        gotError = False
-        try:
-            f.get_sentence_vector(sentence)
-        except ValueError:
-            gotError = True
-        self.assertTrue(gotError)
-
-
-class TestFastTextPyIntegration(TestFastTextPy):
-    @classmethod
-    def setUpClass(cls):
-        cls.bin = os.environ['FASTTEXT_BIN']
-        cls.data_dir = os.environ['FASTTEXT_DATA']
-        cls.result_dir = tempfile.mkdtemp()
-
-    def test_unsup1(self):
-        train, test, output = self.build_paths("fil9", "rw/rw.txt", "fil9")
-
-        model = train_unsupervised(
-            input=train,
-            model="skipgram",
-            lr=0.025,
-            dim=100,
-            ws=5,
-            epoch=1,
-            minCount=5,
-            neg=5,
-            loss="ns",
-            bucket=2000000,
-            minn=3,
-            maxn=6,
-            t=1e-4,
-            lrUpdateRate=100,
-            thread=self.num_thread(),
-        )
-        model.save_model(output)
-
-        path_size = self.get_path_size(output)
-        vectors = {}
-        with open(test, 'r') as test_f:
-            for line in test_f:
-                query0 = line.split()[0].strip()
-                query1 = line.split()[1].strip()
-                vector0 = model.get_word_vector(query0)
-                vector1 = model.get_word_vector(query1)
-                vectors[query0] = vector0
-                vectors[query1] = vector1
-        dataset, correlation, oov = compute_similarity(None, test, vectors)
-        correlation = np.around(correlation)
-
-        self.assertTrue(
-            correlation >= 41, "Correlation: Want: 41 Is: " + str(correlation)
-        )
-        self.assertEqual(oov, 0.0, "Oov: Want: 0 Is: " + str(oov))
-        self.assertEqual(
-            path_size, 978480868, "Size: Want: 978480868 Is: " + str(path_size)
-        )
-
-
-def gen_sup_test(lr, dataset, n, p1, r1, p1_q, r1_q, size, quant_size):
-    def sup_test(self):
-        def check(
-            output_local, test_local, n_local, p1_local, r1_local, size_local,
-            lessthan
-        ):
-            test_args = self.default_test_args(output_local, test_local)
-            test_output = self.get_test_output(test_args)
-            self.assertEqual(
-                str(test_output[0]),
-                str(n_local),
-                "N: Want: " + str(n_local) + " Is: " + str(test_output[0])
-            )
-            self.assertTrue(
-                float(test_output[1]) >= float(p1_local),
-                "p1: Want: " + str(p1_local) + " Is: " + str(test_output[1])
-            )
-            self.assertTrue(
-                float(test_output[2]) >= float(r1_local),
-                "r1: Want: " + str(r1_local) + " Is: " + str(test_output[2])
-            )
-            path_size = self.get_path_size(output_local)
-            if lessthan:
-                self.assertTrue(
-                    path_size <= size_local, "Size: Want at most: " +
-                    str(size_local) + " Is: " + str(path_size)
-                )
-            else:
-                self.assertTrue(
-                    path_size == size_local,
-                    "Size: Want: " + str(size_local) + " Is: " + str(path_size)
-                )
-
-        train, test, output = self.build_paths(
-            dataset + ".train", dataset + ".test", dataset
-        )
-        model = train_supervised(
-            input=train,
-            dim=10,
-            lr=lr,
-            wordNgrams=2,
-            minCount=1,
-            bucket=10000000,
-            epoch=5,
-            thread=self.num_thread()
-        )
-        model.save_model(output)
-        check(output, test, n, p1, r1, size, False)
-        # Exercising
-        model.predict("hello world")
-        model.quantize(input=train, retrain=True, cutoff=100000, qnorm=True)
-        model.save_model(output + ".ftz")
-        # Exercising
-        model.predict("hello world")
-        check(output + ".ftz", test, n, p1_q, r1_q, quant_size, True)
-
-    return sup_test
-
-
-if __name__ == "__main__":
-    sup_job_lr = [0.25, 0.5, 0.5, 0.1, 0.1, 0.1, 0.05, 0.05]
-    sup_job_n = [7600, 60000, 70000, 38000, 50000, 60000, 650000, 400000]
-    sup_job_p1 = [0.921, 0.968, 0.984, 0.956, 0.638, 0.723, 0.603, 0.946]
-    sup_job_r1 = [0.921, 0.968, 0.984, 0.956, 0.638, 0.723, 0.603, 0.946]
-    sup_job_quant_p1 = [0.918, 0.965, 0.984, 0.953, 0.629, 0.707, 0.58, 0.940]
-    sup_job_quant_r1 = [0.918, 0.965, 0.984, 0.953, 0.629, 0.707, 0.58, 0.940]
-    sup_job_size = [
-        405607193, 421445471, 447481878, 427867393, 431292576, 517549567,
-        483742593, 493604598
-    ]
-    sup_job_quant_size = [
-        405607193, 421445471, 447481878, 427867393, 431292576, 517549567,
-        483742593, 493604598
-    ]
-    sup_job_quant_size = [
-        1600000, 1457000, 1690000, 1550000, 1567896, 1655000, 1600000, 1575010
-    ]
-    # Yelp_review_full can be a bit flaky
-    sup_job_dataset = [
-        "ag_news", "sogou_news", "dbpedia", "yelp_review_polarity",
-        "yelp_review_full", "yahoo_answers", "amazon_review_full",
-        "amazon_review_polarity"
-    ]
-    sup_job_args = [
-        sup_job_lr, sup_job_dataset, sup_job_n, sup_job_p1, sup_job_r1,
-        sup_job_quant_p1, sup_job_quant_r1, sup_job_size, sup_job_quant_size
-    ]
-    for lr, dataset, n, p1, r1, p1_q, r1_q, size, quant_size in zip(
-        *sup_job_args
-    ):
-        setattr(
-            TestFastTextPyIntegration, "test_" + dataset,
-            gen_sup_test(lr, dataset, n, p1, r1, p1_q, r1_q, size, quant_size)
-        )
-    unittest.main()

+ 15 - 0
python/fastText/tests/__init__.py

@@ -0,0 +1,15 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree. An additional grant
+# of patent rights can be found in the PATENTS file in the same directory.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from .test_configurations import get_supervised_models
+from .test_script import gen_tests
+from .test_script import gen_small_tests

+ 104 - 0
python/fastText/tests/test_configurations.py

@@ -0,0 +1,104 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree. An additional grant
+# of patent rights can be found in the PATENTS file in the same directory.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import multiprocessing
+import os
+
+# This script represents a collection of integration tests
+# Each integration test comes with a full set of parameters,
+# a dataset, and expected metrics.
+# These configurations can be used by various fastText apis
+# to confirm some level of correctness.
+
+# Supervised models
+# See https://fasttext.cc/docs/en/supervised-models.html
+
+
+def max_thread():
+    return multiprocessing.cpu_count() - 1
+
+
+def get_supervised_models(data_dir=""):
+    sup_job_dataset = [
+        "ag_news", "sogou_news", "dbpedia", "yelp_review_polarity",
+        "yelp_review_full", "yahoo_answers", "amazon_review_full",
+        "amazon_review_polarity"
+    ]
+
+    sup_params = {
+        "dim": 10,
+        "wordNgrams": 2,
+        "minCount": 1,
+        "bucket": 10000000,
+        "epoch": 5,
+        "thread": max_thread(),
+        "verbose": 1,
+    }
+    quant_params = {
+        "retrain": True,
+        "cutoff": 100000,
+        "qnorm": True,
+        "verbose": 1,
+    }
+    sup_job_lr = [0.25, 0.5, 0.5, 0.1, 0.1, 0.1, 0.05, 0.05]
+
+    sup_job_n = [7600, 60000, 70000, 38000, 50000, 60000, 650000, 400000]
+
+    sup_job_p1 = [0.921, 0.968, 0.984, 0.956, 0.638, 0.723, 0.603, 0.946]
+    sup_job_r1 = [0.921, 0.968, 0.984, 0.956, 0.638, 0.723, 0.603, 0.946]
+    sup_job_size = [
+        405607193, 421445471, 447481878, 427867393, 431292576, 517549567,
+        483742593, 493604598
+    ]
+
+    sup_job_quant_p1 = [0.918, 0.965, 0.984, 0.950, 0.625, 0.707, 0.58, 0.940]
+    sup_job_quant_r1 = [0.918, 0.965, 0.984, 0.950, 0.625, 0.707, 0.58, 0.940]
+    sup_job_quant_size = [
+        1600000, 1457000, 1690000, 1550000, 1567896, 1655000, 1600000, 1575000
+    ]
+
+    configurations = []
+    for i in range(len(sup_job_dataset)):
+        configuration = {}
+        configuration["dataset"] = sup_job_dataset[i]
+        args = sup_params.copy()
+        quant_args = quant_params.copy()
+        args["lr"] = sup_job_lr[i]
+        args["input"] = sup_job_dataset[i] + ".train"
+        quant_args["lr"] = sup_job_lr[i]
+        quant_args["input"] = sup_job_dataset[i] + ".train"
+        if data_dir:
+            args["input"] = os.path.join(data_dir, args["input"])
+            quant_args["input"] = os.path.join(data_dir, quant_args["input"])
+        configuration["train_args"] = args
+        configuration["quant_args"] = quant_args
+        test = {
+            "n": sup_job_n[i],
+            "p1": sup_job_p1[i],
+            "r1": sup_job_r1[i],
+            "size": sup_job_size[i],
+            "data": sup_job_dataset[i] + ".test",
+        }
+        quant_test = {
+            "n": sup_job_n[i],
+            "p1": sup_job_quant_p1[i],
+            "r1": sup_job_quant_r1[i],
+            "size": sup_job_quant_size[i],
+            "data": sup_job_dataset[i] + ".test",
+        }
+        if data_dir:
+            test["data"] = os.path.join(data_dir, test["data"])
+            quant_test["data"] = os.path.join(data_dir, quant_test["data"])
+        configuration["test"] = test
+        configuration["quant_test"] = quant_test
+        configurations.append(configuration)
+    return configurations

+ 125 - 0
python/fastText/tests/test_script.py

@@ -0,0 +1,125 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree. An additional grant
+# of patent rights can be found in the PATENTS file in the same directory.
+
+from __future__ import absolute_import
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from fastText import train_supervised
+from fastText import util
+import os
+import subprocess
+import unittest
+import tempfile
+try:
+    import unicode
+except ImportError:
+    pass
+from fastText.tests.test_configurations import get_supervised_models
+
+
+def read_labels(data_file):
+    labels = []
+    lines = []
+    with open(data_file, 'r') as f:
+        for line in f:
+            labels_line = []
+            words_line = []
+            try:
+                line = unicode(line, "UTF-8").split()
+            except NameError:
+                line = line.split()
+            for word in line:
+                if word.startswith("__label__"):
+                    labels_line.append(word)
+                else:
+                    words_line.append(word)
+            labels.append(labels_line)
+            lines.append(" ".join(words_line))
+    return lines, labels
+
+
+# Generate a supervised test case
+# The returned function will be set as an attribute to a test class
+def gen_sup_test(configuration):
+    def sup_test(self):
+        def get_path_size(path):
+            path_size = subprocess.check_output(["stat", "-c", "%s",
+                                                 path]).decode('utf-8')
+            path_size = int(path_size)
+            return path_size
+
+        def check(model, model_filename, test, lessthan, msg_prefix=""):
+            lines, labels = read_labels(test["data"])
+            predictions = []
+            for line in lines:
+                pred_label, _ = model.predict(line)
+                predictions.append(pred_label)
+            p1_local_out, r1_local_out = util.test(predictions, labels)
+            self.assertEqual(
+                len(predictions), test["n"], msg_prefix + "N: Want: " +
+                str(test["n"]) + " Is: " + str(len(predictions))
+            )
+            self.assertTrue(
+                p1_local_out >= test["p1"], msg_prefix + "p1: Want: " +
+                str(test["p1"]) + " Is: " + str(p1_local_out)
+            )
+            self.assertTrue(
+                r1_local_out >= test["r1"], msg_prefix + "r1: Want: " +
+                str(test["r1"]) + " Is: " + str(r1_local_out)
+            )
+            path_size = get_path_size(model_filename)
+            size_msg = str(test["size"]) + " Is: " + str(path_size)
+            if lessthan:
+                self.assertTrue(
+                    path_size <= test["size"],
+                    msg_prefix + "Size: Want at most: " + size_msg
+                )
+            else:
+                self.assertTrue(
+                    path_size == test["size"],
+                    msg_prefix + "Size: Want: " + size_msg
+                )
+
+        output = os.path.join(tempfile.mkdtemp(), configuration["dataset"])
+        model = train_supervised(**configuration["train_args"])
+        model.save_model(output + ".bin")
+        check(model, output + ".bin", configuration["test"], False)
+        model.quantize(**configuration["quant_args"])
+        model.save_model(output + ".ftz")
+        check(
+            model, output + ".ftz", configuration["quant_test"], True, "Quant: "
+        )
+
+    return sup_test
+
+
+def gen_small_tests(data_dir):
+    class TestFastTextSmallPy(unittest.TestCase):
+        pass
+
+    for configuration in get_supervised_models(data_dir=data_dir):
+        if configuration["dataset"] == "dbpedia":
+            setattr(
+                TestFastTextSmallPy, "test_small_" + configuration["dataset"],
+                gen_sup_test(configuration)
+            )
+    return TestFastTextSmallPy
+
+
+def gen_tests(data_dir):
+    class TestFastTextPy(unittest.TestCase):
+        pass
+
+    for configuration in get_supervised_models(data_dir=data_dir):
+        setattr(
+            TestFastTextPy, "test_" + configuration["dataset"],
+            gen_sup_test(configuration)
+        )
+    return TestFastTextPy

+ 14 - 0
python/fastText/util/__init__.py

@@ -0,0 +1,14 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree. An additional grant
+# of patent rights can be found in the PATENTS file in the same directory.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from .util import test
+from .util import find_nearest_neighbor

+ 60 - 0
python/fastText/util/util.py

@@ -0,0 +1,60 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree. An additional grant
+# of patent rights can be found in the PATENTS file in the same directory.
+
+# NOTE: The purpose of this file is not to accumulate all useful utility
+# functions. This file should contain very commonly used and requested functions
+# (such as test). If you think you have a function at that level, please create
+# an issue and we will happily review your suggestion. This file is also not supposed
+# to pull in dependencies outside of numpy/scipy without very good reasons. For
+# example, this file should not use sklearn and matplotlib to produce a t-sne
+# plot of word embeddings or such.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+
+
+def test(predictions, labels, k=1):
+    """
+    Return precision and recall modeled after fasttext's test
+    """
+    precision = 0.0
+    nexamples = 0
+    nlabels = 0
+    for prediction, labels in zip(predictions, labels):
+        for p in prediction:
+            if p in labels:
+                precision += 1
+        nexamples += 1
+        nlabels += len(labels)
+    return (precision / (k * nexamples), precision / nlabels)
+
+
+def find_nearest_neighbor(query, vectors, ban_set, cossims=None):
+    """
+    query is a 1d numpy array corresponding to the vector to which you want to
+    find the closest vector
+    vectors is a 2d numpy array corresponding to the vectors you want to consider
+    ban_set is a set of indicies within vectors you want to ignore for nearest match
+    cossims is a 1d numpy array of size len(vectors), which can be passed for efficiency
+
+    returns the index of the closest match to query within vectors
+
+    """
+    if cossims is None:
+        cossims = np.matmul(vectors, query, out=cossims)
+    else:
+        np.matmul(vectors, query, out=cossims)
+    rank = len(cossims) - 1
+    result_i = np.argpartition(cossims, rank)[rank]
+    while result_i in ban_set:
+        rank -= 1
+        result_i = np.argpartition(cossims, rank)[rank]
+    return result_i

+ 31 - 0
runtests.py

@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2016-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree. An additional grant
+# of patent rights can be found in the PATENTS file in the same directory.
+#
+
+# To run this tests you must first fetch all the required test data.
+# Have a look at tests/fetch_test_data.sh
+# You will then need to point this script to the corresponding folder
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import argparse
+from fastText.tests import gen_tests
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("data_dir", help="Full path to data directory")
+    args = parser.parse_args()
+    tests = gen_tests(args.data_dir)
+    suite = unittest.TestLoader().loadTestsFromTestCase(tests)
+    unittest.TextTestRunner(verbosity=3).run(suite)

+ 13 - 7
setup.py

@@ -18,7 +18,7 @@ import sys
 import setuptools
 import os
 
-__version__ = '0.0.3'
+__version__ = '0.0.6'
 FASTTEXT_SRC = "src"
 
 # Based on https://github.com/pybind/python_example
@@ -81,16 +81,18 @@ def has_flag(compiler, flagname):
 
 
 def cpp_flag(compiler):
-    """Return the -std=c++[11/14] compiler flag.
-    The c++14 is preferred over c++11 (when it is available).
+    """Return the -std=c++[0x/11/14] compiler flag.
+    The c++14 is preferred over c++0x/11 (when it is available).
     """
     if has_flag(compiler, '-std=c++14'):
         return '-std=c++14'
     elif has_flag(compiler, '-std=c++11'):
         return '-std=c++11'
+    elif has_flag(compiler, '-std=c++0x'):
+        return '-std=c++0x'
     else:
         raise RuntimeError(
-            'Unsupported compiler -- at least C++11 support '
+            'Unsupported compiler -- at least C++0x support '
             'is needed!'
         )
 
@@ -124,7 +126,7 @@ class BuildExt(build_ext):
 
 
 setup(
-    name='fastTextpy',
+    name='fasttext',
     version=__version__,
     author='Christian Puhrsch',
     author_email='[email protected]',
@@ -135,7 +137,11 @@ setup(
     license='BSD',
     install_requires=['pybind11>=2.2', "setuptools >= 0.7.0"],
     cmdclass={'build_ext': BuildExt},
-    packages=[str('fastText')],
+    packages=[
+        str('fastText'),
+        str('fastText.util'),
+        str('fastText.tests'),
+    ],
     package_dir={str(''): str('python')},
-    zip_safe=False
+    zip_safe=False,
 )

+ 129 - 0
tests/fetch_test_data.sh

@@ -0,0 +1,129 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2016-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree. An additional grant
+# of patent rights can be found in the PATENTS file in the same directory.
+#
+
+DATADIR=data
+
+report_error() {
+   echo "Error on line $1 of $0"
+}
+
+myshuf() {
+  perl -MList::Util=shuffle -e 'print shuffle(<>);' "$@";
+}
+
+normalize_text() {
+  tr '[:upper:]' '[:lower:]' | sed -e 's/^/__label__/g' | \
+    sed -e "s/'/ ' /g" -e 's/"//g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' \
+        -e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
+        -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' | tr -s " " | myshuf
+}
+
+set -e
+trap 'report_error $LINENO' ERR
+
+mkdir "${DATADIR}"
+
+data_result="${DATADIR}/dbpedia_csv.tar.gz"
+if [ ! -f "$data_result" ] || \
+   [ $(md5sum "$data_result" | cut -f 1 -d ' ') != "8139d58cf075c7f70d085358e73af9b3" ]
+then
+  wget -c "https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz" -O "$data_result"
+  tar -xzvf "$data_result" -C "${DATADIR}"
+fi
+
+data_result="${DATADIR}/dbpedia.train"
+if [ ! -f "$data_result" ]
+then
+  cat "${DATADIR}/dbpedia_csv/train.csv" | normalize_text > "$data_result" || rm -f "$data_result"
+fi
+
+data_result="${DATADIR}/dbpedia.test"
+if [ ! -f "$data_result" ]
+then
+  cat "${DATADIR}/dbpedia_csv/test.csv" | normalize_text > "$data_result" || rm -f "$data_result"
+fi
+
+data_result="${DATADIR}/rw_queries.txt"
+if [ ! -f "$data_result" ]
+then
+  cut -f 1,2 "${DATADIR}"/rw/rw.txt | awk '{print tolower($0)}' | tr '\t' '\n' > "$data_result" || rm -f "$data_result"
+fi
+
+data_result="${DATADIR}/enwik9.zip"
+if [ ! -f "$data_result" ] || \
+   [ $(md5sum "$data_result" | cut -f 1 -d ' ') != "3e773f8a1577fda2e27f871ca17f31fd" ]
+then
+  wget -c http://mattmahoney.net/dc/enwik9.zip -P "${DATADIR}" || rm -f "$data_result"
+  unzip "$data_result" -d "${DATADIR}" || rm -f "$data_result"
+fi
+
+data_result="${DATADIR}/fil9"
+if [ ! -f "$data_result" ]
+then
+  perl wikifil.pl "${DATADIR}/enwik9" > "$data_result" || rm -f "$data_result"
+fi
+
+data_result="${DATADIR}/rw/rw.txt"
+if [ ! -f "$data_result" ]
+then
+  wget -c https://nlp.stanford.edu/~lmthang/morphoNLM/rw.zip -P "${DATADIR}"
+  unzip "${DATADIR}/rw.zip" -d "${DATADIR}" || rm -f "$data_result"
+fi
+
+DATASET=(
+  ag_news
+  sogou_news
+  dbpedia
+  yelp_review_polarity
+  yelp_review_full
+  yahoo_answers
+  amazon_review_full
+  amazon_review_polarity
+)
+
+ID=(
+  0Bz8a_Dbh9QhbUDNpeUdjb0wxRms # ag_news
+  0Bz8a_Dbh9QhbUkVqNEszd0pHaFE # sogou_news
+  0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k # dbpedia
+  0Bz8a_Dbh9QhbNUpYQ2N3SGlFaDg # yelp_review_polarity
+  0Bz8a_Dbh9QhbZlU4dXhHTFhZQU0 # yelp_review_full
+  0Bz8a_Dbh9Qhbd2JNdDBsQUdocVU # yahoo_answers
+  0Bz8a_Dbh9QhbZVhsUnRWRDhETzA # amazon_review_full
+  0Bz8a_Dbh9QhbaW12WVVZS2drcnM # amazon_review_polarity
+)
+
+# Small datasets first
+
+for i in {0..0}
+do
+  echo "Downloading dataset ${DATASET[i]}"
+  if [ ! -f "${DATADIR}/${DATASET[i]}.train" ]
+  then
+    wget -c "https://drive.google.com/uc?export=download&id=${ID[i]}" -O "${DATADIR}/${DATASET[i]}_csv.tar.gz"
+    tar -xzvf "${DATADIR}/${DATASET[i]}_csv.tar.gz" -C "${DATADIR}"
+    cat "${DATADIR}/${DATASET[i]}_csv/train.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.train"
+    cat "${DATADIR}/${DATASET[i]}_csv/test.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.test"
+  fi
+done
+
+# Large datasets require a bit more work due to the extra request page
+
+for i in {1..7}
+do
+  echo "Downloading dataset ${DATASET[i]}"
+  if [ ! -f "${DATADIR}/${DATASET[i]}.train" ]
+  then
+    curl -c /tmp/cookies "https://drive.google.com/uc?export=download&id=${ID[i]}" > /tmp/intermezzo.html
+    curl -L -b /tmp/cookies "https://drive.google.com$(cat /tmp/intermezzo.html | grep -Po 'uc-download-link" [^>]* href="\K[^"]*' | sed 's/\&amp;/\&/g')" > "${DATADIR}/${DATASET[i]}_csv.tar.gz"
+    tar -xzvf "${DATADIR}/${DATASET[i]}_csv.tar.gz" -C "${DATADIR}"
+    cat "${DATADIR}/${DATASET[i]}_csv/train.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.train"
+    cat "${DATADIR}/${DATASET[i]}_csv/test.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.test"
+  fi
+done