JasonWang
/
fastText
réplica de https://github.com/facebookresearch/fastText


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607
							# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree. An additional grant
# of patent rights can be found in the PATENTS file in the same directory.

from __future__ import absolute_import
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

from fastText import train_supervised
from fastText import train_unsupervised
from fastText import load_model
from fastText import tokenize
import random
import sys
import os
import subprocess
import multiprocessing
import numpy as np
import unittest
import tempfile
import math
from scipy import stats


def compat_splitting(line):
    return line.decode('utf8').split()


def similarity(v1, v2):
    n1 = np.linalg.norm(v1)
    n2 = np.linalg.norm(v2)
    return np.dot(v1, v2) / n1 / n2


def read_vectors(model_path):
    vectors = {}
    with open(model_path, 'rb') as fin:
        for _, line in enumerate(fin):
            try:
                tab = compat_splitting(line)
                vec = np.array(tab[1:], dtype=float)
                word = tab[0]
                if np.linalg.norm(vec) == 0:
                    continue
                if word not in vectors:
                    vectors[word] = vec
            except ValueError:
                continue
            except UnicodeDecodeError:
                continue
    return vectors


def compute_similarity(model_path, data_path, vectors=None):
    if not vectors:
        vectors = read_vectors(model_path)

    mysim = []
    gold = []
    drop = 0.0
    nwords = 0.0

    with open(data_path, 'rb') as fin:
        for line in fin:
            tline = compat_splitting(line)
            word1 = tline[0].lower()
            word2 = tline[1].lower()
            nwords = nwords + 1.0

            if (word1 in vectors) and (word2 in vectors):
                v1 = vectors[word1]
                v2 = vectors[word2]
                d = similarity(v1, v2)
                mysim.append(d)
                gold.append(float(tline[2]))
            else:
                drop = drop + 1.0

    corr = stats.spearmanr(mysim, gold)
    dataset = os.path.basename(data_path)
    correlation = corr[0] * 100
    oov = math.ceil(drop / nwords * 100.0)
    return dataset, correlation, oov


def get_random_unicode(length):
    # See: https://stackoverflow.com/questions/1477294/generate-random-utf-8-string-in-python

    try:
        get_char = unichr
    except NameError:
        get_char = chr

    # Update this to include code point ranges to be sampled
    include_ranges = [
        (0x0021, 0x0021),
        (0x0023, 0x0026),
        (0x0028, 0x007E),
        (0x00A1, 0x00AC),
        (0x00AE, 0x00FF),
        (0x0100, 0x017F),
        (0x0180, 0x024F),
        (0x2C60, 0x2C7F),
        (0x16A0, 0x16F0),
        (0x0370, 0x0377),
        (0x037A, 0x037E),
        (0x0384, 0x038A),
        (0x038C, 0x038C),
    ]

    alphabet = [
        get_char(code_point)
        for current_range in include_ranges
        for code_point in range(current_range[0], current_range[1] + 1)
    ]
    return ''.join(random.choice(alphabet) for i in range(length))


def get_random_words(N, a, b):
    words = []
    for _ in range(N):
        length = random.randint(a, b)
        words.append(get_random_unicode(length))
    return words


class TestFastTextPy(unittest.TestCase):
    @classmethod
    def eprint(cls, *args, **kwargs):
        print(*args, file=sys.stderr, **kwargs)

    @classmethod
    def num_thread(cls):
        return multiprocessing.cpu_count() - 1

    @classmethod
    def build_paths(cls, train, test, output):
        train = os.path.join(cls.data_dir, train)
        test = os.path.join(cls.data_dir, test)
        output = os.path.join(cls.result_dir, output)
        return train, test, output

    @classmethod
    def build_train_args(cls, params, mode, train, output):
        args = [cls.bin, mode, "-input", train, "-output", output]
        return args + params.split(' ')

    @classmethod
    def get_train_output(cls, train_args):
        cls.eprint("Executing: " + ' '.join(train_args))
        return subprocess.check_output(train_args).decode('utf-8')

    @classmethod
    def get_path_size(cls, path):
        path_size = subprocess.check_output(["stat", "-c", "%s",
                                             path]).decode('utf-8')
        path_size = int(path_size)
        return path_size

    @classmethod
    def default_test_args(cls, model, test, quantize=False):
        return [cls.bin, "test", model, test]

    @classmethod
    def get_test_output(cls, test_args):
        cls.eprint("Executing: " + ' '.join(test_args))
        test_output = subprocess.check_output(test_args)
        test_output = test_output.decode('utf-8')
        cls.eprint("Test output:\n" + test_output)
        return list(
            map(lambda x: x.split('\t')[1], test_output.split('\n')[:-1])
        )

    @classmethod
    def train_generic_classifier(cls, train, output):
        thread = cls.num_thread()
        cls.eprint("Using {} threads".format(thread))
        sup_params = (
            "-dim 10 -lr 0.1 -wordNgrams 2 -minCount 1 -bucket 10000000 "
            "-epoch 5 -thread {}".format(thread)
        )
        mode = 'supervised'
        cls.get_train_output(
            cls.build_train_args(sup_params, mode, train, output)
        )

    @classmethod
    def train_generic_embeddings(cls, train, output):
        thread = cls.num_thread()
        cls.eprint("Using {} threads".format(thread))
        unsup_params = (
            "-thread {} -lr 0.025 -dim 100 -ws 5 -epoch 1 -minCount 5 "
            "-neg 5 -loss ns -bucket 2000000 -minn 3 -maxn 6 -t 1e-4 "
            "-lrUpdateRate 100".format(thread)
        )
        mode = 'cbow'
        cls.get_train_output(
            cls.build_train_args(unsup_params, mode, train, output)
        )

    def get_predictions_from_list(self, output, words, k):
        args = [self.bin, "predict-prob", output + '.bin', '-', str(k)]
        self.eprint("Executing: " + ' '.join(args))
        p = subprocess.Popen(
            args, stdin=subprocess.PIPE, stdout=subprocess.PIPE
        )
        test_text = ""
        if words:
            test_text = '\n'.join(words) + '\n'
        test_text = test_text.encode('utf-8')
        stdout, stderr = p.communicate(test_text)
        stdout = stdout.decode('utf-8')
        return stdout, stderr, p.returncode

    def get_word_vectors_from_list(self, output, words):
        args = [self.bin, "print-word-vectors", output + '.bin']
        self.eprint("Executing: " + ' '.join(args))
        p = subprocess.Popen(
            args, stdin=subprocess.PIPE, stdout=subprocess.PIPE
        )
        test_text = '\n'.join(words).encode('utf-8')
        stdout, stderr = p.communicate(test_text)
        return stdout


class TestFastTextPyUnit(TestFastTextPy):
    @classmethod
    def setUpClass(cls):
        cls.bin = os.environ['FASTTEXT_BIN']
        cls.data_dir = os.environ['FASTTEXT_DATA']
        cls.result_dir = tempfile.mkdtemp()
        train, _, output = cls.build_paths("fil9", "rw/rw.txt", "fil9")
        cls.train_generic_embeddings(train, output)
        cls.output = output
        train, _, output_sup = cls.build_paths(
            "dbpedia.train", "dbpedia.test", "dbpedia"
        )
        cls.train_generic_classifier(train, output_sup)
        cls.output_sup = output_sup

    @classmethod
    def tearDownClass(cls):
        pass
        # shutil.rmtree(cls.result_dir)

    # Check if get_word_vector aligns with vectors from stdin
    def test_getvector(self):
        f = load_model(self.output + '.bin')
        words, _ = f.get_words(include_freq=True)
        words += get_random_words(100, 1, 100)
        ftbin_vectors = self.get_word_vectors_from_list(self.output, words)
        ftbin_vectors = ftbin_vectors.decode('utf-8').split('\n')[:-1]
        for v in ftbin_vectors:
            word = v.split(' ')[0]
            vector = v.split(' ')[1:-1]
            vector = np.array(list(map(float, vector)))
            pvec = f.get_word_vector(word)
            # The fasttext cli returns floats with 5 digits,
            # but we use the full 6 digits.
            self.assertTrue(np.allclose(vector, pvec, rtol=1e-04))

    def test_predict(self):
        # TODO: I went a little crazy here as an exercise for
        # a rigorous test case. This could be turned into
        # a few utility functions.
        f = load_model(self.output_sup + '.bin')

        def _test(N, min_length, max_length, k, add_vocab=0):
            words = get_random_words(N, min_length, max_length)
            if add_vocab > 0:
                vocab, _ = f.get_words(include_freq=True)
                for _ in range(add_vocab):
                    ind = random.randint(0, len(vocab))
                    words += [vocab[ind]]
            all_labels = []
            all_probs = []
            ii = 0
            gotError = False
            for w in words:
                try:
                    labels, probs = f.predict(w, k)
                except ValueError:
                    gotError = True
                    continue
                all_labels.append(labels)
                all_probs.append(probs)
                ii += 1
            preds, _, retcode = self.get_predictions_from_list(
                self.output_sup, words, k
            )
            if gotError and retcode == 0:
                self.eprint(
                    "Didn't get error. Make sure your compiled "
                    "binary kept the assert statements"
                )
                self.assertTrue(False)
            else:
                return
            preds = preds.split('\n')[:-1]
            self.assertEqual(len(preds), len(all_labels))
            for i in range(len(preds)):
                labels = preds[i].split()
                probs = np.array(list(map(float, labels[1::2])))
                labels = np.array(labels[::2])
                self.assertTrue(np.allclose(probs, all_probs[i], rtol=1e-04))
                self.assertTrue(np.array_equal(labels, all_labels[i]))

        _test(0, 0, 0, 0)
        _test(1, 0, 0, 0)
        _test(10, 0, 0, 0)
        _test(1, 1, 1, 0)
        _test(1, 1, 1, 1)
        _test(1, 2, 3, 0)
        _test(1, 2, 3, 1)
        _test(10, 1, 1, 1)
        _test(1, 1, 1, 0, add_vocab=10)
        _test(1, 1, 1, 1, add_vocab=10)
        _test(1, 2, 3, 0, add_vocab=10)
        _test(1, 2, 3, 1, add_vocab=10)
        reach = 10
        for _ in range(10):
            N = random.randint(0, reach)
            init = random.randint(0, reach)
            offset = random.randint(0, reach)
            k = random.randint(0, reach)
            _test(N, init, init + offset, k)

    def test_vocab(self):
        f = load_model(self.output + '.bin')
        words, freq = f.get_words(include_freq=True)
        self.eprint(
            "There is no way to access words from the cli yet. "
            "Therefore there can be no rigorous test."
        )

    def test_subwords(self):
        f = load_model(self.output + '.bin')
        words, _ = f.get_words(include_freq=True)
        words += get_random_words(10, 1, 10)
        for w in words:
            f.get_subwords(w)
        self.eprint(
            "There is no way to access words from the cli yet. "
            "Therefore there can be no test."
        )

    def test_tokenize(self):
        train, _, _ = self.build_paths("fil9", "rw/rw.txt", "fil9")
        with open(train, 'r') as f:
            _ = tokenize(f.read())

    def test_dimension(self):
        f = load_model(self.output + '.bin')
        f.get_dimension()

    def test_subword_vector(self):
        f = load_model(self.output + '.bin')
        words, _ = f.get_words(include_freq=True)
        words += get_random_words(10000, 1, 200)
        input_matrix = f.get_input_matrix()
        for word in words:

            # Universal api to get word vector
            vec1 = f.get_word_vector(word)

            # Build word vector from subwords
            subwords, subinds = f.get_subwords(word)
            subvectors = list(map(lambda x: f.get_input_vector(x), subinds))
            subvectors = np.stack(subvectors)
            vec2 = np.sum((subvectors / len(subwords)), 0)

            # Build word vector from subinds
            vec3 = np.sum(input_matrix[subinds] / len(subinds), 0)

            # Build word vectors from word and subword ids
            wid = f.get_word_id(word)
            if wid >= 0:
                swids = list(map(lambda x: f.get_subword_id(x), subwords[1:]))
                swids.append(wid)
            else:
                swids = list(map(lambda x: f.get_subword_id(x), subwords))
            swids = np.array(swids)
            vec4 = np.sum(input_matrix[swids] / len(swids), 0)

            self.assertTrue(np.isclose(vec1, vec2, atol=1e-5, rtol=0).all())
            self.assertTrue(np.isclose(vec2, vec3, atol=1e-5, rtol=0).all())
            self.assertTrue(np.isclose(vec3, vec4, atol=1e-5, rtol=0).all())
            self.assertTrue(np.isclose(vec4, vec1, atol=1e-5, rtol=0).all())

    # TODO: Compare with .vec file
    def test_get_words(self):
        f = load_model(self.output + '.bin')
        words1, freq1 = f.get_words(include_freq=True)
        words2 = f.get_words(include_freq=False)
        self.assertEqual(len(words1), len(words2))
        self.assertEqual(len(words1), len(freq1))
        f = load_model(self.output_sup + '.bin')
        words1, freq1 = f.get_words(include_freq=True)
        words2 = f.get_words(include_freq=False)
        self.assertEqual(len(words1), len(words2))
        self.assertEqual(len(words1), len(freq1))

    # TODO: Compare with .vec file for unsup
    def test_get_labels(self):
        f = load_model(self.output + '.bin')
        labels1, freq1 = f.get_labels(include_freq=True)
        labels2 = f.get_labels(include_freq=False)
        words2 = f.get_words(include_freq=False)
        self.assertEqual(len(labels1), len(labels2))
        self.assertEqual(len(labels1), len(freq1))
        self.assertEqual(len(labels1), len(words2))
        for w1, w2 in zip(labels2, words2):
            self.assertEqual(w1, w2)
        f = load_model(self.output_sup + '.bin')
        labels1, freq1 = f.get_labels(include_freq=True)
        labels2 = f.get_labels(include_freq=False)
        self.assertEqual(len(labels1), len(labels2))
        self.assertEqual(len(labels1), len(freq1))

    def test_exercise_is_quant(self):
        f = load_model(self.output + '.bin')
        gotError = False
        try:
            f.quantize()
        except ValueError:
            gotError = True
        self.assertTrue(gotError)
        f = load_model(self.output_sup + '.bin')
        self.assertTrue(not f.is_quantized())
        f.quantize()
        self.assertTrue(f.is_quantized())

    def test_newline_predict_sentence(self):
        f = load_model(self.output_sup + '.bin')
        sentence = get_random_words(1, 1000, 2000)[0]
        f.predict(sentence, k=5)
        sentence += "\n"
        gotError = False
        try:
            f.predict(sentence, k=5)
        except ValueError:
            gotError = True
        self.assertTrue(gotError)

        f = load_model(self.output + '.bin')
        sentence = get_random_words(1, 1000, 2000)[0]
        f.get_sentence_vector(sentence)
        sentence += "\n"
        gotError = False
        try:
            f.get_sentence_vector(sentence)
        except ValueError:
            gotError = True
        self.assertTrue(gotError)


class TestFastTextPyIntegration(TestFastTextPy):
    @classmethod
    def setUpClass(cls):
        cls.bin = os.environ['FASTTEXT_BIN']
        cls.data_dir = os.environ['FASTTEXT_DATA']
        cls.result_dir = tempfile.mkdtemp()

    def test_unsup1(self):
        train, test, output = self.build_paths("fil9", "rw/rw.txt", "fil9")

        model = train_unsupervised(
            input=train,
            model="skipgram",
            lr=0.025,
            dim=100,
            ws=5,
            epoch=1,
            minCount=5,
            neg=5,
            loss="ns",
            bucket=2000000,
            minn=3,
            maxn=6,
            t=1e-4,
            lrUpdateRate=100,
            thread=self.num_thread(),
        )
        model.save_model(output)

        path_size = self.get_path_size(output)
        vectors = {}
        with open(test, 'r') as test_f:
            for line in test_f:
                query0 = line.split()[0].strip()
                query1 = line.split()[1].strip()
                vector0 = model.get_word_vector(query0)
                vector1 = model.get_word_vector(query1)
                vectors[query0] = vector0
                vectors[query1] = vector1
        dataset, correlation, oov = compute_similarity(None, test, vectors)
        correlation = np.around(correlation)

        self.assertTrue(
            correlation >= 41, "Correlation: Want: 41 Is: " + str(correlation)
        )
        self.assertEqual(oov, 0.0, "Oov: Want: 0 Is: " + str(oov))
        self.assertEqual(
            path_size, 978480868, "Size: Want: 978480868 Is: " + str(path_size)
        )


def gen_sup_test(lr, dataset, n, p1, r1, p1_q, r1_q, size, quant_size):
    def sup_test(self):
        def check(
            output_local, test_local, n_local, p1_local, r1_local, size_local,
            lessthan
        ):
            test_args = self.default_test_args(output_local, test_local)
            test_output = self.get_test_output(test_args)
            self.assertEqual(
                str(test_output[0]),
                str(n_local),
                "N: Want: " + str(n_local) + " Is: " + str(test_output[0])
            )
            self.assertTrue(
                float(test_output[1]) >= float(p1_local),
                "p1: Want: " + str(p1_local) + " Is: " + str(test_output[1])
            )
            self.assertTrue(
                float(test_output[2]) >= float(r1_local),
                "r1: Want: " + str(r1_local) + " Is: " + str(test_output[2])
            )
            path_size = self.get_path_size(output_local)
            if lessthan:
                self.assertTrue(
                    path_size <= size_local, "Size: Want at most: " +
                    str(size_local) + " Is: " + str(path_size)
                )
            else:
                self.assertTrue(
                    path_size == size_local,
                    "Size: Want: " + str(size_local) + " Is: " + str(path_size)
                )

        train, test, output = self.build_paths(
            dataset + ".train", dataset + ".test", dataset
        )
        model = train_supervised(
            input=train,
            dim=10,
            lr=lr,
            wordNgrams=2,
            minCount=1,
            bucket=10000000,
            epoch=5,
            thread=self.num_thread()
        )
        model.save_model(output)
        check(output, test, n, p1, r1, size, False)
        # Exercising
        model.predict("hello world")
        model.quantize(input=train, retrain=True, cutoff=100000, qnorm=True)
        model.save_model(output + ".ftz")
        # Exercising
        model.predict("hello world")
        check(output + ".ftz", test, n, p1_q, r1_q, quant_size, True)

    return sup_test


if __name__ == "__main__":
    sup_job_lr = [0.25, 0.5, 0.5, 0.1, 0.1, 0.1, 0.05, 0.05]
    sup_job_n = [7600, 60000, 70000, 38000, 50000, 60000, 650000, 400000]
    sup_job_p1 = [0.921, 0.968, 0.984, 0.956, 0.638, 0.723, 0.603, 0.946]
    sup_job_r1 = [0.921, 0.968, 0.984, 0.956, 0.638, 0.723, 0.603, 0.946]
    sup_job_quant_p1 = [0.918, 0.965, 0.984, 0.953, 0.629, 0.707, 0.58, 0.940]
    sup_job_quant_r1 = [0.918, 0.965, 0.984, 0.953, 0.629, 0.707, 0.58, 0.940]
    sup_job_size = [
        405607193, 421445471, 447481878, 427867393, 431292576, 517549567,
        483742593, 493604598
    ]
    sup_job_quant_size = [
        405607193, 421445471, 447481878, 427867393, 431292576, 517549567,
        483742593, 493604598
    ]
    sup_job_quant_size = [
        1600000, 1457000, 1690000, 1550000, 1567896, 1655000, 1600000, 1575010
    ]
    # Yelp_review_full can be a bit flaky
    sup_job_dataset = [
        "ag_news", "sogou_news", "dbpedia", "yelp_review_polarity",
        "yelp_review_full", "yahoo_answers", "amazon_review_full",
        "amazon_review_polarity"
    ]
    sup_job_args = [
        sup_job_lr, sup_job_dataset, sup_job_n, sup_job_p1, sup_job_r1,
        sup_job_quant_p1, sup_job_quant_r1, sup_job_size, sup_job_quant_size
    ]
    for lr, dataset, n, p1, r1, p1_q, r1_q, size, quant_size in zip(
        *sup_job_args
    ):
        setattr(
            TestFastTextPyIntegration, "test_" + dataset,
            gen_sup_test(lr, dataset, n, p1, r1, p1_q, r1_q, size, quant_size)
        )
    unittest.main()