Jelajahi Sumber

Merge branch 'master' of github.com:facebookresearch/fastText

Armand Joulin 9 tahun lalu
induk
melakukan
8be24fe929
19 mengubah file dengan 81 tambahan dan 41 penghapusan
  1. 17 8
      README.md
  2. 1 1
      classification-example.sh
  3. 1 1
      classification-results.sh
  4. 8 2
      eval.py
  5. 3 1
      src/args.cc
  6. 2 2
      src/args.h
  7. 4 0
      src/dictionary.cc
  8. 4 3
      src/dictionary.h
  9. 12 10
      src/fasttext.cc
  10. 3 0
      src/matrix.cc
  11. 3 2
      src/matrix.h
  12. 3 0
      src/model.cc
  13. 5 4
      src/model.h
  14. 2 2
      src/real.h
  15. 3 0
      src/utils.cc
  16. 3 2
      src/utils.h
  17. 3 0
      src/vector.cc
  18. 3 2
      src/vector.h
  19. 1 1
      word-vector-example.sh

+ 17 - 8
README.md

@@ -4,7 +4,8 @@ fastText is a library for efficient learning of word representations and sentenc
 
 ## Requirements
 
-fastText uses C++11 features and therefore it requires a compiler with good C++11 support.
+**fastText** builds on modern Mac OS and Linux distributions.
+Since it uses C++11 features, it requires a compiler with good C++11 support.
 These include :
 
 * (gcc-4.6.3 or newer) or (clang-3.3 or newer)
@@ -31,11 +32,11 @@ If you do not plan on using the default system-wide compiler, update the two mac
 ## Example use cases
 
 This library has two main use cases: word representation learning and text classification.
-These were described in the two papers [1] and [2].
+These were described in the two papers [1](#enriching-word-vectors-with-subword-information) and [2](#bag-of-tricks-for-efficient-text-classification).
 
 ### Word representation learning
 
-In order to learn word vectors, as described in [1], do:
+In order to learn word vectors, as described in [1](#enriching-word-vectors-with-subword-information), do:
 
 ```
 $ ./fasttext skipgram -input data.txt -output model
@@ -75,7 +76,7 @@ will compile the code, download data, compute word vectors and evaluate them on
 ### Text classification
 
 This library can also be used to train supervised text classifiers, for instance for sentiment analysis.
-In order to train a text classifier using the method described in [2], use:
+In order to train a text classifier using the method described in [2](#bag-of-tricks-for-efficient-text-classification), use:
 
 ```
 $ ./fasttext supervised -input train.txt -output model
@@ -99,7 +100,7 @@ $ ./fasttext predict model.bin test.txt
 where `test.txt` contains a piece of text to classify per line.
 Doing so will output to the standard output the most likely label per line.
 See `classification-example.sh` for an example use case.
-In order to reproduce results from the paper [2], run `classification-results.sh`, this will download all the datasets and reproduce the results from Table 1.
+In order to reproduce results from the paper [2](#bag-of-tricks-for-efficient-text-classification), run `classification-results.sh`, this will download all the datasets and reproduce the results from Table 1.
 
 ## Full documentation
 
@@ -128,9 +129,11 @@ The following arguments are optional:
 
 ## References
 
-Please cite [1] if using this code for learning word representations or [2] if using for text classification.
+Please cite [1](#enriching-word-vectors-with-subword-information) if using this code for learning word representations or [2](#bag-of-tricks-for-efficient-text-classification) if using for text classification.
 
-[1] P. Bojanowski\*, E. Grave\*, A. Joulin, T. Mikolov, *Enriching Word Vectors with Subword Information*
+### Enriching Word Vectors with Subword Information
+
+[1] P. Bojanowski\*, E. Grave\*, A. Joulin, T. Mikolov, [*Enriching Word Vectors with Subword Information*](https://arxiv.org/pdf/1607.04606v1.pdf)
 
 ```
 @article{bojanowski2016enriching,
@@ -141,7 +144,9 @@ Please cite [1] if using this code for learning word representations or [2] if u
 }
 ```
 
-[2] A. Joulin, E. Grave, P. Bojanowski, T. Mikolov, *Bag of Tricks for Efficient Text Classification*
+### Bag of Tricks for Efficient Text Classification
+
+[2] A. Joulin, E. Grave, P. Bojanowski, T. Mikolov, [*Bag of Tricks for Efficient Text Classification*](https://arxiv.org/pdf/1607.01759v2.pdf)
 
 ```
 @article{joulin2016bag,
@@ -154,6 +159,10 @@ Please cite [1] if using this code for learning word representations or [2] if u
 
 (\* These authors contributed equally.)
 
+### Discussion on HN
+
+[3] Some valuable comments on [Hacker News](https://news.ycombinator.com/item?id=12226988).
+
 ## Join the fastText community
 
 * Facebook page: https://www.facebook.com/groups/1174547215919768

+ 1 - 1
classification-example.sh

@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/usr/bin/env bash
 #
 # Copyright (c) 2016-present, Facebook, Inc.
 # All rights reserved.

+ 1 - 1
classification-results.sh

@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/usr/bin/env bash
 #
 # Copyright (c) 2016-present, Facebook, Inc.
 # All rights reserved.

+ 8 - 2
eval.py

@@ -20,6 +20,12 @@ import os
 import math
 import argparse
 
+def compat_splitting(line):
+    if sys.version > "3":
+        return line.split()
+    else: # if version is 2
+        return line.decode('utf8').split()
+
 def similarity(v1, v2):
     n1 = np.linalg.norm(v1)
     n2 = np.linalg.norm(v2)
@@ -34,7 +40,7 @@ vectors = {}
 fin = open(args.modelPath, 'r')
 for i, line in enumerate(fin):
     try:
-        tab = line.decode('utf8').split()
+        tab = compat_splitting(line)
         vec = np.array(tab[1:], dtype=float)
         word = tab[0]
         if not word in vectors:
@@ -52,7 +58,7 @@ nwords = 0.0
 
 fin = open(args.dataPath, 'r')
 for line in fin:
-    tline = line.decode('utf8').split()
+    tline = compat_splitting(line)
     word1 = tline[0].lower()
     word2 = tline[1].lower()
     nwords = nwords + 1.0

+ 3 - 1
src/args.cc

@@ -8,8 +8,10 @@
  */
 
 #include "args.h"
-#include "stdlib.h"
+
+#include <stdlib.h>
 #include <string.h>
+
 #include <iostream>
 #include <fstream>
 

+ 2 - 2
src/args.h

@@ -7,8 +7,8 @@
  * of patent rights can be found in the PATENTS file in the same directory.
  */
 
-#ifndef ARGS_H
-#define ARGS_H
+#ifndef FASTTEXT_ARGS_H
+#define FASTTEXT_ARGS_H
 
 #include <string>
 

+ 4 - 0
src/dictionary.cc

@@ -8,11 +8,14 @@
  */
 
 #include "dictionary.h"
+
 #include <assert.h>
+
 #include <iostream>
 #include <algorithm>
 #include <iterator>
 #include <unordered_map>
+
 #include "args.h"
 
 extern Args args;
@@ -258,6 +261,7 @@ int32_t Dictionary::getLine(std::ifstream& ifs,
 }
 
 std::string Dictionary::getLabel(int32_t lid) {
+  assert(lid >= 0 && lid < nlabels_);
   return words_[lid + nwords_].word;
 }
 

+ 4 - 3
src/dictionary.h

@@ -7,15 +7,16 @@
  * of patent rights can be found in the PATENTS file in the same directory.
  */
 
-#ifndef DICTIONARY_H
-#define DICTIONARY_H
+#ifndef FASTTEXT_DICTIONARY_H
+#define FASTTEXT_DICTIONARY_H
 
-#include "real.h"
 #include <vector>
 #include <string>
 #include <fstream>
 #include <random>
 
+#include "real.h"
+
 typedef int32_t id_type;
 enum class entry_type : int8_t {word=0, label=1};
 

+ 12 - 10
src/fasttext.cc

@@ -7,23 +7,25 @@
  * of patent rights can be found in the PATENTS file in the same directory.
  */
 
-#include "matrix.h"
-#include "vector.h"
-#include "dictionary.h"
-#include "model.h"
-#include "utils.h"
-#include "real.h"
-#include "args.h"
+#include <fenv.h>
+#include <time.h>
+#include <math.h>
+
 #include <iostream>
 #include <iomanip>
 #include <thread>
-#include <time.h>
 #include <string>
-#include <math.h>
 #include <vector>
 #include <atomic>
 #include <algorithm>
-#include <fenv.h>
+
+#include "matrix.h"
+#include "vector.h"
+#include "dictionary.h"
+#include "model.h"
+#include "utils.h"
+#include "real.h"
+#include "args.h"
 
 Args args;
 

+ 3 - 0
src/matrix.cc

@@ -8,8 +8,11 @@
  */
 
 #include "matrix.h"
+
 #include <assert.h>
+
 #include <random>
+
 #include "utils.h"
 #include "vector.h"
 

+ 3 - 2
src/matrix.h

@@ -7,11 +7,12 @@
  * of patent rights can be found in the PATENTS file in the same directory.
  */
 
-#ifndef MATRIX_H
-#define MATRIX_H
+#ifndef FASTTEXT_MATRIX_H
+#define FASTTEXT_MATRIX_H
 
 #include <cstdint>
 #include <fstream>
+
 #include "real.h"
 
 class Vector;

+ 3 - 0
src/model.cc

@@ -8,8 +8,11 @@
  */
 
 #include "model.h"
+
 #include <assert.h>
+
 #include <algorithm>
+
 #include "args.h"
 #include "utils.h"
 

+ 5 - 4
src/model.h

@@ -7,14 +7,15 @@
  * of patent rights can be found in the PATENTS file in the same directory.
  */
 
-#ifndef MODEL_H
-#define MODEL_H
+#ifndef FASTTEXT_MODEL_H
+#define FASTTEXT_MODEL_H
+
+#include <vector>
+#include <random>
 
 #include "matrix.h"
 #include "vector.h"
 #include "real.h"
-#include <vector>
-#include <random>
 
 struct Node {
   int32_t parent;

+ 2 - 2
src/real.h

@@ -7,8 +7,8 @@
  * of patent rights can be found in the PATENTS file in the same directory.
  */
 
-#ifndef REAL_H
-#define REAL_H
+#ifndef FASTTEXT_REAL_H
+#define FASTTEXT_REAL_H
 
 typedef float real;
 

+ 3 - 0
src/utils.cc

@@ -8,6 +8,7 @@
  */
 
 #include "utils.h"
+
 #include <cmath>
 #include <ios>
 
@@ -58,6 +59,8 @@ namespace utils {
   void freeTables() {
     delete[] t_sigmoid;
     delete[] t_log;
+    t_sigmoid = nullptr;
+    t_log = nullptr;
   }
 
   int64_t size(std::ifstream& ifs) {

+ 3 - 2
src/utils.h

@@ -7,10 +7,11 @@
  * of patent rights can be found in the PATENTS file in the same directory.
  */
 
-#ifndef UTILS_H
-#define UTILS_H
+#ifndef FASTTEXT_UTILS_H
+#define FASTTEXT_UTILS_H
 
 #include <fstream>
+
 #include "real.h"
 
 #define SIGMOID_TABLE_SIZE 512

+ 3 - 0
src/vector.cc

@@ -8,9 +8,12 @@
  */
 
 #include "vector.h"
+
 #include <assert.h>
+
 #include <iostream>
 #include <iomanip>
+
 #include "matrix.h"
 #include "utils.h"
 

+ 3 - 2
src/vector.h

@@ -7,11 +7,12 @@
  * of patent rights can be found in the PATENTS file in the same directory.
  */
 
-#ifndef VECTOR_H
-#define VECTOR_H
+#ifndef FASTTEXT_VECTOR_H
+#define FASTTEXT_VECTOR_H
 
 #include <cstdint>
 #include <fstream>
+
 #include "real.h"
 
 class Matrix;

+ 1 - 1
word-vector-example.sh

@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/usr/bin/env bash
 #
 # Copyright (c) 2016-present, Facebook, Inc.
 # All rights reserved.