8 tahun lalu · 80a49bd724
--- a/README.md
+++ b/README.md
@@ -114,6 +114,19 @@ $ ./fasttext print-sentence-vectors model.bin < text.txt
 
															 This assumes that the `text.txt` file contains the paragraphs that you want to get vectors for.
														
 
															 The program will output one vector representation per line in the file.
														
 
															+You can also quantize a supervised model to reduce its memory usage with the following command:
														
 
															+
														
 
															+```
														
 
															+$ ./fasttext quantize -output model
														
 
															+```
														
 
															+This will create a `.ftz` file with a smaller memory footprint. All the standard functionality, like `test` or `predict` work the same way on the quantized models:
														
 
															+```
														
 
															+$ ./fasttext test model.ftz test.txt
														
 
															+```
														
 
															+The quantization procedure follows the steps described in [3](#fastext-zip). You can
														
 
															+run the script `quantization-example.sh` for an example.
														
 
															+
														
 
															+
														
 
															 ## Full documentation
														
 
															 Invoke a command without arguments to list available arguments and their default values:
														
@@ -126,25 +139,37 @@ The following arguments are mandatory:
 
															   -input              training file path
														
 
															   -output             output file path
														
 
															-The following arguments are optional:
														
 
															-  -lr                 learning rate [0.1]
														
 
															+  The following arguments are optional:
														
 
															+  -verbose            verbosity level [2]
														
 
															+
														
 
															+  The following arguments for the dictionary are optional:
														
 
															+  -minCount           minimal number of word occurences [5]
														
 
															+  -minCountLabel      minimal number of label occurences [0]
														
 
															+  -wordNgrams         max length of word ngram [1]
														
 
															+  -bucket             number of buckets [2000000]
														
 
															+  -minn               min length of char ngram [3]
														
 
															+  -maxn               max length of char ngram [6]
														
 
															+  -t                  sampling threshold [0.0001]
														
 
															+  -label              labels prefix [__label__]
														
 
															+
														
 
															+  The following arguments for training are optional:
														
 
															+  -lr                 learning rate [0.05]
														
 
															   -lrUpdateRate       change the rate of updates for the learning rate [100]
														
 
															   -dim                size of word vectors [100]
														
 
															   -ws                 size of the context window [5]
														
 
															   -epoch              number of epochs [5]
														
 
															-  -minCount           minimal number of word occurences [1]
														
 
															-  -minCountLabel      minimal number of label occurences [0]
														
 
															   -neg                number of negatives sampled [5]
														
 
															-  -wordNgrams         max length of word ngram [1]
														
 
															   -loss               loss function {ns, hs, softmax} [ns]
														
 
															-  -bucket             number of buckets [2000000]
														
 
															-  -minn               min length of char ngram [0]
														
 
															-  -maxn               max length of char ngram [0]
														
 
															   -thread             number of threads [12]
														
 
															-  -t                  sampling threshold [0.0001]
														
 
															-  -label              labels prefix [__label__]
														
 
															-  -verbose            verbosity level [2]
														
 
															   -pretrainedVectors  pretrained word vectors for supervised learning []
														
 
															+  -saveOutput         whether output params should be saved [0]
														
 
															+
														
 
															+  The following arguments for quantization are optional:
														
 
															+  -cutoff             number of words and ngrams to retain [0]
														
 
															+  -retrain            finetune embeddings if a cutoff is applied [0]
														
 
															+  -qnorm              quantizing the norm separately [0]
														
 
															+  -qout               quantizing the classifier [0]
														
 
															+  -dsub               size of each sub-vector [2]
														
 
															 ```
														
 
															 Defaults may vary by mode. (Word-representation modes `skipgram` and `cbow` use a default `-minCount` of 5.)
														
@@ -179,6 +204,19 @@ Please cite [1](#enriching-word-vectors-with-subword-information) if using this
 
															 }
														
 
															 ```
														
 
															+### FastText.zip: Compressing text classification models
														
 
															+
														
 
															+[3] A. Joulin, E. Grave, P. Bojanowski, M. Douze, H. Jégou, T. Mikolov, [*FastText.zip: Compressing text classification models*](https://arxiv.org/abs/1612.03651)
														
 
															+
														
 
															+```
														
 
															+@article{joulin2016fasttext,
														
 
															+  title={FastText.zip: Compressing text classification models},
														
 
															+  author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Douze, Matthijs and J{\'e}gou, H{\'e}rve and Mikolov, Tomas},
														
 
															+  journal={arXiv preprint arXiv:1612.03651},
														
 
															+  year={2016}
														
 
															+}
														
 
															+```
														
 
															+
														
 
															 (\* These authors contributed equally.)
														
 
															 ## Resources
														
--- a/quantization-example.sh
+++ b/quantization-example.sh
@@ -34,7 +34,7 @@ echo "Quantizing..."
 
															 echo "Testing original model..."
														
 
															 ./fasttext test "${RESULTDIR}/dbpedia.bin" "${DATADIR}/dbpedia.test"
														
 
															 echo "Testing quantized model..."
														
 
															-./fasttext test "${RESULTDIR}/dbpedia.ftz" "${DATADIR}/dbpedia.test" 1 1
														
 
															+./fasttext test "${RESULTDIR}/dbpedia.ftz" "${DATADIR}/dbpedia.test"
														
 
															-ls -lrh "${RESULTDIR}/dbpedia.bin" | awk  '{print "Size of the original model:\t",$5;}'
														
 
															-ls -lrh "${RESULTDIR}/dbpedia.ftz" | awk  '{print "Size of the quantized model:\t",$5;}'
														
 
															+wc -c < "${RESULTDIR}/dbpedia.bin" | awk '{print "Size of the original model:\t",$1;}'
														
 
															+wc -c < "${RESULTDIR}/dbpedia.ftz" | awk '{print "Size of the quantized model:\t",$1;}'
														
--- a/word-vector-example.sh
+++ b/word-vector-example.sh
@@ -35,6 +35,6 @@ make
 
															 cut -f 1,2 "${DATADIR}"/rw/rw.txt | awk '{print tolower($0)}' | tr '\t' '\n' > "${DATADIR}"/queries.txt
														
 
															-cat "${DATADIR}"/queries.txt | ./fasttext print-vectors "${RESULTDIR}"/fil9.bin > "${RESULTDIR}"/vectors.txt
														
 
															+cat "${DATADIR}"/queries.txt | ./fasttext print-word-vectors "${RESULTDIR}"/fil9.bin > "${RESULTDIR}"/vectors.txt
														
 
															 python eval.py -m "${RESULTDIR}"/vectors.txt -d "${DATADIR}"/rw/rw.txt