8 yıl önce · 80a49bd724
--- a/README.md
+++ b/README.md
@@ -114,6 +114,19 @@ $ ./fasttext print-sentence-vectors model.bin < text.txt
 
				 This assumes that the `text.txt` file contains the paragraphs that you want to get vectors for.
			
 
				 The program will output one vector representation per line in the file.
			
 
				 
			
 
				+You can also quantize a supervised model to reduce its memory usage with the following command:
			
 
				+
			
 
				+```
			
 
				+$ ./fasttext quantize -output model
			
 
				+```
			
 
				+This will create a `.ftz` file with a smaller memory footprint. All the standard functionality, like `test` or `predict` work the same way on the quantized models:
			
 
				+```
			
 
				+$ ./fasttext test model.ftz test.txt
			
 
				+```
			
 
				+The quantization procedure follows the steps described in [3](#fastext-zip). You can
			
 
				+run the script `quantization-example.sh` for an example.
			
 
				+
			
 
				+
			
 
				 ## Full documentation
			
 
				 
			
 
				 Invoke a command without arguments to list available arguments and their default values:
			
@@ -126,25 +139,37 @@ The following arguments are mandatory:
 
				   -input              training file path
			
 
				   -output             output file path
			
 
				 
			
 
				-The following arguments are optional:
			
 
				-  -lr                 learning rate [0.1]
			
 
				+  The following arguments are optional:
			
 
				+  -verbose            verbosity level [2]
			
 
				+
			
 
				+  The following arguments for the dictionary are optional:
			
 
				+  -minCount           minimal number of word occurences [5]
			
 
				+  -minCountLabel      minimal number of label occurences [0]
			
 
				+  -wordNgrams         max length of word ngram [1]
			
 
				+  -bucket             number of buckets [2000000]
			
 
				+  -minn               min length of char ngram [3]
			
 
				+  -maxn               max length of char ngram [6]
			
 
				+  -t                  sampling threshold [0.0001]
			
 
				+  -label              labels prefix [__label__]
			
 
				+
			
 
				+  The following arguments for training are optional:
			
 
				+  -lr                 learning rate [0.05]
			
 
				   -lrUpdateRate       change the rate of updates for the learning rate [100]
			
 
				   -dim                size of word vectors [100]
			
 
				   -ws                 size of the context window [5]
			
 
				   -epoch              number of epochs [5]
			
 
				-  -minCount           minimal number of word occurences [1]
			
 
				-  -minCountLabel      minimal number of label occurences [0]
			
 
				   -neg                number of negatives sampled [5]
			
 
				-  -wordNgrams         max length of word ngram [1]
			
 
				   -loss               loss function {ns, hs, softmax} [ns]
			
 
				-  -bucket             number of buckets [2000000]
			
 
				-  -minn               min length of char ngram [0]
			
 
				-  -maxn               max length of char ngram [0]
			
 
				   -thread             number of threads [12]
			
 
				-  -t                  sampling threshold [0.0001]
			
 
				-  -label              labels prefix [__label__]
			
 
				-  -verbose            verbosity level [2]
			
 
				   -pretrainedVectors  pretrained word vectors for supervised learning []
			
 
				+  -saveOutput         whether output params should be saved [0]
			
 
				+
			
 
				+  The following arguments for quantization are optional:
			
 
				+  -cutoff             number of words and ngrams to retain [0]
			
 
				+  -retrain            finetune embeddings if a cutoff is applied [0]
			
 
				+  -qnorm              quantizing the norm separately [0]
			
 
				+  -qout               quantizing the classifier [0]
			
 
				+  -dsub               size of each sub-vector [2]
			
 
				 ```
			
 
				 
			
 
				 Defaults may vary by mode. (Word-representation modes `skipgram` and `cbow` use a default `-minCount` of 5.)
			
@@ -179,6 +204,19 @@ Please cite [1](#enriching-word-vectors-with-subword-information) if using this
 
				 }
			
 
				 ```
			
 
				 
			
 
				+### FastText.zip: Compressing text classification models
			
 
				+
			
 
				+[3] A. Joulin, E. Grave, P. Bojanowski, M. Douze, H. Jégou, T. Mikolov, [*FastText.zip: Compressing text classification models*](https://arxiv.org/abs/1612.03651)
			
 
				+
			
 
				+```
			
 
				+@article{joulin2016fasttext,
			
 
				+  title={FastText.zip: Compressing text classification models},
			
 
				+  author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Douze, Matthijs and J{\'e}gou, H{\'e}rve and Mikolov, Tomas},
			
 
				+  journal={arXiv preprint arXiv:1612.03651},
			
 
				+  year={2016}
			
 
				+}
			
 
				+```
			
 
				+
			
 
				 (\* These authors contributed equally.)
			
 
				 
			
 
				 ## Resources
			
--- a/quantization-example.sh
+++ b/quantization-example.sh
@@ -34,7 +34,7 @@ echo "Quantizing..."
 
				 echo "Testing original model..."
			
 
				 ./fasttext test "${RESULTDIR}/dbpedia.bin" "${DATADIR}/dbpedia.test"
			
 
				 echo "Testing quantized model..."
			
 
				-./fasttext test "${RESULTDIR}/dbpedia.ftz" "${DATADIR}/dbpedia.test" 1 1
			
 
				+./fasttext test "${RESULTDIR}/dbpedia.ftz" "${DATADIR}/dbpedia.test"
			
 
				 
			
 
				-ls -lrh "${RESULTDIR}/dbpedia.bin" | awk  '{print "Size of the original model:\t",$5;}'
			
 
				-ls -lrh "${RESULTDIR}/dbpedia.ftz" | awk  '{print "Size of the quantized model:\t",$5;}'
			
 
				+wc -c < "${RESULTDIR}/dbpedia.bin" | awk '{print "Size of the original model:\t",$1;}'
			
 
				+wc -c < "${RESULTDIR}/dbpedia.ftz" | awk '{print "Size of the quantized model:\t",$1;}'
			
--- a/word-vector-example.sh
+++ b/word-vector-example.sh
@@ -35,6 +35,6 @@ make
 
				 
			
 
				 cut -f 1,2 "${DATADIR}"/rw/rw.txt | awk '{print tolower($0)}' | tr '\t' '\n' > "${DATADIR}"/queries.txt
			
 
				 
			
 
				-cat "${DATADIR}"/queries.txt | ./fasttext print-vectors "${RESULTDIR}"/fil9.bin > "${RESULTDIR}"/vectors.txt
			
 
				+cat "${DATADIR}"/queries.txt | ./fasttext print-word-vectors "${RESULTDIR}"/fil9.bin > "${RESULTDIR}"/vectors.txt
			
 
				 
			
 
				 python eval.py -m "${RESULTDIR}"/vectors.txt -d "${DATADIR}"/rw/rw.txt