5 年之前 · 13096686fc
--- a/.gitignore
+++ b/.gitignore
@@ -2,8 +2,11 @@
 
				 *.o
			
 
				 *.bin
			
 
				 *.vec
			
 
				+*.bc
			
 
				+.DS_Store
			
 
				 data
			
 
				 fasttext
			
 
				 result
			
 
				 website/node_modules/
			
 
				-
			
 
				+package-lock.json
			
 
				+node_modules/
			
--- a/Makefile
+++ b/Makefile
@@ -20,6 +20,12 @@ coverage: fasttext
 
				 debug: CXXFLAGS += -g -O0 -fno-inline
			
 
				 debug: fasttext
			
 
				 
			
 
				+wasm: webassembly/fasttext_wasm.js
			
 
				+
			
 
				+wasmdebug: export EMCC_DEBUG=1
			
 
				+wasmdebug: webassembly/fasttext_wasm.js
			
 
				+
			
 
				+
			
 
				 args.o: src/args.cc src/args.h
			
 
				 	$(CXX) $(CXXFLAGS) -c src/args.cc
			
 
				 
			
@@ -63,4 +69,57 @@ fasttext: $(OBJS) src/fasttext.cc
 
				 	$(CXX) $(CXXFLAGS) $(OBJS) src/main.cc -o fasttext
			
 
				 
			
 
				 clean:
			
 
				-	rm -rf *.o *.gcno *.gcda fasttext
			
 
				+	rm -rf *.o *.gcno *.gcda fasttext *.bc webassembly/fasttext_wasm.js webassembly/fasttext_wasm.wasm
			
 
				+
			
 
				+
			
 
				+EMCXX = em++
			
 
				+EMCXXFLAGS = --bind --std=c++11 -s WASM=1 -s ALLOW_MEMORY_GROWTH=1 -s "EXTRA_EXPORTED_RUNTIME_METHODS=['addOnPostRun', 'FS']" -s "DISABLE_EXCEPTION_CATCHING=0" -s "EXCEPTION_DEBUG=1" -s "FORCE_FILESYSTEM=1" -s "MODULARIZE=1" -s "EXPORT_ES6=1" -s 'EXPORT_NAME="FastTextModule"' -Isrc/
			
 
				+EMOBJS = args.bc autotune.bc matrix.bc dictionary.bc loss.bc productquantizer.bc densematrix.bc quantmatrix.bc vector.bc model.bc utils.bc meter.bc fasttext.bc main.bc
			
 
				+
			
 
				+
			
 
				+main.bc: webassembly/fasttext_wasm.cc
			
 
				+	$(EMCXX) $(EMCXXFLAGS)  webassembly/fasttext_wasm.cc -o main.bc
			
 
				+
			
 
				+args.bc: src/args.cc src/args.h
			
 
				+	$(EMCXX) $(EMCXXFLAGS)  src/args.cc -o args.bc
			
 
				+
			
 
				+autotune.bc: src/autotune.cc src/autotune.h
			
 
				+	$(EMCXX) $(EMCXXFLAGS)  src/autotune.cc -o autotune.bc
			
 
				+
			
 
				+matrix.bc: src/matrix.cc src/matrix.h
			
 
				+	$(EMCXX) $(EMCXXFLAGS) src/matrix.cc -o matrix.bc
			
 
				+
			
 
				+dictionary.bc: src/dictionary.cc src/dictionary.h src/args.h
			
 
				+	$(EMCXX) $(EMCXXFLAGS)  src/dictionary.cc -o dictionary.bc
			
 
				+
			
 
				+loss.bc: src/loss.cc src/loss.h src/matrix.h src/real.h
			
 
				+	$(EMCXX) $(EMCXXFLAGS) src/loss.cc -o loss.bc
			
 
				+
			
 
				+productquantizer.bc: src/productquantizer.cc src/productquantizer.h src/utils.h
			
 
				+	$(EMCXX) $(EMCXXFLAGS)  src/productquantizer.cc -o productquantizer.bc
			
 
				+
			
 
				+densematrix.bc: src/densematrix.cc src/densematrix.h src/utils.h src/matrix.h
			
 
				+	$(EMCXX) $(EMCXXFLAGS) src/densematrix.cc -o densematrix.bc
			
 
				+
			
 
				+quantmatrix.bc: src/quantmatrix.cc src/quantmatrix.h src/utils.h src/matrix.h
			
 
				+	$(EMCXX) $(EMCXXFLAGS) src/quantmatrix.cc -o quantmatrix.bc
			
 
				+
			
 
				+vector.bc: src/vector.cc src/vector.h src/utils.h
			
 
				+	$(EMCXX) $(EMCXXFLAGS)  src/vector.cc -o vector.bc
			
 
				+
			
 
				+model.bc: src/model.cc src/model.h src/args.h
			
 
				+	$(EMCXX) $(EMCXXFLAGS)  src/model.cc -o model.bc
			
 
				+
			
 
				+utils.bc: src/utils.cc src/utils.h
			
 
				+	$(EMCXX) $(EMCXXFLAGS)  src/utils.cc -o utils.bc
			
 
				+
			
 
				+meter.bc: src/meter.cc src/meter.h
			
 
				+	$(EMCXX) $(EMCXXFLAGS)  src/meter.cc -o meter.bc
			
 
				+
			
 
				+fasttext.bc: src/fasttext.cc src/*.h
			
 
				+	$(EMCXX) $(EMCXXFLAGS)  src/fasttext.cc -o fasttext.bc
			
 
				+
			
 
				+webassembly/fasttext_wasm.js: $(EMOBJS) webassembly/fasttext_wasm.cc Makefile
			
 
				+	$(EMCXX) $(EMCXXFLAGS) $(EMOBJS) -o webassembly/fasttext_wasm.js
			
 
				+
			
 
				+
			
--- a/src/args.cc
+++ b/src/args.cc
@@ -262,7 +262,8 @@ void Args::printTrainingHelp() {
 
				   std::cerr
			
 
				       << "\nThe following arguments for training are optional:\n"
			
 
				       << "  -lr                 learning rate [" << lr << "]\n"
			
 
				-      << "  -lrUpdateRate       change the rate of updates for the learning rate ["
			
 
				+      << "  -lrUpdateRate       change the rate of updates for the learning "
			
 
				+         "rate ["
			
 
				       << lrUpdateRate << "]\n"
			
 
				       << "  -dim                size of word vectors [" << dim << "]\n"
			
 
				       << "  -ws                 size of the context window [" << ws << "]\n"
			
@@ -270,9 +271,11 @@ void Args::printTrainingHelp() {
 
				       << "  -neg                number of negatives sampled [" << neg << "]\n"
			
 
				       << "  -loss               loss function {ns, hs, softmax, one-vs-all} ["
			
 
				       << lossToString(loss) << "]\n"
			
 
				-      << "  -thread             number of threads (set to 1 to ensure reproducible results) ["
			
 
				+      << "  -thread             number of threads (set to 1 to ensure "
			
 
				+         "reproducible results) ["
			
 
				       << thread << "]\n"
			
 
				-      << "  -pretrainedVectors  pretrained word vectors for supervised learning ["
			
 
				+      << "  -pretrainedVectors  pretrained word vectors for supervised "
			
 
				+         "learning ["
			
 
				       << pretrainedVectors << "]\n"
			
 
				       << "  -saveOutput         whether output params should be saved ["
			
 
				       << boolToString(saveOutput) << "]\n"
			
@@ -280,17 +283,19 @@ void Args::printTrainingHelp() {
 
				 }
			
 
				 
			
 
				 void Args::printAutotuneHelp() {
			
 
				-  std::cerr
			
 
				-      << "\nThe following arguments are for autotune:\n"
			
 
				-      << "  -autotune-validation            validation file to be used for evaluation\n"
			
 
				-      << "  -autotune-metric                metric objective {f1, f1:labelname} ["
			
 
				-      << autotuneMetric << "]\n"
			
 
				-      << "  -autotune-predictions           number of predictions used for evaluation  ["
			
 
				-      << autotunePredictions << "]\n"
			
 
				-      << "  -autotune-duration              maximum duration in seconds ["
			
 
				-      << autotuneDuration << "]\n"
			
 
				-      << "  -autotune-modelsize             constraint model file size ["
			
 
				-      << autotuneModelSize << "] (empty = do not quantize)\n";
			
 
				+  std::cerr << "\nThe following arguments are for autotune:\n"
			
 
				+            << "  -autotune-validation            validation file to be used "
			
 
				+               "for evaluation\n"
			
 
				+            << "  -autotune-metric                metric objective {f1, "
			
 
				+               "f1:labelname} ["
			
 
				+            << autotuneMetric << "]\n"
			
 
				+            << "  -autotune-predictions           number of predictions used "
			
 
				+               "for evaluation  ["
			
 
				+            << autotunePredictions << "]\n"
			
 
				+            << "  -autotune-duration              maximum duration in seconds ["
			
 
				+            << autotuneDuration << "]\n"
			
 
				+            << "  -autotune-modelsize             constraint model file size ["
			
 
				+            << autotuneModelSize << "] (empty = do not quantize)\n";
			
 
				 }
			
 
				 
			
 
				 void Args::printQuantizationHelp() {
			
@@ -298,7 +303,8 @@ void Args::printQuantizationHelp() {
 
				       << "\nThe following arguments for quantization are optional:\n"
			
 
				       << "  -cutoff             number of words and ngrams to retain ["
			
 
				       << cutoff << "]\n"
			
 
				-      << "  -retrain            whether embeddings are finetuned if a cutoff is applied ["
			
 
				+      << "  -retrain            whether embeddings are finetuned if a cutoff "
			
 
				+         "is applied ["
			
 
				       << boolToString(retrain) << "]\n"
			
 
				       << "  -qnorm              whether the norm is quantized separately ["
			
 
				       << boolToString(qnorm) << "]\n"
			
--- a/src/autotune.cc
+++ b/src/autotune.cc
@@ -416,10 +416,10 @@ void Autotune::train(const Args& autotuneArgs) {
 
				         if (!sizeConstraintWarning && trials_ > 10 &&
			
 
				             sizeConstraintFailed_ > (trials_ / 2)) {
			
 
				           sizeConstraintWarning = true;
			
 
				-          std::cerr
			
 
				-              << std::endl
			
 
				-              << "Warning : requested model size is probably too small. You may want to increase `autotune-modelsize`."
			
 
				-              << std::endl;
			
 
				+          std::cerr << std::endl
			
 
				+                    << "Warning : requested model size is probably too small. "
			
 
				+                       "You may want to increase `autotune-modelsize`."
			
 
				+                    << std::endl;
			
 
				         }
			
 
				       }
			
 
				     } catch (DenseMatrix::EncounteredNaNError&) {
			
@@ -442,10 +442,12 @@ void Autotune::train(const Args& autotuneArgs) {
 
				     std::string errorMessage;
			
 
				     if (sizeConstraintWarning) {
			
 
				       errorMessage =
			
 
				-          "Couldn't fulfil model size constraint: please increase `autotune-modelsize`.";
			
 
				+          "Couldn't fulfil model size constraint: please increase "
			
 
				+          "`autotune-modelsize`.";
			
 
				     } else {
			
 
				       errorMessage =
			
 
				-          "Didn't have enough time to train once: please increase `autotune-duration`.";
			
 
				+          "Didn't have enough time to train once: please increase "
			
 
				+          "`autotune-duration`.";
			
 
				     }
			
 
				     throw std::runtime_error(errorMessage);
			
 
				   } else {
			
--- a/src/densematrix.cc
+++ b/src/densematrix.cc
@@ -43,12 +43,17 @@ void DenseMatrix::uniformThread(real a, int block, int32_t seed) {
 
				 }
			
 
				 
			
 
				 void DenseMatrix::uniform(real a, unsigned int thread, int32_t seed) {
			
 
				-  std::vector<std::thread> threads;
			
 
				-  for (int i = 0; i < thread; i++) {
			
 
				-    threads.push_back(std::thread([=]() { uniformThread(a, i, seed); }));
			
 
				-  }
			
 
				-  for (int32_t i = 0; i < threads.size(); i++) {
			
 
				-    threads[i].join();
			
 
				+  if (thread > 1) {
			
 
				+    std::vector<std::thread> threads;
			
 
				+    for (int i = 0; i < thread; i++) {
			
 
				+      threads.push_back(std::thread([=]() { uniformThread(a, i, seed); }));
			
 
				+    }
			
 
				+    for (int32_t i = 0; i < threads.size(); i++) {
			
 
				+      threads[i].join();
			
 
				+    }
			
 
				+  } else {
			
 
				+    // webassembly can't instantiate `std::thread`
			
 
				+    uniformThread(a, 0, seed);
			
 
				   }
			
 
				 }
			
 
				 
			
--- a/src/fasttext.cc
+++ b/src/fasttext.cc
@@ -263,7 +263,7 @@ void FastText::loadModel(std::istream& in) {
 
				   buildModel();
			
 
				 }
			
 
				 
			
 
				-void FastText::printInfo(real progress, real loss, std::ostream& log_stream) {
			
 
				+std::tuple<int64_t, double, double> FastText::progressInfo(real progress) {
			
 
				   double t = utils::getDuration(start_, std::chrono::steady_clock::now());
			
 
				   double lr = args_->lr * (1.0 - progress);
			
 
				   double wst = 0;
			
@@ -271,14 +271,22 @@ void FastText::printInfo(real progress, real loss, std::ostream& log_stream) {
 
				   int64_t eta = 2592000; // Default to one month in seconds (720 * 3600)
			
 
				 
			
 
				   if (progress > 0 && t >= 0) {
			
 
				-    progress = progress * 100;
			
 
				-    eta = t * (100 - progress) / progress;
			
 
				+    eta = t * (1 - progress) / progress;
			
 
				     wst = double(tokenCount_) / t / args_->thread;
			
 
				   }
			
 
				 
			
 
				+  return std::tuple<double, double, int64_t>(wst, lr, eta);
			
 
				+}
			
 
				+
			
 
				+void FastText::printInfo(real progress, real loss, std::ostream& log_stream) {
			
 
				+  double wst;
			
 
				+  double lr;
			
 
				+  int64_t eta;
			
 
				+  std::tie<double, double, int64_t>(wst, lr, eta) = progressInfo(progress);
			
 
				+
			
 
				   log_stream << std::fixed;
			
 
				   log_stream << "Progress: ";
			
 
				-  log_stream << std::setprecision(1) << std::setw(5) << progress << "%";
			
 
				+  log_stream << std::setprecision(1) << std::setw(5) << (progress * 100) << "%";
			
 
				   log_stream << " words/sec/thread: " << std::setw(7) << int64_t(wst);
			
 
				   log_stream << " lr: " << std::setw(9) << std::setprecision(6) << lr;
			
 
				   log_stream << " avg.loss: " << std::setw(9) << std::setprecision(6) << loss;
			
@@ -304,7 +312,7 @@ std::vector<int32_t> FastText::selectEmbeddings(int32_t cutoff) const {
 
				   return idx;
			
 
				 }
			
 
				 
			
 
				-void FastText::quantize(const Args& qargs) {
			
 
				+void FastText::quantize(const Args& qargs, const TrainCallback& callback) {
			
 
				   if (args_->model != model_name::sup) {
			
 
				     throw std::invalid_argument(
			
 
				         "For now we only support quantization of supervised models");
			
@@ -336,10 +344,9 @@ void FastText::quantize(const Args& qargs) {
 
				       args_->verbose = qargs.verbose;
			
 
				       auto loss = createLoss(output_);
			
 
				       model_ = std::make_shared<Model>(input, output, loss, normalizeGradient);
			
 
				-      startThreads();
			
 
				+      startThreads(callback);
			
 
				     }
			
 
				   }
			
 
				-
			
 
				   input_ = std::make_shared<QuantMatrix>(
			
 
				       std::move(*(input.get())), qargs.dsub, qargs.qnorm);
			
 
				 
			
@@ -347,7 +354,6 @@ void FastText::quantize(const Args& qargs) {
 
				     output_ = std::make_shared<QuantMatrix>(
			
 
				         std::move(*(output.get())), 2, qargs.qnorm);
			
 
				   }
			
 
				-
			
 
				   quant_ = true;
			
 
				   auto loss = createLoss(output_);
			
 
				   model_ = std::make_shared<Model>(input_, output_, loss, normalizeGradient);
			
@@ -615,7 +621,7 @@ bool FastText::keepTraining(const int64_t ntokens) const {
 
				   return tokenCount_ < args_->epoch * ntokens && !trainException_;
			
 
				 }
			
 
				 
			
 
				-void FastText::trainThread(int32_t threadId) {
			
 
				+void FastText::trainThread(int32_t threadId, const TrainCallback& callback) {
			
 
				   std::ifstream ifs(args_->input);
			
 
				   utils::seek(ifs, threadId * utils::size(ifs) / args_->thread);
			
 
				 
			
@@ -624,9 +630,18 @@ void FastText::trainThread(int32_t threadId) {
 
				   const int64_t ntokens = dict_->ntokens();
			
 
				   int64_t localTokenCount = 0;
			
 
				   std::vector<int32_t> line, labels;
			
 
				+  uint64_t callbackCounter = 0;
			
 
				   try {
			
 
				     while (keepTraining(ntokens)) {
			
 
				       real progress = real(tokenCount_) / (args_->epoch * ntokens);
			
 
				+      if (callback && ((callbackCounter++ % 64) == 0)) {
			
 
				+        double wst;
			
 
				+        double lr;
			
 
				+        int64_t eta;
			
 
				+        std::tie<double, double, int64_t>(wst, lr, eta) =
			
 
				+            progressInfo(progress);
			
 
				+        callback(progress, loss_, wst, lr, eta);
			
 
				+      }
			
 
				       real lr = args_->lr * (1.0 - progress);
			
 
				       if (args_->model == model_name::sup) {
			
 
				         localTokenCount += dict_->getLine(ifs, line, labels);
			
@@ -717,7 +732,7 @@ std::shared_ptr<Matrix> FastText::createTrainOutputMatrix() const {
 
				   return output;
			
 
				 }
			
 
				 
			
 
				-void FastText::train(const Args& args) {
			
 
				+void FastText::train(const Args& args, const TrainCallback& callback) {
			
 
				   args_ = std::make_shared<Args>(args);
			
 
				   dict_ = std::make_shared<Dictionary>(args_);
			
 
				   if (args_->input == "-") {
			
@@ -742,7 +757,7 @@ void FastText::train(const Args& args) {
 
				   auto loss = createLoss(output_);
			
 
				   bool normalizeGradient = (args_->model == model_name::sup);
			
 
				   model_ = std::make_shared<Model>(input_, output_, loss, normalizeGradient);
			
 
				-  startThreads();
			
 
				+  startThreads(callback);
			
 
				 }
			
 
				 
			
 
				 void FastText::abort() {
			
@@ -753,14 +768,19 @@ void FastText::abort() {
 
				   }
			
 
				 }
			
 
				 
			
 
				-void FastText::startThreads() {
			
 
				+void FastText::startThreads(const TrainCallback& callback) {
			
 
				   start_ = std::chrono::steady_clock::now();
			
 
				   tokenCount_ = 0;
			
 
				   loss_ = -1;
			
 
				   trainException_ = nullptr;
			
 
				   std::vector<std::thread> threads;
			
 
				-  for (int32_t i = 0; i < args_->thread; i++) {
			
 
				-    threads.push_back(std::thread([=]() { trainThread(i); }));
			
 
				+  if (args_->thread > 1) {
			
 
				+    for (int32_t i = 0; i < args_->thread; i++) {
			
 
				+      threads.push_back(std::thread([=]() { trainThread(i, callback); }));
			
 
				+    }
			
 
				+  } else {
			
 
				+    // webassembly can't instantiate `std::thread`
			
 
				+    trainThread(0, callback);
			
 
				   }
			
 
				   const int64_t ntokens = dict_->ntokens();
			
 
				   // Same condition as trainThread
			
@@ -772,7 +792,7 @@ void FastText::startThreads() {
 
				       printInfo(progress, loss_, std::cerr);
			
 
				     }
			
 
				   }
			
 
				-  for (int32_t i = 0; i < args_->thread; i++) {
			
 
				+  for (int32_t i = 0; i < threads.size(); i++) {
			
 
				     threads[i].join();
			
 
				   }
			
 
				   if (trainException_) {
			
--- a/src/fasttext.h
+++ b/src/fasttext.h
@@ -12,6 +12,7 @@
 
				 
			
 
				 #include <atomic>
			
 
				 #include <chrono>
			
 
				+#include <functional>
			
 
				 #include <iostream>
			
 
				 #include <memory>
			
 
				 #include <queue>
			
@@ -31,6 +32,10 @@
 
				 namespace fasttext {
			
 
				 
			
 
				 class FastText {
			
 
				+ public:
			
 
				+  using TrainCallback =
			
 
				+      std::function<void(float, float, double, double, int64_t)>;
			
 
				+
			
 
				  protected:
			
 
				   std::shared_ptr<Args> args_;
			
 
				   std::shared_ptr<Dictionary> dict_;
			
@@ -47,9 +52,9 @@ class FastText {
 
				 
			
 
				   void signModel(std::ostream&);
			
 
				   bool checkModel(std::istream&);
			
 
				-  void startThreads();
			
 
				+  void startThreads(const TrainCallback& callback = {});
			
 
				   void addInputVector(Vector&, int32_t) const;
			
 
				-  void trainThread(int32_t);
			
 
				+  void trainThread(int32_t, const TrainCallback& callback);
			
 
				   std::vector<std::pair<real, std::string>> getNN(
			
 
				       const DenseMatrix& wordVectors,
			
 
				       const Vector& queryVec,
			
@@ -73,6 +78,7 @@ class FastText {
 
				   void precomputeWordVectors(DenseMatrix& wordVectors);
			
 
				   bool keepTraining(const int64_t ntokens) const;
			
 
				   void buildModel();
			
 
				+  std::tuple<int64_t, double, double> progressInfo(real progress);
			
 
				 
			
 
				  public:
			
 
				   FastText();
			
@@ -114,7 +120,7 @@ class FastText {
 
				 
			
 
				   void getSentenceVector(std::istream& in, Vector& vec);
			
 
				 
			
 
				-  void quantize(const Args& qargs);
			
 
				+  void quantize(const Args& qargs, const TrainCallback& callback = {});
			
 
				 
			
 
				   std::tuple<int64_t, double, double>
			
 
				   test(std::istream& in, int32_t k, real threshold = 0.0);
			
@@ -146,7 +152,7 @@ class FastText {
 
				       const std::string& wordB,
			
 
				       const std::string& wordC);
			
 
				 
			
 
				-  void train(const Args& args);
			
 
				+  void train(const Args& args, const TrainCallback& callback = {});
			
 
				 
			
 
				   void abort();
			
 
				 
			
--- a/src/main.cc
+++ b/src/main.cc
@@ -21,19 +21,25 @@ void printUsage() {
 
				       << "usage: fasttext <command> <args>\n\n"
			
 
				       << "The commands supported by fasttext are:\n\n"
			
 
				       << "  supervised              train a supervised classifier\n"
			
 
				-      << "  quantize                quantize a model to reduce the memory usage\n"
			
 
				+      << "  quantize                quantize a model to reduce the memory "
			
 
				+         "usage\n"
			
 
				       << "  test                    evaluate a supervised classifier\n"
			
 
				-      << "  test-label              print labels with precision and recall scores\n"
			
 
				+      << "  test-label              print labels with precision and recall "
			
 
				+         "scores\n"
			
 
				       << "  predict                 predict most likely labels\n"
			
 
				-      << "  predict-prob            predict most likely labels with probabilities\n"
			
 
				+      << "  predict-prob            predict most likely labels with "
			
 
				+         "probabilities\n"
			
 
				       << "  skipgram                train a skipgram model\n"
			
 
				       << "  cbow                    train a cbow model\n"
			
 
				       << "  print-word-vectors      print word vectors given a trained model\n"
			
 
				-      << "  print-sentence-vectors  print sentence vectors given a trained model\n"
			
 
				-      << "  print-ngrams            print ngrams given a trained model and word\n"
			
 
				+      << "  print-sentence-vectors  print sentence vectors given a trained "
			
 
				+         "model\n"
			
 
				+      << "  print-ngrams            print ngrams given a trained model and "
			
 
				+         "word\n"
			
 
				       << "  nn                      query for nearest neighbors\n"
			
 
				       << "  analogies               query for analogies\n"
			
 
				-      << "  dump                    dump arguments,dictionary,input/output vectors\n"
			
 
				+      << "  dump                    dump arguments,dictionary,input/output "
			
 
				+         "vectors\n"
			
 
				       << std::endl;
			
 
				 }
			
 
				 
			
--- a/src/real.h
+++ b/src/real.h
@@ -11,5 +11,4 @@
 
				 namespace fasttext {
			
 
				 
			
 
				 typedef float real;
			
 
				-
			
 
				 }
			
--- a/webassembly/README.md
+++ b/webassembly/README.md
@@ -0,0 +1,37 @@
 
				+ fastText [![CircleCI](https://circleci.com/gh/facebookresearch/fastText/tree/master.svg?style=svg)](https://circleci.com/gh/facebookresearch/fastText/tree/master)
			
 
				+
			
 
				+[fastText](https://fasttext.cc/) is a library for efficient learning of word representations and sentence classification.
			
 
				+
			
 
				+In this document we present how to use fastText in a browser with WebAssembly.
			
 
				+
			
 
				+
			
 
				+# Requirements
			
 
				+
			
 
				+[fastText](https://fasttext.cc/) builds on modern Mac OS and Linux distributions.
			
 
				+Since it uses C\++11 features, it requires a compiler with good C++11 support.
			
 
				+You will need [emscripten](https://emscripten.org/) and a [browser that supports WebAssembly](https://caniuse.com/#feat=wasm).
			
 
				+
			
 
				+
			
 
				+# Building WebAssembly binaries
			
 
				+
			
 
				+First, download and install emscripten sdk as [described here](https://emscripten.org/docs/getting_started/downloads.html#installation-instructions)
			
 
				+
			
 
				+
			
 
				+Make sure you activated the PATH for emscripten:
			
 
				+```bash
			
 
				+$ source /path/to/emsdk/emsdk_env.sh
			
 
				+```
			
 
				+
			
 
				+Clone our [repository](https://github.com/facebookresearch/fastText/).
			
 
				+
			
 
				+```bash
			
 
				+$ git clone [email protected]:facebookresearch/fastText.git
			
 
				+```
			
 
				+
			
 
				+Build WebAssembly binaries:
			
 
				+```bash
			
 
				+$ cd fastText
			
 
				+$ make wasm
			
 
				+```
			
 
				+
			
 
				+
			
--- a/webassembly/doc/examples/misc.html
+++ b/webassembly/doc/examples/misc.html
@@ -0,0 +1,62 @@
 
				+<!DOCTYPE html>
			
 
				+<html>
			
 
				+<head>
			
 
				+    <meta charset="UTF-8">
			
 
				+    <meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1.0, maximum-scale=1.0, user-scalable=no">
			
 
				+</head>
			
 
				+<body>
			
 
				+    <script type="module">
			
 
				+        const printVector = function(predictions, limit) {
			
 
				+            limit = limit || Infinity;
			
 
				+
			
 
				+            for (let i=0; i<predictions.size() && i<limit; i++){
			
 
				+                let prediction = predictions.get(i);
			
 
				+                console.log(predictions.get(i));
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        import {FastText, addOnPostRun} from "./fasttext.js";
			
 
				+
			
 
				+        addOnPostRun(() => {
			
 
				+            let ft = new FastText();
			
 
				+
			
 
				+            const url = "lid.176.ftz";
			
 
				+            ft.loadModel(url).then(model => {
			
 
				+                /* isQuant */
			
 
				+                console.log(model.isQuant());
			
 
				+
			
 
				+                /* getDimension */
			
 
				+                console.log(model.getDimension());
			
 
				+
			
 
				+                /* getWordVector */
			
 
				+                let v = model.getWordVector("Hello");
			
 
				+                console.log(v);
			
 
				+
			
 
				+                /* getSentenceVector */
			
 
				+                let v1 = model.getSentenceVector("Hello");
			
 
				+                console.log(v1);
			
 
				+                let v2 = model.getSentenceVector("Hello this is a sentence");
			
 
				+                console.log(v2);
			
 
				+
			
 
				+                /* getNearestNeighbors */
			
 
				+                printVector(model.getNearestNeighbors("Hello", 10));
			
 
				+
			
 
				+                /* getAnalogies */
			
 
				+                printVector(model.getAnalogies("paris", "france", "london", 10));
			
 
				+
			
 
				+                /* getWordId */
			
 
				+                console.log(model.getWordId("Hello"));
			
 
				+
			
 
				+                /* getSubwords */
			
 
				+                let subWordInformation = model.getSubwords("désinstitutionnalisation");
			
 
				+                printVector(subWordInformation[0]);
			
 
				+
			
 
				+                /* getInputVector */
			
 
				+                console.log(model.getInputVector(832));
			
 
				+            });
			
 
				+        });
			
 
				+
			
 
				+    </script>
			
 
				+</body>
			
 
				+
			
 
				+</html>
			
--- a/webassembly/doc/examples/predict.html
+++ b/webassembly/doc/examples/predict.html
@@ -0,0 +1,42 @@
 
				+<!DOCTYPE html>
			
 
				+<html>
			
 
				+<head>
			
 
				+    <meta charset="UTF-8">
			
 
				+    <meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1.0, maximum-scale=1.0, user-scalable=no">
			
 
				+</head>
			
 
				+<body>
			
 
				+    <script type="module">
			
 
				+        const printVector = function(predictions, limit) {
			
 
				+            limit = limit || Infinity;
			
 
				+
			
 
				+            for (let i=0; i<predictions.size() && i<limit; i++){
			
 
				+                let prediction = predictions.get(i);
			
 
				+                console.log(predictions.get(i));
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        import {FastText, addOnPostRun} from "./fasttext.js";
			
 
				+
			
 
				+        addOnPostRun(() => {
			
 
				+            let ft = new FastText();
			
 
				+
			
 
				+            const url = "lid.176.ftz";
			
 
				+            ft.loadModel(url).then(model => {
			
 
				+                let text = "Bonjour à tous. Ceci est du français";
			
 
				+                console.log(text);
			
 
				+                printVector(model.predict(text, 5, 0.0));
			
 
				+
			
 
				+                text = "Hello, world. This is english";
			
 
				+                console.log(text);
			
 
				+                printVector(model.predict(text, 5, 0.0));
			
 
				+
			
 
				+                text = "Merhaba dünya. Bu da türkçe"
			
 
				+                console.log(text);
			
 
				+                printVector(model.predict(text, 5, 0.0));
			
 
				+            });
			
 
				+        });
			
 
				+
			
 
				+    </script>
			
 
				+</body>
			
 
				+
			
 
				+</html>
			
--- a/webassembly/doc/examples/train_supervised.html
+++ b/webassembly/doc/examples/train_supervised.html
@@ -0,0 +1,66 @@
 
				+<!DOCTYPE html>
			
 
				+<html>
			
 
				+<head>
			
 
				+    <meta charset="UTF-8">
			
 
				+    <meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1.0, maximum-scale=1.0, user-scalable=no">
			
 
				+</head>
			
 
				+<body>
			
 
				+    <script type="module">
			
 
				+        const printVector = function(predictions, limit) {
			
 
				+            limit = limit || Infinity;
			
 
				+
			
 
				+            for (let i=0; i<predictions.size() && i<limit; i++){
			
 
				+                let prediction = predictions.get(i);
			
 
				+                console.log(predictions.get(i));
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        const trainCallback = (progress, loss, wst, lr, eta) => {
			
 
				+            console.log([progress, loss, wst, lr, eta]);
			
 
				+        };
			
 
				+
			
 
				+        import {FastText, addOnPostRun} from "./fasttext.js";
			
 
				+
			
 
				+        addOnPostRun(() => {
			
 
				+            let ft = new FastText();
			
 
				+
			
 
				+            ft.trainSupervised("cooking.train", {
			
 
				+                'lr':1.0,
			
 
				+                'epoch':10,
			
 
				+                'loss':'hs',
			
 
				+                'wordNgrams':2,
			
 
				+                'dim':50,
			
 
				+                'bucket':200000
			
 
				+            }, trainCallback).then(model => {
			
 
				+                console.log('Trained.');
			
 
				+
			
 
				+                printVector(model.predict("Which baking dish is best to bake a banana bread ?", 5, 0.0));
			
 
				+
			
 
				+                /* getInputMatrix */
			
 
				+                let inputMatrix = model.getInputMatrix();
			
 
				+                console.log(inputMatrix.cols());
			
 
				+                console.log(inputMatrix.rows());
			
 
				+                console.log(inputMatrix.at(1, 2));
			
 
				+
			
 
				+                /* getOutputMatrix */
			
 
				+                let outputMatrix = model.getOutputMatrix();
			
 
				+                console.log(outputMatrix.cols());
			
 
				+                console.log(outputMatrix.rows());
			
 
				+                console.log(outputMatrix.at(1, 2));
			
 
				+
			
 
				+                /* getWords */
			
 
				+                let wordsInformation = model.getWords();
			
 
				+                printVector(wordsInformation[0], 30);   // words
			
 
				+                printVector(wordsInformation[1], 30);   // frequencies
			
 
				+
			
 
				+                /* getLabels */
			
 
				+                let labelsInformation = model.getLabels();
			
 
				+                printVector(labelsInformation[0], 30);  // labels
			
 
				+                printVector(labelsInformation[1], 30);  // frequencies
			
 
				+            });
			
 
				+        });
			
 
				+
			
 
				+    </script>
			
 
				+</body>
			
 
				+
			
 
				+</html>
			
--- a/webassembly/doc/examples/train_unsupervised.html
+++ b/webassembly/doc/examples/train_unsupervised.html
@@ -0,0 +1,44 @@
 
				+<!DOCTYPE html>
			
 
				+<html>
			
 
				+<head>
			
 
				+    <meta charset="UTF-8">
			
 
				+    <meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1.0, maximum-scale=1.0, user-scalable=no">
			
 
				+</head>
			
 
				+<body>
			
 
				+    <script type="module">
			
 
				+        const printVector = function(predictions, limit) {
			
 
				+            limit = limit || Infinity;
			
 
				+
			
 
				+            for (let i=0; i<predictions.size() && i<limit; i++){
			
 
				+                let prediction = predictions.get(i);
			
 
				+                console.log(predictions.get(i));
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        const trainCallback = (progress, loss, wst, lr, eta) => {
			
 
				+            console.log([progress, loss, wst, lr, eta]);
			
 
				+        };
			
 
				+
			
 
				+        import {FastText, addOnPostRun} from "./fasttext.js";
			
 
				+
			
 
				+        addOnPostRun(() => {
			
 
				+            let ft = new FastText();
			
 
				+
			
 
				+            ft.trainUnsupervised("fil9", 'skipgram', {
			
 
				+                'lr':0.1,
			
 
				+                'epoch':1,
			
 
				+                'loss':'ns',
			
 
				+                'wordNgrams':2,
			
 
				+                'dim':50,
			
 
				+                'bucket':200000
			
 
				+            }, trainCallback).then(model => {
			
 
				+                let wordsInformation = model.getWords();
			
 
				+                printVector(wordsInformation[0], 30);   // words
			
 
				+                printVector(wordsInformation[1], 30);   // frequencies
			
 
				+            });
			
 
				+        });
			
 
				+
			
 
				+    </script>
			
 
				+</body>
			
 
				+
			
 
				+</html>
			
--- a/webassembly/fasttext.js
+++ b/webassembly/fasttext.js
@@ -0,0 +1,520 @@
 
				+/**
			
 
				+ * Copyright (c) 2016-present, Facebook, Inc.
			
 
				+ * All rights reserved.
			
 
				+ *
			
 
				+ * This source code is licensed under the MIT license found in the
			
 
				+ * LICENSE file in the root directory of this source tree.
			
 
				+ */
			
 
				+
			
 
				+import fastTextModularized from './fasttext_wasm.js';
			
 
				+const fastTextModule = fastTextModularized();
			
 
				+
			
 
				+let postRunFunc = null;
			
 
				+const addOnPostRun = function(func) {
			
 
				+  postRunFunc = func;
			
 
				+};
			
 
				+
			
 
				+fastTextModule.addOnPostRun(() => {
			
 
				+  if (postRunFunc) {
			
 
				+    postRunFunc();
			
 
				+  }
			
 
				+});
			
 
				+
			
 
				+const thisModule = this;
			
 
				+const trainFileInWasmFs = 'train.txt';
			
 
				+const testFileInWasmFs = 'test.txt';
			
 
				+const modelFileInWasmFs = 'model.bin';
			
 
				+
			
 
				+const getFloat32ArrayFromHeap = (len) => {
			
 
				+  const dataBytes = len * Float32Array.BYTES_PER_ELEMENT;
			
 
				+  const dataPtr = fastTextModule._malloc(dataBytes);
			
 
				+  const dataHeap = new Uint8Array(fastTextModule.HEAPU8.buffer,
			
 
				+    dataPtr,
			
 
				+    dataBytes);
			
 
				+  return {
			
 
				+    'ptr':dataHeap.byteOffset,
			
 
				+    'size':len,
			
 
				+    'buffer':dataHeap.buffer
			
 
				+  };
			
 
				+};
			
 
				+
			
 
				+const heapToFloat32 = (r) => new Float32Array(r.buffer, r.ptr, r.size);
			
 
				+
			
 
				+class FastText {
			
 
				+  constructor() {
			
 
				+    this.f = new fastTextModule.FastText();
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * loadModel
			
 
				+   *
			
 
				+   * Loads the model file from the specified url, and returns the
			
 
				+   * corresponding `FastTextModel` object.
			
 
				+   *
			
 
				+   * @param {string}     url
			
 
				+   *     the url of the model file.
			
 
				+   *
			
 
				+   * @return {Promise}   promise object that resolves to a `FastTextModel`
			
 
				+   *
			
 
				+   */
			
 
				+  loadModel(url) {
			
 
				+    const fetchFunc = (thisModule && thisModule.fetch) || fetch;
			
 
				+
			
 
				+    const fastTextNative = this.f;
			
 
				+    return new Promise(function(resolve, reject) {
			
 
				+      fetchFunc(url).then(response => {
			
 
				+        return response.arrayBuffer();
			
 
				+      }).then(bytes => {
			
 
				+        const byteArray = new Uint8Array(bytes);
			
 
				+        const FS = fastTextModule.FS;
			
 
				+        FS.writeFile(modelFileInWasmFs, byteArray);
			
 
				+      }).then(() =>  {
			
 
				+        fastTextNative.loadModel(modelFileInWasmFs);
			
 
				+        resolve(new FastTextModel(fastTextNative));
			
 
				+      }).catch(error => {
			
 
				+        reject(error);
			
 
				+      });
			
 
				+    });
			
 
				+  }
			
 
				+
			
 
				+  _train(url, modelName, kwargs = {}, callback = null) {
			
 
				+    const fetchFunc = (thisModule && thisModule.fetch) || fetch;
			
 
				+    const fastTextNative = this.f;
			
 
				+
			
 
				+    return new Promise(function(resolve, reject) {
			
 
				+      fetchFunc(url).then(response => {
			
 
				+        return response.arrayBuffer();
			
 
				+      }).then(bytes => {
			
 
				+        const byteArray = new Uint8Array(bytes);
			
 
				+        const FS = fastTextModule.FS;
			
 
				+        FS.writeFile(trainFileInWasmFs, byteArray);
			
 
				+      }).then(() =>  {
			
 
				+        const argsList = ['lr', 'lrUpdateRate', 'dim', 'ws', 'epoch',
			
 
				+          'minCount', 'minCountLabel', 'neg', 'wordNgrams', 'loss',
			
 
				+          'model', 'bucket', 'minn', 'maxn', 't', 'label', 'verbose',
			
 
				+          'pretrainedVectors', 'saveOutput', 'seed', 'qout', 'retrain',
			
 
				+          'qnorm', 'cutoff', 'dsub', 'qnorm', 'autotuneValidationFile',
			
 
				+          'autotuneMetric', 'autotunePredictions', 'autotuneDuration',
			
 
				+          'autotuneModelSize'];
			
 
				+        const args = new fastTextModule.Args();
			
 
				+        argsList.forEach(k => {
			
 
				+          if (k in kwargs) {
			
 
				+            args[k] = kwargs[k];
			
 
				+          }
			
 
				+        });
			
 
				+        args.model = fastTextModule.ModelName[modelName];
			
 
				+        args.loss = ('loss' in kwargs) ?
			
 
				+          fastTextModule.LossName[kwargs['loss']] : 'hs';
			
 
				+        args.thread = 1;
			
 
				+        args.input = trainFileInWasmFs;
			
 
				+
			
 
				+        fastTextNative.train(args, callback);
			
 
				+
			
 
				+        resolve(new FastTextModel(fastTextNative));
			
 
				+      }).catch(error => {
			
 
				+        reject(error);
			
 
				+      });
			
 
				+    });
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * trainSupervised
			
 
				+   *
			
 
				+   * Downloads the input file from the specified url, trains a supervised
			
 
				+   * model and returns a `FastTextModel` object.
			
 
				+   *
			
 
				+   * @param {string}     url
			
 
				+   *     the url of the input file.
			
 
				+   *     The input file must must contain at least one label per line. For an
			
 
				+   *     example consult the example datasets which are part of the fastText
			
 
				+   *     repository such as the dataset pulled by classification-example.sh.
			
 
				+   *
			
 
				+   * @param {dict}       kwargs
			
 
				+   *     train parameters.
			
 
				+   *     For example {'lr': 0.5, 'epoch': 5}
			
 
				+   *
			
 
				+   * @param {function}   callback
			
 
				+   *     train callback function
			
 
				+   *     `callback` function is called regularly from the train loop:
			
 
				+   *     `callback(progress, loss, wordsPerSec, learningRate, eta)`
			
 
				+   *
			
 
				+   * @return {Promise}   promise object that resolves to a `FastTextModel`
			
 
				+   *
			
 
				+   */
			
 
				+  trainSupervised(url, kwargs = {}, callback) {
			
 
				+    const self = this;
			
 
				+    return new Promise(function(resolve, reject) {
			
 
				+      self._train(url, 'supervised', kwargs, callback).then(model => {
			
 
				+        resolve(model);
			
 
				+      }).catch(error => {
			
 
				+        reject(error);
			
 
				+      });
			
 
				+    });
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * trainUnsupervised
			
 
				+   *
			
 
				+   * Downloads the input file from the specified url, trains an unsupervised
			
 
				+   * model and returns a `FastTextModel` object.
			
 
				+   *
			
 
				+   * @param {string}     url
			
 
				+   *     the url of the input file.
			
 
				+   *     The input file must not contain any labels or use the specified label
			
 
				+   *     prefixunless it is ok for those words to be ignored. For an example
			
 
				+   *     consult the dataset pulled by the example script word-vector-example.sh
			
 
				+   *     which is part of the fastText repository.
			
 
				+   *
			
 
				+   * @param {string}     modelName
			
 
				+   *     Model to be used for unsupervised learning. `cbow` or `skipgram`.
			
 
				+   *
			
 
				+   * @param {dict}       kwargs
			
 
				+   *     train parameters.
			
 
				+   *     For example {'lr': 0.5, 'epoch': 5}
			
 
				+   *
			
 
				+   * @param {function}   callback
			
 
				+   *     train callback function
			
 
				+   *     `callback` function is called regularly from the train loop:
			
 
				+   *     `callback(progress, loss, wordsPerSec, learningRate, eta)`
			
 
				+   *
			
 
				+   * @return {Promise}   promise object that resolves to a `FastTextModel`
			
 
				+   *
			
 
				+   */
			
 
				+  trainUnsupervised(url, modelName, kwargs = {}, callback) {
			
 
				+    const self = this;
			
 
				+    return new Promise(function(resolve, reject) {
			
 
				+      self._train(url, modelName, kwargs, callback).then(model => {
			
 
				+        resolve(model);
			
 
				+      }).catch(error => {
			
 
				+        reject(error);
			
 
				+      });
			
 
				+    });
			
 
				+  }
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+class FastTextModel {
			
 
				+  /**
			
 
				+     * `FastTextModel` represents a trained model.
			
 
				+     *
			
 
				+     * @constructor
			
 
				+     *
			
 
				+     * @param {object}       fastTextNative
			
 
				+     *     webassembly object that makes the bridge between js and C++
			
 
				+     */
			
 
				+  constructor(fastTextNative) {
			
 
				+    this.f = fastTextNative;
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+     * isQuant
			
 
				+     *
			
 
				+     * @return {bool}   true if the model is quantized
			
 
				+     *
			
 
				+     */
			
 
				+  isQuant() {
			
 
				+    return this.f.isQuant;
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+     * getDimension
			
 
				+     *
			
 
				+     * @return {int}    the dimension (size) of a lookup vector (hidden layer)
			
 
				+     *
			
 
				+     */
			
 
				+  getDimension() {
			
 
				+    return this.f.args.dim;
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+     * getWordVector
			
 
				+     *
			
 
				+     * @param {string}          word
			
 
				+     *
			
 
				+     * @return {Float32Array}   the vector representation of `word`.
			
 
				+     *
			
 
				+     */
			
 
				+  getWordVector(word) {
			
 
				+    const b = getFloat32ArrayFromHeap(this.getDimension());
			
 
				+    this.f.getWordVector(b, word);
			
 
				+
			
 
				+    return heapToFloat32(b);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+     * getSentenceVector
			
 
				+     *
			
 
				+     * @param {string}          text
			
 
				+     *
			
 
				+     * @return {Float32Array}   the vector representation of `text`.
			
 
				+     *
			
 
				+     */
			
 
				+  getSentenceVector(text) {
			
 
				+    if (text.indexOf('\n') != -1) {
			
 
				+      "sentence vector processes one line at a time (remove '\\n')";
			
 
				+    }
			
 
				+    text += '\n';
			
 
				+    const b = getFloat32ArrayFromHeap(this.getDimension());
			
 
				+    this.f.getSentenceVector(b, text);
			
 
				+
			
 
				+    return heapToFloat32(b);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+     * getNearestNeighbors
			
 
				+     *
			
 
				+     * returns the nearest `k` neighbors of `word`.
			
 
				+     *
			
 
				+     * @param {string}          word
			
 
				+     * @param {int}             k
			
 
				+     *
			
 
				+     * @return {Array.<Pair.<number, string>>}
			
 
				+     *     words and their corresponding cosine similarities.
			
 
				+     *
			
 
				+     */
			
 
				+  getNearestNeighbors(word, k = 10) {
			
 
				+    return this.f.getNN(word, k);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+     * getAnalogies
			
 
				+     *
			
 
				+     * returns the nearest `k` neighbors of the operation
			
 
				+     * `wordA - wordB + wordC`.
			
 
				+     *
			
 
				+     * @param {string}          wordA
			
 
				+     * @param {string}          wordB
			
 
				+     * @param {string}          wordC
			
 
				+     * @param {int}             k
			
 
				+     *
			
 
				+     * @return {Array.<Pair.<number, string>>}
			
 
				+     *     words and their corresponding cosine similarities
			
 
				+     *
			
 
				+     */
			
 
				+  getAnalogies(wordA, wordB, wordC, k) {
			
 
				+    return this.f.getAnalogies(k, wordA, wordB, wordC);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+     * getWordId
			
 
				+     *
			
 
				+     * Given a word, get the word id within the dictionary.
			
 
				+     * Returns -1 if word is not in the dictionary.
			
 
				+     *
			
 
				+     * @return {int}    word id
			
 
				+     *
			
 
				+     */
			
 
				+  getWordId(word) {
			
 
				+    return this.f.getWordId(word);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+     * getSubwordId
			
 
				+     *
			
 
				+     * Given a subword, return the index (within input matrix) it hashes to.
			
 
				+     *
			
 
				+     * @return {int}    subword id
			
 
				+     *
			
 
				+     */
			
 
				+  getSubwordId(subword) {
			
 
				+    return this.f.getSubwordId(subword);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+     * getSubwords
			
 
				+     *
			
 
				+     * returns the subwords and their indicies.
			
 
				+     *
			
 
				+     * @param {string}          word
			
 
				+     *
			
 
				+     * @return {Pair.<Array.<string>, Array.<int>>}
			
 
				+     *     words and their corresponding indicies
			
 
				+     *
			
 
				+     */
			
 
				+  getSubwords(word) {
			
 
				+    return this.f.getSubwords(word);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+     * getInputVector
			
 
				+     *
			
 
				+     * Given an index, get the corresponding vector of the Input Matrix.
			
 
				+     *
			
 
				+     * @param {int}             ind
			
 
				+     *
			
 
				+     * @return {Float32Array}   the vector of the `ind`'th index
			
 
				+     *
			
 
				+     */
			
 
				+  getInputVector(ind) {
			
 
				+    const b = getFloat32ArrayFromHeap(this.getDimension());
			
 
				+    this.f.getInputVector(b, ind);
			
 
				+
			
 
				+    return heapToFloat32(b);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+     * predict
			
 
				+     *
			
 
				+     * Given a string, get a list of labels and a list of corresponding
			
 
				+     * probabilities. k controls the number of returned labels.
			
 
				+     *
			
 
				+     * @param {string}          text
			
 
				+     * @param {int}             k, the number of predictions to be returned
			
 
				+     * @param {number}          probability threshold
			
 
				+     *
			
 
				+     * @return {Array.<Pair.<number, string>>}
			
 
				+     *     labels and their probabilities
			
 
				+     *
			
 
				+     */
			
 
				+  predict(text, k = 1, threshold = 0.0) {
			
 
				+    return this.f.predict(text, k, threshold);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+     * getInputMatrix
			
 
				+     *
			
 
				+     * Get a reference to the full input matrix of a Model. This only
			
 
				+     * works if the model is not quantized.
			
 
				+     *
			
 
				+     * @return {DenseMatrix}
			
 
				+     *     densematrix with functions: `rows`, `cols`, `at(i,j)`
			
 
				+     *
			
 
				+     * example:
			
 
				+     *     let inputMatrix = model.getInputMatrix();
			
 
				+     *     let value = inputMatrix.at(1, 2);
			
 
				+     */
			
 
				+  getInputMatrix() {
			
 
				+    if (this.isQuant()) {
			
 
				+      throw new Error("Can't get quantized Matrix");
			
 
				+    }
			
 
				+    return this.f.getInputMatrix();
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+     * getOutputMatrix
			
 
				+     *
			
 
				+     * Get a reference to the full input matrix of a Model. This only
			
 
				+     * works if the model is not quantized.
			
 
				+     *
			
 
				+     * @return {DenseMatrix}
			
 
				+     *     densematrix with functions: `rows`, `cols`, `at(i,j)`
			
 
				+     *
			
 
				+     * example:
			
 
				+     *     let outputMatrix = model.getOutputMatrix();
			
 
				+     *     let value = outputMatrix.at(1, 2);
			
 
				+     */
			
 
				+  getOutputMatrix() {
			
 
				+    if (this.isQuant()) {
			
 
				+      throw new Error("Can't get quantized Matrix");
			
 
				+    }
			
 
				+    return this.f.getOutputMatrix();
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+     * getWords
			
 
				+     *
			
 
				+     * Get the entire list of words of the dictionary including the frequency
			
 
				+     * of the individual words. This does not include any subwords. For that
			
 
				+     * please consult the function get_subwords.
			
 
				+     *
			
 
				+     * @return {Pair.<Array.<string>, Array.<int>>}
			
 
				+     *     words and their corresponding frequencies
			
 
				+     *
			
 
				+     */
			
 
				+  getWords() {
			
 
				+    return this.f.getWords();
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+     * getLabels
			
 
				+     *
			
 
				+     * Get the entire list of labels of the dictionary including the frequency
			
 
				+     * of the individual labels.
			
 
				+     *
			
 
				+     * @return {Pair.<Array.<string>, Array.<int>>}
			
 
				+     *     labels and their corresponding frequencies
			
 
				+     *
			
 
				+     */
			
 
				+  getLabels() {
			
 
				+    return this.f.getLabels();
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+     * getLine
			
 
				+     *
			
 
				+     * Split a line of text into words and labels. Labels must start with
			
 
				+     * the prefix used to create the model (__label__ by default).
			
 
				+     *
			
 
				+     * @param {string}          text
			
 
				+     *
			
 
				+     * @return {Pair.<Array.<string>, Array.<string>>}
			
 
				+     *     words and labels
			
 
				+     *
			
 
				+     */
			
 
				+  getLine(text) {
			
 
				+    return this.f.getLine(text);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+     * saveModel
			
 
				+     *
			
 
				+     * Saves the model file in web assembly in-memory FS and returns a blob
			
 
				+     *
			
 
				+     * @return {Blob}           blob data of the file saved in web assembly FS
			
 
				+     *
			
 
				+     */
			
 
				+  saveModel() {
			
 
				+    this.f.saveModel(modelFileInWasmFs);
			
 
				+    const content = fastTextModule.FS.readFile(modelFileInWasmFs,
			
 
				+      { encoding: 'binary' });
			
 
				+    return new Blob(
			
 
				+      [new Uint8Array(content, content.byteOffset, content.length)],
			
 
				+      { type: ' application/octet-stream' }
			
 
				+    );
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+     * test
			
 
				+     *
			
 
				+     * Downloads the test file from the specified url, evaluates the supervised
			
 
				+     * model with it.
			
 
				+     *
			
 
				+     * @param {string}          url
			
 
				+     * @param {int}             k, the number of predictions to be returned
			
 
				+     * @param {number}          probability threshold
			
 
				+     *
			
 
				+     * @return {Promise}   promise object that resolves to a `Meter` object
			
 
				+     *
			
 
				+     * example:
			
 
				+     * model.test("/absolute/url/to/test.txt", 1, 0.0).then((meter) => {
			
 
				+     *     console.log(meter.precision);
			
 
				+     *     console.log(meter.recall);
			
 
				+     *     console.log(meter.f1Score);
			
 
				+     *     console.log(meter.nexamples());
			
 
				+     * });
			
 
				+     *
			
 
				+     */
			
 
				+  test(url, k, threshold) {
			
 
				+    const fetchFunc = (thisModule && thisModule.fetch) || fetch;
			
 
				+    const fastTextNative = this.f;
			
 
				+
			
 
				+    return new Promise(function(resolve, reject) {
			
 
				+      fetchFunc(url).then(response => {
			
 
				+        return response.arrayBuffer();
			
 
				+      }).then(bytes => {
			
 
				+        const byteArray = new Uint8Array(bytes);
			
 
				+        const FS = fastTextModule.FS;
			
 
				+        FS.writeFile(testFileInWasmFs, byteArray);
			
 
				+      }).then(() =>  {
			
 
				+        const meter = fastTextNative.test(testFileInWasmFs, k, threshold);
			
 
				+        resolve(meter);
			
 
				+      }).catch(error => {
			
 
				+        reject(error);
			
 
				+      });
			
 
				+    });
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+export {FastText, addOnPostRun};
			
--- a/webassembly/fasttext_wasm.cc
+++ b/webassembly/fasttext_wasm.cc
@@ -0,0 +1,328 @@
 
				+/**
			
 
				+ * Copyright (c) 2016-present, Facebook, Inc.
			
 
				+ * All rights reserved.
			
 
				+ *
			
 
				+ * This source code is licensed under the MIT license found in the
			
 
				+ * LICENSE file in the root directory of this source tree.
			
 
				+ */
			
 
				+
			
 
				+#include <emscripten.h>
			
 
				+#include <emscripten/bind.h>
			
 
				+#include <fasttext.h>
			
 
				+#include <functional>
			
 
				+#include <sstream>
			
 
				+#include <string>
			
 
				+#include <vector>
			
 
				+
			
 
				+using namespace emscripten;
			
 
				+using namespace fasttext;
			
 
				+
			
 
				+struct Float32ArrayBridge {
			
 
				+  uintptr_t ptr;
			
 
				+  int size;
			
 
				+};
			
 
				+
			
 
				+void fillFloat32ArrayFromVector(
			
 
				+    const Float32ArrayBridge& vecFloat,
			
 
				+    const Vector& v) {
			
 
				+  float* buffer = reinterpret_cast<float*>(vecFloat.ptr);
			
 
				+  assert(vecFloat.size == v.size());
			
 
				+  for (int i = 0; i < v.size(); i++) {
			
 
				+    buffer[i] = v[i];
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+std::vector<std::pair<float, std::string>>
			
 
				+predict(FastText* fasttext, std::string text, int k, double threshold) {
			
 
				+  std::stringstream ioss(text + std::string("\n"));
			
 
				+
			
 
				+  std::vector<std::pair<float, std::string>> predictions;
			
 
				+  fasttext->predictLine(ioss, predictions, k, threshold);
			
 
				+
			
 
				+  return predictions;
			
 
				+}
			
 
				+
			
 
				+void getWordVector(
			
 
				+    FastText* fasttext,
			
 
				+    const Float32ArrayBridge& vecFloat,
			
 
				+    std::string word) {
			
 
				+  assert(fasttext);
			
 
				+  Vector v(fasttext->getDimension());
			
 
				+  fasttext->getWordVector(v, word);
			
 
				+
			
 
				+  fillFloat32ArrayFromVector(vecFloat, v);
			
 
				+}
			
 
				+
			
 
				+void getSentenceVector(
			
 
				+    FastText* fasttext,
			
 
				+    const Float32ArrayBridge& vecFloat,
			
 
				+    std::string text) {
			
 
				+  assert(fasttext);
			
 
				+  Vector v(fasttext->getDimension());
			
 
				+  std::stringstream ioss(text);
			
 
				+  fasttext->getSentenceVector(ioss, v);
			
 
				+
			
 
				+  fillFloat32ArrayFromVector(vecFloat, v);
			
 
				+}
			
 
				+
			
 
				+std::pair<std::vector<std::string>, std::vector<int32_t>> getSubwords(
			
 
				+    FastText* fasttext,
			
 
				+    std::string word) {
			
 
				+  assert(fasttext);
			
 
				+  std::vector<std::string> subwords;
			
 
				+  std::vector<int32_t> ngrams;
			
 
				+  std::shared_ptr<const Dictionary> d = fasttext->getDictionary();
			
 
				+  d->getSubwords(word, ngrams, subwords);
			
 
				+
			
 
				+  return std::pair<std::vector<std::string>, std::vector<int32_t>>(
			
 
				+      subwords, ngrams);
			
 
				+}
			
 
				+
			
 
				+void getInputVector(
			
 
				+    FastText* fasttext,
			
 
				+    const Float32ArrayBridge& vecFloat,
			
 
				+    int32_t ind) {
			
 
				+  assert(fasttext);
			
 
				+  Vector v(fasttext->getDimension());
			
 
				+  fasttext->getInputVector(v, ind);
			
 
				+
			
 
				+  fillFloat32ArrayFromVector(vecFloat, v);
			
 
				+}
			
 
				+
			
 
				+void train(FastText* fasttext, Args* args, emscripten::val jsCallback) {
			
 
				+  assert(args);
			
 
				+  assert(fasttext);
			
 
				+  fasttext->train(
			
 
				+      *args,
			
 
				+      [=](float progress, float loss, double wst, double lr, int64_t eta) {
			
 
				+        jsCallback(progress, loss, wst, lr, static_cast<int32_t>(eta));
			
 
				+      });
			
 
				+}
			
 
				+
			
 
				+const DenseMatrix* getInputMatrix(FastText* fasttext) {
			
 
				+  assert(fasttext);
			
 
				+  std::shared_ptr<const DenseMatrix> mm = fasttext->getInputMatrix();
			
 
				+  return mm.get();
			
 
				+}
			
 
				+
			
 
				+const DenseMatrix* getOutputMatrix(FastText* fasttext) {
			
 
				+  assert(fasttext);
			
 
				+  std::shared_ptr<const DenseMatrix> mm = fasttext->getOutputMatrix();
			
 
				+  return mm.get();
			
 
				+}
			
 
				+
			
 
				+std::pair<std::vector<std::string>, std::vector<int32_t>> getTokens(
			
 
				+    const FastText& fasttext,
			
 
				+    const std::function<std::string(const Dictionary&, int32_t)> getter,
			
 
				+    entry_type entryType) {
			
 
				+  std::vector<std::string> tokens;
			
 
				+  std::vector<int32_t> retVocabFrequencies;
			
 
				+  std::shared_ptr<const Dictionary> d = fasttext.getDictionary();
			
 
				+  std::vector<int64_t> vocabFrequencies = d->getCounts(entryType);
			
 
				+  for (int32_t i = 0; i < vocabFrequencies.size(); i++) {
			
 
				+    tokens.push_back(getter(*d, i));
			
 
				+    retVocabFrequencies.push_back(vocabFrequencies[i]);
			
 
				+  }
			
 
				+  return std::pair<std::vector<std::string>, std::vector<int32_t>>(
			
 
				+      tokens, retVocabFrequencies);
			
 
				+}
			
 
				+
			
 
				+std::pair<std::vector<std::string>, std::vector<int32_t>> getWords(
			
 
				+    FastText* fasttext) {
			
 
				+  assert(fasttext);
			
 
				+  return getTokens(*fasttext, &Dictionary::getWord, entry_type::word);
			
 
				+}
			
 
				+
			
 
				+std::pair<std::vector<std::string>, std::vector<int32_t>> getLabels(
			
 
				+    FastText* fasttext) {
			
 
				+  assert(fasttext);
			
 
				+  return getTokens(*fasttext, &Dictionary::getLabel, entry_type::label);
			
 
				+}
			
 
				+
			
 
				+std::pair<std::vector<std::string>, std::vector<std::string>> getLine(
			
 
				+    FastText* fasttext,
			
 
				+    const std::string text) {
			
 
				+  assert(fasttext);
			
 
				+  std::shared_ptr<const Dictionary> d = fasttext->getDictionary();
			
 
				+  std::stringstream ioss(text);
			
 
				+  std::string token;
			
 
				+  std::vector<std::string> words;
			
 
				+  std::vector<std::string> labels;
			
 
				+  while (d->readWord(ioss, token)) {
			
 
				+    uint32_t h = d->hash(token);
			
 
				+    int32_t wid = d->getId(token, h);
			
 
				+    entry_type type = wid < 0 ? d->getType(token) : d->getType(wid);
			
 
				+
			
 
				+    if (type == entry_type::word) {
			
 
				+      words.push_back(token);
			
 
				+    } else if (type == entry_type::label && wid >= 0) {
			
 
				+      labels.push_back(token);
			
 
				+    }
			
 
				+    if (token == Dictionary::EOS)
			
 
				+      break;
			
 
				+  }
			
 
				+  return std::pair<std::vector<std::string>, std::vector<std::string>>(
			
 
				+      words, labels);
			
 
				+}
			
 
				+
			
 
				+Meter test(
			
 
				+    FastText* fasttext,
			
 
				+    const std::string& filename,
			
 
				+    int32_t k,
			
 
				+    float threshold) {
			
 
				+  assert(fasttext);
			
 
				+  std::ifstream ifs(filename);
			
 
				+  if (!ifs.is_open()) {
			
 
				+    throw std::invalid_argument("Test file cannot be opened!");
			
 
				+  }
			
 
				+  Meter meter;
			
 
				+  fasttext->test(ifs, k, threshold, meter);
			
 
				+  ifs.close();
			
 
				+
			
 
				+  return meter;
			
 
				+}
			
 
				+
			
 
				+EMSCRIPTEN_BINDINGS(fasttext) {
			
 
				+  class_<Args>("Args")
			
 
				+      .constructor<>()
			
 
				+      .property("input", &Args::input)
			
 
				+      .property("output", &Args::output)
			
 
				+      .property("lr", &Args::lr)
			
 
				+      .property("lrUpdateRate", &Args::lrUpdateRate)
			
 
				+      .property("dim", &Args::dim)
			
 
				+      .property("ws", &Args::ws)
			
 
				+      .property("epoch", &Args::epoch)
			
 
				+      .property("minCount", &Args::minCount)
			
 
				+      .property("minCountLabel", &Args::minCountLabel)
			
 
				+      .property("neg", &Args::neg)
			
 
				+      .property("wordNgrams", &Args::wordNgrams)
			
 
				+      .property("loss", &Args::loss)
			
 
				+      .property("model", &Args::model)
			
 
				+      .property("bucket", &Args::bucket)
			
 
				+      .property("minn", &Args::minn)
			
 
				+      .property("maxn", &Args::maxn)
			
 
				+      .property("thread", &Args::thread)
			
 
				+      .property("t", &Args::t)
			
 
				+      .property("label", &Args::label)
			
 
				+      .property("verbose", &Args::verbose)
			
 
				+      .property("pretrainedVectors", &Args::pretrainedVectors)
			
 
				+      .property("saveOutput", &Args::saveOutput)
			
 
				+      .property("seed", &Args::seed)
			
 
				+      .property("qout", &Args::qout)
			
 
				+      .property("retrain", &Args::retrain)
			
 
				+      .property("qnorm", &Args::qnorm)
			
 
				+      .property("cutoff", &Args::cutoff)
			
 
				+      .property("dsub", &Args::dsub)
			
 
				+      .property("qnorm", &Args::qnorm)
			
 
				+      .property("autotuneValidationFile", &Args::autotuneValidationFile)
			
 
				+      .property("autotuneMetric", &Args::autotuneMetric)
			
 
				+      .property("autotunePredictions", &Args::autotunePredictions)
			
 
				+      .property("autotuneDuration", &Args::autotuneDuration)
			
 
				+      .property("autotuneModelSize", &Args::autotuneModelSize);
			
 
				+
			
 
				+  class_<FastText>("FastText")
			
 
				+      .constructor<>()
			
 
				+      .function(
			
 
				+          "loadModel",
			
 
				+          select_overload<void(const std::string&)>(&FastText::loadModel))
			
 
				+      .function(
			
 
				+          "getNN",
			
 
				+          select_overload<std::vector<std::pair<real, std::string>>(
			
 
				+              const std::string& word, int32_t k)>(&FastText::getNN))
			
 
				+      .function("getAnalogies", &FastText::getAnalogies)
			
 
				+      .function("getWordId", &FastText::getWordId)
			
 
				+      .function("getSubwordId", &FastText::getSubwordId)
			
 
				+      .function("getInputMatrix", &getInputMatrix, allow_raw_pointers())
			
 
				+      .function("getOutputMatrix", &getOutputMatrix, allow_raw_pointers())
			
 
				+      .function("getWords", &getWords, allow_raw_pointers())
			
 
				+      .function("getLabels", &getLabels, allow_raw_pointers())
			
 
				+      .function("getLine", &getLine, allow_raw_pointers())
			
 
				+      .function("test", &test, allow_raw_pointers())
			
 
				+      .function("predict", &predict, allow_raw_pointers())
			
 
				+      .function("getWordVector", &getWordVector, allow_raw_pointers())
			
 
				+      .function("getSentenceVector", &getSentenceVector, allow_raw_pointers())
			
 
				+      .function("getSubwords", &getSubwords, allow_raw_pointers())
			
 
				+      .function("getInputVector", &getInputVector, allow_raw_pointers())
			
 
				+      .function("train", &train, allow_raw_pointers())
			
 
				+      .function("saveModel", &FastText::saveModel)
			
 
				+      .property("isQuant", &FastText::isQuant)
			
 
				+      .property("args", &FastText::getArgs);
			
 
				+
			
 
				+  class_<DenseMatrix>("DenseMatrix")
			
 
				+      .constructor<>()
			
 
				+      // we return int32_t because "JS can't represent int64s"
			
 
				+      .function(
			
 
				+          "rows",
			
 
				+          optional_override(
			
 
				+              [](const DenseMatrix* self) -> int32_t { return self->rows(); }),
			
 
				+          allow_raw_pointers())
			
 
				+      .function(
			
 
				+          "cols",
			
 
				+          optional_override(
			
 
				+              [](const DenseMatrix* self) -> int32_t { return self->cols(); }),
			
 
				+          allow_raw_pointers())
			
 
				+      .function(
			
 
				+          "at",
			
 
				+          optional_override(
			
 
				+              [](const DenseMatrix* self, int32_t i, int32_t j) -> const float {
			
 
				+                return self->at(i, j);
			
 
				+              }),
			
 
				+          allow_raw_pointers());
			
 
				+
			
 
				+  class_<Meter>("Meter")
			
 
				+      .constructor<>()
			
 
				+      .property(
			
 
				+          "precision", select_overload<double(void) const>(&Meter::precision))
			
 
				+      .property("recall", select_overload<double(void) const>(&Meter::recall))
			
 
				+      .property("f1Score", select_overload<double(void) const>(&Meter::f1Score))
			
 
				+      .function(
			
 
				+          "nexamples",
			
 
				+          optional_override(
			
 
				+              [](const Meter* self) -> int32_t { return self->nexamples(); }),
			
 
				+          allow_raw_pointers());
			
 
				+
			
 
				+  enum_<model_name>("ModelName")
			
 
				+      .value("cbow", model_name::cbow)
			
 
				+      .value("skipgram", model_name::sg)
			
 
				+      .value("supervised", model_name::sup);
			
 
				+
			
 
				+  enum_<loss_name>("LossName")
			
 
				+      .value("hs", loss_name::hs)
			
 
				+      .value("ns", loss_name::ns)
			
 
				+      .value("softmax", loss_name::softmax)
			
 
				+      .value("ova", loss_name::ova);
			
 
				+
			
 
				+  emscripten::value_object<Float32ArrayBridge>("Float32ArrayBridge")
			
 
				+      .field("ptr", &Float32ArrayBridge::ptr)
			
 
				+      .field("size", &Float32ArrayBridge::size);
			
 
				+
			
 
				+  emscripten::value_array<std::pair<float, std::string>>(
			
 
				+      "std::pair<float, std::string>")
			
 
				+      .element(&std::pair<float, std::string>::first)
			
 
				+      .element(&std::pair<float, std::string>::second);
			
 
				+
			
 
				+  emscripten::register_vector<std::pair<float, std::string>>(
			
 
				+      "std::vector<std::pair<float, std::string>>");
			
 
				+
			
 
				+  emscripten::value_array<
			
 
				+      std::pair<std::vector<std::string>, std::vector<int32_t>>>(
			
 
				+      "std::pair<std::vector<std::string>, std::vector<int32_t>>")
			
 
				+      .element(
			
 
				+          &std::pair<std::vector<std::string>, std::vector<int32_t>>::first)
			
 
				+      .element(
			
 
				+          &std::pair<std::vector<std::string>, std::vector<int32_t>>::second);
			
 
				+
			
 
				+  emscripten::value_array<
			
 
				+      std::pair<std::vector<std::string>, std::vector<std::string>>>(
			
 
				+      "std::pair<std::vector<std::string>, std::vector<std::string>>")
			
 
				+      .element(
			
 
				+          &std::pair<std::vector<std::string>, std::vector<std::string>>::first)
			
 
				+      .element(&std::pair<std::vector<std::string>, std::vector<std::string>>::
			
 
				+                   second);
			
 
				+
			
 
				+  emscripten::register_vector<float>("std::vector<float>");
			
 
				+
			
 
				+  emscripten::register_vector<int32_t>("std::vector<int32_t>");
			
 
				+
			
 
				+  emscripten::register_vector<std::string>("std::vector<std::string>");
			
 
				+}