пре 7 година · 9ddcabd04f
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,6 +22,7 @@ set(HEADER_FILES
 
				     src/densematrix.h
			
 
				     src/dictionary.h
			
 
				     src/fasttext.h
			
 
				+    src/loss.h
			
 
				     src/matrix.h
			
 
				     src/meter.h
			
 
				     src/model.h
			
@@ -36,6 +37,7 @@ set(SOURCE_FILES
 
				     src/densematrix.cc
			
 
				     src/dictionary.cc
			
 
				     src/fasttext.cc
			
 
				+    src/loss.cc
			
 
				     src/main.cc
			
 
				     src/matrix.cc
			
 
				     src/meter.cc
			
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@
 
				 
			
 
				 CXX = c++
			
 
				 CXXFLAGS = -pthread -std=c++0x -march=native
			
 
				-OBJS = args.o matrix.o dictionary.o productquantizer.o densematrix.o quantmatrix.o vector.o model.o utils.o meter.o fasttext.o
			
 
				+OBJS = args.o matrix.o dictionary.o loss.o productquantizer.o densematrix.o quantmatrix.o vector.o model.o utils.o meter.o fasttext.o
			
 
				 INCLUDES = -I.
			
 
				 
			
 
				 opt: CXXFLAGS += -O3 -funroll-loops
			
@@ -29,6 +29,9 @@ matrix.o: src/matrix.cc src/matrix.h
 
				 dictionary.o: src/dictionary.cc src/dictionary.h src/args.h
			
 
				 	$(CXX) $(CXXFLAGS) -c src/dictionary.cc
			
 
				 
			
 
				+loss.o: src/loss.cc src/loss.h src/basematrix.h src/real.h
			
 
				+	$(CXX) $(CXXFLAGS) -c src/loss.cc
			
 
				+
			
 
				 productquantizer.o: src/productquantizer.cc src/productquantizer.h src/utils.h
			
 
				 	$(CXX) $(CXXFLAGS) -c src/productquantizer.cc
			
 
				 
			
--- a/src/fasttext.cc
+++ b/src/fasttext.cc
@@ -7,6 +7,7 @@
 
				  */
			
 
				 
			
 
				 #include "fasttext.h"
			
 
				+#include "loss.h"
			
 
				 #include "quantmatrix.h"
			
 
				 
			
 
				 #include <algorithm>
			
@@ -28,6 +29,24 @@ bool comparePairs(
 
				     const std::pair<real, std::string>& l,
			
 
				     const std::pair<real, std::string>& r);
			
 
				 
			
 
				+std::shared_ptr<Loss> FastText::createLoss(std::shared_ptr<Matrix>& output) {
			
 
				+  loss_name lossName = args_->loss;
			
 
				+  switch (lossName) {
			
 
				+    case loss_name::hs:
			
 
				+      return std::make_shared<HierarchicalSoftmaxLoss>(
			
 
				+          output, getTargetCounts());
			
 
				+    case loss_name::ns:
			
 
				+      return std::make_shared<NegativeSamplingLoss>(
			
 
				+          output, args_->neg, getTargetCounts());
			
 
				+    case loss_name::softmax:
			
 
				+      return std::make_shared<SoftmaxLoss>(output);
			
 
				+    case loss_name::ova:
			
 
				+      return std::make_shared<OneVsAllLoss>(output);
			
 
				+    default:
			
 
				+      throw std::runtime_error("Unknown loss");
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				 FastText::FastText() : quant_(false), wordVectors_(nullptr) {}
			
 
				 
			
 
				 void FastText::addInputVector(Vector& vec, int32_t ind) const {
			
@@ -237,8 +256,8 @@ void FastText::loadModel(std::istream& in) {
 
				   }
			
 
				   output_->load(in);
			
 
				 
			
 
				-  model_ =
			
 
				-      std::make_shared<Model>(input_, output_, args_, getTargetCounts(), 0);
			
 
				+  auto loss = createLoss(output_);
			
 
				+  model_ = std::make_shared<Model>(input_, output_, args_, loss, 0);
			
 
				 }
			
 
				 
			
 
				 void FastText::printInfo(real progress, real loss, std::ostream& log_stream) {
			
@@ -297,7 +316,6 @@ void FastText::quantize(const Args& qargs) {
 
				       std::dynamic_pointer_cast<DenseMatrix>(input_);
			
 
				   std::shared_ptr<DenseMatrix> output =
			
 
				       std::dynamic_pointer_cast<DenseMatrix>(output_);
			
 
				-
			
 
				   if (qargs.cutoff > 0 && qargs.cutoff < input->size(0)) {
			
 
				     auto idx = selectEmbeddings(qargs.cutoff);
			
 
				     dict_->prune(idx);
			
@@ -314,6 +332,8 @@ void FastText::quantize(const Args& qargs) {
 
				       args_->lr = qargs.lr;
			
 
				       args_->thread = qargs.thread;
			
 
				       args_->verbose = qargs.verbose;
			
 
				+      auto loss = createLoss(output_);
			
 
				+      model_ = std::make_shared<Model>(input, output, args_, loss, 0);
			
 
				       startThreads();
			
 
				     }
			
 
				   }
			
@@ -327,8 +347,8 @@ void FastText::quantize(const Args& qargs) {
 
				   }
			
 
				 
			
 
				   quant_ = true;
			
 
				-  model_ =
			
 
				-      std::make_shared<Model>(input_, output_, args_, getTargetCounts(), 0);
			
 
				+  auto loss = createLoss(output_);
			
 
				+  model_ = std::make_shared<Model>(input_, output_, args_, loss, 0);
			
 
				 }
			
 
				 
			
 
				 void FastText::supervised(
			
@@ -393,7 +413,7 @@ void FastText::test(std::istream& in, int32_t k, real threshold, Meter& meter)
 
				     const {
			
 
				   std::vector<int32_t> line;
			
 
				   std::vector<int32_t> labels;
			
 
				-  std::vector<std::pair<real, int32_t>> predictions;
			
 
				+  Predictions predictions;
			
 
				 
			
 
				   while (in.peek() != EOF) {
			
 
				     line.clear();
			
@@ -411,7 +431,7 @@ void FastText::test(std::istream& in, int32_t k, real threshold, Meter& meter)
 
				 void FastText::predict(
			
 
				     int32_t k,
			
 
				     const std::vector<int32_t>& words,
			
 
				-    std::vector<std::pair<real, int32_t>>& predictions,
			
 
				+    Predictions& predictions,
			
 
				     real threshold) const {
			
 
				   if (words.empty()) {
			
 
				     return;
			
@@ -433,7 +453,7 @@ bool FastText::predictLine(
 
				 
			
 
				   std::vector<int32_t> words, labels;
			
 
				   dict_->getLine(in, words, labels);
			
 
				-  std::vector<std::pair<real, int32_t>> linePredictions;
			
 
				+  Predictions linePredictions;
			
 
				   predict(k, words, linePredictions, threshold);
			
 
				   for (const auto& p : linePredictions) {
			
 
				     predictions.push_back(
			
@@ -624,7 +644,8 @@ void FastText::trainThread(int32_t threadId) {
 
				   std::ifstream ifs(args_->input);
			
 
				   utils::seek(ifs, threadId * utils::size(ifs) / args_->thread);
			
 
				 
			
 
				-  Model model(input_, output_, args_, getTargetCounts(), threadId);
			
 
				+  assert(model_);
			
 
				+  Model model(*model_, threadId);
			
 
				 
			
 
				   const int64_t ntokens = dict_->ntokens();
			
 
				   int64_t localTokenCount = 0;
			
@@ -742,9 +763,9 @@ void FastText::train(const Args& args) {
 
				     input_ = createRandomMatrix();
			
 
				   }
			
 
				   output_ = createTrainOutputMatrix();
			
 
				+  auto loss = createLoss(output_);
			
 
				+  model_ = std::make_shared<Model>(input_, output_, args_, loss, 0);
			
 
				   startThreads();
			
 
				-  model_ =
			
 
				-      std::make_shared<Model>(input_, output_, args_, getTargetCounts(), 0);
			
 
				 }
			
 
				 
			
 
				 void FastText::startThreads() {
			
--- a/src/fasttext.h
+++ b/src/fasttext.h
@@ -60,6 +60,7 @@ class FastText {
 
				   std::shared_ptr<Matrix> createRandomMatrix() const;
			
 
				   std::shared_ptr<Matrix> createTrainOutputMatrix() const;
			
 
				   std::vector<int64_t> getTargetCounts() const;
			
 
				+  std::shared_ptr<Loss> createLoss(std::shared_ptr<Matrix>& output);
			
 
				 
			
 
				   bool quant_;
			
 
				   int32_t version;
			
@@ -111,7 +112,7 @@ class FastText {
 
				   void predict(
			
 
				       int32_t k,
			
 
				       const std::vector<int32_t>& words,
			
 
				-      std::vector<std::pair<real, int32_t>>& predictions,
			
 
				+      Predictions& predictions,
			
 
				       real threshold = 0.0) const;
			
 
				 
			
 
				   bool predictLine(
			
--- a/src/loss.cc
+++ b/src/loss.cc
@@ -0,0 +1,361 @@
 
				+/**
			
 
				+ * Copyright (c) 2016-present, Facebook, Inc.
			
 
				+ * All rights reserved.
			
 
				+ *
			
 
				+ * This source code is licensed under the MIT license found in the
			
 
				+ * LICENSE file in the root directory of this source tree.
			
 
				+ */
			
 
				+
			
 
				+#include "loss.h"
			
 
				+#include "utils.h"
			
 
				+
			
 
				+#include <cmath>
			
 
				+
			
 
				+namespace fasttext {
			
 
				+
			
 
				+constexpr int64_t SIGMOID_TABLE_SIZE = 512;
			
 
				+constexpr int64_t MAX_SIGMOID = 8;
			
 
				+constexpr int64_t LOG_TABLE_SIZE = 512;
			
 
				+
			
 
				+bool comparePairs(
			
 
				+    const std::pair<real, int32_t>& l,
			
 
				+    const std::pair<real, int32_t>& r) {
			
 
				+  return l.first > r.first;
			
 
				+}
			
 
				+
			
 
				+real std_log(real x) {
			
 
				+  return std::log(x + 1e-5);
			
 
				+}
			
 
				+
			
 
				+Loss::Loss(std::shared_ptr<Matrix>& wo) : wo_(wo) {
			
 
				+  t_sigmoid_.reserve(SIGMOID_TABLE_SIZE + 1);
			
 
				+  for (int i = 0; i < SIGMOID_TABLE_SIZE + 1; i++) {
			
 
				+    real x = real(i * 2 * MAX_SIGMOID) / SIGMOID_TABLE_SIZE - MAX_SIGMOID;
			
 
				+    t_sigmoid_.push_back(1.0 / (1.0 + std::exp(-x)));
			
 
				+  }
			
 
				+
			
 
				+  t_log_.reserve(LOG_TABLE_SIZE + 1);
			
 
				+  for (int i = 0; i < LOG_TABLE_SIZE + 1; i++) {
			
 
				+    real x = (real(i) + 1e-5) / LOG_TABLE_SIZE;
			
 
				+    t_log_.push_back(std::log(x));
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+real Loss::log(real x) const {
			
 
				+  if (x > 1.0) {
			
 
				+    return 0.0;
			
 
				+  }
			
 
				+  int64_t i = int64_t(x * LOG_TABLE_SIZE);
			
 
				+  return t_log_[i];
			
 
				+}
			
 
				+
			
 
				+real Loss::sigmoid(real x) const {
			
 
				+  if (x < -MAX_SIGMOID) {
			
 
				+    return 0.0;
			
 
				+  } else if (x > MAX_SIGMOID) {
			
 
				+    return 1.0;
			
 
				+  } else {
			
 
				+    int64_t i =
			
 
				+        int64_t((x + MAX_SIGMOID) * SIGMOID_TABLE_SIZE / MAX_SIGMOID / 2);
			
 
				+    return t_sigmoid_[i];
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void Loss::predict(
			
 
				+    int32_t k,
			
 
				+    real threshold,
			
 
				+    Predictions& heap,
			
 
				+    const Vector& hidden,
			
 
				+    Vector& output) const {
			
 
				+  computeOutput(hidden, output);
			
 
				+  findKBest(k, threshold, heap, output);
			
 
				+  std::sort_heap(heap.begin(), heap.end(), comparePairs);
			
 
				+}
			
 
				+
			
 
				+void Loss::findKBest(
			
 
				+    int32_t k,
			
 
				+    real threshold,
			
 
				+    Predictions& heap,
			
 
				+    Vector& output) const {
			
 
				+  for (int32_t i = 0; i < output.size(); i++) {
			
 
				+    if (output[i] < threshold) {
			
 
				+      continue;
			
 
				+    }
			
 
				+    if (heap.size() == k && std_log(output[i]) < heap.front().first) {
			
 
				+      continue;
			
 
				+    }
			
 
				+    heap.push_back(std::make_pair(std_log(output[i]), i));
			
 
				+    std::push_heap(heap.begin(), heap.end(), comparePairs);
			
 
				+    if (heap.size() > k) {
			
 
				+      std::pop_heap(heap.begin(), heap.end(), comparePairs);
			
 
				+      heap.pop_back();
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+BinaryLogisticLoss::BinaryLogisticLoss(std::shared_ptr<Matrix>& wo)
			
 
				+    : Loss(wo) {}
			
 
				+
			
 
				+real BinaryLogisticLoss::binaryLogistic(
			
 
				+    int32_t target,
			
 
				+    const Vector& hidden,
			
 
				+    Vector& grad,
			
 
				+    bool labelIsPositive,
			
 
				+    real lr,
			
 
				+    bool backprop) const {
			
 
				+  real score = sigmoid(wo_->dotRow(hidden, target));
			
 
				+  if (backprop) {
			
 
				+    real alpha = lr * (real(labelIsPositive) - score);
			
 
				+    grad.addRow(*wo_, target, alpha);
			
 
				+    wo_->addVectorToRow(hidden, target, alpha);
			
 
				+  }
			
 
				+  if (labelIsPositive) {
			
 
				+    return -log(score);
			
 
				+  } else {
			
 
				+    return -log(1.0 - score);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void BinaryLogisticLoss::computeOutput(const Vector& hidden, Vector& output)
			
 
				+    const {
			
 
				+  output.mul(*wo_, hidden);
			
 
				+  int32_t osz = output.size();
			
 
				+  for (int32_t i = 0; i < osz; i++) {
			
 
				+    output[i] = sigmoid(output[i]);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+OneVsAllLoss::OneVsAllLoss(std::shared_ptr<Matrix>& wo)
			
 
				+    : BinaryLogisticLoss(wo) {}
			
 
				+
			
 
				+real OneVsAllLoss::forward(
			
 
				+    const std::vector<int32_t>& targets,
			
 
				+    int32_t /* we take all targets here */,
			
 
				+    const Vector& hidden,
			
 
				+    Vector& output,
			
 
				+    Vector& grad,
			
 
				+    real lr,
			
 
				+    std::minstd_rand& /*rng*/,
			
 
				+    bool backprop) {
			
 
				+  real loss = 0.0;
			
 
				+  int32_t osz = output.size();
			
 
				+  for (int32_t i = 0; i < osz; i++) {
			
 
				+    bool isMatch = utils::contains(targets, i);
			
 
				+    loss += binaryLogistic(i, hidden, grad, isMatch, lr, backprop);
			
 
				+  }
			
 
				+
			
 
				+  return loss;
			
 
				+}
			
 
				+
			
 
				+NegativeSamplingLoss::NegativeSamplingLoss(
			
 
				+    std::shared_ptr<Matrix>& wo,
			
 
				+    int neg,
			
 
				+    const std::vector<int64_t>& targetCounts)
			
 
				+    : BinaryLogisticLoss(wo), neg_(neg), negatives_(), uniform_() {
			
 
				+  real z = 0.0;
			
 
				+  for (size_t i = 0; i < targetCounts.size(); i++) {
			
 
				+    z += pow(targetCounts[i], 0.5);
			
 
				+  }
			
 
				+  for (size_t i = 0; i < targetCounts.size(); i++) {
			
 
				+    real c = pow(targetCounts[i], 0.5);
			
 
				+    for (size_t j = 0; j < c * NegativeSamplingLoss::NEGATIVE_TABLE_SIZE / z;
			
 
				+         j++) {
			
 
				+      negatives_.push_back(i);
			
 
				+    }
			
 
				+  }
			
 
				+  uniform_ = std::uniform_int_distribution<size_t>(0, negatives_.size() - 1);
			
 
				+}
			
 
				+
			
 
				+real NegativeSamplingLoss::forward(
			
 
				+    const std::vector<int32_t>& targets,
			
 
				+    int32_t targetIndex,
			
 
				+    const Vector& hidden,
			
 
				+    Vector& /* output */,
			
 
				+    Vector& grad,
			
 
				+    real lr,
			
 
				+    std::minstd_rand& rng,
			
 
				+    bool backprop) {
			
 
				+  assert(targetIndex >= 0);
			
 
				+  assert(targetIndex < targets.size());
			
 
				+  int32_t target = targets[targetIndex];
			
 
				+  real loss = binaryLogistic(target, hidden, grad, true, lr, backprop);
			
 
				+
			
 
				+  for (int32_t n = 0; n < neg_; n++) {
			
 
				+    auto negativeTarget = getNegative(target, rng);
			
 
				+    loss += binaryLogistic(negativeTarget, hidden, grad, false, lr, backprop);
			
 
				+  }
			
 
				+  return loss;
			
 
				+}
			
 
				+
			
 
				+int32_t NegativeSamplingLoss::getNegative(
			
 
				+    int32_t target,
			
 
				+    std::minstd_rand& rng) {
			
 
				+  int32_t negative;
			
 
				+  do {
			
 
				+    negative = negatives_[uniform_(rng)];
			
 
				+  } while (target == negative);
			
 
				+  return negative;
			
 
				+}
			
 
				+
			
 
				+HierarchicalSoftmaxLoss::HierarchicalSoftmaxLoss(
			
 
				+    std::shared_ptr<Matrix>& wo,
			
 
				+    const std::vector<int64_t>& targetCounts)
			
 
				+    : BinaryLogisticLoss(wo),
			
 
				+      paths_(),
			
 
				+      codes_(),
			
 
				+      tree_(),
			
 
				+      osz_(targetCounts.size()) {
			
 
				+  buildTree(targetCounts);
			
 
				+}
			
 
				+
			
 
				+void HierarchicalSoftmaxLoss::buildTree(const std::vector<int64_t>& counts) {
			
 
				+  tree_.resize(2 * osz_ - 1);
			
 
				+  for (int32_t i = 0; i < 2 * osz_ - 1; i++) {
			
 
				+    tree_[i].parent = -1;
			
 
				+    tree_[i].left = -1;
			
 
				+    tree_[i].right = -1;
			
 
				+    tree_[i].count = 1e15;
			
 
				+    tree_[i].binary = false;
			
 
				+  }
			
 
				+  for (int32_t i = 0; i < osz_; i++) {
			
 
				+    tree_[i].count = counts[i];
			
 
				+  }
			
 
				+  int32_t leaf = osz_ - 1;
			
 
				+  int32_t node = osz_;
			
 
				+  for (int32_t i = osz_; i < 2 * osz_ - 1; i++) {
			
 
				+    int32_t mini[2] = {0};
			
 
				+    for (int32_t j = 0; j < 2; j++) {
			
 
				+      if (leaf >= 0 && tree_[leaf].count < tree_[node].count) {
			
 
				+        mini[j] = leaf--;
			
 
				+      } else {
			
 
				+        mini[j] = node++;
			
 
				+      }
			
 
				+    }
			
 
				+    tree_[i].left = mini[0];
			
 
				+    tree_[i].right = mini[1];
			
 
				+    tree_[i].count = tree_[mini[0]].count + tree_[mini[1]].count;
			
 
				+    tree_[mini[0]].parent = i;
			
 
				+    tree_[mini[1]].parent = i;
			
 
				+    tree_[mini[1]].binary = true;
			
 
				+  }
			
 
				+  for (int32_t i = 0; i < osz_; i++) {
			
 
				+    std::vector<int32_t> path;
			
 
				+    std::vector<bool> code;
			
 
				+    int32_t j = i;
			
 
				+    while (tree_[j].parent != -1) {
			
 
				+      path.push_back(tree_[j].parent - osz_);
			
 
				+      code.push_back(tree_[j].binary);
			
 
				+      j = tree_[j].parent;
			
 
				+    }
			
 
				+    paths_.push_back(path);
			
 
				+    codes_.push_back(code);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+real HierarchicalSoftmaxLoss::forward(
			
 
				+    const std::vector<int32_t>& targets,
			
 
				+    int32_t targetIndex,
			
 
				+    const Vector& hidden,
			
 
				+    Vector& /* the output is not an explicit Vector here */,
			
 
				+    Vector& grad,
			
 
				+    real lr,
			
 
				+    std::minstd_rand& /*rng*/,
			
 
				+    bool backprop) {
			
 
				+  real loss = 0.0;
			
 
				+  int32_t target = targets[targetIndex];
			
 
				+  const std::vector<bool>& binaryCode = codes_[target];
			
 
				+  const std::vector<int32_t>& pathToRoot = paths_[target];
			
 
				+  for (int32_t i = 0; i < pathToRoot.size(); i++) {
			
 
				+    loss += binaryLogistic(
			
 
				+        pathToRoot[i], hidden, grad, binaryCode[i], lr, backprop);
			
 
				+  }
			
 
				+  return loss;
			
 
				+}
			
 
				+
			
 
				+void HierarchicalSoftmaxLoss::predict(
			
 
				+    int32_t k,
			
 
				+    real threshold,
			
 
				+    Predictions& heap,
			
 
				+    const Vector& hidden,
			
 
				+    Vector& /*output*/) const {
			
 
				+  dfs(k, threshold, 2 * osz_ - 2, 0.0, heap, hidden);
			
 
				+  std::sort_heap(heap.begin(), heap.end(), comparePairs);
			
 
				+}
			
 
				+
			
 
				+void HierarchicalSoftmaxLoss::dfs(
			
 
				+    int32_t k,
			
 
				+    real threshold,
			
 
				+    int32_t node,
			
 
				+    real score,
			
 
				+    Predictions& heap,
			
 
				+    const Vector& hidden) const {
			
 
				+  if (score < std_log(threshold)) {
			
 
				+    return;
			
 
				+  }
			
 
				+  if (heap.size() == k && score < heap.front().first) {
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  if (tree_[node].left == -1 && tree_[node].right == -1) {
			
 
				+    heap.push_back(std::make_pair(score, node));
			
 
				+    std::push_heap(heap.begin(), heap.end(), comparePairs);
			
 
				+    if (heap.size() > k) {
			
 
				+      std::pop_heap(heap.begin(), heap.end(), comparePairs);
			
 
				+      heap.pop_back();
			
 
				+    }
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  real f = wo_->dotRow(hidden, node - osz_);
			
 
				+  f = 1. / (1 + std::exp(-f));
			
 
				+
			
 
				+  dfs(k, threshold, tree_[node].left, score + std_log(1.0 - f), heap, hidden);
			
 
				+  dfs(k, threshold, tree_[node].right, score + std_log(f), heap, hidden);
			
 
				+}
			
 
				+
			
 
				+SoftmaxLoss::SoftmaxLoss(std::shared_ptr<Matrix>& wo) : Loss(wo) {}
			
 
				+
			
 
				+void SoftmaxLoss::computeOutput(const Vector& hidden, Vector& output) const {
			
 
				+  output.mul(*wo_, hidden);
			
 
				+  real max = output[0], z = 0.0;
			
 
				+  int32_t osz = output.size();
			
 
				+  for (int32_t i = 0; i < osz; i++) {
			
 
				+    max = std::max(output[i], max);
			
 
				+  }
			
 
				+  for (int32_t i = 0; i < osz; i++) {
			
 
				+    output[i] = exp(output[i] - max);
			
 
				+    z += output[i];
			
 
				+  }
			
 
				+  for (int32_t i = 0; i < osz; i++) {
			
 
				+    output[i] /= z;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+real SoftmaxLoss::forward(
			
 
				+    const std::vector<int32_t>& targets,
			
 
				+    int32_t targetIndex,
			
 
				+    const Vector& hidden,
			
 
				+    Vector& output,
			
 
				+    Vector& grad,
			
 
				+    real lr,
			
 
				+    std::minstd_rand& /*rng*/,
			
 
				+    bool backprop) {
			
 
				+  computeOutput(hidden, output);
			
 
				+
			
 
				+  assert(targetIndex >= 0);
			
 
				+  assert(targetIndex < targets.size());
			
 
				+  int32_t target = targets[targetIndex];
			
 
				+
			
 
				+  if (backprop) {
			
 
				+    int32_t osz = wo_->size(0);
			
 
				+    for (int32_t i = 0; i < osz; i++) {
			
 
				+      real label = (i == target) ? 1.0 : 0.0;
			
 
				+      real alpha = lr * (label - output[i]);
			
 
				+      grad.addRow(*wo_, i, alpha);
			
 
				+      wo_->addVectorToRow(hidden, i, alpha);
			
 
				+    }
			
 
				+  }
			
 
				+  return -log(output[target]);
			
 
				+};
			
 
				+
			
 
				+} // namespace fasttext
			
--- a/src/loss.h
+++ b/src/loss.h
@@ -0,0 +1,176 @@
 
				+/**
			
 
				+ * Copyright (c) 2016-present, Facebook, Inc.
			
 
				+ * All rights reserved.
			
 
				+ *
			
 
				+ * This source code is licensed under the MIT license found in the
			
 
				+ * LICENSE file in the root directory of this source tree.
			
 
				+ */
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+#include <memory>
			
 
				+#include <random>
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "matrix.h"
			
 
				+#include "real.h"
			
 
				+#include "utils.h"
			
 
				+#include "vector.h"
			
 
				+
			
 
				+namespace fasttext {
			
 
				+
			
 
				+class Loss {
			
 
				+ private:
			
 
				+  void findKBest(int32_t, real, Predictions&, Vector&) const;
			
 
				+
			
 
				+ protected:
			
 
				+  std::vector<real> t_sigmoid_;
			
 
				+  std::vector<real> t_log_;
			
 
				+  std::shared_ptr<Matrix>& wo_;
			
 
				+
			
 
				+  real log(real x) const;
			
 
				+  real sigmoid(real x) const;
			
 
				+
			
 
				+ public:
			
 
				+  explicit Loss(std::shared_ptr<Matrix>& wo);
			
 
				+  virtual ~Loss() = default;
			
 
				+
			
 
				+  virtual real forward(
			
 
				+      const std::vector<int32_t>& targets,
			
 
				+      int32_t targetIndex,
			
 
				+      const Vector& hidden,
			
 
				+      Vector& output,
			
 
				+      Vector& grad,
			
 
				+      real lr,
			
 
				+      std::minstd_rand& rng,
			
 
				+      bool backprop) = 0;
			
 
				+  virtual void computeOutput(const Vector& hidden, Vector& output) const = 0;
			
 
				+
			
 
				+  virtual void predict(
			
 
				+      int32_t k,
			
 
				+      real threshold,
			
 
				+      Predictions& heap,
			
 
				+      const Vector& hidden,
			
 
				+      Vector& output) const;
			
 
				+};
			
 
				+
			
 
				+class BinaryLogisticLoss : public Loss {
			
 
				+ protected:
			
 
				+  real binaryLogistic(
			
 
				+      int32_t target,
			
 
				+      const Vector& hidden,
			
 
				+      Vector& grad,
			
 
				+      bool labelIsPositive,
			
 
				+      real lr,
			
 
				+      bool backprop) const;
			
 
				+  void computeOutput(const Vector& hidden, Vector& output) const override;
			
 
				+
			
 
				+ public:
			
 
				+  explicit BinaryLogisticLoss(std::shared_ptr<Matrix>& wo);
			
 
				+  virtual ~BinaryLogisticLoss() override = default;
			
 
				+};
			
 
				+
			
 
				+class OneVsAllLoss : public BinaryLogisticLoss {
			
 
				+ public:
			
 
				+  explicit OneVsAllLoss(std::shared_ptr<Matrix>& wo);
			
 
				+  ~OneVsAllLoss() = default;
			
 
				+  real forward(
			
 
				+      const std::vector<int32_t>& targets,
			
 
				+      int32_t targetIndex,
			
 
				+      const Vector& hidden,
			
 
				+      Vector& output,
			
 
				+      Vector& grad,
			
 
				+      real lr,
			
 
				+      std::minstd_rand& rng,
			
 
				+      bool backprop) override;
			
 
				+};
			
 
				+
			
 
				+class NegativeSamplingLoss : public BinaryLogisticLoss {
			
 
				+ protected:
			
 
				+  static const int32_t NEGATIVE_TABLE_SIZE = 10000000;
			
 
				+
			
 
				+  int neg_;
			
 
				+  std::vector<int32_t> negatives_;
			
 
				+  std::uniform_int_distribution<size_t> uniform_;
			
 
				+  int32_t getNegative(int32_t target, std::minstd_rand& rng);
			
 
				+
			
 
				+ public:
			
 
				+  explicit NegativeSamplingLoss(
			
 
				+      std::shared_ptr<Matrix>& wo,
			
 
				+      int neg,
			
 
				+      const std::vector<int64_t>& targetCounts);
			
 
				+  ~NegativeSamplingLoss() override = default;
			
 
				+
			
 
				+  real forward(
			
 
				+      const std::vector<int32_t>& targets,
			
 
				+      int32_t targetIndex,
			
 
				+      const Vector& hidden,
			
 
				+      Vector& output,
			
 
				+      Vector& grad,
			
 
				+      real lr,
			
 
				+      std::minstd_rand& rng,
			
 
				+      bool backprop) override;
			
 
				+};
			
 
				+
			
 
				+class HierarchicalSoftmaxLoss : public BinaryLogisticLoss {
			
 
				+ protected:
			
 
				+  struct Node {
			
 
				+    int32_t parent;
			
 
				+    int32_t left;
			
 
				+    int32_t right;
			
 
				+    int64_t count;
			
 
				+    bool binary;
			
 
				+  };
			
 
				+
			
 
				+  std::vector<std::vector<int32_t>> paths_;
			
 
				+  std::vector<std::vector<bool>> codes_;
			
 
				+  std::vector<Node> tree_;
			
 
				+  int32_t osz_;
			
 
				+  void buildTree(const std::vector<int64_t>& counts);
			
 
				+  void dfs(
			
 
				+      int32_t k,
			
 
				+      real threshold,
			
 
				+      int32_t node,
			
 
				+      real score,
			
 
				+      Predictions& heap,
			
 
				+      const Vector& hidden) const;
			
 
				+
			
 
				+ public:
			
 
				+  explicit HierarchicalSoftmaxLoss(
			
 
				+      std::shared_ptr<Matrix>& wo,
			
 
				+      const std::vector<int64_t>& counts);
			
 
				+  ~HierarchicalSoftmaxLoss() override = default;
			
 
				+  real forward(
			
 
				+      const std::vector<int32_t>& targets,
			
 
				+      int32_t targetIndex,
			
 
				+      const Vector& hidden,
			
 
				+      Vector& output,
			
 
				+      Vector& grad,
			
 
				+      real lr,
			
 
				+      std::minstd_rand& rng,
			
 
				+      bool backprop) override;
			
 
				+  void predict(
			
 
				+      int32_t k,
			
 
				+      real threshold,
			
 
				+      Predictions& heap,
			
 
				+      const Vector& hidden,
			
 
				+      Vector& output) const override;
			
 
				+};
			
 
				+
			
 
				+class SoftmaxLoss : public Loss {
			
 
				+ public:
			
 
				+  explicit SoftmaxLoss(std::shared_ptr<Matrix>& wo);
			
 
				+  ~SoftmaxLoss() override = default;
			
 
				+  real forward(
			
 
				+      const std::vector<int32_t>& targets,
			
 
				+      int32_t targetIndex,
			
 
				+      const Vector& hidden,
			
 
				+      Vector& output,
			
 
				+      Vector& grad,
			
 
				+      real lr,
			
 
				+      std::minstd_rand& rng,
			
 
				+      bool backprop) override;
			
 
				+  void computeOutput(const Vector& hidden, Vector& output) const override;
			
 
				+};
			
 
				+
			
 
				+} // namespace fasttext
			
--- a/src/meter.cc
+++ b/src/meter.cc
@@ -18,7 +18,7 @@ namespace fasttext {
 
				 
			
 
				 void Meter::log(
			
 
				     const std::vector<int32_t>& labels,
			
 
				-    const std::vector<std::pair<real, int32_t>>& predictions) {
			
 
				+    const Predictions& predictions) {
			
 
				   nexamples_++;
			
 
				   metrics_.gold += labels.size();
			
 
				   metrics_.predicted += predictions.size();
			
--- a/src/meter.h
+++ b/src/meter.h
@@ -13,6 +13,7 @@
 
				 
			
 
				 #include "dictionary.h"
			
 
				 #include "real.h"
			
 
				+#include "utils.h"
			
 
				 
			
 
				 namespace fasttext {
			
 
				 
			
@@ -38,9 +39,7 @@ class Meter {
 
				  public:
			
 
				   Meter() : metrics_(), nexamples_(0), labelMetrics_() {}
			
 
				 
			
 
				-  void log(
			
 
				-      const std::vector<int32_t>& labels,
			
 
				-      const std::vector<std::pair<real, int32_t>>& predictions);
			
 
				+  void log(const std::vector<int32_t>& labels, const Predictions& predictions);
			
 
				 
			
 
				   double precision(int32_t);
			
 
				   double recall(int32_t);
			
--- a/src/model.cc
+++ b/src/model.cc
@@ -15,119 +15,36 @@
 
				 
			
 
				 namespace fasttext {
			
 
				 
			
 
				-constexpr int64_t SIGMOID_TABLE_SIZE = 512;
			
 
				-constexpr int64_t MAX_SIGMOID = 8;
			
 
				-constexpr int64_t LOG_TABLE_SIZE = 512;
			
 
				-
			
 
				 Model::Model(
			
 
				     std::shared_ptr<Matrix> wi,
			
 
				     std::shared_ptr<Matrix> wo,
			
 
				     std::shared_ptr<Args> args,
			
 
				-    const std::vector<int64_t>& targetCounts,
			
 
				+    std::shared_ptr<Loss> loss,
			
 
				     int32_t seed)
			
 
				     : hidden_(args->dim), output_(wo->size(0)), grad_(args->dim), rng(seed) {
			
 
				   wi_ = wi;
			
 
				   wo_ = wo;
			
 
				   args_ = args;
			
 
				+  loss_ = loss;
			
 
				   osz_ = wo->size(0);
			
 
				   hsz_ = args->dim;
			
 
				-  negpos = 0;
			
 
				-  loss_ = 0.0;
			
 
				+  lossValue_ = 0.0;
			
 
				   nexamples_ = 1;
			
 
				-  t_sigmoid_.reserve(SIGMOID_TABLE_SIZE + 1);
			
 
				-  t_log_.reserve(LOG_TABLE_SIZE + 1);
			
 
				-  initSigmoid();
			
 
				-  initLog();
			
 
				-  setTargetCounts(targetCounts);
			
 
				-}
			
 
				-
			
 
				-real Model::binaryLogistic(int32_t target, bool label, real lr) {
			
 
				-  real score = sigmoid(wo_->dotRow(hidden_, target));
			
 
				-  real alpha = lr * (real(label) - score);
			
 
				-  grad_.addRow(*wo_, target, alpha);
			
 
				-  wo_->addVectorToRow(hidden_, target, alpha);
			
 
				-  if (label) {
			
 
				-    return -log(score);
			
 
				-  } else {
			
 
				-    return -log(1.0 - score);
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-real Model::negativeSampling(int32_t target, real lr) {
			
 
				-  real loss = 0.0;
			
 
				-  grad_.zero();
			
 
				-  for (int32_t n = 0; n <= args_->neg; n++) {
			
 
				-    if (n == 0) {
			
 
				-      loss += binaryLogistic(target, true, lr);
			
 
				-    } else {
			
 
				-      loss += binaryLogistic(getNegative(target), false, lr);
			
 
				-    }
			
 
				-  }
			
 
				-  return loss;
			
 
				-}
			
 
				-
			
 
				-real Model::hierarchicalSoftmax(int32_t target, real lr) {
			
 
				-  real loss = 0.0;
			
 
				-  grad_.zero();
			
 
				-  const std::vector<bool>& binaryCode = codes[target];
			
 
				-  const std::vector<int32_t>& pathToRoot = paths[target];
			
 
				-  for (int32_t i = 0; i < pathToRoot.size(); i++) {
			
 
				-    loss += binaryLogistic(pathToRoot[i], binaryCode[i], lr);
			
 
				-  }
			
 
				-  return loss;
			
 
				 }
			
 
				 
			
 
				-void Model::computeOutput(Vector& hidden, Vector& output) const {
			
 
				-  output.mul(*wo_, hidden);
			
 
				-}
			
 
				-
			
 
				-void Model::computeOutputSigmoid(Vector& hidden, Vector& output) const {
			
 
				-  computeOutput(hidden, output);
			
 
				-  for (int32_t i = 0; i < osz_; i++) {
			
 
				-    output[i] = sigmoid(output[i]);
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void Model::computeOutputSoftmax(Vector& hidden, Vector& output) const {
			
 
				-  computeOutput(hidden, output);
			
 
				-  real max = output[0], z = 0.0;
			
 
				-  for (int32_t i = 0; i < osz_; i++) {
			
 
				-    max = std::max(output[i], max);
			
 
				-  }
			
 
				-  for (int32_t i = 0; i < osz_; i++) {
			
 
				-    output[i] = exp(output[i] - max);
			
 
				-    z += output[i];
			
 
				-  }
			
 
				-  for (int32_t i = 0; i < osz_; i++) {
			
 
				-    output[i] /= z;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void Model::computeOutputSoftmax() {
			
 
				-  computeOutputSoftmax(hidden_, output_);
			
 
				-}
			
 
				-
			
 
				-real Model::softmax(int32_t target, real lr) {
			
 
				-  grad_.zero();
			
 
				-  computeOutputSoftmax();
			
 
				-  for (int32_t i = 0; i < osz_; i++) {
			
 
				-    real label = (i == target) ? 1.0 : 0.0;
			
 
				-    real alpha = lr * (label - output_[i]);
			
 
				-    grad_.addRow(*wo_, i, alpha);
			
 
				-    wo_->addVectorToRow(hidden_, i, alpha);
			
 
				-  }
			
 
				-  return -log(output_[target]);
			
 
				-}
			
 
				-
			
 
				-real Model::oneVsAll(const std::vector<int32_t>& targets, real lr) {
			
 
				-  real loss = 0.0;
			
 
				-  for (int32_t i = 0; i < osz_; i++) {
			
 
				-    bool isMatch = utils::contains(targets, i);
			
 
				-    loss += binaryLogistic(i, isMatch, lr);
			
 
				-  }
			
 
				-
			
 
				-  return loss;
			
 
				-}
			
 
				+Model::Model(const Model& other, int32_t seed)
			
 
				+    : wi_(other.wi_),
			
 
				+      wo_(other.wo_),
			
 
				+      args_(other.args_),
			
 
				+      loss_(other.loss_),
			
 
				+      hidden_(other.hidden_),
			
 
				+      output_(other.output_),
			
 
				+      grad_(other.grad_),
			
 
				+      hsz_(other.hsz_),
			
 
				+      osz_(other.osz_),
			
 
				+      lossValue_(other.lossValue_),
			
 
				+      nexamples_(other.nexamples_),
			
 
				+      rng(seed) {}
			
 
				 
			
 
				 void Model::computeHidden(const std::vector<int32_t>& input, Vector& hidden)
			
 
				     const {
			
@@ -139,17 +56,11 @@ void Model::computeHidden(const std::vector<int32_t>& input, Vector& hidden)
 
				   hidden.mul(1.0 / input.size());
			
 
				 }
			
 
				 
			
 
				-bool Model::comparePairs(
			
 
				-    const std::pair<real, int32_t>& l,
			
 
				-    const std::pair<real, int32_t>& r) {
			
 
				-  return l.first > r.first;
			
 
				-}
			
 
				-
			
 
				 void Model::predict(
			
 
				     const std::vector<int32_t>& input,
			
 
				     int32_t k,
			
 
				     real threshold,
			
 
				-    std::vector<std::pair<real, int32_t>>& heap,
			
 
				+    Predictions& heap,
			
 
				     Vector& hidden,
			
 
				     Vector& output) const {
			
 
				   if (k == Model::kUnlimitedPredictions) {
			
@@ -162,101 +73,18 @@ void Model::predict(
 
				   }
			
 
				   heap.reserve(k + 1);
			
 
				   computeHidden(input, hidden);
			
 
				-  if (args_->loss == loss_name::hs) {
			
 
				-    dfs(k, threshold, 2 * osz_ - 2, 0.0, heap, hidden);
			
 
				-  } else {
			
 
				-    findKBest(k, threshold, heap, hidden, output);
			
 
				-  }
			
 
				-  std::sort_heap(heap.begin(), heap.end(), comparePairs);
			
 
				+
			
 
				+  loss_->predict(k, threshold, heap, hidden, output);
			
 
				 }
			
 
				 
			
 
				 void Model::predict(
			
 
				     const std::vector<int32_t>& input,
			
 
				     int32_t k,
			
 
				     real threshold,
			
 
				-    std::vector<std::pair<real, int32_t>>& heap) {
			
 
				+    Predictions& heap) {
			
 
				   predict(input, k, threshold, heap, hidden_, output_);
			
 
				 }
			
 
				 
			
 
				-void Model::findKBest(
			
 
				-    int32_t k,
			
 
				-    real threshold,
			
 
				-    std::vector<std::pair<real, int32_t>>& heap,
			
 
				-    Vector& hidden,
			
 
				-    Vector& output) const {
			
 
				-  if (args_->loss == loss_name::ova) {
			
 
				-    computeOutputSigmoid(hidden, output);
			
 
				-  } else {
			
 
				-    computeOutputSoftmax(hidden, output);
			
 
				-  }
			
 
				-  for (int32_t i = 0; i < osz_; i++) {
			
 
				-    if (output[i] < threshold) {
			
 
				-      continue;
			
 
				-    }
			
 
				-    if (heap.size() == k && std_log(output[i]) < heap.front().first) {
			
 
				-      continue;
			
 
				-    }
			
 
				-    heap.push_back(std::make_pair(std_log(output[i]), i));
			
 
				-    std::push_heap(heap.begin(), heap.end(), comparePairs);
			
 
				-    if (heap.size() > k) {
			
 
				-      std::pop_heap(heap.begin(), heap.end(), comparePairs);
			
 
				-      heap.pop_back();
			
 
				-    }
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void Model::dfs(
			
 
				-    int32_t k,
			
 
				-    real threshold,
			
 
				-    int32_t node,
			
 
				-    real score,
			
 
				-    std::vector<std::pair<real, int32_t>>& heap,
			
 
				-    Vector& hidden) const {
			
 
				-  if (score < std_log(threshold)) {
			
 
				-    return;
			
 
				-  }
			
 
				-  if (heap.size() == k && score < heap.front().first) {
			
 
				-    return;
			
 
				-  }
			
 
				-
			
 
				-  if (tree[node].left == -1 && tree[node].right == -1) {
			
 
				-    heap.push_back(std::make_pair(score, node));
			
 
				-    std::push_heap(heap.begin(), heap.end(), comparePairs);
			
 
				-    if (heap.size() > k) {
			
 
				-      std::pop_heap(heap.begin(), heap.end(), comparePairs);
			
 
				-      heap.pop_back();
			
 
				-    }
			
 
				-    return;
			
 
				-  }
			
 
				-
			
 
				-  real f = wo_->dotRow(hidden, node - osz_);
			
 
				-  f = 1. / (1 + std::exp(-f));
			
 
				-
			
 
				-  dfs(k, threshold, tree[node].left, score + std_log(1.0 - f), heap, hidden);
			
 
				-  dfs(k, threshold, tree[node].right, score + std_log(f), heap, hidden);
			
 
				-}
			
 
				-
			
 
				-real Model::computeLoss(
			
 
				-    const std::vector<int32_t>& targets,
			
 
				-    int32_t targetIndex,
			
 
				-    real lr) {
			
 
				-  real loss = 0.0;
			
 
				-
			
 
				-  if (args_->loss == loss_name::ns) {
			
 
				-    loss = negativeSampling(targets[targetIndex], lr);
			
 
				-  } else if (args_->loss == loss_name::hs) {
			
 
				-    loss = hierarchicalSoftmax(targets[targetIndex], lr);
			
 
				-  } else if (args_->loss == loss_name::softmax) {
			
 
				-    loss = softmax(targets[targetIndex], lr);
			
 
				-  } else if (args_->loss == loss_name::ova) {
			
 
				-    loss = oneVsAll(targets, lr);
			
 
				-  } else {
			
 
				-    throw std::invalid_argument("Unhandled loss function for this model.");
			
 
				-  }
			
 
				-
			
 
				-  return loss;
			
 
				-}
			
 
				-
			
 
				 void Model::update(
			
 
				     const std::vector<int32_t>& input,
			
 
				     const std::vector<int32_t>& targets,
			
@@ -267,13 +95,9 @@ void Model::update(
 
				   }
			
 
				   computeHidden(input, hidden_);
			
 
				 
			
 
				-  if (targetIndex == kAllLabelsAsTarget) {
			
 
				-    loss_ += computeLoss(targets, -1, lr);
			
 
				-  } else {
			
 
				-    assert(targetIndex >= 0);
			
 
				-    assert(targetIndex < targets.size());
			
 
				-    loss_ += computeLoss(targets, targetIndex, lr);
			
 
				-  }
			
 
				+  grad_.zero();
			
 
				+  lossValue_ += loss_->forward(
			
 
				+      targets, targetIndex, hidden_, output_, grad_, lr, rng, true);
			
 
				 
			
 
				   nexamples_ += 1;
			
 
				 
			
@@ -285,123 +109,12 @@ void Model::update(
 
				   }
			
 
				 }
			
 
				 
			
 
				-void Model::setTargetCounts(const std::vector<int64_t>& counts) {
			
 
				-  assert(counts.size() == osz_);
			
 
				-  if (args_->loss == loss_name::ns) {
			
 
				-    initTableNegatives(counts);
			
 
				-  }
			
 
				-  if (args_->loss == loss_name::hs) {
			
 
				-    buildTree(counts);
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void Model::initTableNegatives(const std::vector<int64_t>& counts) {
			
 
				-  real z = 0.0;
			
 
				-  for (size_t i = 0; i < counts.size(); i++) {
			
 
				-    z += pow(counts[i], 0.5);
			
 
				-  }
			
 
				-  for (size_t i = 0; i < counts.size(); i++) {
			
 
				-    real c = pow(counts[i], 0.5);
			
 
				-    for (size_t j = 0; j < c * NEGATIVE_TABLE_SIZE / z; j++) {
			
 
				-      negatives_.push_back(i);
			
 
				-    }
			
 
				-  }
			
 
				-  std::shuffle(negatives_.begin(), negatives_.end(), rng);
			
 
				-}
			
 
				-
			
 
				-int32_t Model::getNegative(int32_t target) {
			
 
				-  int32_t negative;
			
 
				-  do {
			
 
				-    negative = negatives_[negpos];
			
 
				-    negpos = (negpos + 1) % negatives_.size();
			
 
				-  } while (target == negative);
			
 
				-  return negative;
			
 
				-}
			
 
				-
			
 
				-void Model::buildTree(const std::vector<int64_t>& counts) {
			
 
				-  tree.resize(2 * osz_ - 1);
			
 
				-  for (int32_t i = 0; i < 2 * osz_ - 1; i++) {
			
 
				-    tree[i].parent = -1;
			
 
				-    tree[i].left = -1;
			
 
				-    tree[i].right = -1;
			
 
				-    tree[i].count = 1e15;
			
 
				-    tree[i].binary = false;
			
 
				-  }
			
 
				-  for (int32_t i = 0; i < osz_; i++) {
			
 
				-    tree[i].count = counts[i];
			
 
				-  }
			
 
				-  int32_t leaf = osz_ - 1;
			
 
				-  int32_t node = osz_;
			
 
				-  for (int32_t i = osz_; i < 2 * osz_ - 1; i++) {
			
 
				-    int32_t mini[2];
			
 
				-    for (int32_t j = 0; j < 2; j++) {
			
 
				-      if (leaf >= 0 && tree[leaf].count < tree[node].count) {
			
 
				-        mini[j] = leaf--;
			
 
				-      } else {
			
 
				-        mini[j] = node++;
			
 
				-      }
			
 
				-    }
			
 
				-    tree[i].left = mini[0];
			
 
				-    tree[i].right = mini[1];
			
 
				-    tree[i].count = tree[mini[0]].count + tree[mini[1]].count;
			
 
				-    tree[mini[0]].parent = i;
			
 
				-    tree[mini[1]].parent = i;
			
 
				-    tree[mini[1]].binary = true;
			
 
				-  }
			
 
				-  for (int32_t i = 0; i < osz_; i++) {
			
 
				-    std::vector<int32_t> path;
			
 
				-    std::vector<bool> code;
			
 
				-    int32_t j = i;
			
 
				-    while (tree[j].parent != -1) {
			
 
				-      path.push_back(tree[j].parent - osz_);
			
 
				-      code.push_back(tree[j].binary);
			
 
				-      j = tree[j].parent;
			
 
				-    }
			
 
				-    paths.push_back(path);
			
 
				-    codes.push_back(code);
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				 real Model::getLoss() const {
			
 
				-  return loss_ / nexamples_;
			
 
				-}
			
 
				-
			
 
				-void Model::initSigmoid() {
			
 
				-  for (int i = 0; i < SIGMOID_TABLE_SIZE + 1; i++) {
			
 
				-    real x = real(i * 2 * MAX_SIGMOID) / SIGMOID_TABLE_SIZE - MAX_SIGMOID;
			
 
				-    t_sigmoid_.push_back(1.0 / (1.0 + std::exp(-x)));
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void Model::initLog() {
			
 
				-  for (int i = 0; i < LOG_TABLE_SIZE + 1; i++) {
			
 
				-    real x = (real(i) + 1e-5) / LOG_TABLE_SIZE;
			
 
				-    t_log_.push_back(std::log(x));
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-real Model::log(real x) const {
			
 
				-  if (x > 1.0) {
			
 
				-    return 0.0;
			
 
				-  }
			
 
				-  int64_t i = int64_t(x * LOG_TABLE_SIZE);
			
 
				-  return t_log_[i];
			
 
				+  return lossValue_ / nexamples_;
			
 
				 }
			
 
				 
			
 
				 real Model::std_log(real x) const {
			
 
				   return std::log(x + 1e-5);
			
 
				 }
			
 
				 
			
 
				-real Model::sigmoid(real x) const {
			
 
				-  if (x < -MAX_SIGMOID) {
			
 
				-    return 0.0;
			
 
				-  } else if (x > MAX_SIGMOID) {
			
 
				-    return 1.0;
			
 
				-  } else {
			
 
				-    int64_t i =
			
 
				-        int64_t((x + MAX_SIGMOID) * SIGMOID_TABLE_SIZE / MAX_SIGMOID / 2);
			
 
				-    return t_sigmoid_[i];
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				 } // namespace fasttext
			
--- a/src/model.h
+++ b/src/model.h
@@ -14,109 +14,56 @@
 
				 #include <vector>
			
 
				 
			
 
				 #include "args.h"
			
 
				+#include "loss.h"
			
 
				 #include "matrix.h"
			
 
				 #include "real.h"
			
 
				 #include "vector.h"
			
 
				 
			
 
				 namespace fasttext {
			
 
				 
			
 
				-struct Node {
			
 
				-  int32_t parent;
			
 
				-  int32_t left;
			
 
				-  int32_t right;
			
 
				-  int64_t count;
			
 
				-  bool binary;
			
 
				-};
			
 
				-
			
 
				 class Model {
			
 
				  protected:
			
 
				   std::shared_ptr<Matrix> wi_;
			
 
				   std::shared_ptr<Matrix> wo_;
			
 
				   std::shared_ptr<Args> args_;
			
 
				+  std::shared_ptr<Loss> loss_;
			
 
				   Vector hidden_;
			
 
				   Vector output_;
			
 
				   Vector grad_;
			
 
				   int32_t hsz_;
			
 
				   int32_t osz_;
			
 
				-  real loss_;
			
 
				+  real lossValue_;
			
 
				   int64_t nexamples_;
			
 
				-  std::vector<real> t_sigmoid_;
			
 
				-  std::vector<real> t_log_;
			
 
				-  // used for negative sampling:
			
 
				-  std::vector<int32_t> negatives_;
			
 
				-  size_t negpos;
			
 
				-  // used for hierarchical softmax:
			
 
				-  std::vector<std::vector<int32_t>> paths;
			
 
				-  std::vector<std::vector<bool>> codes;
			
 
				-  std::vector<Node> tree;
			
 
				-
			
 
				-  static bool comparePairs(
			
 
				-      const std::pair<real, int32_t>&,
			
 
				-      const std::pair<real, int32_t>&);
			
 
				-
			
 
				-  int32_t getNegative(int32_t target);
			
 
				-  void initSigmoid();
			
 
				-  void initLog();
			
 
				-  void computeOutput(Vector&, Vector&) const;
			
 
				-  void setTargetCounts(const std::vector<int64_t>&);
			
 
				-
			
 
				-  static const int32_t NEGATIVE_TABLE_SIZE = 10000000;
			
 
				 
			
 
				  public:
			
 
				   Model(
			
 
				-      std::shared_ptr<Matrix>,
			
 
				-      std::shared_ptr<Matrix>,
			
 
				-      std::shared_ptr<Args>,
			
 
				-      const std::vector<int64_t>&,
			
 
				-      int32_t);
			
 
				-
			
 
				-  real binaryLogistic(int32_t, bool, real);
			
 
				-  real negativeSampling(int32_t, real);
			
 
				-  real hierarchicalSoftmax(int32_t, real);
			
 
				-  real softmax(int32_t, real);
			
 
				-  real oneVsAll(const std::vector<int32_t>&, real);
			
 
				+      std::shared_ptr<Matrix> wi,
			
 
				+      std::shared_ptr<Matrix> wo,
			
 
				+      std::shared_ptr<Args> args,
			
 
				+      std::shared_ptr<Loss> loss,
			
 
				+      int32_t seed);
			
 
				+  Model(const Model& model, int32_t seed);
			
 
				+  Model(const Model& model) = delete;
			
 
				+  Model(Model&& model) = delete;
			
 
				+  Model& operator=(const Model& other) = delete;
			
 
				+  Model& operator=(Model&& other) = delete;
			
 
				 
			
 
				   void predict(
			
 
				       const std::vector<int32_t>&,
			
 
				       int32_t,
			
 
				       real,
			
 
				-      std::vector<std::pair<real, int32_t>>&,
			
 
				-      Vector&,
			
 
				-      Vector&) const;
			
 
				-  void predict(
			
 
				-      const std::vector<int32_t>&,
			
 
				-      int32_t,
			
 
				-      real,
			
 
				-      std::vector<std::pair<real, int32_t>>&);
			
 
				-  void dfs(
			
 
				-      int32_t,
			
 
				-      real,
			
 
				-      int32_t,
			
 
				-      real,
			
 
				-      std::vector<std::pair<real, int32_t>>&,
			
 
				-      Vector&) const;
			
 
				-  void findKBest(
			
 
				-      int32_t,
			
 
				-      real,
			
 
				-      std::vector<std::pair<real, int32_t>>&,
			
 
				+      Predictions&,
			
 
				       Vector&,
			
 
				       Vector&) const;
			
 
				+  void predict(const std::vector<int32_t>&, int32_t, real, Predictions&);
			
 
				   void update(
			
 
				       const std::vector<int32_t>&,
			
 
				       const std::vector<int32_t>&,
			
 
				       int32_t,
			
 
				       real);
			
 
				-  real computeLoss(const std::vector<int32_t>&, int32_t, real);
			
 
				   void computeHidden(const std::vector<int32_t>&, Vector&) const;
			
 
				-  void computeOutputSigmoid(Vector&, Vector&) const;
			
 
				-  void computeOutputSoftmax(Vector&, Vector&) const;
			
 
				-  void computeOutputSoftmax();
			
 
				 
			
 
				-  void initTableNegatives(const std::vector<int64_t>&);
			
 
				-  void buildTree(const std::vector<int64_t>&);
			
 
				   real getLoss() const;
			
 
				-  real sigmoid(real) const;
			
 
				-  real log(real) const;
			
 
				   real std_log(real) const;
			
 
				 
			
 
				   std::minstd_rand rng;
			
--- a/src/utils.h
+++ b/src/utils.h
@@ -8,6 +8,8 @@
 
				 
			
 
				 #pragma once
			
 
				 
			
 
				+#include "real.h"
			
 
				+
			
 
				 #include <algorithm>
			
 
				 #include <fstream>
			
 
				 #include <vector>
			
@@ -22,6 +24,8 @@
 
				 
			
 
				 namespace fasttext {
			
 
				 
			
 
				+using Predictions = std::vector<std::pair<real, int32_t>>;
			
 
				+
			
 
				 namespace utils {
			
 
				 
			
 
				 int64_t size(std::ifstream&);
			
--- a/src/vector.cc
+++ b/src/vector.cc
@@ -20,13 +20,6 @@ namespace fasttext {
 
				 
			
 
				 Vector::Vector(int64_t m) : data_(m) {}
			
 
				 
			
 
				-Vector::Vector(Vector&& other) noexcept : data_(std::move(other.data_)) {}
			
 
				-
			
 
				-Vector& Vector::operator=(Vector&& other) {
			
 
				-  data_ = std::move(other.data_);
			
 
				-  return *this;
			
 
				-}
			
 
				-
			
 
				 void Vector::zero() {
			
 
				   std::fill(data_.begin(), data_.end(), 0.0);
			
 
				 }
			
--- a/src/vector.h
+++ b/src/vector.h
@@ -24,10 +24,10 @@ class Vector {
 
				 
			
 
				  public:
			
 
				   explicit Vector(int64_t);
			
 
				-  Vector(const Vector&) = delete;
			
 
				-  Vector(Vector&&) noexcept;
			
 
				-  Vector& operator=(const Vector&) = delete;
			
 
				-  Vector& operator=(Vector&&);
			
 
				+  Vector(const Vector&) = default;
			
 
				+  Vector(Vector&&) noexcept = default;
			
 
				+  Vector& operator=(const Vector&) = default;
			
 
				+  Vector& operator=(Vector&&) = default;
			
 
				 
			
 
				   inline real* data() {
			
 
				     return data_.data();