2 lat temu · b733943e84
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,13 +9,15 @@
 
				 cmake_minimum_required(VERSION 2.8.9)
			
 
				 project(fasttext)
			
 
				 
			
 
				+set(CMAKE_CXX_STANDARD 17)
			
 
				+
			
 
				 # The version number.
			
 
				 set (fasttext_VERSION_MAJOR 0)
			
 
				 set (fasttext_VERSION_MINOR 1)
			
 
				 
			
 
				 include_directories(fasttext)
			
 
				 
			
 
				-set(CMAKE_CXX_FLAGS " -pthread -std=c++11 -funroll-loops -O3 -march=native")
			
 
				+set(CMAKE_CXX_FLAGS " -pthread -std=c++17 -funroll-loops -O3 -march=native")
			
 
				 
			
 
				 set(HEADER_FILES
			
 
				     src/args.h
			
--- a/Makefile
+++ b/Makefile
@@ -7,7 +7,7 @@
 
				 #
			
 
				 
			
 
				 CXX = c++
			
 
				-CXXFLAGS = -pthread -std=c++11 -march=native
			
 
				+CXXFLAGS = -pthread -std=c++17 -march=native
			
 
				 OBJS = args.o autotune.o matrix.o dictionary.o loss.o productquantizer.o densematrix.o quantmatrix.o vector.o model.o utils.o meter.o fasttext.o
			
 
				 INCLUDES = -I.
			
 
				 
			
--- a/setup.py
+++ b/setup.py
@@ -98,15 +98,14 @@ def has_flag(compiler, flags):
 
				 
			
 
				 
			
 
				 def cpp_flag(compiler):
			
 
				-    """Return the -std=c++[11/14] compiler flag.
			
 
				-    The c++14 is preferred over c++11 (when it is available).
			
 
				+    """Return the -std=c++17 compiler flag.
			
 
				     """
			
 
				-    standards = ['-std=c++11']
			
 
				+    standards = ['-std=c++17']
			
 
				     for standard in standards:
			
 
				         if has_flag(compiler, [standard]):
			
 
				             return standard
			
 
				     raise RuntimeError(
			
 
				-        'Unsupported compiler -- at least C++11 support '
			
 
				+        'Unsupported compiler -- at least C++17 support '
			
 
				         'is needed!'
			
 
				     )
			
 
				 
			
--- a/src/aligned.h
+++ b/src/aligned.h
@@ -0,0 +1,98 @@
 
				+#pragma once
			
 
				+#include <cstdlib>
			
 
				+#include <new>
			
 
				+#ifdef _MSC_VER
			
 
				+// Ensure _HAS_EXCEPTIONS is defined
			
 
				+#include <vcruntime.h>
			
 
				+#include <malloc.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !((defined(_MSC_VER) && !defined(__clang__)) ? (_HAS_EXCEPTIONS) : (__EXCEPTIONS))
			
 
				+#include <cstdlib>
			
 
				+#endif
			
 
				+
			
 
				+// Aligned simple vector.
			
 
				+
			
 
				+namespace intgemm {
			
 
				+
			
 
				+template <class T> class AlignedVector {
			
 
				+  public:
			
 
				+    AlignedVector() : mem_(nullptr), size_(0) {}
			
 
				+
			
 
				+    explicit AlignedVector(std::size_t size, std::size_t alignment = 64 /* CPU cares about this */)
			
 
				+      : size_(size) {
			
 
				+#ifdef _MSC_VER
			
 
				+      mem_ = static_cast<T*>(_aligned_malloc(size * sizeof(T), alignment));
			
 
				+      if (!mem_) {
			
 
				+#  if (defined(_MSC_VER) && !defined(__clang__)) ? (_HAS_EXCEPTIONS) : (__EXCEPTIONS)
			
 
				+        throw std::bad_alloc();
			
 
				+#  else
			
 
				+        std::abort();
			
 
				+#  endif
			
 
				+      }
			
 
				+#else
			
 
				+      if (posix_memalign(reinterpret_cast<void **>(&mem_), alignment, size * sizeof(T))) {
			
 
				+#  if (defined(_MSC_VER) && !defined(__clang__)) ? (_HAS_EXCEPTIONS) : (__EXCEPTIONS)
			
 
				+        throw std::bad_alloc();
			
 
				+#  else
			
 
				+        std::abort();
			
 
				+#  endif
			
 
				+      }
			
 
				+#endif
			
 
				+    }
			
 
				+
			
 
				+    template <class InputIt> AlignedVector(InputIt first, InputIt last) 
			
 
				+      : AlignedVector(last - first) {
			
 
				+      std::copy(first, last, begin());
			
 
				+    }
			
 
				+
			
 
				+    AlignedVector(AlignedVector &&from) noexcept : mem_(from.mem_), size_(from.size_) {
			
 
				+      from.mem_ = nullptr;
			
 
				+      from.size_ = 0;
			
 
				+    }
			
 
				+
			
 
				+    AlignedVector &operator=(AlignedVector &&from) {
			
 
				+      if (this == &from) return *this;
			
 
				+      release();
			
 
				+      mem_ = from.mem_;
			
 
				+      size_ = from.size_;
			
 
				+      from.mem_ = nullptr;
			
 
				+      from.size_ = 0;
			
 
				+      return *this;
			
 
				+    }
			
 
				+
			
 
				+    AlignedVector(const AlignedVector&) = delete;
			
 
				+    AlignedVector& operator=(const AlignedVector&) = delete;
			
 
				+
			
 
				+    ~AlignedVector() { release(); }
			
 
				+
			
 
				+    std::size_t size() const { return size_; }
			
 
				+
			
 
				+    T &operator[](std::size_t offset) { return mem_[offset]; }
			
 
				+    const T &operator[](std::size_t offset) const { return mem_[offset]; }
			
 
				+
			
 
				+    T *begin() { return mem_; }
			
 
				+    const T *begin() const { return mem_; }
			
 
				+    T *end() { return mem_ + size_; }
			
 
				+    const T *end() const { return mem_ + size_; }
			
 
				+
			
 
				+    T *data() { return mem_; }
			
 
				+    const T *data() const { return mem_; }
			
 
				+
			
 
				+    template <typename ReturnType>
			
 
				+    ReturnType *as() { return reinterpret_cast<ReturnType*>(mem_); }
			
 
				+
			
 
				+  private:
			
 
				+    T *mem_;
			
 
				+    std::size_t size_;
			
 
				+
			
 
				+    void release() {
			
 
				+#ifdef _MSC_VER
			
 
				+      _aligned_free(mem_);
			
 
				+#else
			
 
				+      std::free(mem_);
			
 
				+#endif
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+} // namespace intgemm
			
--- a/src/densematrix.cc
+++ b/src/densematrix.cc
@@ -15,6 +15,10 @@
 
				 #include "utils.h"
			
 
				 #include "vector.h"
			
 
				 
			
 
				+#if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE__)
			
 
				+#include <immintrin.h>
			
 
				+#endif
			
 
				+
			
 
				 namespace fasttext {
			
 
				 
			
 
				 DenseMatrix::DenseMatrix() : DenseMatrix(0, 0) {}
			
@@ -146,6 +150,92 @@ void DenseMatrix::addRowToVector(Vector& x, int32_t i, real a) const {
 
				   }
			
 
				 }
			
 
				 
			
 
				+/* Abstract over AVX512F, AVX, and SSE intrinsics, using the one available on this machine. */
			
 
				+#if defined(__AVX512F__)
			
 
				+using Register = __m512;
			
 
				+inline Register Add(Register first, Register second) { return _mm512_add_ps(first, second); }
			
 
				+inline Register Set1(float to) { return _mm512_set1_ps(to); }
			
 
				+inline Register Multiply(Register first, Register second) { return _mm512_mul_ps(first, second); }
			
 
				+#elif defined(__AVX__)
			
 
				+using Register = __m256;
			
 
				+inline Register Add(Register first, Register second) { return _mm256_add_ps(first, second); }
			
 
				+inline Register Set1(float to) { return _mm256_set1_ps(to); }
			
 
				+inline Register Multiply(Register first, Register second) { return _mm256_mul_ps(first, second); }
			
 
				+#elif defined(__SSE__)
			
 
				+using Register = __m128;
			
 
				+inline Register Add(Register first, Register second) { return _mm_add_ps(first, second); }
			
 
				+inline Register Set1(float to) { return _mm_set1_ps(to); }
			
 
				+inline Register Multiply(Register first, Register second) { return _mm_mul_ps(first, second); }
			
 
				+#endif
			
 
				+
			
 
				+/* Faster routine for averaging rows of a matrix on x86.
			
 
				+ * The idea here is to keep the accumulators in registers if possible. */
			
 
				+#if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE__)
			
 
				+template <unsigned Cols> void averageRowsFast(Vector& x, const std::vector<int32_t>& rows, const DenseMatrix &matrix) {
			
 
				+  // Columns must be a multiple of how many floats fit in a register.
			
 
				+  static_assert(Cols % (sizeof(Register) / 4) == 0);
			
 
				+  constexpr unsigned RegisterCount = Cols / (sizeof(Register) / 4);
			
 
				+  // These should be aligned by aligned.h
			
 
				+  assert(reinterpret_cast<uintptr_t>(x.data()) % sizeof(Register) == 0);
			
 
				+  assert(reinterpret_cast<uintptr_t>(matrix.data()) % sizeof(Register) == 0);
			
 
				+
			
 
				+  // Guard against empty list of rows with default NaN behavior.
			
 
				+  if (rows.empty()) {
			
 
				+    x.zero();
			
 
				+    x.mul(1.0 / rows.size());
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  // Copy the first row to accumulation registers.
			
 
				+  Register accum[RegisterCount];
			
 
				+  auto row = rows.cbegin();
			
 
				+  const Register *base = reinterpret_cast<const Register*>(matrix.data() + matrix.cols() * *row);
			
 
				+  for (unsigned i = 0; i < RegisterCount; ++i) {
			
 
				+    accum[i] = base[i];
			
 
				+  }
			
 
				+  // Add the rows after the first.
			
 
				+  for (++row; row != rows.cend(); ++row) {
			
 
				+    base = reinterpret_cast<const Register*>(matrix.data() + matrix.cols() * *row);
			
 
				+    for (unsigned i = 0; i < RegisterCount; ++i) {
			
 
				+      accum[i] = Add(accum[i], base[i]);
			
 
				+    }
			
 
				+  }
			
 
				+  // Multiply by (1.0 / rows.size()) and write to x.
			
 
				+  Register mul = Set1(1.0 / rows.size());
			
 
				+  for (unsigned i = 0; i < RegisterCount; ++i) {
			
 
				+    reinterpret_cast<Register*>(x.data())[i] = Multiply(accum[i], mul);
			
 
				+  }
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+void DenseMatrix::averageRowsToVector(Vector& x, const std::vector<int32_t>& rows) const {
			
 
				+#if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE__)
			
 
				+  switch (cols()) {
			
 
				+    case 512:
			
 
				+      // Maximum number that can fit all in registers on AVX512F.
			
 
				+      averageRowsFast<512>(x, rows, *this);
			
 
				+      return;
			
 
				+    case 256:
			
 
				+      averageRowsFast<256>(x, rows, *this);
			
 
				+      return;
			
 
				+    case 64:
			
 
				+      averageRowsFast<64>(x, rows, *this);
			
 
				+      return;
			
 
				+    case 32:
			
 
				+      averageRowsFast<32>(x, rows, *this);
			
 
				+      return;
			
 
				+    case 16:
			
 
				+      averageRowsFast<16>(x, rows, *this);
			
 
				+      return;
			
 
				+  }
			
 
				+#endif
			
 
				+  x.zero();
			
 
				+  for (auto it = rows.cbegin(); it != rows.cend(); ++it) {
			
 
				+    addRowToVector(x, *it);
			
 
				+  }
			
 
				+  x.mul(1.0 / rows.size());
			
 
				+}
			
 
				+
			
 
				 void DenseMatrix::save(std::ostream& out) const {
			
 
				   out.write((char*)&m_, sizeof(int64_t));
			
 
				   out.write((char*)&n_, sizeof(int64_t));
			
@@ -155,7 +245,7 @@ void DenseMatrix::save(std::ostream& out) const {
 
				 void DenseMatrix::load(std::istream& in) {
			
 
				   in.read((char*)&m_, sizeof(int64_t));
			
 
				   in.read((char*)&n_, sizeof(int64_t));
			
 
				-  data_ = std::vector<real>(m_ * n_);
			
 
				+  data_ = intgemm::AlignedVector<real>(m_ * n_);
			
 
				   in.read((char*)data_.data(), m_ * n_ * sizeof(real));
			
 
				 }
			
 
				 
			
--- a/src/densematrix.h
+++ b/src/densematrix.h
@@ -15,6 +15,7 @@
 
				 #include <stdexcept>
			
 
				 #include <vector>
			
 
				 
			
 
				+#include "aligned.h"
			
 
				 #include "matrix.h"
			
 
				 #include "real.h"
			
 
				 
			
@@ -24,7 +25,7 @@ class Vector;
 
				 
			
 
				 class DenseMatrix : public Matrix {
			
 
				  protected:
			
 
				-  std::vector<real> data_;
			
 
				+  intgemm::AlignedVector<real> data_;
			
 
				   void uniformThread(real, int, int32_t);
			
 
				 
			
 
				  public:
			
@@ -71,6 +72,7 @@ class DenseMatrix : public Matrix {
 
				   void addVectorToRow(const Vector&, int64_t, real) override;
			
 
				   void addRowToVector(Vector& x, int32_t i) const override;
			
 
				   void addRowToVector(Vector& x, int32_t i, real a) const override;
			
 
				+  void averageRowsToVector(Vector& x, const std::vector<int32_t>& rows) const override;
			
 
				   void save(std::ostream&) const override;
			
 
				   void load(std::istream&) override;
			
 
				   void dump(std::ostream&) const override;
			
--- a/src/dictionary.cc
+++ b/src/dictionary.cc
@@ -42,11 +42,11 @@ Dictionary::Dictionary(std::shared_ptr<Args> args, std::istream& in)
 
				   load(in);
			
 
				 }
			
 
				 
			
 
				-int32_t Dictionary::find(const std::string& w) const {
			
 
				+int32_t Dictionary::find(const std::string_view w) const {
			
 
				   return find(w, hash(w));
			
 
				 }
			
 
				 
			
 
				-int32_t Dictionary::find(const std::string& w, uint32_t h) const {
			
 
				+int32_t Dictionary::find(const std::string_view w, uint32_t h) const {
			
 
				   int32_t word2intsize = word2int_.size();
			
 
				   int32_t id = h % word2intsize;
			
 
				   while (word2int_[id] != -1 && words_[word2int_[id]].word != w) {
			
@@ -126,12 +126,12 @@ bool Dictionary::discard(int32_t id, real rand) const {
 
				   return rand > pdiscard_[id];
			
 
				 }
			
 
				 
			
 
				-int32_t Dictionary::getId(const std::string& w, uint32_t h) const {
			
 
				+int32_t Dictionary::getId(const std::string_view w, uint32_t h) const {
			
 
				   int32_t id = find(w, h);
			
 
				   return word2int_[id];
			
 
				 }
			
 
				 
			
 
				-int32_t Dictionary::getId(const std::string& w) const {
			
 
				+int32_t Dictionary::getId(const std::string_view w) const {
			
 
				   int32_t h = find(w);
			
 
				   return word2int_[h];
			
 
				 }
			
@@ -142,7 +142,7 @@ entry_type Dictionary::getType(int32_t id) const {
 
				   return words_[id].type;
			
 
				 }
			
 
				 
			
 
				-entry_type Dictionary::getType(const std::string& w) const {
			
 
				+entry_type Dictionary::getType(const std::string_view w) const {
			
 
				   return (w.find(args_->label) == 0) ? entry_type::label : entry_type::word;
			
 
				 }
			
 
				 
			
@@ -160,7 +160,7 @@ std::string Dictionary::getWord(int32_t id) const {
 
				 // Since all fasttext models that were already released were trained
			
 
				 // using signed char, we fixed the hash function to make models
			
 
				 // compatible whatever compiler is used.
			
 
				-uint32_t Dictionary::hash(const std::string& str) const {
			
 
				+uint32_t Dictionary::hash(const std::string_view str) const {
			
 
				   uint32_t h = 2166136261;
			
 
				   for (size_t i = 0; i < str.size(); i++) {
			
 
				     h = h ^ uint32_t(int8_t(str[i]));
			
@@ -324,11 +324,16 @@ void Dictionary::addWordNgrams(
 
				 
			
 
				 void Dictionary::addSubwords(
			
 
				     std::vector<int32_t>& line,
			
 
				-    const std::string& token,
			
 
				+    const std::string_view token,
			
 
				     int32_t wid) const {
			
 
				   if (wid < 0) { // out of vocab
			
 
				     if (token != EOS) {
			
 
				-      computeSubwords(BOW + token + EOW, line);
			
 
				+      std::string concat;
			
 
				+      concat.reserve(BOW.size() + token.size() + EOW.size());
			
 
				+      concat += BOW;
			
 
				+      concat.append(token.data(), token.size());
			
 
				+      concat += EOW;
			
 
				+      computeSubwords(concat, line);
			
 
				     }
			
 
				   } else {
			
 
				     if (args_->maxn <= 0) { // in vocab w/o subwords
			
@@ -406,6 +411,51 @@ int32_t Dictionary::getLine(
 
				   return ntokens;
			
 
				 }
			
 
				 
			
 
				+namespace {
			
 
				+bool readWordNoNewline(std::string_view& in, std::string_view& word) {
			
 
				+  const std::string_view spaces(" \n\r\t\v\f\0");
			
 
				+  std::string_view::size_type begin = in.find_first_not_of(spaces);
			
 
				+  if (begin == std::string_view::npos) {
			
 
				+    in.remove_prefix(in.size());
			
 
				+    return false;
			
 
				+  }
			
 
				+  in.remove_prefix(begin);
			
 
				+  word = in.substr(0, in.find_first_of(spaces));
			
 
				+  in.remove_prefix(word.size());
			
 
				+  return true;
			
 
				+}
			
 
				+} // namespace
			
 
				+
			
 
				+int32_t Dictionary::getStringNoNewline(
			
 
				+    std::string_view in,
			
 
				+    std::vector<int32_t>& words,
			
 
				+    std::vector<int32_t>& labels) const {
			
 
				+  std::vector<int32_t> word_hashes;
			
 
				+  std::string_view token;
			
 
				+  int32_t ntokens = 0;
			
 
				+
			
 
				+  words.clear();
			
 
				+  labels.clear();
			
 
				+  while (readWordNoNewline(in, token)) {
			
 
				+    uint32_t h = hash(token);
			
 
				+    int32_t wid = getId(token, h);
			
 
				+    entry_type type = wid < 0 ? getType(token) : getType(wid);
			
 
				+
			
 
				+    ntokens++;
			
 
				+    if (type == entry_type::word) {
			
 
				+      addSubwords(words, token, wid);
			
 
				+      word_hashes.push_back(h);
			
 
				+    } else if (type == entry_type::label && wid >= 0) {
			
 
				+      labels.push_back(wid - nwords_);
			
 
				+    }
			
 
				+    if (token == EOS) {
			
 
				+      break;
			
 
				+    }
			
 
				+  }
			
 
				+  addWordNgrams(words, word_hashes, args_->wordNgrams);
			
 
				+  return ntokens;
			
 
				+}
			
 
				+
			
 
				 void Dictionary::pushHash(std::vector<int32_t>& hashes, int32_t id) const {
			
 
				   if (pruneidx_size_ == 0 || id < 0) {
			
 
				     return;
			
--- a/src/dictionary.h
+++ b/src/dictionary.h
@@ -13,6 +13,7 @@
 
				 #include <ostream>
			
 
				 #include <random>
			
 
				 #include <string>
			
 
				+#include <string_view>
			
 
				 #include <unordered_map>
			
 
				 #include <vector>
			
 
				 
			
@@ -36,13 +37,13 @@ class Dictionary {
 
				   static const int32_t MAX_VOCAB_SIZE = 30000000;
			
 
				   static const int32_t MAX_LINE_SIZE = 1024;
			
 
				 
			
 
				-  int32_t find(const std::string&) const;
			
 
				-  int32_t find(const std::string&, uint32_t h) const;
			
 
				+  int32_t find(const std::string_view) const;
			
 
				+  int32_t find(const std::string_view, uint32_t h) const;
			
 
				   void initTableDiscard();
			
 
				   void initNgrams();
			
 
				   void reset(std::istream&) const;
			
 
				   void pushHash(std::vector<int32_t>&, int32_t) const;
			
 
				-  void addSubwords(std::vector<int32_t>&, const std::string&, int32_t) const;
			
 
				+  void addSubwords(std::vector<int32_t>&, const std::string_view, int32_t) const;
			
 
				 
			
 
				   std::shared_ptr<Args> args_;
			
 
				   std::vector<int32_t> word2int_;
			
@@ -71,10 +72,10 @@ class Dictionary {
 
				   int32_t nwords() const;
			
 
				   int32_t nlabels() const;
			
 
				   int64_t ntokens() const;
			
 
				-  int32_t getId(const std::string&) const;
			
 
				-  int32_t getId(const std::string&, uint32_t h) const;
			
 
				+  int32_t getId(const std::string_view) const;
			
 
				+  int32_t getId(const std::string_view, uint32_t h) const;
			
 
				   entry_type getType(int32_t) const;
			
 
				-  entry_type getType(const std::string&) const;
			
 
				+  entry_type getType(const std::string_view) const;
			
 
				   bool discard(int32_t, real) const;
			
 
				   std::string getWord(int32_t) const;
			
 
				   const std::vector<int32_t>& getSubwords(int32_t) const;
			
@@ -87,7 +88,7 @@ class Dictionary {
 
				       const std::string&,
			
 
				       std::vector<int32_t>&,
			
 
				       std::vector<std::string>* substrings = nullptr) const;
			
 
				-  uint32_t hash(const std::string& str) const;
			
 
				+  uint32_t hash(const std::string_view str) const;
			
 
				   void add(const std::string&);
			
 
				   bool readWord(std::istream&, std::string&) const;
			
 
				   void readFromFile(std::istream&);
			
@@ -99,6 +100,8 @@ class Dictionary {
 
				       const;
			
 
				   int32_t getLine(std::istream&, std::vector<int32_t>&, std::minstd_rand&)
			
 
				       const;
			
 
				+  int32_t getStringNoNewline(std::string_view, std::vector<int32_t>&,
			
 
				+      std::vector<int32_t>&) const;
			
 
				   void threshold(int64_t, int64_t);
			
 
				   void prune(std::vector<int32_t>&);
			
 
				   bool isPruned() {
			
--- a/src/fasttext.cc
+++ b/src/fasttext.cc
@@ -532,7 +532,7 @@ std::vector<std::pair<std::string, Vector>> FastText::getNgramVectors(
 
				     if (ngrams[i] >= 0) {
			
 
				       vec.addRow(*input_, ngrams[i]);
			
 
				     }
			
 
				-    result.push_back(std::make_pair(substrings[i], std::move(vec)));
			
 
				+    result.emplace_back(substrings[i], std::move(vec));
			
 
				   }
			
 
				   return result;
			
 
				 }
			
@@ -609,7 +609,7 @@ std::vector<std::pair<real, std::string>> FastText::getAnalogies(
 
				     const std::string& wordA,
			
 
				     const std::string& wordB,
			
 
				     const std::string& wordC) {
			
 
				-  Vector query = Vector(args_->dim);
			
 
				+  Vector query(args_->dim);
			
 
				   query.zero();
			
 
				 
			
 
				   Vector buffer(args_->dim);
			
--- a/src/matrix.h
+++ b/src/matrix.h
@@ -36,6 +36,7 @@ class Matrix {
 
				   virtual void addVectorToRow(const Vector&, int64_t, real) = 0;
			
 
				   virtual void addRowToVector(Vector& x, int32_t i) const = 0;
			
 
				   virtual void addRowToVector(Vector& x, int32_t i, real a) const = 0;
			
 
				+  virtual void averageRowsToVector(Vector& x, const std::vector<int32_t>& rows) const = 0;
			
 
				   virtual void save(std::ostream&) const = 0;
			
 
				   virtual void load(std::istream&) = 0;
			
 
				   virtual void dump(std::ostream&) const = 0;
			
--- a/src/model.cc
+++ b/src/model.cc
@@ -42,11 +42,7 @@ Model::Model(
 
				 void Model::computeHidden(const std::vector<int32_t>& input, State& state)
			
 
				     const {
			
 
				   Vector& hidden = state.hidden;
			
 
				-  hidden.zero();
			
 
				-  for (auto it = input.cbegin(); it != input.cend(); ++it) {
			
 
				-    hidden.addRow(*wi_, *it);
			
 
				-  }
			
 
				-  hidden.mul(1.0 / input.size());
			
 
				+  wi_->averageRowsToVector(hidden, input);
			
 
				 }
			
 
				 
			
 
				 void Model::predict(
			
--- a/src/quantmatrix.cc
+++ b/src/quantmatrix.cc
@@ -80,6 +80,14 @@ void QuantMatrix::addRowToVector(Vector& x, int32_t i) const {
 
				   pq_->addcode(x, codes_.data(), i, norm);
			
 
				 }
			
 
				 
			
 
				+void QuantMatrix::averageRowsToVector(Vector& x, const std::vector<int32_t>& rows) const {
			
 
				+  x.zero();
			
 
				+  for (auto it = rows.cbegin(); it != rows.cend(); ++it) {
			
 
				+    addRowToVector(x, *it);
			
 
				+  }
			
 
				+  x.mul(1.0 / rows.size());
			
 
				+}
			
 
				+
			
 
				 void QuantMatrix::save(std::ostream& out) const {
			
 
				   out.write((char*)&qnorm_, sizeof(qnorm_));
			
 
				   out.write((char*)&m_, sizeof(m_));
			
--- a/src/quantmatrix.h
+++ b/src/quantmatrix.h
@@ -52,6 +52,7 @@ class QuantMatrix : public Matrix {
 
				   void addVectorToRow(const Vector&, int64_t, real) override;
			
 
				   void addRowToVector(Vector& x, int32_t i) const override;
			
 
				   void addRowToVector(Vector& x, int32_t i, real a) const override;
			
 
				+  void averageRowsToVector(Vector& x, const std::vector<int32_t>& rows) const override;
			
 
				   void save(std::ostream&) const override;
			
 
				   void load(std::istream&) override;
			
 
				   void dump(std::ostream&) const override;
			
--- a/src/vector.h
+++ b/src/vector.h
@@ -12,6 +12,7 @@
 
				 #include <ostream>
			
 
				 #include <vector>
			
 
				 
			
 
				+#include "aligned.h"
			
 
				 #include "real.h"
			
 
				 
			
 
				 namespace fasttext {
			
@@ -20,12 +21,12 @@ class Matrix;
 
				 
			
 
				 class Vector {
			
 
				  protected:
			
 
				-  std::vector<real> data_;
			
 
				+  intgemm::AlignedVector<real> data_;
			
 
				 
			
 
				  public:
			
 
				   explicit Vector(int64_t);
			
 
				   Vector(const Vector&) = default;
			
 
				-  Vector(Vector&&) noexcept = default;
			
 
				+  Vector(Vector&&) = default;
			
 
				   Vector& operator=(const Vector&) = default;
			
 
				   Vector& operator=(Vector&&) = default;