Przeglądaj źródła

Predict 1.9-4.2x faster (#1341)

Summary:
I made prediction 1.9x to 4.2x faster than before.

# Motivation
I want to use https://tinyurl.com/nllblid218e and similarly parametrized models to run language classification on petabytes of web data.

# Methodology
The costliest operation is summing the rows for each model input.  I've optimized this in three ways:
1. `addRowToVector` was a virtual function call for each row.  I've replaced this with one virtual function call per prediction by adding `averageRowsToVector` to `Matrix` calls.
2. `Vector` and `DenseMatrix` were not 64-byte aligned so the CPU was doing a lot of unaligned memory access.  I've brought in my own `vector` replacement that does 64-byte alignment.
3.  Write the `averageRowsToVector` in intrinsics for common vector sizes.  This works on SSE, AVX, and AVX512F.

See the commit history for a breakdown of speed improvement from each change.

# Experiments
Test set [docs1000.txt.gz](https://github.com/facebookresearch/fastText/files/11832996/docs1000.txt.gz) which is a bunch of random documents https://data.statmt.org/heafield/classified-fasttext/
CPU: AMD Ryzen 9 7950X 16-Core

Model https://tinyurl.com/nllblid218e with 256-dimensional vectors
Before
real    0m8.757s
user    0m8.434s
sys     0m0.327s

After
real    0m2.046s
user    0m1.717s
sys     0m0.334s

Model https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin with 16-dimensional vectors
Before
real    0m0.926s
user    0m0.889s
sys     0m0.037s

After
real    0m0.477s
user    0m0.436s
sys     0m0.040s

Pull Request resolved: https://github.com/facebookresearch/fastText/pull/1341

Reviewed By: graemenail

Differential Revision: D52134736

Pulled By: kpuatfb

fbshipit-source-id: 42067161f4c968c34612934b48a562399a267f3b
Kenneth Heafield 2 lat temu
rodzic
commit
b733943e84
14 zmienionych plików z 283 dodań i 32 usunięć
  1. 3 1
      CMakeLists.txt
  2. 1 1
      Makefile
  3. 3 4
      setup.py
  4. 98 0
      src/aligned.h
  5. 91 1
      src/densematrix.cc
  6. 3 1
      src/densematrix.h
  7. 58 8
      src/dictionary.cc
  8. 10 7
      src/dictionary.h
  9. 2 2
      src/fasttext.cc
  10. 1 0
      src/matrix.h
  11. 1 5
      src/model.cc
  12. 8 0
      src/quantmatrix.cc
  13. 1 0
      src/quantmatrix.h
  14. 3 2
      src/vector.h

+ 3 - 1
CMakeLists.txt

@@ -9,13 +9,15 @@
 cmake_minimum_required(VERSION 2.8.9)
 project(fasttext)
 
+set(CMAKE_CXX_STANDARD 17)
+
 # The version number.
 set (fasttext_VERSION_MAJOR 0)
 set (fasttext_VERSION_MINOR 1)
 
 include_directories(fasttext)
 
-set(CMAKE_CXX_FLAGS " -pthread -std=c++11 -funroll-loops -O3 -march=native")
+set(CMAKE_CXX_FLAGS " -pthread -std=c++17 -funroll-loops -O3 -march=native")
 
 set(HEADER_FILES
     src/args.h

+ 1 - 1
Makefile

@@ -7,7 +7,7 @@
 #
 
 CXX = c++
-CXXFLAGS = -pthread -std=c++11 -march=native
+CXXFLAGS = -pthread -std=c++17 -march=native
 OBJS = args.o autotune.o matrix.o dictionary.o loss.o productquantizer.o densematrix.o quantmatrix.o vector.o model.o utils.o meter.o fasttext.o
 INCLUDES = -I.
 

+ 3 - 4
setup.py

@@ -98,15 +98,14 @@ def has_flag(compiler, flags):
 
 
 def cpp_flag(compiler):
-    """Return the -std=c++[11/14] compiler flag.
-    The c++14 is preferred over c++11 (when it is available).
+    """Return the -std=c++17 compiler flag.
     """
-    standards = ['-std=c++11']
+    standards = ['-std=c++17']
     for standard in standards:
         if has_flag(compiler, [standard]):
             return standard
     raise RuntimeError(
-        'Unsupported compiler -- at least C++11 support '
+        'Unsupported compiler -- at least C++17 support '
         'is needed!'
     )
 

+ 98 - 0
src/aligned.h

@@ -0,0 +1,98 @@
+#pragma once
+#include <cstdlib>
+#include <new>
+#ifdef _MSC_VER
+// Ensure _HAS_EXCEPTIONS is defined
+#include <vcruntime.h>
+#include <malloc.h>
+#endif
+
+#if !((defined(_MSC_VER) && !defined(__clang__)) ? (_HAS_EXCEPTIONS) : (__EXCEPTIONS))
+#include <cstdlib>
+#endif
+
+// Aligned simple vector.
+
+namespace intgemm {
+
+template <class T> class AlignedVector {
+  public:
+    AlignedVector() : mem_(nullptr), size_(0) {}
+
+    explicit AlignedVector(std::size_t size, std::size_t alignment = 64 /* CPU cares about this */)
+      : size_(size) {
+#ifdef _MSC_VER
+      mem_ = static_cast<T*>(_aligned_malloc(size * sizeof(T), alignment));
+      if (!mem_) {
+#  if (defined(_MSC_VER) && !defined(__clang__)) ? (_HAS_EXCEPTIONS) : (__EXCEPTIONS)
+        throw std::bad_alloc();
+#  else
+        std::abort();
+#  endif
+      }
+#else
+      if (posix_memalign(reinterpret_cast<void **>(&mem_), alignment, size * sizeof(T))) {
+#  if (defined(_MSC_VER) && !defined(__clang__)) ? (_HAS_EXCEPTIONS) : (__EXCEPTIONS)
+        throw std::bad_alloc();
+#  else
+        std::abort();
+#  endif
+      }
+#endif
+    }
+
+    template <class InputIt> AlignedVector(InputIt first, InputIt last) 
+      : AlignedVector(last - first) {
+      std::copy(first, last, begin());
+    }
+
+    AlignedVector(AlignedVector &&from) noexcept : mem_(from.mem_), size_(from.size_) {
+      from.mem_ = nullptr;
+      from.size_ = 0;
+    }
+
+    AlignedVector &operator=(AlignedVector &&from) {
+      if (this == &from) return *this;
+      release();
+      mem_ = from.mem_;
+      size_ = from.size_;
+      from.mem_ = nullptr;
+      from.size_ = 0;
+      return *this;
+    }
+
+    AlignedVector(const AlignedVector&) = delete;
+    AlignedVector& operator=(const AlignedVector&) = delete;
+
+    ~AlignedVector() { release(); }
+
+    std::size_t size() const { return size_; }
+
+    T &operator[](std::size_t offset) { return mem_[offset]; }
+    const T &operator[](std::size_t offset) const { return mem_[offset]; }
+
+    T *begin() { return mem_; }
+    const T *begin() const { return mem_; }
+    T *end() { return mem_ + size_; }
+    const T *end() const { return mem_ + size_; }
+
+    T *data() { return mem_; }
+    const T *data() const { return mem_; }
+
+    template <typename ReturnType>
+    ReturnType *as() { return reinterpret_cast<ReturnType*>(mem_); }
+
+  private:
+    T *mem_;
+    std::size_t size_;
+
+    void release() {
+#ifdef _MSC_VER
+      _aligned_free(mem_);
+#else
+      std::free(mem_);
+#endif
+    }
+};
+
+} // namespace intgemm

+ 91 - 1
src/densematrix.cc

@@ -15,6 +15,10 @@
 #include "utils.h"
 #include "vector.h"
 
+#if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE__)
+#include <immintrin.h>
+#endif
+
 namespace fasttext {
 
 DenseMatrix::DenseMatrix() : DenseMatrix(0, 0) {}
@@ -146,6 +150,92 @@ void DenseMatrix::addRowToVector(Vector& x, int32_t i, real a) const {
   }
 }
 
+/* Abstract over AVX512F, AVX, and SSE intrinsics, using the one available on this machine. */
+#if defined(__AVX512F__)
+using Register = __m512;
+inline Register Add(Register first, Register second) { return _mm512_add_ps(first, second); }
+inline Register Set1(float to) { return _mm512_set1_ps(to); }
+inline Register Multiply(Register first, Register second) { return _mm512_mul_ps(first, second); }
+#elif defined(__AVX__)
+using Register = __m256;
+inline Register Add(Register first, Register second) { return _mm256_add_ps(first, second); }
+inline Register Set1(float to) { return _mm256_set1_ps(to); }
+inline Register Multiply(Register first, Register second) { return _mm256_mul_ps(first, second); }
+#elif defined(__SSE__)
+using Register = __m128;
+inline Register Add(Register first, Register second) { return _mm_add_ps(first, second); }
+inline Register Set1(float to) { return _mm_set1_ps(to); }
+inline Register Multiply(Register first, Register second) { return _mm_mul_ps(first, second); }
+#endif
+
+/* Faster routine for averaging rows of a matrix on x86.
+ * The idea here is to keep the accumulators in registers if possible. */
+#if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE__)
+template <unsigned Cols> void averageRowsFast(Vector& x, const std::vector<int32_t>& rows, const DenseMatrix &matrix) {
+  // Columns must be a multiple of how many floats fit in a register.
+  static_assert(Cols % (sizeof(Register) / 4) == 0);
+  constexpr unsigned RegisterCount = Cols / (sizeof(Register) / 4);
+  // These should be aligned by aligned.h
+  assert(reinterpret_cast<uintptr_t>(x.data()) % sizeof(Register) == 0);
+  assert(reinterpret_cast<uintptr_t>(matrix.data()) % sizeof(Register) == 0);
+
+  // Guard against empty list of rows with default NaN behavior.
+  if (rows.empty()) {
+    x.zero();
+    x.mul(1.0 / rows.size());
+    return;
+  }
+
+  // Copy the first row to accumulation registers.
+  Register accum[RegisterCount];
+  auto row = rows.cbegin();
+  const Register *base = reinterpret_cast<const Register*>(matrix.data() + matrix.cols() * *row);
+  for (unsigned i = 0; i < RegisterCount; ++i) {
+    accum[i] = base[i];
+  }
+  // Add the rows after the first.
+  for (++row; row != rows.cend(); ++row) {
+    base = reinterpret_cast<const Register*>(matrix.data() + matrix.cols() * *row);
+    for (unsigned i = 0; i < RegisterCount; ++i) {
+      accum[i] = Add(accum[i], base[i]);
+    }
+  }
+  // Multiply by (1.0 / rows.size()) and write to x.
+  Register mul = Set1(1.0 / rows.size());
+  for (unsigned i = 0; i < RegisterCount; ++i) {
+    reinterpret_cast<Register*>(x.data())[i] = Multiply(accum[i], mul);
+  }
+}
+#endif
+
+void DenseMatrix::averageRowsToVector(Vector& x, const std::vector<int32_t>& rows) const {
+#if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE__)
+  switch (cols()) {
+    case 512:
+      // Maximum number that can fit all in registers on AVX512F.
+      averageRowsFast<512>(x, rows, *this);
+      return;
+    case 256:
+      averageRowsFast<256>(x, rows, *this);
+      return;
+    case 64:
+      averageRowsFast<64>(x, rows, *this);
+      return;
+    case 32:
+      averageRowsFast<32>(x, rows, *this);
+      return;
+    case 16:
+      averageRowsFast<16>(x, rows, *this);
+      return;
+  }
+#endif
+  x.zero();
+  for (auto it = rows.cbegin(); it != rows.cend(); ++it) {
+    addRowToVector(x, *it);
+  }
+  x.mul(1.0 / rows.size());
+}
+
 void DenseMatrix::save(std::ostream& out) const {
   out.write((char*)&m_, sizeof(int64_t));
   out.write((char*)&n_, sizeof(int64_t));
@@ -155,7 +245,7 @@ void DenseMatrix::save(std::ostream& out) const {
 void DenseMatrix::load(std::istream& in) {
   in.read((char*)&m_, sizeof(int64_t));
   in.read((char*)&n_, sizeof(int64_t));
-  data_ = std::vector<real>(m_ * n_);
+  data_ = intgemm::AlignedVector<real>(m_ * n_);
   in.read((char*)data_.data(), m_ * n_ * sizeof(real));
 }
 

+ 3 - 1
src/densematrix.h

@@ -15,6 +15,7 @@
 #include <stdexcept>
 #include <vector>
 
+#include "aligned.h"
 #include "matrix.h"
 #include "real.h"
 
@@ -24,7 +25,7 @@ class Vector;
 
 class DenseMatrix : public Matrix {
  protected:
-  std::vector<real> data_;
+  intgemm::AlignedVector<real> data_;
   void uniformThread(real, int, int32_t);
 
  public:
@@ -71,6 +72,7 @@ class DenseMatrix : public Matrix {
   void addVectorToRow(const Vector&, int64_t, real) override;
   void addRowToVector(Vector& x, int32_t i) const override;
   void addRowToVector(Vector& x, int32_t i, real a) const override;
+  void averageRowsToVector(Vector& x, const std::vector<int32_t>& rows) const override;
   void save(std::ostream&) const override;
   void load(std::istream&) override;
   void dump(std::ostream&) const override;

+ 58 - 8
src/dictionary.cc

@@ -42,11 +42,11 @@ Dictionary::Dictionary(std::shared_ptr<Args> args, std::istream& in)
   load(in);
 }
 
-int32_t Dictionary::find(const std::string& w) const {
+int32_t Dictionary::find(const std::string_view w) const {
   return find(w, hash(w));
 }
 
-int32_t Dictionary::find(const std::string& w, uint32_t h) const {
+int32_t Dictionary::find(const std::string_view w, uint32_t h) const {
   int32_t word2intsize = word2int_.size();
   int32_t id = h % word2intsize;
   while (word2int_[id] != -1 && words_[word2int_[id]].word != w) {
@@ -126,12 +126,12 @@ bool Dictionary::discard(int32_t id, real rand) const {
   return rand > pdiscard_[id];
 }
 
-int32_t Dictionary::getId(const std::string& w, uint32_t h) const {
+int32_t Dictionary::getId(const std::string_view w, uint32_t h) const {
   int32_t id = find(w, h);
   return word2int_[id];
 }
 
-int32_t Dictionary::getId(const std::string& w) const {
+int32_t Dictionary::getId(const std::string_view w) const {
   int32_t h = find(w);
   return word2int_[h];
 }
@@ -142,7 +142,7 @@ entry_type Dictionary::getType(int32_t id) const {
   return words_[id].type;
 }
 
-entry_type Dictionary::getType(const std::string& w) const {
+entry_type Dictionary::getType(const std::string_view w) const {
   return (w.find(args_->label) == 0) ? entry_type::label : entry_type::word;
 }
 
@@ -160,7 +160,7 @@ std::string Dictionary::getWord(int32_t id) const {
 // Since all fasttext models that were already released were trained
 // using signed char, we fixed the hash function to make models
 // compatible whatever compiler is used.
-uint32_t Dictionary::hash(const std::string& str) const {
+uint32_t Dictionary::hash(const std::string_view str) const {
   uint32_t h = 2166136261;
   for (size_t i = 0; i < str.size(); i++) {
     h = h ^ uint32_t(int8_t(str[i]));
@@ -324,11 +324,16 @@ void Dictionary::addWordNgrams(
 
 void Dictionary::addSubwords(
     std::vector<int32_t>& line,
-    const std::string& token,
+    const std::string_view token,
     int32_t wid) const {
   if (wid < 0) { // out of vocab
     if (token != EOS) {
-      computeSubwords(BOW + token + EOW, line);
+      std::string concat;
+      concat.reserve(BOW.size() + token.size() + EOW.size());
+      concat += BOW;
+      concat.append(token.data(), token.size());
+      concat += EOW;
+      computeSubwords(concat, line);
     }
   } else {
     if (args_->maxn <= 0) { // in vocab w/o subwords
@@ -406,6 +411,51 @@ int32_t Dictionary::getLine(
   return ntokens;
 }
 
+namespace {
+bool readWordNoNewline(std::string_view& in, std::string_view& word) {
+  const std::string_view spaces(" \n\r\t\v\f\0");
+  std::string_view::size_type begin = in.find_first_not_of(spaces);
+  if (begin == std::string_view::npos) {
+    in.remove_prefix(in.size());
+    return false;
+  }
+  in.remove_prefix(begin);
+  word = in.substr(0, in.find_first_of(spaces));
+  in.remove_prefix(word.size());
+  return true;
+}
+} // namespace
+
+int32_t Dictionary::getStringNoNewline(
+    std::string_view in,
+    std::vector<int32_t>& words,
+    std::vector<int32_t>& labels) const {
+  std::vector<int32_t> word_hashes;
+  std::string_view token;
+  int32_t ntokens = 0;
+
+  words.clear();
+  labels.clear();
+  while (readWordNoNewline(in, token)) {
+    uint32_t h = hash(token);
+    int32_t wid = getId(token, h);
+    entry_type type = wid < 0 ? getType(token) : getType(wid);
+
+    ntokens++;
+    if (type == entry_type::word) {
+      addSubwords(words, token, wid);
+      word_hashes.push_back(h);
+    } else if (type == entry_type::label && wid >= 0) {
+      labels.push_back(wid - nwords_);
+    }
+    if (token == EOS) {
+      break;
+    }
+  }
+  addWordNgrams(words, word_hashes, args_->wordNgrams);
+  return ntokens;
+}
+
 void Dictionary::pushHash(std::vector<int32_t>& hashes, int32_t id) const {
   if (pruneidx_size_ == 0 || id < 0) {
     return;

+ 10 - 7
src/dictionary.h

@@ -13,6 +13,7 @@
 #include <ostream>
 #include <random>
 #include <string>
+#include <string_view>
 #include <unordered_map>
 #include <vector>
 
@@ -36,13 +37,13 @@ class Dictionary {
   static const int32_t MAX_VOCAB_SIZE = 30000000;
   static const int32_t MAX_LINE_SIZE = 1024;
 
-  int32_t find(const std::string&) const;
-  int32_t find(const std::string&, uint32_t h) const;
+  int32_t find(const std::string_view) const;
+  int32_t find(const std::string_view, uint32_t h) const;
   void initTableDiscard();
   void initNgrams();
   void reset(std::istream&) const;
   void pushHash(std::vector<int32_t>&, int32_t) const;
-  void addSubwords(std::vector<int32_t>&, const std::string&, int32_t) const;
+  void addSubwords(std::vector<int32_t>&, const std::string_view, int32_t) const;
 
   std::shared_ptr<Args> args_;
   std::vector<int32_t> word2int_;
@@ -71,10 +72,10 @@ class Dictionary {
   int32_t nwords() const;
   int32_t nlabels() const;
   int64_t ntokens() const;
-  int32_t getId(const std::string&) const;
-  int32_t getId(const std::string&, uint32_t h) const;
+  int32_t getId(const std::string_view) const;
+  int32_t getId(const std::string_view, uint32_t h) const;
   entry_type getType(int32_t) const;
-  entry_type getType(const std::string&) const;
+  entry_type getType(const std::string_view) const;
   bool discard(int32_t, real) const;
   std::string getWord(int32_t) const;
   const std::vector<int32_t>& getSubwords(int32_t) const;
@@ -87,7 +88,7 @@ class Dictionary {
       const std::string&,
       std::vector<int32_t>&,
       std::vector<std::string>* substrings = nullptr) const;
-  uint32_t hash(const std::string& str) const;
+  uint32_t hash(const std::string_view str) const;
   void add(const std::string&);
   bool readWord(std::istream&, std::string&) const;
   void readFromFile(std::istream&);
@@ -99,6 +100,8 @@ class Dictionary {
       const;
   int32_t getLine(std::istream&, std::vector<int32_t>&, std::minstd_rand&)
       const;
+  int32_t getStringNoNewline(std::string_view, std::vector<int32_t>&,
+      std::vector<int32_t>&) const;
   void threshold(int64_t, int64_t);
   void prune(std::vector<int32_t>&);
   bool isPruned() {

+ 2 - 2
src/fasttext.cc

@@ -532,7 +532,7 @@ std::vector<std::pair<std::string, Vector>> FastText::getNgramVectors(
     if (ngrams[i] >= 0) {
       vec.addRow(*input_, ngrams[i]);
     }
-    result.push_back(std::make_pair(substrings[i], std::move(vec)));
+    result.emplace_back(substrings[i], std::move(vec));
   }
   return result;
 }
@@ -609,7 +609,7 @@ std::vector<std::pair<real, std::string>> FastText::getAnalogies(
     const std::string& wordA,
     const std::string& wordB,
     const std::string& wordC) {
-  Vector query = Vector(args_->dim);
+  Vector query(args_->dim);
   query.zero();
 
   Vector buffer(args_->dim);

+ 1 - 0
src/matrix.h

@@ -36,6 +36,7 @@ class Matrix {
   virtual void addVectorToRow(const Vector&, int64_t, real) = 0;
   virtual void addRowToVector(Vector& x, int32_t i) const = 0;
   virtual void addRowToVector(Vector& x, int32_t i, real a) const = 0;
+  virtual void averageRowsToVector(Vector& x, const std::vector<int32_t>& rows) const = 0;
   virtual void save(std::ostream&) const = 0;
   virtual void load(std::istream&) = 0;
   virtual void dump(std::ostream&) const = 0;

+ 1 - 5
src/model.cc

@@ -42,11 +42,7 @@ Model::Model(
 void Model::computeHidden(const std::vector<int32_t>& input, State& state)
     const {
   Vector& hidden = state.hidden;
-  hidden.zero();
-  for (auto it = input.cbegin(); it != input.cend(); ++it) {
-    hidden.addRow(*wi_, *it);
-  }
-  hidden.mul(1.0 / input.size());
+  wi_->averageRowsToVector(hidden, input);
 }
 
 void Model::predict(

+ 8 - 0
src/quantmatrix.cc

@@ -80,6 +80,14 @@ void QuantMatrix::addRowToVector(Vector& x, int32_t i) const {
   pq_->addcode(x, codes_.data(), i, norm);
 }
 
+void QuantMatrix::averageRowsToVector(Vector& x, const std::vector<int32_t>& rows) const {
+  x.zero();
+  for (auto it = rows.cbegin(); it != rows.cend(); ++it) {
+    addRowToVector(x, *it);
+  }
+  x.mul(1.0 / rows.size());
+}
+
 void QuantMatrix::save(std::ostream& out) const {
   out.write((char*)&qnorm_, sizeof(qnorm_));
   out.write((char*)&m_, sizeof(m_));

+ 1 - 0
src/quantmatrix.h

@@ -52,6 +52,7 @@ class QuantMatrix : public Matrix {
   void addVectorToRow(const Vector&, int64_t, real) override;
   void addRowToVector(Vector& x, int32_t i) const override;
   void addRowToVector(Vector& x, int32_t i, real a) const override;
+  void averageRowsToVector(Vector& x, const std::vector<int32_t>& rows) const override;
   void save(std::ostream&) const override;
   void load(std::istream&) override;
   void dump(std::ostream&) const override;

+ 3 - 2
src/vector.h

@@ -12,6 +12,7 @@
 #include <ostream>
 #include <vector>
 
+#include "aligned.h"
 #include "real.h"
 
 namespace fasttext {
@@ -20,12 +21,12 @@ class Matrix;
 
 class Vector {
  protected:
-  std::vector<real> data_;
+  intgemm::AlignedVector<real> data_;
 
  public:
   explicit Vector(int64_t);
   Vector(const Vector&) = default;
-  Vector(Vector&&) noexcept = default;
+  Vector(Vector&&) = default;
   Vector& operator=(const Vector&) = default;
   Vector& operator=(Vector&&) = default;