|
|
@@ -15,6 +15,10 @@
|
|
|
#include "utils.h"
|
|
|
#include "vector.h"
|
|
|
|
|
|
+#if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE__)
|
|
|
+#include <immintrin.h>
|
|
|
+#endif
|
|
|
+
|
|
|
namespace fasttext {
|
|
|
|
|
|
DenseMatrix::DenseMatrix() : DenseMatrix(0, 0) {}
|
|
|
@@ -146,6 +150,92 @@ void DenseMatrix::addRowToVector(Vector& x, int32_t i, real a) const {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+/* Abstract over AVX512F, AVX, and SSE intrinsics, using the one available on this machine. */
|
|
|
+#if defined(__AVX512F__)
|
|
|
+using Register = __m512;
|
|
|
+inline Register Add(Register first, Register second) { return _mm512_add_ps(first, second); }
|
|
|
+inline Register Set1(float to) { return _mm512_set1_ps(to); }
|
|
|
+inline Register Multiply(Register first, Register second) { return _mm512_mul_ps(first, second); }
|
|
|
+#elif defined(__AVX__)
|
|
|
+using Register = __m256;
|
|
|
+inline Register Add(Register first, Register second) { return _mm256_add_ps(first, second); }
|
|
|
+inline Register Set1(float to) { return _mm256_set1_ps(to); }
|
|
|
+inline Register Multiply(Register first, Register second) { return _mm256_mul_ps(first, second); }
|
|
|
+#elif defined(__SSE__)
|
|
|
+using Register = __m128;
|
|
|
+inline Register Add(Register first, Register second) { return _mm_add_ps(first, second); }
|
|
|
+inline Register Set1(float to) { return _mm_set1_ps(to); }
|
|
|
+inline Register Multiply(Register first, Register second) { return _mm_mul_ps(first, second); }
|
|
|
+#endif
|
|
|
+
|
|
|
+/* Faster routine for averaging rows of a matrix on x86.
|
|
|
+ * The idea here is to keep the accumulators in registers if possible. */
|
|
|
+#if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE__)
|
|
|
+template <unsigned Cols> void averageRowsFast(Vector& x, const std::vector<int32_t>& rows, const DenseMatrix &matrix) {
|
|
|
+ // Columns must be a multiple of how many floats fit in a register.
|
|
|
+ static_assert(Cols % (sizeof(Register) / 4) == 0);
|
|
|
+ constexpr unsigned RegisterCount = Cols / (sizeof(Register) / 4);
|
|
|
+ // These should be aligned by aligned.h
|
|
|
+ assert(reinterpret_cast<uintptr_t>(x.data()) % sizeof(Register) == 0);
|
|
|
+ assert(reinterpret_cast<uintptr_t>(matrix.data()) % sizeof(Register) == 0);
|
|
|
+
|
|
|
+ // Guard against empty list of rows with default NaN behavior.
|
|
|
+ if (rows.empty()) {
|
|
|
+ x.zero();
|
|
|
+ x.mul(1.0 / rows.size());
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Copy the first row to accumulation registers.
|
|
|
+ Register accum[RegisterCount];
|
|
|
+ auto row = rows.cbegin();
|
|
|
+ const Register *base = reinterpret_cast<const Register*>(matrix.data() + matrix.cols() * *row);
|
|
|
+ for (unsigned i = 0; i < RegisterCount; ++i) {
|
|
|
+ accum[i] = base[i];
|
|
|
+ }
|
|
|
+ // Add the rows after the first.
|
|
|
+ for (++row; row != rows.cend(); ++row) {
|
|
|
+ base = reinterpret_cast<const Register*>(matrix.data() + matrix.cols() * *row);
|
|
|
+ for (unsigned i = 0; i < RegisterCount; ++i) {
|
|
|
+ accum[i] = Add(accum[i], base[i]);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ // Multiply by (1.0 / rows.size()) and write to x.
|
|
|
+ Register mul = Set1(1.0 / rows.size());
|
|
|
+ for (unsigned i = 0; i < RegisterCount; ++i) {
|
|
|
+ reinterpret_cast<Register*>(x.data())[i] = Multiply(accum[i], mul);
|
|
|
+ }
|
|
|
+}
|
|
|
+#endif
|
|
|
+
|
|
|
+void DenseMatrix::averageRowsToVector(Vector& x, const std::vector<int32_t>& rows) const {
|
|
|
+#if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE__)
|
|
|
+ switch (cols()) {
|
|
|
+ case 512:
|
|
|
+ // Maximum number that can fit all in registers on AVX512F.
|
|
|
+ averageRowsFast<512>(x, rows, *this);
|
|
|
+ return;
|
|
|
+ case 256:
|
|
|
+ averageRowsFast<256>(x, rows, *this);
|
|
|
+ return;
|
|
|
+ case 64:
|
|
|
+ averageRowsFast<64>(x, rows, *this);
|
|
|
+ return;
|
|
|
+ case 32:
|
|
|
+ averageRowsFast<32>(x, rows, *this);
|
|
|
+ return;
|
|
|
+ case 16:
|
|
|
+ averageRowsFast<16>(x, rows, *this);
|
|
|
+ return;
|
|
|
+ }
|
|
|
+#endif
|
|
|
+ x.zero();
|
|
|
+ for (auto it = rows.cbegin(); it != rows.cend(); ++it) {
|
|
|
+ addRowToVector(x, *it);
|
|
|
+ }
|
|
|
+ x.mul(1.0 / rows.size());
|
|
|
+}
|
|
|
+
|
|
|
void DenseMatrix::save(std::ostream& out) const {
|
|
|
out.write((char*)&m_, sizeof(int64_t));
|
|
|
out.write((char*)&n_, sizeof(int64_t));
|
|
|
@@ -155,7 +245,7 @@ void DenseMatrix::save(std::ostream& out) const {
|
|
|
void DenseMatrix::load(std::istream& in) {
|
|
|
in.read((char*)&m_, sizeof(int64_t));
|
|
|
in.read((char*)&n_, sizeof(int64_t));
|
|
|
- data_ = std::vector<real>(m_ * n_);
|
|
|
+ data_ = intgemm::AlignedVector<real>(m_ * n_);
|
|
|
in.read((char*)data_.data(), m_ * n_ * sizeof(real));
|
|
|
}
|
|
|
|