|
@@ -15,119 +15,36 @@
|
|
|
|
|
|
|
|
namespace fasttext {
|
|
namespace fasttext {
|
|
|
|
|
|
|
|
-constexpr int64_t SIGMOID_TABLE_SIZE = 512;
|
|
|
|
|
-constexpr int64_t MAX_SIGMOID = 8;
|
|
|
|
|
-constexpr int64_t LOG_TABLE_SIZE = 512;
|
|
|
|
|
-
|
|
|
|
|
Model::Model(
|
|
Model::Model(
|
|
|
std::shared_ptr<Matrix> wi,
|
|
std::shared_ptr<Matrix> wi,
|
|
|
std::shared_ptr<Matrix> wo,
|
|
std::shared_ptr<Matrix> wo,
|
|
|
std::shared_ptr<Args> args,
|
|
std::shared_ptr<Args> args,
|
|
|
- const std::vector<int64_t>& targetCounts,
|
|
|
|
|
|
|
+ std::shared_ptr<Loss> loss,
|
|
|
int32_t seed)
|
|
int32_t seed)
|
|
|
: hidden_(args->dim), output_(wo->size(0)), grad_(args->dim), rng(seed) {
|
|
: hidden_(args->dim), output_(wo->size(0)), grad_(args->dim), rng(seed) {
|
|
|
wi_ = wi;
|
|
wi_ = wi;
|
|
|
wo_ = wo;
|
|
wo_ = wo;
|
|
|
args_ = args;
|
|
args_ = args;
|
|
|
|
|
+ loss_ = loss;
|
|
|
osz_ = wo->size(0);
|
|
osz_ = wo->size(0);
|
|
|
hsz_ = args->dim;
|
|
hsz_ = args->dim;
|
|
|
- negpos = 0;
|
|
|
|
|
- loss_ = 0.0;
|
|
|
|
|
|
|
+ lossValue_ = 0.0;
|
|
|
nexamples_ = 1;
|
|
nexamples_ = 1;
|
|
|
- t_sigmoid_.reserve(SIGMOID_TABLE_SIZE + 1);
|
|
|
|
|
- t_log_.reserve(LOG_TABLE_SIZE + 1);
|
|
|
|
|
- initSigmoid();
|
|
|
|
|
- initLog();
|
|
|
|
|
- setTargetCounts(targetCounts);
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-real Model::binaryLogistic(int32_t target, bool label, real lr) {
|
|
|
|
|
- real score = sigmoid(wo_->dotRow(hidden_, target));
|
|
|
|
|
- real alpha = lr * (real(label) - score);
|
|
|
|
|
- grad_.addRow(*wo_, target, alpha);
|
|
|
|
|
- wo_->addVectorToRow(hidden_, target, alpha);
|
|
|
|
|
- if (label) {
|
|
|
|
|
- return -log(score);
|
|
|
|
|
- } else {
|
|
|
|
|
- return -log(1.0 - score);
|
|
|
|
|
- }
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-real Model::negativeSampling(int32_t target, real lr) {
|
|
|
|
|
- real loss = 0.0;
|
|
|
|
|
- grad_.zero();
|
|
|
|
|
- for (int32_t n = 0; n <= args_->neg; n++) {
|
|
|
|
|
- if (n == 0) {
|
|
|
|
|
- loss += binaryLogistic(target, true, lr);
|
|
|
|
|
- } else {
|
|
|
|
|
- loss += binaryLogistic(getNegative(target), false, lr);
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
- return loss;
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-real Model::hierarchicalSoftmax(int32_t target, real lr) {
|
|
|
|
|
- real loss = 0.0;
|
|
|
|
|
- grad_.zero();
|
|
|
|
|
- const std::vector<bool>& binaryCode = codes[target];
|
|
|
|
|
- const std::vector<int32_t>& pathToRoot = paths[target];
|
|
|
|
|
- for (int32_t i = 0; i < pathToRoot.size(); i++) {
|
|
|
|
|
- loss += binaryLogistic(pathToRoot[i], binaryCode[i], lr);
|
|
|
|
|
- }
|
|
|
|
|
- return loss;
|
|
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-void Model::computeOutput(Vector& hidden, Vector& output) const {
|
|
|
|
|
- output.mul(*wo_, hidden);
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-void Model::computeOutputSigmoid(Vector& hidden, Vector& output) const {
|
|
|
|
|
- computeOutput(hidden, output);
|
|
|
|
|
- for (int32_t i = 0; i < osz_; i++) {
|
|
|
|
|
- output[i] = sigmoid(output[i]);
|
|
|
|
|
- }
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-void Model::computeOutputSoftmax(Vector& hidden, Vector& output) const {
|
|
|
|
|
- computeOutput(hidden, output);
|
|
|
|
|
- real max = output[0], z = 0.0;
|
|
|
|
|
- for (int32_t i = 0; i < osz_; i++) {
|
|
|
|
|
- max = std::max(output[i], max);
|
|
|
|
|
- }
|
|
|
|
|
- for (int32_t i = 0; i < osz_; i++) {
|
|
|
|
|
- output[i] = exp(output[i] - max);
|
|
|
|
|
- z += output[i];
|
|
|
|
|
- }
|
|
|
|
|
- for (int32_t i = 0; i < osz_; i++) {
|
|
|
|
|
- output[i] /= z;
|
|
|
|
|
- }
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-void Model::computeOutputSoftmax() {
|
|
|
|
|
- computeOutputSoftmax(hidden_, output_);
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-real Model::softmax(int32_t target, real lr) {
|
|
|
|
|
- grad_.zero();
|
|
|
|
|
- computeOutputSoftmax();
|
|
|
|
|
- for (int32_t i = 0; i < osz_; i++) {
|
|
|
|
|
- real label = (i == target) ? 1.0 : 0.0;
|
|
|
|
|
- real alpha = lr * (label - output_[i]);
|
|
|
|
|
- grad_.addRow(*wo_, i, alpha);
|
|
|
|
|
- wo_->addVectorToRow(hidden_, i, alpha);
|
|
|
|
|
- }
|
|
|
|
|
- return -log(output_[target]);
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-real Model::oneVsAll(const std::vector<int32_t>& targets, real lr) {
|
|
|
|
|
- real loss = 0.0;
|
|
|
|
|
- for (int32_t i = 0; i < osz_; i++) {
|
|
|
|
|
- bool isMatch = utils::contains(targets, i);
|
|
|
|
|
- loss += binaryLogistic(i, isMatch, lr);
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- return loss;
|
|
|
|
|
-}
|
|
|
|
|
|
|
+Model::Model(const Model& other, int32_t seed)
|
|
|
|
|
+ : wi_(other.wi_),
|
|
|
|
|
+ wo_(other.wo_),
|
|
|
|
|
+ args_(other.args_),
|
|
|
|
|
+ loss_(other.loss_),
|
|
|
|
|
+ hidden_(other.hidden_),
|
|
|
|
|
+ output_(other.output_),
|
|
|
|
|
+ grad_(other.grad_),
|
|
|
|
|
+ hsz_(other.hsz_),
|
|
|
|
|
+ osz_(other.osz_),
|
|
|
|
|
+ lossValue_(other.lossValue_),
|
|
|
|
|
+ nexamples_(other.nexamples_),
|
|
|
|
|
+ rng(seed) {}
|
|
|
|
|
|
|
|
void Model::computeHidden(const std::vector<int32_t>& input, Vector& hidden)
|
|
void Model::computeHidden(const std::vector<int32_t>& input, Vector& hidden)
|
|
|
const {
|
|
const {
|
|
@@ -139,17 +56,11 @@ void Model::computeHidden(const std::vector<int32_t>& input, Vector& hidden)
|
|
|
hidden.mul(1.0 / input.size());
|
|
hidden.mul(1.0 / input.size());
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-bool Model::comparePairs(
|
|
|
|
|
- const std::pair<real, int32_t>& l,
|
|
|
|
|
- const std::pair<real, int32_t>& r) {
|
|
|
|
|
- return l.first > r.first;
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
void Model::predict(
|
|
void Model::predict(
|
|
|
const std::vector<int32_t>& input,
|
|
const std::vector<int32_t>& input,
|
|
|
int32_t k,
|
|
int32_t k,
|
|
|
real threshold,
|
|
real threshold,
|
|
|
- std::vector<std::pair<real, int32_t>>& heap,
|
|
|
|
|
|
|
+ Predictions& heap,
|
|
|
Vector& hidden,
|
|
Vector& hidden,
|
|
|
Vector& output) const {
|
|
Vector& output) const {
|
|
|
if (k == Model::kUnlimitedPredictions) {
|
|
if (k == Model::kUnlimitedPredictions) {
|
|
@@ -162,101 +73,18 @@ void Model::predict(
|
|
|
}
|
|
}
|
|
|
heap.reserve(k + 1);
|
|
heap.reserve(k + 1);
|
|
|
computeHidden(input, hidden);
|
|
computeHidden(input, hidden);
|
|
|
- if (args_->loss == loss_name::hs) {
|
|
|
|
|
- dfs(k, threshold, 2 * osz_ - 2, 0.0, heap, hidden);
|
|
|
|
|
- } else {
|
|
|
|
|
- findKBest(k, threshold, heap, hidden, output);
|
|
|
|
|
- }
|
|
|
|
|
- std::sort_heap(heap.begin(), heap.end(), comparePairs);
|
|
|
|
|
|
|
+
|
|
|
|
|
+ loss_->predict(k, threshold, heap, hidden, output);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
void Model::predict(
|
|
void Model::predict(
|
|
|
const std::vector<int32_t>& input,
|
|
const std::vector<int32_t>& input,
|
|
|
int32_t k,
|
|
int32_t k,
|
|
|
real threshold,
|
|
real threshold,
|
|
|
- std::vector<std::pair<real, int32_t>>& heap) {
|
|
|
|
|
|
|
+ Predictions& heap) {
|
|
|
predict(input, k, threshold, heap, hidden_, output_);
|
|
predict(input, k, threshold, heap, hidden_, output_);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-void Model::findKBest(
|
|
|
|
|
- int32_t k,
|
|
|
|
|
- real threshold,
|
|
|
|
|
- std::vector<std::pair<real, int32_t>>& heap,
|
|
|
|
|
- Vector& hidden,
|
|
|
|
|
- Vector& output) const {
|
|
|
|
|
- if (args_->loss == loss_name::ova) {
|
|
|
|
|
- computeOutputSigmoid(hidden, output);
|
|
|
|
|
- } else {
|
|
|
|
|
- computeOutputSoftmax(hidden, output);
|
|
|
|
|
- }
|
|
|
|
|
- for (int32_t i = 0; i < osz_; i++) {
|
|
|
|
|
- if (output[i] < threshold) {
|
|
|
|
|
- continue;
|
|
|
|
|
- }
|
|
|
|
|
- if (heap.size() == k && std_log(output[i]) < heap.front().first) {
|
|
|
|
|
- continue;
|
|
|
|
|
- }
|
|
|
|
|
- heap.push_back(std::make_pair(std_log(output[i]), i));
|
|
|
|
|
- std::push_heap(heap.begin(), heap.end(), comparePairs);
|
|
|
|
|
- if (heap.size() > k) {
|
|
|
|
|
- std::pop_heap(heap.begin(), heap.end(), comparePairs);
|
|
|
|
|
- heap.pop_back();
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-void Model::dfs(
|
|
|
|
|
- int32_t k,
|
|
|
|
|
- real threshold,
|
|
|
|
|
- int32_t node,
|
|
|
|
|
- real score,
|
|
|
|
|
- std::vector<std::pair<real, int32_t>>& heap,
|
|
|
|
|
- Vector& hidden) const {
|
|
|
|
|
- if (score < std_log(threshold)) {
|
|
|
|
|
- return;
|
|
|
|
|
- }
|
|
|
|
|
- if (heap.size() == k && score < heap.front().first) {
|
|
|
|
|
- return;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- if (tree[node].left == -1 && tree[node].right == -1) {
|
|
|
|
|
- heap.push_back(std::make_pair(score, node));
|
|
|
|
|
- std::push_heap(heap.begin(), heap.end(), comparePairs);
|
|
|
|
|
- if (heap.size() > k) {
|
|
|
|
|
- std::pop_heap(heap.begin(), heap.end(), comparePairs);
|
|
|
|
|
- heap.pop_back();
|
|
|
|
|
- }
|
|
|
|
|
- return;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- real f = wo_->dotRow(hidden, node - osz_);
|
|
|
|
|
- f = 1. / (1 + std::exp(-f));
|
|
|
|
|
-
|
|
|
|
|
- dfs(k, threshold, tree[node].left, score + std_log(1.0 - f), heap, hidden);
|
|
|
|
|
- dfs(k, threshold, tree[node].right, score + std_log(f), heap, hidden);
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-real Model::computeLoss(
|
|
|
|
|
- const std::vector<int32_t>& targets,
|
|
|
|
|
- int32_t targetIndex,
|
|
|
|
|
- real lr) {
|
|
|
|
|
- real loss = 0.0;
|
|
|
|
|
-
|
|
|
|
|
- if (args_->loss == loss_name::ns) {
|
|
|
|
|
- loss = negativeSampling(targets[targetIndex], lr);
|
|
|
|
|
- } else if (args_->loss == loss_name::hs) {
|
|
|
|
|
- loss = hierarchicalSoftmax(targets[targetIndex], lr);
|
|
|
|
|
- } else if (args_->loss == loss_name::softmax) {
|
|
|
|
|
- loss = softmax(targets[targetIndex], lr);
|
|
|
|
|
- } else if (args_->loss == loss_name::ova) {
|
|
|
|
|
- loss = oneVsAll(targets, lr);
|
|
|
|
|
- } else {
|
|
|
|
|
- throw std::invalid_argument("Unhandled loss function for this model.");
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- return loss;
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
void Model::update(
|
|
void Model::update(
|
|
|
const std::vector<int32_t>& input,
|
|
const std::vector<int32_t>& input,
|
|
|
const std::vector<int32_t>& targets,
|
|
const std::vector<int32_t>& targets,
|
|
@@ -267,13 +95,9 @@ void Model::update(
|
|
|
}
|
|
}
|
|
|
computeHidden(input, hidden_);
|
|
computeHidden(input, hidden_);
|
|
|
|
|
|
|
|
- if (targetIndex == kAllLabelsAsTarget) {
|
|
|
|
|
- loss_ += computeLoss(targets, -1, lr);
|
|
|
|
|
- } else {
|
|
|
|
|
- assert(targetIndex >= 0);
|
|
|
|
|
- assert(targetIndex < targets.size());
|
|
|
|
|
- loss_ += computeLoss(targets, targetIndex, lr);
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ grad_.zero();
|
|
|
|
|
+ lossValue_ += loss_->forward(
|
|
|
|
|
+ targets, targetIndex, hidden_, output_, grad_, lr, rng, true);
|
|
|
|
|
|
|
|
nexamples_ += 1;
|
|
nexamples_ += 1;
|
|
|
|
|
|
|
@@ -285,123 +109,12 @@ void Model::update(
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-void Model::setTargetCounts(const std::vector<int64_t>& counts) {
|
|
|
|
|
- assert(counts.size() == osz_);
|
|
|
|
|
- if (args_->loss == loss_name::ns) {
|
|
|
|
|
- initTableNegatives(counts);
|
|
|
|
|
- }
|
|
|
|
|
- if (args_->loss == loss_name::hs) {
|
|
|
|
|
- buildTree(counts);
|
|
|
|
|
- }
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-void Model::initTableNegatives(const std::vector<int64_t>& counts) {
|
|
|
|
|
- real z = 0.0;
|
|
|
|
|
- for (size_t i = 0; i < counts.size(); i++) {
|
|
|
|
|
- z += pow(counts[i], 0.5);
|
|
|
|
|
- }
|
|
|
|
|
- for (size_t i = 0; i < counts.size(); i++) {
|
|
|
|
|
- real c = pow(counts[i], 0.5);
|
|
|
|
|
- for (size_t j = 0; j < c * NEGATIVE_TABLE_SIZE / z; j++) {
|
|
|
|
|
- negatives_.push_back(i);
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
- std::shuffle(negatives_.begin(), negatives_.end(), rng);
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-int32_t Model::getNegative(int32_t target) {
|
|
|
|
|
- int32_t negative;
|
|
|
|
|
- do {
|
|
|
|
|
- negative = negatives_[negpos];
|
|
|
|
|
- negpos = (negpos + 1) % negatives_.size();
|
|
|
|
|
- } while (target == negative);
|
|
|
|
|
- return negative;
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-void Model::buildTree(const std::vector<int64_t>& counts) {
|
|
|
|
|
- tree.resize(2 * osz_ - 1);
|
|
|
|
|
- for (int32_t i = 0; i < 2 * osz_ - 1; i++) {
|
|
|
|
|
- tree[i].parent = -1;
|
|
|
|
|
- tree[i].left = -1;
|
|
|
|
|
- tree[i].right = -1;
|
|
|
|
|
- tree[i].count = 1e15;
|
|
|
|
|
- tree[i].binary = false;
|
|
|
|
|
- }
|
|
|
|
|
- for (int32_t i = 0; i < osz_; i++) {
|
|
|
|
|
- tree[i].count = counts[i];
|
|
|
|
|
- }
|
|
|
|
|
- int32_t leaf = osz_ - 1;
|
|
|
|
|
- int32_t node = osz_;
|
|
|
|
|
- for (int32_t i = osz_; i < 2 * osz_ - 1; i++) {
|
|
|
|
|
- int32_t mini[2];
|
|
|
|
|
- for (int32_t j = 0; j < 2; j++) {
|
|
|
|
|
- if (leaf >= 0 && tree[leaf].count < tree[node].count) {
|
|
|
|
|
- mini[j] = leaf--;
|
|
|
|
|
- } else {
|
|
|
|
|
- mini[j] = node++;
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
- tree[i].left = mini[0];
|
|
|
|
|
- tree[i].right = mini[1];
|
|
|
|
|
- tree[i].count = tree[mini[0]].count + tree[mini[1]].count;
|
|
|
|
|
- tree[mini[0]].parent = i;
|
|
|
|
|
- tree[mini[1]].parent = i;
|
|
|
|
|
- tree[mini[1]].binary = true;
|
|
|
|
|
- }
|
|
|
|
|
- for (int32_t i = 0; i < osz_; i++) {
|
|
|
|
|
- std::vector<int32_t> path;
|
|
|
|
|
- std::vector<bool> code;
|
|
|
|
|
- int32_t j = i;
|
|
|
|
|
- while (tree[j].parent != -1) {
|
|
|
|
|
- path.push_back(tree[j].parent - osz_);
|
|
|
|
|
- code.push_back(tree[j].binary);
|
|
|
|
|
- j = tree[j].parent;
|
|
|
|
|
- }
|
|
|
|
|
- paths.push_back(path);
|
|
|
|
|
- codes.push_back(code);
|
|
|
|
|
- }
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
real Model::getLoss() const {
|
|
real Model::getLoss() const {
|
|
|
- return loss_ / nexamples_;
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-void Model::initSigmoid() {
|
|
|
|
|
- for (int i = 0; i < SIGMOID_TABLE_SIZE + 1; i++) {
|
|
|
|
|
- real x = real(i * 2 * MAX_SIGMOID) / SIGMOID_TABLE_SIZE - MAX_SIGMOID;
|
|
|
|
|
- t_sigmoid_.push_back(1.0 / (1.0 + std::exp(-x)));
|
|
|
|
|
- }
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-void Model::initLog() {
|
|
|
|
|
- for (int i = 0; i < LOG_TABLE_SIZE + 1; i++) {
|
|
|
|
|
- real x = (real(i) + 1e-5) / LOG_TABLE_SIZE;
|
|
|
|
|
- t_log_.push_back(std::log(x));
|
|
|
|
|
- }
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-real Model::log(real x) const {
|
|
|
|
|
- if (x > 1.0) {
|
|
|
|
|
- return 0.0;
|
|
|
|
|
- }
|
|
|
|
|
- int64_t i = int64_t(x * LOG_TABLE_SIZE);
|
|
|
|
|
- return t_log_[i];
|
|
|
|
|
|
|
+ return lossValue_ / nexamples_;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
real Model::std_log(real x) const {
|
|
real Model::std_log(real x) const {
|
|
|
return std::log(x + 1e-5);
|
|
return std::log(x + 1e-5);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-real Model::sigmoid(real x) const {
|
|
|
|
|
- if (x < -MAX_SIGMOID) {
|
|
|
|
|
- return 0.0;
|
|
|
|
|
- } else if (x > MAX_SIGMOID) {
|
|
|
|
|
- return 1.0;
|
|
|
|
|
- } else {
|
|
|
|
|
- int64_t i =
|
|
|
|
|
- int64_t((x + MAX_SIGMOID) * SIGMOID_TABLE_SIZE / MAX_SIGMOID / 2);
|
|
|
|
|
- return t_sigmoid_[i];
|
|
|
|
|
- }
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
} // namespace fasttext
|
|
} // namespace fasttext
|