|
|
@@ -103,7 +103,7 @@ void Dictionary::getSubwords(const std::string& word,
|
|
|
substrings.push_back(words_[i].word);
|
|
|
}
|
|
|
if (word != EOS) {
|
|
|
- computeSubwords(BOW + word + EOW, ngrams, substrings);
|
|
|
+ computeSubwords(BOW + word + EOW, ngrams, &substrings);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@@ -159,26 +159,7 @@ uint32_t Dictionary::hash(const std::string& str) const {
|
|
|
|
|
|
void Dictionary::computeSubwords(const std::string& word,
|
|
|
std::vector<int32_t>& ngrams,
|
|
|
- std::vector<std::string>& substrings) const {
|
|
|
- for (size_t i = 0; i < word.size(); i++) {
|
|
|
- std::string ngram;
|
|
|
- if ((word[i] & 0xC0) == 0x80) continue;
|
|
|
- for (size_t j = i, n = 1; j < word.size() && n <= args_->maxn; n++) {
|
|
|
- ngram.push_back(word[j++]);
|
|
|
- while (j < word.size() && (word[j] & 0xC0) == 0x80) {
|
|
|
- ngram.push_back(word[j++]);
|
|
|
- }
|
|
|
- if (n >= args_->minn && !(n == 1 && (i == 0 || j == word.size()))) {
|
|
|
- int32_t h = hash(ngram) % args_->bucket;
|
|
|
- ngrams.push_back(nwords_ + h);
|
|
|
- substrings.push_back(ngram);
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-void Dictionary::computeSubwords(const std::string& word,
|
|
|
- std::vector<int32_t>& ngrams) const {
|
|
|
+ std::vector<std::string>* substrings) const {
|
|
|
for (size_t i = 0; i < word.size(); i++) {
|
|
|
std::string ngram;
|
|
|
if ((word[i] & 0xC0) == 0x80) continue;
|
|
|
@@ -190,6 +171,9 @@ void Dictionary::computeSubwords(const std::string& word,
|
|
|
if (n >= args_->minn && !(n == 1 && (i == 0 || j == word.size()))) {
|
|
|
int32_t h = hash(ngram) % args_->bucket;
|
|
|
pushHash(ngrams, h);
|
|
|
+ if (substrings) {
|
|
|
+ substrings->push_back(ngram);
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
}
|