diff --git a/src/libime/pinyin/pinyincontext.cpp b/src/libime/pinyin/pinyincontext.cpp index 4a73704..b9b333f 100644 --- a/src/libime/pinyin/pinyincontext.cpp +++ b/src/libime/pinyin/pinyincontext.cpp @@ -17,9 +17,11 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -42,6 +44,7 @@ namespace libime { +namespace { enum class LearnWordResult { Normal, /// word is consisted all from regular word from dict. Custom, /// word is consisted with custom word (e.g. symbol replacement). @@ -65,6 +68,32 @@ struct SelectedPinyin { SelectedPinyinType type_; }; +struct CandidateDedupKey { + std::string text_; + size_t end_ = 0; + + bool operator==(const CandidateDedupKey &other) const { + return text_ == other.text_ && end_ == other.end_; + } +}; + +struct CandidateDedupKeyHash { + size_t operator()(const CandidateDedupKey &key) const { + size_t seed = std::hash()(key.text_); + boost::hash_combine(seed, key.end_); + return seed; + } +}; + +CandidateDedupKey candidateDedupKey(const SentenceResult &candidate) { + return {.text_ = candidate.toString(), + .end_ = candidate.sentence().empty() + ? 0 + : candidate.sentence().back()->to()->index()}; +} + +} // namespace + class PinyinContextPrivate : public fcitx::QPtrHolder { public: PinyinContextPrivate(PinyinContext *q, PinyinIME *ime) @@ -126,14 +155,31 @@ class PinyinContextPrivate : public fcitx::QPtrHolder { candidatesToCursor_.clear(); candidatesToCursorSet_.clear(); + std::unordered_map + duplicateCandidates; + auto insertCandidate = [this, &duplicateCandidates]( + SentenceResult candidate) { + auto key = candidateDedupKey(candidate); + auto iter = duplicateCandidates.find(key); + if (iter != duplicateCandidates.end()) { + auto &oldCandidate = candidatesToCursor_[iter->second]; + if (candidate.score() > oldCandidate.score()) { + oldCandidate = std::move(candidate); + } + return; + } + + candidatesToCursor_.push_back(std::move(candidate)); + duplicateCandidates.emplace(key, candidatesToCursor_.size() - 1); + candidatesToCursorSet_.insert(std::move(key.text_)); + }; + auto start = q->selectedLength(); auto currentCursor = alignCursorToNextSegment(); // Poke best sentence from lattice, ignore nbest option for now. auto nodeRange = lattice_.nodes(&segs_.node(currentCursor - start)); if (!nodeRange.empty()) { - candidatesToCursor_.push_back(nodeRange.front().toSentenceResult()); - candidatesToCursorSet_.insert( - candidatesToCursor_.back().toString()); + insertCandidate(nodeRange.front().toSentenceResult()); } for (const auto &candidate : candidates_) { const auto &sentence = candidate.sentence(); @@ -141,12 +187,7 @@ class PinyinContextPrivate : public fcitx::QPtrHolder { if (sentence.back()->to()->index() + start > currentCursor) { continue; } - auto text = candidate.toString(); - if (candidatesToCursorSet_.contains(text)) { - continue; - } - candidatesToCursor_.push_back(candidate); - candidatesToCursorSet_.insert(std::move(text)); + insertCandidate(candidate); } else if (sentence.size() > 1) { auto newSentence = sentence; while (!newSentence.empty() && @@ -157,12 +198,7 @@ class PinyinContextPrivate : public fcitx::QPtrHolder { if (!newSentence.empty()) { SentenceResult partial(newSentence, newSentence.back()->score()); - auto text = partial.toString(); - if (candidatesToCursorSet_.contains(text)) { - continue; - } - candidatesToCursor_.push_back(partial); - candidatesToCursorSet_.insert(std::move(text)); + insertCandidate(std::move(partial)); } } } @@ -606,7 +642,6 @@ void PinyinContext::update() { // Add n-best result. for (size_t i = 0, e = d->lattice_.sentenceSize(); i < e; i++) { d->candidates_.push_back(d->lattice_.sentence(i)); - d->candidatesSet_.insert(d->candidates_.back().toString()); } const auto *bos = &graph.start(); @@ -632,13 +667,8 @@ void PinyinContext::update() { min = std::min(latticeNode.score(), min); max = std::max(latticeNode.score(), max); } - // Deduplcate. - if (d->candidatesSet_.contains(latticeNode.word())) { - continue; - } d->candidates_.push_back( latticeNode.toSentenceResult(adjust)); - d->candidatesSet_.insert(latticeNode.word()); } } } @@ -651,9 +681,6 @@ void PinyinContext::update() { if (latticeNode.from() == bos && static_cast(latticeNode) .isCorrection()) { - if (d->candidatesSet_.contains(latticeNode.word())) { - continue; - } if ((latticeNode.score() > min && latticeNode.score() + d->ime_->maxDistance() > max) || @@ -662,7 +689,6 @@ void PinyinContext::update() { .size() <= 2) { d->candidates_.push_back( latticeNode.toSentenceResult(adjust)); - d->candidatesSet_.insert(latticeNode.word()); } } } @@ -678,44 +704,51 @@ void PinyinContext::update() { latticeNode.score() + d->ime_->maxDistance() > max && !static_cast(latticeNode) .anyCorrectionOnPath()) { - auto fullWord = latticeNode.fullWord(); - if (d->candidatesSet_.contains(fullWord)) { - continue; - } d->candidates_.push_back( latticeNode.toSentenceResult(adjust)); - d->candidatesSet_.insert(fullWord); } } } } std::sort(d->candidates_.begin() + beginSize, d->candidates_.end(), std::greater<>()); - if (const auto limit = d->ime_->wordCandidateLimit()) { + { + size_t index = 0; size_t count = 0; + const auto limit = d->ime_->wordCandidateLimit(); + std::unordered_set + duplicateCandidates; auto &candidatesSet = d->candidatesSet_; - d->candidates_.erase( - std::remove_if( - d->candidates_.begin() + beginSize, d->candidates_.end(), - [&count, limit, - &candidatesSet](const SentenceResult &candidate) { - const bool isSinglePinyinWord = - candidate.sentence().size() == 1 && - candidate.sentence() - .front() - ->as() - .encodedPinyin() - .size() == 2; - if (!isSinglePinyinWord) { - if (count >= limit) { - candidatesSet.erase(candidate.toString()); - return true; - } - count++; - } - return false; - }), - d->candidates_.end()); + candidatesSet.clear(); + std::erase_if(d->candidates_, + [&candidatesSet, &duplicateCandidates, &index, &count, + beginSize, limit](const SentenceResult &candidate) { + bool beforeBeginSize = index++ < beginSize; + auto key = candidateDedupKey(candidate); + if (duplicateCandidates.contains(key)) { + return true; + } + + if (!beforeBeginSize && limit) { + const bool isSinglePinyinWord = + candidate.sentence().size() == 1 && + candidate.sentence() + .front() + ->as() + .encodedPinyin() + .size() == 2; + if (!isSinglePinyinWord) { + if (count >= limit) { + return true; + } + count++; + } + } + + candidatesSet.insert(key.text_); + duplicateCandidates.insert(std::move(key)); + return false; + }); } d->candidatesToCursorNeedUpdate_ = true; diff --git a/test/testpinyincontext.cpp b/test/testpinyincontext.cpp index 7d36e9a..c8d9591 100644 --- a/test/testpinyincontext.cpp +++ b/test/testpinyincontext.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include "libime/core/historybigram.h" @@ -26,6 +27,28 @@ using namespace libime; +namespace { + +void checkCandidateSet(const PinyinContext &context) { + std::unordered_set candidates; + for (const auto &candidate : context.candidates()) { + candidates.insert(candidate.toString()); + } + + FCITX_ASSERT(candidates == context.candidateSet()); +} + +void checkCandidatesToCursorSet(const PinyinContext &context) { + std::unordered_set candidates; + for (const auto &candidate : context.candidatesToCursor()) { + candidates.insert(candidate.toString()); + } + + FCITX_ASSERT(candidates == context.candidatesToCursorSet()); +} + +} // namespace + int main() { PinyinIME ime( std::make_unique(), @@ -41,6 +64,20 @@ int main() { ime.dict()->addWord(1, "zi'ji'ge'zi", "自机各自"); ime.setFuzzyFlags(PinyinFuzzyFlag::Inner); PinyinContext c(&ime); + + { + c.type("xian"); + std::unordered_set xiEndIndexes; + for (const auto &candidate : c.candidates()) { + if (candidate.toString() == "洗") { + xiEndIndexes.insert(candidate.sentence().back()->to()->index()); + } + } + FCITX_ASSERT(xiEndIndexes.size() == 2) << xiEndIndexes.size(); + FCITX_ASSERT(c.candidateSet().count("洗") == 1); + c.clear(); + } + c.type("xianshi"); std::cout << c.sentence() << std::endl; @@ -176,6 +213,7 @@ int main() { { c.clear(); c.type("nianglanghang"); + checkCandidatesToCursorSet(c); size_t i = 0; for (const auto &candidate : c.candidatesToCursor()) { if (candidate.toString() == "娘") { @@ -189,7 +227,7 @@ int main() { while (i > 0) { --i; c.setCursor(i); - c.candidatesToCursor(); + checkCandidatesToCursorSet(c); } } @@ -288,8 +326,23 @@ int main() { { c.clear(); c.clearContextWords(); + auto wordCandidateLimit = ime.wordCandidateLimit(); + ime.setWordCandidateLimit(1); + c.type("ziran"); + checkCandidateSet(c); + ime.setWordCandidateLimit(wordCandidateLimit); + } + + { + c.clear(); + c.clearContextWords(); + c.type("ziran"); + checkCandidateSet(c); + c.clear(); FCITX_ASSERT(!ime.model()->history().containsBigram("他", "爱")); c.type("taai"); + checkCandidateSet(c); + checkCandidatesToCursorSet(c); size_t i = 0; for (const auto &candidate : c.candidatesToCursor()) { if (candidate.toString() == "他爱") {