From 658c52631c487b2f08d83ee9caa125cf77fb3efc Mon Sep 17 00:00:00 2001 From: Weng Xuetian Date: Sun, 31 May 2026 20:18:30 -0700 Subject: [PATCH 1/2] When de-duplicate candidate, consider the candidate score. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For xina which is corrected to xian, both xian and xi may match "洗" --- src/libime/pinyin/pinyincontext.cpp | 71 ++++++++++++++--------------- test/testpinyincontext.cpp | 41 ++++++++++++++++- 2 files changed, 73 insertions(+), 39 deletions(-) diff --git a/src/libime/pinyin/pinyincontext.cpp b/src/libime/pinyin/pinyincontext.cpp index 4a73704..d880cf7 100644 --- a/src/libime/pinyin/pinyincontext.cpp +++ b/src/libime/pinyin/pinyincontext.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -606,7 +607,6 @@ void PinyinContext::update() { // Add n-best result. for (size_t i = 0, e = d->lattice_.sentenceSize(); i < e; i++) { d->candidates_.push_back(d->lattice_.sentence(i)); - d->candidatesSet_.insert(d->candidates_.back().toString()); } const auto *bos = &graph.start(); @@ -632,13 +632,8 @@ void PinyinContext::update() { min = std::min(latticeNode.score(), min); max = std::max(latticeNode.score(), max); } - // Deduplcate. - if (d->candidatesSet_.contains(latticeNode.word())) { - continue; - } d->candidates_.push_back( latticeNode.toSentenceResult(adjust)); - d->candidatesSet_.insert(latticeNode.word()); } } } @@ -651,9 +646,6 @@ void PinyinContext::update() { if (latticeNode.from() == bos && static_cast(latticeNode) .isCorrection()) { - if (d->candidatesSet_.contains(latticeNode.word())) { - continue; - } if ((latticeNode.score() > min && latticeNode.score() + d->ime_->maxDistance() > max) || @@ -662,7 +654,6 @@ void PinyinContext::update() { .size() <= 2) { d->candidates_.push_back( latticeNode.toSentenceResult(adjust)); - d->candidatesSet_.insert(latticeNode.word()); } } } @@ -678,44 +669,48 @@ void PinyinContext::update() { latticeNode.score() + d->ime_->maxDistance() > max && !static_cast(latticeNode) .anyCorrectionOnPath()) { - auto fullWord = latticeNode.fullWord(); - if (d->candidatesSet_.contains(fullWord)) { - continue; - } d->candidates_.push_back( latticeNode.toSentenceResult(adjust)); - d->candidatesSet_.insert(fullWord); } } } } std::sort(d->candidates_.begin() + beginSize, d->candidates_.end(), std::greater<>()); - if (const auto limit = d->ime_->wordCandidateLimit()) { + { + size_t index = 0; size_t count = 0; + const auto limit = d->ime_->wordCandidateLimit(); auto &candidatesSet = d->candidatesSet_; - d->candidates_.erase( - std::remove_if( - d->candidates_.begin() + beginSize, d->candidates_.end(), - [&count, limit, - &candidatesSet](const SentenceResult &candidate) { - const bool isSinglePinyinWord = - candidate.sentence().size() == 1 && - candidate.sentence() - .front() - ->as() - .encodedPinyin() - .size() == 2; - if (!isSinglePinyinWord) { - if (count >= limit) { - candidatesSet.erase(candidate.toString()); - return true; - } - count++; - } - return false; - }), - d->candidates_.end()); + candidatesSet.clear(); + std::erase_if(d->candidates_, + [&candidatesSet, &index, &count, beginSize, + limit](const SentenceResult &candidate) { + bool beforeBeginSize = index++ < beginSize; + auto candidateString = candidate.toString(); + if (candidatesSet.contains(candidateString)) { + return true; + } + + if (!beforeBeginSize && limit) { + const bool isSinglePinyinWord = + candidate.sentence().size() == 1 && + candidate.sentence() + .front() + ->as() + .encodedPinyin() + .size() == 2; + if (!isSinglePinyinWord) { + if (count >= limit) { + return true; + } + count++; + } + } + + candidatesSet.insert(std::move(candidateString)); + return false; + }); } d->candidatesToCursorNeedUpdate_ = true; diff --git a/test/testpinyincontext.cpp b/test/testpinyincontext.cpp index 7d36e9a..08308af 100644 --- a/test/testpinyincontext.cpp +++ b/test/testpinyincontext.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include "libime/core/historybigram.h" @@ -26,6 +27,28 @@ using namespace libime; +void checkCandidateSet(const PinyinContext &context) { + std::unordered_set candidates; + for (const auto &candidate : context.candidates()) { + auto candidateString = candidate.toString(); + FCITX_ASSERT(candidates.insert(candidateString).second) + << "Duplicate candidate: " << candidateString; + } + + FCITX_ASSERT(candidates == context.candidateSet()); +} + +void checkCandidatesToCursorSet(const PinyinContext &context) { + std::unordered_set candidates; + for (const auto &candidate : context.candidatesToCursor()) { + auto candidateString = candidate.toString(); + FCITX_ASSERT(candidates.insert(candidateString).second) + << "Duplicate candidate to cursor: " << candidateString; + } + + FCITX_ASSERT(candidates == context.candidatesToCursorSet()); +} + int main() { PinyinIME ime( std::make_unique(), @@ -176,6 +199,7 @@ int main() { { c.clear(); c.type("nianglanghang"); + checkCandidatesToCursorSet(c); size_t i = 0; for (const auto &candidate : c.candidatesToCursor()) { if (candidate.toString() == "娘") { @@ -189,7 +213,7 @@ int main() { while (i > 0) { --i; c.setCursor(i); - c.candidatesToCursor(); + checkCandidatesToCursorSet(c); } } @@ -288,8 +312,23 @@ int main() { { c.clear(); c.clearContextWords(); + auto wordCandidateLimit = ime.wordCandidateLimit(); + ime.setWordCandidateLimit(1); + c.type("ziran"); + checkCandidateSet(c); + ime.setWordCandidateLimit(wordCandidateLimit); + } + + { + c.clear(); + c.clearContextWords(); + c.type("ziran"); + checkCandidateSet(c); + c.clear(); FCITX_ASSERT(!ime.model()->history().containsBigram("他", "爱")); c.type("taai"); + checkCandidateSet(c); + checkCandidatesToCursorSet(c); size_t i = 0; for (const auto &candidate : c.candidatesToCursor()) { if (candidate.toString() == "他爱") { From fd07e6304dea1f6a9a6a2776bc0d1d4511d898f4 Mon Sep 17 00:00:00 2001 From: Weng Xuetian Date: Sun, 31 May 2026 21:24:50 -0700 Subject: [PATCH 2/2] Dedup based on selection range MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This will allow things like "xian" and "xi" to not dedup for "洗". The engine may show the full pinyin if there's a duplicate to distinguish them. --- src/libime/pinyin/pinyincontext.cpp | 78 +++++++++++++++++++++-------- test/testpinyincontext.cpp | 26 +++++++--- 2 files changed, 78 insertions(+), 26 deletions(-) diff --git a/src/libime/pinyin/pinyincontext.cpp b/src/libime/pinyin/pinyincontext.cpp index d880cf7..b9b333f 100644 --- a/src/libime/pinyin/pinyincontext.cpp +++ b/src/libime/pinyin/pinyincontext.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -43,6 +44,7 @@ namespace libime { +namespace { enum class LearnWordResult { Normal, /// word is consisted all from regular word from dict. Custom, /// word is consisted with custom word (e.g. symbol replacement). @@ -66,6 +68,32 @@ struct SelectedPinyin { SelectedPinyinType type_; }; +struct CandidateDedupKey { + std::string text_; + size_t end_ = 0; + + bool operator==(const CandidateDedupKey &other) const { + return text_ == other.text_ && end_ == other.end_; + } +}; + +struct CandidateDedupKeyHash { + size_t operator()(const CandidateDedupKey &key) const { + size_t seed = std::hash()(key.text_); + boost::hash_combine(seed, key.end_); + return seed; + } +}; + +CandidateDedupKey candidateDedupKey(const SentenceResult &candidate) { + return {.text_ = candidate.toString(), + .end_ = candidate.sentence().empty() + ? 0 + : candidate.sentence().back()->to()->index()}; +} + +} // namespace + class PinyinContextPrivate : public fcitx::QPtrHolder { public: PinyinContextPrivate(PinyinContext *q, PinyinIME *ime) @@ -127,14 +155,31 @@ class PinyinContextPrivate : public fcitx::QPtrHolder { candidatesToCursor_.clear(); candidatesToCursorSet_.clear(); + std::unordered_map + duplicateCandidates; + auto insertCandidate = [this, &duplicateCandidates]( + SentenceResult candidate) { + auto key = candidateDedupKey(candidate); + auto iter = duplicateCandidates.find(key); + if (iter != duplicateCandidates.end()) { + auto &oldCandidate = candidatesToCursor_[iter->second]; + if (candidate.score() > oldCandidate.score()) { + oldCandidate = std::move(candidate); + } + return; + } + + candidatesToCursor_.push_back(std::move(candidate)); + duplicateCandidates.emplace(key, candidatesToCursor_.size() - 1); + candidatesToCursorSet_.insert(std::move(key.text_)); + }; + auto start = q->selectedLength(); auto currentCursor = alignCursorToNextSegment(); // Poke best sentence from lattice, ignore nbest option for now. auto nodeRange = lattice_.nodes(&segs_.node(currentCursor - start)); if (!nodeRange.empty()) { - candidatesToCursor_.push_back(nodeRange.front().toSentenceResult()); - candidatesToCursorSet_.insert( - candidatesToCursor_.back().toString()); + insertCandidate(nodeRange.front().toSentenceResult()); } for (const auto &candidate : candidates_) { const auto &sentence = candidate.sentence(); @@ -142,12 +187,7 @@ class PinyinContextPrivate : public fcitx::QPtrHolder { if (sentence.back()->to()->index() + start > currentCursor) { continue; } - auto text = candidate.toString(); - if (candidatesToCursorSet_.contains(text)) { - continue; - } - candidatesToCursor_.push_back(candidate); - candidatesToCursorSet_.insert(std::move(text)); + insertCandidate(candidate); } else if (sentence.size() > 1) { auto newSentence = sentence; while (!newSentence.empty() && @@ -158,12 +198,7 @@ class PinyinContextPrivate : public fcitx::QPtrHolder { if (!newSentence.empty()) { SentenceResult partial(newSentence, newSentence.back()->score()); - auto text = partial.toString(); - if (candidatesToCursorSet_.contains(text)) { - continue; - } - candidatesToCursor_.push_back(partial); - candidatesToCursorSet_.insert(std::move(text)); + insertCandidate(std::move(partial)); } } } @@ -681,14 +716,16 @@ void PinyinContext::update() { size_t index = 0; size_t count = 0; const auto limit = d->ime_->wordCandidateLimit(); + std::unordered_set + duplicateCandidates; auto &candidatesSet = d->candidatesSet_; candidatesSet.clear(); std::erase_if(d->candidates_, - [&candidatesSet, &index, &count, beginSize, - limit](const SentenceResult &candidate) { + [&candidatesSet, &duplicateCandidates, &index, &count, + beginSize, limit](const SentenceResult &candidate) { bool beforeBeginSize = index++ < beginSize; - auto candidateString = candidate.toString(); - if (candidatesSet.contains(candidateString)) { + auto key = candidateDedupKey(candidate); + if (duplicateCandidates.contains(key)) { return true; } @@ -708,7 +745,8 @@ void PinyinContext::update() { } } - candidatesSet.insert(std::move(candidateString)); + candidatesSet.insert(key.text_); + duplicateCandidates.insert(std::move(key)); return false; }); } diff --git a/test/testpinyincontext.cpp b/test/testpinyincontext.cpp index 08308af..c8d9591 100644 --- a/test/testpinyincontext.cpp +++ b/test/testpinyincontext.cpp @@ -27,12 +27,12 @@ using namespace libime; +namespace { + void checkCandidateSet(const PinyinContext &context) { std::unordered_set candidates; for (const auto &candidate : context.candidates()) { - auto candidateString = candidate.toString(); - FCITX_ASSERT(candidates.insert(candidateString).second) - << "Duplicate candidate: " << candidateString; + candidates.insert(candidate.toString()); } FCITX_ASSERT(candidates == context.candidateSet()); @@ -41,14 +41,14 @@ void checkCandidateSet(const PinyinContext &context) { void checkCandidatesToCursorSet(const PinyinContext &context) { std::unordered_set candidates; for (const auto &candidate : context.candidatesToCursor()) { - auto candidateString = candidate.toString(); - FCITX_ASSERT(candidates.insert(candidateString).second) - << "Duplicate candidate to cursor: " << candidateString; + candidates.insert(candidate.toString()); } FCITX_ASSERT(candidates == context.candidatesToCursorSet()); } +} // namespace + int main() { PinyinIME ime( std::make_unique(), @@ -64,6 +64,20 @@ int main() { ime.dict()->addWord(1, "zi'ji'ge'zi", "自机各自"); ime.setFuzzyFlags(PinyinFuzzyFlag::Inner); PinyinContext c(&ime); + + { + c.type("xian"); + std::unordered_set xiEndIndexes; + for (const auto &candidate : c.candidates()) { + if (candidate.toString() == "洗") { + xiEndIndexes.insert(candidate.sentence().back()->to()->index()); + } + } + FCITX_ASSERT(xiEndIndexes.size() == 2) << xiEndIndexes.size(); + FCITX_ASSERT(c.candidateSet().count("洗") == 1); + c.clear(); + } + c.type("xianshi"); std::cout << c.sentence() << std::endl;