Skip to content
Merged

dedup #126

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 86 additions & 53 deletions src/libime/pinyin/pinyincontext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@
#include <string>
#include <string_view>
#include <tuple>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include <boost/container_hash/hash.hpp>
#include <fcitx-utils/charutils.h>
#include <fcitx-utils/inputbuffer.h>
#include <fcitx-utils/keysym.h>
Expand All @@ -42,6 +44,7 @@

namespace libime {

namespace {
enum class LearnWordResult {
Normal, /// word is consisted all from regular word from dict.
Custom, /// word is consisted with custom word (e.g. symbol replacement).
Expand All @@ -65,6 +68,32 @@ struct SelectedPinyin {
SelectedPinyinType type_;
};

struct CandidateDedupKey {
std::string text_;
size_t end_ = 0;

bool operator==(const CandidateDedupKey &other) const {
return text_ == other.text_ && end_ == other.end_;
}
};

struct CandidateDedupKeyHash {
size_t operator()(const CandidateDedupKey &key) const {
size_t seed = std::hash<std::string>()(key.text_);
boost::hash_combine(seed, key.end_);
return seed;
}
};

CandidateDedupKey candidateDedupKey(const SentenceResult &candidate) {
return {.text_ = candidate.toString(),
.end_ = candidate.sentence().empty()
? 0
: candidate.sentence().back()->to()->index()};
}

} // namespace

class PinyinContextPrivate : public fcitx::QPtrHolder<PinyinContext> {
public:
PinyinContextPrivate(PinyinContext *q, PinyinIME *ime)
Expand Down Expand Up @@ -126,27 +155,39 @@ class PinyinContextPrivate : public fcitx::QPtrHolder<PinyinContext> {
candidatesToCursor_.clear();
candidatesToCursorSet_.clear();

std::unordered_map<CandidateDedupKey, size_t, CandidateDedupKeyHash>
duplicateCandidates;
auto insertCandidate = [this, &duplicateCandidates](
SentenceResult candidate) {
auto key = candidateDedupKey(candidate);
auto iter = duplicateCandidates.find(key);
if (iter != duplicateCandidates.end()) {
auto &oldCandidate = candidatesToCursor_[iter->second];
if (candidate.score() > oldCandidate.score()) {
oldCandidate = std::move(candidate);
}
return;
}

candidatesToCursor_.push_back(std::move(candidate));
duplicateCandidates.emplace(key, candidatesToCursor_.size() - 1);
candidatesToCursorSet_.insert(std::move(key.text_));
};

auto start = q->selectedLength();
auto currentCursor = alignCursorToNextSegment();
// Poke best sentence from lattice, ignore nbest option for now.
auto nodeRange = lattice_.nodes(&segs_.node(currentCursor - start));
if (!nodeRange.empty()) {
candidatesToCursor_.push_back(nodeRange.front().toSentenceResult());
candidatesToCursorSet_.insert(
candidatesToCursor_.back().toString());
insertCandidate(nodeRange.front().toSentenceResult());
}
for (const auto &candidate : candidates_) {
const auto &sentence = candidate.sentence();
if (sentence.size() == 1) {
if (sentence.back()->to()->index() + start > currentCursor) {
continue;
}
auto text = candidate.toString();
if (candidatesToCursorSet_.contains(text)) {
continue;
}
candidatesToCursor_.push_back(candidate);
candidatesToCursorSet_.insert(std::move(text));
insertCandidate(candidate);
} else if (sentence.size() > 1) {
auto newSentence = sentence;
while (!newSentence.empty() &&
Expand All @@ -157,12 +198,7 @@ class PinyinContextPrivate : public fcitx::QPtrHolder<PinyinContext> {
if (!newSentence.empty()) {
SentenceResult partial(newSentence,
newSentence.back()->score());
auto text = partial.toString();
if (candidatesToCursorSet_.contains(text)) {
continue;
}
candidatesToCursor_.push_back(partial);
candidatesToCursorSet_.insert(std::move(text));
insertCandidate(std::move(partial));
}
}
}
Expand Down Expand Up @@ -606,7 +642,6 @@ void PinyinContext::update() {
// Add n-best result.
for (size_t i = 0, e = d->lattice_.sentenceSize(); i < e; i++) {
d->candidates_.push_back(d->lattice_.sentence(i));
d->candidatesSet_.insert(d->candidates_.back().toString());
}

const auto *bos = &graph.start();
Expand All @@ -632,13 +667,8 @@ void PinyinContext::update() {
min = std::min(latticeNode.score(), min);
max = std::max(latticeNode.score(), max);
}
// Deduplcate.
if (d->candidatesSet_.contains(latticeNode.word())) {
continue;
}
d->candidates_.push_back(
latticeNode.toSentenceResult(adjust));
d->candidatesSet_.insert(latticeNode.word());
}
}
}
Expand All @@ -651,9 +681,6 @@ void PinyinContext::update() {
if (latticeNode.from() == bos &&
static_cast<const PinyinLatticeNode &>(latticeNode)
.isCorrection()) {
if (d->candidatesSet_.contains(latticeNode.word())) {
continue;
}
if ((latticeNode.score() > min &&
latticeNode.score() + d->ime_->maxDistance() >
max) ||
Expand All @@ -662,7 +689,6 @@ void PinyinContext::update() {
.size() <= 2) {
d->candidates_.push_back(
latticeNode.toSentenceResult(adjust));
d->candidatesSet_.insert(latticeNode.word());
}
}
}
Expand All @@ -678,44 +704,51 @@ void PinyinContext::update() {
latticeNode.score() + d->ime_->maxDistance() > max &&
!static_cast<const PinyinLatticeNode &>(latticeNode)
.anyCorrectionOnPath()) {
auto fullWord = latticeNode.fullWord();
if (d->candidatesSet_.contains(fullWord)) {
continue;
}
d->candidates_.push_back(
latticeNode.toSentenceResult(adjust));
d->candidatesSet_.insert(fullWord);
}
}
}
}
std::sort(d->candidates_.begin() + beginSize, d->candidates_.end(),
std::greater<>());
if (const auto limit = d->ime_->wordCandidateLimit()) {
{
size_t index = 0;
size_t count = 0;
const auto limit = d->ime_->wordCandidateLimit();
std::unordered_set<CandidateDedupKey, CandidateDedupKeyHash>
duplicateCandidates;
auto &candidatesSet = d->candidatesSet_;
d->candidates_.erase(
std::remove_if(
d->candidates_.begin() + beginSize, d->candidates_.end(),
[&count, limit,
&candidatesSet](const SentenceResult &candidate) {
const bool isSinglePinyinWord =
candidate.sentence().size() == 1 &&
candidate.sentence()
.front()
->as<PinyinLatticeNode>()
.encodedPinyin()
.size() == 2;
if (!isSinglePinyinWord) {
if (count >= limit) {
candidatesSet.erase(candidate.toString());
return true;
}
count++;
}
return false;
}),
d->candidates_.end());
candidatesSet.clear();
std::erase_if(d->candidates_,
[&candidatesSet, &duplicateCandidates, &index, &count,
beginSize, limit](const SentenceResult &candidate) {
bool beforeBeginSize = index++ < beginSize;
auto key = candidateDedupKey(candidate);
if (duplicateCandidates.contains(key)) {
return true;
}

if (!beforeBeginSize && limit) {
const bool isSinglePinyinWord =
candidate.sentence().size() == 1 &&
candidate.sentence()
.front()
->as<PinyinLatticeNode>()
.encodedPinyin()
.size() == 2;
if (!isSinglePinyinWord) {
if (count >= limit) {
return true;
}
count++;
}
}

candidatesSet.insert(key.text_);
duplicateCandidates.insert(std::move(key));
return false;
});
}

d->candidatesToCursorNeedUpdate_ = true;
Expand Down
55 changes: 54 additions & 1 deletion test/testpinyincontext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <sstream>
#include <string>
#include <string_view>
#include <unordered_set>
#include <vector>
#include <fcitx-utils/log.h>
#include "libime/core/historybigram.h"
Expand All @@ -26,6 +27,28 @@

using namespace libime;

namespace {

void checkCandidateSet(const PinyinContext &context) {
std::unordered_set<std::string> candidates;
for (const auto &candidate : context.candidates()) {
candidates.insert(candidate.toString());
}

FCITX_ASSERT(candidates == context.candidateSet());
}

void checkCandidatesToCursorSet(const PinyinContext &context) {
std::unordered_set<std::string> candidates;
for (const auto &candidate : context.candidatesToCursor()) {
candidates.insert(candidate.toString());
}

FCITX_ASSERT(candidates == context.candidatesToCursorSet());
}

} // namespace

int main() {
PinyinIME ime(
std::make_unique<PinyinDictionary>(),
Expand All @@ -41,6 +64,20 @@ int main() {
ime.dict()->addWord(1, "zi'ji'ge'zi", "自机各自");
ime.setFuzzyFlags(PinyinFuzzyFlag::Inner);
PinyinContext c(&ime);

{
c.type("xian");
std::unordered_set<size_t> xiEndIndexes;
for (const auto &candidate : c.candidates()) {
if (candidate.toString() == "洗") {
xiEndIndexes.insert(candidate.sentence().back()->to()->index());
}
}
FCITX_ASSERT(xiEndIndexes.size() == 2) << xiEndIndexes.size();
FCITX_ASSERT(c.candidateSet().count("洗") == 1);
c.clear();
}

c.type("xianshi");

std::cout << c.sentence() << std::endl;
Expand Down Expand Up @@ -176,6 +213,7 @@ int main() {
{
c.clear();
c.type("nianglanghang");
checkCandidatesToCursorSet(c);
size_t i = 0;
for (const auto &candidate : c.candidatesToCursor()) {
if (candidate.toString() == "娘") {
Expand All @@ -189,7 +227,7 @@ int main() {
while (i > 0) {
--i;
c.setCursor(i);
c.candidatesToCursor();
checkCandidatesToCursorSet(c);
}
}

Expand Down Expand Up @@ -288,8 +326,23 @@ int main() {
{
c.clear();
c.clearContextWords();
auto wordCandidateLimit = ime.wordCandidateLimit();
ime.setWordCandidateLimit(1);
c.type("ziran");
checkCandidateSet(c);
ime.setWordCandidateLimit(wordCandidateLimit);
}

{
c.clear();
c.clearContextWords();
c.type("ziran");
checkCandidateSet(c);
c.clear();
FCITX_ASSERT(!ime.model()->history().containsBigram("他", "爱"));
c.type("taai");
checkCandidateSet(c);
checkCandidatesToCursorSet(c);
size_t i = 0;
for (const auto &candidate : c.candidatesToCursor()) {
if (candidate.toString() == "他爱") {
Expand Down
Loading