Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions src/libime/core/historybigram.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -571,4 +571,14 @@ void HistoryBigram::fillPredict(std::unordered_set<std::string> &words,
pool.fillPredict(words, lookup, maxSize);
});
}

bool HistoryBigram::containsBigram(std::string_view prev,
std::string_view cur) const {
FCITX_D();
return std::ranges::any_of(d->pools_,
[&prev, &cur](const HistoryBigramPool &pool) {
return pool.bigramFreq(prev, cur) > 0;
});
}

} // namespace libime
2 changes: 2 additions & 0 deletions src/libime/core/historybigram.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ class LIBIMECORE_EXPORT HistoryBigram {
const std::vector<std::string> &sentence,
size_t maxSize) const;

bool containsBigram(std::string_view prev, std::string_view cur) const;

private:
std::unique_ptr<HistoryBigramPrivate> d_ptr;
FCITX_DECLARE_PRIVATE(HistoryBigram);
Expand Down
31 changes: 31 additions & 0 deletions src/libime/core/languagemodel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
*/

#include "languagemodel.h"
#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstdlib>
Expand All @@ -27,9 +28,11 @@
#include "lm/config.hh"
#include "lm/lm_exception.hh"
#include "lm/model.hh"
#include "lm/return.hh"
#include "lm/state.hh"
#include "lm/word_index.hh"
#include "util/string_piece.hh"
#include "utils.h"

namespace libime {

Expand Down Expand Up @@ -72,6 +75,8 @@ const DATrie<float> &StaticLanguageModelFile::predictionTrie() const {

static_assert(sizeof(void *) + sizeof(lm::ngram::State) <= StateSize, "Size");

LanguageModelBase::~LanguageModelBase() {}

bool LanguageModelBase::isNodeUnknown(const LatticeNode &node) const {
return isUnknown(node.idx(), node.word());
}
Expand Down Expand Up @@ -217,6 +222,32 @@ bool LanguageModel::isUnknown(WordIndex idx, std::string_view /*word*/) const {
return idx == unknown();
}

unsigned int
LanguageModel::maxNgramLength(const std::vector<std::string> &words) const {
FCITX_D();
if (!d->model()) {
return 0;
}
State state = nullState();
State outState;

unsigned int maxNgramLength = 0;
std::vector<WordNode> nodes;
for (const auto &word : words) {
const auto idx = index(word);
lm::FullScoreReturn full =
d->model()->FullScore(lmState(state), idx, lmState(outState));
unsigned int ngramLength = full.ngram_length;
if (ngramLength == 1 && idx == unknown()) {
ngramLength = 0;
}

maxNgramLength = std::max(maxNgramLength, ngramLength);
state = outState;
}
return maxNgramLength;
}

void LanguageModel::setUnknownPenalty(float unknown) {
FCITX_D();
d->unknown_ = unknown;
Expand Down
4 changes: 3 additions & 1 deletion src/libime/core/languagemodel.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class LanguageModelResolverPrivate;

class LIBIMECORE_EXPORT LanguageModelBase {
public:
virtual ~LanguageModelBase() {}
virtual ~LanguageModelBase();

virtual WordIndex beginSentence() const = 0;
virtual WordIndex endSentence() const = 0;
Expand Down Expand Up @@ -89,6 +89,8 @@ class LIBIMECORE_EXPORT LanguageModel : public LanguageModelBase {
void setUnknownPenalty(float unknown);
float unknownPenalty() const;

unsigned int maxNgramLength(const std::vector<std::string> &words) const;

private:
std::unique_ptr<LanguageModelPrivate> d_ptr;
FCITX_DECLARE_PRIVATE(LanguageModel);
Expand Down
20 changes: 20 additions & 0 deletions src/libime/core/userlanguagemodel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,13 @@
#include <cassert>
#include <cmath>
#include <istream>
#include <iterator>
#include <memory>
#include <ostream>
#include <string>
#include <string_view>
#include <utility>
#include <vector>
#include <fcitx-utils/macros.h>
#include "constants.h"
#include "historybigram.h"
Expand Down Expand Up @@ -150,4 +153,21 @@ bool UserLanguageModel::useOnlyUnigram() const {
FCITX_D();
return d->useOnlyUnigram_;
}

bool UserLanguageModel::containsNonUnigram(
const std::vector<std::string> &words) const {
FCITX_D();
if (words.size() <= 1 || d->useOnlyUnigram_) {
return false;
}

for (auto iter = words.begin(); iter != std::prev(words.end()); ++iter) {
if (d->history_.containsBigram(*iter, *(std::next(iter)))) {
return true;
}
}

return LanguageModel::maxNgramLength(words) > 1;
}

} // namespace libime
4 changes: 4 additions & 0 deletions src/libime/core/userlanguagemodel.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
#include <istream>
#include <memory>
#include <ostream>
#include <string>
#include <string_view>
#include <vector>
#include <fcitx-utils/macros.h>
#include <libime/core/languagemodel.h>
#include <libime/core/libimecore_export.h>
Expand Down Expand Up @@ -44,6 +46,8 @@ class LIBIMECORE_EXPORT UserLanguageModel : public LanguageModel {
State &out) const override;
bool isUnknown(WordIndex idx, std::string_view view) const override;

bool containsNonUnigram(const std::vector<std::string> &words) const;

private:
std::unique_ptr<UserLanguageModelPrivate> d_ptr;
FCITX_DECLARE_PRIVATE(UserLanguageModel);
Expand Down
12 changes: 10 additions & 2 deletions src/libime/pinyin/pinyincontext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -251,9 +251,17 @@ class PinyinContextPrivate : public fcitx::QPtrHolder<PinyinContext> {
totalPinyinLength += item.encodedPinyin_.size() / 2;
}
}
if (!isAllSingleWord && !hasCustom && totalPinyinLength > 4) {
return LearnWordResult::Ignored;

FCITX_Q();
if (!hasCustom) {
if ((!isAllSingleWord && totalPinyinLength > 4)) {
return LearnWordResult::Ignored;
}
if (ime_->model()->containsNonUnigram(q->selectedWords())) {
return LearnWordResult::Ignored;
}
}

for (auto &s : selected_) {
for (auto &item : s) {
if (item.type_ == SelectedPinyinType::Separator) {
Expand Down
22 changes: 22 additions & 0 deletions test/testpinyincontext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -285,5 +285,27 @@ int main() {
}
}

{
c.clear();
c.clearContextWords();
FCITX_ASSERT(!ime.model()->history().containsBigram("他", "爱"));
c.type("taai");
size_t i = 0;
for (const auto &candidate : c.candidatesToCursor()) {
if (candidate.toString() == "他爱") {
break;
}
i++;
}
FCITX_ASSERT(i < c.candidatesToCursor().size());
c.selectCandidatesToCursor(i);

FCITX_ASSERT(c.selected());
FCITX_ASSERT(c.selectedSentence() == "他爱");
c.learn();
c.clear();
FCITX_ASSERT(ime.model()->history().containsBigram("他", "爱"));
}

return 0;
}
5 changes: 3 additions & 2 deletions test/testpinyinime_unit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <iterator>
#include <memory>
#include <fcitx-utils/log.h>
#include "libime/core/historybigram.h"
#include "libime/core/userlanguagemodel.h"
#include "libime/pinyin/pinyincontext.h"
#include "libime/pinyin/pinyincorrectionprofile.h"
Expand Down Expand Up @@ -55,8 +56,8 @@ int main() {
"ni'hao'zhong'guo", "你好中国"));
c.select(std::distance(c.candidates().begin(), iter));
c.learn();
FCITX_ASSERT(ime.dict()->lookupWord(PinyinDictionary::UserDict,
"ni'hao'zhong'guo", "你好中国"));
FCITX_ASSERT(ime.model()->history().containsBigram("你", "好"));
FCITX_ASSERT(ime.model()->history().containsBigram("好", "中国"));

c.setUseShuangpin(true);

Expand Down
Loading