summaryrefslogtreecommitdiff
path: root/libs/hunspell/src
diff options
context:
space:
mode:
authorGeorge Hazan <ghazan@miranda.im>2022-08-30 17:13:21 +0300
committerGeorge Hazan <ghazan@miranda.im>2022-08-30 17:13:21 +0300
commit3ad2f2b7c2bfb3166363239d67a6645692ffb2b6 (patch)
tree0201fd31d0c0e5c193752f7b80cdc69096b563cf /libs/hunspell/src
parentd82b809f6af58a1d10fa503138b912d336dca75e (diff)
fixes #3183 (Update hunspell to 1.7.1)
Diffstat (limited to 'libs/hunspell/src')
-rw-r--r--libs/hunspell/src/affentry.c++18
-rw-r--r--libs/hunspell/src/affentry.hxx6
-rw-r--r--libs/hunspell/src/affixmgr.c++331
-rw-r--r--libs/hunspell/src/affixmgr.hxx7
-rw-r--r--libs/hunspell/src/atypes.hxx12
-rw-r--r--libs/hunspell/src/baseaffix.hxx2
-rw-r--r--libs/hunspell/src/csutil.c++98
-rw-r--r--libs/hunspell/src/csutil.hxx29
-rw-r--r--libs/hunspell/src/filemgr.c++6
-rw-r--r--libs/hunspell/src/filemgr.hxx2
-rw-r--r--libs/hunspell/src/hashmgr.c++393
-rw-r--r--libs/hunspell/src/hashmgr.hxx17
-rw-r--r--libs/hunspell/src/htypes.hxx17
-rw-r--r--libs/hunspell/src/hunspell.c++1040
-rw-r--r--libs/hunspell/src/hunspell.hxx6
-rw-r--r--libs/hunspell/src/hunvisapi.h2
-rw-r--r--libs/hunspell/src/hunzip.c++4
-rw-r--r--libs/hunspell/src/hunzip.hxx2
-rw-r--r--libs/hunspell/src/langnum.hxx3
-rw-r--r--libs/hunspell/src/replist.c++2
-rw-r--r--libs/hunspell/src/replist.hxx2
-rw-r--r--libs/hunspell/src/suggestmgr.c++294
-rw-r--r--libs/hunspell/src/suggestmgr.hxx15
-rw-r--r--libs/hunspell/src/utf_info.hxx12
-rw-r--r--libs/hunspell/src/w_char.hxx2
25 files changed, 1438 insertions, 884 deletions
diff --git a/libs/hunspell/src/affentry.c++ b/libs/hunspell/src/affentry.c++
index 4ef0c00d9b..2cf4f4671f 100644
--- a/libs/hunspell/src/affentry.c++
+++ b/libs/hunspell/src/affentry.c++
@@ -1,7 +1,7 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
- * Copyright (C) 2002-2017 Németh László
+ * Copyright (C) 2002-2022 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
@@ -146,7 +146,7 @@ inline int PfxEntry::test_condition(const char* st) {
break;
}
case ']': {
- if ((neg && ingroup) || (!neg && !ingroup))
+ if (bool(neg) == bool(ingroup))
return 0;
pos = NULL;
p = nextchar(p);
@@ -224,7 +224,7 @@ struct hentry* PfxEntry::checkword(const char* word,
// back any characters that would have been stripped
std::string tmpword(strip);
- tmpword.append(word + appnd.size());
+ tmpword.append(word + appnd.size(), tmpl);
// now make sure all of the conditions on characters
// are met. Please see the appendix at the end of
@@ -399,28 +399,28 @@ std::string PfxEntry::check_morph(const char* word,
((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
(contclass && TESTAFF(contclass, needflag, contclasslen)))) {
if (morphcode) {
- result.append(" ");
+ result.push_back(MSEP_FLD);
result.append(morphcode);
} else
result.append(getKey());
if (!HENTRY_FIND(he, MORPH_STEM)) {
- result.append(" ");
+ result.push_back(MSEP_FLD);
result.append(MORPH_STEM);
result.append(HENTRY_WORD(he));
}
// store the pointer of the hash entry
if (HENTRY_DATA(he)) {
- result.append(" ");
+ result.push_back(MSEP_FLD);
result.append(HENTRY_DATA2(he));
} else {
// return with debug information
char* flag = pmyMgr->encode_flag(getFlag());
- result.append(" ");
+ result.push_back(MSEP_FLD);
result.append(MORPH_FLAG);
result.append(flag);
free(flag);
}
- result.append("\n");
+ result.push_back(MSEP_REC);
}
he = he->next_homonym;
} while (he);
@@ -804,7 +804,7 @@ std::string SfxEntry::check_twosfx_morph(const char* word,
if (!st.empty()) {
if (ppfx->getMorph()) {
result.append(ppfx->getMorph());
- result.append(" ");
+ result.push_back(MSEP_FLD);
}
result.append(st);
mychomp(result);
diff --git a/libs/hunspell/src/affentry.hxx b/libs/hunspell/src/affentry.hxx
index 535a96bc42..b736bf0350 100644
--- a/libs/hunspell/src/affentry.hxx
+++ b/libs/hunspell/src/affentry.hxx
@@ -1,7 +1,7 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
- * Copyright (C) 2002-2017 Németh László
+ * Copyright (C) 2002-2022 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
@@ -118,7 +118,7 @@ class PfxEntry : public AffEntry {
const char* getKey() { return appnd.c_str(); }
std::string add(const char* word, size_t len);
- inline short getKeyLen() { return (short)appnd.size(); }
+ inline short getKeyLen() { return appnd.size(); }
inline const char* getMorph() { return morphcode; }
@@ -199,7 +199,7 @@ class SfxEntry : public AffEntry {
inline short getContLen() { return contclasslen; }
inline const char* getAffix() { return appnd.c_str(); }
- inline short getKeyLen() { return (short)appnd.size(); }
+ inline short getKeyLen() { return appnd.size(); }
inline SfxEntry* getNext() { return next; }
inline SfxEntry* getNextNE() { return nextne; }
diff --git a/libs/hunspell/src/affixmgr.c++ b/libs/hunspell/src/affixmgr.c++
index 90c7eaff33..adb750dba1 100644
--- a/libs/hunspell/src/affixmgr.c++
+++ b/libs/hunspell/src/affixmgr.c++
@@ -1,7 +1,7 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
- * Copyright (C) 2002-2017 Németh László
+ * Copyright (C) 2002-2022 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
@@ -72,6 +72,7 @@
#include <string.h>
#include <stdio.h>
#include <ctype.h>
+#include <time.h>
#include <algorithm>
#include <limits>
@@ -96,7 +97,6 @@ AffixMgr::AffixMgr(const char* affpath,
complexprefixes = 0;
parsedmaptable = false;
parsedbreaktable = false;
- parsedrep = false;
iconvtable = NULL;
oconvtable = NULL;
// allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN)
@@ -113,7 +113,7 @@ AffixMgr::AffixMgr(const char* affpath,
compoundforbidflag = FLAG_NULL; // compound fordidden flag for suffixed word
compoundmoresuffixes = 0; // allow more suffixes within compound words
checkcompounddup = 0; // forbid double words in compounds
- checkcompoundrep = 0; // forbid bad compounds (may be non compound word with
+ checkcompoundrep = 0; // forbid bad compounds (may be non-compound word with
// a REP substitution)
checkcompoundcase =
0; // forbid upper and lowercase combinations at word bounds
@@ -439,7 +439,7 @@ int AffixMgr::parse_file(const char* affpath, const char* key) {
}
}
- /* parse in the flag used by forbidden words */
+ /* parse in the flag used by forbidden words (is deprecated) */
if (line.compare(0, 13, "LEMMA_PRESENT", 13) == 0) {
if (!parse_flag(line, &lemma_present, afflst)) {
finishFileMgr(afflst);
@@ -463,7 +463,7 @@ int AffixMgr::parse_file(const char* affpath, const char* key) {
}
}
- /* parse in the flag used by `needaffixs' */
+ /* parse in the flag used by `needaffixs' (is deprecated) */
if (line.compare(0, 10, "PSEUDOROOT", 10) == 0) {
if (!parse_flag(line, &needaffix, afflst)) {
finishFileMgr(afflst);
@@ -529,14 +529,6 @@ int AffixMgr::parse_file(const char* affpath, const char* key) {
}
}
- /* parse in the typical fault correcting table */
- if (line.compare(0, 3, "REP", 3) == 0) {
- if (!parse_reptable(line, afflst)) {
- finishFileMgr(afflst);
- return 1;
- }
- }
-
/* parse in the input conversion table */
if (line.compare(0, 5, "ICONV", 5) == 0) {
if (!parse_convtable(line, afflst, &iconvtable, "ICONV")) {
@@ -545,7 +537,7 @@ int AffixMgr::parse_file(const char* affpath, const char* key) {
}
}
- /* parse in the input conversion table */
+ /* parse in the output conversion table */
if (line.compare(0, 5, "OCONV", 5) == 0) {
if (!parse_convtable(line, afflst, &oconvtable, "OCONV")) {
finishFileMgr(afflst);
@@ -1023,7 +1015,7 @@ int AffixMgr::process_sfx_order() {
// add flags to the result for dictionary debugging
std::string& AffixMgr::debugflag(std::string& result, unsigned short flag) {
char* st = encode_flag(flag);
- result.append(" ");
+ result.push_back(MSEP_FLD);
result.append(MORPH_FLAG);
if (st) {
result.append(st);
@@ -1060,7 +1052,7 @@ int AffixMgr::encodeit(AffEntry& entry, const char* cs) {
} else if (cs[MAXCONDLEN]) {
//there is more conditions than fit in fixed space, so its
//a long condition
- entry.opts += aeLONGCOND;
+ entry.opts |= aeLONGCOND;
entry.c.l.conds2 = mystrdup(cs + MAXCONDLEN_1);
if (!entry.c.l.conds2)
return 1;
@@ -1146,7 +1138,7 @@ struct hentry* AffixMgr::prefix_check(const char* word,
return NULL;
}
-// check word for prefixes
+// check word for prefixes and two-level suffixes
struct hentry* AffixMgr::prefix_check_twosfx(const char* word,
int len,
char in_compound,
@@ -1187,7 +1179,7 @@ struct hentry* AffixMgr::prefix_check_twosfx(const char* word,
return NULL;
}
-// check word for prefixes
+// check word for prefixes and morph
std::string AffixMgr::prefix_check_morph(const char* word,
int len,
char in_compound,
@@ -1234,7 +1226,7 @@ std::string AffixMgr::prefix_check_morph(const char* word,
return result;
}
-// check word for prefixes
+// check word for prefixes and morph and two-level suffixes
std::string AffixMgr::prefix_check_twosfx_morph(const char* word,
int len,
char in_compound,
@@ -1275,25 +1267,44 @@ std::string AffixMgr::prefix_check_twosfx_morph(const char* word,
return result;
}
-// Is word a non compound with a REP substitution (see checkcompoundrep)?
+// Is word a non-compound with a REP substitution (see checkcompoundrep)?
int AffixMgr::cpdrep_check(const char* word, int wl) {
- if ((wl < 2) || reptable.empty())
+ if ((wl < 2) || get_reptable().empty())
return 0;
- for (size_t i = 0; i < reptable.size(); ++i) {
- const char* r = word;
- const size_t lenp = reptable[i].pattern.size();
- // search every occurence of the pattern in the word
- while ((r = strstr(r, reptable[i].pattern.c_str())) != NULL) {
- std::string candidate(word);
- size_t type = r == word && langnum != LANG_hu ? 1 : 0;
- if (r - word + reptable[i].pattern.size() == lenp && langnum != LANG_hu)
- type += 2;
- candidate.replace(r - word, lenp, reptable[i].outstrings[type]);
+ for (size_t i = 0; i < get_reptable().size(); ++i) {
+ // use only available mid patterns
+ if (!get_reptable()[i].outstrings[0].empty()) {
+ const char* r = word;
+ const size_t lenp = get_reptable()[i].pattern.size();
+ // search every occurence of the pattern in the word
+ while ((r = strstr(r, get_reptable()[i].pattern.c_str())) != NULL) {
+ std::string candidate(word);
+ candidate.replace(r - word, lenp, get_reptable()[i].outstrings[0]);
+ if (candidate_check(candidate.c_str(), candidate.size()))
+ return 1;
+ ++r; // search for the next letter
+ }
+ }
+ }
+
+ return 0;
+}
+
+// forbid compound words, if they are in the dictionary as a
+// word pair separated by space
+int AffixMgr::cpdwordpair_check(const char * word, int wl) {
+ if (wl > 2) {
+ std::string candidate(word);
+ for (size_t i = 1; i < candidate.size(); i++) {
+ // go to end of the UTF-8 character
+ if (utf8 && ((word[i] & 0xc0) == 0x80))
+ continue;
+ candidate.insert(i, 1, ' ');
if (candidate_check(candidate.c_str(), candidate.size()))
return 1;
- ++r; // search for the next letter
+ candidate.erase(i, 1);
}
}
@@ -1584,6 +1595,21 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
int checked_prefix;
+ // add a time limit to handle possible
+ // combinatorical explosion of the overlapping words
+
+ HUNSPELL_THREAD_LOCAL clock_t timelimit;
+
+ if (wordnum == 0) {
+ // get the start time, seeing as we're reusing this set to 0
+ // to flag timeout, use clock() + 1 to avoid start clock()
+ // of 0 as being a timeout
+ timelimit = clock() + 1;
+ }
+ else if (timelimit != 0 && (clock() > timelimit + TIMELIMIT)) {
+ timelimit = 0;
+ }
+
setcminmax(&cmin, &cmax, word.c_str(), len);
st.assign(word);
@@ -1608,6 +1634,9 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
do { // simplified checkcompoundpattern loop
+ if (timelimit == 0)
+ return 0;
+
if (scpd > 0) {
for (; scpd <= checkcpdtable.size() &&
(checkcpdtable[scpd - 1].pattern3.empty() ||
@@ -1647,6 +1676,12 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
affixed = 1;
rv = lookup(st.c_str()); // perhaps without prefix
+ // forbid dictionary stems with COMPOUNDFORBIDFLAG in
+ // compound words, overriding the effect of COMPOUNDPERMITFLAG
+ if ((rv) && compoundforbidflag &&
+ TESTAFF(rv->astr, compoundforbidflag, rv->alen) && !hu_mov_rule)
+ continue;
+
// search homonym with compound flag
while ((rv) && !hu_mov_rule &&
((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
@@ -1854,7 +1889,7 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
}
// check FORCEUCASE
- if (rv && forceucase &&
+ if (rv && forceucase && (rv) &&
(TESTAFF(rv->astr, forceucase, rv->alen)) &&
!(info && *info & SPELL_ORIGCAP))
rv = NULL;
@@ -1909,9 +1944,10 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
&&
(scpd == 0 || checkcpdtable[scpd - 1].cond2 == FLAG_NULL ||
TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2, rv->alen))) {
- // forbid compound word, if it is a non compound word with typical
+ // forbid compound word, if it is a non-compound word with typical
// fault
- if (checkcompoundrep && cpdrep_check(word.c_str(), len))
+ if ((checkcompoundrep && cpdrep_check(word.c_str(), len)) ||
+ cpdwordpair_check(word.c_str(), len))
return NULL;
return rv_first;
}
@@ -1962,7 +1998,7 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
}
// check FORCEUCASE
- if (rv && forceucase &&
+ if (rv && forceucase && (rv) &&
(TESTAFF(rv->astr, forceucase, rv->alen)) &&
!(info && *info & SPELL_ORIGCAP))
rv = NULL;
@@ -1989,7 +2025,9 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
if (sfxappnd) {
std::string tmp(sfxappnd);
reverseword(tmp);
- numsyllable -= get_syllable(tmp) + sfxextra;
+ numsyllable -= short(get_syllable(tmp) + sfxextra);
+ } else {
+ numsyllable -= short(sfxextra);
}
// + 1 word, if syllable number of the prefix > 1 (hungarian
@@ -2024,7 +2062,6 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
(TESTAFF(rv->astr, compoundroot, rv->alen))) {
wordnum++;
}
-
// second word is acceptable, as a word with prefix or/and suffix?
// hungarian conventions: compounding is acceptable,
// when compound forms consist 2 word, otherwise
@@ -2033,9 +2070,10 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
(((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
((cpdmaxsyllable != 0) && (numsyllable <= cpdmaxsyllable))) &&
((!checkcompounddup || (rv != rv_first)))) {
- // forbid compound word, if it is a non compound word with typical
+ // forbid compound word, if it is a non-compound word with typical
// fault
- if (checkcompoundrep && cpdrep_check(word.c_str(), len))
+ if ((checkcompoundrep && cpdrep_check(word.c_str(), len)) ||
+ cpdwordpair_check(word.c_str(), len))
return NULL;
return rv_first;
}
@@ -2059,8 +2097,12 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
rv = NULL;
}
if (rv) {
- // forbid compound word, if it is a non compound word with typical
- // fault
+ // forbid compound word, if it is a non-compound word with typical
+ // fault, or a dictionary word pair
+
+ if (cpdwordpair_check(word.c_str(), len))
+ return NULL;
+
if (checkcompoundrep || forbiddenword) {
if (checkcompoundrep && cpdrep_check(word.c_str(), len))
@@ -2071,7 +2113,8 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
char r = st[i + rv->blen];
st[i + rv->blen] = '\0';
- if (checkcompoundrep && cpdrep_check(st.c_str(), i + rv->blen)) {
+ if ((checkcompoundrep && cpdrep_check(st.c_str(), i + rv->blen)) ||
+ cpdwordpair_check(st.c_str(), i + rv->blen)) {
st[ + i + rv->blen] = r;
continue;
}
@@ -2162,6 +2205,21 @@ int AffixMgr::compound_check_morph(const char* word,
char affixed = 0;
hentry** oldwords = words;
+ // add a time limit to handle possible
+ // combinatorical explosion of the overlapping words
+
+ HUNSPELL_THREAD_LOCAL clock_t timelimit;
+
+ if (wordnum == 0) {
+ // get the start time, seeing as we're reusing this set to 0
+ // to flag timeout, use clock() + 1 to avoid start clock()
+ // of 0 as being a timeout
+ timelimit = clock() + 1;
+ }
+ else if (timelimit != 0 && (clock() > timelimit + TIMELIMIT)) {
+ timelimit = 0;
+ }
+
setcminmax(&cmin, &cmax, word, len);
st.assign(word);
@@ -2180,6 +2238,9 @@ int AffixMgr::compound_check_morph(const char* word,
do { // onlycpdrule loop
+ if (timelimit == 0)
+ return 0;
+
oldnumsyllable = numsyllable;
oldwordnum = wordnum;
checked_prefix = 0;
@@ -2198,6 +2259,12 @@ int AffixMgr::compound_check_morph(const char* word,
rv = lookup(st.c_str()); // perhaps without prefix
+ // forbid dictionary stems with COMPOUNDFORBIDFLAG in
+ // compound words, overriding the effect of COMPOUNDPERMITFLAG
+ if ((rv) && compoundforbidflag &&
+ TESTAFF(rv->astr, compoundforbidflag, rv->alen) && !hu_mov_rule)
+ continue;
+
// search homonym with compound flag
while ((rv) && !hu_mov_rule &&
((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
@@ -2215,6 +2282,9 @@ int AffixMgr::compound_check_morph(const char* word,
rv = rv->next_homonym;
}
+ if (timelimit == 0)
+ return 0;
+
if (rv)
affixed = 0;
@@ -2405,22 +2475,22 @@ int AffixMgr::compound_check_morph(const char* word,
if (rv && words && words[wnum + 1]) {
result.append(presult);
- result.append(" ");
+ result.push_back(MSEP_FLD);
result.append(MORPH_PART);
result.append(word + i);
if (complexprefixes && HENTRY_DATA(rv))
result.append(HENTRY_DATA2(rv));
if (!HENTRY_FIND(rv, MORPH_STEM)) {
- result.append(" ");
+ result.push_back(MSEP_FLD);
result.append(MORPH_STEM);
result.append(HENTRY_WORD(rv));
}
// store the pointer of the hash entry
if (!complexprefixes && HENTRY_DATA(rv)) {
- result.append(" ");
+ result.push_back(MSEP_FLD);
result.append(HENTRY_DATA2(rv));
}
- result.append("\n");
+ result.push_back(MSEP_REC);
return 0;
}
@@ -2462,7 +2532,7 @@ int AffixMgr::compound_check_morph(const char* word,
((!checkcompounddup || (rv != rv_first)))) {
// bad compound word
result.append(presult);
- result.append(" ");
+ result.push_back(MSEP_FLD);
result.append(MORPH_PART);
result.append(word + i);
@@ -2470,17 +2540,17 @@ int AffixMgr::compound_check_morph(const char* word,
if (complexprefixes)
result.append(HENTRY_DATA2(rv));
if (!HENTRY_FIND(rv, MORPH_STEM)) {
- result.append(" ");
+ result.push_back(MSEP_FLD);
result.append(MORPH_STEM);
result.append(HENTRY_WORD(rv));
}
// store the pointer of the hash entry
if (!complexprefixes) {
- result.append(" ");
+ result.push_back(MSEP_FLD);
result.append(HENTRY_DATA2(rv));
}
}
- result.append("\n");
+ result.push_back(MSEP_REC);
ok = 1;
}
@@ -2519,7 +2589,7 @@ int AffixMgr::compound_check_morph(const char* word,
line_uniq_app(m, MSEP_REC);
result.append(m);
}
- result.append("\n");
+ result.push_back(MSEP_REC);
ok = 1;
}
}
@@ -2552,7 +2622,9 @@ int AffixMgr::compound_check_morph(const char* word,
if (sfxappnd) {
std::string tmp(sfxappnd);
reverseword(tmp);
- numsyllable -= get_syllable(tmp) + sfxextra;
+ numsyllable -= short(get_syllable(tmp) + sfxextra);
+ } else {
+ numsyllable -= short(sfxextra);
}
// + 1 word, if syllable number of the prefix > 1 (hungarian
@@ -2605,8 +2677,9 @@ int AffixMgr::compound_check_morph(const char* word,
if (!m.empty()) {
result.push_back(MSEP_FLD);
result.append(MORPH_PART);
- result.append(word + 1);
+ result.append(word + i);
line_uniq_app(m, MSEP_REC);
+ result.push_back(MSEP_FLD);
result.append(m);
}
result.push_back(MSEP_REC);
@@ -2769,7 +2842,6 @@ struct hentry* AffixMgr::suffix_check(const char* word,
}
// check word for two-level suffixes
-
struct hentry* AffixMgr::suffix_check_twosfx(const char* word,
int len,
int sfxopts,
@@ -2814,6 +2886,7 @@ struct hentry* AffixMgr::suffix_check_twosfx(const char* word,
return NULL;
}
+// check word for two-level suffixes and morph
std::string AffixMgr::suffix_check_twosfx_morph(const char* word,
int len,
int sfxopts,
@@ -2832,17 +2905,17 @@ std::string AffixMgr::suffix_check_twosfx_morph(const char* word,
if (ppfx) {
if (ppfx->getMorph()) {
result.append(ppfx->getMorph());
- result.append(" ");
+ result.push_back(MSEP_FLD);
} else
debugflag(result, ppfx->getFlag());
}
result.append(st);
if (se->getMorph()) {
- result.append(" ");
+ result.push_back(MSEP_FLD);
result.append(se->getMorph());
} else
debugflag(result, se->getFlag());
- result.append("\n");
+ result.push_back(MSEP_REC);
}
}
se = se->getNext();
@@ -2867,12 +2940,12 @@ std::string AffixMgr::suffix_check_twosfx_morph(const char* word,
result3.clear();
if (sptr->getMorph()) {
- result3.append(" ");
+ result3.push_back(MSEP_FLD);
result3.append(sptr->getMorph());
} else
debugflag(result3, sptr->getFlag());
strlinecat(result2, result3);
- result2.append("\n");
+ result2.push_back(MSEP_REC);
result.append(result2);
}
}
@@ -2935,28 +3008,28 @@ std::string AffixMgr::suffix_check_morph(const char* word,
if (ppfx) {
if (ppfx->getMorph()) {
result.append(ppfx->getMorph());
- result.append(" ");
+ result.push_back(MSEP_FLD);
} else
debugflag(result, ppfx->getFlag());
}
if (complexprefixes && HENTRY_DATA(rv))
result.append(HENTRY_DATA2(rv));
if (!HENTRY_FIND(rv, MORPH_STEM)) {
- result.append(" ");
+ result.push_back(MSEP_FLD);
result.append(MORPH_STEM);
result.append(HENTRY_WORD(rv));
}
if (!complexprefixes && HENTRY_DATA(rv)) {
- result.append(" ");
+ result.push_back(MSEP_FLD);
result.append(HENTRY_DATA2(rv));
}
if (se->getMorph()) {
- result.append(" ");
+ result.push_back(MSEP_FLD);
result.append(se->getMorph());
} else
debugflag(result, se->getFlag());
- result.append("\n");
+ result.push_back(MSEP_REC);
rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
}
}
@@ -3002,29 +3075,29 @@ std::string AffixMgr::suffix_check_morph(const char* word,
if (ppfx) {
if (ppfx->getMorph()) {
result.append(ppfx->getMorph());
- result.append(" ");
+ result.push_back(MSEP_FLD);
} else
debugflag(result, ppfx->getFlag());
}
if (complexprefixes && HENTRY_DATA(rv))
result.append(HENTRY_DATA2(rv));
if (!HENTRY_FIND(rv, MORPH_STEM)) {
- result.append(" ");
+ result.push_back(MSEP_FLD);
result.append(MORPH_STEM);
result.append(HENTRY_WORD(rv));
}
if (!complexprefixes && HENTRY_DATA(rv)) {
- result.append(" ");
+ result.push_back(MSEP_FLD);
result.append(HENTRY_DATA2(rv));
}
if (sptr->getMorph()) {
- result.append(" ");
+ result.push_back(MSEP_FLD);
result.append(sptr->getMorph());
} else
debugflag(result, sptr->getFlag());
- result.append("\n");
+ result.push_back(MSEP_REC);
rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
}
sptr = sptr->getNextEQ();
@@ -3213,7 +3286,7 @@ std::string AffixMgr::morphgen(const char* ts,
// use input suffix fields, if exist
if (strstr(morph, MORPH_INFL_SFX) || strstr(morph, MORPH_DERI_SFX)) {
mymorph.assign(morph);
- mymorph.append(" ");
+ mymorph.push_back(MSEP_FLD);
stemmorphcatpos = mymorph.size();
} else {
stemmorphcatpos = std::string::npos;
@@ -3414,7 +3487,7 @@ int AffixMgr::expand_rootword(struct guessword* wlst,
// return replacing table
const std::vector<replentry>& AffixMgr::get_reptable() const {
- return reptable;
+ return pHMgr->get_reptable();
}
// return iconv table
@@ -3554,6 +3627,11 @@ FLAG AffixMgr::get_nongramsuggest() const {
return nongramsuggest;
}
+// return the substandard root/affix control flag
+FLAG AffixMgr::get_substandard() const {
+ return substandard;
+}
+
// return the forbidden words flag modify flag
FLAG AffixMgr::get_needaffix() const {
return needaffix;
@@ -3692,103 +3770,6 @@ bool AffixMgr::parse_cpdsyllable(const std::string& line, FileMgr* af) {
return true;
}
-/* parse in the typical fault correcting table */
-bool AffixMgr::parse_reptable(const std::string& line, FileMgr* af) {
- if (parsedrep) {
- HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
- af->getlinenum());
- return false;
- }
- parsedrep = true;
- int numrep = -1;
- int i = 0;
- int np = 0;
- std::string::const_iterator iter = line.begin();
- std::string::const_iterator start_piece = mystrsep(line, iter);
- while (start_piece != line.end()) {
- switch (i) {
- case 0: {
- np++;
- break;
- }
- case 1: {
- numrep = atoi(std::string(start_piece, iter).c_str());
- if (numrep < 1) {
- HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n",
- af->getlinenum());
- return false;
- }
- reptable.reserve(numrep);
- np++;
- break;
- }
- default:
- break;
- }
- ++i;
- start_piece = mystrsep(line, iter);
- }
- if (np != 2) {
- HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
- af->getlinenum());
- return false;
- }
-
- /* now parse the numrep lines to read in the remainder of the table */
- for (int j = 0; j < numrep; ++j) {
- std::string nl;
- if (!af->getline(nl))
- return false;
- mychomp(nl);
- reptable.push_back(replentry());
- iter = nl.begin();
- i = 0;
- int type = 0;
- start_piece = mystrsep(nl, iter);
- while (start_piece != nl.end()) {
- switch (i) {
- case 0: {
- if (nl.compare(start_piece - nl.begin(), 3, "REP", 3) != 0) {
- HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
- af->getlinenum());
- reptable.clear();
- return false;
- }
- break;
- }
- case 1: {
- if (*start_piece == '^')
- type = 1;
- reptable.back().pattern.assign(start_piece + type, iter);
- mystrrep(reptable.back().pattern, "_", " ");
- if (!reptable.back().pattern.empty() && reptable.back().pattern[reptable.back().pattern.size() - 1] == '$') {
- type += 2;
- reptable.back().pattern.resize(reptable.back().pattern.size() - 1);
- }
- break;
- }
- case 2: {
- reptable.back().outstrings[type].assign(start_piece, iter);
- mystrrep(reptable.back().outstrings[type], "_", " ");
- break;
- }
- default:
- break;
- }
- ++i;
- start_piece = mystrsep(nl, iter);
- }
- if (reptable.back().pattern.empty() || reptable.back().outstrings[type].empty()) {
- HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
- af->getlinenum());
- reptable.clear();
- return false;
- }
- }
- return true;
-}
-
-/* parse in the typical fault correcting table */
bool AffixMgr::parse_convtable(const std::string& line,
FileMgr* af,
RepList** rl,
@@ -4386,7 +4367,7 @@ void AffixMgr::reverse_condition(std::string& piece) {
case '^': {
if (*(k - 1) == ']')
neg = 1;
- else
+ else if (neg)
*(k - 1) = *k;
break;
}
@@ -4519,11 +4500,11 @@ bool AffixMgr::parse_affix(const std::string& line,
char opts = ff;
if (utf8)
- opts += aeUTF8;
+ opts |= aeUTF8;
if (pHMgr->is_aliasf())
- opts += aeALIASF;
+ opts |= aeALIASF;
if (pHMgr->is_aliasm())
- opts += aeALIASM;
+ opts |= aeALIASM;
affentries.initialize(numents, opts, aflag);
}
@@ -4617,7 +4598,7 @@ bool AffixMgr::parse_affix(const std::string& line,
entry->appnd = std::string(start_piece, dash);
std::string dash_str(dash + 1, iter);
- if (!ignorechars.empty()) {
+ if (!ignorechars.empty() && !has_no_ignored_chars(entry->appnd, ignorechars)) {
if (utf8) {
remove_ignored_chars_utf(entry->appnd, ignorechars_utf16);
} else {
@@ -4653,7 +4634,7 @@ bool AffixMgr::parse_affix(const std::string& line,
} else {
entry->appnd = std::string(start_piece, iter);
- if (!ignorechars.empty()) {
+ if (!ignorechars.empty() && !has_no_ignored_chars(entry->appnd, ignorechars)) {
if (utf8) {
remove_ignored_chars_utf(entry->appnd, ignorechars_utf16);
} else {
diff --git a/libs/hunspell/src/affixmgr.hxx b/libs/hunspell/src/affixmgr.hxx
index d41e69cfd2..450f50a65c 100644
--- a/libs/hunspell/src/affixmgr.hxx
+++ b/libs/hunspell/src/affixmgr.hxx
@@ -1,7 +1,7 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
- * Copyright (C) 2002-2017 Németh László
+ * Copyright (C) 2002-2022 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
@@ -120,8 +120,6 @@ class AffixMgr {
FLAG nongramsuggest;
FLAG needaffix;
int cpdmin;
- bool parsedrep;
- std::vector<replentry> reptable;
RepList* iconvtable;
RepList* oconvtable;
bool parsedmaptable;
@@ -251,6 +249,7 @@ class AffixMgr {
short get_syllable(const std::string& word);
int cpdrep_check(const char* word, int len);
+ int cpdwordpair_check(const char * word, int len);
int cpdpat_check(const char* word,
int len,
hentry* r1,
@@ -311,6 +310,7 @@ class AffixMgr {
FLAG get_forbiddenword() const;
FLAG get_nosuggest() const;
FLAG get_nongramsuggest() const;
+ FLAG get_substandard() const;
FLAG get_needaffix() const;
FLAG get_onlyincompound() const;
const char* get_derived() const;
@@ -338,7 +338,6 @@ class AffixMgr {
bool parse_flag(const std::string& line, unsigned short* out, FileMgr* af);
bool parse_num(const std::string& line, int* out, FileMgr* af);
bool parse_cpdsyllable(const std::string& line, FileMgr* af);
- bool parse_reptable(const std::string& line, FileMgr* af);
bool parse_convtable(const std::string& line,
FileMgr* af,
RepList** rl,
diff --git a/libs/hunspell/src/atypes.hxx b/libs/hunspell/src/atypes.hxx
index f841523189..1b78d4724b 100644
--- a/libs/hunspell/src/atypes.hxx
+++ b/libs/hunspell/src/atypes.hxx
@@ -1,7 +1,7 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
- * Copyright (C) 2002-2017 Németh László
+ * Copyright (C) 2002-2022 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
@@ -95,6 +95,16 @@ static inline void HUNSPELL_WARNING(FILE*, const char*, ...) {}
#define TESTAFF(a, b, c) (std::binary_search(a, a + c, b))
+// timelimit: max. ~1/4 sec (process time on Linux) for
+// for a suggestion, including max. ~/10 sec for a case
+// sensitive plain or compound word suggestion, within
+// ~1/20 sec long time consuming suggestion functions
+#define TIMELIMIT_GLOBAL (CLOCKS_PER_SEC / 4)
+#define TIMELIMIT_SUGGESTION (CLOCKS_PER_SEC / 10)
+#define TIMELIMIT (CLOCKS_PER_SEC / 20)
+#define MINTIMER 100
+#define MAXPLUSTIMER 100
+
struct guessword {
char* word;
bool allow;
diff --git a/libs/hunspell/src/baseaffix.hxx b/libs/hunspell/src/baseaffix.hxx
index 9191dba475..52cd60e028 100644
--- a/libs/hunspell/src/baseaffix.hxx
+++ b/libs/hunspell/src/baseaffix.hxx
@@ -1,7 +1,7 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
- * Copyright (C) 2002-2017 Németh László
+ * Copyright (C) 2002-2022 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
diff --git a/libs/hunspell/src/csutil.c++ b/libs/hunspell/src/csutil.c++
index 59a9d28353..fbaa768b40 100644
--- a/libs/hunspell/src/csutil.c++
+++ b/libs/hunspell/src/csutil.c++
@@ -1,7 +1,7 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
- * Copyright (C) 2002-2017 Németh László
+ * Copyright (C) 2002-2022 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
@@ -69,6 +69,7 @@
*/
#include <algorithm>
+#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
@@ -79,13 +80,6 @@
#include "atypes.hxx"
#include "langnum.hxx"
-// Unicode character encoding information
-struct unicode_info {
- unsigned short c;
- unsigned short cupper;
- unsigned short clower;
-};
-
#ifdef _WIN32
#include <windows.h>
#include <wchar.h>
@@ -102,12 +96,10 @@ struct unicode_info {
#ifdef MOZILLA_CLIENT
#include "nsCOMPtr.h"
-#include "nsIUnicodeEncoder.h"
-#include "nsIUnicodeDecoder.h"
#include "nsUnicharUtils.h"
-#include "mozilla/dom/EncodingUtils.h"
+#include "mozilla/Encoding.h"
-using mozilla::dom::EncodingUtils;
+using namespace mozilla;
#endif
struct unicode_info2 {
@@ -495,20 +487,17 @@ void uniqlist(std::vector<std::string>& list) {
namespace {
unsigned char cupper(const struct cs_info* csconv, int nIndex) {
- if (nIndex < 0 || nIndex > 255)
- return nIndex;
+ assert(nIndex >= 0 && nIndex <= 255);
return csconv[nIndex].cupper;
}
unsigned char clower(const struct cs_info* csconv, int nIndex) {
- if (nIndex < 0 || nIndex > 255)
- return nIndex;
+ assert(nIndex >= 0 && nIndex <= 255);
return csconv[nIndex].clower;
}
unsigned char ccase(const struct cs_info* csconv, int nIndex) {
- if (nIndex < 0 || nIndex > 255)
- return nIndex;
+ assert(nIndex >= 0 && nIndex <= 255);
return csconv[nIndex].ccase;
}
}
@@ -2306,20 +2295,12 @@ struct cs_info* get_current_cs(const std::string& es) {
ccs[i].cupper = i;
}
- nsCOMPtr<nsIUnicodeEncoder> encoder;
- nsCOMPtr<nsIUnicodeDecoder> decoder;
-
- nsresult rv;
-
- nsAutoCString label(es.c_str());
- nsAutoCString encoding;
- if (!EncodingUtils::FindEncodingForLabelNoReplacement(label, encoding)) {
+ auto encoding = Encoding::ForLabelNoReplacement(es);
+ if (!encoding) {
return ccs;
}
- encoder = EncodingUtils::EncoderForEncoding(encoding);
- decoder = EncodingUtils::DecoderForEncoding(encoding);
- encoder->SetOutputErrorBehavior(encoder->kOnError_Signal, nullptr, '?');
- decoder->SetInputErrorBehavior(decoder->kOnError_Signal);
+ auto encoder = encoding->NewEncoder();
+ auto decoder = encoding->NewDecoderWithoutBOMHandling();
for (unsigned int i = 0; i <= 0xff; ++i) {
bool success = false;
@@ -2327,36 +2308,50 @@ struct cs_info* get_current_cs(const std::string& es) {
// in this 1-byte character encoding. Call our encoding/decoding
// APIs separately for each byte since they may reject some of the
// bytes, and we want to handle errors separately for each byte.
- char lower, upper;
+ uint8_t lower, upper;
do {
if (i == 0)
break;
- const char source = char(i);
- char16_t uni, uniCased;
- int32_t charLength = 1, uniLength = 1;
-
- rv = decoder->Convert(&source, &charLength, &uni, &uniLength);
- // Explicitly check NS_OK because we don't want to allow
- // NS_OK_UDEC_MOREOUTPUT or NS_OK_UDEC_MOREINPUT.
- if (rv != NS_OK || charLength != 1 || uniLength != 1)
+ uint8_t source = uint8_t(i);
+ char16_t uni[2];
+ char16_t uniCased;
+ uint8_t destination[4];
+ auto src1 = MakeSpan(&source, 1);
+ auto dst1 = MakeSpan(uni);
+ auto src2 = MakeSpan(&uniCased, 1);
+ auto dst2 = MakeSpan(destination);
+
+ uint32_t result;
+ size_t read;
+ size_t written;
+ Tie(result, read, written) =
+ decoder->DecodeToUTF16WithoutReplacement(src1, dst1, true);
+ if (result != kInputEmpty || read != 1 || written != 1) {
break;
- uniCased = ToLowerCase(uni);
- rv = encoder->Convert(&uniCased, &uniLength, &lower, &charLength);
- // Explicitly check NS_OK because we don't want to allow
- // NS_OK_UDEC_MOREOUTPUT or NS_OK_UDEC_MOREINPUT.
- if (rv != NS_OK || charLength != 1 || uniLength != 1)
+ }
+
+ uniCased = ToLowerCase(uni[0]);
+ Tie(result, read, written) =
+ encoder->EncodeFromUTF16WithoutReplacement(src2, dst2, true);
+ if (result != kInputEmpty || read != 1 || written != 1) {
break;
+ }
+ lower = destination[0];
- uniCased = ToUpperCase(uni);
- rv = encoder->Convert(&uniCased, &uniLength, &upper, &charLength);
- // Explicitly check NS_OK because we don't want to allow
- // NS_OK_UDEC_MOREOUTPUT or NS_OK_UDEC_MOREINPUT.
- if (rv != NS_OK || charLength != 1 || uniLength != 1)
+ uniCased = ToUpperCase(uni[0]);
+ Tie(result, read, written) =
+ encoder->EncodeFromUTF16WithoutReplacement(src2, dst2, true);
+ if (result != kInputEmpty || read != 1 || written != 1) {
break;
+ }
+ upper = destination[0];
success = true;
} while (0);
+ encoding->NewEncoderInto(*encoder);
+ encoding->NewDecoderWithoutBOMHandlingInto(*decoder);
+
if (success) {
ccs[i].cupper = upper;
ccs[i].clower = lower;
@@ -2401,6 +2396,7 @@ static struct lang_map lang2enc[] =
{{"ar", LANG_ar}, {"az", LANG_az},
{"az_AZ", LANG_az}, // for back-compatibility
{"bg", LANG_bg}, {"ca", LANG_ca},
+ {"crh", LANG_crh},
{"cs", LANG_cs}, {"da", LANG_da},
{"de", LANG_de}, {"el", LANG_el},
{"en", LANG_en}, {"es", LANG_es},
@@ -2458,7 +2454,7 @@ unsigned short unicodetoupper(unsigned short c, int langnum) {
// In Azeri and Turkish, I and i dictinct letters:
// There are a dotless lower case i pair of upper `I',
// and an upper I with dot pair of lower `i'.
- if (c == 0x0069 && ((langnum == LANG_az) || (langnum == LANG_tr)))
+ if (c == 0x0069 && ((langnum == LANG_az) || (langnum == LANG_tr) || (langnum == LANG_crh)))
return 0x0130;
#ifdef OPENOFFICEORG
return static_cast<unsigned short>(u_toupper(c));
@@ -2475,7 +2471,7 @@ unsigned short unicodetolower(unsigned short c, int langnum) {
// In Azeri and Turkish, I and i dictinct letters:
// There are a dotless lower case i pair of upper `I',
// and an upper I with dot pair of lower `i'.
- if (c == 0x0049 && ((langnum == LANG_az) || (langnum == LANG_tr)))
+ if (c == 0x0049 && ((langnum == LANG_az) || (langnum == LANG_tr) || (langnum == LANG_crh)))
return 0x0131;
#ifdef OPENOFFICEORG
return static_cast<unsigned short>(u_tolower(c));
diff --git a/libs/hunspell/src/csutil.hxx b/libs/hunspell/src/csutil.hxx
index 5d83f80970..c6f03d8f76 100644
--- a/libs/hunspell/src/csutil.hxx
+++ b/libs/hunspell/src/csutil.hxx
@@ -1,7 +1,7 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
- * Copyright (C) 2002-2017 Németh László
+ * Copyright (C) 2002-2022 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
@@ -269,10 +269,23 @@ LIBHUNSPELL_DLL_EXPORTED void store_pointer(char* dest, char* source);
// conversion function for protected memory
LIBHUNSPELL_DLL_EXPORTED char* get_stored_pointer(const char* s);
+
+// to avoid unnecessary string copies and Unicode conversions
+// we simply check the ignored_chars characters in the word
+// (in the case of UTF-8 encoded strings, "false" means
+// "likely false", if ignored_chars characters are not ASCII)
+inline bool has_no_ignored_chars(const std::string& word,
+ const std::string& ignored_chars) {
+ for (std::string::const_iterator it = ignored_chars.begin(), end = ignored_chars.end(); it != end; ++it)
+ if (word.find(*it) != std::string::npos)
+ return false;
+ return true;
+}
+
// hash entry macros
-LIBHUNSPELL_DLL_EXPORTED inline char* HENTRY_DATA(struct hentry* h) {
+inline char* HENTRY_DATA(struct hentry* h) {
char* ret;
- if (!h->var)
+ if (!(h->var & H_OPT))
ret = NULL;
else if (h->var & H_OPT_ALIASM)
ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
@@ -281,10 +294,10 @@ LIBHUNSPELL_DLL_EXPORTED inline char* HENTRY_DATA(struct hentry* h) {
return ret;
}
-LIBHUNSPELL_DLL_EXPORTED inline const char* HENTRY_DATA(
+inline const char* HENTRY_DATA(
const struct hentry* h) {
const char* ret;
- if (!h->var)
+ if (!(h->var & H_OPT))
ret = NULL;
else if (h->var & H_OPT_ALIASM)
ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
@@ -294,10 +307,10 @@ LIBHUNSPELL_DLL_EXPORTED inline const char* HENTRY_DATA(
}
// NULL-free version for warning-free OOo build
-LIBHUNSPELL_DLL_EXPORTED inline const char* HENTRY_DATA2(
+inline const char* HENTRY_DATA2(
const struct hentry* h) {
const char* ret;
- if (!h->var)
+ if (!(h->var & H_OPT))
ret = "";
else if (h->var & H_OPT_ALIASM)
ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
@@ -306,7 +319,7 @@ LIBHUNSPELL_DLL_EXPORTED inline const char* HENTRY_DATA2(
return ret;
}
-LIBHUNSPELL_DLL_EXPORTED inline char* HENTRY_FIND(struct hentry* h,
+inline char* HENTRY_FIND(struct hentry* h,
const char* p) {
return (HENTRY_DATA(h) ? strstr(HENTRY_DATA(h), p) : NULL);
}
diff --git a/libs/hunspell/src/filemgr.c++ b/libs/hunspell/src/filemgr.c++
index 4a14de8762..4a754e52a8 100644
--- a/libs/hunspell/src/filemgr.c++
+++ b/libs/hunspell/src/filemgr.c++
@@ -1,7 +1,7 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
- * Copyright (C) 2002-2017 Németh László
+ * Copyright (C) 2002-2022 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
@@ -83,6 +83,8 @@ int FileMgr::fail(const char* err, const char* par) {
FileMgr::FileMgr(const char* file, const char* key) : hin(NULL), linenum(0) {
in[0] = '\0';
+ if (!file || !strlen(file))
+ return;
myopen(fin, file, std::ios_base::in);
if (!fin.is_open()) {
// check hzipped file
@@ -103,7 +105,7 @@ bool FileMgr::getline(std::string& dest) {
++linenum;
if (fin.is_open()) {
ret = static_cast<bool>(std::getline(fin, dest));
- } else if (hin->is_open()) {
+ } else if (hin && hin->is_open()) {
ret = hin->getline(dest);
}
if (!ret) {
diff --git a/libs/hunspell/src/filemgr.hxx b/libs/hunspell/src/filemgr.hxx
index 62433aeefe..88fe88388a 100644
--- a/libs/hunspell/src/filemgr.hxx
+++ b/libs/hunspell/src/filemgr.hxx
@@ -1,7 +1,7 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
- * Copyright (C) 2002-2017 Németh László
+ * Copyright (C) 2002-2022 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
diff --git a/libs/hunspell/src/hashmgr.c++ b/libs/hunspell/src/hashmgr.c++
index 23421b567a..3ec263de1d 100644
--- a/libs/hunspell/src/hashmgr.c++
+++ b/libs/hunspell/src/hashmgr.c++
@@ -1,7 +1,7 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
- * Copyright (C) 2002-2017 Németh László
+ * Copyright (C) 2002-2022 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
@@ -78,6 +78,7 @@
#include "hashmgr.hxx"
#include "csutil.hxx"
#include "atypes.hxx"
+#include "langnum.hxx"
// build a hash table from a munched word list
@@ -182,13 +183,14 @@ int HashMgr::add_word(const std::string& in_word,
unsigned short* aff,
int al,
const std::string* in_desc,
- bool onlyupcase) {
+ bool onlyupcase,
+ int captype) {
const std::string* word = &in_word;
const std::string* desc = in_desc;
std::string *word_copy = NULL;
std::string *desc_copy = NULL;
- if (!ignorechars.empty() || complexprefixes) {
+ if ((!ignorechars.empty() && !has_no_ignored_chars(in_word, ignorechars)) || complexprefixes) {
word_copy = new std::string(in_word);
if (!ignorechars.empty()) {
@@ -243,20 +245,119 @@ int HashMgr::add_word(const std::string& in_word,
hp->astr = aff;
hp->next = NULL;
hp->next_homonym = NULL;
+ hp->var = (captype == INITCAP) ? H_OPT_INITCAP : 0;
// store the description string or its pointer
if (desc) {
- hp->var = H_OPT;
+ hp->var |= H_OPT;
if (aliasm) {
- hp->var += H_OPT_ALIASM;
+ hp->var |= H_OPT_ALIASM;
store_pointer(hpw + word->size() + 1, get_aliasm(atoi(desc->c_str())));
} else {
strcpy(hpw + word->size() + 1, desc->c_str());
}
- if (strstr(HENTRY_DATA(hp), MORPH_PHON))
- hp->var += H_OPT_PHON;
- } else
- hp->var = 0;
+ if (strstr(HENTRY_DATA(hp), MORPH_PHON)) {
+ hp->var |= H_OPT_PHON;
+ // store ph: fields (pronounciation, misspellings, old orthography etc.)
+ // of a morphological description in reptable to use in REP replacements.
+ if (reptable.capacity() < (unsigned int)(tablesize/MORPH_PHON_RATIO))
+ reptable.reserve(tablesize/MORPH_PHON_RATIO);
+ std::string fields = HENTRY_DATA(hp);
+ std::string::const_iterator iter = fields.begin();
+ std::string::const_iterator start_piece = mystrsep(fields, iter);
+ while (start_piece != fields.end()) {
+ if (std::string(start_piece, iter).find(MORPH_PHON) == 0) {
+ std::string ph = std::string(start_piece, iter).substr(sizeof MORPH_PHON - 1);
+ if (ph.size() > 0) {
+ std::vector<w_char> w;
+ size_t strippatt;
+ std::string wordpart;
+ // dictionary based REP replacement, separated by "->"
+ // for example "pretty ph:prity ph:priti->pretti" to handle
+ // both prity -> pretty and pritier -> prettiest suggestions.
+ if (((strippatt = ph.find("->")) != std::string::npos) &&
+ (strippatt > 0) && (strippatt < ph.size() - 2)) {
+ wordpart = ph.substr(strippatt + 2);
+ ph.erase(ph.begin() + strippatt, ph.end());
+ } else
+ wordpart = in_word;
+ // when the ph: field ends with the character *,
+ // strip last character of the pattern and the replacement
+ // to match in REP suggestions also at character changes,
+ // for example, "pretty ph:prity*" results "prit->prett"
+ // REP replacement instead of "prity->pretty", to get
+ // prity->pretty and pritiest->prettiest suggestions.
+ if (ph.at(ph.size()-1) == '*') {
+ strippatt = 1;
+ size_t stripword = 0;
+ if (utf8) {
+ while ((strippatt < ph.size()) &&
+ ((ph.at(ph.size()-strippatt-1) & 0xc0) == 0x80))
+ ++strippatt;
+ while ((stripword < wordpart.size()) &&
+ ((wordpart.at(wordpart.size()-stripword-1) & 0xc0) == 0x80))
+ ++stripword;
+ }
+ ++strippatt;
+ ++stripword;
+ if ((ph.size() > strippatt) && (wordpart.size() > stripword)) {
+ ph.erase(ph.size()-strippatt, strippatt);
+ wordpart.erase(in_word.size()-stripword, stripword);
+ }
+ }
+ // capitalize lowercase pattern for capitalized words to support
+ // good suggestions also for capitalized misspellings, eg.
+ // Wednesday ph:wendsay
+ // results wendsay -> Wednesday and Wendsay -> Wednesday, too.
+ if (captype==INITCAP) {
+ std::string ph_capitalized;
+ if (utf8) {
+ u8_u16(w, ph);
+ if (get_captype_utf8(w, langnum) == NOCAP) {
+ mkinitcap_utf(w, langnum);
+ u16_u8(ph_capitalized, w);
+ }
+ } else if (get_captype(ph, csconv) == NOCAP)
+ mkinitcap(ph_capitalized, csconv);
+
+ if (ph_capitalized.size() > 0) {
+ // add also lowercase word in the case of German or
+ // Hungarian to support lowercase suggestions lowercased by
+ // compound word generation or derivational suffixes
+ // (for example by adjectival suffix "-i" of geographical
+ // names in Hungarian:
+ // Massachusetts ph:messzecsuzec
+ // messzecsuzeci -> massachusettsi (adjective)
+ // For lowercasing by conditional PFX rules, see
+ // tests/germancompounding test example or the
+ // Hungarian dictionary.)
+ if (langnum == LANG_de || langnum == LANG_hu) {
+ std::string wordpart_lower(wordpart);
+ if (utf8) {
+ u8_u16(w, wordpart_lower);
+ mkallsmall_utf(w, langnum);
+ u16_u8(wordpart_lower, w);
+ } else {
+ mkallsmall(wordpart_lower, csconv);
+ }
+ reptable.push_back(replentry());
+ reptable.back().pattern.assign(ph);
+ reptable.back().outstrings[0].assign(wordpart_lower);
+ }
+ reptable.push_back(replentry());
+ reptable.back().pattern.assign(ph_capitalized);
+ reptable.back().outstrings[0].assign(wordpart);
+ }
+ }
+ reptable.push_back(replentry());
+ reptable.back().pattern.assign(ph);
+ reptable.back().outstrings[0].assign(wordpart);
+ }
+ }
+ start_piece = mystrsep(fields, iter);
+ }
+ }
+ }
struct hentry* dp = tableptr[i];
if (!dp) {
@@ -347,12 +448,12 @@ int HashMgr::add_hidden_capitalized_word(const std::string& word,
mkallsmall_utf(w, langnum);
mkinitcap_utf(w, langnum);
u16_u8(st, w);
- return add_word(st, wcl, flags2, flagslen + 1, dp, true);
+ return add_word(st, wcl, flags2, flagslen + 1, dp, true, INITCAP);
} else {
std::string new_word(word);
mkallsmall(new_word, csconv);
mkinitcap(new_word, csconv);
- int ret = add_word(new_word, wcl, flags2, flagslen + 1, dp, true);
+ int ret = add_word(new_word, wcl, flags2, flagslen + 1, dp, true, INITCAP);
return ret;
}
}
@@ -405,24 +506,8 @@ int HashMgr::remove_forbidden_flag(const std::string& word) {
if (!dp)
return 1;
while (dp) {
- if (dp->astr && TESTAFF(dp->astr, forbiddenword, dp->alen)) {
- if (dp->alen == 1)
- dp->alen = 0; // XXX forbidden words of personal dic.
- else {
- unsigned short* flags2 =
- (unsigned short*)malloc(sizeof(unsigned short) * (dp->alen - 1));
- if (!flags2)
- return 1;
- int i, j = 0;
- for (i = 0; i < dp->alen; i++) {
- if (dp->astr[i] != forbiddenword)
- flags2[j++] = dp->astr[i];
- }
- dp->alen--;
- free(dp->astr);
- dp->astr = flags2; // XXX allowed forbidden words
- }
- }
+ if (dp->astr && TESTAFF(dp->astr, forbiddenword, dp->alen))
+ dp->alen = 0; // XXX forbidden words of personal dic.
dp = dp->next_homonym;
}
return 0;
@@ -435,7 +520,7 @@ int HashMgr::add(const std::string& word) {
int al = 0;
unsigned short* flags = NULL;
int wcl = get_clen_and_captype(word, &captype);
- add_word(word, wcl, flags, al, NULL, false);
+ add_word(word, wcl, flags, al, NULL, false, captype);
return add_hidden_capitalized_word(word, wcl, flags, al, NULL,
captype);
}
@@ -450,14 +535,14 @@ int HashMgr::add_with_affix(const std::string& word, const std::string& example)
int captype;
int wcl = get_clen_and_captype(word, &captype);
if (aliasf) {
- add_word(word, wcl, dp->astr, dp->alen, NULL, false);
+ add_word(word, wcl, dp->astr, dp->alen, NULL, false, captype);
} else {
unsigned short* flags =
(unsigned short*)malloc(dp->alen * sizeof(unsigned short));
if (flags) {
memcpy((void*)flags, (void*)dp->astr,
dp->alen * sizeof(unsigned short));
- add_word(word, wcl, flags, dp->alen, NULL, false);
+ add_word(word, wcl, flags, dp->alen, NULL, false, captype);
} else
return 1;
}
@@ -605,7 +690,7 @@ int HashMgr::load_tables(const char* tpath, const char* key) {
int wcl = get_clen_and_captype(ts, &captype, workbuf);
const std::string *dp_str = dp.empty() ? NULL : &dp;
// add the word and its index plus its capitalized form optionally
- if (add_word(ts, wcl, flags, al, dp_str, false) ||
+ if (add_word(ts, wcl, flags, al, dp_str, false, captype) ||
add_hidden_capitalized_word(ts, wcl, flags, al, dp_str, captype)) {
delete dict;
return 5;
@@ -697,7 +782,7 @@ int HashMgr::decode_flags(unsigned short** result, const std::string& flags, Fil
*result = (unsigned short*)malloc(len * sizeof(unsigned short));
if (!*result)
return -1;
- memcpy(*result, &w[0], len * sizeof(short));
+ memcpy(*result, w.data(), len * sizeof(short));
break;
}
default: { // Ispell's one-character flags (erfg -> e r f g)
@@ -768,7 +853,7 @@ bool HashMgr::decode_flags(std::vector<unsigned short>& result, const std::strin
size_t len = w.size();
size_t origsize = result.size();
result.resize(origsize + len);
- memcpy(&result[origsize], &w[0], len * sizeof(short));
+ memcpy(result.data() + origsize, w.data(), len * sizeof(short));
break;
}
default: { // Ispell's one-character flags (erfg -> e r f g)
@@ -799,7 +884,7 @@ unsigned short HashMgr::decode_flag(const char* f) const {
std::vector<w_char> w;
u8_u16(w, f);
if (!w.empty())
- memcpy(&s, &w[0], 1 * sizeof(short));
+ memcpy(&s, w.data(), 1 * sizeof(short));
break;
}
default:
@@ -940,8 +1025,19 @@ int HashMgr::load_config(const char* affpath, const char* key) {
if (line.compare(0, 15, "COMPLEXPREFIXES", 15) == 0)
complexprefixes = 1;
+ /* parse in the typical fault correcting table */
+ if (line.compare(0, 3, "REP", 3) == 0) {
+ if (!parse_reptable(line, afflst)) {
+ delete afflst;
+ return 1;
+ }
+ }
+
+ // don't check the full affix file, yet
if (((line.compare(0, 3, "SFX", 3) == 0) ||
- (line.compare(0, 3, "PFX", 3) == 0)) && line.size() > 3 && isspace(line[3]))
+ (line.compare(0, 3, "PFX", 3) == 0)) &&
+ line.size() > 3 && isspace(line[3]) &&
+ !reptable.empty()) // (REP table is in the end of Afrikaans aff file)
break;
}
@@ -1015,43 +1111,41 @@ bool HashMgr::parse_aliasf(const std::string& line, FileMgr* af) {
/* now parse the numaliasf lines to read in the remainder of the table */
for (int j = 0; j < numaliasf; j++) {
std::string nl;
- if (!af->getline(nl))
- return false;
- mychomp(nl);
- i = 0;
aliasf[j] = NULL;
aliasflen[j] = 0;
- iter = nl.begin();
- start_piece = mystrsep(nl, iter);
- while (start_piece != nl.end()) {
- switch (i) {
- case 0: {
- if (nl.compare(start_piece - nl.begin(), 2, "AF", 2) != 0) {
- numaliasf = 0;
- free(aliasf);
- free(aliasflen);
- aliasf = NULL;
- aliasflen = NULL;
- HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
- af->getlinenum());
- return false;
+ i = 0;
+ if (af->getline(nl)) {
+ mychomp(nl);
+ iter = nl.begin();
+ start_piece = mystrsep(nl, iter);
+ bool errored = false;
+ while (!errored && start_piece != nl.end()) {
+ switch (i) {
+ case 0: {
+ if (nl.compare(start_piece - nl.begin(), 2, "AF", 2) != 0) {
+ errored = true;
+ break;
+ }
+ break;
}
- break;
- }
- case 1: {
- std::string piece(start_piece, iter);
- aliasflen[j] =
- (unsigned short)decode_flags(&(aliasf[j]), piece, af);
- std::sort(aliasf[j], aliasf[j] + aliasflen[j]);
- break;
+ case 1: {
+ std::string piece(start_piece, iter);
+ aliasflen[j] =
+ (unsigned short)decode_flags(&(aliasf[j]), piece, af);
+ std::sort(aliasf[j], aliasf[j] + aliasflen[j]);
+ break;
+ }
+ default:
+ break;
}
- default:
- break;
+ ++i;
+ start_piece = mystrsep(nl, iter);
}
- ++i;
- start_piece = mystrsep(nl, iter);
}
if (!aliasf[j]) {
+ for (int k = 0; k < j; ++k) {
+ free(aliasf[k]);
+ }
free(aliasf);
free(aliasflen);
aliasf = NULL;
@@ -1130,47 +1224,47 @@ bool HashMgr::parse_aliasm(const std::string& line, FileMgr* af) {
/* now parse the numaliasm lines to read in the remainder of the table */
for (int j = 0; j < numaliasm; j++) {
std::string nl;
- if (!af->getline(nl))
- return false;
- mychomp(nl);
aliasm[j] = NULL;
- iter = nl.begin();
- i = 0;
- start_piece = mystrsep(nl, iter);
- while (start_piece != nl.end()) {
- switch (i) {
- case 0: {
- if (nl.compare(start_piece - nl.begin(), 2, "AM", 2) != 0) {
- HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
- af->getlinenum());
- numaliasm = 0;
- free(aliasm);
- aliasm = NULL;
- return false;
+ if (af->getline(nl)) {
+ mychomp(nl);
+ iter = nl.begin();
+ i = 0;
+ start_piece = mystrsep(nl, iter);
+ bool errored = false;
+ while (!errored && start_piece != nl.end()) {
+ switch (i) {
+ case 0: {
+ if (nl.compare(start_piece - nl.begin(), 2, "AM", 2) != 0) {
+ errored = true;
+ break;
+ }
+ break;
}
- break;
- }
- case 1: {
- // add the remaining of the line
- std::string::const_iterator end = nl.end();
- std::string chunk(start_piece, end);
- if (complexprefixes) {
- if (utf8)
- reverseword_utf(chunk);
- else
- reverseword(chunk);
+ case 1: {
+ // add the remaining of the line
+ std::string::const_iterator end = nl.end();
+ std::string chunk(start_piece, end);
+ if (complexprefixes) {
+ if (utf8)
+ reverseword_utf(chunk);
+ else
+ reverseword(chunk);
+ }
+ aliasm[j] = mystrdup(chunk.c_str());
+ break;
}
- aliasm[j] = mystrdup(chunk.c_str());
- break;
+ default:
+ break;
}
- default:
- break;
+ ++i;
+ start_piece = mystrsep(nl, iter);
}
- ++i;
- start_piece = mystrsep(nl, iter);
}
if (!aliasm[j]) {
numaliasm = 0;
+ for (int k = 0; k < j; ++k) {
+ free(aliasm[k]);
+ }
free(aliasm);
aliasm = NULL;
HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
@@ -1191,3 +1285,102 @@ char* HashMgr::get_aliasm(int index) const {
HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index);
return NULL;
}
+
+/* parse in the typical fault correcting table */
+bool HashMgr::parse_reptable(const std::string& line, FileMgr* af) {
+ if (!reptable.empty()) {
+ HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
+ af->getlinenum());
+ return false;
+ }
+ int numrep = -1;
+ int i = 0;
+ int np = 0;
+ std::string::const_iterator iter = line.begin();
+ std::string::const_iterator start_piece = mystrsep(line, iter);
+ while (start_piece != line.end()) {
+ switch (i) {
+ case 0: {
+ np++;
+ break;
+ }
+ case 1: {
+ numrep = atoi(std::string(start_piece, iter).c_str());
+ if (numrep < 1) {
+ HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n",
+ af->getlinenum());
+ return false;
+ }
+ reptable.reserve(numrep);
+ np++;
+ break;
+ }
+ default:
+ break;
+ }
+ ++i;
+ start_piece = mystrsep(line, iter);
+ }
+ if (np != 2) {
+ HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
+ af->getlinenum());
+ return false;
+ }
+
+ /* now parse the numrep lines to read in the remainder of the table */
+ for (int j = 0; j < numrep; ++j) {
+ std::string nl;
+ reptable.push_back(replentry());
+ int type = 0;
+ if (af->getline(nl)) {
+ mychomp(nl);
+ iter = nl.begin();
+ i = 0;
+ start_piece = mystrsep(nl, iter);
+ bool errored = false;
+ while (!errored && start_piece != nl.end()) {
+ switch (i) {
+ case 0: {
+ if (nl.compare(start_piece - nl.begin(), 3, "REP", 3) != 0) {
+ errored = true;
+ break;
+ }
+ break;
+ }
+ case 1: {
+ if (*start_piece == '^')
+ type = 1;
+ reptable.back().pattern.assign(start_piece + type, iter);
+ mystrrep(reptable.back().pattern, "_", " ");
+ if (!reptable.back().pattern.empty() && reptable.back().pattern[reptable.back().pattern.size() - 1] == '$') {
+ type += 2;
+ reptable.back().pattern.resize(reptable.back().pattern.size() - 1);
+ }
+ break;
+ }
+ case 2: {
+ reptable.back().outstrings[type].assign(start_piece, iter);
+ mystrrep(reptable.back().outstrings[type], "_", " ");
+ break;
+ }
+ default:
+ break;
+ }
+ ++i;
+ start_piece = mystrsep(nl, iter);
+ }
+ }
+ if (reptable.back().pattern.empty() || reptable.back().outstrings[type].empty()) {
+ HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
+ af->getlinenum());
+ reptable.clear();
+ return false;
+ }
+ }
+ return true;
+}
+
+// return replacing table
+const std::vector<replentry>& HashMgr::get_reptable() const {
+ return reptable;
+}
diff --git a/libs/hunspell/src/hashmgr.hxx b/libs/hunspell/src/hashmgr.hxx
index da485d7afa..98b09e2569 100644
--- a/libs/hunspell/src/hashmgr.hxx
+++ b/libs/hunspell/src/hashmgr.hxx
@@ -1,7 +1,7 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
- * Copyright (C) 2002-2017 Németh László
+ * Copyright (C) 2002-2022 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
@@ -81,6 +81,12 @@
enum flag { FLAG_CHAR, FLAG_LONG, FLAG_NUM, FLAG_UNI };
+// morphological description of a dictionary item can contain
+// arbitrary number "ph:" (MORPH_PHON) fields to store typical
+// phonetic or other misspellings of that word.
+// ratio of lines/lines with "ph:" in the dic file: 1/MORPH_PHON_RATIO
+#define MORPH_PHON_RATIO 500
+
class HashMgr {
int tablesize;
struct hentry** tableptr;
@@ -99,6 +105,10 @@ class HashMgr {
unsigned short* aliasflen;
int numaliasm; // morphological desciption `compression' with aliases
char** aliasm;
+ // reptable created from REP table of aff file and from "ph:" fields
+ // of the dic file. It contains phonetic and other common misspellings
+ // (letters, letter groups and words) for better suggestions
+ std::vector<replentry> reptable;
public:
HashMgr(const char* tpath, const char* apath, const char* key = NULL);
@@ -119,6 +129,7 @@ class HashMgr {
int get_aliasf(int index, unsigned short** fvec, FileMgr* af) const;
int is_aliasm() const;
char* get_aliasm(int index) const;
+ const std::vector<replentry>& get_reptable() const;
private:
int get_clen_and_captype(const std::string& word, int* captype);
@@ -129,7 +140,8 @@ class HashMgr {
unsigned short* ap,
int al,
const std::string* desc,
- bool onlyupcase);
+ bool onlyupcase,
+ int captype);
int load_config(const char* affpath, const char* key);
bool parse_aliasf(const std::string& line, FileMgr* af);
int add_hidden_capitalized_word(const std::string& word,
@@ -139,6 +151,7 @@ class HashMgr {
const std::string* dp,
int captype);
bool parse_aliasm(const std::string& line, FileMgr* af);
+ bool parse_reptable(const std::string& line, FileMgr* af);
int remove_forbidden_flag(const std::string& word);
};
diff --git a/libs/hunspell/src/htypes.hxx b/libs/hunspell/src/htypes.hxx
index 8f66a0080e..44366b1d68 100644
--- a/libs/hunspell/src/htypes.hxx
+++ b/libs/hunspell/src/htypes.hxx
@@ -1,7 +1,7 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
- * Copyright (C) 2002-2017 Németh László
+ * Copyright (C) 2002-2022 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
@@ -44,9 +44,10 @@
(v) = ((v) << (q)) | (((v) >> (32 - q)) & ((1 << (q)) - 1));
// hentry options
-#define H_OPT (1 << 0)
-#define H_OPT_ALIASM (1 << 1)
-#define H_OPT_PHON (1 << 2)
+#define H_OPT (1 << 0) // is there optional morphological data?
+#define H_OPT_ALIASM (1 << 1) // using alias compression?
+#define H_OPT_PHON (1 << 2) // is there ph: field in the morphological data?
+#define H_OPT_INITCAP (1 << 3) // is dictionary word capitalized?
// see also csutil.hxx
#define HENTRY_WORD(h) &(h->word[0])
@@ -54,6 +55,12 @@
// approx. number of user defined words
#define USERWORD 1000
+#if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900)
+# define HUNSPELL_THREAD_LOCAL thread_local
+#else
+# define HUNSPELL_THREAD_LOCAL static
+#endif
+
struct hentry {
unsigned char blen; // word length in bytes
unsigned char clen; // word length in characters (different for UTF-8 enc.)
@@ -61,7 +68,7 @@ struct hentry {
unsigned short* astr; // affix flag vector
struct hentry* next; // next word with same hash code
struct hentry* next_homonym; // next homonym word (with same hash code)
- char var; // variable fields (only for special pronounciation yet)
+ char var; // bit vector of H_OPT hentry options
char word[1]; // variable-length word (8-bit or UTF-8 encoding)
};
diff --git a/libs/hunspell/src/hunspell.c++ b/libs/hunspell/src/hunspell.c++
index b1535013fe..4afafdadc1 100644
--- a/libs/hunspell/src/hunspell.c++
+++ b/libs/hunspell/src/hunspell.c++
@@ -1,7 +1,7 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
- * Copyright (C) 2002-2017 Németh László
+ * Copyright (C) 2002-2022 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
@@ -71,6 +71,7 @@
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
+#include <time.h>
#include "affixmgr.hxx"
#include "hunspell.hxx"
@@ -86,30 +87,41 @@
class HunspellImpl
{
public:
- HunspellImpl(const char* affpath, const char* dpath, const char* key);
+ HunspellImpl(const char* affpath, const char* dpath, const char* key = NULL);
~HunspellImpl();
- int add_dic(const char* dpath, const char* key);
+ int add_dic(const char* dpath, const char* key = NULL);
std::vector<std::string> suffix_suggest(const std::string& root_word);
std::vector<std::string> generate(const std::string& word, const std::vector<std::string>& pl);
std::vector<std::string> generate(const std::string& word, const std::string& pattern);
std::vector<std::string> stem(const std::string& word);
std::vector<std::string> stem(const std::vector<std::string>& morph);
std::vector<std::string> analyze(const std::string& word);
+ int get_langnum() const;
bool input_conv(const std::string& word, std::string& dest);
bool spell(const std::string& word, int* info = NULL, std::string* root = NULL);
std::vector<std::string> suggest(const std::string& word);
- const std::string& get_wordchars() const;
+ const std::string& get_wordchars_cpp() const;
const std::vector<w_char>& get_wordchars_utf16() const;
const std::string& get_dict_encoding() const;
int add(const std::string& word);
int add_with_affix(const std::string& word, const std::string& example);
int remove(const std::string& word);
+ const std::string& get_version_cpp() const;
struct cs_info* get_csconv();
- std::vector<char> dic_encoding_vec;
- int get_langnum() const { return langnum; }
- const char* get_try_string() const { return pAMgr->get_try_string(); }
- const std::string& get_version() const { return pAMgr->get_version(); }
+ int spell(const char* word, int* info = NULL, char** root = NULL);
+ int suggest(char*** slst, const char* word);
+ int suffix_suggest(char*** slst, const char* root_word);
+ void free_list(char*** slst, int n);
+ char* get_dic_encoding();
+ int analyze(char*** slst, const char* word);
+ int stem(char*** slst, const char* word);
+ int stem(char*** slst, char** morph, int n);
+ int generate(char*** slst, const char* word, const char* word2);
+ int generate(char*** slst, const char* word, char** desc, int n);
+ const char* get_wordchars() const;
+ const char* get_version() const;
+ int input_conv(const char* word, char* dest, size_t destsize);
private:
AffixMgr* pAMgr;
@@ -124,12 +136,17 @@ private:
std::vector<std::string> wordbreak;
private:
+ std::vector<std::string> analyze_internal(const std::string& word);
+ bool spell_internal(const std::string& word, int* info = NULL, std::string* root = NULL);
+ std::vector<std::string> suggest_internal(const std::string& word,
+ bool& capitalized, size_t& abbreviated, int& captype);
void cleanword(std::string& dest, const std::string&, int* pcaptype, int* pabbrev);
size_t cleanword2(std::string& dest,
std::vector<w_char>& dest_u,
const std::string& src,
int* pcaptype,
size_t* pabbrev);
+ void clean_ignore(std::string& dest, const std::string& src);
void mkinitcap(std::string& u8);
int mkinitcap2(std::string& u8, std::vector<w_char>& u16);
int mkinitsmall2(std::string& u8, std::vector<w_char>& u16);
@@ -143,19 +160,15 @@ private:
void insert_sug(std::vector<std::string>& slst, const std::string& word);
void cat_result(std::string& result, const std::string& st);
std::vector<std::string> spellml(const std::string& word);
- std::string get_xml_par(const char* par);
- const char* get_xml_pos(const char* s, const char* attr);
- std::vector<std::string> get_xml_list(const char* list, const char* tag);
- int check_xml_par(const char* q, const char* attr, const char* value);
+ std::string get_xml_par(const std::string& par, std::string::size_type pos);
+ std::string::size_type get_xml_pos(const std::string& s, std::string::size_type pos, const char* attr);
+ std::vector<std::string> get_xml_list(const std::string& list, std::string::size_type pos, const char* tag);
+ int check_xml_par(const std::string& q, std::string::size_type pos, const char* attr, const char* value);
private:
HunspellImpl(const HunspellImpl&);
HunspellImpl& operator=(const HunspellImpl&);
};
-Hunspell::Hunspell(const char* affpath, const char* dpath, const char* key)
- : m_Impl(new HunspellImpl(affpath, dpath, key)) {
-}
-
HunspellImpl::HunspellImpl(const char* affpath, const char* dpath, const char* key) {
csconv = NULL;
utf8 = 0;
@@ -180,19 +193,12 @@ HunspellImpl::HunspellImpl(const char* affpath, const char* dpath, const char* k
complexprefixes = pAMgr->get_complexprefixes();
wordbreak = pAMgr->get_breaktable();
- dic_encoding_vec.resize(encoding.size()+1);
- strcpy(&dic_encoding_vec[0], encoding.c_str());
-
/* and finally set up the suggestion manager */
pSMgr = new SuggestMgr(try_string, MAXSUGGESTION, pAMgr);
if (try_string)
free(try_string);
}
-Hunspell::~Hunspell() {
- delete m_Impl;
-}
-
HunspellImpl::~HunspellImpl() {
delete pSMgr;
delete pAMgr;
@@ -210,11 +216,6 @@ HunspellImpl::~HunspellImpl() {
}
// load extra dictionaries
-int Hunspell::add_dic(const char* dpath, const char* key) {
- return m_Impl->add_dic(dpath, key);
-}
-
-// load extra dictionaries
int HunspellImpl::add_dic(const char* dpath, const char* key) {
if (!affixpath)
return 1;
@@ -222,6 +223,26 @@ int HunspellImpl::add_dic(const char* dpath, const char* key) {
return 0;
}
+
+// make a copy of src at dest while removing all characters
+// specified in IGNORE rule
+void HunspellImpl::clean_ignore(std::string& dest,
+ const std::string& src) {
+ dest.clear();
+ dest.assign(src);
+ const char* ignoredchars = pAMgr ? pAMgr->get_ignore() : NULL;
+ if (ignoredchars != NULL) {
+ if (utf8) {
+ const std::vector<w_char>& ignoredchars_utf16 =
+ pAMgr->get_ignore_utf16();
+ remove_ignored_chars_utf(dest, ignoredchars_utf16);
+ } else {
+ remove_ignored_chars(dest, ignoredchars);
+ }
+ }
+}
+
+
// make a copy of src at destination while removing all leading
// blanks and removing any trailing periods after recording
// their presence with the abbreviation flag
@@ -237,7 +258,11 @@ size_t HunspellImpl::cleanword2(std::string& dest,
dest.clear();
dest_utf.clear();
- const char* q = src.c_str();
+ // remove IGNORE characters from the string
+ std::string w2;
+ clean_ignore(w2, src);
+
+ const char* q = w2.c_str();
// first skip over any leading blanks
while (*q == ' ')
@@ -409,11 +434,22 @@ void HunspellImpl::insert_sug(std::vector<std::string>& slst, const std::string&
slst.insert(slst.begin(), word);
}
-bool Hunspell::spell(const std::string& word, int* info, std::string* root) {
- return m_Impl->spell(word, info, root);
+bool HunspellImpl::spell(const std::string& word, int* info, std::string* root) {
+ bool r = spell_internal(word, info, root);
+ if (r && root) {
+ // output conversion
+ RepList* rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL;
+ if (rl) {
+ std::string wspace;
+ if (rl->conv(*root, wspace)) {
+ *root = wspace;
+ }
+ }
+ }
+ return r;
}
-bool HunspellImpl::spell(const std::string& word, int* info, std::string* root) {
+bool HunspellImpl::spell_internal(const std::string& word, int* info, std::string* root) {
struct hentry* rv = NULL;
int info2 = 0;
@@ -485,7 +521,7 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root)
case HUHCAP:
/* FALLTHROUGH */
case HUHINITCAP:
- *info += SPELL_ORIGCAP;
+ *info |= SPELL_ORIGCAP;
/* FALLTHROUGH */
case NOCAP:
rv = checkword(scw, info, root);
@@ -496,7 +532,7 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root)
}
break;
case ALLCAP: {
- *info += SPELL_ORIGCAP;
+ *info |= SPELL_ORIGCAP;
rv = checkword(scw, info, root);
if (rv)
break;
@@ -563,17 +599,22 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root)
break;
}
}
+ /* FALLTHROUGH */
case INITCAP: {
-
- *info += SPELL_ORIGCAP;
- mkallsmall2(scw, sunicw);
- std::string u8buffer(scw);
- mkinitcap2(scw, sunicw);
+ // handle special capitalization of dotted I
+ bool Idot = (utf8 && (unsigned char) scw[0] == 0xc4 && (unsigned char) scw[1] == 0xb0);
+ *info |= SPELL_ORIGCAP;
+ if (captype == ALLCAP) {
+ mkallsmall2(scw, sunicw);
+ mkinitcap2(scw, sunicw);
+ if (Idot)
+ scw.replace(0, 1, "\xc4\xb0");
+ }
if (captype == INITCAP)
- *info += SPELL_INITCAP;
+ *info |= SPELL_INITCAP;
rv = checkword(scw, info, root);
if (captype == INITCAP)
- *info -= SPELL_INITCAP;
+ *info &= ~SPELL_INITCAP;
// forbid bad capitalization
// (for example, ijs -> Ijs instead of IJs in Dutch)
// use explicit forms in dic: Ijs/F (F = FORBIDDENWORD flag)
@@ -583,9 +624,13 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root)
}
if (rv && is_keepcase(rv) && (captype == ALLCAP))
rv = NULL;
- if (rv)
+ if (rv || (Idot && langnum != LANG_az && langnum != LANG_tr && langnum != LANG_crh))
break;
+ mkallsmall2(scw, sunicw);
+ std::string u8buffer(scw);
+ mkinitcap2(scw, sunicw);
+
rv = checkword(u8buffer, info, root);
if (abbv && !rv) {
u8buffer.push_back('.');
@@ -594,10 +639,10 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root)
u8buffer = scw;
u8buffer.push_back('.');
if (captype == INITCAP)
- *info += SPELL_INITCAP;
+ *info |= SPELL_INITCAP;
rv = checkword(u8buffer, info, root);
if (captype == INITCAP)
- *info -= SPELL_INITCAP;
+ *info &= ~SPELL_INITCAP;
if (rv && is_keepcase(rv) && (captype == ALLCAP))
rv = NULL;
break;
@@ -618,7 +663,7 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root)
if (rv) {
if (pAMgr && pAMgr->get_warn() && rv->astr &&
TESTAFF(rv->astr, pAMgr->get_warn(), rv->alen)) {
- *info += SPELL_WARN;
+ *info |= SPELL_WARN;
if (pAMgr->get_forbidwarn())
return false;
return true;
@@ -627,7 +672,7 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root)
}
// recursive breaking at break points
- if (!wordbreak.empty()) {
+ if (!wordbreak.empty() && !(*info & SPELL_FORBIDDEN)) {
int nbr = 0;
wl = scw.size();
@@ -668,6 +713,37 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root)
size_t plen = wordbreak[j].size();
size_t found = scw.find(wordbreak[j]);
if ((found > 0) && (found < wl - plen)) {
+ size_t found2 = scw.find(wordbreak[j], found + 1);
+ // try to break at the second occurance
+ // to recognize dictionary words with wordbreak
+ if (found2 > 0 && (found2 < wl - plen))
+ found = found2;
+ if (!spell(scw.substr(found + plen)))
+ continue;
+ std::string suffix(scw.substr(found));
+ scw.resize(found);
+ // examine 2 sides of the break point
+ if (spell(scw))
+ return true;
+ scw.append(suffix);
+
+ // LANG_hu: spec. dash rule
+ if (langnum == LANG_hu && wordbreak[j] == "-") {
+ suffix = scw.substr(found + 1);
+ scw.resize(found + 1);
+ if (spell(scw))
+ return true; // check the first part with dash
+ scw.append(suffix);
+ }
+ // end of LANG specific region
+ }
+ }
+
+ // other patterns (break at first break point)
+ for (size_t j = 0; j < wordbreak.size(); ++j) {
+ size_t plen = wordbreak[j].size();
+ size_t found = scw.find(wordbreak[j]);
+ if ((found > 0) && (found < wl - plen)) {
if (!spell(scw.substr(found + plen)))
continue;
std::string suffix(scw.substr(found));
@@ -694,47 +770,28 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root)
}
struct hentry* HunspellImpl::checkword(const std::string& w, int* info, std::string* root) {
- bool usebuffer = false;
std::string w2;
const char* word;
int len;
- const char* ignoredchars = pAMgr ? pAMgr->get_ignore() : NULL;
- if (ignoredchars != NULL) {
- w2.assign(w);
- if (utf8) {
- const std::vector<w_char>& ignoredchars_utf16 =
- pAMgr->get_ignore_utf16();
- remove_ignored_chars_utf(w2, ignoredchars_utf16);
- } else {
- remove_ignored_chars(w2, ignoredchars);
- }
- word = w2.c_str();
- len = w2.size();
- usebuffer = true;
- } else {
- word = w.c_str();
- len = w.size();
- }
+ // remove IGNORE characters from the string
+ clean_ignore(w2, w);
+
+ word = w2.c_str();
+ len = w2.size();
if (!len)
return NULL;
// word reversing wrapper for complex prefixes
if (complexprefixes) {
- if (!usebuffer) {
- w2.assign(word);
- usebuffer = true;
- }
if (utf8)
reverseword_utf(w2);
else
reverseword(w2);
}
- if (usebuffer) {
- word = w2.c_str();
- }
+ word = w2.c_str();
// look word in hash table
struct hentry* he = NULL;
@@ -745,13 +802,13 @@ struct hentry* HunspellImpl::checkword(const std::string& w, int* info, std::str
if ((he) && (he->astr) && (pAMgr) &&
TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) {
if (info)
- *info += SPELL_FORBIDDEN;
+ *info |= SPELL_FORBIDDEN;
// LANG_hu section: set dash information for suggestions
if (langnum == LANG_hu) {
if (pAMgr->get_compoundflag() &&
TESTAFF(he->astr, pAMgr->get_compoundflag(), he->alen)) {
if (info)
- *info += SPELL_COMPOUND;
+ *info |= SPELL_COMPOUND;
}
}
return NULL;
@@ -786,7 +843,7 @@ struct hentry* HunspellImpl::checkword(const std::string& w, int* info, std::str
if ((he->astr) && (pAMgr) &&
TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) {
if (info)
- *info += SPELL_FORBIDDEN;
+ *info |= SPELL_FORBIDDEN;
return NULL;
}
if (root) {
@@ -819,7 +876,7 @@ struct hentry* HunspellImpl::checkword(const std::string& w, int* info, std::str
}
}
if (info)
- *info += SPELL_COMPOUND;
+ *info |= SPELL_COMPOUND;
}
}
}
@@ -827,11 +884,103 @@ struct hentry* HunspellImpl::checkword(const std::string& w, int* info, std::str
return he;
}
-std::vector<std::string> Hunspell::suggest(const std::string& word) {
- return m_Impl->suggest(word);
+std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
+ bool capwords;
+ size_t abbv;
+ int captype;
+ std::vector<std::string> slst = suggest_internal(word, capwords, abbv, captype);
+ // word reversing wrapper for complex prefixes
+ if (complexprefixes) {
+ for (size_t j = 0; j < slst.size(); ++j) {
+ if (utf8)
+ reverseword_utf(slst[j]);
+ else
+ reverseword(slst[j]);
+ }
+ }
+
+ // capitalize
+ if (capwords)
+ for (size_t j = 0; j < slst.size(); ++j) {
+ mkinitcap(slst[j]);
+ }
+
+ // expand suggestions with dot(s)
+ if (abbv && pAMgr && pAMgr->get_sugswithdots()) {
+ for (size_t j = 0; j < slst.size(); ++j) {
+ slst[j].append(word.substr(word.size() - abbv));
+ }
+ }
+
+ // remove bad capitalized and forbidden forms
+ if (pAMgr && (pAMgr->get_keepcase() || pAMgr->get_forbiddenword())) {
+ switch (captype) {
+ case INITCAP:
+ case ALLCAP: {
+ size_t l = 0;
+ for (size_t j = 0; j < slst.size(); ++j) {
+ if (slst[j].find(' ') == std::string::npos && !spell(slst[j])) {
+ std::string s;
+ std::vector<w_char> w;
+ if (utf8) {
+ u8_u16(w, slst[j]);
+ } else {
+ s = slst[j];
+ }
+ mkallsmall2(s, w);
+ if (spell(s)) {
+ slst[l] = s;
+ ++l;
+ } else {
+ mkinitcap2(s, w);
+ if (spell(s)) {
+ slst[l] = s;
+ ++l;
+ }
+ }
+ } else {
+ slst[l] = slst[j];
+ ++l;
+ }
+ }
+ slst.resize(l);
+ }
+ }
+ }
+
+ // remove duplications
+ size_t l = 0;
+ for (size_t j = 0; j < slst.size(); ++j) {
+ slst[l] = slst[j];
+ for (size_t k = 0; k < l; ++k) {
+ if (slst[k] == slst[j]) {
+ --l;
+ break;
+ }
+ }
+ ++l;
+ }
+ slst.resize(l);
+
+ // output conversion
+ RepList* rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL;
+ if (rl) {
+ for (size_t i = 0; rl && i < slst.size(); ++i) {
+ std::string wspace;
+ if (rl->conv(slst[i], wspace)) {
+ slst[i] = wspace;
+ }
+ }
+ }
+ return slst;
}
-std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
+std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word,
+ bool& capwords, size_t& abbv, int& captype) {
+ captype = NOCAP;
+ abbv = 0;
+ capwords = false;
+
std::vector<std::string> slst;
int onlycmpdsug = 0;
@@ -849,8 +998,6 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
if (word.size() >= MAXWORDLEN)
return slst;
}
- int captype = NOCAP;
- size_t abbv = 0;
size_t wl = 0;
std::string scw;
@@ -871,7 +1018,11 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
return slst;
}
- int capwords = 0;
+ bool good = false;
+
+ clock_t timelimit;
+ // initialize in every suggestion call
+ timelimit = clock();
// check capitalized form for FORCEUCASE
if (pAMgr && captype == NOCAP && pAMgr->get_forceucase()) {
@@ -886,22 +1037,38 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
switch (captype) {
case NOCAP: {
- pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
+ good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
+ if (clock() > timelimit + TIMELIMIT_GLOBAL)
+ return slst;
+ if (abbv) {
+ std::string wspace(scw);
+ wspace.push_back('.');
+ good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
+ if (clock() > timelimit + TIMELIMIT_GLOBAL)
+ return slst;
+ }
break;
}
case INITCAP: {
- capwords = 1;
- pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
+ capwords = true;
+ good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
+ if (clock() > timelimit + TIMELIMIT_GLOBAL)
+ return slst;
std::string wspace(scw);
mkallsmall2(wspace, sunicw);
- pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
+ good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
+ if (clock() > timelimit + TIMELIMIT_GLOBAL)
+ return slst;
break;
}
case HUHINITCAP:
- capwords = 1;
+ capwords = true;
+ /* FALLTHROUGH */
case HUHCAP: {
- pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
+ good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
+ if (clock() > timelimit + TIMELIMIT_GLOBAL)
+ return slst;
// something.The -> something. The
size_t dot_pos = scw.find('.');
if (dot_pos != std::string::npos) {
@@ -927,19 +1094,25 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
// TheOpenOffice.org -> The OpenOffice.org
wspace = scw;
mkinitsmall2(wspace, sunicw);
- pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
+ good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
+ if (clock() > timelimit + TIMELIMIT_GLOBAL)
+ return slst;
}
wspace = scw;
mkallsmall2(wspace, sunicw);
if (spell(wspace.c_str()))
insert_sug(slst, wspace);
size_t prevns = slst.size();
- pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
+ good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
+ if (clock() > timelimit + TIMELIMIT_GLOBAL)
+ return slst;
if (captype == HUHINITCAP) {
mkinitcap2(wspace, sunicw);
if (spell(wspace.c_str()))
insert_sug(slst, wspace);
- pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
+ good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
+ if (clock() > timelimit + TIMELIMIT_GLOBAL)
+ return slst;
}
// aNew -> "a New" (instead of "a new")
for (size_t j = prevns; j < slst.size(); ++j) {
@@ -966,11 +1139,15 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
case ALLCAP: {
std::string wspace(scw);
mkallsmall2(wspace, sunicw);
- pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
+ good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
+ if (clock() > timelimit + TIMELIMIT_GLOBAL)
+ return slst;
if (pAMgr && pAMgr->get_keepcase() && spell(wspace.c_str()))
insert_sug(slst, wspace);
mkinitcap2(wspace, sunicw);
- pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
+ good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
+ if (clock() > timelimit + TIMELIMIT_GLOBAL)
+ return slst;
for (size_t j = 0; j < slst.size(); ++j) {
mkallcap(slst[j]);
if (pAMgr && pAMgr->get_checksharps()) {
@@ -1002,34 +1179,43 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
}
}
// END OF LANG_hu section
-
- // try ngram approach since found nothing or only compound words
- if (pAMgr && (slst.empty() || onlycmpdsug) && (pAMgr->get_maxngramsugs() != 0)) {
+ // try ngram approach since found nothing good suggestion
+ if (!good && pAMgr && (slst.empty() || onlycmpdsug) && (pAMgr->get_maxngramsugs() != 0)) {
switch (captype) {
case NOCAP: {
- pSMgr->ngsuggest(slst, scw.c_str(), m_HMgrs);
+ pSMgr->ngsuggest(slst, scw.c_str(), m_HMgrs, NOCAP);
+ if (clock() > timelimit + TIMELIMIT_GLOBAL)
+ return slst;
break;
}
+ /* FALLTHROUGH */
case HUHINITCAP:
- capwords = 1;
+ capwords = true;
+ /* FALLTHROUGH */
case HUHCAP: {
std::string wspace(scw);
mkallsmall2(wspace, sunicw);
- pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs);
+ pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs, HUHCAP);
+ if (clock() > timelimit + TIMELIMIT_GLOBAL)
+ return slst;
break;
}
case INITCAP: {
- capwords = 1;
+ capwords = true;
std::string wspace(scw);
mkallsmall2(wspace, sunicw);
- pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs);
+ pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs, INITCAP);
+ if (clock() > timelimit + TIMELIMIT_GLOBAL)
+ return slst;
break;
}
case ALLCAP: {
std::string wspace(scw);
mkallsmall2(wspace, sunicw);
size_t oldns = slst.size();
- pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs);
+ pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs, ALLCAP);
+ if (clock() > timelimit + TIMELIMIT_GLOBAL)
+ return slst;
for (size_t j = oldns; j < slst.size(); ++j) {
mkallcap(slst[j]);
}
@@ -1039,6 +1225,11 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
}
// try dash suggestion (Afo-American -> Afro-American)
+ // Note: LibreOffice was modified to treat dashes as word
+ // characters to check "scot-free" etc. word forms, but
+ // we need to handle suggestions for "Afo-American", etc.,
+ // while "Afro-American" is missing from the dictionary.
+ // TODO avoid possible overgeneration
size_t dash_pos = scw.find('-');
if (dash_pos != std::string::npos) {
int nodashsug = 1;
@@ -1050,12 +1241,14 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
size_t prev_pos = 0;
bool last = false;
- while (nodashsug && !last) {
+ while (!good && nodashsug && !last) {
if (dash_pos == scw.size())
last = 1;
std::string chunk = scw.substr(prev_pos, dash_pos - prev_pos);
if (!spell(chunk.c_str())) {
std::vector<std::string> nlst = suggest(chunk.c_str());
+ if (clock() > timelimit + TIMELIMIT_GLOBAL)
+ return slst;
for (std::vector<std::string>::reverse_iterator j = nlst.rbegin(); j != nlst.rend(); ++j) {
std::string wspace = scw.substr(0, prev_pos);
wspace.append(*j);
@@ -1063,7 +1256,11 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
wspace.append("-");
wspace.append(scw.substr(dash_pos + 1));
}
- insert_sug(slst, wspace);
+ int info = 0;
+ if (pAMgr && pAMgr->get_forbiddenword())
+ checkword(wspace, &info, NULL);
+ if (!(info & SPELL_FORBIDDEN))
+ insert_sug(slst, wspace);
}
nodashsug = 0;
}
@@ -1075,104 +1272,13 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
dash_pos = scw.size();
}
}
-
- // word reversing wrapper for complex prefixes
- if (complexprefixes) {
- for (size_t j = 0; j < slst.size(); ++j) {
- if (utf8)
- reverseword_utf(slst[j]);
- else
- reverseword(slst[j]);
- }
- }
-
- // capitalize
- if (capwords)
- for (size_t j = 0; j < slst.size(); ++j) {
- mkinitcap(slst[j]);
- }
-
- // expand suggestions with dot(s)
- if (abbv && pAMgr && pAMgr->get_sugswithdots()) {
- for (size_t j = 0; j < slst.size(); ++j) {
- slst[j].append(word.substr(word.size() - abbv));
- }
- }
-
- // remove bad capitalized and forbidden forms
- if (pAMgr && (pAMgr->get_keepcase() || pAMgr->get_forbiddenword())) {
- switch (captype) {
- case INITCAP:
- case ALLCAP: {
- size_t l = 0;
- for (size_t j = 0; j < slst.size(); ++j) {
- if (slst[j].find(' ') == std::string::npos && !spell(slst[j])) {
- std::string s;
- std::vector<w_char> w;
- if (utf8) {
- u8_u16(w, slst[j]);
- } else {
- s = slst[j];
- }
- mkallsmall2(s, w);
- if (spell(s)) {
- slst[l] = s;
- ++l;
- } else {
- mkinitcap2(s, w);
- if (spell(s)) {
- slst[l] = s;
- ++l;
- }
- }
- } else {
- slst[l] = slst[j];
- ++l;
- }
- }
- slst.resize(l);
- }
- }
- }
-
- // remove duplications
- size_t l = 0;
- for (size_t j = 0; j < slst.size(); ++j) {
- slst[l] = slst[j];
- for (size_t k = 0; k < l; ++k) {
- if (slst[k] == slst[j]) {
- --l;
- break;
- }
- }
- ++l;
- }
- slst.resize(l);
-
- // output conversion
- rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL;
- for (size_t j = 0; rl && j < slst.size(); ++j) {
- std::string wspace;
- if (rl->conv(slst[j], wspace)) {
- slst[j] = wspace;
- }
- }
-
return slst;
}
-const std::string& Hunspell::get_dict_encoding() const {
- return m_Impl->get_dict_encoding();
-}
-
const std::string& HunspellImpl::get_dict_encoding() const {
return encoding;
}
-std::vector<std::string> Hunspell::stem(const std::vector<std::string>& desc) {
- return m_Impl->stem(desc);
-}
-
std::vector<std::string> HunspellImpl::stem(const std::vector<std::string>& desc) {
std::vector<std::string> slst;
@@ -1241,30 +1347,14 @@ std::vector<std::string> HunspellImpl::stem(const std::vector<std::string>& desc
return slst;
}
-std::vector<std::string> Hunspell::stem(const std::string& word) {
- return m_Impl->stem(word);
-}
-
std::vector<std::string> HunspellImpl::stem(const std::string& word) {
return stem(analyze(word));
}
-const char* Hunspell::get_wordchars() const {
- return m_Impl->get_wordchars().c_str();
-}
-
-const std::string& Hunspell::get_wordchars_cpp() const {
- return m_Impl->get_wordchars();
-}
-
-const std::string& HunspellImpl::get_wordchars() const {
+const std::string& HunspellImpl::get_wordchars_cpp() const {
return pAMgr->get_wordchars();
}
-const std::vector<w_char>& Hunspell::get_wordchars_utf16() const {
- return m_Impl->get_wordchars_utf16();
-}
-
const std::vector<w_char>& HunspellImpl::get_wordchars_utf16() const {
return pAMgr->get_wordchars_utf16();
}
@@ -1300,56 +1390,32 @@ int HunspellImpl::mkinitsmall2(std::string& u8, std::vector<w_char>& u16) {
return u8.size();
}
-int Hunspell::add(const std::string& word) {
- return m_Impl->add(word);
-}
-
int HunspellImpl::add(const std::string& word) {
if (!m_HMgrs.empty())
return m_HMgrs[0]->add(word);
return 0;
}
-int Hunspell::add_with_affix(const std::string& word, const std::string& example) {
- return m_Impl->add_with_affix(word, example);
-}
-
int HunspellImpl::add_with_affix(const std::string& word, const std::string& example) {
if (!m_HMgrs.empty())
return m_HMgrs[0]->add_with_affix(word, example);
return 0;
}
-int Hunspell::remove(const std::string& word) {
- return m_Impl->remove(word);
-}
-
int HunspellImpl::remove(const std::string& word) {
if (!m_HMgrs.empty())
return m_HMgrs[0]->remove(word);
return 0;
}
-const char* Hunspell::get_version() const {
- return m_Impl->get_version().c_str();
-}
-
-const std::string& Hunspell::get_version_cpp() const {
- return m_Impl->get_version();
-}
-
-const char* Hunspell::get_try_string() const {
- return m_Impl->get_try_string();
+const std::string& HunspellImpl::get_version_cpp() const {
+ return pAMgr->get_version();
}
struct cs_info* HunspellImpl::get_csconv() {
return csconv;
}
-struct cs_info* Hunspell::get_csconv() {
- return m_Impl->get_csconv();
-}
-
void HunspellImpl::cat_result(std::string& result, const std::string& st) {
if (!st.empty()) {
if (!result.empty())
@@ -1358,11 +1424,22 @@ void HunspellImpl::cat_result(std::string& result, const std::string& st) {
}
}
-std::vector<std::string> Hunspell::analyze(const std::string& word) {
- return m_Impl->analyze(word);
+std::vector<std::string> HunspellImpl::analyze(const std::string& word) {
+ std::vector<std::string> slst = analyze_internal(word);
+ // output conversion
+ RepList* rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL;
+ if (rl) {
+ for (size_t i = 0; rl && i < slst.size(); ++i) {
+ std::string wspace;
+ if (rl->conv(slst[i], wspace)) {
+ slst[i] = wspace;
+ }
+ }
+ }
+ return slst;
}
-std::vector<std::string> HunspellImpl::analyze(const std::string& word) {
+std::vector<std::string> HunspellImpl::analyze_internal(const std::string& word) {
std::vector<std::string> slst;
if (!pSMgr || m_HMgrs.empty())
return slst;
@@ -1595,10 +1672,6 @@ std::vector<std::string> HunspellImpl::analyze(const std::string& word) {
return slst;
}
-std::vector<std::string> Hunspell::generate(const std::string& word, const std::vector<std::string>& pl) {
- return m_Impl->generate(word, pl);
-}
-
std::vector<std::string> HunspellImpl::generate(const std::string& word, const std::vector<std::string>& pl) {
std::vector<std::string> slst;
if (!pSMgr || pl.empty())
@@ -1643,10 +1716,6 @@ std::vector<std::string> HunspellImpl::generate(const std::string& word, const s
return slst;
}
-std::vector<std::string> Hunspell::generate(const std::string& word, const std::string& pattern) {
- return m_Impl->generate(word, pattern);
-}
-
std::vector<std::string> HunspellImpl::generate(const std::string& word, const std::string& pattern) {
std::vector<std::string> pl = analyze(pattern);
std::vector<std::string> slst = generate(word, pl);
@@ -1655,10 +1724,11 @@ std::vector<std::string> HunspellImpl::generate(const std::string& word, const s
}
// minimal XML parser functions
-std::string HunspellImpl::get_xml_par(const char* par) {
+std::string HunspellImpl::get_xml_par(const std::string& in_par, std::string::size_type pos) {
std::string dest;
- if (!par)
+ if (pos == std::string::npos)
return dest;
+ const char* par = in_par.c_str() + pos;
char end = *par;
if (end == '>')
end = '<';
@@ -1672,22 +1742,8 @@ std::string HunspellImpl::get_xml_par(const char* par) {
return dest;
}
-int Hunspell::get_langnum() const {
- return m_Impl->get_langnum();
-}
-
-bool Hunspell::input_conv(const std::string& word, std::string& dest) {
- return m_Impl->input_conv(word, dest);
-}
-
-int Hunspell::input_conv(const char* word, char* dest, size_t destsize) {
- std::string d;
- bool ret = input_conv(word, d);
- if (ret && d.size() < destsize) {
- strncpy(dest, d.c_str(), destsize);
- return 1;
- }
- return 0;
+int HunspellImpl::get_langnum() const {
+ return langnum;
}
bool HunspellImpl::input_conv(const std::string& word, std::string& dest) {
@@ -1700,42 +1756,47 @@ bool HunspellImpl::input_conv(const std::string& word, std::string& dest) {
}
// return the beginning of the element (attr == NULL) or the attribute
-const char* HunspellImpl::get_xml_pos(const char* s, const char* attr) {
- const char* end = strchr(s, '>');
+std::string::size_type HunspellImpl::get_xml_pos(const std::string& s, std::string::size_type pos, const char* attr) {
+ if (pos == std::string::npos)
+ return std::string::npos;
+
+ std::string::size_type endpos = s.find('>', pos);
if (attr == NULL)
- return end;
- const char* p = s;
- while (1) {
- p = strstr(p, attr);
- if (!p || p >= end)
- return 0;
- if (*(p - 1) == ' ' || *(p - 1) == '\n')
+ return endpos;
+ while (true) {
+ pos = s.find(attr, pos);
+ if (pos == std::string::npos || pos >= endpos)
+ return std::string::npos;
+ if (s[pos - 1] == ' ' || s[pos - 1] == '\n')
break;
- p += strlen(attr);
+ pos += strlen(attr);
}
- return p + strlen(attr);
+ return pos + strlen(attr);
}
-int HunspellImpl::check_xml_par(const char* q,
- const char* attr,
- const char* value) {
- std::string cw = get_xml_par(get_xml_pos(q, attr));
+int HunspellImpl::check_xml_par(const std::string& q, std::string::size_type pos,
+ const char* attr,
+ const char* value) {
+ std::string cw = get_xml_par(q, get_xml_pos(q, pos, attr));
if (cw == value)
return 1;
return 0;
}
-std::vector<std::string> HunspellImpl::get_xml_list(const char* list, const char* tag) {
+std::vector<std::string> HunspellImpl::get_xml_list(const std::string& list, std::string::size_type pos, const char* tag) {
std::vector<std::string> slst;
- if (!list)
+ if (pos == std::string::npos)
return slst;
- const char* p = list;
- for (size_t n = 0; ((p = strstr(p, tag)) != NULL); ++p, ++n) {
- std::string cw = get_xml_par(p + strlen(tag) - 1);
+ while (true) {
+ pos = list.find(tag, pos);
+ if (pos == std::string::npos)
+ break;
+ std::string cw = get_xml_par(list, pos + strlen(tag) - 1);
if (cw.empty()) {
break;
}
slst.push_back(cw);
+ ++pos;
}
return slst;
}
@@ -1743,19 +1804,20 @@ std::vector<std::string> HunspellImpl::get_xml_list(const char* list, const char
std::vector<std::string> HunspellImpl::spellml(const std::string& in_word) {
std::vector<std::string> slst;
- const char* word = in_word.c_str();
-
- const char* q = strstr(word, "<query");
- if (!q)
+ std::string::size_type qpos = in_word.find("<query");
+ if (qpos == std::string::npos)
return slst; // bad XML input
- const char* q2 = strchr(q, '>');
- if (!q2)
+
+ std::string::size_type q2pos = in_word.find('>', qpos);
+ if (q2pos == std::string::npos)
return slst; // bad XML input
- q2 = strstr(q2, "<word");
- if (!q2)
+
+ q2pos = in_word.find("<word", q2pos);
+ if (q2pos == std::string::npos)
return slst; // bad XML input
- if (check_xml_par(q, "type=", "analyze")) {
- std::string cw = get_xml_par(strchr(q2, '>'));
+
+ if (check_xml_par(in_word, qpos, "type=", "analyze")) {
+ std::string cw = get_xml_par(in_word, in_word.find('>', q2pos));
if (!cw.empty())
slst = analyze(cw);
if (slst.empty())
@@ -1778,23 +1840,24 @@ std::vector<std::string> HunspellImpl::spellml(const std::string& in_word) {
slst.clear();
slst.push_back(r);
return slst;
- } else if (check_xml_par(q, "type=", "stem")) {
- std::string cw = get_xml_par(strchr(q2, '>'));
+ } else if (check_xml_par(in_word, qpos, "type=", "stem")) {
+ std::string cw = get_xml_par(in_word, in_word.find('>', q2pos));
if (!cw.empty())
return stem(cw);
- } else if (check_xml_par(q, "type=", "generate")) {
- std::string cw = get_xml_par(strchr(q2, '>'));
+ } else if (check_xml_par(in_word, qpos, "type=", "generate")) {
+ std::string cw = get_xml_par(in_word, in_word.find('>', q2pos));
if (cw.empty())
return slst;
- const char* q3 = strstr(q2 + 1, "<word");
- if (q3) {
- std::string cw2 = get_xml_par(strchr(q3, '>'));
+ std::string::size_type q3pos = in_word.find("<word", q2pos + 1);
+ if (q3pos != std::string::npos) {
+ std::string cw2 = get_xml_par(in_word, in_word.find('>', q3pos));
if (!cw2.empty()) {
return generate(cw, cw2);
}
} else {
- if ((q2 = strstr(q2 + 1, "<code")) != NULL) {
- std::vector<std::string> slst2 = get_xml_list(strchr(q2, '>'), "<a>");
+ q2pos = in_word.find("<code", q2pos + 1);
+ if (q2pos != std::string::npos) {
+ std::vector<std::string> slst2 = get_xml_list(in_word, in_word.find('>', q2pos), "<a>");
if (!slst2.empty()) {
slst = generate(cw, slst2);
uniqlist(slst);
@@ -1802,21 +1865,57 @@ std::vector<std::string> HunspellImpl::spellml(const std::string& in_word) {
}
}
}
+ } else if (check_xml_par(in_word, qpos, "type=", "add")) {
+ std::string cw = get_xml_par(in_word, in_word.find('>', q2pos));
+ if (cw.empty())
+ return slst;
+ std::string::size_type q3pos = in_word.find("<word", q2pos + 1);
+ if (q3pos != std::string::npos) {
+ std::string cw2 = get_xml_par(in_word, in_word.find('>', q3pos));
+ if (!cw2.empty()) {
+ add_with_affix(cw, cw2);
+ } else {
+ add(cw);
+ }
+ } else {
+ add(cw);
+ }
}
return slst;
}
-int Hunspell::spell(const char* word, int* info, char** root) {
- std::string sroot;
- bool ret = m_Impl->spell(word, info, root ? &sroot : NULL);
- if (root) {
- if (sroot.empty()) {
- *root = NULL;
+std::vector<std::string> HunspellImpl::suffix_suggest(const std::string& root_word) {
+ std::vector<std::string> slst;
+ struct hentry* he = NULL;
+ int len;
+ std::string w2;
+ const char* word;
+ const char* ignoredchars = pAMgr->get_ignore();
+ if (ignoredchars != NULL) {
+ w2.assign(root_word);
+ if (utf8) {
+ const std::vector<w_char>& ignoredchars_utf16 =
+ pAMgr->get_ignore_utf16();
+ remove_ignored_chars_utf(w2, ignoredchars_utf16);
} else {
- *root = mystrdup(sroot.c_str());
+ remove_ignored_chars(w2, ignoredchars);
}
+ word = w2.c_str();
+ } else
+ word = root_word.c_str();
+
+ len = strlen(word);
+
+ if (!len)
+ return slst;
+
+ for (size_t i = 0; (i < m_HMgrs.size()) && !he; ++i) {
+ he = m_HMgrs[i]->lookup(word);
}
- return ret;
+ if (he) {
+ slst = pAMgr->get_suffix_words(he->astr, he->alen, root_word.c_str());
+ }
+ return slst;
}
namespace {
@@ -1835,113 +1934,289 @@ namespace {
}
}
-void Hunspell::free_list(char*** slst, int n) {
- Hunspell_free_list((Hunhandle*)(this), slst, n);
+int HunspellImpl::spell(const char* word, int* info, char** root) {
+ std::string sroot;
+ bool ret = spell(word, info, root ? &sroot : NULL);
+ if (root) {
+ if (sroot.empty()) {
+ *root = NULL;
+ } else {
+ *root = mystrdup(sroot.c_str());
+ }
+ }
+ return ret;
+}
+
+int HunspellImpl::suggest(char*** slst, const char* word) {
+ std::vector<std::string> suggests = suggest(word);
+ return munge_vector(slst, suggests);
+}
+
+int HunspellImpl::suffix_suggest(char*** slst, const char* root_word) {
+ std::vector<std::string> stems = suffix_suggest(root_word);
+ return munge_vector(slst, stems);
+}
+
+void HunspellImpl::free_list(char*** slst, int n) {
+ if (slst && *slst) {
+ for (int i = 0; i < n; i++)
+ free((*slst)[i]);
+ free(*slst);
+ *slst = NULL;
+ }
+}
+
+char* HunspellImpl::get_dic_encoding() {
+ return &encoding[0];
+}
+
+int HunspellImpl::analyze(char*** slst, const char* word) {
+ std::vector<std::string> stems = analyze(word);
+ return munge_vector(slst, stems);
+}
+
+int HunspellImpl::stem(char*** slst, const char* word) {
+ std::vector<std::string> stems = stem(word);
+ return munge_vector(slst, stems);
+}
+
+int HunspellImpl::stem(char*** slst, char** desc, int n) {
+ std::vector<std::string> morph;
+ morph.reserve(n);
+ for (int i = 0; i < n; ++i)
+ morph.push_back(desc[i]);
+
+ std::vector<std::string> stems = stem(morph);
+ return munge_vector(slst, stems);
+}
+
+int HunspellImpl::generate(char*** slst, const char* word, const char* pattern) {
+ std::vector<std::string> stems = generate(word, pattern);
+ return munge_vector(slst, stems);
+}
+
+int HunspellImpl::generate(char*** slst, const char* word, char** pl, int pln) {
+ std::vector<std::string> morph;
+ morph.reserve(pln);
+ for (int i = 0; i < pln; ++i)
+ morph.push_back(pl[i]);
+
+ std::vector<std::string> stems = generate(word, morph);
+ return munge_vector(slst, stems);
+}
+
+const char* HunspellImpl::get_wordchars() const {
+ return get_wordchars_cpp().c_str();
+}
+
+const char* HunspellImpl::get_version() const {
+ return get_version_cpp().c_str();
+}
+
+int HunspellImpl::input_conv(const char* word, char* dest, size_t destsize) {
+ std::string d;
+ bool ret = input_conv(word, d);
+ if (ret && d.size() < destsize) {
+ strncpy(dest, d.c_str(), destsize);
+ return 1;
+ }
+ return 0;
+}
+
+Hunspell::Hunspell(const char* affpath, const char* dpath, const char* key)
+ : m_Impl(new HunspellImpl(affpath, dpath, key)) {
+}
+
+Hunspell::~Hunspell() {
+ delete m_Impl;
+}
+
+// load extra dictionaries
+int Hunspell::add_dic(const char* dpath, const char* key) {
+ return m_Impl->add_dic(dpath, key);
+}
+
+bool Hunspell::spell(const std::string& word, int* info, std::string* root) {
+ return m_Impl->spell(word, info, root);
+}
+
+std::vector<std::string> Hunspell::suggest(const std::string& word) {
+ return m_Impl->suggest(word);
+}
+
+std::vector<std::string> Hunspell::suffix_suggest(const std::string& root_word) {
+ return m_Impl->suffix_suggest(root_word);
+}
+
+const std::string& Hunspell::get_dict_encoding() const {
+ return m_Impl->get_dict_encoding();
+}
+
+std::vector<std::string> Hunspell::stem(const std::vector<std::string>& desc) {
+ return m_Impl->stem(desc);
+}
+
+std::vector<std::string> Hunspell::stem(const std::string& word) {
+ return m_Impl->stem(word);
+}
+
+const std::string& Hunspell::get_wordchars_cpp() const {
+ return m_Impl->get_wordchars_cpp();
+}
+
+const std::vector<w_char>& Hunspell::get_wordchars_utf16() const {
+ return m_Impl->get_wordchars_utf16();
+}
+
+int Hunspell::add(const std::string& word) {
+ return m_Impl->add(word);
+}
+
+int Hunspell::add_with_affix(const std::string& word, const std::string& example) {
+ return m_Impl->add_with_affix(word, example);
+}
+
+int Hunspell::remove(const std::string& word) {
+ return m_Impl->remove(word);
+}
+
+const std::string& Hunspell::get_version_cpp() const {
+ return m_Impl->get_version_cpp();
+}
+
+struct cs_info* Hunspell::get_csconv() {
+ return m_Impl->get_csconv();
+}
+
+std::vector<std::string> Hunspell::analyze(const std::string& word) {
+ return m_Impl->analyze(word);
+}
+
+std::vector<std::string> Hunspell::generate(const std::string& word, const std::vector<std::string>& pl) {
+ return m_Impl->generate(word, pl);
+}
+
+std::vector<std::string> Hunspell::generate(const std::string& word, const std::string& pattern) {
+ return m_Impl->generate(word, pattern);
+}
+
+int Hunspell::get_langnum() const {
+ return m_Impl->get_langnum();
+}
+
+bool Hunspell::input_conv(const std::string& word, std::string& dest) {
+ return m_Impl->input_conv(word, dest);
+}
+
+int Hunspell::spell(const char* word, int* info, char** root) {
+ return m_Impl->spell(word, info, root);
}
int Hunspell::suggest(char*** slst, const char* word) {
- return Hunspell_suggest((Hunhandle*)(this), slst, word);
+ return m_Impl->suggest(slst, word);
}
int Hunspell::suffix_suggest(char*** slst, const char* root_word) {
- std::vector<std::string> stems = m_Impl->suffix_suggest(root_word);
- return munge_vector(slst, stems);
+ return m_Impl->suffix_suggest(slst, root_word);
+}
+
+void Hunspell::free_list(char*** slst, int n) {
+ m_Impl->free_list(slst, n);
}
char* Hunspell::get_dic_encoding() {
- return &(m_Impl->dic_encoding_vec[0]);
+ return m_Impl->get_dic_encoding();
}
-int Hunspell::stem(char*** slst, char** desc, int n) {
- return Hunspell_stem2((Hunhandle*)(this), slst, desc, n);
+int Hunspell::analyze(char*** slst, const char* word) {
+ return m_Impl->analyze(slst, word);
}
int Hunspell::stem(char*** slst, const char* word) {
- return Hunspell_stem((Hunhandle*)(this), slst, word);
+ return m_Impl->stem(slst, word);
}
-int Hunspell::analyze(char*** slst, const char* word) {
- return Hunspell_analyze((Hunhandle*)(this), slst, word);
+int Hunspell::stem(char*** slst, char** desc, int n) {
+ return m_Impl->stem(slst, desc, n);
+}
+
+int Hunspell::generate(char*** slst, const char* word, const char* pattern) {
+ return m_Impl->generate(slst, word, pattern);
}
int Hunspell::generate(char*** slst, const char* word, char** pl, int pln) {
- return Hunspell_generate2((Hunhandle*)(this), slst, word, pl, pln);
+ return m_Impl->generate(slst, word, pl, pln);
}
-int Hunspell::generate(char*** slst, const char* word, const char* pattern) {
- return Hunspell_generate((Hunhandle*)(this), slst, word, pattern);
+const char* Hunspell::get_wordchars() const {
+ return m_Impl->get_wordchars();
+}
+
+const char* Hunspell::get_version() const {
+ return m_Impl->get_version();
+}
+
+int Hunspell::input_conv(const char* word, char* dest, size_t destsize) {
+ return m_Impl->input_conv(word, dest, destsize);
}
Hunhandle* Hunspell_create(const char* affpath, const char* dpath) {
- return (Hunhandle*)(new Hunspell(affpath, dpath));
+ return reinterpret_cast<Hunhandle*>(new HunspellImpl(affpath, dpath));
}
Hunhandle* Hunspell_create_key(const char* affpath,
const char* dpath,
const char* key) {
- return reinterpret_cast<Hunhandle*>(new Hunspell(affpath, dpath, key));
+ return reinterpret_cast<Hunhandle*>(new HunspellImpl(affpath, dpath, key));
}
void Hunspell_destroy(Hunhandle* pHunspell) {
- delete reinterpret_cast<Hunspell*>(pHunspell);
+ delete reinterpret_cast<HunspellImpl*>(pHunspell);
}
int Hunspell_add_dic(Hunhandle* pHunspell, const char* dpath) {
- return reinterpret_cast<Hunspell*>(pHunspell)->add_dic(dpath);
+ return reinterpret_cast<HunspellImpl*>(pHunspell)->add_dic(dpath);
}
int Hunspell_spell(Hunhandle* pHunspell, const char* word) {
- return reinterpret_cast<Hunspell*>(pHunspell)->spell(std::string(word));
+ return reinterpret_cast<HunspellImpl*>(pHunspell)->spell(word);
}
char* Hunspell_get_dic_encoding(Hunhandle* pHunspell) {
- return reinterpret_cast<Hunspell*>(pHunspell)->get_dic_encoding();
+ return reinterpret_cast<HunspellImpl*>(pHunspell)->get_dic_encoding();
}
int Hunspell_suggest(Hunhandle* pHunspell, char*** slst, const char* word) {
- std::vector<std::string> suggests = reinterpret_cast<Hunspell*>(pHunspell)->suggest(word);
- return munge_vector(slst, suggests);
+ return reinterpret_cast<HunspellImpl*>(pHunspell)->suggest(slst, word);
}
int Hunspell_analyze(Hunhandle* pHunspell, char*** slst, const char* word) {
- std::vector<std::string> stems = reinterpret_cast<Hunspell*>(pHunspell)->analyze(word);
- return munge_vector(slst, stems);
+ return reinterpret_cast<HunspellImpl*>(pHunspell)->analyze(slst, word);
}
int Hunspell_stem(Hunhandle* pHunspell, char*** slst, const char* word) {
-
- std::vector<std::string> stems = reinterpret_cast<Hunspell*>(pHunspell)->stem(word);
- return munge_vector(slst, stems);
+ return reinterpret_cast<HunspellImpl*>(pHunspell)->stem(slst, word);
}
int Hunspell_stem2(Hunhandle* pHunspell, char*** slst, char** desc, int n) {
- std::vector<std::string> morph;
- for (int i = 0; i < n; ++i)
- morph.push_back(desc[i]);
-
- std::vector<std::string> stems = reinterpret_cast<Hunspell*>(pHunspell)->stem(morph);
- return munge_vector(slst, stems);
+ return reinterpret_cast<HunspellImpl*>(pHunspell)->stem(slst, desc, n);
}
int Hunspell_generate(Hunhandle* pHunspell,
char*** slst,
const char* word,
- const char* pattern) {
- std::vector<std::string> stems = reinterpret_cast<Hunspell*>(pHunspell)->generate(word, pattern);
- return munge_vector(slst, stems);
+ const char* pattern)
+{
+ return reinterpret_cast<HunspellImpl*>(pHunspell)->generate(slst, word, pattern);
}
int Hunspell_generate2(Hunhandle* pHunspell,
char*** slst,
const char* word,
char** desc,
- int n) {
- std::vector<std::string> morph;
- for (int i = 0; i < n; ++i)
- morph.push_back(desc[i]);
-
- std::vector<std::string> stems = reinterpret_cast<Hunspell*>(pHunspell)->generate(word, morph);
- return munge_vector(slst, stems);
+ int n)
+{
+ return reinterpret_cast<HunspellImpl*>(pHunspell)->generate(slst, word, desc, n);
}
/* functions for run-time modification of the dictionary */
@@ -1949,7 +2224,7 @@ int Hunspell_generate2(Hunhandle* pHunspell,
/* add word to the run-time dictionary */
int Hunspell_add(Hunhandle* pHunspell, const char* word) {
- return reinterpret_cast<Hunspell*>(pHunspell)->add(word);
+ return reinterpret_cast<HunspellImpl*>(pHunspell)->add(word);
}
/* add word to the run-time dictionary with affix flags of
@@ -1960,58 +2235,15 @@ int Hunspell_add(Hunhandle* pHunspell, const char* word) {
int Hunspell_add_with_affix(Hunhandle* pHunspell,
const char* word,
const char* example) {
- return reinterpret_cast<Hunspell*>(pHunspell)->add_with_affix(word, example);
+ return reinterpret_cast<HunspellImpl*>(pHunspell)->add_with_affix(word, example);
}
/* remove word from the run-time dictionary */
int Hunspell_remove(Hunhandle* pHunspell, const char* word) {
- return reinterpret_cast<Hunspell*>(pHunspell)->remove(word);
+ return reinterpret_cast<HunspellImpl*>(pHunspell)->remove(word);
}
-void Hunspell_free_list(Hunhandle*, char*** list, int n) {
- if (list && *list) {
- for (int i = 0; i < n; i++)
- free((*list)[i]);
- free(*list);
- *list = NULL;
- }
-}
-
-std::vector<std::string> Hunspell::suffix_suggest(const std::string& root_word) {
- return m_Impl->suffix_suggest(root_word);
-}
-
-std::vector<std::string> HunspellImpl::suffix_suggest(const std::string& root_word) {
- std::vector<std::string> slst;
- struct hentry* he = NULL;
- int len;
- std::string w2;
- const char* word;
- const char* ignoredchars = pAMgr->get_ignore();
- if (ignoredchars != NULL) {
- w2.assign(root_word);
- if (utf8) {
- const std::vector<w_char>& ignoredchars_utf16 =
- pAMgr->get_ignore_utf16();
- remove_ignored_chars_utf(w2, ignoredchars_utf16);
- } else {
- remove_ignored_chars(w2, ignoredchars);
- }
- word = w2.c_str();
- } else
- word = root_word.c_str();
-
- len = strlen(word);
-
- if (!len)
- return slst;
-
- for (size_t i = 0; (i < m_HMgrs.size()) && !he; ++i) {
- he = m_HMgrs[i]->lookup(word);
- }
- if (he) {
- slst = pAMgr->get_suffix_words(he->astr, he->alen, root_word.c_str());
- }
- return slst;
+void Hunspell_free_list(Hunhandle* pHunspell, char*** list, int n) {
+ reinterpret_cast<HunspellImpl*>(pHunspell)->free_list(list, n);
}
diff --git a/libs/hunspell/src/hunspell.hxx b/libs/hunspell/src/hunspell.hxx
index f728f829c2..8640a35ca1 100644
--- a/libs/hunspell/src/hunspell.hxx
+++ b/libs/hunspell/src/hunspell.hxx
@@ -1,7 +1,7 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
- * Copyright (C) 2002-2017 Németh László
+ * Copyright (C) 2002-2022 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
@@ -78,7 +78,10 @@
#define SPELL_XML "<?xml?>"
+#ifndef MAXSUGGESTION
#define MAXSUGGESTION 15
+#endif
+
#define MAXSHARPS 5
#ifndef MAXWORDLEN
@@ -216,7 +219,6 @@ class LIBHUNSPELL_DLL_EXPORTED Hunspell {
struct cs_info* get_csconv();
- const char* get_try_string() const;
const char* get_version() const;
const std::string& get_version_cpp() const;
diff --git a/libs/hunspell/src/hunvisapi.h b/libs/hunspell/src/hunvisapi.h
index eb2b348091..ed0a502ba2 100644
--- a/libs/hunspell/src/hunvisapi.h
+++ b/libs/hunspell/src/hunvisapi.h
@@ -3,7 +3,7 @@
#if defined(HUNSPELL_STATIC)
# define LIBHUNSPELL_DLL_EXPORTED
-#elif defined(_MSC_VER)
+#elif defined(_WIN32)
# if defined(BUILDING_LIBHUNSPELL)
# define LIBHUNSPELL_DLL_EXPORTED __declspec(dllexport)
# else
diff --git a/libs/hunspell/src/hunzip.c++ b/libs/hunspell/src/hunzip.c++
index 8962b100b1..64a9169c4b 100644
--- a/libs/hunspell/src/hunzip.c++
+++ b/libs/hunspell/src/hunzip.c++
@@ -1,7 +1,7 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
- * Copyright (C) 2002-2017 Németh László
+ * Copyright (C) 2002-2022 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
@@ -178,7 +178,7 @@ int Hunzip::getbuf() {
do {
if (inc == 0) {
fin.read(in, BUFSIZE);
- inbits = fin.gcount() * 8;
+ inbits = int(fin.gcount() * 8);
}
for (; inc < inbits; inc++) {
int b = (in[inc / 8] & (1 << (7 - (inc % 8)))) ? 1 : 0;
diff --git a/libs/hunspell/src/hunzip.hxx b/libs/hunspell/src/hunzip.hxx
index ea2bc58d26..f57ea41cc0 100644
--- a/libs/hunspell/src/hunzip.hxx
+++ b/libs/hunspell/src/hunzip.hxx
@@ -1,7 +1,7 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
- * Copyright (C) 2002-2017 Németh László
+ * Copyright (C) 2002-2022 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
diff --git a/libs/hunspell/src/langnum.hxx b/libs/hunspell/src/langnum.hxx
index a64d3d7869..39e63efdaa 100644
--- a/libs/hunspell/src/langnum.hxx
+++ b/libs/hunspell/src/langnum.hxx
@@ -1,7 +1,7 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
- * Copyright (C) 2002-2017 Németh László
+ * Copyright (C) 2002-2022 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
@@ -48,6 +48,7 @@ enum {
LANG_az = 100, // custom number
LANG_bg = 41,
LANG_ca = 37,
+ LANG_crh = 102, // custom number
LANG_cs = 42,
LANG_da = 45,
LANG_de = 49,
diff --git a/libs/hunspell/src/replist.c++ b/libs/hunspell/src/replist.c++
index cabe382bfd..1395ade607 100644
--- a/libs/hunspell/src/replist.c++
+++ b/libs/hunspell/src/replist.c++
@@ -1,7 +1,7 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
- * Copyright (C) 2002-2017 Németh László
+ * Copyright (C) 2002-2022 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
diff --git a/libs/hunspell/src/replist.hxx b/libs/hunspell/src/replist.hxx
index 1e3efa4131..08daeb4488 100644
--- a/libs/hunspell/src/replist.hxx
+++ b/libs/hunspell/src/replist.hxx
@@ -1,7 +1,7 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
- * Copyright (C) 2002-2017 Németh László
+ * Copyright (C) 2002-2022 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
diff --git a/libs/hunspell/src/suggestmgr.c++ b/libs/hunspell/src/suggestmgr.c++
index 73ea91e3a3..6b363debd5 100644
--- a/libs/hunspell/src/suggestmgr.c++
+++ b/libs/hunspell/src/suggestmgr.c++
@@ -1,7 +1,7 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
- * Copyright (C) 2002-2017 Németh László
+ * Copyright (C) 2002-2022 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
@@ -72,6 +72,7 @@
#include <string.h>
#include <stdio.h>
#include <ctype.h>
+#include <time.h>
#include "suggestmgr.hxx"
#include "htypes.hxx"
@@ -79,6 +80,8 @@
const w_char W_VLINE = {'\0', '|'};
+#define MAX_CHAR_DISTANCE 4
+
SuggestMgr::SuggestMgr(const char* tryme, unsigned int maxn, AffixMgr* aptr) {
// register affix manager and check in string of chars to
// try when building candidate suggestions
@@ -132,6 +135,11 @@ SuggestMgr::SuggestMgr(const char* tryme, unsigned int maxn, AffixMgr* aptr) {
ctryl = u8_u16(ctry_utf, tryme);
}
}
+
+ // language with possible dash usage
+ // (latin letters or dash in TRY characters)
+ lang_with_dash_usage = (ctry &&
+ ((strchr(ctry, '-') != NULL) || (strchr(ctry, 'a') != NULL)));
}
SuggestMgr::~SuggestMgr() {
@@ -169,10 +177,13 @@ void SuggestMgr::testsug(std::vector<std::string>& wlst,
}
}
-// generate suggestions for a misspelled word
-// pass in address of array of char * pointers
-// onlycompoundsug: probably bad suggestions (need for ngram sugs, too)
-void SuggestMgr::suggest(std::vector<std::string>& slst,
+/* generate suggestions for a misspelled word
+ * pass in address of array of char * pointers
+ * onlycompoundsug: probably bad suggestions (need for ngram sugs, too)
+ * return value: true, if there is a good suggestion
+ * (REP, ph: or a dictionary word pair)
+ */
+bool SuggestMgr::suggest(std::vector<std::string>& slst,
const char* w,
int* onlycompoundsug) {
int nocompoundtwowords = 0;
@@ -182,6 +193,7 @@ void SuggestMgr::suggest(std::vector<std::string>& slst,
std::string w2;
const char* word = w;
size_t oldSug = 0;
+ bool good_suggestion = false;
// word reversing wrapper for complex prefixes
if (complexprefixes) {
@@ -196,34 +208,49 @@ void SuggestMgr::suggest(std::vector<std::string>& slst,
if (utf8) {
wl = u8_u16(word_utf, word);
if (wl == -1) {
- return;
+ return false;
}
}
- for (int cpdsuggest = 0; (cpdsuggest < 2) && (nocompoundtwowords == 0);
+ for (int cpdsuggest = 0; (cpdsuggest < 2) && (nocompoundtwowords == 0) && !good_suggestion;
cpdsuggest++) {
+
+ clock_t timelimit;
+ // initialize both in non-compound and compound cycles
+ timelimit = clock();
+
// limit compound suggestion
if (cpdsuggest > 0)
oldSug = slst.size();
// suggestions for an uppercase word (html -> HTML)
if (slst.size() < maxSug) {
+ size_t i = slst.size();
if (utf8)
- capchars_utf(slst, &word_utf[0], wl, cpdsuggest);
+ capchars_utf(slst, word_utf.data(), wl, cpdsuggest);
else
capchars(slst, word, cpdsuggest);
+ if (slst.size() > i)
+ good_suggestion = true;
}
// perhaps we made a typical fault of spelling
if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
+ size_t i = slst.size();
replchars(slst, word, cpdsuggest);
+ if (slst.size() > i)
+ good_suggestion = true;
}
+ if (clock() > timelimit + TIMELIMIT_SUGGESTION)
+ return good_suggestion;
// perhaps we made chose the wrong char from a related set
if ((slst.size() < maxSug) &&
(!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
mapchars(slst, word, cpdsuggest);
}
+ if (clock() > timelimit + TIMELIMIT_SUGGESTION)
+ return good_suggestion;
// only suggest compound words when no other suggestion
if ((cpdsuggest == 0) && (slst.size() > nsugorig))
@@ -232,77 +259,99 @@ void SuggestMgr::suggest(std::vector<std::string>& slst,
// did we swap the order of chars by mistake
if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
if (utf8)
- swapchar_utf(slst, &word_utf[0], wl, cpdsuggest);
+ swapchar_utf(slst, word_utf.data(), wl, cpdsuggest);
else
swapchar(slst, word, cpdsuggest);
}
+ if (clock() > timelimit + TIMELIMIT_SUGGESTION)
+ return good_suggestion;
// did we swap the order of non adjacent chars by mistake
if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
if (utf8)
- longswapchar_utf(slst, &word_utf[0], wl, cpdsuggest);
+ longswapchar_utf(slst, word_utf.data(), wl, cpdsuggest);
else
longswapchar(slst, word, cpdsuggest);
}
+ if (clock() > timelimit + TIMELIMIT_SUGGESTION)
+ return good_suggestion;
// did we just hit the wrong key in place of a good char (case and keyboard)
if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
if (utf8)
- badcharkey_utf(slst, &word_utf[0], wl, cpdsuggest);
+ badcharkey_utf(slst, word_utf.data(), wl, cpdsuggest);
else
badcharkey(slst, word, cpdsuggest);
}
+ if (clock() > timelimit + TIMELIMIT_SUGGESTION)
+ return good_suggestion;
// did we add a char that should not be there
if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
if (utf8)
- extrachar_utf(slst, &word_utf[0], wl, cpdsuggest);
+ extrachar_utf(slst, word_utf.data(), wl, cpdsuggest);
else
extrachar(slst, word, cpdsuggest);
}
+ if (clock() > timelimit + TIMELIMIT_SUGGESTION)
+ return good_suggestion;
// did we forgot a char
if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
if (utf8)
- forgotchar_utf(slst, &word_utf[0], wl, cpdsuggest);
+ forgotchar_utf(slst, word_utf.data(), wl, cpdsuggest);
else
forgotchar(slst, word, cpdsuggest);
}
+ if (clock() > timelimit + TIMELIMIT_SUGGESTION)
+ return good_suggestion;
// did we move a char
if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
if (utf8)
- movechar_utf(slst, &word_utf[0], wl, cpdsuggest);
+ movechar_utf(slst, word_utf.data(), wl, cpdsuggest);
else
movechar(slst, word, cpdsuggest);
}
+ if (clock() > timelimit + TIMELIMIT_SUGGESTION)
+ return good_suggestion;
// did we just hit the wrong key in place of a good char
if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
if (utf8)
- badchar_utf(slst, &word_utf[0], wl, cpdsuggest);
+ badchar_utf(slst, word_utf.data(), wl, cpdsuggest);
else
badchar(slst, word, cpdsuggest);
}
+ if (clock() > timelimit + TIMELIMIT_SUGGESTION)
+ return good_suggestion;
// did we double two characters
if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
if (utf8)
- doubletwochars_utf(slst, &word_utf[0], wl, cpdsuggest);
+ doubletwochars_utf(slst, word_utf.data(), wl, cpdsuggest);
else
doubletwochars(slst, word, cpdsuggest);
}
+ if (clock() > timelimit + TIMELIMIT_SUGGESTION)
+ return good_suggestion;
// perhaps we forgot to hit space and two words ran together
- if (!nosplitsugs && (slst.size() < maxSug) &&
- (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
- twowords(slst, word, cpdsuggest);
+ // (dictionary word pairs have top priority here, so
+ // we always suggest them, in despite of nosplitsugs, and
+ // drop compound word and other suggestions)
+ if (!cpdsuggest || (!nosplitsugs && slst.size() < oldSug + maxcpdsugs)) {
+ good_suggestion = twowords(slst, word, cpdsuggest, good_suggestion);
}
+ if (clock() > timelimit + TIMELIMIT_SUGGESTION)
+ return good_suggestion;
} // repeating ``for'' statement compounding support
if (!nocompoundtwowords && (!slst.empty()) && onlycompoundsug)
*onlycompoundsug = 1;
+
+ return good_suggestion;
}
// suggestions for an uppercase word (html -> HTML)
@@ -450,8 +499,11 @@ int SuggestMgr::replchars(std::vector<std::string>& wlst,
return wlst.size();
}
-// perhaps we doubled two characters (pattern aba -> ababa, for example vacation
-// -> vacacation)
+// perhaps we doubled two characters
+// (for example vacation -> vacacation)
+// The recognized pattern with regex back-references:
+// "(.)(.)\1\2\1" or "..(.)(.)\1\2"
+
int SuggestMgr::doubletwochars(std::vector<std::string>& wlst,
const char* word,
int cpdsuggest) {
@@ -462,7 +514,7 @@ int SuggestMgr::doubletwochars(std::vector<std::string>& wlst,
for (int i = 2; i < wl; i++) {
if (word[i] == word[i - 2]) {
state++;
- if (state == 3) {
+ if (state == 3 || (state == 2 && i >= 4)) {
std::string candidate(word, word + i - 1);
candidate.insert(candidate.end(), word + i + 1, word + wl);
testsug(wlst, candidate, cpdsuggest, NULL, NULL);
@@ -475,8 +527,11 @@ int SuggestMgr::doubletwochars(std::vector<std::string>& wlst,
return wlst.size();
}
-// perhaps we doubled two characters (pattern aba -> ababa, for example vacation
-// -> vacacation)
+// perhaps we doubled two characters
+// (for example vacation -> vacacation)
+// The recognized pattern with regex back-references:
+// "(.)(.)\1\2\1" or "..(.)(.)\1\2"
+
int SuggestMgr::doubletwochars_utf(std::vector<std::string>& wlst,
const w_char* word,
int wl,
@@ -487,7 +542,7 @@ int SuggestMgr::doubletwochars_utf(std::vector<std::string>& wlst,
for (int i = 2; i < wl; i++) {
if (word[i] == word[i - 2]) {
state++;
- if (state == 3) {
+ if (state == 3 || (state == 2 && i >= 4)) {
std::vector<w_char> candidate_utf(word, word + i - 1);
candidate_utf.insert(candidate_utf.end(), word + i + 1, word + wl);
std::string candidate;
@@ -721,17 +776,22 @@ int SuggestMgr::forgotchar_utf(std::vector<std::string>& wlst,
return wlst.size();
}
-/* error is should have been two words */
-int SuggestMgr::twowords(std::vector<std::string>& wlst,
+/* error is should have been two words
+ * return value is true, if there is a dictionary word pair,
+ * or there was already a good suggestion before calling
+ * this function.
+ */
+bool SuggestMgr::twowords(std::vector<std::string>& wlst,
const char* word,
- int cpdsuggest) {
+ int cpdsuggest,
+ bool good) {
int c2;
int forbidden = 0;
int cwrd;
int wl = strlen(word);
if (wl < 3)
- return wlst.size();
+ return false;
if (langnum == LANG_hu)
forbidden = check_forbidden(word, wl);
@@ -750,63 +810,87 @@ int SuggestMgr::twowords(std::vector<std::string>& wlst,
}
if (utf8 && p[1] == '\0')
break; // last UTF-8 character
- *p = '\0';
- int c1 = checkword(candidate, cpdsuggest, NULL, NULL);
- if (c1) {
- c2 = checkword((p + 1), cpdsuggest, NULL, NULL);
- if (c2) {
- *p = ' ';
-
- // spec. Hungarian code (need a better compound word support)
- if ((langnum == LANG_hu) && !forbidden &&
- // if 3 repeating letter, use - instead of space
- (((p[-1] == p[1]) &&
- (((p > candidate + 1) && (p[-1] == p[-2])) || (p[-1] == p[2]))) ||
- // or multiple compounding, with more, than 6 syllables
- ((c1 == 3) && (c2 >= 2))))
- *p = '-';
-
- cwrd = 1;
- for (size_t k = 0; k < wlst.size(); ++k) {
- if (wlst[k] == candidate) {
- cwrd = 0;
- break;
- }
- }
- if (wlst.size() < maxSug) {
- if (cwrd) {
- wlst.push_back(candidate);
- }
- } else {
- free(candidate);
- return wlst.size();
+
+ // Suggest only word pairs, if they are listed in the dictionary.
+ // For example, adding "a lot" to the English dic file will
+ // result only "alot" -> "a lot" suggestion instead of
+ // "alto, slot, alt, lot, allot, aloft, aloe, clot, plot, blot, a lot".
+ // Note: using "ph:alot" keeps the other suggestions:
+ // a lot ph:alot
+ // alot -> a lot, alto, slot...
+ *p = ' ';
+ if (!cpdsuggest && checkword(candidate, cpdsuggest, NULL, NULL)) {
+ // remove not word pair suggestions
+ if (!good) {
+ good = true;
+ wlst.clear();
+ }
+ wlst.insert(wlst.begin(), candidate);
+ }
+
+ // word pairs with dash?
+ if (lang_with_dash_usage) {
+ *p = '-';
+
+ if (!cpdsuggest && checkword(candidate, cpdsuggest, NULL, NULL)) {
+ // remove not word pair suggestions
+ if (!good) {
+ good = true;
+ wlst.clear();
}
- // add two word suggestion with dash, if TRY string contains
- // "a" or "-"
- // NOTE: cwrd doesn't modified for REP twoword sugg.
- if (ctry && (strchr(ctry, 'a') || strchr(ctry, '-')) &&
- mystrlen(p + 1) > 1 && mystrlen(candidate) - mystrlen(p) > 1) {
- *p = '-';
+ wlst.insert(wlst.begin(), candidate);
+ }
+ }
+
+ if (wlst.size() < maxSug && !nosplitsugs && !good) {
+ *p = '\0';
+ int c1 = checkword(candidate, cpdsuggest, NULL, NULL);
+ if (c1) {
+ c2 = checkword((p + 1), cpdsuggest, NULL, NULL);
+ if (c2) {
+ // spec. Hungarian code (TODO need a better compound word support)
+ if ((langnum == LANG_hu) && !forbidden &&
+ // if 3 repeating letter, use - instead of space
+ (((p[-1] == p[1]) &&
+ (((p > candidate + 1) && (p[-1] == p[-2])) || (p[-1] == p[2]))) ||
+ // or multiple compounding, with more, than 6 syllables
+ ((c1 == 3) && (c2 >= 2))))
+ *p = '-';
+ else
+ *p = ' ';
+
+ cwrd = 1;
for (size_t k = 0; k < wlst.size(); ++k) {
if (wlst[k] == candidate) {
cwrd = 0;
break;
}
}
- if (wlst.size() < maxSug) {
- if (cwrd) {
+
+ if (cwrd && (wlst.size() < maxSug))
wlst.push_back(candidate);
+
+ // add two word suggestion with dash, depending on the language
+ // Note that cwrd doesn't modified for REP twoword sugg.
+ if ( !nosplitsugs && lang_with_dash_usage &&
+ mystrlen(p + 1) > 1 && mystrlen(candidate) - mystrlen(p) > 1) {
+ *p = '-';
+ for (size_t k = 0; k < wlst.size(); ++k) {
+ if (wlst[k] == candidate) {
+ cwrd = 0;
+ break;
+ }
}
- } else {
- free(candidate);
- return wlst.size();
+
+ if ((wlst.size() < maxSug) && cwrd)
+ wlst.push_back(candidate);
}
}
}
}
}
free(candidate);
- return wlst.size();
+ return good;
}
// error is adjacent letter were swapped
@@ -891,7 +975,8 @@ int SuggestMgr::longswapchar(std::vector<std::string>& wlst,
// try swapping not adjacent chars one by one
for (std::string::iterator p = candidate.begin(); p < candidate.end(); ++p) {
for (std::string::iterator q = candidate.begin(); q < candidate.end(); ++q) {
- if (std::abs(std::distance(q, p)) > 1) {
+ size_t distance = std::abs(std::distance(q, p));
+ if (distance > 1 && distance <= MAX_CHAR_DISTANCE) {
std::swap(*p, *q);
testsug(wlst, candidate, cpdsuggest, NULL, NULL);
std::swap(*p, *q);
@@ -910,7 +995,8 @@ int SuggestMgr::longswapchar_utf(std::vector<std::string>& wlst,
// try swapping not adjacent chars
for (std::vector<w_char>::iterator p = candidate_utf.begin(); p < candidate_utf.end(); ++p) {
for (std::vector<w_char>::iterator q = candidate_utf.begin(); q < candidate_utf.end(); ++q) {
- if (std::abs(std::distance(q, p)) > 1) {
+ size_t distance = std::abs(std::distance(q, p));
+ if (distance > 1 && distance <= MAX_CHAR_DISTANCE) {
std::swap(*p, *q);
std::string candidate;
u16_u8(candidate, candidate_utf);
@@ -932,7 +1018,7 @@ int SuggestMgr::movechar(std::vector<std::string>& wlst,
// try moving a char
for (std::string::iterator p = candidate.begin(); p < candidate.end(); ++p) {
- for (std::string::iterator q = p + 1; q < candidate.end() && std::distance(p, q) < 10; ++q) {
+ for (std::string::iterator q = p + 1; q < candidate.end() && std::distance(p, q) <= MAX_CHAR_DISTANCE; ++q) {
std::swap(*q, *(q - 1));
if (std::distance(p, q) < 2)
continue; // omit swap char
@@ -942,7 +1028,7 @@ int SuggestMgr::movechar(std::vector<std::string>& wlst,
}
for (std::string::reverse_iterator p = candidate.rbegin(), pEnd = candidate.rend() - 1; p != pEnd; ++p) {
- for (std::string::reverse_iterator q = p + 1, qEnd = candidate.rend(); q != qEnd && std::distance(p, q) < 10; ++q) {
+ for (std::string::reverse_iterator q = p + 1, qEnd = candidate.rend(); q != qEnd && std::distance(p, q) <= MAX_CHAR_DISTANCE; ++q) {
std::swap(*q, *(q - 1));
if (std::distance(p, q) < 2)
continue; // omit swap char
@@ -965,7 +1051,7 @@ int SuggestMgr::movechar_utf(std::vector<std::string>& wlst,
// try moving a char
for (std::vector<w_char>::iterator p = candidate_utf.begin(); p < candidate_utf.end(); ++p) {
- for (std::vector<w_char>::iterator q = p + 1; q < candidate_utf.end() && std::distance(p, q) < 10; ++q) {
+ for (std::vector<w_char>::iterator q = p + 1; q < candidate_utf.end() && std::distance(p, q) <= MAX_CHAR_DISTANCE; ++q) {
std::swap(*q, *(q - 1));
if (std::distance(p, q) < 2)
continue; // omit swap char
@@ -977,7 +1063,7 @@ int SuggestMgr::movechar_utf(std::vector<std::string>& wlst,
}
for (std::vector<w_char>::reverse_iterator p = candidate_utf.rbegin(); p < candidate_utf.rend(); ++p) {
- for (std::vector<w_char>::reverse_iterator q = p + 1; q < candidate_utf.rend() && std::distance(p, q) < 10; ++q) {
+ for (std::vector<w_char>::reverse_iterator q = p + 1; q < candidate_utf.rend() && std::distance(p, q) <= MAX_CHAR_DISTANCE; ++q) {
std::swap(*q, *(q - 1));
if (std::distance(p, q) < 2)
continue; // omit swap char
@@ -994,7 +1080,8 @@ int SuggestMgr::movechar_utf(std::vector<std::string>& wlst,
// generate a set of suggestions for very poorly spelled words
void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
const char* w,
- const std::vector<HashMgr*>& rHMgr) {
+ const std::vector<HashMgr*>& rHMgr,
+ int captype) {
int lval;
int sc;
int lp, lpphon;
@@ -1071,18 +1158,34 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
u8_u16(w_word, word);
u8_u16(w_target, target);
}
-
+
std::string f;
std::vector<w_char> w_f;
-
+
for (size_t i = 0; i < rHMgr.size(); ++i) {
while (0 != (hp = rHMgr[i]->walk_hashtable(col, hp))) {
- if ((hp->astr) && (pAMgr) &&
- (TESTAFF(hp->astr, forbiddenword, hp->alen) ||
- TESTAFF(hp->astr, ONLYUPCASEFLAG, hp->alen) ||
- TESTAFF(hp->astr, nosuggest, hp->alen) ||
- TESTAFF(hp->astr, nongramsuggest, hp->alen) ||
- TESTAFF(hp->astr, onlyincompound, hp->alen)))
+ // skip exceptions
+ if (
+ // skip it, if the word length different by 5 or
+ // more characters (to avoid strange suggestions)
+ // (except Unicode characters over BMP)
+ (((abs(n - hp->clen) > 4) && !nonbmp)) ||
+ // don't suggest capitalized dictionary words for
+ // lower case misspellings in ngram suggestions, except
+ // - PHONE usage, or
+ // - in the case of German, where not only proper
+ // nouns are capitalized, or
+ // - the capitalized word has special pronunciation
+ ((captype == NOCAP) && (hp->var & H_OPT_INITCAP) &&
+ !ph && (langnum != LANG_de) && !(hp->var & H_OPT_PHON)) ||
+ // or it has one of the following special flags
+ ((hp->astr) && (pAMgr) &&
+ (TESTAFF(hp->astr, forbiddenword, hp->alen) ||
+ TESTAFF(hp->astr, ONLYUPCASEFLAG, hp->alen) ||
+ TESTAFF(hp->astr, nosuggest, hp->alen) ||
+ TESTAFF(hp->astr, nongramsuggest, hp->alen) ||
+ TESTAFF(hp->astr, onlyincompound, hp->alen)))
+ )
continue;
if (utf8) {
@@ -1105,7 +1208,7 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
sc = ngram(3, word, f, NGRAM_LONGER_WORSE) + leftcommon;
}
- // check special pronounciation
+ // check special pronunciation
f.clear();
if ((hp->var & H_OPT_PHON) &&
copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) {
@@ -1559,7 +1662,8 @@ int SuggestMgr::checkword(const std::string& word,
if (rv) {
if ((rv->astr) &&
(TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->alen) ||
- TESTAFF(rv->astr, pAMgr->get_nosuggest(), rv->alen)))
+ TESTAFF(rv->astr, pAMgr->get_nosuggest(), rv->alen) ||
+ TESTAFF(rv->astr, pAMgr->get_substandard(), rv->alen)))
return 0;
while (rv) {
if (rv->astr &&
@@ -1584,7 +1688,7 @@ int SuggestMgr::checkword(const std::string& word,
if (!rv && pAMgr->have_contclass()) {
rv = pAMgr->suffix_check_twosfx(word.c_str(), word.size(), 0, NULL, FLAG_NULL);
if (!rv)
- rv = pAMgr->prefix_check_twosfx(word.c_str(), word.size(), 1, FLAG_NULL);
+ rv = pAMgr->prefix_check_twosfx(word.c_str(), word.size(), 0, FLAG_NULL);
}
// check forbidden words
@@ -1649,15 +1753,15 @@ std::string SuggestMgr::suggest_morph(const std::string& in_w) {
TESTAFF(rv->astr, pAMgr->get_needaffix(), rv->alen) ||
TESTAFF(rv->astr, pAMgr->get_onlyincompound(), rv->alen))) {
if (!HENTRY_FIND(rv, MORPH_STEM)) {
- result.append(" ");
+ result.push_back(MSEP_FLD);
result.append(MORPH_STEM);
result.append(w);
}
if (HENTRY_DATA(rv)) {
- result.append(" ");
+ result.push_back(MSEP_FLD);
result.append(HENTRY_DATA2(rv));
}
- result.append("\n");
+ result.push_back(MSEP_REC);
}
rv = rv->next_homonym;
}
@@ -1713,7 +1817,7 @@ std::string SuggestMgr::suggest_hentry_gen(hentry* rv, const char* pattern) {
HENTRY_DATA(rv), pattern, 0);
if (!aff.empty()) {
result.append(aff);
- result.append("\n");
+ result.push_back(MSEP_REC);
}
}
@@ -1737,7 +1841,7 @@ std::string SuggestMgr::suggest_hentry_gen(hentry* rv, const char* pattern) {
rv2->alen, HENTRY_DATA(rv2), pattern, 0);
if (!aff.empty()) {
result.append(aff);
- result.append("\n");
+ result.push_back(MSEP_REC);
}
}
}
@@ -1936,7 +2040,7 @@ int SuggestMgr::leftcommonsubstring(
int l2 = su2.size();
// decapitalize dictionary word
if (complexprefixes) {
- if (su1[l1 - 1] == su2[l2 - 1])
+ if (l1 && l2 && su1[l1 - 1] == su2[l2 - 1])
return 1;
} else {
unsigned short idx = su2.empty() ? 0 : (su2[0].h << 8) + su2[0].l;
diff --git a/libs/hunspell/src/suggestmgr.hxx b/libs/hunspell/src/suggestmgr.hxx
index 19ffc03a84..4c2fb69032 100644
--- a/libs/hunspell/src/suggestmgr.hxx
+++ b/libs/hunspell/src/suggestmgr.hxx
@@ -1,7 +1,7 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
- * Copyright (C) 2002-2017 Németh László
+ * Copyright (C) 2002-2022 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
@@ -78,11 +78,6 @@
#define MAXPHONSUGS 2
#define MAXCOMPOUNDSUGS 3
-// timelimit: max ~1/4 sec (process time on Linux) for a time consuming function
-#define TIMELIMIT (CLOCKS_PER_SEC >> 2)
-#define MINTIMER 100
-#define MAXPLUSTIMER 100
-
#define NGRAM_LONGER_WORSE (1 << 0)
#define NGRAM_ANY_MISMATCH (1 << 1)
#define NGRAM_LOWERING (1 << 2)
@@ -92,7 +87,6 @@
#include "affixmgr.hxx"
#include "hashmgr.hxx"
#include "langnum.hxx"
-#include <time.h>
enum { LCS_UP, LCS_LEFT, LCS_UPLEFT };
@@ -109,6 +103,7 @@ class SuggestMgr {
char* ctry;
size_t ctryl;
std::vector<w_char> ctry_utf;
+ bool lang_with_dash_usage;
AffixMgr* pAMgr;
unsigned int maxSug;
@@ -124,8 +119,8 @@ class SuggestMgr {
SuggestMgr(const char* tryme, unsigned int maxn, AffixMgr* aptr);
~SuggestMgr();
- void suggest(std::vector<std::string>& slst, const char* word, int* onlycmpdsug);
- void ngsuggest(std::vector<std::string>& slst, const char* word, const std::vector<HashMgr*>& rHMgr);
+ bool suggest(std::vector<std::string>& slst, const char* word, int* onlycmpdsug);
+ void ngsuggest(std::vector<std::string>& slst, const char* word, const std::vector<HashMgr*>& rHMgr, int captype);
std::string suggest_morph(const std::string& word);
std::string suggest_gen(const std::vector<std::string>& pl, const std::string& pattern);
@@ -149,7 +144,7 @@ class SuggestMgr {
int extrachar(std::vector<std::string>&, const char*, int);
int badcharkey(std::vector<std::string>&, const char*, int);
int badchar(std::vector<std::string>&, const char*, int);
- int twowords(std::vector<std::string>&, const char*, int);
+ bool twowords(std::vector<std::string>&, const char*, int, bool);
void capchars_utf(std::vector<std::string>&, const w_char*, int wl, int);
int doubletwochars_utf(std::vector<std::string>&, const w_char*, int wl, int);
diff --git a/libs/hunspell/src/utf_info.hxx b/libs/hunspell/src/utf_info.hxx
index 6bb847f2a6..9ab9f7a5fe 100644
--- a/libs/hunspell/src/utf_info.hxx
+++ b/libs/hunspell/src/utf_info.hxx
@@ -1,7 +1,7 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
- * Copyright (C) 2002-2017 Németh László
+ * Copyright (C) 2002-2022 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
@@ -35,9 +35,15 @@
*
* ***** END LICENSE BLOCK ***** */
-#include "csutil.hxx"
+// Unicode character encoding information
+struct unicode_info {
+ unsigned short c;
+ unsigned short cupper;
+ unsigned short clower;
+};
+
/* fields: Unicode letter, toupper, tolower */
-static struct unicode_info utf_lst[] = {
+static const struct unicode_info utf_lst[] = {
{0x0041, 0x0041, 0x0061}, {0x0042, 0x0042, 0x0062},
{0x0043, 0x0043, 0x0063}, {0x0044, 0x0044, 0x0064},
{0x0045, 0x0045, 0x0065}, {0x0046, 0x0046, 0x0066},
diff --git a/libs/hunspell/src/w_char.hxx b/libs/hunspell/src/w_char.hxx
index 5accb7568f..7e71d04680 100644
--- a/libs/hunspell/src/w_char.hxx
+++ b/libs/hunspell/src/w_char.hxx
@@ -1,7 +1,7 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
- * Copyright (C) 2002-2017 Németh László
+ * Copyright (C) 2002-2022 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with