summaryrefslogtreecommitdiff
path: root/libs/hunspell/src/hunspell.c++
diff options
context:
space:
mode:
authorGeorge Hazan <george.hazan@gmail.com>2023-10-01 13:24:29 +0300
committerGeorge Hazan <george.hazan@gmail.com>2023-10-01 13:24:29 +0300
commit93babaf1665bd2b706a096057fdd06572c159622 (patch)
treeefa9da647497149233facb88a422d9d31564bb4b /libs/hunspell/src/hunspell.c++
parent13627600541c19643b96bd4c7b3a3d9fd92040f9 (diff)
fixes #3696 (Update hunspell to 1.7.2)
Diffstat (limited to 'libs/hunspell/src/hunspell.c++')
-rw-r--r--libs/hunspell/src/hunspell.c++464
1 files changed, 289 insertions, 175 deletions
diff --git a/libs/hunspell/src/hunspell.c++ b/libs/hunspell/src/hunspell.c++
index 779f8ec241..f340c0ad2c 100644
--- a/libs/hunspell/src/hunspell.c++
+++ b/libs/hunspell/src/hunspell.c++
@@ -68,10 +68,10 @@
* SUCH DAMAGE.
*/
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
-#include <time.h>
+#include <cstdlib>
+#include <cstring>
+#include <cstdio>
+#include <ctime>
#include "affixmgr.hxx"
#include "hunspell.hxx"
@@ -88,6 +88,8 @@ class HunspellImpl
{
public:
HunspellImpl(const char* affpath, const char* dpath, const char* key = NULL);
+ HunspellImpl(const HunspellImpl&) = delete;
+ HunspellImpl& operator=(const HunspellImpl&) = delete;
~HunspellImpl();
int add_dic(const char* dpath, const char* key = NULL);
std::vector<std::string> suffix_suggest(const std::string& root_word);
@@ -98,12 +100,15 @@ public:
std::vector<std::string> analyze(const std::string& word);
int get_langnum() const;
bool input_conv(const std::string& word, std::string& dest);
- bool spell(const std::string& word, int* info = NULL, std::string* root = NULL);
+ bool spell(const std::string& word, std::vector<std::string>& candidate_stack,
+ int* info = NULL, std::string* root = NULL);
std::vector<std::string> suggest(const std::string& word);
+ std::vector<std::string> suggest(const std::string& word, std::vector<std::string>& suggest_candidate_stack);
const std::string& get_wordchars_cpp() const;
const std::vector<w_char>& get_wordchars_utf16() const;
const std::string& get_dict_encoding() const;
int add(const std::string& word);
+ int add_with_flags(const std::string& word, const std::string& flags, const std::string& desc = NULL);
int add_with_affix(const std::string& word, const std::string& example);
int remove(const std::string& word);
const std::string& get_version_cpp() const;
@@ -120,7 +125,7 @@ public:
int generate(char*** slst, const char* word, const char* word2);
int generate(char*** slst, const char* word, char** desc, int n);
const char* get_wordchars() const;
- const char* get_try_string() const { return pAMgr->get_try_string(); }
+ const char* get_try_string() const;
const char* get_version() const;
int input_conv(const char* word, char* dest, size_t destsize);
@@ -128,7 +133,7 @@ private:
AffixMgr* pAMgr;
std::vector<HashMgr*> m_HMgrs;
SuggestMgr* pSMgr;
- char* affixpath;
+ std::string affixpath;
std::string encoding;
struct cs_info* csconv;
int langnum;
@@ -138,9 +143,12 @@ private:
private:
std::vector<std::string> analyze_internal(const std::string& word);
- bool spell_internal(const std::string& word, int* info = NULL, std::string* root = NULL);
+ bool spell_internal(const std::string& word, std::vector<std::string>& candidate_stack,
+ int* info = NULL, std::string* root = NULL);
std::vector<std::string> suggest_internal(const std::string& word,
- bool& capitalized, size_t& abbreviated, int& captype);
+ std::vector<std::string>& spell_candidate_stack,
+ std::vector<std::string>& suggest_candidate_stack,
+ bool& capitalized, size_t& abbreviated, int& captype);
void cleanword(std::string& dest, const std::string&, int* pcaptype, int* pabbrev);
size_t cleanword2(std::string& dest,
std::vector<w_char>& dest_u,
@@ -165,16 +173,13 @@ private:
std::string::size_type get_xml_pos(const std::string& s, std::string::size_type pos, const char* attr);
std::vector<std::string> get_xml_list(const std::string& list, std::string::size_type pos, const char* tag);
int check_xml_par(const std::string& q, std::string::size_type pos, const char* attr, const char* value);
-private:
- HunspellImpl(const HunspellImpl&);
- HunspellImpl& operator=(const HunspellImpl&);
};
-HunspellImpl::HunspellImpl(const char* affpath, const char* dpath, const char* key) {
+HunspellImpl::HunspellImpl(const char* affpath, const char* dpath, const char* key)
+ : affixpath(affpath) {
csconv = NULL;
utf8 = 0;
complexprefixes = 0;
- affixpath = mystrdup(affpath);
/* first set up the hash manager */
m_HMgrs.push_back(new HashMgr(dpath, affpath, key));
@@ -185,7 +190,7 @@ HunspellImpl::HunspellImpl(const char* affpath, const char* dpath, const char* k
/* get the preferred try string and the dictionary */
/* encoding from the Affix Manager for that dictionary */
- char* try_string = pAMgr->get_try_string();
+ std::string try_string = pAMgr->get_try_string();
encoding = pAMgr->get_encoding();
langnum = pAMgr->get_langnum();
utf8 = pAMgr->get_utf8();
@@ -196,31 +201,24 @@ HunspellImpl::HunspellImpl(const char* affpath, const char* dpath, const char* k
/* and finally set up the suggestion manager */
pSMgr = new SuggestMgr(try_string, MAXSUGGESTION, pAMgr);
- if (try_string)
- free(try_string);
}
HunspellImpl::~HunspellImpl() {
delete pSMgr;
delete pAMgr;
- for (size_t i = 0; i < m_HMgrs.size(); ++i)
- delete m_HMgrs[i];
+ for (auto& m_HMgr : m_HMgrs)
+ delete m_HMgr;
pSMgr = NULL;
pAMgr = NULL;
#ifdef MOZILLA_CLIENT
delete[] csconv;
#endif
csconv = NULL;
- if (affixpath)
- free(affixpath);
- affixpath = NULL;
}
// load extra dictionaries
int HunspellImpl::add_dic(const char* dpath, const char* key) {
- if (!affixpath)
- return 1;
- m_HMgrs.push_back(new HashMgr(dpath, affixpath, key));
+ m_HMgrs.push_back(new HashMgr(dpath, affixpath.c_str(), key));
return 0;
}
@@ -264,14 +262,17 @@ size_t HunspellImpl::cleanword2(std::string& dest,
clean_ignore(w2, src);
const char* q = w2.c_str();
+ int nl = (int)w2.size();
// first skip over any leading blanks
- while (*q == ' ')
+ while (*q == ' ') {
++q;
-
+ nl--;
+ }
+
// now strip off any trailing periods (recording their presence)
*pabbrev = 0;
- int nl = strlen(q);
+
while ((nl > 0) && (*(q + nl - 1) == '.')) {
nl--;
(*pabbrev)++;
@@ -300,15 +301,17 @@ void HunspellImpl::cleanword(std::string& dest,
int* pabbrev) {
dest.clear();
const unsigned char* q = (const unsigned char*)src.c_str();
- int firstcap = 0;
+ int firstcap = 0, nl = (int)src.size();
// first skip over any leading blanks
- while (*q == ' ')
+ while (*q == ' ') {
++q;
-
+ nl--;
+ }
+
// now strip off any trailing periods (recording their presence)
*pabbrev = 0;
- int nl = strlen((const char*)q);
+
while ((nl > 0) && (*(q + nl - 1) == '.')) {
nl--;
(*pabbrev)++;
@@ -340,9 +343,9 @@ void HunspellImpl::cleanword(std::string& dest,
} else {
std::vector<w_char> t;
u8_u16(t, src);
- for (size_t i = 0; i < t.size(); ++i) {
- unsigned short idx = (t[i].h << 8) + t[i].l;
- unsigned short low = unicodetolower(idx, langnum);
+ for (auto& wc : t) {
+ const auto idx = (unsigned short)wc;
+ const auto low = unicodetolower(idx, langnum);
if (idx != low)
ncap++;
if (unicodetoupper(idx, langnum) == low)
@@ -350,7 +353,7 @@ void HunspellImpl::cleanword(std::string& dest,
}
u16_u8(dest, t);
if (ncap) {
- unsigned short idx = (t[0].h << 8) + t[0].l;
+ const auto idx = (unsigned short)t[0];
firstcap = (idx != unicodetolower(idx, langnum));
}
}
@@ -435,8 +438,16 @@ void HunspellImpl::insert_sug(std::vector<std::string>& slst, const std::string&
slst.insert(slst.begin(), word);
}
-bool HunspellImpl::spell(const std::string& word, int* info, std::string* root) {
- bool r = spell_internal(word, info, root);
+bool HunspellImpl::spell(const std::string& word, std::vector<std::string>& candidate_stack,
+ int* info, std::string* root) {
+ // something very broken if spell ends up calling itself with the same word
+ if (std::find(candidate_stack.begin(), candidate_stack.end(), word) != candidate_stack.end())
+ return false;
+
+ candidate_stack.push_back(word);
+ bool r = spell_internal(word, candidate_stack, info, root);
+ candidate_stack.pop_back();
+
if (r && root) {
// output conversion
RepList* rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL;
@@ -450,7 +461,8 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root)
return r;
}
-bool HunspellImpl::spell_internal(const std::string& word, int* info, std::string* root) {
+bool HunspellImpl::spell_internal(const std::string& word, std::vector<std::string>& candidate_stack,
+ int* info, std::string* root) {
struct hentry* rv = NULL;
int info2 = 0;
@@ -488,6 +500,11 @@ bool HunspellImpl::spell_internal(const std::string& word, int* info, std::strin
wl = cleanword2(scw, sunicw, word, &captype, &abbv);
}
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
+ if (wl > 32768)
+ return false;
+#endif
+
#ifdef MOZILLA_CLIENT
// accept the abbreviated words without dots
// workaround for the incomplete tokenization of Mozilla
@@ -552,8 +569,7 @@ bool HunspellImpl::spell_internal(const std::string& word, int* info, std::strin
//conversion may result in string with different len to pre-mkallsmall2
//so re-scan
if (apos != std::string::npos && apos < scw.size() - 1) {
- std::string part1 = scw.substr(0, apos+1);
- std::string part2 = scw.substr(apos+1);
+ std::string part1 = scw.substr(0, apos + 1), part2 = scw.substr(apos + 1);
if (utf8) {
std::vector<w_char> part1u, part2u;
u8_u16(part1u, part1);
@@ -679,61 +695,78 @@ bool HunspellImpl::spell_internal(const std::string& word, int* info, std::strin
wl = scw.size();
// calculate break points for recursion limit
- for (size_t j = 0; j < wordbreak.size(); ++j) {
+ for (auto& j : wordbreak) {
size_t pos = 0;
- while ((pos = scw.find(wordbreak[j], pos)) != std::string::npos) {
+ while ((pos = scw.find(j, pos)) != std::string::npos) {
++nbr;
- pos += wordbreak[j].size();
+ pos += j.size();
}
}
if (nbr >= 10)
return false;
// check boundary patterns (^begin and end$)
- for (size_t j = 0; j < wordbreak.size(); ++j) {
- size_t plen = wordbreak[j].size();
+ for (auto& j : wordbreak) {
+ size_t plen = j.size();
if (plen == 1 || plen > wl)
continue;
- if (wordbreak[j][0] == '^' &&
- scw.compare(0, plen - 1, wordbreak[j], 1, plen -1) == 0 && spell(scw.substr(plen - 1)))
+ if (j[0] == '^' &&
+ scw.compare(0, plen - 1, j, 1, plen -1) == 0 && spell(scw.substr(plen - 1), candidate_stack))
+ {
+ if (info)
+ *info |= SPELL_COMPOUND;
return true;
+ }
- if (wordbreak[j][plen - 1] == '$' &&
- scw.compare(wl - plen + 1, plen - 1, wordbreak[j], 0, plen - 1) == 0) {
+ if (j[plen - 1] == '$' &&
+ scw.compare(wl - plen + 1, plen - 1, j, 0, plen - 1) == 0) {
std::string suffix(scw.substr(wl - plen + 1));
scw.resize(wl - plen + 1);
- if (spell(scw))
+ if (spell(scw, candidate_stack))
+ {
+ if (info)
+ *info |= SPELL_COMPOUND;
return true;
+ }
scw.append(suffix);
}
}
// other patterns
- for (size_t j = 0; j < wordbreak.size(); ++j) {
- size_t plen = wordbreak[j].size();
- size_t found = scw.find(wordbreak[j]);
+ for (auto& j : wordbreak) {
+ size_t plen = j.size();
+ size_t found = scw.find(j);
if ((found > 0) && (found < wl - plen)) {
- size_t found2 = scw.find(wordbreak[j], found + 1);
+ size_t found2 = scw.find(j, found + 1);
// try to break at the second occurance
// to recognize dictionary words with wordbreak
if (found2 > 0 && (found2 < wl - plen))
found = found2;
- if (!spell(scw.substr(found + plen)))
+ std::string substring(scw.substr(found + plen));
+ if (!spell(substring, candidate_stack))
continue;
std::string suffix(scw.substr(found));
scw.resize(found);
// examine 2 sides of the break point
- if (spell(scw))
+ if (spell(scw, candidate_stack))
+ {
+ if (info)
+ *info |= SPELL_COMPOUND;
return true;
+ }
scw.append(suffix);
// LANG_hu: spec. dash rule
- if (langnum == LANG_hu && wordbreak[j] == "-") {
+ if (langnum == LANG_hu && j == "-") {
suffix = scw.substr(found + 1);
scw.resize(found + 1);
- if (spell(scw))
+ if (spell(scw, candidate_stack))
+ {
+ if (info)
+ *info |= SPELL_COMPOUND;
return true; // check the first part with dash
+ }
scw.append(suffix);
}
// end of LANG specific region
@@ -741,25 +774,32 @@ bool HunspellImpl::spell_internal(const std::string& word, int* info, std::strin
}
// other patterns (break at first break point)
- for (size_t j = 0; j < wordbreak.size(); ++j) {
- size_t plen = wordbreak[j].size();
- size_t found = scw.find(wordbreak[j]);
+ for (auto& j : wordbreak) {
+ size_t plen = j.size(), found = scw.find(j);
if ((found > 0) && (found < wl - plen)) {
- if (!spell(scw.substr(found + plen)))
+ if (!spell(scw.substr(found + plen), candidate_stack))
continue;
std::string suffix(scw.substr(found));
scw.resize(found);
// examine 2 sides of the break point
- if (spell(scw))
+ if (spell(scw, candidate_stack))
+ {
+ if (info)
+ *info |= SPELL_COMPOUND;
return true;
+ }
scw.append(suffix);
// LANG_hu: spec. dash rule
- if (langnum == LANG_hu && wordbreak[j] == "-") {
+ if (langnum == LANG_hu && j == "-") {
suffix = scw.substr(found + 1);
scw.resize(found + 1);
- if (spell(scw))
+ if (spell(scw, candidate_stack))
+ {
+ if (info)
+ *info |= SPELL_COMPOUND;
return true; // check the first part with dash
+ }
scw.append(suffix);
}
// end of LANG specific region
@@ -771,33 +811,28 @@ bool HunspellImpl::spell_internal(const std::string& word, int* info, std::strin
}
struct hentry* HunspellImpl::checkword(const std::string& w, int* info, std::string* root) {
- std::string w2;
- const char* word;
- int len;
+ std::string word;
// remove IGNORE characters from the string
- clean_ignore(w2, w);
-
- word = w2.c_str();
- len = w2.size();
+ clean_ignore(word, w);
- if (!len)
+ if (word.empty())
return NULL;
// word reversing wrapper for complex prefixes
if (complexprefixes) {
if (utf8)
- reverseword_utf(w2);
+ reverseword_utf(word);
else
- reverseword(w2);
+ reverseword(word);
}
- word = w2.c_str();
+ int len = word.size();
// look word in hash table
struct hentry* he = NULL;
for (size_t i = 0; (i < m_HMgrs.size()) && !he; ++i) {
- he = m_HMgrs[i]->lookup(word);
+ he = m_HMgrs[i]->lookup(word.c_str(), word.size());
// check forbidden and onlyincompound words
if ((he) && (he->astr) && (pAMgr) &&
@@ -828,8 +863,8 @@ struct hentry* HunspellImpl::checkword(const std::string& w, int* info, std::str
// check with affixes
if (!he && pAMgr) {
- // try stripping off affixes */
- he = pAMgr->affix_check(word, len, 0);
+ // try stripping off affixes
+ he = pAMgr->affix_check(word, 0, len, 0);
// check compound restriction and onlyupcase
if (he && he->astr &&
@@ -858,11 +893,35 @@ struct hentry* HunspellImpl::checkword(const std::string& w, int* info, std::str
}
// try check compound word
} else if (pAMgr->get_compound()) {
- struct hentry* rwords[100]; // buffer for COMPOUND pattern checking
- he = pAMgr->compound_check(word, 0, 0, 100, 0, NULL, (hentry**)&rwords, 0, 0, info);
+ struct hentry* rwords[100] = {}; // buffer for COMPOUND pattern checking
+
+ // first allow only 2 words in the compound
+ int setinfo = SPELL_COMPOUND_2;
+ if (info)
+ setinfo |= *info;
+ he = pAMgr->compound_check(word, 0, 0, 100, 0, NULL, (hentry**)&rwords, 0, 0, &setinfo);
+ if (info)
+ *info = setinfo & ~SPELL_COMPOUND_2;
+ // if not 2-word compoud word, try with 3 or more words
+ // (only if original info didn't forbid it)
+ if (!he && info && !(*info & SPELL_COMPOUND_2)) {
+ *info &= ~SPELL_COMPOUND_2;
+ he = pAMgr->compound_check(word, 0, 0, 100, 0, NULL, (hentry**)&rwords, 0, 0, info);
+ // accept the compound with 3 or more words only if it is
+ // - not a dictionary word with a typo and
+ // - not two words written separately,
+ // - or if it's an arbitrary number accepted by compound rules (e.g. 999%)
+ if (he && !isdigit(word[0]))
+ {
+ std::vector<std::string> slst;
+ if (pSMgr->suggest(slst, word, NULL, /*test_simplesug=*/true))
+ he = NULL;
+ }
+ }
+
// LANG_hu section: `moving rule' with last dash
if ((!he) && (langnum == LANG_hu) && (word[len - 1] == '-')) {
- std::string dup(word, len - 1);
+ std::string dup(word, 0, len - 1);
he = pAMgr->compound_check(dup, -5, 0, 100, 0, NULL, (hentry**)&rwords, 1, 0, info);
}
// end of LANG specific region
@@ -885,31 +944,48 @@ struct hentry* HunspellImpl::checkword(const std::string& w, int* info, std::str
return he;
}
-std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
+#define MAX_CANDIDATE_STACK_DEPTH 512
+#else
+#define MAX_CANDIDATE_STACK_DEPTH 2048
+#endif
+
+std::vector<std::string> HunspellImpl::suggest(const std::string& word, std::vector<std::string>& suggest_candidate_stack) {
+
+ if (suggest_candidate_stack.size() > MAX_CANDIDATE_STACK_DEPTH || // apply a fairly arbitrary depth limit
+ // something very broken if suggest ends up calling itself with the same word
+ std::find(suggest_candidate_stack.begin(), suggest_candidate_stack.end(), word) != suggest_candidate_stack.end()) {
+ return { };
+ }
+
bool capwords;
size_t abbv;
int captype;
- std::vector<std::string> slst = suggest_internal(word, capwords, abbv, captype);
+ std::vector<std::string> spell_candidate_stack;
+ suggest_candidate_stack.push_back(word);
+ std::vector<std::string> slst = suggest_internal(word, spell_candidate_stack, suggest_candidate_stack,
+ capwords, abbv, captype);
+ suggest_candidate_stack.pop_back();
// word reversing wrapper for complex prefixes
if (complexprefixes) {
- for (size_t j = 0; j < slst.size(); ++j) {
+ for (auto& j : slst) {
if (utf8)
- reverseword_utf(slst[j]);
+ reverseword_utf(j);
else
- reverseword(slst[j]);
+ reverseword(j);
}
}
// capitalize
if (capwords)
- for (size_t j = 0; j < slst.size(); ++j) {
- mkinitcap(slst[j]);
+ for (auto& j : slst) {
+ mkinitcap(j);
}
// expand suggestions with dot(s)
- if (abbv && pAMgr && pAMgr->get_sugswithdots()) {
- for (size_t j = 0; j < slst.size(); ++j) {
- slst[j].append(word.substr(word.size() - abbv));
+ if (abbv && pAMgr && pAMgr->get_sugswithdots() && word.size() >= abbv) {
+ for (auto& j : slst) {
+ j.append(word.substr(word.size() - abbv));
}
}
@@ -920,7 +996,7 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
case ALLCAP: {
size_t l = 0;
for (size_t j = 0; j < slst.size(); ++j) {
- if (slst[j].find(' ') == std::string::npos && !spell(slst[j])) {
+ if (slst[j].find(' ') == std::string::npos && !spell(slst[j], spell_candidate_stack)) {
std::string s;
std::vector<w_char> w;
if (utf8) {
@@ -929,12 +1005,12 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
s = slst[j];
}
mkallsmall2(s, w);
- if (spell(s)) {
+ if (spell(s, spell_candidate_stack)) {
slst[l] = s;
++l;
} else {
mkinitcap2(s, w);
- if (spell(s)) {
+ if (spell(s, spell_candidate_stack)) {
slst[l] = s;
++l;
}
@@ -976,7 +1052,14 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
return slst;
}
+std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
+ std::vector<std::string> suggest_candidate_stack;
+ return suggest(word, suggest_candidate_stack);
+}
+
std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word,
+ std::vector<std::string>& spell_candidate_stack,
+ std::vector<std::string>& suggest_candidate_stack,
bool& capwords, size_t& abbv, int& captype) {
captype = NOCAP;
abbv = 0;
@@ -1017,6 +1100,11 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word,
if (wl == 0)
return slst;
+
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
+ if (wl > 32768)
+ return slst;
+#endif
}
bool good = false;
@@ -1038,13 +1126,13 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word,
switch (captype) {
case NOCAP: {
- good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
+ good |= pSMgr->suggest(slst, scw, &onlycmpdsug);
if (clock() > timelimit + TIMELIMIT_GLOBAL)
return slst;
if (abbv) {
std::string wspace(scw);
wspace.push_back('.');
- good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
+ good |= pSMgr->suggest(slst, wspace, &onlycmpdsug);
if (clock() > timelimit + TIMELIMIT_GLOBAL)
return slst;
}
@@ -1053,12 +1141,12 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word,
case INITCAP: {
capwords = true;
- good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
+ good |= pSMgr->suggest(slst, scw, &onlycmpdsug);
if (clock() > timelimit + TIMELIMIT_GLOBAL)
return slst;
std::string wspace(scw);
mkallsmall2(wspace, sunicw);
- good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
+ good |= pSMgr->suggest(slst, wspace, &onlycmpdsug);
if (clock() > timelimit + TIMELIMIT_GLOBAL)
return slst;
break;
@@ -1067,7 +1155,7 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word,
capwords = true;
/* FALLTHROUGH */
case HUHCAP: {
- good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
+ good |= pSMgr->suggest(slst, scw, &onlycmpdsug);
if (clock() > timelimit + TIMELIMIT_GLOBAL)
return slst;
// something.The -> something. The
@@ -1095,23 +1183,23 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word,
// TheOpenOffice.org -> The OpenOffice.org
wspace = scw;
mkinitsmall2(wspace, sunicw);
- good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
+ good |= pSMgr->suggest(slst, wspace, &onlycmpdsug);
if (clock() > timelimit + TIMELIMIT_GLOBAL)
return slst;
}
wspace = scw;
mkallsmall2(wspace, sunicw);
- if (spell(wspace.c_str()))
+ if (spell(wspace, spell_candidate_stack))
insert_sug(slst, wspace);
size_t prevns = slst.size();
- good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
+ good |= pSMgr->suggest(slst, wspace, &onlycmpdsug);
if (clock() > timelimit + TIMELIMIT_GLOBAL)
return slst;
if (captype == HUHINITCAP) {
mkinitcap2(wspace, sunicw);
- if (spell(wspace.c_str()))
+ if (spell(wspace, spell_candidate_stack))
insert_sug(slst, wspace);
- good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
+ good |= pSMgr->suggest(slst, wspace, &onlycmpdsug);
if (clock() > timelimit + TIMELIMIT_GLOBAL)
return slst;
}
@@ -1140,22 +1228,22 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word,
case ALLCAP: {
std::string wspace(scw);
mkallsmall2(wspace, sunicw);
- good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
+ good |= pSMgr->suggest(slst, wspace, &onlycmpdsug);
if (clock() > timelimit + TIMELIMIT_GLOBAL)
return slst;
- if (pAMgr && pAMgr->get_keepcase() && spell(wspace.c_str()))
+ if (pAMgr && pAMgr->get_keepcase() && spell(wspace, spell_candidate_stack))
insert_sug(slst, wspace);
mkinitcap2(wspace, sunicw);
- good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
+ good |= pSMgr->suggest(slst, wspace, &onlycmpdsug);
if (clock() > timelimit + TIMELIMIT_GLOBAL)
return slst;
- for (size_t j = 0; j < slst.size(); ++j) {
- mkallcap(slst[j]);
+ for (auto& j : slst) {
+ mkallcap(j);
if (pAMgr && pAMgr->get_checksharps()) {
if (utf8) {
- mystrrep(slst[j], "\xC3\x9F", "SS");
+ mystrrep(j, "\xC3\x9F", "SS");
} else {
- mystrrep(slst[j], "\xDF", "SS");
+ mystrrep(j, "\xDF", "SS");
}
}
}
@@ -1165,17 +1253,17 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word,
// LANG_hu section: replace '-' with ' ' in Hungarian
if (langnum == LANG_hu) {
- for (size_t j = 0; j < slst.size(); ++j) {
- size_t pos = slst[j].find('-');
+ for (auto& j : slst) {
+ size_t pos = j.find('-');
if (pos != std::string::npos) {
int info;
- std::string w(slst[j].substr(0, pos));
- w.append(slst[j].substr(pos + 1));
- (void)spell(w, &info, NULL);
+ std::string w(j.substr(0, pos));
+ w.append(j.substr(pos + 1));
+ (void)spell(w, spell_candidate_stack, &info, NULL);
if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) {
- slst[j][pos] = ' ';
+ j[pos] = ' ';
} else
- slst[j][pos] = '-';
+ j[pos] = '-';
}
}
}
@@ -1246,11 +1334,11 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word,
if (dash_pos == scw.size())
last = 1;
std::string chunk = scw.substr(prev_pos, dash_pos - prev_pos);
- if (!spell(chunk.c_str())) {
- std::vector<std::string> nlst = suggest(chunk.c_str());
+ if (chunk != word && !spell(chunk, spell_candidate_stack)) {
+ std::vector<std::string> nlst = suggest(chunk, suggest_candidate_stack);
if (clock() > timelimit + TIMELIMIT_GLOBAL)
return slst;
- for (std::vector<std::string>::reverse_iterator j = nlst.rbegin(); j != nlst.rend(); ++j) {
+ for (auto j = nlst.rbegin(); j != nlst.rend(); ++j) {
std::string wspace = scw.substr(0, prev_pos);
wspace.append(*j);
if (!last) {
@@ -1286,12 +1374,11 @@ std::vector<std::string> HunspellImpl::stem(const std::vector<std::string>& desc
std::string result2;
if (desc.empty())
return slst;
- for (size_t i = 0; i < desc.size(); ++i) {
-
+ for (const auto& i : desc) {
std::string result;
// add compound word parts (except the last one)
- const char* s = desc[i].c_str();
+ const char* s = i.c_str();
const char* part = strstr(s, MORPH_PART);
if (part) {
const char* nextpart = strstr(part + 1, MORPH_PART);
@@ -1311,34 +1398,34 @@ std::vector<std::string> HunspellImpl::stem(const std::vector<std::string>& desc
tok[alt + 1] = MSEP_ALT;
}
std::vector<std::string> pl = line_tok(tok, MSEP_ALT);
- for (size_t k = 0; k < pl.size(); ++k) {
+ for (auto& k : pl) {
// add derivational suffixes
- if (pl[k].find(MORPH_DERI_SFX) != std::string::npos) {
+ if (k.find(MORPH_DERI_SFX) != std::string::npos) {
// remove inflectional suffixes
- const size_t is = pl[k].find(MORPH_INFL_SFX);
+ const size_t is = k.find(MORPH_INFL_SFX);
if (is != std::string::npos)
- pl[k].resize(is);
+ k.resize(is);
std::vector<std::string> singlepl;
- singlepl.push_back(pl[k]);
- std::string sg = pSMgr->suggest_gen(singlepl, pl[k]);
+ singlepl.push_back(k);
+ std::string sg = pSMgr->suggest_gen(singlepl, k);
if (!sg.empty()) {
std::vector<std::string> gen = line_tok(sg, MSEP_REC);
- for (size_t j = 0; j < gen.size(); ++j) {
+ for (auto& j : gen) {
result2.push_back(MSEP_REC);
result2.append(result);
- result2.append(gen[j]);
+ result2.append(j);
}
}
} else {
result2.push_back(MSEP_REC);
result2.append(result);
- if (pl[k].find(MORPH_SURF_PFX) != std::string::npos) {
+ if (k.find(MORPH_SURF_PFX) != std::string::npos) {
std::string field;
- copy_field(field, pl[k], MORPH_SURF_PFX);
+ copy_field(field, k, MORPH_SURF_PFX);
result2.append(field);
}
std::string field;
- copy_field(field, pl[k], MORPH_STEM);
+ copy_field(field, k, MORPH_STEM);
result2.append(field);
}
}
@@ -1397,6 +1484,12 @@ int HunspellImpl::add(const std::string& word) {
return 0;
}
+int HunspellImpl::add_with_flags(const std::string& word, const std::string& flags, const std::string& desc) {
+ if (!m_HMgrs.empty())
+ return m_HMgrs[0]->add_with_flags(word, flags, desc);
+ return 0;
+}
+
int HunspellImpl::add_with_affix(const std::string& word, const std::string& example) {
if (!m_HMgrs.empty())
return m_HMgrs[0]->add_with_affix(word, example);
@@ -1441,7 +1534,7 @@ std::vector<std::string> HunspellImpl::analyze(const std::string& word) {
}
std::vector<std::string> HunspellImpl::analyze_internal(const std::string& word) {
- std::vector<std::string> slst;
+ std::vector<std::string> candidate_stack, slst;
if (!pSMgr || m_HMgrs.empty())
return slst;
if (utf8) {
@@ -1593,12 +1686,11 @@ std::vector<std::string> HunspellImpl::analyze_internal(const std::string& word)
if (dash_pos != std::string::npos) {
int nresult = 0;
- std::string part1 = scw.substr(0, dash_pos);
- std::string part2 = scw.substr(dash_pos+1);
+ std::string part1 = scw.substr(0, dash_pos), part2 = scw.substr(dash_pos + 1);
// examine 2 sides of the dash
if (part2.empty()) { // base word ending with dash
- if (spell(part1)) {
+ if (spell(part1, candidate_stack)) {
std::string p = pSMgr->suggest_morph(part1);
if (!p.empty()) {
slst = line_tok(p, MSEP_REC);
@@ -1606,7 +1698,7 @@ std::vector<std::string> HunspellImpl::analyze_internal(const std::string& word)
}
}
} else if (part2.size() == 1 && part2[0] == 'e') { // XXX (HU) -e hat.
- if (spell(part1) && (spell("-e"))) {
+ if (spell(part1, candidate_stack) && (spell("-e", candidate_stack))) {
std::string st = pSMgr->suggest_morph(part1);
if (!st.empty()) {
result.append(st);
@@ -1621,9 +1713,9 @@ std::vector<std::string> HunspellImpl::analyze_internal(const std::string& word)
} else {
// first word ending with dash: word- XXX ???
part1.push_back(' ');
- nresult = spell(part1);
+ nresult = spell(part1, candidate_stack);
part1.erase(part1.size() - 1);
- if (nresult && spell(part2) &&
+ if (nresult && spell(part2, candidate_stack) &&
((part2.size() > 1) || ((part2[0] > '0') && (part2[0] < '9')))) {
std::string st = pSMgr->suggest_morph(part1);
if (!st.empty()) {
@@ -1678,14 +1770,13 @@ std::vector<std::string> HunspellImpl::generate(const std::string& word, const s
if (!pSMgr || pl.empty())
return slst;
std::vector<std::string> pl2 = analyze(word);
- int captype = NOCAP;
- int abbv = 0;
+ int captype = NOCAP, abbv = 0;
std::string cw;
cleanword(cw, word, &captype, &abbv);
std::string result;
- for (size_t i = 0; i < pl.size(); ++i) {
- cat_result(result, pSMgr->suggest_gen(pl2, pl[i]));
+ for (const auto& i : pl) {
+ cat_result(result, pSMgr->suggest_gen(pl2, i));
}
if (!result.empty()) {
@@ -1698,16 +1789,17 @@ std::vector<std::string> HunspellImpl::generate(const std::string& word, const s
// capitalize
if (captype == INITCAP || captype == HUHINITCAP) {
- for (size_t j = 0; j < slst.size(); ++j) {
- mkinitcap(slst[j]);
+ for (auto& str : slst) {
+ mkinitcap(str);
}
}
// temporary filtering of prefix related errors (eg.
// generate("undrinkable", "eats") --> "undrinkables" and "*undrinks")
- std::vector<std::string>::iterator it = slst.begin();
+ auto it = slst.begin();
while (it != slst.end()) {
- if (!spell(*it)) {
+ std::vector<std::string> candidate_stack;
+ if (!spell(*it, candidate_stack)) {
it = slst.erase(it);
} else {
++it;
@@ -1778,10 +1870,8 @@ std::string::size_type HunspellImpl::get_xml_pos(const std::string& s, std::stri
int HunspellImpl::check_xml_par(const std::string& q, std::string::size_type pos,
const char* attr,
const char* value) {
- std::string cw = get_xml_par(q, get_xml_pos(q, pos, attr));
- if (cw == value)
- return 1;
- return 0;
+ const std::string cw = get_xml_par(q, get_xml_pos(q, pos, attr));
+ return cw == value ? 1 : 0;
}
std::vector<std::string> HunspellImpl::get_xml_list(const std::string& list, std::string::size_type pos, const char* tag) {
@@ -1826,10 +1916,9 @@ std::vector<std::string> HunspellImpl::spellml(const std::string& in_word) {
// convert the result to <code><a>ana1</a><a>ana2</a></code> format
std::string r;
r.append("<code>");
- for (size_t i = 0; i < slst.size(); ++i) {
+ for (auto entry : slst) {
r.append("<a>");
- std::string entry(slst[i]);
mystrrep(entry, "\t", " ");
mystrrep(entry, "&", "&amp;");
mystrrep(entry, "<", "&lt;");
@@ -1902,34 +1991,43 @@ std::vector<std::string> HunspellImpl::suffix_suggest(const std::string& root_wo
remove_ignored_chars(w2, ignoredchars);
}
word = w2.c_str();
- } else
+ len = (int)w2.size();
+ } else {
word = root_word.c_str();
-
- len = strlen(word);
+ len = (int)root_word.size();
+ }
if (!len)
return slst;
for (size_t i = 0; (i < m_HMgrs.size()) && !he; ++i) {
- he = m_HMgrs[i]->lookup(word);
+ he = m_HMgrs[i]->lookup(word, len);
}
if (he) {
- slst = pAMgr->get_suffix_words(he->astr, he->alen, root_word.c_str());
+ slst = pAMgr->get_suffix_words(he->astr, he->alen, root_word);
}
return slst;
}
namespace {
+ // using malloc because this is for the c-api where the callers
+ // expect to be able to use free
+ char* stringdup(const std::string& s) {
+ size_t sl = s.size() + 1;
+ char* d = (char*)malloc(sl);
+ if (d)
+ memcpy(d, s.c_str(), sl);
+ return d;
+ }
+
int munge_vector(char*** slst, const std::vector<std::string>& items) {
if (items.empty()) {
*slst = NULL;
return 0;
} else {
- *slst = (char**)malloc(sizeof(char*) * items.size());
- if (!*slst)
- return 0;
+ *slst = new char*[items.size()];
for (size_t i = 0; i < items.size(); ++i)
- (*slst)[i] = mystrdup(items[i].c_str());
+ (*slst)[i] = stringdup(items[i]);
}
return items.size();
}
@@ -1937,12 +2035,13 @@ namespace {
int HunspellImpl::spell(const char* word, int* info, char** root) {
std::string sroot;
- bool ret = spell(word, info, root ? &sroot : NULL);
+ std::vector<std::string> candidate_stack;
+ bool ret = spell(word, candidate_stack, info, root ? &sroot : NULL);
if (root) {
if (sroot.empty()) {
*root = NULL;
} else {
- *root = mystrdup(sroot.c_str());
+ *root = stringdup(sroot);
}
}
return ret;
@@ -1962,7 +2061,7 @@ void HunspellImpl::free_list(char*** slst, int n) {
if (slst && *slst) {
for (int i = 0; i < n; i++)
free((*slst)[i]);
- free(*slst);
+ delete[] *slst;
*slst = NULL;
}
}
@@ -1984,8 +2083,7 @@ int HunspellImpl::stem(char*** slst, const char* word) {
int HunspellImpl::stem(char*** slst, char** desc, int n) {
std::vector<std::string> morph;
morph.reserve(n);
- for (int i = 0; i < n; ++i)
- morph.push_back(desc[i]);
+ for (int i = 0; i < n; ++i) morph.emplace_back(desc[i]);
std::vector<std::string> stems = stem(morph);
return munge_vector(slst, stems);
@@ -1999,8 +2097,7 @@ int HunspellImpl::generate(char*** slst, const char* word, const char* pattern)
int HunspellImpl::generate(char*** slst, const char* word, char** pl, int pln) {
std::vector<std::string> morph;
morph.reserve(pln);
- for (int i = 0; i < pln; ++i)
- morph.push_back(pl[i]);
+ for (int i = 0; i < pln; ++i) morph.emplace_back(pl[i]);
std::vector<std::string> stems = generate(word, morph);
return munge_vector(slst, stems);
@@ -2014,6 +2111,10 @@ const char* HunspellImpl::get_version() const {
return get_version_cpp().c_str();
}
+const char* HunspellImpl::get_try_string() const {
+ return pAMgr->get_try_string().c_str();
+}
+
int HunspellImpl::input_conv(const char* word, char* dest, size_t destsize) {
std::string d;
bool ret = input_conv(word, d);
@@ -2038,7 +2139,8 @@ int Hunspell::add_dic(const char* dpath, const char* key) {
}
bool Hunspell::spell(const std::string& word, int* info, std::string* root) {
- return m_Impl->spell(word, info, root);
+ std::vector<std::string> candidate_stack;
+ return m_Impl->spell(word, candidate_stack, info, root);
}
std::vector<std::string> Hunspell::suggest(const std::string& word) {
@@ -2073,6 +2175,10 @@ int Hunspell::add(const std::string& word) {
return m_Impl->add(word);
}
+int Hunspell::add_with_flags(const std::string& word, const std::string& flags, const std::string& desc) {
+ return m_Impl->add_with_flags(word, flags, desc);
+}
+
int Hunspell::add_with_affix(const std::string& word, const std::string& example) {
return m_Impl->add_with_affix(word, example);
}
@@ -2195,6 +2301,10 @@ int Hunspell_suggest(Hunhandle* pHunspell, char*** slst, const char* word) {
return reinterpret_cast<HunspellImpl*>(pHunspell)->suggest(slst, word);
}
+int Hunspell_suffix_suggest(Hunhandle* pHunspell, char*** slst, const char* root_word) {
+ return reinterpret_cast<HunspellImpl*>(pHunspell)->suffix_suggest(slst, root_word);
+}
+
int Hunspell_analyze(Hunhandle* pHunspell, char*** slst, const char* word) {
return reinterpret_cast<HunspellImpl*>(pHunspell)->analyze(slst, word);
}
@@ -2232,6 +2342,10 @@ int Hunspell_add(Hunhandle* pHunspell, const char* word) {
return reinterpret_cast<HunspellImpl*>(pHunspell)->add(word);
}
+int Hunspell_add_with_flags(Hunhandle* pHunspell, const char* word, const char* flags, const char* desc) {
+ return reinterpret_cast<HunspellImpl*>(pHunspell)->add_with_flags(word, flags, desc);
+}
+
/* add word to the run-time dictionary with affix flags of
* the example (a dictionary word): Hunspell will recognize
* affixed forms of the new word, too.