diff options
author | George Hazan <george.hazan@gmail.com> | 2023-10-01 13:24:29 +0300 |
---|---|---|
committer | George Hazan <george.hazan@gmail.com> | 2023-10-01 13:24:29 +0300 |
commit | 93babaf1665bd2b706a096057fdd06572c159622 (patch) | |
tree | efa9da647497149233facb88a422d9d31564bb4b /libs/hunspell/src/hunspell.c++ | |
parent | 13627600541c19643b96bd4c7b3a3d9fd92040f9 (diff) |
fixes #3696 (Update hunspell to 1.7.2)
Diffstat (limited to 'libs/hunspell/src/hunspell.c++')
-rw-r--r-- | libs/hunspell/src/hunspell.c++ | 464 |
1 files changed, 289 insertions, 175 deletions
diff --git a/libs/hunspell/src/hunspell.c++ b/libs/hunspell/src/hunspell.c++ index 779f8ec241..f340c0ad2c 100644 --- a/libs/hunspell/src/hunspell.c++ +++ b/libs/hunspell/src/hunspell.c++ @@ -68,10 +68,10 @@ * SUCH DAMAGE. */ -#include <stdlib.h> -#include <string.h> -#include <stdio.h> -#include <time.h> +#include <cstdlib> +#include <cstring> +#include <cstdio> +#include <ctime> #include "affixmgr.hxx" #include "hunspell.hxx" @@ -88,6 +88,8 @@ class HunspellImpl { public: HunspellImpl(const char* affpath, const char* dpath, const char* key = NULL); + HunspellImpl(const HunspellImpl&) = delete; + HunspellImpl& operator=(const HunspellImpl&) = delete; ~HunspellImpl(); int add_dic(const char* dpath, const char* key = NULL); std::vector<std::string> suffix_suggest(const std::string& root_word); @@ -98,12 +100,15 @@ public: std::vector<std::string> analyze(const std::string& word); int get_langnum() const; bool input_conv(const std::string& word, std::string& dest); - bool spell(const std::string& word, int* info = NULL, std::string* root = NULL); + bool spell(const std::string& word, std::vector<std::string>& candidate_stack, + int* info = NULL, std::string* root = NULL); std::vector<std::string> suggest(const std::string& word); + std::vector<std::string> suggest(const std::string& word, std::vector<std::string>& suggest_candidate_stack); const std::string& get_wordchars_cpp() const; const std::vector<w_char>& get_wordchars_utf16() const; const std::string& get_dict_encoding() const; int add(const std::string& word); + int add_with_flags(const std::string& word, const std::string& flags, const std::string& desc = NULL); int add_with_affix(const std::string& word, const std::string& example); int remove(const std::string& word); const std::string& get_version_cpp() const; @@ -120,7 +125,7 @@ public: int generate(char*** slst, const char* word, const char* word2); int generate(char*** slst, const char* word, char** desc, int n); const char* get_wordchars() const; - const char* get_try_string() const { return pAMgr->get_try_string(); } + const char* get_try_string() const; const char* get_version() const; int input_conv(const char* word, char* dest, size_t destsize); @@ -128,7 +133,7 @@ private: AffixMgr* pAMgr; std::vector<HashMgr*> m_HMgrs; SuggestMgr* pSMgr; - char* affixpath; + std::string affixpath; std::string encoding; struct cs_info* csconv; int langnum; @@ -138,9 +143,12 @@ private: private: std::vector<std::string> analyze_internal(const std::string& word); - bool spell_internal(const std::string& word, int* info = NULL, std::string* root = NULL); + bool spell_internal(const std::string& word, std::vector<std::string>& candidate_stack, + int* info = NULL, std::string* root = NULL); std::vector<std::string> suggest_internal(const std::string& word, - bool& capitalized, size_t& abbreviated, int& captype); + std::vector<std::string>& spell_candidate_stack, + std::vector<std::string>& suggest_candidate_stack, + bool& capitalized, size_t& abbreviated, int& captype); void cleanword(std::string& dest, const std::string&, int* pcaptype, int* pabbrev); size_t cleanword2(std::string& dest, std::vector<w_char>& dest_u, @@ -165,16 +173,13 @@ private: std::string::size_type get_xml_pos(const std::string& s, std::string::size_type pos, const char* attr); std::vector<std::string> get_xml_list(const std::string& list, std::string::size_type pos, const char* tag); int check_xml_par(const std::string& q, std::string::size_type pos, const char* attr, const char* value); -private: - HunspellImpl(const HunspellImpl&); - HunspellImpl& operator=(const HunspellImpl&); }; -HunspellImpl::HunspellImpl(const char* affpath, const char* dpath, const char* key) { +HunspellImpl::HunspellImpl(const char* affpath, const char* dpath, const char* key) + : affixpath(affpath) { csconv = NULL; utf8 = 0; complexprefixes = 0; - affixpath = mystrdup(affpath); /* first set up the hash manager */ m_HMgrs.push_back(new HashMgr(dpath, affpath, key)); @@ -185,7 +190,7 @@ HunspellImpl::HunspellImpl(const char* affpath, const char* dpath, const char* k /* get the preferred try string and the dictionary */ /* encoding from the Affix Manager for that dictionary */ - char* try_string = pAMgr->get_try_string(); + std::string try_string = pAMgr->get_try_string(); encoding = pAMgr->get_encoding(); langnum = pAMgr->get_langnum(); utf8 = pAMgr->get_utf8(); @@ -196,31 +201,24 @@ HunspellImpl::HunspellImpl(const char* affpath, const char* dpath, const char* k /* and finally set up the suggestion manager */ pSMgr = new SuggestMgr(try_string, MAXSUGGESTION, pAMgr); - if (try_string) - free(try_string); } HunspellImpl::~HunspellImpl() { delete pSMgr; delete pAMgr; - for (size_t i = 0; i < m_HMgrs.size(); ++i) - delete m_HMgrs[i]; + for (auto& m_HMgr : m_HMgrs) + delete m_HMgr; pSMgr = NULL; pAMgr = NULL; #ifdef MOZILLA_CLIENT delete[] csconv; #endif csconv = NULL; - if (affixpath) - free(affixpath); - affixpath = NULL; } // load extra dictionaries int HunspellImpl::add_dic(const char* dpath, const char* key) { - if (!affixpath) - return 1; - m_HMgrs.push_back(new HashMgr(dpath, affixpath, key)); + m_HMgrs.push_back(new HashMgr(dpath, affixpath.c_str(), key)); return 0; } @@ -264,14 +262,17 @@ size_t HunspellImpl::cleanword2(std::string& dest, clean_ignore(w2, src); const char* q = w2.c_str(); + int nl = (int)w2.size(); // first skip over any leading blanks - while (*q == ' ') + while (*q == ' ') { ++q; - + nl--; + } + // now strip off any trailing periods (recording their presence) *pabbrev = 0; - int nl = strlen(q); + while ((nl > 0) && (*(q + nl - 1) == '.')) { nl--; (*pabbrev)++; @@ -300,15 +301,17 @@ void HunspellImpl::cleanword(std::string& dest, int* pabbrev) { dest.clear(); const unsigned char* q = (const unsigned char*)src.c_str(); - int firstcap = 0; + int firstcap = 0, nl = (int)src.size(); // first skip over any leading blanks - while (*q == ' ') + while (*q == ' ') { ++q; - + nl--; + } + // now strip off any trailing periods (recording their presence) *pabbrev = 0; - int nl = strlen((const char*)q); + while ((nl > 0) && (*(q + nl - 1) == '.')) { nl--; (*pabbrev)++; @@ -340,9 +343,9 @@ void HunspellImpl::cleanword(std::string& dest, } else { std::vector<w_char> t; u8_u16(t, src); - for (size_t i = 0; i < t.size(); ++i) { - unsigned short idx = (t[i].h << 8) + t[i].l; - unsigned short low = unicodetolower(idx, langnum); + for (auto& wc : t) { + const auto idx = (unsigned short)wc; + const auto low = unicodetolower(idx, langnum); if (idx != low) ncap++; if (unicodetoupper(idx, langnum) == low) @@ -350,7 +353,7 @@ void HunspellImpl::cleanword(std::string& dest, } u16_u8(dest, t); if (ncap) { - unsigned short idx = (t[0].h << 8) + t[0].l; + const auto idx = (unsigned short)t[0]; firstcap = (idx != unicodetolower(idx, langnum)); } } @@ -435,8 +438,16 @@ void HunspellImpl::insert_sug(std::vector<std::string>& slst, const std::string& slst.insert(slst.begin(), word); } -bool HunspellImpl::spell(const std::string& word, int* info, std::string* root) { - bool r = spell_internal(word, info, root); +bool HunspellImpl::spell(const std::string& word, std::vector<std::string>& candidate_stack, + int* info, std::string* root) { + // something very broken if spell ends up calling itself with the same word + if (std::find(candidate_stack.begin(), candidate_stack.end(), word) != candidate_stack.end()) + return false; + + candidate_stack.push_back(word); + bool r = spell_internal(word, candidate_stack, info, root); + candidate_stack.pop_back(); + if (r && root) { // output conversion RepList* rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL; @@ -450,7 +461,8 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root) return r; } -bool HunspellImpl::spell_internal(const std::string& word, int* info, std::string* root) { +bool HunspellImpl::spell_internal(const std::string& word, std::vector<std::string>& candidate_stack, + int* info, std::string* root) { struct hentry* rv = NULL; int info2 = 0; @@ -488,6 +500,11 @@ bool HunspellImpl::spell_internal(const std::string& word, int* info, std::strin wl = cleanword2(scw, sunicw, word, &captype, &abbv); } +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) + if (wl > 32768) + return false; +#endif + #ifdef MOZILLA_CLIENT // accept the abbreviated words without dots // workaround for the incomplete tokenization of Mozilla @@ -552,8 +569,7 @@ bool HunspellImpl::spell_internal(const std::string& word, int* info, std::strin //conversion may result in string with different len to pre-mkallsmall2 //so re-scan if (apos != std::string::npos && apos < scw.size() - 1) { - std::string part1 = scw.substr(0, apos+1); - std::string part2 = scw.substr(apos+1); + std::string part1 = scw.substr(0, apos + 1), part2 = scw.substr(apos + 1); if (utf8) { std::vector<w_char> part1u, part2u; u8_u16(part1u, part1); @@ -679,61 +695,78 @@ bool HunspellImpl::spell_internal(const std::string& word, int* info, std::strin wl = scw.size(); // calculate break points for recursion limit - for (size_t j = 0; j < wordbreak.size(); ++j) { + for (auto& j : wordbreak) { size_t pos = 0; - while ((pos = scw.find(wordbreak[j], pos)) != std::string::npos) { + while ((pos = scw.find(j, pos)) != std::string::npos) { ++nbr; - pos += wordbreak[j].size(); + pos += j.size(); } } if (nbr >= 10) return false; // check boundary patterns (^begin and end$) - for (size_t j = 0; j < wordbreak.size(); ++j) { - size_t plen = wordbreak[j].size(); + for (auto& j : wordbreak) { + size_t plen = j.size(); if (plen == 1 || plen > wl) continue; - if (wordbreak[j][0] == '^' && - scw.compare(0, plen - 1, wordbreak[j], 1, plen -1) == 0 && spell(scw.substr(plen - 1))) + if (j[0] == '^' && + scw.compare(0, plen - 1, j, 1, plen -1) == 0 && spell(scw.substr(plen - 1), candidate_stack)) + { + if (info) + *info |= SPELL_COMPOUND; return true; + } - if (wordbreak[j][plen - 1] == '$' && - scw.compare(wl - plen + 1, plen - 1, wordbreak[j], 0, plen - 1) == 0) { + if (j[plen - 1] == '$' && + scw.compare(wl - plen + 1, plen - 1, j, 0, plen - 1) == 0) { std::string suffix(scw.substr(wl - plen + 1)); scw.resize(wl - plen + 1); - if (spell(scw)) + if (spell(scw, candidate_stack)) + { + if (info) + *info |= SPELL_COMPOUND; return true; + } scw.append(suffix); } } // other patterns - for (size_t j = 0; j < wordbreak.size(); ++j) { - size_t plen = wordbreak[j].size(); - size_t found = scw.find(wordbreak[j]); + for (auto& j : wordbreak) { + size_t plen = j.size(); + size_t found = scw.find(j); if ((found > 0) && (found < wl - plen)) { - size_t found2 = scw.find(wordbreak[j], found + 1); + size_t found2 = scw.find(j, found + 1); // try to break at the second occurance // to recognize dictionary words with wordbreak if (found2 > 0 && (found2 < wl - plen)) found = found2; - if (!spell(scw.substr(found + plen))) + std::string substring(scw.substr(found + plen)); + if (!spell(substring, candidate_stack)) continue; std::string suffix(scw.substr(found)); scw.resize(found); // examine 2 sides of the break point - if (spell(scw)) + if (spell(scw, candidate_stack)) + { + if (info) + *info |= SPELL_COMPOUND; return true; + } scw.append(suffix); // LANG_hu: spec. dash rule - if (langnum == LANG_hu && wordbreak[j] == "-") { + if (langnum == LANG_hu && j == "-") { suffix = scw.substr(found + 1); scw.resize(found + 1); - if (spell(scw)) + if (spell(scw, candidate_stack)) + { + if (info) + *info |= SPELL_COMPOUND; return true; // check the first part with dash + } scw.append(suffix); } // end of LANG specific region @@ -741,25 +774,32 @@ bool HunspellImpl::spell_internal(const std::string& word, int* info, std::strin } // other patterns (break at first break point) - for (size_t j = 0; j < wordbreak.size(); ++j) { - size_t plen = wordbreak[j].size(); - size_t found = scw.find(wordbreak[j]); + for (auto& j : wordbreak) { + size_t plen = j.size(), found = scw.find(j); if ((found > 0) && (found < wl - plen)) { - if (!spell(scw.substr(found + plen))) + if (!spell(scw.substr(found + plen), candidate_stack)) continue; std::string suffix(scw.substr(found)); scw.resize(found); // examine 2 sides of the break point - if (spell(scw)) + if (spell(scw, candidate_stack)) + { + if (info) + *info |= SPELL_COMPOUND; return true; + } scw.append(suffix); // LANG_hu: spec. dash rule - if (langnum == LANG_hu && wordbreak[j] == "-") { + if (langnum == LANG_hu && j == "-") { suffix = scw.substr(found + 1); scw.resize(found + 1); - if (spell(scw)) + if (spell(scw, candidate_stack)) + { + if (info) + *info |= SPELL_COMPOUND; return true; // check the first part with dash + } scw.append(suffix); } // end of LANG specific region @@ -771,33 +811,28 @@ bool HunspellImpl::spell_internal(const std::string& word, int* info, std::strin } struct hentry* HunspellImpl::checkword(const std::string& w, int* info, std::string* root) { - std::string w2; - const char* word; - int len; + std::string word; // remove IGNORE characters from the string - clean_ignore(w2, w); - - word = w2.c_str(); - len = w2.size(); + clean_ignore(word, w); - if (!len) + if (word.empty()) return NULL; // word reversing wrapper for complex prefixes if (complexprefixes) { if (utf8) - reverseword_utf(w2); + reverseword_utf(word); else - reverseword(w2); + reverseword(word); } - word = w2.c_str(); + int len = word.size(); // look word in hash table struct hentry* he = NULL; for (size_t i = 0; (i < m_HMgrs.size()) && !he; ++i) { - he = m_HMgrs[i]->lookup(word); + he = m_HMgrs[i]->lookup(word.c_str(), word.size()); // check forbidden and onlyincompound words if ((he) && (he->astr) && (pAMgr) && @@ -828,8 +863,8 @@ struct hentry* HunspellImpl::checkword(const std::string& w, int* info, std::str // check with affixes if (!he && pAMgr) { - // try stripping off affixes */ - he = pAMgr->affix_check(word, len, 0); + // try stripping off affixes + he = pAMgr->affix_check(word, 0, len, 0); // check compound restriction and onlyupcase if (he && he->astr && @@ -858,11 +893,35 @@ struct hentry* HunspellImpl::checkword(const std::string& w, int* info, std::str } // try check compound word } else if (pAMgr->get_compound()) { - struct hentry* rwords[100]; // buffer for COMPOUND pattern checking - he = pAMgr->compound_check(word, 0, 0, 100, 0, NULL, (hentry**)&rwords, 0, 0, info); + struct hentry* rwords[100] = {}; // buffer for COMPOUND pattern checking + + // first allow only 2 words in the compound + int setinfo = SPELL_COMPOUND_2; + if (info) + setinfo |= *info; + he = pAMgr->compound_check(word, 0, 0, 100, 0, NULL, (hentry**)&rwords, 0, 0, &setinfo); + if (info) + *info = setinfo & ~SPELL_COMPOUND_2; + // if not 2-word compoud word, try with 3 or more words + // (only if original info didn't forbid it) + if (!he && info && !(*info & SPELL_COMPOUND_2)) { + *info &= ~SPELL_COMPOUND_2; + he = pAMgr->compound_check(word, 0, 0, 100, 0, NULL, (hentry**)&rwords, 0, 0, info); + // accept the compound with 3 or more words only if it is + // - not a dictionary word with a typo and + // - not two words written separately, + // - or if it's an arbitrary number accepted by compound rules (e.g. 999%) + if (he && !isdigit(word[0])) + { + std::vector<std::string> slst; + if (pSMgr->suggest(slst, word, NULL, /*test_simplesug=*/true)) + he = NULL; + } + } + // LANG_hu section: `moving rule' with last dash if ((!he) && (langnum == LANG_hu) && (word[len - 1] == '-')) { - std::string dup(word, len - 1); + std::string dup(word, 0, len - 1); he = pAMgr->compound_check(dup, -5, 0, 100, 0, NULL, (hentry**)&rwords, 1, 0, info); } // end of LANG specific region @@ -885,31 +944,48 @@ struct hentry* HunspellImpl::checkword(const std::string& w, int* info, std::str return he; } -std::vector<std::string> HunspellImpl::suggest(const std::string& word) { +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) +#define MAX_CANDIDATE_STACK_DEPTH 512 +#else +#define MAX_CANDIDATE_STACK_DEPTH 2048 +#endif + +std::vector<std::string> HunspellImpl::suggest(const std::string& word, std::vector<std::string>& suggest_candidate_stack) { + + if (suggest_candidate_stack.size() > MAX_CANDIDATE_STACK_DEPTH || // apply a fairly arbitrary depth limit + // something very broken if suggest ends up calling itself with the same word + std::find(suggest_candidate_stack.begin(), suggest_candidate_stack.end(), word) != suggest_candidate_stack.end()) { + return { }; + } + bool capwords; size_t abbv; int captype; - std::vector<std::string> slst = suggest_internal(word, capwords, abbv, captype); + std::vector<std::string> spell_candidate_stack; + suggest_candidate_stack.push_back(word); + std::vector<std::string> slst = suggest_internal(word, spell_candidate_stack, suggest_candidate_stack, + capwords, abbv, captype); + suggest_candidate_stack.pop_back(); // word reversing wrapper for complex prefixes if (complexprefixes) { - for (size_t j = 0; j < slst.size(); ++j) { + for (auto& j : slst) { if (utf8) - reverseword_utf(slst[j]); + reverseword_utf(j); else - reverseword(slst[j]); + reverseword(j); } } // capitalize if (capwords) - for (size_t j = 0; j < slst.size(); ++j) { - mkinitcap(slst[j]); + for (auto& j : slst) { + mkinitcap(j); } // expand suggestions with dot(s) - if (abbv && pAMgr && pAMgr->get_sugswithdots()) { - for (size_t j = 0; j < slst.size(); ++j) { - slst[j].append(word.substr(word.size() - abbv)); + if (abbv && pAMgr && pAMgr->get_sugswithdots() && word.size() >= abbv) { + for (auto& j : slst) { + j.append(word.substr(word.size() - abbv)); } } @@ -920,7 +996,7 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) { case ALLCAP: { size_t l = 0; for (size_t j = 0; j < slst.size(); ++j) { - if (slst[j].find(' ') == std::string::npos && !spell(slst[j])) { + if (slst[j].find(' ') == std::string::npos && !spell(slst[j], spell_candidate_stack)) { std::string s; std::vector<w_char> w; if (utf8) { @@ -929,12 +1005,12 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) { s = slst[j]; } mkallsmall2(s, w); - if (spell(s)) { + if (spell(s, spell_candidate_stack)) { slst[l] = s; ++l; } else { mkinitcap2(s, w); - if (spell(s)) { + if (spell(s, spell_candidate_stack)) { slst[l] = s; ++l; } @@ -976,7 +1052,14 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) { return slst; } +std::vector<std::string> HunspellImpl::suggest(const std::string& word) { + std::vector<std::string> suggest_candidate_stack; + return suggest(word, suggest_candidate_stack); +} + std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word, + std::vector<std::string>& spell_candidate_stack, + std::vector<std::string>& suggest_candidate_stack, bool& capwords, size_t& abbv, int& captype) { captype = NOCAP; abbv = 0; @@ -1017,6 +1100,11 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word, if (wl == 0) return slst; + +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) + if (wl > 32768) + return slst; +#endif } bool good = false; @@ -1038,13 +1126,13 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word, switch (captype) { case NOCAP: { - good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug); + good |= pSMgr->suggest(slst, scw, &onlycmpdsug); if (clock() > timelimit + TIMELIMIT_GLOBAL) return slst; if (abbv) { std::string wspace(scw); wspace.push_back('.'); - good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); + good |= pSMgr->suggest(slst, wspace, &onlycmpdsug); if (clock() > timelimit + TIMELIMIT_GLOBAL) return slst; } @@ -1053,12 +1141,12 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word, case INITCAP: { capwords = true; - good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug); + good |= pSMgr->suggest(slst, scw, &onlycmpdsug); if (clock() > timelimit + TIMELIMIT_GLOBAL) return slst; std::string wspace(scw); mkallsmall2(wspace, sunicw); - good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); + good |= pSMgr->suggest(slst, wspace, &onlycmpdsug); if (clock() > timelimit + TIMELIMIT_GLOBAL) return slst; break; @@ -1067,7 +1155,7 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word, capwords = true; /* FALLTHROUGH */ case HUHCAP: { - good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug); + good |= pSMgr->suggest(slst, scw, &onlycmpdsug); if (clock() > timelimit + TIMELIMIT_GLOBAL) return slst; // something.The -> something. The @@ -1095,23 +1183,23 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word, // TheOpenOffice.org -> The OpenOffice.org wspace = scw; mkinitsmall2(wspace, sunicw); - good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); + good |= pSMgr->suggest(slst, wspace, &onlycmpdsug); if (clock() > timelimit + TIMELIMIT_GLOBAL) return slst; } wspace = scw; mkallsmall2(wspace, sunicw); - if (spell(wspace.c_str())) + if (spell(wspace, spell_candidate_stack)) insert_sug(slst, wspace); size_t prevns = slst.size(); - good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); + good |= pSMgr->suggest(slst, wspace, &onlycmpdsug); if (clock() > timelimit + TIMELIMIT_GLOBAL) return slst; if (captype == HUHINITCAP) { mkinitcap2(wspace, sunicw); - if (spell(wspace.c_str())) + if (spell(wspace, spell_candidate_stack)) insert_sug(slst, wspace); - good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); + good |= pSMgr->suggest(slst, wspace, &onlycmpdsug); if (clock() > timelimit + TIMELIMIT_GLOBAL) return slst; } @@ -1140,22 +1228,22 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word, case ALLCAP: { std::string wspace(scw); mkallsmall2(wspace, sunicw); - good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); + good |= pSMgr->suggest(slst, wspace, &onlycmpdsug); if (clock() > timelimit + TIMELIMIT_GLOBAL) return slst; - if (pAMgr && pAMgr->get_keepcase() && spell(wspace.c_str())) + if (pAMgr && pAMgr->get_keepcase() && spell(wspace, spell_candidate_stack)) insert_sug(slst, wspace); mkinitcap2(wspace, sunicw); - good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); + good |= pSMgr->suggest(slst, wspace, &onlycmpdsug); if (clock() > timelimit + TIMELIMIT_GLOBAL) return slst; - for (size_t j = 0; j < slst.size(); ++j) { - mkallcap(slst[j]); + for (auto& j : slst) { + mkallcap(j); if (pAMgr && pAMgr->get_checksharps()) { if (utf8) { - mystrrep(slst[j], "\xC3\x9F", "SS"); + mystrrep(j, "\xC3\x9F", "SS"); } else { - mystrrep(slst[j], "\xDF", "SS"); + mystrrep(j, "\xDF", "SS"); } } } @@ -1165,17 +1253,17 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word, // LANG_hu section: replace '-' with ' ' in Hungarian if (langnum == LANG_hu) { - for (size_t j = 0; j < slst.size(); ++j) { - size_t pos = slst[j].find('-'); + for (auto& j : slst) { + size_t pos = j.find('-'); if (pos != std::string::npos) { int info; - std::string w(slst[j].substr(0, pos)); - w.append(slst[j].substr(pos + 1)); - (void)spell(w, &info, NULL); + std::string w(j.substr(0, pos)); + w.append(j.substr(pos + 1)); + (void)spell(w, spell_candidate_stack, &info, NULL); if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) { - slst[j][pos] = ' '; + j[pos] = ' '; } else - slst[j][pos] = '-'; + j[pos] = '-'; } } } @@ -1246,11 +1334,11 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word, if (dash_pos == scw.size()) last = 1; std::string chunk = scw.substr(prev_pos, dash_pos - prev_pos); - if (!spell(chunk.c_str())) { - std::vector<std::string> nlst = suggest(chunk.c_str()); + if (chunk != word && !spell(chunk, spell_candidate_stack)) { + std::vector<std::string> nlst = suggest(chunk, suggest_candidate_stack); if (clock() > timelimit + TIMELIMIT_GLOBAL) return slst; - for (std::vector<std::string>::reverse_iterator j = nlst.rbegin(); j != nlst.rend(); ++j) { + for (auto j = nlst.rbegin(); j != nlst.rend(); ++j) { std::string wspace = scw.substr(0, prev_pos); wspace.append(*j); if (!last) { @@ -1286,12 +1374,11 @@ std::vector<std::string> HunspellImpl::stem(const std::vector<std::string>& desc std::string result2; if (desc.empty()) return slst; - for (size_t i = 0; i < desc.size(); ++i) { - + for (const auto& i : desc) { std::string result; // add compound word parts (except the last one) - const char* s = desc[i].c_str(); + const char* s = i.c_str(); const char* part = strstr(s, MORPH_PART); if (part) { const char* nextpart = strstr(part + 1, MORPH_PART); @@ -1311,34 +1398,34 @@ std::vector<std::string> HunspellImpl::stem(const std::vector<std::string>& desc tok[alt + 1] = MSEP_ALT; } std::vector<std::string> pl = line_tok(tok, MSEP_ALT); - for (size_t k = 0; k < pl.size(); ++k) { + for (auto& k : pl) { // add derivational suffixes - if (pl[k].find(MORPH_DERI_SFX) != std::string::npos) { + if (k.find(MORPH_DERI_SFX) != std::string::npos) { // remove inflectional suffixes - const size_t is = pl[k].find(MORPH_INFL_SFX); + const size_t is = k.find(MORPH_INFL_SFX); if (is != std::string::npos) - pl[k].resize(is); + k.resize(is); std::vector<std::string> singlepl; - singlepl.push_back(pl[k]); - std::string sg = pSMgr->suggest_gen(singlepl, pl[k]); + singlepl.push_back(k); + std::string sg = pSMgr->suggest_gen(singlepl, k); if (!sg.empty()) { std::vector<std::string> gen = line_tok(sg, MSEP_REC); - for (size_t j = 0; j < gen.size(); ++j) { + for (auto& j : gen) { result2.push_back(MSEP_REC); result2.append(result); - result2.append(gen[j]); + result2.append(j); } } } else { result2.push_back(MSEP_REC); result2.append(result); - if (pl[k].find(MORPH_SURF_PFX) != std::string::npos) { + if (k.find(MORPH_SURF_PFX) != std::string::npos) { std::string field; - copy_field(field, pl[k], MORPH_SURF_PFX); + copy_field(field, k, MORPH_SURF_PFX); result2.append(field); } std::string field; - copy_field(field, pl[k], MORPH_STEM); + copy_field(field, k, MORPH_STEM); result2.append(field); } } @@ -1397,6 +1484,12 @@ int HunspellImpl::add(const std::string& word) { return 0; } +int HunspellImpl::add_with_flags(const std::string& word, const std::string& flags, const std::string& desc) { + if (!m_HMgrs.empty()) + return m_HMgrs[0]->add_with_flags(word, flags, desc); + return 0; +} + int HunspellImpl::add_with_affix(const std::string& word, const std::string& example) { if (!m_HMgrs.empty()) return m_HMgrs[0]->add_with_affix(word, example); @@ -1441,7 +1534,7 @@ std::vector<std::string> HunspellImpl::analyze(const std::string& word) { } std::vector<std::string> HunspellImpl::analyze_internal(const std::string& word) { - std::vector<std::string> slst; + std::vector<std::string> candidate_stack, slst; if (!pSMgr || m_HMgrs.empty()) return slst; if (utf8) { @@ -1593,12 +1686,11 @@ std::vector<std::string> HunspellImpl::analyze_internal(const std::string& word) if (dash_pos != std::string::npos) { int nresult = 0; - std::string part1 = scw.substr(0, dash_pos); - std::string part2 = scw.substr(dash_pos+1); + std::string part1 = scw.substr(0, dash_pos), part2 = scw.substr(dash_pos + 1); // examine 2 sides of the dash if (part2.empty()) { // base word ending with dash - if (spell(part1)) { + if (spell(part1, candidate_stack)) { std::string p = pSMgr->suggest_morph(part1); if (!p.empty()) { slst = line_tok(p, MSEP_REC); @@ -1606,7 +1698,7 @@ std::vector<std::string> HunspellImpl::analyze_internal(const std::string& word) } } } else if (part2.size() == 1 && part2[0] == 'e') { // XXX (HU) -e hat. - if (spell(part1) && (spell("-e"))) { + if (spell(part1, candidate_stack) && (spell("-e", candidate_stack))) { std::string st = pSMgr->suggest_morph(part1); if (!st.empty()) { result.append(st); @@ -1621,9 +1713,9 @@ std::vector<std::string> HunspellImpl::analyze_internal(const std::string& word) } else { // first word ending with dash: word- XXX ??? part1.push_back(' '); - nresult = spell(part1); + nresult = spell(part1, candidate_stack); part1.erase(part1.size() - 1); - if (nresult && spell(part2) && + if (nresult && spell(part2, candidate_stack) && ((part2.size() > 1) || ((part2[0] > '0') && (part2[0] < '9')))) { std::string st = pSMgr->suggest_morph(part1); if (!st.empty()) { @@ -1678,14 +1770,13 @@ std::vector<std::string> HunspellImpl::generate(const std::string& word, const s if (!pSMgr || pl.empty()) return slst; std::vector<std::string> pl2 = analyze(word); - int captype = NOCAP; - int abbv = 0; + int captype = NOCAP, abbv = 0; std::string cw; cleanword(cw, word, &captype, &abbv); std::string result; - for (size_t i = 0; i < pl.size(); ++i) { - cat_result(result, pSMgr->suggest_gen(pl2, pl[i])); + for (const auto& i : pl) { + cat_result(result, pSMgr->suggest_gen(pl2, i)); } if (!result.empty()) { @@ -1698,16 +1789,17 @@ std::vector<std::string> HunspellImpl::generate(const std::string& word, const s // capitalize if (captype == INITCAP || captype == HUHINITCAP) { - for (size_t j = 0; j < slst.size(); ++j) { - mkinitcap(slst[j]); + for (auto& str : slst) { + mkinitcap(str); } } // temporary filtering of prefix related errors (eg. // generate("undrinkable", "eats") --> "undrinkables" and "*undrinks") - std::vector<std::string>::iterator it = slst.begin(); + auto it = slst.begin(); while (it != slst.end()) { - if (!spell(*it)) { + std::vector<std::string> candidate_stack; + if (!spell(*it, candidate_stack)) { it = slst.erase(it); } else { ++it; @@ -1778,10 +1870,8 @@ std::string::size_type HunspellImpl::get_xml_pos(const std::string& s, std::stri int HunspellImpl::check_xml_par(const std::string& q, std::string::size_type pos, const char* attr, const char* value) { - std::string cw = get_xml_par(q, get_xml_pos(q, pos, attr)); - if (cw == value) - return 1; - return 0; + const std::string cw = get_xml_par(q, get_xml_pos(q, pos, attr)); + return cw == value ? 1 : 0; } std::vector<std::string> HunspellImpl::get_xml_list(const std::string& list, std::string::size_type pos, const char* tag) { @@ -1826,10 +1916,9 @@ std::vector<std::string> HunspellImpl::spellml(const std::string& in_word) { // convert the result to <code><a>ana1</a><a>ana2</a></code> format std::string r; r.append("<code>"); - for (size_t i = 0; i < slst.size(); ++i) { + for (auto entry : slst) { r.append("<a>"); - std::string entry(slst[i]); mystrrep(entry, "\t", " "); mystrrep(entry, "&", "&"); mystrrep(entry, "<", "<"); @@ -1902,34 +1991,43 @@ std::vector<std::string> HunspellImpl::suffix_suggest(const std::string& root_wo remove_ignored_chars(w2, ignoredchars); } word = w2.c_str(); - } else + len = (int)w2.size(); + } else { word = root_word.c_str(); - - len = strlen(word); + len = (int)root_word.size(); + } if (!len) return slst; for (size_t i = 0; (i < m_HMgrs.size()) && !he; ++i) { - he = m_HMgrs[i]->lookup(word); + he = m_HMgrs[i]->lookup(word, len); } if (he) { - slst = pAMgr->get_suffix_words(he->astr, he->alen, root_word.c_str()); + slst = pAMgr->get_suffix_words(he->astr, he->alen, root_word); } return slst; } namespace { + // using malloc because this is for the c-api where the callers + // expect to be able to use free + char* stringdup(const std::string& s) { + size_t sl = s.size() + 1; + char* d = (char*)malloc(sl); + if (d) + memcpy(d, s.c_str(), sl); + return d; + } + int munge_vector(char*** slst, const std::vector<std::string>& items) { if (items.empty()) { *slst = NULL; return 0; } else { - *slst = (char**)malloc(sizeof(char*) * items.size()); - if (!*slst) - return 0; + *slst = new char*[items.size()]; for (size_t i = 0; i < items.size(); ++i) - (*slst)[i] = mystrdup(items[i].c_str()); + (*slst)[i] = stringdup(items[i]); } return items.size(); } @@ -1937,12 +2035,13 @@ namespace { int HunspellImpl::spell(const char* word, int* info, char** root) { std::string sroot; - bool ret = spell(word, info, root ? &sroot : NULL); + std::vector<std::string> candidate_stack; + bool ret = spell(word, candidate_stack, info, root ? &sroot : NULL); if (root) { if (sroot.empty()) { *root = NULL; } else { - *root = mystrdup(sroot.c_str()); + *root = stringdup(sroot); } } return ret; @@ -1962,7 +2061,7 @@ void HunspellImpl::free_list(char*** slst, int n) { if (slst && *slst) { for (int i = 0; i < n; i++) free((*slst)[i]); - free(*slst); + delete[] *slst; *slst = NULL; } } @@ -1984,8 +2083,7 @@ int HunspellImpl::stem(char*** slst, const char* word) { int HunspellImpl::stem(char*** slst, char** desc, int n) { std::vector<std::string> morph; morph.reserve(n); - for (int i = 0; i < n; ++i) - morph.push_back(desc[i]); + for (int i = 0; i < n; ++i) morph.emplace_back(desc[i]); std::vector<std::string> stems = stem(morph); return munge_vector(slst, stems); @@ -1999,8 +2097,7 @@ int HunspellImpl::generate(char*** slst, const char* word, const char* pattern) int HunspellImpl::generate(char*** slst, const char* word, char** pl, int pln) { std::vector<std::string> morph; morph.reserve(pln); - for (int i = 0; i < pln; ++i) - morph.push_back(pl[i]); + for (int i = 0; i < pln; ++i) morph.emplace_back(pl[i]); std::vector<std::string> stems = generate(word, morph); return munge_vector(slst, stems); @@ -2014,6 +2111,10 @@ const char* HunspellImpl::get_version() const { return get_version_cpp().c_str(); } +const char* HunspellImpl::get_try_string() const { + return pAMgr->get_try_string().c_str(); +} + int HunspellImpl::input_conv(const char* word, char* dest, size_t destsize) { std::string d; bool ret = input_conv(word, d); @@ -2038,7 +2139,8 @@ int Hunspell::add_dic(const char* dpath, const char* key) { } bool Hunspell::spell(const std::string& word, int* info, std::string* root) { - return m_Impl->spell(word, info, root); + std::vector<std::string> candidate_stack; + return m_Impl->spell(word, candidate_stack, info, root); } std::vector<std::string> Hunspell::suggest(const std::string& word) { @@ -2073,6 +2175,10 @@ int Hunspell::add(const std::string& word) { return m_Impl->add(word); } +int Hunspell::add_with_flags(const std::string& word, const std::string& flags, const std::string& desc) { + return m_Impl->add_with_flags(word, flags, desc); +} + int Hunspell::add_with_affix(const std::string& word, const std::string& example) { return m_Impl->add_with_affix(word, example); } @@ -2195,6 +2301,10 @@ int Hunspell_suggest(Hunhandle* pHunspell, char*** slst, const char* word) { return reinterpret_cast<HunspellImpl*>(pHunspell)->suggest(slst, word); } +int Hunspell_suffix_suggest(Hunhandle* pHunspell, char*** slst, const char* root_word) { + return reinterpret_cast<HunspellImpl*>(pHunspell)->suffix_suggest(slst, root_word); +} + int Hunspell_analyze(Hunhandle* pHunspell, char*** slst, const char* word) { return reinterpret_cast<HunspellImpl*>(pHunspell)->analyze(slst, word); } @@ -2232,6 +2342,10 @@ int Hunspell_add(Hunhandle* pHunspell, const char* word) { return reinterpret_cast<HunspellImpl*>(pHunspell)->add(word); } +int Hunspell_add_with_flags(Hunhandle* pHunspell, const char* word, const char* flags, const char* desc) { + return reinterpret_cast<HunspellImpl*>(pHunspell)->add_with_flags(word, flags, desc); +} + /* add word to the run-time dictionary with affix flags of * the example (a dictionary word): Hunspell will recognize * affixed forms of the new word, too. |