fixes #3696 (Update hunspell to 1.7.2)

author: George Hazan <george.hazan@gmail.com> 2023-10-01 13:24:29 +0300
committer: George Hazan <george.hazan@gmail.com> 2023-10-01 13:24:29 +0300
commit: 93babaf1665bd2b706a096057fdd06572c159622 (patch)
tree: efa9da647497149233facb88a422d9d31564bb4b /libs/hunspell/src/hunspell.c++
parent: 13627600541c19643b96bd4c7b3a3d9fd92040f9 (diff)
1 files changed, 289 insertions, 175 deletions
diff --git a/libs/hunspell/src/hunspell.c++ b/libs/hunspell/src/hunspell.c++
index 779f8ec241..f340c0ad2c 100644
--- a/libs/hunspell/src/hunspell.c++
+++ b/libs/hunspell/src/hunspell.c++
@@ -68,10 +68,10 @@
  * SUCH DAMAGE.
  */
 
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
-#include <time.h>
+#include <cstdlib>
+#include <cstring>
+#include <cstdio>
+#include <ctime>
 
 #include "affixmgr.hxx"
 #include "hunspell.hxx"
@@ -88,6 +88,8 @@ class HunspellImpl
 {
 public:
   HunspellImpl(const char* affpath, const char* dpath, const char* key = NULL);
+  HunspellImpl(const HunspellImpl&) = delete;
+  HunspellImpl& operator=(const HunspellImpl&) = delete;
   ~HunspellImpl();
   int add_dic(const char* dpath, const char* key = NULL);
   std::vector<std::string> suffix_suggest(const std::string& root_word);
@@ -98,12 +100,15 @@ public:
   std::vector<std::string> analyze(const std::string& word);
   int get_langnum() const;
   bool input_conv(const std::string& word, std::string& dest);
-  bool spell(const std::string& word, int* info = NULL, std::string* root = NULL);
+  bool spell(const std::string& word, std::vector<std::string>& candidate_stack,
+             int* info = NULL, std::string* root = NULL);
   std::vector<std::string> suggest(const std::string& word);
+  std::vector<std::string> suggest(const std::string& word, std::vector<std::string>& suggest_candidate_stack);
   const std::string& get_wordchars_cpp() const;
   const std::vector<w_char>& get_wordchars_utf16() const;
   const std::string& get_dict_encoding() const;
   int add(const std::string& word);
+  int add_with_flags(const std::string& word, const std::string& flags, const std::string& desc = NULL);
   int add_with_affix(const std::string& word, const std::string& example);
   int remove(const std::string& word);
   const std::string& get_version_cpp() const;
@@ -120,7 +125,7 @@ public:
   int generate(char*** slst, const char* word, const char* word2);
   int generate(char*** slst, const char* word, char** desc, int n);
   const char* get_wordchars() const;
-  const char* get_try_string() const { return pAMgr->get_try_string(); }
+  const char* get_try_string() const;
   const char* get_version() const;
   int input_conv(const char* word, char* dest, size_t destsize);
 
@@ -128,7 +133,7 @@ private:
   AffixMgr* pAMgr;
   std::vector<HashMgr*> m_HMgrs;
   SuggestMgr* pSMgr;
-  char* affixpath;
+  std::string affixpath;
   std::string encoding;
   struct cs_info* csconv;
   int langnum;
@@ -138,9 +143,12 @@ private:
 
 private:
   std::vector<std::string> analyze_internal(const std::string& word);
-  bool spell_internal(const std::string& word, int* info = NULL, std::string* root = NULL);
+  bool spell_internal(const std::string& word, std::vector<std::string>& candidate_stack,
+                      int* info = NULL, std::string* root = NULL);
   std::vector<std::string> suggest_internal(const std::string& word,
-                    bool& capitalized, size_t& abbreviated, int& captype);
+                                            std::vector<std::string>& spell_candidate_stack,
+                                            std::vector<std::string>& suggest_candidate_stack,
+                                            bool& capitalized, size_t& abbreviated, int& captype);
   void cleanword(std::string& dest, const std::string&, int* pcaptype, int* pabbrev);
   size_t cleanword2(std::string& dest,
                     std::vector<w_char>& dest_u,
@@ -165,16 +173,13 @@ private:
   std::string::size_type get_xml_pos(const std::string& s, std::string::size_type pos, const char* attr);
   std::vector<std::string> get_xml_list(const std::string& list, std::string::size_type pos, const char* tag);
   int check_xml_par(const std::string& q, std::string::size_type pos, const char* attr, const char* value);
-private:
-  HunspellImpl(const HunspellImpl&);
-  HunspellImpl& operator=(const HunspellImpl&);
 };
 
-HunspellImpl::HunspellImpl(const char* affpath, const char* dpath, const char* key) {
+HunspellImpl::HunspellImpl(const char* affpath, const char* dpath, const char* key)
+  : affixpath(affpath) {
   csconv = NULL;
   utf8 = 0;
   complexprefixes = 0;
-  affixpath = mystrdup(affpath);
 
   /* first set up the hash manager */
   m_HMgrs.push_back(new HashMgr(dpath, affpath, key));
@@ -185,7 +190,7 @@ HunspellImpl::HunspellImpl(const char* affpath, const char* dpath, const char* k
 
   /* get the preferred try string and the dictionary */
   /* encoding from the Affix Manager for that dictionary */
-  char* try_string = pAMgr->get_try_string();
+  std::string try_string = pAMgr->get_try_string();
   encoding = pAMgr->get_encoding();
   langnum = pAMgr->get_langnum();
   utf8 = pAMgr->get_utf8();
@@ -196,31 +201,24 @@ HunspellImpl::HunspellImpl(const char* affpath, const char* dpath, const char* k
 
   /* and finally set up the suggestion manager */
   pSMgr = new SuggestMgr(try_string, MAXSUGGESTION, pAMgr);
-  if (try_string)
-    free(try_string);
 }
 
 HunspellImpl::~HunspellImpl() {
   delete pSMgr;
   delete pAMgr;
-  for (size_t i = 0; i < m_HMgrs.size(); ++i)
-    delete m_HMgrs[i];
+  for (auto& m_HMgr : m_HMgrs)
+    delete m_HMgr;
   pSMgr = NULL;
   pAMgr = NULL;
 #ifdef MOZILLA_CLIENT
   delete[] csconv;
 #endif
   csconv = NULL;
-  if (affixpath)
-    free(affixpath);
-  affixpath = NULL;
 }
 
 // load extra dictionaries
 int HunspellImpl::add_dic(const char* dpath, const char* key) {
-  if (!affixpath)
-    return 1;
-  m_HMgrs.push_back(new HashMgr(dpath, affixpath, key));
+  m_HMgrs.push_back(new HashMgr(dpath, affixpath.c_str(), key));
   return 0;
 }
 
@@ -264,14 +262,17 @@ size_t HunspellImpl::cleanword2(std::string& dest,
   clean_ignore(w2, src);
 
   const char* q = w2.c_str();
+  int nl = (int)w2.size();
 
   // first skip over any leading blanks
-  while (*q == ' ')
+  while (*q == ' ') {
     ++q;
-
+    nl--;
+  }
+	
   // now strip off any trailing periods (recording their presence)
   *pabbrev = 0;
-  int nl = strlen(q);
+  
   while ((nl > 0) && (*(q + nl - 1) == '.')) {
     nl--;
     (*pabbrev)++;
@@ -300,15 +301,17 @@ void HunspellImpl::cleanword(std::string& dest,
                         int* pabbrev) {
   dest.clear();
   const unsigned char* q = (const unsigned char*)src.c_str();
-  int firstcap = 0;
+  int firstcap = 0, nl = (int)src.size();
 
   // first skip over any leading blanks
-  while (*q == ' ')
+  while (*q == ' ') {
     ++q;
-
+    nl--;
+  }
+  
   // now strip off any trailing periods (recording their presence)
   *pabbrev = 0;
-  int nl = strlen((const char*)q);
+  
   while ((nl > 0) && (*(q + nl - 1) == '.')) {
     nl--;
     (*pabbrev)++;
@@ -340,9 +343,9 @@ void HunspellImpl::cleanword(std::string& dest,
   } else {
     std::vector<w_char> t;
     u8_u16(t, src);
-    for (size_t i = 0; i < t.size(); ++i) {
-      unsigned short idx = (t[i].h << 8) + t[i].l;
-      unsigned short low = unicodetolower(idx, langnum);
+    for (auto& wc : t) {
+      const auto idx = (unsigned short)wc;
+      const auto low = unicodetolower(idx, langnum);
       if (idx != low)
         ncap++;
       if (unicodetoupper(idx, langnum) == low)
@@ -350,7 +353,7 @@ void HunspellImpl::cleanword(std::string& dest,
     }
     u16_u8(dest, t);
     if (ncap) {
-      unsigned short idx = (t[0].h << 8) + t[0].l;
+      const auto idx = (unsigned short)t[0];
       firstcap = (idx != unicodetolower(idx, langnum));
     }
   }
@@ -435,8 +438,16 @@ void HunspellImpl::insert_sug(std::vector<std::string>& slst, const std::string&
   slst.insert(slst.begin(), word);
 }
 
-bool HunspellImpl::spell(const std::string& word, int* info, std::string* root) {
-  bool r = spell_internal(word, info, root);
+bool HunspellImpl::spell(const std::string& word, std::vector<std::string>& candidate_stack,
+                         int* info, std::string* root) {
+  // something very broken if spell ends up calling itself with the same word
+  if (std::find(candidate_stack.begin(), candidate_stack.end(), word) != candidate_stack.end())
+    return false;
+
+  candidate_stack.push_back(word);
+  bool r = spell_internal(word, candidate_stack, info, root);
+  candidate_stack.pop_back();
+
   if (r && root) {
     // output conversion
     RepList* rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL;
@@ -450,7 +461,8 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root)
   return r;
 }
 
-bool HunspellImpl::spell_internal(const std::string& word, int* info, std::string* root) {
+bool HunspellImpl::spell_internal(const std::string& word, std::vector<std::string>& candidate_stack,
+                                  int* info, std::string* root) {
   struct hentry* rv = NULL;
 
   int info2 = 0;
@@ -488,6 +500,11 @@ bool HunspellImpl::spell_internal(const std::string& word, int* info, std::strin
       wl = cleanword2(scw, sunicw, word, &captype, &abbv);
   }
 
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
+    if (wl > 32768)
+      return false;
+#endif
+
 #ifdef MOZILLA_CLIENT
   // accept the abbreviated words without dots
   // workaround for the incomplete tokenization of Mozilla
@@ -552,8 +569,7 @@ bool HunspellImpl::spell_internal(const std::string& word, int* info, std::strin
         //conversion may result in string with different len to pre-mkallsmall2
         //so re-scan
         if (apos != std::string::npos && apos < scw.size() - 1) {
-          std::string part1 = scw.substr(0, apos+1);
-          std::string part2 = scw.substr(apos+1);
+          std::string part1 = scw.substr(0, apos + 1), part2 = scw.substr(apos + 1);
           if (utf8) {
             std::vector<w_char> part1u, part2u;
             u8_u16(part1u, part1);
@@ -679,61 +695,78 @@ bool HunspellImpl::spell_internal(const std::string& word, int* info, std::strin
     wl = scw.size();
 
     // calculate break points for recursion limit
-    for (size_t j = 0; j < wordbreak.size(); ++j) {
+    for (auto& j : wordbreak) {
       size_t pos = 0;
-      while ((pos = scw.find(wordbreak[j], pos)) != std::string::npos) {
+      while ((pos = scw.find(j, pos)) != std::string::npos) {
         ++nbr;
-        pos += wordbreak[j].size();
+        pos += j.size();
       }
     }
     if (nbr >= 10)
       return false;
 
     // check boundary patterns (^begin and end$)
-    for (size_t j = 0; j < wordbreak.size(); ++j) {
-      size_t plen = wordbreak[j].size();
+    for (auto& j : wordbreak) {
+      size_t plen = j.size();
       if (plen == 1 || plen > wl)
         continue;
 
-      if (wordbreak[j][0] == '^' &&
-          scw.compare(0, plen - 1, wordbreak[j], 1, plen -1) == 0 && spell(scw.substr(plen - 1)))
+      if (j[0] == '^' &&
+          scw.compare(0, plen - 1, j, 1, plen -1) == 0 && spell(scw.substr(plen - 1), candidate_stack))
+      {
+        if (info)
+          *info |= SPELL_COMPOUND;
         return true;
+      }
 
-      if (wordbreak[j][plen - 1] == '$' &&
-          scw.compare(wl - plen + 1, plen - 1, wordbreak[j], 0, plen - 1) == 0) {
+      if (j[plen - 1] == '$' &&
+          scw.compare(wl - plen + 1, plen - 1, j, 0, plen - 1) == 0) {
         std::string suffix(scw.substr(wl - plen + 1));
         scw.resize(wl - plen + 1);
-        if (spell(scw))
+        if (spell(scw, candidate_stack))
+        {
+          if (info)
+            *info |= SPELL_COMPOUND;
           return true;
+        }
         scw.append(suffix);
       }
     }
 
     // other patterns
-    for (size_t j = 0; j < wordbreak.size(); ++j) {
-      size_t plen = wordbreak[j].size();
-      size_t found = scw.find(wordbreak[j]);
+    for (auto& j : wordbreak) {
+      size_t plen = j.size();
+      size_t found = scw.find(j);
       if ((found > 0) && (found < wl - plen)) {
-        size_t found2 = scw.find(wordbreak[j], found + 1);
+        size_t found2 = scw.find(j, found + 1);
         // try to break at the second occurance
         // to recognize dictionary words with wordbreak
         if (found2 > 0 && (found2 < wl - plen))
             found = found2;
-        if (!spell(scw.substr(found + plen)))
+        std::string substring(scw.substr(found + plen));
+        if (!spell(substring, candidate_stack))
           continue;
         std::string suffix(scw.substr(found));
         scw.resize(found);
         // examine 2 sides of the break point
-        if (spell(scw))
+        if (spell(scw, candidate_stack))
+        {
+          if (info)
+            *info |= SPELL_COMPOUND;
           return true;
+        }
         scw.append(suffix);
 
         // LANG_hu: spec. dash rule
-        if (langnum == LANG_hu && wordbreak[j] == "-") {
+        if (langnum == LANG_hu && j == "-") {
           suffix = scw.substr(found + 1);
           scw.resize(found + 1);
-          if (spell(scw))
+          if (spell(scw, candidate_stack))
+          {
+            if (info)
+              *info |= SPELL_COMPOUND;
             return true;  // check the first part with dash
+          }
           scw.append(suffix);
         }
         // end of LANG specific region
@@ -741,25 +774,32 @@ bool HunspellImpl::spell_internal(const std::string& word, int* info, std::strin
     }
 
     // other patterns (break at first break point)
-    for (size_t j = 0; j < wordbreak.size(); ++j) {
-      size_t plen = wordbreak[j].size();
-      size_t found = scw.find(wordbreak[j]);
+    for (auto& j : wordbreak) {
+      size_t plen = j.size(), found = scw.find(j);
       if ((found > 0) && (found < wl - plen)) {
-        if (!spell(scw.substr(found + plen)))
+        if (!spell(scw.substr(found + plen), candidate_stack))
           continue;
         std::string suffix(scw.substr(found));
         scw.resize(found);
         // examine 2 sides of the break point
-        if (spell(scw))
+        if (spell(scw, candidate_stack))
+        {
+          if (info)
+            *info |= SPELL_COMPOUND;
           return true;
+        }
         scw.append(suffix);
 
         // LANG_hu: spec. dash rule
-        if (langnum == LANG_hu && wordbreak[j] == "-") {
+        if (langnum == LANG_hu && j == "-") {
           suffix = scw.substr(found + 1);
           scw.resize(found + 1);
-          if (spell(scw))
+          if (spell(scw, candidate_stack))
+          {
+            if (info)
+              *info |= SPELL_COMPOUND;
             return true;  // check the first part with dash
+          }
           scw.append(suffix);
         }
         // end of LANG specific region
@@ -771,33 +811,28 @@ bool HunspellImpl::spell_internal(const std::string& word, int* info, std::strin
 }
 
 struct hentry* HunspellImpl::checkword(const std::string& w, int* info, std::string* root) {
-  std::string w2;
-  const char* word;
-  int len;
+  std::string word;
 
   // remove IGNORE characters from the string
-  clean_ignore(w2, w);
-
-  word = w2.c_str();
-  len = w2.size();
+  clean_ignore(word, w);
 
-  if (!len)
+  if (word.empty())
     return NULL;
 
   // word reversing wrapper for complex prefixes
   if (complexprefixes) {
     if (utf8)
-      reverseword_utf(w2);
+      reverseword_utf(word);
     else
-      reverseword(w2);
+      reverseword(word);
   }
 
-  word = w2.c_str();
+  int len = word.size();
 
   // look word in hash table
   struct hentry* he = NULL;
   for (size_t i = 0; (i < m_HMgrs.size()) && !he; ++i) {
-    he = m_HMgrs[i]->lookup(word);
+    he = m_HMgrs[i]->lookup(word.c_str(), word.size());
 
     // check forbidden and onlyincompound words
     if ((he) && (he->astr) && (pAMgr) &&
@@ -828,8 +863,8 @@ struct hentry* HunspellImpl::checkword(const std::string& w, int* info, std::str
 
   // check with affixes
   if (!he && pAMgr) {
-    // try stripping off affixes */
-    he = pAMgr->affix_check(word, len, 0);
+    // try stripping off affixes
+    he = pAMgr->affix_check(word, 0, len, 0);
 
     // check compound restriction and onlyupcase
     if (he && he->astr &&
@@ -858,11 +893,35 @@ struct hentry* HunspellImpl::checkword(const std::string& w, int* info, std::str
       }
       // try check compound word
     } else if (pAMgr->get_compound()) {
-      struct hentry* rwords[100];  // buffer for COMPOUND pattern checking
-      he = pAMgr->compound_check(word, 0, 0, 100, 0, NULL, (hentry**)&rwords, 0, 0, info);
+      struct hentry* rwords[100] = {};  // buffer for COMPOUND pattern checking
+
+      // first allow only 2 words in the compound
+      int setinfo = SPELL_COMPOUND_2;
+      if (info)
+        setinfo |= *info;
+      he = pAMgr->compound_check(word, 0, 0, 100, 0, NULL, (hentry**)&rwords, 0, 0, &setinfo);
+      if (info)
+        *info = setinfo & ~SPELL_COMPOUND_2;
+      // if not 2-word compoud word, try with 3 or more words
+      // (only if original info didn't forbid it)
+      if (!he && info && !(*info & SPELL_COMPOUND_2)) {
+        *info &= ~SPELL_COMPOUND_2;
+        he = pAMgr->compound_check(word, 0, 0, 100, 0, NULL, (hentry**)&rwords, 0, 0, info);
+        // accept the compound with 3 or more words only if it is
+        // - not a dictionary word with a typo and
+        // - not two words written separately,
+        // - or if it's an arbitrary number accepted by compound rules (e.g. 999%)
+        if (he && !isdigit(word[0]))
+        {
+          std::vector<std::string> slst;
+          if (pSMgr->suggest(slst, word, NULL, /*test_simplesug=*/true))
+            he = NULL;
+        }
+      }
+
       // LANG_hu section: `moving rule' with last dash
       if ((!he) && (langnum == LANG_hu) && (word[len - 1] == '-')) {
-        std::string dup(word, len - 1);
+        std::string dup(word, 0, len - 1);
         he = pAMgr->compound_check(dup, -5, 0, 100, 0, NULL, (hentry**)&rwords, 1, 0, info);
       }
       // end of LANG specific region
@@ -885,31 +944,48 @@ struct hentry* HunspellImpl::checkword(const std::string& w, int* info, std::str
   return he;
 }
 
-std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
+#define MAX_CANDIDATE_STACK_DEPTH 512
+#else
+#define MAX_CANDIDATE_STACK_DEPTH 2048
+#endif
+
+std::vector<std::string> HunspellImpl::suggest(const std::string& word, std::vector<std::string>& suggest_candidate_stack) {
+
+  if (suggest_candidate_stack.size() > MAX_CANDIDATE_STACK_DEPTH || // apply a fairly arbitrary depth limit
+      // something very broken if suggest ends up calling itself with the same word
+      std::find(suggest_candidate_stack.begin(), suggest_candidate_stack.end(), word) != suggest_candidate_stack.end()) {
+    return { };
+  }
+
   bool capwords;
   size_t abbv;
   int captype;
-  std::vector<std::string> slst = suggest_internal(word, capwords, abbv, captype);
+  std::vector<std::string> spell_candidate_stack;
+  suggest_candidate_stack.push_back(word);
+  std::vector<std::string> slst = suggest_internal(word, spell_candidate_stack, suggest_candidate_stack,
+		                                   capwords, abbv, captype);
+  suggest_candidate_stack.pop_back();
   // word reversing wrapper for complex prefixes
   if (complexprefixes) {
-    for (size_t j = 0; j < slst.size(); ++j) {
+    for (auto& j : slst) {
       if (utf8)
-        reverseword_utf(slst[j]);
+        reverseword_utf(j);
       else
-        reverseword(slst[j]);
+        reverseword(j);
     }
   }
 
   // capitalize
   if (capwords)
-    for (size_t j = 0; j < slst.size(); ++j) {
-      mkinitcap(slst[j]);
+    for (auto& j : slst) {
+      mkinitcap(j);
     }
 
   // expand suggestions with dot(s)
-  if (abbv && pAMgr && pAMgr->get_sugswithdots()) {
-    for (size_t j = 0; j < slst.size(); ++j) {
-      slst[j].append(word.substr(word.size() - abbv));
+  if (abbv && pAMgr && pAMgr->get_sugswithdots() && word.size() >= abbv) {
+    for (auto& j : slst) {
+	    j.append(word.substr(word.size() - abbv));
     }
   }
 
@@ -920,7 +996,7 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
       case ALLCAP: {
         size_t l = 0;
         for (size_t j = 0; j < slst.size(); ++j) {
-          if (slst[j].find(' ') == std::string::npos && !spell(slst[j])) {
+          if (slst[j].find(' ') == std::string::npos && !spell(slst[j], spell_candidate_stack)) {
             std::string s;
             std::vector<w_char> w;
             if (utf8) {
@@ -929,12 +1005,12 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
               s = slst[j];
             }
             mkallsmall2(s, w);
-            if (spell(s)) {
+            if (spell(s, spell_candidate_stack)) {
               slst[l] = s;
               ++l;
             } else {
               mkinitcap2(s, w);
-              if (spell(s)) {
+              if (spell(s, spell_candidate_stack)) {
                 slst[l] = s;
                 ++l;
               }
@@ -976,7 +1052,14 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
   return slst;
 }
 
+std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
+  std::vector<std::string> suggest_candidate_stack;
+  return suggest(word, suggest_candidate_stack);
+}
+
 std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word,
+        std::vector<std::string>& spell_candidate_stack,
+        std::vector<std::string>& suggest_candidate_stack,
         bool& capwords, size_t& abbv, int& captype) {
   captype = NOCAP;
   abbv = 0;
@@ -1017,6 +1100,11 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word,
 
     if (wl == 0)
       return slst;
+
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
+    if (wl > 32768)
+      return slst;
+#endif
   }
 
   bool good = false;
@@ -1038,13 +1126,13 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word,
 
   switch (captype) {
     case NOCAP: {
-      good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
+      good |= pSMgr->suggest(slst, scw, &onlycmpdsug);
       if (clock() > timelimit + TIMELIMIT_GLOBAL)
           return slst;
       if (abbv) {
         std::string wspace(scw);
         wspace.push_back('.');
-        good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
+        good |= pSMgr->suggest(slst, wspace, &onlycmpdsug);
         if (clock() > timelimit + TIMELIMIT_GLOBAL)
             return slst;
       }
@@ -1053,12 +1141,12 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word,
 
     case INITCAP: {
       capwords = true;
-      good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
+      good |= pSMgr->suggest(slst, scw, &onlycmpdsug);
       if (clock() > timelimit + TIMELIMIT_GLOBAL)
           return slst;
       std::string wspace(scw);
       mkallsmall2(wspace, sunicw);
-      good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
+      good |= pSMgr->suggest(slst, wspace, &onlycmpdsug);
       if (clock() > timelimit + TIMELIMIT_GLOBAL)
           return slst;
       break;
@@ -1067,7 +1155,7 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word,
       capwords = true;
       /* FALLTHROUGH */
     case HUHCAP: {
-      good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
+      good |= pSMgr->suggest(slst, scw, &onlycmpdsug);
       if (clock() > timelimit + TIMELIMIT_GLOBAL)
           return slst;
       // something.The -> something. The
@@ -1095,23 +1183,23 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word,
         // TheOpenOffice.org -> The OpenOffice.org
         wspace = scw;
         mkinitsmall2(wspace, sunicw);
-        good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
+        good |= pSMgr->suggest(slst, wspace, &onlycmpdsug);
         if (clock() > timelimit + TIMELIMIT_GLOBAL)
             return slst;
       }
       wspace = scw;
       mkallsmall2(wspace, sunicw);
-      if (spell(wspace.c_str()))
+      if (spell(wspace, spell_candidate_stack))
         insert_sug(slst, wspace);
       size_t prevns = slst.size();
-      good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
+      good |= pSMgr->suggest(slst, wspace, &onlycmpdsug);
       if (clock() > timelimit + TIMELIMIT_GLOBAL)
           return slst;
       if (captype == HUHINITCAP) {
         mkinitcap2(wspace, sunicw);
-        if (spell(wspace.c_str()))
+        if (spell(wspace, spell_candidate_stack))
           insert_sug(slst, wspace);
-        good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
+        good |= pSMgr->suggest(slst, wspace, &onlycmpdsug);
         if (clock() > timelimit + TIMELIMIT_GLOBAL)
             return slst;
       }
@@ -1140,22 +1228,22 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word,
     case ALLCAP: {
       std::string wspace(scw);
       mkallsmall2(wspace, sunicw);
-      good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
+      good |= pSMgr->suggest(slst, wspace, &onlycmpdsug);
       if (clock() > timelimit + TIMELIMIT_GLOBAL)
           return slst;
-      if (pAMgr && pAMgr->get_keepcase() && spell(wspace.c_str()))
+      if (pAMgr && pAMgr->get_keepcase() && spell(wspace, spell_candidate_stack))
         insert_sug(slst, wspace);
       mkinitcap2(wspace, sunicw);
-      good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
+      good |= pSMgr->suggest(slst, wspace, &onlycmpdsug);
       if (clock() > timelimit + TIMELIMIT_GLOBAL)
           return slst;
-      for (size_t j = 0; j < slst.size(); ++j) {
-        mkallcap(slst[j]);
+      for (auto& j : slst) {
+        mkallcap(j);
         if (pAMgr && pAMgr->get_checksharps()) {
           if (utf8) {
-            mystrrep(slst[j], "\xC3\x9F", "SS");
+            mystrrep(j, "\xC3\x9F", "SS");
           } else {
-            mystrrep(slst[j], "\xDF", "SS");
+            mystrrep(j, "\xDF", "SS");
           }
         }
       }
@@ -1165,17 +1253,17 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word,
 
   // LANG_hu section: replace '-' with ' ' in Hungarian
   if (langnum == LANG_hu) {
-    for (size_t j = 0; j < slst.size(); ++j) {
-      size_t pos = slst[j].find('-');
+    for (auto& j : slst) {
+      size_t pos = j.find('-');
       if (pos != std::string::npos) {
         int info;
-        std::string w(slst[j].substr(0, pos));
-        w.append(slst[j].substr(pos + 1));
-        (void)spell(w, &info, NULL);
+        std::string w(j.substr(0, pos));
+        w.append(j.substr(pos + 1));
+        (void)spell(w, spell_candidate_stack, &info, NULL);
         if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) {
-          slst[j][pos] = ' ';
+	        j[pos] = ' ';
         } else
-          slst[j][pos] = '-';
+	        j[pos] = '-';
       }
     }
   }
@@ -1246,11 +1334,11 @@ std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word,
       if (dash_pos == scw.size())
         last = 1;
       std::string chunk = scw.substr(prev_pos, dash_pos - prev_pos);
-      if (!spell(chunk.c_str())) {
-        std::vector<std::string> nlst = suggest(chunk.c_str());
+      if (chunk != word && !spell(chunk, spell_candidate_stack)) {
+        std::vector<std::string> nlst = suggest(chunk, suggest_candidate_stack);
         if (clock() > timelimit + TIMELIMIT_GLOBAL)
             return slst;
-        for (std::vector<std::string>::reverse_iterator j = nlst.rbegin(); j != nlst.rend(); ++j) {
+        for (auto j = nlst.rbegin(); j != nlst.rend(); ++j) {
           std::string wspace = scw.substr(0, prev_pos);
           wspace.append(*j);
           if (!last) {
@@ -1286,12 +1374,11 @@ std::vector<std::string> HunspellImpl::stem(const std::vector<std::string>& desc
   std::string result2;
   if (desc.empty())
     return slst;
-  for (size_t i = 0; i < desc.size(); ++i) {
-
+  for (const auto& i : desc) {
     std::string result;
 
     // add compound word parts (except the last one)
-    const char* s = desc[i].c_str();
+    const char* s = i.c_str();
     const char* part = strstr(s, MORPH_PART);
     if (part) {
       const char* nextpart = strstr(part + 1, MORPH_PART);
@@ -1311,34 +1398,34 @@ std::vector<std::string> HunspellImpl::stem(const std::vector<std::string>& desc
       tok[alt + 1] = MSEP_ALT;
     }
     std::vector<std::string> pl = line_tok(tok, MSEP_ALT);
-    for (size_t k = 0; k < pl.size(); ++k) {
+    for (auto& k : pl) {
       // add derivational suffixes
-      if (pl[k].find(MORPH_DERI_SFX) != std::string::npos) {
+      if (k.find(MORPH_DERI_SFX) != std::string::npos) {
         // remove inflectional suffixes
-        const size_t is = pl[k].find(MORPH_INFL_SFX);
+        const size_t is = k.find(MORPH_INFL_SFX);
         if (is != std::string::npos)
-          pl[k].resize(is);
+	        k.resize(is);
         std::vector<std::string> singlepl;
-        singlepl.push_back(pl[k]);
-        std::string sg = pSMgr->suggest_gen(singlepl, pl[k]);
+        singlepl.push_back(k);
+        std::string sg = pSMgr->suggest_gen(singlepl, k);
         if (!sg.empty()) {
           std::vector<std::string> gen = line_tok(sg, MSEP_REC);
-          for (size_t j = 0; j < gen.size(); ++j) {
+          for (auto& j : gen) {
             result2.push_back(MSEP_REC);
             result2.append(result);
-            result2.append(gen[j]);
+            result2.append(j);
           }
         }
       } else {
         result2.push_back(MSEP_REC);
         result2.append(result);
-        if (pl[k].find(MORPH_SURF_PFX) != std::string::npos) {
+        if (k.find(MORPH_SURF_PFX) != std::string::npos) {
           std::string field;
-          copy_field(field, pl[k], MORPH_SURF_PFX);
+          copy_field(field, k, MORPH_SURF_PFX);
           result2.append(field);
         }
         std::string field;
-        copy_field(field, pl[k], MORPH_STEM);
+        copy_field(field, k, MORPH_STEM);
         result2.append(field);
       }
     }
@@ -1397,6 +1484,12 @@ int HunspellImpl::add(const std::string& word) {
   return 0;
 }
 
+int HunspellImpl::add_with_flags(const std::string& word, const std::string& flags, const std::string& desc) {
+  if (!m_HMgrs.empty())
+    return m_HMgrs[0]->add_with_flags(word, flags, desc);
+  return 0;
+}
+
 int HunspellImpl::add_with_affix(const std::string& word, const std::string& example) {
   if (!m_HMgrs.empty())
     return m_HMgrs[0]->add_with_affix(word, example);
@@ -1441,7 +1534,7 @@ std::vector<std::string> HunspellImpl::analyze(const std::string& word) {
 }
 
 std::vector<std::string> HunspellImpl::analyze_internal(const std::string& word) {
-  std::vector<std::string> slst;
+  std::vector<std::string> candidate_stack, slst;
   if (!pSMgr || m_HMgrs.empty())
     return slst;
   if (utf8) {
@@ -1593,12 +1686,11 @@ std::vector<std::string> HunspellImpl::analyze_internal(const std::string& word)
   if (dash_pos != std::string::npos) {
     int nresult = 0;
 
-    std::string part1 = scw.substr(0, dash_pos);
-    std::string part2 = scw.substr(dash_pos+1);
+    std::string part1 = scw.substr(0, dash_pos), part2 = scw.substr(dash_pos + 1);
 
     // examine 2 sides of the dash
     if (part2.empty()) {  // base word ending with dash
-      if (spell(part1)) {
+      if (spell(part1, candidate_stack)) {
         std::string p = pSMgr->suggest_morph(part1);
         if (!p.empty()) {
           slst = line_tok(p, MSEP_REC);
@@ -1606,7 +1698,7 @@ std::vector<std::string> HunspellImpl::analyze_internal(const std::string& word)
         }
       }
     } else if (part2.size() == 1 && part2[0] == 'e') {  // XXX (HU) -e hat.
-      if (spell(part1) && (spell("-e"))) {
+      if (spell(part1, candidate_stack) && (spell("-e", candidate_stack))) {
         std::string st = pSMgr->suggest_morph(part1);
         if (!st.empty()) {
           result.append(st);
@@ -1621,9 +1713,9 @@ std::vector<std::string> HunspellImpl::analyze_internal(const std::string& word)
     } else {
       // first word ending with dash: word- XXX ???
       part1.push_back(' ');
-      nresult = spell(part1);
+      nresult = spell(part1, candidate_stack);
       part1.erase(part1.size() - 1);
-      if (nresult && spell(part2) &&
+      if (nresult && spell(part2, candidate_stack) &&
           ((part2.size() > 1) || ((part2[0] > '0') && (part2[0] < '9')))) {
         std::string st = pSMgr->suggest_morph(part1);
         if (!st.empty()) {
@@ -1678,14 +1770,13 @@ std::vector<std::string> HunspellImpl::generate(const std::string& word, const s
   if (!pSMgr || pl.empty())
     return slst;
   std::vector<std::string> pl2 = analyze(word);
-  int captype = NOCAP;
-  int abbv = 0;
+  int captype = NOCAP, abbv = 0;
   std::string cw;
   cleanword(cw, word, &captype, &abbv);
   std::string result;
 
-  for (size_t i = 0; i < pl.size(); ++i) {
-    cat_result(result, pSMgr->suggest_gen(pl2, pl[i]));
+  for (const auto& i : pl) {
+    cat_result(result, pSMgr->suggest_gen(pl2, i));
   }
 
   if (!result.empty()) {
@@ -1698,16 +1789,17 @@ std::vector<std::string> HunspellImpl::generate(const std::string& word, const s
 
     // capitalize
     if (captype == INITCAP || captype == HUHINITCAP) {
-      for (size_t j = 0; j < slst.size(); ++j) {
-        mkinitcap(slst[j]);
+      for (auto& str : slst) {
+        mkinitcap(str);
       }
     }
 
     // temporary filtering of prefix related errors (eg.
     // generate("undrinkable", "eats") --> "undrinkables" and "*undrinks")
-    std::vector<std::string>::iterator it = slst.begin();
+    auto it = slst.begin();
     while (it != slst.end()) {
-      if (!spell(*it)) {
+      std::vector<std::string> candidate_stack;
+      if (!spell(*it, candidate_stack)) {
         it = slst.erase(it);
       } else  {
         ++it;
@@ -1778,10 +1870,8 @@ std::string::size_type HunspellImpl::get_xml_pos(const std::string& s, std::stri
 int HunspellImpl::check_xml_par(const std::string& q, std::string::size_type pos,
                                 const char* attr,
                                 const char* value) {
-  std::string cw = get_xml_par(q, get_xml_pos(q, pos, attr));
-  if (cw == value)
-    return 1;
-  return 0;
+  const std::string cw = get_xml_par(q, get_xml_pos(q, pos, attr));
+  return cw == value ? 1 : 0;
 }
 
 std::vector<std::string> HunspellImpl::get_xml_list(const std::string& list, std::string::size_type pos, const char* tag) {
@@ -1826,10 +1916,9 @@ std::vector<std::string> HunspellImpl::spellml(const std::string& in_word) {
     // convert the result to <code><a>ana1</a><a>ana2</a></code> format
     std::string r;
     r.append("<code>");
-    for (size_t i = 0; i < slst.size(); ++i) {
+    for (auto entry : slst) {
       r.append("<a>");
 
-      std::string entry(slst[i]);
       mystrrep(entry, "\t", " ");
       mystrrep(entry, "&", "&amp;");
       mystrrep(entry, "<", "&lt;");
@@ -1902,34 +1991,43 @@ std::vector<std::string> HunspellImpl::suffix_suggest(const std::string& root_wo
       remove_ignored_chars(w2, ignoredchars);
     }
     word = w2.c_str();
-  } else
+    len = (int)w2.size();
+  } else {
     word = root_word.c_str();
-
-  len = strlen(word);
+    len = (int)root_word.size();
+  }
 
   if (!len)
     return slst;
 
   for (size_t i = 0; (i < m_HMgrs.size()) && !he; ++i) {
-    he = m_HMgrs[i]->lookup(word);
+    he = m_HMgrs[i]->lookup(word, len);
   }
   if (he) {
-    slst = pAMgr->get_suffix_words(he->astr, he->alen, root_word.c_str());
+    slst = pAMgr->get_suffix_words(he->astr, he->alen, root_word);
   }
   return slst;
 }
 
 namespace {
+  // using malloc because this is for the c-api where the callers
+  // expect to be able to use free
+  char* stringdup(const std::string& s) {
+    size_t sl = s.size() + 1;
+    char* d = (char*)malloc(sl);
+    if (d)
+      memcpy(d, s.c_str(), sl);
+    return d;
+  }
+
   int munge_vector(char*** slst, const std::vector<std::string>& items) {
     if (items.empty()) {
       *slst = NULL;
       return 0;
     } else {
-      *slst = (char**)malloc(sizeof(char*) * items.size());
-      if (!*slst)
-        return 0;
+      *slst = new char*[items.size()];
       for (size_t i = 0; i < items.size(); ++i)
-        (*slst)[i] = mystrdup(items[i].c_str());
+        (*slst)[i] = stringdup(items[i]);
     }
     return items.size();
   }
@@ -1937,12 +2035,13 @@ namespace {
 
 int HunspellImpl::spell(const char* word, int* info, char** root) {
   std::string sroot;
-  bool ret = spell(word, info, root ? &sroot : NULL);
+  std::vector<std::string> candidate_stack;
+  bool ret = spell(word, candidate_stack, info, root ? &sroot : NULL);
   if (root) {
     if (sroot.empty()) {
       *root = NULL;
     } else {
-      *root = mystrdup(sroot.c_str());
+      *root = stringdup(sroot);
     }
   }
   return ret;
@@ -1962,7 +2061,7 @@ void HunspellImpl::free_list(char*** slst, int n) {
   if (slst && *slst) {
     for (int i = 0; i < n; i++)
       free((*slst)[i]);
-    free(*slst);
+    delete[] *slst;
     *slst = NULL;
   }
 }
@@ -1984,8 +2083,7 @@ int HunspellImpl::stem(char*** slst, const char* word) {
 int HunspellImpl::stem(char*** slst, char** desc, int n) {
   std::vector<std::string> morph;
   morph.reserve(n);
-  for (int i = 0; i < n; ++i)
-    morph.push_back(desc[i]);
+  for (int i = 0; i < n; ++i) morph.emplace_back(desc[i]);
 
   std::vector<std::string> stems = stem(morph);
   return munge_vector(slst, stems);
@@ -1999,8 +2097,7 @@ int HunspellImpl::generate(char*** slst, const char* word, const char* pattern)
 int HunspellImpl::generate(char*** slst, const char* word, char** pl, int pln) {
   std::vector<std::string> morph;
   morph.reserve(pln);
-  for (int i = 0; i < pln; ++i)
-    morph.push_back(pl[i]);
+  for (int i = 0; i < pln; ++i) morph.emplace_back(pl[i]);
 
   std::vector<std::string> stems = generate(word, morph);
   return munge_vector(slst, stems);
@@ -2014,6 +2111,10 @@ const char* HunspellImpl::get_version() const {
   return get_version_cpp().c_str();
 }
 
+const char* HunspellImpl::get_try_string() const {
+   return pAMgr->get_try_string().c_str();
+}
+
 int HunspellImpl::input_conv(const char* word, char* dest, size_t destsize) {
   std::string d;
   bool ret = input_conv(word, d);
@@ -2038,7 +2139,8 @@ int Hunspell::add_dic(const char* dpath, const char* key) {
 }
 
 bool Hunspell::spell(const std::string& word, int* info, std::string* root) {
-  return m_Impl->spell(word, info, root);
+  std::vector<std::string> candidate_stack;
+  return m_Impl->spell(word, candidate_stack, info, root);
 }
 
 std::vector<std::string> Hunspell::suggest(const std::string& word) {
@@ -2073,6 +2175,10 @@ int Hunspell::add(const std::string& word) {
   return m_Impl->add(word);
 }
 
+int Hunspell::add_with_flags(const std::string& word, const std::string& flags, const std::string& desc) {
+  return m_Impl->add_with_flags(word, flags, desc);
+}
+
 int Hunspell::add_with_affix(const std::string& word, const std::string& example) {
   return m_Impl->add_with_affix(word, example);
 }
@@ -2195,6 +2301,10 @@ int Hunspell_suggest(Hunhandle* pHunspell, char*** slst, const char* word) {
   return reinterpret_cast<HunspellImpl*>(pHunspell)->suggest(slst, word);
 }
 
+int Hunspell_suffix_suggest(Hunhandle* pHunspell, char*** slst, const char* root_word) {
+  return reinterpret_cast<HunspellImpl*>(pHunspell)->suffix_suggest(slst, root_word);
+}
+
 int Hunspell_analyze(Hunhandle* pHunspell, char*** slst, const char* word) {
   return reinterpret_cast<HunspellImpl*>(pHunspell)->analyze(slst, word);
 }
@@ -2232,6 +2342,10 @@ int Hunspell_add(Hunhandle* pHunspell, const char* word) {
   return reinterpret_cast<HunspellImpl*>(pHunspell)->add(word);
 }
 
+int Hunspell_add_with_flags(Hunhandle* pHunspell, const char* word, const char* flags, const char* desc) {
+  return reinterpret_cast<HunspellImpl*>(pHunspell)->add_with_flags(word, flags, desc);
+}
+
 /* add word to the run-time dictionary with affix flags of
  * the example (a dictionary word): Hunspell will recognize
  * affixed forms of the new word, too.
author	George Hazan <george.hazan@gmail.com>	2023-10-01 13:24:29 +0300
committer	George Hazan <george.hazan@gmail.com>	2023-10-01 13:24:29 +0300
commit	93babaf1665bd2b706a096057fdd06572c159622 (patch)
tree	efa9da647497149233facb88a422d9d31564bb4b /libs/hunspell/src/hunspell.c++
parent	13627600541c19643b96bd4c7b3a3d9fd92040f9 (diff)