diff options
author | Kirill Volinsky <mataes2007@gmail.com> | 2017-12-03 15:00:42 +0300 |
---|---|---|
committer | Kirill Volinsky <mataes2007@gmail.com> | 2017-12-03 15:01:25 +0300 |
commit | 97e2d186da4024c7ac62f7549f3243bd15204118 (patch) | |
tree | a0fdb451333c952b3eb773094380d88d3464ac30 /libs/hunspell/src | |
parent | d1f75ef5d26e7071fd1f6071e6c9a306fd19c33d (diff) |
Hunspell: lib updated to 1.6.2
Diffstat (limited to 'libs/hunspell/src')
33 files changed, 3788 insertions, 4669 deletions
diff --git a/libs/hunspell/src/affentry.c++ b/libs/hunspell/src/affentry.cxx index bd28274368..4ef0c00d9b 100644 --- a/libs/hunspell/src/affentry.c++ +++ b/libs/hunspell/src/affentry.cxx @@ -1,6 +1,8 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * + * Copyright (C) 2002-2017 Németh László + * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at @@ -11,12 +13,7 @@ * for the specific language governing rights and limitations under the * License. * - * The Original Code is Hunspell, based on MySpell. - * - * The Initial Developers of the Original Code are - * Kevin Hendricks (MySpell) and Németh László (Hunspell). - * Portions created by the Initial Developers are Copyright (C) 2002-2005 - * the Initial Developers. All Rights Reserved. + * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. * * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, @@ -79,33 +76,7 @@ #include "affentry.hxx" #include "csutil.hxx" -PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp) - // register affix manager - : pmyMgr(pmgr), - next(NULL), - nexteq(NULL), - nextne(NULL), - flgnxt(NULL) { - // set up its initial values - aflag = dp->aflag; // flag - strip = dp->strip; // string to strip - appnd = dp->appnd; // string to append - numconds = dp->numconds; // length of the condition - opts = dp->opts; // cross product flag - // then copy over all of the conditions - if (opts & aeLONGCOND) { - memcpy(c.conds, dp->c.l.conds1, MAXCONDLEN_1); - c.l.conds2 = dp->c.l.conds2; - } else - memcpy(c.conds, dp->c.conds, MAXCONDLEN); - morphcode = dp->morphcode; - contclass = dp->contclass; - contclasslen = dp->contclasslen; -} - -PfxEntry::~PfxEntry() { - aflag = 0; - pmyMgr = NULL; +AffEntry::~AffEntry() { if (opts & aeLONGCOND) free(c.l.conds2); if (morphcode && !(opts & aeALIASM)) @@ -114,17 +85,26 @@ PfxEntry::~PfxEntry() { free(contclass); } +PfxEntry::PfxEntry(AffixMgr* pmgr) + // register affix manager + : pmyMgr(pmgr), + next(NULL), + nexteq(NULL), + nextne(NULL), + flgnxt(NULL) { +} + // add prefix to this word assuming conditions hold -char* PfxEntry::add(const char* word, size_t len) { +std::string PfxEntry::add(const char* word, size_t len) { + std::string result; if ((len > strip.size() || (len == 0 && pmyMgr->get_fullstrip())) && (len >= numconds) && test_condition(word) && (!strip.size() || (strncmp(word, strip.c_str(), strip.size()) == 0))) { /* we have a match so add prefix */ - std::string tword(appnd); - tword.append(word + strip.size()); - return mystrdup(tword.c_str()); + result.assign(appnd); + result.append(word + strip.size()); } - return NULL; + return result; } inline char* PfxEntry::nextchar(char* p) { @@ -276,8 +256,7 @@ struct hentry* PfxEntry::checkword(const char* word, // if ((opts & aeXPRODUCT) && in_compound) { if ((opts & aeXPRODUCT)) { he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, aeXPRODUCT, this, - NULL, 0, NULL, FLAG_NULL, needflag, - in_compound); + FLAG_NULL, needflag, in_compound); if (he) return he; } @@ -291,8 +270,6 @@ struct hentry* PfxEntry::check_twosfx(const char* word, int len, char in_compound, const FLAG needflag) { - struct hentry* he; // hash entry of root word or NULL - // on entry prefix is 0 length or already matches the beginning of the word. // So if the remaining root word has positive length // and if there are enough chars in root word and added back strip chars @@ -324,8 +301,9 @@ struct hentry* PfxEntry::check_twosfx(const char* word, // cross checked combined with a suffix if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { - he = pmyMgr->suffix_check_twosfx(tmpword.c_str(), tmpl, aeXPRODUCT, this, - needflag); + // hash entry of root word or NULL + struct hentry* he = pmyMgr->suffix_check_twosfx(tmpword.c_str(), tmpl, aeXPRODUCT, this, + needflag); if (he) return he; } @@ -335,15 +313,15 @@ struct hentry* PfxEntry::check_twosfx(const char* word, } // check if this prefix entry matches -char* PfxEntry::check_twosfx_morph(const char* word, - int len, - char in_compound, - const FLAG needflag) { +std::string PfxEntry::check_twosfx_morph(const char* word, + int len, + char in_compound, + const FLAG needflag) { + std::string result; // on entry prefix is 0 length or already matches the beginning of the word. // So if the remaining root word has positive length // and if there are enough chars in root word and added back strip chars // to meet the number of characters conditions, then test it - int tmpl = len - appnd.size(); // length of tmpword if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && @@ -370,22 +348,21 @@ char* PfxEntry::check_twosfx_morph(const char* word, // ross checked combined with a suffix if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { - return pmyMgr->suffix_check_twosfx_morph(tmpword.c_str(), tmpl, - aeXPRODUCT, - this, needflag); + result = pmyMgr->suffix_check_twosfx_morph(tmpword.c_str(), tmpl, + aeXPRODUCT, + this, needflag); } } } - return NULL; + return result; } // check if this prefix entry matches -char* PfxEntry::check_morph(const char* word, - int len, - char in_compound, - const FLAG needflag) { - struct hentry* he; // hash entry of root word or NULL - char* st; +std::string PfxEntry::check_morph(const char* word, + int len, + char in_compound, + const FLAG needflag) { + std::string result; // on entry prefix is 0 length or already matches the beginning of the word. // So if the remaining root word has positive length @@ -411,9 +388,8 @@ char* PfxEntry::check_morph(const char* word, // root word in the dictionary if (test_condition(tmpword.c_str())) { - std::string result; - tmpl += strip.size(); + struct hentry* he; // hash entry of root word or NULL if ((he = pmyMgr->lookup(tmpword.c_str())) != NULL) { do { if (TESTAFF(he->astr, aflag, he->alen) && @@ -455,23 +431,19 @@ char* PfxEntry::check_morph(const char* word, // ross checked combined with a suffix if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { - st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, aeXPRODUCT, this, - FLAG_NULL, needflag); - if (st) { + std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, aeXPRODUCT, this, + FLAG_NULL, needflag); + if (!st.empty()) { result.append(st); - free(st); } } - - if (!result.empty()) - return mystrdup(result.c_str()); } } - return NULL; + return result; } -SfxEntry::SfxEntry(AffixMgr* pmgr, affentry* dp) +SfxEntry::SfxEntry(AffixMgr* pmgr) : pmyMgr(pmgr) // register affix manager , next(NULL), @@ -481,50 +453,21 @@ SfxEntry::SfxEntry(AffixMgr* pmgr, affentry* dp) l_morph(NULL), r_morph(NULL), eq_morph(NULL) { - // set up its initial values - aflag = dp->aflag; // char flag - strip = dp->strip; // string to strip - appnd = dp->appnd; // string to append - numconds = dp->numconds; // length of the condition - opts = dp->opts; // cross product flag - - // then copy over all of the conditions - if (opts & aeLONGCOND) { - memcpy(c.l.conds1, dp->c.l.conds1, MAXCONDLEN_1); - c.l.conds2 = dp->c.l.conds2; - } else - memcpy(c.conds, dp->c.conds, MAXCONDLEN); - rappnd = appnd; - reverseword(rappnd); - morphcode = dp->morphcode; - contclass = dp->contclass; - contclasslen = dp->contclasslen; -} - -SfxEntry::~SfxEntry() { - aflag = 0; - pmyMgr = NULL; - if (opts & aeLONGCOND) - free(c.l.conds2); - if (morphcode && !(opts & aeALIASM)) - free(morphcode); - if (contclass && !(opts & aeALIASF)) - free(contclass); } // add suffix to this word assuming conditions hold -char* SfxEntry::add(const char* word, size_t len) { +std::string SfxEntry::add(const char* word, size_t len) { + std::string result; /* make sure all conditions match */ if ((len > strip.size() || (len == 0 && pmyMgr->get_fullstrip())) && (len >= numconds) && test_condition(word + len, word) && (!strip.size() || (strcmp(word + len - strip.size(), strip.c_str()) == 0))) { - std::string tword(word); + result.assign(word); /* we have a match so add suffix */ - tword.replace(len - strip.size(), std::string::npos, appnd); - return mystrdup(tword.c_str()); + result.replace(len - strip.size(), std::string::npos, appnd); } - return NULL; + return result; } inline char* SfxEntry::nextchar(char* p) { @@ -669,9 +612,6 @@ struct hentry* SfxEntry::checkword(const char* word, int len, int optflags, PfxEntry* ppfx, - char** wlst, - int maxSug, - int* ns, const FLAG cclass, const FLAG needflag, const FLAG badflag) { @@ -742,27 +682,6 @@ struct hentry* SfxEntry::checkword(const char* word, return he; he = he->next_homonym; // check homonyms } while (he); - - // obsolote stemming code (used only by the - // experimental SuffixMgr:suggest_pos_stems) - // store resulting root in wlst - } else if (wlst && (*ns < maxSug)) { - int cwrd = 1; - for (int k = 0; k < *ns; k++) - if (strcmp(tmpword, wlst[k]) == 0) { - cwrd = 0; - break; - } - if (cwrd) { - wlst[*ns] = mystrdup(tmpword); - if (wlst[*ns] == NULL) { - for (int j = 0; j < *ns; j++) - free(wlst[j]); - *ns = -1; - return NULL; - } - (*ns)++; - } } } } @@ -775,7 +694,6 @@ struct hentry* SfxEntry::check_twosfx(const char* word, int optflags, PfxEntry* ppfx, const FLAG needflag) { - struct hentry* he; // hash entry pointer PfxEntry* ep = ppfx; // if this suffix is being cross checked with a prefix @@ -813,17 +731,18 @@ struct hentry* SfxEntry::check_twosfx(const char* word, // if all conditions are met then recall suffix_check if (test_condition(end, beg)) { + struct hentry* he; // hash entry pointer if (ppfx) { // handle conditional suffix if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) - he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, 0, NULL, NULL, 0, NULL, - (FLAG)aflag, needflag); + he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, 0, NULL, + (FLAG)aflag, needflag, IN_CPD_NOT); else - he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, optflags, ppfx, NULL, 0, - NULL, (FLAG)aflag, needflag); + he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, optflags, ppfx, + (FLAG)aflag, needflag, IN_CPD_NOT); } else { - he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, 0, NULL, NULL, 0, NULL, - (FLAG)aflag, needflag); + he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, 0, NULL, + (FLAG)aflag, needflag, IN_CPD_NOT); } if (he) return he; @@ -833,23 +752,20 @@ struct hentry* SfxEntry::check_twosfx(const char* word, } // see if two-level suffix is present in the word -char* SfxEntry::check_twosfx_morph(const char* word, - int len, - int optflags, - PfxEntry* ppfx, - const FLAG needflag) { +std::string SfxEntry::check_twosfx_morph(const char* word, + int len, + int optflags, + PfxEntry* ppfx, + const FLAG needflag) { PfxEntry* ep = ppfx; - char* st; - - char result[MAXLNLEN]; - *result = '\0'; + std::string result; // if this suffix is being cross checked with a prefix // but it does not support cross products skip it if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0) - return NULL; + return result; // upon entry suffix is 0 length or already matches the end of the word. // So if the remaining root word has positive length @@ -883,40 +799,34 @@ char* SfxEntry::check_twosfx_morph(const char* word, if (ppfx) { // handle conditional suffix if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) { - st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, 0, NULL, aflag, - needflag); - if (st) { + std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, 0, NULL, aflag, + needflag); + if (!st.empty()) { if (ppfx->getMorph()) { - mystrcat(result, ppfx->getMorph(), MAXLNLEN); - mystrcat(result, " ", MAXLNLEN); + result.append(ppfx->getMorph()); + result.append(" "); } - mystrcat(result, st, MAXLNLEN); - free(st); + result.append(st); mychomp(result); } } else { - st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, optflags, ppfx, aflag, - needflag); - if (st) { - mystrcat(result, st, MAXLNLEN); - free(st); + std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, optflags, ppfx, aflag, + needflag); + if (!st.empty()) { + result.append(st); mychomp(result); } } } else { - st = - pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, 0, NULL, aflag, needflag); - if (st) { - mystrcat(result, st, MAXLNLEN); - free(st); + std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, 0, NULL, aflag, needflag); + if (!st.empty()) { + result.append(st); mychomp(result); } } - if (*result) - return mystrdup(result); } } - return NULL; + return result; } // get next homonym with same affix @@ -948,6 +858,11 @@ struct hentry* SfxEntry::get_next_homonym(struct hentry* he, return NULL; } +void SfxEntry::initReverseWord() { + rappnd = appnd; + reverseword(rappnd); +} + #if 0 Appendix: Understanding Affix Code diff --git a/libs/hunspell/src/affentry.hxx b/libs/hunspell/src/affentry.hxx index 6311d83fff..4bafc043f4 100644 --- a/libs/hunspell/src/affentry.hxx +++ b/libs/hunspell/src/affentry.hxx @@ -1,6 +1,8 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * + * Copyright (C) 2002-2017 Németh László + * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at @@ -11,12 +13,7 @@ * for the specific language governing rights and limitations under the * License. * - * The Original Code is Hunspell, based on MySpell. - * - * The Initial Developers of the Original Code are - * Kevin Hendricks (MySpell) and Németh László (Hunspell). - * Portions created by the Initial Developers are Copyright (C) 2002-2005 - * the Initial Developers. All Rights Reserved. + * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. * * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, @@ -71,10 +68,8 @@ * SUCH DAMAGE. */ -#ifndef _AFFIX_HXX_ -#define _AFFIX_HXX_ - -#include "hunvisapi.h" +#ifndef AFFIX_HXX_ +#define AFFIX_HXX_ #include "atypes.hxx" #include "baseaffix.hxx" @@ -82,7 +77,7 @@ /* A Prefix Entry */ -class LIBHUNSPELL_DLL_EXPORTED PfxEntry : protected AffEntry { +class PfxEntry : public AffEntry { private: PfxEntry(const PfxEntry&); PfxEntry& operator=(const PfxEntry&); @@ -96,10 +91,9 @@ class LIBHUNSPELL_DLL_EXPORTED PfxEntry : protected AffEntry { PfxEntry* flgnxt; public: - PfxEntry(AffixMgr* pmgr, affentry* dp); - ~PfxEntry(); + explicit PfxEntry(AffixMgr* pmgr); - inline bool allowCross() { return ((opts & aeXPRODUCT) != 0); } + bool allowCross() const { return ((opts & aeXPRODUCT) != 0); } struct hentry* checkword(const char* word, int len, char in_compound, @@ -110,19 +104,19 @@ class LIBHUNSPELL_DLL_EXPORTED PfxEntry : protected AffEntry { char in_compound, const FLAG needflag = FLAG_NULL); - char* check_morph(const char* word, - int len, - char in_compound, - const FLAG needflag = FLAG_NULL); + std::string check_morph(const char* word, + int len, + char in_compound, + const FLAG needflag = FLAG_NULL); - char* check_twosfx_morph(const char* word, - int len, - char in_compound, - const FLAG needflag = FLAG_NULL); + std::string check_twosfx_morph(const char* word, + int len, + char in_compound, + const FLAG needflag = FLAG_NULL); - inline FLAG getFlag() { return aflag; } - inline const char* getKey() { return appnd.c_str(); } - char* add(const char* word, size_t len); + FLAG getFlag() { return aflag; } + const char* getKey() { return appnd.c_str(); } + std::string add(const char* word, size_t len); inline short getKeyLen() { return appnd.size(); } @@ -147,7 +141,7 @@ class LIBHUNSPELL_DLL_EXPORTED PfxEntry : protected AffEntry { /* A Suffix Entry */ -class LIBHUNSPELL_DLL_EXPORTED SfxEntry : protected AffEntry { +class SfxEntry : public AffEntry { private: SfxEntry(const SfxEntry&); SfxEntry& operator=(const SfxEntry&); @@ -166,20 +160,16 @@ class LIBHUNSPELL_DLL_EXPORTED SfxEntry : protected AffEntry { SfxEntry* eq_morph; public: - SfxEntry(AffixMgr* pmgr, affentry* dp); - ~SfxEntry(); + explicit SfxEntry(AffixMgr* pmgr); - inline bool allowCross() { return ((opts & aeXPRODUCT) != 0); } + bool allowCross() const { return ((opts & aeXPRODUCT) != 0); } struct hentry* checkword(const char* word, int len, int optflags, PfxEntry* ppfx, - char** wlst, - int maxSug, - int* ns, - const FLAG cclass = FLAG_NULL, - const FLAG needflag = FLAG_NULL, - const FLAG badflag = FLAG_NULL); + const FLAG cclass, + const FLAG needflag, + const FLAG badflag); struct hentry* check_twosfx(const char* word, int len, @@ -187,11 +177,11 @@ class LIBHUNSPELL_DLL_EXPORTED SfxEntry : protected AffEntry { PfxEntry* ppfx, const FLAG needflag = FLAG_NULL); - char* check_twosfx_morph(const char* word, - int len, - int optflags, - PfxEntry* ppfx, - const FLAG needflag = FLAG_NULL); + std::string check_twosfx_morph(const char* word, + int len, + int optflags, + PfxEntry* ppfx, + const FLAG needflag = FLAG_NULL); struct hentry* get_next_homonym(struct hentry* he); struct hentry* get_next_homonym(struct hentry* word, int optflags, @@ -199,9 +189,9 @@ class LIBHUNSPELL_DLL_EXPORTED SfxEntry : protected AffEntry { const FLAG cclass, const FLAG needflag); - inline FLAG getFlag() { return aflag; } - inline const char* getKey() { return rappnd.c_str(); } - char* add(const char* word, size_t len); + FLAG getFlag() { return aflag; } + const char* getKey() { return rappnd.c_str(); } + std::string add(const char* word, size_t len); inline const char* getMorph() { return morphcode; } @@ -224,6 +214,7 @@ class LIBHUNSPELL_DLL_EXPORTED SfxEntry : protected AffEntry { inline void setNextNE(SfxEntry* ptr) { nextne = ptr; } inline void setNextEQ(SfxEntry* ptr) { nexteq = ptr; } inline void setFlgNxt(SfxEntry* ptr) { flgnxt = ptr; } + void initReverseWord(); inline char* nextchar(char* p); inline int test_condition(const char* st, const char* begin); diff --git a/libs/hunspell/src/affixmgr.c++ b/libs/hunspell/src/affixmgr.cxx index d6bb677982..ffce7bb1bd 100644 --- a/libs/hunspell/src/affixmgr.c++ +++ b/libs/hunspell/src/affixmgr.cxx @@ -1,6 +1,8 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * + * Copyright (C) 2002-2017 Németh László + * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at @@ -11,12 +13,7 @@ * for the specific language governing rights and limitations under the * License. * - * The Original Code is Hunspell, based on MySpell. - * - * The Initial Developers of the Original Code are - * Kevin Hendricks (MySpell) and Németh László (Hunspell). - * Portions created by the Initial Developers are Copyright (C) 2002-2005 - * the Initial Developers. All Rights Reserved. + * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. * * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, @@ -88,33 +85,24 @@ #include "csutil.hxx" AffixMgr::AffixMgr(const char* affpath, - HashMgr** ptr, - int* md, - const char* key) { + const std::vector<HashMgr*>& ptr, + const char* key) + : alldic(ptr) + , pHMgr(ptr[0]) { + // register hash manager and load affix data from aff file - pHMgr = ptr[0]; - alldic = ptr; - maxdic = md; - keystring = NULL; - trystring = NULL; - encoding = NULL; csconv = NULL; utf8 = 0; complexprefixes = 0; - maptable = NULL; - nummap = 0; - breaktable = NULL; - numbreak = -1; - reptable = NULL; - numrep = 0; + parsedmaptable = false; + parsedbreaktable = false; + parsedrep = false; iconvtable = NULL; oconvtable = NULL; - checkcpdtable = NULL; // allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN) simplifiedcpd = 0; - numcheckcpd = 0; - defcpdtable = NULL; - numdefcpd = 0; + parsedcheckcpd = false; + parseddefcpd = false; phone = NULL; compoundflag = FLAG_NULL; // permits word in compound forms compoundbegin = FLAG_NULL; // may be first word in compound forms @@ -135,25 +123,15 @@ AffixMgr::AffixMgr(const char* affpath, forbiddenword = FORBIDDENWORD; // forbidden word signing flag nosuggest = FLAG_NULL; // don't suggest words signed with NOSUGGEST flag nongramsuggest = FLAG_NULL; - lang = NULL; // language langnum = 0; // language code (see http://l10n.openoffice.org/languages.html) needaffix = FLAG_NULL; // forbidden root, allowed only with suffixes cpdwordmax = -1; // default: unlimited wordcount in compound words cpdmin = -1; // undefined cpdmaxsyllable = 0; // default: unlimited syllablecount in compound words - cpdvowels = NULL; // vowels (for calculating of Hungarian compounding limit, - // O(n) search! XXX) - cpdvowels_utf16 = - NULL; // vowels for UTF-8 encoding (bsearch instead of O(n) search) - cpdvowels_utf16_len = 0; // vowels pfxappnd = NULL; // previous prefix for counting syllables of the prefix BUG sfxappnd = NULL; // previous suffix for counting syllables of the suffix BUG sfxextra = 0; // modifier for syllable count of sfxappnd BUG - cpdsyllablenum = NULL; // syllable count incrementing flag checknum = 0; // checking numbers, and word with numbers - wordchars = NULL; // letters + spec. word characters - ignorechars = NULL; // letters + spec. word characters - version = NULL; // affix and dictionary file version string havecontclass = 0; // flags of possible continuing classes (double affix) // LEMMA_PRESENT: not put root into the morphological output. Lemma presents // in morhological description in dictionary file. It's often combined with @@ -225,83 +203,10 @@ AffixMgr::~AffixMgr() { sStart[j] = NULL; } - if (keystring) - free(keystring); - keystring = NULL; - if (trystring) - free(trystring); - trystring = NULL; - if (encoding) - free(encoding); - encoding = NULL; - if (maptable) { - for (int j = 0; j < nummap; j++) { - for (int k = 0; k < maptable[j].len; k++) { - if (maptable[j].set[k]) - free(maptable[j].set[k]); - } - free(maptable[j].set); - maptable[j].set = NULL; - maptable[j].len = 0; - } - free(maptable); - maptable = NULL; - } - nummap = 0; - if (breaktable) { - for (int j = 0; j < numbreak; j++) { - if (breaktable[j]) - free(breaktable[j]); - breaktable[j] = NULL; - } - free(breaktable); - breaktable = NULL; - } - numbreak = 0; - if (reptable) { - for (int j = 0; j < numrep; j++) { - free(reptable[j].pattern); - free(reptable[j].pattern2); - } - free(reptable); - reptable = NULL; - } - if (iconvtable) - delete iconvtable; - if (oconvtable) - delete oconvtable; - if (phone && phone->rules) { - for (int j = 0; j < phone->num + 1; j++) { - free(phone->rules[j * 2]); - free(phone->rules[j * 2 + 1]); - } - free(phone->rules); - free(phone); - phone = NULL; - } + delete iconvtable; + delete oconvtable; + delete phone; - if (defcpdtable) { - for (int j = 0; j < numdefcpd; j++) { - free(defcpdtable[j].def); - defcpdtable[j].def = NULL; - } - free(defcpdtable); - defcpdtable = NULL; - } - numrep = 0; - if (checkcpdtable) { - for (int j = 0; j < numcheckcpd; j++) { - free(checkcpdtable[j].pattern); - free(checkcpdtable[j].pattern2); - free(checkcpdtable[j].pattern3); - checkcpdtable[j].pattern = NULL; - checkcpdtable[j].pattern2 = NULL; - checkcpdtable[j].pattern3 = NULL; - } - free(checkcpdtable); - checkcpdtable = NULL; - } - numcheckcpd = 0; FREE_FLAG(compoundflag); FREE_FLAG(compoundbegin); FREE_FLAG(compoundmiddle); @@ -321,21 +226,7 @@ AffixMgr::~AffixMgr() { pHMgr = NULL; cpdmin = 0; cpdmaxsyllable = 0; - if (cpdvowels) - free(cpdvowels); - if (cpdvowels_utf16) - free(cpdvowels_utf16); - if (cpdsyllablenum) - free(cpdsyllablenum); free_utf_tbl(); - if (lang) - free(lang); - if (wordchars) - free(wordchars); - if (ignorechars) - free(ignorechars); - if (version) - free(version); checknum = 0; #ifdef MOZILLA_CLIENT delete[] csconv; @@ -352,8 +243,6 @@ void AffixMgr::finishFileMgr(FileMgr* afflst) { // read in aff file and build up prefix and suffix entry objects int AffixMgr::parse_file(const char* affpath, const char* key) { - char* line; // io buffers - char ft; // affix type // checking flag duplication char dupflags[CONTSIZE]; @@ -375,7 +264,8 @@ int AffixMgr::parse_file(const char* affpath, const char* key) { // read in each line ignoring any that do not // start with a known line type indicator - while ((line = afflst->getline()) != NULL) { + std::string line; + while (afflst->getline(line)) { mychomp(line); /* remove byte order mark */ @@ -383,41 +273,38 @@ int AffixMgr::parse_file(const char* affpath, const char* key) { firstline = 0; // Affix file begins with byte order mark: possible incompatibility with // old Hunspell versions - if (strncmp(line, "\xEF\xBB\xBF", 3) == 0) { - memmove(line, line + 3, strlen(line + 3) + 1); + if (line.compare(0, 3, "\xEF\xBB\xBF", 3) == 0) { + line.erase(0, 3); } } /* parse in the keyboard string */ - if (strncmp(line, "KEY", 3) == 0) { - if (parse_string(line, &keystring, afflst->getlinenum())) { + if (line.compare(0, 3, "KEY", 3) == 0) { + if (!parse_string(line, keystring, afflst->getlinenum())) { finishFileMgr(afflst); return 1; } } /* parse in the try string */ - if (strncmp(line, "TRY", 3) == 0) { - if (parse_string(line, &trystring, afflst->getlinenum())) { + if (line.compare(0, 3, "TRY", 3) == 0) { + if (!parse_string(line, trystring, afflst->getlinenum())) { finishFileMgr(afflst); return 1; } } /* parse in the name of the character set used by the .dict and .aff */ - if (strncmp(line, "SET", 3) == 0) { - if (parse_string(line, &encoding, afflst->getlinenum())) { + if (line.compare(0, 3, "SET", 3) == 0) { + if (!parse_string(line, encoding, afflst->getlinenum())) { finishFileMgr(afflst); return 1; } - if (strcmp(encoding, "UTF-8") == 0) { + if (encoding == "UTF-8") { utf8 = 1; #ifndef OPENOFFICEORG #ifndef MOZILLA_CLIENT - if (initialize_utf_tbl()) { - finishFileMgr(afflst); - return 1; - } + initialize_utf_tbl(); #endif #endif } @@ -425,26 +312,26 @@ int AffixMgr::parse_file(const char* affpath, const char* key) { /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left * writing system */ - if (strncmp(line, "COMPLEXPREFIXES", 15) == 0) + if (line.compare(0, 15, "COMPLEXPREFIXES", 15) == 0) complexprefixes = 1; /* parse in the flag used by the controlled compound words */ - if (strncmp(line, "COMPOUNDFLAG", 12) == 0) { - if (parse_flag(line, &compoundflag, afflst)) { + if (line.compare(0, 12, "COMPOUNDFLAG", 12) == 0) { + if (!parse_flag(line, &compoundflag, afflst)) { finishFileMgr(afflst); return 1; } } /* parse in the flag used by compound words */ - if (strncmp(line, "COMPOUNDBEGIN", 13) == 0) { + if (line.compare(0, 13, "COMPOUNDBEGIN", 13) == 0) { if (complexprefixes) { - if (parse_flag(line, &compoundend, afflst)) { + if (!parse_flag(line, &compoundend, afflst)) { finishFileMgr(afflst); return 1; } } else { - if (parse_flag(line, &compoundbegin, afflst)) { + if (!parse_flag(line, &compoundbegin, afflst)) { finishFileMgr(afflst); return 1; } @@ -452,21 +339,22 @@ int AffixMgr::parse_file(const char* affpath, const char* key) { } /* parse in the flag used by compound words */ - if (strncmp(line, "COMPOUNDMIDDLE", 14) == 0) { - if (parse_flag(line, &compoundmiddle, afflst)) { + if (line.compare(0, 14, "COMPOUNDMIDDLE", 14) == 0) { + if (!parse_flag(line, &compoundmiddle, afflst)) { finishFileMgr(afflst); return 1; } } + /* parse in the flag used by compound words */ - if (strncmp(line, "COMPOUNDEND", 11) == 0) { + if (line.compare(0, 11, "COMPOUNDEND", 11) == 0) { if (complexprefixes) { - if (parse_flag(line, &compoundbegin, afflst)) { + if (!parse_flag(line, &compoundbegin, afflst)) { finishFileMgr(afflst); return 1; } } else { - if (parse_flag(line, &compoundend, afflst)) { + if (!parse_flag(line, &compoundend, afflst)) { finishFileMgr(afflst); return 1; } @@ -474,126 +362,126 @@ int AffixMgr::parse_file(const char* affpath, const char* key) { } /* parse in the data used by compound_check() method */ - if (strncmp(line, "COMPOUNDWORDMAX", 15) == 0) { - if (parse_num(line, &cpdwordmax, afflst)) { + if (line.compare(0, 15, "COMPOUNDWORDMAX", 15) == 0) { + if (!parse_num(line, &cpdwordmax, afflst)) { finishFileMgr(afflst); return 1; } } /* parse in the flag sign compounds in dictionary */ - if (strncmp(line, "COMPOUNDROOT", 12) == 0) { - if (parse_flag(line, &compoundroot, afflst)) { + if (line.compare(0, 12, "COMPOUNDROOT", 12) == 0) { + if (!parse_flag(line, &compoundroot, afflst)) { finishFileMgr(afflst); return 1; } } /* parse in the flag used by compound_check() method */ - if (strncmp(line, "COMPOUNDPERMITFLAG", 18) == 0) { - if (parse_flag(line, &compoundpermitflag, afflst)) { + if (line.compare(0, 18, "COMPOUNDPERMITFLAG", 18) == 0) { + if (!parse_flag(line, &compoundpermitflag, afflst)) { finishFileMgr(afflst); return 1; } } /* parse in the flag used by compound_check() method */ - if (strncmp(line, "COMPOUNDFORBIDFLAG", 18) == 0) { - if (parse_flag(line, &compoundforbidflag, afflst)) { + if (line.compare(0, 18, "COMPOUNDFORBIDFLAG", 18) == 0) { + if (!parse_flag(line, &compoundforbidflag, afflst)) { finishFileMgr(afflst); return 1; } } - if (strncmp(line, "COMPOUNDMORESUFFIXES", 20) == 0) { + if (line.compare(0, 20, "COMPOUNDMORESUFFIXES", 20) == 0) { compoundmoresuffixes = 1; } - if (strncmp(line, "CHECKCOMPOUNDDUP", 16) == 0) { + if (line.compare(0, 16, "CHECKCOMPOUNDDUP", 16) == 0) { checkcompounddup = 1; } - if (strncmp(line, "CHECKCOMPOUNDREP", 16) == 0) { + if (line.compare(0, 16, "CHECKCOMPOUNDREP", 16) == 0) { checkcompoundrep = 1; } - if (strncmp(line, "CHECKCOMPOUNDTRIPLE", 19) == 0) { + if (line.compare(0, 19, "CHECKCOMPOUNDTRIPLE", 19) == 0) { checkcompoundtriple = 1; } - if (strncmp(line, "SIMPLIFIEDTRIPLE", 16) == 0) { + if (line.compare(0, 16, "SIMPLIFIEDTRIPLE", 16) == 0) { simplifiedtriple = 1; } - if (strncmp(line, "CHECKCOMPOUNDCASE", 17) == 0) { + if (line.compare(0, 17, "CHECKCOMPOUNDCASE", 17) == 0) { checkcompoundcase = 1; } - if (strncmp(line, "NOSUGGEST", 9) == 0) { - if (parse_flag(line, &nosuggest, afflst)) { + if (line.compare(0, 9, "NOSUGGEST", 9) == 0) { + if (!parse_flag(line, &nosuggest, afflst)) { finishFileMgr(afflst); return 1; } } - if (strncmp(line, "NONGRAMSUGGEST", 14) == 0) { - if (parse_flag(line, &nongramsuggest, afflst)) { + if (line.compare(0, 14, "NONGRAMSUGGEST", 14) == 0) { + if (!parse_flag(line, &nongramsuggest, afflst)) { finishFileMgr(afflst); return 1; } } /* parse in the flag used by forbidden words */ - if (strncmp(line, "FORBIDDENWORD", 13) == 0) { - if (parse_flag(line, &forbiddenword, afflst)) { + if (line.compare(0, 13, "FORBIDDENWORD", 13) == 0) { + if (!parse_flag(line, &forbiddenword, afflst)) { finishFileMgr(afflst); return 1; } } /* parse in the flag used by forbidden words */ - if (strncmp(line, "LEMMA_PRESENT", 13) == 0) { - if (parse_flag(line, &lemma_present, afflst)) { + if (line.compare(0, 13, "LEMMA_PRESENT", 13) == 0) { + if (!parse_flag(line, &lemma_present, afflst)) { finishFileMgr(afflst); return 1; } } /* parse in the flag used by circumfixes */ - if (strncmp(line, "CIRCUMFIX", 9) == 0) { - if (parse_flag(line, &circumfix, afflst)) { + if (line.compare(0, 9, "CIRCUMFIX", 9) == 0) { + if (!parse_flag(line, &circumfix, afflst)) { finishFileMgr(afflst); return 1; } } /* parse in the flag used by fogemorphemes */ - if (strncmp(line, "ONLYINCOMPOUND", 14) == 0) { - if (parse_flag(line, &onlyincompound, afflst)) { + if (line.compare(0, 14, "ONLYINCOMPOUND", 14) == 0) { + if (!parse_flag(line, &onlyincompound, afflst)) { finishFileMgr(afflst); return 1; } } /* parse in the flag used by `needaffixs' */ - if (strncmp(line, "PSEUDOROOT", 10) == 0) { - if (parse_flag(line, &needaffix, afflst)) { + if (line.compare(0, 10, "PSEUDOROOT", 10) == 0) { + if (!parse_flag(line, &needaffix, afflst)) { finishFileMgr(afflst); return 1; } } /* parse in the flag used by `needaffixs' */ - if (strncmp(line, "NEEDAFFIX", 9) == 0) { - if (parse_flag(line, &needaffix, afflst)) { + if (line.compare(0, 9, "NEEDAFFIX", 9) == 0) { + if (!parse_flag(line, &needaffix, afflst)) { finishFileMgr(afflst); return 1; } } /* parse in the minimal length for words in compounds */ - if (strncmp(line, "COMPOUNDMIN", 11) == 0) { - if (parse_num(line, &cpdmin, afflst)) { + if (line.compare(0, 11, "COMPOUNDMIN", 11) == 0) { + if (!parse_num(line, &cpdmin, afflst)) { finishFileMgr(afflst); return 1; } @@ -602,29 +490,29 @@ int AffixMgr::parse_file(const char* affpath, const char* key) { } /* parse in the max. words and syllables in compounds */ - if (strncmp(line, "COMPOUNDSYLLABLE", 16) == 0) { - if (parse_cpdsyllable(line, afflst)) { + if (line.compare(0, 16, "COMPOUNDSYLLABLE", 16) == 0) { + if (!parse_cpdsyllable(line, afflst)) { finishFileMgr(afflst); return 1; } } /* parse in the flag used by compound_check() method */ - if (strncmp(line, "SYLLABLENUM", 11) == 0) { - if (parse_string(line, &cpdsyllablenum, afflst->getlinenum())) { + if (line.compare(0, 11, "SYLLABLENUM", 11) == 0) { + if (!parse_string(line, cpdsyllablenum, afflst->getlinenum())) { finishFileMgr(afflst); return 1; } } /* parse in the flag used by the controlled compound words */ - if (strncmp(line, "CHECKNUM", 8) == 0) { + if (line.compare(0, 8, "CHECKNUM", 8) == 0) { checknum = 1; } /* parse in the extra word characters */ - if (strncmp(line, "WORDCHARS", 9) == 0) { - if (!parse_array(line, &wordchars, wordchars_utf16, + if (line.compare(0, 9, "WORDCHARS", 9) == 0) { + if (!parse_array(line, wordchars, wordchars_utf16, utf8, afflst->getlinenum())) { finishFileMgr(afflst); return 1; @@ -633,8 +521,8 @@ int AffixMgr::parse_file(const char* affpath, const char* key) { /* parse in the ignored characters (for example, Arabic optional diacretics * charachters */ - if (strncmp(line, "IGNORE", 6) == 0) { - if (!parse_array(line, &ignorechars, ignorechars_utf16, + if (line.compare(0, 6, "IGNORE", 6) == 0) { + if (!parse_array(line, ignorechars, ignorechars_utf16, utf8, afflst->getlinenum())) { finishFileMgr(afflst); return 1; @@ -642,172 +530,174 @@ int AffixMgr::parse_file(const char* affpath, const char* key) { } /* parse in the typical fault correcting table */ - if (strncmp(line, "REP", 3) == 0) { - if (parse_reptable(line, afflst)) { + if (line.compare(0, 3, "REP", 3) == 0) { + if (!parse_reptable(line, afflst)) { finishFileMgr(afflst); return 1; } } /* parse in the input conversion table */ - if (strncmp(line, "ICONV", 5) == 0) { - if (parse_convtable(line, afflst, &iconvtable, "ICONV")) { + if (line.compare(0, 5, "ICONV", 5) == 0) { + if (!parse_convtable(line, afflst, &iconvtable, "ICONV")) { finishFileMgr(afflst); return 1; } } /* parse in the input conversion table */ - if (strncmp(line, "OCONV", 5) == 0) { - if (parse_convtable(line, afflst, &oconvtable, "OCONV")) { + if (line.compare(0, 5, "OCONV", 5) == 0) { + if (!parse_convtable(line, afflst, &oconvtable, "OCONV")) { finishFileMgr(afflst); return 1; } } /* parse in the phonetic translation table */ - if (strncmp(line, "PHONE", 5) == 0) { - if (parse_phonetable(line, afflst)) { + if (line.compare(0, 5, "PHONE", 5) == 0) { + if (!parse_phonetable(line, afflst)) { finishFileMgr(afflst); return 1; } } /* parse in the checkcompoundpattern table */ - if (strncmp(line, "CHECKCOMPOUNDPATTERN", 20) == 0) { - if (parse_checkcpdtable(line, afflst)) { + if (line.compare(0, 20, "CHECKCOMPOUNDPATTERN", 20) == 0) { + if (!parse_checkcpdtable(line, afflst)) { finishFileMgr(afflst); return 1; } } /* parse in the defcompound table */ - if (strncmp(line, "COMPOUNDRULE", 12) == 0) { - if (parse_defcpdtable(line, afflst)) { + if (line.compare(0, 12, "COMPOUNDRULE", 12) == 0) { + if (!parse_defcpdtable(line, afflst)) { finishFileMgr(afflst); return 1; } } /* parse in the related character map table */ - if (strncmp(line, "MAP", 3) == 0) { - if (parse_maptable(line, afflst)) { + if (line.compare(0, 3, "MAP", 3) == 0) { + if (!parse_maptable(line, afflst)) { finishFileMgr(afflst); return 1; } } /* parse in the word breakpoints table */ - if (strncmp(line, "BREAK", 5) == 0) { - if (parse_breaktable(line, afflst)) { + if (line.compare(0, 5, "BREAK", 5) == 0) { + if (!parse_breaktable(line, afflst)) { finishFileMgr(afflst); return 1; } } /* parse in the language for language specific codes */ - if (strncmp(line, "LANG", 4) == 0) { - if (parse_string(line, &lang, afflst->getlinenum())) { + if (line.compare(0, 4, "LANG", 4) == 0) { + if (!parse_string(line, lang, afflst->getlinenum())) { finishFileMgr(afflst); return 1; } langnum = get_lang_num(lang); } - if (strncmp(line, "VERSION", 7) == 0) { - for (line = line + 7; *line == ' ' || *line == '\t'; line++) - ; - version = mystrdup(line); + if (line.compare(0, 7, "VERSION", 7) == 0) { + size_t startpos = line.find_first_not_of(" \t", 7); + if (startpos != std::string::npos) { + version = line.substr(startpos); + } } - if (strncmp(line, "MAXNGRAMSUGS", 12) == 0) { - if (parse_num(line, &maxngramsugs, afflst)) { + if (line.compare(0, 12, "MAXNGRAMSUGS", 12) == 0) { + if (!parse_num(line, &maxngramsugs, afflst)) { finishFileMgr(afflst); return 1; } } - if (strncmp(line, "ONLYMAXDIFF", 11) == 0) + if (line.compare(0, 11, "ONLYMAXDIFF", 11) == 0) onlymaxdiff = 1; - if (strncmp(line, "MAXDIFF", 7) == 0) { - if (parse_num(line, &maxdiff, afflst)) { + if (line.compare(0, 7, "MAXDIFF", 7) == 0) { + if (!parse_num(line, &maxdiff, afflst)) { finishFileMgr(afflst); return 1; } } - if (strncmp(line, "MAXCPDSUGS", 10) == 0) { - if (parse_num(line, &maxcpdsugs, afflst)) { + if (line.compare(0, 10, "MAXCPDSUGS", 10) == 0) { + if (!parse_num(line, &maxcpdsugs, afflst)) { finishFileMgr(afflst); return 1; } } - if (strncmp(line, "NOSPLITSUGS", 11) == 0) { + if (line.compare(0, 11, "NOSPLITSUGS", 11) == 0) { nosplitsugs = 1; } - if (strncmp(line, "FULLSTRIP", 9) == 0) { + if (line.compare(0, 9, "FULLSTRIP", 9) == 0) { fullstrip = 1; } - if (strncmp(line, "SUGSWITHDOTS", 12) == 0) { + if (line.compare(0, 12, "SUGSWITHDOTS", 12) == 0) { sugswithdots = 1; } /* parse in the flag used by forbidden words */ - if (strncmp(line, "KEEPCASE", 8) == 0) { - if (parse_flag(line, &keepcase, afflst)) { + if (line.compare(0, 8, "KEEPCASE", 8) == 0) { + if (!parse_flag(line, &keepcase, afflst)) { finishFileMgr(afflst); return 1; } } /* parse in the flag used by `forceucase' */ - if (strncmp(line, "FORCEUCASE", 10) == 0) { - if (parse_flag(line, &forceucase, afflst)) { + if (line.compare(0, 10, "FORCEUCASE", 10) == 0) { + if (!parse_flag(line, &forceucase, afflst)) { finishFileMgr(afflst); return 1; } } /* parse in the flag used by `warn' */ - if (strncmp(line, "WARN", 4) == 0) { - if (parse_flag(line, &warn, afflst)) { + if (line.compare(0, 4, "WARN", 4) == 0) { + if (!parse_flag(line, &warn, afflst)) { finishFileMgr(afflst); return 1; } } - if (strncmp(line, "FORBIDWARN", 10) == 0) { + if (line.compare(0, 10, "FORBIDWARN", 10) == 0) { forbidwarn = 1; } /* parse in the flag used by the affix generator */ - if (strncmp(line, "SUBSTANDARD", 11) == 0) { - if (parse_flag(line, &substandard, afflst)) { + if (line.compare(0, 11, "SUBSTANDARD", 11) == 0) { + if (!parse_flag(line, &substandard, afflst)) { finishFileMgr(afflst); return 1; } } - if (strncmp(line, "CHECKSHARPS", 11) == 0) { + if (line.compare(0, 11, "CHECKSHARPS", 11) == 0) { checksharps = 1; } /* parse this affix: P - prefix, S - suffix */ - ft = ' '; - if (strncmp(line, "PFX", 3) == 0) + // affix type + char ft = ' '; + if (line.compare(0, 3, "PFX", 3) == 0) ft = complexprefixes ? 'S' : 'P'; - if (strncmp(line, "SFX", 3) == 0) + if (line.compare(0, 3, "SFX", 3) == 0) ft = complexprefixes ? 'P' : 'S'; if (ft != ' ') { if (dupflags_ini) { memset(dupflags, 0, sizeof(dupflags)); dupflags_ini = 0; } - if (parse_affix(line, ft, afflst, dupflags)) { + if (!parse_affix(line, ft, afflst, dupflags)) { finishFileMgr(afflst); return 1; } @@ -848,37 +738,22 @@ int AffixMgr::parse_file(const char* affpath, const char* key) { /* get encoding for CHECKCOMPOUNDCASE */ if (!utf8) { - char* enc = get_encoding(); - csconv = get_current_cs(enc); - free(enc); - enc = NULL; - - std::string expw; - if (wordchars) { - expw.assign(wordchars); - free(wordchars); - } - + csconv = get_current_cs(get_encoding()); for (int i = 0; i <= 255; i++) { if ((csconv[i].cupper != csconv[i].clower) && - (expw.find((char)i) == std::string::npos)) { - expw.push_back((char)i); + (wordchars.find((char)i) == std::string::npos)) { + wordchars.push_back((char)i); } } - wordchars = mystrdup(expw.c_str()); } // default BREAK definition - if (numbreak == -1) { - breaktable = (char**)malloc(sizeof(char*) * 3); - if (!breaktable) - return 1; - breaktable[0] = mystrdup("-"); - breaktable[1] = mystrdup("^-"); - breaktable[2] = mystrdup("-$"); - if (breaktable[0] && breaktable[1] && breaktable[2]) - numbreak = 3; + if (!parsedbreaktable) { + breaktable.push_back("-"); + breaktable.push_back("^-"); + breaktable.push_back("-$"); + parsedbreaktable = true; } return 0; } @@ -949,6 +824,9 @@ int AffixMgr::build_pfxtree(PfxEntry* pfxptr) { // both by suffix flag, and sorted by the reverse of the // suffix string itself; so we need to set up two indexes int AffixMgr::build_sfxtree(SfxEntry* sfxptr) { + + sfxptr->initReverseWord(); + SfxEntry* ptr; SfxEntry* pptr; SfxEntry* ep = sfxptr; @@ -1143,17 +1021,6 @@ int AffixMgr::process_sfx_order() { } // add flags to the result for dictionary debugging -void AffixMgr::debugflag(char* result, unsigned short flag) { - char* st = encode_flag(flag); - mystrcat(result, " ", MAXLNLEN); - mystrcat(result, MORPH_FLAG, MAXLNLEN); - if (st) { - mystrcat(result, st, MAXLNLEN); - free(st); - } -} - -// add flags to the result for dictionary debugging std::string& AffixMgr::debugflag(std::string& result, unsigned short flag) { char* st = encode_flag(flag); result.append(" "); @@ -1181,13 +1048,18 @@ int AffixMgr::condlen(const char* st) { return l; } -int AffixMgr::encodeit(affentry& entry, const char* cs) { +int AffixMgr::encodeit(AffEntry& entry, const char* cs) { if (strcmp(cs, ".") != 0) { entry.numconds = (char)condlen(cs); - // coverity[buffer_size_warning] - deliberate use of lack of end of conds - // padded by strncpy as long condition flag - strncpy(entry.c.conds, cs, MAXCONDLEN); - if (entry.c.conds[MAXCONDLEN - 1] && cs[MAXCONDLEN]) { + const size_t cslen = strlen(cs); + const size_t short_part = std::min<size_t>(MAXCONDLEN, cslen); + memcpy(entry.c.conds, cs, short_part); + if (short_part < MAXCONDLEN) { + //blank out the remaining space + memset(entry.c.conds + short_part, 0, MAXCONDLEN - short_part); + } else if (cs[MAXCONDLEN]) { + //there is more conditions than fit in fixed space, so its + //a long condition entry.opts += aeLONGCOND; entry.c.l.conds2 = mystrdup(cs + MAXCONDLEN_1); if (!entry.c.l.conds2) @@ -1316,13 +1188,12 @@ struct hentry* AffixMgr::prefix_check_twosfx(const char* word, } // check word for prefixes -char* AffixMgr::prefix_check_morph(const char* word, - int len, - char in_compound, - const FLAG needflag) { +std::string AffixMgr::prefix_check_morph(const char* word, + int len, + char in_compound, + const FLAG needflag) { - char result[MAXLNLEN]; - result[0] = '\0'; + std::string result; pfx = NULL; sfxappnd = NULL; @@ -1331,12 +1202,10 @@ char* AffixMgr::prefix_check_morph(const char* word, // first handle the special case of 0 length prefixes PfxEntry* pe = pStart[0]; while (pe) { - char* st = pe->check_morph(word, len, in_compound, needflag); - if (st) { - mystrcat(result, st, MAXLNLEN); - free(st); + std::string st = pe->check_morph(word, len, in_compound, needflag); + if (!st.empty()) { + result.append(st); } - // if (rv) return rv; pe = pe->getNext(); } @@ -1346,16 +1215,15 @@ char* AffixMgr::prefix_check_morph(const char* word, while (pptr) { if (isSubset(pptr->getKey(), word)) { - char* st = pptr->check_morph(word, len, in_compound, needflag); - if (st) { + std::string st = pptr->check_morph(word, len, in_compound, needflag); + if (!st.empty()) { // fogemorpheme if ((in_compound != IN_CPD_NOT) || !((pptr->getCont() && (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen()))))) { - mystrcat(result, st, MAXLNLEN); + result.append(st); pfx = pptr; } - free(st); } pptr = pptr->getNextEQ(); } else { @@ -1363,18 +1231,15 @@ char* AffixMgr::prefix_check_morph(const char* word, } } - if (*result) - return mystrdup(result); - return NULL; + return result; } // check word for prefixes -char* AffixMgr::prefix_check_twosfx_morph(const char* word, - int len, - char in_compound, - const FLAG needflag) { - char result[MAXLNLEN]; - result[0] = '\0'; +std::string AffixMgr::prefix_check_twosfx_morph(const char* word, + int len, + char in_compound, + const FLAG needflag) { + std::string result; pfx = NULL; sfxappnd = NULL; @@ -1383,10 +1248,9 @@ char* AffixMgr::prefix_check_twosfx_morph(const char* word, // first handle the special case of 0 length prefixes PfxEntry* pe = pStart[0]; while (pe) { - char* st = pe->check_twosfx_morph(word, len, in_compound, needflag); - if (st) { - mystrcat(result, st, MAXLNLEN); - free(st); + std::string st = pe->check_twosfx_morph(word, len, in_compound, needflag); + if (!st.empty()) { + result.append(st); } pe = pe->getNext(); } @@ -1397,10 +1261,9 @@ char* AffixMgr::prefix_check_twosfx_morph(const char* word, while (pptr) { if (isSubset(pptr->getKey(), word)) { - char* st = pptr->check_twosfx_morph(word, len, in_compound, needflag); - if (st) { - mystrcat(result, st, MAXLNLEN); - free(st); + std::string st = pptr->check_twosfx_morph(word, len, in_compound, needflag); + if (!st.empty()) { + result.append(st); pfx = pptr; } pptr = pptr->getNextEQ(); @@ -1409,29 +1272,31 @@ char* AffixMgr::prefix_check_twosfx_morph(const char* word, } } - if (*result) - return mystrdup(result); - return NULL; + return result; } // Is word a non compound with a REP substitution (see checkcompoundrep)? int AffixMgr::cpdrep_check(const char* word, int wl) { - if ((wl < 2) || !numrep) + if ((wl < 2) || reptable.empty()) return 0; - for (int i = 0; i < numrep; i++) { + for (size_t i = 0; i < reptable.size(); ++i) { const char* r = word; - int lenp = strlen(reptable[i].pattern); + const size_t lenp = reptable[i].pattern.size(); // search every occurence of the pattern in the word - while ((r = strstr(r, reptable[i].pattern)) != NULL) { + while ((r = strstr(r, reptable[i].pattern.c_str())) != NULL) { std::string candidate(word); - candidate.replace(r - word, lenp, reptable[i].pattern2); + size_t type = r == word && langnum != LANG_hu ? 1 : 0; + if (r - word + reptable[i].pattern.size() == lenp && langnum != LANG_hu) + type += 2; + candidate.replace(r - word, lenp, reptable[i].outstrings[type]); if (candidate_check(candidate.c_str(), candidate.size())) return 1; - r++; // search for the next letter + ++r; // search for the next letter } } + return 0; } @@ -1441,21 +1306,21 @@ int AffixMgr::cpdpat_check(const char* word, hentry* r1, hentry* r2, const char /*affixed*/) { - int len; - for (int i = 0; i < numcheckcpd; i++) { - if (isSubset(checkcpdtable[i].pattern2, word + pos) && + for (size_t i = 0; i < checkcpdtable.size(); ++i) { + size_t len; + if (isSubset(checkcpdtable[i].pattern2.c_str(), word + pos) && (!r1 || !checkcpdtable[i].cond || (r1->astr && TESTAFF(r1->astr, checkcpdtable[i].cond, r1->alen))) && (!r2 || !checkcpdtable[i].cond2 || (r2->astr && TESTAFF(r2->astr, checkcpdtable[i].cond2, r2->alen))) && // zero length pattern => only TESTAFF // zero pattern (0/flag) => unmodified stem (zero affixes allowed) - (!*(checkcpdtable[i].pattern) || - ((*(checkcpdtable[i].pattern) == '0' && r1->blen <= pos && + (checkcpdtable[i].pattern.empty() || + ((checkcpdtable[i].pattern[0] == '0' && r1->blen <= pos && strncmp(word + pos - r1->blen, r1->word, r1->blen) == 0) || - (*(checkcpdtable[i].pattern) != '0' && - ((len = strlen(checkcpdtable[i].pattern)) != 0) && - strncmp(word + pos - len, checkcpdtable[i].pattern, len) == 0)))) { + (checkcpdtable[i].pattern[0] != '0' && + ((len = checkcpdtable[i].pattern.size()) != 0) && + strncmp(word + pos - len, checkcpdtable[i].pattern.c_str(), len) == 0)))) { return 1; } } @@ -1513,7 +1378,6 @@ int AffixMgr::defcpd_check(hentry*** words, std::vector<metachar_data> btinfo(1); short bt = 0; - int i, j; (*words)[wnum] = rv; @@ -1525,10 +1389,10 @@ int AffixMgr::defcpd_check(hentry*** words, return 0; } int ok = 0; - for (i = 0; i < numdefcpd; i++) { - for (j = 0; j < defcpdtable[i].len; j++) { - if (defcpdtable[i].def[j] != '*' && defcpdtable[i].def[j] != '?' && - TESTAFF(rv->astr, defcpdtable[i].def[j], rv->alen)) { + for (size_t i = 0; i < defcpdtable.size(); ++i) { + for (size_t j = 0; j < defcpdtable[i].size(); ++j) { + if (defcpdtable[i][j] != '*' && defcpdtable[i][j] != '?' && + TESTAFF(rv->astr, defcpdtable[i][j], rv->alen)) { ok = 1; break; } @@ -1541,25 +1405,25 @@ int AffixMgr::defcpd_check(hentry*** words, return 0; } - for (i = 0; i < numdefcpd; i++) { - signed short pp = 0; // pattern position + for (size_t i = 0; i < defcpdtable.size(); ++i) { + size_t pp = 0; // pattern position signed short wp = 0; // "words" position int ok2; ok = 1; ok2 = 1; do { - while ((pp < defcpdtable[i].len) && (wp <= wnum)) { - if (((pp + 1) < defcpdtable[i].len) && - ((defcpdtable[i].def[pp + 1] == '*') || - (defcpdtable[i].def[pp + 1] == '?'))) { - int wend = (defcpdtable[i].def[pp + 1] == '?') ? wp : wnum; + while ((pp < defcpdtable[i].size()) && (wp <= wnum)) { + if (((pp + 1) < defcpdtable[i].size()) && + ((defcpdtable[i][pp + 1] == '*') || + (defcpdtable[i][pp + 1] == '?'))) { + int wend = (defcpdtable[i][pp + 1] == '?') ? wp : wnum; ok2 = 1; pp += 2; btinfo[bt].btpp = pp; btinfo[bt].btwp = wp; while (wp <= wend) { if (!(*words)[wp]->alen || - !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp - 2], + !TESTAFF((*words)[wp]->astr, defcpdtable[i][pp - 2], (*words)[wp]->alen)) { ok2 = 0; break; @@ -1578,24 +1442,24 @@ int AffixMgr::defcpd_check(hentry*** words, } else { ok2 = 1; if (!(*words)[wp] || !(*words)[wp]->alen || - !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp], + !TESTAFF((*words)[wp]->astr, defcpdtable[i][pp], (*words)[wp]->alen)) { ok = 0; break; } pp++; wp++; - if ((defcpdtable[i].len == pp) && !(wp > wnum)) + if ((defcpdtable[i].size() == pp) && !(wp > wnum)) ok = 0; } } if (ok && ok2) { - int r = pp; - while ((defcpdtable[i].len > r) && ((r + 1) < defcpdtable[i].len) && - ((defcpdtable[i].def[r + 1] == '*') || - (defcpdtable[i].def[r + 1] == '?'))) + size_t r = pp; + while ((defcpdtable[i].size() > r) && ((r + 1) < defcpdtable[i].size()) && + ((defcpdtable[i][r + 1] == '*') || + (defcpdtable[i][r + 1] == '?'))) r += 2; - if (defcpdtable[i].len <= r) + if (defcpdtable[i].size() <= r) return 1; } // backtrack @@ -1608,16 +1472,16 @@ int AffixMgr::defcpd_check(hentry*** words, } while ((btinfo[bt - 1].btnum < 0) && --bt); } while (bt); - if (ok && ok2 && (!all || (defcpdtable[i].len <= pp))) + if (ok && ok2 && (!all || (defcpdtable[i].size() <= pp))) return 1; // check zero ending - while (ok && ok2 && (defcpdtable[i].len > pp) && - ((pp + 1) < defcpdtable[i].len) && - ((defcpdtable[i].def[pp + 1] == '*') || - (defcpdtable[i].def[pp + 1] == '?'))) + while (ok && ok2 && (defcpdtable[i].size() > pp) && + ((pp + 1) < defcpdtable[i].size()) && + ((defcpdtable[i][pp + 1] == '*') || + (defcpdtable[i][pp + 1] == '?'))) pp += 2; - if (ok && ok2 && (defcpdtable[i].len <= pp)) + if (ok && ok2 && (defcpdtable[i].size() <= pp)) return 1; } (*words)[wnum] = NULL; @@ -1627,9 +1491,8 @@ int AffixMgr::defcpd_check(hentry*** words, } inline int AffixMgr::candidate_check(const char* word, int len) { - struct hentry* rv = NULL; - rv = lookup(word); + struct hentry* rv = lookup(word); if (rv) return 1; @@ -1651,20 +1514,23 @@ short AffixMgr::get_syllable(const std::string& word) { if (!utf8) { for (size_t i = 0; i < word.size(); ++i) { - if (strchr(cpdvowels, word[i])) - num++; + if (std::binary_search(cpdvowels.begin(), cpdvowels.end(), + word[i])) { + ++num; + } } - } else if (cpdvowels_utf16) { + } else if (!cpdvowels_utf16.empty()) { std::vector<w_char> w; - int i = u8_u16(w, word); - for (; i > 0; i--) { - if (std::binary_search(cpdvowels_utf16, - cpdvowels_utf16 + cpdvowels_utf16_len, - w[i - 1])) { + u8_u16(w, word); + for (size_t i = 0; i < w.size(); ++i) { + if (std::binary_search(cpdvowels_utf16.begin(), + cpdvowels_utf16.end(), + w[i])) { ++num; } } } + return num; } @@ -1687,8 +1553,7 @@ void AffixMgr::setcminmax(int* cmin, int* cmax, const char* word, int len) { // check if compound word is correctly spelled // hu_mov_rule = spec. Hungarian rule (XXX) -struct hentry* AffixMgr::compound_check(const char* word, - int len, +struct hentry* AffixMgr::compound_check(const std::string& word, short wordnum, short numsyllable, short maxwordnum, @@ -1707,19 +1572,19 @@ struct hentry* AffixMgr::compound_check(const char* word, int cmin; int cmax; int striple = 0; - int scpd = 0; + size_t scpd = 0; int soldi = 0; int oldcmin = 0; int oldcmax = 0; int oldlen = 0; int checkedstriple = 0; - int onlycpdrule; char affixed = 0; hentry** oldwords = words; + size_t len = word.size(); int checked_prefix; - setcminmax(&cmin, &cmax, word, len); + setcminmax(&cmin, &cmax, word.c_str(), len); st.assign(word); @@ -1733,7 +1598,7 @@ struct hentry* AffixMgr::compound_check(const char* word, } words = oldwords; - onlycpdrule = (words) ? 1 : 0; + int onlycpdrule = (words) ? 1 : 0; do { // onlycpdrule loop @@ -1744,26 +1609,26 @@ struct hentry* AffixMgr::compound_check(const char* word, do { // simplified checkcompoundpattern loop if (scpd > 0) { - for (; scpd <= numcheckcpd && - (!checkcpdtable[scpd - 1].pattern3 || - strncmp(word + i, checkcpdtable[scpd - 1].pattern3, - strlen(checkcpdtable[scpd - 1].pattern3)) != 0); + for (; scpd <= checkcpdtable.size() && + (checkcpdtable[scpd - 1].pattern3.empty() || + strncmp(word.c_str() + i, checkcpdtable[scpd - 1].pattern3.c_str(), + checkcpdtable[scpd - 1].pattern3.size()) != 0); scpd++) ; - if (scpd > numcheckcpd) + if (scpd > checkcpdtable.size()) break; // break simplified checkcompoundpattern loop st.replace(i, std::string::npos, checkcpdtable[scpd - 1].pattern); soldi = i; - i += strlen(checkcpdtable[scpd - 1].pattern); + i += checkcpdtable[scpd - 1].pattern.size(); st.replace(i, std::string::npos, checkcpdtable[scpd - 1].pattern2); - st.replace(i + strlen(checkcpdtable[scpd - 1].pattern2), std::string::npos, - word + soldi + strlen(checkcpdtable[scpd - 1].pattern3)); + st.replace(i + checkcpdtable[scpd - 1].pattern2.size(), std::string::npos, + word.substr(soldi + checkcpdtable[scpd - 1].pattern3.size())); oldlen = len; - len += strlen(checkcpdtable[scpd - 1].pattern) + - strlen(checkcpdtable[scpd - 1].pattern2) - - strlen(checkcpdtable[scpd - 1].pattern3); + len += checkcpdtable[scpd - 1].pattern.size() + + checkcpdtable[scpd - 1].pattern2.size() - + checkcpdtable[scpd - 1].pattern3.size(); oldcmin = cmin; oldcmax = cmax; setcminmax(&cmin, &cmax, st.c_str(), len); @@ -1791,7 +1656,7 @@ struct hentry* AffixMgr::compound_check(const char* word, TESTAFF(rv->astr, compoundbegin, rv->alen)) || (compoundmiddle && wordnum && !words && !onlycpdrule && TESTAFF(rv->astr, compoundmiddle, rv->alen)) || - (numdefcpd && onlycpdrule && + (!defcpdtable.empty() && onlycpdrule && ((!words && !wordnum && defcpd_check(&words, wnum, rv, rwords, 0)) || (words && @@ -1812,7 +1677,7 @@ struct hentry* AffixMgr::compound_check(const char* word, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) { if (((rv = suffix_check( - st.c_str(), i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundflag, + st.c_str(), i, 0, NULL, FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || (compoundmoresuffixes && (rv = suffix_check_twosfx(st.c_str(), i, 0, NULL, compoundflag)))) && @@ -1829,7 +1694,7 @@ struct hentry* AffixMgr::compound_check(const char* word, if (rv || (((wordnum == 0) && compoundbegin && ((rv = suffix_check( - st.c_str(), i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, + st.c_str(), i, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || (compoundmoresuffixes && (rv = suffix_check_twosfx( @@ -1840,7 +1705,7 @@ struct hentry* AffixMgr::compound_check(const char* word, compoundbegin)))) || ((wordnum > 0) && compoundmiddle && ((rv = suffix_check( - st.c_str(), i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, + st.c_str(), i, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || (compoundmoresuffixes && (rv = suffix_check_twosfx( @@ -1911,8 +1776,7 @@ struct hentry* AffixMgr::compound_check(const char* word, ((oldwordnum == 0) && compoundbegin && TESTAFF(rv->astr, compoundbegin, rv->alen)) || ((oldwordnum > 0) && compoundmiddle && - TESTAFF(rv->astr, compoundmiddle, rv->alen)) // || - // (numdefcpd && ) + TESTAFF(rv->astr, compoundmiddle, rv->alen)) // LANG_hu section: spec. Hungarian rule || ((langnum == LANG_hu) && hu_mov_rule && @@ -1934,7 +1798,7 @@ struct hentry* AffixMgr::compound_check(const char* word, ((word[i - 1] == word[i + 1])) // may be word[i+1] == '\0' )) || (checkcompoundcase && scpd == 0 && !words && - cpdcase_check(word, i)))) + cpdcase_check(word.c_str(), i)))) // LANG_hu section: spec. Hungarian rule || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && (rv = affix_check(st.c_str(), i)) && @@ -1949,7 +1813,7 @@ struct hentry* AffixMgr::compound_check(const char* word, // LANG_hu section: spec. Hungarian rule if (langnum == LANG_hu) { // calculate syllable number of the word - numsyllable += get_syllable(st.substr(i)); + numsyllable += get_syllable(st.substr(0, i)); // + 1 word, if syllable number of the prefix > 1 (hungarian // convention) if (pfx && (get_syllable(pfx->getKey()) > 1)) @@ -1968,7 +1832,7 @@ struct hentry* AffixMgr::compound_check(const char* word, if (striple) { checkedstriple = 1; i--; // check "fahrt" instead of "ahrt" in "Schiffahrt" - } else if (i > 2 && *(word + i - 1) == *(word + i - 2)) + } else if (i > 2 && word[i - 1] == word[i - 2]) striple = 1; } @@ -1981,7 +1845,7 @@ struct hentry* AffixMgr::compound_check(const char* word, TESTAFF(rv->astr, compoundflag, rv->alen)) || (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) || - (numdefcpd && words && + (!defcpdtable.empty() && words && defcpd_check(&words, wnum + 1, rv, NULL, 1))) || (scpd != 0 && checkcpdtable[scpd - 1].cond2 != FLAG_NULL && !TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2, @@ -2034,12 +1898,12 @@ struct hentry* AffixMgr::compound_check(const char* word, (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))) && (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) || ((cpdmaxsyllable != 0) && - (numsyllable + get_syllable(std::string(HENTRY_WORD(rv), rv->clen)) <= + (numsyllable + get_syllable(std::string(HENTRY_WORD(rv), rv->blen)) <= cpdmaxsyllable))) && ( // test CHECKCOMPOUNDPATTERN - !numcheckcpd || scpd != 0 || - !cpdpat_check(word, i, rv_first, rv, 0)) && + checkcpdtable.empty() || scpd != 0 || + !cpdpat_check(word.c_str(), i, rv_first, rv, 0)) && ((!checkcompounddup || (rv != rv_first))) // test CHECKCOMPOUNDPATTERN conditions && @@ -2047,7 +1911,7 @@ struct hentry* AffixMgr::compound_check(const char* word, TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2, rv->alen))) { // forbid compound word, if it is a non compound word with typical // fault - if (checkcompoundrep && cpdrep_check(word, len)) + if (checkcompoundrep && cpdrep_check(word.c_str(), len)) return NULL; return rv_first; } @@ -2059,18 +1923,18 @@ struct hentry* AffixMgr::compound_check(const char* word, sfx = NULL; sfxflag = FLAG_NULL; rv = (compoundflag && !onlycpdrule) - ? affix_check((word + i), strlen(word + i), compoundflag, + ? affix_check((word.c_str() + i), strlen(word.c_str() + i), compoundflag, IN_CPD_END) : NULL; if (!rv && compoundend && !onlycpdrule) { sfx = NULL; pfx = NULL; - rv = affix_check((word + i), strlen(word + i), compoundend, + rv = affix_check((word.c_str() + i), strlen(word.c_str() + i), compoundend, IN_CPD_END); } - if (!rv && numdefcpd && words) { - rv = affix_check((word + i), strlen(word + i), 0, IN_CPD_END); + if (!rv && !defcpdtable.empty() && words) { + rv = affix_check((word.c_str() + i), strlen(word.c_str() + i), 0, IN_CPD_END); if (rv && defcpd_check(&words, wnum + 1, rv, NULL, 1)) return rv_first; rv = NULL; @@ -2083,8 +1947,8 @@ struct hentry* AffixMgr::compound_check(const char* word, rv = NULL; // test CHECKCOMPOUNDPATTERN conditions (forbidden compounds) - if (rv && numcheckcpd && scpd == 0 && - cpdpat_check(word, i, rv_first, rv, affixed)) + if (rv && !checkcpdtable.empty() && scpd == 0 && + cpdpat_check(word.c_str(), i, rv_first, rv, affixed)) rv = NULL; // check non_compound flag in suffix and prefix @@ -2118,7 +1982,7 @@ struct hentry* AffixMgr::compound_check(const char* word, if (langnum == LANG_hu) { // calculate syllable number of the word - numsyllable += get_syllable(word + i); + numsyllable += get_syllable(word.c_str() + i); // - affix syllable num. // XXX only second suffix (inflections, not derivations) @@ -2136,7 +2000,7 @@ struct hentry* AffixMgr::compound_check(const char* word, // increment syllable num, if last word has a SYLLABLENUM flag // and the suffix is beginning `s' - if (cpdsyllablenum) { + if (!cpdsyllablenum.empty()) { switch (sfxflag) { case 'c': { numsyllable += 2; @@ -2171,7 +2035,7 @@ struct hentry* AffixMgr::compound_check(const char* word, ((!checkcompounddup || (rv != rv_first)))) { // forbid compound word, if it is a non compound word with typical // fault - if (checkcompoundrep && cpdrep_check(word, len)) + if (checkcompoundrep && cpdrep_check(word.c_str(), len)) return NULL; return rv_first; } @@ -2180,16 +2044,16 @@ struct hentry* AffixMgr::compound_check(const char* word, wordnum = oldwordnum2; // perhaps second word is a compound word (recursive call) - if (wordnum < maxwordnum) { - rv = compound_check(st.c_str() + i, strlen(st.c_str() + i), wordnum + 1, + if (wordnum + 2 < maxwordnum) { + rv = compound_check(st.substr(i), wordnum + 1, numsyllable, maxwordnum, wnum + 1, words, rwords, 0, is_sug, info); - if (rv && numcheckcpd && + if (rv && !checkcpdtable.empty() && ((scpd == 0 && - cpdpat_check(word, i, rv_first, rv, affixed)) || + cpdpat_check(word.c_str(), i, rv_first, rv, affixed)) || (scpd != 0 && - !cpdpat_check(word, i, rv_first, rv, affixed)))) + !cpdpat_check(word.c_str(), i, rv_first, rv, affixed)))) rv = NULL; } else { rv = NULL; @@ -2198,13 +2062,12 @@ struct hentry* AffixMgr::compound_check(const char* word, // forbid compound word, if it is a non compound word with typical // fault if (checkcompoundrep || forbiddenword) { - struct hentry* rv2 = NULL; - if (checkcompoundrep && cpdrep_check(word, len)) + if (checkcompoundrep && cpdrep_check(word.c_str(), len)) return NULL; // check first part - if (strncmp(rv->word, word + i, rv->blen) == 0) { + if (strncmp(rv->word, word.c_str() + i, rv->blen) == 0) { char r = st[i + rv->blen]; st[i + rv->blen] = '\0'; @@ -2214,9 +2077,9 @@ struct hentry* AffixMgr::compound_check(const char* word, } if (forbiddenword) { - rv2 = lookup(word); + struct hentry* rv2 = lookup(word.c_str()); if (!rv2) - rv2 = affix_check(word, len); + rv2 = affix_check(word.c_str(), len); if (rv2 && rv2->astr && TESTAFF(rv2->astr, forbiddenword, rv2->alen) && (strncmp(rv2->word, st.c_str(), i + rv->blen) == 0)) { @@ -2248,7 +2111,7 @@ struct hentry* AffixMgr::compound_check(const char* word, scpd++; } while (!onlycpdrule && simplifiedcpd && - scpd <= numcheckcpd); // end of simplifiedcpd loop + scpd <= checkcpdtable.size()); // end of simplifiedcpd loop scpd = 0; wordnum = oldwordnum; @@ -2261,7 +2124,7 @@ struct hentry* AffixMgr::compound_check(const char* word, } else st[i] = ch; - } while (numdefcpd && oldwordnum == 0 && + } while (!defcpdtable.empty() && oldwordnum == 0 && onlycpdrule++ < 1); // end of onlycpd loop } @@ -2278,9 +2141,9 @@ int AffixMgr::compound_check_morph(const char* word, short wnum, hentry** words, hentry** rwords, - char hu_mov_rule = 0, - char** result = NULL, - char* partresult = NULL) { + char hu_mov_rule, + std::string& result, + const std::string* partresult) { int i; short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2; int ok = 0; @@ -2291,12 +2154,11 @@ int AffixMgr::compound_check_morph(const char* word, char ch; int checked_prefix; - char presult[MAXLNLEN]; + std::string presult; int cmin; int cmax; - int onlycpdrule; char affixed = 0; hentry** oldwords = words; @@ -2314,7 +2176,7 @@ int AffixMgr::compound_check_morph(const char* word, } words = oldwords; - onlycpdrule = (words) ? 1 : 0; + int onlycpdrule = (words) ? 1 : 0; do { // onlycpdrule loop @@ -2330,9 +2192,9 @@ int AffixMgr::compound_check_morph(const char* word, affixed = 1; - *presult = '\0'; + presult.clear(); if (partresult) - mystrcat(presult, partresult, MAXLNLEN); + presult.append(*partresult); rv = lookup(st.c_str()); // perhaps without prefix @@ -2345,7 +2207,7 @@ int AffixMgr::compound_check_morph(const char* word, TESTAFF(rv->astr, compoundbegin, rv->alen)) || (compoundmiddle && wordnum && !words && !onlycpdrule && TESTAFF(rv->astr, compoundmiddle, rv->alen)) || - (numdefcpd && onlycpdrule && + (!defcpdtable.empty() && onlycpdrule && ((!words && !wordnum && defcpd_check(&words, wnum, rv, rwords, 0)) || (words && @@ -2357,28 +2219,26 @@ int AffixMgr::compound_check_morph(const char* word, affixed = 0; if (rv) { - sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_PART, st.c_str()); + presult.push_back(MSEP_FLD); + presult.append(MORPH_PART); + presult.append(st.c_str()); if (!HENTRY_FIND(rv, MORPH_STEM)) { - sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_STEM, - st.c_str()); + presult.push_back(MSEP_FLD); + presult.append(MORPH_STEM); + presult.append(st.c_str()); } - // store the pointer of the hash entry - // sprintf(presult + strlen(presult), "%c%s%p", MSEP_FLD, - // MORPH_HENTRY, rv); if (HENTRY_DATA(rv)) { - sprintf(presult + strlen(presult), "%c%s", MSEP_FLD, - HENTRY_DATA2(rv)); + presult.push_back(MSEP_FLD); + presult.append(HENTRY_DATA2(rv)); } } if (!rv) { - if (onlycpdrule && strlen(*result) > MAXLNLEN / 10) - break; if (compoundflag && !(rv = prefix_check(st.c_str(), i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) { - if (((rv = suffix_check(st.c_str(), i, 0, NULL, NULL, 0, NULL, FLAG_NULL, + if (((rv = suffix_check(st.c_str(), i, 0, NULL, FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || (compoundmoresuffixes && @@ -2395,7 +2255,7 @@ int AffixMgr::compound_check_morph(const char* word, if (rv || (((wordnum == 0) && compoundbegin && - ((rv = suffix_check(st.c_str(), i, 0, NULL, NULL, 0, NULL, FLAG_NULL, + ((rv = suffix_check(st.c_str(), i, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || (compoundmoresuffixes && @@ -2406,7 +2266,7 @@ int AffixMgr::compound_check_morph(const char* word, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundbegin)))) || ((wordnum > 0) && compoundmiddle && - ((rv = suffix_check(st.c_str(), i, 0, NULL, NULL, 0, NULL, FLAG_NULL, + ((rv = suffix_check(st.c_str(), i, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || (compoundmoresuffixes && @@ -2416,26 +2276,23 @@ int AffixMgr::compound_check_morph(const char* word, (rv = prefix_check(st.c_str(), i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle)))))) { - // char * p = prefix_check_morph(st, i, 0, compound); - char* p = NULL; + std::string p; if (compoundflag) p = affix_check_morph(st.c_str(), i, compoundflag); - if (!p || (*p == '\0')) { - if (p) - free(p); - p = NULL; + if (p.empty()) { if ((wordnum == 0) && compoundbegin) { p = affix_check_morph(st.c_str(), i, compoundbegin); } else if ((wordnum > 0) && compoundmiddle) { p = affix_check_morph(st.c_str(), i, compoundmiddle); } } - if (p && (*p != '\0')) { - sprintf(presult + strlen(presult), "%c%s%s%s", MSEP_FLD, MORPH_PART, - st.c_str(), line_uniq_app(&p, MSEP_REC)); + if (!p.empty()) { + presult.push_back(MSEP_FLD); + presult.append(MORPH_PART); + presult.append(st.c_str()); + line_uniq_app(p, MSEP_REC); + presult.append(p); } - if (p) - free(p); checked_prefix = 1; } // else check forbiddenwords @@ -2507,7 +2364,7 @@ int AffixMgr::compound_check_morph(const char* word, )) || ( // test CHECKCOMPOUNDPATTERN - numcheckcpd && !words && + !checkcpdtable.empty() && !words && cpdpat_check(word, i, rv, NULL, affixed)) || (checkcompoundcase && !words && cpdcase_check(word, i)))) // LANG_hu section: spec. Hungarian rule @@ -2522,7 +2379,7 @@ int AffixMgr::compound_check_morph(const char* word, // LANG_hu section: spec. Hungarian rule if (langnum == LANG_hu) { // calculate syllable number of the word - numsyllable += get_syllable(st.substr(i)); + numsyllable += get_syllable(st.substr(0, i)); // + 1 word, if syllable number of the prefix > 1 (hungarian // convention) @@ -2541,31 +2398,29 @@ int AffixMgr::compound_check_morph(const char* word, TESTAFF(rv->astr, compoundflag, rv->alen)) || (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) || - (numdefcpd && words && + (!defcpdtable.empty() && words && defcpd_check(&words, wnum + 1, rv, NULL, 1))))) { rv = rv->next_homonym; } if (rv && words && words[wnum + 1]) { - mystrcat(*result, presult, MAXLNLEN); - mystrcat(*result, " ", MAXLNLEN); - mystrcat(*result, MORPH_PART, MAXLNLEN); - mystrcat(*result, word + i, MAXLNLEN); + result.append(presult); + result.append(" "); + result.append(MORPH_PART); + result.append(word + i); if (complexprefixes && HENTRY_DATA(rv)) - mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN); + result.append(HENTRY_DATA2(rv)); if (!HENTRY_FIND(rv, MORPH_STEM)) { - mystrcat(*result, " ", MAXLNLEN); - mystrcat(*result, MORPH_STEM, MAXLNLEN); - mystrcat(*result, HENTRY_WORD(rv), MAXLNLEN); + result.append(" "); + result.append(MORPH_STEM); + result.append(HENTRY_WORD(rv)); } // store the pointer of the hash entry - // sprintf(*result + strlen(*result), " %s%p", - // MORPH_HENTRY, rv); if (!complexprefixes && HENTRY_DATA(rv)) { - mystrcat(*result, " ", MAXLNLEN); - mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN); + result.append(" "); + result.append(HENTRY_DATA2(rv)); } - mystrcat(*result, "\n", MAXLNLEN); + result.append("\n"); return 0; } @@ -2606,28 +2461,26 @@ int AffixMgr::compound_check_morph(const char* word, cpdmaxsyllable))) && ((!checkcompounddup || (rv != rv_first)))) { // bad compound word - mystrcat(*result, presult, MAXLNLEN); - mystrcat(*result, " ", MAXLNLEN); - mystrcat(*result, MORPH_PART, MAXLNLEN); - mystrcat(*result, word + i, MAXLNLEN); + result.append(presult); + result.append(" "); + result.append(MORPH_PART); + result.append(word + i); if (HENTRY_DATA(rv)) { if (complexprefixes) - mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN); + result.append(HENTRY_DATA2(rv)); if (!HENTRY_FIND(rv, MORPH_STEM)) { - mystrcat(*result, " ", MAXLNLEN); - mystrcat(*result, MORPH_STEM, MAXLNLEN); - mystrcat(*result, HENTRY_WORD(rv), MAXLNLEN); + result.append(" "); + result.append(MORPH_STEM); + result.append(HENTRY_WORD(rv)); } // store the pointer of the hash entry - // sprintf(*result + strlen(*result), " - // %s%p", MORPH_HENTRY, rv); if (!complexprefixes) { - mystrcat(*result, " ", MAXLNLEN); - mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN); + result.append(" "); + result.append(HENTRY_DATA2(rv)); } } - mystrcat(*result, "\n", MAXLNLEN); + result.append("\n"); ok = 1; } @@ -2649,27 +2502,24 @@ int AffixMgr::compound_check_morph(const char* word, rv = affix_check((word + i), strlen(word + i), compoundend); } - if (!rv && numdefcpd && words) { + if (!rv && !defcpdtable.empty() && words) { rv = affix_check((word + i), strlen(word + i), 0, IN_CPD_END); if (rv && words && defcpd_check(&words, wnum + 1, rv, NULL, 1)) { - char* m = NULL; + std::string m; if (compoundflag) m = affix_check_morph((word + i), strlen(word + i), compoundflag); - if ((!m || *m == '\0') && compoundend) { - if (m) - free(m); + if (m.empty() && compoundend) { m = affix_check_morph((word + i), strlen(word + i), compoundend); } - mystrcat(*result, presult, MAXLNLEN); - if (m || (*m != '\0')) { - char m2[MAXLNLEN]; - sprintf(m2, "%c%s%s%s", MSEP_FLD, MORPH_PART, word + i, - line_uniq_app(&m, MSEP_REC)); - mystrcat(*result, m2, MAXLNLEN); + result.append(presult); + if (!m.empty()) { + result.push_back(MSEP_FLD); + result.append(MORPH_PART); + result.append(word + i); + line_uniq_app(m, MSEP_REC); + result.append(m); } - if (m) - free(m); - mystrcat(*result, "\n", MAXLNLEN); + result.append("\n"); ok = 1; } } @@ -2713,7 +2563,7 @@ int AffixMgr::compound_check_morph(const char* word, // increment syllable num, if last word has a SYLLABLENUM flag // and the suffix is beginning `s' - if (cpdsyllablenum) { + if (!cpdsyllablenum.empty()) { switch (sfxflag) { case 'c': { numsyllable += 2; @@ -2745,25 +2595,21 @@ int AffixMgr::compound_check_morph(const char* word, (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) || ((cpdmaxsyllable != 0) && (numsyllable <= cpdmaxsyllable))) && ((!checkcompounddup || (rv != rv_first)))) { - char* m = NULL; + std::string m; if (compoundflag) m = affix_check_morph((word + i), strlen(word + i), compoundflag); - if ((!m || *m == '\0') && compoundend) { - if (m) - free(m); + if (m.empty() && compoundend) { m = affix_check_morph((word + i), strlen(word + i), compoundend); } - mystrcat(*result, presult, MAXLNLEN); - if (m && (*m != '\0')) { - char m2[MAXLNLEN]; - sprintf(m2, "%c%s%s%s", MSEP_FLD, MORPH_PART, word + i, - line_uniq_app(&m, MSEP_REC)); - mystrcat(*result, m2, MAXLNLEN); + result.append(presult); + if (!m.empty()) { + result.push_back(MSEP_FLD); + result.append(MORPH_PART); + result.append(word + 1); + line_uniq_app(m, MSEP_REC); + result.append(m); } - if (m) - free(m); - if (strlen(*result) + 1 < MAXLNLEN) - sprintf(*result + strlen(*result), "%c", MSEP_REC); + result.push_back(MSEP_REC); ok = 1; } @@ -2771,10 +2617,10 @@ int AffixMgr::compound_check_morph(const char* word, wordnum = oldwordnum2; // perhaps second word is a compound word (recursive call) - if ((wordnum < maxwordnum) && (ok == 0)) { + if ((wordnum + 2 < maxwordnum) && (ok == 0)) { compound_check_morph((word + i), strlen(word + i), wordnum + 1, numsyllable, maxwordnum, wnum + 1, words, rwords, 0, - result, presult); + result, &presult); } else { rv = NULL; } @@ -2783,26 +2629,13 @@ int AffixMgr::compound_check_morph(const char* word, wordnum = oldwordnum; numsyllable = oldnumsyllable; - } while (numdefcpd && oldwordnum == 0 && + } while (!defcpdtable.empty() && oldwordnum == 0 && onlycpdrule++ < 1); // end of onlycpd loop } return 0; } -// return 1 if s1 (reversed) is a leading subset of end of s2 -/* inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int - len) - { - while ((len > 0) && *s1 && (*s1 == *end_of_s2)) { - s1++; - end_of_s2--; - len--; - } - return (*s1 == '\0'); - } - */ - inline int AffixMgr::isRevSubset(const char* s1, const char* end_of_s2, int len) { @@ -2815,14 +2648,10 @@ inline int AffixMgr::isRevSubset(const char* s1, } // check word for suffixes - struct hentry* AffixMgr::suffix_check(const char* word, int len, int sfxopts, PfxEntry* ppfx, - char** wlst, - int maxSug, - int* ns, const FLAG cclass, const FLAG needflag, char in_compound) { @@ -2861,7 +2690,7 @@ struct hentry* AffixMgr::suffix_check(const char* word, (ppfx && !((ep->getCont()) && TESTAFF(ep->getCont(), needaffix, ep->getContLen()))))) { - rv = se->checkword(word, len, sfxopts, ppfx, wlst, maxSug, ns, + rv = se->checkword(word, len, sfxopts, ppfx, (FLAG)cclass, needflag, (in_compound ? 0 : onlyincompound)); if (rv) { @@ -2912,7 +2741,7 @@ struct hentry* AffixMgr::suffix_check(const char* word, if (in_compound != IN_CPD_END || ppfx || !(sptr->getCont() && TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))) { - rv = sptr->checkword(word, len, sfxopts, ppfx, wlst, maxSug, ns, + rv = sptr->checkword(word, len, sfxopts, ppfx, cclass, needflag, (in_compound ? 0 : onlyincompound)); if (rv) { @@ -2985,23 +2814,21 @@ struct hentry* AffixMgr::suffix_check_twosfx(const char* word, return NULL; } -char* AffixMgr::suffix_check_twosfx_morph(const char* word, - int len, - int sfxopts, - PfxEntry* ppfx, - const FLAG needflag) { +std::string AffixMgr::suffix_check_twosfx_morph(const char* word, + int len, + int sfxopts, + PfxEntry* ppfx, + const FLAG needflag) { std::string result; std::string result2; std::string result3; - char* st; - // first handle the special case of 0 length suffixes SfxEntry* se = sStart[0]; while (se) { if (contclasses[se->getFlag()]) { - st = se->check_twosfx_morph(word, len, sfxopts, ppfx, needflag); - if (st) { + std::string st = se->check_twosfx_morph(word, len, sfxopts, ppfx, needflag); + if (!st.empty()) { if (ppfx) { if (ppfx->getMorph()) { result.append(ppfx->getMorph()); @@ -3010,7 +2837,6 @@ char* AffixMgr::suffix_check_twosfx_morph(const char* word, debugflag(result, ppfx->getFlag()); } result.append(st); - free(st); if (se->getMorph()) { result.append(" "); result.append(se->getMorph()); @@ -3024,20 +2850,19 @@ char* AffixMgr::suffix_check_twosfx_morph(const char* word, // now handle the general case if (len == 0) - return NULL; // FULLSTRIP + return std::string(); // FULLSTRIP unsigned char sp = *((const unsigned char*)(word + len - 1)); SfxEntry* sptr = sStart[sp]; while (sptr) { if (isRevSubset(sptr->getKey(), word + len - 1, len)) { if (contclasses[sptr->getFlag()]) { - st = sptr->check_twosfx_morph(word, len, sfxopts, ppfx, needflag); - if (st) { + std::string st = sptr->check_twosfx_morph(word, len, sfxopts, ppfx, needflag); + if (!st.empty()) { sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless if (!sptr->getCont()) sfxappnd = sptr->getKey(); // BUG: sfxappnd not stateless result2.assign(st); - free(st); result3.clear(); @@ -3057,25 +2882,20 @@ char* AffixMgr::suffix_check_twosfx_morph(const char* word, } } - if (!result.empty()) - return mystrdup(result.c_str()); - - return NULL; + return result; } -char* AffixMgr::suffix_check_morph(const char* word, - int len, - int sfxopts, - PfxEntry* ppfx, - const FLAG cclass, - const FLAG needflag, - char in_compound) { - char result[MAXLNLEN]; +std::string AffixMgr::suffix_check_morph(const char* word, + int len, + int sfxopts, + PfxEntry* ppfx, + const FLAG cclass, + const FLAG needflag, + char in_compound) { + std::string result; struct hentry* rv = NULL; - result[0] = '\0'; - PfxEntry* ep = ppfx; // first handle the special case of 0 length suffixes @@ -3109,37 +2929,34 @@ char* AffixMgr::suffix_check_morph(const char* word, (ppfx && !((ep->getCont()) && TESTAFF(ep->getCont(), needaffix, ep->getContLen())))))) - rv = se->checkword(word, len, sfxopts, ppfx, NULL, 0, 0, cclass, - needflag); + rv = se->checkword(word, len, sfxopts, ppfx, cclass, + needflag, FLAG_NULL); while (rv) { if (ppfx) { if (ppfx->getMorph()) { - mystrcat(result, ppfx->getMorph(), MAXLNLEN); - mystrcat(result, " ", MAXLNLEN); + result.append(ppfx->getMorph()); + result.append(" "); } else debugflag(result, ppfx->getFlag()); } if (complexprefixes && HENTRY_DATA(rv)) - mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); + result.append(HENTRY_DATA2(rv)); if (!HENTRY_FIND(rv, MORPH_STEM)) { - mystrcat(result, " ", MAXLNLEN); - mystrcat(result, MORPH_STEM, MAXLNLEN); - mystrcat(result, HENTRY_WORD(rv), MAXLNLEN); + result.append(" "); + result.append(MORPH_STEM); + result.append(HENTRY_WORD(rv)); } - // store the pointer of the hash entry - // sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, - // rv); if (!complexprefixes && HENTRY_DATA(rv)) { - mystrcat(result, " ", MAXLNLEN); - mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); + result.append(" "); + result.append(HENTRY_DATA2(rv)); } if (se->getMorph()) { - mystrcat(result, " ", MAXLNLEN); - mystrcat(result, se->getMorph(), MAXLNLEN); + result.append(" "); + result.append(se->getMorph()); } else debugflag(result, se->getFlag()); - mystrcat(result, "\n", MAXLNLEN); + result.append("\n"); rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag); } } @@ -3148,7 +2965,7 @@ char* AffixMgr::suffix_check_morph(const char* word, // now handle the general case if (len == 0) - return NULL; // FULLSTRIP + return std::string(); // FULLSTRIP unsigned char sp = *((const unsigned char*)(word + len - 1)); SfxEntry* sptr = sStart[sp]; @@ -3179,38 +2996,35 @@ char* AffixMgr::suffix_check_morph(const char* word, (cclass || !(sptr->getCont() && TESTAFF(sptr->getCont(), needaffix, sptr->getContLen()))))) - rv = sptr->checkword(word, len, sfxopts, ppfx, NULL, 0, 0, cclass, - needflag); + rv = sptr->checkword(word, len, sfxopts, ppfx, cclass, + needflag, FLAG_NULL); while (rv) { if (ppfx) { if (ppfx->getMorph()) { - mystrcat(result, ppfx->getMorph(), MAXLNLEN); - mystrcat(result, " ", MAXLNLEN); + result.append(ppfx->getMorph()); + result.append(" "); } else debugflag(result, ppfx->getFlag()); } if (complexprefixes && HENTRY_DATA(rv)) - mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); + result.append(HENTRY_DATA2(rv)); if (!HENTRY_FIND(rv, MORPH_STEM)) { - mystrcat(result, " ", MAXLNLEN); - mystrcat(result, MORPH_STEM, MAXLNLEN); - mystrcat(result, HENTRY_WORD(rv), MAXLNLEN); + result.append(" "); + result.append(MORPH_STEM); + result.append(HENTRY_WORD(rv)); } - // store the pointer of the hash entry - // sprintf(result + strlen(result), " %s%p", - // MORPH_HENTRY, rv); if (!complexprefixes && HENTRY_DATA(rv)) { - mystrcat(result, " ", MAXLNLEN); - mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); + result.append(" "); + result.append(HENTRY_DATA2(rv)); } if (sptr->getMorph()) { - mystrcat(result, " ", MAXLNLEN); - mystrcat(result, sptr->getMorph(), MAXLNLEN); + result.append(" "); + result.append(sptr->getMorph()); } else debugflag(result, sptr->getFlag()); - mystrcat(result, "\n", MAXLNLEN); + result.append("\n"); rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag); } sptr = sptr->getNextEQ(); @@ -3219,9 +3033,7 @@ char* AffixMgr::suffix_check_morph(const char* word, } } - if (*result) - return mystrdup(result); - return NULL; + return result; } // check if word with affixes is correctly spelled @@ -3229,16 +3041,14 @@ struct hentry* AffixMgr::affix_check(const char* word, int len, const FLAG needflag, char in_compound) { - struct hentry* rv = NULL; // check all prefixes (also crossed with suffixes if allowed) - rv = prefix_check(word, len, in_compound, needflag); + struct hentry* rv = prefix_check(word, len, in_compound, needflag); if (rv) return rv; // if still not found check all suffixes - rv = suffix_check(word, len, 0, NULL, NULL, 0, NULL, FLAG_NULL, needflag, - in_compound); + rv = suffix_check(word, len, 0, NULL, FLAG_NULL, needflag, in_compound); if (havecontclass) { sfx = NULL; @@ -3259,27 +3069,22 @@ struct hentry* AffixMgr::affix_check(const char* word, } // check if word with affixes is correctly spelled -char* AffixMgr::affix_check_morph(const char* word, +std::string AffixMgr::affix_check_morph(const char* word, int len, const FLAG needflag, char in_compound) { - char result[MAXLNLEN]; - char* st = NULL; - - *result = '\0'; + std::string result; // check all prefixes (also crossed with suffixes if allowed) - st = prefix_check_morph(word, len, in_compound); - if (st) { - mystrcat(result, st, MAXLNLEN); - free(st); + std::string st = prefix_check_morph(word, len, in_compound); + if (!st.empty()) { + result.append(st); } // if still not found check all suffixes st = suffix_check_morph(word, len, 0, NULL, '\0', needflag, in_compound); - if (st) { - mystrcat(result, st, MAXLNLEN); - free(st); + if (!st.empty()) { + result.append(st); } if (havecontclass) { @@ -3287,39 +3092,120 @@ char* AffixMgr::affix_check_morph(const char* word, pfx = NULL; // if still not found check all two-level suffixes st = suffix_check_twosfx_morph(word, len, 0, NULL, needflag); - if (st) { - mystrcat(result, st, MAXLNLEN); - free(st); + if (!st.empty()) { + result.append(st); } // if still not found check all two-level suffixes st = prefix_check_twosfx_morph(word, len, IN_CPD_NOT, needflag); - if (st) { - mystrcat(result, st, MAXLNLEN); - free(st); + if (!st.empty()) { + result.append(st); } } - return mystrdup(result); + return result; +} + +// morphcmp(): compare MORPH_DERI_SFX, MORPH_INFL_SFX and MORPH_TERM_SFX fields +// in the first line of the inputs +// return 0, if inputs equal +// return 1, if inputs may equal with a secondary suffix +// otherwise return -1 +static int morphcmp(const char* s, const char* t) { + int se = 0; + int te = 0; + const char* sl; + const char* tl; + const char* olds; + const char* oldt; + if (!s || !t) + return 1; + olds = s; + sl = strchr(s, '\n'); + s = strstr(s, MORPH_DERI_SFX); + if (!s || (sl && sl < s)) + s = strstr(olds, MORPH_INFL_SFX); + if (!s || (sl && sl < s)) { + s = strstr(olds, MORPH_TERM_SFX); + olds = NULL; + } + oldt = t; + tl = strchr(t, '\n'); + t = strstr(t, MORPH_DERI_SFX); + if (!t || (tl && tl < t)) + t = strstr(oldt, MORPH_INFL_SFX); + if (!t || (tl && tl < t)) { + t = strstr(oldt, MORPH_TERM_SFX); + oldt = NULL; + } + while (s && t && (!sl || sl > s) && (!tl || tl > t)) { + s += MORPH_TAG_LEN; + t += MORPH_TAG_LEN; + se = 0; + te = 0; + while ((*s == *t) && !se && !te) { + s++; + t++; + switch (*s) { + case ' ': + case '\n': + case '\t': + case '\0': + se = 1; + } + switch (*t) { + case ' ': + case '\n': + case '\t': + case '\0': + te = 1; + } + } + if (!se || !te) { + // not terminal suffix difference + if (olds) + return -1; + return 1; + } + olds = s; + s = strstr(s, MORPH_DERI_SFX); + if (!s || (sl && sl < s)) + s = strstr(olds, MORPH_INFL_SFX); + if (!s || (sl && sl < s)) { + s = strstr(olds, MORPH_TERM_SFX); + olds = NULL; + } + oldt = t; + t = strstr(t, MORPH_DERI_SFX); + if (!t || (tl && tl < t)) + t = strstr(oldt, MORPH_INFL_SFX); + if (!t || (tl && tl < t)) { + t = strstr(oldt, MORPH_TERM_SFX); + oldt = NULL; + } + } + if (!s && !t && se && te) + return 0; + return 1; } -char* AffixMgr::morphgen(const char* ts, - int wl, - const unsigned short* ap, - unsigned short al, - const char* morph, - const char* targetmorph, +std::string AffixMgr::morphgen(const char* ts, + int wl, + const unsigned short* ap, + unsigned short al, + const char* morph, + const char* targetmorph, int level) { // handle suffixes if (!morph) - return NULL; + return std::string(); // check substandard flag if (TESTAFF(ap, substandard, al)) - return NULL; + return std::string(); if (morphcmp(morph, targetmorph) == 0) - return mystrdup(ts); + return ts; size_t stemmorphcatpos; std::string mymorph; @@ -3352,41 +3238,36 @@ char* AffixMgr::morphgen(const char* ts, int cmp = morphcmp(stemmorph, targetmorph); if (cmp == 0) { - char* newword = sptr->add(ts, wl); - if (newword) { - hentry* check = pHMgr->lookup(newword); // XXX extra dic + std::string newword = sptr->add(ts, wl); + if (!newword.empty()) { + hentry* check = pHMgr->lookup(newword.c_str()); // XXX extra dic if (!check || !check->astr || !(TESTAFF(check->astr, forbiddenword, check->alen) || TESTAFF(check->astr, ONLYUPCASEFLAG, check->alen))) { return newword; } - free(newword); } } // recursive call for secondary suffixes if ((level == 0) && (cmp == 1) && (sptr->getContLen() > 0) && - // (get_sfxcount(stemmorph) < targetcount) && !TESTAFF(sptr->getCont(), substandard, sptr->getContLen())) { - char* newword = sptr->add(ts, wl); - if (newword) { - char* newword2 = - morphgen(newword, strlen(newword), sptr->getCont(), + std::string newword = sptr->add(ts, wl); + if (!newword.empty()) { + std::string newword2 = + morphgen(newword.c_str(), newword.size(), sptr->getCont(), sptr->getContLen(), stemmorph, targetmorph, 1); - if (newword2) { - free(newword); + if (!newword2.empty()) { return newword2; } - free(newword); - newword = NULL; } } } sptr = sptr->getFlgNxt(); } } - return NULL; + return std::string(); } int AffixMgr::expand_rootword(struct guessword* wlst, @@ -3406,7 +3287,7 @@ int AffixMgr::expand_rootword(struct guessword* wlst, wlst[nh].word = mystrdup(ts); if (!wlst[nh].word) return 0; - wlst[nh].allow = (1 == 0); + wlst[nh].allow = false; wlst[nh].orig = NULL; nh++; // add special phonetic version @@ -3414,7 +3295,7 @@ int AffixMgr::expand_rootword(struct guessword* wlst, wlst[nh].word = mystrdup(phon); if (!wlst[nh].word) return nh - 1; - wlst[nh].allow = (1 == 0); + wlst[nh].allow = false; wlst[nh].orig = mystrdup(ts); if (!wlst[nh].orig) return nh - 1; @@ -3439,10 +3320,10 @@ int AffixMgr::expand_rootword(struct guessword* wlst, TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())) || (onlyincompound && TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) { - char* newword = sptr->add(ts, wl); - if (newword) { + std::string newword = sptr->add(ts, wl); + if (!newword.empty()) { if (nh < maxn) { - wlst[nh].word = newword; + wlst[nh].word = mystrdup(newword.c_str()); wlst[nh].allow = sptr->allowCross(); wlst[nh].orig = NULL; nh++; @@ -3455,14 +3336,12 @@ int AffixMgr::expand_rootword(struct guessword* wlst, wlst[nh].word = mystrdup(prefix.c_str()); if (!wlst[nh].word) return nh - 1; - wlst[nh].allow = (1 == 0); - wlst[nh].orig = mystrdup(newword); + wlst[nh].allow = false; + wlst[nh].orig = mystrdup(newword.c_str()); if (!wlst[nh].orig) return nh - 1; nh++; } - } else { - free(newword); } } } @@ -3484,15 +3363,13 @@ int AffixMgr::expand_rootword(struct guessword* wlst, ((badl > cptr->getKeyLen()) && (strncmp(cptr->getKey(), bad, cptr->getKeyLen()) == 0)))) { int l1 = strlen(wlst[j].word); - char* newword = cptr->add(wlst[j].word, l1); - if (newword) { + std::string newword = cptr->add(wlst[j].word, l1); + if (!newword.empty()) { if (nh < maxn) { - wlst[nh].word = newword; + wlst[nh].word = mystrdup(newword.c_str()); wlst[nh].allow = cptr->allowCross(); wlst[nh].orig = NULL; nh++; - } else { - free(newword); } } } @@ -3518,15 +3395,13 @@ int AffixMgr::expand_rootword(struct guessword* wlst, TESTAFF(ptr->getCont(), circumfix, ptr->getContLen())) || (onlyincompound && TESTAFF(ptr->getCont(), onlyincompound, ptr->getContLen()))))) { - char* newword = ptr->add(ts, wl); - if (newword) { + std::string newword = ptr->add(ts, wl); + if (!newword.empty()) { if (nh < maxn) { - wlst[nh].word = newword; + wlst[nh].word = mystrdup(newword.c_str()); wlst[nh].allow = ptr->allowCross(); wlst[nh].orig = NULL; nh++; - } else { - free(newword); } } } @@ -3537,15 +3412,8 @@ int AffixMgr::expand_rootword(struct guessword* wlst, return nh; } -// return length of replacing table -int AffixMgr::get_numrep() const { - return numrep; -} - // return replacing table -struct replentry* AffixMgr::get_reptable() const { - if (!reptable) - return NULL; +const std::vector<replentry>& AffixMgr::get_reptable() const { return reptable; } @@ -3570,35 +3438,21 @@ struct phonetable* AffixMgr::get_phonetable() const { return phone; } -// return length of character map table -int AffixMgr::get_nummap() const { - return nummap; -} - // return character map table -struct mapentry* AffixMgr::get_maptable() const { - if (!maptable) - return NULL; +const std::vector<mapentry>& AffixMgr::get_maptable() const { return maptable; } -// return length of word break table -int AffixMgr::get_numbreak() const { - return numbreak; -} - // return character map table -char** AffixMgr::get_breaktable() const { - if (!breaktable) - return NULL; +const std::vector<std::string>& AffixMgr::get_breaktable() const { return breaktable; } // return text encoding of dictionary -char* AffixMgr::get_encoding() { - if (!encoding) - encoding = mystrdup(SPELL_ENCODING); - return mystrdup(encoding); +const std::string& AffixMgr::get_encoding() { + if (encoding.empty()) + encoding = SPELL_ENCODING; + return encoding; } // return text encoding of dictionary @@ -3641,10 +3495,10 @@ char* AffixMgr::encode_flag(unsigned short aflag) const { } // return the preferred ignore string for suggestions -char* AffixMgr::get_ignore() const { - if (!ignorechars) +const char* AffixMgr::get_ignore() const { + if (ignorechars.empty()) return NULL; - return ignorechars; + return ignorechars.c_str(); } // return the preferred ignore string for suggestions @@ -3654,20 +3508,20 @@ const std::vector<w_char>& AffixMgr::get_ignore_utf16() const { // return the keyboard string for suggestions char* AffixMgr::get_key_string() { - if (!keystring) - keystring = mystrdup(SPELL_KEYSTRING); - return mystrdup(keystring); + if (keystring.empty()) + keystring = SPELL_KEYSTRING; + return mystrdup(keystring.c_str()); } // return the preferred try string for suggestions char* AffixMgr::get_try_string() const { - if (!trystring) + if (trystring.empty()) return NULL; - return mystrdup(trystring); + return mystrdup(trystring.c_str()); } // return the preferred try string for suggestions -const char* AffixMgr::get_wordchars() const { +const std::string& AffixMgr::get_wordchars() const { return wordchars; } @@ -3677,7 +3531,7 @@ const std::vector<w_char>& AffixMgr::get_wordchars_utf16() const { // is there compounding? int AffixMgr::get_compound() const { - return compoundflag || compoundbegin || numdefcpd; + return compoundflag || compoundbegin || !defcpdtable.empty(); } // return the compound words control flag @@ -3710,49 +3564,16 @@ FLAG AffixMgr::get_onlyincompound() const { return onlyincompound; } -// return the compound word signal flag -FLAG AffixMgr::get_compoundroot() const { - return compoundroot; -} - -// return the compound begin signal flag -FLAG AffixMgr::get_compoundbegin() const { - return compoundbegin; -} - -// return the value of checknum -int AffixMgr::get_checknum() const { - return checknum; -} - -// return the value of prefix -const char* AffixMgr::get_prefix() const { - if (pfx) - return pfx->getKey(); - return NULL; -} - // return the value of suffix -const char* AffixMgr::get_suffix() const { - return sfxappnd; -} - -// return the value of suffix -const char* AffixMgr::get_version() const { +const std::string& AffixMgr::get_version() const { return version; } -// return lemma_present flag -FLAG AffixMgr::get_lemma_present() const { - return lemma_present; -} - // utility method to look up root words in hash table struct hentry* AffixMgr::lookup(const char* word) { - int i; struct hentry* he = NULL; - for (i = 0; i < *maxdic && !he; i++) { - he = (alldic[i])->lookup(word); + for (size_t i = 0; i < alldic.size() && !he; ++i) { + he = alldic[i]->lookup(word); } return he; } @@ -3794,839 +3615,751 @@ int AffixMgr::get_sugswithdots(void) const { } /* parse flag */ -int AffixMgr::parse_flag(char* line, unsigned short* out, FileMgr* af) { - char* s = NULL; +bool AffixMgr::parse_flag(const std::string& line, unsigned short* out, FileMgr* af) { if (*out != FLAG_NULL && !(*out >= DEFAULTFLAGS)) { HUNSPELL_WARNING( stderr, "error: line %d: multiple definitions of an affix file parameter\n", af->getlinenum()); - return 1; + return false; } - if (parse_string(line, &s, af->getlinenum())) - return 1; - *out = pHMgr->decode_flag(s); - free(s); - return 0; + std::string s; + if (!parse_string(line, s, af->getlinenum())) + return false; + *out = pHMgr->decode_flag(s.c_str()); + return true; } /* parse num */ -int AffixMgr::parse_num(char* line, int* out, FileMgr* af) { - char* s = NULL; +bool AffixMgr::parse_num(const std::string& line, int* out, FileMgr* af) { if (*out != -1) { HUNSPELL_WARNING( stderr, "error: line %d: multiple definitions of an affix file parameter\n", af->getlinenum()); - return 1; + return false; } - if (parse_string(line, &s, af->getlinenum())) - return 1; - *out = atoi(s); - free(s); - return 0; + std::string s; + if (!parse_string(line, s, af->getlinenum())) + return false; + *out = atoi(s.c_str()); + return true; } /* parse in the max syllablecount of compound words and */ -int AffixMgr::parse_cpdsyllable(char* line, FileMgr* af) { - char* tp = line; - char* piece; +bool AffixMgr::parse_cpdsyllable(const std::string& line, FileMgr* af) { int i = 0; int np = 0; - piece = mystrsep(&tp, 0); - while (piece) { - if (*piece != '\0') { - switch (i) { - case 0: { - np++; - break; - } - case 1: { - cpdmaxsyllable = atoi(piece); - np++; - break; - } - case 2: { - if (!utf8) { - cpdvowels = mystrdup(piece); - } else { - std::vector<w_char> w; - u8_u16(w, piece); - if (!w.empty()) { - std::sort(w.begin(), w.end()); - cpdvowels_utf16 = (w_char*)malloc(w.size() * sizeof(w_char)); - if (!cpdvowels_utf16) - return 1; - memcpy(cpdvowels_utf16, &w[0], w.size()); - } - cpdvowels_utf16_len = w.size(); - } - np++; - break; + std::string::const_iterator iter = line.begin(); + std::string::const_iterator start_piece = mystrsep(line, iter); + while (start_piece != line.end()) { + switch (i) { + case 0: { + np++; + break; + } + case 1: { + cpdmaxsyllable = atoi(std::string(start_piece, iter).c_str()); + np++; + break; + } + case 2: { + if (!utf8) { + cpdvowels.assign(start_piece, iter); + std::sort(cpdvowels.begin(), cpdvowels.end()); + } else { + std::string piece(start_piece, iter); + u8_u16(cpdvowels_utf16, piece); + std::sort(cpdvowels_utf16.begin(), cpdvowels_utf16.end()); } - default: - break; + np++; + break; } - i++; + default: + break; } - piece = mystrsep(&tp, 0); + ++i; + start_piece = mystrsep(line, iter); } if (np < 2) { HUNSPELL_WARNING(stderr, "error: line %d: missing compoundsyllable information\n", af->getlinenum()); - return 1; + return false; } if (np == 2) - cpdvowels = mystrdup("aeiouAEIOU"); - return 0; + cpdvowels = "AEIOUaeiou"; + return true; } /* parse in the typical fault correcting table */ -int AffixMgr::parse_reptable(char* line, FileMgr* af) { - if (numrep != 0) { +bool AffixMgr::parse_reptable(const std::string& line, FileMgr* af) { + if (parsedrep) { HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum()); - return 1; + return false; } - char* tp = line; - char* piece; + parsedrep = true; + int numrep = -1; int i = 0; int np = 0; - piece = mystrsep(&tp, 0); - while (piece) { - if (*piece != '\0') { - switch (i) { - case 0: { - np++; - break; - } - case 1: { - numrep = atoi(piece); - if (numrep < 1) { - HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", - af->getlinenum()); - return 1; - } - reptable = (replentry*)malloc(numrep * sizeof(struct replentry)); - if (!reptable) - return 1; - np++; - break; + std::string::const_iterator iter = line.begin(); + std::string::const_iterator start_piece = mystrsep(line, iter); + while (start_piece != line.end()) { + switch (i) { + case 0: { + np++; + break; + } + case 1: { + numrep = atoi(std::string(start_piece, iter).c_str()); + if (numrep < 1) { + HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", + af->getlinenum()); + return false; } - default: - break; + reptable.reserve(numrep); + np++; + break; } - i++; + default: + break; } - piece = mystrsep(&tp, 0); + ++i; + start_piece = mystrsep(line, iter); } if (np != 2) { HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); - return 1; + return false; } /* now parse the numrep lines to read in the remainder of the table */ - char* nl; - for (int j = 0; j < numrep; j++) { - if ((nl = af->getline()) == NULL) - return 1; + for (int j = 0; j < numrep; ++j) { + std::string nl; + if (!af->getline(nl)) + return false; mychomp(nl); - tp = nl; + reptable.push_back(replentry()); + iter = nl.begin(); i = 0; - reptable[j].pattern = NULL; - reptable[j].pattern2 = NULL; - piece = mystrsep(&tp, 0); - while (piece) { - if (*piece != '\0') { - switch (i) { - case 0: { - if (strncmp(piece, "REP", 3) != 0) { - HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", - af->getlinenum()); - numrep = 0; - return 1; - } - break; - } - case 1: { - if (*piece == '^') - reptable[j].start = true; - else - reptable[j].start = false; - reptable[j].pattern = - mystrrep(mystrdup(piece + int(reptable[j].start)), "_", " "); - int lr = strlen(reptable[j].pattern) - 1; - if (reptable[j].pattern[lr] == '$') { - reptable[j].end = true; - reptable[j].pattern[lr] = '\0'; - } else - reptable[j].end = false; - break; + int type = 0; + start_piece = mystrsep(nl, iter); + while (start_piece != nl.end()) { + switch (i) { + case 0: { + if (nl.compare(start_piece - nl.begin(), 3, "REP", 3) != 0) { + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", + af->getlinenum()); + reptable.clear(); + return false; } - case 2: { - reptable[j].pattern2 = mystrrep(mystrdup(piece), "_", " "); - break; + break; + } + case 1: { + if (*start_piece == '^') + type = 1; + reptable.back().pattern.assign(start_piece + type, iter); + mystrrep(reptable.back().pattern, "_", " "); + if (!reptable.back().pattern.empty() && reptable.back().pattern[reptable.back().pattern.size() - 1] == '$') { + type += 2; + reptable.back().pattern.resize(reptable.back().pattern.size() - 1); } - default: - break; + break; } - i++; + case 2: { + reptable.back().outstrings[type].assign(start_piece, iter); + mystrrep(reptable.back().outstrings[type], "_", " "); + break; + } + default: + break; } - piece = mystrsep(&tp, 0); + ++i; + start_piece = mystrsep(nl, iter); } - if ((!(reptable[j].pattern)) || (!(reptable[j].pattern2))) { + if (reptable.back().pattern.empty() || reptable.back().outstrings[type].empty()) { HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); - numrep = 0; - return 1; + reptable.clear(); + return false; } } - return 0; + return true; } /* parse in the typical fault correcting table */ -int AffixMgr::parse_convtable(char* line, +bool AffixMgr::parse_convtable(const std::string& line, FileMgr* af, RepList** rl, - const char* keyword) { + const std::string& keyword) { if (*rl) { HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum()); - return 1; + return false; } - char* tp = line; - char* piece; int i = 0; int np = 0; int numrl = 0; - piece = mystrsep(&tp, 0); - while (piece) { - if (*piece != '\0') { - switch (i) { - case 0: { - np++; - break; - } - case 1: { - numrl = atoi(piece); - if (numrl < 1) { - HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", - af->getlinenum()); - return 1; - } - *rl = new RepList(numrl); - if (!*rl) - return 1; - np++; - break; + std::string::const_iterator iter = line.begin(); + std::string::const_iterator start_piece = mystrsep(line, iter); + while (start_piece != line.end()) { + switch (i) { + case 0: { + np++; + break; + } + case 1: { + numrl = atoi(std::string(start_piece, iter).c_str()); + if (numrl < 1) { + HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", + af->getlinenum()); + return false; } - default: - break; + *rl = new RepList(numrl); + if (!*rl) + return false; + np++; + break; } - i++; + default: + break; } - piece = mystrsep(&tp, 0); + ++i; + start_piece = mystrsep(line, iter); } if (np != 2) { HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); - return 1; + return false; } /* now parse the num lines to read in the remainder of the table */ - char* nl; for (int j = 0; j < numrl; j++) { - if (!(nl = af->getline())) - return 1; + std::string nl; + if (!af->getline(nl)) + return false; mychomp(nl); - tp = nl; i = 0; - char* pattern = NULL; - char* pattern2 = NULL; - piece = mystrsep(&tp, 0); - while (piece) { - if (*piece != '\0') { + std::string pattern; + std::string pattern2; + iter = nl.begin(); + start_piece = mystrsep(nl, iter); + while (start_piece != nl.end()) { + { switch (i) { case 0: { - if (strncmp(piece, keyword, strlen(keyword)) != 0) { + if (nl.compare(start_piece - nl.begin(), keyword.size(), keyword, 0, keyword.size()) != 0) { HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); delete *rl; *rl = NULL; - return 1; + return false; } break; } case 1: { - pattern = mystrrep(mystrdup(piece), "_", " "); + pattern.assign(start_piece, iter); break; } case 2: { - pattern2 = mystrrep(mystrdup(piece), "_", " "); + pattern2.assign(start_piece, iter); break; } default: break; } - i++; + ++i; } - piece = mystrsep(&tp, 0); + start_piece = mystrsep(nl, iter); } - if (!pattern || !pattern2) { - if (pattern) - free(pattern); - if (pattern2) - free(pattern2); + if (pattern.empty() || pattern2.empty()) { HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); - return 1; + return false; } (*rl)->add(pattern, pattern2); } - return 0; + return true; } /* parse in the typical fault correcting table */ -int AffixMgr::parse_phonetable(char* line, FileMgr* af) { +bool AffixMgr::parse_phonetable(const std::string& line, FileMgr* af) { if (phone) { HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum()); - return 1; + return false; } - char* tp = line; - char* piece; + int num = -1; int i = 0; int np = 0; - piece = mystrsep(&tp, 0); - while (piece) { - if (*piece != '\0') { - switch (i) { - case 0: { - np++; - break; - } - case 1: { - phone = (phonetable*)malloc(sizeof(struct phonetable)); - if (!phone) - return 1; - phone->num = atoi(piece); - phone->rules = NULL; - phone->utf8 = (char)utf8; - if (phone->num < 1) { - HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", - af->getlinenum()); - return 1; - } - phone->rules = (char**)malloc(2 * (phone->num + 1) * sizeof(char*)); - if (!phone->rules) { - free(phone); - phone = NULL; - return 1; - } - np++; - break; + std::string::const_iterator iter = line.begin(); + std::string::const_iterator start_piece = mystrsep(line, iter); + while (start_piece != line.end()) { + switch (i) { + case 0: { + np++; + break; + } + case 1: { + num = atoi(std::string(start_piece, iter).c_str()); + if (num < 1) { + HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", + af->getlinenum()); + return false; } - default: - break; + phone = new phonetable; + phone->utf8 = (char)utf8; + np++; + break; } - i++; + default: + break; } - piece = mystrsep(&tp, 0); + ++i; + start_piece = mystrsep(line, iter); } if (np != 2) { HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); - return 1; + return false; } /* now parse the phone->num lines to read in the remainder of the table */ - char* nl; - for (int j = 0; j < phone->num; j++) { - if (!(nl = af->getline())) - return 1; + for (int j = 0; j < num; ++j) { + std::string nl; + if (!af->getline(nl)) + return false; mychomp(nl); - tp = nl; i = 0; - phone->rules[j * 2] = NULL; - phone->rules[j * 2 + 1] = NULL; - piece = mystrsep(&tp, 0); - while (piece) { - if (*piece != '\0') { + const size_t old_size = phone->rules.size(); + iter = nl.begin(); + start_piece = mystrsep(nl, iter); + while (start_piece != nl.end()) { + { switch (i) { case 0: { - if (strncmp(piece, "PHONE", 5) != 0) { + if (nl.compare(start_piece - nl.begin(), 5, "PHONE", 5) != 0) { HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); - phone->num = 0; - return 1; + return false; } break; } case 1: { - phone->rules[j * 2] = mystrrep(mystrdup(piece), "_", ""); + phone->rules.push_back(std::string(start_piece, iter)); break; } case 2: { - phone->rules[j * 2 + 1] = mystrrep(mystrdup(piece), "_", ""); + phone->rules.push_back(std::string(start_piece, iter)); + mystrrep(phone->rules.back(), "_", ""); break; } default: break; } - i++; + ++i; } - piece = mystrsep(&tp, 0); + start_piece = mystrsep(nl, iter); } - if ((!(phone->rules[j * 2])) || (!(phone->rules[j * 2 + 1]))) { + if (phone->rules.size() != old_size + 2) { HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); - phone->num = 0; - return 1; + phone->rules.clear(); + return false; } } - phone->rules[phone->num * 2] = mystrdup(""); - phone->rules[phone->num * 2 + 1] = mystrdup(""); + phone->rules.push_back(""); + phone->rules.push_back(""); init_phonet_hash(*phone); - return 0; + return true; } /* parse in the checkcompoundpattern table */ -int AffixMgr::parse_checkcpdtable(char* line, FileMgr* af) { - if (numcheckcpd != 0) { +bool AffixMgr::parse_checkcpdtable(const std::string& line, FileMgr* af) { + if (parsedcheckcpd) { HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum()); - return 1; + return false; } - char* tp = line; - char* piece; + parsedcheckcpd = true; + int numcheckcpd = -1; int i = 0; int np = 0; - piece = mystrsep(&tp, 0); - while (piece) { - if (*piece != '\0') { - switch (i) { - case 0: { - np++; - break; - } - case 1: { - numcheckcpd = atoi(piece); - if (numcheckcpd < 1) { - HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", - af->getlinenum()); - return 1; - } - checkcpdtable = - (patentry*)malloc(numcheckcpd * sizeof(struct patentry)); - if (!checkcpdtable) - return 1; - np++; - break; + std::string::const_iterator iter = line.begin(); + std::string::const_iterator start_piece = mystrsep(line, iter); + while (start_piece != line.end()) { + switch (i) { + case 0: { + np++; + break; + } + case 1: { + numcheckcpd = atoi(std::string(start_piece, iter).c_str()); + if (numcheckcpd < 1) { + HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", + af->getlinenum()); + return false; } - default: - break; + checkcpdtable.reserve(numcheckcpd); + np++; + break; } - i++; + default: + break; } - piece = mystrsep(&tp, 0); + ++i; + start_piece = mystrsep(line, iter); } if (np != 2) { HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); - return 1; + return false; } /* now parse the numcheckcpd lines to read in the remainder of the table */ - char* nl; - for (int j = 0; j < numcheckcpd; j++) { - if (!(nl = af->getline())) - return 1; + for (int j = 0; j < numcheckcpd; ++j) { + std::string nl; + if (!af->getline(nl)) + return false; mychomp(nl); - tp = nl; i = 0; - checkcpdtable[j].pattern = NULL; - checkcpdtable[j].pattern2 = NULL; - checkcpdtable[j].pattern3 = NULL; - checkcpdtable[j].cond = FLAG_NULL; - checkcpdtable[j].cond2 = FLAG_NULL; - piece = mystrsep(&tp, 0); - while (piece) { - if (*piece != '\0') { - switch (i) { - case 0: { - if (strncmp(piece, "CHECKCOMPOUNDPATTERN", 20) != 0) { - HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", - af->getlinenum()); - numcheckcpd = 0; - return 1; - } - break; - } - case 1: { - checkcpdtable[j].pattern = mystrdup(piece); - char* p = strchr(checkcpdtable[j].pattern, '/'); - if (p) { - *p = '\0'; - checkcpdtable[j].cond = pHMgr->decode_flag(p + 1); - } - break; + checkcpdtable.push_back(patentry()); + iter = nl.begin(); + start_piece = mystrsep(nl, iter); + while (start_piece != nl.end()) { + switch (i) { + case 0: { + if (nl.compare(start_piece - nl.begin(), 20, "CHECKCOMPOUNDPATTERN", 20) != 0) { + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", + af->getlinenum()); + return false; } - case 2: { - checkcpdtable[j].pattern2 = mystrdup(piece); - char* p = strchr(checkcpdtable[j].pattern2, '/'); - if (p) { - *p = '\0'; - checkcpdtable[j].cond2 = pHMgr->decode_flag(p + 1); - } - break; + break; + } + case 1: { + checkcpdtable.back().pattern.assign(start_piece, iter); + size_t slash_pos = checkcpdtable.back().pattern.find('/'); + if (slash_pos != std::string::npos) { + std::string chunk(checkcpdtable.back().pattern, slash_pos + 1); + checkcpdtable.back().pattern.resize(slash_pos); + checkcpdtable.back().cond = pHMgr->decode_flag(chunk.c_str()); } - case 3: { - checkcpdtable[j].pattern3 = mystrdup(piece); - simplifiedcpd = 1; - break; + break; + } + case 2: { + checkcpdtable.back().pattern2.assign(start_piece, iter); + size_t slash_pos = checkcpdtable.back().pattern2.find('/'); + if (slash_pos != std::string::npos) { + std::string chunk(checkcpdtable.back().pattern2, slash_pos + 1); + checkcpdtable.back().pattern2.resize(slash_pos); + checkcpdtable.back().cond2 = pHMgr->decode_flag(chunk.c_str()); } - default: - break; + break; + } + case 3: { + checkcpdtable.back().pattern3.assign(start_piece, iter); + simplifiedcpd = 1; + break; } - i++; + default: + break; } - piece = mystrsep(&tp, 0); - } - if ((!(checkcpdtable[j].pattern)) || (!(checkcpdtable[j].pattern2))) { - HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", - af->getlinenum()); - numcheckcpd = 0; - return 1; + i++; + start_piece = mystrsep(nl, iter); } } - return 0; + return true; } /* parse in the compound rule table */ -int AffixMgr::parse_defcpdtable(char* line, FileMgr* af) { - if (numdefcpd != 0) { +bool AffixMgr::parse_defcpdtable(const std::string& line, FileMgr* af) { + if (parseddefcpd) { HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum()); - return 1; + return false; } - char* tp = line; - char* piece; + parseddefcpd = true; + int numdefcpd = -1; int i = 0; int np = 0; - piece = mystrsep(&tp, 0); - while (piece) { - if (*piece != '\0') { - switch (i) { - case 0: { - np++; - break; - } - case 1: { - numdefcpd = atoi(piece); - if (numdefcpd < 1) { - HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", - af->getlinenum()); - return 1; - } - defcpdtable = (flagentry*)malloc(numdefcpd * sizeof(flagentry)); - if (!defcpdtable) - return 1; - np++; - break; + std::string::const_iterator iter = line.begin(); + std::string::const_iterator start_piece = mystrsep(line, iter); + while (start_piece != line.end()) { + switch (i) { + case 0: { + np++; + break; + } + case 1: { + numdefcpd = atoi(std::string(start_piece, iter).c_str()); + if (numdefcpd < 1) { + HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", + af->getlinenum()); + return false; } - default: - break; + defcpdtable.reserve(numdefcpd); + np++; + break; } - i++; + default: + break; } - piece = mystrsep(&tp, 0); + ++i; + start_piece = mystrsep(line, iter); } if (np != 2) { HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); - return 1; + return false; } /* now parse the numdefcpd lines to read in the remainder of the table */ - char* nl; - for (int j = 0; j < numdefcpd; j++) { - if (!(nl = af->getline())) - return 1; + for (int j = 0; j < numdefcpd; ++j) { + std::string nl; + if (!af->getline(nl)) + return false; mychomp(nl); - tp = nl; i = 0; - defcpdtable[j].def = NULL; - defcpdtable[j].len = 0; - piece = mystrsep(&tp, 0); - while (piece) { - if (*piece != '\0') { - switch (i) { - case 0: { - if (strncmp(piece, "COMPOUNDRULE", 12) != 0) { - HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", - af->getlinenum()); - numdefcpd = 0; - return 1; - } - break; + defcpdtable.push_back(flagentry()); + iter = nl.begin(); + start_piece = mystrsep(nl, iter); + while (start_piece != nl.end()) { + switch (i) { + case 0: { + if (nl.compare(start_piece - nl.begin(), 12, "COMPOUNDRULE", 12) != 0) { + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", + af->getlinenum()); + numdefcpd = 0; + return false; } - case 1: { // handle parenthesized flags - if (strchr(piece, '(')) { - defcpdtable[j].def = (FLAG*)malloc(strlen(piece) * sizeof(FLAG)); - defcpdtable[j].len = 0; - int end = 0; - FLAG* conv; - while (!end) { - char* par = piece + 1; - while (*par != '(' && *par != ')' && *par != '\0') - par++; - if (*par == '\0') - end = 1; - else - *par = '\0'; - if (*piece == '(') - piece++; - if (*piece == '*' || *piece == '?') { - defcpdtable[j].def[defcpdtable[j].len++] = (FLAG)*piece; - } else if (*piece != '\0') { - int l = pHMgr->decode_flags(&conv, piece, af); - for (int k = 0; k < l; k++) - defcpdtable[j].def[defcpdtable[j].len++] = conv[k]; - free(conv); + break; + } + case 1: { // handle parenthesized flags + if (std::find(start_piece, iter, '(') != iter) { + for (std::string::const_iterator k = start_piece; k != iter; ++k) { + std::string::const_iterator chb = k; + std::string::const_iterator che = k + 1; + if (*k == '(') { + std::string::const_iterator parpos = std::find(k, iter, ')'); + if (parpos != iter) { + chb = k + 1; + che = parpos; + k = parpos; } - piece = par + 1; } - } else { - defcpdtable[j].len = - pHMgr->decode_flags(&(defcpdtable[j].def), piece, af); + + if (*chb == '*' || *chb == '?') { + defcpdtable.back().push_back((FLAG)*chb); + } else { + pHMgr->decode_flags(defcpdtable.back(), std::string(chb, che), af); + } } - break; + } else { + pHMgr->decode_flags(defcpdtable.back(), std::string(start_piece, iter), af); } - default: - break; + break; } - i++; + default: + break; } - piece = mystrsep(&tp, 0); + ++i; + start_piece = mystrsep(nl, iter); } - if (!defcpdtable[j].len) { + if (defcpdtable.back().empty()) { HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); - numdefcpd = 0; - return 1; + return false; } } - return 0; + return true; } /* parse in the character map table */ -int AffixMgr::parse_maptable(char* line, FileMgr* af) { - if (nummap != 0) { +bool AffixMgr::parse_maptable(const std::string& line, FileMgr* af) { + if (parsedmaptable) { HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum()); - return 1; + return false; } - char* tp = line; - char* piece; + parsedmaptable = true; + int nummap = -1; int i = 0; int np = 0; - piece = mystrsep(&tp, 0); - while (piece) { - if (*piece != '\0') { - switch (i) { - case 0: { - np++; - break; - } - case 1: { - nummap = atoi(piece); - if (nummap < 1) { - HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", - af->getlinenum()); - return 1; - } - maptable = (mapentry*)malloc(nummap * sizeof(struct mapentry)); - if (!maptable) - return 1; - np++; - break; + std::string::const_iterator iter = line.begin(); + std::string::const_iterator start_piece = mystrsep(line, iter); + while (start_piece != line.end()) { + switch (i) { + case 0: { + np++; + break; + } + case 1: { + nummap = atoi(std::string(start_piece, iter).c_str()); + if (nummap < 1) { + HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", + af->getlinenum()); + return false; } - default: - break; + maptable.reserve(nummap); + np++; + break; } - i++; + default: + break; } - piece = mystrsep(&tp, 0); + ++i; + start_piece = mystrsep(line, iter); } if (np != 2) { HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); - return 1; + return false; } /* now parse the nummap lines to read in the remainder of the table */ - char* nl; - for (int j = 0; j < nummap; j++) { - if (!(nl = af->getline())) - return 1; + for (int j = 0; j < nummap; ++j) { + std::string nl; + if (!af->getline(nl)) + return false; mychomp(nl); - tp = nl; i = 0; - maptable[j].set = NULL; - maptable[j].len = 0; - piece = mystrsep(&tp, 0); - while (piece) { - if (*piece != '\0') { - switch (i) { - case 0: { - if (strncmp(piece, "MAP", 3) != 0) { - HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", - af->getlinenum()); - nummap = 0; - return 1; - } - break; + maptable.push_back(mapentry()); + iter = nl.begin(); + start_piece = mystrsep(nl, iter); + while (start_piece != nl.end()) { + switch (i) { + case 0: { + if (nl.compare(start_piece - nl.begin(), 3, "MAP", 3) != 0) { + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", + af->getlinenum()); + nummap = 0; + return false; } - case 1: { - int setn = 0; - maptable[j].len = strlen(piece); - maptable[j].set = (char**)malloc(maptable[j].len * sizeof(char*)); - if (!maptable[j].set) - return 1; - for (int k = 0; k < maptable[j].len; k++) { - int chl = 1; - int chb = k; - if (piece[k] == '(') { - char* parpos = strchr(piece + k, ')'); - if (parpos != NULL) { - chb = k + 1; - chl = (int)(parpos - piece) - k - 1; - k = k + chl + 1; - } - } else { - if (utf8 && (piece[k] & 0xc0) == 0xc0) { - for (k++; utf8 && (piece[k] & 0xc0) == 0x80; k++) - ; - chl = k - chb; - k--; - } + break; + } + case 1: { + for (std::string::const_iterator k = start_piece; k != iter; ++k) { + std::string::const_iterator chb = k; + std::string::const_iterator che = k + 1; + if (*k == '(') { + std::string::const_iterator parpos = std::find(k, iter, ')'); + if (parpos != iter) { + chb = k + 1; + che = parpos; + k = parpos; + } + } else { + if (utf8 && (*k & 0xc0) == 0xc0) { + ++k; + while (k != iter && (*k & 0xc0) == 0x80) + ++k; + che = k; + --k; } - maptable[j].set[setn] = (char*)malloc(chl + 1); - if (!maptable[j].set[setn]) - return 1; - strncpy(maptable[j].set[setn], piece + chb, chl); - maptable[j].set[setn][chl] = '\0'; - setn++; } - maptable[j].len = setn; - break; + maptable.back().push_back(std::string(chb, che)); } - default: - break; + break; } - i++; + default: + break; } - piece = mystrsep(&tp, 0); + ++i; + start_piece = mystrsep(nl, iter); } - if (!maptable[j].set || !maptable[j].len) { + if (maptable.back().empty()) { HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); - nummap = 0; - return 1; + return false; } } - return 0; + return true; } /* parse in the word breakpoint table */ -int AffixMgr::parse_breaktable(char* line, FileMgr* af) { - if (numbreak > -1) { +bool AffixMgr::parse_breaktable(const std::string& line, FileMgr* af) { + if (parsedbreaktable) { HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum()); - return 1; + return false; } - char* tp = line; - char* piece; + parsedbreaktable = true; + int numbreak = -1; int i = 0; int np = 0; - piece = mystrsep(&tp, 0); - while (piece) { - if (*piece != '\0') { - switch (i) { - case 0: { - np++; - break; - } - case 1: { - numbreak = atoi(piece); - if (numbreak < 0) { - HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", - af->getlinenum()); - return 1; - } - if (numbreak == 0) - return 0; - breaktable = (char**)malloc(numbreak * sizeof(char*)); - if (!breaktable) - return 1; - np++; - break; + std::string::const_iterator iter = line.begin(); + std::string::const_iterator start_piece = mystrsep(line, iter); + while (start_piece != line.end()) { + switch (i) { + case 0: { + np++; + break; + } + case 1: { + numbreak = atoi(std::string(start_piece, iter).c_str()); + if (numbreak < 0) { + HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", + af->getlinenum()); + return false; } - default: - break; + if (numbreak == 0) + return true; + breaktable.reserve(numbreak); + np++; + break; } - i++; + default: + break; } - piece = mystrsep(&tp, 0); + ++i; + start_piece = mystrsep(line, iter); } if (np != 2) { HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); - return 1; + return false; } /* now parse the numbreak lines to read in the remainder of the table */ - char* nl; - for (int j = 0; j < numbreak; j++) { - if (!(nl = af->getline())) - return 1; + for (int j = 0; j < numbreak; ++j) { + std::string nl; + if (!af->getline(nl)) + return false; mychomp(nl); - tp = nl; i = 0; - piece = mystrsep(&tp, 0); - while (piece) { - if (*piece != '\0') { - switch (i) { - case 0: { - if (strncmp(piece, "BREAK", 5) != 0) { - HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", - af->getlinenum()); - numbreak = 0; - return 1; - } - break; - } - case 1: { - breaktable[j] = mystrdup(piece); - break; + iter = nl.begin(); + start_piece = mystrsep(nl, iter); + while (start_piece != nl.end()) { + switch (i) { + case 0: { + if (nl.compare(start_piece - nl.begin(), 5, "BREAK", 5) != 0) { + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", + af->getlinenum()); + numbreak = 0; + return false; } - default: - break; + break; + } + case 1: { + breaktable.push_back(std::string(start_piece, iter)); + break; } - i++; + default: + break; } - piece = mystrsep(&tp, 0); - } - if (!breaktable) { - HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", - af->getlinenum()); - numbreak = 0; - return 1; + ++i; + start_piece = mystrsep(nl, iter); } } - return 0; + + if (breaktable.size() != static_cast<size_t>(numbreak)) { + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", + af->getlinenum()); + return false; + } + + return true; } void AffixMgr::reverse_condition(std::string& piece) { @@ -4665,20 +4398,68 @@ void AffixMgr::reverse_condition(std::string& piece) { } } -int AffixMgr::parse_affix(char* line, +class entries_container { + std::vector<AffEntry*> entries; + AffixMgr* m_mgr; + char m_at; +public: + entries_container(char at, AffixMgr* mgr) + : m_mgr(mgr) + , m_at(at) { + } + void release() { + entries.clear(); + } + void initialize(int numents, + char opts, unsigned short aflag) { + entries.reserve(numents); + + if (m_at == 'P') { + entries.push_back(new PfxEntry(m_mgr)); + } else { + entries.push_back(new SfxEntry(m_mgr)); + } + + entries.back()->opts = opts; + entries.back()->aflag = aflag; + } + + AffEntry* add_entry(char opts) { + if (m_at == 'P') { + entries.push_back(new PfxEntry(m_mgr)); + } else { + entries.push_back(new SfxEntry(m_mgr)); + } + AffEntry* ret = entries.back(); + ret->opts = entries[0]->opts & opts; + return ret; + } + + AffEntry* first_entry() { + return entries.empty() ? NULL : entries[0]; + } + + ~entries_container() { + for (size_t i = 0; i < entries.size(); ++i) { + delete entries[i]; + } + } + + std::vector<AffEntry*>::iterator begin() { return entries.begin(); } + std::vector<AffEntry*>::iterator end() { return entries.end(); } +}; + +bool AffixMgr::parse_affix(const std::string& line, const char at, FileMgr* af, char* dupflags) { - int numents = 0; // number of affentry structures to parse + int numents = 0; // number of AffEntry structures to parse unsigned short aflag = 0; // affix char identifier char ff = 0; - std::vector<affentry> affentries; + entries_container affentries(at, this); - char* tp = line; - char* nl = line; - char* piece; int i = 0; // checking lines with bad syntax @@ -4689,71 +4470,68 @@ int AffixMgr::parse_affix(char* line, // split affix header line into pieces int np = 0; + std::string::const_iterator iter = line.begin(); + std::string::const_iterator start_piece = mystrsep(line, iter); + while (start_piece != line.end()) { + switch (i) { + // piece 1 - is type of affix + case 0: { + np++; + break; + } - piece = mystrsep(&tp, 0); - while (piece) { - if (*piece != '\0') { - switch (i) { - // piece 1 - is type of affix - case 0: { - np++; - break; - } - - // piece 2 - is affix char - case 1: { - np++; - aflag = pHMgr->decode_flag(piece); - if (((at == 'S') && (dupflags[aflag] & dupSFX)) || - ((at == 'P') && (dupflags[aflag] & dupPFX))) { - HUNSPELL_WARNING( - stderr, - "error: line %d: multiple definitions of an affix flag\n", - af->getlinenum()); - // return 1; XXX permissive mode for bad dictionaries - } - dupflags[aflag] += (char)((at == 'S') ? dupSFX : dupPFX); - break; - } - // piece 3 - is cross product indicator - case 2: { - np++; - if (*piece == 'Y') - ff = aeXPRODUCT; - break; + // piece 2 - is affix char + case 1: { + np++; + aflag = pHMgr->decode_flag(std::string(start_piece, iter).c_str()); + if (((at == 'S') && (dupflags[aflag] & dupSFX)) || + ((at == 'P') && (dupflags[aflag] & dupPFX))) { + HUNSPELL_WARNING( + stderr, + "error: line %d: multiple definitions of an affix flag\n", + af->getlinenum()); } + dupflags[aflag] += (char)((at == 'S') ? dupSFX : dupPFX); + break; + } + // piece 3 - is cross product indicator + case 2: { + np++; + if (*start_piece == 'Y') + ff = aeXPRODUCT; + break; + } - // piece 4 - is number of affentries - case 3: { - np++; - numents = atoi(piece); - if ((numents <= 0) || ((std::numeric_limits<size_t>::max() / - sizeof(struct affentry)) < static_cast<size_t>(numents))) { - char* err = pHMgr->encode_flag(aflag); - if (err) { - HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", - af->getlinenum()); - free(err); - } - return 1; + // piece 4 - is number of affentries + case 3: { + np++; + numents = atoi(std::string(start_piece, iter).c_str()); + if ((numents <= 0) || ((std::numeric_limits<size_t>::max() / + sizeof(AffEntry)) < static_cast<size_t>(numents))) { + char* err = pHMgr->encode_flag(aflag); + if (err) { + HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", + af->getlinenum()); + free(err); } - affentries.resize(numents); - affentries[0].opts = ff; - if (utf8) - affentries[0].opts += aeUTF8; - if (pHMgr->is_aliasf()) - affentries[0].opts += aeALIASF; - if (pHMgr->is_aliasm()) - affentries[0].opts += aeALIASM; - affentries[0].aflag = aflag; + return false; } - default: - break; + char opts = ff; + if (utf8) + opts += aeUTF8; + if (pHMgr->is_aliasf()) + opts += aeALIASF; + if (pHMgr->is_aliasm()) + opts += aeALIASM; + affentries.initialize(numents, opts, aflag); } - i++; + + default: + break; } - piece = mystrsep(&tp, 0); + ++i; + start_piece = mystrsep(line, iter); } // check to make sure we parsed enough pieces if (np != 4) { @@ -4763,196 +4541,193 @@ int AffixMgr::parse_affix(char* line, af->getlinenum()); free(err); } - return 1; + return false; } // now parse numents affentries for this affix - std::vector<affentry>::iterator start = affentries.begin(); - std::vector<affentry>::iterator end = affentries.end(); - for (std::vector<affentry>::iterator entry = start; entry != end; ++entry) { - if ((nl = af->getline()) == NULL) - return 1; + AffEntry* entry = affentries.first_entry(); + for (int ent = 0; ent < numents; ++ent) { + std::string nl; + if (!af->getline(nl)) + return false; mychomp(nl); - tp = nl; + + iter = nl.begin(); i = 0; np = 0; // split line into pieces - piece = mystrsep(&tp, 0); - while (piece) { - if (*piece != '\0') { - switch (i) { - // piece 1 - is type - case 0: { - np++; - if (entry != start) - entry->opts = start->opts & - (char)(aeXPRODUCT + aeUTF8 + aeALIASF + aeALIASM); - break; - } + start_piece = mystrsep(nl, iter); + while (start_piece != nl.end()) { + switch (i) { + // piece 1 - is type + case 0: { + np++; + if (ent != 0) + entry = affentries.add_entry((char)(aeXPRODUCT + aeUTF8 + aeALIASF + aeALIASM)); + break; + } - // piece 2 - is affix char - case 1: { - np++; - if (pHMgr->decode_flag(piece) != aflag) { - char* err = pHMgr->encode_flag(aflag); - if (err) { - HUNSPELL_WARNING(stderr, - "error: line %d: affix %s is corrupt\n", - af->getlinenum(), err); - free(err); - } - return 1; + // piece 2 - is affix char + case 1: { + np++; + std::string chunk(start_piece, iter); + if (pHMgr->decode_flag(chunk.c_str()) != aflag) { + char* err = pHMgr->encode_flag(aflag); + if (err) { + HUNSPELL_WARNING(stderr, + "error: line %d: affix %s is corrupt\n", + af->getlinenum(), err); + free(err); } - - if (entry != start) - entry->aflag = start->aflag; - break; + return false; } - // piece 3 - is string to strip or 0 for null - case 2: { - np++; - entry->strip = piece; - if (complexprefixes) { - if (utf8) - reverseword_utf(entry->strip); - else - reverseword(entry->strip); - } - if (entry->strip.compare("0") == 0) { - entry->strip.clear(); - } - break; + if (ent != 0) { + AffEntry* start_entry = affentries.first_entry(); + entry->aflag = start_entry->aflag; } + break; + } - // piece 4 - is affix string or 0 for null - case 3: { - char* dash; - entry->morphcode = NULL; - entry->contclass = NULL; - entry->contclasslen = 0; - np++; - dash = strchr(piece, '/'); - if (dash) { - *dash = '\0'; - - entry->appnd = piece; - - if (ignorechars) { - if (utf8) { - remove_ignored_chars_utf(entry->appnd, ignorechars_utf16); - } else { - remove_ignored_chars(entry->appnd, ignorechars); - } - } - - if (complexprefixes) { - if (utf8) - reverseword_utf(entry->appnd); - else - reverseword(entry->appnd); - } + // piece 3 - is string to strip or 0 for null + case 2: { + np++; + entry->strip = std::string(start_piece, iter); + if (complexprefixes) { + if (utf8) + reverseword_utf(entry->strip); + else + reverseword(entry->strip); + } + if (entry->strip.compare("0") == 0) { + entry->strip.clear(); + } + break; + } - if (pHMgr->is_aliasf()) { - int index = atoi(dash + 1); - entry->contclasslen = (unsigned short)pHMgr->get_aliasf( - index, &(entry->contclass), af); - if (!entry->contclasslen) - HUNSPELL_WARNING(stderr, - "error: bad affix flag alias: \"%s\"\n", - dash + 1); + // piece 4 - is affix string or 0 for null + case 3: { + entry->morphcode = NULL; + entry->contclass = NULL; + entry->contclasslen = 0; + np++; + std::string::const_iterator dash = std::find(start_piece, iter, '/'); + if (dash != iter) { + entry->appnd = std::string(start_piece, dash); + std::string dash_str(dash + 1, iter); + + if (!ignorechars.empty()) { + if (utf8) { + remove_ignored_chars_utf(entry->appnd, ignorechars_utf16); } else { - entry->contclasslen = (unsigned short)pHMgr->decode_flags( - &(entry->contclass), dash + 1, af); - std::sort(entry->contclass, entry->contclass + entry->contclasslen); + remove_ignored_chars(entry->appnd, ignorechars); } - *dash = '/'; + } - havecontclass = 1; - for (unsigned short _i = 0; _i < entry->contclasslen; _i++) { - contclasses[(entry->contclass)[_i]] = 1; - } + if (complexprefixes) { + if (utf8) + reverseword_utf(entry->appnd); + else + reverseword(entry->appnd); + } + + if (pHMgr->is_aliasf()) { + int index = atoi(dash_str.c_str()); + entry->contclasslen = (unsigned short)pHMgr->get_aliasf( + index, &(entry->contclass), af); + if (!entry->contclasslen) + HUNSPELL_WARNING(stderr, + "error: bad affix flag alias: \"%s\"\n", + dash_str.c_str()); } else { - entry->appnd = piece; + entry->contclasslen = (unsigned short)pHMgr->decode_flags( + &(entry->contclass), dash_str.c_str(), af); + std::sort(entry->contclass, entry->contclass + entry->contclasslen); + } - if (ignorechars) { - if (utf8) { - remove_ignored_chars_utf(entry->appnd, ignorechars_utf16); - } else { - remove_ignored_chars(entry->appnd, ignorechars); - } - } + havecontclass = 1; + for (unsigned short _i = 0; _i < entry->contclasslen; _i++) { + contclasses[(entry->contclass)[_i]] = 1; + } + } else { + entry->appnd = std::string(start_piece, iter); - if (complexprefixes) { - if (utf8) - reverseword_utf(entry->appnd); - else - reverseword(entry->appnd); + if (!ignorechars.empty()) { + if (utf8) { + remove_ignored_chars_utf(entry->appnd, ignorechars_utf16); + } else { + remove_ignored_chars(entry->appnd, ignorechars); } } - if (entry->appnd.compare("0") == 0) { - entry->appnd.clear(); + if (complexprefixes) { + if (utf8) + reverseword_utf(entry->appnd); + else + reverseword(entry->appnd); } - break; } - // piece 5 - is the conditions descriptions - case 4: { - std::string chunk(piece); - np++; - if (complexprefixes) { + if (entry->appnd.compare("0") == 0) { + entry->appnd.clear(); + } + break; + } + + // piece 5 - is the conditions descriptions + case 4: { + std::string chunk(start_piece, iter); + np++; + if (complexprefixes) { + if (utf8) + reverseword_utf(chunk); + else + reverseword(chunk); + reverse_condition(chunk); + } + if (!entry->strip.empty() && chunk != "." && + redundant_condition(at, entry->strip.c_str(), entry->strip.size(), chunk.c_str(), + af->getlinenum())) + chunk = "."; + if (at == 'S') { + reverseword(chunk); + reverse_condition(chunk); + } + if (encodeit(*entry, chunk.c_str())) + return false; + break; + } + + case 5: { + std::string chunk(start_piece, iter); + np++; + if (pHMgr->is_aliasm()) { + int index = atoi(chunk.c_str()); + entry->morphcode = pHMgr->get_aliasm(index); + } else { + if (complexprefixes) { // XXX - fix me for morph. gen. if (utf8) reverseword_utf(chunk); else reverseword(chunk); - reverse_condition(chunk); } - if (!entry->strip.empty() && chunk != "." && - redundant_condition(at, entry->strip.c_str(), entry->strip.size(), chunk.c_str(), - af->getlinenum())) - chunk = "."; - if (at == 'S') { - reverseword(chunk); - reverse_condition(chunk); - } - if (encodeit(*entry, chunk.c_str())) - return 1; - break; - } - - case 5: { - std::string chunk(piece); - np++; - if (pHMgr->is_aliasm()) { - int index = atoi(chunk.c_str()); - entry->morphcode = pHMgr->get_aliasm(index); - } else { - if (complexprefixes) { // XXX - fix me for morph. gen. - if (utf8) - reverseword_utf(chunk); - else - reverseword(chunk); - } - // add the remaining of the line - if (*tp) { - *(tp - 1) = ' '; - chunk.push_back(' '); - chunk.append(tp); - } - entry->morphcode = mystrdup(chunk.c_str()); - if (!entry->morphcode) - return 1; + // add the remaining of the line + std::string::const_iterator end = nl.end(); + if (iter != end) { + chunk.append(iter, end); } - break; + entry->morphcode = mystrdup(chunk.c_str()); + if (!entry->morphcode) + return false; } - default: - break; + break; } - i++; + default: + break; } - piece = mystrsep(&tp, 0); + i++; + start_piece = mystrsep(nl, iter); } // check to make sure we parsed enough pieces if (np < 4) { @@ -4962,7 +4737,7 @@ int AffixMgr::parse_affix(char* line, af->getlinenum(), err); free(err); } - return 1; + return false; } #ifdef DEBUG @@ -4982,16 +4757,20 @@ int AffixMgr::parse_affix(char* line, // now create SfxEntry or PfxEntry objects and use links to // build an ordered (sorted by affix string) list - for (std::vector<affentry>::iterator entry = start; entry != end; ++entry) { + std::vector<AffEntry*>::iterator start = affentries.begin(); + std::vector<AffEntry*>::iterator end = affentries.end(); + for (std::vector<AffEntry*>::iterator affentry = start; affentry != end; ++affentry) { if (at == 'P') { - PfxEntry* pfxptr = new PfxEntry(this, &(*entry)); - build_pfxtree(pfxptr); + build_pfxtree(static_cast<PfxEntry*>(*affentry)); } else { - SfxEntry* sfxptr = new SfxEntry(this, &(*entry)); - build_sfxtree(sfxptr); + build_sfxtree(static_cast<SfxEntry*>(*affentry)); } } - return 0; + + //contents belong to AffixMgr now + affentries.release(); + + return true; } int AffixMgr::redundant_condition(char ft, @@ -5088,11 +4867,10 @@ int AffixMgr::redundant_condition(char ft, return 0; } -int AffixMgr::get_suffix_words(short unsigned* suff, +std::vector<std::string> AffixMgr::get_suffix_words(short unsigned* suff, int len, - const char* root_word, - char** slst) { - int suff_words_cnt = 0; + const char* root_word) { + std::vector<std::string> slst; short unsigned* start_ptr = suff; for (int j = 0; j < SETSIZE; j++) { SfxEntry* ptr = sStart[j]; @@ -5102,10 +4880,9 @@ int AffixMgr::get_suffix_words(short unsigned* suff, if ((*suff) == ptr->getFlag()) { std::string nw(root_word); nw.append(ptr->getAffix()); - hentry* ht = ptr->checkword(nw.c_str(), nw.size(), 0, NULL, NULL, 0, - NULL, 0, 0, 0); + hentry* ht = ptr->checkword(nw.c_str(), nw.size(), 0, NULL, 0, 0, 0); if (ht) { - slst[suff_words_cnt++] = mystrdup(nw.c_str()); + slst.push_back(nw); } } suff++; @@ -5113,5 +4890,5 @@ int AffixMgr::get_suffix_words(short unsigned* suff, ptr = ptr->getNext(); } } - return suff_words_cnt; + return slst; } diff --git a/libs/hunspell/src/affixmgr.hxx b/libs/hunspell/src/affixmgr.hxx index d70e853388..d41e69cfd2 100644 --- a/libs/hunspell/src/affixmgr.hxx +++ b/libs/hunspell/src/affixmgr.hxx @@ -1,6 +1,8 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * + * Copyright (C) 2002-2017 Németh László + * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at @@ -11,12 +13,7 @@ * for the specific language governing rights and limitations under the * License. * - * The Original Code is Hunspell, based on MySpell. - * - * The Initial Developers of the Original Code are - * Kevin Hendricks (MySpell) and Németh László (Hunspell). - * Portions created by the Initial Developers are Copyright (C) 2002-2005 - * the Initial Developers. All Rights Reserved. + * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. * * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, @@ -71,14 +68,13 @@ * SUCH DAMAGE. */ -#ifndef _AFFIXMGR_HXX_ -#define _AFFIXMGR_HXX_ - -#include "hunvisapi.h" +#ifndef AFFIXMGR_HXX_ +#define AFFIXMGR_HXX_ #include <stdio.h> #include <string> +#include <vector> #include "atypes.hxx" #include "baseaffix.hxx" @@ -93,17 +89,16 @@ class PfxEntry; class SfxEntry; -class LIBHUNSPELL_DLL_EXPORTED AffixMgr { +class AffixMgr { PfxEntry* pStart[SETSIZE]; SfxEntry* sStart[SETSIZE]; PfxEntry* pFlag[SETSIZE]; SfxEntry* sFlag[SETSIZE]; - HashMgr* pHMgr; - HashMgr** alldic; - int* maxdic; - char* keystring; - char* trystring; - char* encoding; + const std::vector<HashMgr*>& alldic; + const HashMgr* pHMgr; + std::string keystring; + std::string trystring; + std::string encoding; struct cs_info* csconv; int utf8; int complexprefixes; @@ -125,19 +120,19 @@ class LIBHUNSPELL_DLL_EXPORTED AffixMgr { FLAG nongramsuggest; FLAG needaffix; int cpdmin; - int numrep; - replentry* reptable; + bool parsedrep; + std::vector<replentry> reptable; RepList* iconvtable; RepList* oconvtable; - int nummap; - mapentry* maptable; - int numbreak; - char** breaktable; - int numcheckcpd; - patentry* checkcpdtable; + bool parsedmaptable; + std::vector<mapentry> maptable; + bool parsedbreaktable; + std::vector<std::string> breaktable; + bool parsedcheckcpd; + std::vector<patentry> checkcpdtable; int simplifiedcpd; - int numdefcpd; - flagentry* defcpdtable; + bool parseddefcpd; + std::vector<flagentry> defcpdtable; phonetable* phone; int maxngramsugs; int maxcpdsugs; @@ -147,10 +142,9 @@ class LIBHUNSPELL_DLL_EXPORTED AffixMgr { int sugswithdots; int cpdwordmax; int cpdmaxsyllable; - char* cpdvowels; - w_char* cpdvowels_utf16; - int cpdvowels_utf16_len; - char* cpdsyllablenum; + std::string cpdvowels; // vowels (for calculating of Hungarian compounding limit, + std::vector<w_char> cpdvowels_utf16; //vowels for UTF-8 encoding + std::string cpdsyllablenum; // syllable count incrementing flag const char* pfxappnd; // BUG: not stateless const char* sfxappnd; // BUG: not stateless int sfxextra; // BUG: not stateless @@ -159,12 +153,12 @@ class LIBHUNSPELL_DLL_EXPORTED AffixMgr { SfxEntry* sfx; // BUG: not stateless PfxEntry* pfx; // BUG: not stateless int checknum; - char* wordchars; + std::string wordchars; // letters + spec. word characters std::vector<w_char> wordchars_utf16; - char* ignorechars; + std::string ignorechars; // letters + spec. word characters std::vector<w_char> ignorechars_utf16; - char* version; - char* lang; + std::string version; // affix and dictionary file version string + std::string lang; // language int langnum; FLAG lemma_present; FLAG circumfix; @@ -182,7 +176,7 @@ class LIBHUNSPELL_DLL_EXPORTED AffixMgr { // affix) public: - AffixMgr(const char* affpath, HashMgr** ptr, int* md, const char* key = NULL); + AffixMgr(const char* affpath, const std::vector<HashMgr*>& ptr, const char* key = NULL); ~AffixMgr(); struct hentry* affix_check(const char* word, int len, @@ -202,9 +196,6 @@ class LIBHUNSPELL_DLL_EXPORTED AffixMgr { int len, int sfxopts, PfxEntry* ppfx, - char** wlst, - int maxSug, - int* ns, const FLAG cclass = FLAG_NULL, const FLAG needflag = FLAG_NULL, char in_compound = IN_CPD_NOT); @@ -214,39 +205,39 @@ class LIBHUNSPELL_DLL_EXPORTED AffixMgr { PfxEntry* ppfx, const FLAG needflag = FLAG_NULL); - char* affix_check_morph(const char* word, - int len, - const FLAG needflag = FLAG_NULL, - char in_compound = IN_CPD_NOT); - char* prefix_check_morph(const char* word, - int len, - char in_compound, - const FLAG needflag = FLAG_NULL); - char* suffix_check_morph(const char* word, - int len, - int sfxopts, - PfxEntry* ppfx, - const FLAG cclass = FLAG_NULL, - const FLAG needflag = FLAG_NULL, - char in_compound = IN_CPD_NOT); + std::string affix_check_morph(const char* word, + int len, + const FLAG needflag = FLAG_NULL, + char in_compound = IN_CPD_NOT); + std::string prefix_check_morph(const char* word, + int len, + char in_compound, + const FLAG needflag = FLAG_NULL); + std::string suffix_check_morph(const char* word, + int len, + int sfxopts, + PfxEntry* ppfx, + const FLAG cclass = FLAG_NULL, + const FLAG needflag = FLAG_NULL, + char in_compound = IN_CPD_NOT); - char* prefix_check_twosfx_morph(const char* word, - int len, - char in_compound, - const FLAG needflag = FLAG_NULL); - char* suffix_check_twosfx_morph(const char* word, - int len, - int sfxopts, - PfxEntry* ppfx, - const FLAG needflag = FLAG_NULL); + std::string prefix_check_twosfx_morph(const char* word, + int len, + char in_compound, + const FLAG needflag = FLAG_NULL); + std::string suffix_check_twosfx_morph(const char* word, + int len, + int sfxopts, + PfxEntry* ppfx, + const FLAG needflag = FLAG_NULL); - char* morphgen(const char* ts, - int wl, - const unsigned short* ap, - unsigned short al, - const char* morph, - const char* targetmorph, - int level); + std::string morphgen(const char* ts, + int wl, + const unsigned short* ap, + unsigned short al, + const char* morph, + const char* targetmorph, + int level); int expand_rootword(struct guessword* wlst, int maxn, @@ -273,8 +264,7 @@ class LIBHUNSPELL_DLL_EXPORTED AffixMgr { int cpdcase_check(const char* word, int len); inline int candidate_check(const char* word, int len); void setcminmax(int* cmin, int* cmax, const char* word, int len); - struct hentry* compound_check(const char* word, - int len, + struct hentry* compound_check(const std::string& word, short wordnum, short numsyllable, short maxwordnum, @@ -294,47 +284,37 @@ class LIBHUNSPELL_DLL_EXPORTED AffixMgr { hentry** words, hentry** rwords, char hu_mov_rule, - char** result, - char* partresult); + std::string& result, + const std::string* partresult); - int get_suffix_words(short unsigned* suff, + std::vector<std::string> get_suffix_words(short unsigned* suff, int len, - const char* root_word, - char** slst); + const char* root_word); struct hentry* lookup(const char* word); - int get_numrep() const; - struct replentry* get_reptable() const; + const std::vector<replentry>& get_reptable() const; RepList* get_iconvtable() const; RepList* get_oconvtable() const; struct phonetable* get_phonetable() const; - int get_nummap() const; - struct mapentry* get_maptable() const; - int get_numbreak() const; - char** get_breaktable() const; - char* get_encoding(); + const std::vector<mapentry>& get_maptable() const; + const std::vector<std::string>& get_breaktable() const; + const std::string& get_encoding(); int get_langnum() const; char* get_key_string(); char* get_try_string() const; - const char* get_wordchars() const; + const std::string& get_wordchars() const; const std::vector<w_char>& get_wordchars_utf16() const; - char* get_ignore() const; + const char* get_ignore() const; const std::vector<w_char>& get_ignore_utf16() const; int get_compound() const; FLAG get_compoundflag() const; - FLAG get_compoundbegin() const; FLAG get_forbiddenword() const; FLAG get_nosuggest() const; FLAG get_nongramsuggest() const; FLAG get_needaffix() const; FLAG get_onlyincompound() const; - FLAG get_compoundroot() const; - FLAG get_lemma_present() const; - int get_checknum() const; - const char* get_prefix() const; - const char* get_suffix() const; const char* get_derived() const; - const char* get_version() const; + const std::string& get_version() const; int have_contclass() const; int get_utf8() const; int get_complexprefixes() const; @@ -355,26 +335,25 @@ class LIBHUNSPELL_DLL_EXPORTED AffixMgr { private: int parse_file(const char* affpath, const char* key); - int parse_flag(char* line, unsigned short* out, FileMgr* af); - int parse_num(char* line, int* out, FileMgr* af); - int parse_cpdsyllable(char* line, FileMgr* af); - int parse_reptable(char* line, FileMgr* af); - int parse_convtable(char* line, + bool parse_flag(const std::string& line, unsigned short* out, FileMgr* af); + bool parse_num(const std::string& line, int* out, FileMgr* af); + bool parse_cpdsyllable(const std::string& line, FileMgr* af); + bool parse_reptable(const std::string& line, FileMgr* af); + bool parse_convtable(const std::string& line, FileMgr* af, RepList** rl, - const char* keyword); - int parse_phonetable(char* line, FileMgr* af); - int parse_maptable(char* line, FileMgr* af); - int parse_breaktable(char* line, FileMgr* af); - int parse_checkcpdtable(char* line, FileMgr* af); - int parse_defcpdtable(char* line, FileMgr* af); - int parse_affix(char* line, const char at, FileMgr* af, char* dupflags); + const std::string& keyword); + bool parse_phonetable(const std::string& line, FileMgr* af); + bool parse_maptable(const std::string& line, FileMgr* af); + bool parse_breaktable(const std::string& line, FileMgr* af); + bool parse_checkcpdtable(const std::string& line, FileMgr* af); + bool parse_defcpdtable(const std::string& line, FileMgr* af); + bool parse_affix(const std::string& line, const char at, FileMgr* af, char* dupflags); void reverse_condition(std::string&); - void debugflag(char* result, unsigned short flag); std::string& debugflag(std::string& result, unsigned short flag); int condlen(const char*); - int encodeit(affentry& entry, const char* cs); + int encodeit(AffEntry& entry, const char* cs); int build_pfxtree(PfxEntry* pfxptr); int build_sfxtree(SfxEntry* sfxptr); int process_pfx_order(); diff --git a/libs/hunspell/src/atypes.hxx b/libs/hunspell/src/atypes.hxx index 60826af20e..f841523189 100644 --- a/libs/hunspell/src/atypes.hxx +++ b/libs/hunspell/src/atypes.hxx @@ -1,6 +1,8 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * + * Copyright (C) 2002-2017 Németh László + * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at @@ -11,12 +13,7 @@ * for the specific language governing rights and limitations under the * License. * - * The Original Code is Hunspell, based on MySpell. - * - * The Initial Developers of the Original Code are - * Kevin Hendricks (MySpell) and Németh László (Hunspell). - * Portions created by the Initial Developers are Copyright (C) 2002-2005 - * the Initial Developers. All Rights Reserved. + * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. * * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, @@ -38,8 +35,8 @@ * * ***** END LICENSE BLOCK ***** */ -#ifndef _ATYPES_HXX_ -#define _ATYPES_HXX_ +#ifndef ATYPES_HXX_ +#define ATYPES_HXX_ #ifndef HUNSPELL_WARNING #include <stdio.h> @@ -55,15 +52,15 @@ static inline void HUNSPELL_WARNING(FILE*, const char*, ...) {} // HUNSTEM def. #define HUNSTEM -#include "hashmgr.hxx" #include "w_char.hxx" #include <algorithm> #include <string> +#include <vector> #define SETSIZE 256 #define CONTSIZE 65536 -// affentry options +// AffEntry options #define aeXPRODUCT (1 << 0) #define aeUTF8 (1 << 1) #define aeALIASF (1 << 2) @@ -85,8 +82,6 @@ static inline void HUNSPELL_WARNING(FILE*, const char*, ...) {} #define SPELL_ORIGCAP (1 << 5) #define SPELL_WARN (1 << 6) -#define MAXLNLEN 8192 - #define MINCPDLEN 3 #define MAXCOMPOUND 10 #define MAXCONDLEN 20 @@ -100,46 +95,25 @@ static inline void HUNSPELL_WARNING(FILE*, const char*, ...) {} #define TESTAFF(a, b, c) (std::binary_search(a, a + c, b)) -struct affentry { - std::string strip; - std::string appnd; - char numconds; - char opts; - unsigned short aflag; - unsigned short* contclass; - short contclasslen; - union { - char conds[MAXCONDLEN]; - struct { - char conds1[MAXCONDLEN_1]; - char* conds2; - } l; - } c; - char* morphcode; -}; - struct guessword { char* word; bool allow; char* orig; }; -struct mapentry { - char** set; - int len; -}; - -struct flagentry { - FLAG* def; - int len; -}; +typedef std::vector<std::string> mapentry; +typedef std::vector<FLAG> flagentry; struct patentry { - char* pattern; - char* pattern2; - char* pattern3; + std::string pattern; + std::string pattern2; + std::string pattern3; FLAG cond; FLAG cond2; + patentry() + : cond(FLAG_NULL) + , cond2(FLAG_NULL) { + } }; #endif diff --git a/libs/hunspell/src/baseaffix.hxx b/libs/hunspell/src/baseaffix.hxx index 59256e92f3..9191dba475 100644 --- a/libs/hunspell/src/baseaffix.hxx +++ b/libs/hunspell/src/baseaffix.hxx @@ -1,6 +1,8 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * + * Copyright (C) 2002-2017 Németh László + * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at @@ -11,12 +13,7 @@ * for the specific language governing rights and limitations under the * License. * - * The Original Code is Hunspell, based on MySpell. - * - * The Initial Developers of the Original Code are - * Kevin Hendricks (MySpell) and Németh László (Hunspell). - * Portions created by the Initial Developers are Copyright (C) 2002-2005 - * the Initial Developers. All Rights Reserved. + * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. * * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, @@ -38,18 +35,17 @@ * * ***** END LICENSE BLOCK ***** */ -#ifndef _BASEAFF_HXX_ -#define _BASEAFF_HXX_ +#ifndef BASEAFF_HXX_ +#define BASEAFF_HXX_ -#include "hunvisapi.h" #include <string> -class LIBHUNSPELL_DLL_EXPORTED AffEntry { +class AffEntry { private: AffEntry(const AffEntry&); AffEntry& operator=(const AffEntry&); - protected: + public: AffEntry() : numconds(0), opts(0), @@ -57,6 +53,7 @@ class LIBHUNSPELL_DLL_EXPORTED AffEntry { morphcode(0), contclass(NULL), contclasslen(0) {} + virtual ~AffEntry(); std::string appnd; std::string strip; unsigned char numconds; diff --git a/libs/hunspell/src/config.h b/libs/hunspell/src/config.h index 1230ed0be7..f3b64fb819 100644 --- a/libs/hunspell/src/config.h +++ b/libs/hunspell/src/config.h @@ -201,5 +201,5 @@ #define PACKAGE_TARNAME /* Define to the version of this package. */ -#define PACKAGE_VERSION "1.4.0" -#define VERSION "1.4.0" +#define PACKAGE_VERSION "1.6.2" +#define VERSION "1.6.2" diff --git a/libs/hunspell/src/csutil.c++ b/libs/hunspell/src/csutil.cxx index 1948e4a3b3..be43a5b597 100644 --- a/libs/hunspell/src/csutil.c++ +++ b/libs/hunspell/src/csutil.cxx @@ -1,6 +1,8 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * + * Copyright (C) 2002-2017 Németh László + * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at @@ -11,12 +13,7 @@ * for the specific language governing rights and limitations under the * License. * - * The Original Code is Hunspell, based on MySpell. - * - * The Initial Developers of the Original Code are - * Kevin Hendricks (MySpell) and Németh László (Hunspell). - * Portions created by the Initial Developers are Copyright (C) 2002-2005 - * the Initial Developers. All Rights Reserved. + * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. * * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, @@ -76,6 +73,7 @@ #include <string.h> #include <stdio.h> #include <ctype.h> +#include <sstream> #include "csutil.hxx" #include "atypes.hxx" @@ -97,7 +95,7 @@ struct unicode_info { #include <unicode/uchar.h> #else #ifndef MOZILLA_CLIENT -#include "utf_info.cxx" +#include "utf_info.c++" #define UTF_LST_LEN (sizeof(utf_lst) / (sizeof(unicode_info))) #endif #endif @@ -122,26 +120,24 @@ static struct unicode_info2* utf_tbl = NULL; static int utf_tbl_count = 0; // utf_tbl can be used by multiple Hunspell instances -FILE* myfopen(const char* path, const char* mode) { -#ifdef _WIN32 +void myopen(std::ifstream& stream, const char* path, std::ios_base::openmode mode) +{ +#if defined(_WIN32) && defined(_MSC_VER) #define WIN32_LONG_PATH_PREFIX "\\\\?\\" if (strncmp(path, WIN32_LONG_PATH_PREFIX, 4) == 0) { int len = MultiByteToWideChar(CP_UTF8, 0, path, -1, NULL, 0); - wchar_t* buff = (wchar_t*)malloc(len * sizeof(wchar_t)); - wchar_t* buff2 = (wchar_t*)malloc(len * sizeof(wchar_t)); - FILE* f = NULL; - if (buff && buff2) { - MultiByteToWideChar(CP_UTF8, 0, path, -1, buff, len); - if (_wfullpath(buff2, buff, len) != NULL) { - f = _wfopen(buff2, (strcmp(mode, "r") == 0) ? L"r" : L"rb"); - } - free(buff); - free(buff2); + wchar_t* buff = new wchar_t[len]; + wchar_t* buff2 = new wchar_t[len]; + MultiByteToWideChar(CP_UTF8, 0, path, -1, buff, len); + if (_wfullpath(buff2, buff, len) != NULL) { + stream.open(buff2, mode); } - return f; + delete [] buff; + delete [] buff2; } + else #endif - return fopen(path, mode); + stream.open(path, mode); } std::string& u16_u8(std::string& dest, const std::vector<w_char>& src) { @@ -218,7 +214,7 @@ int u8_u16(std::vector<w_char>& dest, const std::string& src) { case 0xd0: { // 2-byte UTF-8 codes if ((*(u8 + 1) & 0xc0) == 0x80) { u2.h = (*u8 & 0x1f) >> 2; - u2.l = (*u8 << 6) + (*(u8 + 1) & 0x3f); + u2.l = (static_cast<unsigned char>(*u8) << 6) + (*(u8 + 1) & 0x3f); ++u8; } else { HUNSPELL_WARNING(stderr, @@ -275,34 +271,35 @@ int u8_u16(std::vector<w_char>& dest, const std::string& src) { return dest.size(); } -// strip strings into token based on single char delimiter -// acts like strsep() but only uses a delim char and not -// a delim string -// default delimiter: white space characters - -char* mystrsep(char** stringp, const char delim) { - char* mp = *stringp; - if (*mp != '\0') { - char* dp; - if (delim) { - dp = strchr(mp, delim); - } else { - // don't use isspace() here, the string can be in some random charset - // that's way different than the locale's - for (dp = mp; (*dp && *dp != ' ' && *dp != '\t'); dp++) - ; - if (!*dp) - dp = NULL; - } - if (dp) { - *stringp = dp + 1; - *dp = '\0'; - } else { - *stringp = mp + strlen(mp); - } - return mp; - } - return NULL; +namespace { +class is_any_of { + public: + explicit is_any_of(const std::string& in) : chars(in) {} + + bool operator()(char c) { return chars.find(c) != std::string::npos; } + + private: + std::string chars; +}; +} + +std::string::const_iterator mystrsep(const std::string &str, + std::string::const_iterator& start) { + std::string::const_iterator end = str.end(); + + is_any_of op(" \t"); + // don't use isspace() here, the string can be in some random charset + // that's way different than the locale's + std::string::const_iterator sp = start; + while (sp != end && op(*sp)) + ++sp; + + std::string::const_iterator dp = sp; + while (dp != end && !op(*dp)) + ++dp; + + start = dp; + return sp; } // replaces strdup with ansi version @@ -320,142 +317,98 @@ char* mystrdup(const char* s) { return d; } -// strcat for limited length destination string -char* mystrcat(char* dest, const char* st, int max) { - int len; - int len2; - if (dest == NULL || st == NULL) - return dest; - len = strlen(dest); - len2 = strlen(st); - if (len + len2 + 1 > max) - return dest; - strcpy(dest + len, st); - return dest; -} - // remove cross-platform text line end characters -void mychomp(char* s) { - size_t k = strlen(s); - if ((k > 0) && ((*(s + k - 1) == '\r') || (*(s + k - 1) == '\n'))) - *(s + k - 1) = '\0'; - if ((k > 1) && (*(s + k - 2) == '\r')) - *(s + k - 2) = '\0'; +void mychomp(std::string& s) { + size_t k = s.size(); + size_t newsize = k; + if ((k > 0) && ((s[k - 1] == '\r') || (s[k - 1] == '\n'))) + --newsize; + if ((k > 1) && (s[k - 2] == '\r')) + --newsize; + s.resize(newsize); } // break text to lines -// return number of lines -int line_tok(const char* text, char*** lines, char breakchar) { - int linenum = 0; - if (!text) { - return linenum; - } - char* dup = mystrdup(text); - char* p = strchr(dup, breakchar); - while (p) { - linenum++; - *p = '\0'; - p++; - p = strchr(p, breakchar); - } - linenum++; - *lines = (char**)malloc(linenum * sizeof(char*)); - if (!(*lines)) { - free(dup); - return 0; +std::vector<std::string> line_tok(const std::string& text, char breakchar) { + std::vector<std::string> ret; + if (text.empty()) { + return ret; } - p = dup; - int l = 0; - for (int i = 0; i < linenum; i++) { - if (*p != '\0') { - (*lines)[l] = mystrdup(p); - if (!(*lines)[l]) { - for (i = 0; i < l; i++) - free((*lines)[i]); - free(dup); - return 0; - } - l++; + std::stringstream ss(text); + std::string tok; + while(std::getline(ss, tok, breakchar)) { + if (!tok.empty()) { + ret.push_back(tok); } - p += strlen(p) + 1; } - free(dup); - if (!l) { - free(*lines); - *lines = NULL; - } - return l; + + return ret; } // uniq line in place -char* line_uniq(char* text, char breakchar) { - char** lines; - int linenum = line_tok(text, &lines, breakchar); - int i; - strcpy(text, lines[0]); - for (i = 1; i < linenum; i++) { - int dup = 0; - for (int j = 0; j < i; j++) { - if (strcmp(lines[i], lines[j]) == 0) { - dup = 1; +void line_uniq(std::string& text, char breakchar) +{ + std::vector<std::string> lines = line_tok(text, breakchar); + text.clear(); + if (lines.empty()) { + return; + } + text = lines[0]; + for (size_t i = 1; i < lines.size(); ++i) { + bool dup = false; + for (size_t j = 0; j < i; ++j) { + if (lines[i] == lines[j]) { + dup = true; break; } } if (!dup) { - if ((i > 1) || (*(lines[0]) != '\0')) { - sprintf(text + strlen(text), "%c", breakchar); - } - strcat(text, lines[i]); + if (!text.empty()) + text.push_back(breakchar); + text.append(lines[i]); } } - for (i = 0; i < linenum; i++) { - free(lines[i]); - } - free(lines); - return text; } // uniq and boundary for compound analysis: "1\n\2\n\1" -> " ( \1 | \2 ) " -char* line_uniq_app(char** text, char breakchar) { - if (!strchr(*text, breakchar)) { - return *text; +void line_uniq_app(std::string& text, char breakchar) { + if (text.find(breakchar) == std::string::npos) { + return; } - char** lines; - int i; - int linenum = line_tok(*text, &lines, breakchar); - int dup = 0; - for (i = 0; i < linenum; i++) { - for (int j = 0; j < (i - 1); j++) { - if (strcmp(lines[i], lines[j]) == 0) { - *(lines[i]) = '\0'; - dup++; + std::vector<std::string> lines = line_tok(text, breakchar); + text.clear(); + if (lines.empty()) { + return; + } + text = lines[0]; + for (size_t i = 1; i < lines.size(); ++i) { + bool dup = false; + for (size_t j = 0; j < i; ++j) { + if (lines[i] == lines[j]) { + dup = true; break; } } + if (!dup) { + if (!text.empty()) + text.push_back(breakchar); + text.append(lines[i]); + } } - if ((linenum - dup) == 1) { - strcpy(*text, lines[0]); - freelist(&lines, linenum); - return *text; + + if (lines.size() == 1) { + text = lines[0]; + return; } - char* newtext = (char*)malloc(strlen(*text) + 2 * linenum + 3 + 1); - if (newtext) { - free(*text); - *text = newtext; - } else { - freelist(&lines, linenum); - return *text; + + text.assign(" ( "); + for (size_t i = 0; i < lines.size(); ++i) { + text.append(lines[i]); + text.append(" | "); } - strcpy(*text, " ( "); - for (i = 0; i < linenum; i++) - if (*(lines[i])) { - sprintf(*text + strlen(*text), "%s%s", lines[i], " | "); - } - (*text)[strlen(*text) - 2] = ')'; // " ) " - freelist(&lines, linenum); - return *text; + text[text.size() - 2] = ')'; // " ) " } // append s to ends of every lines in text @@ -469,111 +422,6 @@ std::string& strlinecat(std::string& str, const std::string& apd) { return str; } -// morphcmp(): compare MORPH_DERI_SFX, MORPH_INFL_SFX and MORPH_TERM_SFX fields -// in the first line of the inputs -// return 0, if inputs equal -// return 1, if inputs may equal with a secondary suffix -// otherwise return -1 -int morphcmp(const char* s, const char* t) { - int se = 0; - int te = 0; - const char* sl; - const char* tl; - const char* olds; - const char* oldt; - if (!s || !t) - return 1; - olds = s; - sl = strchr(s, '\n'); - s = strstr(s, MORPH_DERI_SFX); - if (!s || (sl && sl < s)) - s = strstr(olds, MORPH_INFL_SFX); - if (!s || (sl && sl < s)) { - s = strstr(olds, MORPH_TERM_SFX); - olds = NULL; - } - oldt = t; - tl = strchr(t, '\n'); - t = strstr(t, MORPH_DERI_SFX); - if (!t || (tl && tl < t)) - t = strstr(oldt, MORPH_INFL_SFX); - if (!t || (tl && tl < t)) { - t = strstr(oldt, MORPH_TERM_SFX); - oldt = NULL; - } - while (s && t && (!sl || sl > s) && (!tl || tl > t)) { - s += MORPH_TAG_LEN; - t += MORPH_TAG_LEN; - se = 0; - te = 0; - while ((*s == *t) && !se && !te) { - s++; - t++; - switch (*s) { - case ' ': - case '\n': - case '\t': - case '\0': - se = 1; - } - switch (*t) { - case ' ': - case '\n': - case '\t': - case '\0': - te = 1; - } - } - if (!se || !te) { - // not terminal suffix difference - if (olds) - return -1; - return 1; - } - olds = s; - s = strstr(s, MORPH_DERI_SFX); - if (!s || (sl && sl < s)) - s = strstr(olds, MORPH_INFL_SFX); - if (!s || (sl && sl < s)) { - s = strstr(olds, MORPH_TERM_SFX); - olds = NULL; - } - oldt = t; - t = strstr(t, MORPH_DERI_SFX); - if (!t || (tl && tl < t)) - t = strstr(oldt, MORPH_INFL_SFX); - if (!t || (tl && tl < t)) { - t = strstr(oldt, MORPH_TERM_SFX); - oldt = NULL; - } - } - if (!s && !t && se && te) - return 0; - return 1; -} - -int get_sfxcount(const char* morph) { - if (!morph || !*morph) - return 0; - int n = 0; - const char* old = morph; - morph = strstr(morph, MORPH_DERI_SFX); - if (!morph) - morph = strstr(old, MORPH_INFL_SFX); - if (!morph) - morph = strstr(old, MORPH_TERM_SFX); - while (morph) { - n++; - old = morph; - morph = strstr(morph + 1, MORPH_DERI_SFX); - if (!morph) - morph = strstr(old + 1, MORPH_INFL_SFX); - if (!morph) - morph = strstr(old + 1, MORPH_TERM_SFX); - } - return n; -} - int fieldlen(const char* r) { int n = 0; while (r && *r != ' ' && *r != '\t' && *r != '\0' && *r != '\n') { @@ -615,33 +463,6 @@ std::string& mystrrep(std::string& str, return str; } -char* mystrrep(char* word, const char* pat, const char* rep) { - char* pos = strstr(word, pat); - if (pos) { - int replen = strlen(rep); - int patlen = strlen(pat); - while (pos) { - if (replen < patlen) { - char* end = word + strlen(word); - char* next = pos + replen; - char* prev = pos + strlen(pat); - for (; prev < end;* next = *prev, prev++, next++) - ; - *next = '\0'; - } else if (replen > patlen) { - char* end = pos + patlen; - char* next = word + strlen(word) + replen - patlen; - char* prev = next - replen + patlen; - for (; prev >= end;* next = *prev, prev--, next--) - ; - } - strncpy(pos, rep, replen); - pos = strstr(word, pat); - } - } - return word; -} - // reverse word size_t reverseword(std::string& word) { std::reverse(word.begin(), word.end()); @@ -657,35 +478,19 @@ size_t reverseword_utf(std::string& word) { return w.size(); } -int uniqlist(char** list, int n) { - int i; - if (n < 2) - return n; - for (i = 0; i < n; i++) { - for (int j = 0; j < i; j++) { - if (list[j] && list[i] && (strcmp(list[j], list[i]) == 0)) { - free(list[i]); - list[i] = NULL; - break; - } - } - } - int m = 1; - for (i = 1; i < n; i++) - if (list[i]) { - list[m] = list[i]; - m++; - } - return m; -} +void uniqlist(std::vector<std::string>& list) { + if (list.size() < 2) + return; -void freelist(char*** list, int n) { - if (list && *list) { - for (int i = 0; i < n; i++) - free((*list)[i]); - free(*list); - *list = NULL; + std::vector<std::string> ret; + ret.push_back(list[0]); + + for (size_t i = 1; i < list.size(); ++i) { + if (std::find(ret.begin(), ret.end(), list[i]) == ret.end()) + ret.push_back(list[i]); } + + list.swap(ret); } namespace { @@ -710,18 +515,20 @@ unsigned char ccase(const struct cs_info* csconv, int nIndex) { w_char upper_utf(w_char u, int langnum) { unsigned short idx = (u.h << 8) + u.l; - if (idx != unicodetoupper(idx, langnum)) { - u.h = (unsigned char)(unicodetoupper(idx, langnum) >> 8); - u.l = (unsigned char)(unicodetoupper(idx, langnum) & 0x00FF); + unsigned short upridx = unicodetoupper(idx, langnum); + if (idx != upridx) { + u.h = (unsigned char)(upridx >> 8); + u.l = (unsigned char)(upridx & 0x00FF); } return u; } w_char lower_utf(w_char u, int langnum) { unsigned short idx = (u.h << 8) + u.l; - if (idx != unicodetolower(idx, langnum)) { - u.h = (unsigned char)(unicodetolower(idx, langnum) >> 8); - u.l = (unsigned char)(unicodetolower(idx, langnum) & 0x00FF); + unsigned short lwridx = unicodetolower(idx, langnum); + if (idx != lwridx) { + u.h = (unsigned char)(lwridx >> 8); + u.l = (unsigned char)(lwridx & 0x00FF); } return u; } @@ -743,12 +550,13 @@ std::string& mkallsmall(std::string& s, const struct cs_info* csconv) { } std::vector<w_char>& mkallsmall_utf(std::vector<w_char>& u, - int langnum) { + int langnum) { for (size_t i = 0; i < u.size(); ++i) { unsigned short idx = (u[i].h << 8) + u[i].l; - if (idx != unicodetolower(idx, langnum)) { - u[i].h = (unsigned char)(unicodetolower(idx, langnum) >> 8); - u[i].l = (unsigned char)(unicodetolower(idx, langnum) & 0x00FF); + unsigned short lwridx = unicodetolower(idx, langnum); + if (idx != lwridx) { + u[i].h = (unsigned char)(lwridx >> 8); + u[i].l = (unsigned char)(lwridx & 0x00FF); } } return u; @@ -757,9 +565,10 @@ std::vector<w_char>& mkallsmall_utf(std::vector<w_char>& u, std::vector<w_char>& mkallcap_utf(std::vector<w_char>& u, int langnum) { for (size_t i = 0; i < u.size(); i++) { unsigned short idx = (u[i].h << 8) + u[i].l; - if (idx != unicodetoupper(idx, langnum)) { - u[i].h = (unsigned char)(unicodetoupper(idx, langnum) >> 8); - u[i].l = (unsigned char)(unicodetoupper(idx, langnum) & 0x00FF); + unsigned short upridx = unicodetoupper(idx, langnum); + if (idx != upridx) { + u[i].h = (unsigned char)(upridx >> 8); + u[i].l = (unsigned char)(upridx & 0x00FF); } } return u; @@ -775,9 +584,10 @@ std::string& mkinitcap(std::string& s, const struct cs_info* csconv) { std::vector<w_char>& mkinitcap_utf(std::vector<w_char>& u, int langnum) { if (!u.empty()) { unsigned short idx = (u[0].h << 8) + u[0].l; - if (idx != unicodetoupper(idx, langnum)) { - u[0].h = (unsigned char)(unicodetoupper(idx, langnum) >> 8); - u[0].l = (unsigned char)(unicodetoupper(idx, langnum) & 0x00FF); + unsigned short upridx = unicodetoupper(idx, langnum); + if (idx != upridx) { + u[0].h = (unsigned char)(upridx >> 8); + u[0].l = (unsigned char)(upridx & 0x00FF); } } return u; @@ -793,9 +603,10 @@ std::string& mkinitsmall(std::string& s, const struct cs_info* csconv) { std::vector<w_char>& mkinitsmall_utf(std::vector<w_char>& u, int langnum) { if (!u.empty()) { unsigned short idx = (u[0].h << 8) + u[0].l; - if (idx != unicodetolower(idx, langnum)) { - u[0].h = (unsigned char)(unicodetolower(idx, langnum) >> 8); - u[0].l = (unsigned char)(unicodetolower(idx, langnum) & 0x00FF); + unsigned short lwridx = unicodetolower(idx, langnum); + if (idx != lwridx) { + u[0].h = (unsigned char)(lwridx >> 8); + u[0].l = (unsigned char)(lwridx & 0x00FF); } } return u; @@ -2457,9 +2268,9 @@ static void toAsciiLowerAndRemoveNonAlphanumeric(const char* pName, *pBuf = '\0'; } -struct cs_info* get_current_cs(const char* es) { - char* normalized_encoding = new char[strlen(es) + 1]; - toAsciiLowerAndRemoveNonAlphanumeric(es, normalized_encoding); +struct cs_info* get_current_cs(const std::string& es) { + char* normalized_encoding = new char[es.size() + 1]; + toAsciiLowerAndRemoveNonAlphanumeric(es.c_str(), normalized_encoding); struct cs_info* ccs = NULL; int n = sizeof(encds) / sizeof(encds[0]); @@ -2474,7 +2285,7 @@ struct cs_info* get_current_cs(const char* es) { if (!ccs) { HUNSPELL_WARNING(stderr, - "error: unknown encoding %s: using %s as fallback\n", es, + "error: unknown encoding %s: using %s as fallback\n", es.c_str(), encds[0].enc_name); ccs = encds[0].cs_table; } @@ -2485,7 +2296,7 @@ struct cs_info* get_current_cs(const char* es) { // XXX This function was rewritten for mozilla. Instead of storing the // conversion tables static in this file, create them when needed // with help the mozilla backend. -struct cs_info* get_current_cs(const char* es) { +struct cs_info* get_current_cs(const std::string& es) { struct cs_info* ccs = new cs_info[256]; // Initialze the array with dummy data so that we wouldn't need // to return null in case of failures. @@ -2500,7 +2311,7 @@ struct cs_info* get_current_cs(const char* es) { nsresult rv; - nsAutoCString label(es); + nsAutoCString label(es.c_str()); nsAutoCString encoding; if (!EncodingUtils::FindEncodingForLabelNoReplacement(label, encoding)) { return ccs; @@ -2565,21 +2376,18 @@ struct cs_info* get_current_cs(const char* es) { #endif // primitive isalpha() replacement for tokenization -char* get_casechars(const char* enc) { +std::string get_casechars(const char* enc) { struct cs_info* csconv = get_current_cs(enc); - char expw[MAXLNLEN]; - char* p = expw; - for (int i = 0; i <= 255; i++) { + std::string expw; + for (int i = 0; i <= 255; ++i) { if (cupper(csconv, i) != clower(csconv, i)) { - *p = static_cast<char>(i); - p++; + expw.push_back(static_cast<char>(i)); } } - *p = '\0'; #ifdef MOZILLA_CLIENT delete[] csconv; #endif - return mystrdup(expw); + return expw; } // language to encoding default map @@ -2606,10 +2414,10 @@ static struct lang_map lang2enc[] = {"tr_TR", LANG_tr}, // for back-compatibility {"ru", LANG_ru}, {"uk", LANG_uk}}; -int get_lang_num(const char* lang) { +int get_lang_num(const std::string& lang) { int n = sizeof(lang2enc) / sizeof(lang2enc[0]); for (int i = 0; i < n; i++) { - if (strcmp(lang, lang2enc[i].lang) == 0) { + if (strcmp(lang.c_str(), lang2enc[i].lang) == 0) { return lang2enc[i].num; } } @@ -2618,26 +2426,21 @@ int get_lang_num(const char* lang) { #ifndef OPENOFFICEORG #ifndef MOZILLA_CLIENT -int initialize_utf_tbl() { +void initialize_utf_tbl() { utf_tbl_count++; if (utf_tbl) - return 0; - utf_tbl = (unicode_info2*)malloc(CONTSIZE * sizeof(unicode_info2)); - if (utf_tbl) { - size_t j; - for (j = 0; j < CONTSIZE; j++) { - utf_tbl[j].cletter = 0; - utf_tbl[j].clower = (unsigned short)j; - utf_tbl[j].cupper = (unsigned short)j; - } - for (j = 0; j < UTF_LST_LEN; j++) { - utf_tbl[utf_lst[j].c].cletter = 1; - utf_tbl[utf_lst[j].c].clower = utf_lst[j].clower; - utf_tbl[utf_lst[j].c].cupper = utf_lst[j].cupper; - } - } else - return 1; - return 0; + return; + utf_tbl = new unicode_info2[CONTSIZE]; + for (size_t j = 0; j < CONTSIZE; ++j) { + utf_tbl[j].cletter = 0; + utf_tbl[j].clower = (unsigned short)j; + utf_tbl[j].cupper = (unsigned short)j; + } + for (size_t j = 0; j < UTF_LST_LEN; ++j) { + utf_tbl[utf_lst[j].c].cletter = 1; + utf_tbl[utf_lst[j].c].clower = utf_lst[j].clower; + utf_tbl[utf_lst[j].c].cupper = utf_lst[j].cupper; + } } #endif #endif @@ -2646,7 +2449,7 @@ void free_utf_tbl() { if (utf_tbl_count > 0) utf_tbl_count--; if (utf_tbl && (utf_tbl_count == 0)) { - free(utf_tbl); + delete[] utf_tbl; utf_tbl = NULL; } } @@ -2731,12 +2534,17 @@ int get_captype_utf8(const std::vector<w_char>& word, int langnum) { size_t ncap = 0; size_t nneutral = 0; size_t firstcap = 0; - for (size_t i = 0; i < word.size(); ++i) { - unsigned short idx = (word[i].h << 8) + word[i].l; - if (idx != unicodetolower(idx, langnum)) + + std::vector<w_char>::const_iterator it = word.begin(); + std::vector<w_char>::const_iterator it_end = word.end(); + while (it != it_end) { + unsigned short idx = (it->h << 8) + it->l; + unsigned short lwridx = unicodetolower(idx, langnum); + if (idx != lwridx) ncap++; - if (unicodetoupper(idx, langnum) == unicodetolower(idx, langnum)) + if (unicodetoupper(idx, langnum) == lwridx) nneutral++; + ++it; } if (ncap) { unsigned short idx = (word[0].h << 8) + word[0].l; @@ -2775,18 +2583,6 @@ size_t remove_ignored_chars_utf(std::string& word, return w2.size(); } -namespace { -class is_any_of { - public: - is_any_of(const std::string& in) : chars(in) {} - - bool operator()(char c) { return chars.find(c) != std::string::npos; } - - private: - std::string chars; -}; -} - // strip all ignored characters in the string size_t remove_ignored_chars(std::string& word, const std::string& ignored_chars) { @@ -2796,54 +2592,48 @@ size_t remove_ignored_chars(std::string& word, return word.size(); } -int parse_string(char* line, char** out, int ln) { - char* tp = line; - char* piece; - int i = 0; - int np = 0; - if (*out) { +bool parse_string(const std::string& line, std::string& out, int ln) { + if (!out.empty()) { HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions\n", ln); - return 1; + return false; } - piece = mystrsep(&tp, 0); - while (piece) { - if (*piece != '\0') { - switch (i) { - case 0: { - np++; - break; - } - case 1: { - *out = mystrdup(piece); - if (!*out) - return 1; - np++; - break; - } - default: - break; + int i = 0; + int np = 0; + std::string::const_iterator iter = line.begin(); + std::string::const_iterator start_piece = mystrsep(line, iter); + while (start_piece != line.end()) { + switch (i) { + case 0: { + np++; + break; + } + case 1: { + out.assign(start_piece, iter); + np++; + break; } - i++; + default: + break; } - // free(piece); - piece = mystrsep(&tp, 0); + ++i; + start_piece = mystrsep(line, iter); } if (np != 2) { HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", ln); - return 1; + return false; } - return 0; + return true; } -bool parse_array(char* line, - char** out, +bool parse_array(const std::string& line, + std::string& out, std::vector<w_char>& out_utf16, int utf8, int ln) { - if (parse_string(line, out, ln)) + if (!parse_string(line, out, ln)) return false; if (utf8) { - u8_u16(out_utf16, *out); + u8_u16(out_utf16, out); std::sort(out_utf16.begin(), out_utf16.end()); } return true; diff --git a/libs/hunspell/src/csutil.hxx b/libs/hunspell/src/csutil.hxx index ce7091df55..5d83f80970 100644 --- a/libs/hunspell/src/csutil.hxx +++ b/libs/hunspell/src/csutil.hxx @@ -1,6 +1,8 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * + * Copyright (C) 2002-2017 Németh László + * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at @@ -11,12 +13,7 @@ * for the specific language governing rights and limitations under the * License. * - * The Original Code is Hunspell, based on MySpell. - * - * The Initial Developers of the Original Code are - * Kevin Hendricks (MySpell) and Németh László (Hunspell). - * Portions created by the Initial Developers are Copyright (C) 2002-2005 - * the Initial Developers. All Rights Reserved. + * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. * * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, @@ -71,13 +68,14 @@ * SUCH DAMAGE. */ -#ifndef __CSUTILHXX__ -#define __CSUTILHXX__ +#ifndef CSUTIL_HXX_ +#define CSUTIL_HXX_ #include "hunvisapi.h" // First some base level utility routines +#include <fstream> #include <string> #include <vector> #include <string.h> @@ -127,8 +125,9 @@ #define FORBIDDENWORD 65510 #define ONLYUPCASEFLAG 65511 -// fopen or optional _wfopen to fix long pathname problem of WIN32 -LIBHUNSPELL_DLL_EXPORTED FILE* myfopen(const char* path, const char* mode); +// fix long pathname problem of WIN32 by using w_char std::fstream::open override +LIBHUNSPELL_DLL_EXPORTED void myopen(std::ifstream& stream, const char* path, + std::ios_base::openmode mode); // convert UTF-16 characters to UTF-8 LIBHUNSPELL_DLL_EXPORTED std::string& u16_u8(std::string& dest, @@ -139,21 +138,16 @@ LIBHUNSPELL_DLL_EXPORTED int u8_u16(std::vector<w_char>& dest, const std::string& src); // remove end of line char(s) -LIBHUNSPELL_DLL_EXPORTED void mychomp(char* s); +LIBHUNSPELL_DLL_EXPORTED void mychomp(std::string& s); // duplicate string LIBHUNSPELL_DLL_EXPORTED char* mystrdup(const char* s); -// strcat for limited length destination string -LIBHUNSPELL_DLL_EXPORTED char* mystrcat(char* dest, const char* st, int max); - // parse into tokens with char delimiter -LIBHUNSPELL_DLL_EXPORTED char* mystrsep(char** sptr, const char delim); +LIBHUNSPELL_DLL_EXPORTED std::string::const_iterator mystrsep(const std::string &str, + std::string::const_iterator& start); // replace pat by rep in word and return word -LIBHUNSPELL_DLL_EXPORTED char* mystrrep(char* word, - const char* pat, - const char* rep); LIBHUNSPELL_DLL_EXPORTED std::string& mystrrep(std::string& str, const std::string& search, const std::string& replace); @@ -163,13 +157,13 @@ LIBHUNSPELL_DLL_EXPORTED std::string& strlinecat(std::string& str, const std::string& apd); // tokenize into lines with new line -LIBHUNSPELL_DLL_EXPORTED int line_tok(const char* text, - char*** lines, - char breakchar); +LIBHUNSPELL_DLL_EXPORTED std::vector<std::string> line_tok(const std::string& text, + char breakchar); // tokenize into lines with new line and uniq in place -LIBHUNSPELL_DLL_EXPORTED char* line_uniq(char* text, char breakchar); -LIBHUNSPELL_DLL_EXPORTED char* line_uniq_app(char** text, char breakchar); +LIBHUNSPELL_DLL_EXPORTED void line_uniq(std::string& text, char breakchar); + +LIBHUNSPELL_DLL_EXPORTED void line_uniq_app(std::string& text, char breakchar); // reverse word LIBHUNSPELL_DLL_EXPORTED size_t reverseword(std::string& word); @@ -178,10 +172,7 @@ LIBHUNSPELL_DLL_EXPORTED size_t reverseword(std::string& word); LIBHUNSPELL_DLL_EXPORTED size_t reverseword_utf(std::string&); // remove duplicates -LIBHUNSPELL_DLL_EXPORTED int uniqlist(char** list, int n); - -// free character array list -LIBHUNSPELL_DLL_EXPORTED void freelist(char*** list, int n); +LIBHUNSPELL_DLL_EXPORTED void uniqlist(std::vector<std::string>& list); // character encoding information struct cs_info { @@ -190,7 +181,7 @@ struct cs_info { unsigned char cupper; }; -LIBHUNSPELL_DLL_EXPORTED int initialize_utf_tbl(); +LIBHUNSPELL_DLL_EXPORTED void initialize_utf_tbl(); LIBHUNSPELL_DLL_EXPORTED void free_utf_tbl(); LIBHUNSPELL_DLL_EXPORTED unsigned short unicodetoupper(unsigned short c, int langnum); @@ -200,13 +191,13 @@ LIBHUNSPELL_DLL_EXPORTED unsigned short unicodetolower(unsigned short c, int langnum); LIBHUNSPELL_DLL_EXPORTED int unicodeisalpha(unsigned short c); -LIBHUNSPELL_DLL_EXPORTED struct cs_info* get_current_cs(const char* es); +LIBHUNSPELL_DLL_EXPORTED struct cs_info* get_current_cs(const std::string& es); // get language identifiers of language codes -LIBHUNSPELL_DLL_EXPORTED int get_lang_num(const char* lang); +LIBHUNSPELL_DLL_EXPORTED int get_lang_num(const std::string& lang); // get characters of the given 8bit encoding with lower- and uppercase forms -LIBHUNSPELL_DLL_EXPORTED char* get_casechars(const char* enc); +LIBHUNSPELL_DLL_EXPORTED std::string get_casechars(const char* enc); // convert std::string to all caps LIBHUNSPELL_DLL_EXPORTED std::string& mkallcap(std::string& s, @@ -256,10 +247,12 @@ LIBHUNSPELL_DLL_EXPORTED size_t remove_ignored_chars( std::string& word, const std::string& ignored_chars); -LIBHUNSPELL_DLL_EXPORTED int parse_string(char* line, char** out, int ln); +LIBHUNSPELL_DLL_EXPORTED bool parse_string(const std::string& line, + std::string& out, + int ln); -LIBHUNSPELL_DLL_EXPORTED bool parse_array(char* line, - char** out, +LIBHUNSPELL_DLL_EXPORTED bool parse_array(const std::string& line, + std::string& out, std::vector<w_char>& out_utf16, int utf8, int ln); @@ -270,10 +263,6 @@ LIBHUNSPELL_DLL_EXPORTED bool copy_field(std::string& dest, const std::string& morph, const std::string& var); -LIBHUNSPELL_DLL_EXPORTED int morphcmp(const char* s, const char* t); - -LIBHUNSPELL_DLL_EXPORTED int get_sfxcount(const char* morph); - // conversion function for protected memory LIBHUNSPELL_DLL_EXPORTED void store_pointer(char* dest, char* source); diff --git a/libs/hunspell/src/dictmgr.c++ b/libs/hunspell/src/dictmgr.c++ deleted file mode 100644 index 473c09acfe..0000000000 --- a/libs/hunspell/src/dictmgr.c++ +++ /dev/null @@ -1,216 +0,0 @@ -/* ***** BEGIN LICENSE BLOCK ***** - * Version: MPL 1.1/GPL 2.0/LGPL 2.1 - * - * The contents of this file are subject to the Mozilla Public License Version - * 1.1 (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" basis, - * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License - * for the specific language governing rights and limitations under the - * License. - * - * The Original Code is Hunspell, based on MySpell. - * - * The Initial Developers of the Original Code are - * Kevin Hendricks (MySpell) and Németh László (Hunspell). - * Portions created by the Initial Developers are Copyright (C) 2002-2005 - * the Initial Developers. All Rights Reserved. - * - * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, - * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, - * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, - * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, - * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen - * - * Alternatively, the contents of this file may be used under the terms of - * either the GNU General Public License Version 2 or later (the "GPL"), or - * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), - * in which case the provisions of the GPL or the LGPL are applicable instead - * of those above. If you wish to allow use of your version of this file only - * under the terms of either the GPL or the LGPL, and not to allow others to - * use your version of this file under the terms of the MPL, indicate your - * decision by deleting the provisions above and replace them with the notice - * and other provisions required by the GPL or the LGPL. If you do not delete - * the provisions above, a recipient may use your version of this file under - * the terms of any one of the MPL, the GPL or the LGPL. - * - * ***** END LICENSE BLOCK ***** */ - -#include <stdlib.h> -#include <string.h> -#include <ctype.h> -#include <stdio.h> - -#include "dictmgr.hxx" -#include "csutil.hxx" - -DictMgr::DictMgr(const char* dictpath, const char* etype) : numdict(0) { - // load list of etype entries - pdentry = (dictentry*)malloc(MAXDICTIONARIES * sizeof(struct dictentry)); - if (pdentry) { - if (parse_file(dictpath, etype)) { - numdict = 0; - // no dictionary.lst found is okay - } - } -} - -DictMgr::~DictMgr() { - dictentry* pdict = NULL; - if (pdentry) { - pdict = pdentry; - for (int i = 0; i < numdict; i++) { - if (pdict->lang) { - free(pdict->lang); - pdict->lang = NULL; - } - if (pdict->region) { - free(pdict->region); - pdict->region = NULL; - } - if (pdict->filename) { - free(pdict->filename); - pdict->filename = NULL; - } - pdict++; - } - free(pdentry); - pdentry = NULL; - pdict = NULL; - } - numdict = 0; -} - -// read in list of etype entries and build up structure to describe them -int DictMgr::parse_file(const char* dictpath, const char* etype) { - int i; - char line[MAXDICTENTRYLEN + 1]; - dictentry* pdict = pdentry; - - // open the dictionary list file - FILE* dictlst; - dictlst = myfopen(dictpath, "r"); - if (!dictlst) { - return 1; - } - - // step one is to parse the dictionary list building up the - // descriptive structures - - // read in each line ignoring any that dont start with etype - while (fgets(line, MAXDICTENTRYLEN, dictlst)) { - mychomp(line); - - /* parse in a dictionary entry */ - if (strncmp(line, etype, 4) == 0) { - if (numdict < MAXDICTIONARIES) { - char* tp = line; - char* piece; - i = 0; - while ((piece = mystrsep(&tp, ' '))) { - if (*piece != '\0') { - switch (i) { - case 0: - break; - case 1: - pdict->lang = mystrdup(piece); - break; - case 2: - if (strcmp(piece, "ANY") == 0) - pdict->region = mystrdup(""); - else - pdict->region = mystrdup(piece); - break; - case 3: - pdict->filename = mystrdup(piece); - break; - default: - break; - } - i++; - } - free(piece); - } - if (i == 4) { - numdict++; - pdict++; - } else { - switch (i) { - case 3: - free(pdict->region); - pdict->region = NULL; - /* FALLTHROUGH */ - case 2: - free(pdict->lang); - pdict->lang = NULL; - default: - break; - } - fprintf(stderr, "dictionary list corruption in line \"%s\"\n", line); - fflush(stderr); - } - } - } - } - fclose(dictlst); - return 0; -} - -// return text encoding of dictionary -int DictMgr::get_list(dictentry** ppentry) { - *ppentry = pdentry; - return numdict; -} - -// strip strings into token based on single char delimiter -// acts like strsep() but only uses a delim char and not -// a delim string - -char* DictMgr::mystrsep(char** stringp, const char delim) { - char* rv = NULL; - char* mp = *stringp; - size_t n = strlen(mp); - if (n > 0) { - char* dp = (char*)memchr(mp, (int)((unsigned char)delim), n); - if (dp) { - *stringp = dp + 1; - size_t nc = dp - mp; - rv = (char*)malloc(nc + 1); - if (rv) { - memcpy(rv, mp, nc); - *(rv + nc) = '\0'; - } - } else { - rv = (char*)malloc(n + 1); - if (rv) { - memcpy(rv, mp, n); - *(rv + n) = '\0'; - *stringp = mp + n; - } - } - } - return rv; -} - -// replaces strdup with ansi version -char* DictMgr::mystrdup(const char* s) { - char* d = NULL; - if (s) { - int sl = strlen(s) + 1; - d = (char*)malloc(sl); - if (d) - memcpy(d, s, sl); - } - return d; -} - -// remove cross-platform text line end characters -void DictMgr::mychomp(char* s) { - int k = strlen(s); - if ((k > 0) && ((*(s + k - 1) == '\r') || (*(s + k - 1) == '\n'))) - *(s + k - 1) = '\0'; - if ((k > 1) && (*(s + k - 2) == '\r')) - *(s + k - 2) = '\0'; -} diff --git a/libs/hunspell/src/dictmgr.hxx b/libs/hunspell/src/dictmgr.hxx deleted file mode 100644 index 98134c3b2f..0000000000 --- a/libs/hunspell/src/dictmgr.hxx +++ /dev/null @@ -1,76 +0,0 @@ -/* ***** BEGIN LICENSE BLOCK ***** - * Version: MPL 1.1/GPL 2.0/LGPL 2.1 - * - * The contents of this file are subject to the Mozilla Public License Version - * 1.1 (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" basis, - * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License - * for the specific language governing rights and limitations under the - * License. - * - * The Original Code is Hunspell, based on MySpell. - * - * The Initial Developers of the Original Code are - * Kevin Hendricks (MySpell) and Németh László (Hunspell). - * Portions created by the Initial Developers are Copyright (C) 2002-2005 - * the Initial Developers. All Rights Reserved. - * - * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, - * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, - * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, - * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, - * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen - * - * Alternatively, the contents of this file may be used under the terms of - * either the GNU General Public License Version 2 or later (the "GPL"), or - * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), - * in which case the provisions of the GPL or the LGPL are applicable instead - * of those above. If you wish to allow use of your version of this file only - * under the terms of either the GPL or the LGPL, and not to allow others to - * use your version of this file under the terms of the MPL, indicate your - * decision by deleting the provisions above and replace them with the notice - * and other provisions required by the GPL or the LGPL. If you do not delete - * the provisions above, a recipient may use your version of this file under - * the terms of any one of the MPL, the GPL or the LGPL. - * - * ***** END LICENSE BLOCK ***** */ - -#ifndef _DICTMGR_HXX_ -#define _DICTMGR_HXX_ - -#include "hunvisapi.h" - -#define MAXDICTIONARIES 100 -#define MAXDICTENTRYLEN 1024 - -struct dictentry { - char* filename; - char* lang; - char* region; -}; - -class LIBHUNSPELL_DLL_EXPORTED DictMgr { - private: - DictMgr(const DictMgr&); - DictMgr& operator=(const DictMgr&); - - private: - int numdict; - dictentry* pdentry; - - public: - DictMgr(const char* dictpath, const char* etype); - ~DictMgr(); - int get_list(dictentry** ppentry); - - private: - int parse_file(const char* dictpath, const char* etype); - char* mystrsep(char** stringp, const char delim); - char* mystrdup(const char* s); - void mychomp(char* s); -}; - -#endif diff --git a/libs/hunspell/src/filemgr.c++ b/libs/hunspell/src/filemgr.cxx index 2218bc79e1..4a14de8762 100644 --- a/libs/hunspell/src/filemgr.c++ +++ b/libs/hunspell/src/filemgr.cxx @@ -1,6 +1,8 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * + * Copyright (C) 2002-2017 Németh László + * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at @@ -11,12 +13,7 @@ * for the specific language governing rights and limitations under the * License. * - * The Original Code is Hunspell, based on MySpell. - * - * The Initial Developers of the Original Code are - * Kevin Hendricks (MySpell) and Németh László (Hunspell). - * Portions created by the Initial Developers are Copyright (C) 2002-2005 - * the Initial Developers. All Rights Reserved. + * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. * * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, @@ -86,33 +83,33 @@ int FileMgr::fail(const char* err, const char* par) { FileMgr::FileMgr(const char* file, const char* key) : hin(NULL), linenum(0) { in[0] = '\0'; - fin = myfopen(file, "r"); - if (!fin) { + myopen(fin, file, std::ios_base::in); + if (!fin.is_open()) { // check hzipped file std::string st(file); st.append(HZIP_EXTENSION); hin = new Hunzip(st.c_str(), key); } - if (!fin && !hin) + if (!fin.is_open() && !hin->is_open()) fail(MSG_OPEN, file); } FileMgr::~FileMgr() { - if (fin) - fclose(fin); - if (hin) - delete hin; + delete hin; } -char* FileMgr::getline() { - const char* l; - linenum++; - if (fin) - return fgets(in, BUFSIZE - 1, fin); - if (hin && ((l = hin->getline()) != NULL)) - return strcpy(in, l); - linenum--; - return NULL; +bool FileMgr::getline(std::string& dest) { + bool ret = false; + ++linenum; + if (fin.is_open()) { + ret = static_cast<bool>(std::getline(fin, dest)); + } else if (hin->is_open()) { + ret = hin->getline(dest); + } + if (!ret) { + --linenum; + } + return ret; } int FileMgr::getlinenum() { diff --git a/libs/hunspell/src/filemgr.hxx b/libs/hunspell/src/filemgr.hxx index 8b69931ddb..62433aeefe 100644 --- a/libs/hunspell/src/filemgr.hxx +++ b/libs/hunspell/src/filemgr.hxx @@ -1,6 +1,8 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * + * Copyright (C) 2002-2017 Németh László + * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at @@ -11,12 +13,7 @@ * for the specific language governing rights and limitations under the * License. * - * The Original Code is Hunspell, based on MySpell. - * - * The Initial Developers of the Original Code are - * Kevin Hendricks (MySpell) and Németh László (Hunspell). - * Portions created by the Initial Developers are Copyright (C) 2002-2005 - * the Initial Developers. All Rights Reserved. + * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. * * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, @@ -72,21 +69,21 @@ */ /* file manager class - read lines of files [filename] OR [filename.hz] */ -#ifndef _FILEMGR_HXX_ -#define _FILEMGR_HXX_ - -#include "hunvisapi.h" +#ifndef FILEMGR_HXX_ +#define FILEMGR_HXX_ #include "hunzip.hxx" #include <stdio.h> +#include <string> +#include <fstream> -class LIBHUNSPELL_DLL_EXPORTED FileMgr { +class FileMgr { private: FileMgr(const FileMgr&); FileMgr& operator=(const FileMgr&); protected: - FILE* fin; + std::ifstream fin; Hunzip* hin; char in[BUFSIZE + 50]; // input buffer int fail(const char* err, const char* par); @@ -95,7 +92,7 @@ class LIBHUNSPELL_DLL_EXPORTED FileMgr { public: FileMgr(const char* filename, const char* key = NULL); ~FileMgr(); - char* getline(); + bool getline(std::string&); int getlinenum(); }; #endif diff --git a/libs/hunspell/src/hashmgr.c++ b/libs/hunspell/src/hashmgr.cxx index c3cd95420f..23421b567a 100644 --- a/libs/hunspell/src/hashmgr.c++ +++ b/libs/hunspell/src/hashmgr.cxx @@ -1,6 +1,8 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * + * Copyright (C) 2002-2017 Németh László + * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at @@ -11,12 +13,7 @@ * for the specific language governing rights and limitations under the * License. * - * The Original Code is Hunspell, based on MySpell. - * - * The Initial Developers of the Original Code are - * Kevin Hendricks (MySpell) and Németh László (Hunspell). - * Portions created by the Initial Developers are Copyright (C) 2002-2005 - * the Initial Developers. All Rights Reserved. + * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. * * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, @@ -98,20 +95,19 @@ HashMgr::HashMgr(const char* tpath, const char* apath, const char* key) numaliasm(0), aliasm(NULL) { langnum = 0; - lang = NULL; - enc = NULL; csconv = 0; - ignorechars = NULL; load_config(apath, key); int ec = load_tables(tpath, key); if (ec) { /* error condition - what should we do here */ HUNSPELL_WARNING(stderr, "Hash Manager Error : %d\n", ec); - if (tableptr) { - free(tableptr); - tableptr = NULL; + free(tableptr); + //keep tablesize to 1 to fix possible division with zero + tablesize = 1; + tableptr = (struct hentry**)calloc(tablesize, sizeof(struct hentry*)); + if (!tableptr) { + tablesize = 0; } - tablesize = 0; } } @@ -159,14 +155,6 @@ HashMgr::~HashMgr() { #endif #endif - if (enc) - free(enc); - if (lang) - free(lang); - - if (ignorechars) - free(ignorechars); - #ifdef MOZILLA_CLIENT delete[] csconv; #endif @@ -189,20 +177,21 @@ struct hentry* HashMgr::lookup(const char* word) const { } // add a word to the hash table (private) -int HashMgr::add_word(const char* word, - int wbl, +int HashMgr::add_word(const std::string& in_word, int wcl, unsigned short* aff, int al, - const char* desc, + const std::string* in_desc, bool onlyupcase) { + const std::string* word = &in_word; + const std::string* desc = in_desc; std::string *word_copy = NULL; std::string *desc_copy = NULL; - if (ignorechars || complexprefixes) { - word_copy = new std::string(word, wbl); + if (!ignorechars.empty() || complexprefixes) { + word_copy = new std::string(in_word); - if (ignorechars != NULL) { + if (!ignorechars.empty()) { if (utf8) { wcl = remove_ignored_chars_utf(*word_copy, ignorechars_utf16); } else { @@ -216,8 +205,8 @@ int HashMgr::add_word(const char* word, else reverseword(*word_copy); - if (desc && !aliasm) { - desc_copy = new std::string(desc); + if (in_desc && !aliasm) { + desc_copy = new std::string(*in_desc); if (complexprefixes) { if (utf8) @@ -225,19 +214,18 @@ int HashMgr::add_word(const char* word, else reverseword(*desc_copy); } - desc = desc_copy->c_str(); + desc = desc_copy; } } - wbl = word_copy->size(); - word = word_copy->c_str(); + word = word_copy; } bool upcasehomonym = false; - int descl = desc ? (aliasm ? sizeof(char*) : strlen(desc) + 1) : 0; + int descl = desc ? (aliasm ? sizeof(char*) : desc->size() + 1) : 0; // variable-length hash record with word and optional fields struct hentry* hp = - (struct hentry*)malloc(sizeof(struct hentry) + wbl + descl); + (struct hentry*)malloc(sizeof(struct hentry) + word->size() + descl); if (!hp) { delete desc_copy; delete word_copy; @@ -245,11 +233,11 @@ int HashMgr::add_word(const char* word, } char* hpw = hp->word; - strcpy(hpw, word); + strcpy(hpw, word->c_str()); int i = hash(hpw); - hp->blen = (unsigned char)wbl; + hp->blen = (unsigned char)word->size(); hp->clen = (unsigned char)wcl; hp->alen = (short)al; hp->astr = aff; @@ -261,9 +249,9 @@ int HashMgr::add_word(const char* word, hp->var = H_OPT; if (aliasm) { hp->var += H_OPT_ALIASM; - store_pointer(hpw + wbl + 1, get_aliasm(atoi(desc))); + store_pointer(hpw + word->size() + 1, get_aliasm(atoi(desc->c_str()))); } else { - strcpy(hpw + wbl + 1, desc); + strcpy(hpw + word->size() + 1, desc->c_str()); } if (strstr(HENTRY_DATA(hp), MORPH_PHON)) hp->var += H_OPT_PHON; @@ -334,7 +322,7 @@ int HashMgr::add_hidden_capitalized_word(const std::string& word, int wcl, unsigned short* flags, int flagslen, - char* dp, + const std::string* dp, int captype) { if (flags == NULL) flagslen = 0; @@ -359,12 +347,12 @@ int HashMgr::add_hidden_capitalized_word(const std::string& word, mkallsmall_utf(w, langnum); mkinitcap_utf(w, langnum); u16_u8(st, w); - return add_word(st.c_str(), st.size(), wcl, flags2, flagslen + 1, dp, true); + return add_word(st, wcl, flags2, flagslen + 1, dp, true); } else { std::string new_word(word); mkallsmall(new_word, csconv); mkinitcap(new_word, csconv); - int ret = add_word(new_word.c_str(), new_word.size(), wcl, flags2, flagslen + 1, dp, true); + int ret = add_word(new_word, wcl, flags2, flagslen + 1, dp, true); return ret; } } @@ -372,12 +360,11 @@ int HashMgr::add_hidden_capitalized_word(const std::string& word, } // detect captype and modify word length for UTF-8 encoding -int HashMgr::get_clen_and_captype(const std::string& word, int* captype) { +int HashMgr::get_clen_and_captype(const std::string& word, int* captype, std::vector<w_char> &workbuf) { int len; if (utf8) { - std::vector<w_char> dest_utf; - len = u8_u16(dest_utf, word); - *captype = get_captype_utf8(dest_utf, langnum); + len = u8_u16(workbuf, word); + *captype = get_captype_utf8(workbuf, langnum); } else { len = word.size(); *captype = get_captype(word, csconv); @@ -385,9 +372,14 @@ int HashMgr::get_clen_and_captype(const std::string& word, int* captype) { return len; } +int HashMgr::get_clen_and_captype(const std::string& word, int* captype) { + std::vector<w_char> workbuf; + return get_clen_and_captype(word, captype, workbuf); +} + // remove word (personal dictionary function for standalone applications) -int HashMgr::remove(const char* word) { - struct hentry* dp = lookup(word); +int HashMgr::remove(const std::string& word) { + struct hentry* dp = lookup(word.c_str()); while (dp) { if (dp->alen == 0 || !TESTAFF(dp->astr, forbiddenword, dp->alen)) { unsigned short* flags = @@ -397,6 +389,7 @@ int HashMgr::remove(const char* word) { for (int i = 0; i < dp->alen; i++) flags[i] = dp->astr[i]; flags[dp->alen] = forbiddenword; + free(dp->astr); dp->astr = flags; dp->alen++; std::sort(flags, flags + dp->alen); @@ -426,6 +419,7 @@ int HashMgr::remove_forbidden_flag(const std::string& word) { flags2[j++] = dp->astr[i]; } dp->alen--; + free(dp->astr); dp->astr = flags2; // XXX allowed forbidden words } } @@ -436,36 +430,34 @@ int HashMgr::remove_forbidden_flag(const std::string& word) { // add a custom dic. word to the hash table (public) int HashMgr::add(const std::string& word) { - unsigned short* flags = NULL; - int al = 0; if (remove_forbidden_flag(word)) { int captype; - int wbl = word.size(); + int al = 0; + unsigned short* flags = NULL; int wcl = get_clen_and_captype(word, &captype); - add_word(word.c_str(), wbl, wcl, flags, al, NULL, false); + add_word(word, wcl, flags, al, NULL, false); return add_hidden_capitalized_word(word, wcl, flags, al, NULL, captype); } return 0; } -int HashMgr::add_with_affix(const char* word, const char* example) { +int HashMgr::add_with_affix(const std::string& word, const std::string& example) { // detect captype and modify word length for UTF-8 encoding - struct hentry* dp = lookup(example); + struct hentry* dp = lookup(example.c_str()); remove_forbidden_flag(word); if (dp && dp->astr) { int captype; - int wbl = strlen(word); int wcl = get_clen_and_captype(word, &captype); if (aliasf) { - add_word(word, wbl, wcl, dp->astr, dp->alen, NULL, false); + add_word(word, wcl, dp->astr, dp->alen, NULL, false); } else { unsigned short* flags = (unsigned short*)malloc(dp->alen * sizeof(unsigned short)); if (flags) { memcpy((void*)flags, (void*)dp->astr, dp->alen * sizeof(unsigned short)); - add_word(word, wbl, wcl, flags, dp->alen, NULL, false); + add_word(word, wcl, flags, dp->alen, NULL, false); } else return 1; } @@ -491,20 +483,14 @@ struct hentry* HashMgr::walk_hashtable(int& col, struct hentry* hp) const { // load a munched word list and build a hash table on the fly int HashMgr::load_tables(const char* tpath, const char* key) { - int al; - char* ap; - char* dp; - char* dp2; - unsigned short* flags; - char* ts; - // open dictionary file FileMgr* dict = new FileMgr(tpath, key); if (dict == NULL) return 1; // first read the first line of file to get hash table size */ - if ((ts = dict->getline()) == NULL) { + std::string ts; + if (!dict->getline(ts)) { HUNSPELL_WARNING(stderr, "error: empty dic file %s\n", tpath); delete dict; return 2; @@ -512,13 +498,11 @@ int HashMgr::load_tables(const char* tpath, const char* key) { mychomp(ts); /* remove byte order mark */ - if (strncmp(ts, "\xEF\xBB\xBF", 3) == 0) { - memmove(ts, ts + 3, strlen(ts + 3) + 1); - // warning: dic file begins with byte order mark: possible incompatibility - // with old Hunspell versions + if (ts.compare(0, 3, "\xEF\xBB\xBF", 3) == 0) { + ts.erase(0, 3); } - tablesize = atoi(ts); + tablesize = atoi(ts.c_str()); int nExtra = 5 + USERWORD; @@ -544,60 +528,67 @@ int HashMgr::load_tables(const char* tpath, const char* key) { // loop through all words on much list and add to hash // table and create word and affix strings - while ((ts = dict->getline()) != NULL) { + std::vector<w_char> workbuf; + + while (dict->getline(ts)) { mychomp(ts); // split each line into word and morphological description - dp = ts; - while ((dp = strchr(dp, ':')) != NULL) { - if ((dp > ts + 3) && (*(dp - 3) == ' ' || *(dp - 3) == '\t')) { - for (dp -= 4; dp >= ts && (*dp == ' ' || *dp == '\t'); dp--) + size_t dp_pos = 0; + while ((dp_pos = ts.find(':', dp_pos)) != std::string::npos) { + if ((dp_pos > 3) && (ts[dp_pos - 3] == ' ' || ts[dp_pos - 3] == '\t')) { + for (dp_pos -= 3; dp_pos > 0 && (ts[dp_pos-1] == ' ' || ts[dp_pos-1] == '\t'); --dp_pos) ; - if (dp < ts) { // missing word - dp = NULL; + if (dp_pos == 0) { // missing word + dp_pos = std::string::npos; } else { - *(dp + 1) = '\0'; - dp = dp + 2; + ++dp_pos; } break; } - dp++; + ++dp_pos; } // tabulator is the old morphological field separator - dp2 = strchr(ts, '\t'); - if (dp2 && (!dp || dp2 < dp)) { - *dp2 = '\0'; - dp = dp2 + 1; + size_t dp2_pos = ts.find('\t'); + if (dp2_pos != std::string::npos && (dp_pos == std::string::npos || dp2_pos < dp_pos)) { + dp_pos = dp2_pos + 1; + } + + std::string dp; + if (dp_pos != std::string::npos) { + dp.assign(ts.substr(dp_pos)); + ts.resize(dp_pos - 1); } // split each line into word and affix char strings // "\/" signs slash in words (not affix separator) // "/" at beginning of the line is word character (not affix separator) - ap = strchr(ts, '/'); - while (ap) { - if (ap == ts) { - ap++; + size_t ap_pos = ts.find('/'); + while (ap_pos != std::string::npos) { + if (ap_pos == 0) { + ++ap_pos; continue; - } else if (*(ap - 1) != '\\') + } else if (ts[ap_pos - 1] != '\\') break; // replace "\/" with "/" - for (char *sp = ap - 1; *sp; *sp = *(sp + 1), sp++) - ; - ap = strchr(ap, '/'); + ts.erase(ap_pos - 1, 1); + ap_pos = ts.find('/', ap_pos); } - if (ap) { - *ap = '\0'; + unsigned short* flags; + int al; + if (ap_pos != std::string::npos && ap_pos != ts.size()) { + std::string ap(ts.substr(ap_pos + 1)); + ts.resize(ap_pos); if (aliasf) { - int index = atoi(ap + 1); + int index = atoi(ap.c_str()); al = get_aliasf(index, &flags, dict); if (!al) { HUNSPELL_WARNING(stderr, "error: line %d: bad flag vector alias\n", dict->getlinenum()); - *ap = '\0'; } } else { - al = decode_flags(&flags, ap + 1, dict); + al = decode_flags(&flags, ap.c_str(), dict); if (al == -1) { HUNSPELL_WARNING(stderr, "Can't allocate memory.\n"); delete dict; @@ -607,16 +598,15 @@ int HashMgr::load_tables(const char* tpath, const char* key) { } } else { al = 0; - ap = NULL; flags = NULL; } int captype; - int wbl = strlen(ts); - int wcl = get_clen_and_captype(ts, &captype); + int wcl = get_clen_and_captype(ts, &captype, workbuf); + const std::string *dp_str = dp.empty() ? NULL : &dp; // add the word and its index plus its capitalized form optionally - if (add_word(ts, wbl, wcl, flags, al, dp, false) || - add_hidden_capitalized_word(ts, wcl, flags, al, dp, captype)) { + if (add_word(ts, wcl, flags, al, dp_str, false) || + add_hidden_capitalized_word(ts, wcl, flags, al, dp_str, captype)) { delete dict; return 5; } @@ -639,15 +629,15 @@ int HashMgr::hash(const char* word) const { return (unsigned long)hv % tablesize; } -int HashMgr::decode_flags(unsigned short** result, char* flags, FileMgr* af) { +int HashMgr::decode_flags(unsigned short** result, const std::string& flags, FileMgr* af) const { int len; - if (*flags == '\0') { + if (flags.empty()) { *result = NULL; return 0; } switch (flag_mode) { case FLAG_LONG: { // two-character flags (1x2yZz -> 1x 2y Zz) - len = strlen(flags); + len = flags.size(); if (len % 2 == 1) HUNSPELL_WARNING(stderr, "error: line %d: bad flagvector\n", af->getlinenum()); @@ -656,29 +646,27 @@ int HashMgr::decode_flags(unsigned short** result, char* flags, FileMgr* af) { if (!*result) return -1; for (int i = 0; i < len; i++) { - (*result)[i] = (((unsigned short)flags[i * 2]) << 8) + - (unsigned short)flags[i * 2 + 1]; + (*result)[i] = ((unsigned short)((unsigned char)flags[i * 2]) << 8) + + (unsigned char)flags[i * 2 + 1]; } break; } case FLAG_NUM: { // decimal numbers separated by comma (4521,23,233 -> 4521 // 23 233) - int i; len = 1; - char* src = flags; unsigned short* dest; - char* p; - for (p = flags; *p; p++) { - if (*p == ',') + for (size_t i = 0; i < flags.size(); ++i) { + if (flags[i] == ',') len++; } *result = (unsigned short*)malloc(len * sizeof(unsigned short)); if (!*result) return -1; dest = *result; - for (p = flags; *p; p++) { + const char* src = flags.c_str(); + for (const char* p = src; *p; p++) { if (*p == ',') { - i = atoi(src); + int i = atoi(src); if (i >= DEFAULTFLAGS) HUNSPELL_WARNING( stderr, "error: line %d: flag id %d is too large (max: %d)\n", @@ -691,7 +679,7 @@ int HashMgr::decode_flags(unsigned short** result, char* flags, FileMgr* af) { dest++; } } - i = atoi(src); + int i = atoi(src); if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: line %d: flag id %d is too large (max: %d)\n", @@ -714,13 +702,13 @@ int HashMgr::decode_flags(unsigned short** result, char* flags, FileMgr* af) { } default: { // Ispell's one-character flags (erfg -> e r f g) unsigned short* dest; - len = strlen(flags); + len = flags.size(); *result = (unsigned short*)malloc(len * sizeof(unsigned short)); if (!*result) return -1; dest = *result; - for (unsigned char* p = (unsigned char*)flags; *p; p++) { - *dest = (unsigned short)*p; + for (size_t i = 0; i < flags.size(); ++i) { + *dest = (unsigned char)flags[i]; dest++; } } @@ -728,12 +716,77 @@ int HashMgr::decode_flags(unsigned short** result, char* flags, FileMgr* af) { return len; } -unsigned short HashMgr::decode_flag(const char* f) { +bool HashMgr::decode_flags(std::vector<unsigned short>& result, const std::string& flags, FileMgr* af) const { + if (flags.empty()) { + return false; + } + switch (flag_mode) { + case FLAG_LONG: { // two-character flags (1x2yZz -> 1x 2y Zz) + size_t len = flags.size(); + if (len % 2 == 1) + HUNSPELL_WARNING(stderr, "error: line %d: bad flagvector\n", + af->getlinenum()); + len /= 2; + result.reserve(result.size() + len); + for (size_t i = 0; i < len; ++i) { + result.push_back(((unsigned short)((unsigned char)flags[i * 2]) << 8) + + (unsigned char)flags[i * 2 + 1]); + } + break; + } + case FLAG_NUM: { // decimal numbers separated by comma (4521,23,233 -> 4521 + // 23 233) + const char* src = flags.c_str(); + for (const char* p = src; *p; p++) { + if (*p == ',') { + int i = atoi(src); + if (i >= DEFAULTFLAGS) + HUNSPELL_WARNING( + stderr, "error: line %d: flag id %d is too large (max: %d)\n", + af->getlinenum(), i, DEFAULTFLAGS - 1); + result.push_back((unsigned short)i); + if (result.back() == 0) + HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n", + af->getlinenum()); + src = p + 1; + } + } + int i = atoi(src); + if (i >= DEFAULTFLAGS) + HUNSPELL_WARNING(stderr, + "error: line %d: flag id %d is too large (max: %d)\n", + af->getlinenum(), i, DEFAULTFLAGS - 1); + result.push_back((unsigned short)i); + if (result.back() == 0) + HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n", + af->getlinenum()); + break; + } + case FLAG_UNI: { // UTF-8 characters + std::vector<w_char> w; + u8_u16(w, flags); + size_t len = w.size(); + size_t origsize = result.size(); + result.resize(origsize + len); + memcpy(&result[origsize], &w[0], len * sizeof(short)); + break; + } + default: { // Ispell's one-character flags (erfg -> e r f g) + result.reserve(flags.size()); + for (size_t i = 0; i < flags.size(); ++i) { + result.push_back((unsigned char)flags[i]); + } + } + } + return true; +} + +unsigned short HashMgr::decode_flag(const char* f) const { unsigned short s = 0; int i; switch (flag_mode) { case FLAG_LONG: - s = ((unsigned short)f[0] << 8) + (unsigned short)f[1]; + s = ((unsigned short)((unsigned char)f[0]) << 8) + (unsigned char)f[1]; break; case FLAG_NUM: i = atoi(f); @@ -750,14 +803,14 @@ unsigned short HashMgr::decode_flag(const char* f) { break; } default: - s = (unsigned short)*((unsigned char*)f); + s = *(unsigned char*)f; } if (s == 0) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n"); return s; } -char* HashMgr::encode_flag(unsigned short f) { +char* HashMgr::encode_flag(unsigned short f) const { if (f == 0) return mystrdup("(NULL)"); std::string ch; @@ -780,7 +833,6 @@ char* HashMgr::encode_flag(unsigned short f) { // read in aff file and set flag mode int HashMgr::load_config(const char* affpath, const char* key) { - char* line; // io buffers int firstline = 1; // open the affix file @@ -794,29 +846,31 @@ int HashMgr::load_config(const char* affpath, const char* key) { // read in each line ignoring any that do not // start with a known line type indicator - while ((line = afflst->getline()) != NULL) { + std::string line; + while (afflst->getline(line)) { mychomp(line); /* remove byte order mark */ if (firstline) { firstline = 0; - if (strncmp(line, "\xEF\xBB\xBF", 3) == 0) - memmove(line, line + 3, strlen(line + 3) + 1); + if (line.compare(0, 3, "\xEF\xBB\xBF", 3) == 0) { + line.erase(0, 3); + } } /* parse in the try string */ - if ((strncmp(line, "FLAG", 4) == 0) && isspace(line[4])) { + if ((line.compare(0, 4, "FLAG", 4) == 0) && line.size() > 4 && isspace(line[4])) { if (flag_mode != FLAG_CHAR) { HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of the FLAG " "affix file parameter\n", afflst->getlinenum()); } - if (strstr(line, "long")) + if (line.find("long") != std::string::npos) flag_mode = FLAG_LONG; - if (strstr(line, "num")) + if (line.find("num") != std::string::npos) flag_mode = FLAG_NUM; - if (strstr(line, "UTF-8")) + if (line.find("UTF-8") != std::string::npos) flag_mode = FLAG_UNI; if (flag_mode == FLAG_CHAR) { HUNSPELL_WARNING( @@ -825,21 +879,22 @@ int HashMgr::load_config(const char* affpath, const char* key) { afflst->getlinenum()); } } - if (strncmp(line, "FORBIDDENWORD", 13) == 0) { - char* st = NULL; - if (parse_string(line, &st, afflst->getlinenum())) { + + if (line.compare(0, 13, "FORBIDDENWORD", 13) == 0) { + std::string st; + if (!parse_string(line, st, afflst->getlinenum())) { delete afflst; return 1; } - forbiddenword = decode_flag(st); - free(st); + forbiddenword = decode_flag(st.c_str()); } - if (strncmp(line, "SET", 3) == 0) { - if (parse_string(line, &enc, afflst->getlinenum())) { + + if (line.compare(0, 3, "SET", 3) == 0) { + if (!parse_string(line, enc, afflst->getlinenum())) { delete afflst; return 1; } - if (strcmp(enc, "UTF-8") == 0) { + if (enc == "UTF-8") { utf8 = 1; #ifndef OPENOFFICEORG #ifndef MOZILLA_CLIENT @@ -849,8 +904,9 @@ int HashMgr::load_config(const char* affpath, const char* key) { } else csconv = get_current_cs(enc); } - if (strncmp(line, "LANG", 4) == 0) { - if (parse_string(line, &lang, afflst->getlinenum())) { + + if (line.compare(0, 4, "LANG", 4) == 0) { + if (!parse_string(line, lang, afflst->getlinenum())) { delete afflst; return 1; } @@ -859,34 +915,36 @@ int HashMgr::load_config(const char* affpath, const char* key) { /* parse in the ignored characters (for example, Arabic optional diacritics * characters */ - if (strncmp(line, "IGNORE", 6) == 0) { - if (!parse_array(line, &ignorechars, ignorechars_utf16, + if (line.compare(0, 6, "IGNORE", 6) == 0) { + if (!parse_array(line, ignorechars, ignorechars_utf16, utf8, afflst->getlinenum())) { delete afflst; return 1; } } - if ((strncmp(line, "AF", 2) == 0) && isspace(line[2])) { - if (parse_aliasf(line, afflst)) { + if ((line.compare(0, 2, "AF", 2) == 0) && line.size() > 2 && isspace(line[2])) { + if (!parse_aliasf(line, afflst)) { delete afflst; return 1; } } - if ((strncmp(line, "AM", 2) == 0) && isspace(line[2])) { - if (parse_aliasm(line, afflst)) { + if ((line.compare(0, 2, "AM", 2) == 0) && line.size() > 2 && isspace(line[2])) { + if (!parse_aliasm(line, afflst)) { delete afflst; return 1; } } - if (strncmp(line, "COMPLEXPREFIXES", 15) == 0) + if (line.compare(0, 15, "COMPLEXPREFIXES", 15) == 0) complexprefixes = 1; - if (((strncmp(line, "SFX", 3) == 0) || (strncmp(line, "PFX", 3) == 0)) && - isspace(line[3])) + + if (((line.compare(0, 3, "SFX", 3) == 0) || + (line.compare(0, 3, "PFX", 3) == 0)) && line.size() > 3 && isspace(line[3])) break; } + if (csconv == NULL) csconv = get_current_cs(SPELL_ENCODING); delete afflst; @@ -894,57 +952,54 @@ int HashMgr::load_config(const char* affpath, const char* key) { } /* parse in the ALIAS table */ -int HashMgr::parse_aliasf(char* line, FileMgr* af) { +bool HashMgr::parse_aliasf(const std::string& line, FileMgr* af) { if (numaliasf != 0) { HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum()); - return 1; + return false; } - char* tp = line; - char* piece; int i = 0; int np = 0; - piece = mystrsep(&tp, 0); - while (piece) { - if (*piece != '\0') { - switch (i) { - case 0: { - np++; - break; + std::string::const_iterator iter = line.begin(); + std::string::const_iterator start_piece = mystrsep(line, iter); + while (start_piece != line.end()) { + switch (i) { + case 0: { + np++; + break; + } + case 1: { + numaliasf = atoi(std::string(start_piece, iter).c_str()); + if (numaliasf < 1) { + numaliasf = 0; + aliasf = NULL; + aliasflen = NULL; + HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", + af->getlinenum()); + return false; } - case 1: { - numaliasf = atoi(piece); - if (numaliasf < 1) { - numaliasf = 0; - aliasf = NULL; - aliasflen = NULL; - HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", - af->getlinenum()); - return 1; - } - aliasf = - (unsigned short**)malloc(numaliasf * sizeof(unsigned short*)); - aliasflen = - (unsigned short*)malloc(numaliasf * sizeof(unsigned short)); - if (!aliasf || !aliasflen) { - numaliasf = 0; - if (aliasf) - free(aliasf); - if (aliasflen) - free(aliasflen); - aliasf = NULL; - aliasflen = NULL; - return 1; - } - np++; - break; + aliasf = + (unsigned short**)malloc(numaliasf * sizeof(unsigned short*)); + aliasflen = + (unsigned short*)malloc(numaliasf * sizeof(unsigned short)); + if (!aliasf || !aliasflen) { + numaliasf = 0; + if (aliasf) + free(aliasf); + if (aliasflen) + free(aliasflen); + aliasf = NULL; + aliasflen = NULL; + return false; } - default: - break; + np++; + break; } - i++; + default: + break; } - piece = mystrsep(&tp, 0); + ++i; + start_piece = mystrsep(line, iter); } if (np != 2) { numaliasf = 0; @@ -954,48 +1009,47 @@ int HashMgr::parse_aliasf(char* line, FileMgr* af) { aliasflen = NULL; HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); - return 1; + return false; } /* now parse the numaliasf lines to read in the remainder of the table */ - char* nl; for (int j = 0; j < numaliasf; j++) { - if ((nl = af->getline()) == NULL) - return 1; + std::string nl; + if (!af->getline(nl)) + return false; mychomp(nl); - tp = nl; i = 0; aliasf[j] = NULL; aliasflen[j] = 0; - piece = mystrsep(&tp, 0); - while (piece) { - if (*piece != '\0') { - switch (i) { - case 0: { - if (strncmp(piece, "AF", 2) != 0) { - numaliasf = 0; - free(aliasf); - free(aliasflen); - aliasf = NULL; - aliasflen = NULL; - HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", - af->getlinenum()); - return 1; - } - break; - } - case 1: { - aliasflen[j] = - (unsigned short)decode_flags(&(aliasf[j]), piece, af); - std::sort(aliasf[j], aliasf[j] + aliasflen[j]); - break; + iter = nl.begin(); + start_piece = mystrsep(nl, iter); + while (start_piece != nl.end()) { + switch (i) { + case 0: { + if (nl.compare(start_piece - nl.begin(), 2, "AF", 2) != 0) { + numaliasf = 0; + free(aliasf); + free(aliasflen); + aliasf = NULL; + aliasflen = NULL; + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", + af->getlinenum()); + return false; } - default: - break; + break; + } + case 1: { + std::string piece(start_piece, iter); + aliasflen[j] = + (unsigned short)decode_flags(&(aliasf[j]), piece, af); + std::sort(aliasf[j], aliasf[j] + aliasflen[j]); + break; } - i++; + default: + break; } - piece = mystrsep(&tp, 0); + ++i; + start_piece = mystrsep(nl, iter); } if (!aliasf[j]) { free(aliasf); @@ -1005,17 +1059,17 @@ int HashMgr::parse_aliasf(char* line, FileMgr* af) { numaliasf = 0; HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); - return 1; + return false; } } - return 0; + return true; } -int HashMgr::is_aliasf() { +int HashMgr::is_aliasf() const { return (aliasf != NULL); } -int HashMgr::get_aliasf(int index, unsigned short** fvec, FileMgr* af) { +int HashMgr::get_aliasf(int index, unsigned short** fvec, FileMgr* af) const { if ((index > 0) && (index <= numaliasf)) { *fvec = aliasf[index - 1]; return aliasflen[index - 1]; @@ -1027,45 +1081,42 @@ int HashMgr::get_aliasf(int index, unsigned short** fvec, FileMgr* af) { } /* parse morph alias definitions */ -int HashMgr::parse_aliasm(char* line, FileMgr* af) { +bool HashMgr::parse_aliasm(const std::string& line, FileMgr* af) { if (numaliasm != 0) { HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum()); - return 1; + return false; } - char* tp = line; - char* piece; int i = 0; int np = 0; - piece = mystrsep(&tp, 0); - while (piece) { - if (*piece != '\0') { - switch (i) { - case 0: { - np++; - break; + std::string::const_iterator iter = line.begin(); + std::string::const_iterator start_piece = mystrsep(line, iter); + while (start_piece != line.end()) { + switch (i) { + case 0: { + np++; + break; + } + case 1: { + numaliasm = atoi(std::string(start_piece, iter).c_str()); + if (numaliasm < 1) { + HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", + af->getlinenum()); + return false; } - case 1: { - numaliasm = atoi(piece); - if (numaliasm < 1) { - HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", - af->getlinenum()); - return 1; - } - aliasm = (char**)malloc(numaliasm * sizeof(char*)); - if (!aliasm) { - numaliasm = 0; - return 1; - } - np++; - break; + aliasm = (char**)malloc(numaliasm * sizeof(char*)); + if (!aliasm) { + numaliasm = 0; + return false; } - default: - break; + np++; + break; } - i++; + default: + break; } - piece = mystrsep(&tp, 0); + ++i; + start_piece = mystrsep(line, iter); } if (np != 2) { numaliasm = 0; @@ -1073,55 +1124,50 @@ int HashMgr::parse_aliasm(char* line, FileMgr* af) { aliasm = NULL; HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); - return 1; + return false; } /* now parse the numaliasm lines to read in the remainder of the table */ - char* nl = line; for (int j = 0; j < numaliasm; j++) { - if ((nl = af->getline()) == NULL) - return 1; + std::string nl; + if (!af->getline(nl)) + return false; mychomp(nl); - tp = nl; - i = 0; aliasm[j] = NULL; - piece = mystrsep(&tp, ' '); - while (piece) { - if (*piece != '\0') { - switch (i) { - case 0: { - if (strncmp(piece, "AM", 2) != 0) { - HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", - af->getlinenum()); - numaliasm = 0; - free(aliasm); - aliasm = NULL; - return 1; - } - break; + iter = nl.begin(); + i = 0; + start_piece = mystrsep(nl, iter); + while (start_piece != nl.end()) { + switch (i) { + case 0: { + if (nl.compare(start_piece - nl.begin(), 2, "AM", 2) != 0) { + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", + af->getlinenum()); + numaliasm = 0; + free(aliasm); + aliasm = NULL; + return false; } - case 1: { - // add the remaining of the line - if (*tp) { - *(tp - 1) = ' '; - tp = tp + strlen(tp); - } - std::string chunk(piece); - if (complexprefixes) { - if (utf8) - reverseword_utf(chunk); - else - reverseword(chunk); - } - aliasm[j] = mystrdup(chunk.c_str()); - break; + break; + } + case 1: { + // add the remaining of the line + std::string::const_iterator end = nl.end(); + std::string chunk(start_piece, end); + if (complexprefixes) { + if (utf8) + reverseword_utf(chunk); + else + reverseword(chunk); } - default: - break; + aliasm[j] = mystrdup(chunk.c_str()); + break; } - i++; + default: + break; } - piece = mystrsep(&tp, ' '); + ++i; + start_piece = mystrsep(nl, iter); } if (!aliasm[j]) { numaliasm = 0; @@ -1129,17 +1175,17 @@ int HashMgr::parse_aliasm(char* line, FileMgr* af) { aliasm = NULL; HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); - return 1; + return false; } } - return 0; + return true; } -int HashMgr::is_aliasm() { +int HashMgr::is_aliasm() const { return (aliasm != NULL); } -char* HashMgr::get_aliasm(int index) { +char* HashMgr::get_aliasm(int index) const { if ((index > 0) && (index <= numaliasm)) return aliasm[index - 1]; HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index); diff --git a/libs/hunspell/src/hashmgr.hxx b/libs/hunspell/src/hashmgr.hxx index 95b06b13f9..da485d7afa 100644 --- a/libs/hunspell/src/hashmgr.hxx +++ b/libs/hunspell/src/hashmgr.hxx @@ -1,6 +1,8 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * + * Copyright (C) 2002-2017 Németh László + * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at @@ -11,12 +13,7 @@ * for the specific language governing rights and limitations under the * License. * - * The Original Code is Hunspell, based on MySpell. - * - * The Initial Developers of the Original Code are - * Kevin Hendricks (MySpell) and Németh László (Hunspell). - * Portions created by the Initial Developers are Copyright (C) 2002-2005 - * the Initial Developers. All Rights Reserved. + * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. * * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, @@ -71,10 +68,8 @@ * SUCH DAMAGE. */ -#ifndef _HASHMGR_HXX_ -#define _HASHMGR_HXX_ - -#include "hunvisapi.h" +#ifndef HASHMGR_HXX_ +#define HASHMGR_HXX_ #include <stdio.h> #include <string> @@ -86,7 +81,7 @@ enum flag { FLAG_CHAR, FLAG_LONG, FLAG_NUM, FLAG_UNI }; -class LIBHUNSPELL_DLL_EXPORTED HashMgr { +class HashMgr { int tablesize; struct hentry** tableptr; flag flag_mode; @@ -94,10 +89,10 @@ class LIBHUNSPELL_DLL_EXPORTED HashMgr { int utf8; unsigned short forbiddenword; int langnum; - char* enc; - char* lang; + std::string enc; + std::string lang; struct cs_info* csconv; - char* ignorechars; + std::string ignorechars; std::vector<w_char> ignorechars_utf16; int numaliasf; // flag vector `compression' with aliases unsigned short** aliasf; @@ -114,35 +109,36 @@ class LIBHUNSPELL_DLL_EXPORTED HashMgr { struct hentry* walk_hashtable(int& col, struct hentry* hp) const; int add(const std::string& word); - int add_with_affix(const char* word, const char* pattern); - int remove(const char* word); - int decode_flags(unsigned short** result, char* flags, FileMgr* af); - unsigned short decode_flag(const char* flag); - char* encode_flag(unsigned short flag); - int is_aliasf(); - int get_aliasf(int index, unsigned short** fvec, FileMgr* af); - int is_aliasm(); - char* get_aliasm(int index); + int add_with_affix(const std::string& word, const std::string& pattern); + int remove(const std::string& word); + int decode_flags(unsigned short** result, const std::string& flags, FileMgr* af) const; + bool decode_flags(std::vector<unsigned short>& result, const std::string& flags, FileMgr* af) const; + unsigned short decode_flag(const char* flag) const; + char* encode_flag(unsigned short flag) const; + int is_aliasf() const; + int get_aliasf(int index, unsigned short** fvec, FileMgr* af) const; + int is_aliasm() const; + char* get_aliasm(int index) const; private: int get_clen_and_captype(const std::string& word, int* captype); + int get_clen_and_captype(const std::string& word, int* captype, std::vector<w_char> &workbuf); int load_tables(const char* tpath, const char* key); - int add_word(const char* word, - int wbl, + int add_word(const std::string& word, int wcl, unsigned short* ap, int al, - const char* desc, + const std::string* desc, bool onlyupcase); int load_config(const char* affpath, const char* key); - int parse_aliasf(char* line, FileMgr* af); + bool parse_aliasf(const std::string& line, FileMgr* af); int add_hidden_capitalized_word(const std::string& word, int wcl, unsigned short* flags, int al, - char* dp, + const std::string* dp, int captype); - int parse_aliasm(char* line, FileMgr* af); + bool parse_aliasm(const std::string& line, FileMgr* af); int remove_forbidden_flag(const std::string& word); }; diff --git a/libs/hunspell/src/htypes.hxx b/libs/hunspell/src/htypes.hxx index d244394416..8f66a0080e 100644 --- a/libs/hunspell/src/htypes.hxx +++ b/libs/hunspell/src/htypes.hxx @@ -1,6 +1,8 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * + * Copyright (C) 2002-2017 Németh László + * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at @@ -11,12 +13,7 @@ * for the specific language governing rights and limitations under the * License. * - * The Original Code is Hunspell, based on MySpell. - * - * The Initial Developers of the Original Code are - * Kevin Hendricks (MySpell) and Németh László (Hunspell). - * Portions created by the Initial Developers are Copyright (C) 2002-2005 - * the Initial Developers. All Rights Reserved. + * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. * * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, @@ -38,8 +35,8 @@ * * ***** END LICENSE BLOCK ***** */ -#ifndef _HTYPES_HXX_ -#define _HTYPES_HXX_ +#ifndef HTYPES_HXX_ +#define HTYPES_HXX_ #define ROTATE_LEN 5 diff --git a/libs/hunspell/src/hunspell.c++ b/libs/hunspell/src/hunspell.cxx index f7c1581087..1ef11df341 100644 --- a/libs/hunspell/src/hunspell.c++ +++ b/libs/hunspell/src/hunspell.cxx @@ -1,6 +1,8 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * + * Copyright (C) 2002-2017 Németh László + * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at @@ -11,12 +13,7 @@ * for the specific language governing rights and limitations under the * License. * - * The Original Code is Hunspell, based on MySpell. - * - * The Initial Developers of the Original Code are - * Kevin Hendricks (MySpell) and Németh László (Hunspell). - * Portions created by the Initial Developers are Copyright (C) 2002-2005 - * the Initial Developers. All Rights Reserved. + * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. * * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, @@ -75,35 +72,100 @@ #include <string.h> #include <stdio.h> +#include "affixmgr.hxx" #include "hunspell.hxx" +#include "suggestmgr.hxx" #include "hunspell.h" -#ifndef MOZILLA_CLIENT -#include "config.h" -#endif #include "csutil.hxx" #include <limits> #include <string> -#define MAXWORDLEN 176 #define MAXWORDUTF8LEN (MAXWORDLEN * 3) -Hunspell::Hunspell(const char* affpath, const char* dpath, const char* key) { - encoding = NULL; +class HunspellImpl +{ +public: + HunspellImpl(const char* affpath, const char* dpath, const char* key); + ~HunspellImpl(); + int add_dic(const char* dpath, const char* key); + std::vector<std::string> suffix_suggest(const std::string& root_word); + std::vector<std::string> generate(const std::string& word, const std::vector<std::string>& pl); + std::vector<std::string> generate(const std::string& word, const std::string& pattern); + std::vector<std::string> stem(const std::string& word); + std::vector<std::string> stem(const std::vector<std::string>& morph); + std::vector<std::string> analyze(const std::string& word); + int get_langnum() const; + bool input_conv(const std::string& word, std::string& dest); + bool spell(const std::string& word, int* info = NULL, std::string* root = NULL); + std::vector<std::string> suggest(const std::string& word); + const std::string& get_wordchars() const; + const std::vector<w_char>& get_wordchars_utf16() const; + const std::string& get_dict_encoding() const; + int add(const std::string& word); + int add_with_affix(const std::string& word, const std::string& example); + int remove(const std::string& word); + const std::string& get_version() const; + struct cs_info* get_csconv(); + std::vector<char> dic_encoding_vec; + +private: + AffixMgr* pAMgr; + std::vector<HashMgr*> m_HMgrs; + SuggestMgr* pSMgr; + char* affixpath; + std::string encoding; + struct cs_info* csconv; + int langnum; + int utf8; + int complexprefixes; + std::vector<std::string> wordbreak; + +private: + void cleanword(std::string& dest, const std::string&, int* pcaptype, int* pabbrev); + size_t cleanword2(std::string& dest, + std::vector<w_char>& dest_u, + const std::string& src, + int* pcaptype, + size_t* pabbrev); + void mkinitcap(std::string& u8); + int mkinitcap2(std::string& u8, std::vector<w_char>& u16); + int mkinitsmall2(std::string& u8, std::vector<w_char>& u16); + void mkallcap(std::string& u8); + int mkallsmall2(std::string& u8, std::vector<w_char>& u16); + struct hentry* checkword(const std::string& source, int* info, std::string* root); + std::string sharps_u8_l1(const std::string& source); + hentry* + spellsharps(std::string& base, size_t start_pos, int, int, int* info, std::string* root); + int is_keepcase(const hentry* rv); + void insert_sug(std::vector<std::string>& slst, const std::string& word); + void cat_result(std::string& result, const std::string& st); + std::vector<std::string> spellml(const std::string& word); + std::string get_xml_par(const char* par); + const char* get_xml_pos(const char* s, const char* attr); + std::vector<std::string> get_xml_list(const char* list, const char* tag); + int check_xml_par(const char* q, const char* attr, const char* value); +private: + HunspellImpl(const HunspellImpl&); + HunspellImpl& operator=(const HunspellImpl&); +}; + +Hunspell::Hunspell(const char* affpath, const char* dpath, const char* key) + : m_Impl(new HunspellImpl(affpath, dpath, key)) { +} + +HunspellImpl::HunspellImpl(const char* affpath, const char* dpath, const char* key) { csconv = NULL; utf8 = 0; complexprefixes = 0; affixpath = mystrdup(affpath); - maxdic = 0; /* first set up the hash manager */ - pHMgr[0] = new HashMgr(dpath, affpath, key); - if (pHMgr[0]) - maxdic = 1; + m_HMgrs.push_back(new HashMgr(dpath, affpath, key)); /* next set up the affix manager */ /* it needs access to the hash manager lookup methods */ - pAMgr = new AffixMgr(affpath, pHMgr, &maxdic, key); + pAMgr = new AffixMgr(affpath, m_HMgrs, key); /* get the preferred try string and the dictionary */ /* encoding from the Affix Manager for that dictionary */ @@ -116,6 +178,9 @@ Hunspell::Hunspell(const char* affpath, const char* dpath, const char* key) { complexprefixes = pAMgr->get_complexprefixes(); wordbreak = pAMgr->get_breaktable(); + dic_encoding_vec.resize(encoding.size()+1); + strcpy(&dic_encoding_vec[0], encoding.c_str()); + /* and finally set up the suggestion manager */ pSMgr = new SuggestMgr(try_string, MAXSUGGESTION, pAMgr); if (try_string) @@ -123,20 +188,20 @@ Hunspell::Hunspell(const char* affpath, const char* dpath, const char* key) { } Hunspell::~Hunspell() { + delete m_Impl; +} + +HunspellImpl::~HunspellImpl() { delete pSMgr; delete pAMgr; - for (int i = 0; i < maxdic; i++) - delete pHMgr[i]; - maxdic = 0; + for (size_t i = 0; i < m_HMgrs.size(); ++i) + delete m_HMgrs[i]; pSMgr = NULL; pAMgr = NULL; #ifdef MOZILLA_CLIENT delete[] csconv; #endif csconv = NULL; - if (encoding) - free(encoding); - encoding = NULL; if (affixpath) free(affixpath); affixpath = NULL; @@ -144,13 +209,14 @@ Hunspell::~Hunspell() { // load extra dictionaries int Hunspell::add_dic(const char* dpath, const char* key) { - if (maxdic == MAXDIC || !affixpath) - return 1; - pHMgr[maxdic] = new HashMgr(dpath, affixpath, key); - if (pHMgr[maxdic]) - maxdic++; - else + return m_Impl->add_dic(dpath, key); +} + +// load extra dictionaries +int HunspellImpl::add_dic(const char* dpath, const char* key) { + if (!affixpath) return 1; + m_HMgrs.push_back(new HashMgr(dpath, affixpath, key)); return 0; } @@ -161,20 +227,19 @@ int Hunspell::add_dic(const char* dpath, const char* key) { // set the capitalization type // return the length of the "cleaned" (and UTF-8 encoded) word -size_t Hunspell::cleanword2(std::string& dest, +size_t HunspellImpl::cleanword2(std::string& dest, std::vector<w_char>& dest_utf, - const char* src, - int* nc, + const std::string& src, int* pcaptype, size_t* pabbrev) { dest.clear(); dest_utf.clear(); - const char* q = src; + const char* q = src.c_str(); // first skip over any leading blanks - while ((*q != '\0') && (*q == ' ')) - q++; + while (*q == ' ') + ++q; // now strip off any trailing periods (recording their presence) *pabbrev = 0; @@ -193,26 +258,25 @@ size_t Hunspell::cleanword2(std::string& dest, dest.append(q, nl); nl = dest.size(); if (utf8) { - *nc = u8_u16(dest_utf, dest); + u8_u16(dest_utf, dest); *pcaptype = get_captype_utf8(dest_utf, langnum); } else { *pcaptype = get_captype(dest, csconv); - *nc = nl; } return nl; } -void Hunspell::cleanword(std::string& dest, - const char* src, +void HunspellImpl::cleanword(std::string& dest, + const std::string& src, int* pcaptype, int* pabbrev) { dest.clear(); - const unsigned char* q = (const unsigned char*)src; + const unsigned char* q = (const unsigned char*)src.c_str(); int firstcap = 0; // first skip over any leading blanks - while ((*q != '\0') && (*q == ' ')) - q++; + while (*q == ' ') + ++q; // now strip off any trailing periods (recording their presence) *pabbrev = 0; @@ -277,7 +341,7 @@ void Hunspell::cleanword(std::string& dest, } } -void Hunspell::mkallcap(std::string& u8) { +void HunspellImpl::mkallcap(std::string& u8) { if (utf8) { std::vector<w_char> u16; u8_u16(u16, u8); @@ -288,7 +352,7 @@ void Hunspell::mkallcap(std::string& u8) { } } -int Hunspell::mkallsmall2(std::string& u8, std::vector<w_char>& u16) { +int HunspellImpl::mkallsmall2(std::string& u8, std::vector<w_char>& u16) { if (utf8) { ::mkallsmall_utf(u16, langnum); u16_u8(u8, u16); @@ -299,19 +363,19 @@ int Hunspell::mkallsmall2(std::string& u8, std::vector<w_char>& u16) { } // convert UTF-8 sharp S codes to latin 1 -std::string Hunspell::sharps_u8_l1(const std::string& source) { +std::string HunspellImpl::sharps_u8_l1(const std::string& source) { std::string dest(source); mystrrep(dest, "\xC3\x9F", "\xDF"); return dest; } // recursive search for right ss - sharp s permutations -hentry* Hunspell::spellsharps(std::string& base, +hentry* HunspellImpl::spellsharps(std::string& base, size_t n_pos, int n, int repnum, int* info, - char** root) { + std::string* root) { size_t pos = base.find("ss", n_pos); if (pos != std::string::npos && (n < MAXSHARPS)) { base[pos] = '\xC3'; @@ -326,36 +390,28 @@ hentry* Hunspell::spellsharps(std::string& base, return h; } else if (repnum > 0) { if (utf8) - return checkword(base.c_str(), info, root); + return checkword(base, info, root); std::string tmp(sharps_u8_l1(base)); - return checkword(tmp.c_str(), info, root); + return checkword(tmp, info, root); } return NULL; } -int Hunspell::is_keepcase(const hentry* rv) { +int HunspellImpl::is_keepcase(const hentry* rv) { return pAMgr && rv->astr && pAMgr->get_keepcase() && TESTAFF(rv->astr, pAMgr->get_keepcase(), rv->alen); } -/* insert a word to the beginning of the suggestion array and return ns */ -int Hunspell::insert_sug(char*** slst, const char* word, int ns) { - if (!*slst) - return ns; - char* dup = mystrdup(word); - if (!dup) - return ns; - if (ns == MAXSUGGESTION) { - ns--; - free((*slst)[ns]); - } - for (int k = ns; k > 0; k--) - (*slst)[k] = (*slst)[k - 1]; - (*slst)[0] = dup; - return ns + 1; +/* insert a word to the beginning of the suggestion array */ +void HunspellImpl::insert_sug(std::vector<std::string>& slst, const std::string& word) { + slst.insert(slst.begin(), word); } -int Hunspell::spell(const char* word, int* info, char** root) { +bool Hunspell::spell(const std::string& word, int* info, std::string* root) { + return m_Impl->spell(word, info, root); +} + +bool HunspellImpl::spell(const std::string& word, int* info, std::string* root) { struct hentry* rv = NULL; int info2 = 0; @@ -365,15 +421,14 @@ int Hunspell::spell(const char* word, int* info, char** root) { *info = 0; // Hunspell supports XML input of the simplified API (see manual) - if (strcmp(word, SPELL_XML) == 0) - return 1; - int nc = strlen(word); + if (word == SPELL_XML) + return true; if (utf8) { - if (nc >= MAXWORDUTF8LEN) - return 0; + if (word.size() >= MAXWORDUTF8LEN) + return false; } else { - if (nc >= MAXWORDLEN) - return 0; + if (word.size() >= MAXWORDLEN) + return false; } int captype = NOCAP; size_t abbv = 0; @@ -383,17 +438,15 @@ int Hunspell::spell(const char* word, int* info, char** root) { std::vector<w_char> sunicw; // input conversion - RepList* rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; + RepList* rl = pAMgr ? pAMgr->get_iconvtable() : NULL; { std::string wspace; - int convstatus = rl ? rl->conv(word, wspace) : 0; - if (convstatus < 0) - return 0; - else if (convstatus > 0) - wl = cleanword2(scw, sunicw, wspace.c_str(), &nc, &captype, &abbv); + bool convstatus = rl ? rl->conv(word, wspace) : false; + if (convstatus) + wl = cleanword2(scw, sunicw, wspace, &captype, &abbv); else - wl = cleanword2(scw, sunicw, word, &nc, &captype, &abbv); + wl = cleanword2(scw, sunicw, word, &captype, &abbv); } #ifdef MOZILLA_CLIENT @@ -402,10 +455,10 @@ int Hunspell::spell(const char* word, int* info, char** root) { abbv = 1; #endif - if (wl == 0 || maxdic == 0) - return 1; + if (wl == 0 || m_HMgrs.empty()) + return true; if (root) - *root = NULL; + root->clear(); // allow numbers with dots, dashes and commas (but forbid double separators: // "..", "--" etc.) @@ -424,7 +477,7 @@ int Hunspell::spell(const char* word, int* info, char** root) { break; } if ((i == wl) && (nstate == NNUM)) - return 1; + return true; switch (captype) { case HUHCAP: @@ -433,22 +486,22 @@ int Hunspell::spell(const char* word, int* info, char** root) { *info += SPELL_ORIGCAP; /* FALLTHROUGH */ case NOCAP: - rv = checkword(scw.c_str(), info, root); + rv = checkword(scw, info, root); if ((abbv) && !(rv)) { std::string u8buffer(scw); u8buffer.push_back('.'); - rv = checkword(u8buffer.c_str(), info, root); + rv = checkword(u8buffer, info, root); } break; case ALLCAP: { *info += SPELL_ORIGCAP; - rv = checkword(scw.c_str(), info, root); + rv = checkword(scw, info, root); if (rv) break; if (abbv) { std::string u8buffer(scw); u8buffer.push_back('.'); - rv = checkword(u8buffer.c_str(), info, root); + rv = checkword(u8buffer, info, root); if (rv) break; } @@ -470,18 +523,18 @@ int Hunspell::spell(const char* word, int* info, char** root) { scw = part1 + part2; sunicw = part1u; sunicw.insert(sunicw.end(), part2u.begin(), part2u.end()); - rv = checkword(scw.c_str(), info, root); + rv = checkword(scw, info, root); if (rv) break; } else { mkinitcap2(part2, sunicw); scw = part1 + part2; - rv = checkword(scw.c_str(), info, root); + rv = checkword(scw, info, root); if (rv) break; } mkinitcap2(scw, sunicw); - rv = checkword(scw.c_str(), info, root); + rv = checkword(scw, info, root); if (rv) break; } @@ -516,7 +569,7 @@ int Hunspell::spell(const char* word, int* info, char** root) { mkinitcap2(scw, sunicw); if (captype == INITCAP) *info += SPELL_INITCAP; - rv = checkword(scw.c_str(), info, root); + rv = checkword(scw, info, root); if (captype == INITCAP) *info -= SPELL_INITCAP; // forbid bad capitalization @@ -531,16 +584,16 @@ int Hunspell::spell(const char* word, int* info, char** root) { if (rv) break; - rv = checkword(u8buffer.c_str(), info, root); + rv = checkword(u8buffer, info, root); if (abbv && !rv) { u8buffer.push_back('.'); - rv = checkword(u8buffer.c_str(), info, root); + rv = checkword(u8buffer, info, root); if (!rv) { u8buffer = scw; u8buffer.push_back('.'); if (captype == INITCAP) *info += SPELL_INITCAP; - rv = checkword(u8buffer.c_str(), info, root); + rv = checkword(u8buffer, info, root); if (captype == INITCAP) *info -= SPELL_INITCAP; if (rv && is_keepcase(rv) && (captype == ALLCAP)) @@ -565,89 +618,86 @@ int Hunspell::spell(const char* word, int* info, char** root) { TESTAFF(rv->astr, pAMgr->get_warn(), rv->alen)) { *info += SPELL_WARN; if (pAMgr->get_forbidwarn()) - return 0; - return HUNSPELL_OK_WARN; + return false; + return true; } - return HUNSPELL_OK; + return true; } // recursive breaking at break points - if (wordbreak) { + if (!wordbreak.empty()) { int nbr = 0; wl = scw.size(); - int numbreak = pAMgr ? pAMgr->get_numbreak() : 0; // calculate break points for recursion limit - for (int j = 0; j < numbreak; j++) { - size_t len = strlen(wordbreak[j]); + for (size_t j = 0; j < wordbreak.size(); ++j) { size_t pos = 0; - while ((pos = scw.find(wordbreak[j], pos, len)) != std::string::npos) { + while ((pos = scw.find(wordbreak[j], pos)) != std::string::npos) { ++nbr; - pos += len; + pos += wordbreak[j].size(); } } if (nbr >= 10) - return 0; + return false; // check boundary patterns (^begin and end$) - for (int j = 0; j < numbreak; j++) { - size_t plen = strlen(wordbreak[j]); + for (size_t j = 0; j < wordbreak.size(); ++j) { + size_t plen = wordbreak[j].size(); if (plen == 1 || plen > wl) continue; if (wordbreak[j][0] == '^' && - scw.compare(0, plen - 1, wordbreak[j] + 1, plen -1) == 0 && spell(scw.c_str() + plen - 1)) - return 1; + scw.compare(0, plen - 1, wordbreak[j], 1, plen -1) == 0 && spell(scw.substr(plen - 1))) + return true; if (wordbreak[j][plen - 1] == '$' && - scw.compare(wl - plen + 1, plen - 1, wordbreak[j], plen - 1) == 0) { - char r = scw[wl - plen + 1]; - scw[wl - plen + 1] = '\0'; - if (spell(scw.c_str())) - return 1; - scw[wl - plen + 1] = r; + scw.compare(wl - plen + 1, plen - 1, wordbreak[j], 0, plen - 1) == 0) { + std::string suffix(scw.substr(wl - plen + 1)); + scw.resize(wl - plen + 1); + if (spell(scw)) + return true; + scw.append(suffix); } } // other patterns - for (int j = 0; j < numbreak; j++) { - size_t plen = strlen(wordbreak[j]); + for (size_t j = 0; j < wordbreak.size(); ++j) { + size_t plen = wordbreak[j].size(); size_t found = scw.find(wordbreak[j]); if ((found > 0) && (found < wl - plen)) { - if (!spell(scw.c_str() + found + plen)) + if (!spell(scw.substr(found + plen))) continue; - char r = scw[found]; - scw[found] = '\0'; + std::string suffix(scw.substr(found)); + scw.resize(found); // examine 2 sides of the break point - if (spell(scw.c_str())) - return 1; - scw[found] = r; + if (spell(scw)) + return true; + scw.append(suffix); // LANG_hu: spec. dash rule - if (langnum == LANG_hu && strcmp(wordbreak[j], "-") == 0) { - r = scw[found + 1]; - scw[found + 1] = '\0'; - if (spell(scw.c_str())) - return 1; // check the first part with dash - scw[found + 1] = r; + if (langnum == LANG_hu && wordbreak[j] == "-") { + suffix = scw.substr(found + 1); + scw.resize(found + 1); + if (spell(scw)) + return true; // check the first part with dash + scw.append(suffix); } // end of LANG specific region } } } - return 0; + return false; } -struct hentry* Hunspell::checkword(const char* w, int* info, char** root) { - struct hentry* he = NULL; +struct hentry* HunspellImpl::checkword(const std::string& w, int* info, std::string* root) { bool usebuffer = false; - int len, i; std::string w2; const char* word; + int len; - char* ignoredchars = pAMgr ? pAMgr->get_ignore() : NULL; + const char* ignoredchars = pAMgr ? pAMgr->get_ignore() : NULL; if (ignoredchars != NULL) { w2.assign(w); if (utf8) { @@ -658,11 +708,12 @@ struct hentry* Hunspell::checkword(const char* w, int* info, char** root) { remove_ignored_chars(w2, ignoredchars); } word = w2.c_str(); + len = w2.size(); usebuffer = true; - } else - word = w; - - len = strlen(word); + } else { + word = w.c_str(); + len = w.size(); + } if (!len) return NULL; @@ -684,8 +735,9 @@ struct hentry* Hunspell::checkword(const char* w, int* info, char** root) { } // look word in hash table - for (i = 0; (i < maxdic) && !he; i++) { - he = (pHMgr[i])->lookup(word); + struct hentry* he = NULL; + for (size_t i = 0; (i < m_HMgrs.size()) && !he; ++i) { + he = m_HMgrs[i]->lookup(word); // check forbidden and onlyincompound words if ((he) && (he->astr) && (pAMgr) && @@ -736,40 +788,33 @@ struct hentry* Hunspell::checkword(const char* w, int* info, char** root) { return NULL; } if (root) { - std::string word_root(he->word); + root->assign(he->word); if (complexprefixes) { if (utf8) - reverseword_utf(word_root); + reverseword_utf(*root); else - reverseword(word_root); + reverseword(*root); } - *root = mystrdup(word_root.c_str()); } // try check compound word } else if (pAMgr->get_compound()) { struct hentry* rwords[100]; // buffer for COMPOUND pattern checking - he = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, (hentry**)&rwords, 0, 0, info); + he = pAMgr->compound_check(word, 0, 0, 100, 0, NULL, (hentry**)&rwords, 0, 0, info); // LANG_hu section: `moving rule' with last dash if ((!he) && (langnum == LANG_hu) && (word[len - 1] == '-')) { - char* dup = mystrdup(word); - if (!dup) - return NULL; - dup[len - 1] = '\0'; - he = pAMgr->compound_check(dup, len - 1, -5, 0, 100, 0, NULL, (hentry**)&rwords, 1, 0, - info); - free(dup); + std::string dup(word, len - 1); + he = pAMgr->compound_check(dup, -5, 0, 100, 0, NULL, (hentry**)&rwords, 1, 0, info); } // end of LANG specific region if (he) { if (root) { - std::string word_root(he->word); + root->assign(he->word); if (complexprefixes) { if (utf8) - reverseword_utf(word_root); + reverseword_utf(*root); else - reverseword(word_root); + reverseword(*root); } - *root = mystrdup(word_root.c_str()); } if (info) *info += SPELL_COMPOUND; @@ -780,22 +825,27 @@ struct hentry* Hunspell::checkword(const char* w, int* info, char** root) { return he; } -int Hunspell::suggest(char*** slst, const char* word) { +std::vector<std::string> Hunspell::suggest(const std::string& word) { + return m_Impl->suggest(word); +} + +std::vector<std::string> HunspellImpl::suggest(const std::string& word) { + std::vector<std::string> slst; + int onlycmpdsug = 0; - if (!pSMgr || maxdic == 0) - return 0; - *slst = NULL; + if (!pSMgr || m_HMgrs.empty()) + return slst; + // process XML input of the simplified API (see manual) - if (strncmp(word, SPELL_XML, sizeof(SPELL_XML) - 3) == 0) { - return spellml(slst, word); + if (word.compare(0, sizeof(SPELL_XML) - 3, SPELL_XML, sizeof(SPELL_XML) - 3) == 0) { + return spellml(word); } - int nc = strlen(word); if (utf8) { - if (nc >= MAXWORDUTF8LEN) - return 0; + if (word.size() >= MAXWORDUTF8LEN) + return slst; } else { - if (nc >= MAXWORDLEN) - return 0; + if (word.size() >= MAXWORDLEN) + return slst; } int captype = NOCAP; size_t abbv = 0; @@ -809,121 +859,102 @@ int Hunspell::suggest(char*** slst, const char* word) { { std::string wspace; - int convstatus = rl ? rl->conv(word, wspace) : 0; - if (convstatus < 0) - return 0; - else if (convstatus > 0) - wl = cleanword2(scw, sunicw, wspace.c_str(), &nc, &captype, &abbv); + bool convstatus = rl ? rl->conv(word, wspace) : false; + if (convstatus) + wl = cleanword2(scw, sunicw, wspace, &captype, &abbv); else - wl = cleanword2(scw, sunicw, word, &nc, &captype, &abbv); + wl = cleanword2(scw, sunicw, word, &captype, &abbv); if (wl == 0) - return 0; + return slst; } - int ns = 0; int capwords = 0; // check capitalized form for FORCEUCASE if (pAMgr && captype == NOCAP && pAMgr->get_forceucase()) { int info = SPELL_ORIGCAP; - if (checkword(scw.c_str(), &info, NULL)) { + if (checkword(scw, &info, NULL)) { std::string form(scw); mkinitcap(form); - - char** wlst = (char**)malloc(MAXSUGGESTION * sizeof(char*)); - if (wlst == NULL) - return -1; - *slst = wlst; - wlst[0] = mystrdup(form.c_str()); - for (int i = 1; i < MAXSUGGESTION; ++i) { - wlst[i] = NULL; - } - - return 1; + slst.push_back(form); + return slst; } } switch (captype) { case NOCAP: { - ns = pSMgr->suggest(slst, scw.c_str(), ns, &onlycmpdsug); + pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug); break; } case INITCAP: { capwords = 1; - ns = pSMgr->suggest(slst, scw.c_str(), ns, &onlycmpdsug); - if (ns == -1) - break; + pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug); std::string wspace(scw); mkallsmall2(wspace, sunicw); - ns = pSMgr->suggest(slst, wspace.c_str(), ns, &onlycmpdsug); + pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); break; } case HUHINITCAP: capwords = 1; case HUHCAP: { - ns = pSMgr->suggest(slst, scw.c_str(), ns, &onlycmpdsug); - if (ns != -1) { - // something.The -> something. The - size_t dot_pos = scw.find('.'); - if (dot_pos != std::string::npos) { - std::string postdot = scw.substr(dot_pos + 1); - int captype_; - if (utf8) { - std::vector<w_char> postdotu; - u8_u16(postdotu, postdot); - captype_ = get_captype_utf8(postdotu, langnum); - } else { - captype_ = get_captype(postdot, csconv); - } - if (captype_ == INITCAP) { - std::string str(scw); - str.insert(dot_pos + 1, 1, ' '); - ns = insert_sug(slst, str.c_str(), ns); - } + pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug); + // something.The -> something. The + size_t dot_pos = scw.find('.'); + if (dot_pos != std::string::npos) { + std::string postdot = scw.substr(dot_pos + 1); + int captype_; + if (utf8) { + std::vector<w_char> postdotu; + u8_u16(postdotu, postdot); + captype_ = get_captype_utf8(postdotu, langnum); + } else { + captype_ = get_captype(postdot, csconv); + } + if (captype_ == INITCAP) { + std::string str(scw); + str.insert(dot_pos + 1, 1, ' '); + insert_sug(slst, str); } + } - std::string wspace; + std::string wspace; - if (captype == HUHINITCAP) { - // TheOpenOffice.org -> The OpenOffice.org - wspace = scw; - mkinitsmall2(wspace, sunicw); - ns = pSMgr->suggest(slst, wspace.c_str(), ns, &onlycmpdsug); - } + if (captype == HUHINITCAP) { + // TheOpenOffice.org -> The OpenOffice.org wspace = scw; - mkallsmall2(wspace, sunicw); + mkinitsmall2(wspace, sunicw); + pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); + } + wspace = scw; + mkallsmall2(wspace, sunicw); + if (spell(wspace.c_str())) + insert_sug(slst, wspace); + size_t prevns = slst.size(); + pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); + if (captype == HUHINITCAP) { + mkinitcap2(wspace, sunicw); if (spell(wspace.c_str())) - ns = insert_sug(slst, wspace.c_str(), ns); - int prevns = ns; - ns = pSMgr->suggest(slst, wspace.c_str(), ns, &onlycmpdsug); - if (captype == HUHINITCAP) { - mkinitcap2(wspace, sunicw); - if (spell(wspace.c_str())) - ns = insert_sug(slst, wspace.c_str(), ns); - ns = pSMgr->suggest(slst, wspace.c_str(), ns, &onlycmpdsug); - } - // aNew -> "a New" (instead of "a new") - for (int j = prevns; j < ns; j++) { - char* space = strchr((*slst)[j], ' '); - if (space) { - size_t slen = strlen(space + 1); - // different case after space (need capitalisation) - if ((slen < wl) && strcmp(scw.c_str() + wl - slen, space + 1)) { - std::string first((*slst)[j], space + 1); - std::string second(space + 1); - std::vector<w_char> w; - if (utf8) - u8_u16(w, second); - mkinitcap2(second, w); - // set as first suggestion - char* r = (*slst)[j]; - for (int k = j; k > 0; k--) - (*slst)[k] = (*slst)[k - 1]; - free(r); - (*slst)[0] = mystrdup((first + second).c_str()); - } + insert_sug(slst, wspace); + pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); + } + // aNew -> "a New" (instead of "a new") + for (size_t j = prevns; j < slst.size(); ++j) { + const char* space = strchr(slst[j].c_str(), ' '); + if (space) { + size_t slen = strlen(space + 1); + // different case after space (need capitalisation) + if ((slen < wl) && strcmp(scw.c_str() + wl - slen, space + 1)) { + std::string first(slst[j].c_str(), space + 1); + std::string second(space + 1); + std::vector<w_char> w; + if (utf8) + u8_u16(w, second); + mkinitcap2(second, w); + // set as first suggestion + slst.erase(slst.begin() + j); + slst.insert(slst.begin(), first + second); } } } @@ -933,28 +964,20 @@ int Hunspell::suggest(char*** slst, const char* word) { case ALLCAP: { std::string wspace(scw); mkallsmall2(wspace, sunicw); - ns = pSMgr->suggest(slst, wspace.c_str(), ns, &onlycmpdsug); - if (ns == -1) - break; + pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); if (pAMgr && pAMgr->get_keepcase() && spell(wspace.c_str())) - ns = insert_sug(slst, wspace.c_str(), ns); + insert_sug(slst, wspace); mkinitcap2(wspace, sunicw); - ns = pSMgr->suggest(slst, wspace.c_str(), ns, &onlycmpdsug); - for (int j = 0; j < ns; j++) { - std::string form((*slst)[j]); - mkallcap(form); - + pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); + for (size_t j = 0; j < slst.size(); ++j) { + mkallcap(slst[j]); if (pAMgr && pAMgr->get_checksharps()) { if (utf8) { - mystrrep(form, "\xC3\x9F", "SS"); + mystrrep(slst[j], "\xC3\x9F", "SS"); } else { - mystrrep(form, "\xDF", "SS"); + mystrrep(slst[j], "\xDF", "SS"); } } - - free((*slst)[j]); - (*slst)[j] = mystrdup(form.c_str()); - } break; } @@ -962,29 +985,27 @@ int Hunspell::suggest(char*** slst, const char* word) { // LANG_hu section: replace '-' with ' ' in Hungarian if (langnum == LANG_hu) { - for (int j = 0; j < ns; j++) { - char* pos = strchr((*slst)[j], '-'); - if (pos) { + for (size_t j = 0; j < slst.size(); ++j) { + size_t pos = slst[j].find('-'); + if (pos != std::string::npos) { int info; - *pos = '\0'; - std::string w((*slst)[j]); - w.append(pos + 1); - (void)spell(w.c_str(), &info, NULL); + std::string w(slst[j].substr(0, pos)); + w.append(slst[j].substr(pos + 1)); + (void)spell(w, &info, NULL); if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) { - *pos = ' '; + slst[j][pos] = ' '; } else - *pos = '-'; + slst[j][pos] = '-'; } } } // END OF LANG_hu section // try ngram approach since found nothing or only compound words - if (pAMgr && (ns == 0 || onlycmpdsug) && (pAMgr->get_maxngramsugs() != 0) && - (*slst)) { + if (pAMgr && (slst.empty() || onlycmpdsug) && (pAMgr->get_maxngramsugs() != 0)) { switch (captype) { case NOCAP: { - ns = pSMgr->ngsuggest(*slst, scw.c_str(), ns, pHMgr, maxdic); + pSMgr->ngsuggest(slst, scw.c_str(), m_HMgrs); break; } case HUHINITCAP: @@ -992,26 +1013,23 @@ int Hunspell::suggest(char*** slst, const char* word) { case HUHCAP: { std::string wspace(scw); mkallsmall2(wspace, sunicw); - ns = pSMgr->ngsuggest(*slst, wspace.c_str(), ns, pHMgr, maxdic); + pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs); break; } case INITCAP: { capwords = 1; std::string wspace(scw); mkallsmall2(wspace, sunicw); - ns = pSMgr->ngsuggest(*slst, wspace.c_str(), ns, pHMgr, maxdic); + pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs); break; } case ALLCAP: { std::string wspace(scw); mkallsmall2(wspace, sunicw); - int oldns = ns; - ns = pSMgr->ngsuggest(*slst, wspace.c_str(), ns, pHMgr, maxdic); - for (int j = oldns; j < ns; j++) { - std::string form((*slst)[j]); - mkallcap(form); - free((*slst)[j]); - (*slst)[j] = mystrdup(form.c_str()); + size_t oldns = slst.size(); + pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs); + for (size_t j = oldns; j < slst.size(); ++j) { + mkallcap(slst[j]); } break; } @@ -1022,8 +1040,8 @@ int Hunspell::suggest(char*** slst, const char* word) { size_t dash_pos = scw.find('-'); if (dash_pos != std::string::npos) { int nodashsug = 1; - for (int j = 0; j < ns && nodashsug == 1; j++) { - if (strchr((*slst)[j], '-')) + for (size_t j = 0; j < slst.size() && nodashsug == 1; ++j) { + if (slst[j].find('-') != std::string::npos) nodashsug = 0; } @@ -1035,20 +1053,16 @@ int Hunspell::suggest(char*** slst, const char* word) { last = 1; std::string chunk = scw.substr(prev_pos, dash_pos - prev_pos); if (!spell(chunk.c_str())) { - char** nlst = NULL; - int nn = suggest(&nlst, chunk.c_str()); - for (int j = nn - 1; j >= 0; j--) { + std::vector<std::string> nlst = suggest(chunk.c_str()); + for (std::vector<std::string>::reverse_iterator j = nlst.rbegin(); j != nlst.rend(); ++j) { std::string wspace = scw.substr(0, prev_pos); - wspace.append(nlst[j]); + wspace.append(*j); if (!last) { wspace.append("-"); wspace.append(scw.substr(dash_pos + 1)); } - ns = insert_sug(slst, wspace.c_str(), ns); - free(nlst[j]); + insert_sug(slst, wspace); } - if (nlst != NULL) - free(nlst); nodashsug = 0; } if (!last) { @@ -1062,31 +1076,24 @@ int Hunspell::suggest(char*** slst, const char* word) { // word reversing wrapper for complex prefixes if (complexprefixes) { - for (int j = 0; j < ns; j++) { - std::string root((*slst)[j]); - free((*slst)[j]); + for (size_t j = 0; j < slst.size(); ++j) { if (utf8) - reverseword_utf(root); + reverseword_utf(slst[j]); else - reverseword(root); - (*slst)[j] = mystrdup(root.c_str()); + reverseword(slst[j]); } } // capitalize if (capwords) - for (int j = 0; j < ns; j++) { - std::string form((*slst)[j]); - free((*slst)[j]); - mkinitcap(form); - (*slst)[j] = mystrdup(form.c_str()); + for (size_t j = 0; j < slst.size(); ++j) { + mkinitcap(slst[j]); } // expand suggestions with dot(s) if (abbv && pAMgr && pAMgr->get_sugswithdots()) { - for (int j = 0; j < ns; j++) { - (*slst)[j] = (char*)realloc((*slst)[j], strlen((*slst)[j]) + 1 + abbv); - strcat((*slst)[j], word + strlen(word) - abbv); + for (size_t j = 0; j < slst.size(); ++j) { + slst[j].append(word.substr(word.size() - abbv)); } } @@ -1095,96 +1102,90 @@ int Hunspell::suggest(char*** slst, const char* word) { switch (captype) { case INITCAP: case ALLCAP: { - int l = 0; - for (int j = 0; j < ns; j++) { - if (!strchr((*slst)[j], ' ') && !spell((*slst)[j])) { + size_t l = 0; + for (size_t j = 0; j < slst.size(); ++j) { + if (slst[j].find(' ') == std::string::npos && !spell(slst[j])) { std::string s; std::vector<w_char> w; if (utf8) { - u8_u16(w, (*slst)[j]); + u8_u16(w, slst[j]); } else { - s = (*slst)[j]; + s = slst[j]; } mkallsmall2(s, w); - free((*slst)[j]); - if (spell(s.c_str())) { - (*slst)[l] = mystrdup(s.c_str()); - if ((*slst)[l]) - l++; + if (spell(s)) { + slst[l] = s; + ++l; } else { mkinitcap2(s, w); - if (spell(s.c_str())) { - (*slst)[l] = mystrdup(s.c_str()); - if ((*slst)[l]) - l++; + if (spell(s)) { + slst[l] = s; + ++l; } } } else { - (*slst)[l] = (*slst)[j]; - l++; + slst[l] = slst[j]; + ++l; } } - ns = l; + slst.resize(l); } } } // remove duplications - int l = 0; - for (int j = 0; j < ns; j++) { - (*slst)[l] = (*slst)[j]; - for (int k = 0; k < l; k++) { - if (strcmp((*slst)[k], (*slst)[j]) == 0) { - free((*slst)[j]); - l--; + size_t l = 0; + for (size_t j = 0; j < slst.size(); ++j) { + slst[l] = slst[j]; + for (size_t k = 0; k < l; ++k) { + if (slst[k] == slst[j]) { + --l; break; } } - l++; + ++l; } - ns = l; + slst.resize(l); // output conversion rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL; - for (int j = 0; rl && j < ns; j++) { + for (size_t j = 0; rl && j < slst.size(); ++j) { std::string wspace; - if (rl->conv((*slst)[j], wspace) > 0) { - free((*slst)[j]); - (*slst)[j] = mystrdup(wspace.c_str()); + if (rl->conv(slst[j], wspace)) { + slst[j] = wspace; } } - // if suggestions removed by nosuggest, onlyincompound parameters - if (l == 0 && *slst) { - free(*slst); - *slst = NULL; - } - return l; + return slst; } -void Hunspell::free_list(char*** slst, int n) { - freelist(slst, n); +const std::string& Hunspell::get_dict_encoding() const { + return m_Impl->get_dict_encoding(); } -char* Hunspell::get_dic_encoding() { +const std::string& HunspellImpl::get_dict_encoding() const { return encoding; } -int Hunspell::stem(char*** slst, char** desc, int n) { +std::vector<std::string> Hunspell::stem(const std::vector<std::string>& desc) { + return m_Impl->stem(desc); +} + +std::vector<std::string> HunspellImpl::stem(const std::vector<std::string>& desc) { + std::vector<std::string> slst; std::string result2; - *slst = NULL; - if (n == 0) - return 0; - for (int i = 0; i < n; i++) { + if (desc.empty()) + return slst; + for (size_t i = 0; i < desc.size(); ++i) { std::string result; // add compound word parts (except the last one) - char* s = (char*)desc[i]; - char* part = strstr(s, MORPH_PART); + const char* s = desc[i].c_str(); + const char* part = strstr(s, MORPH_PART); if (part) { - char* nextpart = strstr(part + 1, MORPH_PART); + const char* nextpart = strstr(part + 1, MORPH_PART); while (nextpart) { std::string field; copy_field(field, part, MORPH_PART); @@ -1195,36 +1196,34 @@ int Hunspell::stem(char*** slst, char** desc, int n) { s = part; } - char** pl; std::string tok(s); size_t alt = 0; while ((alt = tok.find(" | ", alt)) != std::string::npos) { tok[alt + 1] = MSEP_ALT; } - int pln = line_tok(tok.c_str(), &pl, MSEP_ALT); - for (int k = 0; k < pln; k++) { + std::vector<std::string> pl = line_tok(tok, MSEP_ALT); + for (size_t k = 0; k < pl.size(); ++k) { // add derivational suffixes - if (strstr(pl[k], MORPH_DERI_SFX)) { + if (pl[k].find(MORPH_DERI_SFX) != std::string::npos) { // remove inflectional suffixes - char* is = strstr(pl[k], MORPH_INFL_SFX); - if (is) - *is = '\0'; - char* sg = pSMgr->suggest_gen(&(pl[k]), 1, pl[k]); - if (sg) { - char** gen; - int genl = line_tok(sg, &gen, MSEP_REC); - free(sg); - for (int j = 0; j < genl; j++) { + const size_t is = pl[k].find(MORPH_INFL_SFX); + if (is != std::string::npos) + pl[k].resize(is); + std::vector<std::string> singlepl; + singlepl.push_back(pl[k]); + std::string sg = pSMgr->suggest_gen(singlepl, pl[k]); + if (!sg.empty()) { + std::vector<std::string> gen = line_tok(sg, MSEP_REC); + for (size_t j = 0; j < gen.size(); ++j) { result2.push_back(MSEP_REC); result2.append(result); result2.append(gen[j]); } - freelist(&gen, genl); } } else { result2.push_back(MSEP_REC); result2.append(result); - if (strstr(pl[k], MORPH_SURF_PFX)) { + if (pl[k].find(MORPH_SURF_PFX) != std::string::npos) { std::string field; copy_field(field, pl[k], MORPH_SURF_PFX); result2.append(field); @@ -1234,29 +1233,41 @@ int Hunspell::stem(char*** slst, char** desc, int n) { result2.append(field); } } - freelist(&pl, pln); } - int sln = line_tok(result2.c_str(), slst, MSEP_REC); - return uniqlist(*slst, sln); + slst = line_tok(result2, MSEP_REC); + uniqlist(slst); + return slst; } -int Hunspell::stem(char*** slst, const char* word) { - char** pl; - int pln = analyze(&pl, word); - int pln2 = stem(slst, pl, pln); - freelist(&pl, pln); - return pln2; +std::vector<std::string> Hunspell::stem(const std::string& word) { + return m_Impl->stem(word); +} + +std::vector<std::string> HunspellImpl::stem(const std::string& word) { + return stem(analyze(word)); +} + +const char* Hunspell::get_wordchars() const { + return m_Impl->get_wordchars().c_str(); } -const char* Hunspell::get_wordchars() { +const std::string& Hunspell::get_wordchars_cpp() const { + return m_Impl->get_wordchars(); +} + +const std::string& HunspellImpl::get_wordchars() const { return pAMgr->get_wordchars(); } -const std::vector<w_char>& Hunspell::get_wordchars_utf16() { +const std::vector<w_char>& Hunspell::get_wordchars_utf16() const { + return m_Impl->get_wordchars_utf16(); +} + +const std::vector<w_char>& HunspellImpl::get_wordchars_utf16() const { return pAMgr->get_wordchars_utf16(); } -void Hunspell::mkinitcap(std::string& u8) { +void HunspellImpl::mkinitcap(std::string& u8) { if (utf8) { std::vector<w_char> u16; u8_u16(u16, u8); @@ -1267,7 +1278,7 @@ void Hunspell::mkinitcap(std::string& u8) { } } -int Hunspell::mkinitcap2(std::string& u8, std::vector<w_char>& u16) { +int HunspellImpl::mkinitcap2(std::string& u8, std::vector<w_char>& u16) { if (utf8) { ::mkinitcap_utf(u16, langnum); u16_u8(u8, u16); @@ -1277,7 +1288,7 @@ int Hunspell::mkinitcap2(std::string& u8, std::vector<w_char>& u16) { return u8.size(); } -int Hunspell::mkinitsmall2(std::string& u8, std::vector<w_char>& u16) { +int HunspellImpl::mkinitsmall2(std::string& u8, std::vector<w_char>& u16) { if (utf8) { ::mkinitsmall_utf(u16, langnum); u16_u8(u8, u16); @@ -1287,52 +1298,78 @@ int Hunspell::mkinitsmall2(std::string& u8, std::vector<w_char>& u16) { return u8.size(); } -int Hunspell::add(const char* word) { - if (pHMgr[0]) - return (pHMgr[0])->add(word); +int Hunspell::add(const std::string& word) { + return m_Impl->add(word); +} + +int HunspellImpl::add(const std::string& word) { + if (!m_HMgrs.empty()) + return m_HMgrs[0]->add(word); return 0; } -int Hunspell::add_with_affix(const char* word, const char* example) { - if (pHMgr[0]) - return (pHMgr[0])->add_with_affix(word, example); +int Hunspell::add_with_affix(const std::string& word, const std::string& example) { + return m_Impl->add_with_affix(word, example); +} + +int HunspellImpl::add_with_affix(const std::string& word, const std::string& example) { + if (!m_HMgrs.empty()) + return m_HMgrs[0]->add_with_affix(word, example); return 0; } -int Hunspell::remove(const char* word) { - if (pHMgr[0]) - return (pHMgr[0])->remove(word); +int Hunspell::remove(const std::string& word) { + return m_Impl->remove(word); +} + +int HunspellImpl::remove(const std::string& word) { + if (!m_HMgrs.empty()) + return m_HMgrs[0]->remove(word); return 0; } -const char* Hunspell::get_version() { +const char* Hunspell::get_version() const { + return m_Impl->get_version().c_str(); +} + +const std::string& Hunspell::get_version_cpp() const { + return m_Impl->get_version(); +} + +const std::string& HunspellImpl::get_version() const { return pAMgr->get_version(); } -struct cs_info* Hunspell::get_csconv() { +struct cs_info* HunspellImpl::get_csconv() { return csconv; } -void Hunspell::cat_result(std::string& result, char* st) { - if (st) { +struct cs_info* Hunspell::get_csconv() { + return m_Impl->get_csconv(); +} + +void HunspellImpl::cat_result(std::string& result, const std::string& st) { + if (!st.empty()) { if (!result.empty()) result.append("\n"); result.append(st); - free(st); } } -int Hunspell::analyze(char*** slst, const char* word) { - *slst = NULL; - if (!pSMgr || maxdic == 0) - return 0; - int nc = strlen(word); +std::vector<std::string> Hunspell::analyze(const std::string& word) { + return m_Impl->analyze(word); +} + +std::vector<std::string> HunspellImpl::analyze(const std::string& word) { + std::vector<std::string> slst; + if (!pSMgr || m_HMgrs.empty()) + return slst; if (utf8) { - if (nc >= MAXWORDUTF8LEN) - return 0; + if (word.size() >= MAXWORDUTF8LEN) + return slst; } else { - if (nc >= MAXWORDLEN) - return 0; + if (word.size() >= MAXWORDLEN) + return slst; } int captype = NOCAP; size_t abbv = 0; @@ -1346,13 +1383,11 @@ int Hunspell::analyze(char*** slst, const char* word) { { std::string wspace; - int convstatus = rl ? rl->conv(word, wspace) : 0; - if (convstatus < 0) - return 0; - else if (convstatus > 0) - wl = cleanword2(scw, sunicw, wspace.c_str(), &nc, &captype, &abbv); + bool convstatus = rl ? rl->conv(word, wspace) : false; + if (convstatus) + wl = cleanword2(scw, sunicw, wspace, &captype, &abbv); else - wl = cleanword2(scw, sunicw, word, &nc, &captype, &abbv); + wl = cleanword2(scw, sunicw, word, &captype, &abbv); } if (wl == 0) { @@ -1362,18 +1397,18 @@ int Hunspell::analyze(char*** slst, const char* word) { scw.push_back('.'); abbv = 0; } else - return 0; + return slst; } std::string result; size_t n = 0; - size_t n2 = 0; - size_t n3 = 0; - // test numbers // LANG_hu section: set dash information for suggestions if (langnum == LANG_hu) { + size_t n2 = 0; + size_t n3 = 0; + while ((n < wl) && (((scw[n] <= '9') && (scw[n] >= '0')) || (((scw[n] == '.') || (scw[n] == ',')) && (n > 0)))) { n++; @@ -1387,22 +1422,20 @@ int Hunspell::analyze(char*** slst, const char* word) { } if ((n == wl) && (n3 > 0) && (n - n3 > 3)) - return 0; + return slst; if ((n == wl) || ((n > 0) && ((scw[n] == '%') || (scw[n] == '\xB0')) && - checkword(scw.c_str() + n, NULL, NULL))) { + checkword(scw.substr(n), NULL, NULL))) { result.append(scw); result.resize(n - 1); if (n == wl) - cat_result(result, pSMgr->suggest_morph(scw.c_str() + n - 1)); + cat_result(result, pSMgr->suggest_morph(scw.substr(n - 1))); else { - char sign = scw[n]; - scw[n] = '\0'; - cat_result(result, pSMgr->suggest_morph(scw.c_str() + n - 1)); + std::string chunk = scw.substr(n - 1, 1); + cat_result(result, pSMgr->suggest_morph(chunk)); result.push_back('+'); // XXX SPEC. MORPHCODE - scw[n] = sign; - cat_result(result, pSMgr->suggest_morph(scw.c_str() + n)); + cat_result(result, pSMgr->suggest_morph(scw.substr(n))); } - return line_tok(result.c_str(), slst, MSEP_REC); + return line_tok(result, MSEP_REC); } } // END OF LANG_hu section @@ -1411,52 +1444,52 @@ int Hunspell::analyze(char*** slst, const char* word) { case HUHCAP: case HUHINITCAP: case NOCAP: { - cat_result(result, pSMgr->suggest_morph(scw.c_str())); + cat_result(result, pSMgr->suggest_morph(scw)); if (abbv) { std::string u8buffer(scw); u8buffer.push_back('.'); - cat_result(result, pSMgr->suggest_morph(u8buffer.c_str())); + cat_result(result, pSMgr->suggest_morph(u8buffer)); } break; } case INITCAP: { - wl = mkallsmall2(scw, sunicw); + mkallsmall2(scw, sunicw); std::string u8buffer(scw); mkinitcap2(scw, sunicw); - cat_result(result, pSMgr->suggest_morph(u8buffer.c_str())); - cat_result(result, pSMgr->suggest_morph(scw.c_str())); + cat_result(result, pSMgr->suggest_morph(u8buffer)); + cat_result(result, pSMgr->suggest_morph(scw)); if (abbv) { u8buffer.push_back('.'); - cat_result(result, pSMgr->suggest_morph(u8buffer.c_str())); + cat_result(result, pSMgr->suggest_morph(u8buffer)); u8buffer = scw; u8buffer.push_back('.'); - cat_result(result, pSMgr->suggest_morph(u8buffer.c_str())); + cat_result(result, pSMgr->suggest_morph(u8buffer)); } break; } case ALLCAP: { - cat_result(result, pSMgr->suggest_morph(scw.c_str())); + cat_result(result, pSMgr->suggest_morph(scw)); if (abbv) { std::string u8buffer(scw); u8buffer.push_back('.'); - cat_result(result, pSMgr->suggest_morph(u8buffer.c_str())); + cat_result(result, pSMgr->suggest_morph(u8buffer)); } mkallsmall2(scw, sunicw); std::string u8buffer(scw); mkinitcap2(scw, sunicw); - cat_result(result, pSMgr->suggest_morph(u8buffer.c_str())); - cat_result(result, pSMgr->suggest_morph(scw.c_str())); + cat_result(result, pSMgr->suggest_morph(u8buffer)); + cat_result(result, pSMgr->suggest_morph(scw)); if (abbv) { u8buffer.push_back('.'); - cat_result(result, pSMgr->suggest_morph(u8buffer.c_str())); + cat_result(result, pSMgr->suggest_morph(u8buffer)); u8buffer = scw; u8buffer.push_back('.'); - cat_result(result, pSMgr->suggest_morph(u8buffer.c_str())); + cat_result(result, pSMgr->suggest_morph(u8buffer)); } break; } @@ -1470,62 +1503,58 @@ int Hunspell::analyze(char*** slst, const char* word) { else reverseword(result); } - return line_tok(result.c_str(), slst, MSEP_REC); + return line_tok(result, MSEP_REC); } // compound word with dash (HU) I18n // LANG_hu section: set dash information for suggestions size_t dash_pos = langnum == LANG_hu ? scw.find('-') : std::string::npos; - int nresult = 0; if (dash_pos != std::string::npos) { + int nresult = 0; + std::string part1 = scw.substr(0, dash_pos); std::string part2 = scw.substr(dash_pos+1); // examine 2 sides of the dash if (part2.empty()) { // base word ending with dash - if (spell(part1.c_str())) { - char* p = pSMgr->suggest_morph(part1.c_str()); - if (p) { - int ret = line_tok(p, slst, MSEP_REC); - free(p); - return ret; + if (spell(part1)) { + std::string p = pSMgr->suggest_morph(part1); + if (!p.empty()) { + slst = line_tok(p, MSEP_REC); + return slst; } } } else if (part2.size() == 1 && part2[0] == 'e') { // XXX (HU) -e hat. - if (spell(part1.c_str()) && (spell("-e"))) { - char* st = pSMgr->suggest_morph(part1.c_str()); - if (st) { + if (spell(part1) && (spell("-e"))) { + std::string st = pSMgr->suggest_morph(part1); + if (!st.empty()) { result.append(st); - free(st); } result.push_back('+'); // XXX spec. separator in MORPHCODE st = pSMgr->suggest_morph("-e"); - if (st) { + if (!st.empty()) { result.append(st); - free(st); } - return line_tok(result.c_str(), slst, MSEP_REC); + return line_tok(result, MSEP_REC); } } else { // first word ending with dash: word- XXX ??? part1.push_back(' '); - nresult = spell(part1.c_str()); + nresult = spell(part1); part1.erase(part1.size() - 1); - if (nresult && spell(part2.c_str()) && + if (nresult && spell(part2) && ((part2.size() > 1) || ((part2[0] > '0') && (part2[0] < '9')))) { - char* st = pSMgr->suggest_morph(part1.c_str()); - if (st) { + std::string st = pSMgr->suggest_morph(part1); + if (!st.empty()) { result.append(st); - free(st); result.push_back('+'); // XXX spec. separator in MORPHCODE } - st = pSMgr->suggest_morph(part2.c_str()); - if (st) { + st = pSMgr->suggest_morph(part2); + if (!st.empty()) { result.append(st); - free(st); } - return line_tok(result.c_str(), slst, MSEP_REC); + return line_tok(result, MSEP_REC); } } // affixed number in correct word @@ -1550,37 +1579,38 @@ int Hunspell::analyze(char*** slst, const char* word) { continue; } std::string chunk = scw.substr(dash_pos - n); - if (checkword(chunk.c_str(), NULL, NULL)) { + if (checkword(chunk, NULL, NULL)) { result.append(chunk); - char* st = pSMgr->suggest_morph(chunk.c_str()); - if (st) { + std::string st = pSMgr->suggest_morph(chunk); + if (!st.empty()) { result.append(st); - free(st); } - return line_tok(result.c_str(), slst, MSEP_REC); + return line_tok(result, MSEP_REC); } } } } - return 0; + return slst; } -int Hunspell::generate(char*** slst, const char* word, char** pl, int pln) { - *slst = NULL; - if (!pSMgr || !pln) - return 0; - char** pl2; - int pl2n = analyze(&pl2, word); +std::vector<std::string> Hunspell::generate(const std::string& word, const std::vector<std::string>& pl) { + return m_Impl->generate(word, pl); +} + +std::vector<std::string> HunspellImpl::generate(const std::string& word, const std::vector<std::string>& pl) { + std::vector<std::string> slst; + if (!pSMgr || pl.empty()) + return slst; + std::vector<std::string> pl2 = analyze(word); int captype = NOCAP; int abbv = 0; std::string cw; cleanword(cw, word, &captype, &abbv); std::string result; - for (int i = 0; i < pln; i++) { - cat_result(result, pSMgr->suggest_gen(pl2, pl2n, pl[i])); + for (size_t i = 0; i < pl.size(); ++i) { + cat_result(result, pSMgr->suggest_gen(pl2, pl[i])); } - freelist(&pl2, pl2n); if (!result.empty()) { // allcap @@ -1588,50 +1618,42 @@ int Hunspell::generate(char*** slst, const char* word, char** pl, int pln) { mkallcap(result); // line split - int linenum = line_tok(result.c_str(), slst, MSEP_REC); + slst = line_tok(result, MSEP_REC); // capitalize if (captype == INITCAP || captype == HUHINITCAP) { - for (int j = 0; j < linenum; j++) { - std::string form((*slst)[j]); - free((*slst)[j]); - mkinitcap(form); - (*slst)[j] = mystrdup(form.c_str()); + for (size_t j = 0; j < slst.size(); ++j) { + mkinitcap(slst[j]); } } // temporary filtering of prefix related errors (eg. // generate("undrinkable", "eats") --> "undrinkables" and "*undrinks") - - int r = 0; - for (int j = 0; j < linenum; j++) { - if (!spell((*slst)[j])) { - free((*slst)[j]); - (*slst)[j] = NULL; - } else { - if (r < j) - (*slst)[r] = (*slst)[j]; - r++; + std::vector<std::string>::iterator it = slst.begin(); + while (it != slst.end()) { + if (!spell(*it)) { + it = slst.erase(it); + } else { + ++it; } } - if (r > 0) - return r; - free(*slst); - *slst = NULL; } - return 0; + return slst; } -int Hunspell::generate(char*** slst, const char* word, const char* pattern) { - char** pl; - int pln = analyze(&pl, pattern); - int n = generate(slst, word, pl, pln); - freelist(&pl, pln); - return uniqlist(*slst, n); +std::vector<std::string> Hunspell::generate(const std::string& word, const std::string& pattern) { + return m_Impl->generate(word, pattern); +} + +std::vector<std::string> HunspellImpl::generate(const std::string& word, const std::string& pattern) { + std::vector<std::string> pl = analyze(pattern); + std::vector<std::string> slst = generate(word, pl); + uniqlist(slst); + return slst; } // minimal XML parser functions -std::string Hunspell::get_xml_par(const char* par) { +std::string HunspellImpl::get_xml_par(const char* par) { std::string dest; if (!par) return dest; @@ -1639,7 +1661,7 @@ std::string Hunspell::get_xml_par(const char* par) { if (end == '>') end = '<'; else if (end != '\'' && end != '"') - return 0; // bad XML + return dest; // bad XML for (par++; *par != '\0' && *par != end; ++par) { dest.push_back(*par); } @@ -1649,29 +1671,54 @@ std::string Hunspell::get_xml_par(const char* par) { } int Hunspell::get_langnum() const { + return m_Impl->get_langnum(); +} + +int HunspellImpl::get_langnum() const { return langnum; } +bool Hunspell::input_conv(const std::string& word, std::string& dest) { + return m_Impl->input_conv(word, dest); +} + int Hunspell::input_conv(const char* word, char* dest, size_t destsize) { - RepList* rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; - return (rl && rl->conv(word, dest, destsize) > 0); + std::string d; + bool ret = input_conv(word, d); + if (ret && d.size() < destsize) { + strncpy(dest, d.c_str(), destsize); + return 1; + } + return 0; +} + +bool HunspellImpl::input_conv(const std::string& word, std::string& dest) { + RepList* rl = pAMgr ? pAMgr->get_iconvtable() : NULL; + if (rl) { + return rl->conv(word, dest); + } + dest.assign(word); + return false; } // return the beginning of the element (attr == NULL) or the attribute -const char* Hunspell::get_xml_pos(const char* s, const char* attr) { +const char* HunspellImpl::get_xml_pos(const char* s, const char* attr) { const char* end = strchr(s, '>'); - const char* p = s; if (attr == NULL) return end; - do { + const char* p = s; + while (1) { p = strstr(p, attr); if (!p || p >= end) return 0; - } while (*(p - 1) != ' ' && *(p - 1) != '\n'); + if (*(p - 1) == ' ' || *(p - 1) == '\n') + break; + p += strlen(attr); + } return p + strlen(attr); } -int Hunspell::check_xml_par(const char* q, +int HunspellImpl::check_xml_par(const char* q, const char* attr, const char* value) { std::string cw = get_xml_par(get_xml_pos(q, attr)); @@ -1680,53 +1727,48 @@ int Hunspell::check_xml_par(const char* q, return 0; } -int Hunspell::get_xml_list(char*** slst, const char* list, const char* tag) { +std::vector<std::string> HunspellImpl::get_xml_list(const char* list, const char* tag) { + std::vector<std::string> slst; if (!list) - return 0; - int n = 0; - const char* p; - for (p = list; ((p = strstr(p, tag)) != NULL); p++) - n++; - if (n == 0) - return 0; - *slst = (char**)malloc(sizeof(char*) * n); - if (!*slst) - return 0; - for (p = list, n = 0; ((p = strstr(p, tag)) != NULL); p++, n++) { + return slst; + const char* p = list; + for (size_t n = 0; ((p = strstr(p, tag)) != NULL); ++p, ++n) { std::string cw = get_xml_par(p + strlen(tag) - 1); if (cw.empty()) { break; } - (*slst)[n] = mystrdup(cw.c_str()); + slst.push_back(cw); } - return n; + return slst; } -int Hunspell::spellml(char*** slst, const char* word) { +std::vector<std::string> HunspellImpl::spellml(const std::string& in_word) { + std::vector<std::string> slst; + + const char* word = in_word.c_str(); + const char* q = strstr(word, "<query"); if (!q) - return 0; // bad XML input + return slst; // bad XML input const char* q2 = strchr(q, '>'); if (!q2) - return 0; // bad XML input + return slst; // bad XML input q2 = strstr(q2, "<word"); if (!q2) - return 0; // bad XML input + return slst; // bad XML input if (check_xml_par(q, "type=", "analyze")) { - int n = 0; std::string cw = get_xml_par(strchr(q2, '>')); if (!cw.empty()) - n = analyze(slst, cw.c_str()); - if (n == 0) - return 0; + slst = analyze(cw); + if (slst.empty()) + return slst; // convert the result to <code><a>ana1</a><a>ana2</a></code> format std::string r; r.append("<code>"); - for (int i = 0; i < n; i++) { + for (size_t i = 0; i < slst.size(); ++i) { r.append("<a>"); - std::string entry((*slst)[i]); - free((*slst)[i]); + std::string entry(slst[i]); mystrrep(entry, "\t", " "); mystrrep(entry, "&", "&"); mystrrep(entry, "<", "<"); @@ -1735,36 +1777,101 @@ int Hunspell::spellml(char*** slst, const char* word) { r.append("</a>"); } r.append("</code>"); - (*slst)[0] = mystrdup(r.c_str()); - return 1; + slst.clear(); + slst.push_back(r); + return slst; } else if (check_xml_par(q, "type=", "stem")) { std::string cw = get_xml_par(strchr(q2, '>')); if (!cw.empty()) - return stem(slst, cw.c_str()); + return stem(cw); } else if (check_xml_par(q, "type=", "generate")) { std::string cw = get_xml_par(strchr(q2, '>')); if (cw.empty()) - return 0; + return slst; const char* q3 = strstr(q2 + 1, "<word"); if (q3) { std::string cw2 = get_xml_par(strchr(q3, '>')); if (!cw2.empty()) { - return generate(slst, cw.c_str(), cw2.c_str()); + return generate(cw, cw2); } } else { if ((q2 = strstr(q2 + 1, "<code")) != NULL) { - char** slst2; - int n = get_xml_list(&slst2, strchr(q2, '>'), "<a>"); - if (n != 0) { - int n2 = generate(slst, cw.c_str(), slst2, n); - freelist(&slst2, n); - return uniqlist(*slst, n2); + std::vector<std::string> slst2 = get_xml_list(strchr(q2, '>'), "<a>"); + if (!slst2.empty()) { + slst = generate(cw, slst2); + uniqlist(slst); + return slst; } - freelist(&slst2, n); } } } - return 0; + return slst; +} + +int Hunspell::spell(const char* word, int* info, char** root) { + std::string sroot; + bool ret = m_Impl->spell(word, info, root ? &sroot : NULL); + if (root) { + if (sroot.empty()) { + *root = NULL; + } else { + *root = mystrdup(sroot.c_str()); + } + } + return ret; +} + +namespace { + int munge_vector(char*** slst, const std::vector<std::string>& items) { + if (items.empty()) { + *slst = NULL; + return 0; + } else { + *slst = (char**)malloc(sizeof(char*) * items.size()); + if (!*slst) + return 0; + for (size_t i = 0; i < items.size(); ++i) + (*slst)[i] = mystrdup(items[i].c_str()); + } + return items.size(); + } +} + +void Hunspell::free_list(char*** slst, int n) { + Hunspell_free_list((Hunhandle*)(this), slst, n); +} + +int Hunspell::suggest(char*** slst, const char* word) { + return Hunspell_suggest((Hunhandle*)(this), slst, word); +} + +int Hunspell::suffix_suggest(char*** slst, const char* root_word) { + std::vector<std::string> stems = m_Impl->suffix_suggest(root_word); + return munge_vector(slst, stems); +} + +char* Hunspell::get_dic_encoding() { + return &(m_Impl->dic_encoding_vec[0]); +} + +int Hunspell::stem(char*** slst, char** desc, int n) { + return Hunspell_stem2((Hunhandle*)(this), slst, desc, n); +} + +int Hunspell::stem(char*** slst, const char* word) { + return Hunspell_stem((Hunhandle*)(this), slst, word); +} + +int Hunspell::analyze(char*** slst, const char* word) { + return Hunspell_analyze((Hunhandle*)(this), slst, word); +} + +int Hunspell::generate(char*** slst, const char* word, char** pl, int pln) { + return Hunspell_generate2((Hunhandle*)(this), slst, word, pl, pln); +} + +int Hunspell::generate(char*** slst, const char* word, const char* pattern) { + return Hunspell_generate((Hunhandle*)(this), slst, word, pattern); } Hunhandle* Hunspell_create(const char* affpath, const char* dpath) { @@ -1774,46 +1881,56 @@ Hunhandle* Hunspell_create(const char* affpath, const char* dpath) { Hunhandle* Hunspell_create_key(const char* affpath, const char* dpath, const char* key) { - return (Hunhandle*)(new Hunspell(affpath, dpath, key)); + return reinterpret_cast<Hunhandle*>(new Hunspell(affpath, dpath, key)); } void Hunspell_destroy(Hunhandle* pHunspell) { - delete (Hunspell*)(pHunspell); + delete reinterpret_cast<Hunspell*>(pHunspell); } int Hunspell_add_dic(Hunhandle* pHunspell, const char* dpath) { - return ((Hunspell*)pHunspell)->add_dic(dpath); + return reinterpret_cast<Hunspell*>(pHunspell)->add_dic(dpath); } int Hunspell_spell(Hunhandle* pHunspell, const char* word) { - return ((Hunspell*)pHunspell)->spell(word); + return reinterpret_cast<Hunspell*>(pHunspell)->spell(std::string(word)); } char* Hunspell_get_dic_encoding(Hunhandle* pHunspell) { - return ((Hunspell*)pHunspell)->get_dic_encoding(); + return reinterpret_cast<Hunspell*>(pHunspell)->get_dic_encoding(); } int Hunspell_suggest(Hunhandle* pHunspell, char*** slst, const char* word) { - return ((Hunspell*)pHunspell)->suggest(slst, word); + std::vector<std::string> suggests = reinterpret_cast<Hunspell*>(pHunspell)->suggest(word); + return munge_vector(slst, suggests); } int Hunspell_analyze(Hunhandle* pHunspell, char*** slst, const char* word) { - return ((Hunspell*)pHunspell)->analyze(slst, word); + std::vector<std::string> stems = reinterpret_cast<Hunspell*>(pHunspell)->analyze(word); + return munge_vector(slst, stems); } int Hunspell_stem(Hunhandle* pHunspell, char*** slst, const char* word) { - return ((Hunspell*)pHunspell)->stem(slst, word); + + std::vector<std::string> stems = reinterpret_cast<Hunspell*>(pHunspell)->stem(word); + return munge_vector(slst, stems); } int Hunspell_stem2(Hunhandle* pHunspell, char*** slst, char** desc, int n) { - return ((Hunspell*)pHunspell)->stem(slst, desc, n); + std::vector<std::string> morph; + for (int i = 0; i < n; ++i) + morph.push_back(desc[i]); + + std::vector<std::string> stems = reinterpret_cast<Hunspell*>(pHunspell)->stem(morph); + return munge_vector(slst, stems); } int Hunspell_generate(Hunhandle* pHunspell, char*** slst, const char* word, - const char* word2) { - return ((Hunspell*)pHunspell)->generate(slst, word, word2); + const char* pattern) { + std::vector<std::string> stems = reinterpret_cast<Hunspell*>(pHunspell)->generate(word, pattern); + return munge_vector(slst, stems); } int Hunspell_generate2(Hunhandle* pHunspell, @@ -1821,7 +1938,12 @@ int Hunspell_generate2(Hunhandle* pHunspell, const char* word, char** desc, int n) { - return ((Hunspell*)pHunspell)->generate(slst, word, desc, n); + std::vector<std::string> morph; + for (int i = 0; i < n; ++i) + morph.push_back(desc[i]); + + std::vector<std::string> stems = reinterpret_cast<Hunspell*>(pHunspell)->generate(word, morph); + return munge_vector(slst, stems); } /* functions for run-time modification of the dictionary */ @@ -1829,7 +1951,7 @@ int Hunspell_generate2(Hunhandle* pHunspell, /* add word to the run-time dictionary */ int Hunspell_add(Hunhandle* pHunspell, const char* word) { - return ((Hunspell*)pHunspell)->add(word); + return reinterpret_cast<Hunspell*>(pHunspell)->add(word); } /* add word to the run-time dictionary with affix flags of @@ -1840,25 +1962,35 @@ int Hunspell_add(Hunhandle* pHunspell, const char* word) { int Hunspell_add_with_affix(Hunhandle* pHunspell, const char* word, const char* example) { - return ((Hunspell*)pHunspell)->add_with_affix(word, example); + return reinterpret_cast<Hunspell*>(pHunspell)->add_with_affix(word, example); } /* remove word from the run-time dictionary */ int Hunspell_remove(Hunhandle* pHunspell, const char* word) { - return ((Hunspell*)pHunspell)->remove(word); + return reinterpret_cast<Hunspell*>(pHunspell)->remove(word); } -void Hunspell_free_list(Hunhandle*, char*** slst, int n) { - freelist(slst, n); +void Hunspell_free_list(Hunhandle*, char*** list, int n) { + if (list && *list) { + for (int i = 0; i < n; i++) + free((*list)[i]); + free(*list); + *list = NULL; + } } -int Hunspell::suffix_suggest(char*** slst, const char* root_word) { +std::vector<std::string> Hunspell::suffix_suggest(const std::string& root_word) { + return m_Impl->suffix_suggest(root_word); +} + +std::vector<std::string> HunspellImpl::suffix_suggest(const std::string& root_word) { + std::vector<std::string> slst; struct hentry* he = NULL; int len; std::string w2; const char* word; - char* ignoredchars = pAMgr->get_ignore(); + const char* ignoredchars = pAMgr->get_ignore(); if (ignoredchars != NULL) { w2.assign(root_word); if (utf8) { @@ -1870,26 +2002,18 @@ int Hunspell::suffix_suggest(char*** slst, const char* root_word) { } word = w2.c_str(); } else - word = root_word; + word = root_word.c_str(); len = strlen(word); if (!len) - return 0; + return slst; - char** wlst = (char**)malloc(MAXSUGGESTION * sizeof(char*)); - if (wlst == NULL) - return -1; - *slst = wlst; - for (int i = 0; i < MAXSUGGESTION; i++) { - wlst[i] = NULL; - } - - for (int i = 0; (i < maxdic) && !he; i++) { - he = (pHMgr[i])->lookup(word); + for (size_t i = 0; (i < m_HMgrs.size()) && !he; ++i) { + he = m_HMgrs[i]->lookup(word); } if (he) { - return pAMgr->get_suffix_words(he->astr, he->alen, root_word, *slst); + slst = pAMgr->get_suffix_words(he->astr, he->alen, root_word.c_str()); } - return 0; + return slst; } diff --git a/libs/hunspell/src/hunspell.h b/libs/hunspell/src/hunspell.h index 726bbe2077..3aca30ab2f 100644 --- a/libs/hunspell/src/hunspell.h +++ b/libs/hunspell/src/hunspell.h @@ -38,8 +38,8 @@ * * ***** END LICENSE BLOCK ***** */ -#ifndef _MYSPELLMGR_H_ -#define _MYSPELLMGR_H_ +#ifndef MYSPELLMGR_H_ +#define MYSPELLMGR_H_ #include "hunvisapi.h" diff --git a/libs/hunspell/src/hunspell.hxx b/libs/hunspell/src/hunspell.hxx index 3bcf75e39c..a06bdd43ab 100644 --- a/libs/hunspell/src/hunspell.hxx +++ b/libs/hunspell/src/hunspell.hxx @@ -1,6 +1,8 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * + * Copyright (C) 2002-2017 Németh László + * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at @@ -11,12 +13,7 @@ * for the specific language governing rights and limitations under the * License. * - * The Original Code is Hunspell, based on MySpell. - * - * The Initial Developers of the Original Code are - * Kevin Hendricks (MySpell) and Németh László (Hunspell). - * Portions created by the Initial Developers are Copyright (C) 2002-2005 - * the Initial Developers. All Rights Reserved. + * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. * * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, @@ -70,26 +67,33 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ +#ifndef MYSPELLMGR_HXX_ +#define MYSPELLMGR_HXX_ #include "hunvisapi.h" - -#include "hashmgr.hxx" -#include "affixmgr.hxx" -#include "suggestmgr.hxx" -#include "langnum.hxx" +#include "w_char.hxx" +#include "atypes.hxx" +#include <string> #include <vector> #define SPELL_XML "<?xml?>" -#define MAXDIC 20 #define MAXSUGGESTION 15 #define MAXSHARPS 5 -#define HUNSPELL_OK (1 << 0) -#define HUNSPELL_OK_WARN (1 << 1) +#ifndef MAXWORDLEN +#define MAXWORDLEN 100 +#endif -#ifndef _MYSPELLMGR_HXX_ -#define _MYSPELLMGR_HXX_ +#if defined __GNUC__ && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) +# define H_DEPRECATED __attribute__((__deprecated__)) +#elif defined(_MSC_VER) && (_MSC_VER >= 1300) +# define H_DEPRECATED __declspec(deprecated) +#else +# define H_DEPRECATED +#endif + +class HunspellImpl; class LIBHUNSPELL_DLL_EXPORTED Hunspell { private: @@ -97,17 +101,7 @@ class LIBHUNSPELL_DLL_EXPORTED Hunspell { Hunspell& operator=(const Hunspell&); private: - AffixMgr* pAMgr; - HashMgr* pHMgr[MAXDIC]; - int maxdic; - SuggestMgr* pSMgr; - char* affixpath; - char* encoding; - struct cs_info* csconv; - int langnum; - int utf8; - int complexprefixes; - char** wordbreak; + HunspellImpl* m_Impl; public: /* Hunspell(aff, dic) - constructor of Hunspell class @@ -125,7 +119,7 @@ class LIBHUNSPELL_DLL_EXPORTED Hunspell { int add_dic(const char* dpath, const char* key = NULL); /* spell(word) - spellcheck word - * output: 0 = bad word, not 0 = good word + * output: false = bad word, true = good word * * plus output: * info: information bit array, fields: @@ -133,8 +127,8 @@ class LIBHUNSPELL_DLL_EXPORTED Hunspell { * SPELL_FORBIDDEN = an explicit forbidden word * root: root (stem), when input is a word with affix(es) */ - - int spell(const char* word, int* info = NULL, char** root = NULL); + bool spell(const std::string& word, int* info = NULL, std::string* root = NULL); + H_DEPRECATED int spell(const char* word, int* info = NULL, char** root = NULL); /* suggest(suggestions, word) - search suggestions * input: pointer to an array of strings pointer and the (bad) word @@ -143,8 +137,8 @@ class LIBHUNSPELL_DLL_EXPORTED Hunspell { * a newly allocated array of strings (*slts will be NULL when number * of suggestion equals 0.) */ - - int suggest(char*** slst, const char* word); + std::vector<std::string> suggest(const std::string& word); + H_DEPRECATED int suggest(char*** slst, const char* word); /* Suggest words from suffix rules * suffix_suggest(suggestions, root_word) @@ -154,36 +148,37 @@ class LIBHUNSPELL_DLL_EXPORTED Hunspell { * a newly allocated array of strings (*slts will be NULL when number * of suggestion equals 0.) */ - int suffix_suggest(char*** slst, const char* root_word); + std::vector<std::string> suffix_suggest(const std::string& root_word); + H_DEPRECATED int suffix_suggest(char*** slst, const char* root_word); /* deallocate suggestion lists */ + H_DEPRECATED void free_list(char*** slst, int n); - void free_list(char*** slst, int n); - + const std::string& get_dict_encoding() const; char* get_dic_encoding(); /* morphological functions */ /* analyze(result, word) - morphological analysis of the word */ + std::vector<std::string> analyze(const std::string& word); + H_DEPRECATED int analyze(char*** slst, const char* word); - int analyze(char*** slst, const char* word); + /* stem(word) - stemmer function */ + std::vector<std::string> stem(const std::string& word); + H_DEPRECATED int stem(char*** slst, const char* word); - /* stem(result, word) - stemmer function */ - - int stem(char*** slst, const char* word); - - /* stem(result, analysis, n) - get stems from a morph. analysis + /* stem(analysis, n) - get stems from a morph. analysis * example: * char ** result, result2; * int n1 = analyze(&result, "words"); * int n2 = stem(&result2, result, n1); */ - - int stem(char*** slst, char** morph, int n); + std::vector<std::string> stem(const std::vector<std::string>& morph); + H_DEPRECATED int stem(char*** slst, char** morph, int n); /* generate(result, word, word2) - morphological generation by example(s) */ - - int generate(char*** slst, const char* word, const char* word2); + std::vector<std::string> generate(const std::string& word, const std::string& word2); + H_DEPRECATED int generate(char*** slst, const char* word, const char* word2); /* generate(result, word, desc, n) - generation by morph. description(s) * example: @@ -192,71 +187,43 @@ class LIBHUNSPELL_DLL_EXPORTED Hunspell { * int n = generate(&result, "word", &affix, 1); * for (int i = 0; i < n; i++) printf("%s\n", result[i]); */ - - int generate(char*** slst, const char* word, char** desc, int n); + std::vector<std::string> generate(const std::string& word, const std::vector<std::string>& pl); + H_DEPRECATED int generate(char*** slst, const char* word, char** desc, int n); /* functions for run-time modification of the dictionary */ /* add word to the run-time dictionary */ - int add(const char* word); + int add(const std::string& word); /* add word to the run-time dictionary with affix flags of * the example (a dictionary word): Hunspell will recognize * affixed forms of the new word, too. */ - int add_with_affix(const char* word, const char* example); + int add_with_affix(const std::string& word, const std::string& example); /* remove word from the run-time dictionary */ - int remove(const char* word); + int remove(const std::string& word); /* other */ /* get extra word characters definied in affix file for tokenization */ - const char* get_wordchars(); - const std::vector<w_char>& get_wordchars_utf16(); + const char* get_wordchars() const; + const std::string& get_wordchars_cpp() const; + const std::vector<w_char>& get_wordchars_utf16() const; struct cs_info* get_csconv(); - const char* get_version(); + + const char* get_version() const; + const std::string& get_version_cpp() const; int get_langnum() const; /* need for putdic */ - int input_conv(const char* word, char* dest, size_t destsize); - - inline char *get_try_string() - { - return pAMgr->get_try_string(); - } - - private: - void cleanword(std::string& dest, const char*, int* pcaptype, int* pabbrev); - size_t cleanword2(std::string& dest, - std::vector<w_char>& dest_u, - const char*, - int* w_len, - int* pcaptype, - size_t* pabbrev); - void mkinitcap(std::string& u8); - int mkinitcap2(std::string& u8, std::vector<w_char>& u16); - int mkinitsmall2(std::string& u8, std::vector<w_char>& u16); - void mkallcap(std::string& u8); - int mkallsmall2(std::string& u8, std::vector<w_char>& u16); - struct hentry* checkword(const char*, int* info, char** root); - std::string sharps_u8_l1(const std::string& source); - hentry* - spellsharps(std::string& base, size_t start_pos, int, int, int* info, char** root); - int is_keepcase(const hentry* rv); - int insert_sug(char*** slst, const char* word, int ns); - void cat_result(std::string& result, char* st); - char* stem_description(const char* desc); - int spellml(char*** slst, const char* word); - std::string get_xml_par(const char* par); - const char* get_xml_pos(const char* s, const char* attr); - int get_xml_list(char*** slst, const char* list, const char* tag); - int check_xml_par(const char* q, const char* attr, const char* value); + bool input_conv(const std::string& word, std::string& dest); + H_DEPRECATED int input_conv(const char* word, char* dest, size_t destsize); }; #endif diff --git a/libs/hunspell/src/hunspelldll.h b/libs/hunspell/src/hunspelldll.h deleted file mode 100644 index 32d168236a..0000000000 --- a/libs/hunspell/src/hunspelldll.h +++ /dev/null @@ -1,39 +0,0 @@ -/* ***** BEGIN LICENSE BLOCK *****
- * Version: MPL 1.1/GPL 2.0/LGPL 2.1
- *
- * The contents of this file are subject to the Mozilla Public License Version
- * 1.1 (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- * http://www.mozilla.org/MPL/
- *
- * Software distributed under the License is distributed on an "AS IS" basis,
- * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
- * for the specific language governing rights and limitations under the
- * License.
- *
- * Copyright (C) 2006
- * Miha Vrhovnik (http://simail.sf.net, http://xcollect.sf.net)
- * All Rights Reserved.
- *
- * Contributor(s):
- *
- *
- * Alternatively, the contents of this file may be used under the terms of
- * either the GNU General Public License Version 2 or later (the "GPL"), or
- * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
- * in which case the provisions of the GPL or the LGPL are applicable instead
- * of those above. If you wish to allow use of your version of this file only
- * under the terms of either the GPL or the LGPL, and not to allow others to
- * use your version of this file under the terms of the MPL, indicate your
- * decision by deleting the provisions above and replace them with the notice
- * and other provisions required by the GPL or the LGPL. If you do not delete
- * the provisions above, a recipient may use your version of this file under
- * the terms of any one of the MPL, the GPL or the LGPL.
- *
- * ***** END LICENSE BLOCK ***** **/
-#include "hunspell.hxx"
-
-#ifndef _DLL_H_
-#define _DLL_H_
-
-#endif /* _DLL_H_ */
diff --git a/libs/hunspell/src/hunvisapi.h b/libs/hunspell/src/hunvisapi.h index abf025ae97..eb2b348091 100644 --- a/libs/hunspell/src/hunvisapi.h +++ b/libs/hunspell/src/hunvisapi.h @@ -1,5 +1,5 @@ -#ifndef _HUNSPELL_VISIBILITY_H_ -#define _HUNSPELL_VISIBILITY_H_ +#ifndef HUNSPELL_VISIBILITY_H_ +#define HUNSPELL_VISIBILITY_H_ #if defined(HUNSPELL_STATIC) # define LIBHUNSPELL_DLL_EXPORTED @@ -9,7 +9,7 @@ # else # define LIBHUNSPELL_DLL_EXPORTED __declspec(dllimport) # endif -#elif defined(BUILDING_LIBHUNSPELL) && @HAVE_VISIBILITY@ +#elif defined(BUILDING_LIBHUNSPELL) && 1 # define LIBHUNSPELL_DLL_EXPORTED __attribute__((__visibility__("default"))) #else # define LIBHUNSPELL_DLL_EXPORTED diff --git a/libs/hunspell/src/hunzip.c++ b/libs/hunspell/src/hunzip.cxx index b2788a1055..8962b100b1 100644 --- a/libs/hunspell/src/hunzip.c++ +++ b/libs/hunspell/src/hunzip.cxx @@ -1,6 +1,8 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * + * Copyright (C) 2002-2017 Németh László + * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at @@ -11,12 +13,7 @@ * for the specific language governing rights and limitations under the * License. * - * The Original Code is Hunspell, based on MySpell. - * - * The Initial Developers of the Original Code are - * Kevin Hendricks (MySpell) and Németh László (Hunspell). - * Portions created by the Initial Developers are Copyright (C) 2002-2005 - * the Initial Developers. All Rights Reserved. + * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. * * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, @@ -59,7 +56,7 @@ int Hunzip::fail(const char* err, const char* par) { } Hunzip::Hunzip(const char* file, const char* key) - : fin(NULL), bufsiz(0), lastbit(0), inc(0), inbits(0), outc(0), dec(NULL) { + : bufsiz(0), lastbit(0), inc(0), inbits(0), outc(0) { in[0] = out[0] = line[0] = '\0'; filename = mystrdup(file); if (getcode(key) == -1) @@ -70,19 +67,19 @@ Hunzip::Hunzip(const char* file, const char* key) int Hunzip::getcode(const char* key) { unsigned char c[2]; - int i, j, n, p; + int i, j, n; int allocatedbit = BASEBITREC; const char* enc = key; if (!filename) return -1; - fin = myfopen(filename, "rb"); - if (!fin) + myopen(fin, filename, std::ios_base::in | std::ios_base::binary); + if (!fin.is_open()) return -1; // read magic number - if ((fread(in, 1, 3, fin) < MAGICLEN) || + if (!fin.read(in, 3) || !(strncmp(MAGIC, in, MAGICLEN) == 0 || strncmp(MAGIC_ENCRYPT, in, MAGICLEN) == 0)) { return fail(MSG_FORMAT, filename); @@ -93,7 +90,7 @@ int Hunzip::getcode(const char* key) { unsigned char cs; if (!key) return fail(MSG_KEY, filename); - if (fread(&c, 1, 1, fin) < 1) + if (!fin.read(reinterpret_cast<char*>(c), 1)) return fail(MSG_FORMAT, filename); for (cs = 0; *enc; enc++) cs ^= *enc; @@ -104,7 +101,7 @@ int Hunzip::getcode(const char* key) { key = NULL; // read record count - if (fread(&c, 1, 2, fin) < 2) + if (!fin.read(reinterpret_cast<char*>(c), 2)) return fail(MSG_FORMAT, filename); if (key) { @@ -115,16 +112,14 @@ int Hunzip::getcode(const char* key) { } n = ((int)c[0] << 8) + c[1]; - dec = (struct bit*)malloc(BASEBITREC * sizeof(struct bit)); - if (!dec) - return fail(MSG_MEMORY, filename); + dec.resize(BASEBITREC); dec[0].v[0] = 0; dec[0].v[1] = 0; // read codes for (i = 0; i < n; i++) { unsigned char l; - if (fread(c, 1, 2, fin) < 2) + if (!fin.read(reinterpret_cast<char*>(c), 2)) return fail(MSG_FORMAT, filename); if (key) { if (*(++enc) == '\0') @@ -134,14 +129,14 @@ int Hunzip::getcode(const char* key) { enc = key; c[1] ^= *enc; } - if (fread(&l, 1, 1, fin) < 1) + if (!fin.read(reinterpret_cast<char*>(&l), 1)) return fail(MSG_FORMAT, filename); if (key) { if (*(++enc) == '\0') enc = key; l ^= *enc; } - if (fread(in, 1, l / 8 + 1, fin) < (size_t)l / 8 + 1) + if (!fin.read(in, l / 8 + 1)) return fail(MSG_FORMAT, filename); if (key) for (j = 0; j <= l / 8; j++) { @@ -149,7 +144,7 @@ int Hunzip::getcode(const char* key) { enc = key; in[j] ^= *enc; } - p = 0; + int p = 0; for (j = 0; j < l; j++) { int b = (in[j / 8] & (1 << (7 - (j % 8)))) ? 1 : 0; int oldp = p; @@ -158,7 +153,7 @@ int Hunzip::getcode(const char* key) { lastbit++; if (lastbit == allocatedbit) { allocatedbit += BASEBITREC; - dec = (struct bit*)realloc(dec, allocatedbit * sizeof(struct bit)); + dec.resize(allocatedbit); } dec[lastbit].v[0] = 0; dec[lastbit].v[1] = 0; @@ -173,10 +168,6 @@ int Hunzip::getcode(const char* key) { } Hunzip::~Hunzip() { - if (dec) - free(dec); - if (fin) - fclose(fin); if (filename) free(filename); } @@ -185,16 +176,17 @@ int Hunzip::getbuf() { int p = 0; int o = 0; do { - if (inc == 0) - inbits = fread(in, 1, BUFSIZE, fin) * 8; + if (inc == 0) { + fin.read(in, BUFSIZE); + inbits = fin.gcount() * 8; + } for (; inc < inbits; inc++) { int b = (in[inc / 8] & (1 << (7 - (inc % 8)))) ? 1 : 0; int oldp = p; p = dec[p].v[b]; if (p == 0) { if (oldp == lastbit) { - fclose(fin); - fin = NULL; + fin.close(); // add last odd byte if (dec[lastbit].c[0]) out[o++] = dec[lastbit].c[1]; @@ -212,11 +204,11 @@ int Hunzip::getbuf() { return fail(MSG_FORMAT, filename); } -const char* Hunzip::getline() { +bool Hunzip::getline(std::string& dest) { char linebuf[BUFSIZE]; int l = 0, eol = 0, left = 0, right = 0; if (bufsiz == -1) - return NULL; + return false; while (l < bufsiz && !eol) { linebuf[l++] = out[outc]; switch (out[outc]) { @@ -251,7 +243,7 @@ const char* Hunzip::getline() { } if (++outc == bufsiz) { outc = 0; - bufsiz = fin ? getbuf() : -1; + bufsiz = fin.is_open() ? getbuf() : -1; } } if (right) @@ -259,5 +251,6 @@ const char* Hunzip::getline() { else linebuf[l] = '\0'; strcpy(line + left, linebuf); - return line; + dest.assign(line); + return true; } diff --git a/libs/hunspell/src/hunzip.hxx b/libs/hunspell/src/hunzip.hxx index 5082adddb0..ea2bc58d26 100644 --- a/libs/hunspell/src/hunzip.hxx +++ b/libs/hunspell/src/hunzip.hxx @@ -1,6 +1,8 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * + * Copyright (C) 2002-2017 Németh László + * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at @@ -11,12 +13,7 @@ * for the specific language governing rights and limitations under the * License. * - * The Original Code is Hunspell, based on MySpell. - * - * The Initial Developers of the Original Code are - * Kevin Hendricks (MySpell) and Németh László (Hunspell). - * Portions created by the Initial Developers are Copyright (C) 2002-2005 - * the Initial Developers. All Rights Reserved. + * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. * * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, @@ -41,12 +38,14 @@ /* hunzip: file decompression for sorted dictionaries with optional encryption, * algorithm: prefix-suffix encoding and 16-bit Huffman encoding */ -#ifndef _HUNZIP_HXX_ -#define _HUNZIP_HXX_ +#ifndef HUNZIP_HXX_ +#define HUNZIP_HXX_ #include "hunvisapi.h" #include <stdio.h> +#include <fstream> +#include <vector> #define BUFSIZE 65536 #define HZIP_EXTENSION ".hz" @@ -68,9 +67,9 @@ class LIBHUNSPELL_DLL_EXPORTED Hunzip { protected: char* filename; - FILE* fin; + std::ifstream fin; int bufsiz, lastbit, inc, inbits, outc; - struct bit* dec; // code table + std::vector<bit> dec; // code table char in[BUFSIZE]; // input buffer char out[BUFSIZE + 1]; // Huffman-decoded buffer char line[BUFSIZE + 50]; // decoded line @@ -81,7 +80,8 @@ class LIBHUNSPELL_DLL_EXPORTED Hunzip { public: Hunzip(const char* filename, const char* key = NULL); ~Hunzip(); - const char* getline(); + bool is_open() { return fin.is_open(); } + bool getline(std::string& dest); }; #endif diff --git a/libs/hunspell/src/langnum.hxx b/libs/hunspell/src/langnum.hxx index af5c86e4fe..a64d3d7869 100644 --- a/libs/hunspell/src/langnum.hxx +++ b/libs/hunspell/src/langnum.hxx @@ -1,6 +1,8 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * + * Copyright (C) 2002-2017 Németh László + * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at @@ -11,12 +13,7 @@ * for the specific language governing rights and limitations under the * License. * - * The Original Code is Hunspell, based on MySpell. - * - * The Initial Developers of the Original Code are - * Kevin Hendricks (MySpell) and Németh László (Hunspell). - * Portions created by the Initial Developers are Copyright (C) 2002-2005 - * the Initial Developers. All Rights Reserved. + * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. * * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, @@ -38,12 +35,12 @@ * * ***** END LICENSE BLOCK ***** */ -#ifndef _LANGNUM_HXX_ -#define _LANGNUM_HXX_ +#ifndef LANGNUM_HXX_ +#define LANGNUM_HXX_ /* language numbers for language specific codes - see http://l10n.openoffice.org/languages.html + see https://wiki.openoffice.org/w/index.php?title=Languages&oldid=230199 */ enum { diff --git a/libs/hunspell/src/phonet.c++ b/libs/hunspell/src/phonet.cxx index 17350e74a7..69601a2872 100644 --- a/libs/hunspell/src/phonet.c++ +++ b/libs/hunspell/src/phonet.cxx @@ -36,15 +36,13 @@ #include "phonet.hxx" void init_phonet_hash(phonetable& parms) { - int i, k; - - for (i = 0; i < HASHSIZE; i++) { + for (int i = 0; i < HASHSIZE; i++) { parms.hash[i] = -1; } - for (i = 0; parms.rules[i][0] != '\0'; i += 2) { + for (int i = 0; parms.rules[i][0] != '\0'; i += 2) { /** set hash value **/ - k = (unsigned char)parms.rules[i][0]; + int k = (unsigned char)parms.rules[i][0]; if (parms.hash[k] < 0) { parms.hash[k] = i; @@ -73,9 +71,8 @@ static int myisalpha(char ch) { std::string phonet(const std::string& inword, phonetable& parms) { int i, k = 0, p, z; - int k0, n0, p0 = -333, z0; + int k0, n0, p0 = -333; char c; - const char* s; typedef unsigned char uchar; size_t len = inword.size(); @@ -90,15 +87,15 @@ std::string phonet(const std::string& inword, phonetable& parms) { i = z = 0; while ((c = word[i]) != '\0') { int n = parms.hash[(uchar)c]; - z0 = 0; + int z0 = 0; - if (n >= 0) { + if (n >= 0 && !parms.rules[n].empty()) { /** check all rules for the same letter **/ while (parms.rules[n][0] == c) { /** check whole string **/ k = 1; /** number of found letters **/ p = 5; /** default priority **/ - s = parms.rules[n]; + const char*s = parms.rules[n].c_str(); s++; /** important for (see below) "*(s-1)" **/ while (*s != '\0' && word[i + k] == *s && !isdigit((unsigned char)*s) && @@ -142,13 +139,13 @@ std::string phonet(const std::string& inword, phonetable& parms) { n0 = parms.hash[(uchar)c0]; // if (parms.followup && k > 1 && n0 >= 0 - if (k > 1 && n0 >= 0 && p0 != (int)'-' && word[i + k] != '\0') { + if (k > 1 && n0 >= 0 && p0 != (int)'-' && word[i + k] != '\0' && !parms.rules[n0].empty()) { /** test follow-up rule for "word[i+k]" **/ while (parms.rules[n0][0] == c0) { /** check whole string **/ k0 = k; p0 = 5; - s = parms.rules[n0]; + s = parms.rules[n0].c_str(); s++; while (*s != '\0' && word[i + k0] == *s && !isdigit((unsigned char)*s) && @@ -206,9 +203,9 @@ std::string phonet(const std::string& inword, phonetable& parms) { } /** end of follow-up stuff **/ /** replace string **/ - s = parms.rules[n + 1]; - p0 = (parms.rules[n][0] != '\0' && - strchr(parms.rules[n] + 1, '<') != NULL) + s = parms.rules[n + 1].c_str(); + p0 = (!parms.rules[n].empty() && + strchr(parms.rules[n].c_str() + 1, '<') != NULL) ? 1 : 0; if (p0 == 1 && z == 0) { @@ -241,8 +238,8 @@ std::string phonet(const std::string& inword, phonetable& parms) { } /** new "actual letter" **/ c = *s; - if (parms.rules[n][0] != '\0' && - strstr(parms.rules[n] + 1, "^^") != NULL) { + if (!parms.rules[n].empty() && + strstr(parms.rules[n].c_str() + 1, "^^") != NULL) { if (c != '\0') { target.push_back(c); } @@ -257,8 +254,7 @@ std::string phonet(const std::string& inword, phonetable& parms) { } /** end of while (parms.rules[n][0] == c) **/ } /** end of if (n >= 0) **/ if (z0 == 0) { - if (k && !p0 && target.size() < len && c != '\0' && - (1 || target.empty() || target[target.size()-1] != c)) { + if (k && !p0 && target.size() < len && c != '\0') { /** condense only double letters **/ target.push_back(c); /// printf("\n setting \n"); diff --git a/libs/hunspell/src/phonet.hxx b/libs/hunspell/src/phonet.hxx index eb9fd0c628..2d58b3ba1b 100644 --- a/libs/hunspell/src/phonet.hxx +++ b/libs/hunspell/src/phonet.hxx @@ -27,8 +27,8 @@ Porting from Aspell to Hunspell using C-like structs */ -#ifndef __PHONETHXX__ -#define __PHONETHXX__ +#ifndef PHONET_HXX_ +#define PHONET_HXX_ #define HASHSIZE 256 #define MAXPHONETLEN 256 @@ -38,9 +38,7 @@ struct phonetable { char utf8; - cs_info* lang; - int num; - char** rules; + std::vector<std::string> rules; int hash[HASHSIZE]; }; diff --git a/libs/hunspell/src/replist.c++ b/libs/hunspell/src/replist.cxx index b3e6b37d20..cabe382bfd 100644 --- a/libs/hunspell/src/replist.c++ +++ b/libs/hunspell/src/replist.cxx @@ -1,6 +1,8 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * + * Copyright (C) 2002-2017 Németh László + * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at @@ -11,12 +13,7 @@ * for the specific language governing rights and limitations under the * License. * - * The Original Code is Hunspell, based on MySpell. - * - * The Initial Developers of the Original Code are - * Kevin Hendricks (MySpell) and Németh László (Hunspell). - * Portions created by the Initial Developers are Copyright (C) 2002-2005 - * the Initial Developers. All Rights Reserved. + * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. * * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, @@ -90,104 +87,110 @@ RepList::RepList(int n) { RepList::~RepList() { for (int i = 0; i < pos; i++) { - free(dat[i]->pattern); - free(dat[i]->pattern2); - free(dat[i]); + delete dat[i]; } free(dat); } -int RepList::get_pos() { - return pos; -} - replentry* RepList::item(int n) { return dat[n]; } -int RepList::near(const char* word) { +int RepList::find(const char* word) { int p1 = 0; - int p2 = pos; - while ((p2 - p1) > 1) { - int m = (p1 + p2) / 2; - int c = strcmp(word, dat[m]->pattern); - if (c <= 0) { - if (c < 0) - p2 = m; - else - p1 = p2 = m; - } else - p1 = m; + int p2 = pos - 1; + int ret = -1; + while (p1 <= p2) { + int m = ((unsigned)p1 + (unsigned)p2) >> 1; + int c = strncmp(word, dat[m]->pattern.c_str(), dat[m]->pattern.size()); + if (c < 0) + p2 = m - 1; + else if (c > 0) + p1 = m + 1; + else { // scan in the right half for a longer match + ret = m; + p1 = m + 1; + } } - return p1; + return ret; } -int RepList::match(const char* word, int n) { - if (strncmp(word, dat[n]->pattern, strlen(dat[n]->pattern)) == 0) - return strlen(dat[n]->pattern); - return 0; +std::string RepList::replace(const char* word, int ind, bool atstart) { + int type = atstart ? 1 : 0; + if (ind < 0) + return std::string(); + if (strlen(word) == dat[ind]->pattern.size()) + type = atstart ? 3 : 2; + while (type && dat[ind]->outstrings[type].empty()) + type = (type == 2 && !atstart) ? 0 : type - 1; + return dat[ind]->outstrings[type]; } -int RepList::add(char* pat1, char* pat2) { - if (pos >= size || pat1 == NULL || pat2 == NULL) +int RepList::add(const std::string& in_pat1, const std::string& pat2) { + if (pos >= size || in_pat1.empty() || pat2.empty()) { return 1; - replentry* r = (replentry*)malloc(sizeof(replentry)); + } + // analyse word context + int type = 0; + std::string pat1(in_pat1); + if (pat1[0] == '_') { + pat1.erase(0, 1); + type = 1; + } + if (!pat1.empty() && pat1[pat1.size() - 1] == '_') { + type = type + 2; + pat1.erase(pat1.size() - 1); + } + mystrrep(pat1, "_", " "); + + // find existing entry + int m = find(pat1.c_str()); + if (m >= 0 && dat[m]->pattern == pat1) { + // since already used + dat[m]->outstrings[type] = pat2; + mystrrep(dat[m]->outstrings[type], "_", " "); + return 0; + } + + // make a new entry if none exists + replentry* r = new replentry; if (r == NULL) return 1; - r->pattern = mystrrep(pat1, "_", " "); - r->pattern2 = mystrrep(pat2, "_", " "); - r->start = false; - r->end = false; + r->pattern = pat1; + r->outstrings[type] = pat2; + mystrrep(r->outstrings[type], "_", " "); dat[pos++] = r; - for (int i = pos - 1; i > 0; i--) { - r = dat[i]; - if (strcmp(r->pattern, dat[i - 1]->pattern) < 0) { + // sort to the right place in the list + int i; + for (i = pos - 1; i > 0; i--) { + if (strcmp(r->pattern.c_str(), dat[i - 1]->pattern.c_str()) < 0) { dat[i] = dat[i - 1]; - dat[i - 1] = r; } else break; } + dat[i] = r; return 0; } -int RepList::conv(const char* word, char* dest, size_t destsize) { - size_t stl = 0; - int change = 0; - for (size_t i = 0; i < strlen(word); i++) { - int n = near(word + i); - int l = match(word + i, n); - if (l) { - size_t replen = strlen(dat[n]->pattern2); - if (stl + replen >= destsize) - return -1; - strcpy(dest + stl, dat[n]->pattern2); - stl += replen; - i += l - 1; - change = 1; - } else { - if (stl + 1 >= destsize) - return -1; - dest[stl++] = word[i]; - } - } - dest[stl] = '\0'; - return change; -} - -bool RepList::conv(const char* word, std::string& dest) { +bool RepList::conv(const std::string& in_word, std::string& dest) { dest.clear(); + size_t wordlen = in_word.size(); + const char* word = in_word.c_str(); + bool change = false; - for (size_t i = 0; i < strlen(word); i++) { - int n = near(word + i); - int l = match(word + i, n); - if (l) { - dest.append(dat[n]->pattern2); - i += l - 1; + for (size_t i = 0; i < wordlen; ++i) { + int n = find(word + i); + std::string l = replace(word + i, n, i == 0); + if (!l.empty()) { + dest.append(l); + i += dat[n]->pattern.size() - 1; change = true; } else { dest.push_back(word[i]); } } + return change; } + diff --git a/libs/hunspell/src/replist.hxx b/libs/hunspell/src/replist.hxx index 59366e9e02..1e3efa4131 100644 --- a/libs/hunspell/src/replist.hxx +++ b/libs/hunspell/src/replist.hxx @@ -1,6 +1,8 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * + * Copyright (C) 2002-2017 Németh László + * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at @@ -11,12 +13,7 @@ * for the specific language governing rights and limitations under the * License. * - * The Original Code is Hunspell, based on MySpell. - * - * The Initial Developers of the Original Code are - * Kevin Hendricks (MySpell) and Németh László (Hunspell). - * Portions created by the Initial Developers are Copyright (C) 2002-2005 - * the Initial Developers. All Rights Reserved. + * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. * * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, @@ -72,17 +69,15 @@ */ /* string replacement list class */ -#ifndef _REPLIST_HXX_ -#define _REPLIST_HXX_ - -#include "hunvisapi.h" +#ifndef REPLIST_HXX_ +#define REPLIST_HXX_ #include "w_char.hxx" #include <string> #include <vector> -class LIBHUNSPELL_DLL_EXPORTED RepList { +class RepList { private: RepList(const RepList&); RepList& operator=(const RepList&); @@ -93,16 +88,13 @@ class LIBHUNSPELL_DLL_EXPORTED RepList { int pos; public: - RepList(int n); + explicit RepList(int n); ~RepList(); - int get_pos(); - int add(char* pat1, char* pat2); + int add(const std::string& pat1, const std::string& pat2); replentry* item(int n); -#undef near - int near(const char* word); - int match(const char* word, int n); - int conv(const char* word, char* dest, size_t destsize); - bool conv(const char* word, std::string& dest); + int find(const char* word); + std::string replace(const char* word, int n, bool atstart); + bool conv(const std::string& word, std::string& dest); }; #endif diff --git a/libs/hunspell/src/resource.h b/libs/hunspell/src/resource.h deleted file mode 100644 index e1df211357..0000000000 --- a/libs/hunspell/src/resource.h +++ /dev/null @@ -1,14 +0,0 @@ -//{{NO_DEPENDENCIES}}
-// Microsoft Visual C++ generated include file.
-// Used by hunspell.rc
-
-// Следующие стандартные значения для новых объектов
-//
-#ifdef APSTUDIO_INVOKED
-#ifndef APSTUDIO_READONLY_SYMBOLS
-#define _APS_NEXT_RESOURCE_VALUE 101
-#define _APS_NEXT_COMMAND_VALUE 40001
-#define _APS_NEXT_CONTROL_VALUE 1001
-#define _APS_NEXT_SYMED_VALUE 101
-#endif
-#endif
diff --git a/libs/hunspell/src/suggestmgr.c++ b/libs/hunspell/src/suggestmgr.cxx index 17becd7582..73ea91e3a3 100644 --- a/libs/hunspell/src/suggestmgr.c++ +++ b/libs/hunspell/src/suggestmgr.cxx @@ -1,6 +1,8 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * + * Copyright (C) 2002-2017 Németh László + * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at @@ -11,12 +13,7 @@ * for the specific language governing rights and limitations under the * License. * - * The Original Code is Hunspell, based on MySpell. - * - * The Initial Developers of the Original Code are - * Kevin Hendricks (MySpell) and Németh László (Hunspell). - * Portions created by the Initial Developers are Copyright (C) 2002-2005 - * the Initial Developers. All Rights Reserved. + * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. * * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, @@ -82,7 +79,7 @@ const w_char W_VLINE = {'\0', '|'}; -SuggestMgr::SuggestMgr(const char* tryme, int maxn, AffixMgr* aptr) { +SuggestMgr::SuggestMgr(const char* tryme, unsigned int maxn, AffixMgr* aptr) { // register affix manager and check in string of chars to // try when building candidate suggestions pAMgr = aptr; @@ -91,11 +88,9 @@ SuggestMgr::SuggestMgr(const char* tryme, int maxn, AffixMgr* aptr) { ckeyl = 0; ckey = NULL; - ckey_utf = NULL; ctryl = 0; ctry = NULL; - ctry_utf = NULL; utf8 = 0; langnum = 0; @@ -116,22 +111,14 @@ SuggestMgr::SuggestMgr(const char* tryme, int maxn, AffixMgr* aptr) { if (pAMgr->get_maxcpdsugs() >= 0) maxcpdsugs = pAMgr->get_maxcpdsugs(); if (!utf8) { - char* enc = pAMgr->get_encoding(); - csconv = get_current_cs(enc); - free(enc); + csconv = get_current_cs(pAMgr->get_encoding()); } complexprefixes = pAMgr->get_complexprefixes(); } if (ckey) { if (utf8) { - std::vector<w_char> t; - ckeyl = u8_u16(t, ckey); - ckey_utf = (w_char*)malloc(ckeyl * sizeof(w_char)); - if (ckey_utf) - memcpy(ckey_utf, &t[0], ckeyl * sizeof(w_char)); - else - ckeyl = 0; + ckeyl = u8_u16(ckey_utf, ckey); } else { ckeyl = strlen(ckey); } @@ -142,13 +129,7 @@ SuggestMgr::SuggestMgr(const char* tryme, int maxn, AffixMgr* aptr) { if (ctry) ctryl = strlen(ctry); if (ctry && utf8) { - std::vector<w_char> t; - ctryl = u8_u16(t, tryme); - ctry_utf = (w_char*)malloc(ctryl * sizeof(w_char)); - if (ctry_utf) - memcpy(ctry_utf, &t[0], ctryl * sizeof(w_char)); - else - ctryl = 0; + ctryl = u8_u16(ctry_utf, tryme); } } } @@ -158,16 +139,10 @@ SuggestMgr::~SuggestMgr() { if (ckey) free(ckey); ckey = NULL; - if (ckey_utf) - free(ckey_utf); - ckey_utf = NULL; ckeyl = 0; if (ctry) free(ctry); ctry = NULL; - if (ctry_utf) - free(ctry_utf); - ctry_utf = NULL; ctryl = 0; maxSug = 0; #ifdef MOZILLA_CLIENT @@ -175,50 +150,38 @@ SuggestMgr::~SuggestMgr() { #endif } -int SuggestMgr::testsug(char** wlst, - const char* candidate, - int wl, - int ns, +void SuggestMgr::testsug(std::vector<std::string>& wlst, + const std::string& candidate, int cpdsuggest, int* timer, clock_t* timelimit) { int cwrd = 1; - if (ns == maxSug) - return maxSug; - for (int k = 0; k < ns; k++) { - if (strcmp(candidate, wlst[k]) == 0) { + if (wlst.size() == maxSug) + return; + for (size_t k = 0; k < wlst.size(); ++k) { + if (wlst[k] == candidate) { cwrd = 0; break; } } - if ((cwrd) && checkword(candidate, wl, cpdsuggest, timer, timelimit)) { - wlst[ns] = mystrdup(candidate); - if (wlst[ns] == NULL) { - for (int j = 0; j < ns; j++) - free(wlst[j]); - return -1; - } - ns++; + if ((cwrd) && checkword(candidate, cpdsuggest, timer, timelimit)) { + wlst.push_back(candidate); } - return ns; } // generate suggestions for a misspelled word // pass in address of array of char * pointers // onlycompoundsug: probably bad suggestions (need for ngram sugs, too) - -int SuggestMgr::suggest(char*** slst, +void SuggestMgr::suggest(std::vector<std::string>& slst, const char* w, - int nsug, int* onlycompoundsug) { int nocompoundtwowords = 0; - char** wlst; std::vector<w_char> word_utf; int wl = 0; - int nsugorig = nsug; + size_t nsugorig = slst.size(); std::string w2; const char* word = w; - int oldSug = 0; + size_t oldSug = 0; // word reversing wrapper for complex prefixes if (complexprefixes) { @@ -230,22 +193,10 @@ int SuggestMgr::suggest(char*** slst, word = w2.c_str(); } - if (*slst) { - wlst = *slst; - } else { - wlst = (char**)malloc(maxSug * sizeof(char*)); - if (wlst == NULL) - return -1; - for (int i = 0; i < maxSug; i++) { - wlst[i] = NULL; - } - } - if (utf8) { wl = u8_u16(word_utf, word); if (wl == -1) { - *slst = wlst; - return nsug; + return; } } @@ -253,139 +204,131 @@ int SuggestMgr::suggest(char*** slst, cpdsuggest++) { // limit compound suggestion if (cpdsuggest > 0) - oldSug = nsug; + oldSug = slst.size(); // suggestions for an uppercase word (html -> HTML) - if ((nsug < maxSug) && (nsug > -1)) { - nsug = (utf8) ? capchars_utf(wlst, &word_utf[0], wl, nsug, cpdsuggest) - : capchars(wlst, word, nsug, cpdsuggest); + if (slst.size() < maxSug) { + if (utf8) + capchars_utf(slst, &word_utf[0], wl, cpdsuggest); + else + capchars(slst, word, cpdsuggest); } // perhaps we made a typical fault of spelling - if ((nsug < maxSug) && (nsug > -1) && - (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { - nsug = replchars(wlst, word, nsug, cpdsuggest); + if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) { + replchars(slst, word, cpdsuggest); } // perhaps we made chose the wrong char from a related set - if ((nsug < maxSug) && (nsug > -1) && - (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { - nsug = mapchars(wlst, word, nsug, cpdsuggest); + if ((slst.size() < maxSug) && + (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) { + mapchars(slst, word, cpdsuggest); } // only suggest compound words when no other suggestion - if ((cpdsuggest == 0) && (nsug > nsugorig)) + if ((cpdsuggest == 0) && (slst.size() > nsugorig)) nocompoundtwowords = 1; // did we swap the order of chars by mistake - if ((nsug < maxSug) && (nsug > -1) && - (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { - nsug = (utf8) ? swapchar_utf(wlst, &word_utf[0], wl, nsug, cpdsuggest) - : swapchar(wlst, word, nsug, cpdsuggest); + if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) { + if (utf8) + swapchar_utf(slst, &word_utf[0], wl, cpdsuggest); + else + swapchar(slst, word, cpdsuggest); } // did we swap the order of non adjacent chars by mistake - if ((nsug < maxSug) && (nsug > -1) && - (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { - nsug = (utf8) ? longswapchar_utf(wlst, &word_utf[0], wl, nsug, cpdsuggest) - : longswapchar(wlst, word, nsug, cpdsuggest); + if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) { + if (utf8) + longswapchar_utf(slst, &word_utf[0], wl, cpdsuggest); + else + longswapchar(slst, word, cpdsuggest); } // did we just hit the wrong key in place of a good char (case and keyboard) - if ((nsug < maxSug) && (nsug > -1) && - (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { - nsug = (utf8) ? badcharkey_utf(wlst, &word_utf[0], wl, nsug, cpdsuggest) - : badcharkey(wlst, word, nsug, cpdsuggest); + if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) { + if (utf8) + badcharkey_utf(slst, &word_utf[0], wl, cpdsuggest); + else + badcharkey(slst, word, cpdsuggest); } // did we add a char that should not be there - if ((nsug < maxSug) && (nsug > -1) && - (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { - nsug = (utf8) ? extrachar_utf(wlst, &word_utf[0], wl, nsug, cpdsuggest) - : extrachar(wlst, word, nsug, cpdsuggest); + if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) { + if (utf8) + extrachar_utf(slst, &word_utf[0], wl, cpdsuggest); + else + extrachar(slst, word, cpdsuggest); } // did we forgot a char - if ((nsug < maxSug) && (nsug > -1) && - (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { - nsug = (utf8) ? forgotchar_utf(wlst, &word_utf[0], wl, nsug, cpdsuggest) - : forgotchar(wlst, word, nsug, cpdsuggest); + if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) { + if (utf8) + forgotchar_utf(slst, &word_utf[0], wl, cpdsuggest); + else + forgotchar(slst, word, cpdsuggest); } // did we move a char - if ((nsug < maxSug) && (nsug > -1) && - (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { - nsug = (utf8) ? movechar_utf(wlst, &word_utf[0], wl, nsug, cpdsuggest) - : movechar(wlst, word, nsug, cpdsuggest); + if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) { + if (utf8) + movechar_utf(slst, &word_utf[0], wl, cpdsuggest); + else + movechar(slst, word, cpdsuggest); } // did we just hit the wrong key in place of a good char - if ((nsug < maxSug) && (nsug > -1) && - (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { - nsug = (utf8) ? badchar_utf(wlst, &word_utf[0], wl, nsug, cpdsuggest) - : badchar(wlst, word, nsug, cpdsuggest); + if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) { + if (utf8) + badchar_utf(slst, &word_utf[0], wl, cpdsuggest); + else + badchar(slst, word, cpdsuggest); } // did we double two characters - if ((nsug < maxSug) && (nsug > -1) && - (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { - nsug = (utf8) ? doubletwochars_utf(wlst, &word_utf[0], wl, nsug, cpdsuggest) - : doubletwochars(wlst, word, nsug, cpdsuggest); + if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) { + if (utf8) + doubletwochars_utf(slst, &word_utf[0], wl, cpdsuggest); + else + doubletwochars(slst, word, cpdsuggest); } // perhaps we forgot to hit space and two words ran together - if (!nosplitsugs && (nsug < maxSug) && (nsug > -1) && - (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { - nsug = twowords(wlst, word, nsug, cpdsuggest); + if (!nosplitsugs && (slst.size() < maxSug) && + (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) { + twowords(slst, word, cpdsuggest); } } // repeating ``for'' statement compounding support - if (nsug < 0) { - // we ran out of memory - we should free up as much as possible - for (int i = 0; i < maxSug; i++) - if (wlst[i] != NULL) - free(wlst[i]); - free(wlst); - wlst = NULL; - } - - if (!nocompoundtwowords && (nsug > 0) && onlycompoundsug) + if (!nocompoundtwowords && (!slst.empty()) && onlycompoundsug) *onlycompoundsug = 1; - - *slst = wlst; - return nsug; } // suggestions for an uppercase word (html -> HTML) -int SuggestMgr::capchars_utf(char** wlst, - const w_char* word, - int wl, - int ns, - int cpdsuggest) { +void SuggestMgr::capchars_utf(std::vector<std::string>& wlst, + const w_char* word, + int wl, + int cpdsuggest) { std::vector<w_char> candidate_utf(word, word + wl); mkallcap_utf(candidate_utf, langnum); std::string candidate; u16_u8(candidate, candidate_utf); - return testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, - NULL); + testsug(wlst, candidate, cpdsuggest, NULL, NULL); } // suggestions for an uppercase word (html -> HTML) -int SuggestMgr::capchars(char** wlst, - const char* word, - int ns, - int cpdsuggest) { +void SuggestMgr::capchars(std::vector<std::string>& wlst, + const char* word, + int cpdsuggest) { std::string candidate(word); mkallcap(candidate, csconv); - return testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, - NULL); + testsug(wlst, candidate, cpdsuggest, NULL, NULL); } // suggestions for when chose the wrong char out of a related set -int SuggestMgr::mapchars(char** wlst, +int SuggestMgr::mapchars(std::vector<std::string>& wlst, const char* word, - int ns, int cpdsuggest) { std::string candidate; clock_t timelimit; @@ -393,120 +336,108 @@ int SuggestMgr::mapchars(char** wlst, int wl = strlen(word); if (wl < 2 || !pAMgr) - return ns; + return wlst.size(); - int nummap = pAMgr->get_nummap(); - struct mapentry* maptable = pAMgr->get_maptable(); - if (maptable == NULL) - return ns; + const std::vector<mapentry>& maptable = pAMgr->get_maptable(); + if (maptable.empty()) + return wlst.size(); timelimit = clock(); timer = MINTIMER; - return map_related(word, candidate, 0, wlst, cpdsuggest, ns, - maptable, nummap, &timer, &timelimit); + return map_related(word, candidate, 0, wlst, cpdsuggest, + maptable, &timer, &timelimit); } int SuggestMgr::map_related(const char* word, std::string& candidate, int wn, - char** wlst, + std::vector<std::string>& wlst, int cpdsuggest, - int ns, - const mapentry* maptable, - int nummap, + const std::vector<mapentry>& maptable, int* timer, clock_t* timelimit) { if (*(word + wn) == '\0') { int cwrd = 1; - for (int m = 0; m < ns; m++) { - if (candidate == wlst[m]) { + for (size_t m = 0; m < wlst.size(); ++m) { + if (wlst[m] == candidate) { cwrd = 0; break; } } - if ((cwrd) && checkword(candidate.c_str(), candidate.size(), cpdsuggest, timer, timelimit)) { - if (ns < maxSug) { - wlst[ns] = mystrdup(candidate.c_str()); - if (wlst[ns] == NULL) - return -1; - ns++; + if ((cwrd) && checkword(candidate, cpdsuggest, timer, timelimit)) { + if (wlst.size() < maxSug) { + wlst.push_back(candidate); } } - return ns; + return wlst.size(); } int in_map = 0; - for (int j = 0; j < nummap; j++) { - for (int k = 0; k < maptable[j].len; k++) { - int len = strlen(maptable[j].set[k]); - if (strncmp(maptable[j].set[k], word + wn, len) == 0) { + for (size_t j = 0; j < maptable.size(); ++j) { + for (size_t k = 0; k < maptable[j].size(); ++k) { + size_t len = maptable[j][k].size(); + if (strncmp(maptable[j][k].c_str(), word + wn, len) == 0) { in_map = 1; size_t cn = candidate.size(); - for (int l = 0; l < maptable[j].len; l++) { + for (size_t l = 0; l < maptable[j].size(); ++l) { candidate.resize(cn); - candidate.append(maptable[j].set[l]); - ns = map_related(word, candidate, wn + len, wlst, - cpdsuggest, ns, maptable, nummap, timer, timelimit); + candidate.append(maptable[j][l]); + map_related(word, candidate, wn + len, wlst, + cpdsuggest, maptable, timer, timelimit); if (!(*timer)) - return ns; + return wlst.size(); } } } } if (!in_map) { candidate.push_back(*(word + wn)); - ns = map_related(word, candidate, wn + 1, wlst, cpdsuggest, ns, - maptable, nummap, timer, timelimit); + map_related(word, candidate, wn + 1, wlst, cpdsuggest, + maptable, timer, timelimit); } - return ns; + return wlst.size(); } // suggestions for a typical fault of spelling, that // differs with more, than 1 letter from the right form. -int SuggestMgr::replchars(char** wlst, +int SuggestMgr::replchars(std::vector<std::string>& wlst, const char* word, - int ns, int cpdsuggest) { std::string candidate; int wl = strlen(word); if (wl < 2 || !pAMgr) - return ns; - int numrep = pAMgr->get_numrep(); - struct replentry* reptable = pAMgr->get_reptable(); - if (reptable == NULL) - return ns; - for (int i = 0; i < numrep; i++) { + return wlst.size(); + const std::vector<replentry>& reptable = pAMgr->get_reptable(); + for (size_t i = 0; i < reptable.size(); ++i) { const char* r = word; // search every occurence of the pattern in the word - while ((r = strstr(r, reptable[i].pattern)) != NULL && - (!reptable[i].end || strlen(r) == strlen(reptable[i].pattern)) && - (!reptable[i].start || r == word)) { + while ((r = strstr(r, reptable[i].pattern.c_str())) != NULL) { + int type = (r == word) ? 1 : 0; + if (r - word + reptable[i].pattern.size() == strlen(word)) + type += 2; + while (type && reptable[i].outstrings[type].empty()) + type = (type == 2 && r != word) ? 0 : type - 1; + const std::string&out = reptable[i].outstrings[type]; + if (out.empty()) { + ++r; + continue; + } candidate.assign(word); candidate.resize(r - word); - candidate.append(reptable[i].pattern2); - int lenp = strlen(reptable[i].pattern); - candidate.append(r + lenp); - ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, - NULL); - if (ns == -1) - return -1; + candidate.append(reptable[i].outstrings[type]); + candidate.append(r + reptable[i].pattern.size()); + testsug(wlst, candidate, cpdsuggest, NULL, NULL); // check REP suggestions with space size_t sp = candidate.find(' '); if (sp != std::string::npos) { size_t prev = 0; while (sp != std::string::npos) { std::string prev_chunk = candidate.substr(prev, sp - prev); - if (checkword(prev_chunk.c_str(), prev_chunk.size(), 0, NULL, NULL)) { - int oldns = ns; + if (checkword(prev_chunk, 0, NULL, NULL)) { + size_t oldns = wlst.size(); std::string post_chunk = candidate.substr(sp + 1); - ns = testsug(wlst, post_chunk.c_str(), post_chunk.size(), ns, cpdsuggest, NULL, - NULL); - if (ns == -1) - return -1; - if (oldns < ns) { - free(wlst[ns - 1]); - wlst[ns - 1] = mystrdup(candidate.c_str()); - if (!wlst[ns - 1]) - return -1; + testsug(wlst, post_chunk, cpdsuggest, NULL, NULL); + if (oldns < wlst.size()) { + wlst[wlst.size() - 1] = candidate; } } prev = sp + 1; @@ -516,47 +447,43 @@ int SuggestMgr::replchars(char** wlst, r++; // search for the next letter } } - return ns; + return wlst.size(); } // perhaps we doubled two characters (pattern aba -> ababa, for example vacation // -> vacacation) -int SuggestMgr::doubletwochars(char** wlst, +int SuggestMgr::doubletwochars(std::vector<std::string>& wlst, const char* word, - int ns, int cpdsuggest) { int state = 0; int wl = strlen(word); if (wl < 5 || !pAMgr) - return ns; + return wlst.size(); for (int i = 2; i < wl; i++) { if (word[i] == word[i - 2]) { state++; if (state == 3) { std::string candidate(word, word + i - 1); candidate.insert(candidate.end(), word + i + 1, word + wl); - ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL); - if (ns == -1) - return -1; + testsug(wlst, candidate, cpdsuggest, NULL, NULL); state = 0; } } else { state = 0; } } - return ns; + return wlst.size(); } // perhaps we doubled two characters (pattern aba -> ababa, for example vacation // -> vacacation) -int SuggestMgr::doubletwochars_utf(char** wlst, +int SuggestMgr::doubletwochars_utf(std::vector<std::string>& wlst, const w_char* word, int wl, - int ns, int cpdsuggest) { int state = 0; if (wl < 5 || !pAMgr) - return ns; + return wlst.size(); for (int i = 2; i < wl; i++) { if (word[i] == word[i - 2]) { state++; @@ -565,24 +492,20 @@ int SuggestMgr::doubletwochars_utf(char** wlst, candidate_utf.insert(candidate_utf.end(), word + i + 1, word + wl); std::string candidate; u16_u8(candidate, candidate_utf); - ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, - NULL); - if (ns == -1) - return -1; + testsug(wlst, candidate, cpdsuggest, NULL, NULL); state = 0; } } else { state = 0; } } - return ns; + return wlst.size(); } // error is wrong char in place of correct one (case and keyboard related // version) -int SuggestMgr::badcharkey(char** wlst, +int SuggestMgr::badcharkey(std::vector<std::string>& wlst, const char* word, - int ns, int cpdsuggest) { std::string candidate(word); @@ -593,9 +516,7 @@ int SuggestMgr::badcharkey(char** wlst, // check with uppercase letters candidate[i] = csconv[((unsigned char)tmpc)].cupper; if (tmpc != candidate[i]) { - ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL); - if (ns == -1) - return -1; + testsug(wlst, candidate, cpdsuggest, NULL, NULL); candidate[i] = tmpc; } // check neighbor characters in keyboard string @@ -605,29 +526,24 @@ int SuggestMgr::badcharkey(char** wlst, while (loc) { if ((loc > ckey) && (*(loc - 1) != '|')) { candidate[i] = *(loc - 1); - ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL); - if (ns == -1) - return -1; + testsug(wlst, candidate, cpdsuggest, NULL, NULL); } if ((*(loc + 1) != '|') && (*(loc + 1) != '\0')) { candidate[i] = *(loc + 1); - ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL); - if (ns == -1) - return -1; + testsug(wlst, candidate, cpdsuggest, NULL, NULL); } loc = strchr(loc + 1, tmpc); } candidate[i] = tmpc; } - return ns; + return wlst.size(); } // error is wrong char in place of correct one (case and keyboard related // version) -int SuggestMgr::badcharkey_utf(char** wlst, +int SuggestMgr::badcharkey_utf(std::vector<std::string>& wlst, const w_char* word, int wl, - int ns, int cpdsuggest) { std::string candidate; std::vector<w_char> candidate_utf(word, word + wl); @@ -639,73 +555,61 @@ int SuggestMgr::badcharkey_utf(char** wlst, candidate_utf[i] = upper_utf(candidate_utf[i], 1); if (tmpc != candidate_utf[i]) { u16_u8(candidate, candidate_utf); - ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, - NULL); - if (ns == -1) - return -1; + testsug(wlst, candidate, cpdsuggest, NULL, NULL); candidate_utf[i] = tmpc; } // check neighbor characters in keyboard string if (!ckey) continue; - w_char* loc = ckey_utf; - while ((loc < (ckey_utf + ckeyl)) && *loc != tmpc) - loc++; - while (loc < (ckey_utf + ckeyl)) { - if ((loc > ckey_utf) && *(loc - 1) != W_VLINE) { - candidate_utf[i] = *(loc - 1); + size_t loc = 0; + while ((loc < ckeyl) && ckey_utf[loc] != tmpc) + ++loc; + while (loc < ckeyl) { + if ((loc > 0) && ckey_utf[loc - 1] != W_VLINE) { + candidate_utf[i] = ckey_utf[loc - 1]; u16_u8(candidate, candidate_utf); - ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, - NULL); - if (ns == -1) - return -1; + testsug(wlst, candidate, cpdsuggest, NULL, NULL); } - if (((loc + 1) < (ckey_utf + ckeyl)) && (*(loc + 1) != W_VLINE)) { - candidate_utf[i] = *(loc + 1); + if (((loc + 1) < ckeyl) && (ckey_utf[loc + 1] != W_VLINE)) { + candidate_utf[i] = ckey_utf[loc + 1]; u16_u8(candidate, candidate_utf); - ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, - NULL); - if (ns == -1) - return -1; + testsug(wlst, candidate, cpdsuggest, NULL, NULL); } do { loc++; - } while ((loc < (ckey_utf + ckeyl)) && *loc != tmpc); + } while ((loc < ckeyl) && ckey_utf[loc] != tmpc); } candidate_utf[i] = tmpc; } - return ns; + return wlst.size(); } // error is wrong char in place of correct one -int SuggestMgr::badchar(char** wlst, const char* word, int ns, int cpdsuggest) { +int SuggestMgr::badchar(std::vector<std::string>& wlst, const char* word, int cpdsuggest) { std::string candidate(word); clock_t timelimit = clock(); int timer = MINTIMER; // swap out each char one by one and try all the tryme // chars in its place to see if that makes a good word - for (int j = 0; j < ctryl; j++) { + for (size_t j = 0; j < ctryl; ++j) { for (std::string::reverse_iterator aI = candidate.rbegin(), aEnd = candidate.rend(); aI != aEnd; ++aI) { char tmpc = *aI; if (ctry[j] == tmpc) continue; *aI = ctry[j]; - ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, &timer, &timelimit); - if (ns == -1) - return -1; + testsug(wlst, candidate, cpdsuggest, &timer, &timelimit); if (!timer) - return ns; + return wlst.size(); *aI = tmpc; } } - return ns; + return wlst.size(); } // error is wrong char in place of correct one -int SuggestMgr::badchar_utf(char** wlst, +int SuggestMgr::badchar_utf(std::vector<std::string>& wlst, const w_char* word, int wl, - int ns, int cpdsuggest) { std::vector<w_char> candidate_utf(word, word + wl); std::string candidate; @@ -713,34 +617,30 @@ int SuggestMgr::badchar_utf(char** wlst, int timer = MINTIMER; // swap out each char one by one and try all the tryme // chars in its place to see if that makes a good word - for (int j = 0; j < ctryl; j++) { + for (size_t j = 0; j < ctryl; ++j) { for (int i = wl - 1; i >= 0; i--) { w_char tmpc = candidate_utf[i]; if (tmpc == ctry_utf[j]) continue; candidate_utf[i] = ctry_utf[j]; u16_u8(candidate, candidate_utf); - ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, &timer, - &timelimit); - if (ns == -1) - return -1; + testsug(wlst, candidate, cpdsuggest, &timer, &timelimit); if (!timer) - return ns; + return wlst.size(); candidate_utf[i] = tmpc; } } - return ns; + return wlst.size(); } // error is word has an extra letter it does not need -int SuggestMgr::extrachar_utf(char** wlst, +int SuggestMgr::extrachar_utf(std::vector<std::string>& wlst, const w_char* word, int wl, - int ns, int cpdsuggest) { std::vector<w_char> candidate_utf(word, word + wl); if (candidate_utf.size() < 2) - return ns; + return wlst.size(); // try omitting one char of word at a time for (size_t i = 0; i < candidate_utf.size(); ++i) { size_t index = candidate_utf.size() - 1 - i; @@ -748,39 +648,33 @@ int SuggestMgr::extrachar_utf(char** wlst, candidate_utf.erase(candidate_utf.begin() + index); std::string candidate; u16_u8(candidate, candidate_utf); - ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL); - if (ns == -1) - return -1; + testsug(wlst, candidate, cpdsuggest, NULL, NULL); candidate_utf.insert(candidate_utf.begin() + index, tmpc); } - return ns; + return wlst.size(); } // error is word has an extra letter it does not need -int SuggestMgr::extrachar(char** wlst, +int SuggestMgr::extrachar(std::vector<std::string>& wlst, const char* word, - int ns, int cpdsuggest) { std::string candidate(word); if (candidate.size() < 2) - return ns; + return wlst.size(); // try omitting one char of word at a time for (size_t i = 0; i < candidate.size(); ++i) { size_t index = candidate.size() - 1 - i; char tmpc = candidate[index]; candidate.erase(candidate.begin() + index); - ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL); - if (ns == -1) - return -1; + testsug(wlst, candidate, cpdsuggest, NULL, NULL); candidate.insert(candidate.begin() + index, tmpc); } - return ns; + return wlst.size(); } // error is missing a letter it needs -int SuggestMgr::forgotchar(char** wlst, +int SuggestMgr::forgotchar(std::vector<std::string>& wlst, const char* word, - int ns, int cpdsuggest) { std::string candidate(word); clock_t timelimit = clock(); @@ -788,26 +682,23 @@ int SuggestMgr::forgotchar(char** wlst, // try inserting a tryme character before every letter (and the null // terminator) - for (int k = 0; k < ctryl; ++k) { + for (size_t k = 0; k < ctryl; ++k) { for (size_t i = 0; i <= candidate.size(); ++i) { size_t index = candidate.size() - i; candidate.insert(candidate.begin() + index, ctry[k]); - ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, &timer, &timelimit); - if (ns == -1) - return -1; + testsug(wlst, candidate, cpdsuggest, &timer, &timelimit); if (!timer) - return ns; + return wlst.size(); candidate.erase(candidate.begin() + index); } } - return ns; + return wlst.size(); } // error is missing a letter it needs -int SuggestMgr::forgotchar_utf(char** wlst, +int SuggestMgr::forgotchar_utf(std::vector<std::string>& wlst, const w_char* word, int wl, - int ns, int cpdsuggest) { std::vector<w_char> candidate_utf(word, word + wl); clock_t timelimit = clock(); @@ -815,36 +706,32 @@ int SuggestMgr::forgotchar_utf(char** wlst, // try inserting a tryme character at the end of the word and before every // letter - for (int k = 0; k < ctryl; ++k) { + for (size_t k = 0; k < ctryl; ++k) { for (size_t i = 0; i <= candidate_utf.size(); ++i) { size_t index = candidate_utf.size() - i; candidate_utf.insert(candidate_utf.begin() + index, ctry_utf[k]); std::string candidate; u16_u8(candidate, candidate_utf); - ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, &timer, - &timelimit); - if (ns == -1) - return -1; + testsug(wlst, candidate, cpdsuggest, &timer, &timelimit); if (!timer) - return ns; + return wlst.size(); candidate_utf.erase(candidate_utf.begin() + index); } } - return ns; + return wlst.size(); } /* error is should have been two words */ -int SuggestMgr::twowords(char** wlst, +int SuggestMgr::twowords(std::vector<std::string>& wlst, const char* word, - int ns, int cpdsuggest) { - int c1, c2; + int c2; int forbidden = 0; int cwrd; int wl = strlen(word); if (wl < 3) - return ns; + return wlst.size(); if (langnum == LANG_hu) forbidden = check_forbidden(word, wl); @@ -864,9 +751,9 @@ int SuggestMgr::twowords(char** wlst, if (utf8 && p[1] == '\0') break; // last UTF-8 character *p = '\0'; - c1 = checkword(candidate, strlen(candidate), cpdsuggest, NULL, NULL); + int c1 = checkword(candidate, cpdsuggest, NULL, NULL); if (c1) { - c2 = checkword((p + 1), strlen(p + 1), cpdsuggest, NULL, NULL); + c2 = checkword((p + 1), cpdsuggest, NULL, NULL); if (c2) { *p = ' '; @@ -880,24 +767,19 @@ int SuggestMgr::twowords(char** wlst, *p = '-'; cwrd = 1; - for (int k = 0; k < ns; k++) { - if (strcmp(candidate, wlst[k]) == 0) { + for (size_t k = 0; k < wlst.size(); ++k) { + if (wlst[k] == candidate) { cwrd = 0; break; } } - if (ns < maxSug) { + if (wlst.size() < maxSug) { if (cwrd) { - wlst[ns] = mystrdup(candidate); - if (wlst[ns] == NULL) { - free(candidate); - return -1; - } - ns++; + wlst.push_back(candidate); } } else { free(candidate); - return ns; + return wlst.size(); } // add two word suggestion with dash, if TRY string contains // "a" or "-" @@ -905,48 +787,40 @@ int SuggestMgr::twowords(char** wlst, if (ctry && (strchr(ctry, 'a') || strchr(ctry, '-')) && mystrlen(p + 1) > 1 && mystrlen(candidate) - mystrlen(p) > 1) { *p = '-'; - for (int k = 0; k < ns; k++) { - if (strcmp(candidate, wlst[k]) == 0) { + for (size_t k = 0; k < wlst.size(); ++k) { + if (wlst[k] == candidate) { cwrd = 0; break; } } - if (ns < maxSug) { + if (wlst.size() < maxSug) { if (cwrd) { - wlst[ns] = mystrdup(candidate); - if (wlst[ns] == NULL) { - free(candidate); - return -1; - } - ns++; + wlst.push_back(candidate); } } else { free(candidate); - return ns; + return wlst.size(); } } } } } free(candidate); - return ns; + return wlst.size(); } // error is adjacent letter were swapped -int SuggestMgr::swapchar(char** wlst, +int SuggestMgr::swapchar(std::vector<std::string>& wlst, const char* word, - int ns, int cpdsuggest) { std::string candidate(word); if (candidate.size() < 2) - return ns; + return wlst.size(); // try swapping adjacent chars one by one for (size_t i = 0; i < candidate.size() - 1; ++i) { std::swap(candidate[i], candidate[i+1]); - ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL); - if (ns == -1) - return -1; + testsug(wlst, candidate, cpdsuggest, NULL, NULL); std::swap(candidate[i], candidate[i+1]); } @@ -958,40 +832,33 @@ int SuggestMgr::swapchar(char** wlst, candidate[2] = word[2]; candidate[candidate.size() - 2] = word[candidate.size() - 1]; candidate[candidate.size() - 1] = word[candidate.size() - 2]; - ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL); - if (ns == -1) - return -1; + testsug(wlst, candidate, cpdsuggest, NULL, NULL); if (candidate.size() == 5) { candidate[0] = word[0]; candidate[1] = word[2]; candidate[2] = word[1]; - ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL); - if (ns == -1) - return -1; + testsug(wlst, candidate, cpdsuggest, NULL, NULL); } } - return ns; + return wlst.size(); } // error is adjacent letter were swapped -int SuggestMgr::swapchar_utf(char** wlst, +int SuggestMgr::swapchar_utf(std::vector<std::string>& wlst, const w_char* word, int wl, - int ns, int cpdsuggest) { std::vector<w_char> candidate_utf(word, word + wl); if (candidate_utf.size() < 2) - return ns; + return wlst.size(); std::string candidate; // try swapping adjacent chars one by one for (size_t i = 0; i < candidate_utf.size() - 1; ++i) { std::swap(candidate_utf[i], candidate_utf[i+1]); u16_u8(candidate, candidate_utf); - ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL); - if (ns == -1) - return -1; + testsug(wlst, candidate, cpdsuggest, NULL, NULL); std::swap(candidate_utf[i], candidate_utf[i+1]); } @@ -1004,76 +871,64 @@ int SuggestMgr::swapchar_utf(char** wlst, candidate_utf[candidate_utf.size() - 2] = word[candidate_utf.size() - 1]; candidate_utf[candidate_utf.size() - 1] = word[candidate_utf.size() - 2]; u16_u8(candidate, candidate_utf); - ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL); - if (ns == -1) - return -1; + testsug(wlst, candidate, cpdsuggest, NULL, NULL); if (candidate_utf.size() == 5) { candidate_utf[0] = word[0]; candidate_utf[1] = word[2]; candidate_utf[2] = word[1]; u16_u8(candidate, candidate_utf); - ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL); - if (ns == -1) - return -1; + testsug(wlst, candidate, cpdsuggest, NULL, NULL); } } - return ns; + return wlst.size(); } // error is not adjacent letter were swapped -int SuggestMgr::longswapchar(char** wlst, +int SuggestMgr::longswapchar(std::vector<std::string>& wlst, const char* word, - int ns, int cpdsuggest) { std::string candidate(word); // try swapping not adjacent chars one by one for (std::string::iterator p = candidate.begin(); p < candidate.end(); ++p) { for (std::string::iterator q = candidate.begin(); q < candidate.end(); ++q) { - if (abs(std::distance(q, p)) > 1) { + if (std::abs(std::distance(q, p)) > 1) { std::swap(*p, *q); - ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL); - if (ns == -1) - return -1; + testsug(wlst, candidate, cpdsuggest, NULL, NULL); std::swap(*p, *q); } } } - return ns; + return wlst.size(); } // error is adjacent letter were swapped -int SuggestMgr::longswapchar_utf(char** wlst, +int SuggestMgr::longswapchar_utf(std::vector<std::string>& wlst, const w_char* word, int wl, - int ns, int cpdsuggest) { std::vector<w_char> candidate_utf(word, word + wl); // try swapping not adjacent chars for (std::vector<w_char>::iterator p = candidate_utf.begin(); p < candidate_utf.end(); ++p) { for (std::vector<w_char>::iterator q = candidate_utf.begin(); q < candidate_utf.end(); ++q) { - if (abs(std::distance(q, p)) > 1) { + if (std::abs(std::distance(q, p)) > 1) { std::swap(*p, *q); std::string candidate; u16_u8(candidate, candidate_utf); - ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, - NULL); - if (ns == -1) - return -1; + testsug(wlst, candidate, cpdsuggest, NULL, NULL); std::swap(*p, *q); } } } - return ns; + return wlst.size(); } // error is a letter was moved -int SuggestMgr::movechar(char** wlst, +int SuggestMgr::movechar(std::vector<std::string>& wlst, const char* word, - int ns, int cpdsuggest) { std::string candidate(word); if (candidate.size() < 2) - return ns; + return wlst.size(); // try moving a char for (std::string::iterator p = candidate.begin(); p < candidate.end(); ++p) { @@ -1081,9 +936,7 @@ int SuggestMgr::movechar(char** wlst, std::swap(*q, *(q - 1)); if (std::distance(p, q) < 2) continue; // omit swap char - ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL); - if (ns == -1) - return -1; + testsug(wlst, candidate, cpdsuggest, NULL, NULL); } std::copy(word, word + candidate.size(), candidate.begin()); } @@ -1093,25 +946,22 @@ int SuggestMgr::movechar(char** wlst, std::swap(*q, *(q - 1)); if (std::distance(p, q) < 2) continue; // omit swap char - ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL); - if (ns == -1) - return -1; + testsug(wlst, candidate, cpdsuggest, NULL, NULL); } std::copy(word, word + candidate.size(), candidate.begin()); } - return ns; + return wlst.size(); } // error is a letter was moved -int SuggestMgr::movechar_utf(char** wlst, +int SuggestMgr::movechar_utf(std::vector<std::string>& wlst, const w_char* word, int wl, - int ns, int cpdsuggest) { std::vector<w_char> candidate_utf(word, word + wl); if (candidate_utf.size() < 2) - return ns; + return wlst.size(); // try moving a char for (std::vector<w_char>::iterator p = candidate_utf.begin(); p < candidate_utf.end(); ++p) { @@ -1121,39 +971,30 @@ int SuggestMgr::movechar_utf(char** wlst, continue; // omit swap char std::string candidate; u16_u8(candidate, candidate_utf); - ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, - NULL); - if (ns == -1) - return -1; + testsug(wlst, candidate, cpdsuggest, NULL, NULL); } std::copy(word, word + candidate_utf.size(), candidate_utf.begin()); } - for (std::vector<w_char>::iterator p = candidate_utf.begin() + candidate_utf.size() - 1; p > candidate_utf.begin(); --p) { - for (std::vector<w_char>::iterator q = p - 1; q >= candidate_utf.begin() && std::distance(q, p) < 10; --q) { - std::swap(*q, *(q + 1)); - if (std::distance(q, p) < 2) + for (std::vector<w_char>::reverse_iterator p = candidate_utf.rbegin(); p < candidate_utf.rend(); ++p) { + for (std::vector<w_char>::reverse_iterator q = p + 1; q < candidate_utf.rend() && std::distance(p, q) < 10; ++q) { + std::swap(*q, *(q - 1)); + if (std::distance(p, q) < 2) continue; // omit swap char std::string candidate; u16_u8(candidate, candidate_utf); - ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, - NULL); - if (ns == -1) - return -1; + testsug(wlst, candidate, cpdsuggest, NULL, NULL); } std::copy(word, word + candidate_utf.size(), candidate_utf.begin()); } - return ns; + return wlst.size(); } // generate a set of suggestions for very poorly spelled words -int SuggestMgr::ngsuggest(char** wlst, +void SuggestMgr::ngsuggest(std::vector<std::string>& wlst, const char* w, - int ns, - HashMgr** pHMgr, - int md) { - int i, j; + const std::vector<HashMgr*>& rHMgr) { int lval; int sc; int lp, lpphon; @@ -1165,7 +1006,7 @@ int SuggestMgr::ngsuggest(char** wlst, char* rootsphon[MAX_ROOTS]; int scores[MAX_ROOTS]; int scoresphon[MAX_ROOTS]; - for (i = 0; i < MAX_ROOTS; i++) { + for (int i = 0; i < MAX_ROOTS; i++) { roots[i] = NULL; scores[i] = -100 * i; rootsphon[i] = NULL; @@ -1206,12 +1047,12 @@ int SuggestMgr::ngsuggest(char** wlst, phonetable* ph = (pAMgr) ? pAMgr->get_phonetable() : NULL; std::string target; std::string candidate; + std::vector<w_char> w_candidate; if (ph) { if (utf8) { - std::vector<w_char> _w; - u8_u16(_w, word); - mkallcap_utf(_w, langnum); - u16_u8(candidate, _w); + u8_u16(w_candidate, word); + mkallcap_utf(w_candidate, langnum); + u16_u8(candidate, w_candidate); } else { candidate.assign(word); if (!nonbmp) @@ -1225,8 +1066,17 @@ int SuggestMgr::ngsuggest(char** wlst, FLAG nongramsuggest = pAMgr ? pAMgr->get_nongramsuggest() : FLAG_NULL; FLAG onlyincompound = pAMgr ? pAMgr->get_onlyincompound() : FLAG_NULL; - for (i = 0; i < md; i++) { - while (0 != (hp = (pHMgr[i])->walk_hashtable(col, hp))) { + std::vector<w_char> w_word, w_target; + if (utf8) { + u8_u16(w_word, word); + u8_u16(w_target, target); + } + + std::string f; + std::vector<w_char> w_f; + + for (size_t i = 0; i < rHMgr.size(); ++i) { + while (0 != (hp = rHMgr[i]->walk_hashtable(col, hp))) { if ((hp->astr) && (pAMgr) && (TESTAFF(hp->astr, forbiddenword, hp->alen) || TESTAFF(hp->astr, ONLYUPCASEFLAG, hp->alen) || @@ -1235,15 +1085,48 @@ int SuggestMgr::ngsuggest(char** wlst, TESTAFF(hp->astr, onlyincompound, hp->alen))) continue; - sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + low) + - leftcommonsubstring(word, HENTRY_WORD(hp)); + if (utf8) { + u8_u16(w_f, HENTRY_WORD(hp)); + + int leftcommon = leftcommonsubstring(w_word, w_f); + if (low) { + // lowering dictionary word + mkallsmall_utf(w_f, langnum); + } + sc = ngram(3, w_word, w_f, NGRAM_LONGER_WORSE) + leftcommon; + } else { + f.assign(HENTRY_WORD(hp)); + + int leftcommon = leftcommonsubstring(word, f.c_str()); + if (low) { + // lowering dictionary word + mkallsmall(f, csconv); + } + sc = ngram(3, word, f, NGRAM_LONGER_WORSE) + leftcommon; + } // check special pronounciation - std::string f; + f.clear(); if ((hp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) { - int sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) + - +leftcommonsubstring(word, f.c_str()); + int sc2; + if (utf8) { + u8_u16(w_f, f); + + int leftcommon = leftcommonsubstring(w_word, w_f); + if (low) { + // lowering dictionary word + mkallsmall_utf(w_f, langnum); + } + sc2 = ngram(3, w_word, w_f, NGRAM_LONGER_WORSE) + leftcommon; + } else { + int leftcommon = leftcommonsubstring(word, f.c_str()); + if (low) { + // lowering dictionary word + mkallsmall(f, csconv); + } + sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE) + leftcommon; + } if (sc2 > sc) sc = sc2; } @@ -1251,23 +1134,29 @@ int SuggestMgr::ngsuggest(char** wlst, int scphon = -20000; if (ph && (sc > 2) && (abs(n - (int)hp->clen) <= 3)) { if (utf8) { - std::vector<w_char> _w; - u8_u16(_w, HENTRY_WORD(hp)); - mkallcap_utf(_w, langnum); - u16_u8(candidate, _w); + u8_u16(w_candidate, HENTRY_WORD(hp)); + mkallcap_utf(w_candidate, langnum); + u16_u8(candidate, w_candidate); } else { - candidate.assign(HENTRY_WORD(hp)); + candidate = HENTRY_WORD(hp); mkallcap(candidate, csconv); } - std::string target2 = phonet(candidate, *ph); - scphon = 2 * ngram(3, target, target2, NGRAM_LONGER_WORSE); + f = phonet(candidate, *ph); + if (utf8) { + u8_u16(w_f, f); + scphon = 2 * ngram(3, w_target, w_f, + NGRAM_LONGER_WORSE); + } else { + scphon = 2 * ngram(3, target, f, + NGRAM_LONGER_WORSE); + } } if (sc > scores[lp]) { scores[lp] = sc; roots[lp] = hp; lval = sc; - for (j = 0; j < MAX_ROOTS; j++) + for (int j = 0; j < MAX_ROOTS; j++) if (scores[j] < lval) { lp = j; lval = scores[j]; @@ -1278,7 +1167,7 @@ int SuggestMgr::ngsuggest(char** wlst, scoresphon[lpphon] = scphon; rootsphon[lpphon] = HENTRY_WORD(hp); lval = scphon; - for (j = 0; j < MAX_ROOTS; j++) + for (int j = 0; j < MAX_ROOTS; j++) if (scoresphon[j] < lval) { lpphon = j; lval = scoresphon[j]; @@ -1290,21 +1179,33 @@ int SuggestMgr::ngsuggest(char** wlst, // find minimum threshold for a passable suggestion // mangle original word three differnt ways // and score them to generate a minimum acceptable score + std::vector<w_char> w_mw; int thresh = 0; for (int sp = 1; sp < 4; sp++) { if (utf8) { + w_mw = w_word; for (int k = sp; k < n; k += 4) { - u8[k].l = '*'; - u8[k].h = 0; + w_mw[k].l = '*'; + w_mw[k].h = 0; } - std::string mw; - u16_u8(mw, u8); - thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low); + + if (low) { + // lowering dictionary word + mkallsmall_utf(w_mw, langnum); + } + + thresh += ngram(n, w_word, w_mw, NGRAM_ANY_MISMATCH); } else { - std::string mw(word); + std::string mw = word; for (int k = sp; k < n; k += 4) mw[k] = '*'; - thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low); + + if (low) { + // lowering dictionary word + mkallsmall(mw, csconv); + } + + thresh += ngram(n, word, mw, NGRAM_ANY_MISMATCH); } } thresh = thresh / 3; @@ -1316,7 +1217,7 @@ int SuggestMgr::ngsuggest(char** wlst, char* guess[MAX_GUESS]; char* guessorig[MAX_GUESS]; int gscore[MAX_GUESS]; - for (i = 0; i < MAX_GUESS; i++) { + for (int i = 0; i < MAX_GUESS; i++) { guess[i] = NULL; guessorig[i] = NULL; gscore[i] = -100 * i; @@ -1329,14 +1230,14 @@ int SuggestMgr::ngsuggest(char** wlst, if (!glst) { if (nonbmp) utf8 = 1; - return ns; + return; } - for (i = 0; i < MAX_ROOTS; i++) { + for (int i = 0; i < MAX_ROOTS; i++) { if (roots[i]) { struct hentry* rp = roots[i]; - std::string f; + f.clear(); const char *field = NULL; if ((rp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(rp), MORPH_PHON)) field = f.c_str(); @@ -1345,8 +1246,27 @@ int SuggestMgr::ngsuggest(char** wlst, nc, field); for (int k = 0; k < nw; k++) { - sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH + low) + - leftcommonsubstring(word, glst[k].word); + if (utf8) { + u8_u16(w_f, glst[k].word); + + int leftcommon = leftcommonsubstring(w_word, w_f); + if (low) { + // lowering dictionary word + mkallsmall_utf(w_f, langnum); + } + + sc = ngram(n, w_word, w_f, NGRAM_ANY_MISMATCH) + leftcommon; + } else { + f = glst[k].word; + + int leftcommon = leftcommonsubstring(word, f.c_str()); + if (low) { + // lowering dictionary word + mkallsmall(f, csconv); + } + + sc = ngram(n, word, f, NGRAM_ANY_MISMATCH) + leftcommon; + } if (sc > thresh) { if (sc > gscore[lp]) { @@ -1361,7 +1281,7 @@ int SuggestMgr::ngsuggest(char** wlst, guess[lp] = glst[k].word; guessorig[lp] = glst[k].orig; lval = sc; - for (j = 0; j < MAX_GUESS; j++) + for (int j = 0; j < MAX_GUESS; j++) if (gscore[j] < lval) { lp = j; lval = gscore[j]; @@ -1400,16 +1320,16 @@ int SuggestMgr::ngsuggest(char** wlst, fact = (10.0 - maxd) / 5.0; } - for (i = 0; i < MAX_GUESS; i++) { + std::vector<w_char> w_gl; + for (int i = 0; i < MAX_GUESS; i++) { if (guess[i]) { // lowering guess[i] std::string gl; int len; if (utf8) { - std::vector<w_char> _w; - len = u8_u16(_w, guess[i]); - mkallsmall_utf(_w, langnum); - u16_u8(gl, _w); + len = u8_u16(w_gl, guess[i]); + mkallsmall_utf(w_gl, langnum); + u16_u8(gl, w_gl); } else { gl.assign(guess[i]); if (!nonbmp) @@ -1426,14 +1346,46 @@ int SuggestMgr::ngsuggest(char** wlst, } // using 2-gram instead of 3, and other weightening - re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) + - ngram(2, gl, word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED); + if (utf8) { + u8_u16(w_gl, gl); + //w_gl is lowercase already at this point + re = ngram(2, w_word, w_gl, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED); + if (low) { + w_f = w_word; + // lowering dictionary word + mkallsmall_utf(w_f, langnum); + re += ngram(2, w_gl, w_f, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED); + } else { + re += ngram(2, w_gl, w_word, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED); + } + } else { + //gl is lowercase already at this point + re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED); + if (low) { + f = word; + // lowering dictionary word + mkallsmall(f, csconv); + re += ngram(2, gl, f, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED); + } else { + re += ngram(2, gl, word, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED); + } + } + int ngram_score, leftcommon_score; + if (utf8) { + //w_gl is lowercase already at this point + ngram_score = ngram(4, w_word, w_gl, NGRAM_ANY_MISMATCH); + leftcommon_score = leftcommonsubstring(w_word, w_gl); + } else { + //gl is lowercase already at this point + ngram_score = ngram(4, word, gl, NGRAM_ANY_MISMATCH); + leftcommon_score = leftcommonsubstring(word, gl.c_str()); + } gscore[i] = // length of longest common subsequent minus length difference 2 * _lcs - abs((int)(n - len)) + // weight length of the left common substring - leftcommonsubstring(word, gl.c_str()) + + leftcommon_score + // weight equal character positions (!nonbmp && commoncharacterpositions(word, gl.c_str(), &is_swap) ? 1 @@ -1441,7 +1393,7 @@ int SuggestMgr::ngsuggest(char** wlst, // swap character (not neighboring) ((is_swap) ? 10 : 0) + // ngram - ngram(4, word, gl, NGRAM_ANY_MISMATCH + low) + + ngram_score + // weighted ngrams re + // different limit for dictionaries with PHONE rules @@ -1454,16 +1406,15 @@ int SuggestMgr::ngsuggest(char** wlst, // phonetic version if (ph) - for (i = 0; i < MAX_ROOTS; i++) { + for (int i = 0; i < MAX_ROOTS; i++) { if (rootsphon[i]) { // lowering rootphon[i] std::string gl; int len; if (utf8) { - std::vector<w_char> _w; - len = u8_u16(_w, rootsphon[i]); - mkallsmall_utf(_w, langnum); - u16_u8(gl, _w); + len = u8_u16(w_gl, rootsphon[i]); + mkallsmall_utf(w_gl, langnum); + u16_u8(gl, w_gl); } else { gl.assign(rootsphon[i]); if (!nonbmp) @@ -1471,10 +1422,15 @@ int SuggestMgr::ngsuggest(char** wlst, len = strlen(rootsphon[i]); } + // weight length of the left common substring + int leftcommon_score; + if (utf8) + leftcommon_score = leftcommonsubstring(w_word, w_gl); + else + leftcommon_score = leftcommonsubstring(word, gl.c_str()); // heuristic weigthing of ngram scores scoresphon[i] += 2 * lcslen(word, gl) - abs((int)(n - len)) + - // weight length of the left common substring - leftcommonsubstring(word, gl.c_str()); + leftcommon_score; } } @@ -1482,12 +1438,12 @@ int SuggestMgr::ngsuggest(char** wlst, bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS); // copy over - int oldns = ns; + size_t oldns = wlst.size(); int same = 0; - for (i = 0; i < MAX_GUESS; i++) { + for (int i = 0; i < MAX_GUESS; i++) { if (guess[i]) { - if ((ns < oldns + maxngramsugs) && (ns < maxSug) && + if ((wlst.size() < oldns + maxngramsugs) && (wlst.size() < maxSug) && (!same || (gscore[i] > 1000))) { int unique = 1; // leave only excellent suggestions, if exists @@ -1496,35 +1452,34 @@ int SuggestMgr::ngsuggest(char** wlst, else if (gscore[i] < -100) { same = 1; // keep the best ngram suggestions, unless in ONLYMAXDIFF mode - if (ns > oldns || (pAMgr && pAMgr->get_onlymaxdiff())) { + if (wlst.size() > oldns || (pAMgr && pAMgr->get_onlymaxdiff())) { free(guess[i]); if (guessorig[i]) free(guessorig[i]); continue; } } - for (j = 0; j < ns; j++) { + for (size_t j = 0; j < wlst.size(); ++j) { // don't suggest previous suggestions or a previous suggestion with // prefixes or affixes - if ((!guessorig[i] && strstr(guess[i], wlst[j])) || - (guessorig[i] && strstr(guessorig[i], wlst[j])) || + if ((!guessorig[i] && strstr(guess[i], wlst[j].c_str())) || + (guessorig[i] && strstr(guessorig[i], wlst[j].c_str())) || // check forbidden words - !checkword(guess[i], strlen(guess[i]), 0, NULL, NULL)) { + !checkword(guess[i], 0, NULL, NULL)) { unique = 0; break; } } if (unique) { - wlst[ns++] = guess[i]; if (guessorig[i]) { - free(guess[i]); - wlst[ns - 1] = guessorig[i]; + wlst.push_back(guessorig[i]); + } else { + wlst.push_back(guess[i]); } - } else { - free(guess[i]); - if (guessorig[i]) - free(guessorig[i]); } + free(guess[i]); + if (guessorig[i]) + free(guessorig[i]); } else { free(guess[i]); if (guessorig[i]) @@ -1533,26 +1488,24 @@ int SuggestMgr::ngsuggest(char** wlst, } } - oldns = ns; + oldns = wlst.size(); if (ph) - for (i = 0; i < MAX_ROOTS; i++) { + for (int i = 0; i < MAX_ROOTS; i++) { if (rootsphon[i]) { - if ((ns < oldns + MAXPHONSUGS) && (ns < maxSug)) { + if ((wlst.size() < oldns + MAXPHONSUGS) && (wlst.size() < maxSug)) { int unique = 1; - for (j = 0; j < ns; j++) { + for (size_t j = 0; j < wlst.size(); ++j) { // don't suggest previous suggestions or a previous suggestion with // prefixes or affixes - if (strstr(rootsphon[i], wlst[j]) || + if (strstr(rootsphon[i], wlst[j].c_str()) || // check forbidden words - !checkword(rootsphon[i], strlen(rootsphon[i]), 0, NULL, NULL)) { + !checkword(rootsphon[i], 0, NULL, NULL)) { unique = 0; break; } } if (unique) { - wlst[ns++] = mystrdup(rootsphon[i]); - if (!wlst[ns - 1]) - return ns - 1; + wlst.push_back(rootsphon[i]); } } } @@ -1560,7 +1513,6 @@ int SuggestMgr::ngsuggest(char** wlst, if (nonbmp) utf8 = 1; - return ns; } // see if a candidate suggestion is spelled correctly @@ -1569,15 +1521,10 @@ int SuggestMgr::ngsuggest(char** wlst, // obsolote MySpell-HU modifications: // return value 2 and 3 marks compounding with hyphen (-) // `3' marks roots without suffix -int SuggestMgr::checkword(const char* word, - int len, +int SuggestMgr::checkword(const std::string& word, int cpdsuggest, int* timer, clock_t* timelimit) { - struct hentry* rv = NULL; - struct hentry* rv2 = NULL; - int nosuffix = 0; - // check time limit if (timer) { (*timer)--; @@ -1589,13 +1536,16 @@ int SuggestMgr::checkword(const char* word, } if (pAMgr) { + struct hentry* rv = NULL; + int nosuffix = 0; + if (cpdsuggest == 1) { if (pAMgr->get_compound()) { + struct hentry* rv2 = NULL; struct hentry* rwords[100]; // buffer for COMPOUND pattern checking - rv = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, (hentry**)&rwords, 0, 1, - 0); // EXT + rv = pAMgr->compound_check(word, 0, 0, 100, 0, NULL, (hentry**)&rwords, 0, 1, 0); // EXT if (rv && - (!(rv2 = pAMgr->lookup(word)) || !rv2->astr || + (!(rv2 = pAMgr->lookup(word.c_str())) || !rv2->astr || !(TESTAFF(rv2->astr, pAMgr->get_forbiddenword(), rv2->alen) || TESTAFF(rv2->astr, pAMgr->get_nosuggest(), rv2->alen)))) return 3; // XXX obsolote categorisation + only ICONV needs affix @@ -1604,7 +1554,7 @@ int SuggestMgr::checkword(const char* word, return 0; } - rv = pAMgr->lookup(word); + rv = pAMgr->lookup(word.c_str()); if (rv) { if ((rv->astr) && @@ -1621,20 +1571,20 @@ int SuggestMgr::checkword(const char* word, break; } } else - rv = pAMgr->prefix_check(word, len, + rv = pAMgr->prefix_check(word.c_str(), word.size(), 0); // only prefix, and prefix + suffix XXX if (rv) { nosuffix = 1; } else { - rv = pAMgr->suffix_check(word, len, 0, NULL, NULL, 0, - NULL); // only suffix + rv = pAMgr->suffix_check(word.c_str(), word.size(), 0, NULL, + FLAG_NULL, FLAG_NULL, IN_CPD_NOT); // only suffix } if (!rv && pAMgr->have_contclass()) { - rv = pAMgr->suffix_check_twosfx(word, len, 0, NULL, FLAG_NULL); + rv = pAMgr->suffix_check_twosfx(word.c_str(), word.size(), 0, NULL, FLAG_NULL); if (!rv) - rv = pAMgr->prefix_check_twosfx(word, len, 1, FLAG_NULL); + rv = pAMgr->prefix_check_twosfx(word.c_str(), word.size(), 1, FLAG_NULL); } // check forbidden words @@ -1656,17 +1606,15 @@ int SuggestMgr::checkword(const char* word, } int SuggestMgr::check_forbidden(const char* word, int len) { - struct hentry* rv = NULL; - if (pAMgr) { - rv = pAMgr->lookup(word); + struct hentry* rv = pAMgr->lookup(word); if (rv && rv->astr && (TESTAFF(rv->astr, pAMgr->get_needaffix(), rv->alen) || TESTAFF(rv->astr, pAMgr->get_onlyincompound(), rv->alen))) rv = NULL; if (!(pAMgr->prefix_check(word, len, 1))) - rv = pAMgr->suffix_check(word, len, 0, NULL, NULL, 0, - NULL); // prefix+suffix, suffix + rv = pAMgr->suffix_check(word, len, 0, NULL, + FLAG_NULL, FLAG_NULL, IN_CPD_NOT); // prefix+suffix, suffix // check forbidden words if ((rv) && (rv->astr) && TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->alen)) @@ -1675,32 +1623,25 @@ int SuggestMgr::check_forbidden(const char* word, int len) { return 0; } -char* SuggestMgr::suggest_morph(const char* w) { - char result[MAXLNLEN]; - char* r = (char*)result; - char* st; +std::string SuggestMgr::suggest_morph(const std::string& in_w) { + std::string result; struct hentry* rv = NULL; - *result = '\0'; - if (!pAMgr) - return NULL; + return std::string(); - std::string w2; - const char* word = w; + std::string w(in_w); // word reversing wrapper for complex prefixes if (complexprefixes) { - w2.assign(w); if (utf8) - reverseword_utf(w2); + reverseword_utf(w); else - reverseword(w2); - word = w2.c_str(); + reverseword(w); } - rv = pAMgr->lookup(word); + rv = pAMgr->lookup(w.c_str()); while (rv) { if ((!rv->astr) || @@ -1708,65 +1649,83 @@ char* SuggestMgr::suggest_morph(const char* w) { TESTAFF(rv->astr, pAMgr->get_needaffix(), rv->alen) || TESTAFF(rv->astr, pAMgr->get_onlyincompound(), rv->alen))) { if (!HENTRY_FIND(rv, MORPH_STEM)) { - mystrcat(result, " ", MAXLNLEN); - mystrcat(result, MORPH_STEM, MAXLNLEN); - mystrcat(result, word, MAXLNLEN); + result.append(" "); + result.append(MORPH_STEM); + result.append(w); } if (HENTRY_DATA(rv)) { - mystrcat(result, " ", MAXLNLEN); - mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); + result.append(" "); + result.append(HENTRY_DATA2(rv)); } - mystrcat(result, "\n", MAXLNLEN); + result.append("\n"); } rv = rv->next_homonym; } - st = pAMgr->affix_check_morph(word, strlen(word)); - if (st) { - mystrcat(result, st, MAXLNLEN); - free(st); + std::string st = pAMgr->affix_check_morph(w.c_str(), w.size()); + if (!st.empty()) { + result.append(st); } - if (pAMgr->get_compound() && (*result == '\0')) { + if (pAMgr->get_compound() && result.empty()) { struct hentry* rwords[100]; // buffer for COMPOUND pattern checking - pAMgr->compound_check_morph(word, strlen(word), 0, 0, 100, 0, NULL, (hentry**)&rwords, 0, &r, + pAMgr->compound_check_morph(w.c_str(), w.size(), 0, 0, 100, 0, NULL, (hentry**)&rwords, 0, result, NULL); } - return (*result) ? mystrdup(line_uniq(result, MSEP_REC)) : NULL; + line_uniq(result, MSEP_REC); + + return result; +} + +static int get_sfxcount(const char* morph) { + if (!morph || !*morph) + return 0; + int n = 0; + const char* old = morph; + morph = strstr(morph, MORPH_DERI_SFX); + if (!morph) + morph = strstr(old, MORPH_INFL_SFX); + if (!morph) + morph = strstr(old, MORPH_TERM_SFX); + while (morph) { + n++; + old = morph; + morph = strstr(morph + 1, MORPH_DERI_SFX); + if (!morph) + morph = strstr(old + 1, MORPH_INFL_SFX); + if (!morph) + morph = strstr(old + 1, MORPH_TERM_SFX); + } + return n; } /* affixation */ -char* SuggestMgr::suggest_hentry_gen(hentry* rv, const char* pattern) { - char result[MAXLNLEN]; - *result = '\0'; +std::string SuggestMgr::suggest_hentry_gen(hentry* rv, const char* pattern) { + std::string result; int sfxcount = get_sfxcount(pattern); if (get_sfxcount(HENTRY_DATA(rv)) > sfxcount) - return NULL; + return result; if (HENTRY_DATA(rv)) { - char* aff = pAMgr->morphgen(HENTRY_WORD(rv), rv->blen, rv->astr, rv->alen, - HENTRY_DATA(rv), pattern, 0); - if (aff) { - mystrcat(result, aff, MAXLNLEN); - mystrcat(result, "\n", MAXLNLEN); - free(aff); + std::string aff = pAMgr->morphgen(HENTRY_WORD(rv), rv->blen, rv->astr, rv->alen, + HENTRY_DATA(rv), pattern, 0); + if (!aff.empty()) { + result.append(aff); + result.append("\n"); } } // check all allomorphs - char allomorph[MAXLNLEN]; char* p = NULL; if (HENTRY_DATA(rv)) p = (char*)strstr(HENTRY_DATA2(rv), MORPH_ALLOMORPH); while (p) { - struct hentry* rv2 = NULL; p += MORPH_TAG_LEN; int plen = fieldlen(p); - strncpy(allomorph, p, plen); - allomorph[plen] = '\0'; - rv2 = pAMgr->lookup(allomorph); + std::string allomorph(p, plen); + struct hentry* rv2 = pAMgr->lookup(allomorph.c_str()); while (rv2) { // if (HENTRY_DATA(rv2) && get_sfxcount(HENTRY_DATA(rv2)) <= // sfxcount) { @@ -1774,12 +1733,11 @@ char* SuggestMgr::suggest_hentry_gen(hentry* rv, const char* pattern) { char* st = (char*)strstr(HENTRY_DATA2(rv2), MORPH_STEM); if (st && (strncmp(st + MORPH_TAG_LEN, HENTRY_WORD(rv), fieldlen(st + MORPH_TAG_LEN)) == 0)) { - char* aff = pAMgr->morphgen(HENTRY_WORD(rv2), rv2->blen, rv2->astr, - rv2->alen, HENTRY_DATA(rv2), pattern, 0); - if (aff) { - mystrcat(result, aff, MAXLNLEN); - mystrcat(result, "\n", MAXLNLEN); - free(aff); + std::string aff = pAMgr->morphgen(HENTRY_WORD(rv2), rv2->blen, rv2->astr, + rv2->alen, HENTRY_DATA(rv2), pattern, 0); + if (!aff.empty()) { + result.append(aff); + result.append("\n"); } } } @@ -1788,27 +1746,28 @@ char* SuggestMgr::suggest_hentry_gen(hentry* rv, const char* pattern) { p = strstr(p + plen, MORPH_ALLOMORPH); } - return (*result) ? mystrdup(result) : NULL; + return result; } -char* SuggestMgr::suggest_gen(char** desc, int n, const char* pattern) { - if (n == 0 || !pAMgr) - return NULL; +std::string SuggestMgr::suggest_gen(const std::vector<std::string>& desc, const std::string& in_pattern) { + if (desc.empty() || !pAMgr) + return std::string(); + const char* pattern = in_pattern.c_str(); std::string result2; std::string newpattern; struct hentry* rv = NULL; // search affixed forms with and without derivational suffixes while (1) { - for (int k = 0; k < n; k++) { + for (size_t k = 0; k < desc.size(); ++k) { std::string result; // add compound word parts (except the last one) - char* s = (char*)desc[k]; - char* part = strstr(s, MORPH_PART); + const char* s = desc[k].c_str(); + const char* part = strstr(s, MORPH_PART); if (part) { - char* nextpart = strstr(part + 1, MORPH_PART); + const char* nextpart = strstr(part + 1, MORPH_PART); while (nextpart) { std::string field; copy_field(field, part, MORPH_PART); @@ -1819,56 +1778,50 @@ char* SuggestMgr::suggest_gen(char** desc, int n, const char* pattern) { s = part; } - char** pl; std::string tok(s); size_t pos = tok.find(" | "); while (pos != std::string::npos) { tok[pos + 1] = MSEP_ALT; pos = tok.find(" | ", pos); } - int pln = line_tok(tok.c_str(), &pl, MSEP_ALT); - for (int i = 0; i < pln; i++) { + std::vector<std::string> pl = line_tok(tok, MSEP_ALT); + for (size_t i = 0; i < pl.size(); ++i) { // remove inflectional and terminal suffixes - char* is = strstr(pl[i], MORPH_INFL_SFX); - if (is) - *is = '\0'; - char* ts = strstr(pl[i], MORPH_TERM_SFX); - while (ts) { - *ts = '_'; - ts = strstr(pl[i], MORPH_TERM_SFX); + size_t is = pl[i].find(MORPH_INFL_SFX); + if (is != std::string::npos) + pl[i].resize(is); + size_t ts = pl[i].find(MORPH_TERM_SFX); + while (ts != std::string::npos) { + pl[i][ts] = '_'; + ts = pl[i].find(MORPH_TERM_SFX); } - char* st = strstr(s, MORPH_STEM); + const char* st = strstr(s, MORPH_STEM); if (st) { copy_field(tok, st, MORPH_STEM); rv = pAMgr->lookup(tok.c_str()); while (rv) { std::string newpat(pl[i]); newpat.append(pattern); - char* sg = suggest_hentry_gen(rv, newpat.c_str()); - if (!sg) + std::string sg = suggest_hentry_gen(rv, newpat.c_str()); + if (sg.empty()) sg = suggest_hentry_gen(rv, pattern); - if (sg) { - char** gen; - int genl = line_tok(sg, &gen, MSEP_REC); - free(sg); - sg = NULL; - for (int j = 0; j < genl; j++) { + if (!sg.empty()) { + std::vector<std::string> gen = line_tok(sg, MSEP_REC); + for (size_t j = 0; j < gen.size(); ++j) { result2.push_back(MSEP_REC); result2.append(result); - if (strstr(pl[i], MORPH_SURF_PFX)) { + if (pl[i].find(MORPH_SURF_PFX) != std::string::npos) { std::string field; copy_field(field, pl[i], MORPH_SURF_PFX); result2.append(field); } result2.append(gen[j]); } - freelist(&gen, genl); } rv = rv->next_homonym; } } } - freelist(&pl, pln); } if (!result2.empty() || !strstr(pattern, MORPH_DERI_SFX)) @@ -1878,13 +1831,13 @@ char* SuggestMgr::suggest_gen(char** desc, int n, const char* pattern) { mystrrep(newpattern, MORPH_DERI_SFX, MORPH_TERM_SFX); pattern = newpattern.c_str(); } - return (!result2.empty() ? mystrdup(result2.c_str()) : NULL); + return result2; } -// generate an n-gram score comparing s1 and s2 +// generate an n-gram score comparing s1 and s2, UTF16 version int SuggestMgr::ngram(int n, - const std::string& s1, - const std::string& s2, + const std::vector<w_char>& su1, + const std::vector<w_char>& su2, int opt) { int nscore = 0; int ns; @@ -1892,68 +1845,36 @@ int SuggestMgr::ngram(int n, int l2; int test = 0; - if (utf8) { - std::vector<w_char> su1; - std::vector<w_char> su2; - l1 = u8_u16(su1, s1); - l2 = u8_u16(su2, s2); - if ((l2 <= 0) || (l1 == -1)) - return 0; - // lowering dictionary word - if (opt & NGRAM_LOWERING) - mkallsmall_utf(su2, langnum); - for (int j = 1; j <= n; j++) { - ns = 0; - for (int i = 0; i <= (l1 - j); i++) { - int k = 0; - for (int l = 0; l <= (l2 - j); l++) { - for (k = 0; k < j; k++) { - w_char& c1 = su1[i + k]; - w_char& c2 = su2[l + k]; - if ((c1.l != c2.l) || (c1.h != c2.h)) - break; - } - if (k == j) { - ns++; + l1 = su1.size(); + l2 = su2.size(); + if (l2 == 0) + return 0; + for (int j = 1; j <= n; j++) { + ns = 0; + for (int i = 0; i <= (l1 - j); i++) { + int k = 0; + for (int l = 0; l <= (l2 - j); l++) { + for (k = 0; k < j; k++) { + const w_char& c1 = su1[i + k]; + const w_char& c2 = su2[l + k]; + if ((c1.l != c2.l) || (c1.h != c2.h)) break; - } - } - if (k != j && opt & NGRAM_WEIGHTED) { - ns--; - test++; - if (i == 0 || i == l1 - j) - ns--; // side weight } - } - nscore = nscore + ns; - if (ns < 2 && !(opt & NGRAM_WEIGHTED)) - break; - } - } else { - l2 = s2.size(); - if (l2 == 0) - return 0; - l1 = s1.size(); - std::string t(s2); - if (opt & NGRAM_LOWERING) - mkallsmall(t, csconv); - for (int j = 1; j <= n; j++) { - ns = 0; - for (int i = 0; i <= (l1 - j); i++) { - std::string temp(s1.substr(i, j)); - if (t.find(temp) != std::string::npos) { + if (k == j) { ns++; - } else if (opt & NGRAM_WEIGHTED) { - ns--; - test++; - if (i == 0 || i == l1 - j) - ns--; // side weight + break; } } - nscore = nscore + ns; - if (ns < 2 && !(opt & NGRAM_WEIGHTED)) - break; + if (k != j && opt & NGRAM_WEIGHTED) { + ns--; + test++; + if (i == 0 || i == l1 - j) + ns--; // side weight + } } + nscore = nscore + ns; + if (ns < 2 && !(opt & NGRAM_WEIGHTED)) + break; } ns = 0; @@ -1965,46 +1886,92 @@ int SuggestMgr::ngram(int n, return ns; } -// length of the left common substring of s1 and (decapitalised) s2 -int SuggestMgr::leftcommonsubstring(const char* s1, const char* s2) { - if (utf8) { - std::vector<w_char> su1; - std::vector<w_char> su2; - int l1 = u8_u16(su1, s1); - int l2 = u8_u16(su2, s2); - // decapitalize dictionary word - if (complexprefixes) { - if (su1[l1 - 1] == su2[l2 - 1]) - return 1; - } else { - unsigned short idx = su2.empty() ? 0 : (su2[0].h << 8) + su2[0].l; - unsigned short otheridx = su1.empty() ? 0 : (su1[0].h << 8) + su1[0].l; - if (otheridx != idx && (otheridx != unicodetolower(idx, langnum))) - return 0; - int i; - for (i = 1; (i < l1) && (i < l2) && (su1[i].l == su2[i].l) && - (su1[i].h == su2[i].h); - i++) - ; - return i; +// generate an n-gram score comparing s1 and s2, non-UTF16 version +int SuggestMgr::ngram(int n, + const std::string& s1, + const std::string& s2, + int opt) { + int nscore = 0; + int ns; + int l1; + int l2; + int test = 0; + + l2 = s2.size(); + if (l2 == 0) + return 0; + l1 = s1.size(); + for (int j = 1; j <= n; j++) { + ns = 0; + for (int i = 0; i <= (l1 - j); i++) { + //s2 is haystack, s1[i..i+j) is needle + if (s2.find(s1.c_str()+i, 0, j) != std::string::npos) { + ns++; + } else if (opt & NGRAM_WEIGHTED) { + ns--; + test++; + if (i == 0 || i == l1 - j) + ns--; // side weight + } } + nscore = nscore + ns; + if (ns < 2 && !(opt & NGRAM_WEIGHTED)) + break; + } + + ns = 0; + if (opt & NGRAM_LONGER_WORSE) + ns = (l2 - l1) - 2; + if (opt & NGRAM_ANY_MISMATCH) + ns = abs(l2 - l1) - 2; + ns = (nscore - ((ns > 0) ? ns : 0)); + return ns; +} + +// length of the left common substring of s1 and (decapitalised) s2, UTF version +int SuggestMgr::leftcommonsubstring( + const std::vector<w_char>& su1, + const std::vector<w_char>& su2) { + int l1 = su1.size(); + int l2 = su2.size(); + // decapitalize dictionary word + if (complexprefixes) { + if (su1[l1 - 1] == su2[l2 - 1]) + return 1; } else { - if (complexprefixes) { - int l1 = strlen(s1); - int l2 = strlen(s2); - if (l1 <= l2 && s2[l1 - 1] == s2[l2 - 1]) - return 1; - } else if (csconv) { - const char* olds = s1; - // decapitalise dictionary word - if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower)) - return 0; - do { - s1++; - s2++; - } while ((*s1 == *s2) && (*s1 != '\0')); - return (int)(s1 - olds); - } + unsigned short idx = su2.empty() ? 0 : (su2[0].h << 8) + su2[0].l; + unsigned short otheridx = su1.empty() ? 0 : (su1[0].h << 8) + su1[0].l; + if (otheridx != idx && (otheridx != unicodetolower(idx, langnum))) + return 0; + int i; + for (i = 1; (i < l1) && (i < l2) && (su1[i].l == su2[i].l) && + (su1[i].h == su2[i].h); + i++) + ; + return i; + } + return 0; +} + +// length of the left common substring of s1 and (decapitalised) s2, non-UTF +int SuggestMgr::leftcommonsubstring( + const char* s1, + const char* s2) { + if (complexprefixes) { + int l1 = strlen(s1); + int l2 = strlen(s2); + if (l1 <= l2 && s2[l1 - 1] == s2[l2 - 1]) + return 1; + } else if (csconv) { + const char* olds = s1; + // decapitalise dictionary word + if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower)) + return 0; + do { + s1++; + s2++; + } while ((*s1 == *s2) && (*s1 != '\0')); + return (int)(s1 - olds); } return 0; } @@ -2054,7 +2021,7 @@ int SuggestMgr::commoncharacterpositions(const char* s1, } else { mkallsmall(t, csconv); } - for (i = 0; (*(s1 + i) != 0) && i < t.size(); i++) { + for (i = 0; i < t.size() && (*(s1 + i) != 0); ++i) { if (*(s1 + i) == t[i]) { num++; } else { diff --git a/libs/hunspell/src/suggestmgr.hxx b/libs/hunspell/src/suggestmgr.hxx index 675d98eb8f..19ffc03a84 100644 --- a/libs/hunspell/src/suggestmgr.hxx +++ b/libs/hunspell/src/suggestmgr.hxx @@ -1,6 +1,8 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * + * Copyright (C) 2002-2017 Németh László + * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at @@ -11,12 +13,7 @@ * for the specific language governing rights and limitations under the * License. * - * The Original Code is Hunspell, based on MySpell. - * - * The Initial Developers of the Original Code are - * Kevin Hendricks (MySpell) and Németh László (Hunspell). - * Portions created by the Initial Developers are Copyright (C) 2002-2005 - * the Initial Developers. All Rights Reserved. + * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. * * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, @@ -71,8 +68,8 @@ * SUCH DAMAGE. */ -#ifndef _SUGGESTMGR_HXX_ -#define _SUGGESTMGR_HXX_ +#ifndef SUGGESTMGR_HXX_ +#define SUGGESTMGR_HXX_ #define MAX_ROOTS 100 #define MAX_WORDS 100 @@ -91,8 +88,6 @@ #define NGRAM_LOWERING (1 << 2) #define NGRAM_WEIGHTED (1 << 3) -#include "hunvisapi.h" - #include "atypes.hxx" #include "affixmgr.hxx" #include "hashmgr.hxx" @@ -101,22 +96,22 @@ enum { LCS_UP, LCS_LEFT, LCS_UPLEFT }; -class LIBHUNSPELL_DLL_EXPORTED SuggestMgr { +class SuggestMgr { private: SuggestMgr(const SuggestMgr&); SuggestMgr& operator=(const SuggestMgr&); private: char* ckey; - int ckeyl; - w_char* ckey_utf; + size_t ckeyl; + std::vector<w_char> ckey_utf; char* ctry; - int ctryl; - w_char* ctry_utf; + size_t ctryl; + std::vector<w_char> ctry_utf; AffixMgr* pAMgr; - int maxSug; + unsigned int maxSug; struct cs_info* csconv; int utf8; int langnum; @@ -126,73 +121,68 @@ class LIBHUNSPELL_DLL_EXPORTED SuggestMgr { int complexprefixes; public: - SuggestMgr(const char* tryme, int maxn, AffixMgr* aptr); + SuggestMgr(const char* tryme, unsigned int maxn, AffixMgr* aptr); ~SuggestMgr(); - int suggest(char*** slst, const char* word, int nsug, int* onlycmpdsug); - int ngsuggest(char** wlst, const char* word, int ns, HashMgr** pHMgr, int md); - int suggest_auto(char*** slst, const char* word, int nsug); - int suggest_stems(char*** slst, const char* word, int nsug); - int suggest_pos_stems(char*** slst, const char* word, int nsug); + void suggest(std::vector<std::string>& slst, const char* word, int* onlycmpdsug); + void ngsuggest(std::vector<std::string>& slst, const char* word, const std::vector<HashMgr*>& rHMgr); - char* suggest_morph(const char* word); - char* suggest_gen(char** pl, int pln, const char* pattern); - char* suggest_morph_for_spelling_error(const char* word); + std::string suggest_morph(const std::string& word); + std::string suggest_gen(const std::vector<std::string>& pl, const std::string& pattern); private: - int testsug(char** wlst, - const char* candidate, - int wl, - int ns, - int cpdsuggest, - int* timer, - clock_t* timelimit); - int checkword(const char*, int, int, int*, clock_t*); + void testsug(std::vector<std::string>& wlst, + const std::string& candidate, + int cpdsuggest, + int* timer, + clock_t* timelimit); + int checkword(const std::string& word, int, int*, clock_t*); int check_forbidden(const char*, int); - int capchars(char**, const char*, int, int); - int replchars(char**, const char*, int, int); - int doubletwochars(char**, const char*, int, int); - int forgotchar(char**, const char*, int, int); - int swapchar(char**, const char*, int, int); - int longswapchar(char**, const char*, int, int); - int movechar(char**, const char*, int, int); - int extrachar(char**, const char*, int, int); - int badcharkey(char**, const char*, int, int); - int badchar(char**, const char*, int, int); - int twowords(char**, const char*, int, int); - int fixstems(char**, const char*, int); - - int capchars_utf(char**, const w_char*, int wl, int, int); - int doubletwochars_utf(char**, const w_char*, int wl, int, int); - int forgotchar_utf(char**, const w_char*, int wl, int, int); - int extrachar_utf(char**, const w_char*, int wl, int, int); - int badcharkey_utf(char**, const w_char*, int wl, int, int); - int badchar_utf(char**, const w_char*, int wl, int, int); - int swapchar_utf(char**, const w_char*, int wl, int, int); - int longswapchar_utf(char**, const w_char*, int, int, int); - int movechar_utf(char**, const w_char*, int, int, int); - - int mapchars(char**, const char*, int, int); + void capchars(std::vector<std::string>&, const char*, int); + int replchars(std::vector<std::string>&, const char*, int); + int doubletwochars(std::vector<std::string>&, const char*, int); + int forgotchar(std::vector<std::string>&, const char*, int); + int swapchar(std::vector<std::string>&, const char*, int); + int longswapchar(std::vector<std::string>&, const char*, int); + int movechar(std::vector<std::string>&, const char*, int); + int extrachar(std::vector<std::string>&, const char*, int); + int badcharkey(std::vector<std::string>&, const char*, int); + int badchar(std::vector<std::string>&, const char*, int); + int twowords(std::vector<std::string>&, const char*, int); + + void capchars_utf(std::vector<std::string>&, const w_char*, int wl, int); + int doubletwochars_utf(std::vector<std::string>&, const w_char*, int wl, int); + int forgotchar_utf(std::vector<std::string>&, const w_char*, int wl, int); + int extrachar_utf(std::vector<std::string>&, const w_char*, int wl, int); + int badcharkey_utf(std::vector<std::string>&, const w_char*, int wl, int); + int badchar_utf(std::vector<std::string>&, const w_char*, int wl, int); + int swapchar_utf(std::vector<std::string>&, const w_char*, int wl, int); + int longswapchar_utf(std::vector<std::string>&, const w_char*, int, int); + int movechar_utf(std::vector<std::string>&, const w_char*, int, int); + + int mapchars(std::vector<std::string>&, const char*, int); int map_related(const char*, std::string&, int, - char** wlst, - int, - int, - const mapentry*, + std::vector<std::string>& wlst, int, + const std::vector<mapentry>&, int*, clock_t*); + int ngram(int n, const std::vector<w_char>& su1, + const std::vector<w_char>& su2, int opt); int ngram(int n, const std::string& s1, const std::string& s2, int opt); int mystrlen(const char* word); + int leftcommonsubstring(const std::vector<w_char>& su1, + const std::vector<w_char>& su2); int leftcommonsubstring(const char* s1, const char* s2); int commoncharacterpositions(const char* s1, const char* s2, int* is_swap); void bubblesort(char** rwd, char** rwd2, int* rsc, int n); void lcs(const char* s, const char* s2, int* l1, int* l2, char** result); int lcslen(const char* s, const char* s2); int lcslen(const std::string& s, const std::string& s2); - char* suggest_hentry_gen(hentry* rv, const char* pattern); + std::string suggest_hentry_gen(hentry* rv, const char* pattern); }; #endif diff --git a/libs/hunspell/src/utf_info.cxx b/libs/hunspell/src/utf_info.c++ index 74742b8e43..6bb847f2a6 100644 --- a/libs/hunspell/src/utf_info.cxx +++ b/libs/hunspell/src/utf_info.c++ @@ -1,6 +1,8 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * + * Copyright (C) 2002-2017 Németh László + * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at @@ -11,12 +13,7 @@ * for the specific language governing rights and limitations under the * License. * - * The Original Code is Hunspell, based on MySpell. - * - * The Initial Developers of the Original Code are - * Kevin Hendricks (MySpell) and Németh László (Hunspell). - * Portions created by the Initial Developers are Copyright (C) 2002-2005 - * the Initial Developers. All Rights Reserved. + * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. * * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, @@ -39,7 +36,6 @@ * ***** END LICENSE BLOCK ***** */ #include "csutil.hxx" - /* fields: Unicode letter, toupper, tolower */ static struct unicode_info utf_lst[] = { {0x0041, 0x0041, 0x0061}, {0x0042, 0x0042, 0x0062}, @@ -9878,4 +9874,3 @@ static struct unicode_info utf_lst[] = { {0xFFD5, 0xFFD5, 0xFFD5}, {0xFFD6, 0xFFD6, 0xFFD6}, {0xFFD7, 0xFFD7, 0xFFD7}, {0xFFDA, 0xFFDA, 0xFFDA}, {0xFFDB, 0xFFDB, 0xFFDB}, {0xFFDC, 0xFFDC, 0xFFDC}}; -
\ No newline at end of file diff --git a/libs/hunspell/src/w_char.hxx b/libs/hunspell/src/w_char.hxx index 336c454f79..5accb7568f 100644 --- a/libs/hunspell/src/w_char.hxx +++ b/libs/hunspell/src/w_char.hxx @@ -1,6 +1,8 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * + * Copyright (C) 2002-2017 Németh László + * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at @@ -11,12 +13,7 @@ * for the specific language governing rights and limitations under the * License. * - * The Original Code is Hunspell, based on MySpell. - * - * The Initial Developers of the Original Code are - * Kevin Hendricks (MySpell) and Németh László (Hunspell). - * Portions created by the Initial Developers are Copyright (C) 2002-2005 - * the Initial Developers. All Rights Reserved. + * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. * * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, @@ -38,8 +35,10 @@ * * ***** END LICENSE BLOCK ***** */ -#ifndef __WCHARHXX__ -#define __WCHARHXX__ +#ifndef W_CHAR_HXX_ +#define W_CHAR_HXX_ + +#include <string> #ifndef GCC struct w_char { @@ -66,10 +65,8 @@ struct __attribute__((packed)) w_char { // two character arrays struct replentry { - char* pattern; - char* pattern2; - bool start; - bool end; + std::string pattern; + std::string outstrings[4]; // med, ini, fin, isol }; #endif |