diff options
author | George Hazan <ghazan@miranda.im> | 2022-08-30 17:13:21 +0300 |
---|---|---|
committer | George Hazan <ghazan@miranda.im> | 2022-08-30 17:13:21 +0300 |
commit | 3ad2f2b7c2bfb3166363239d67a6645692ffb2b6 (patch) | |
tree | 0201fd31d0c0e5c193752f7b80cdc69096b563cf /libs/hunspell/src | |
parent | d82b809f6af58a1d10fa503138b912d336dca75e (diff) |
fixes #3183 (Update hunspell to 1.7.1)
Diffstat (limited to 'libs/hunspell/src')
25 files changed, 1438 insertions, 884 deletions
diff --git a/libs/hunspell/src/affentry.c++ b/libs/hunspell/src/affentry.c++ index 4ef0c00d9b..2cf4f4671f 100644 --- a/libs/hunspell/src/affentry.c++ +++ b/libs/hunspell/src/affentry.c++ @@ -1,7 +1,7 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * - * Copyright (C) 2002-2017 Németh László + * Copyright (C) 2002-2022 Németh László * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with @@ -146,7 +146,7 @@ inline int PfxEntry::test_condition(const char* st) { break; } case ']': { - if ((neg && ingroup) || (!neg && !ingroup)) + if (bool(neg) == bool(ingroup)) return 0; pos = NULL; p = nextchar(p); @@ -224,7 +224,7 @@ struct hentry* PfxEntry::checkword(const char* word, // back any characters that would have been stripped std::string tmpword(strip); - tmpword.append(word + appnd.size()); + tmpword.append(word + appnd.size(), tmpl); // now make sure all of the conditions on characters // are met. Please see the appendix at the end of @@ -399,28 +399,28 @@ std::string PfxEntry::check_morph(const char* word, ((!needflag) || TESTAFF(he->astr, needflag, he->alen) || (contclass && TESTAFF(contclass, needflag, contclasslen)))) { if (morphcode) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(morphcode); } else result.append(getKey()); if (!HENTRY_FIND(he, MORPH_STEM)) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(MORPH_STEM); result.append(HENTRY_WORD(he)); } // store the pointer of the hash entry if (HENTRY_DATA(he)) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(HENTRY_DATA2(he)); } else { // return with debug information char* flag = pmyMgr->encode_flag(getFlag()); - result.append(" "); + result.push_back(MSEP_FLD); result.append(MORPH_FLAG); result.append(flag); free(flag); } - result.append("\n"); + result.push_back(MSEP_REC); } he = he->next_homonym; } while (he); @@ -804,7 +804,7 @@ std::string SfxEntry::check_twosfx_morph(const char* word, if (!st.empty()) { if (ppfx->getMorph()) { result.append(ppfx->getMorph()); - result.append(" "); + result.push_back(MSEP_FLD); } result.append(st); mychomp(result); diff --git a/libs/hunspell/src/affentry.hxx b/libs/hunspell/src/affentry.hxx index 535a96bc42..b736bf0350 100644 --- a/libs/hunspell/src/affentry.hxx +++ b/libs/hunspell/src/affentry.hxx @@ -1,7 +1,7 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * - * Copyright (C) 2002-2017 Németh László + * Copyright (C) 2002-2022 Németh László * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with @@ -118,7 +118,7 @@ class PfxEntry : public AffEntry { const char* getKey() { return appnd.c_str(); } std::string add(const char* word, size_t len); - inline short getKeyLen() { return (short)appnd.size(); } + inline short getKeyLen() { return appnd.size(); } inline const char* getMorph() { return morphcode; } @@ -199,7 +199,7 @@ class SfxEntry : public AffEntry { inline short getContLen() { return contclasslen; } inline const char* getAffix() { return appnd.c_str(); } - inline short getKeyLen() { return (short)appnd.size(); } + inline short getKeyLen() { return appnd.size(); } inline SfxEntry* getNext() { return next; } inline SfxEntry* getNextNE() { return nextne; } diff --git a/libs/hunspell/src/affixmgr.c++ b/libs/hunspell/src/affixmgr.c++ index 90c7eaff33..adb750dba1 100644 --- a/libs/hunspell/src/affixmgr.c++ +++ b/libs/hunspell/src/affixmgr.c++ @@ -1,7 +1,7 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * - * Copyright (C) 2002-2017 Németh László + * Copyright (C) 2002-2022 Németh László * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with @@ -72,6 +72,7 @@ #include <string.h> #include <stdio.h> #include <ctype.h> +#include <time.h> #include <algorithm> #include <limits> @@ -96,7 +97,6 @@ AffixMgr::AffixMgr(const char* affpath, complexprefixes = 0; parsedmaptable = false; parsedbreaktable = false; - parsedrep = false; iconvtable = NULL; oconvtable = NULL; // allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN) @@ -113,7 +113,7 @@ AffixMgr::AffixMgr(const char* affpath, compoundforbidflag = FLAG_NULL; // compound fordidden flag for suffixed word compoundmoresuffixes = 0; // allow more suffixes within compound words checkcompounddup = 0; // forbid double words in compounds - checkcompoundrep = 0; // forbid bad compounds (may be non compound word with + checkcompoundrep = 0; // forbid bad compounds (may be non-compound word with // a REP substitution) checkcompoundcase = 0; // forbid upper and lowercase combinations at word bounds @@ -439,7 +439,7 @@ int AffixMgr::parse_file(const char* affpath, const char* key) { } } - /* parse in the flag used by forbidden words */ + /* parse in the flag used by forbidden words (is deprecated) */ if (line.compare(0, 13, "LEMMA_PRESENT", 13) == 0) { if (!parse_flag(line, &lemma_present, afflst)) { finishFileMgr(afflst); @@ -463,7 +463,7 @@ int AffixMgr::parse_file(const char* affpath, const char* key) { } } - /* parse in the flag used by `needaffixs' */ + /* parse in the flag used by `needaffixs' (is deprecated) */ if (line.compare(0, 10, "PSEUDOROOT", 10) == 0) { if (!parse_flag(line, &needaffix, afflst)) { finishFileMgr(afflst); @@ -529,14 +529,6 @@ int AffixMgr::parse_file(const char* affpath, const char* key) { } } - /* parse in the typical fault correcting table */ - if (line.compare(0, 3, "REP", 3) == 0) { - if (!parse_reptable(line, afflst)) { - finishFileMgr(afflst); - return 1; - } - } - /* parse in the input conversion table */ if (line.compare(0, 5, "ICONV", 5) == 0) { if (!parse_convtable(line, afflst, &iconvtable, "ICONV")) { @@ -545,7 +537,7 @@ int AffixMgr::parse_file(const char* affpath, const char* key) { } } - /* parse in the input conversion table */ + /* parse in the output conversion table */ if (line.compare(0, 5, "OCONV", 5) == 0) { if (!parse_convtable(line, afflst, &oconvtable, "OCONV")) { finishFileMgr(afflst); @@ -1023,7 +1015,7 @@ int AffixMgr::process_sfx_order() { // add flags to the result for dictionary debugging std::string& AffixMgr::debugflag(std::string& result, unsigned short flag) { char* st = encode_flag(flag); - result.append(" "); + result.push_back(MSEP_FLD); result.append(MORPH_FLAG); if (st) { result.append(st); @@ -1060,7 +1052,7 @@ int AffixMgr::encodeit(AffEntry& entry, const char* cs) { } else if (cs[MAXCONDLEN]) { //there is more conditions than fit in fixed space, so its //a long condition - entry.opts += aeLONGCOND; + entry.opts |= aeLONGCOND; entry.c.l.conds2 = mystrdup(cs + MAXCONDLEN_1); if (!entry.c.l.conds2) return 1; @@ -1146,7 +1138,7 @@ struct hentry* AffixMgr::prefix_check(const char* word, return NULL; } -// check word for prefixes +// check word for prefixes and two-level suffixes struct hentry* AffixMgr::prefix_check_twosfx(const char* word, int len, char in_compound, @@ -1187,7 +1179,7 @@ struct hentry* AffixMgr::prefix_check_twosfx(const char* word, return NULL; } -// check word for prefixes +// check word for prefixes and morph std::string AffixMgr::prefix_check_morph(const char* word, int len, char in_compound, @@ -1234,7 +1226,7 @@ std::string AffixMgr::prefix_check_morph(const char* word, return result; } -// check word for prefixes +// check word for prefixes and morph and two-level suffixes std::string AffixMgr::prefix_check_twosfx_morph(const char* word, int len, char in_compound, @@ -1275,25 +1267,44 @@ std::string AffixMgr::prefix_check_twosfx_morph(const char* word, return result; } -// Is word a non compound with a REP substitution (see checkcompoundrep)? +// Is word a non-compound with a REP substitution (see checkcompoundrep)? int AffixMgr::cpdrep_check(const char* word, int wl) { - if ((wl < 2) || reptable.empty()) + if ((wl < 2) || get_reptable().empty()) return 0; - for (size_t i = 0; i < reptable.size(); ++i) { - const char* r = word; - const size_t lenp = reptable[i].pattern.size(); - // search every occurence of the pattern in the word - while ((r = strstr(r, reptable[i].pattern.c_str())) != NULL) { - std::string candidate(word); - size_t type = r == word && langnum != LANG_hu ? 1 : 0; - if (r - word + reptable[i].pattern.size() == lenp && langnum != LANG_hu) - type += 2; - candidate.replace(r - word, lenp, reptable[i].outstrings[type]); + for (size_t i = 0; i < get_reptable().size(); ++i) { + // use only available mid patterns + if (!get_reptable()[i].outstrings[0].empty()) { + const char* r = word; + const size_t lenp = get_reptable()[i].pattern.size(); + // search every occurence of the pattern in the word + while ((r = strstr(r, get_reptable()[i].pattern.c_str())) != NULL) { + std::string candidate(word); + candidate.replace(r - word, lenp, get_reptable()[i].outstrings[0]); + if (candidate_check(candidate.c_str(), candidate.size())) + return 1; + ++r; // search for the next letter + } + } + } + + return 0; +} + +// forbid compound words, if they are in the dictionary as a +// word pair separated by space +int AffixMgr::cpdwordpair_check(const char * word, int wl) { + if (wl > 2) { + std::string candidate(word); + for (size_t i = 1; i < candidate.size(); i++) { + // go to end of the UTF-8 character + if (utf8 && ((word[i] & 0xc0) == 0x80)) + continue; + candidate.insert(i, 1, ' '); if (candidate_check(candidate.c_str(), candidate.size())) return 1; - ++r; // search for the next letter + candidate.erase(i, 1); } } @@ -1584,6 +1595,21 @@ struct hentry* AffixMgr::compound_check(const std::string& word, int checked_prefix; + // add a time limit to handle possible + // combinatorical explosion of the overlapping words + + HUNSPELL_THREAD_LOCAL clock_t timelimit; + + if (wordnum == 0) { + // get the start time, seeing as we're reusing this set to 0 + // to flag timeout, use clock() + 1 to avoid start clock() + // of 0 as being a timeout + timelimit = clock() + 1; + } + else if (timelimit != 0 && (clock() > timelimit + TIMELIMIT)) { + timelimit = 0; + } + setcminmax(&cmin, &cmax, word.c_str(), len); st.assign(word); @@ -1608,6 +1634,9 @@ struct hentry* AffixMgr::compound_check(const std::string& word, do { // simplified checkcompoundpattern loop + if (timelimit == 0) + return 0; + if (scpd > 0) { for (; scpd <= checkcpdtable.size() && (checkcpdtable[scpd - 1].pattern3.empty() || @@ -1647,6 +1676,12 @@ struct hentry* AffixMgr::compound_check(const std::string& word, affixed = 1; rv = lookup(st.c_str()); // perhaps without prefix + // forbid dictionary stems with COMPOUNDFORBIDFLAG in + // compound words, overriding the effect of COMPOUNDPERMITFLAG + if ((rv) && compoundforbidflag && + TESTAFF(rv->astr, compoundforbidflag, rv->alen) && !hu_mov_rule) + continue; + // search homonym with compound flag while ((rv) && !hu_mov_rule && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || @@ -1854,7 +1889,7 @@ struct hentry* AffixMgr::compound_check(const std::string& word, } // check FORCEUCASE - if (rv && forceucase && + if (rv && forceucase && (rv) && (TESTAFF(rv->astr, forceucase, rv->alen)) && !(info && *info & SPELL_ORIGCAP)) rv = NULL; @@ -1909,9 +1944,10 @@ struct hentry* AffixMgr::compound_check(const std::string& word, && (scpd == 0 || checkcpdtable[scpd - 1].cond2 == FLAG_NULL || TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2, rv->alen))) { - // forbid compound word, if it is a non compound word with typical + // forbid compound word, if it is a non-compound word with typical // fault - if (checkcompoundrep && cpdrep_check(word.c_str(), len)) + if ((checkcompoundrep && cpdrep_check(word.c_str(), len)) || + cpdwordpair_check(word.c_str(), len)) return NULL; return rv_first; } @@ -1962,7 +1998,7 @@ struct hentry* AffixMgr::compound_check(const std::string& word, } // check FORCEUCASE - if (rv && forceucase && + if (rv && forceucase && (rv) && (TESTAFF(rv->astr, forceucase, rv->alen)) && !(info && *info & SPELL_ORIGCAP)) rv = NULL; @@ -1989,7 +2025,9 @@ struct hentry* AffixMgr::compound_check(const std::string& word, if (sfxappnd) { std::string tmp(sfxappnd); reverseword(tmp); - numsyllable -= get_syllable(tmp) + sfxextra; + numsyllable -= short(get_syllable(tmp) + sfxextra); + } else { + numsyllable -= short(sfxextra); } // + 1 word, if syllable number of the prefix > 1 (hungarian @@ -2024,7 +2062,6 @@ struct hentry* AffixMgr::compound_check(const std::string& word, (TESTAFF(rv->astr, compoundroot, rv->alen))) { wordnum++; } - // second word is acceptable, as a word with prefix or/and suffix? // hungarian conventions: compounding is acceptable, // when compound forms consist 2 word, otherwise @@ -2033,9 +2070,10 @@ struct hentry* AffixMgr::compound_check(const std::string& word, (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) || ((cpdmaxsyllable != 0) && (numsyllable <= cpdmaxsyllable))) && ((!checkcompounddup || (rv != rv_first)))) { - // forbid compound word, if it is a non compound word with typical + // forbid compound word, if it is a non-compound word with typical // fault - if (checkcompoundrep && cpdrep_check(word.c_str(), len)) + if ((checkcompoundrep && cpdrep_check(word.c_str(), len)) || + cpdwordpair_check(word.c_str(), len)) return NULL; return rv_first; } @@ -2059,8 +2097,12 @@ struct hentry* AffixMgr::compound_check(const std::string& word, rv = NULL; } if (rv) { - // forbid compound word, if it is a non compound word with typical - // fault + // forbid compound word, if it is a non-compound word with typical + // fault, or a dictionary word pair + + if (cpdwordpair_check(word.c_str(), len)) + return NULL; + if (checkcompoundrep || forbiddenword) { if (checkcompoundrep && cpdrep_check(word.c_str(), len)) @@ -2071,7 +2113,8 @@ struct hentry* AffixMgr::compound_check(const std::string& word, char r = st[i + rv->blen]; st[i + rv->blen] = '\0'; - if (checkcompoundrep && cpdrep_check(st.c_str(), i + rv->blen)) { + if ((checkcompoundrep && cpdrep_check(st.c_str(), i + rv->blen)) || + cpdwordpair_check(st.c_str(), i + rv->blen)) { st[ + i + rv->blen] = r; continue; } @@ -2162,6 +2205,21 @@ int AffixMgr::compound_check_morph(const char* word, char affixed = 0; hentry** oldwords = words; + // add a time limit to handle possible + // combinatorical explosion of the overlapping words + + HUNSPELL_THREAD_LOCAL clock_t timelimit; + + if (wordnum == 0) { + // get the start time, seeing as we're reusing this set to 0 + // to flag timeout, use clock() + 1 to avoid start clock() + // of 0 as being a timeout + timelimit = clock() + 1; + } + else if (timelimit != 0 && (clock() > timelimit + TIMELIMIT)) { + timelimit = 0; + } + setcminmax(&cmin, &cmax, word, len); st.assign(word); @@ -2180,6 +2238,9 @@ int AffixMgr::compound_check_morph(const char* word, do { // onlycpdrule loop + if (timelimit == 0) + return 0; + oldnumsyllable = numsyllable; oldwordnum = wordnum; checked_prefix = 0; @@ -2198,6 +2259,12 @@ int AffixMgr::compound_check_morph(const char* word, rv = lookup(st.c_str()); // perhaps without prefix + // forbid dictionary stems with COMPOUNDFORBIDFLAG in + // compound words, overriding the effect of COMPOUNDPERMITFLAG + if ((rv) && compoundforbidflag && + TESTAFF(rv->astr, compoundforbidflag, rv->alen) && !hu_mov_rule) + continue; + // search homonym with compound flag while ((rv) && !hu_mov_rule && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || @@ -2215,6 +2282,9 @@ int AffixMgr::compound_check_morph(const char* word, rv = rv->next_homonym; } + if (timelimit == 0) + return 0; + if (rv) affixed = 0; @@ -2405,22 +2475,22 @@ int AffixMgr::compound_check_morph(const char* word, if (rv && words && words[wnum + 1]) { result.append(presult); - result.append(" "); + result.push_back(MSEP_FLD); result.append(MORPH_PART); result.append(word + i); if (complexprefixes && HENTRY_DATA(rv)) result.append(HENTRY_DATA2(rv)); if (!HENTRY_FIND(rv, MORPH_STEM)) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(MORPH_STEM); result.append(HENTRY_WORD(rv)); } // store the pointer of the hash entry if (!complexprefixes && HENTRY_DATA(rv)) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(HENTRY_DATA2(rv)); } - result.append("\n"); + result.push_back(MSEP_REC); return 0; } @@ -2462,7 +2532,7 @@ int AffixMgr::compound_check_morph(const char* word, ((!checkcompounddup || (rv != rv_first)))) { // bad compound word result.append(presult); - result.append(" "); + result.push_back(MSEP_FLD); result.append(MORPH_PART); result.append(word + i); @@ -2470,17 +2540,17 @@ int AffixMgr::compound_check_morph(const char* word, if (complexprefixes) result.append(HENTRY_DATA2(rv)); if (!HENTRY_FIND(rv, MORPH_STEM)) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(MORPH_STEM); result.append(HENTRY_WORD(rv)); } // store the pointer of the hash entry if (!complexprefixes) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(HENTRY_DATA2(rv)); } } - result.append("\n"); + result.push_back(MSEP_REC); ok = 1; } @@ -2519,7 +2589,7 @@ int AffixMgr::compound_check_morph(const char* word, line_uniq_app(m, MSEP_REC); result.append(m); } - result.append("\n"); + result.push_back(MSEP_REC); ok = 1; } } @@ -2552,7 +2622,9 @@ int AffixMgr::compound_check_morph(const char* word, if (sfxappnd) { std::string tmp(sfxappnd); reverseword(tmp); - numsyllable -= get_syllable(tmp) + sfxextra; + numsyllable -= short(get_syllable(tmp) + sfxextra); + } else { + numsyllable -= short(sfxextra); } // + 1 word, if syllable number of the prefix > 1 (hungarian @@ -2605,8 +2677,9 @@ int AffixMgr::compound_check_morph(const char* word, if (!m.empty()) { result.push_back(MSEP_FLD); result.append(MORPH_PART); - result.append(word + 1); + result.append(word + i); line_uniq_app(m, MSEP_REC); + result.push_back(MSEP_FLD); result.append(m); } result.push_back(MSEP_REC); @@ -2769,7 +2842,6 @@ struct hentry* AffixMgr::suffix_check(const char* word, } // check word for two-level suffixes - struct hentry* AffixMgr::suffix_check_twosfx(const char* word, int len, int sfxopts, @@ -2814,6 +2886,7 @@ struct hentry* AffixMgr::suffix_check_twosfx(const char* word, return NULL; } +// check word for two-level suffixes and morph std::string AffixMgr::suffix_check_twosfx_morph(const char* word, int len, int sfxopts, @@ -2832,17 +2905,17 @@ std::string AffixMgr::suffix_check_twosfx_morph(const char* word, if (ppfx) { if (ppfx->getMorph()) { result.append(ppfx->getMorph()); - result.append(" "); + result.push_back(MSEP_FLD); } else debugflag(result, ppfx->getFlag()); } result.append(st); if (se->getMorph()) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(se->getMorph()); } else debugflag(result, se->getFlag()); - result.append("\n"); + result.push_back(MSEP_REC); } } se = se->getNext(); @@ -2867,12 +2940,12 @@ std::string AffixMgr::suffix_check_twosfx_morph(const char* word, result3.clear(); if (sptr->getMorph()) { - result3.append(" "); + result3.push_back(MSEP_FLD); result3.append(sptr->getMorph()); } else debugflag(result3, sptr->getFlag()); strlinecat(result2, result3); - result2.append("\n"); + result2.push_back(MSEP_REC); result.append(result2); } } @@ -2935,28 +3008,28 @@ std::string AffixMgr::suffix_check_morph(const char* word, if (ppfx) { if (ppfx->getMorph()) { result.append(ppfx->getMorph()); - result.append(" "); + result.push_back(MSEP_FLD); } else debugflag(result, ppfx->getFlag()); } if (complexprefixes && HENTRY_DATA(rv)) result.append(HENTRY_DATA2(rv)); if (!HENTRY_FIND(rv, MORPH_STEM)) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(MORPH_STEM); result.append(HENTRY_WORD(rv)); } if (!complexprefixes && HENTRY_DATA(rv)) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(HENTRY_DATA2(rv)); } if (se->getMorph()) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(se->getMorph()); } else debugflag(result, se->getFlag()); - result.append("\n"); + result.push_back(MSEP_REC); rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag); } } @@ -3002,29 +3075,29 @@ std::string AffixMgr::suffix_check_morph(const char* word, if (ppfx) { if (ppfx->getMorph()) { result.append(ppfx->getMorph()); - result.append(" "); + result.push_back(MSEP_FLD); } else debugflag(result, ppfx->getFlag()); } if (complexprefixes && HENTRY_DATA(rv)) result.append(HENTRY_DATA2(rv)); if (!HENTRY_FIND(rv, MORPH_STEM)) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(MORPH_STEM); result.append(HENTRY_WORD(rv)); } if (!complexprefixes && HENTRY_DATA(rv)) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(HENTRY_DATA2(rv)); } if (sptr->getMorph()) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(sptr->getMorph()); } else debugflag(result, sptr->getFlag()); - result.append("\n"); + result.push_back(MSEP_REC); rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag); } sptr = sptr->getNextEQ(); @@ -3213,7 +3286,7 @@ std::string AffixMgr::morphgen(const char* ts, // use input suffix fields, if exist if (strstr(morph, MORPH_INFL_SFX) || strstr(morph, MORPH_DERI_SFX)) { mymorph.assign(morph); - mymorph.append(" "); + mymorph.push_back(MSEP_FLD); stemmorphcatpos = mymorph.size(); } else { stemmorphcatpos = std::string::npos; @@ -3414,7 +3487,7 @@ int AffixMgr::expand_rootword(struct guessword* wlst, // return replacing table const std::vector<replentry>& AffixMgr::get_reptable() const { - return reptable; + return pHMgr->get_reptable(); } // return iconv table @@ -3554,6 +3627,11 @@ FLAG AffixMgr::get_nongramsuggest() const { return nongramsuggest; } +// return the substandard root/affix control flag +FLAG AffixMgr::get_substandard() const { + return substandard; +} + // return the forbidden words flag modify flag FLAG AffixMgr::get_needaffix() const { return needaffix; @@ -3692,103 +3770,6 @@ bool AffixMgr::parse_cpdsyllable(const std::string& line, FileMgr* af) { return true; } -/* parse in the typical fault correcting table */ -bool AffixMgr::parse_reptable(const std::string& line, FileMgr* af) { - if (parsedrep) { - HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", - af->getlinenum()); - return false; - } - parsedrep = true; - int numrep = -1; - int i = 0; - int np = 0; - std::string::const_iterator iter = line.begin(); - std::string::const_iterator start_piece = mystrsep(line, iter); - while (start_piece != line.end()) { - switch (i) { - case 0: { - np++; - break; - } - case 1: { - numrep = atoi(std::string(start_piece, iter).c_str()); - if (numrep < 1) { - HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", - af->getlinenum()); - return false; - } - reptable.reserve(numrep); - np++; - break; - } - default: - break; - } - ++i; - start_piece = mystrsep(line, iter); - } - if (np != 2) { - HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", - af->getlinenum()); - return false; - } - - /* now parse the numrep lines to read in the remainder of the table */ - for (int j = 0; j < numrep; ++j) { - std::string nl; - if (!af->getline(nl)) - return false; - mychomp(nl); - reptable.push_back(replentry()); - iter = nl.begin(); - i = 0; - int type = 0; - start_piece = mystrsep(nl, iter); - while (start_piece != nl.end()) { - switch (i) { - case 0: { - if (nl.compare(start_piece - nl.begin(), 3, "REP", 3) != 0) { - HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", - af->getlinenum()); - reptable.clear(); - return false; - } - break; - } - case 1: { - if (*start_piece == '^') - type = 1; - reptable.back().pattern.assign(start_piece + type, iter); - mystrrep(reptable.back().pattern, "_", " "); - if (!reptable.back().pattern.empty() && reptable.back().pattern[reptable.back().pattern.size() - 1] == '$') { - type += 2; - reptable.back().pattern.resize(reptable.back().pattern.size() - 1); - } - break; - } - case 2: { - reptable.back().outstrings[type].assign(start_piece, iter); - mystrrep(reptable.back().outstrings[type], "_", " "); - break; - } - default: - break; - } - ++i; - start_piece = mystrsep(nl, iter); - } - if (reptable.back().pattern.empty() || reptable.back().outstrings[type].empty()) { - HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", - af->getlinenum()); - reptable.clear(); - return false; - } - } - return true; -} - -/* parse in the typical fault correcting table */ bool AffixMgr::parse_convtable(const std::string& line, FileMgr* af, RepList** rl, @@ -4386,7 +4367,7 @@ void AffixMgr::reverse_condition(std::string& piece) { case '^': { if (*(k - 1) == ']') neg = 1; - else + else if (neg) *(k - 1) = *k; break; } @@ -4519,11 +4500,11 @@ bool AffixMgr::parse_affix(const std::string& line, char opts = ff; if (utf8) - opts += aeUTF8; + opts |= aeUTF8; if (pHMgr->is_aliasf()) - opts += aeALIASF; + opts |= aeALIASF; if (pHMgr->is_aliasm()) - opts += aeALIASM; + opts |= aeALIASM; affentries.initialize(numents, opts, aflag); } @@ -4617,7 +4598,7 @@ bool AffixMgr::parse_affix(const std::string& line, entry->appnd = std::string(start_piece, dash); std::string dash_str(dash + 1, iter); - if (!ignorechars.empty()) { + if (!ignorechars.empty() && !has_no_ignored_chars(entry->appnd, ignorechars)) { if (utf8) { remove_ignored_chars_utf(entry->appnd, ignorechars_utf16); } else { @@ -4653,7 +4634,7 @@ bool AffixMgr::parse_affix(const std::string& line, } else { entry->appnd = std::string(start_piece, iter); - if (!ignorechars.empty()) { + if (!ignorechars.empty() && !has_no_ignored_chars(entry->appnd, ignorechars)) { if (utf8) { remove_ignored_chars_utf(entry->appnd, ignorechars_utf16); } else { diff --git a/libs/hunspell/src/affixmgr.hxx b/libs/hunspell/src/affixmgr.hxx index d41e69cfd2..450f50a65c 100644 --- a/libs/hunspell/src/affixmgr.hxx +++ b/libs/hunspell/src/affixmgr.hxx @@ -1,7 +1,7 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * - * Copyright (C) 2002-2017 Németh László + * Copyright (C) 2002-2022 Németh László * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with @@ -120,8 +120,6 @@ class AffixMgr { FLAG nongramsuggest; FLAG needaffix; int cpdmin; - bool parsedrep; - std::vector<replentry> reptable; RepList* iconvtable; RepList* oconvtable; bool parsedmaptable; @@ -251,6 +249,7 @@ class AffixMgr { short get_syllable(const std::string& word); int cpdrep_check(const char* word, int len); + int cpdwordpair_check(const char * word, int len); int cpdpat_check(const char* word, int len, hentry* r1, @@ -311,6 +310,7 @@ class AffixMgr { FLAG get_forbiddenword() const; FLAG get_nosuggest() const; FLAG get_nongramsuggest() const; + FLAG get_substandard() const; FLAG get_needaffix() const; FLAG get_onlyincompound() const; const char* get_derived() const; @@ -338,7 +338,6 @@ class AffixMgr { bool parse_flag(const std::string& line, unsigned short* out, FileMgr* af); bool parse_num(const std::string& line, int* out, FileMgr* af); bool parse_cpdsyllable(const std::string& line, FileMgr* af); - bool parse_reptable(const std::string& line, FileMgr* af); bool parse_convtable(const std::string& line, FileMgr* af, RepList** rl, diff --git a/libs/hunspell/src/atypes.hxx b/libs/hunspell/src/atypes.hxx index f841523189..1b78d4724b 100644 --- a/libs/hunspell/src/atypes.hxx +++ b/libs/hunspell/src/atypes.hxx @@ -1,7 +1,7 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * - * Copyright (C) 2002-2017 Németh László + * Copyright (C) 2002-2022 Németh László * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with @@ -95,6 +95,16 @@ static inline void HUNSPELL_WARNING(FILE*, const char*, ...) {} #define TESTAFF(a, b, c) (std::binary_search(a, a + c, b)) +// timelimit: max. ~1/4 sec (process time on Linux) for +// for a suggestion, including max. ~/10 sec for a case +// sensitive plain or compound word suggestion, within +// ~1/20 sec long time consuming suggestion functions +#define TIMELIMIT_GLOBAL (CLOCKS_PER_SEC / 4) +#define TIMELIMIT_SUGGESTION (CLOCKS_PER_SEC / 10) +#define TIMELIMIT (CLOCKS_PER_SEC / 20) +#define MINTIMER 100 +#define MAXPLUSTIMER 100 + struct guessword { char* word; bool allow; diff --git a/libs/hunspell/src/baseaffix.hxx b/libs/hunspell/src/baseaffix.hxx index 9191dba475..52cd60e028 100644 --- a/libs/hunspell/src/baseaffix.hxx +++ b/libs/hunspell/src/baseaffix.hxx @@ -1,7 +1,7 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * - * Copyright (C) 2002-2017 Németh László + * Copyright (C) 2002-2022 Németh László * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with diff --git a/libs/hunspell/src/csutil.c++ b/libs/hunspell/src/csutil.c++ index 59a9d28353..fbaa768b40 100644 --- a/libs/hunspell/src/csutil.c++ +++ b/libs/hunspell/src/csutil.c++ @@ -1,7 +1,7 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * - * Copyright (C) 2002-2017 Németh László + * Copyright (C) 2002-2022 Németh László * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with @@ -69,6 +69,7 @@ */ #include <algorithm> +#include <assert.h> #include <stdlib.h> #include <string.h> #include <stdio.h> @@ -79,13 +80,6 @@ #include "atypes.hxx" #include "langnum.hxx" -// Unicode character encoding information -struct unicode_info { - unsigned short c; - unsigned short cupper; - unsigned short clower; -}; - #ifdef _WIN32 #include <windows.h> #include <wchar.h> @@ -102,12 +96,10 @@ struct unicode_info { #ifdef MOZILLA_CLIENT #include "nsCOMPtr.h" -#include "nsIUnicodeEncoder.h" -#include "nsIUnicodeDecoder.h" #include "nsUnicharUtils.h" -#include "mozilla/dom/EncodingUtils.h" +#include "mozilla/Encoding.h" -using mozilla::dom::EncodingUtils; +using namespace mozilla; #endif struct unicode_info2 { @@ -495,20 +487,17 @@ void uniqlist(std::vector<std::string>& list) { namespace { unsigned char cupper(const struct cs_info* csconv, int nIndex) { - if (nIndex < 0 || nIndex > 255) - return nIndex; + assert(nIndex >= 0 && nIndex <= 255); return csconv[nIndex].cupper; } unsigned char clower(const struct cs_info* csconv, int nIndex) { - if (nIndex < 0 || nIndex > 255) - return nIndex; + assert(nIndex >= 0 && nIndex <= 255); return csconv[nIndex].clower; } unsigned char ccase(const struct cs_info* csconv, int nIndex) { - if (nIndex < 0 || nIndex > 255) - return nIndex; + assert(nIndex >= 0 && nIndex <= 255); return csconv[nIndex].ccase; } } @@ -2306,20 +2295,12 @@ struct cs_info* get_current_cs(const std::string& es) { ccs[i].cupper = i; } - nsCOMPtr<nsIUnicodeEncoder> encoder; - nsCOMPtr<nsIUnicodeDecoder> decoder; - - nsresult rv; - - nsAutoCString label(es.c_str()); - nsAutoCString encoding; - if (!EncodingUtils::FindEncodingForLabelNoReplacement(label, encoding)) { + auto encoding = Encoding::ForLabelNoReplacement(es); + if (!encoding) { return ccs; } - encoder = EncodingUtils::EncoderForEncoding(encoding); - decoder = EncodingUtils::DecoderForEncoding(encoding); - encoder->SetOutputErrorBehavior(encoder->kOnError_Signal, nullptr, '?'); - decoder->SetInputErrorBehavior(decoder->kOnError_Signal); + auto encoder = encoding->NewEncoder(); + auto decoder = encoding->NewDecoderWithoutBOMHandling(); for (unsigned int i = 0; i <= 0xff; ++i) { bool success = false; @@ -2327,36 +2308,50 @@ struct cs_info* get_current_cs(const std::string& es) { // in this 1-byte character encoding. Call our encoding/decoding // APIs separately for each byte since they may reject some of the // bytes, and we want to handle errors separately for each byte. - char lower, upper; + uint8_t lower, upper; do { if (i == 0) break; - const char source = char(i); - char16_t uni, uniCased; - int32_t charLength = 1, uniLength = 1; - - rv = decoder->Convert(&source, &charLength, &uni, &uniLength); - // Explicitly check NS_OK because we don't want to allow - // NS_OK_UDEC_MOREOUTPUT or NS_OK_UDEC_MOREINPUT. - if (rv != NS_OK || charLength != 1 || uniLength != 1) + uint8_t source = uint8_t(i); + char16_t uni[2]; + char16_t uniCased; + uint8_t destination[4]; + auto src1 = MakeSpan(&source, 1); + auto dst1 = MakeSpan(uni); + auto src2 = MakeSpan(&uniCased, 1); + auto dst2 = MakeSpan(destination); + + uint32_t result; + size_t read; + size_t written; + Tie(result, read, written) = + decoder->DecodeToUTF16WithoutReplacement(src1, dst1, true); + if (result != kInputEmpty || read != 1 || written != 1) { break; - uniCased = ToLowerCase(uni); - rv = encoder->Convert(&uniCased, &uniLength, &lower, &charLength); - // Explicitly check NS_OK because we don't want to allow - // NS_OK_UDEC_MOREOUTPUT or NS_OK_UDEC_MOREINPUT. - if (rv != NS_OK || charLength != 1 || uniLength != 1) + } + + uniCased = ToLowerCase(uni[0]); + Tie(result, read, written) = + encoder->EncodeFromUTF16WithoutReplacement(src2, dst2, true); + if (result != kInputEmpty || read != 1 || written != 1) { break; + } + lower = destination[0]; - uniCased = ToUpperCase(uni); - rv = encoder->Convert(&uniCased, &uniLength, &upper, &charLength); - // Explicitly check NS_OK because we don't want to allow - // NS_OK_UDEC_MOREOUTPUT or NS_OK_UDEC_MOREINPUT. - if (rv != NS_OK || charLength != 1 || uniLength != 1) + uniCased = ToUpperCase(uni[0]); + Tie(result, read, written) = + encoder->EncodeFromUTF16WithoutReplacement(src2, dst2, true); + if (result != kInputEmpty || read != 1 || written != 1) { break; + } + upper = destination[0]; success = true; } while (0); + encoding->NewEncoderInto(*encoder); + encoding->NewDecoderWithoutBOMHandlingInto(*decoder); + if (success) { ccs[i].cupper = upper; ccs[i].clower = lower; @@ -2401,6 +2396,7 @@ static struct lang_map lang2enc[] = {{"ar", LANG_ar}, {"az", LANG_az}, {"az_AZ", LANG_az}, // for back-compatibility {"bg", LANG_bg}, {"ca", LANG_ca}, + {"crh", LANG_crh}, {"cs", LANG_cs}, {"da", LANG_da}, {"de", LANG_de}, {"el", LANG_el}, {"en", LANG_en}, {"es", LANG_es}, @@ -2458,7 +2454,7 @@ unsigned short unicodetoupper(unsigned short c, int langnum) { // In Azeri and Turkish, I and i dictinct letters: // There are a dotless lower case i pair of upper `I', // and an upper I with dot pair of lower `i'. - if (c == 0x0069 && ((langnum == LANG_az) || (langnum == LANG_tr))) + if (c == 0x0069 && ((langnum == LANG_az) || (langnum == LANG_tr) || (langnum == LANG_crh))) return 0x0130; #ifdef OPENOFFICEORG return static_cast<unsigned short>(u_toupper(c)); @@ -2475,7 +2471,7 @@ unsigned short unicodetolower(unsigned short c, int langnum) { // In Azeri and Turkish, I and i dictinct letters: // There are a dotless lower case i pair of upper `I', // and an upper I with dot pair of lower `i'. - if (c == 0x0049 && ((langnum == LANG_az) || (langnum == LANG_tr))) + if (c == 0x0049 && ((langnum == LANG_az) || (langnum == LANG_tr) || (langnum == LANG_crh))) return 0x0131; #ifdef OPENOFFICEORG return static_cast<unsigned short>(u_tolower(c)); diff --git a/libs/hunspell/src/csutil.hxx b/libs/hunspell/src/csutil.hxx index 5d83f80970..c6f03d8f76 100644 --- a/libs/hunspell/src/csutil.hxx +++ b/libs/hunspell/src/csutil.hxx @@ -1,7 +1,7 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * - * Copyright (C) 2002-2017 Németh László + * Copyright (C) 2002-2022 Németh László * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with @@ -269,10 +269,23 @@ LIBHUNSPELL_DLL_EXPORTED void store_pointer(char* dest, char* source); // conversion function for protected memory LIBHUNSPELL_DLL_EXPORTED char* get_stored_pointer(const char* s); + +// to avoid unnecessary string copies and Unicode conversions +// we simply check the ignored_chars characters in the word +// (in the case of UTF-8 encoded strings, "false" means +// "likely false", if ignored_chars characters are not ASCII) +inline bool has_no_ignored_chars(const std::string& word, + const std::string& ignored_chars) { + for (std::string::const_iterator it = ignored_chars.begin(), end = ignored_chars.end(); it != end; ++it) + if (word.find(*it) != std::string::npos) + return false; + return true; +} + // hash entry macros -LIBHUNSPELL_DLL_EXPORTED inline char* HENTRY_DATA(struct hentry* h) { +inline char* HENTRY_DATA(struct hentry* h) { char* ret; - if (!h->var) + if (!(h->var & H_OPT)) ret = NULL; else if (h->var & H_OPT_ALIASM) ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1); @@ -281,10 +294,10 @@ LIBHUNSPELL_DLL_EXPORTED inline char* HENTRY_DATA(struct hentry* h) { return ret; } -LIBHUNSPELL_DLL_EXPORTED inline const char* HENTRY_DATA( +inline const char* HENTRY_DATA( const struct hentry* h) { const char* ret; - if (!h->var) + if (!(h->var & H_OPT)) ret = NULL; else if (h->var & H_OPT_ALIASM) ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1); @@ -294,10 +307,10 @@ LIBHUNSPELL_DLL_EXPORTED inline const char* HENTRY_DATA( } // NULL-free version for warning-free OOo build -LIBHUNSPELL_DLL_EXPORTED inline const char* HENTRY_DATA2( +inline const char* HENTRY_DATA2( const struct hentry* h) { const char* ret; - if (!h->var) + if (!(h->var & H_OPT)) ret = ""; else if (h->var & H_OPT_ALIASM) ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1); @@ -306,7 +319,7 @@ LIBHUNSPELL_DLL_EXPORTED inline const char* HENTRY_DATA2( return ret; } -LIBHUNSPELL_DLL_EXPORTED inline char* HENTRY_FIND(struct hentry* h, +inline char* HENTRY_FIND(struct hentry* h, const char* p) { return (HENTRY_DATA(h) ? strstr(HENTRY_DATA(h), p) : NULL); } diff --git a/libs/hunspell/src/filemgr.c++ b/libs/hunspell/src/filemgr.c++ index 4a14de8762..4a754e52a8 100644 --- a/libs/hunspell/src/filemgr.c++ +++ b/libs/hunspell/src/filemgr.c++ @@ -1,7 +1,7 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * - * Copyright (C) 2002-2017 Németh László + * Copyright (C) 2002-2022 Németh László * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with @@ -83,6 +83,8 @@ int FileMgr::fail(const char* err, const char* par) { FileMgr::FileMgr(const char* file, const char* key) : hin(NULL), linenum(0) { in[0] = '\0'; + if (!file || !strlen(file)) + return; myopen(fin, file, std::ios_base::in); if (!fin.is_open()) { // check hzipped file @@ -103,7 +105,7 @@ bool FileMgr::getline(std::string& dest) { ++linenum; if (fin.is_open()) { ret = static_cast<bool>(std::getline(fin, dest)); - } else if (hin->is_open()) { + } else if (hin && hin->is_open()) { ret = hin->getline(dest); } if (!ret) { diff --git a/libs/hunspell/src/filemgr.hxx b/libs/hunspell/src/filemgr.hxx index 62433aeefe..88fe88388a 100644 --- a/libs/hunspell/src/filemgr.hxx +++ b/libs/hunspell/src/filemgr.hxx @@ -1,7 +1,7 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * - * Copyright (C) 2002-2017 Németh László + * Copyright (C) 2002-2022 Németh László * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with diff --git a/libs/hunspell/src/hashmgr.c++ b/libs/hunspell/src/hashmgr.c++ index 23421b567a..3ec263de1d 100644 --- a/libs/hunspell/src/hashmgr.c++ +++ b/libs/hunspell/src/hashmgr.c++ @@ -1,7 +1,7 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * - * Copyright (C) 2002-2017 Németh László + * Copyright (C) 2002-2022 Németh László * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with @@ -78,6 +78,7 @@ #include "hashmgr.hxx" #include "csutil.hxx" #include "atypes.hxx" +#include "langnum.hxx" // build a hash table from a munched word list @@ -182,13 +183,14 @@ int HashMgr::add_word(const std::string& in_word, unsigned short* aff, int al, const std::string* in_desc, - bool onlyupcase) { + bool onlyupcase, + int captype) { const std::string* word = &in_word; const std::string* desc = in_desc; std::string *word_copy = NULL; std::string *desc_copy = NULL; - if (!ignorechars.empty() || complexprefixes) { + if ((!ignorechars.empty() && !has_no_ignored_chars(in_word, ignorechars)) || complexprefixes) { word_copy = new std::string(in_word); if (!ignorechars.empty()) { @@ -243,20 +245,119 @@ int HashMgr::add_word(const std::string& in_word, hp->astr = aff; hp->next = NULL; hp->next_homonym = NULL; + hp->var = (captype == INITCAP) ? H_OPT_INITCAP : 0; // store the description string or its pointer if (desc) { - hp->var = H_OPT; + hp->var |= H_OPT; if (aliasm) { - hp->var += H_OPT_ALIASM; + hp->var |= H_OPT_ALIASM; store_pointer(hpw + word->size() + 1, get_aliasm(atoi(desc->c_str()))); } else { strcpy(hpw + word->size() + 1, desc->c_str()); } - if (strstr(HENTRY_DATA(hp), MORPH_PHON)) - hp->var += H_OPT_PHON; - } else - hp->var = 0; + if (strstr(HENTRY_DATA(hp), MORPH_PHON)) { + hp->var |= H_OPT_PHON; + // store ph: fields (pronounciation, misspellings, old orthography etc.) + // of a morphological description in reptable to use in REP replacements. + if (reptable.capacity() < (unsigned int)(tablesize/MORPH_PHON_RATIO)) + reptable.reserve(tablesize/MORPH_PHON_RATIO); + std::string fields = HENTRY_DATA(hp); + std::string::const_iterator iter = fields.begin(); + std::string::const_iterator start_piece = mystrsep(fields, iter); + while (start_piece != fields.end()) { + if (std::string(start_piece, iter).find(MORPH_PHON) == 0) { + std::string ph = std::string(start_piece, iter).substr(sizeof MORPH_PHON - 1); + if (ph.size() > 0) { + std::vector<w_char> w; + size_t strippatt; + std::string wordpart; + // dictionary based REP replacement, separated by "->" + // for example "pretty ph:prity ph:priti->pretti" to handle + // both prity -> pretty and pritier -> prettiest suggestions. + if (((strippatt = ph.find("->")) != std::string::npos) && + (strippatt > 0) && (strippatt < ph.size() - 2)) { + wordpart = ph.substr(strippatt + 2); + ph.erase(ph.begin() + strippatt, ph.end()); + } else + wordpart = in_word; + // when the ph: field ends with the character *, + // strip last character of the pattern and the replacement + // to match in REP suggestions also at character changes, + // for example, "pretty ph:prity*" results "prit->prett" + // REP replacement instead of "prity->pretty", to get + // prity->pretty and pritiest->prettiest suggestions. + if (ph.at(ph.size()-1) == '*') { + strippatt = 1; + size_t stripword = 0; + if (utf8) { + while ((strippatt < ph.size()) && + ((ph.at(ph.size()-strippatt-1) & 0xc0) == 0x80)) + ++strippatt; + while ((stripword < wordpart.size()) && + ((wordpart.at(wordpart.size()-stripword-1) & 0xc0) == 0x80)) + ++stripword; + } + ++strippatt; + ++stripword; + if ((ph.size() > strippatt) && (wordpart.size() > stripword)) { + ph.erase(ph.size()-strippatt, strippatt); + wordpart.erase(in_word.size()-stripword, stripword); + } + } + // capitalize lowercase pattern for capitalized words to support + // good suggestions also for capitalized misspellings, eg. + // Wednesday ph:wendsay + // results wendsay -> Wednesday and Wendsay -> Wednesday, too. + if (captype==INITCAP) { + std::string ph_capitalized; + if (utf8) { + u8_u16(w, ph); + if (get_captype_utf8(w, langnum) == NOCAP) { + mkinitcap_utf(w, langnum); + u16_u8(ph_capitalized, w); + } + } else if (get_captype(ph, csconv) == NOCAP) + mkinitcap(ph_capitalized, csconv); + + if (ph_capitalized.size() > 0) { + // add also lowercase word in the case of German or + // Hungarian to support lowercase suggestions lowercased by + // compound word generation or derivational suffixes + // (for example by adjectival suffix "-i" of geographical + // names in Hungarian: + // Massachusetts ph:messzecsuzec + // messzecsuzeci -> massachusettsi (adjective) + // For lowercasing by conditional PFX rules, see + // tests/germancompounding test example or the + // Hungarian dictionary.) + if (langnum == LANG_de || langnum == LANG_hu) { + std::string wordpart_lower(wordpart); + if (utf8) { + u8_u16(w, wordpart_lower); + mkallsmall_utf(w, langnum); + u16_u8(wordpart_lower, w); + } else { + mkallsmall(wordpart_lower, csconv); + } + reptable.push_back(replentry()); + reptable.back().pattern.assign(ph); + reptable.back().outstrings[0].assign(wordpart_lower); + } + reptable.push_back(replentry()); + reptable.back().pattern.assign(ph_capitalized); + reptable.back().outstrings[0].assign(wordpart); + } + } + reptable.push_back(replentry()); + reptable.back().pattern.assign(ph); + reptable.back().outstrings[0].assign(wordpart); + } + } + start_piece = mystrsep(fields, iter); + } + } + } struct hentry* dp = tableptr[i]; if (!dp) { @@ -347,12 +448,12 @@ int HashMgr::add_hidden_capitalized_word(const std::string& word, mkallsmall_utf(w, langnum); mkinitcap_utf(w, langnum); u16_u8(st, w); - return add_word(st, wcl, flags2, flagslen + 1, dp, true); + return add_word(st, wcl, flags2, flagslen + 1, dp, true, INITCAP); } else { std::string new_word(word); mkallsmall(new_word, csconv); mkinitcap(new_word, csconv); - int ret = add_word(new_word, wcl, flags2, flagslen + 1, dp, true); + int ret = add_word(new_word, wcl, flags2, flagslen + 1, dp, true, INITCAP); return ret; } } @@ -405,24 +506,8 @@ int HashMgr::remove_forbidden_flag(const std::string& word) { if (!dp) return 1; while (dp) { - if (dp->astr && TESTAFF(dp->astr, forbiddenword, dp->alen)) { - if (dp->alen == 1) - dp->alen = 0; // XXX forbidden words of personal dic. - else { - unsigned short* flags2 = - (unsigned short*)malloc(sizeof(unsigned short) * (dp->alen - 1)); - if (!flags2) - return 1; - int i, j = 0; - for (i = 0; i < dp->alen; i++) { - if (dp->astr[i] != forbiddenword) - flags2[j++] = dp->astr[i]; - } - dp->alen--; - free(dp->astr); - dp->astr = flags2; // XXX allowed forbidden words - } - } + if (dp->astr && TESTAFF(dp->astr, forbiddenword, dp->alen)) + dp->alen = 0; // XXX forbidden words of personal dic. dp = dp->next_homonym; } return 0; @@ -435,7 +520,7 @@ int HashMgr::add(const std::string& word) { int al = 0; unsigned short* flags = NULL; int wcl = get_clen_and_captype(word, &captype); - add_word(word, wcl, flags, al, NULL, false); + add_word(word, wcl, flags, al, NULL, false, captype); return add_hidden_capitalized_word(word, wcl, flags, al, NULL, captype); } @@ -450,14 +535,14 @@ int HashMgr::add_with_affix(const std::string& word, const std::string& example) int captype; int wcl = get_clen_and_captype(word, &captype); if (aliasf) { - add_word(word, wcl, dp->astr, dp->alen, NULL, false); + add_word(word, wcl, dp->astr, dp->alen, NULL, false, captype); } else { unsigned short* flags = (unsigned short*)malloc(dp->alen * sizeof(unsigned short)); if (flags) { memcpy((void*)flags, (void*)dp->astr, dp->alen * sizeof(unsigned short)); - add_word(word, wcl, flags, dp->alen, NULL, false); + add_word(word, wcl, flags, dp->alen, NULL, false, captype); } else return 1; } @@ -605,7 +690,7 @@ int HashMgr::load_tables(const char* tpath, const char* key) { int wcl = get_clen_and_captype(ts, &captype, workbuf); const std::string *dp_str = dp.empty() ? NULL : &dp; // add the word and its index plus its capitalized form optionally - if (add_word(ts, wcl, flags, al, dp_str, false) || + if (add_word(ts, wcl, flags, al, dp_str, false, captype) || add_hidden_capitalized_word(ts, wcl, flags, al, dp_str, captype)) { delete dict; return 5; @@ -697,7 +782,7 @@ int HashMgr::decode_flags(unsigned short** result, const std::string& flags, Fil *result = (unsigned short*)malloc(len * sizeof(unsigned short)); if (!*result) return -1; - memcpy(*result, &w[0], len * sizeof(short)); + memcpy(*result, w.data(), len * sizeof(short)); break; } default: { // Ispell's one-character flags (erfg -> e r f g) @@ -768,7 +853,7 @@ bool HashMgr::decode_flags(std::vector<unsigned short>& result, const std::strin size_t len = w.size(); size_t origsize = result.size(); result.resize(origsize + len); - memcpy(&result[origsize], &w[0], len * sizeof(short)); + memcpy(result.data() + origsize, w.data(), len * sizeof(short)); break; } default: { // Ispell's one-character flags (erfg -> e r f g) @@ -799,7 +884,7 @@ unsigned short HashMgr::decode_flag(const char* f) const { std::vector<w_char> w; u8_u16(w, f); if (!w.empty()) - memcpy(&s, &w[0], 1 * sizeof(short)); + memcpy(&s, w.data(), 1 * sizeof(short)); break; } default: @@ -940,8 +1025,19 @@ int HashMgr::load_config(const char* affpath, const char* key) { if (line.compare(0, 15, "COMPLEXPREFIXES", 15) == 0) complexprefixes = 1; + /* parse in the typical fault correcting table */ + if (line.compare(0, 3, "REP", 3) == 0) { + if (!parse_reptable(line, afflst)) { + delete afflst; + return 1; + } + } + + // don't check the full affix file, yet if (((line.compare(0, 3, "SFX", 3) == 0) || - (line.compare(0, 3, "PFX", 3) == 0)) && line.size() > 3 && isspace(line[3])) + (line.compare(0, 3, "PFX", 3) == 0)) && + line.size() > 3 && isspace(line[3]) && + !reptable.empty()) // (REP table is in the end of Afrikaans aff file) break; } @@ -1015,43 +1111,41 @@ bool HashMgr::parse_aliasf(const std::string& line, FileMgr* af) { /* now parse the numaliasf lines to read in the remainder of the table */ for (int j = 0; j < numaliasf; j++) { std::string nl; - if (!af->getline(nl)) - return false; - mychomp(nl); - i = 0; aliasf[j] = NULL; aliasflen[j] = 0; - iter = nl.begin(); - start_piece = mystrsep(nl, iter); - while (start_piece != nl.end()) { - switch (i) { - case 0: { - if (nl.compare(start_piece - nl.begin(), 2, "AF", 2) != 0) { - numaliasf = 0; - free(aliasf); - free(aliasflen); - aliasf = NULL; - aliasflen = NULL; - HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", - af->getlinenum()); - return false; + i = 0; + if (af->getline(nl)) { + mychomp(nl); + iter = nl.begin(); + start_piece = mystrsep(nl, iter); + bool errored = false; + while (!errored && start_piece != nl.end()) { + switch (i) { + case 0: { + if (nl.compare(start_piece - nl.begin(), 2, "AF", 2) != 0) { + errored = true; + break; + } + break; } - break; - } - case 1: { - std::string piece(start_piece, iter); - aliasflen[j] = - (unsigned short)decode_flags(&(aliasf[j]), piece, af); - std::sort(aliasf[j], aliasf[j] + aliasflen[j]); - break; + case 1: { + std::string piece(start_piece, iter); + aliasflen[j] = + (unsigned short)decode_flags(&(aliasf[j]), piece, af); + std::sort(aliasf[j], aliasf[j] + aliasflen[j]); + break; + } + default: + break; } - default: - break; + ++i; + start_piece = mystrsep(nl, iter); } - ++i; - start_piece = mystrsep(nl, iter); } if (!aliasf[j]) { + for (int k = 0; k < j; ++k) { + free(aliasf[k]); + } free(aliasf); free(aliasflen); aliasf = NULL; @@ -1130,47 +1224,47 @@ bool HashMgr::parse_aliasm(const std::string& line, FileMgr* af) { /* now parse the numaliasm lines to read in the remainder of the table */ for (int j = 0; j < numaliasm; j++) { std::string nl; - if (!af->getline(nl)) - return false; - mychomp(nl); aliasm[j] = NULL; - iter = nl.begin(); - i = 0; - start_piece = mystrsep(nl, iter); - while (start_piece != nl.end()) { - switch (i) { - case 0: { - if (nl.compare(start_piece - nl.begin(), 2, "AM", 2) != 0) { - HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", - af->getlinenum()); - numaliasm = 0; - free(aliasm); - aliasm = NULL; - return false; + if (af->getline(nl)) { + mychomp(nl); + iter = nl.begin(); + i = 0; + start_piece = mystrsep(nl, iter); + bool errored = false; + while (!errored && start_piece != nl.end()) { + switch (i) { + case 0: { + if (nl.compare(start_piece - nl.begin(), 2, "AM", 2) != 0) { + errored = true; + break; + } + break; } - break; - } - case 1: { - // add the remaining of the line - std::string::const_iterator end = nl.end(); - std::string chunk(start_piece, end); - if (complexprefixes) { - if (utf8) - reverseword_utf(chunk); - else - reverseword(chunk); + case 1: { + // add the remaining of the line + std::string::const_iterator end = nl.end(); + std::string chunk(start_piece, end); + if (complexprefixes) { + if (utf8) + reverseword_utf(chunk); + else + reverseword(chunk); + } + aliasm[j] = mystrdup(chunk.c_str()); + break; } - aliasm[j] = mystrdup(chunk.c_str()); - break; + default: + break; } - default: - break; + ++i; + start_piece = mystrsep(nl, iter); } - ++i; - start_piece = mystrsep(nl, iter); } if (!aliasm[j]) { numaliasm = 0; + for (int k = 0; k < j; ++k) { + free(aliasm[k]); + } free(aliasm); aliasm = NULL; HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", @@ -1191,3 +1285,102 @@ char* HashMgr::get_aliasm(int index) const { HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index); return NULL; } + +/* parse in the typical fault correcting table */ +bool HashMgr::parse_reptable(const std::string& line, FileMgr* af) { + if (!reptable.empty()) { + HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", + af->getlinenum()); + return false; + } + int numrep = -1; + int i = 0; + int np = 0; + std::string::const_iterator iter = line.begin(); + std::string::const_iterator start_piece = mystrsep(line, iter); + while (start_piece != line.end()) { + switch (i) { + case 0: { + np++; + break; + } + case 1: { + numrep = atoi(std::string(start_piece, iter).c_str()); + if (numrep < 1) { + HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", + af->getlinenum()); + return false; + } + reptable.reserve(numrep); + np++; + break; + } + default: + break; + } + ++i; + start_piece = mystrsep(line, iter); + } + if (np != 2) { + HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", + af->getlinenum()); + return false; + } + + /* now parse the numrep lines to read in the remainder of the table */ + for (int j = 0; j < numrep; ++j) { + std::string nl; + reptable.push_back(replentry()); + int type = 0; + if (af->getline(nl)) { + mychomp(nl); + iter = nl.begin(); + i = 0; + start_piece = mystrsep(nl, iter); + bool errored = false; + while (!errored && start_piece != nl.end()) { + switch (i) { + case 0: { + if (nl.compare(start_piece - nl.begin(), 3, "REP", 3) != 0) { + errored = true; + break; + } + break; + } + case 1: { + if (*start_piece == '^') + type = 1; + reptable.back().pattern.assign(start_piece + type, iter); + mystrrep(reptable.back().pattern, "_", " "); + if (!reptable.back().pattern.empty() && reptable.back().pattern[reptable.back().pattern.size() - 1] == '$') { + type += 2; + reptable.back().pattern.resize(reptable.back().pattern.size() - 1); + } + break; + } + case 2: { + reptable.back().outstrings[type].assign(start_piece, iter); + mystrrep(reptable.back().outstrings[type], "_", " "); + break; + } + default: + break; + } + ++i; + start_piece = mystrsep(nl, iter); + } + } + if (reptable.back().pattern.empty() || reptable.back().outstrings[type].empty()) { + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", + af->getlinenum()); + reptable.clear(); + return false; + } + } + return true; +} + +// return replacing table +const std::vector<replentry>& HashMgr::get_reptable() const { + return reptable; +} diff --git a/libs/hunspell/src/hashmgr.hxx b/libs/hunspell/src/hashmgr.hxx index da485d7afa..98b09e2569 100644 --- a/libs/hunspell/src/hashmgr.hxx +++ b/libs/hunspell/src/hashmgr.hxx @@ -1,7 +1,7 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * - * Copyright (C) 2002-2017 Németh László + * Copyright (C) 2002-2022 Németh László * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with @@ -81,6 +81,12 @@ enum flag { FLAG_CHAR, FLAG_LONG, FLAG_NUM, FLAG_UNI }; +// morphological description of a dictionary item can contain +// arbitrary number "ph:" (MORPH_PHON) fields to store typical +// phonetic or other misspellings of that word. +// ratio of lines/lines with "ph:" in the dic file: 1/MORPH_PHON_RATIO +#define MORPH_PHON_RATIO 500 + class HashMgr { int tablesize; struct hentry** tableptr; @@ -99,6 +105,10 @@ class HashMgr { unsigned short* aliasflen; int numaliasm; // morphological desciption `compression' with aliases char** aliasm; + // reptable created from REP table of aff file and from "ph:" fields + // of the dic file. It contains phonetic and other common misspellings + // (letters, letter groups and words) for better suggestions + std::vector<replentry> reptable; public: HashMgr(const char* tpath, const char* apath, const char* key = NULL); @@ -119,6 +129,7 @@ class HashMgr { int get_aliasf(int index, unsigned short** fvec, FileMgr* af) const; int is_aliasm() const; char* get_aliasm(int index) const; + const std::vector<replentry>& get_reptable() const; private: int get_clen_and_captype(const std::string& word, int* captype); @@ -129,7 +140,8 @@ class HashMgr { unsigned short* ap, int al, const std::string* desc, - bool onlyupcase); + bool onlyupcase, + int captype); int load_config(const char* affpath, const char* key); bool parse_aliasf(const std::string& line, FileMgr* af); int add_hidden_capitalized_word(const std::string& word, @@ -139,6 +151,7 @@ class HashMgr { const std::string* dp, int captype); bool parse_aliasm(const std::string& line, FileMgr* af); + bool parse_reptable(const std::string& line, FileMgr* af); int remove_forbidden_flag(const std::string& word); }; diff --git a/libs/hunspell/src/htypes.hxx b/libs/hunspell/src/htypes.hxx index 8f66a0080e..44366b1d68 100644 --- a/libs/hunspell/src/htypes.hxx +++ b/libs/hunspell/src/htypes.hxx @@ -1,7 +1,7 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * - * Copyright (C) 2002-2017 Németh László + * Copyright (C) 2002-2022 Németh László * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with @@ -44,9 +44,10 @@ (v) = ((v) << (q)) | (((v) >> (32 - q)) & ((1 << (q)) - 1)); // hentry options -#define H_OPT (1 << 0) -#define H_OPT_ALIASM (1 << 1) -#define H_OPT_PHON (1 << 2) +#define H_OPT (1 << 0) // is there optional morphological data? +#define H_OPT_ALIASM (1 << 1) // using alias compression? +#define H_OPT_PHON (1 << 2) // is there ph: field in the morphological data? +#define H_OPT_INITCAP (1 << 3) // is dictionary word capitalized? // see also csutil.hxx #define HENTRY_WORD(h) &(h->word[0]) @@ -54,6 +55,12 @@ // approx. number of user defined words #define USERWORD 1000 +#if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900) +# define HUNSPELL_THREAD_LOCAL thread_local +#else +# define HUNSPELL_THREAD_LOCAL static +#endif + struct hentry { unsigned char blen; // word length in bytes unsigned char clen; // word length in characters (different for UTF-8 enc.) @@ -61,7 +68,7 @@ struct hentry { unsigned short* astr; // affix flag vector struct hentry* next; // next word with same hash code struct hentry* next_homonym; // next homonym word (with same hash code) - char var; // variable fields (only for special pronounciation yet) + char var; // bit vector of H_OPT hentry options char word[1]; // variable-length word (8-bit or UTF-8 encoding) }; diff --git a/libs/hunspell/src/hunspell.c++ b/libs/hunspell/src/hunspell.c++ index b1535013fe..4afafdadc1 100644 --- a/libs/hunspell/src/hunspell.c++ +++ b/libs/hunspell/src/hunspell.c++ @@ -1,7 +1,7 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * - * Copyright (C) 2002-2017 Németh László + * Copyright (C) 2002-2022 Németh László * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with @@ -71,6 +71,7 @@ #include <stdlib.h> #include <string.h> #include <stdio.h> +#include <time.h> #include "affixmgr.hxx" #include "hunspell.hxx" @@ -86,30 +87,41 @@ class HunspellImpl { public: - HunspellImpl(const char* affpath, const char* dpath, const char* key); + HunspellImpl(const char* affpath, const char* dpath, const char* key = NULL); ~HunspellImpl(); - int add_dic(const char* dpath, const char* key); + int add_dic(const char* dpath, const char* key = NULL); std::vector<std::string> suffix_suggest(const std::string& root_word); std::vector<std::string> generate(const std::string& word, const std::vector<std::string>& pl); std::vector<std::string> generate(const std::string& word, const std::string& pattern); std::vector<std::string> stem(const std::string& word); std::vector<std::string> stem(const std::vector<std::string>& morph); std::vector<std::string> analyze(const std::string& word); + int get_langnum() const; bool input_conv(const std::string& word, std::string& dest); bool spell(const std::string& word, int* info = NULL, std::string* root = NULL); std::vector<std::string> suggest(const std::string& word); - const std::string& get_wordchars() const; + const std::string& get_wordchars_cpp() const; const std::vector<w_char>& get_wordchars_utf16() const; const std::string& get_dict_encoding() const; int add(const std::string& word); int add_with_affix(const std::string& word, const std::string& example); int remove(const std::string& word); + const std::string& get_version_cpp() const; struct cs_info* get_csconv(); - std::vector<char> dic_encoding_vec; - int get_langnum() const { return langnum; } - const char* get_try_string() const { return pAMgr->get_try_string(); } - const std::string& get_version() const { return pAMgr->get_version(); } + int spell(const char* word, int* info = NULL, char** root = NULL); + int suggest(char*** slst, const char* word); + int suffix_suggest(char*** slst, const char* root_word); + void free_list(char*** slst, int n); + char* get_dic_encoding(); + int analyze(char*** slst, const char* word); + int stem(char*** slst, const char* word); + int stem(char*** slst, char** morph, int n); + int generate(char*** slst, const char* word, const char* word2); + int generate(char*** slst, const char* word, char** desc, int n); + const char* get_wordchars() const; + const char* get_version() const; + int input_conv(const char* word, char* dest, size_t destsize); private: AffixMgr* pAMgr; @@ -124,12 +136,17 @@ private: std::vector<std::string> wordbreak; private: + std::vector<std::string> analyze_internal(const std::string& word); + bool spell_internal(const std::string& word, int* info = NULL, std::string* root = NULL); + std::vector<std::string> suggest_internal(const std::string& word, + bool& capitalized, size_t& abbreviated, int& captype); void cleanword(std::string& dest, const std::string&, int* pcaptype, int* pabbrev); size_t cleanword2(std::string& dest, std::vector<w_char>& dest_u, const std::string& src, int* pcaptype, size_t* pabbrev); + void clean_ignore(std::string& dest, const std::string& src); void mkinitcap(std::string& u8); int mkinitcap2(std::string& u8, std::vector<w_char>& u16); int mkinitsmall2(std::string& u8, std::vector<w_char>& u16); @@ -143,19 +160,15 @@ private: void insert_sug(std::vector<std::string>& slst, const std::string& word); void cat_result(std::string& result, const std::string& st); std::vector<std::string> spellml(const std::string& word); - std::string get_xml_par(const char* par); - const char* get_xml_pos(const char* s, const char* attr); - std::vector<std::string> get_xml_list(const char* list, const char* tag); - int check_xml_par(const char* q, const char* attr, const char* value); + std::string get_xml_par(const std::string& par, std::string::size_type pos); + std::string::size_type get_xml_pos(const std::string& s, std::string::size_type pos, const char* attr); + std::vector<std::string> get_xml_list(const std::string& list, std::string::size_type pos, const char* tag); + int check_xml_par(const std::string& q, std::string::size_type pos, const char* attr, const char* value); private: HunspellImpl(const HunspellImpl&); HunspellImpl& operator=(const HunspellImpl&); }; -Hunspell::Hunspell(const char* affpath, const char* dpath, const char* key) - : m_Impl(new HunspellImpl(affpath, dpath, key)) { -} - HunspellImpl::HunspellImpl(const char* affpath, const char* dpath, const char* key) { csconv = NULL; utf8 = 0; @@ -180,19 +193,12 @@ HunspellImpl::HunspellImpl(const char* affpath, const char* dpath, const char* k complexprefixes = pAMgr->get_complexprefixes(); wordbreak = pAMgr->get_breaktable(); - dic_encoding_vec.resize(encoding.size()+1); - strcpy(&dic_encoding_vec[0], encoding.c_str()); - /* and finally set up the suggestion manager */ pSMgr = new SuggestMgr(try_string, MAXSUGGESTION, pAMgr); if (try_string) free(try_string); } -Hunspell::~Hunspell() { - delete m_Impl; -} - HunspellImpl::~HunspellImpl() { delete pSMgr; delete pAMgr; @@ -210,11 +216,6 @@ HunspellImpl::~HunspellImpl() { } // load extra dictionaries -int Hunspell::add_dic(const char* dpath, const char* key) { - return m_Impl->add_dic(dpath, key); -} - -// load extra dictionaries int HunspellImpl::add_dic(const char* dpath, const char* key) { if (!affixpath) return 1; @@ -222,6 +223,26 @@ int HunspellImpl::add_dic(const char* dpath, const char* key) { return 0; } + +// make a copy of src at dest while removing all characters +// specified in IGNORE rule +void HunspellImpl::clean_ignore(std::string& dest, + const std::string& src) { + dest.clear(); + dest.assign(src); + const char* ignoredchars = pAMgr ? pAMgr->get_ignore() : NULL; + if (ignoredchars != NULL) { + if (utf8) { + const std::vector<w_char>& ignoredchars_utf16 = + pAMgr->get_ignore_utf16(); + remove_ignored_chars_utf(dest, ignoredchars_utf16); + } else { + remove_ignored_chars(dest, ignoredchars); + } + } +} + + // make a copy of src at destination while removing all leading // blanks and removing any trailing periods after recording // their presence with the abbreviation flag @@ -237,7 +258,11 @@ size_t HunspellImpl::cleanword2(std::string& dest, dest.clear(); dest_utf.clear(); - const char* q = src.c_str(); + // remove IGNORE characters from the string + std::string w2; + clean_ignore(w2, src); + + const char* q = w2.c_str(); // first skip over any leading blanks while (*q == ' ') @@ -409,11 +434,22 @@ void HunspellImpl::insert_sug(std::vector<std::string>& slst, const std::string& slst.insert(slst.begin(), word); } -bool Hunspell::spell(const std::string& word, int* info, std::string* root) { - return m_Impl->spell(word, info, root); +bool HunspellImpl::spell(const std::string& word, int* info, std::string* root) { + bool r = spell_internal(word, info, root); + if (r && root) { + // output conversion + RepList* rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL; + if (rl) { + std::string wspace; + if (rl->conv(*root, wspace)) { + *root = wspace; + } + } + } + return r; } -bool HunspellImpl::spell(const std::string& word, int* info, std::string* root) { +bool HunspellImpl::spell_internal(const std::string& word, int* info, std::string* root) { struct hentry* rv = NULL; int info2 = 0; @@ -485,7 +521,7 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root) case HUHCAP: /* FALLTHROUGH */ case HUHINITCAP: - *info += SPELL_ORIGCAP; + *info |= SPELL_ORIGCAP; /* FALLTHROUGH */ case NOCAP: rv = checkword(scw, info, root); @@ -496,7 +532,7 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root) } break; case ALLCAP: { - *info += SPELL_ORIGCAP; + *info |= SPELL_ORIGCAP; rv = checkword(scw, info, root); if (rv) break; @@ -563,17 +599,22 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root) break; } } + /* FALLTHROUGH */ case INITCAP: { - - *info += SPELL_ORIGCAP; - mkallsmall2(scw, sunicw); - std::string u8buffer(scw); - mkinitcap2(scw, sunicw); + // handle special capitalization of dotted I + bool Idot = (utf8 && (unsigned char) scw[0] == 0xc4 && (unsigned char) scw[1] == 0xb0); + *info |= SPELL_ORIGCAP; + if (captype == ALLCAP) { + mkallsmall2(scw, sunicw); + mkinitcap2(scw, sunicw); + if (Idot) + scw.replace(0, 1, "\xc4\xb0"); + } if (captype == INITCAP) - *info += SPELL_INITCAP; + *info |= SPELL_INITCAP; rv = checkword(scw, info, root); if (captype == INITCAP) - *info -= SPELL_INITCAP; + *info &= ~SPELL_INITCAP; // forbid bad capitalization // (for example, ijs -> Ijs instead of IJs in Dutch) // use explicit forms in dic: Ijs/F (F = FORBIDDENWORD flag) @@ -583,9 +624,13 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root) } if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL; - if (rv) + if (rv || (Idot && langnum != LANG_az && langnum != LANG_tr && langnum != LANG_crh)) break; + mkallsmall2(scw, sunicw); + std::string u8buffer(scw); + mkinitcap2(scw, sunicw); + rv = checkword(u8buffer, info, root); if (abbv && !rv) { u8buffer.push_back('.'); @@ -594,10 +639,10 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root) u8buffer = scw; u8buffer.push_back('.'); if (captype == INITCAP) - *info += SPELL_INITCAP; + *info |= SPELL_INITCAP; rv = checkword(u8buffer, info, root); if (captype == INITCAP) - *info -= SPELL_INITCAP; + *info &= ~SPELL_INITCAP; if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL; break; @@ -618,7 +663,7 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root) if (rv) { if (pAMgr && pAMgr->get_warn() && rv->astr && TESTAFF(rv->astr, pAMgr->get_warn(), rv->alen)) { - *info += SPELL_WARN; + *info |= SPELL_WARN; if (pAMgr->get_forbidwarn()) return false; return true; @@ -627,7 +672,7 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root) } // recursive breaking at break points - if (!wordbreak.empty()) { + if (!wordbreak.empty() && !(*info & SPELL_FORBIDDEN)) { int nbr = 0; wl = scw.size(); @@ -668,6 +713,37 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root) size_t plen = wordbreak[j].size(); size_t found = scw.find(wordbreak[j]); if ((found > 0) && (found < wl - plen)) { + size_t found2 = scw.find(wordbreak[j], found + 1); + // try to break at the second occurance + // to recognize dictionary words with wordbreak + if (found2 > 0 && (found2 < wl - plen)) + found = found2; + if (!spell(scw.substr(found + plen))) + continue; + std::string suffix(scw.substr(found)); + scw.resize(found); + // examine 2 sides of the break point + if (spell(scw)) + return true; + scw.append(suffix); + + // LANG_hu: spec. dash rule + if (langnum == LANG_hu && wordbreak[j] == "-") { + suffix = scw.substr(found + 1); + scw.resize(found + 1); + if (spell(scw)) + return true; // check the first part with dash + scw.append(suffix); + } + // end of LANG specific region + } + } + + // other patterns (break at first break point) + for (size_t j = 0; j < wordbreak.size(); ++j) { + size_t plen = wordbreak[j].size(); + size_t found = scw.find(wordbreak[j]); + if ((found > 0) && (found < wl - plen)) { if (!spell(scw.substr(found + plen))) continue; std::string suffix(scw.substr(found)); @@ -694,47 +770,28 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root) } struct hentry* HunspellImpl::checkword(const std::string& w, int* info, std::string* root) { - bool usebuffer = false; std::string w2; const char* word; int len; - const char* ignoredchars = pAMgr ? pAMgr->get_ignore() : NULL; - if (ignoredchars != NULL) { - w2.assign(w); - if (utf8) { - const std::vector<w_char>& ignoredchars_utf16 = - pAMgr->get_ignore_utf16(); - remove_ignored_chars_utf(w2, ignoredchars_utf16); - } else { - remove_ignored_chars(w2, ignoredchars); - } - word = w2.c_str(); - len = w2.size(); - usebuffer = true; - } else { - word = w.c_str(); - len = w.size(); - } + // remove IGNORE characters from the string + clean_ignore(w2, w); + + word = w2.c_str(); + len = w2.size(); if (!len) return NULL; // word reversing wrapper for complex prefixes if (complexprefixes) { - if (!usebuffer) { - w2.assign(word); - usebuffer = true; - } if (utf8) reverseword_utf(w2); else reverseword(w2); } - if (usebuffer) { - word = w2.c_str(); - } + word = w2.c_str(); // look word in hash table struct hentry* he = NULL; @@ -745,13 +802,13 @@ struct hentry* HunspellImpl::checkword(const std::string& w, int* info, std::str if ((he) && (he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) { if (info) - *info += SPELL_FORBIDDEN; + *info |= SPELL_FORBIDDEN; // LANG_hu section: set dash information for suggestions if (langnum == LANG_hu) { if (pAMgr->get_compoundflag() && TESTAFF(he->astr, pAMgr->get_compoundflag(), he->alen)) { if (info) - *info += SPELL_COMPOUND; + *info |= SPELL_COMPOUND; } } return NULL; @@ -786,7 +843,7 @@ struct hentry* HunspellImpl::checkword(const std::string& w, int* info, std::str if ((he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) { if (info) - *info += SPELL_FORBIDDEN; + *info |= SPELL_FORBIDDEN; return NULL; } if (root) { @@ -819,7 +876,7 @@ struct hentry* HunspellImpl::checkword(const std::string& w, int* info, std::str } } if (info) - *info += SPELL_COMPOUND; + *info |= SPELL_COMPOUND; } } } @@ -827,11 +884,103 @@ struct hentry* HunspellImpl::checkword(const std::string& w, int* info, std::str return he; } -std::vector<std::string> Hunspell::suggest(const std::string& word) { - return m_Impl->suggest(word); +std::vector<std::string> HunspellImpl::suggest(const std::string& word) { + bool capwords; + size_t abbv; + int captype; + std::vector<std::string> slst = suggest_internal(word, capwords, abbv, captype); + // word reversing wrapper for complex prefixes + if (complexprefixes) { + for (size_t j = 0; j < slst.size(); ++j) { + if (utf8) + reverseword_utf(slst[j]); + else + reverseword(slst[j]); + } + } + + // capitalize + if (capwords) + for (size_t j = 0; j < slst.size(); ++j) { + mkinitcap(slst[j]); + } + + // expand suggestions with dot(s) + if (abbv && pAMgr && pAMgr->get_sugswithdots()) { + for (size_t j = 0; j < slst.size(); ++j) { + slst[j].append(word.substr(word.size() - abbv)); + } + } + + // remove bad capitalized and forbidden forms + if (pAMgr && (pAMgr->get_keepcase() || pAMgr->get_forbiddenword())) { + switch (captype) { + case INITCAP: + case ALLCAP: { + size_t l = 0; + for (size_t j = 0; j < slst.size(); ++j) { + if (slst[j].find(' ') == std::string::npos && !spell(slst[j])) { + std::string s; + std::vector<w_char> w; + if (utf8) { + u8_u16(w, slst[j]); + } else { + s = slst[j]; + } + mkallsmall2(s, w); + if (spell(s)) { + slst[l] = s; + ++l; + } else { + mkinitcap2(s, w); + if (spell(s)) { + slst[l] = s; + ++l; + } + } + } else { + slst[l] = slst[j]; + ++l; + } + } + slst.resize(l); + } + } + } + + // remove duplications + size_t l = 0; + for (size_t j = 0; j < slst.size(); ++j) { + slst[l] = slst[j]; + for (size_t k = 0; k < l; ++k) { + if (slst[k] == slst[j]) { + --l; + break; + } + } + ++l; + } + slst.resize(l); + + // output conversion + RepList* rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL; + if (rl) { + for (size_t i = 0; rl && i < slst.size(); ++i) { + std::string wspace; + if (rl->conv(slst[i], wspace)) { + slst[i] = wspace; + } + } + } + return slst; } -std::vector<std::string> HunspellImpl::suggest(const std::string& word) { +std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word, + bool& capwords, size_t& abbv, int& captype) { + captype = NOCAP; + abbv = 0; + capwords = false; + std::vector<std::string> slst; int onlycmpdsug = 0; @@ -849,8 +998,6 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) { if (word.size() >= MAXWORDLEN) return slst; } - int captype = NOCAP; - size_t abbv = 0; size_t wl = 0; std::string scw; @@ -871,7 +1018,11 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) { return slst; } - int capwords = 0; + bool good = false; + + clock_t timelimit; + // initialize in every suggestion call + timelimit = clock(); // check capitalized form for FORCEUCASE if (pAMgr && captype == NOCAP && pAMgr->get_forceucase()) { @@ -886,22 +1037,38 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) { switch (captype) { case NOCAP: { - pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug); + good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug); + if (clock() > timelimit + TIMELIMIT_GLOBAL) + return slst; + if (abbv) { + std::string wspace(scw); + wspace.push_back('.'); + good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); + if (clock() > timelimit + TIMELIMIT_GLOBAL) + return slst; + } break; } case INITCAP: { - capwords = 1; - pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug); + capwords = true; + good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug); + if (clock() > timelimit + TIMELIMIT_GLOBAL) + return slst; std::string wspace(scw); mkallsmall2(wspace, sunicw); - pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); + good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); + if (clock() > timelimit + TIMELIMIT_GLOBAL) + return slst; break; } case HUHINITCAP: - capwords = 1; + capwords = true; + /* FALLTHROUGH */ case HUHCAP: { - pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug); + good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug); + if (clock() > timelimit + TIMELIMIT_GLOBAL) + return slst; // something.The -> something. The size_t dot_pos = scw.find('.'); if (dot_pos != std::string::npos) { @@ -927,19 +1094,25 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) { // TheOpenOffice.org -> The OpenOffice.org wspace = scw; mkinitsmall2(wspace, sunicw); - pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); + good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); + if (clock() > timelimit + TIMELIMIT_GLOBAL) + return slst; } wspace = scw; mkallsmall2(wspace, sunicw); if (spell(wspace.c_str())) insert_sug(slst, wspace); size_t prevns = slst.size(); - pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); + good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); + if (clock() > timelimit + TIMELIMIT_GLOBAL) + return slst; if (captype == HUHINITCAP) { mkinitcap2(wspace, sunicw); if (spell(wspace.c_str())) insert_sug(slst, wspace); - pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); + good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); + if (clock() > timelimit + TIMELIMIT_GLOBAL) + return slst; } // aNew -> "a New" (instead of "a new") for (size_t j = prevns; j < slst.size(); ++j) { @@ -966,11 +1139,15 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) { case ALLCAP: { std::string wspace(scw); mkallsmall2(wspace, sunicw); - pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); + good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); + if (clock() > timelimit + TIMELIMIT_GLOBAL) + return slst; if (pAMgr && pAMgr->get_keepcase() && spell(wspace.c_str())) insert_sug(slst, wspace); mkinitcap2(wspace, sunicw); - pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); + good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); + if (clock() > timelimit + TIMELIMIT_GLOBAL) + return slst; for (size_t j = 0; j < slst.size(); ++j) { mkallcap(slst[j]); if (pAMgr && pAMgr->get_checksharps()) { @@ -1002,34 +1179,43 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) { } } // END OF LANG_hu section - - // try ngram approach since found nothing or only compound words - if (pAMgr && (slst.empty() || onlycmpdsug) && (pAMgr->get_maxngramsugs() != 0)) { + // try ngram approach since found nothing good suggestion + if (!good && pAMgr && (slst.empty() || onlycmpdsug) && (pAMgr->get_maxngramsugs() != 0)) { switch (captype) { case NOCAP: { - pSMgr->ngsuggest(slst, scw.c_str(), m_HMgrs); + pSMgr->ngsuggest(slst, scw.c_str(), m_HMgrs, NOCAP); + if (clock() > timelimit + TIMELIMIT_GLOBAL) + return slst; break; } + /* FALLTHROUGH */ case HUHINITCAP: - capwords = 1; + capwords = true; + /* FALLTHROUGH */ case HUHCAP: { std::string wspace(scw); mkallsmall2(wspace, sunicw); - pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs); + pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs, HUHCAP); + if (clock() > timelimit + TIMELIMIT_GLOBAL) + return slst; break; } case INITCAP: { - capwords = 1; + capwords = true; std::string wspace(scw); mkallsmall2(wspace, sunicw); - pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs); + pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs, INITCAP); + if (clock() > timelimit + TIMELIMIT_GLOBAL) + return slst; break; } case ALLCAP: { std::string wspace(scw); mkallsmall2(wspace, sunicw); size_t oldns = slst.size(); - pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs); + pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs, ALLCAP); + if (clock() > timelimit + TIMELIMIT_GLOBAL) + return slst; for (size_t j = oldns; j < slst.size(); ++j) { mkallcap(slst[j]); } @@ -1039,6 +1225,11 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) { } // try dash suggestion (Afo-American -> Afro-American) + // Note: LibreOffice was modified to treat dashes as word + // characters to check "scot-free" etc. word forms, but + // we need to handle suggestions for "Afo-American", etc., + // while "Afro-American" is missing from the dictionary. + // TODO avoid possible overgeneration size_t dash_pos = scw.find('-'); if (dash_pos != std::string::npos) { int nodashsug = 1; @@ -1050,12 +1241,14 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) { size_t prev_pos = 0; bool last = false; - while (nodashsug && !last) { + while (!good && nodashsug && !last) { if (dash_pos == scw.size()) last = 1; std::string chunk = scw.substr(prev_pos, dash_pos - prev_pos); if (!spell(chunk.c_str())) { std::vector<std::string> nlst = suggest(chunk.c_str()); + if (clock() > timelimit + TIMELIMIT_GLOBAL) + return slst; for (std::vector<std::string>::reverse_iterator j = nlst.rbegin(); j != nlst.rend(); ++j) { std::string wspace = scw.substr(0, prev_pos); wspace.append(*j); @@ -1063,7 +1256,11 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) { wspace.append("-"); wspace.append(scw.substr(dash_pos + 1)); } - insert_sug(slst, wspace); + int info = 0; + if (pAMgr && pAMgr->get_forbiddenword()) + checkword(wspace, &info, NULL); + if (!(info & SPELL_FORBIDDEN)) + insert_sug(slst, wspace); } nodashsug = 0; } @@ -1075,104 +1272,13 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) { dash_pos = scw.size(); } } - - // word reversing wrapper for complex prefixes - if (complexprefixes) { - for (size_t j = 0; j < slst.size(); ++j) { - if (utf8) - reverseword_utf(slst[j]); - else - reverseword(slst[j]); - } - } - - // capitalize - if (capwords) - for (size_t j = 0; j < slst.size(); ++j) { - mkinitcap(slst[j]); - } - - // expand suggestions with dot(s) - if (abbv && pAMgr && pAMgr->get_sugswithdots()) { - for (size_t j = 0; j < slst.size(); ++j) { - slst[j].append(word.substr(word.size() - abbv)); - } - } - - // remove bad capitalized and forbidden forms - if (pAMgr && (pAMgr->get_keepcase() || pAMgr->get_forbiddenword())) { - switch (captype) { - case INITCAP: - case ALLCAP: { - size_t l = 0; - for (size_t j = 0; j < slst.size(); ++j) { - if (slst[j].find(' ') == std::string::npos && !spell(slst[j])) { - std::string s; - std::vector<w_char> w; - if (utf8) { - u8_u16(w, slst[j]); - } else { - s = slst[j]; - } - mkallsmall2(s, w); - if (spell(s)) { - slst[l] = s; - ++l; - } else { - mkinitcap2(s, w); - if (spell(s)) { - slst[l] = s; - ++l; - } - } - } else { - slst[l] = slst[j]; - ++l; - } - } - slst.resize(l); - } - } - } - - // remove duplications - size_t l = 0; - for (size_t j = 0; j < slst.size(); ++j) { - slst[l] = slst[j]; - for (size_t k = 0; k < l; ++k) { - if (slst[k] == slst[j]) { - --l; - break; - } - } - ++l; - } - slst.resize(l); - - // output conversion - rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL; - for (size_t j = 0; rl && j < slst.size(); ++j) { - std::string wspace; - if (rl->conv(slst[j], wspace)) { - slst[j] = wspace; - } - } - return slst; } -const std::string& Hunspell::get_dict_encoding() const { - return m_Impl->get_dict_encoding(); -} - const std::string& HunspellImpl::get_dict_encoding() const { return encoding; } -std::vector<std::string> Hunspell::stem(const std::vector<std::string>& desc) { - return m_Impl->stem(desc); -} - std::vector<std::string> HunspellImpl::stem(const std::vector<std::string>& desc) { std::vector<std::string> slst; @@ -1241,30 +1347,14 @@ std::vector<std::string> HunspellImpl::stem(const std::vector<std::string>& desc return slst; } -std::vector<std::string> Hunspell::stem(const std::string& word) { - return m_Impl->stem(word); -} - std::vector<std::string> HunspellImpl::stem(const std::string& word) { return stem(analyze(word)); } -const char* Hunspell::get_wordchars() const { - return m_Impl->get_wordchars().c_str(); -} - -const std::string& Hunspell::get_wordchars_cpp() const { - return m_Impl->get_wordchars(); -} - -const std::string& HunspellImpl::get_wordchars() const { +const std::string& HunspellImpl::get_wordchars_cpp() const { return pAMgr->get_wordchars(); } -const std::vector<w_char>& Hunspell::get_wordchars_utf16() const { - return m_Impl->get_wordchars_utf16(); -} - const std::vector<w_char>& HunspellImpl::get_wordchars_utf16() const { return pAMgr->get_wordchars_utf16(); } @@ -1300,56 +1390,32 @@ int HunspellImpl::mkinitsmall2(std::string& u8, std::vector<w_char>& u16) { return u8.size(); } -int Hunspell::add(const std::string& word) { - return m_Impl->add(word); -} - int HunspellImpl::add(const std::string& word) { if (!m_HMgrs.empty()) return m_HMgrs[0]->add(word); return 0; } -int Hunspell::add_with_affix(const std::string& word, const std::string& example) { - return m_Impl->add_with_affix(word, example); -} - int HunspellImpl::add_with_affix(const std::string& word, const std::string& example) { if (!m_HMgrs.empty()) return m_HMgrs[0]->add_with_affix(word, example); return 0; } -int Hunspell::remove(const std::string& word) { - return m_Impl->remove(word); -} - int HunspellImpl::remove(const std::string& word) { if (!m_HMgrs.empty()) return m_HMgrs[0]->remove(word); return 0; } -const char* Hunspell::get_version() const { - return m_Impl->get_version().c_str(); -} - -const std::string& Hunspell::get_version_cpp() const { - return m_Impl->get_version(); -} - -const char* Hunspell::get_try_string() const { - return m_Impl->get_try_string(); +const std::string& HunspellImpl::get_version_cpp() const { + return pAMgr->get_version(); } struct cs_info* HunspellImpl::get_csconv() { return csconv; } -struct cs_info* Hunspell::get_csconv() { - return m_Impl->get_csconv(); -} - void HunspellImpl::cat_result(std::string& result, const std::string& st) { if (!st.empty()) { if (!result.empty()) @@ -1358,11 +1424,22 @@ void HunspellImpl::cat_result(std::string& result, const std::string& st) { } } -std::vector<std::string> Hunspell::analyze(const std::string& word) { - return m_Impl->analyze(word); +std::vector<std::string> HunspellImpl::analyze(const std::string& word) { + std::vector<std::string> slst = analyze_internal(word); + // output conversion + RepList* rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL; + if (rl) { + for (size_t i = 0; rl && i < slst.size(); ++i) { + std::string wspace; + if (rl->conv(slst[i], wspace)) { + slst[i] = wspace; + } + } + } + return slst; } -std::vector<std::string> HunspellImpl::analyze(const std::string& word) { +std::vector<std::string> HunspellImpl::analyze_internal(const std::string& word) { std::vector<std::string> slst; if (!pSMgr || m_HMgrs.empty()) return slst; @@ -1595,10 +1672,6 @@ std::vector<std::string> HunspellImpl::analyze(const std::string& word) { return slst; } -std::vector<std::string> Hunspell::generate(const std::string& word, const std::vector<std::string>& pl) { - return m_Impl->generate(word, pl); -} - std::vector<std::string> HunspellImpl::generate(const std::string& word, const std::vector<std::string>& pl) { std::vector<std::string> slst; if (!pSMgr || pl.empty()) @@ -1643,10 +1716,6 @@ std::vector<std::string> HunspellImpl::generate(const std::string& word, const s return slst; } -std::vector<std::string> Hunspell::generate(const std::string& word, const std::string& pattern) { - return m_Impl->generate(word, pattern); -} - std::vector<std::string> HunspellImpl::generate(const std::string& word, const std::string& pattern) { std::vector<std::string> pl = analyze(pattern); std::vector<std::string> slst = generate(word, pl); @@ -1655,10 +1724,11 @@ std::vector<std::string> HunspellImpl::generate(const std::string& word, const s } // minimal XML parser functions -std::string HunspellImpl::get_xml_par(const char* par) { +std::string HunspellImpl::get_xml_par(const std::string& in_par, std::string::size_type pos) { std::string dest; - if (!par) + if (pos == std::string::npos) return dest; + const char* par = in_par.c_str() + pos; char end = *par; if (end == '>') end = '<'; @@ -1672,22 +1742,8 @@ std::string HunspellImpl::get_xml_par(const char* par) { return dest; } -int Hunspell::get_langnum() const { - return m_Impl->get_langnum(); -} - -bool Hunspell::input_conv(const std::string& word, std::string& dest) { - return m_Impl->input_conv(word, dest); -} - -int Hunspell::input_conv(const char* word, char* dest, size_t destsize) { - std::string d; - bool ret = input_conv(word, d); - if (ret && d.size() < destsize) { - strncpy(dest, d.c_str(), destsize); - return 1; - } - return 0; +int HunspellImpl::get_langnum() const { + return langnum; } bool HunspellImpl::input_conv(const std::string& word, std::string& dest) { @@ -1700,42 +1756,47 @@ bool HunspellImpl::input_conv(const std::string& word, std::string& dest) { } // return the beginning of the element (attr == NULL) or the attribute -const char* HunspellImpl::get_xml_pos(const char* s, const char* attr) { - const char* end = strchr(s, '>'); +std::string::size_type HunspellImpl::get_xml_pos(const std::string& s, std::string::size_type pos, const char* attr) { + if (pos == std::string::npos) + return std::string::npos; + + std::string::size_type endpos = s.find('>', pos); if (attr == NULL) - return end; - const char* p = s; - while (1) { - p = strstr(p, attr); - if (!p || p >= end) - return 0; - if (*(p - 1) == ' ' || *(p - 1) == '\n') + return endpos; + while (true) { + pos = s.find(attr, pos); + if (pos == std::string::npos || pos >= endpos) + return std::string::npos; + if (s[pos - 1] == ' ' || s[pos - 1] == '\n') break; - p += strlen(attr); + pos += strlen(attr); } - return p + strlen(attr); + return pos + strlen(attr); } -int HunspellImpl::check_xml_par(const char* q, - const char* attr, - const char* value) { - std::string cw = get_xml_par(get_xml_pos(q, attr)); +int HunspellImpl::check_xml_par(const std::string& q, std::string::size_type pos, + const char* attr, + const char* value) { + std::string cw = get_xml_par(q, get_xml_pos(q, pos, attr)); if (cw == value) return 1; return 0; } -std::vector<std::string> HunspellImpl::get_xml_list(const char* list, const char* tag) { +std::vector<std::string> HunspellImpl::get_xml_list(const std::string& list, std::string::size_type pos, const char* tag) { std::vector<std::string> slst; - if (!list) + if (pos == std::string::npos) return slst; - const char* p = list; - for (size_t n = 0; ((p = strstr(p, tag)) != NULL); ++p, ++n) { - std::string cw = get_xml_par(p + strlen(tag) - 1); + while (true) { + pos = list.find(tag, pos); + if (pos == std::string::npos) + break; + std::string cw = get_xml_par(list, pos + strlen(tag) - 1); if (cw.empty()) { break; } slst.push_back(cw); + ++pos; } return slst; } @@ -1743,19 +1804,20 @@ std::vector<std::string> HunspellImpl::get_xml_list(const char* list, const char std::vector<std::string> HunspellImpl::spellml(const std::string& in_word) { std::vector<std::string> slst; - const char* word = in_word.c_str(); - - const char* q = strstr(word, "<query"); - if (!q) + std::string::size_type qpos = in_word.find("<query"); + if (qpos == std::string::npos) return slst; // bad XML input - const char* q2 = strchr(q, '>'); - if (!q2) + + std::string::size_type q2pos = in_word.find('>', qpos); + if (q2pos == std::string::npos) return slst; // bad XML input - q2 = strstr(q2, "<word"); - if (!q2) + + q2pos = in_word.find("<word", q2pos); + if (q2pos == std::string::npos) return slst; // bad XML input - if (check_xml_par(q, "type=", "analyze")) { - std::string cw = get_xml_par(strchr(q2, '>')); + + if (check_xml_par(in_word, qpos, "type=", "analyze")) { + std::string cw = get_xml_par(in_word, in_word.find('>', q2pos)); if (!cw.empty()) slst = analyze(cw); if (slst.empty()) @@ -1778,23 +1840,24 @@ std::vector<std::string> HunspellImpl::spellml(const std::string& in_word) { slst.clear(); slst.push_back(r); return slst; - } else if (check_xml_par(q, "type=", "stem")) { - std::string cw = get_xml_par(strchr(q2, '>')); + } else if (check_xml_par(in_word, qpos, "type=", "stem")) { + std::string cw = get_xml_par(in_word, in_word.find('>', q2pos)); if (!cw.empty()) return stem(cw); - } else if (check_xml_par(q, "type=", "generate")) { - std::string cw = get_xml_par(strchr(q2, '>')); + } else if (check_xml_par(in_word, qpos, "type=", "generate")) { + std::string cw = get_xml_par(in_word, in_word.find('>', q2pos)); if (cw.empty()) return slst; - const char* q3 = strstr(q2 + 1, "<word"); - if (q3) { - std::string cw2 = get_xml_par(strchr(q3, '>')); + std::string::size_type q3pos = in_word.find("<word", q2pos + 1); + if (q3pos != std::string::npos) { + std::string cw2 = get_xml_par(in_word, in_word.find('>', q3pos)); if (!cw2.empty()) { return generate(cw, cw2); } } else { - if ((q2 = strstr(q2 + 1, "<code")) != NULL) { - std::vector<std::string> slst2 = get_xml_list(strchr(q2, '>'), "<a>"); + q2pos = in_word.find("<code", q2pos + 1); + if (q2pos != std::string::npos) { + std::vector<std::string> slst2 = get_xml_list(in_word, in_word.find('>', q2pos), "<a>"); if (!slst2.empty()) { slst = generate(cw, slst2); uniqlist(slst); @@ -1802,21 +1865,57 @@ std::vector<std::string> HunspellImpl::spellml(const std::string& in_word) { } } } + } else if (check_xml_par(in_word, qpos, "type=", "add")) { + std::string cw = get_xml_par(in_word, in_word.find('>', q2pos)); + if (cw.empty()) + return slst; + std::string::size_type q3pos = in_word.find("<word", q2pos + 1); + if (q3pos != std::string::npos) { + std::string cw2 = get_xml_par(in_word, in_word.find('>', q3pos)); + if (!cw2.empty()) { + add_with_affix(cw, cw2); + } else { + add(cw); + } + } else { + add(cw); + } } return slst; } -int Hunspell::spell(const char* word, int* info, char** root) { - std::string sroot; - bool ret = m_Impl->spell(word, info, root ? &sroot : NULL); - if (root) { - if (sroot.empty()) { - *root = NULL; +std::vector<std::string> HunspellImpl::suffix_suggest(const std::string& root_word) { + std::vector<std::string> slst; + struct hentry* he = NULL; + int len; + std::string w2; + const char* word; + const char* ignoredchars = pAMgr->get_ignore(); + if (ignoredchars != NULL) { + w2.assign(root_word); + if (utf8) { + const std::vector<w_char>& ignoredchars_utf16 = + pAMgr->get_ignore_utf16(); + remove_ignored_chars_utf(w2, ignoredchars_utf16); } else { - *root = mystrdup(sroot.c_str()); + remove_ignored_chars(w2, ignoredchars); } + word = w2.c_str(); + } else + word = root_word.c_str(); + + len = strlen(word); + + if (!len) + return slst; + + for (size_t i = 0; (i < m_HMgrs.size()) && !he; ++i) { + he = m_HMgrs[i]->lookup(word); } - return ret; + if (he) { + slst = pAMgr->get_suffix_words(he->astr, he->alen, root_word.c_str()); + } + return slst; } namespace { @@ -1835,113 +1934,289 @@ namespace { } } -void Hunspell::free_list(char*** slst, int n) { - Hunspell_free_list((Hunhandle*)(this), slst, n); +int HunspellImpl::spell(const char* word, int* info, char** root) { + std::string sroot; + bool ret = spell(word, info, root ? &sroot : NULL); + if (root) { + if (sroot.empty()) { + *root = NULL; + } else { + *root = mystrdup(sroot.c_str()); + } + } + return ret; +} + +int HunspellImpl::suggest(char*** slst, const char* word) { + std::vector<std::string> suggests = suggest(word); + return munge_vector(slst, suggests); +} + +int HunspellImpl::suffix_suggest(char*** slst, const char* root_word) { + std::vector<std::string> stems = suffix_suggest(root_word); + return munge_vector(slst, stems); +} + +void HunspellImpl::free_list(char*** slst, int n) { + if (slst && *slst) { + for (int i = 0; i < n; i++) + free((*slst)[i]); + free(*slst); + *slst = NULL; + } +} + +char* HunspellImpl::get_dic_encoding() { + return &encoding[0]; +} + +int HunspellImpl::analyze(char*** slst, const char* word) { + std::vector<std::string> stems = analyze(word); + return munge_vector(slst, stems); +} + +int HunspellImpl::stem(char*** slst, const char* word) { + std::vector<std::string> stems = stem(word); + return munge_vector(slst, stems); +} + +int HunspellImpl::stem(char*** slst, char** desc, int n) { + std::vector<std::string> morph; + morph.reserve(n); + for (int i = 0; i < n; ++i) + morph.push_back(desc[i]); + + std::vector<std::string> stems = stem(morph); + return munge_vector(slst, stems); +} + +int HunspellImpl::generate(char*** slst, const char* word, const char* pattern) { + std::vector<std::string> stems = generate(word, pattern); + return munge_vector(slst, stems); +} + +int HunspellImpl::generate(char*** slst, const char* word, char** pl, int pln) { + std::vector<std::string> morph; + morph.reserve(pln); + for (int i = 0; i < pln; ++i) + morph.push_back(pl[i]); + + std::vector<std::string> stems = generate(word, morph); + return munge_vector(slst, stems); +} + +const char* HunspellImpl::get_wordchars() const { + return get_wordchars_cpp().c_str(); +} + +const char* HunspellImpl::get_version() const { + return get_version_cpp().c_str(); +} + +int HunspellImpl::input_conv(const char* word, char* dest, size_t destsize) { + std::string d; + bool ret = input_conv(word, d); + if (ret && d.size() < destsize) { + strncpy(dest, d.c_str(), destsize); + return 1; + } + return 0; +} + +Hunspell::Hunspell(const char* affpath, const char* dpath, const char* key) + : m_Impl(new HunspellImpl(affpath, dpath, key)) { +} + +Hunspell::~Hunspell() { + delete m_Impl; +} + +// load extra dictionaries +int Hunspell::add_dic(const char* dpath, const char* key) { + return m_Impl->add_dic(dpath, key); +} + +bool Hunspell::spell(const std::string& word, int* info, std::string* root) { + return m_Impl->spell(word, info, root); +} + +std::vector<std::string> Hunspell::suggest(const std::string& word) { + return m_Impl->suggest(word); +} + +std::vector<std::string> Hunspell::suffix_suggest(const std::string& root_word) { + return m_Impl->suffix_suggest(root_word); +} + +const std::string& Hunspell::get_dict_encoding() const { + return m_Impl->get_dict_encoding(); +} + +std::vector<std::string> Hunspell::stem(const std::vector<std::string>& desc) { + return m_Impl->stem(desc); +} + +std::vector<std::string> Hunspell::stem(const std::string& word) { + return m_Impl->stem(word); +} + +const std::string& Hunspell::get_wordchars_cpp() const { + return m_Impl->get_wordchars_cpp(); +} + +const std::vector<w_char>& Hunspell::get_wordchars_utf16() const { + return m_Impl->get_wordchars_utf16(); +} + +int Hunspell::add(const std::string& word) { + return m_Impl->add(word); +} + +int Hunspell::add_with_affix(const std::string& word, const std::string& example) { + return m_Impl->add_with_affix(word, example); +} + +int Hunspell::remove(const std::string& word) { + return m_Impl->remove(word); +} + +const std::string& Hunspell::get_version_cpp() const { + return m_Impl->get_version_cpp(); +} + +struct cs_info* Hunspell::get_csconv() { + return m_Impl->get_csconv(); +} + +std::vector<std::string> Hunspell::analyze(const std::string& word) { + return m_Impl->analyze(word); +} + +std::vector<std::string> Hunspell::generate(const std::string& word, const std::vector<std::string>& pl) { + return m_Impl->generate(word, pl); +} + +std::vector<std::string> Hunspell::generate(const std::string& word, const std::string& pattern) { + return m_Impl->generate(word, pattern); +} + +int Hunspell::get_langnum() const { + return m_Impl->get_langnum(); +} + +bool Hunspell::input_conv(const std::string& word, std::string& dest) { + return m_Impl->input_conv(word, dest); +} + +int Hunspell::spell(const char* word, int* info, char** root) { + return m_Impl->spell(word, info, root); } int Hunspell::suggest(char*** slst, const char* word) { - return Hunspell_suggest((Hunhandle*)(this), slst, word); + return m_Impl->suggest(slst, word); } int Hunspell::suffix_suggest(char*** slst, const char* root_word) { - std::vector<std::string> stems = m_Impl->suffix_suggest(root_word); - return munge_vector(slst, stems); + return m_Impl->suffix_suggest(slst, root_word); +} + +void Hunspell::free_list(char*** slst, int n) { + m_Impl->free_list(slst, n); } char* Hunspell::get_dic_encoding() { - return &(m_Impl->dic_encoding_vec[0]); + return m_Impl->get_dic_encoding(); } -int Hunspell::stem(char*** slst, char** desc, int n) { - return Hunspell_stem2((Hunhandle*)(this), slst, desc, n); +int Hunspell::analyze(char*** slst, const char* word) { + return m_Impl->analyze(slst, word); } int Hunspell::stem(char*** slst, const char* word) { - return Hunspell_stem((Hunhandle*)(this), slst, word); + return m_Impl->stem(slst, word); } -int Hunspell::analyze(char*** slst, const char* word) { - return Hunspell_analyze((Hunhandle*)(this), slst, word); +int Hunspell::stem(char*** slst, char** desc, int n) { + return m_Impl->stem(slst, desc, n); +} + +int Hunspell::generate(char*** slst, const char* word, const char* pattern) { + return m_Impl->generate(slst, word, pattern); } int Hunspell::generate(char*** slst, const char* word, char** pl, int pln) { - return Hunspell_generate2((Hunhandle*)(this), slst, word, pl, pln); + return m_Impl->generate(slst, word, pl, pln); } -int Hunspell::generate(char*** slst, const char* word, const char* pattern) { - return Hunspell_generate((Hunhandle*)(this), slst, word, pattern); +const char* Hunspell::get_wordchars() const { + return m_Impl->get_wordchars(); +} + +const char* Hunspell::get_version() const { + return m_Impl->get_version(); +} + +int Hunspell::input_conv(const char* word, char* dest, size_t destsize) { + return m_Impl->input_conv(word, dest, destsize); } Hunhandle* Hunspell_create(const char* affpath, const char* dpath) { - return (Hunhandle*)(new Hunspell(affpath, dpath)); + return reinterpret_cast<Hunhandle*>(new HunspellImpl(affpath, dpath)); } Hunhandle* Hunspell_create_key(const char* affpath, const char* dpath, const char* key) { - return reinterpret_cast<Hunhandle*>(new Hunspell(affpath, dpath, key)); + return reinterpret_cast<Hunhandle*>(new HunspellImpl(affpath, dpath, key)); } void Hunspell_destroy(Hunhandle* pHunspell) { - delete reinterpret_cast<Hunspell*>(pHunspell); + delete reinterpret_cast<HunspellImpl*>(pHunspell); } int Hunspell_add_dic(Hunhandle* pHunspell, const char* dpath) { - return reinterpret_cast<Hunspell*>(pHunspell)->add_dic(dpath); + return reinterpret_cast<HunspellImpl*>(pHunspell)->add_dic(dpath); } int Hunspell_spell(Hunhandle* pHunspell, const char* word) { - return reinterpret_cast<Hunspell*>(pHunspell)->spell(std::string(word)); + return reinterpret_cast<HunspellImpl*>(pHunspell)->spell(word); } char* Hunspell_get_dic_encoding(Hunhandle* pHunspell) { - return reinterpret_cast<Hunspell*>(pHunspell)->get_dic_encoding(); + return reinterpret_cast<HunspellImpl*>(pHunspell)->get_dic_encoding(); } int Hunspell_suggest(Hunhandle* pHunspell, char*** slst, const char* word) { - std::vector<std::string> suggests = reinterpret_cast<Hunspell*>(pHunspell)->suggest(word); - return munge_vector(slst, suggests); + return reinterpret_cast<HunspellImpl*>(pHunspell)->suggest(slst, word); } int Hunspell_analyze(Hunhandle* pHunspell, char*** slst, const char* word) { - std::vector<std::string> stems = reinterpret_cast<Hunspell*>(pHunspell)->analyze(word); - return munge_vector(slst, stems); + return reinterpret_cast<HunspellImpl*>(pHunspell)->analyze(slst, word); } int Hunspell_stem(Hunhandle* pHunspell, char*** slst, const char* word) { - - std::vector<std::string> stems = reinterpret_cast<Hunspell*>(pHunspell)->stem(word); - return munge_vector(slst, stems); + return reinterpret_cast<HunspellImpl*>(pHunspell)->stem(slst, word); } int Hunspell_stem2(Hunhandle* pHunspell, char*** slst, char** desc, int n) { - std::vector<std::string> morph; - for (int i = 0; i < n; ++i) - morph.push_back(desc[i]); - - std::vector<std::string> stems = reinterpret_cast<Hunspell*>(pHunspell)->stem(morph); - return munge_vector(slst, stems); + return reinterpret_cast<HunspellImpl*>(pHunspell)->stem(slst, desc, n); } int Hunspell_generate(Hunhandle* pHunspell, char*** slst, const char* word, - const char* pattern) { - std::vector<std::string> stems = reinterpret_cast<Hunspell*>(pHunspell)->generate(word, pattern); - return munge_vector(slst, stems); + const char* pattern) +{ + return reinterpret_cast<HunspellImpl*>(pHunspell)->generate(slst, word, pattern); } int Hunspell_generate2(Hunhandle* pHunspell, char*** slst, const char* word, char** desc, - int n) { - std::vector<std::string> morph; - for (int i = 0; i < n; ++i) - morph.push_back(desc[i]); - - std::vector<std::string> stems = reinterpret_cast<Hunspell*>(pHunspell)->generate(word, morph); - return munge_vector(slst, stems); + int n) +{ + return reinterpret_cast<HunspellImpl*>(pHunspell)->generate(slst, word, desc, n); } /* functions for run-time modification of the dictionary */ @@ -1949,7 +2224,7 @@ int Hunspell_generate2(Hunhandle* pHunspell, /* add word to the run-time dictionary */ int Hunspell_add(Hunhandle* pHunspell, const char* word) { - return reinterpret_cast<Hunspell*>(pHunspell)->add(word); + return reinterpret_cast<HunspellImpl*>(pHunspell)->add(word); } /* add word to the run-time dictionary with affix flags of @@ -1960,58 +2235,15 @@ int Hunspell_add(Hunhandle* pHunspell, const char* word) { int Hunspell_add_with_affix(Hunhandle* pHunspell, const char* word, const char* example) { - return reinterpret_cast<Hunspell*>(pHunspell)->add_with_affix(word, example); + return reinterpret_cast<HunspellImpl*>(pHunspell)->add_with_affix(word, example); } /* remove word from the run-time dictionary */ int Hunspell_remove(Hunhandle* pHunspell, const char* word) { - return reinterpret_cast<Hunspell*>(pHunspell)->remove(word); + return reinterpret_cast<HunspellImpl*>(pHunspell)->remove(word); } -void Hunspell_free_list(Hunhandle*, char*** list, int n) { - if (list && *list) { - for (int i = 0; i < n; i++) - free((*list)[i]); - free(*list); - *list = NULL; - } -} - -std::vector<std::string> Hunspell::suffix_suggest(const std::string& root_word) { - return m_Impl->suffix_suggest(root_word); -} - -std::vector<std::string> HunspellImpl::suffix_suggest(const std::string& root_word) { - std::vector<std::string> slst; - struct hentry* he = NULL; - int len; - std::string w2; - const char* word; - const char* ignoredchars = pAMgr->get_ignore(); - if (ignoredchars != NULL) { - w2.assign(root_word); - if (utf8) { - const std::vector<w_char>& ignoredchars_utf16 = - pAMgr->get_ignore_utf16(); - remove_ignored_chars_utf(w2, ignoredchars_utf16); - } else { - remove_ignored_chars(w2, ignoredchars); - } - word = w2.c_str(); - } else - word = root_word.c_str(); - - len = strlen(word); - - if (!len) - return slst; - - for (size_t i = 0; (i < m_HMgrs.size()) && !he; ++i) { - he = m_HMgrs[i]->lookup(word); - } - if (he) { - slst = pAMgr->get_suffix_words(he->astr, he->alen, root_word.c_str()); - } - return slst; +void Hunspell_free_list(Hunhandle* pHunspell, char*** list, int n) { + reinterpret_cast<HunspellImpl*>(pHunspell)->free_list(list, n); } diff --git a/libs/hunspell/src/hunspell.hxx b/libs/hunspell/src/hunspell.hxx index f728f829c2..8640a35ca1 100644 --- a/libs/hunspell/src/hunspell.hxx +++ b/libs/hunspell/src/hunspell.hxx @@ -1,7 +1,7 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * - * Copyright (C) 2002-2017 Németh László + * Copyright (C) 2002-2022 Németh László * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with @@ -78,7 +78,10 @@ #define SPELL_XML "<?xml?>" +#ifndef MAXSUGGESTION #define MAXSUGGESTION 15 +#endif + #define MAXSHARPS 5 #ifndef MAXWORDLEN @@ -216,7 +219,6 @@ class LIBHUNSPELL_DLL_EXPORTED Hunspell { struct cs_info* get_csconv(); - const char* get_try_string() const; const char* get_version() const; const std::string& get_version_cpp() const; diff --git a/libs/hunspell/src/hunvisapi.h b/libs/hunspell/src/hunvisapi.h index eb2b348091..ed0a502ba2 100644 --- a/libs/hunspell/src/hunvisapi.h +++ b/libs/hunspell/src/hunvisapi.h @@ -3,7 +3,7 @@ #if defined(HUNSPELL_STATIC) # define LIBHUNSPELL_DLL_EXPORTED -#elif defined(_MSC_VER) +#elif defined(_WIN32) # if defined(BUILDING_LIBHUNSPELL) # define LIBHUNSPELL_DLL_EXPORTED __declspec(dllexport) # else diff --git a/libs/hunspell/src/hunzip.c++ b/libs/hunspell/src/hunzip.c++ index 8962b100b1..64a9169c4b 100644 --- a/libs/hunspell/src/hunzip.c++ +++ b/libs/hunspell/src/hunzip.c++ @@ -1,7 +1,7 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * - * Copyright (C) 2002-2017 Németh László + * Copyright (C) 2002-2022 Németh László * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with @@ -178,7 +178,7 @@ int Hunzip::getbuf() { do { if (inc == 0) { fin.read(in, BUFSIZE); - inbits = fin.gcount() * 8; + inbits = int(fin.gcount() * 8); } for (; inc < inbits; inc++) { int b = (in[inc / 8] & (1 << (7 - (inc % 8)))) ? 1 : 0; diff --git a/libs/hunspell/src/hunzip.hxx b/libs/hunspell/src/hunzip.hxx index ea2bc58d26..f57ea41cc0 100644 --- a/libs/hunspell/src/hunzip.hxx +++ b/libs/hunspell/src/hunzip.hxx @@ -1,7 +1,7 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * - * Copyright (C) 2002-2017 Németh László + * Copyright (C) 2002-2022 Németh László * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with diff --git a/libs/hunspell/src/langnum.hxx b/libs/hunspell/src/langnum.hxx index a64d3d7869..39e63efdaa 100644 --- a/libs/hunspell/src/langnum.hxx +++ b/libs/hunspell/src/langnum.hxx @@ -1,7 +1,7 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * - * Copyright (C) 2002-2017 Németh László + * Copyright (C) 2002-2022 Németh László * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with @@ -48,6 +48,7 @@ enum { LANG_az = 100, // custom number LANG_bg = 41, LANG_ca = 37, + LANG_crh = 102, // custom number LANG_cs = 42, LANG_da = 45, LANG_de = 49, diff --git a/libs/hunspell/src/replist.c++ b/libs/hunspell/src/replist.c++ index cabe382bfd..1395ade607 100644 --- a/libs/hunspell/src/replist.c++ +++ b/libs/hunspell/src/replist.c++ @@ -1,7 +1,7 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * - * Copyright (C) 2002-2017 Németh László + * Copyright (C) 2002-2022 Németh László * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with diff --git a/libs/hunspell/src/replist.hxx b/libs/hunspell/src/replist.hxx index 1e3efa4131..08daeb4488 100644 --- a/libs/hunspell/src/replist.hxx +++ b/libs/hunspell/src/replist.hxx @@ -1,7 +1,7 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * - * Copyright (C) 2002-2017 Németh László + * Copyright (C) 2002-2022 Németh László * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with diff --git a/libs/hunspell/src/suggestmgr.c++ b/libs/hunspell/src/suggestmgr.c++ index 73ea91e3a3..6b363debd5 100644 --- a/libs/hunspell/src/suggestmgr.c++ +++ b/libs/hunspell/src/suggestmgr.c++ @@ -1,7 +1,7 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * - * Copyright (C) 2002-2017 Németh László + * Copyright (C) 2002-2022 Németh László * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with @@ -72,6 +72,7 @@ #include <string.h> #include <stdio.h> #include <ctype.h> +#include <time.h> #include "suggestmgr.hxx" #include "htypes.hxx" @@ -79,6 +80,8 @@ const w_char W_VLINE = {'\0', '|'}; +#define MAX_CHAR_DISTANCE 4 + SuggestMgr::SuggestMgr(const char* tryme, unsigned int maxn, AffixMgr* aptr) { // register affix manager and check in string of chars to // try when building candidate suggestions @@ -132,6 +135,11 @@ SuggestMgr::SuggestMgr(const char* tryme, unsigned int maxn, AffixMgr* aptr) { ctryl = u8_u16(ctry_utf, tryme); } } + + // language with possible dash usage + // (latin letters or dash in TRY characters) + lang_with_dash_usage = (ctry && + ((strchr(ctry, '-') != NULL) || (strchr(ctry, 'a') != NULL))); } SuggestMgr::~SuggestMgr() { @@ -169,10 +177,13 @@ void SuggestMgr::testsug(std::vector<std::string>& wlst, } } -// generate suggestions for a misspelled word -// pass in address of array of char * pointers -// onlycompoundsug: probably bad suggestions (need for ngram sugs, too) -void SuggestMgr::suggest(std::vector<std::string>& slst, +/* generate suggestions for a misspelled word + * pass in address of array of char * pointers + * onlycompoundsug: probably bad suggestions (need for ngram sugs, too) + * return value: true, if there is a good suggestion + * (REP, ph: or a dictionary word pair) + */ +bool SuggestMgr::suggest(std::vector<std::string>& slst, const char* w, int* onlycompoundsug) { int nocompoundtwowords = 0; @@ -182,6 +193,7 @@ void SuggestMgr::suggest(std::vector<std::string>& slst, std::string w2; const char* word = w; size_t oldSug = 0; + bool good_suggestion = false; // word reversing wrapper for complex prefixes if (complexprefixes) { @@ -196,34 +208,49 @@ void SuggestMgr::suggest(std::vector<std::string>& slst, if (utf8) { wl = u8_u16(word_utf, word); if (wl == -1) { - return; + return false; } } - for (int cpdsuggest = 0; (cpdsuggest < 2) && (nocompoundtwowords == 0); + for (int cpdsuggest = 0; (cpdsuggest < 2) && (nocompoundtwowords == 0) && !good_suggestion; cpdsuggest++) { + + clock_t timelimit; + // initialize both in non-compound and compound cycles + timelimit = clock(); + // limit compound suggestion if (cpdsuggest > 0) oldSug = slst.size(); // suggestions for an uppercase word (html -> HTML) if (slst.size() < maxSug) { + size_t i = slst.size(); if (utf8) - capchars_utf(slst, &word_utf[0], wl, cpdsuggest); + capchars_utf(slst, word_utf.data(), wl, cpdsuggest); else capchars(slst, word, cpdsuggest); + if (slst.size() > i) + good_suggestion = true; } // perhaps we made a typical fault of spelling if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) { + size_t i = slst.size(); replchars(slst, word, cpdsuggest); + if (slst.size() > i) + good_suggestion = true; } + if (clock() > timelimit + TIMELIMIT_SUGGESTION) + return good_suggestion; // perhaps we made chose the wrong char from a related set if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) { mapchars(slst, word, cpdsuggest); } + if (clock() > timelimit + TIMELIMIT_SUGGESTION) + return good_suggestion; // only suggest compound words when no other suggestion if ((cpdsuggest == 0) && (slst.size() > nsugorig)) @@ -232,77 +259,99 @@ void SuggestMgr::suggest(std::vector<std::string>& slst, // did we swap the order of chars by mistake if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) { if (utf8) - swapchar_utf(slst, &word_utf[0], wl, cpdsuggest); + swapchar_utf(slst, word_utf.data(), wl, cpdsuggest); else swapchar(slst, word, cpdsuggest); } + if (clock() > timelimit + TIMELIMIT_SUGGESTION) + return good_suggestion; // did we swap the order of non adjacent chars by mistake if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) { if (utf8) - longswapchar_utf(slst, &word_utf[0], wl, cpdsuggest); + longswapchar_utf(slst, word_utf.data(), wl, cpdsuggest); else longswapchar(slst, word, cpdsuggest); } + if (clock() > timelimit + TIMELIMIT_SUGGESTION) + return good_suggestion; // did we just hit the wrong key in place of a good char (case and keyboard) if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) { if (utf8) - badcharkey_utf(slst, &word_utf[0], wl, cpdsuggest); + badcharkey_utf(slst, word_utf.data(), wl, cpdsuggest); else badcharkey(slst, word, cpdsuggest); } + if (clock() > timelimit + TIMELIMIT_SUGGESTION) + return good_suggestion; // did we add a char that should not be there if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) { if (utf8) - extrachar_utf(slst, &word_utf[0], wl, cpdsuggest); + extrachar_utf(slst, word_utf.data(), wl, cpdsuggest); else extrachar(slst, word, cpdsuggest); } + if (clock() > timelimit + TIMELIMIT_SUGGESTION) + return good_suggestion; // did we forgot a char if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) { if (utf8) - forgotchar_utf(slst, &word_utf[0], wl, cpdsuggest); + forgotchar_utf(slst, word_utf.data(), wl, cpdsuggest); else forgotchar(slst, word, cpdsuggest); } + if (clock() > timelimit + TIMELIMIT_SUGGESTION) + return good_suggestion; // did we move a char if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) { if (utf8) - movechar_utf(slst, &word_utf[0], wl, cpdsuggest); + movechar_utf(slst, word_utf.data(), wl, cpdsuggest); else movechar(slst, word, cpdsuggest); } + if (clock() > timelimit + TIMELIMIT_SUGGESTION) + return good_suggestion; // did we just hit the wrong key in place of a good char if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) { if (utf8) - badchar_utf(slst, &word_utf[0], wl, cpdsuggest); + badchar_utf(slst, word_utf.data(), wl, cpdsuggest); else badchar(slst, word, cpdsuggest); } + if (clock() > timelimit + TIMELIMIT_SUGGESTION) + return good_suggestion; // did we double two characters if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) { if (utf8) - doubletwochars_utf(slst, &word_utf[0], wl, cpdsuggest); + doubletwochars_utf(slst, word_utf.data(), wl, cpdsuggest); else doubletwochars(slst, word, cpdsuggest); } + if (clock() > timelimit + TIMELIMIT_SUGGESTION) + return good_suggestion; // perhaps we forgot to hit space and two words ran together - if (!nosplitsugs && (slst.size() < maxSug) && - (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) { - twowords(slst, word, cpdsuggest); + // (dictionary word pairs have top priority here, so + // we always suggest them, in despite of nosplitsugs, and + // drop compound word and other suggestions) + if (!cpdsuggest || (!nosplitsugs && slst.size() < oldSug + maxcpdsugs)) { + good_suggestion = twowords(slst, word, cpdsuggest, good_suggestion); } + if (clock() > timelimit + TIMELIMIT_SUGGESTION) + return good_suggestion; } // repeating ``for'' statement compounding support if (!nocompoundtwowords && (!slst.empty()) && onlycompoundsug) *onlycompoundsug = 1; + + return good_suggestion; } // suggestions for an uppercase word (html -> HTML) @@ -450,8 +499,11 @@ int SuggestMgr::replchars(std::vector<std::string>& wlst, return wlst.size(); } -// perhaps we doubled two characters (pattern aba -> ababa, for example vacation -// -> vacacation) +// perhaps we doubled two characters +// (for example vacation -> vacacation) +// The recognized pattern with regex back-references: +// "(.)(.)\1\2\1" or "..(.)(.)\1\2" + int SuggestMgr::doubletwochars(std::vector<std::string>& wlst, const char* word, int cpdsuggest) { @@ -462,7 +514,7 @@ int SuggestMgr::doubletwochars(std::vector<std::string>& wlst, for (int i = 2; i < wl; i++) { if (word[i] == word[i - 2]) { state++; - if (state == 3) { + if (state == 3 || (state == 2 && i >= 4)) { std::string candidate(word, word + i - 1); candidate.insert(candidate.end(), word + i + 1, word + wl); testsug(wlst, candidate, cpdsuggest, NULL, NULL); @@ -475,8 +527,11 @@ int SuggestMgr::doubletwochars(std::vector<std::string>& wlst, return wlst.size(); } -// perhaps we doubled two characters (pattern aba -> ababa, for example vacation -// -> vacacation) +// perhaps we doubled two characters +// (for example vacation -> vacacation) +// The recognized pattern with regex back-references: +// "(.)(.)\1\2\1" or "..(.)(.)\1\2" + int SuggestMgr::doubletwochars_utf(std::vector<std::string>& wlst, const w_char* word, int wl, @@ -487,7 +542,7 @@ int SuggestMgr::doubletwochars_utf(std::vector<std::string>& wlst, for (int i = 2; i < wl; i++) { if (word[i] == word[i - 2]) { state++; - if (state == 3) { + if (state == 3 || (state == 2 && i >= 4)) { std::vector<w_char> candidate_utf(word, word + i - 1); candidate_utf.insert(candidate_utf.end(), word + i + 1, word + wl); std::string candidate; @@ -721,17 +776,22 @@ int SuggestMgr::forgotchar_utf(std::vector<std::string>& wlst, return wlst.size(); } -/* error is should have been two words */ -int SuggestMgr::twowords(std::vector<std::string>& wlst, +/* error is should have been two words + * return value is true, if there is a dictionary word pair, + * or there was already a good suggestion before calling + * this function. + */ +bool SuggestMgr::twowords(std::vector<std::string>& wlst, const char* word, - int cpdsuggest) { + int cpdsuggest, + bool good) { int c2; int forbidden = 0; int cwrd; int wl = strlen(word); if (wl < 3) - return wlst.size(); + return false; if (langnum == LANG_hu) forbidden = check_forbidden(word, wl); @@ -750,63 +810,87 @@ int SuggestMgr::twowords(std::vector<std::string>& wlst, } if (utf8 && p[1] == '\0') break; // last UTF-8 character - *p = '\0'; - int c1 = checkword(candidate, cpdsuggest, NULL, NULL); - if (c1) { - c2 = checkword((p + 1), cpdsuggest, NULL, NULL); - if (c2) { - *p = ' '; - - // spec. Hungarian code (need a better compound word support) - if ((langnum == LANG_hu) && !forbidden && - // if 3 repeating letter, use - instead of space - (((p[-1] == p[1]) && - (((p > candidate + 1) && (p[-1] == p[-2])) || (p[-1] == p[2]))) || - // or multiple compounding, with more, than 6 syllables - ((c1 == 3) && (c2 >= 2)))) - *p = '-'; - - cwrd = 1; - for (size_t k = 0; k < wlst.size(); ++k) { - if (wlst[k] == candidate) { - cwrd = 0; - break; - } - } - if (wlst.size() < maxSug) { - if (cwrd) { - wlst.push_back(candidate); - } - } else { - free(candidate); - return wlst.size(); + + // Suggest only word pairs, if they are listed in the dictionary. + // For example, adding "a lot" to the English dic file will + // result only "alot" -> "a lot" suggestion instead of + // "alto, slot, alt, lot, allot, aloft, aloe, clot, plot, blot, a lot". + // Note: using "ph:alot" keeps the other suggestions: + // a lot ph:alot + // alot -> a lot, alto, slot... + *p = ' '; + if (!cpdsuggest && checkword(candidate, cpdsuggest, NULL, NULL)) { + // remove not word pair suggestions + if (!good) { + good = true; + wlst.clear(); + } + wlst.insert(wlst.begin(), candidate); + } + + // word pairs with dash? + if (lang_with_dash_usage) { + *p = '-'; + + if (!cpdsuggest && checkword(candidate, cpdsuggest, NULL, NULL)) { + // remove not word pair suggestions + if (!good) { + good = true; + wlst.clear(); } - // add two word suggestion with dash, if TRY string contains - // "a" or "-" - // NOTE: cwrd doesn't modified for REP twoword sugg. - if (ctry && (strchr(ctry, 'a') || strchr(ctry, '-')) && - mystrlen(p + 1) > 1 && mystrlen(candidate) - mystrlen(p) > 1) { - *p = '-'; + wlst.insert(wlst.begin(), candidate); + } + } + + if (wlst.size() < maxSug && !nosplitsugs && !good) { + *p = '\0'; + int c1 = checkword(candidate, cpdsuggest, NULL, NULL); + if (c1) { + c2 = checkword((p + 1), cpdsuggest, NULL, NULL); + if (c2) { + // spec. Hungarian code (TODO need a better compound word support) + if ((langnum == LANG_hu) && !forbidden && + // if 3 repeating letter, use - instead of space + (((p[-1] == p[1]) && + (((p > candidate + 1) && (p[-1] == p[-2])) || (p[-1] == p[2]))) || + // or multiple compounding, with more, than 6 syllables + ((c1 == 3) && (c2 >= 2)))) + *p = '-'; + else + *p = ' '; + + cwrd = 1; for (size_t k = 0; k < wlst.size(); ++k) { if (wlst[k] == candidate) { cwrd = 0; break; } } - if (wlst.size() < maxSug) { - if (cwrd) { + + if (cwrd && (wlst.size() < maxSug)) wlst.push_back(candidate); + + // add two word suggestion with dash, depending on the language + // Note that cwrd doesn't modified for REP twoword sugg. + if ( !nosplitsugs && lang_with_dash_usage && + mystrlen(p + 1) > 1 && mystrlen(candidate) - mystrlen(p) > 1) { + *p = '-'; + for (size_t k = 0; k < wlst.size(); ++k) { + if (wlst[k] == candidate) { + cwrd = 0; + break; + } } - } else { - free(candidate); - return wlst.size(); + + if ((wlst.size() < maxSug) && cwrd) + wlst.push_back(candidate); } } } } } free(candidate); - return wlst.size(); + return good; } // error is adjacent letter were swapped @@ -891,7 +975,8 @@ int SuggestMgr::longswapchar(std::vector<std::string>& wlst, // try swapping not adjacent chars one by one for (std::string::iterator p = candidate.begin(); p < candidate.end(); ++p) { for (std::string::iterator q = candidate.begin(); q < candidate.end(); ++q) { - if (std::abs(std::distance(q, p)) > 1) { + size_t distance = std::abs(std::distance(q, p)); + if (distance > 1 && distance <= MAX_CHAR_DISTANCE) { std::swap(*p, *q); testsug(wlst, candidate, cpdsuggest, NULL, NULL); std::swap(*p, *q); @@ -910,7 +995,8 @@ int SuggestMgr::longswapchar_utf(std::vector<std::string>& wlst, // try swapping not adjacent chars for (std::vector<w_char>::iterator p = candidate_utf.begin(); p < candidate_utf.end(); ++p) { for (std::vector<w_char>::iterator q = candidate_utf.begin(); q < candidate_utf.end(); ++q) { - if (std::abs(std::distance(q, p)) > 1) { + size_t distance = std::abs(std::distance(q, p)); + if (distance > 1 && distance <= MAX_CHAR_DISTANCE) { std::swap(*p, *q); std::string candidate; u16_u8(candidate, candidate_utf); @@ -932,7 +1018,7 @@ int SuggestMgr::movechar(std::vector<std::string>& wlst, // try moving a char for (std::string::iterator p = candidate.begin(); p < candidate.end(); ++p) { - for (std::string::iterator q = p + 1; q < candidate.end() && std::distance(p, q) < 10; ++q) { + for (std::string::iterator q = p + 1; q < candidate.end() && std::distance(p, q) <= MAX_CHAR_DISTANCE; ++q) { std::swap(*q, *(q - 1)); if (std::distance(p, q) < 2) continue; // omit swap char @@ -942,7 +1028,7 @@ int SuggestMgr::movechar(std::vector<std::string>& wlst, } for (std::string::reverse_iterator p = candidate.rbegin(), pEnd = candidate.rend() - 1; p != pEnd; ++p) { - for (std::string::reverse_iterator q = p + 1, qEnd = candidate.rend(); q != qEnd && std::distance(p, q) < 10; ++q) { + for (std::string::reverse_iterator q = p + 1, qEnd = candidate.rend(); q != qEnd && std::distance(p, q) <= MAX_CHAR_DISTANCE; ++q) { std::swap(*q, *(q - 1)); if (std::distance(p, q) < 2) continue; // omit swap char @@ -965,7 +1051,7 @@ int SuggestMgr::movechar_utf(std::vector<std::string>& wlst, // try moving a char for (std::vector<w_char>::iterator p = candidate_utf.begin(); p < candidate_utf.end(); ++p) { - for (std::vector<w_char>::iterator q = p + 1; q < candidate_utf.end() && std::distance(p, q) < 10; ++q) { + for (std::vector<w_char>::iterator q = p + 1; q < candidate_utf.end() && std::distance(p, q) <= MAX_CHAR_DISTANCE; ++q) { std::swap(*q, *(q - 1)); if (std::distance(p, q) < 2) continue; // omit swap char @@ -977,7 +1063,7 @@ int SuggestMgr::movechar_utf(std::vector<std::string>& wlst, } for (std::vector<w_char>::reverse_iterator p = candidate_utf.rbegin(); p < candidate_utf.rend(); ++p) { - for (std::vector<w_char>::reverse_iterator q = p + 1; q < candidate_utf.rend() && std::distance(p, q) < 10; ++q) { + for (std::vector<w_char>::reverse_iterator q = p + 1; q < candidate_utf.rend() && std::distance(p, q) <= MAX_CHAR_DISTANCE; ++q) { std::swap(*q, *(q - 1)); if (std::distance(p, q) < 2) continue; // omit swap char @@ -994,7 +1080,8 @@ int SuggestMgr::movechar_utf(std::vector<std::string>& wlst, // generate a set of suggestions for very poorly spelled words void SuggestMgr::ngsuggest(std::vector<std::string>& wlst, const char* w, - const std::vector<HashMgr*>& rHMgr) { + const std::vector<HashMgr*>& rHMgr, + int captype) { int lval; int sc; int lp, lpphon; @@ -1071,18 +1158,34 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst, u8_u16(w_word, word); u8_u16(w_target, target); } - + std::string f; std::vector<w_char> w_f; - + for (size_t i = 0; i < rHMgr.size(); ++i) { while (0 != (hp = rHMgr[i]->walk_hashtable(col, hp))) { - if ((hp->astr) && (pAMgr) && - (TESTAFF(hp->astr, forbiddenword, hp->alen) || - TESTAFF(hp->astr, ONLYUPCASEFLAG, hp->alen) || - TESTAFF(hp->astr, nosuggest, hp->alen) || - TESTAFF(hp->astr, nongramsuggest, hp->alen) || - TESTAFF(hp->astr, onlyincompound, hp->alen))) + // skip exceptions + if ( + // skip it, if the word length different by 5 or + // more characters (to avoid strange suggestions) + // (except Unicode characters over BMP) + (((abs(n - hp->clen) > 4) && !nonbmp)) || + // don't suggest capitalized dictionary words for + // lower case misspellings in ngram suggestions, except + // - PHONE usage, or + // - in the case of German, where not only proper + // nouns are capitalized, or + // - the capitalized word has special pronunciation + ((captype == NOCAP) && (hp->var & H_OPT_INITCAP) && + !ph && (langnum != LANG_de) && !(hp->var & H_OPT_PHON)) || + // or it has one of the following special flags + ((hp->astr) && (pAMgr) && + (TESTAFF(hp->astr, forbiddenword, hp->alen) || + TESTAFF(hp->astr, ONLYUPCASEFLAG, hp->alen) || + TESTAFF(hp->astr, nosuggest, hp->alen) || + TESTAFF(hp->astr, nongramsuggest, hp->alen) || + TESTAFF(hp->astr, onlyincompound, hp->alen))) + ) continue; if (utf8) { @@ -1105,7 +1208,7 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst, sc = ngram(3, word, f, NGRAM_LONGER_WORSE) + leftcommon; } - // check special pronounciation + // check special pronunciation f.clear(); if ((hp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) { @@ -1559,7 +1662,8 @@ int SuggestMgr::checkword(const std::string& word, if (rv) { if ((rv->astr) && (TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->alen) || - TESTAFF(rv->astr, pAMgr->get_nosuggest(), rv->alen))) + TESTAFF(rv->astr, pAMgr->get_nosuggest(), rv->alen) || + TESTAFF(rv->astr, pAMgr->get_substandard(), rv->alen))) return 0; while (rv) { if (rv->astr && @@ -1584,7 +1688,7 @@ int SuggestMgr::checkword(const std::string& word, if (!rv && pAMgr->have_contclass()) { rv = pAMgr->suffix_check_twosfx(word.c_str(), word.size(), 0, NULL, FLAG_NULL); if (!rv) - rv = pAMgr->prefix_check_twosfx(word.c_str(), word.size(), 1, FLAG_NULL); + rv = pAMgr->prefix_check_twosfx(word.c_str(), word.size(), 0, FLAG_NULL); } // check forbidden words @@ -1649,15 +1753,15 @@ std::string SuggestMgr::suggest_morph(const std::string& in_w) { TESTAFF(rv->astr, pAMgr->get_needaffix(), rv->alen) || TESTAFF(rv->astr, pAMgr->get_onlyincompound(), rv->alen))) { if (!HENTRY_FIND(rv, MORPH_STEM)) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(MORPH_STEM); result.append(w); } if (HENTRY_DATA(rv)) { - result.append(" "); + result.push_back(MSEP_FLD); result.append(HENTRY_DATA2(rv)); } - result.append("\n"); + result.push_back(MSEP_REC); } rv = rv->next_homonym; } @@ -1713,7 +1817,7 @@ std::string SuggestMgr::suggest_hentry_gen(hentry* rv, const char* pattern) { HENTRY_DATA(rv), pattern, 0); if (!aff.empty()) { result.append(aff); - result.append("\n"); + result.push_back(MSEP_REC); } } @@ -1737,7 +1841,7 @@ std::string SuggestMgr::suggest_hentry_gen(hentry* rv, const char* pattern) { rv2->alen, HENTRY_DATA(rv2), pattern, 0); if (!aff.empty()) { result.append(aff); - result.append("\n"); + result.push_back(MSEP_REC); } } } @@ -1936,7 +2040,7 @@ int SuggestMgr::leftcommonsubstring( int l2 = su2.size(); // decapitalize dictionary word if (complexprefixes) { - if (su1[l1 - 1] == su2[l2 - 1]) + if (l1 && l2 && su1[l1 - 1] == su2[l2 - 1]) return 1; } else { unsigned short idx = su2.empty() ? 0 : (su2[0].h << 8) + su2[0].l; diff --git a/libs/hunspell/src/suggestmgr.hxx b/libs/hunspell/src/suggestmgr.hxx index 19ffc03a84..4c2fb69032 100644 --- a/libs/hunspell/src/suggestmgr.hxx +++ b/libs/hunspell/src/suggestmgr.hxx @@ -1,7 +1,7 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * - * Copyright (C) 2002-2017 Németh László + * Copyright (C) 2002-2022 Németh László * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with @@ -78,11 +78,6 @@ #define MAXPHONSUGS 2 #define MAXCOMPOUNDSUGS 3 -// timelimit: max ~1/4 sec (process time on Linux) for a time consuming function -#define TIMELIMIT (CLOCKS_PER_SEC >> 2) -#define MINTIMER 100 -#define MAXPLUSTIMER 100 - #define NGRAM_LONGER_WORSE (1 << 0) #define NGRAM_ANY_MISMATCH (1 << 1) #define NGRAM_LOWERING (1 << 2) @@ -92,7 +87,6 @@ #include "affixmgr.hxx" #include "hashmgr.hxx" #include "langnum.hxx" -#include <time.h> enum { LCS_UP, LCS_LEFT, LCS_UPLEFT }; @@ -109,6 +103,7 @@ class SuggestMgr { char* ctry; size_t ctryl; std::vector<w_char> ctry_utf; + bool lang_with_dash_usage; AffixMgr* pAMgr; unsigned int maxSug; @@ -124,8 +119,8 @@ class SuggestMgr { SuggestMgr(const char* tryme, unsigned int maxn, AffixMgr* aptr); ~SuggestMgr(); - void suggest(std::vector<std::string>& slst, const char* word, int* onlycmpdsug); - void ngsuggest(std::vector<std::string>& slst, const char* word, const std::vector<HashMgr*>& rHMgr); + bool suggest(std::vector<std::string>& slst, const char* word, int* onlycmpdsug); + void ngsuggest(std::vector<std::string>& slst, const char* word, const std::vector<HashMgr*>& rHMgr, int captype); std::string suggest_morph(const std::string& word); std::string suggest_gen(const std::vector<std::string>& pl, const std::string& pattern); @@ -149,7 +144,7 @@ class SuggestMgr { int extrachar(std::vector<std::string>&, const char*, int); int badcharkey(std::vector<std::string>&, const char*, int); int badchar(std::vector<std::string>&, const char*, int); - int twowords(std::vector<std::string>&, const char*, int); + bool twowords(std::vector<std::string>&, const char*, int, bool); void capchars_utf(std::vector<std::string>&, const w_char*, int wl, int); int doubletwochars_utf(std::vector<std::string>&, const w_char*, int wl, int); diff --git a/libs/hunspell/src/utf_info.hxx b/libs/hunspell/src/utf_info.hxx index 6bb847f2a6..9ab9f7a5fe 100644 --- a/libs/hunspell/src/utf_info.hxx +++ b/libs/hunspell/src/utf_info.hxx @@ -1,7 +1,7 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * - * Copyright (C) 2002-2017 Németh László + * Copyright (C) 2002-2022 Németh László * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with @@ -35,9 +35,15 @@ * * ***** END LICENSE BLOCK ***** */ -#include "csutil.hxx" +// Unicode character encoding information +struct unicode_info { + unsigned short c; + unsigned short cupper; + unsigned short clower; +}; + /* fields: Unicode letter, toupper, tolower */ -static struct unicode_info utf_lst[] = { +static const struct unicode_info utf_lst[] = { {0x0041, 0x0041, 0x0061}, {0x0042, 0x0042, 0x0062}, {0x0043, 0x0043, 0x0063}, {0x0044, 0x0044, 0x0064}, {0x0045, 0x0045, 0x0065}, {0x0046, 0x0046, 0x0066}, diff --git a/libs/hunspell/src/w_char.hxx b/libs/hunspell/src/w_char.hxx index 5accb7568f..7e71d04680 100644 --- a/libs/hunspell/src/w_char.hxx +++ b/libs/hunspell/src/w_char.hxx @@ -1,7 +1,7 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * - * Copyright (C) 2002-2017 Németh László + * Copyright (C) 2002-2022 Németh László * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with |