diff options
Diffstat (limited to 'libs/hunspell/src/hashmgr.c++')
-rw-r--r-- | libs/hunspell/src/hashmgr.c++ | 393 |
1 files changed, 293 insertions, 100 deletions
diff --git a/libs/hunspell/src/hashmgr.c++ b/libs/hunspell/src/hashmgr.c++ index 23421b567a..3ec263de1d 100644 --- a/libs/hunspell/src/hashmgr.c++ +++ b/libs/hunspell/src/hashmgr.c++ @@ -1,7 +1,7 @@ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * - * Copyright (C) 2002-2017 Németh László + * Copyright (C) 2002-2022 Németh László * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with @@ -78,6 +78,7 @@ #include "hashmgr.hxx" #include "csutil.hxx" #include "atypes.hxx" +#include "langnum.hxx" // build a hash table from a munched word list @@ -182,13 +183,14 @@ int HashMgr::add_word(const std::string& in_word, unsigned short* aff, int al, const std::string* in_desc, - bool onlyupcase) { + bool onlyupcase, + int captype) { const std::string* word = &in_word; const std::string* desc = in_desc; std::string *word_copy = NULL; std::string *desc_copy = NULL; - if (!ignorechars.empty() || complexprefixes) { + if ((!ignorechars.empty() && !has_no_ignored_chars(in_word, ignorechars)) || complexprefixes) { word_copy = new std::string(in_word); if (!ignorechars.empty()) { @@ -243,20 +245,119 @@ int HashMgr::add_word(const std::string& in_word, hp->astr = aff; hp->next = NULL; hp->next_homonym = NULL; + hp->var = (captype == INITCAP) ? H_OPT_INITCAP : 0; // store the description string or its pointer if (desc) { - hp->var = H_OPT; + hp->var |= H_OPT; if (aliasm) { - hp->var += H_OPT_ALIASM; + hp->var |= H_OPT_ALIASM; store_pointer(hpw + word->size() + 1, get_aliasm(atoi(desc->c_str()))); } else { strcpy(hpw + word->size() + 1, desc->c_str()); } - if (strstr(HENTRY_DATA(hp), MORPH_PHON)) - hp->var += H_OPT_PHON; - } else - hp->var = 0; + if (strstr(HENTRY_DATA(hp), MORPH_PHON)) { + hp->var |= H_OPT_PHON; + // store ph: fields (pronounciation, misspellings, old orthography etc.) + // of a morphological description in reptable to use in REP replacements. + if (reptable.capacity() < (unsigned int)(tablesize/MORPH_PHON_RATIO)) + reptable.reserve(tablesize/MORPH_PHON_RATIO); + std::string fields = HENTRY_DATA(hp); + std::string::const_iterator iter = fields.begin(); + std::string::const_iterator start_piece = mystrsep(fields, iter); + while (start_piece != fields.end()) { + if (std::string(start_piece, iter).find(MORPH_PHON) == 0) { + std::string ph = std::string(start_piece, iter).substr(sizeof MORPH_PHON - 1); + if (ph.size() > 0) { + std::vector<w_char> w; + size_t strippatt; + std::string wordpart; + // dictionary based REP replacement, separated by "->" + // for example "pretty ph:prity ph:priti->pretti" to handle + // both prity -> pretty and pritier -> prettiest suggestions. + if (((strippatt = ph.find("->")) != std::string::npos) && + (strippatt > 0) && (strippatt < ph.size() - 2)) { + wordpart = ph.substr(strippatt + 2); + ph.erase(ph.begin() + strippatt, ph.end()); + } else + wordpart = in_word; + // when the ph: field ends with the character *, + // strip last character of the pattern and the replacement + // to match in REP suggestions also at character changes, + // for example, "pretty ph:prity*" results "prit->prett" + // REP replacement instead of "prity->pretty", to get + // prity->pretty and pritiest->prettiest suggestions. + if (ph.at(ph.size()-1) == '*') { + strippatt = 1; + size_t stripword = 0; + if (utf8) { + while ((strippatt < ph.size()) && + ((ph.at(ph.size()-strippatt-1) & 0xc0) == 0x80)) + ++strippatt; + while ((stripword < wordpart.size()) && + ((wordpart.at(wordpart.size()-stripword-1) & 0xc0) == 0x80)) + ++stripword; + } + ++strippatt; + ++stripword; + if ((ph.size() > strippatt) && (wordpart.size() > stripword)) { + ph.erase(ph.size()-strippatt, strippatt); + wordpart.erase(in_word.size()-stripword, stripword); + } + } + // capitalize lowercase pattern for capitalized words to support + // good suggestions also for capitalized misspellings, eg. + // Wednesday ph:wendsay + // results wendsay -> Wednesday and Wendsay -> Wednesday, too. + if (captype==INITCAP) { + std::string ph_capitalized; + if (utf8) { + u8_u16(w, ph); + if (get_captype_utf8(w, langnum) == NOCAP) { + mkinitcap_utf(w, langnum); + u16_u8(ph_capitalized, w); + } + } else if (get_captype(ph, csconv) == NOCAP) + mkinitcap(ph_capitalized, csconv); + + if (ph_capitalized.size() > 0) { + // add also lowercase word in the case of German or + // Hungarian to support lowercase suggestions lowercased by + // compound word generation or derivational suffixes + // (for example by adjectival suffix "-i" of geographical + // names in Hungarian: + // Massachusetts ph:messzecsuzec + // messzecsuzeci -> massachusettsi (adjective) + // For lowercasing by conditional PFX rules, see + // tests/germancompounding test example or the + // Hungarian dictionary.) + if (langnum == LANG_de || langnum == LANG_hu) { + std::string wordpart_lower(wordpart); + if (utf8) { + u8_u16(w, wordpart_lower); + mkallsmall_utf(w, langnum); + u16_u8(wordpart_lower, w); + } else { + mkallsmall(wordpart_lower, csconv); + } + reptable.push_back(replentry()); + reptable.back().pattern.assign(ph); + reptable.back().outstrings[0].assign(wordpart_lower); + } + reptable.push_back(replentry()); + reptable.back().pattern.assign(ph_capitalized); + reptable.back().outstrings[0].assign(wordpart); + } + } + reptable.push_back(replentry()); + reptable.back().pattern.assign(ph); + reptable.back().outstrings[0].assign(wordpart); + } + } + start_piece = mystrsep(fields, iter); + } + } + } struct hentry* dp = tableptr[i]; if (!dp) { @@ -347,12 +448,12 @@ int HashMgr::add_hidden_capitalized_word(const std::string& word, mkallsmall_utf(w, langnum); mkinitcap_utf(w, langnum); u16_u8(st, w); - return add_word(st, wcl, flags2, flagslen + 1, dp, true); + return add_word(st, wcl, flags2, flagslen + 1, dp, true, INITCAP); } else { std::string new_word(word); mkallsmall(new_word, csconv); mkinitcap(new_word, csconv); - int ret = add_word(new_word, wcl, flags2, flagslen + 1, dp, true); + int ret = add_word(new_word, wcl, flags2, flagslen + 1, dp, true, INITCAP); return ret; } } @@ -405,24 +506,8 @@ int HashMgr::remove_forbidden_flag(const std::string& word) { if (!dp) return 1; while (dp) { - if (dp->astr && TESTAFF(dp->astr, forbiddenword, dp->alen)) { - if (dp->alen == 1) - dp->alen = 0; // XXX forbidden words of personal dic. - else { - unsigned short* flags2 = - (unsigned short*)malloc(sizeof(unsigned short) * (dp->alen - 1)); - if (!flags2) - return 1; - int i, j = 0; - for (i = 0; i < dp->alen; i++) { - if (dp->astr[i] != forbiddenword) - flags2[j++] = dp->astr[i]; - } - dp->alen--; - free(dp->astr); - dp->astr = flags2; // XXX allowed forbidden words - } - } + if (dp->astr && TESTAFF(dp->astr, forbiddenword, dp->alen)) + dp->alen = 0; // XXX forbidden words of personal dic. dp = dp->next_homonym; } return 0; @@ -435,7 +520,7 @@ int HashMgr::add(const std::string& word) { int al = 0; unsigned short* flags = NULL; int wcl = get_clen_and_captype(word, &captype); - add_word(word, wcl, flags, al, NULL, false); + add_word(word, wcl, flags, al, NULL, false, captype); return add_hidden_capitalized_word(word, wcl, flags, al, NULL, captype); } @@ -450,14 +535,14 @@ int HashMgr::add_with_affix(const std::string& word, const std::string& example) int captype; int wcl = get_clen_and_captype(word, &captype); if (aliasf) { - add_word(word, wcl, dp->astr, dp->alen, NULL, false); + add_word(word, wcl, dp->astr, dp->alen, NULL, false, captype); } else { unsigned short* flags = (unsigned short*)malloc(dp->alen * sizeof(unsigned short)); if (flags) { memcpy((void*)flags, (void*)dp->astr, dp->alen * sizeof(unsigned short)); - add_word(word, wcl, flags, dp->alen, NULL, false); + add_word(word, wcl, flags, dp->alen, NULL, false, captype); } else return 1; } @@ -605,7 +690,7 @@ int HashMgr::load_tables(const char* tpath, const char* key) { int wcl = get_clen_and_captype(ts, &captype, workbuf); const std::string *dp_str = dp.empty() ? NULL : &dp; // add the word and its index plus its capitalized form optionally - if (add_word(ts, wcl, flags, al, dp_str, false) || + if (add_word(ts, wcl, flags, al, dp_str, false, captype) || add_hidden_capitalized_word(ts, wcl, flags, al, dp_str, captype)) { delete dict; return 5; @@ -697,7 +782,7 @@ int HashMgr::decode_flags(unsigned short** result, const std::string& flags, Fil *result = (unsigned short*)malloc(len * sizeof(unsigned short)); if (!*result) return -1; - memcpy(*result, &w[0], len * sizeof(short)); + memcpy(*result, w.data(), len * sizeof(short)); break; } default: { // Ispell's one-character flags (erfg -> e r f g) @@ -768,7 +853,7 @@ bool HashMgr::decode_flags(std::vector<unsigned short>& result, const std::strin size_t len = w.size(); size_t origsize = result.size(); result.resize(origsize + len); - memcpy(&result[origsize], &w[0], len * sizeof(short)); + memcpy(result.data() + origsize, w.data(), len * sizeof(short)); break; } default: { // Ispell's one-character flags (erfg -> e r f g) @@ -799,7 +884,7 @@ unsigned short HashMgr::decode_flag(const char* f) const { std::vector<w_char> w; u8_u16(w, f); if (!w.empty()) - memcpy(&s, &w[0], 1 * sizeof(short)); + memcpy(&s, w.data(), 1 * sizeof(short)); break; } default: @@ -940,8 +1025,19 @@ int HashMgr::load_config(const char* affpath, const char* key) { if (line.compare(0, 15, "COMPLEXPREFIXES", 15) == 0) complexprefixes = 1; + /* parse in the typical fault correcting table */ + if (line.compare(0, 3, "REP", 3) == 0) { + if (!parse_reptable(line, afflst)) { + delete afflst; + return 1; + } + } + + // don't check the full affix file, yet if (((line.compare(0, 3, "SFX", 3) == 0) || - (line.compare(0, 3, "PFX", 3) == 0)) && line.size() > 3 && isspace(line[3])) + (line.compare(0, 3, "PFX", 3) == 0)) && + line.size() > 3 && isspace(line[3]) && + !reptable.empty()) // (REP table is in the end of Afrikaans aff file) break; } @@ -1015,43 +1111,41 @@ bool HashMgr::parse_aliasf(const std::string& line, FileMgr* af) { /* now parse the numaliasf lines to read in the remainder of the table */ for (int j = 0; j < numaliasf; j++) { std::string nl; - if (!af->getline(nl)) - return false; - mychomp(nl); - i = 0; aliasf[j] = NULL; aliasflen[j] = 0; - iter = nl.begin(); - start_piece = mystrsep(nl, iter); - while (start_piece != nl.end()) { - switch (i) { - case 0: { - if (nl.compare(start_piece - nl.begin(), 2, "AF", 2) != 0) { - numaliasf = 0; - free(aliasf); - free(aliasflen); - aliasf = NULL; - aliasflen = NULL; - HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", - af->getlinenum()); - return false; + i = 0; + if (af->getline(nl)) { + mychomp(nl); + iter = nl.begin(); + start_piece = mystrsep(nl, iter); + bool errored = false; + while (!errored && start_piece != nl.end()) { + switch (i) { + case 0: { + if (nl.compare(start_piece - nl.begin(), 2, "AF", 2) != 0) { + errored = true; + break; + } + break; } - break; - } - case 1: { - std::string piece(start_piece, iter); - aliasflen[j] = - (unsigned short)decode_flags(&(aliasf[j]), piece, af); - std::sort(aliasf[j], aliasf[j] + aliasflen[j]); - break; + case 1: { + std::string piece(start_piece, iter); + aliasflen[j] = + (unsigned short)decode_flags(&(aliasf[j]), piece, af); + std::sort(aliasf[j], aliasf[j] + aliasflen[j]); + break; + } + default: + break; } - default: - break; + ++i; + start_piece = mystrsep(nl, iter); } - ++i; - start_piece = mystrsep(nl, iter); } if (!aliasf[j]) { + for (int k = 0; k < j; ++k) { + free(aliasf[k]); + } free(aliasf); free(aliasflen); aliasf = NULL; @@ -1130,47 +1224,47 @@ bool HashMgr::parse_aliasm(const std::string& line, FileMgr* af) { /* now parse the numaliasm lines to read in the remainder of the table */ for (int j = 0; j < numaliasm; j++) { std::string nl; - if (!af->getline(nl)) - return false; - mychomp(nl); aliasm[j] = NULL; - iter = nl.begin(); - i = 0; - start_piece = mystrsep(nl, iter); - while (start_piece != nl.end()) { - switch (i) { - case 0: { - if (nl.compare(start_piece - nl.begin(), 2, "AM", 2) != 0) { - HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", - af->getlinenum()); - numaliasm = 0; - free(aliasm); - aliasm = NULL; - return false; + if (af->getline(nl)) { + mychomp(nl); + iter = nl.begin(); + i = 0; + start_piece = mystrsep(nl, iter); + bool errored = false; + while (!errored && start_piece != nl.end()) { + switch (i) { + case 0: { + if (nl.compare(start_piece - nl.begin(), 2, "AM", 2) != 0) { + errored = true; + break; + } + break; } - break; - } - case 1: { - // add the remaining of the line - std::string::const_iterator end = nl.end(); - std::string chunk(start_piece, end); - if (complexprefixes) { - if (utf8) - reverseword_utf(chunk); - else - reverseword(chunk); + case 1: { + // add the remaining of the line + std::string::const_iterator end = nl.end(); + std::string chunk(start_piece, end); + if (complexprefixes) { + if (utf8) + reverseword_utf(chunk); + else + reverseword(chunk); + } + aliasm[j] = mystrdup(chunk.c_str()); + break; } - aliasm[j] = mystrdup(chunk.c_str()); - break; + default: + break; } - default: - break; + ++i; + start_piece = mystrsep(nl, iter); } - ++i; - start_piece = mystrsep(nl, iter); } if (!aliasm[j]) { numaliasm = 0; + for (int k = 0; k < j; ++k) { + free(aliasm[k]); + } free(aliasm); aliasm = NULL; HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", @@ -1191,3 +1285,102 @@ char* HashMgr::get_aliasm(int index) const { HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index); return NULL; } + +/* parse in the typical fault correcting table */ +bool HashMgr::parse_reptable(const std::string& line, FileMgr* af) { + if (!reptable.empty()) { + HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", + af->getlinenum()); + return false; + } + int numrep = -1; + int i = 0; + int np = 0; + std::string::const_iterator iter = line.begin(); + std::string::const_iterator start_piece = mystrsep(line, iter); + while (start_piece != line.end()) { + switch (i) { + case 0: { + np++; + break; + } + case 1: { + numrep = atoi(std::string(start_piece, iter).c_str()); + if (numrep < 1) { + HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", + af->getlinenum()); + return false; + } + reptable.reserve(numrep); + np++; + break; + } + default: + break; + } + ++i; + start_piece = mystrsep(line, iter); + } + if (np != 2) { + HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", + af->getlinenum()); + return false; + } + + /* now parse the numrep lines to read in the remainder of the table */ + for (int j = 0; j < numrep; ++j) { + std::string nl; + reptable.push_back(replentry()); + int type = 0; + if (af->getline(nl)) { + mychomp(nl); + iter = nl.begin(); + i = 0; + start_piece = mystrsep(nl, iter); + bool errored = false; + while (!errored && start_piece != nl.end()) { + switch (i) { + case 0: { + if (nl.compare(start_piece - nl.begin(), 3, "REP", 3) != 0) { + errored = true; + break; + } + break; + } + case 1: { + if (*start_piece == '^') + type = 1; + reptable.back().pattern.assign(start_piece + type, iter); + mystrrep(reptable.back().pattern, "_", " "); + if (!reptable.back().pattern.empty() && reptable.back().pattern[reptable.back().pattern.size() - 1] == '$') { + type += 2; + reptable.back().pattern.resize(reptable.back().pattern.size() - 1); + } + break; + } + case 2: { + reptable.back().outstrings[type].assign(start_piece, iter); + mystrrep(reptable.back().outstrings[type], "_", " "); + break; + } + default: + break; + } + ++i; + start_piece = mystrsep(nl, iter); + } + } + if (reptable.back().pattern.empty() || reptable.back().outstrings[type].empty()) { + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", + af->getlinenum()); + reptable.clear(); + return false; + } + } + return true; +} + +// return replacing table +const std::vector<replentry>& HashMgr::get_reptable() const { + return reptable; +} |