summaryrefslogtreecommitdiff
path: root/libs/hunspell/src/hashmgr.c++
diff options
context:
space:
mode:
Diffstat (limited to 'libs/hunspell/src/hashmgr.c++')
-rw-r--r--libs/hunspell/src/hashmgr.c++393
1 files changed, 293 insertions, 100 deletions
diff --git a/libs/hunspell/src/hashmgr.c++ b/libs/hunspell/src/hashmgr.c++
index 23421b567a..3ec263de1d 100644
--- a/libs/hunspell/src/hashmgr.c++
+++ b/libs/hunspell/src/hashmgr.c++
@@ -1,7 +1,7 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
- * Copyright (C) 2002-2017 Németh László
+ * Copyright (C) 2002-2022 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
@@ -78,6 +78,7 @@
#include "hashmgr.hxx"
#include "csutil.hxx"
#include "atypes.hxx"
+#include "langnum.hxx"
// build a hash table from a munched word list
@@ -182,13 +183,14 @@ int HashMgr::add_word(const std::string& in_word,
unsigned short* aff,
int al,
const std::string* in_desc,
- bool onlyupcase) {
+ bool onlyupcase,
+ int captype) {
const std::string* word = &in_word;
const std::string* desc = in_desc;
std::string *word_copy = NULL;
std::string *desc_copy = NULL;
- if (!ignorechars.empty() || complexprefixes) {
+ if ((!ignorechars.empty() && !has_no_ignored_chars(in_word, ignorechars)) || complexprefixes) {
word_copy = new std::string(in_word);
if (!ignorechars.empty()) {
@@ -243,20 +245,119 @@ int HashMgr::add_word(const std::string& in_word,
hp->astr = aff;
hp->next = NULL;
hp->next_homonym = NULL;
+ hp->var = (captype == INITCAP) ? H_OPT_INITCAP : 0;
// store the description string or its pointer
if (desc) {
- hp->var = H_OPT;
+ hp->var |= H_OPT;
if (aliasm) {
- hp->var += H_OPT_ALIASM;
+ hp->var |= H_OPT_ALIASM;
store_pointer(hpw + word->size() + 1, get_aliasm(atoi(desc->c_str())));
} else {
strcpy(hpw + word->size() + 1, desc->c_str());
}
- if (strstr(HENTRY_DATA(hp), MORPH_PHON))
- hp->var += H_OPT_PHON;
- } else
- hp->var = 0;
+ if (strstr(HENTRY_DATA(hp), MORPH_PHON)) {
+ hp->var |= H_OPT_PHON;
+ // store ph: fields (pronounciation, misspellings, old orthography etc.)
+ // of a morphological description in reptable to use in REP replacements.
+ if (reptable.capacity() < (unsigned int)(tablesize/MORPH_PHON_RATIO))
+ reptable.reserve(tablesize/MORPH_PHON_RATIO);
+ std::string fields = HENTRY_DATA(hp);
+ std::string::const_iterator iter = fields.begin();
+ std::string::const_iterator start_piece = mystrsep(fields, iter);
+ while (start_piece != fields.end()) {
+ if (std::string(start_piece, iter).find(MORPH_PHON) == 0) {
+ std::string ph = std::string(start_piece, iter).substr(sizeof MORPH_PHON - 1);
+ if (ph.size() > 0) {
+ std::vector<w_char> w;
+ size_t strippatt;
+ std::string wordpart;
+ // dictionary based REP replacement, separated by "->"
+ // for example "pretty ph:prity ph:priti->pretti" to handle
+ // both prity -> pretty and pritier -> prettiest suggestions.
+ if (((strippatt = ph.find("->")) != std::string::npos) &&
+ (strippatt > 0) && (strippatt < ph.size() - 2)) {
+ wordpart = ph.substr(strippatt + 2);
+ ph.erase(ph.begin() + strippatt, ph.end());
+ } else
+ wordpart = in_word;
+ // when the ph: field ends with the character *,
+ // strip last character of the pattern and the replacement
+ // to match in REP suggestions also at character changes,
+ // for example, "pretty ph:prity*" results "prit->prett"
+ // REP replacement instead of "prity->pretty", to get
+ // prity->pretty and pritiest->prettiest suggestions.
+ if (ph.at(ph.size()-1) == '*') {
+ strippatt = 1;
+ size_t stripword = 0;
+ if (utf8) {
+ while ((strippatt < ph.size()) &&
+ ((ph.at(ph.size()-strippatt-1) & 0xc0) == 0x80))
+ ++strippatt;
+ while ((stripword < wordpart.size()) &&
+ ((wordpart.at(wordpart.size()-stripword-1) & 0xc0) == 0x80))
+ ++stripword;
+ }
+ ++strippatt;
+ ++stripword;
+ if ((ph.size() > strippatt) && (wordpart.size() > stripword)) {
+ ph.erase(ph.size()-strippatt, strippatt);
+ wordpart.erase(in_word.size()-stripword, stripword);
+ }
+ }
+ // capitalize lowercase pattern for capitalized words to support
+ // good suggestions also for capitalized misspellings, eg.
+ // Wednesday ph:wendsay
+ // results wendsay -> Wednesday and Wendsay -> Wednesday, too.
+ if (captype==INITCAP) {
+ std::string ph_capitalized;
+ if (utf8) {
+ u8_u16(w, ph);
+ if (get_captype_utf8(w, langnum) == NOCAP) {
+ mkinitcap_utf(w, langnum);
+ u16_u8(ph_capitalized, w);
+ }
+ } else if (get_captype(ph, csconv) == NOCAP)
+ mkinitcap(ph_capitalized, csconv);
+
+ if (ph_capitalized.size() > 0) {
+ // add also lowercase word in the case of German or
+ // Hungarian to support lowercase suggestions lowercased by
+ // compound word generation or derivational suffixes
+ // (for example by adjectival suffix "-i" of geographical
+ // names in Hungarian:
+ // Massachusetts ph:messzecsuzec
+ // messzecsuzeci -> massachusettsi (adjective)
+ // For lowercasing by conditional PFX rules, see
+ // tests/germancompounding test example or the
+ // Hungarian dictionary.)
+ if (langnum == LANG_de || langnum == LANG_hu) {
+ std::string wordpart_lower(wordpart);
+ if (utf8) {
+ u8_u16(w, wordpart_lower);
+ mkallsmall_utf(w, langnum);
+ u16_u8(wordpart_lower, w);
+ } else {
+ mkallsmall(wordpart_lower, csconv);
+ }
+ reptable.push_back(replentry());
+ reptable.back().pattern.assign(ph);
+ reptable.back().outstrings[0].assign(wordpart_lower);
+ }
+ reptable.push_back(replentry());
+ reptable.back().pattern.assign(ph_capitalized);
+ reptable.back().outstrings[0].assign(wordpart);
+ }
+ }
+ reptable.push_back(replentry());
+ reptable.back().pattern.assign(ph);
+ reptable.back().outstrings[0].assign(wordpart);
+ }
+ }
+ start_piece = mystrsep(fields, iter);
+ }
+ }
+ }
struct hentry* dp = tableptr[i];
if (!dp) {
@@ -347,12 +448,12 @@ int HashMgr::add_hidden_capitalized_word(const std::string& word,
mkallsmall_utf(w, langnum);
mkinitcap_utf(w, langnum);
u16_u8(st, w);
- return add_word(st, wcl, flags2, flagslen + 1, dp, true);
+ return add_word(st, wcl, flags2, flagslen + 1, dp, true, INITCAP);
} else {
std::string new_word(word);
mkallsmall(new_word, csconv);
mkinitcap(new_word, csconv);
- int ret = add_word(new_word, wcl, flags2, flagslen + 1, dp, true);
+ int ret = add_word(new_word, wcl, flags2, flagslen + 1, dp, true, INITCAP);
return ret;
}
}
@@ -405,24 +506,8 @@ int HashMgr::remove_forbidden_flag(const std::string& word) {
if (!dp)
return 1;
while (dp) {
- if (dp->astr && TESTAFF(dp->astr, forbiddenword, dp->alen)) {
- if (dp->alen == 1)
- dp->alen = 0; // XXX forbidden words of personal dic.
- else {
- unsigned short* flags2 =
- (unsigned short*)malloc(sizeof(unsigned short) * (dp->alen - 1));
- if (!flags2)
- return 1;
- int i, j = 0;
- for (i = 0; i < dp->alen; i++) {
- if (dp->astr[i] != forbiddenword)
- flags2[j++] = dp->astr[i];
- }
- dp->alen--;
- free(dp->astr);
- dp->astr = flags2; // XXX allowed forbidden words
- }
- }
+ if (dp->astr && TESTAFF(dp->astr, forbiddenword, dp->alen))
+ dp->alen = 0; // XXX forbidden words of personal dic.
dp = dp->next_homonym;
}
return 0;
@@ -435,7 +520,7 @@ int HashMgr::add(const std::string& word) {
int al = 0;
unsigned short* flags = NULL;
int wcl = get_clen_and_captype(word, &captype);
- add_word(word, wcl, flags, al, NULL, false);
+ add_word(word, wcl, flags, al, NULL, false, captype);
return add_hidden_capitalized_word(word, wcl, flags, al, NULL,
captype);
}
@@ -450,14 +535,14 @@ int HashMgr::add_with_affix(const std::string& word, const std::string& example)
int captype;
int wcl = get_clen_and_captype(word, &captype);
if (aliasf) {
- add_word(word, wcl, dp->astr, dp->alen, NULL, false);
+ add_word(word, wcl, dp->astr, dp->alen, NULL, false, captype);
} else {
unsigned short* flags =
(unsigned short*)malloc(dp->alen * sizeof(unsigned short));
if (flags) {
memcpy((void*)flags, (void*)dp->astr,
dp->alen * sizeof(unsigned short));
- add_word(word, wcl, flags, dp->alen, NULL, false);
+ add_word(word, wcl, flags, dp->alen, NULL, false, captype);
} else
return 1;
}
@@ -605,7 +690,7 @@ int HashMgr::load_tables(const char* tpath, const char* key) {
int wcl = get_clen_and_captype(ts, &captype, workbuf);
const std::string *dp_str = dp.empty() ? NULL : &dp;
// add the word and its index plus its capitalized form optionally
- if (add_word(ts, wcl, flags, al, dp_str, false) ||
+ if (add_word(ts, wcl, flags, al, dp_str, false, captype) ||
add_hidden_capitalized_word(ts, wcl, flags, al, dp_str, captype)) {
delete dict;
return 5;
@@ -697,7 +782,7 @@ int HashMgr::decode_flags(unsigned short** result, const std::string& flags, Fil
*result = (unsigned short*)malloc(len * sizeof(unsigned short));
if (!*result)
return -1;
- memcpy(*result, &w[0], len * sizeof(short));
+ memcpy(*result, w.data(), len * sizeof(short));
break;
}
default: { // Ispell's one-character flags (erfg -> e r f g)
@@ -768,7 +853,7 @@ bool HashMgr::decode_flags(std::vector<unsigned short>& result, const std::strin
size_t len = w.size();
size_t origsize = result.size();
result.resize(origsize + len);
- memcpy(&result[origsize], &w[0], len * sizeof(short));
+ memcpy(result.data() + origsize, w.data(), len * sizeof(short));
break;
}
default: { // Ispell's one-character flags (erfg -> e r f g)
@@ -799,7 +884,7 @@ unsigned short HashMgr::decode_flag(const char* f) const {
std::vector<w_char> w;
u8_u16(w, f);
if (!w.empty())
- memcpy(&s, &w[0], 1 * sizeof(short));
+ memcpy(&s, w.data(), 1 * sizeof(short));
break;
}
default:
@@ -940,8 +1025,19 @@ int HashMgr::load_config(const char* affpath, const char* key) {
if (line.compare(0, 15, "COMPLEXPREFIXES", 15) == 0)
complexprefixes = 1;
+ /* parse in the typical fault correcting table */
+ if (line.compare(0, 3, "REP", 3) == 0) {
+ if (!parse_reptable(line, afflst)) {
+ delete afflst;
+ return 1;
+ }
+ }
+
+ // don't check the full affix file, yet
if (((line.compare(0, 3, "SFX", 3) == 0) ||
- (line.compare(0, 3, "PFX", 3) == 0)) && line.size() > 3 && isspace(line[3]))
+ (line.compare(0, 3, "PFX", 3) == 0)) &&
+ line.size() > 3 && isspace(line[3]) &&
+ !reptable.empty()) // (REP table is in the end of Afrikaans aff file)
break;
}
@@ -1015,43 +1111,41 @@ bool HashMgr::parse_aliasf(const std::string& line, FileMgr* af) {
/* now parse the numaliasf lines to read in the remainder of the table */
for (int j = 0; j < numaliasf; j++) {
std::string nl;
- if (!af->getline(nl))
- return false;
- mychomp(nl);
- i = 0;
aliasf[j] = NULL;
aliasflen[j] = 0;
- iter = nl.begin();
- start_piece = mystrsep(nl, iter);
- while (start_piece != nl.end()) {
- switch (i) {
- case 0: {
- if (nl.compare(start_piece - nl.begin(), 2, "AF", 2) != 0) {
- numaliasf = 0;
- free(aliasf);
- free(aliasflen);
- aliasf = NULL;
- aliasflen = NULL;
- HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
- af->getlinenum());
- return false;
+ i = 0;
+ if (af->getline(nl)) {
+ mychomp(nl);
+ iter = nl.begin();
+ start_piece = mystrsep(nl, iter);
+ bool errored = false;
+ while (!errored && start_piece != nl.end()) {
+ switch (i) {
+ case 0: {
+ if (nl.compare(start_piece - nl.begin(), 2, "AF", 2) != 0) {
+ errored = true;
+ break;
+ }
+ break;
}
- break;
- }
- case 1: {
- std::string piece(start_piece, iter);
- aliasflen[j] =
- (unsigned short)decode_flags(&(aliasf[j]), piece, af);
- std::sort(aliasf[j], aliasf[j] + aliasflen[j]);
- break;
+ case 1: {
+ std::string piece(start_piece, iter);
+ aliasflen[j] =
+ (unsigned short)decode_flags(&(aliasf[j]), piece, af);
+ std::sort(aliasf[j], aliasf[j] + aliasflen[j]);
+ break;
+ }
+ default:
+ break;
}
- default:
- break;
+ ++i;
+ start_piece = mystrsep(nl, iter);
}
- ++i;
- start_piece = mystrsep(nl, iter);
}
if (!aliasf[j]) {
+ for (int k = 0; k < j; ++k) {
+ free(aliasf[k]);
+ }
free(aliasf);
free(aliasflen);
aliasf = NULL;
@@ -1130,47 +1224,47 @@ bool HashMgr::parse_aliasm(const std::string& line, FileMgr* af) {
/* now parse the numaliasm lines to read in the remainder of the table */
for (int j = 0; j < numaliasm; j++) {
std::string nl;
- if (!af->getline(nl))
- return false;
- mychomp(nl);
aliasm[j] = NULL;
- iter = nl.begin();
- i = 0;
- start_piece = mystrsep(nl, iter);
- while (start_piece != nl.end()) {
- switch (i) {
- case 0: {
- if (nl.compare(start_piece - nl.begin(), 2, "AM", 2) != 0) {
- HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
- af->getlinenum());
- numaliasm = 0;
- free(aliasm);
- aliasm = NULL;
- return false;
+ if (af->getline(nl)) {
+ mychomp(nl);
+ iter = nl.begin();
+ i = 0;
+ start_piece = mystrsep(nl, iter);
+ bool errored = false;
+ while (!errored && start_piece != nl.end()) {
+ switch (i) {
+ case 0: {
+ if (nl.compare(start_piece - nl.begin(), 2, "AM", 2) != 0) {
+ errored = true;
+ break;
+ }
+ break;
}
- break;
- }
- case 1: {
- // add the remaining of the line
- std::string::const_iterator end = nl.end();
- std::string chunk(start_piece, end);
- if (complexprefixes) {
- if (utf8)
- reverseword_utf(chunk);
- else
- reverseword(chunk);
+ case 1: {
+ // add the remaining of the line
+ std::string::const_iterator end = nl.end();
+ std::string chunk(start_piece, end);
+ if (complexprefixes) {
+ if (utf8)
+ reverseword_utf(chunk);
+ else
+ reverseword(chunk);
+ }
+ aliasm[j] = mystrdup(chunk.c_str());
+ break;
}
- aliasm[j] = mystrdup(chunk.c_str());
- break;
+ default:
+ break;
}
- default:
- break;
+ ++i;
+ start_piece = mystrsep(nl, iter);
}
- ++i;
- start_piece = mystrsep(nl, iter);
}
if (!aliasm[j]) {
numaliasm = 0;
+ for (int k = 0; k < j; ++k) {
+ free(aliasm[k]);
+ }
free(aliasm);
aliasm = NULL;
HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
@@ -1191,3 +1285,102 @@ char* HashMgr::get_aliasm(int index) const {
HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index);
return NULL;
}
+
+/* parse in the typical fault correcting table */
+bool HashMgr::parse_reptable(const std::string& line, FileMgr* af) {
+ if (!reptable.empty()) {
+ HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
+ af->getlinenum());
+ return false;
+ }
+ int numrep = -1;
+ int i = 0;
+ int np = 0;
+ std::string::const_iterator iter = line.begin();
+ std::string::const_iterator start_piece = mystrsep(line, iter);
+ while (start_piece != line.end()) {
+ switch (i) {
+ case 0: {
+ np++;
+ break;
+ }
+ case 1: {
+ numrep = atoi(std::string(start_piece, iter).c_str());
+ if (numrep < 1) {
+ HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n",
+ af->getlinenum());
+ return false;
+ }
+ reptable.reserve(numrep);
+ np++;
+ break;
+ }
+ default:
+ break;
+ }
+ ++i;
+ start_piece = mystrsep(line, iter);
+ }
+ if (np != 2) {
+ HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
+ af->getlinenum());
+ return false;
+ }
+
+ /* now parse the numrep lines to read in the remainder of the table */
+ for (int j = 0; j < numrep; ++j) {
+ std::string nl;
+ reptable.push_back(replentry());
+ int type = 0;
+ if (af->getline(nl)) {
+ mychomp(nl);
+ iter = nl.begin();
+ i = 0;
+ start_piece = mystrsep(nl, iter);
+ bool errored = false;
+ while (!errored && start_piece != nl.end()) {
+ switch (i) {
+ case 0: {
+ if (nl.compare(start_piece - nl.begin(), 3, "REP", 3) != 0) {
+ errored = true;
+ break;
+ }
+ break;
+ }
+ case 1: {
+ if (*start_piece == '^')
+ type = 1;
+ reptable.back().pattern.assign(start_piece + type, iter);
+ mystrrep(reptable.back().pattern, "_", " ");
+ if (!reptable.back().pattern.empty() && reptable.back().pattern[reptable.back().pattern.size() - 1] == '$') {
+ type += 2;
+ reptable.back().pattern.resize(reptable.back().pattern.size() - 1);
+ }
+ break;
+ }
+ case 2: {
+ reptable.back().outstrings[type].assign(start_piece, iter);
+ mystrrep(reptable.back().outstrings[type], "_", " ");
+ break;
+ }
+ default:
+ break;
+ }
+ ++i;
+ start_piece = mystrsep(nl, iter);
+ }
+ }
+ if (reptable.back().pattern.empty() || reptable.back().outstrings[type].empty()) {
+ HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
+ af->getlinenum());
+ reptable.clear();
+ return false;
+ }
+ }
+ return true;
+}
+
+// return replacing table
+const std::vector<replentry>& HashMgr::get_reptable() const {
+ return reptable;
+}