summaryrefslogtreecommitdiff
path: root/libs/hunspell/src/affixmgr.c++
diff options
context:
space:
mode:
Diffstat (limited to 'libs/hunspell/src/affixmgr.c++')
-rw-r--r--libs/hunspell/src/affixmgr.c++381
1 files changed, 185 insertions, 196 deletions
diff --git a/libs/hunspell/src/affixmgr.c++ b/libs/hunspell/src/affixmgr.c++
index d21ff49573..d6bb677982 100644
--- a/libs/hunspell/src/affixmgr.c++
+++ b/libs/hunspell/src/affixmgr.c++
@@ -72,13 +72,13 @@
*/
#include <stdlib.h>
-#include <string>
#include <string.h>
#include <stdio.h>
#include <ctype.h>
+#include <algorithm>
#include <limits>
-
+#include <string>
#include <vector>
#include "affixmgr.hxx"
@@ -152,11 +152,7 @@ AffixMgr::AffixMgr(const char* affpath,
cpdsyllablenum = NULL; // syllable count incrementing flag
checknum = 0; // checking numbers, and word with numbers
wordchars = NULL; // letters + spec. word characters
- wordchars_utf16 = NULL; // letters + spec. word characters
- wordchars_utf16_len = 0; // letters + spec. word characters
ignorechars = NULL; // letters + spec. word characters
- ignorechars_utf16 = NULL; // letters + spec. word characters
- ignorechars_utf16_len = 0; // letters + spec. word characters
version = NULL; // affix and dictionary file version string
havecontclass = 0; // flags of possible continuing classes (double affix)
// LEMMA_PRESENT: not put root into the morphological output. Lemma presents
@@ -336,12 +332,8 @@ AffixMgr::~AffixMgr() {
free(lang);
if (wordchars)
free(wordchars);
- if (wordchars_utf16)
- free(wordchars_utf16);
if (ignorechars)
free(ignorechars);
- if (ignorechars_utf16)
- free(ignorechars_utf16);
if (version)
free(version);
checknum = 0;
@@ -632,8 +624,8 @@ int AffixMgr::parse_file(const char* affpath, const char* key) {
/* parse in the extra word characters */
if (strncmp(line, "WORDCHARS", 9) == 0) {
- if (parse_array(line, &wordchars, &wordchars_utf16, &wordchars_utf16_len,
- utf8, afflst->getlinenum())) {
+ if (!parse_array(line, &wordchars, wordchars_utf16,
+ utf8, afflst->getlinenum())) {
finishFileMgr(afflst);
return 1;
}
@@ -642,8 +634,8 @@ int AffixMgr::parse_file(const char* affpath, const char* key) {
/* parse in the ignored characters (for example, Arabic optional diacretics
* charachters */
if (strncmp(line, "IGNORE", 6) == 0) {
- if (parse_array(line, &ignorechars, &ignorechars_utf16,
- &ignorechars_utf16_len, utf8, afflst->getlinenum())) {
+ if (!parse_array(line, &ignorechars, ignorechars_utf16,
+ utf8, afflst->getlinenum())) {
finishFileMgr(afflst);
return 1;
}
@@ -1174,7 +1166,7 @@ std::string& AffixMgr::debugflag(std::string& result, unsigned short flag) {
}
// calculate the character length of the condition
-int AffixMgr::condlen(char* st) {
+int AffixMgr::condlen(const char* st) {
int l = 0;
bool group = false;
for (; *st; st++) {
@@ -1189,7 +1181,7 @@ int AffixMgr::condlen(char* st) {
return l;
}
-int AffixMgr::encodeit(affentry& entry, char* cs) {
+int AffixMgr::encodeit(affentry& entry, const char* cs) {
if (strcmp(cs, ".") != 0) {
entry.numconds = (char)condlen(cs);
// coverity[buffer_size_warning] - deliberate use of lack of end of conds
@@ -1328,7 +1320,6 @@ char* AffixMgr::prefix_check_morph(const char* word,
int len,
char in_compound,
const FLAG needflag) {
- char* st;
char result[MAXLNLEN];
result[0] = '\0';
@@ -1340,7 +1331,7 @@ char* AffixMgr::prefix_check_morph(const char* word,
// first handle the special case of 0 length prefixes
PfxEntry* pe = pStart[0];
while (pe) {
- st = pe->check_morph(word, len, in_compound, needflag);
+ char* st = pe->check_morph(word, len, in_compound, needflag);
if (st) {
mystrcat(result, st, MAXLNLEN);
free(st);
@@ -1355,7 +1346,7 @@ char* AffixMgr::prefix_check_morph(const char* word,
while (pptr) {
if (isSubset(pptr->getKey(), word)) {
- st = pptr->check_morph(word, len, in_compound, needflag);
+ char* st = pptr->check_morph(word, len, in_compound, needflag);
if (st) {
// fogemorpheme
if ((in_compound != IN_CPD_NOT) ||
@@ -1382,8 +1373,6 @@ char* AffixMgr::prefix_check_twosfx_morph(const char* word,
int len,
char in_compound,
const FLAG needflag) {
- char* st;
-
char result[MAXLNLEN];
result[0] = '\0';
@@ -1394,7 +1383,7 @@ char* AffixMgr::prefix_check_twosfx_morph(const char* word,
// first handle the special case of 0 length prefixes
PfxEntry* pe = pStart[0];
while (pe) {
- st = pe->check_twosfx_morph(word, len, in_compound, needflag);
+ char* st = pe->check_twosfx_morph(word, len, in_compound, needflag);
if (st) {
mystrcat(result, st, MAXLNLEN);
free(st);
@@ -1408,7 +1397,7 @@ char* AffixMgr::prefix_check_twosfx_morph(const char* word,
while (pptr) {
if (isSubset(pptr->getKey(), word)) {
- st = pptr->check_twosfx_morph(word, len, in_compound, needflag);
+ char* st = pptr->check_twosfx_morph(word, len, in_compound, needflag);
if (st) {
mystrcat(result, st, MAXLNLEN);
free(st);
@@ -1427,13 +1416,12 @@ char* AffixMgr::prefix_check_twosfx_morph(const char* word,
// Is word a non compound with a REP substitution (see checkcompoundrep)?
int AffixMgr::cpdrep_check(const char* word, int wl) {
- const char* r;
if ((wl < 2) || !numrep)
return 0;
for (int i = 0; i < numrep; i++) {
- r = word;
+ const char* r = word;
int lenp = strlen(reptable[i].pattern);
// search every occurence of the pattern in the word
while ((r = strstr(r, reptable[i].pattern)) != NULL) {
@@ -1478,14 +1466,14 @@ int AffixMgr::cpdpat_check(const char* word,
// bounds
int AffixMgr::cpdcase_check(const char* word, int pos) {
if (utf8) {
- w_char u, w;
const char* p;
- u8_u16(&u, 1, word + pos);
for (p = word + pos - 1; (*p & 0xc0) == 0x80; p--)
;
- u8_u16(&w, 1, p);
- unsigned short a = (u.h << 8) + u.l;
- unsigned short b = (w.h << 8) + w.l;
+ std::string pair(p);
+ std::vector<w_char> pair_u;
+ u8_u16(pair_u, pair);
+ unsigned short a = pair_u.size() > 1 ? ((pair_u[1].h << 8) + pair_u[1].l) : 0;
+ unsigned short b = !pair_u.empty() ? ((pair_u[0].h << 8) + pair_u[0].l) : 0;
if (((unicodetoupper(a, langnum) == a) ||
(unicodetoupper(b, langnum) == b)) &&
(a != '-') && (b != '-'))
@@ -1499,20 +1487,18 @@ int AffixMgr::cpdcase_check(const char* word, int pos) {
return 0;
}
+struct metachar_data {
+ signed short btpp; // metacharacter (*, ?) position for backtracking
+ signed short btwp; // word position for metacharacters
+ int btnum; // number of matched characters in metacharacter
+};
+
// check compound patterns
int AffixMgr::defcpd_check(hentry*** words,
short wnum,
hentry* rv,
hentry** def,
char all) {
- signed short
- btpp[MAXWORDLEN]; // metacharacter (*, ?) positions for backtracking
- signed short btwp[MAXWORDLEN]; // word positions for metacharacters
- int btnum[MAXWORDLEN]; // number of matched characters in metacharacter
- // positions
- short bt = 0;
- int i, j;
- int ok;
int w = 0;
if (!*words) {
@@ -1524,6 +1510,11 @@ int AffixMgr::defcpd_check(hentry*** words,
return 0;
}
+ std::vector<metachar_data> btinfo(1);
+
+ short bt = 0;
+ int i, j;
+
(*words)[wnum] = rv;
// has the last word COMPOUNDRULE flag?
@@ -1533,7 +1524,7 @@ int AffixMgr::defcpd_check(hentry*** words,
*words = NULL;
return 0;
}
- ok = 0;
+ int ok = 0;
for (i = 0; i < numdefcpd; i++) {
for (j = 0; j < defcpdtable[i].len; j++) {
if (defcpdtable[i].def[j] != '*' && defcpdtable[i].def[j] != '?' &&
@@ -1564,8 +1555,8 @@ int AffixMgr::defcpd_check(hentry*** words,
int wend = (defcpdtable[i].def[pp + 1] == '?') ? wp : wnum;
ok2 = 1;
pp += 2;
- btpp[bt] = pp;
- btwp[bt] = wp;
+ btinfo[bt].btpp = pp;
+ btinfo[bt].btwp = wp;
while (wp <= wend) {
if (!(*words)[wp]->alen ||
!TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp - 2],
@@ -1577,9 +1568,11 @@ int AffixMgr::defcpd_check(hentry*** words,
}
if (wp <= wnum)
ok2 = 0;
- btnum[bt] = wp - btwp[bt];
- if (btnum[bt] > 0)
- bt++;
+ btinfo[bt].btnum = wp - btinfo[bt].btwp;
+ if (btinfo[bt].btnum > 0) {
+ ++bt;
+ btinfo.resize(bt+1);
+ }
if (ok2)
break;
} else {
@@ -1609,10 +1602,10 @@ int AffixMgr::defcpd_check(hentry*** words,
if (bt)
do {
ok = 1;
- btnum[bt - 1]--;
- pp = btpp[bt - 1];
- wp = btwp[bt - 1] + (signed short)btnum[bt - 1];
- } while ((btnum[bt - 1] < 0) && --bt);
+ btinfo[bt - 1].btnum--;
+ pp = btinfo[bt - 1].btpp;
+ wp = btinfo[bt - 1].btwp + (signed short)btinfo[bt - 1].btnum;
+ } while ((btinfo[bt - 1].btnum < 0) && --bt);
} while (bt);
if (ok && ok2 && (!all || (defcpdtable[i].len <= pp)))
@@ -1650,24 +1643,26 @@ inline int AffixMgr::candidate_check(const char* word, int len) {
}
// calculate number of syllable for compound-checking
-short AffixMgr::get_syllable(const char* word, int wlen) {
+short AffixMgr::get_syllable(const std::string& word) {
if (cpdmaxsyllable == 0)
return 0;
short num = 0;
if (!utf8) {
- for (int i = 0; i < wlen; i++) {
+ for (size_t i = 0; i < word.size(); ++i) {
if (strchr(cpdvowels, word[i]))
num++;
}
} else if (cpdvowels_utf16) {
- w_char w[MAXWORDUTF8LEN];
- int i = u8_u16(w, MAXWORDUTF8LEN, word);
+ std::vector<w_char> w;
+ int i = u8_u16(w, word);
for (; i > 0; i--) {
- if (flag_bsearch((unsigned short*)cpdvowels_utf16,
- ((unsigned short*)w)[i - 1], cpdvowels_utf16_len))
- num++;
+ if (std::binary_search(cpdvowels_utf16,
+ cpdvowels_utf16 + cpdvowels_utf16_len,
+ w[i - 1])) {
+ ++num;
+ }
}
}
return num;
@@ -1676,12 +1671,12 @@ short AffixMgr::get_syllable(const char* word, int wlen) {
void AffixMgr::setcminmax(int* cmin, int* cmax, const char* word, int len) {
if (utf8) {
int i;
- for (*cmin = 0, i = 0; (i < cpdmin) && word[*cmin]; i++) {
- for ((*cmin)++; (word[*cmin] & 0xc0) == 0x80; (*cmin)++)
+ for (*cmin = 0, i = 0; (i < cpdmin) && *cmin < len; i++) {
+ for ((*cmin)++; *cmin < len && (word[*cmin] & 0xc0) == 0x80; (*cmin)++)
;
}
- for (*cmax = len, i = 0; (i < (cpdmin - 1)) && *cmax; i++) {
- for ((*cmax)--; (word[*cmax] & 0xc0) == 0x80; (*cmax)--)
+ for (*cmax = len, i = 0; (i < (cpdmin - 1)) && *cmax >= 0; i++) {
+ for ((*cmax)--; *cmax >= 0 && (word[*cmax] & 0xc0) == 0x80; (*cmax)--)
;
}
} else {
@@ -1699,6 +1694,7 @@ struct hentry* AffixMgr::compound_check(const char* word,
short maxwordnum,
short wnum,
hentry** words = NULL,
+ hentry** rwords = NULL,
char hu_mov_rule = 0,
char is_sug = 0,
int* info = NULL) {
@@ -1706,8 +1702,7 @@ struct hentry* AffixMgr::compound_check(const char* word,
short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
struct hentry* rv = NULL;
struct hentry* rv_first;
- struct hentry* rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking
- char st[MAXWORDUTF8LEN + 4];
+ std::string st;
char ch = '\0';
int cmin;
int cmax;
@@ -1726,7 +1721,7 @@ struct hentry* AffixMgr::compound_check(const char* word,
setcminmax(&cmin, &cmax, word, len);
- strcpy(st, word);
+ st.assign(word);
for (i = cmin; i < cmax; i++) {
// go to end of the UTF-8 character
@@ -1758,11 +1753,11 @@ struct hentry* AffixMgr::compound_check(const char* word,
if (scpd > numcheckcpd)
break; // break simplified checkcompoundpattern loop
- strcpy(st + i, checkcpdtable[scpd - 1].pattern);
+ st.replace(i, std::string::npos, checkcpdtable[scpd - 1].pattern);
soldi = i;
i += strlen(checkcpdtable[scpd - 1].pattern);
- strcpy(st + i, checkcpdtable[scpd - 1].pattern2);
- strcpy(st + i + strlen(checkcpdtable[scpd - 1].pattern2),
+ st.replace(i, std::string::npos, checkcpdtable[scpd - 1].pattern2);
+ st.replace(i + strlen(checkcpdtable[scpd - 1].pattern2), std::string::npos,
word + soldi + strlen(checkcpdtable[scpd - 1].pattern3));
oldlen = len;
@@ -1771,7 +1766,7 @@ struct hentry* AffixMgr::compound_check(const char* word,
strlen(checkcpdtable[scpd - 1].pattern3);
oldcmin = cmin;
oldcmax = cmax;
- setcminmax(&cmin, &cmax, st, len);
+ setcminmax(&cmin, &cmax, st.c_str(), len);
cmax = len - cpdmin + 1;
}
@@ -1785,7 +1780,7 @@ struct hentry* AffixMgr::compound_check(const char* word,
// FIRST WORD
affixed = 1;
- rv = lookup(st); // perhaps without prefix
+ rv = lookup(st.c_str()); // perhaps without prefix
// search homonym with compound flag
while ((rv) && !hu_mov_rule &&
@@ -1798,9 +1793,9 @@ struct hentry* AffixMgr::compound_check(const char* word,
TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
(numdefcpd && onlycpdrule &&
((!words && !wordnum &&
- defcpd_check(&words, wnum, rv, (hentry**)&rwords, 0)) ||
+ defcpd_check(&words, wnum, rv, rwords, 0)) ||
(words &&
- defcpd_check(&words, wnum, rv, (hentry**)&rwords, 0))))) ||
+ defcpd_check(&words, wnum, rv, rwords, 0))))) ||
(scpd != 0 && checkcpdtable[scpd - 1].cond != FLAG_NULL &&
!TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond, rv->alen)))) {
rv = rv->next_homonym;
@@ -1813,14 +1808,14 @@ struct hentry* AffixMgr::compound_check(const char* word,
if (onlycpdrule)
break;
if (compoundflag &&
- !(rv = prefix_check(st, i,
+ !(rv = prefix_check(st.c_str(), i,
hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
compoundflag))) {
if (((rv = suffix_check(
- st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundflag,
+ st.c_str(), i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundflag,
hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
(compoundmoresuffixes &&
- (rv = suffix_check_twosfx(st, i, 0, NULL, compoundflag)))) &&
+ (rv = suffix_check_twosfx(st.c_str(), i, 0, NULL, compoundflag)))) &&
!hu_mov_rule && sfx->getCont() &&
((compoundforbidflag &&
TESTAFF(sfx->getCont(), compoundforbidflag,
@@ -1834,24 +1829,24 @@ struct hentry* AffixMgr::compound_check(const char* word,
if (rv ||
(((wordnum == 0) && compoundbegin &&
((rv = suffix_check(
- st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin,
+ st.c_str(), i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin,
hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
(compoundmoresuffixes &&
(rv = suffix_check_twosfx(
- st, i, 0, NULL,
+ st.c_str(), i, 0, NULL,
compoundbegin))) || // twofold suffixes + compound
- (rv = prefix_check(st, i,
+ (rv = prefix_check(st.c_str(), i,
hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
compoundbegin)))) ||
((wordnum > 0) && compoundmiddle &&
((rv = suffix_check(
- st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle,
+ st.c_str(), i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle,
hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
(compoundmoresuffixes &&
(rv = suffix_check_twosfx(
- st, i, 0, NULL,
+ st.c_str(), i, 0, NULL,
compoundmiddle))) || // twofold suffixes + compound
- (rv = prefix_check(st, i,
+ (rv = prefix_check(st.c_str(), i,
hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
compoundmiddle))))))
checked_prefix = 1;
@@ -1942,7 +1937,7 @@ struct hentry* AffixMgr::compound_check(const char* word,
cpdcase_check(word, i))))
// LANG_hu section: spec. Hungarian rule
|| ((!rv) && (langnum == LANG_hu) && hu_mov_rule &&
- (rv = affix_check(st, i)) &&
+ (rv = affix_check(st.c_str(), i)) &&
(sfx && sfx->getCont() &&
( // XXX hardwired Hungarian dic. codes
TESTAFF(sfx->getCont(), (unsigned short)'x',
@@ -1954,10 +1949,10 @@ struct hentry* AffixMgr::compound_check(const char* word,
// LANG_hu section: spec. Hungarian rule
if (langnum == LANG_hu) {
// calculate syllable number of the word
- numsyllable += get_syllable(st, i);
+ numsyllable += get_syllable(st.substr(i));
// + 1 word, if syllable number of the prefix > 1 (hungarian
// convention)
- if (pfx && (get_syllable(pfx->getKey(), strlen(pfx->getKey())) > 1))
+ if (pfx && (get_syllable(pfx->getKey()) > 1))
wordnum++;
}
// END of LANG_hu section
@@ -1977,7 +1972,7 @@ struct hentry* AffixMgr::compound_check(const char* word,
striple = 1;
}
- rv = lookup((st + i)); // perhaps without prefix
+ rv = lookup(st.c_str() + i); // perhaps without prefix
// search homonym with compound flag
while ((rv) &&
@@ -2039,7 +2034,7 @@ struct hentry* AffixMgr::compound_check(const char* word,
(compoundend && TESTAFF(rv->astr, compoundend, rv->alen))) &&
(((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
((cpdmaxsyllable != 0) &&
- (numsyllable + get_syllable(HENTRY_WORD(rv), rv->clen) <=
+ (numsyllable + get_syllable(std::string(HENTRY_WORD(rv), rv->clen)) <=
cpdmaxsyllable))) &&
(
// test CHECKCOMPOUNDPATTERN
@@ -2123,20 +2118,19 @@ struct hentry* AffixMgr::compound_check(const char* word,
if (langnum == LANG_hu) {
// calculate syllable number of the word
- numsyllable += get_syllable(word + i, strlen(word + i));
+ numsyllable += get_syllable(word + i);
// - affix syllable num.
// XXX only second suffix (inflections, not derivations)
if (sfxappnd) {
- char* tmp = myrevstrdup(sfxappnd);
- numsyllable -= get_syllable(tmp, strlen(tmp)) + sfxextra;
- free(tmp);
+ std::string tmp(sfxappnd);
+ reverseword(tmp);
+ numsyllable -= get_syllable(tmp) + sfxextra;
}
// + 1 word, if syllable number of the prefix > 1 (hungarian
// convention)
- if (pfx &&
- (get_syllable(pfx->getKey(), strlen(pfx->getKey())) > 1))
+ if (pfx && (get_syllable(pfx->getKey()) > 1))
wordnum++;
// increment syllable num, if last word has a SYLLABLENUM flag
@@ -2187,8 +2181,8 @@ struct hentry* AffixMgr::compound_check(const char* word,
// perhaps second word is a compound word (recursive call)
if (wordnum < maxwordnum) {
- rv = compound_check((st + i), strlen(st + i), wordnum + 1,
- numsyllable, maxwordnum, wnum + 1, words, 0,
+ rv = compound_check(st.c_str() + i, strlen(st.c_str() + i), wordnum + 1,
+ numsyllable, maxwordnum, wnum + 1, words, rwords, 0,
is_sug, info);
if (rv && numcheckcpd &&
@@ -2211,11 +2205,11 @@ struct hentry* AffixMgr::compound_check(const char* word,
// check first part
if (strncmp(rv->word, word + i, rv->blen) == 0) {
- char r = *(st + i + rv->blen);
- *(st + i + rv->blen) = '\0';
+ char r = st[i + rv->blen];
+ st[i + rv->blen] = '\0';
- if (checkcompoundrep && cpdrep_check(st, i + rv->blen)) {
- *(st + i + rv->blen) = r;
+ if (checkcompoundrep && cpdrep_check(st.c_str(), i + rv->blen)) {
+ st[ + i + rv->blen] = r;
continue;
}
@@ -2225,11 +2219,11 @@ struct hentry* AffixMgr::compound_check(const char* word,
rv2 = affix_check(word, len);
if (rv2 && rv2->astr &&
TESTAFF(rv2->astr, forbiddenword, rv2->alen) &&
- (strncmp(rv2->word, st, i + rv->blen) == 0)) {
+ (strncmp(rv2->word, st.c_str(), i + rv->blen) == 0)) {
return NULL;
}
}
- *(st + i + rv->blen) = r;
+ st[i + rv->blen] = r;
}
}
return rv_first;
@@ -2262,7 +2256,7 @@ struct hentry* AffixMgr::compound_check(const char* word,
if (soldi != 0) {
i = soldi;
- strcpy(st, word); // XXX add more optim.
+ st.assign(word); // XXX add more optim.
soldi = 0;
} else
st[i] = ch;
@@ -2283,6 +2277,7 @@ int AffixMgr::compound_check_morph(const char* word,
short maxwordnum,
short wnum,
hentry** words,
+ hentry** rwords,
char hu_mov_rule = 0,
char** result = NULL,
char* partresult = NULL) {
@@ -2292,8 +2287,7 @@ int AffixMgr::compound_check_morph(const char* word,
struct hentry* rv = NULL;
struct hentry* rv_first;
- struct hentry* rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking
- char st[MAXWORDUTF8LEN + 4];
+ std::string st;
char ch;
int checked_prefix;
@@ -2308,7 +2302,7 @@ int AffixMgr::compound_check_morph(const char* word,
setcminmax(&cmin, &cmax, word, len);
- strcpy(st, word);
+ st.assign(word);
for (i = cmin; i < cmax; i++) {
// go to end of the UTF-8 character
@@ -2340,7 +2334,7 @@ int AffixMgr::compound_check_morph(const char* word,
if (partresult)
mystrcat(presult, partresult, MAXLNLEN);
- rv = lookup(st); // perhaps without prefix
+ rv = lookup(st.c_str()); // perhaps without prefix
// search homonym with compound flag
while ((rv) && !hu_mov_rule &&
@@ -2353,9 +2347,9 @@ int AffixMgr::compound_check_morph(const char* word,
TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
(numdefcpd && onlycpdrule &&
((!words && !wordnum &&
- defcpd_check(&words, wnum, rv, (hentry**)&rwords, 0)) ||
+ defcpd_check(&words, wnum, rv, rwords, 0)) ||
(words &&
- defcpd_check(&words, wnum, rv, (hentry**)&rwords, 0))))))) {
+ defcpd_check(&words, wnum, rv, rwords, 0))))))) {
rv = rv->next_homonym;
}
@@ -2363,10 +2357,10 @@ int AffixMgr::compound_check_morph(const char* word,
affixed = 0;
if (rv) {
- sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_PART, st);
+ sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_PART, st.c_str());
if (!HENTRY_FIND(rv, MORPH_STEM)) {
sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_STEM,
- st);
+ st.c_str());
}
// store the pointer of the hash entry
// sprintf(presult + strlen(presult), "%c%s%p", MSEP_FLD,
@@ -2382,13 +2376,13 @@ int AffixMgr::compound_check_morph(const char* word,
break;
if (compoundflag &&
!(rv =
- prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
+ prefix_check(st.c_str(), i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
compoundflag))) {
- if (((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL,
+ if (((rv = suffix_check(st.c_str(), i, 0, NULL, NULL, 0, NULL, FLAG_NULL,
compoundflag,
hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
(compoundmoresuffixes &&
- (rv = suffix_check_twosfx(st, i, 0, NULL, compoundflag)))) &&
+ (rv = suffix_check_twosfx(st.c_str(), i, 0, NULL, compoundflag)))) &&
!hu_mov_rule && sfx->getCont() &&
((compoundforbidflag &&
TESTAFF(sfx->getCont(), compoundforbidflag,
@@ -2401,44 +2395,44 @@ int AffixMgr::compound_check_morph(const char* word,
if (rv ||
(((wordnum == 0) && compoundbegin &&
- ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL,
+ ((rv = suffix_check(st.c_str(), i, 0, NULL, NULL, 0, NULL, FLAG_NULL,
compoundbegin,
hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
(compoundmoresuffixes &&
(rv = suffix_check_twosfx(
- st, i, 0, NULL,
+ st.c_str(), i, 0, NULL,
compoundbegin))) || // twofold suffix+compound
- (rv = prefix_check(st, i,
+ (rv = prefix_check(st.c_str(), i,
hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
compoundbegin)))) ||
((wordnum > 0) && compoundmiddle &&
- ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL,
+ ((rv = suffix_check(st.c_str(), i, 0, NULL, NULL, 0, NULL, FLAG_NULL,
compoundmiddle,
hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
(compoundmoresuffixes &&
(rv = suffix_check_twosfx(
- st, i, 0, NULL,
+ st.c_str(), i, 0, NULL,
compoundmiddle))) || // twofold suffix+compound
- (rv = prefix_check(st, i,
+ (rv = prefix_check(st.c_str(), i,
hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
compoundmiddle)))))) {
// char * p = prefix_check_morph(st, i, 0, compound);
char* p = NULL;
if (compoundflag)
- p = affix_check_morph(st, i, compoundflag);
+ p = affix_check_morph(st.c_str(), i, compoundflag);
if (!p || (*p == '\0')) {
if (p)
free(p);
p = NULL;
if ((wordnum == 0) && compoundbegin) {
- p = affix_check_morph(st, i, compoundbegin);
+ p = affix_check_morph(st.c_str(), i, compoundbegin);
} else if ((wordnum > 0) && compoundmiddle) {
- p = affix_check_morph(st, i, compoundmiddle);
+ p = affix_check_morph(st.c_str(), i, compoundmiddle);
}
}
if (p && (*p != '\0')) {
sprintf(presult + strlen(presult), "%c%s%s%s", MSEP_FLD, MORPH_PART,
- st, line_uniq_app(&p, MSEP_REC));
+ st.c_str(), line_uniq_app(&p, MSEP_REC));
}
if (p)
free(p);
@@ -2519,7 +2513,7 @@ int AffixMgr::compound_check_morph(const char* word,
// LANG_hu section: spec. Hungarian rule
||
((!rv) && (langnum == LANG_hu) && hu_mov_rule &&
- (rv = affix_check(st, i)) &&
+ (rv = affix_check(st.c_str(), i)) &&
(sfx && sfx->getCont() &&
(TESTAFF(sfx->getCont(), (unsigned short)'x', sfx->getContLen()) ||
TESTAFF(sfx->getCont(), (unsigned short)'%', sfx->getContLen()))))
@@ -2528,11 +2522,11 @@ int AffixMgr::compound_check_morph(const char* word,
// LANG_hu section: spec. Hungarian rule
if (langnum == LANG_hu) {
// calculate syllable number of the word
- numsyllable += get_syllable(st, i);
+ numsyllable += get_syllable(st.substr(i));
// + 1 word, if syllable number of the prefix > 1 (hungarian
// convention)
- if (pfx && (get_syllable(pfx->getKey(), strlen(pfx->getKey())) > 1))
+ if (pfx && (get_syllable(pfx->getKey()) > 1))
wordnum++;
}
// END of LANG_hu section
@@ -2608,7 +2602,7 @@ int AffixMgr::compound_check_morph(const char* word,
(compoundend && TESTAFF(rv->astr, compoundend, rv->alen))) &&
(((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
((cpdmaxsyllable != 0) &&
- (numsyllable + get_syllable(HENTRY_WORD(rv), rv->blen) <=
+ (numsyllable + get_syllable(std::string(HENTRY_WORD(rv), rv->blen)) <=
cpdmaxsyllable))) &&
((!checkcompounddup || (rv != rv_first)))) {
// bad compound word
@@ -2701,19 +2695,19 @@ int AffixMgr::compound_check_morph(const char* word,
if (langnum == LANG_hu) {
// calculate syllable number of the word
- numsyllable += get_syllable(word + i, strlen(word + i));
+ numsyllable += get_syllable(word + i);
// - affix syllable num.
// XXX only second suffix (inflections, not derivations)
if (sfxappnd) {
- char* tmp = myrevstrdup(sfxappnd);
- numsyllable -= get_syllable(tmp, strlen(tmp)) + sfxextra;
- free(tmp);
+ std::string tmp(sfxappnd);
+ reverseword(tmp);
+ numsyllable -= get_syllable(tmp) + sfxextra;
}
// + 1 word, if syllable number of the prefix > 1 (hungarian
// convention)
- if (pfx && (get_syllable(pfx->getKey(), strlen(pfx->getKey())) > 1))
+ if (pfx && (get_syllable(pfx->getKey()) > 1))
wordnum++;
// increment syllable num, if last word has a SYLLABLENUM flag
@@ -2779,7 +2773,7 @@ int AffixMgr::compound_check_morph(const char* word,
// perhaps second word is a compound word (recursive call)
if ((wordnum < maxwordnum) && (ok == 0)) {
compound_check_morph((word + i), strlen(word + i), wordnum + 1,
- numsyllable, maxwordnum, wnum + 1, words, 0,
+ numsyllable, maxwordnum, wnum + 1, words, rwords, 0,
result, presult);
} else {
rv = NULL;
@@ -2795,6 +2789,7 @@ int AffixMgr::compound_check_morph(const char* word,
return 0;
}
+
// return 1 if s1 (reversed) is a leading subset of end of s2
/* inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int
len)
@@ -3402,7 +3397,7 @@ int AffixMgr::expand_rootword(struct guessword* wlst,
unsigned short al,
const char* bad,
int badl,
- char* phon) {
+ const char* phon) {
int nh = 0;
// first add root word to list
if ((nh < maxn) &&
@@ -3653,8 +3648,7 @@ char* AffixMgr::get_ignore() const {
}
// return the preferred ignore string for suggestions
-unsigned short* AffixMgr::get_ignore_utf16(int* len) const {
- *len = ignorechars_utf16_len;
+const std::vector<w_char>& AffixMgr::get_ignore_utf16() const {
return ignorechars_utf16;
}
@@ -3677,8 +3671,7 @@ const char* AffixMgr::get_wordchars() const {
return wordchars;
}
-unsigned short* AffixMgr::get_wordchars_utf16(int* len) const {
- *len = wordchars_utf16_len;
+const std::vector<w_char>& AffixMgr::get_wordchars_utf16() const {
return wordchars_utf16;
}
@@ -3840,7 +3833,6 @@ int AffixMgr::parse_cpdsyllable(char* line, FileMgr* af) {
char* piece;
int i = 0;
int np = 0;
- w_char w[MAXWORDLEN];
piece = mystrsep(&tp, 0);
while (piece) {
if (*piece != '\0') {
@@ -3858,15 +3850,16 @@ int AffixMgr::parse_cpdsyllable(char* line, FileMgr* af) {
if (!utf8) {
cpdvowels = mystrdup(piece);
} else {
- int n = u8_u16(w, MAXWORDLEN, piece);
- if (n > 0) {
- flag_qsort((unsigned short*)w, 0, n);
- cpdvowels_utf16 = (w_char*)malloc(n * sizeof(w_char));
+ std::vector<w_char> w;
+ u8_u16(w, piece);
+ if (!w.empty()) {
+ std::sort(w.begin(), w.end());
+ cpdvowels_utf16 = (w_char*)malloc(w.size() * sizeof(w_char));
if (!cpdvowels_utf16)
return 1;
- memcpy(cpdvowels_utf16, w, n * sizeof(w_char));
+ memcpy(cpdvowels_utf16, &w[0], w.size());
}
- cpdvowels_utf16_len = n;
+ cpdvowels_utf16_len = w.size();
}
np++;
break;
@@ -4636,13 +4629,16 @@ int AffixMgr::parse_breaktable(char* line, FileMgr* af) {
return 0;
}
-void AffixMgr::reverse_condition(char* piece) {
+void AffixMgr::reverse_condition(std::string& piece) {
+ if (piece.empty())
+ return;
+
int neg = 0;
- for (char* k = piece + strlen(piece) - 1; k >= piece; k--) {
+ for (std::string::reverse_iterator k = piece.rbegin(); k != piece.rend(); ++k) {
switch (*k) {
case '[': {
if (neg)
- *(k + 1) = '[';
+ *(k - 1) = '[';
else
*k = ']';
break;
@@ -4650,20 +4646,20 @@ void AffixMgr::reverse_condition(char* piece) {
case ']': {
*k = '[';
if (neg)
- *(k + 1) = '^';
+ *(k - 1) = '^';
neg = 0;
break;
}
case '^': {
- if (*(k + 1) == ']')
+ if (*(k - 1) == ']')
neg = 1;
else
- *(k + 1) = *k;
+ *(k - 1) = *k;
break;
}
default: {
if (neg)
- *(k + 1) = *k;
+ *(k - 1) = *k;
}
}
}
@@ -4731,8 +4727,8 @@ int AffixMgr::parse_affix(char* line,
case 3: {
np++;
numents = atoi(piece);
- if ((numents <= 0) || ((::std::numeric_limits<size_t>::max() /
- sizeof(struct affentry)) < numents)) {
+ if ((numents <= 0) || ((std::numeric_limits<size_t>::max() /
+ sizeof(struct affentry)) < static_cast<size_t>(numents))) {
char* err = pHMgr->encode_flag(aflag);
if (err) {
HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
@@ -4817,18 +4813,15 @@ int AffixMgr::parse_affix(char* line,
// piece 3 - is string to strip or 0 for null
case 2: {
np++;
+ entry->strip = piece;
if (complexprefixes) {
if (utf8)
- reverseword_utf(piece);
+ reverseword_utf(entry->strip);
else
- reverseword(piece);
+ reverseword(entry->strip);
}
- entry->strip = mystrdup(piece);
- entry->stripl = (unsigned char)strlen(entry->strip);
- if (strcmp(entry->strip, "0") == 0) {
- free(entry->strip);
- entry->strip = mystrdup("");
- entry->stripl = 0;
+ if (entry->strip.compare("0") == 0) {
+ entry->strip.clear();
}
break;
}
@@ -4844,22 +4837,22 @@ int AffixMgr::parse_affix(char* line,
if (dash) {
*dash = '\0';
+ entry->appnd = piece;
+
if (ignorechars) {
if (utf8) {
- remove_ignored_chars_utf(piece, ignorechars_utf16,
- ignorechars_utf16_len);
+ remove_ignored_chars_utf(entry->appnd, ignorechars_utf16);
} else {
- remove_ignored_chars(piece, ignorechars);
+ remove_ignored_chars(entry->appnd, ignorechars);
}
}
if (complexprefixes) {
if (utf8)
- reverseword_utf(piece);
+ reverseword_utf(entry->appnd);
else
- reverseword(piece);
+ reverseword(entry->appnd);
}
- entry->appnd = mystrdup(piece);
if (pHMgr->is_aliasf()) {
int index = atoi(dash + 1);
@@ -4872,7 +4865,7 @@ int AffixMgr::parse_affix(char* line,
} else {
entry->contclasslen = (unsigned short)pHMgr->decode_flags(
&(entry->contclass), dash + 1, af);
- flag_qsort(entry->contclass, 0, entry->contclasslen);
+ std::sort(entry->contclass, entry->contclass + entry->contclasslen);
}
*dash = '/';
@@ -4881,74 +4874,74 @@ int AffixMgr::parse_affix(char* line,
contclasses[(entry->contclass)[_i]] = 1;
}
} else {
+ entry->appnd = piece;
+
if (ignorechars) {
if (utf8) {
- remove_ignored_chars_utf(piece, ignorechars_utf16,
- ignorechars_utf16_len);
+ remove_ignored_chars_utf(entry->appnd, ignorechars_utf16);
} else {
- remove_ignored_chars(piece, ignorechars);
+ remove_ignored_chars(entry->appnd, ignorechars);
}
}
if (complexprefixes) {
if (utf8)
- reverseword_utf(piece);
+ reverseword_utf(entry->appnd);
else
- reverseword(piece);
+ reverseword(entry->appnd);
}
- entry->appnd = mystrdup(piece);
}
- entry->appndl = (unsigned char)strlen(entry->appnd);
- if (strcmp(entry->appnd, "0") == 0) {
- free(entry->appnd);
- entry->appnd = mystrdup("");
- entry->appndl = 0;
+ if (entry->appnd.compare("0") == 0) {
+ entry->appnd.clear();
}
break;
}
// piece 5 - is the conditions descriptions
case 4: {
+ std::string chunk(piece);
np++;
if (complexprefixes) {
if (utf8)
- reverseword_utf(piece);
+ reverseword_utf(chunk);
else
- reverseword(piece);
- reverse_condition(piece);
+ reverseword(chunk);
+ reverse_condition(chunk);
}
- if (entry->stripl && (strcmp(piece, ".") != 0) &&
- redundant_condition(at, entry->strip, entry->stripl, piece,
+ if (!entry->strip.empty() && chunk != "." &&
+ redundant_condition(at, entry->strip.c_str(), entry->strip.size(), chunk.c_str(),
af->getlinenum()))
- strcpy(piece, ".");
+ chunk = ".";
if (at == 'S') {
- reverseword(piece);
- reverse_condition(piece);
+ reverseword(chunk);
+ reverse_condition(chunk);
}
- if (encodeit(*entry, piece))
+ if (encodeit(*entry, chunk.c_str()))
return 1;
break;
}
case 5: {
+ std::string chunk(piece);
np++;
if (pHMgr->is_aliasm()) {
- int index = atoi(piece);
+ int index = atoi(chunk.c_str());
entry->morphcode = pHMgr->get_aliasm(index);
} else {
if (complexprefixes) { // XXX - fix me for morph. gen.
if (utf8)
- reverseword_utf(piece);
+ reverseword_utf(chunk);
else
- reverseword(piece);
+ reverseword(chunk);
}
// add the remaining of the line
if (*tp) {
*(tp - 1) = ' ';
- tp = tp + strlen(tp);
+ chunk.push_back(' ');
+ chunk.append(tp);
}
- entry->morphcode = mystrdup(piece);
+ entry->morphcode = mystrdup(chunk.c_str());
if (!entry->morphcode)
return 1;
}
@@ -5002,7 +4995,7 @@ int AffixMgr::parse_affix(char* line,
}
int AffixMgr::redundant_condition(char ft,
- char* strip,
+ const char* strip,
int stripl,
const char* cond,
int linenum) {
@@ -5112,11 +5105,7 @@ int AffixMgr::get_suffix_words(short unsigned* suff,
hentry* ht = ptr->checkword(nw.c_str(), nw.size(), 0, NULL, NULL, 0,
NULL, 0, 0, 0);
if (ht) {
- slst[suff_words_cnt] = (char*)malloc(MAXWORDUTF8LEN * sizeof(char));
- if (slst[suff_words_cnt]) {
- strcpy(slst[suff_words_cnt], nw.c_str());
- suff_words_cnt++;
- }
+ slst[suff_words_cnt++] = mystrdup(nw.c_str());
}
}
suff++;