summaryrefslogtreecommitdiff
path: root/libs/hunspell/src/hunspell.c++
diff options
context:
space:
mode:
authorTobias Weimer <wishmaster51@googlemail.com>2016-05-13 19:32:21 +0000
committerTobias Weimer <wishmaster51@googlemail.com>2016-05-13 19:32:21 +0000
commit37c98eaad76b7f1bf86c75fe2c32cf6aa11f7c6f (patch)
tree32aede144aa0cd0f2dd058b003cdbd534a2f969c /libs/hunspell/src/hunspell.c++
parente73bb3845517a31fa795e8d2174fcc8572835b33 (diff)
SpellChecker: Updated hunspell to 1.4.1
git-svn-id: http://svn.miranda-ng.org/main/trunk@16828 1316c22d-e87f-b044-9b9b-93d7a3e3ba9c
Diffstat (limited to 'libs/hunspell/src/hunspell.c++')
-rw-r--r--libs/hunspell/src/hunspell.c++1554
1 files changed, 541 insertions, 1013 deletions
diff --git a/libs/hunspell/src/hunspell.c++ b/libs/hunspell/src/hunspell.c++
index 726c72931a..f7c1581087 100644
--- a/libs/hunspell/src/hunspell.c++
+++ b/libs/hunspell/src/hunspell.c++
@@ -85,6 +85,9 @@
#include <limits>
#include <string>
+#define MAXWORDLEN 176
+#define MAXWORDUTF8LEN (MAXWORDLEN * 3)
+
Hunspell::Hunspell(const char* affpath, const char* dpath, const char* key) {
encoding = NULL;
csconv = NULL;
@@ -158,14 +161,16 @@ int Hunspell::add_dic(const char* dpath, const char* key) {
// set the capitalization type
// return the length of the "cleaned" (and UTF-8 encoded) word
-int Hunspell::cleanword2(char* dest,
+size_t Hunspell::cleanword2(std::string& dest,
+ std::vector<w_char>& dest_utf,
const char* src,
- w_char* dest_utf,
int* nc,
int* pcaptype,
- int* pabbrev) {
- unsigned char* p = (unsigned char*)dest;
- const unsigned char* q = (const unsigned char*)src;
+ size_t* pabbrev) {
+ dest.clear();
+ dest_utf.clear();
+
+ const char* q = src;
// first skip over any leading blanks
while ((*q != '\0') && (*q == ' '))
@@ -173,7 +178,7 @@ int Hunspell::cleanword2(char* dest,
// now strip off any trailing periods (recording their presence)
*pabbrev = 0;
- int nl = strlen((const char*)q);
+ int nl = strlen(q);
while ((nl > 0) && (*(q + nl - 1) == '.')) {
nl--;
(*pabbrev)++;
@@ -182,35 +187,26 @@ int Hunspell::cleanword2(char* dest,
// if no characters are left it can't be capitalized
if (nl <= 0) {
*pcaptype = NOCAP;
- *p = '\0';
return 0;
}
- strncpy(dest, (char*)q, nl);
- *(dest + nl) = '\0';
- nl = strlen(dest);
+ dest.append(q, nl);
+ nl = dest.size();
if (utf8) {
- *nc = u8_u16(dest_utf, MAXWORDLEN, dest);
- // don't check too long words
- if (*nc >= MAXWORDLEN)
- return 0;
- if (*nc == -1) { // big Unicode character (non BMP area)
- *pcaptype = NOCAP;
- return nl;
- }
- *pcaptype = get_captype_utf8(dest_utf, *nc, langnum);
+ *nc = u8_u16(dest_utf, dest);
+ *pcaptype = get_captype_utf8(dest_utf, langnum);
} else {
- *pcaptype = get_captype(dest, nl, csconv);
+ *pcaptype = get_captype(dest, csconv);
*nc = nl;
}
return nl;
}
-int Hunspell::cleanword(char* dest,
+void Hunspell::cleanword(std::string& dest,
const char* src,
int* pcaptype,
int* pabbrev) {
- unsigned char* p = (unsigned char*)dest;
+ dest.clear();
const unsigned char* q = (const unsigned char*)src;
int firstcap = 0;
@@ -229,8 +225,7 @@ int Hunspell::cleanword(char* dest,
// if no characters are left it can't be capitalized
if (nl <= 0) {
*pcaptype = NOCAP;
- *p = '\0';
- return 0;
+ return;
}
// now determine the capitalization type of the first nl letters
@@ -245,27 +240,25 @@ int Hunspell::cleanword(char* dest,
ncap++;
if (csconv[(*q)].cupper == csconv[(*q)].clower)
nneutral++;
- *p++ = *q++;
+ dest.push_back(*q++);
nl--;
}
// remember to terminate the destination string
- *p = '\0';
- firstcap = csconv[(unsigned char)(*dest)].ccase;
+ firstcap = csconv[static_cast<unsigned char>(dest[0])].ccase;
} else {
- unsigned short idx;
- w_char t[MAXWORDLEN];
- nc = u8_u16(t, MAXWORDLEN, src);
- for (int i = 0; i < nc; i++) {
- idx = (t[i].h << 8) + t[i].l;
+ std::vector<w_char> t;
+ u8_u16(t, src);
+ for (size_t i = 0; i < t.size(); ++i) {
+ unsigned short idx = (t[i].h << 8) + t[i].l;
unsigned short low = unicodetolower(idx, langnum);
if (idx != low)
ncap++;
if (unicodetoupper(idx, langnum) == low)
nneutral++;
}
- u16_u8(dest, MAXWORDUTF8LEN, t, nc);
+ u16_u8(dest, t);
if (ncap) {
- idx = (t[0].h << 8) + t[0].l;
+ unsigned short idx = (t[0].h << 8) + t[0].l;
firstcap = (idx != unicodetolower(idx, langnum));
}
}
@@ -282,117 +275,60 @@ int Hunspell::cleanword(char* dest,
} else {
*pcaptype = HUHCAP;
}
- return strlen(dest);
}
-void Hunspell::mkallcap(char* p) {
+void Hunspell::mkallcap(std::string& u8) {
if (utf8) {
- w_char u[MAXWORDLEN];
- int nc = u8_u16(u, MAXWORDLEN, p);
- unsigned short idx;
- for (int i = 0; i < nc; i++) {
- idx = (u[i].h << 8) + u[i].l;
- if (idx != unicodetoupper(idx, langnum)) {
- u[i].h = (unsigned char)(unicodetoupper(idx, langnum) >> 8);
- u[i].l = (unsigned char)(unicodetoupper(idx, langnum) & 0x00FF);
- }
- }
- u16_u8(p, MAXWORDUTF8LEN, u, nc);
+ std::vector<w_char> u16;
+ u8_u16(u16, u8);
+ ::mkallcap_utf(u16, langnum);
+ u16_u8(u8, u16);
} else {
- while (*p != '\0') {
- *p = csconv[((unsigned char)*p)].cupper;
- p++;
- }
- }
-}
-
-int Hunspell::mkallcap2(char* p, w_char* u, int nc) {
- if (utf8) {
- unsigned short idx;
- for (int i = 0; i < nc; i++) {
- idx = (u[i].h << 8) + u[i].l;
- unsigned short up = unicodetoupper(idx, langnum);
- if (idx != up) {
- u[i].h = (unsigned char)(up >> 8);
- u[i].l = (unsigned char)(up & 0x00FF);
- }
- }
- u16_u8(p, MAXWORDUTF8LEN, u, nc);
- return strlen(p);
- } else {
- while (*p != '\0') {
- *p = csconv[((unsigned char)*p)].cupper;
- p++;
- }
- }
- return nc;
-}
-
-void Hunspell::mkallsmall(char* p) {
- while (*p != '\0') {
- *p = csconv[((unsigned char)*p)].clower;
- p++;
+ ::mkallcap(u8, csconv);
}
}
-int Hunspell::mkallsmall2(char* p, w_char* u, int nc) {
+int Hunspell::mkallsmall2(std::string& u8, std::vector<w_char>& u16) {
if (utf8) {
- unsigned short idx;
- for (int i = 0; i < nc; i++) {
- idx = (u[i].h << 8) + u[i].l;
- unsigned short low = unicodetolower(idx, langnum);
- if (idx != low) {
- u[i].h = (unsigned char)(low >> 8);
- u[i].l = (unsigned char)(low & 0x00FF);
- }
- }
- u16_u8(p, MAXWORDUTF8LEN, u, nc);
- return strlen(p);
+ ::mkallsmall_utf(u16, langnum);
+ u16_u8(u8, u16);
} else {
- while (*p != '\0') {
- *p = csconv[((unsigned char)*p)].clower;
- p++;
- }
+ ::mkallsmall(u8, csconv);
}
- return nc;
+ return u8.size();
}
// convert UTF-8 sharp S codes to latin 1
-char* Hunspell::sharps_u8_l1(char* dest, char* source) {
- char* p = dest;
- *p = *source;
- for (p++, source++; *(source - 1); p++, source++) {
- *p = *source;
- if (*source == '\x9F')
- *--p = '\xDF';
- }
+std::string Hunspell::sharps_u8_l1(const std::string& source) {
+ std::string dest(source);
+ mystrrep(dest, "\xC3\x9F", "\xDF");
return dest;
}
// recursive search for right ss - sharp s permutations
-hentry* Hunspell::spellsharps(char* base,
- char* pos,
+hentry* Hunspell::spellsharps(std::string& base,
+ size_t n_pos,
int n,
int repnum,
- char* tmp,
int* info,
char** root) {
- pos = strstr(pos, "ss");
- if (pos && (n < MAXSHARPS)) {
- *pos = '\xC3';
- *(pos + 1) = '\x9F';
- hentry* h = spellsharps(base, pos + 2, n + 1, repnum + 1, tmp, info, root);
+ size_t pos = base.find("ss", n_pos);
+ if (pos != std::string::npos && (n < MAXSHARPS)) {
+ base[pos] = '\xC3';
+ base[pos + 1] = '\x9F';
+ hentry* h = spellsharps(base, pos + 2, n + 1, repnum + 1, info, root);
if (h)
return h;
- *pos = 's';
- *(pos + 1) = 's';
- h = spellsharps(base, pos + 2, n + 1, repnum, tmp, info, root);
+ base[pos] = 's';
+ base[pos + 1] = 's';
+ h = spellsharps(base, pos + 2, n + 1, repnum, info, root);
if (h)
return h;
} else if (repnum > 0) {
if (utf8)
- return checkword(base, info, root);
- return checkword(sharps_u8_l1(tmp, base), info, root);
+ return checkword(base.c_str(), info, root);
+ std::string tmp(sharps_u8_l1(base));
+ return checkword(tmp.c_str(), info, root);
}
return NULL;
}
@@ -403,7 +339,7 @@ int Hunspell::is_keepcase(const hentry* rv) {
}
/* insert a word to the beginning of the suggestion array and return ns */
-int Hunspell::insert_sug(char*** slst, char* word, int ns) {
+int Hunspell::insert_sug(char*** slst, const char* word, int ns) {
if (!*slst)
return ns;
char* dup = mystrdup(word);
@@ -421,11 +357,6 @@ int Hunspell::insert_sug(char*** slst, char* word, int ns) {
int Hunspell::spell(const char* word, int* info, char** root) {
struct hentry* rv = NULL;
- // need larger vector. For example, Turkish capital letter I converted a
- // 2-byte UTF-8 character (dotless i) by mkallsmall.
- char cw[MAXWORDUTF8LEN];
- char wspace[MAXWORDUTF8LEN];
- w_char unicw[MAXWORDLEN];
int info2 = 0;
if (!info)
@@ -437,7 +368,6 @@ int Hunspell::spell(const char* word, int* info, char** root) {
if (strcmp(word, SPELL_XML) == 0)
return 1;
int nc = strlen(word);
- int wl2 = 0;
if (utf8) {
if (nc >= MAXWORDUTF8LEN)
return 0;
@@ -445,19 +375,26 @@ int Hunspell::spell(const char* word, int* info, char** root) {
if (nc >= MAXWORDLEN)
return 0;
}
- int captype = 0;
- int abbv = 0;
- int wl = 0;
+ int captype = NOCAP;
+ size_t abbv = 0;
+ size_t wl = 0;
+
+ std::string scw;
+ std::vector<w_char> sunicw;
// input conversion
RepList* rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
- int convstatus = rl ? rl->conv(word, wspace, MAXWORDUTF8LEN) : 0;
- if (convstatus < 0)
- return 0;
- else if (convstatus > 0)
- wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
- else
- wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
+ {
+ std::string wspace;
+
+ int convstatus = rl ? rl->conv(word, wspace) : 0;
+ if (convstatus < 0)
+ return 0;
+ else if (convstatus > 0)
+ wl = cleanword2(scw, sunicw, wspace.c_str(), &nc, &captype, &abbv);
+ else
+ wl = cleanword2(scw, sunicw, word, &nc, &captype, &abbv);
+ }
#ifdef MOZILLA_CLIENT
// accept the abbreviated words without dots
@@ -474,12 +411,12 @@ int Hunspell::spell(const char* word, int* info, char** root) {
// "..", "--" etc.)
enum { NBEGIN, NNUM, NSEP };
int nstate = NBEGIN;
- int i;
+ size_t i;
for (i = 0; (i < wl); i++) {
- if ((cw[i] <= '9') && (cw[i] >= '0')) {
+ if ((scw[i] <= '9') && (scw[i] >= '0')) {
nstate = NNUM;
- } else if ((cw[i] == ',') || (cw[i] == '.') || (cw[i] == '-')) {
+ } else if ((scw[i] == ',') || (scw[i] == '.') || (scw[i] == '-')) {
if ((nstate == NSEP) || (i == 0))
break;
nstate = NSEP;
@@ -496,75 +433,75 @@ int Hunspell::spell(const char* word, int* info, char** root) {
*info += SPELL_ORIGCAP;
/* FALLTHROUGH */
case NOCAP:
- rv = checkword(cw, info, root);
+ rv = checkword(scw.c_str(), info, root);
if ((abbv) && !(rv)) {
- memcpy(wspace, cw, wl);
- *(wspace + wl) = '.';
- *(wspace + wl + 1) = '\0';
- rv = checkword(wspace, info, root);
+ std::string u8buffer(scw);
+ u8buffer.push_back('.');
+ rv = checkword(u8buffer.c_str(), info, root);
}
break;
case ALLCAP: {
*info += SPELL_ORIGCAP;
- rv = checkword(cw, info, root);
+ rv = checkword(scw.c_str(), info, root);
if (rv)
break;
if (abbv) {
- memcpy(wspace, cw, wl);
- *(wspace + wl) = '.';
- *(wspace + wl + 1) = '\0';
- rv = checkword(wspace, info, root);
+ std::string u8buffer(scw);
+ u8buffer.push_back('.');
+ rv = checkword(u8buffer.c_str(), info, root);
if (rv)
break;
}
// Spec. prefix handling for Catalan, French, Italian:
// prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia).
- if (pAMgr && strchr(cw, '\'')) {
- mkallsmall2(cw, unicw, nc);
- // There are no really sane circumstances where this could fail,
- // but anyway...
- if (char* apostrophe = strchr(cw, '\'')) {
+ size_t apos = pAMgr ? scw.find('\'') : std::string::npos;
+ if (apos != std::string::npos) {
+ mkallsmall2(scw, sunicw);
+ //conversion may result in string with different len to pre-mkallsmall2
+ //so re-scan
+ if (apos != std::string::npos && apos < scw.size() - 1) {
+ std::string part1 = scw.substr(0, apos+1);
+ std::string part2 = scw.substr(apos+1);
if (utf8) {
- w_char tmpword[MAXWORDLEN];
- *apostrophe = '\0';
- wl2 = u8_u16(tmpword, MAXWORDLEN, cw);
- *apostrophe = '\'';
- if (wl2 >= 0 && wl2 < nc) {
- mkinitcap2(apostrophe + 1, unicw + wl2 + 1, nc - wl2 - 1);
- rv = checkword(cw, info, root);
- if (rv)
- break;
- }
+ std::vector<w_char> part1u, part2u;
+ u8_u16(part1u, part1);
+ u8_u16(part2u, part2);
+ mkinitcap2(part2, part2u);
+ scw = part1 + part2;
+ sunicw = part1u;
+ sunicw.insert(sunicw.end(), part2u.begin(), part2u.end());
+ rv = checkword(scw.c_str(), info, root);
+ if (rv)
+ break;
} else {
- mkinitcap2(apostrophe + 1, unicw, nc);
- rv = checkword(cw, info, root);
+ mkinitcap2(part2, sunicw);
+ scw = part1 + part2;
+ rv = checkword(scw.c_str(), info, root);
if (rv)
break;
}
+ mkinitcap2(scw, sunicw);
+ rv = checkword(scw.c_str(), info, root);
+ if (rv)
+ break;
}
- mkinitcap2(cw, unicw, nc);
- rv = checkword(cw, info, root);
- if (rv)
- break;
}
- if (pAMgr && pAMgr->get_checksharps() && strstr(cw, "SS")) {
- char tmpword[MAXWORDUTF8LEN];
- wl = mkallsmall2(cw, unicw, nc);
- memcpy(wspace, cw, (wl + 1));
- rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root);
+ if (pAMgr && pAMgr->get_checksharps() && scw.find("SS") != std::string::npos) {
+
+ mkallsmall2(scw, sunicw);
+ std::string u8buffer(scw);
+ rv = spellsharps(u8buffer, 0, 0, 0, info, root);
if (!rv) {
- wl2 = mkinitcap2(cw, unicw, nc);
- rv = spellsharps(cw, cw, 0, 0, tmpword, info, root);
+ mkinitcap2(scw, sunicw);
+ rv = spellsharps(scw, 0, 0, 0, info, root);
}
if ((abbv) && !(rv)) {
- *(wspace + wl) = '.';
- *(wspace + wl + 1) = '\0';
- rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root);
+ u8buffer.push_back('.');
+ rv = spellsharps(u8buffer, 0, 0, 0, info, root);
if (!rv) {
- memcpy(wspace, cw, wl2);
- *(wspace + wl2) = '.';
- *(wspace + wl2 + 1) = '\0';
- rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root);
+ u8buffer = std::string(scw);
+ u8buffer.push_back('.');
+ rv = spellsharps(u8buffer, 0, 0, 0, info, root);
}
}
if (rv)
@@ -572,13 +509,14 @@ int Hunspell::spell(const char* word, int* info, char** root) {
}
}
case INITCAP: {
+
*info += SPELL_ORIGCAP;
- wl = mkallsmall2(cw, unicw, nc);
- memcpy(wspace, cw, (wl + 1));
- wl2 = mkinitcap2(cw, unicw, nc);
+ mkallsmall2(scw, sunicw);
+ std::string u8buffer(scw);
+ mkinitcap2(scw, sunicw);
if (captype == INITCAP)
*info += SPELL_INITCAP;
- rv = checkword(cw, info, root);
+ rv = checkword(scw.c_str(), info, root);
if (captype == INITCAP)
*info -= SPELL_INITCAP;
// forbid bad capitalization
@@ -593,18 +531,16 @@ int Hunspell::spell(const char* word, int* info, char** root) {
if (rv)
break;
- rv = checkword(wspace, info, root);
+ rv = checkword(u8buffer.c_str(), info, root);
if (abbv && !rv) {
- *(wspace + wl) = '.';
- *(wspace + wl + 1) = '\0';
- rv = checkword(wspace, info, root);
+ u8buffer.push_back('.');
+ rv = checkword(u8buffer.c_str(), info, root);
if (!rv) {
- memcpy(wspace, cw, wl2);
- *(wspace + wl2) = '.';
- *(wspace + wl2 + 1) = '\0';
+ u8buffer = scw;
+ u8buffer.push_back('.');
if (captype == INITCAP)
*info += SPELL_INITCAP;
- rv = checkword(wspace, info, root);
+ rv = checkword(u8buffer.c_str(), info, root);
if (captype == INITCAP)
*info -= SPELL_INITCAP;
if (rv && is_keepcase(rv) && (captype == ALLCAP))
@@ -617,8 +553,8 @@ int Hunspell::spell(const char* word, int* info, char** root) {
// if CHECKSHARPS: KEEPCASE words with \xDF are allowed
// in INITCAP form, too.
!(pAMgr->get_checksharps() &&
- ((utf8 && strstr(wspace, "\xC3\x9F")) ||
- (!utf8 && strchr(wspace, '\xDF'))))))
+ ((utf8 && u8buffer.find("\xC3\x9F") != std::string::npos) ||
+ (!utf8 && u8buffer.find('\xDF') != std::string::npos)))))
rv = NULL;
break;
}
@@ -637,67 +573,66 @@ int Hunspell::spell(const char* word, int* info, char** root) {
// recursive breaking at break points
if (wordbreak) {
- char* s;
- char r;
+
int nbr = 0;
- wl = strlen(cw);
+ wl = scw.size();
int numbreak = pAMgr ? pAMgr->get_numbreak() : 0;
// calculate break points for recursion limit
for (int j = 0; j < numbreak; j++) {
- s = cw;
- do {
- s = (char*)strstr(s, wordbreak[j]);
- if (s) {
- nbr++;
- s++;
- }
- } while (s);
+ size_t len = strlen(wordbreak[j]);
+ size_t pos = 0;
+ while ((pos = scw.find(wordbreak[j], pos, len)) != std::string::npos) {
+ ++nbr;
+ pos += len;
+ }
}
if (nbr >= 10)
return 0;
// check boundary patterns (^begin and end$)
for (int j = 0; j < numbreak; j++) {
- int plen = strlen(wordbreak[j]);
+ size_t plen = strlen(wordbreak[j]);
if (plen == 1 || plen > wl)
continue;
+
if (wordbreak[j][0] == '^' &&
- strncmp(cw, wordbreak[j] + 1, plen - 1) == 0 && spell(cw + plen - 1))
+ scw.compare(0, plen - 1, wordbreak[j] + 1, plen -1) == 0 && spell(scw.c_str() + plen - 1))
return 1;
+
if (wordbreak[j][plen - 1] == '$' &&
- strncmp(cw + wl - plen + 1, wordbreak[j], plen - 1) == 0) {
- r = cw[wl - plen + 1];
- cw[wl - plen + 1] = '\0';
- if (spell(cw))
+ scw.compare(wl - plen + 1, plen - 1, wordbreak[j], plen - 1) == 0) {
+ char r = scw[wl - plen + 1];
+ scw[wl - plen + 1] = '\0';
+ if (spell(scw.c_str()))
return 1;
- cw[wl - plen + 1] = r;
+ scw[wl - plen + 1] = r;
}
}
// other patterns
for (int j = 0; j < numbreak; j++) {
- int plen = strlen(wordbreak[j]);
- s = (char*)strstr(cw, wordbreak[j]);
- if (s && (s > cw) && (s < cw + wl - plen)) {
- if (!spell(s + plen))
+ size_t plen = strlen(wordbreak[j]);
+ size_t found = scw.find(wordbreak[j]);
+ if ((found > 0) && (found < wl - plen)) {
+ if (!spell(scw.c_str() + found + plen))
continue;
- r = *s;
- *s = '\0';
+ char r = scw[found];
+ scw[found] = '\0';
// examine 2 sides of the break point
- if (spell(cw))
+ if (spell(scw.c_str()))
return 1;
- *s = r;
+ scw[found] = r;
// LANG_hu: spec. dash rule
if (langnum == LANG_hu && strcmp(wordbreak[j], "-") == 0) {
- r = s[1];
- s[1] = '\0';
- if (spell(cw))
+ r = scw[found + 1];
+ scw[found + 1] = '\0';
+ if (spell(scw.c_str()))
return 1; // check the first part with dash
- s[1] = r;
+ scw[found + 1] = r;
}
- // end of LANG speficic region
+ // end of LANG specific region
}
}
}
@@ -716,10 +651,9 @@ struct hentry* Hunspell::checkword(const char* w, int* info, char** root) {
if (ignoredchars != NULL) {
w2.assign(w);
if (utf8) {
- int ignoredchars_utf16_len;
- unsigned short* ignoredchars_utf16 =
- pAMgr->get_ignore_utf16(&ignoredchars_utf16_len);
- remove_ignored_chars_utf(w2, ignoredchars_utf16, ignoredchars_utf16_len);
+ const std::vector<w_char>& ignoredchars_utf16 =
+ pAMgr->get_ignore_utf16();
+ remove_ignored_chars_utf(w2, ignoredchars_utf16);
} else {
remove_ignored_chars(w2, ignoredchars);
}
@@ -802,37 +736,40 @@ struct hentry* Hunspell::checkword(const char* w, int* info, char** root) {
return NULL;
}
if (root) {
- *root = mystrdup(he->word);
- if (*root && complexprefixes) {
+ std::string word_root(he->word);
+ if (complexprefixes) {
if (utf8)
- reverseword_utf(*root);
+ reverseword_utf(word_root);
else
- reverseword(*root);
+ reverseword(word_root);
}
+ *root = mystrdup(word_root.c_str());
}
// try check compound word
} else if (pAMgr->get_compound()) {
- he = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 0, info);
+ struct hentry* rwords[100]; // buffer for COMPOUND pattern checking
+ he = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, (hentry**)&rwords, 0, 0, info);
// LANG_hu section: `moving rule' with last dash
if ((!he) && (langnum == LANG_hu) && (word[len - 1] == '-')) {
char* dup = mystrdup(word);
if (!dup)
return NULL;
dup[len - 1] = '\0';
- he = pAMgr->compound_check(dup, len - 1, -5, 0, 100, 0, NULL, 1, 0,
+ he = pAMgr->compound_check(dup, len - 1, -5, 0, 100, 0, NULL, (hentry**)&rwords, 1, 0,
info);
free(dup);
}
- // end of LANG speficic region
+ // end of LANG specific region
if (he) {
if (root) {
- *root = mystrdup(he->word);
- if (*root && complexprefixes) {
+ std::string word_root(he->word);
+ if (complexprefixes) {
if (utf8)
- reverseword_utf(*root);
+ reverseword_utf(word_root);
else
- reverseword(*root);
+ reverseword(word_root);
}
+ *root = mystrdup(word_root.c_str());
}
if (info)
*info += SPELL_COMPOUND;
@@ -845,11 +782,8 @@ struct hentry* Hunspell::checkword(const char* w, int* info, char** root) {
int Hunspell::suggest(char*** slst, const char* word) {
int onlycmpdsug = 0;
- char cw[MAXWORDUTF8LEN];
- char wspace[MAXWORDUTF8LEN];
if (!pSMgr || maxdic == 0)
return 0;
- w_char unicw[MAXWORDLEN];
*slst = NULL;
// process XML input of the simplified API (see manual)
if (strncmp(word, SPELL_XML, sizeof(SPELL_XML) - 3) == 0) {
@@ -863,130 +797,132 @@ int Hunspell::suggest(char*** slst, const char* word) {
if (nc >= MAXWORDLEN)
return 0;
}
- int captype = 0;
- int abbv = 0;
- int wl = 0;
+ int captype = NOCAP;
+ size_t abbv = 0;
+ size_t wl = 0;
+
+ std::string scw;
+ std::vector<w_char> sunicw;
// input conversion
RepList* rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
- int convstatus = rl ? rl->conv(word, wspace, MAXWORDUTF8LEN) : 0;
- if (convstatus < 0)
- return 0;
- else if (convstatus > 0)
- wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
- else
- wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
+ {
+ std::string wspace;
+
+ int convstatus = rl ? rl->conv(word, wspace) : 0;
+ if (convstatus < 0)
+ return 0;
+ else if (convstatus > 0)
+ wl = cleanword2(scw, sunicw, wspace.c_str(), &nc, &captype, &abbv);
+ else
+ wl = cleanword2(scw, sunicw, word, &nc, &captype, &abbv);
+
+ if (wl == 0)
+ return 0;
+ }
- if (wl == 0)
- return 0;
int ns = 0;
int capwords = 0;
// check capitalized form for FORCEUCASE
if (pAMgr && captype == NOCAP && pAMgr->get_forceucase()) {
int info = SPELL_ORIGCAP;
- char** wlst;
- if (checkword(cw, &info, NULL)) {
- if (*slst) {
- wlst = *slst;
- } else {
- wlst = (char**)malloc(MAXSUGGESTION * sizeof(char*));
- if (wlst == NULL)
- return -1;
- *slst = wlst;
- for (int i = 0; i < MAXSUGGESTION; i++) {
- wlst[i] = NULL;
- }
+ if (checkword(scw.c_str(), &info, NULL)) {
+ std::string form(scw);
+ mkinitcap(form);
+
+ char** wlst = (char**)malloc(MAXSUGGESTION * sizeof(char*));
+ if (wlst == NULL)
+ return -1;
+ *slst = wlst;
+ wlst[0] = mystrdup(form.c_str());
+ for (int i = 1; i < MAXSUGGESTION; ++i) {
+ wlst[i] = NULL;
}
- wlst[0] = mystrdup(cw);
- mkinitcap(wlst[0]);
+
return 1;
}
}
switch (captype) {
case NOCAP: {
- ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug);
+ ns = pSMgr->suggest(slst, scw.c_str(), ns, &onlycmpdsug);
break;
}
case INITCAP: {
capwords = 1;
- ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug);
+ ns = pSMgr->suggest(slst, scw.c_str(), ns, &onlycmpdsug);
if (ns == -1)
break;
- memcpy(wspace, cw, (wl + 1));
- mkallsmall2(wspace, unicw, nc);
- ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
+ std::string wspace(scw);
+ mkallsmall2(wspace, sunicw);
+ ns = pSMgr->suggest(slst, wspace.c_str(), ns, &onlycmpdsug);
break;
}
case HUHINITCAP:
capwords = 1;
case HUHCAP: {
- ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug);
+ ns = pSMgr->suggest(slst, scw.c_str(), ns, &onlycmpdsug);
if (ns != -1) {
- int prevns;
// something.The -> something. The
- char* dot = strchr(cw, '.');
- if (dot && (dot > cw)) {
+ size_t dot_pos = scw.find('.');
+ if (dot_pos != std::string::npos) {
+ std::string postdot = scw.substr(dot_pos + 1);
int captype_;
if (utf8) {
- w_char w_[MAXWORDLEN];
- int wl_ = u8_u16(w_, MAXWORDLEN, dot + 1);
- captype_ = get_captype_utf8(w_, wl_, langnum);
- } else
- captype_ = get_captype(dot + 1, strlen(dot + 1), csconv);
+ std::vector<w_char> postdotu;
+ u8_u16(postdotu, postdot);
+ captype_ = get_captype_utf8(postdotu, langnum);
+ } else {
+ captype_ = get_captype(postdot, csconv);
+ }
if (captype_ == INITCAP) {
- char* st = mystrdup(cw);
- if (st) {
- char* newst = (char*)realloc(st, wl + 2);
- if (newst == NULL)
- free(st);
- st = newst;
- }
- if (st) {
- st[(dot - cw) + 1] = ' ';
- strcpy(st + (dot - cw) + 2, dot + 1);
- ns = insert_sug(slst, st, ns);
- free(st);
- }
+ std::string str(scw);
+ str.insert(dot_pos + 1, 1, ' ');
+ ns = insert_sug(slst, str.c_str(), ns);
}
}
+
+ std::string wspace;
+
if (captype == HUHINITCAP) {
// TheOpenOffice.org -> The OpenOffice.org
- memcpy(wspace, cw, (wl + 1));
- mkinitsmall2(wspace, unicw, nc);
- ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
+ wspace = scw;
+ mkinitsmall2(wspace, sunicw);
+ ns = pSMgr->suggest(slst, wspace.c_str(), ns, &onlycmpdsug);
}
- memcpy(wspace, cw, (wl + 1));
- mkallsmall2(wspace, unicw, nc);
- if (spell(wspace))
- ns = insert_sug(slst, wspace, ns);
- prevns = ns;
- ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
+ wspace = scw;
+ mkallsmall2(wspace, sunicw);
+ if (spell(wspace.c_str()))
+ ns = insert_sug(slst, wspace.c_str(), ns);
+ int prevns = ns;
+ ns = pSMgr->suggest(slst, wspace.c_str(), ns, &onlycmpdsug);
if (captype == HUHINITCAP) {
- mkinitcap2(wspace, unicw, nc);
- if (spell(wspace))
- ns = insert_sug(slst, wspace, ns);
- ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
+ mkinitcap2(wspace, sunicw);
+ if (spell(wspace.c_str()))
+ ns = insert_sug(slst, wspace.c_str(), ns);
+ ns = pSMgr->suggest(slst, wspace.c_str(), ns, &onlycmpdsug);
}
// aNew -> "a New" (instead of "a new")
for (int j = prevns; j < ns; j++) {
char* space = strchr((*slst)[j], ' ');
if (space) {
- int slen = strlen(space + 1);
+ size_t slen = strlen(space + 1);
// different case after space (need capitalisation)
- if ((slen < wl) && strcmp(cw + wl - slen, space + 1)) {
- w_char w[MAXWORDLEN];
- int wc = 0;
- char* r = (*slst)[j];
+ if ((slen < wl) && strcmp(scw.c_str() + wl - slen, space + 1)) {
+ std::string first((*slst)[j], space + 1);
+ std::string second(space + 1);
+ std::vector<w_char> w;
if (utf8)
- wc = u8_u16(w, MAXWORDLEN, space + 1);
- mkinitcap2(space + 1, w, wc);
+ u8_u16(w, second);
+ mkinitcap2(second, w);
// set as first suggestion
+ char* r = (*slst)[j];
for (int k = j; k > 0; k--)
(*slst)[k] = (*slst)[k - 1];
- (*slst)[0] = r;
+ free(r);
+ (*slst)[0] = mystrdup((first + second).c_str());
}
}
}
@@ -995,35 +931,30 @@ int Hunspell::suggest(char*** slst, const char* word) {
}
case ALLCAP: {
- memcpy(wspace, cw, (wl + 1));
- mkallsmall2(wspace, unicw, nc);
- ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
+ std::string wspace(scw);
+ mkallsmall2(wspace, sunicw);
+ ns = pSMgr->suggest(slst, wspace.c_str(), ns, &onlycmpdsug);
if (ns == -1)
break;
- if (pAMgr && pAMgr->get_keepcase() && spell(wspace))
- ns = insert_sug(slst, wspace, ns);
- mkinitcap2(wspace, unicw, nc);
- ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
+ if (pAMgr && pAMgr->get_keepcase() && spell(wspace.c_str()))
+ ns = insert_sug(slst, wspace.c_str(), ns);
+ mkinitcap2(wspace, sunicw);
+ ns = pSMgr->suggest(slst, wspace.c_str(), ns, &onlycmpdsug);
for (int j = 0; j < ns; j++) {
- mkallcap((*slst)[j]);
+ std::string form((*slst)[j]);
+ mkallcap(form);
+
if (pAMgr && pAMgr->get_checksharps()) {
- char* pos;
if (utf8) {
- pos = strstr((*slst)[j], "\xC3\x9F");
- while (pos) {
- *pos = 'S';
- *(pos + 1) = 'S';
- pos = strstr(pos + 2, "\xC3\x9F");
- }
+ mystrrep(form, "\xC3\x9F", "SS");
} else {
- pos = strchr((*slst)[j], '\xDF');
- while (pos) {
- (*slst)[j] = (char*)realloc((*slst)[j], strlen((*slst)[j]) + 2);
- mystrrep((*slst)[j], "\xDF", "SS");
- pos = strchr((*slst)[j], '\xDF');
- }
+ mystrrep(form, "\xDF", "SS");
}
}
+
+ free((*slst)[j]);
+ (*slst)[j] = mystrdup(form.c_str());
+
}
break;
}
@@ -1035,11 +966,10 @@ int Hunspell::suggest(char*** slst, const char* word) {
char* pos = strchr((*slst)[j], '-');
if (pos) {
int info;
- char w[MAXWORDUTF8LEN];
*pos = '\0';
- strcpy(w, (*slst)[j]);
- strcat(w, pos + 1);
- (void)spell(w, &info, NULL);
+ std::string w((*slst)[j]);
+ w.append(pos + 1);
+ (void)spell(w.c_str(), &info, NULL);
if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) {
*pos = ' ';
} else
@@ -1054,64 +984,67 @@ int Hunspell::suggest(char*** slst, const char* word) {
(*slst)) {
switch (captype) {
case NOCAP: {
- ns = pSMgr->ngsuggest(*slst, cw, ns, pHMgr, maxdic);
+ ns = pSMgr->ngsuggest(*slst, scw.c_str(), ns, pHMgr, maxdic);
break;
}
case HUHINITCAP:
capwords = 1;
case HUHCAP: {
- memcpy(wspace, cw, (wl + 1));
- mkallsmall2(wspace, unicw, nc);
- ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic);
+ std::string wspace(scw);
+ mkallsmall2(wspace, sunicw);
+ ns = pSMgr->ngsuggest(*slst, wspace.c_str(), ns, pHMgr, maxdic);
break;
}
case INITCAP: {
capwords = 1;
- memcpy(wspace, cw, (wl + 1));
- mkallsmall2(wspace, unicw, nc);
- ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic);
+ std::string wspace(scw);
+ mkallsmall2(wspace, sunicw);
+ ns = pSMgr->ngsuggest(*slst, wspace.c_str(), ns, pHMgr, maxdic);
break;
}
case ALLCAP: {
- memcpy(wspace, cw, (wl + 1));
- mkallsmall2(wspace, unicw, nc);
+ std::string wspace(scw);
+ mkallsmall2(wspace, sunicw);
int oldns = ns;
- ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic);
- for (int j = oldns; j < ns; j++)
- mkallcap((*slst)[j]);
+ ns = pSMgr->ngsuggest(*slst, wspace.c_str(), ns, pHMgr, maxdic);
+ for (int j = oldns; j < ns; j++) {
+ std::string form((*slst)[j]);
+ mkallcap(form);
+ free((*slst)[j]);
+ (*slst)[j] = mystrdup(form.c_str());
+ }
break;
}
}
}
// try dash suggestion (Afo-American -> Afro-American)
- if (char* pos = strchr(cw, '-')) {
- char* ppos = cw;
+ size_t dash_pos = scw.find('-');
+ if (dash_pos != std::string::npos) {
int nodashsug = 1;
- char** nlst = NULL;
- int nn = 0;
- int last = 0;
- if (*slst) {
- for (int j = 0; j < ns && nodashsug == 1; j++) {
- if (strchr((*slst)[j], '-'))
- nodashsug = 0;
- }
+ for (int j = 0; j < ns && nodashsug == 1; j++) {
+ if (strchr((*slst)[j], '-'))
+ nodashsug = 0;
}
+
+ size_t prev_pos = 0;
+ bool last = false;
+
while (nodashsug && !last) {
- if (*pos == '\0')
+ if (dash_pos == scw.size())
last = 1;
- else
- *pos = '\0';
- if (!spell(ppos)) {
- nn = suggest(&nlst, ppos);
+ std::string chunk = scw.substr(prev_pos, dash_pos - prev_pos);
+ if (!spell(chunk.c_str())) {
+ char** nlst = NULL;
+ int nn = suggest(&nlst, chunk.c_str());
for (int j = nn - 1; j >= 0; j--) {
- strncpy(wspace, cw, ppos - cw);
- strcpy(wspace + (ppos - cw), nlst[j]);
+ std::string wspace = scw.substr(0, prev_pos);
+ wspace.append(nlst[j]);
if (!last) {
- strcat(wspace, "-");
- strcat(wspace, pos + 1);
+ wspace.append("-");
+ wspace.append(scw.substr(dash_pos + 1));
}
- ns = insert_sug(slst, wspace, ns);
+ ns = insert_sug(slst, wspace.c_str(), ns);
free(nlst[j]);
}
if (nlst != NULL)
@@ -1119,29 +1052,34 @@ int Hunspell::suggest(char*** slst, const char* word) {
nodashsug = 0;
}
if (!last) {
- *pos = '-';
- ppos = pos + 1;
- pos = strchr(ppos, '-');
+ prev_pos = dash_pos + 1;
+ dash_pos = scw.find('-', prev_pos);
}
- if (!pos)
- pos = cw + strlen(cw);
+ if (dash_pos == std::string::npos)
+ dash_pos = scw.size();
}
}
// word reversing wrapper for complex prefixes
if (complexprefixes) {
for (int j = 0; j < ns; j++) {
+ std::string root((*slst)[j]);
+ free((*slst)[j]);
if (utf8)
- reverseword_utf((*slst)[j]);
+ reverseword_utf(root);
else
- reverseword((*slst)[j]);
+ reverseword(root);
+ (*slst)[j] = mystrdup(root.c_str());
}
}
// capitalize
if (capwords)
for (int j = 0; j < ns; j++) {
- mkinitcap((*slst)[j]);
+ std::string form((*slst)[j]);
+ free((*slst)[j]);
+ mkinitcap(form);
+ (*slst)[j] = mystrdup(form.c_str());
}
// expand suggestions with dot(s)
@@ -1160,25 +1098,23 @@ int Hunspell::suggest(char*** slst, const char* word) {
int l = 0;
for (int j = 0; j < ns; j++) {
if (!strchr((*slst)[j], ' ') && !spell((*slst)[j])) {
- char s[MAXSWUTF8L];
- w_char w[MAXSWL];
- int len;
+ std::string s;
+ std::vector<w_char> w;
if (utf8) {
- len = u8_u16(w, MAXSWL, (*slst)[j]);
+ u8_u16(w, (*slst)[j]);
} else {
- strcpy(s, (*slst)[j]);
- len = strlen(s);
+ s = (*slst)[j];
}
- mkallsmall2(s, w, len);
+ mkallsmall2(s, w);
free((*slst)[j]);
- if (spell(s)) {
- (*slst)[l] = mystrdup(s);
+ if (spell(s.c_str())) {
+ (*slst)[l] = mystrdup(s.c_str());
if ((*slst)[l])
l++;
} else {
- mkinitcap2(s, w, len);
- if (spell(s)) {
- (*slst)[l] = mystrdup(s);
+ mkinitcap2(s, w);
+ if (spell(s.c_str())) {
+ (*slst)[l] = mystrdup(s.c_str());
if ((*slst)[l])
l++;
}
@@ -1211,9 +1147,10 @@ int Hunspell::suggest(char*** slst, const char* word) {
// output conversion
rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL;
for (int j = 0; rl && j < ns; j++) {
- if (rl->conv((*slst)[j], wspace, MAXWORDUTF8LEN) > 0) {
+ std::string wspace;
+ if (rl->conv((*slst)[j], wspace) > 0) {
free((*slst)[j]);
- (*slst)[j] = mystrdup(wspace);
+ (*slst)[j] = mystrdup(wspace.c_str());
}
}
@@ -1233,151 +1170,25 @@ char* Hunspell::get_dic_encoding() {
return encoding;
}
-#ifdef HUNSPELL_EXPERIMENTAL
-// XXX UTF-8 support is OK?
-int Hunspell::suggest_auto(char*** slst, const char* word) {
- char cw[MAXWORDUTF8LEN];
- char wspace[MAXWORDUTF8LEN];
- if (!pSMgr || maxdic == 0)
- return 0;
- w_char unicw[MAXWORDLEN];
- int nc = strlen(word);
- if (utf8) {
- if (nc >= MAXWORDUTF8LEN)
- return 0;
- } else {
- if (nc >= MAXWORDLEN)
- return 0;
- }
- int captype = 0;
- int abbv = 0;
- int wl = 0;
-
- // input conversion
- RepList* rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
- int convstatus = rl ? rl->conv(word, wspace) : 0;
- if (convstatus < 0)
- return 0;
- else if (convstatus > 0)
- wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
- else
- wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
-
- if (wl == 0)
- return 0;
- int ns = 0;
- *slst = NULL; // HU, nsug in pSMgr->suggest
-
- switch (captype) {
- case NOCAP: {
- ns = pSMgr->suggest_auto(slst, cw, ns);
- if (ns > 0)
- break;
- break;
- }
-
- case INITCAP: {
- memcpy(wspace, cw, (wl + 1));
- mkallsmall2(wspace, unicw, nc);
- ns = pSMgr->suggest_auto(slst, wspace, ns);
- for (int j = 0; j < ns; j++)
- mkinitcap((*slst)[j]);
- ns = pSMgr->suggest_auto(slst, cw, ns);
- break;
- }
-
- case HUHINITCAP:
- case HUHCAP: {
- ns = pSMgr->suggest_auto(slst, cw, ns);
- if (ns == 0) {
- memcpy(wspace, cw, (wl + 1));
- mkallsmall2(wspace, unicw, nc);
- ns = pSMgr->suggest_auto(slst, wspace, ns);
- }
- break;
- }
-
- case ALLCAP: {
- memcpy(wspace, cw, (wl + 1));
- mkallsmall2(wspace, unicw, nc);
- ns = pSMgr->suggest_auto(slst, wspace, ns);
-
- mkinitcap(wspace);
- ns = pSMgr->suggest_auto(slst, wspace, ns);
-
- for (int j = 0; j < ns; j++)
- mkallcap((*slst)[j]);
- break;
- }
- }
-
- // word reversing wrapper for complex prefixes
- if (complexprefixes) {
- for (int j = 0; j < ns; j++) {
- if (utf8)
- reverseword_utf((*slst)[j]);
- else
- reverseword((*slst)[j]);
- }
- }
-
- // expand suggestions with dot(s)
- if (abbv && pAMgr && pAMgr->get_sugswithdots()) {
- for (int j = 0; j < ns; j++) {
- (*slst)[j] = (char*)realloc((*slst)[j], strlen((*slst)[j]) + 1 + abbv);
- strcat((*slst)[j], word + strlen(word) - abbv);
- }
- }
-
- // LANG_hu section: replace '-' with ' ' in Hungarian
- if (langnum == LANG_hu) {
- for (int j = 0; j < ns; j++) {
- char* pos = strchr((*slst)[j], '-');
- if (pos) {
- int info;
- char w[MAXWORDUTF8LEN];
- *pos = '\0';
- strcpy(w, (*slst)[j]);
- strcat(w, pos + 1);
- spell(w, &info, NULL);
- if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) {
- *pos = ' ';
- } else
- *pos = '-';
- }
- }
- }
-
- // output conversion
- rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL;
- for (int j = 0; rl && j < ns; j++) {
- if (rl->conv((*slst)[j], wspace) > 0) {
- free((*slst)[j]);
- (*slst)[j] = mystrdup(wspace);
- }
- }
-
- // END OF LANG_hu section
- return ns;
-}
-#endif
-
int Hunspell::stem(char*** slst, char** desc, int n) {
- char result[MAXLNLEN];
- char result2[MAXLNLEN];
+
+ std::string result2;
*slst = NULL;
if (n == 0)
return 0;
- *result2 = '\0';
for (int i = 0; i < n; i++) {
- *result = '\0';
+
+ std::string result;
+
// add compound word parts (except the last one)
char* s = (char*)desc[i];
char* part = strstr(s, MORPH_PART);
if (part) {
char* nextpart = strstr(part + 1, MORPH_PART);
while (nextpart) {
- copy_field(result + strlen(result), part, MORPH_PART);
+ std::string field;
+ copy_field(field, part, MORPH_PART);
+ result.append(field);
part = nextpart;
nextpart = strstr(part + 1, MORPH_PART);
}
@@ -1404,22 +1215,28 @@ int Hunspell::stem(char*** slst, char** desc, int n) {
int genl = line_tok(sg, &gen, MSEP_REC);
free(sg);
for (int j = 0; j < genl; j++) {
- sprintf(result2 + strlen(result2), "%c%s%s", MSEP_REC, result,
- gen[j]);
+ result2.push_back(MSEP_REC);
+ result2.append(result);
+ result2.append(gen[j]);
}
freelist(&gen, genl);
}
} else {
- sprintf(result2 + strlen(result2), "%c%s", MSEP_REC, result);
+ result2.push_back(MSEP_REC);
+ result2.append(result);
if (strstr(pl[k], MORPH_SURF_PFX)) {
- copy_field(result2 + strlen(result2), pl[k], MORPH_SURF_PFX);
+ std::string field;
+ copy_field(field, pl[k], MORPH_SURF_PFX);
+ result2.append(field);
}
- copy_field(result2 + strlen(result2), pl[k], MORPH_STEM);
+ std::string field;
+ copy_field(field, pl[k], MORPH_STEM);
+ result2.append(field);
}
}
freelist(&pl, pln);
}
- int sln = line_tok(result2, slst, MSEP_REC);
+ int sln = line_tok(result2.c_str(), slst, MSEP_REC);
return uniqlist(*slst, sln);
}
@@ -1431,148 +1248,43 @@ int Hunspell::stem(char*** slst, const char* word) {
return pln2;
}
-#ifdef HUNSPELL_EXPERIMENTAL
-int Hunspell::suggest_pos_stems(char*** slst, const char* word) {
- char cw[MAXWORDUTF8LEN];
- char wspace[MAXWORDUTF8LEN];
- if (!pSMgr || maxdic == 0)
- return 0;
- w_char unicw[MAXWORDLEN];
- int nc = strlen(word);
- if (utf8) {
- if (nc >= MAXWORDUTF8LEN)
- return 0;
- } else {
- if (nc >= MAXWORDLEN)
- return 0;
- }
- int captype = 0;
- int abbv = 0;
- int wl = 0;
-
- // input conversion
- RepList* rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
- int convstatus = rl ? rl->conv(word, wspace) : 0;
- if (convstatus < 0)
- return 0;
- else if (convstatus > 0)
- wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
- else
- wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
-
- if (wl == 0)
- return 0;
-
- int ns = 0; // ns=0 = normalized input
-
- *slst = NULL; // HU, nsug in pSMgr->suggest
-
- switch (captype) {
- case HUHCAP:
- case NOCAP: {
- ns = pSMgr->suggest_pos_stems(slst, cw, ns);
-
- if ((abbv) && (ns == 0)) {
- memcpy(wspace, cw, wl);
- *(wspace + wl) = '.';
- *(wspace + wl + 1) = '\0';
- ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
- }
-
- break;
- }
-
- case INITCAP: {
- ns = pSMgr->suggest_pos_stems(slst, cw, ns);
-
- if (ns == 0 || ((*slst)[0][0] == '#')) {
- memcpy(wspace, cw, (wl + 1));
- mkallsmall2(wspace, unicw, nc);
- ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
- }
-
- break;
- }
-
- case ALLCAP: {
- ns = pSMgr->suggest_pos_stems(slst, cw, ns);
- if (ns != 0)
- break;
-
- memcpy(wspace, cw, (wl + 1));
- mkallsmall2(wspace, unicw, nc);
- ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
-
- if (ns == 0) {
- mkinitcap(wspace);
- ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
- }
- break;
- }
- }
-
- // output conversion
- rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL;
- for (int j = 0; rl && j < ns; j++) {
- if (rl->conv((*slst)[j], wspace) > 0) {
- free((*slst)[j]);
- (*slst)[j] = mystrdup(wspace);
- }
- }
-
- return ns;
-}
-#endif // END OF HUNSPELL_EXPERIMENTAL CODE
-
const char* Hunspell::get_wordchars() {
return pAMgr->get_wordchars();
}
-unsigned short* Hunspell::get_wordchars_utf16(int* len) {
- return pAMgr->get_wordchars_utf16(len);
+const std::vector<w_char>& Hunspell::get_wordchars_utf16() {
+ return pAMgr->get_wordchars_utf16();
}
-void Hunspell::mkinitcap(char* p) {
- if (!utf8) {
- if (*p != '\0')
- *p = csconv[((unsigned char)*p)].cupper;
+void Hunspell::mkinitcap(std::string& u8) {
+ if (utf8) {
+ std::vector<w_char> u16;
+ u8_u16(u16, u8);
+ ::mkinitcap_utf(u16, langnum);
+ u16_u8(u8, u16);
} else {
- int len;
- w_char u[MAXWORDLEN];
- len = u8_u16(u, MAXWORDLEN, p);
- unsigned short i = unicodetoupper((u[0].h << 8) + u[0].l, langnum);
- u[0].h = (unsigned char)(i >> 8);
- u[0].l = (unsigned char)(i & 0x00FF);
- u16_u8(p, MAXWORDUTF8LEN, u, len);
+ ::mkinitcap(u8, csconv);
}
}
-int Hunspell::mkinitcap2(char* p, w_char* u, int nc) {
- if (!utf8) {
- if (*p != '\0')
- *p = csconv[((unsigned char)*p)].cupper;
- } else if (nc > 0) {
- unsigned short i = unicodetoupper((u[0].h << 8) + u[0].l, langnum);
- u[0].h = (unsigned char)(i >> 8);
- u[0].l = (unsigned char)(i & 0x00FF);
- u16_u8(p, MAXWORDUTF8LEN, u, nc);
- return strlen(p);
+int Hunspell::mkinitcap2(std::string& u8, std::vector<w_char>& u16) {
+ if (utf8) {
+ ::mkinitcap_utf(u16, langnum);
+ u16_u8(u8, u16);
+ } else {
+ ::mkinitcap(u8, csconv);
}
- return nc;
+ return u8.size();
}
-int Hunspell::mkinitsmall2(char* p, w_char* u, int nc) {
- if (!utf8) {
- if (*p != '\0')
- *p = csconv[((unsigned char)*p)].clower;
- } else if (nc > 0) {
- unsigned short i = unicodetolower((u[0].h << 8) + u[0].l, langnum);
- u[0].h = (unsigned char)(i >> 8);
- u[0].l = (unsigned char)(i & 0x00FF);
- u16_u8(p, MAXWORDUTF8LEN, u, nc);
- return strlen(p);
+int Hunspell::mkinitsmall2(std::string& u8, std::vector<w_char>& u16) {
+ if (utf8) {
+ ::mkinitsmall_utf(u16, langnum);
+ u16_u8(u8, u16);
+ } else {
+ ::mkinitsmall(u8, csconv);
}
- return nc;
+ return u8.size();
}
int Hunspell::add(const char* word) {
@@ -1601,20 +1313,16 @@ struct cs_info* Hunspell::get_csconv() {
return csconv;
}
-void Hunspell::cat_result(char* result, char* st) {
+void Hunspell::cat_result(std::string& result, char* st) {
if (st) {
- if (*result)
- mystrcat(result, "\n", MAXLNLEN);
- mystrcat(result, st, MAXLNLEN);
+ if (!result.empty())
+ result.append("\n");
+ result.append(st);
free(st);
}
}
int Hunspell::analyze(char*** slst, const char* word) {
- char cw[MAXWORDUTF8LEN];
- char wspace[MAXWORDUTF8LEN];
- w_char unicw[MAXWORDLEN];
- int wl2 = 0;
*slst = NULL;
if (!pSMgr || maxdic == 0)
return 0;
@@ -1626,48 +1334,52 @@ int Hunspell::analyze(char*** slst, const char* word) {
if (nc >= MAXWORDLEN)
return 0;
}
- int captype = 0;
- int abbv = 0;
- int wl = 0;
+ int captype = NOCAP;
+ size_t abbv = 0;
+ size_t wl = 0;
+
+ std::string scw;
+ std::vector<w_char> sunicw;
// input conversion
RepList* rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
- int convstatus = rl ? rl->conv(word, wspace, MAXWORDUTF8LEN) : 0;
- if (convstatus < 0)
- return 0;
- else if (convstatus > 0)
- wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
- else
- wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
+ {
+ std::string wspace;
+
+ int convstatus = rl ? rl->conv(word, wspace) : 0;
+ if (convstatus < 0)
+ return 0;
+ else if (convstatus > 0)
+ wl = cleanword2(scw, sunicw, wspace.c_str(), &nc, &captype, &abbv);
+ else
+ wl = cleanword2(scw, sunicw, word, &nc, &captype, &abbv);
+ }
if (wl == 0) {
if (abbv) {
+ scw.clear();
for (wl = 0; wl < abbv; wl++)
- cw[wl] = '.';
- cw[wl] = '\0';
+ scw.push_back('.');
abbv = 0;
} else
return 0;
}
- char result[MAXLNLEN];
- char* st = NULL;
-
- *result = '\0';
+ std::string result;
- int n = 0;
- int n2 = 0;
- int n3 = 0;
+ size_t n = 0;
+ size_t n2 = 0;
+ size_t n3 = 0;
// test numbers
// LANG_hu section: set dash information for suggestions
if (langnum == LANG_hu) {
- while ((n < wl) && (((cw[n] <= '9') && (cw[n] >= '0')) ||
- (((cw[n] == '.') || (cw[n] == ',')) && (n > 0)))) {
+ while ((n < wl) && (((scw[n] <= '9') && (scw[n] >= '0')) ||
+ (((scw[n] == '.') || (scw[n] == ',')) && (n > 0)))) {
n++;
- if ((cw[n] == '.') || (cw[n] == ',')) {
+ if ((scw[n] == '.') || (scw[n] == ',')) {
if (((n2 == 0) && (n > 3)) ||
- ((n2 > 0) && ((cw[n - 1] == '.') || (cw[n - 1] == ','))))
+ ((n2 > 0) && ((scw[n - 1] == '.') || (scw[n - 1] == ','))))
break;
n2++;
n3 = n;
@@ -1676,21 +1388,21 @@ int Hunspell::analyze(char*** slst, const char* word) {
if ((n == wl) && (n3 > 0) && (n - n3 > 3))
return 0;
- if ((n == wl) || ((n > 0) && ((cw[n] == '%') || (cw[n] == '\xB0')) &&
- checkword(cw + n, NULL, NULL))) {
- mystrcat(result, cw, MAXLNLEN);
- result[n - 1] = '\0';
+ if ((n == wl) || ((n > 0) && ((scw[n] == '%') || (scw[n] == '\xB0')) &&
+ checkword(scw.c_str() + n, NULL, NULL))) {
+ result.append(scw);
+ result.resize(n - 1);
if (n == wl)
- cat_result(result, pSMgr->suggest_morph(cw + n - 1));
+ cat_result(result, pSMgr->suggest_morph(scw.c_str() + n - 1));
else {
- char sign = cw[n];
- cw[n] = '\0';
- cat_result(result, pSMgr->suggest_morph(cw + n - 1));
- mystrcat(result, "+", MAXLNLEN); // XXX SPEC. MORPHCODE
- cw[n] = sign;
- cat_result(result, pSMgr->suggest_morph(cw + n));
+ char sign = scw[n];
+ scw[n] = '\0';
+ cat_result(result, pSMgr->suggest_morph(scw.c_str() + n - 1));
+ result.push_back('+'); // XXX SPEC. MORPHCODE
+ scw[n] = sign;
+ cat_result(result, pSMgr->suggest_morph(scw.c_str() + n));
}
- return line_tok(result, slst, MSEP_REC);
+ return line_tok(result.c_str(), slst, MSEP_REC);
}
}
// END OF LANG_hu section
@@ -1699,64 +1411,58 @@ int Hunspell::analyze(char*** slst, const char* word) {
case HUHCAP:
case HUHINITCAP:
case NOCAP: {
- cat_result(result, pSMgr->suggest_morph(cw));
+ cat_result(result, pSMgr->suggest_morph(scw.c_str()));
if (abbv) {
- memcpy(wspace, cw, wl);
- *(wspace + wl) = '.';
- *(wspace + wl + 1) = '\0';
- cat_result(result, pSMgr->suggest_morph(wspace));
+ std::string u8buffer(scw);
+ u8buffer.push_back('.');
+ cat_result(result, pSMgr->suggest_morph(u8buffer.c_str()));
}
break;
}
case INITCAP: {
- wl = mkallsmall2(cw, unicw, nc);
- memcpy(wspace, cw, (wl + 1));
- wl2 = mkinitcap2(cw, unicw, nc);
- cat_result(result, pSMgr->suggest_morph(wspace));
- cat_result(result, pSMgr->suggest_morph(cw));
+ wl = mkallsmall2(scw, sunicw);
+ std::string u8buffer(scw);
+ mkinitcap2(scw, sunicw);
+ cat_result(result, pSMgr->suggest_morph(u8buffer.c_str()));
+ cat_result(result, pSMgr->suggest_morph(scw.c_str()));
if (abbv) {
- *(wspace + wl) = '.';
- *(wspace + wl + 1) = '\0';
- cat_result(result, pSMgr->suggest_morph(wspace));
+ u8buffer.push_back('.');
+ cat_result(result, pSMgr->suggest_morph(u8buffer.c_str()));
- memcpy(wspace, cw, wl2);
- *(wspace + wl2) = '.';
- *(wspace + wl2 + 1) = '\0';
+ u8buffer = scw;
+ u8buffer.push_back('.');
- cat_result(result, pSMgr->suggest_morph(wspace));
+ cat_result(result, pSMgr->suggest_morph(u8buffer.c_str()));
}
break;
}
case ALLCAP: {
- cat_result(result, pSMgr->suggest_morph(cw));
+ cat_result(result, pSMgr->suggest_morph(scw.c_str()));
if (abbv) {
- memcpy(wspace, cw, wl);
- *(wspace + wl) = '.';
- *(wspace + wl + 1) = '\0';
- cat_result(result, pSMgr->suggest_morph(cw));
+ std::string u8buffer(scw);
+ u8buffer.push_back('.');
+ cat_result(result, pSMgr->suggest_morph(u8buffer.c_str()));
}
- wl = mkallsmall2(cw, unicw, nc);
- memcpy(wspace, cw, (wl + 1));
- wl2 = mkinitcap2(cw, unicw, nc);
+ mkallsmall2(scw, sunicw);
+ std::string u8buffer(scw);
+ mkinitcap2(scw, sunicw);
- cat_result(result, pSMgr->suggest_morph(wspace));
- cat_result(result, pSMgr->suggest_morph(cw));
+ cat_result(result, pSMgr->suggest_morph(u8buffer.c_str()));
+ cat_result(result, pSMgr->suggest_morph(scw.c_str()));
if (abbv) {
- *(wspace + wl) = '.';
- *(wspace + wl + 1) = '\0';
- cat_result(result, pSMgr->suggest_morph(wspace));
+ u8buffer.push_back('.');
+ cat_result(result, pSMgr->suggest_morph(u8buffer.c_str()));
- memcpy(wspace, cw, wl2);
- *(wspace + wl2) = '.';
- *(wspace + wl2 + 1) = '\0';
+ u8buffer = scw;
+ u8buffer.push_back('.');
- cat_result(result, pSMgr->suggest_morph(wspace));
+ cat_result(result, pSMgr->suggest_morph(u8buffer.c_str()));
}
break;
}
}
- if (*result) {
+ if (!result.empty()) {
// word reversing wrapper for complex prefixes
if (complexprefixes) {
if (utf8)
@@ -1764,95 +1470,94 @@ int Hunspell::analyze(char*** slst, const char* word) {
else
reverseword(result);
}
- return line_tok(result, slst, MSEP_REC);
+ return line_tok(result.c_str(), slst, MSEP_REC);
}
// compound word with dash (HU) I18n
- char* dash = NULL;
- int nresult = 0;
// LANG_hu section: set dash information for suggestions
- if (langnum == LANG_hu)
- dash = (char*)strchr(cw, '-');
- if ((langnum == LANG_hu) && dash) {
- *dash = '\0';
+
+ size_t dash_pos = langnum == LANG_hu ? scw.find('-') : std::string::npos;
+ int nresult = 0;
+ if (dash_pos != std::string::npos) {
+ std::string part1 = scw.substr(0, dash_pos);
+ std::string part2 = scw.substr(dash_pos+1);
+
// examine 2 sides of the dash
- if (dash[1] == '\0') { // base word ending with dash
- if (spell(cw)) {
- char* p = pSMgr->suggest_morph(cw);
+ if (part2.empty()) { // base word ending with dash
+ if (spell(part1.c_str())) {
+ char* p = pSMgr->suggest_morph(part1.c_str());
if (p) {
int ret = line_tok(p, slst, MSEP_REC);
free(p);
return ret;
}
}
- } else if ((dash[1] == 'e') && (dash[2] == '\0')) { // XXX (HU) -e hat.
- if (spell(cw) && (spell("-e"))) {
- st = pSMgr->suggest_morph(cw);
+ } else if (part2.size() == 1 && part2[0] == 'e') { // XXX (HU) -e hat.
+ if (spell(part1.c_str()) && (spell("-e"))) {
+ char* st = pSMgr->suggest_morph(part1.c_str());
if (st) {
- mystrcat(result, st, MAXLNLEN);
+ result.append(st);
free(st);
}
- mystrcat(result, "+", MAXLNLEN); // XXX spec. separator in MORPHCODE
+ result.push_back('+'); // XXX spec. separator in MORPHCODE
st = pSMgr->suggest_morph("-e");
if (st) {
- mystrcat(result, st, MAXLNLEN);
+ result.append(st);
free(st);
}
- return line_tok(result, slst, MSEP_REC);
+ return line_tok(result.c_str(), slst, MSEP_REC);
}
} else {
// first word ending with dash: word- XXX ???
- char r2 = *(dash + 1);
- dash[0] = '-';
- dash[1] = '\0';
- nresult = spell(cw);
- dash[1] = r2;
- dash[0] = '\0';
- if (nresult && spell(dash + 1) &&
- ((strlen(dash + 1) > 1) || ((dash[1] > '0') && (dash[1] < '9')))) {
- st = pSMgr->suggest_morph(cw);
+ part1.push_back(' ');
+ nresult = spell(part1.c_str());
+ part1.erase(part1.size() - 1);
+ if (nresult && spell(part2.c_str()) &&
+ ((part2.size() > 1) || ((part2[0] > '0') && (part2[0] < '9')))) {
+ char* st = pSMgr->suggest_morph(part1.c_str());
if (st) {
- mystrcat(result, st, MAXLNLEN);
+ result.append(st);
free(st);
- mystrcat(result, "+", MAXLNLEN); // XXX spec. separator in MORPHCODE
+ result.push_back('+'); // XXX spec. separator in MORPHCODE
}
- st = pSMgr->suggest_morph(dash + 1);
+ st = pSMgr->suggest_morph(part2.c_str());
if (st) {
- mystrcat(result, st, MAXLNLEN);
+ result.append(st);
free(st);
}
- return line_tok(result, slst, MSEP_REC);
+ return line_tok(result.c_str(), slst, MSEP_REC);
}
}
// affixed number in correct word
- if (nresult && (dash > cw) &&
- (((*(dash - 1) <= '9') && (*(dash - 1) >= '0')) ||
- (*(dash - 1) == '.'))) {
- *dash = '-';
+ if (nresult && (dash_pos > 0) &&
+ (((scw[dash_pos - 1] <= '9') && (scw[dash_pos - 1] >= '0')) ||
+ (scw[dash_pos - 1] == '.'))) {
n = 1;
- if (*(dash - n) == '.')
+ if (scw[dash_pos - n] == '.')
n++;
// search first not a number character to left from dash
- while (((dash - n) >= cw) && ((*(dash - n) == '0') || (n < 3)) &&
+ while ((dash_pos >= n) && ((scw[dash_pos - n] == '0') || (n < 3)) &&
(n < 6)) {
n++;
}
- if ((dash - n) < cw)
+ if (dash_pos < n)
n--;
// numbers: valami1000000-hoz
// examine 100000-hoz, 10000-hoz 1000-hoz, 10-hoz,
// 56-hoz, 6-hoz
for (; n >= 1; n--) {
- if ((*(dash - n) >= '0') && (*(dash - n) <= '9') &&
- checkword(dash - n, NULL, NULL)) {
- mystrcat(result, cw, MAXLNLEN);
- result[dash - cw - n] = '\0';
- st = pSMgr->suggest_morph(dash - n);
+ if (scw[dash_pos - n] < '0' || scw[dash_pos - n] > '9') {
+ continue;
+ }
+ std::string chunk = scw.substr(dash_pos - n);
+ if (checkword(chunk.c_str(), NULL, NULL)) {
+ result.append(chunk);
+ char* st = pSMgr->suggest_morph(chunk.c_str());
if (st) {
- mystrcat(result, st, MAXLNLEN);
+ result.append(st);
free(st);
}
- return line_tok(result, slst, MSEP_REC);
+ return line_tok(result.c_str(), slst, MSEP_REC);
}
}
}
@@ -1866,30 +1571,33 @@ int Hunspell::generate(char*** slst, const char* word, char** pl, int pln) {
return 0;
char** pl2;
int pl2n = analyze(&pl2, word);
- int captype = 0;
+ int captype = NOCAP;
int abbv = 0;
- char cw[MAXWORDUTF8LEN];
+ std::string cw;
cleanword(cw, word, &captype, &abbv);
- char result[MAXLNLEN];
- *result = '\0';
+ std::string result;
for (int i = 0; i < pln; i++) {
cat_result(result, pSMgr->suggest_gen(pl2, pl2n, pl[i]));
}
freelist(&pl2, pl2n);
- if (*result) {
+ if (!result.empty()) {
// allcap
if (captype == ALLCAP)
mkallcap(result);
// line split
- int linenum = line_tok(result, slst, MSEP_REC);
+ int linenum = line_tok(result.c_str(), slst, MSEP_REC);
// capitalize
if (captype == INITCAP || captype == HUHINITCAP) {
- for (int j = 0; j < linenum; j++)
- mkinitcap((*slst)[j]);
+ for (int j = 0; j < linenum; j++) {
+ std::string form((*slst)[j]);
+ free((*slst)[j]);
+ mkinitcap(form);
+ (*slst)[j] = mystrdup(form.c_str());
+ }
}
// temporary filtering of prefix related errors (eg.
@@ -1923,22 +1631,21 @@ int Hunspell::generate(char*** slst, const char* word, const char* pattern) {
}
// minimal XML parser functions
-int Hunspell::get_xml_par(char* dest, const char* par, int max) {
- char* d = dest;
+std::string Hunspell::get_xml_par(const char* par) {
+ std::string dest;
if (!par)
- return 0;
+ return dest;
char end = *par;
- char* dmax = dest + max;
if (end == '>')
end = '<';
else if (end != '\'' && end != '"')
return 0; // bad XML
- for (par++; d < dmax && *par != '\0' && *par != end; par++, d++)
- *d = *par;
- *d = '\0';
+ for (par++; *par != '\0' && *par != end; ++par) {
+ dest.push_back(*par);
+ }
mystrrep(dest, "&lt;", "<");
mystrrep(dest, "&amp;", "&");
- return (int)(d - dest);
+ return dest;
}
int Hunspell::get_langnum() const {
@@ -1967,18 +1674,17 @@ const char* Hunspell::get_xml_pos(const char* s, const char* attr) {
int Hunspell::check_xml_par(const char* q,
const char* attr,
const char* value) {
- char cw[MAXWORDUTF8LEN];
- if (get_xml_par(cw, get_xml_pos(q, attr), MAXWORDUTF8LEN - 1) &&
- strcmp(cw, value) == 0)
+ std::string cw = get_xml_par(get_xml_pos(q, attr));
+ if (cw == value)
return 1;
return 0;
}
-int Hunspell::get_xml_list(char*** slst, char* list, const char* tag) {
- int n = 0;
- char* p;
+int Hunspell::get_xml_list(char*** slst, const char* list, const char* tag) {
if (!list)
return 0;
+ int n = 0;
+ const char* p;
for (p = list; ((p = strstr(p, tag)) != NULL); p++)
n++;
if (n == 0)
@@ -1987,25 +1693,20 @@ int Hunspell::get_xml_list(char*** slst, char* list, const char* tag) {
if (!*slst)
return 0;
for (p = list, n = 0; ((p = strstr(p, tag)) != NULL); p++, n++) {
- int l = strlen(p);
- (*slst)[n] = (char*)malloc(l + 1);
- if (!(*slst)[n])
- return n;
- if (!get_xml_par((*slst)[n], p + strlen(tag) - 1, l)) {
- free((*slst)[n]);
+ std::string cw = get_xml_par(p + strlen(tag) - 1);
+ if (cw.empty()) {
break;
}
+ (*slst)[n] = mystrdup(cw.c_str());
}
return n;
}
int Hunspell::spellml(char*** slst, const char* word) {
- char *q, *q2;
- char cw[MAXWORDUTF8LEN], cw2[MAXWORDUTF8LEN];
- q = (char*)strstr(word, "<query");
+ const char* q = strstr(word, "<query");
if (!q)
return 0; // bad XML input
- q2 = strchr(q, '>');
+ const char* q2 = strchr(q, '>');
if (!q2)
return 0; // bad XML input
q2 = strstr(q2, "<word");
@@ -2013,8 +1714,9 @@ int Hunspell::spellml(char*** slst, const char* word) {
return 0; // bad XML input
if (check_xml_par(q, "type=", "analyze")) {
int n = 0;
- if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 10))
- n = analyze(slst, cw);
+ std::string cw = get_xml_par(strchr(q2, '>'));
+ if (!cw.empty())
+ n = analyze(slst, cw.c_str());
if (n == 0)
return 0;
// convert the result to <code><a>ana1</a><a>ana2</a></code> format
@@ -2036,22 +1738,25 @@ int Hunspell::spellml(char*** slst, const char* word) {
(*slst)[0] = mystrdup(r.c_str());
return 1;
} else if (check_xml_par(q, "type=", "stem")) {
- if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 1))
- return stem(slst, cw);
+ std::string cw = get_xml_par(strchr(q2, '>'));
+ if (!cw.empty())
+ return stem(slst, cw.c_str());
} else if (check_xml_par(q, "type=", "generate")) {
- int n = get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 1);
- if (n == 0)
+ std::string cw = get_xml_par(strchr(q2, '>'));
+ if (cw.empty())
return 0;
- char* q3 = strstr(q2 + 1, "<word");
+ const char* q3 = strstr(q2 + 1, "<word");
if (q3) {
- if (get_xml_par(cw2, strchr(q3, '>'), MAXWORDUTF8LEN - 1)) {
- return generate(slst, cw, cw2);
+ std::string cw2 = get_xml_par(strchr(q3, '>'));
+ if (!cw2.empty()) {
+ return generate(slst, cw.c_str(), cw2.c_str());
}
} else {
if ((q2 = strstr(q2 + 1, "<code")) != NULL) {
char** slst2;
- if ((n = get_xml_list(&slst2, strchr(q2, '>'), "<a>")) != 0) {
- int n2 = generate(slst, cw, slst2, n);
+ int n = get_xml_list(&slst2, strchr(q2, '>'), "<a>");
+ if (n != 0) {
+ int n2 = generate(slst, cw.c_str(), slst2, n);
freelist(&slst2, n);
return uniqlist(*slst, n2);
}
@@ -2062,182 +1767,6 @@ int Hunspell::spellml(char*** slst, const char* word) {
return 0;
}
-#ifdef HUNSPELL_EXPERIMENTAL
-// XXX is UTF-8 support OK?
-char* Hunspell::morph_with_correction(const char* word) {
- char cw[MAXWORDUTF8LEN];
- char wspace[MAXWORDUTF8LEN];
- if (!pSMgr || maxdic == 0)
- return NULL;
- w_char unicw[MAXWORDLEN];
- int nc = strlen(word);
- if (utf8) {
- if (nc >= MAXWORDUTF8LEN)
- return NULL;
- } else {
- if (nc >= MAXWORDLEN)
- return NULL;
- }
- int captype = 0;
- int abbv = 0;
- int wl = 0;
-
- // input conversion
- RepList* rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
- int convstatus = rl ? rl->conv(word, wspace) : 0;
- if (convstatus < 0)
- return 0;
- else if (convstatus > 0)
- wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
- else
- wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
-
- if (wl == 0)
- return NULL;
-
- char result[MAXLNLEN];
- char* st = NULL;
-
- *result = '\0';
-
- switch (captype) {
- case NOCAP: {
- st = pSMgr->suggest_morph_for_spelling_error(cw);
- if (st) {
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- if (abbv) {
- memcpy(wspace, cw, wl);
- *(wspace + wl) = '.';
- *(wspace + wl + 1) = '\0';
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- if (*result)
- mystrcat(result, "\n", MAXLNLEN);
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- }
- break;
- }
- case INITCAP: {
- memcpy(wspace, cw, (wl + 1));
- mkallsmall2(wspace, unicw, nc);
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- st = pSMgr->suggest_morph_for_spelling_error(cw);
- if (st) {
- if (*result)
- mystrcat(result, "\n", MAXLNLEN);
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- if (abbv) {
- memcpy(wspace, cw, wl);
- *(wspace + wl) = '.';
- *(wspace + wl + 1) = '\0';
- mkallsmall2(wspace, unicw, nc);
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- if (*result)
- mystrcat(result, "\n", MAXLNLEN);
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- mkinitcap(wspace);
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- if (*result)
- mystrcat(result, "\n", MAXLNLEN);
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- }
- break;
- }
- case HUHCAP: {
- st = pSMgr->suggest_morph_for_spelling_error(cw);
- if (st) {
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- memcpy(wspace, cw, (wl + 1));
- mkallsmall2(wspace, unicw, nc);
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- if (*result)
- mystrcat(result, "\n", MAXLNLEN);
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- break;
- }
- case ALLCAP: {
- memcpy(wspace, cw, (wl + 1));
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- mkallsmall2(wspace, unicw, nc);
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- if (*result)
- mystrcat(result, "\n", MAXLNLEN);
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- mkinitcap(wspace);
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- if (*result)
- mystrcat(result, "\n", MAXLNLEN);
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- if (abbv) {
- memcpy(wspace, cw, (wl + 1));
- *(wspace + wl) = '.';
- *(wspace + wl + 1) = '\0';
- if (*result)
- mystrcat(result, "\n", MAXLNLEN);
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- mkallsmall2(wspace, unicw, nc);
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- if (*result)
- mystrcat(result, "\n", MAXLNLEN);
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- mkinitcap(wspace);
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- if (*result)
- mystrcat(result, "\n", MAXLNLEN);
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- }
- break;
- }
- }
-
- if (*result)
- return mystrdup(result);
- return NULL;
-}
-
-#endif // END OF HUNSPELL_EXPERIMENTAL CODE
-
Hunhandle* Hunspell_create(const char* affpath, const char* dpath) {
return (Hunhandle*)(new Hunspell(affpath, dpath));
}
@@ -2333,10 +1862,9 @@ int Hunspell::suffix_suggest(char*** slst, const char* root_word) {
if (ignoredchars != NULL) {
w2.assign(root_word);
if (utf8) {
- int ignoredchars_utf16_len;
- unsigned short* ignoredchars_utf16 =
- pAMgr->get_ignore_utf16(&ignoredchars_utf16_len);
- remove_ignored_chars_utf(w2, ignoredchars_utf16, ignoredchars_utf16_len);
+ const std::vector<w_char>& ignoredchars_utf16 =
+ pAMgr->get_ignore_utf16();
+ remove_ignored_chars_utf(w2, ignoredchars_utf16);
} else {
remove_ignored_chars(w2, ignoredchars);
}