summaryrefslogtreecommitdiff
path: root/libs/hunspell/src
diff options
context:
space:
mode:
Diffstat (limited to 'libs/hunspell/src')
-rw-r--r--libs/hunspell/src/affentry.c++201
-rw-r--r--libs/hunspell/src/affentry.hxx8
-rw-r--r--libs/hunspell/src/affixmgr.c++381
-rw-r--r--libs/hunspell/src/affixmgr.hxx24
-rw-r--r--libs/hunspell/src/atypes.hxx13
-rw-r--r--libs/hunspell/src/baseaffix.hxx2
-rw-r--r--libs/hunspell/src/config.h7
-rw-r--r--libs/hunspell/src/csutil.c++520
-rw-r--r--libs/hunspell/src/csutil.hxx121
-rw-r--r--libs/hunspell/src/filemgr.c++10
-rw-r--r--libs/hunspell/src/hashmgr.c++193
-rw-r--r--libs/hunspell/src/hashmgr.hxx15
-rw-r--r--libs/hunspell/src/hunspell.c++1554
-rw-r--r--libs/hunspell/src/hunspell.hxx52
-rw-r--r--libs/hunspell/src/phonet.c++59
-rw-r--r--libs/hunspell/src/phonet.hxx6
-rw-r--r--libs/hunspell/src/replist.c++20
-rw-r--r--libs/hunspell/src/replist.hxx4
-rw-r--r--libs/hunspell/src/suggestmgr.c++812
-rw-r--r--libs/hunspell/src/suggestmgr.hxx7
-rw-r--r--libs/hunspell/src/w_char.hxx20
21 files changed, 1477 insertions, 2552 deletions
diff --git a/libs/hunspell/src/affentry.c++ b/libs/hunspell/src/affentry.c++
index 983fe2c1ec..bd28274368 100644
--- a/libs/hunspell/src/affentry.c++
+++ b/libs/hunspell/src/affentry.c++
@@ -79,8 +79,6 @@
#include "affentry.hxx"
#include "csutil.hxx"
-#define MAXTEMPWORDLEN (MAXWORDUTF8LEN + 4)
-
PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp)
// register affix manager
: pmyMgr(pmgr),
@@ -117,11 +115,10 @@ PfxEntry::~PfxEntry() {
}
// add prefix to this word assuming conditions hold
-char* PfxEntry::add(const char* word, int len) {
+char* PfxEntry::add(const char* word, size_t len) {
if ((len > strip.size() || (len == 0 && pmyMgr->get_fullstrip())) &&
(len >= numconds) && test_condition(word) &&
- (!strip.size() || (strncmp(word, strip.c_str(), strip.size()) == 0)) &&
- ((MAXTEMPWORDLEN) > (len + appnd.size() - strip.size()))) {
+ (!strip.size() || (strncmp(word, strip.c_str(), strip.size()) == 0))) {
/* we have a match so add prefix */
std::string tword(appnd);
tword.append(word + strip.size());
@@ -233,26 +230,21 @@ struct hentry* PfxEntry::checkword(const char* word,
int len,
char in_compound,
const FLAG needflag) {
- int tmpl; // length of tmpword
struct hentry* he; // hash entry of root word or NULL
- char tmpword[MAXTEMPWORDLEN];
// on entry prefix is 0 length or already matches the beginning of the word.
// So if the remaining root word has positive length
// and if there are enough chars in root word and added back strip chars
// to meet the number of characters conditions, then test it
- tmpl = len - appnd.size();
+ int tmpl = len - appnd.size(); // length of tmpword
if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) {
// generate new root word by removing prefix and adding
// back any characters that would have been stripped
- if (strip.size()) {
- strncpy(tmpword, strip.c_str(), MAXTEMPWORDLEN - 1);
- tmpword[MAXTEMPWORDLEN - 1] = '\0';
- }
- strcpy((tmpword + strip.size()), (word + appnd.size()));
+ std::string tmpword(strip);
+ tmpword.append(word + appnd.size());
// now make sure all of the conditions on characters
// are met. Please see the appendix at the end of
@@ -262,9 +254,9 @@ struct hentry* PfxEntry::checkword(const char* word,
// if all conditions are met then check if resulting
// root word in the dictionary
- if (test_condition(tmpword)) {
+ if (test_condition(tmpword.c_str())) {
tmpl += strip.size();
- if ((he = pmyMgr->lookup(tmpword)) != NULL) {
+ if ((he = pmyMgr->lookup(tmpword.c_str())) != NULL) {
do {
if (TESTAFF(he->astr, aflag, he->alen) &&
// forbid single prefixes with needaffix flag
@@ -283,8 +275,9 @@ struct hentry* PfxEntry::checkword(const char* word,
// if ((opts & aeXPRODUCT) && in_compound) {
if ((opts & aeXPRODUCT)) {
- he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, this, NULL, 0,
- NULL, FLAG_NULL, needflag, in_compound);
+ he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, aeXPRODUCT, this,
+ NULL, 0, NULL, FLAG_NULL, needflag,
+ in_compound);
if (he)
return he;
}
@@ -298,27 +291,22 @@ struct hentry* PfxEntry::check_twosfx(const char* word,
int len,
char in_compound,
const FLAG needflag) {
- int tmpl; // length of tmpword
struct hentry* he; // hash entry of root word or NULL
- char tmpword[MAXTEMPWORDLEN];
// on entry prefix is 0 length or already matches the beginning of the word.
// So if the remaining root word has positive length
// and if there are enough chars in root word and added back strip chars
// to meet the number of characters conditions, then test it
- tmpl = len - appnd.size();
+ int tmpl = len - appnd.size(); // length of tmpword
if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
(tmpl + strip.size() >= numconds)) {
// generate new root word by removing prefix and adding
// back any characters that would have been stripped
- if (strip.size()) {
- strncpy(tmpword, strip.c_str(), MAXTEMPWORDLEN - 1);
- tmpword[MAXTEMPWORDLEN - 1] = '\0';
- }
- strcpy((tmpword + strip.size()), (word + appnd.size()));
+ std::string tmpword(strip);
+ tmpword.append(word + appnd.size());
// now make sure all of the conditions on characters
// are met. Please see the appendix at the end of
@@ -328,7 +316,7 @@ struct hentry* PfxEntry::check_twosfx(const char* word,
// if all conditions are met then check if resulting
// root word in the dictionary
- if (test_condition(tmpword)) {
+ if (test_condition(tmpword.c_str())) {
tmpl += strip.size();
// prefix matched but no root word was found
@@ -336,7 +324,7 @@ struct hentry* PfxEntry::check_twosfx(const char* word,
// cross checked combined with a suffix
if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
- he = pmyMgr->suffix_check_twosfx(tmpword, tmpl, aeXPRODUCT, this,
+ he = pmyMgr->suffix_check_twosfx(tmpword.c_str(), tmpl, aeXPRODUCT, this,
needflag);
if (he)
return he;
@@ -351,26 +339,20 @@ char* PfxEntry::check_twosfx_morph(const char* word,
int len,
char in_compound,
const FLAG needflag) {
- int tmpl; // length of tmpword
- char tmpword[MAXTEMPWORDLEN];
-
// on entry prefix is 0 length or already matches the beginning of the word.
// So if the remaining root word has positive length
// and if there are enough chars in root word and added back strip chars
// to meet the number of characters conditions, then test it
- tmpl = len - appnd.size();
+ int tmpl = len - appnd.size(); // length of tmpword
if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
(tmpl + strip.size() >= numconds)) {
// generate new root word by removing prefix and adding
// back any characters that would have been stripped
- if (strip.size()) {
- strncpy(tmpword, strip.c_str(), MAXTEMPWORDLEN - 1);
- tmpword[MAXTEMPWORDLEN - 1] = '\0';
- }
- strcpy((tmpword + strip.size()), (word + appnd.size()));
+ std::string tmpword(strip);
+ tmpword.append(word + appnd.size());
// now make sure all of the conditions on characters
// are met. Please see the appendix at the end of
@@ -380,7 +362,7 @@ char* PfxEntry::check_twosfx_morph(const char* word,
// if all conditions are met then check if resulting
// root word in the dictionary
- if (test_condition(tmpword)) {
+ if (test_condition(tmpword.c_str())) {
tmpl += strip.size();
// prefix matched but no root word was found
@@ -388,7 +370,8 @@ char* PfxEntry::check_twosfx_morph(const char* word,
// ross checked combined with a suffix
if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
- return pmyMgr->suffix_check_twosfx_morph(tmpword, tmpl, aeXPRODUCT,
+ return pmyMgr->suffix_check_twosfx_morph(tmpword.c_str(), tmpl,
+ aeXPRODUCT,
this, needflag);
}
}
@@ -401,31 +384,23 @@ char* PfxEntry::check_morph(const char* word,
int len,
char in_compound,
const FLAG needflag) {
- int tmpl; // length of tmpword
struct hentry* he; // hash entry of root word or NULL
- char tmpword[MAXTEMPWORDLEN];
- char result[MAXLNLEN];
char* st;
- *result = '\0';
-
// on entry prefix is 0 length or already matches the beginning of the word.
// So if the remaining root word has positive length
// and if there are enough chars in root word and added back strip chars
// to meet the number of characters conditions, then test it
- tmpl = len - appnd.size();
+ int tmpl = len - appnd.size(); // length of tmpword
if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
(tmpl + strip.size() >= numconds)) {
// generate new root word by removing prefix and adding
// back any characters that would have been stripped
- if (strip.size()) {
- strncpy(tmpword, strip.c_str(), MAXTEMPWORDLEN - 1);
- tmpword[MAXTEMPWORDLEN - 1] = '\0';
- }
- strcpy(tmpword + strip.size(), word + appnd.size());
+ std::string tmpword(strip);
+ tmpword.append(word + appnd.size());
// now make sure all of the conditions on characters
// are met. Please see the appendix at the end of
@@ -435,9 +410,11 @@ char* PfxEntry::check_morph(const char* word,
// if all conditions are met then check if resulting
// root word in the dictionary
- if (test_condition(tmpword)) {
+ if (test_condition(tmpword.c_str())) {
+ std::string result;
+
tmpl += strip.size();
- if ((he = pmyMgr->lookup(tmpword)) != NULL) {
+ if ((he = pmyMgr->lookup(tmpword.c_str())) != NULL) {
do {
if (TESTAFF(he->astr, aflag, he->alen) &&
// forbid single prefixes with needaffix flag
@@ -446,28 +423,28 @@ char* PfxEntry::check_morph(const char* word,
((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
(contclass && TESTAFF(contclass, needflag, contclasslen)))) {
if (morphcode) {
- mystrcat(result, " ", MAXLNLEN);
- mystrcat(result, morphcode, MAXLNLEN);
+ result.append(" ");
+ result.append(morphcode);
} else
- mystrcat(result, getKey(), MAXLNLEN);
+ result.append(getKey());
if (!HENTRY_FIND(he, MORPH_STEM)) {
- mystrcat(result, " ", MAXLNLEN);
- mystrcat(result, MORPH_STEM, MAXLNLEN);
- mystrcat(result, HENTRY_WORD(he), MAXLNLEN);
+ result.append(" ");
+ result.append(MORPH_STEM);
+ result.append(HENTRY_WORD(he));
}
// store the pointer of the hash entry
if (HENTRY_DATA(he)) {
- mystrcat(result, " ", MAXLNLEN);
- mystrcat(result, HENTRY_DATA2(he), MAXLNLEN);
+ result.append(" ");
+ result.append(HENTRY_DATA2(he));
} else {
// return with debug information
char* flag = pmyMgr->encode_flag(getFlag());
- mystrcat(result, " ", MAXLNLEN);
- mystrcat(result, MORPH_FLAG, MAXLNLEN);
- mystrcat(result, flag, MAXLNLEN);
+ result.append(" ");
+ result.append(MORPH_FLAG);
+ result.append(flag);
free(flag);
}
- mystrcat(result, "\n", MAXLNLEN);
+ result.append("\n");
}
he = he->next_homonym;
} while (he);
@@ -478,18 +455,19 @@ char* PfxEntry::check_morph(const char* word,
// ross checked combined with a suffix
if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
- st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, this,
+ st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, aeXPRODUCT, this,
FLAG_NULL, needflag);
if (st) {
- mystrcat(result, st, MAXLNLEN);
+ result.append(st);
free(st);
}
}
+
+ if (!result.empty())
+ return mystrdup(result.c_str());
}
}
- if (*result)
- return mystrdup(result);
return NULL;
}
@@ -516,7 +494,8 @@ SfxEntry::SfxEntry(AffixMgr* pmgr, affentry* dp)
c.l.conds2 = dp->c.l.conds2;
} else
memcpy(c.conds, dp->c.conds, MAXCONDLEN);
- rappnd = myrevstrdup(appnd.c_str());
+ rappnd = appnd;
+ reverseword(rappnd);
morphcode = dp->morphcode;
contclass = dp->contclass;
contclasslen = dp->contclasslen;
@@ -524,8 +503,6 @@ SfxEntry::SfxEntry(AffixMgr* pmgr, affentry* dp)
SfxEntry::~SfxEntry() {
aflag = 0;
- if (rappnd)
- free(rappnd);
pmyMgr = NULL;
if (opts & aeLONGCOND)
free(c.l.conds2);
@@ -536,13 +513,12 @@ SfxEntry::~SfxEntry() {
}
// add suffix to this word assuming conditions hold
-char* SfxEntry::add(const char* word, int len) {
+char* SfxEntry::add(const char* word, size_t len) {
/* make sure all conditions match */
if ((len > strip.size() || (len == 0 && pmyMgr->get_fullstrip())) &&
(len >= numconds) && test_condition(word + len, word) &&
(!strip.size() ||
- (strcmp(word + len - strip.size(), strip.c_str()) == 0)) &&
- ((MAXTEMPWORDLEN) > (len + appnd.size() - strip.size()))) {
+ (strcmp(word + len - strip.size(), strip.c_str()) == 0))) {
std::string tword(word);
/* we have a match so add suffix */
tword.replace(len - strip.size(), std::string::npos, appnd);
@@ -699,10 +675,7 @@ struct hentry* SfxEntry::checkword(const char* word,
const FLAG cclass,
const FLAG needflag,
const FLAG badflag) {
- int tmpl; // length of tmpword
struct hentry* he; // hash entry pointer
- unsigned char* cp;
- char tmpword[MAXTEMPWORDLEN];
PfxEntry* ep = ppfx;
// if this suffix is being cross checked with a prefix
@@ -716,7 +689,7 @@ struct hentry* SfxEntry::checkword(const char* word,
// and if there are enough chars in root word and added back strip chars
// to meet the number of characters conditions, then test it
- tmpl = len - appnd.size();
+ int tmpl = len - appnd.size(); // length of tmpword
// the second condition is not enough for UTF-8 strings
// it checked in test_condition()
@@ -726,15 +699,13 @@ struct hentry* SfxEntry::checkword(const char* word,
// back any characters that would have been stripped or
// or null terminating the shorter string
- strncpy(tmpword, word, MAXTEMPWORDLEN - 1);
- tmpword[MAXTEMPWORDLEN - 1] = '\0';
- cp = (unsigned char*)(tmpword + tmpl);
+ std::string tmpstring(word, tmpl);
if (strip.size()) {
- strcpy((char*)cp, strip.c_str());
- tmpl += strip.size();
- cp = (unsigned char*)(tmpword + tmpl);
- } else
- *cp = '\0';
+ tmpstring.append(strip);
+ }
+
+ const char* tmpword = tmpstring.c_str();
+ const char* endword = tmpword + tmpstring.size();
// now make sure all of the conditions on characters
// are met. Please see the appendix at the end of
@@ -744,7 +715,7 @@ struct hentry* SfxEntry::checkword(const char* word,
// if all conditions are met then check if resulting
// root word in the dictionary
- if (test_condition((char*)cp, (char*)tmpword)) {
+ if (test_condition(endword, tmpword)) {
#ifdef SZOSZABLYA_POSSIBLE_ROOTS
fprintf(stdout, "%s %s %c\n", word, tmpword, aflag);
#endif
@@ -804,10 +775,7 @@ struct hentry* SfxEntry::check_twosfx(const char* word,
int optflags,
PfxEntry* ppfx,
const FLAG needflag) {
- int tmpl; // length of tmpword
struct hentry* he; // hash entry pointer
- unsigned char* cp;
- char tmpword[MAXTEMPWORDLEN];
PfxEntry* ep = ppfx;
// if this suffix is being cross checked with a prefix
@@ -821,7 +789,7 @@ struct hentry* SfxEntry::check_twosfx(const char* word,
// and if there are enough chars in root word and added back strip chars
// to meet the number of characters conditions, then test it
- tmpl = len - appnd.size();
+ int tmpl = len - appnd.size(); // length of tmpword
if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
(tmpl + strip.size() >= numconds)) {
@@ -829,15 +797,13 @@ struct hentry* SfxEntry::check_twosfx(const char* word,
// back any characters that would have been stripped or
// or null terminating the shorter string
- strncpy(tmpword, word, MAXTEMPWORDLEN - 1);
- tmpword[MAXTEMPWORDLEN - 1] = '\0';
- cp = (unsigned char*)(tmpword + tmpl);
- if (strip.size()) {
- strcpy((char*)cp, strip.c_str());
- tmpl += strip.size();
- cp = (unsigned char*)(tmpword + tmpl);
- } else
- *cp = '\0';
+ std::string tmpword(word);
+ tmpword.resize(tmpl);
+ tmpword.append(strip);
+ tmpl += strip.size();
+
+ const char* beg = tmpword.c_str();
+ const char* end = beg + tmpl;
// now make sure all of the conditions on characters
// are met. Please see the appendix at the end of
@@ -846,17 +812,17 @@ struct hentry* SfxEntry::check_twosfx(const char* word,
// if all conditions are met then recall suffix_check
- if (test_condition((char*)cp, (char*)tmpword)) {
+ if (test_condition(end, beg)) {
if (ppfx) {
// handle conditional suffix
if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
- he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL,
+ he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, 0, NULL, NULL, 0, NULL,
(FLAG)aflag, needflag);
else
- he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0,
+ he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, optflags, ppfx, NULL, 0,
NULL, (FLAG)aflag, needflag);
} else {
- he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL,
+ he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, 0, NULL, NULL, 0, NULL,
(FLAG)aflag, needflag);
}
if (he)
@@ -872,9 +838,6 @@ char* SfxEntry::check_twosfx_morph(const char* word,
int optflags,
PfxEntry* ppfx,
const FLAG needflag) {
- int tmpl; // length of tmpword
- unsigned char* cp;
- char tmpword[MAXTEMPWORDLEN];
PfxEntry* ep = ppfx;
char* st;
@@ -893,7 +856,7 @@ char* SfxEntry::check_twosfx_morph(const char* word,
// and if there are enough chars in root word and added back strip chars
// to meet the number of characters conditions, then test it
- tmpl = len - appnd.size();
+ int tmpl = len - appnd.size(); // length of tmpword
if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
(tmpl + strip.size() >= numconds)) {
@@ -901,15 +864,13 @@ char* SfxEntry::check_twosfx_morph(const char* word,
// back any characters that would have been stripped or
// or null terminating the shorter string
- strncpy(tmpword, word, MAXTEMPWORDLEN - 1);
- tmpword[MAXTEMPWORDLEN - 1] = '\0';
- cp = (unsigned char*)(tmpword + tmpl);
- if (strip.size()) {
- strcpy((char*)cp, strip.c_str());
- tmpl += strip.size();
- cp = (unsigned char*)(tmpword + tmpl);
- } else
- *cp = '\0';
+ std::string tmpword(word);
+ tmpword.resize(tmpl);
+ tmpword.append(strip);
+ tmpl += strip.size();
+
+ const char* beg = tmpword.c_str();
+ const char* end = beg + tmpl;
// now make sure all of the conditions on characters
// are met. Please see the appendix at the end of
@@ -918,11 +879,11 @@ char* SfxEntry::check_twosfx_morph(const char* word,
// if all conditions are met then recall suffix_check
- if (test_condition((char*)cp, (char*)tmpword)) {
+ if (test_condition(end, beg)) {
if (ppfx) {
// handle conditional suffix
if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {
- st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag,
+ st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, 0, NULL, aflag,
needflag);
if (st) {
if (ppfx->getMorph()) {
@@ -934,7 +895,7 @@ char* SfxEntry::check_twosfx_morph(const char* word,
mychomp(result);
}
} else {
- st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag,
+ st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, optflags, ppfx, aflag,
needflag);
if (st) {
mystrcat(result, st, MAXLNLEN);
@@ -944,7 +905,7 @@ char* SfxEntry::check_twosfx_morph(const char* word,
}
} else {
st =
- pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
+ pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, 0, NULL, aflag, needflag);
if (st) {
mystrcat(result, st, MAXLNLEN);
free(st);
diff --git a/libs/hunspell/src/affentry.hxx b/libs/hunspell/src/affentry.hxx
index f3db20013c..6311d83fff 100644
--- a/libs/hunspell/src/affentry.hxx
+++ b/libs/hunspell/src/affentry.hxx
@@ -122,7 +122,7 @@ class LIBHUNSPELL_DLL_EXPORTED PfxEntry : protected AffEntry {
inline FLAG getFlag() { return aflag; }
inline const char* getKey() { return appnd.c_str(); }
- char* add(const char* word, int len);
+ char* add(const char* word, size_t len);
inline short getKeyLen() { return appnd.size(); }
@@ -154,7 +154,7 @@ class LIBHUNSPELL_DLL_EXPORTED SfxEntry : protected AffEntry {
private:
AffixMgr* pmyMgr;
- char* rappnd;
+ std::string rappnd;
SfxEntry* next;
SfxEntry* nexteq;
@@ -200,8 +200,8 @@ class LIBHUNSPELL_DLL_EXPORTED SfxEntry : protected AffEntry {
const FLAG needflag);
inline FLAG getFlag() { return aflag; }
- inline const char* getKey() { return rappnd; }
- char* add(const char* word, int len);
+ inline const char* getKey() { return rappnd.c_str(); }
+ char* add(const char* word, size_t len);
inline const char* getMorph() { return morphcode; }
diff --git a/libs/hunspell/src/affixmgr.c++ b/libs/hunspell/src/affixmgr.c++
index d21ff49573..d6bb677982 100644
--- a/libs/hunspell/src/affixmgr.c++
+++ b/libs/hunspell/src/affixmgr.c++
@@ -72,13 +72,13 @@
*/
#include <stdlib.h>
-#include <string>
#include <string.h>
#include <stdio.h>
#include <ctype.h>
+#include <algorithm>
#include <limits>
-
+#include <string>
#include <vector>
#include "affixmgr.hxx"
@@ -152,11 +152,7 @@ AffixMgr::AffixMgr(const char* affpath,
cpdsyllablenum = NULL; // syllable count incrementing flag
checknum = 0; // checking numbers, and word with numbers
wordchars = NULL; // letters + spec. word characters
- wordchars_utf16 = NULL; // letters + spec. word characters
- wordchars_utf16_len = 0; // letters + spec. word characters
ignorechars = NULL; // letters + spec. word characters
- ignorechars_utf16 = NULL; // letters + spec. word characters
- ignorechars_utf16_len = 0; // letters + spec. word characters
version = NULL; // affix and dictionary file version string
havecontclass = 0; // flags of possible continuing classes (double affix)
// LEMMA_PRESENT: not put root into the morphological output. Lemma presents
@@ -336,12 +332,8 @@ AffixMgr::~AffixMgr() {
free(lang);
if (wordchars)
free(wordchars);
- if (wordchars_utf16)
- free(wordchars_utf16);
if (ignorechars)
free(ignorechars);
- if (ignorechars_utf16)
- free(ignorechars_utf16);
if (version)
free(version);
checknum = 0;
@@ -632,8 +624,8 @@ int AffixMgr::parse_file(const char* affpath, const char* key) {
/* parse in the extra word characters */
if (strncmp(line, "WORDCHARS", 9) == 0) {
- if (parse_array(line, &wordchars, &wordchars_utf16, &wordchars_utf16_len,
- utf8, afflst->getlinenum())) {
+ if (!parse_array(line, &wordchars, wordchars_utf16,
+ utf8, afflst->getlinenum())) {
finishFileMgr(afflst);
return 1;
}
@@ -642,8 +634,8 @@ int AffixMgr::parse_file(const char* affpath, const char* key) {
/* parse in the ignored characters (for example, Arabic optional diacretics
* charachters */
if (strncmp(line, "IGNORE", 6) == 0) {
- if (parse_array(line, &ignorechars, &ignorechars_utf16,
- &ignorechars_utf16_len, utf8, afflst->getlinenum())) {
+ if (!parse_array(line, &ignorechars, ignorechars_utf16,
+ utf8, afflst->getlinenum())) {
finishFileMgr(afflst);
return 1;
}
@@ -1174,7 +1166,7 @@ std::string& AffixMgr::debugflag(std::string& result, unsigned short flag) {
}
// calculate the character length of the condition
-int AffixMgr::condlen(char* st) {
+int AffixMgr::condlen(const char* st) {
int l = 0;
bool group = false;
for (; *st; st++) {
@@ -1189,7 +1181,7 @@ int AffixMgr::condlen(char* st) {
return l;
}
-int AffixMgr::encodeit(affentry& entry, char* cs) {
+int AffixMgr::encodeit(affentry& entry, const char* cs) {
if (strcmp(cs, ".") != 0) {
entry.numconds = (char)condlen(cs);
// coverity[buffer_size_warning] - deliberate use of lack of end of conds
@@ -1328,7 +1320,6 @@ char* AffixMgr::prefix_check_morph(const char* word,
int len,
char in_compound,
const FLAG needflag) {
- char* st;
char result[MAXLNLEN];
result[0] = '\0';
@@ -1340,7 +1331,7 @@ char* AffixMgr::prefix_check_morph(const char* word,
// first handle the special case of 0 length prefixes
PfxEntry* pe = pStart[0];
while (pe) {
- st = pe->check_morph(word, len, in_compound, needflag);
+ char* st = pe->check_morph(word, len, in_compound, needflag);
if (st) {
mystrcat(result, st, MAXLNLEN);
free(st);
@@ -1355,7 +1346,7 @@ char* AffixMgr::prefix_check_morph(const char* word,
while (pptr) {
if (isSubset(pptr->getKey(), word)) {
- st = pptr->check_morph(word, len, in_compound, needflag);
+ char* st = pptr->check_morph(word, len, in_compound, needflag);
if (st) {
// fogemorpheme
if ((in_compound != IN_CPD_NOT) ||
@@ -1382,8 +1373,6 @@ char* AffixMgr::prefix_check_twosfx_morph(const char* word,
int len,
char in_compound,
const FLAG needflag) {
- char* st;
-
char result[MAXLNLEN];
result[0] = '\0';
@@ -1394,7 +1383,7 @@ char* AffixMgr::prefix_check_twosfx_morph(const char* word,
// first handle the special case of 0 length prefixes
PfxEntry* pe = pStart[0];
while (pe) {
- st = pe->check_twosfx_morph(word, len, in_compound, needflag);
+ char* st = pe->check_twosfx_morph(word, len, in_compound, needflag);
if (st) {
mystrcat(result, st, MAXLNLEN);
free(st);
@@ -1408,7 +1397,7 @@ char* AffixMgr::prefix_check_twosfx_morph(const char* word,
while (pptr) {
if (isSubset(pptr->getKey(), word)) {
- st = pptr->check_twosfx_morph(word, len, in_compound, needflag);
+ char* st = pptr->check_twosfx_morph(word, len, in_compound, needflag);
if (st) {
mystrcat(result, st, MAXLNLEN);
free(st);
@@ -1427,13 +1416,12 @@ char* AffixMgr::prefix_check_twosfx_morph(const char* word,
// Is word a non compound with a REP substitution (see checkcompoundrep)?
int AffixMgr::cpdrep_check(const char* word, int wl) {
- const char* r;
if ((wl < 2) || !numrep)
return 0;
for (int i = 0; i < numrep; i++) {
- r = word;
+ const char* r = word;
int lenp = strlen(reptable[i].pattern);
// search every occurence of the pattern in the word
while ((r = strstr(r, reptable[i].pattern)) != NULL) {
@@ -1478,14 +1466,14 @@ int AffixMgr::cpdpat_check(const char* word,
// bounds
int AffixMgr::cpdcase_check(const char* word, int pos) {
if (utf8) {
- w_char u, w;
const char* p;
- u8_u16(&u, 1, word + pos);
for (p = word + pos - 1; (*p & 0xc0) == 0x80; p--)
;
- u8_u16(&w, 1, p);
- unsigned short a = (u.h << 8) + u.l;
- unsigned short b = (w.h << 8) + w.l;
+ std::string pair(p);
+ std::vector<w_char> pair_u;
+ u8_u16(pair_u, pair);
+ unsigned short a = pair_u.size() > 1 ? ((pair_u[1].h << 8) + pair_u[1].l) : 0;
+ unsigned short b = !pair_u.empty() ? ((pair_u[0].h << 8) + pair_u[0].l) : 0;
if (((unicodetoupper(a, langnum) == a) ||
(unicodetoupper(b, langnum) == b)) &&
(a != '-') && (b != '-'))
@@ -1499,20 +1487,18 @@ int AffixMgr::cpdcase_check(const char* word, int pos) {
return 0;
}
+struct metachar_data {
+ signed short btpp; // metacharacter (*, ?) position for backtracking
+ signed short btwp; // word position for metacharacters
+ int btnum; // number of matched characters in metacharacter
+};
+
// check compound patterns
int AffixMgr::defcpd_check(hentry*** words,
short wnum,
hentry* rv,
hentry** def,
char all) {
- signed short
- btpp[MAXWORDLEN]; // metacharacter (*, ?) positions for backtracking
- signed short btwp[MAXWORDLEN]; // word positions for metacharacters
- int btnum[MAXWORDLEN]; // number of matched characters in metacharacter
- // positions
- short bt = 0;
- int i, j;
- int ok;
int w = 0;
if (!*words) {
@@ -1524,6 +1510,11 @@ int AffixMgr::defcpd_check(hentry*** words,
return 0;
}
+ std::vector<metachar_data> btinfo(1);
+
+ short bt = 0;
+ int i, j;
+
(*words)[wnum] = rv;
// has the last word COMPOUNDRULE flag?
@@ -1533,7 +1524,7 @@ int AffixMgr::defcpd_check(hentry*** words,
*words = NULL;
return 0;
}
- ok = 0;
+ int ok = 0;
for (i = 0; i < numdefcpd; i++) {
for (j = 0; j < defcpdtable[i].len; j++) {
if (defcpdtable[i].def[j] != '*' && defcpdtable[i].def[j] != '?' &&
@@ -1564,8 +1555,8 @@ int AffixMgr::defcpd_check(hentry*** words,
int wend = (defcpdtable[i].def[pp + 1] == '?') ? wp : wnum;
ok2 = 1;
pp += 2;
- btpp[bt] = pp;
- btwp[bt] = wp;
+ btinfo[bt].btpp = pp;
+ btinfo[bt].btwp = wp;
while (wp <= wend) {
if (!(*words)[wp]->alen ||
!TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp - 2],
@@ -1577,9 +1568,11 @@ int AffixMgr::defcpd_check(hentry*** words,
}
if (wp <= wnum)
ok2 = 0;
- btnum[bt] = wp - btwp[bt];
- if (btnum[bt] > 0)
- bt++;
+ btinfo[bt].btnum = wp - btinfo[bt].btwp;
+ if (btinfo[bt].btnum > 0) {
+ ++bt;
+ btinfo.resize(bt+1);
+ }
if (ok2)
break;
} else {
@@ -1609,10 +1602,10 @@ int AffixMgr::defcpd_check(hentry*** words,
if (bt)
do {
ok = 1;
- btnum[bt - 1]--;
- pp = btpp[bt - 1];
- wp = btwp[bt - 1] + (signed short)btnum[bt - 1];
- } while ((btnum[bt - 1] < 0) && --bt);
+ btinfo[bt - 1].btnum--;
+ pp = btinfo[bt - 1].btpp;
+ wp = btinfo[bt - 1].btwp + (signed short)btinfo[bt - 1].btnum;
+ } while ((btinfo[bt - 1].btnum < 0) && --bt);
} while (bt);
if (ok && ok2 && (!all || (defcpdtable[i].len <= pp)))
@@ -1650,24 +1643,26 @@ inline int AffixMgr::candidate_check(const char* word, int len) {
}
// calculate number of syllable for compound-checking
-short AffixMgr::get_syllable(const char* word, int wlen) {
+short AffixMgr::get_syllable(const std::string& word) {
if (cpdmaxsyllable == 0)
return 0;
short num = 0;
if (!utf8) {
- for (int i = 0; i < wlen; i++) {
+ for (size_t i = 0; i < word.size(); ++i) {
if (strchr(cpdvowels, word[i]))
num++;
}
} else if (cpdvowels_utf16) {
- w_char w[MAXWORDUTF8LEN];
- int i = u8_u16(w, MAXWORDUTF8LEN, word);
+ std::vector<w_char> w;
+ int i = u8_u16(w, word);
for (; i > 0; i--) {
- if (flag_bsearch((unsigned short*)cpdvowels_utf16,
- ((unsigned short*)w)[i - 1], cpdvowels_utf16_len))
- num++;
+ if (std::binary_search(cpdvowels_utf16,
+ cpdvowels_utf16 + cpdvowels_utf16_len,
+ w[i - 1])) {
+ ++num;
+ }
}
}
return num;
@@ -1676,12 +1671,12 @@ short AffixMgr::get_syllable(const char* word, int wlen) {
void AffixMgr::setcminmax(int* cmin, int* cmax, const char* word, int len) {
if (utf8) {
int i;
- for (*cmin = 0, i = 0; (i < cpdmin) && word[*cmin]; i++) {
- for ((*cmin)++; (word[*cmin] & 0xc0) == 0x80; (*cmin)++)
+ for (*cmin = 0, i = 0; (i < cpdmin) && *cmin < len; i++) {
+ for ((*cmin)++; *cmin < len && (word[*cmin] & 0xc0) == 0x80; (*cmin)++)
;
}
- for (*cmax = len, i = 0; (i < (cpdmin - 1)) && *cmax; i++) {
- for ((*cmax)--; (word[*cmax] & 0xc0) == 0x80; (*cmax)--)
+ for (*cmax = len, i = 0; (i < (cpdmin - 1)) && *cmax >= 0; i++) {
+ for ((*cmax)--; *cmax >= 0 && (word[*cmax] & 0xc0) == 0x80; (*cmax)--)
;
}
} else {
@@ -1699,6 +1694,7 @@ struct hentry* AffixMgr::compound_check(const char* word,
short maxwordnum,
short wnum,
hentry** words = NULL,
+ hentry** rwords = NULL,
char hu_mov_rule = 0,
char is_sug = 0,
int* info = NULL) {
@@ -1706,8 +1702,7 @@ struct hentry* AffixMgr::compound_check(const char* word,
short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
struct hentry* rv = NULL;
struct hentry* rv_first;
- struct hentry* rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking
- char st[MAXWORDUTF8LEN + 4];
+ std::string st;
char ch = '\0';
int cmin;
int cmax;
@@ -1726,7 +1721,7 @@ struct hentry* AffixMgr::compound_check(const char* word,
setcminmax(&cmin, &cmax, word, len);
- strcpy(st, word);
+ st.assign(word);
for (i = cmin; i < cmax; i++) {
// go to end of the UTF-8 character
@@ -1758,11 +1753,11 @@ struct hentry* AffixMgr::compound_check(const char* word,
if (scpd > numcheckcpd)
break; // break simplified checkcompoundpattern loop
- strcpy(st + i, checkcpdtable[scpd - 1].pattern);
+ st.replace(i, std::string::npos, checkcpdtable[scpd - 1].pattern);
soldi = i;
i += strlen(checkcpdtable[scpd - 1].pattern);
- strcpy(st + i, checkcpdtable[scpd - 1].pattern2);
- strcpy(st + i + strlen(checkcpdtable[scpd - 1].pattern2),
+ st.replace(i, std::string::npos, checkcpdtable[scpd - 1].pattern2);
+ st.replace(i + strlen(checkcpdtable[scpd - 1].pattern2), std::string::npos,
word + soldi + strlen(checkcpdtable[scpd - 1].pattern3));
oldlen = len;
@@ -1771,7 +1766,7 @@ struct hentry* AffixMgr::compound_check(const char* word,
strlen(checkcpdtable[scpd - 1].pattern3);
oldcmin = cmin;
oldcmax = cmax;
- setcminmax(&cmin, &cmax, st, len);
+ setcminmax(&cmin, &cmax, st.c_str(), len);
cmax = len - cpdmin + 1;
}
@@ -1785,7 +1780,7 @@ struct hentry* AffixMgr::compound_check(const char* word,
// FIRST WORD
affixed = 1;
- rv = lookup(st); // perhaps without prefix
+ rv = lookup(st.c_str()); // perhaps without prefix
// search homonym with compound flag
while ((rv) && !hu_mov_rule &&
@@ -1798,9 +1793,9 @@ struct hentry* AffixMgr::compound_check(const char* word,
TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
(numdefcpd && onlycpdrule &&
((!words && !wordnum &&
- defcpd_check(&words, wnum, rv, (hentry**)&rwords, 0)) ||
+ defcpd_check(&words, wnum, rv, rwords, 0)) ||
(words &&
- defcpd_check(&words, wnum, rv, (hentry**)&rwords, 0))))) ||
+ defcpd_check(&words, wnum, rv, rwords, 0))))) ||
(scpd != 0 && checkcpdtable[scpd - 1].cond != FLAG_NULL &&
!TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond, rv->alen)))) {
rv = rv->next_homonym;
@@ -1813,14 +1808,14 @@ struct hentry* AffixMgr::compound_check(const char* word,
if (onlycpdrule)
break;
if (compoundflag &&
- !(rv = prefix_check(st, i,
+ !(rv = prefix_check(st.c_str(), i,
hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
compoundflag))) {
if (((rv = suffix_check(
- st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundflag,
+ st.c_str(), i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundflag,
hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
(compoundmoresuffixes &&
- (rv = suffix_check_twosfx(st, i, 0, NULL, compoundflag)))) &&
+ (rv = suffix_check_twosfx(st.c_str(), i, 0, NULL, compoundflag)))) &&
!hu_mov_rule && sfx->getCont() &&
((compoundforbidflag &&
TESTAFF(sfx->getCont(), compoundforbidflag,
@@ -1834,24 +1829,24 @@ struct hentry* AffixMgr::compound_check(const char* word,
if (rv ||
(((wordnum == 0) && compoundbegin &&
((rv = suffix_check(
- st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin,
+ st.c_str(), i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin,
hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
(compoundmoresuffixes &&
(rv = suffix_check_twosfx(
- st, i, 0, NULL,
+ st.c_str(), i, 0, NULL,
compoundbegin))) || // twofold suffixes + compound
- (rv = prefix_check(st, i,
+ (rv = prefix_check(st.c_str(), i,
hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
compoundbegin)))) ||
((wordnum > 0) && compoundmiddle &&
((rv = suffix_check(
- st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle,
+ st.c_str(), i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle,
hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
(compoundmoresuffixes &&
(rv = suffix_check_twosfx(
- st, i, 0, NULL,
+ st.c_str(), i, 0, NULL,
compoundmiddle))) || // twofold suffixes + compound
- (rv = prefix_check(st, i,
+ (rv = prefix_check(st.c_str(), i,
hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
compoundmiddle))))))
checked_prefix = 1;
@@ -1942,7 +1937,7 @@ struct hentry* AffixMgr::compound_check(const char* word,
cpdcase_check(word, i))))
// LANG_hu section: spec. Hungarian rule
|| ((!rv) && (langnum == LANG_hu) && hu_mov_rule &&
- (rv = affix_check(st, i)) &&
+ (rv = affix_check(st.c_str(), i)) &&
(sfx && sfx->getCont() &&
( // XXX hardwired Hungarian dic. codes
TESTAFF(sfx->getCont(), (unsigned short)'x',
@@ -1954,10 +1949,10 @@ struct hentry* AffixMgr::compound_check(const char* word,
// LANG_hu section: spec. Hungarian rule
if (langnum == LANG_hu) {
// calculate syllable number of the word
- numsyllable += get_syllable(st, i);
+ numsyllable += get_syllable(st.substr(i));
// + 1 word, if syllable number of the prefix > 1 (hungarian
// convention)
- if (pfx && (get_syllable(pfx->getKey(), strlen(pfx->getKey())) > 1))
+ if (pfx && (get_syllable(pfx->getKey()) > 1))
wordnum++;
}
// END of LANG_hu section
@@ -1977,7 +1972,7 @@ struct hentry* AffixMgr::compound_check(const char* word,
striple = 1;
}
- rv = lookup((st + i)); // perhaps without prefix
+ rv = lookup(st.c_str() + i); // perhaps without prefix
// search homonym with compound flag
while ((rv) &&
@@ -2039,7 +2034,7 @@ struct hentry* AffixMgr::compound_check(const char* word,
(compoundend && TESTAFF(rv->astr, compoundend, rv->alen))) &&
(((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
((cpdmaxsyllable != 0) &&
- (numsyllable + get_syllable(HENTRY_WORD(rv), rv->clen) <=
+ (numsyllable + get_syllable(std::string(HENTRY_WORD(rv), rv->clen)) <=
cpdmaxsyllable))) &&
(
// test CHECKCOMPOUNDPATTERN
@@ -2123,20 +2118,19 @@ struct hentry* AffixMgr::compound_check(const char* word,
if (langnum == LANG_hu) {
// calculate syllable number of the word
- numsyllable += get_syllable(word + i, strlen(word + i));
+ numsyllable += get_syllable(word + i);
// - affix syllable num.
// XXX only second suffix (inflections, not derivations)
if (sfxappnd) {
- char* tmp = myrevstrdup(sfxappnd);
- numsyllable -= get_syllable(tmp, strlen(tmp)) + sfxextra;
- free(tmp);
+ std::string tmp(sfxappnd);
+ reverseword(tmp);
+ numsyllable -= get_syllable(tmp) + sfxextra;
}
// + 1 word, if syllable number of the prefix > 1 (hungarian
// convention)
- if (pfx &&
- (get_syllable(pfx->getKey(), strlen(pfx->getKey())) > 1))
+ if (pfx && (get_syllable(pfx->getKey()) > 1))
wordnum++;
// increment syllable num, if last word has a SYLLABLENUM flag
@@ -2187,8 +2181,8 @@ struct hentry* AffixMgr::compound_check(const char* word,
// perhaps second word is a compound word (recursive call)
if (wordnum < maxwordnum) {
- rv = compound_check((st + i), strlen(st + i), wordnum + 1,
- numsyllable, maxwordnum, wnum + 1, words, 0,
+ rv = compound_check(st.c_str() + i, strlen(st.c_str() + i), wordnum + 1,
+ numsyllable, maxwordnum, wnum + 1, words, rwords, 0,
is_sug, info);
if (rv && numcheckcpd &&
@@ -2211,11 +2205,11 @@ struct hentry* AffixMgr::compound_check(const char* word,
// check first part
if (strncmp(rv->word, word + i, rv->blen) == 0) {
- char r = *(st + i + rv->blen);
- *(st + i + rv->blen) = '\0';
+ char r = st[i + rv->blen];
+ st[i + rv->blen] = '\0';
- if (checkcompoundrep && cpdrep_check(st, i + rv->blen)) {
- *(st + i + rv->blen) = r;
+ if (checkcompoundrep && cpdrep_check(st.c_str(), i + rv->blen)) {
+ st[ + i + rv->blen] = r;
continue;
}
@@ -2225,11 +2219,11 @@ struct hentry* AffixMgr::compound_check(const char* word,
rv2 = affix_check(word, len);
if (rv2 && rv2->astr &&
TESTAFF(rv2->astr, forbiddenword, rv2->alen) &&
- (strncmp(rv2->word, st, i + rv->blen) == 0)) {
+ (strncmp(rv2->word, st.c_str(), i + rv->blen) == 0)) {
return NULL;
}
}
- *(st + i + rv->blen) = r;
+ st[i + rv->blen] = r;
}
}
return rv_first;
@@ -2262,7 +2256,7 @@ struct hentry* AffixMgr::compound_check(const char* word,
if (soldi != 0) {
i = soldi;
- strcpy(st, word); // XXX add more optim.
+ st.assign(word); // XXX add more optim.
soldi = 0;
} else
st[i] = ch;
@@ -2283,6 +2277,7 @@ int AffixMgr::compound_check_morph(const char* word,
short maxwordnum,
short wnum,
hentry** words,
+ hentry** rwords,
char hu_mov_rule = 0,
char** result = NULL,
char* partresult = NULL) {
@@ -2292,8 +2287,7 @@ int AffixMgr::compound_check_morph(const char* word,
struct hentry* rv = NULL;
struct hentry* rv_first;
- struct hentry* rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking
- char st[MAXWORDUTF8LEN + 4];
+ std::string st;
char ch;
int checked_prefix;
@@ -2308,7 +2302,7 @@ int AffixMgr::compound_check_morph(const char* word,
setcminmax(&cmin, &cmax, word, len);
- strcpy(st, word);
+ st.assign(word);
for (i = cmin; i < cmax; i++) {
// go to end of the UTF-8 character
@@ -2340,7 +2334,7 @@ int AffixMgr::compound_check_morph(const char* word,
if (partresult)
mystrcat(presult, partresult, MAXLNLEN);
- rv = lookup(st); // perhaps without prefix
+ rv = lookup(st.c_str()); // perhaps without prefix
// search homonym with compound flag
while ((rv) && !hu_mov_rule &&
@@ -2353,9 +2347,9 @@ int AffixMgr::compound_check_morph(const char* word,
TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
(numdefcpd && onlycpdrule &&
((!words && !wordnum &&
- defcpd_check(&words, wnum, rv, (hentry**)&rwords, 0)) ||
+ defcpd_check(&words, wnum, rv, rwords, 0)) ||
(words &&
- defcpd_check(&words, wnum, rv, (hentry**)&rwords, 0))))))) {
+ defcpd_check(&words, wnum, rv, rwords, 0))))))) {
rv = rv->next_homonym;
}
@@ -2363,10 +2357,10 @@ int AffixMgr::compound_check_morph(const char* word,
affixed = 0;
if (rv) {
- sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_PART, st);
+ sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_PART, st.c_str());
if (!HENTRY_FIND(rv, MORPH_STEM)) {
sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_STEM,
- st);
+ st.c_str());
}
// store the pointer of the hash entry
// sprintf(presult + strlen(presult), "%c%s%p", MSEP_FLD,
@@ -2382,13 +2376,13 @@ int AffixMgr::compound_check_morph(const char* word,
break;
if (compoundflag &&
!(rv =
- prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
+ prefix_check(st.c_str(), i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
compoundflag))) {
- if (((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL,
+ if (((rv = suffix_check(st.c_str(), i, 0, NULL, NULL, 0, NULL, FLAG_NULL,
compoundflag,
hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
(compoundmoresuffixes &&
- (rv = suffix_check_twosfx(st, i, 0, NULL, compoundflag)))) &&
+ (rv = suffix_check_twosfx(st.c_str(), i, 0, NULL, compoundflag)))) &&
!hu_mov_rule && sfx->getCont() &&
((compoundforbidflag &&
TESTAFF(sfx->getCont(), compoundforbidflag,
@@ -2401,44 +2395,44 @@ int AffixMgr::compound_check_morph(const char* word,
if (rv ||
(((wordnum == 0) && compoundbegin &&
- ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL,
+ ((rv = suffix_check(st.c_str(), i, 0, NULL, NULL, 0, NULL, FLAG_NULL,
compoundbegin,
hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
(compoundmoresuffixes &&
(rv = suffix_check_twosfx(
- st, i, 0, NULL,
+ st.c_str(), i, 0, NULL,
compoundbegin))) || // twofold suffix+compound
- (rv = prefix_check(st, i,
+ (rv = prefix_check(st.c_str(), i,
hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
compoundbegin)))) ||
((wordnum > 0) && compoundmiddle &&
- ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL,
+ ((rv = suffix_check(st.c_str(), i, 0, NULL, NULL, 0, NULL, FLAG_NULL,
compoundmiddle,
hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
(compoundmoresuffixes &&
(rv = suffix_check_twosfx(
- st, i, 0, NULL,
+ st.c_str(), i, 0, NULL,
compoundmiddle))) || // twofold suffix+compound
- (rv = prefix_check(st, i,
+ (rv = prefix_check(st.c_str(), i,
hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
compoundmiddle)))))) {
// char * p = prefix_check_morph(st, i, 0, compound);
char* p = NULL;
if (compoundflag)
- p = affix_check_morph(st, i, compoundflag);
+ p = affix_check_morph(st.c_str(), i, compoundflag);
if (!p || (*p == '\0')) {
if (p)
free(p);
p = NULL;
if ((wordnum == 0) && compoundbegin) {
- p = affix_check_morph(st, i, compoundbegin);
+ p = affix_check_morph(st.c_str(), i, compoundbegin);
} else if ((wordnum > 0) && compoundmiddle) {
- p = affix_check_morph(st, i, compoundmiddle);
+ p = affix_check_morph(st.c_str(), i, compoundmiddle);
}
}
if (p && (*p != '\0')) {
sprintf(presult + strlen(presult), "%c%s%s%s", MSEP_FLD, MORPH_PART,
- st, line_uniq_app(&p, MSEP_REC));
+ st.c_str(), line_uniq_app(&p, MSEP_REC));
}
if (p)
free(p);
@@ -2519,7 +2513,7 @@ int AffixMgr::compound_check_morph(const char* word,
// LANG_hu section: spec. Hungarian rule
||
((!rv) && (langnum == LANG_hu) && hu_mov_rule &&
- (rv = affix_check(st, i)) &&
+ (rv = affix_check(st.c_str(), i)) &&
(sfx && sfx->getCont() &&
(TESTAFF(sfx->getCont(), (unsigned short)'x', sfx->getContLen()) ||
TESTAFF(sfx->getCont(), (unsigned short)'%', sfx->getContLen()))))
@@ -2528,11 +2522,11 @@ int AffixMgr::compound_check_morph(const char* word,
// LANG_hu section: spec. Hungarian rule
if (langnum == LANG_hu) {
// calculate syllable number of the word
- numsyllable += get_syllable(st, i);
+ numsyllable += get_syllable(st.substr(i));
// + 1 word, if syllable number of the prefix > 1 (hungarian
// convention)
- if (pfx && (get_syllable(pfx->getKey(), strlen(pfx->getKey())) > 1))
+ if (pfx && (get_syllable(pfx->getKey()) > 1))
wordnum++;
}
// END of LANG_hu section
@@ -2608,7 +2602,7 @@ int AffixMgr::compound_check_morph(const char* word,
(compoundend && TESTAFF(rv->astr, compoundend, rv->alen))) &&
(((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
((cpdmaxsyllable != 0) &&
- (numsyllable + get_syllable(HENTRY_WORD(rv), rv->blen) <=
+ (numsyllable + get_syllable(std::string(HENTRY_WORD(rv), rv->blen)) <=
cpdmaxsyllable))) &&
((!checkcompounddup || (rv != rv_first)))) {
// bad compound word
@@ -2701,19 +2695,19 @@ int AffixMgr::compound_check_morph(const char* word,
if (langnum == LANG_hu) {
// calculate syllable number of the word
- numsyllable += get_syllable(word + i, strlen(word + i));
+ numsyllable += get_syllable(word + i);
// - affix syllable num.
// XXX only second suffix (inflections, not derivations)
if (sfxappnd) {
- char* tmp = myrevstrdup(sfxappnd);
- numsyllable -= get_syllable(tmp, strlen(tmp)) + sfxextra;
- free(tmp);
+ std::string tmp(sfxappnd);
+ reverseword(tmp);
+ numsyllable -= get_syllable(tmp) + sfxextra;
}
// + 1 word, if syllable number of the prefix > 1 (hungarian
// convention)
- if (pfx && (get_syllable(pfx->getKey(), strlen(pfx->getKey())) > 1))
+ if (pfx && (get_syllable(pfx->getKey()) > 1))
wordnum++;
// increment syllable num, if last word has a SYLLABLENUM flag
@@ -2779,7 +2773,7 @@ int AffixMgr::compound_check_morph(const char* word,
// perhaps second word is a compound word (recursive call)
if ((wordnum < maxwordnum) && (ok == 0)) {
compound_check_morph((word + i), strlen(word + i), wordnum + 1,
- numsyllable, maxwordnum, wnum + 1, words, 0,
+ numsyllable, maxwordnum, wnum + 1, words, rwords, 0,
result, presult);
} else {
rv = NULL;
@@ -2795,6 +2789,7 @@ int AffixMgr::compound_check_morph(const char* word,
return 0;
}
+
// return 1 if s1 (reversed) is a leading subset of end of s2
/* inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int
len)
@@ -3402,7 +3397,7 @@ int AffixMgr::expand_rootword(struct guessword* wlst,
unsigned short al,
const char* bad,
int badl,
- char* phon) {
+ const char* phon) {
int nh = 0;
// first add root word to list
if ((nh < maxn) &&
@@ -3653,8 +3648,7 @@ char* AffixMgr::get_ignore() const {
}
// return the preferred ignore string for suggestions
-unsigned short* AffixMgr::get_ignore_utf16(int* len) const {
- *len = ignorechars_utf16_len;
+const std::vector<w_char>& AffixMgr::get_ignore_utf16() const {
return ignorechars_utf16;
}
@@ -3677,8 +3671,7 @@ const char* AffixMgr::get_wordchars() const {
return wordchars;
}
-unsigned short* AffixMgr::get_wordchars_utf16(int* len) const {
- *len = wordchars_utf16_len;
+const std::vector<w_char>& AffixMgr::get_wordchars_utf16() const {
return wordchars_utf16;
}
@@ -3840,7 +3833,6 @@ int AffixMgr::parse_cpdsyllable(char* line, FileMgr* af) {
char* piece;
int i = 0;
int np = 0;
- w_char w[MAXWORDLEN];
piece = mystrsep(&tp, 0);
while (piece) {
if (*piece != '\0') {
@@ -3858,15 +3850,16 @@ int AffixMgr::parse_cpdsyllable(char* line, FileMgr* af) {
if (!utf8) {
cpdvowels = mystrdup(piece);
} else {
- int n = u8_u16(w, MAXWORDLEN, piece);
- if (n > 0) {
- flag_qsort((unsigned short*)w, 0, n);
- cpdvowels_utf16 = (w_char*)malloc(n * sizeof(w_char));
+ std::vector<w_char> w;
+ u8_u16(w, piece);
+ if (!w.empty()) {
+ std::sort(w.begin(), w.end());
+ cpdvowels_utf16 = (w_char*)malloc(w.size() * sizeof(w_char));
if (!cpdvowels_utf16)
return 1;
- memcpy(cpdvowels_utf16, w, n * sizeof(w_char));
+ memcpy(cpdvowels_utf16, &w[0], w.size());
}
- cpdvowels_utf16_len = n;
+ cpdvowels_utf16_len = w.size();
}
np++;
break;
@@ -4636,13 +4629,16 @@ int AffixMgr::parse_breaktable(char* line, FileMgr* af) {
return 0;
}
-void AffixMgr::reverse_condition(char* piece) {
+void AffixMgr::reverse_condition(std::string& piece) {
+ if (piece.empty())
+ return;
+
int neg = 0;
- for (char* k = piece + strlen(piece) - 1; k >= piece; k--) {
+ for (std::string::reverse_iterator k = piece.rbegin(); k != piece.rend(); ++k) {
switch (*k) {
case '[': {
if (neg)
- *(k + 1) = '[';
+ *(k - 1) = '[';
else
*k = ']';
break;
@@ -4650,20 +4646,20 @@ void AffixMgr::reverse_condition(char* piece) {
case ']': {
*k = '[';
if (neg)
- *(k + 1) = '^';
+ *(k - 1) = '^';
neg = 0;
break;
}
case '^': {
- if (*(k + 1) == ']')
+ if (*(k - 1) == ']')
neg = 1;
else
- *(k + 1) = *k;
+ *(k - 1) = *k;
break;
}
default: {
if (neg)
- *(k + 1) = *k;
+ *(k - 1) = *k;
}
}
}
@@ -4731,8 +4727,8 @@ int AffixMgr::parse_affix(char* line,
case 3: {
np++;
numents = atoi(piece);
- if ((numents <= 0) || ((::std::numeric_limits<size_t>::max() /
- sizeof(struct affentry)) < numents)) {
+ if ((numents <= 0) || ((std::numeric_limits<size_t>::max() /
+ sizeof(struct affentry)) < static_cast<size_t>(numents))) {
char* err = pHMgr->encode_flag(aflag);
if (err) {
HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
@@ -4817,18 +4813,15 @@ int AffixMgr::parse_affix(char* line,
// piece 3 - is string to strip or 0 for null
case 2: {
np++;
+ entry->strip = piece;
if (complexprefixes) {
if (utf8)
- reverseword_utf(piece);
+ reverseword_utf(entry->strip);
else
- reverseword(piece);
+ reverseword(entry->strip);
}
- entry->strip = mystrdup(piece);
- entry->stripl = (unsigned char)strlen(entry->strip);
- if (strcmp(entry->strip, "0") == 0) {
- free(entry->strip);
- entry->strip = mystrdup("");
- entry->stripl = 0;
+ if (entry->strip.compare("0") == 0) {
+ entry->strip.clear();
}
break;
}
@@ -4844,22 +4837,22 @@ int AffixMgr::parse_affix(char* line,
if (dash) {
*dash = '\0';
+ entry->appnd = piece;
+
if (ignorechars) {
if (utf8) {
- remove_ignored_chars_utf(piece, ignorechars_utf16,
- ignorechars_utf16_len);
+ remove_ignored_chars_utf(entry->appnd, ignorechars_utf16);
} else {
- remove_ignored_chars(piece, ignorechars);
+ remove_ignored_chars(entry->appnd, ignorechars);
}
}
if (complexprefixes) {
if (utf8)
- reverseword_utf(piece);
+ reverseword_utf(entry->appnd);
else
- reverseword(piece);
+ reverseword(entry->appnd);
}
- entry->appnd = mystrdup(piece);
if (pHMgr->is_aliasf()) {
int index = atoi(dash + 1);
@@ -4872,7 +4865,7 @@ int AffixMgr::parse_affix(char* line,
} else {
entry->contclasslen = (unsigned short)pHMgr->decode_flags(
&(entry->contclass), dash + 1, af);
- flag_qsort(entry->contclass, 0, entry->contclasslen);
+ std::sort(entry->contclass, entry->contclass + entry->contclasslen);
}
*dash = '/';
@@ -4881,74 +4874,74 @@ int AffixMgr::parse_affix(char* line,
contclasses[(entry->contclass)[_i]] = 1;
}
} else {
+ entry->appnd = piece;
+
if (ignorechars) {
if (utf8) {
- remove_ignored_chars_utf(piece, ignorechars_utf16,
- ignorechars_utf16_len);
+ remove_ignored_chars_utf(entry->appnd, ignorechars_utf16);
} else {
- remove_ignored_chars(piece, ignorechars);
+ remove_ignored_chars(entry->appnd, ignorechars);
}
}
if (complexprefixes) {
if (utf8)
- reverseword_utf(piece);
+ reverseword_utf(entry->appnd);
else
- reverseword(piece);
+ reverseword(entry->appnd);
}
- entry->appnd = mystrdup(piece);
}
- entry->appndl = (unsigned char)strlen(entry->appnd);
- if (strcmp(entry->appnd, "0") == 0) {
- free(entry->appnd);
- entry->appnd = mystrdup("");
- entry->appndl = 0;
+ if (entry->appnd.compare("0") == 0) {
+ entry->appnd.clear();
}
break;
}
// piece 5 - is the conditions descriptions
case 4: {
+ std::string chunk(piece);
np++;
if (complexprefixes) {
if (utf8)
- reverseword_utf(piece);
+ reverseword_utf(chunk);
else
- reverseword(piece);
- reverse_condition(piece);
+ reverseword(chunk);
+ reverse_condition(chunk);
}
- if (entry->stripl && (strcmp(piece, ".") != 0) &&
- redundant_condition(at, entry->strip, entry->stripl, piece,
+ if (!entry->strip.empty() && chunk != "." &&
+ redundant_condition(at, entry->strip.c_str(), entry->strip.size(), chunk.c_str(),
af->getlinenum()))
- strcpy(piece, ".");
+ chunk = ".";
if (at == 'S') {
- reverseword(piece);
- reverse_condition(piece);
+ reverseword(chunk);
+ reverse_condition(chunk);
}
- if (encodeit(*entry, piece))
+ if (encodeit(*entry, chunk.c_str()))
return 1;
break;
}
case 5: {
+ std::string chunk(piece);
np++;
if (pHMgr->is_aliasm()) {
- int index = atoi(piece);
+ int index = atoi(chunk.c_str());
entry->morphcode = pHMgr->get_aliasm(index);
} else {
if (complexprefixes) { // XXX - fix me for morph. gen.
if (utf8)
- reverseword_utf(piece);
+ reverseword_utf(chunk);
else
- reverseword(piece);
+ reverseword(chunk);
}
// add the remaining of the line
if (*tp) {
*(tp - 1) = ' ';
- tp = tp + strlen(tp);
+ chunk.push_back(' ');
+ chunk.append(tp);
}
- entry->morphcode = mystrdup(piece);
+ entry->morphcode = mystrdup(chunk.c_str());
if (!entry->morphcode)
return 1;
}
@@ -5002,7 +4995,7 @@ int AffixMgr::parse_affix(char* line,
}
int AffixMgr::redundant_condition(char ft,
- char* strip,
+ const char* strip,
int stripl,
const char* cond,
int linenum) {
@@ -5112,11 +5105,7 @@ int AffixMgr::get_suffix_words(short unsigned* suff,
hentry* ht = ptr->checkword(nw.c_str(), nw.size(), 0, NULL, NULL, 0,
NULL, 0, 0, 0);
if (ht) {
- slst[suff_words_cnt] = (char*)malloc(MAXWORDUTF8LEN * sizeof(char));
- if (slst[suff_words_cnt]) {
- strcpy(slst[suff_words_cnt], nw.c_str());
- suff_words_cnt++;
- }
+ slst[suff_words_cnt++] = mystrdup(nw.c_str());
}
}
suff++;
diff --git a/libs/hunspell/src/affixmgr.hxx b/libs/hunspell/src/affixmgr.hxx
index ca376953cd..d70e853388 100644
--- a/libs/hunspell/src/affixmgr.hxx
+++ b/libs/hunspell/src/affixmgr.hxx
@@ -160,11 +160,9 @@ class LIBHUNSPELL_DLL_EXPORTED AffixMgr {
PfxEntry* pfx; // BUG: not stateless
int checknum;
char* wordchars;
- unsigned short* wordchars_utf16;
- int wordchars_utf16_len;
+ std::vector<w_char> wordchars_utf16;
char* ignorechars;
- unsigned short* ignorechars_utf16;
- int ignorechars_utf16_len;
+ std::vector<w_char> ignorechars_utf16;
char* version;
char* lang;
int langnum;
@@ -258,9 +256,9 @@ class LIBHUNSPELL_DLL_EXPORTED AffixMgr {
unsigned short al,
const char* bad,
int,
- char*);
+ const char*);
- short get_syllable(const char* word, int wlen);
+ short get_syllable(const std::string& word);
int cpdrep_check(const char* word, int len);
int cpdpat_check(const char* word,
int len,
@@ -282,6 +280,7 @@ class LIBHUNSPELL_DLL_EXPORTED AffixMgr {
short maxwordnum,
short wnum,
hentry** words,
+ hentry** rwords,
char hu_mov_rule,
char is_sug,
int* info);
@@ -293,6 +292,7 @@ class LIBHUNSPELL_DLL_EXPORTED AffixMgr {
short maxwordnum,
short wnum,
hentry** words,
+ hentry** rwords,
char hu_mov_rule,
char** result,
char* partresult);
@@ -317,9 +317,9 @@ class LIBHUNSPELL_DLL_EXPORTED AffixMgr {
char* get_key_string();
char* get_try_string() const;
const char* get_wordchars() const;
- unsigned short* get_wordchars_utf16(int* len) const;
+ const std::vector<w_char>& get_wordchars_utf16() const;
char* get_ignore() const;
- unsigned short* get_ignore_utf16(int* len) const;
+ const std::vector<w_char>& get_ignore_utf16() const;
int get_compound() const;
FLAG get_compoundflag() const;
FLAG get_compoundbegin() const;
@@ -370,11 +370,11 @@ class LIBHUNSPELL_DLL_EXPORTED AffixMgr {
int parse_defcpdtable(char* line, FileMgr* af);
int parse_affix(char* line, const char at, FileMgr* af, char* dupflags);
- void reverse_condition(char*);
+ void reverse_condition(std::string&);
void debugflag(char* result, unsigned short flag);
std::string& debugflag(std::string& result, unsigned short flag);
- int condlen(char*);
- int encodeit(affentry& entry, char* cs);
+ int condlen(const char*);
+ int encodeit(affentry& entry, const char* cs);
int build_pfxtree(PfxEntry* pfxptr);
int build_sfxtree(SfxEntry* sfxptr);
int process_pfx_order();
@@ -383,7 +383,7 @@ class LIBHUNSPELL_DLL_EXPORTED AffixMgr {
SfxEntry* process_sfx_in_order(SfxEntry* ptr, SfxEntry* nptr);
int process_pfx_tree_to_list();
int process_sfx_tree_to_list();
- int redundant_condition(char, char* strip, int stripl, const char* cond, int);
+ int redundant_condition(char, const char* strip, int stripl, const char* cond, int);
void finishFileMgr(FileMgr* afflst);
};
diff --git a/libs/hunspell/src/atypes.hxx b/libs/hunspell/src/atypes.hxx
index d71f62a32d..60826af20e 100644
--- a/libs/hunspell/src/atypes.hxx
+++ b/libs/hunspell/src/atypes.hxx
@@ -57,11 +57,11 @@ static inline void HUNSPELL_WARNING(FILE*, const char*, ...) {}
#include "hashmgr.hxx"
#include "w_char.hxx"
+#include <algorithm>
+#include <string>
#define SETSIZE 256
#define CONTSIZE 65536
-#define MAXWORDLEN 100
-#define MAXWORDUTF8LEN 256
// affentry options
#define aeXPRODUCT (1 << 0)
@@ -98,14 +98,11 @@ static inline void HUNSPELL_WARNING(FILE*, const char*, ...) {}
#define FLAG_NULL 0x00
#define FREE_FLAG(a) a = 0
-#define TESTAFF(a, b, c) \
- (flag_bsearch((unsigned short*)a, (unsigned short)b, c))
+#define TESTAFF(a, b, c) (std::binary_search(a, a + c, b))
struct affentry {
- char* strip;
- char* appnd;
- unsigned char stripl;
- unsigned char appndl;
+ std::string strip;
+ std::string appnd;
char numconds;
char opts;
unsigned short aflag;
diff --git a/libs/hunspell/src/baseaffix.hxx b/libs/hunspell/src/baseaffix.hxx
index cfc6b71ca4..59256e92f3 100644
--- a/libs/hunspell/src/baseaffix.hxx
+++ b/libs/hunspell/src/baseaffix.hxx
@@ -59,7 +59,7 @@ class LIBHUNSPELL_DLL_EXPORTED AffEntry {
contclasslen(0) {}
std::string appnd;
std::string strip;
- char numconds;
+ unsigned char numconds;
char opts;
unsigned short aflag;
union {
diff --git a/libs/hunspell/src/config.h b/libs/hunspell/src/config.h
index f1963148b6..1230ed0be7 100644
--- a/libs/hunspell/src/config.h
+++ b/libs/hunspell/src/config.h
@@ -179,9 +179,6 @@
/* Define to 1 if you have the `__argz_stringify' function. */
#define HAVE___ARGZ_STRINGIFY 1
-/* "Define if you use exterimental functions" */
-//#define HUNSPELL_EXPERIMENTAL 1
-
/* "Define if you need warning messages" */
#define HUNSPELL_WARNING_ON
@@ -204,5 +201,5 @@
#define PACKAGE_TARNAME
/* Define to the version of this package. */
-#define PACKAGE_VERSION "1.3.4"
-#define VERSION "1.3.4"
+#define PACKAGE_VERSION "1.4.0"
+#define VERSION "1.4.0"
diff --git a/libs/hunspell/src/csutil.c++ b/libs/hunspell/src/csutil.c++
index d7411bb216..1948e4a3b3 100644
--- a/libs/hunspell/src/csutil.c++
+++ b/libs/hunspell/src/csutil.c++
@@ -144,53 +144,6 @@ FILE* myfopen(const char* path, const char* mode) {
return fopen(path, mode);
}
-/* only UTF-16 (BMP) implementation */
-char* u16_u8(char* dest, int size, const w_char* src, int srclen) {
- signed char* u8 = (signed char*)dest;
- signed char* u8_max = (signed char*)(u8 + size);
- const w_char* u2 = src;
- const w_char* u2_max = src + srclen;
- while ((u2 < u2_max) && (u8 < u8_max)) {
- if (u2->h) { // > 0xFF
- // XXX 4-byte haven't implemented yet.
- if (u2->h >= 0x08) { // >= 0x800 (3-byte UTF-8 character)
- *u8 = 0xe0 + (u2->h >> 4);
- u8++;
- if (u8 < u8_max) {
- *u8 = 0x80 + ((u2->h & 0xf) << 2) + (u2->l >> 6);
- u8++;
- if (u8 < u8_max) {
- *u8 = 0x80 + (u2->l & 0x3f);
- u8++;
- }
- }
- } else { // < 0x800 (2-byte UTF-8 character)
- *u8 = 0xc0 + (u2->h << 2) + (u2->l >> 6);
- u8++;
- if (u8 < u8_max) {
- *u8 = 0x80 + (u2->l & 0x3f);
- u8++;
- }
- }
- } else { // <= 0xFF
- if (u2->l & 0x80) { // >0x80 (2-byte UTF-8 character)
- *u8 = 0xc0 + (u2->l >> 6);
- u8++;
- if (u8 < u8_max) {
- *u8 = 0x80 + (u2->l & 0x3f);
- u8++;
- }
- } else { // < 0x80 (1-byte UTF-8 character)
- *u8 = u2->l;
- u8++;
- }
- }
- u2++;
- }
- *u8 = '\0';
- return dest;
-}
-
std::string& u16_u8(std::string& dest, const std::vector<w_char>& src) {
dest.clear();
std::vector<w_char>::const_iterator u2 = src.begin();
@@ -228,93 +181,6 @@ std::string& u16_u8(std::string& dest, const std::vector<w_char>& src) {
return dest;
}
-/* only UTF-16 (BMP) implementation */
-int u8_u16(w_char* dest, int size, const char* src) {
- const signed char* u8 = (const signed char*)src;
- w_char* u2 = dest;
- w_char* u2_max = u2 + size;
-
- while ((u2 < u2_max) && *u8) {
- switch ((*u8) & 0xf0) {
- case 0x00:
- case 0x10:
- case 0x20:
- case 0x30:
- case 0x40:
- case 0x50:
- case 0x60:
- case 0x70: {
- u2->h = 0;
- u2->l = *u8;
- break;
- }
- case 0x80:
- case 0x90:
- case 0xa0:
- case 0xb0: {
- HUNSPELL_WARNING(stderr,
- "UTF-8 encoding error. Unexpected continuation bytes "
- "in %ld. character position\n%s\n",
- static_cast<long>(u8 - (signed char*)src), src);
- u2->h = 0xff;
- u2->l = 0xfd;
- break;
- }
- case 0xc0:
- case 0xd0: { // 2-byte UTF-8 codes
- if ((*(u8 + 1) & 0xc0) == 0x80) {
- u2->h = (*u8 & 0x1f) >> 2;
- u2->l = (*u8 << 6) + (*(u8 + 1) & 0x3f);
- u8++;
- } else {
- HUNSPELL_WARNING(stderr,
- "UTF-8 encoding error. Missing continuation byte in "
- "%ld. character position:\n%s\n",
- static_cast<long>(u8 - (signed char*)src), src);
- u2->h = 0xff;
- u2->l = 0xfd;
- }
- break;
- }
- case 0xe0: { // 3-byte UTF-8 codes
- if ((*(u8 + 1) & 0xc0) == 0x80) {
- u2->h = ((*u8 & 0x0f) << 4) + ((*(u8 + 1) & 0x3f) >> 2);
- u8++;
- if ((*(u8 + 1) & 0xc0) == 0x80) {
- u2->l = (*u8 << 6) + (*(u8 + 1) & 0x3f);
- u8++;
- } else {
- HUNSPELL_WARNING(stderr,
- "UTF-8 encoding error. Missing continuation byte "
- "in %ld. character position:\n%s\n",
- static_cast<long>(u8 - (signed char*)src), src);
- u2->h = 0xff;
- u2->l = 0xfd;
- }
- } else {
- HUNSPELL_WARNING(stderr,
- "UTF-8 encoding error. Missing continuation byte in "
- "%ld. character position:\n%s\n",
- static_cast<long>(u8 - (signed char*)src), src);
- u2->h = 0xff;
- u2->l = 0xfd;
- }
- break;
- }
- case 0xf0: { // 4 or more byte UTF-8 codes
- HUNSPELL_WARNING(
- stderr, "This UTF-8 encoding can't convert to UTF-16:\n%s\n", src);
- u2->h = 0xff;
- u2->l = 0xfd;
- return -1;
- }
- }
- u8++;
- u2++;
- }
- return (int)(u2 - dest);
-}
-
int u8_u16(std::vector<w_char>& dest, const std::string& src) {
dest.clear();
std::string::const_iterator u8 = src.begin();
@@ -370,7 +236,7 @@ int u8_u16(std::vector<w_char>& dest, const std::string& src) {
u2.h = ((*u8 & 0x0f) << 4) + ((*(u8 + 1) & 0x3f) >> 2);
++u8;
if ((*(u8 + 1) & 0xc0) == 0x80) {
- u2.l = (*u8 << 6) + (*(u8 + 1) & 0x3f);
+ u2.l = (static_cast<unsigned char>(*u8) << 6) + (*(u8 + 1) & 0x3f);
++u8;
} else {
HUNSPELL_WARNING(stderr,
@@ -409,48 +275,6 @@ int u8_u16(std::vector<w_char>& dest, const std::string& src) {
return dest.size();
}
-void flag_qsort(unsigned short flags[], int begin, int end) {
- unsigned short reg;
- if (end > begin) {
- unsigned short pivot = flags[begin];
- int l = begin + 1;
- int r = end;
- while (l < r) {
- if (flags[l] <= pivot) {
- l++;
- } else {
- r--;
- reg = flags[l];
- flags[l] = flags[r];
- flags[r] = reg;
- }
- }
- l--;
- reg = flags[begin];
- flags[begin] = flags[l];
- flags[l] = reg;
-
- flag_qsort(flags, begin, l);
- flag_qsort(flags, r, end);
- }
-}
-
-int flag_bsearch(unsigned short flags[], unsigned short flag, int length) {
- int mid;
- int left = 0;
- int right = length - 1;
- while (left <= right) {
- mid = (left + right) / 2;
- if (flags[mid] == flag)
- return 1;
- if (flag < flags[mid])
- right = mid - 1;
- else
- left = mid + 1;
- }
- return 0;
-}
-
// strip strings into token based on single char delimiter
// acts like strsep() but only uses a delim char and not
// a delim string
@@ -519,25 +343,6 @@ void mychomp(char* s) {
*(s + k - 2) = '\0';
}
-// does an ansi strdup of the reverse of a string
-char* myrevstrdup(const char* s) {
- char* d = NULL;
- if (s) {
- size_t sl = strlen(s);
- d = (char*)malloc(sl + 1);
- if (d) {
- const char* p = s + sl - 1;
- char* q = d;
- while (p >= s)
- *q++ = *p--;
- *q = '\0';
- } else {
- HUNSPELL_WARNING(stderr, "Can't allocate memory.\n");
- }
- }
- return d;
-}
-
// break text to lines
// return number of lines
int line_tok(const char* text, char*** lines, char breakchar) {
@@ -654,26 +459,6 @@ char* line_uniq_app(char** text, char breakchar) {
}
// append s to ends of every lines in text
-void strlinecat(char* dest, const char* s) {
- char* dup = mystrdup(dest);
- char* source = dup;
- int len = strlen(s);
- if (dup) {
- while (*source) {
- if (*source == '\n') {
- strncpy(dest, s, len);
- dest += len;
- }
- *dest = *source;
- source++;
- dest++;
- }
- strcpy(dest, s);
- free(dup);
- }
-}
-
-// append s to ends of every lines in text
std::string& strlinecat(std::string& str, const std::string& apd) {
size_t pos = 0;
while ((pos = str.find('\n', pos)) != std::string::npos) {
@@ -684,15 +469,6 @@ std::string& strlinecat(std::string& str, const std::string& apd) {
return str;
}
-// change \n to char c
-char* tr(char* text, char oldc, char newc) {
- char* p;
- for (p = text; *p; p++)
- if (*p == oldc)
- *p = newc;
- return text;
-}
-
// morphcmp(): compare MORPH_DERI_SFX, MORPH_INFL_SFX and MORPH_TERM_SFX fields
// in the first line of the inputs
// return 0, if inputs equal
@@ -807,23 +583,6 @@ int fieldlen(const char* r) {
return n;
}
-char* copy_field(char* dest, const char* morph, const char* var) {
- if (!morph)
- return NULL;
- const char* beg = strstr(morph, var);
- if (beg) {
- char* d = dest;
- for (beg += MORPH_TAG_LEN;
- *beg != ' ' && *beg != '\t' && *beg != '\n' && *beg != '\0';
- d++, beg++) {
- *d = *beg;
- }
- *d = '\0';
- return dest;
- }
- return NULL;
-}
-
bool copy_field(std::string& dest,
const std::string& morph,
const std::string& var) {
@@ -884,47 +643,18 @@ char* mystrrep(char* word, const char* pat, const char* rep) {
}
// reverse word
-int reverseword(char* word) {
- char r;
- for (char *dest = word + strlen(word) - 1; word < dest; word++, dest--) {
- r = *word;
- *word = *dest;
- *dest = r;
- }
- return 0;
-}
-
-// reverse word
-std::string& reverseword(std::string& word) {
+size_t reverseword(std::string& word) {
std::reverse(word.begin(), word.end());
- return word;
-}
-
-// reverse word (error: 1)
-int reverseword_utf(char* word) {
- w_char w[MAXWORDLEN];
- w_char* p;
- w_char r;
- int l = u8_u16(w, MAXWORDLEN, word);
- if (l == -1)
- return 1;
- p = w;
- for (w_char *dest = w + l - 1; p < dest; p++, dest--) {
- r = *p;
- *p = *dest;
- *dest = r;
- }
- u16_u8(word, MAXWORDUTF8LEN, w, l);
- return 0;
+ return word.size();
}
// reverse word
-std::string& reverseword_utf(std::string& word) {
+size_t reverseword_utf(std::string& word) {
std::vector<w_char> w;
u8_u16(w, word);
std::reverse(w.begin(), w.end());
u16_u8(word, w);
- return word;
+ return w.size();
}
int uniqlist(char** list, int n) {
@@ -978,12 +708,22 @@ unsigned char ccase(const struct cs_info* csconv, int nIndex) {
}
}
-// convert null terminated string to all caps
-void mkallcap(char* p, const struct cs_info* csconv) {
- while (*p != '\0') {
- *p = cupper(csconv, static_cast<unsigned char>(*p));
- p++;
+w_char upper_utf(w_char u, int langnum) {
+ unsigned short idx = (u.h << 8) + u.l;
+ if (idx != unicodetoupper(idx, langnum)) {
+ u.h = (unsigned char)(unicodetoupper(idx, langnum) >> 8);
+ u.l = (unsigned char)(unicodetoupper(idx, langnum) & 0x00FF);
+ }
+ return u;
+}
+
+w_char lower_utf(w_char u, int langnum) {
+ unsigned short idx = (u.h << 8) + u.l;
+ if (idx != unicodetolower(idx, langnum)) {
+ u.h = (unsigned char)(unicodetolower(idx, langnum) >> 8);
+ u.l = (unsigned char)(unicodetolower(idx, langnum) & 0x00FF);
}
+ return u;
}
// convert std::string to all caps
@@ -994,14 +734,6 @@ std::string& mkallcap(std::string& s, const struct cs_info* csconv) {
return s;
}
-// convert null terminated string to all little
-void mkallsmall(char* p, const struct cs_info* csconv) {
- while (*p != '\0') {
- *p = clower(csconv, static_cast<unsigned char>(*p));
- p++;
- }
-}
-
// convert std::string to all little
std::string& mkallsmall(std::string& s, const struct cs_info* csconv) {
for (std::string::iterator aI = s.begin(), aEnd = s.end(); aI != aEnd; ++aI) {
@@ -1010,20 +742,9 @@ std::string& mkallsmall(std::string& s, const struct cs_info* csconv) {
return s;
}
-void mkallsmall_utf(w_char* u, int nc, int langnum) {
- for (int i = 0; i < nc; i++) {
- unsigned short idx = (u[i].h << 8) + u[i].l;
- if (idx != unicodetolower(idx, langnum)) {
- u[i].h = (unsigned char)(unicodetolower(idx, langnum) >> 8);
- u[i].l = (unsigned char)(unicodetolower(idx, langnum) & 0x00FF);
- }
- }
-}
-
std::vector<w_char>& mkallsmall_utf(std::vector<w_char>& u,
- int nc,
int langnum) {
- for (int i = 0; i < nc; i++) {
+ for (size_t i = 0; i < u.size(); ++i) {
unsigned short idx = (u[i].h << 8) + u[i].l;
if (idx != unicodetolower(idx, langnum)) {
u[i].h = (unsigned char)(unicodetolower(idx, langnum) >> 8);
@@ -1033,31 +754,51 @@ std::vector<w_char>& mkallsmall_utf(std::vector<w_char>& u,
return u;
}
-void mkallcap_utf(w_char* u, int nc, int langnum) {
- for (int i = 0; i < nc; i++) {
+std::vector<w_char>& mkallcap_utf(std::vector<w_char>& u, int langnum) {
+ for (size_t i = 0; i < u.size(); i++) {
unsigned short idx = (u[i].h << 8) + u[i].l;
if (idx != unicodetoupper(idx, langnum)) {
u[i].h = (unsigned char)(unicodetoupper(idx, langnum) >> 8);
u[i].l = (unsigned char)(unicodetoupper(idx, langnum) & 0x00FF);
}
}
+ return u;
}
-std::vector<w_char>& mkallcap_utf(std::vector<w_char>& u, int nc, int langnum) {
- for (int i = 0; i < nc; i++) {
- unsigned short idx = (u[i].h << 8) + u[i].l;
+std::string& mkinitcap(std::string& s, const struct cs_info* csconv) {
+ if (!s.empty()) {
+ s[0] = cupper(csconv, static_cast<unsigned char>(s[0]));
+ }
+ return s;
+}
+
+std::vector<w_char>& mkinitcap_utf(std::vector<w_char>& u, int langnum) {
+ if (!u.empty()) {
+ unsigned short idx = (u[0].h << 8) + u[0].l;
if (idx != unicodetoupper(idx, langnum)) {
- u[i].h = (unsigned char)(unicodetoupper(idx, langnum) >> 8);
- u[i].l = (unsigned char)(unicodetoupper(idx, langnum) & 0x00FF);
+ u[0].h = (unsigned char)(unicodetoupper(idx, langnum) >> 8);
+ u[0].l = (unsigned char)(unicodetoupper(idx, langnum) & 0x00FF);
}
}
return u;
}
-// convert null terminated string to have initial capital
-void mkinitcap(char* p, const struct cs_info* csconv) {
- if (*p != '\0')
- *p = cupper(csconv, static_cast<unsigned char>(*p));
+std::string& mkinitsmall(std::string& s, const struct cs_info* csconv) {
+ if (!s.empty()) {
+ s[0] = clower(csconv, static_cast<unsigned char>(s[0]));
+ }
+ return s;
+}
+
+std::vector<w_char>& mkinitsmall_utf(std::vector<w_char>& u, int langnum) {
+ if (!u.empty()) {
+ unsigned short idx = (u[0].h << 8) + u[0].l;
+ if (idx != unicodetolower(idx, langnum)) {
+ u[0].h = (unsigned char)(unicodetolower(idx, langnum) >> 8);
+ u[0].l = (unsigned char)(unicodetolower(idx, langnum) & 0x00FF);
+ }
+ }
+ return u;
}
// conversion function for protected memory
@@ -1073,35 +814,6 @@ char* get_stored_pointer(const char* s) {
}
#ifndef MOZILLA_CLIENT
-// convert null terminated string to all caps using encoding
-void enmkallcap(char* d, const char* p, const char* encoding)
-
-{
- struct cs_info* csconv = get_current_cs(encoding);
- while (*p != '\0') {
- *d++ = cupper(csconv, static_cast<unsigned char>(*p));
- p++;
- }
- *d = '\0';
-}
-
-// convert null terminated string to all little using encoding
-void enmkallsmall(char* d, const char* p, const char* encoding) {
- struct cs_info* csconv = get_current_cs(encoding);
- while (*p != '\0') {
- *d++ = clower(csconv, static_cast<unsigned char>(*p));
- p++;
- }
- *d = '\0';
-}
-
-// convert null terminated string to have initial capital using encoding
-void enmkinitcap(char* d, const char* p, const char* encoding) {
- struct cs_info* csconv = get_current_cs(encoding);
- memcpy(d, p, (strlen(p) + 1));
- if (*p != '\0')
- *d = cupper(csconv, static_cast<unsigned char>(*p));
-}
// these are simple character mappings for the
// encodings supported
@@ -2982,14 +2694,14 @@ int unicodeisalpha(unsigned short c) {
}
/* get type of capitalization */
-int get_captype(char* word, int nl, cs_info* csconv) {
+int get_captype(const std::string& word, cs_info* csconv) {
// now determine the capitalization type of the first nl letters
- int ncap = 0;
- int nneutral = 0;
- int firstcap = 0;
+ size_t ncap = 0;
+ size_t nneutral = 0;
+ size_t firstcap = 0;
if (csconv == NULL)
return NOCAP;
- for (char* q = word; *q != '\0'; q++) {
+ for (std::string::const_iterator q = word.begin(); q != word.end(); ++q) {
unsigned char nIndex = static_cast<unsigned char>(*q);
if (ccase(csconv, nIndex))
ncap++;
@@ -3006,7 +2718,7 @@ int get_captype(char* word, int nl, cs_info* csconv) {
return NOCAP;
} else if ((ncap == 1) && firstcap) {
return INITCAP;
- } else if ((ncap == nl) || ((ncap + nneutral) == nl)) {
+ } else if ((ncap == word.size()) || ((ncap + nneutral) == word.size())) {
return ALLCAP;
} else if ((ncap > 1) && firstcap) {
return HUHINITCAP;
@@ -3014,27 +2726,20 @@ int get_captype(char* word, int nl, cs_info* csconv) {
return HUHCAP;
}
-int get_captype_utf8(w_char* word, int nl, int langnum) {
+int get_captype_utf8(const std::vector<w_char>& word, int langnum) {
// now determine the capitalization type of the first nl letters
- int ncap = 0;
- int nneutral = 0;
- int firstcap = 0;
- unsigned short idx;
- // don't check too long words
- if (nl >= MAXWORDLEN)
- return 0;
- // big Unicode character (non BMP area)
- if (nl == -1)
- return NOCAP;
- for (int i = 0; i < nl; i++) {
- idx = (word[i].h << 8) + word[i].l;
+ size_t ncap = 0;
+ size_t nneutral = 0;
+ size_t firstcap = 0;
+ for (size_t i = 0; i < word.size(); ++i) {
+ unsigned short idx = (word[i].h << 8) + word[i].l;
if (idx != unicodetolower(idx, langnum))
ncap++;
if (unicodetoupper(idx, langnum) == unicodetolower(idx, langnum))
nneutral++;
}
if (ncap) {
- idx = (word[0].h << 8) + word[0].l;
+ unsigned short idx = (word[0].h << 8) + word[0].l;
firstcap = (idx != unicodetolower(idx, langnum));
}
@@ -3043,7 +2748,7 @@ int get_captype_utf8(w_char* word, int nl, int langnum) {
return NOCAP;
} else if ((ncap == 1) && firstcap) {
return INITCAP;
- } else if ((ncap == nl) || ((ncap + nneutral) == nl)) {
+ } else if ((ncap == word.size()) || ((ncap + nneutral) == word.size())) {
return ALLCAP;
} else if ((ncap > 1) && firstcap) {
return HUHINITCAP;
@@ -3052,63 +2757,22 @@ int get_captype_utf8(w_char* word, int nl, int langnum) {
}
// strip all ignored characters in the string
-void remove_ignored_chars_utf(char* word,
- unsigned short ignored_chars[],
- int ignored_len) {
- w_char w[MAXWORDLEN];
- w_char w2[MAXWORDLEN];
- int i;
- int j;
- int len = u8_u16(w, MAXWORDLEN, word);
- for (i = 0, j = 0; i < len; i++) {
- if (!flag_bsearch(ignored_chars, ((unsigned short*)w)[i], ignored_len)) {
- w2[j] = w[i];
- j++;
- }
- }
- if (j < i)
- u16_u8(word, MAXWORDUTF8LEN, w2, j);
-}
-
-namespace {
-union w_s {
- w_char w;
- unsigned short s;
-};
-
-unsigned short asushort(w_char in) {
- w_s c;
- c.w = in;
- return c.s;
-}
-}
-
-// strip all ignored characters in the string
-std::string& remove_ignored_chars_utf(std::string& word,
- unsigned short ignored_chars[],
- int ignored_len) {
+size_t remove_ignored_chars_utf(std::string& word,
+ const std::vector<w_char>& ignored_chars) {
std::vector<w_char> w;
std::vector<w_char> w2;
u8_u16(w, word);
for (size_t i = 0; i < w.size(); ++i) {
- if (!flag_bsearch(ignored_chars, asushort(w[i]), ignored_len))
+ if (!std::binary_search(ignored_chars.begin(),
+ ignored_chars.end(),
+ w[i])) {
w2.push_back(w[i]);
+ }
}
u16_u8(word, w2);
- return word;
-}
-
-// strip all ignored characters in the string
-void remove_ignored_chars(char* word, char* ignored_chars) {
- for (char* p = word; *p != '\0'; p++) {
- if (!strchr(ignored_chars, *p)) {
- *word = *p;
- word++;
- }
- }
- *word = '\0';
+ return w2.size();
}
namespace {
@@ -3119,16 +2783,17 @@ class is_any_of {
bool operator()(char c) { return chars.find(c) != std::string::npos; }
private:
- const std::string& chars;
+ std::string chars;
};
}
// strip all ignored characters in the string
-std::string& remove_ignored_chars(std::string& word,
- const std::string& ignored_chars) {
+size_t remove_ignored_chars(std::string& word,
+ const std::string& ignored_chars) {
word.erase(
- std::remove_if(word.begin(), word.end(), is_any_of(ignored_chars)));
- return word;
+ std::remove_if(word.begin(), word.end(), is_any_of(ignored_chars)),
+ word.end());
+ return word.size();
}
int parse_string(char* line, char** out, int ln) {
@@ -3170,25 +2835,16 @@ int parse_string(char* line, char** out, int ln) {
return 0;
}
-int parse_array(char* line,
- char** out,
- unsigned short** out_utf16,
- int* out_utf16_len,
- int utf8,
- int ln) {
+bool parse_array(char* line,
+ char** out,
+ std::vector<w_char>& out_utf16,
+ int utf8,
+ int ln) {
if (parse_string(line, out, ln))
- return 1;
+ return false;
if (utf8) {
- w_char w[MAXWORDLEN];
- int n = u8_u16(w, MAXWORDLEN, *out);
- if (n > 0) {
- flag_qsort((unsigned short*)w, 0, n);
- *out_utf16 = (unsigned short*)malloc(n * sizeof(unsigned short));
- if (!*out_utf16)
- return 1;
- memcpy(*out_utf16, w, n * sizeof(unsigned short));
- }
- *out_utf16_len = n;
+ u8_u16(out_utf16, *out);
+ std::sort(out_utf16.begin(), out_utf16.end());
}
- return 0;
+ return true;
}
diff --git a/libs/hunspell/src/csutil.hxx b/libs/hunspell/src/csutil.hxx
index cd582933b1..ce7091df55 100644
--- a/libs/hunspell/src/csutil.hxx
+++ b/libs/hunspell/src/csutil.hxx
@@ -131,30 +131,13 @@
LIBHUNSPELL_DLL_EXPORTED FILE* myfopen(const char* path, const char* mode);
// convert UTF-16 characters to UTF-8
-LIBHUNSPELL_DLL_EXPORTED char* u16_u8(char* dest,
- int size,
- const w_char* src,
- int srclen);
-// convert UTF-16 characters to UTF-8
LIBHUNSPELL_DLL_EXPORTED std::string& u16_u8(std::string& dest,
const std::vector<w_char>& src);
// convert UTF-8 characters to UTF-16
-LIBHUNSPELL_DLL_EXPORTED int u8_u16(w_char* dest, int size, const char* src);
-// convert UTF-8 characters to UTF-16
LIBHUNSPELL_DLL_EXPORTED int u8_u16(std::vector<w_char>& dest,
const std::string& src);
-// sort 2-byte vector
-LIBHUNSPELL_DLL_EXPORTED void flag_qsort(unsigned short flags[],
- int begin,
- int end);
-
-// binary search in 2-byte vector
-LIBHUNSPELL_DLL_EXPORTED int flag_bsearch(unsigned short flags[],
- unsigned short flag,
- int right);
-
// remove end of line char(s)
LIBHUNSPELL_DLL_EXPORTED void mychomp(char* s);
@@ -164,13 +147,8 @@ LIBHUNSPELL_DLL_EXPORTED char* mystrdup(const char* s);
// strcat for limited length destination string
LIBHUNSPELL_DLL_EXPORTED char* mystrcat(char* dest, const char* st, int max);
-// duplicate reverse of string
-LIBHUNSPELL_DLL_EXPORTED char* myrevstrdup(const char* s);
-
// parse into tokens with char delimiter
LIBHUNSPELL_DLL_EXPORTED char* mystrsep(char** sptr, const char delim);
-// parse into tokens with char delimiter
-LIBHUNSPELL_DLL_EXPORTED char* mystrsep2(char** sptr, const char delim);
// replace pat by rep in word and return word
LIBHUNSPELL_DLL_EXPORTED char* mystrrep(char* word,
@@ -181,9 +159,6 @@ LIBHUNSPELL_DLL_EXPORTED std::string& mystrrep(std::string& str,
const std::string& replace);
// append s to ends of every lines in text
-LIBHUNSPELL_DLL_EXPORTED void strlinecat(char* lines, const char* s);
-
-// append s to ends of every lines in text
LIBHUNSPELL_DLL_EXPORTED std::string& strlinecat(std::string& str,
const std::string& apd);
@@ -196,18 +171,11 @@ LIBHUNSPELL_DLL_EXPORTED int line_tok(const char* text,
LIBHUNSPELL_DLL_EXPORTED char* line_uniq(char* text, char breakchar);
LIBHUNSPELL_DLL_EXPORTED char* line_uniq_app(char** text, char breakchar);
-// change oldchar to newchar in place
-LIBHUNSPELL_DLL_EXPORTED char* tr(char* text, char oldc, char newc);
-
// reverse word
-LIBHUNSPELL_DLL_EXPORTED int reverseword(char*);
-// reverse word
-LIBHUNSPELL_DLL_EXPORTED std::string& reverseword(std::string& word);
+LIBHUNSPELL_DLL_EXPORTED size_t reverseword(std::string& word);
// reverse word
-LIBHUNSPELL_DLL_EXPORTED int reverseword_utf(char*);
-// reverse word
-LIBHUNSPELL_DLL_EXPORTED std::string& reverseword_utf(std::string&);
+LIBHUNSPELL_DLL_EXPORTED size_t reverseword_utf(std::string&);
// remove duplicates
LIBHUNSPELL_DLL_EXPORTED int uniqlist(char** list, int n);
@@ -226,6 +194,8 @@ LIBHUNSPELL_DLL_EXPORTED int initialize_utf_tbl();
LIBHUNSPELL_DLL_EXPORTED void free_utf_tbl();
LIBHUNSPELL_DLL_EXPORTED unsigned short unicodetoupper(unsigned short c,
int langnum);
+LIBHUNSPELL_DLL_EXPORTED w_char upper_utf(w_char u, int langnum);
+LIBHUNSPELL_DLL_EXPORTED w_char lower_utf(w_char u, int langnum);
LIBHUNSPELL_DLL_EXPORTED unsigned short unicodetolower(unsigned short c,
int langnum);
LIBHUNSPELL_DLL_EXPORTED int unicodeisalpha(unsigned short c);
@@ -238,87 +208,64 @@ LIBHUNSPELL_DLL_EXPORTED int get_lang_num(const char* lang);
// get characters of the given 8bit encoding with lower- and uppercase forms
LIBHUNSPELL_DLL_EXPORTED char* get_casechars(const char* enc);
-// convert null terminated string to all caps using encoding
-LIBHUNSPELL_DLL_EXPORTED void enmkallcap(char* d,
- const char* p,
- const char* encoding);
-
-// convert null terminated string to all little using encoding
-LIBHUNSPELL_DLL_EXPORTED void enmkallsmall(char* d,
- const char* p,
- const char* encoding);
-
-// convert null terminated string to have initial capital using encoding
-LIBHUNSPELL_DLL_EXPORTED void enmkinitcap(char* d,
- const char* p,
- const char* encoding);
-
-// convert null terminated string to all caps
-LIBHUNSPELL_DLL_EXPORTED void mkallcap(char* p, const struct cs_info* csconv);
// convert std::string to all caps
LIBHUNSPELL_DLL_EXPORTED std::string& mkallcap(std::string& s,
const struct cs_info* csconv);
// convert null terminated string to all little
-LIBHUNSPELL_DLL_EXPORTED void mkallsmall(char* p, const struct cs_info* csconv);
-// convert null terminated string to all little
LIBHUNSPELL_DLL_EXPORTED std::string& mkallsmall(std::string& s,
const struct cs_info* csconv);
-// convert null terminated string to have initial capital
-LIBHUNSPELL_DLL_EXPORTED void mkinitcap(char* p, const struct cs_info* csconv);
+// convert first letter of string to little
+LIBHUNSPELL_DLL_EXPORTED std::string& mkinitsmall(std::string& s,
+ const struct cs_info* csconv);
+
+// convert first letter of string to capital
+LIBHUNSPELL_DLL_EXPORTED std::string& mkinitcap(std::string& s,
+ const struct cs_info* csconv);
+
+// convert first letter of UTF-8 string to capital
+LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
+mkinitcap_utf(std::vector<w_char>& u, int langnum);
+
+// convert UTF-8 string to little
+LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
+mkallsmall_utf(std::vector<w_char>& u, int langnum);
-// convert first nc characters of UTF-8 string to little
-LIBHUNSPELL_DLL_EXPORTED void mkallsmall_utf(w_char* u, int nc, int langnum);
-// convert first nc characters of UTF-8 string to little
+// convert first letter of UTF-8 string to little
LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
-mkallsmall_utf(std::vector<w_char>& u, int nc, int langnum);
+mkinitsmall_utf(std::vector<w_char>& u, int langnum);
-// convert first nc characters of UTF-8 string to capital
-LIBHUNSPELL_DLL_EXPORTED void mkallcap_utf(w_char* u, int nc, int langnum);
-// convert first nc characters of UTF-8 string to capital
+// convert UTF-8 string to capital
LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
-mkallcap_utf(std::vector<w_char>& u, int nc, int langnum);
+mkallcap_utf(std::vector<w_char>& u, int langnum);
// get type of capitalization
-LIBHUNSPELL_DLL_EXPORTED int get_captype(char* q, int nl, cs_info*);
+LIBHUNSPELL_DLL_EXPORTED int get_captype(const std::string& q, cs_info*);
// get type of capitalization (UTF-8)
-LIBHUNSPELL_DLL_EXPORTED int get_captype_utf8(w_char* q, int nl, int langnum);
+LIBHUNSPELL_DLL_EXPORTED int get_captype_utf8(const std::vector<w_char>& q, int langnum);
// strip all ignored characters in the string
-LIBHUNSPELL_DLL_EXPORTED void remove_ignored_chars_utf(
- char* word,
- unsigned short ignored_chars[],
- int ignored_len);
-// strip all ignored characters in the string
-LIBHUNSPELL_DLL_EXPORTED std::string& remove_ignored_chars_utf(
+LIBHUNSPELL_DLL_EXPORTED size_t remove_ignored_chars_utf(
std::string& word,
- unsigned short ignored_chars[],
- int ignored_len);
+ const std::vector<w_char>& ignored_chars);
// strip all ignored characters in the string
-LIBHUNSPELL_DLL_EXPORTED void remove_ignored_chars(char* word,
- char* ignored_chars);
-// strip all ignored characters in the string
-LIBHUNSPELL_DLL_EXPORTED std::string& remove_ignored_chars(
+LIBHUNSPELL_DLL_EXPORTED size_t remove_ignored_chars(
std::string& word,
const std::string& ignored_chars);
LIBHUNSPELL_DLL_EXPORTED int parse_string(char* line, char** out, int ln);
-LIBHUNSPELL_DLL_EXPORTED int parse_array(char* line,
- char** out,
- unsigned short** out_utf16,
- int* out_utf16_len,
- int utf8,
- int ln);
+LIBHUNSPELL_DLL_EXPORTED bool parse_array(char* line,
+ char** out,
+ std::vector<w_char>& out_utf16,
+ int utf8,
+ int ln);
LIBHUNSPELL_DLL_EXPORTED int fieldlen(const char* r);
-LIBHUNSPELL_DLL_EXPORTED char* copy_field(char* dest,
- const char* morph,
- const char* var);
LIBHUNSPELL_DLL_EXPORTED bool copy_field(std::string& dest,
const std::string& morph,
const std::string& var);
@@ -375,6 +322,4 @@ LIBHUNSPELL_DLL_EXPORTED inline char* HENTRY_FIND(struct hentry* h,
return (HENTRY_DATA(h) ? strstr(HENTRY_DATA(h), p) : NULL);
}
-#define w_char_eq(a, b) (((a).l == (b).l) && ((a).h == (b).h))
-
#endif
diff --git a/libs/hunspell/src/filemgr.c++ b/libs/hunspell/src/filemgr.c++
index cbe41c577b..2218bc79e1 100644
--- a/libs/hunspell/src/filemgr.c++
+++ b/libs/hunspell/src/filemgr.c++
@@ -89,13 +89,9 @@ FileMgr::FileMgr(const char* file, const char* key) : hin(NULL), linenum(0) {
fin = myfopen(file, "r");
if (!fin) {
// check hzipped file
- char* st = (char*)malloc(strlen(file) + strlen(HZIP_EXTENSION) + 1);
- if (st) {
- strcpy(st, file);
- strcat(st, HZIP_EXTENSION);
- hin = new Hunzip(st, key);
- free(st);
- }
+ std::string st(file);
+ st.append(HZIP_EXTENSION);
+ hin = new Hunzip(st.c_str(), key);
}
if (!fin && !hin)
fail(MSG_OPEN, file);
diff --git a/libs/hunspell/src/hashmgr.c++ b/libs/hunspell/src/hashmgr.c++
index dbcf56a51c..c3cd95420f 100644
--- a/libs/hunspell/src/hashmgr.c++
+++ b/libs/hunspell/src/hashmgr.c++
@@ -76,6 +76,7 @@
#include <stdio.h>
#include <ctype.h>
#include <limits>
+#include <sstream>
#include "hashmgr.hxx"
#include "csutil.hxx"
@@ -101,8 +102,6 @@ HashMgr::HashMgr(const char* tpath, const char* apath, const char* key)
enc = NULL;
csconv = 0;
ignorechars = NULL;
- ignorechars_utf16 = NULL;
- ignorechars_utf16_len = 0;
load_config(apath, key);
int ec = load_tables(tpath, key);
if (ec) {
@@ -167,8 +166,6 @@ HashMgr::~HashMgr() {
if (ignorechars)
free(ignorechars);
- if (ignorechars_utf16)
- free(ignorechars_utf16);
#ifdef MOZILLA_CLIENT
delete[] csconv;
@@ -199,28 +196,56 @@ int HashMgr::add_word(const char* word,
int al,
const char* desc,
bool onlyupcase) {
+
+ std::string *word_copy = NULL;
+ std::string *desc_copy = NULL;
+ if (ignorechars || complexprefixes) {
+ word_copy = new std::string(word, wbl);
+
+ if (ignorechars != NULL) {
+ if (utf8) {
+ wcl = remove_ignored_chars_utf(*word_copy, ignorechars_utf16);
+ } else {
+ remove_ignored_chars(*word_copy, ignorechars);
+ }
+ }
+
+ if (complexprefixes) {
+ if (utf8)
+ wcl = reverseword_utf(*word_copy);
+ else
+ reverseword(*word_copy);
+
+ if (desc && !aliasm) {
+ desc_copy = new std::string(desc);
+
+ if (complexprefixes) {
+ if (utf8)
+ reverseword_utf(*desc_copy);
+ else
+ reverseword(*desc_copy);
+ }
+ desc = desc_copy->c_str();
+ }
+ }
+
+ wbl = word_copy->size();
+ word = word_copy->c_str();
+ }
+
bool upcasehomonym = false;
int descl = desc ? (aliasm ? sizeof(char*) : strlen(desc) + 1) : 0;
// variable-length hash record with word and optional fields
struct hentry* hp =
(struct hentry*)malloc(sizeof(struct hentry) + wbl + descl);
- if (!hp)
+ if (!hp) {
+ delete desc_copy;
+ delete word_copy;
return 1;
+ }
+
char* hpw = hp->word;
strcpy(hpw, word);
- if (ignorechars != NULL) {
- if (utf8) {
- remove_ignored_chars_utf(hpw, ignorechars_utf16, ignorechars_utf16_len);
- } else {
- remove_ignored_chars(hpw, ignorechars);
- }
- }
- if (complexprefixes) {
- if (utf8)
- reverseword_utf(hpw);
- else
- reverseword(hpw);
- }
int i = hash(hpw);
@@ -239,12 +264,6 @@ int HashMgr::add_word(const char* word,
store_pointer(hpw + wbl + 1, get_aliasm(atoi(desc)));
} else {
strcpy(hpw + wbl + 1, desc);
- if (complexprefixes) {
- if (utf8)
- reverseword_utf(HENTRY_DATA(hp));
- else
- reverseword(HENTRY_DATA(hp));
- }
}
if (strstr(HENTRY_DATA(hp), MORPH_PHON))
hp->var += H_OPT_PHON;
@@ -254,6 +273,8 @@ int HashMgr::add_word(const char* word,
struct hentry* dp = tableptr[i];
if (!dp) {
tableptr[i] = hp;
+ delete desc_copy;
+ delete word_copy;
return 0;
}
while (dp->next != NULL) {
@@ -265,6 +286,8 @@ int HashMgr::add_word(const char* word,
dp->astr = hp->astr;
dp->alen = hp->alen;
free(hp);
+ delete desc_copy;
+ delete word_copy;
return 0;
} else {
dp->next_homonym = hp;
@@ -283,6 +306,8 @@ int HashMgr::add_word(const char* word,
dp->astr = hp->astr;
dp->alen = hp->alen;
free(hp);
+ delete desc_copy;
+ delete word_copy;
return 0;
} else {
dp->next_homonym = hp;
@@ -299,11 +324,13 @@ int HashMgr::add_word(const char* word,
free(hp->astr);
free(hp);
}
+
+ delete desc_copy;
+ delete word_copy;
return 0;
}
-int HashMgr::add_hidden_capitalized_word(char* word,
- int wbl,
+int HashMgr::add_hidden_capitalized_word(const std::string& word,
int wcl,
unsigned short* flags,
int flagslen,
@@ -326,32 +353,34 @@ int HashMgr::add_hidden_capitalized_word(char* word,
memcpy(flags2, flags, flagslen * sizeof(unsigned short));
flags2[flagslen] = ONLYUPCASEFLAG;
if (utf8) {
- char st[BUFSIZE];
- w_char w[BUFSIZE];
- int wlen = u8_u16(w, BUFSIZE, word);
- mkallsmall_utf(w, wlen, langnum);
- mkallcap_utf(w, 1, langnum);
- u16_u8(st, BUFSIZE, w, wlen);
- return add_word(st, wbl, wcl, flags2, flagslen + 1, dp, true);
+ std::string st;
+ std::vector<w_char> w;
+ u8_u16(w, word);
+ mkallsmall_utf(w, langnum);
+ mkinitcap_utf(w, langnum);
+ u16_u8(st, w);
+ return add_word(st.c_str(), st.size(), wcl, flags2, flagslen + 1, dp, true);
} else {
- mkallsmall(word, csconv);
- mkinitcap(word, csconv);
- return add_word(word, wbl, wcl, flags2, flagslen + 1, dp, true);
+ std::string new_word(word);
+ mkallsmall(new_word, csconv);
+ mkinitcap(new_word, csconv);
+ int ret = add_word(new_word.c_str(), new_word.size(), wcl, flags2, flagslen + 1, dp, true);
+ return ret;
}
}
return 0;
}
// detect captype and modify word length for UTF-8 encoding
-int HashMgr::get_clen_and_captype(const char* word, int wbl, int* captype) {
+int HashMgr::get_clen_and_captype(const std::string& word, int* captype) {
int len;
if (utf8) {
- w_char dest_utf[BUFSIZE];
- len = u8_u16(dest_utf, BUFSIZE, word);
- *captype = get_captype_utf8(dest_utf, len, langnum);
+ std::vector<w_char> dest_utf;
+ len = u8_u16(dest_utf, word);
+ *captype = get_captype_utf8(dest_utf, langnum);
} else {
- len = wbl;
- *captype = get_captype((char*)word, len, csconv);
+ len = word.size();
+ *captype = get_captype(word, csconv);
}
return len;
}
@@ -370,7 +399,7 @@ int HashMgr::remove(const char* word) {
flags[dp->alen] = forbiddenword;
dp->astr = flags;
dp->alen++;
- flag_qsort(flags, 0, dp->alen);
+ std::sort(flags, flags + dp->alen);
}
dp = dp->next_homonym;
}
@@ -378,8 +407,8 @@ int HashMgr::remove(const char* word) {
}
/* remove forbidden flag to add a personal word to the hash */
-int HashMgr::remove_forbidden_flag(const char* word) {
- struct hentry* dp = lookup(word);
+int HashMgr::remove_forbidden_flag(const std::string& word) {
+ struct hentry* dp = lookup(word.c_str());
if (!dp)
return 1;
while (dp) {
@@ -406,15 +435,15 @@ int HashMgr::remove_forbidden_flag(const char* word) {
}
// add a custom dic. word to the hash table (public)
-int HashMgr::add(const char* word) {
+int HashMgr::add(const std::string& word) {
unsigned short* flags = NULL;
int al = 0;
if (remove_forbidden_flag(word)) {
int captype;
- int wbl = strlen(word);
- int wcl = get_clen_and_captype(word, wbl, &captype);
- add_word(word, wbl, wcl, flags, al, NULL, false);
- return add_hidden_capitalized_word((char*)word, wbl, wcl, flags, al, NULL,
+ int wbl = word.size();
+ int wcl = get_clen_and_captype(word, &captype);
+ add_word(word.c_str(), wbl, wcl, flags, al, NULL, false);
+ return add_hidden_capitalized_word(word, wcl, flags, al, NULL,
captype);
}
return 0;
@@ -427,7 +456,7 @@ int HashMgr::add_with_affix(const char* word, const char* example) {
if (dp && dp->astr) {
int captype;
int wbl = strlen(word);
- int wcl = get_clen_and_captype(word, wbl, &captype);
+ int wcl = get_clen_and_captype(word, &captype);
if (aliasf) {
add_word(word, wbl, wcl, dp->astr, dp->alen, NULL, false);
} else {
@@ -440,7 +469,7 @@ int HashMgr::add_with_affix(const char* word, const char* example) {
} else
return 1;
}
- return add_hidden_capitalized_word((char*)word, wbl, wcl, dp->astr,
+ return add_hidden_capitalized_word(word, wcl, dp->astr,
dp->alen, NULL, captype);
}
return 1;
@@ -574,7 +603,7 @@ int HashMgr::load_tables(const char* tpath, const char* key) {
delete dict;
return 6;
}
- flag_qsort(flags, 0, al);
+ std::sort(flags, flags + al);
}
} else {
al = 0;
@@ -584,10 +613,10 @@ int HashMgr::load_tables(const char* tpath, const char* key) {
int captype;
int wbl = strlen(ts);
- int wcl = get_clen_and_captype(ts, wbl, &captype);
+ int wcl = get_clen_and_captype(ts, &captype);
// add the word and its index plus its capitalized form optionally
if (add_word(ts, wbl, wcl, flags, al, dp, false) ||
- add_hidden_capitalized_word(ts, wbl, wcl, flags, al, dp, captype)) {
+ add_hidden_capitalized_word(ts, wcl, flags, al, dp, captype)) {
delete dict;
return 5;
}
@@ -674,12 +703,13 @@ int HashMgr::decode_flags(unsigned short** result, char* flags, FileMgr* af) {
break;
}
case FLAG_UNI: { // UTF-8 characters
- w_char w[BUFSIZE / 2];
- len = u8_u16(w, BUFSIZE / 2, flags);
+ std::vector<w_char> w;
+ u8_u16(w, flags);
+ len = w.size();
*result = (unsigned short*)malloc(len * sizeof(unsigned short));
if (!*result)
return -1;
- memcpy(*result, w, len * sizeof(short));
+ memcpy(*result, &w[0], len * sizeof(short));
break;
}
default: { // Ispell's one-character flags (erfg -> e r f g)
@@ -712,9 +742,13 @@ unsigned short HashMgr::decode_flag(const char* f) {
i, DEFAULTFLAGS - 1);
s = (unsigned short)i;
break;
- case FLAG_UNI:
- u8_u16((w_char*)&s, 1, f);
+ case FLAG_UNI: {
+ std::vector<w_char> w;
+ u8_u16(w, f);
+ if (!w.empty())
+ memcpy(&s, &w[0], 1 * sizeof(short));
break;
+ }
default:
s = (unsigned short)*((unsigned char*)f);
}
@@ -724,22 +758,24 @@ unsigned short HashMgr::decode_flag(const char* f) {
}
char* HashMgr::encode_flag(unsigned short f) {
- unsigned char ch[10];
if (f == 0)
return mystrdup("(NULL)");
+ std::string ch;
if (flag_mode == FLAG_LONG) {
- ch[0] = (unsigned char)(f >> 8);
- ch[1] = (unsigned char)(f - ((f >> 8) << 8));
- ch[2] = '\0';
+ ch.push_back((unsigned char)(f >> 8));
+ ch.push_back((unsigned char)(f - ((f >> 8) << 8)));
} else if (flag_mode == FLAG_NUM) {
- sprintf((char*)ch, "%d", f);
+ std::ostringstream stream;
+ stream << f;
+ ch = stream.str();
} else if (flag_mode == FLAG_UNI) {
- u16_u8((char*)&ch, 10, (w_char*)&f, 1);
+ const w_char* w_c = (const w_char*)&f;
+ std::vector<w_char> w(w_c, w_c + 1);
+ u16_u8(ch, w);
} else {
- ch[0] = (unsigned char)(f);
- ch[1] = '\0';
+ ch.push_back((unsigned char)(f));
}
- return mystrdup((char*)ch);
+ return mystrdup(ch.c_str());
}
// read in aff file and set flag mode
@@ -824,8 +860,8 @@ int HashMgr::load_config(const char* affpath, const char* key) {
/* parse in the ignored characters (for example, Arabic optional diacritics
* characters */
if (strncmp(line, "IGNORE", 6) == 0) {
- if (parse_array(line, &ignorechars, &ignorechars_utf16,
- &ignorechars_utf16_len, utf8, afflst->getlinenum())) {
+ if (!parse_array(line, &ignorechars, ignorechars_utf16,
+ utf8, afflst->getlinenum())) {
delete afflst;
return 1;
}
@@ -951,7 +987,7 @@ int HashMgr::parse_aliasf(char* line, FileMgr* af) {
case 1: {
aliasflen[j] =
(unsigned short)decode_flags(&(aliasf[j]), piece, af);
- flag_qsort(aliasf[j], 0, aliasflen[j]);
+ std::sort(aliasf[j], aliasf[j] + aliasflen[j]);
break;
}
default:
@@ -1070,19 +1106,14 @@ int HashMgr::parse_aliasm(char* line, FileMgr* af) {
*(tp - 1) = ' ';
tp = tp + strlen(tp);
}
+ std::string chunk(piece);
if (complexprefixes) {
if (utf8)
- reverseword_utf(piece);
+ reverseword_utf(chunk);
else
- reverseword(piece);
- }
- aliasm[j] = mystrdup(piece);
- if (!aliasm[j]) {
- numaliasm = 0;
- free(aliasm);
- aliasm = NULL;
- return 1;
+ reverseword(chunk);
}
+ aliasm[j] = mystrdup(chunk.c_str());
break;
}
default:
diff --git a/libs/hunspell/src/hashmgr.hxx b/libs/hunspell/src/hashmgr.hxx
index c6d72f3c40..95b06b13f9 100644
--- a/libs/hunspell/src/hashmgr.hxx
+++ b/libs/hunspell/src/hashmgr.hxx
@@ -77,9 +77,12 @@
#include "hunvisapi.h"
#include <stdio.h>
+#include <string>
+#include <vector>
#include "htypes.hxx"
#include "filemgr.hxx"
+#include "w_char.hxx"
enum flag { FLAG_CHAR, FLAG_LONG, FLAG_NUM, FLAG_UNI };
@@ -95,8 +98,7 @@ class LIBHUNSPELL_DLL_EXPORTED HashMgr {
char* lang;
struct cs_info* csconv;
char* ignorechars;
- unsigned short* ignorechars_utf16;
- int ignorechars_utf16_len;
+ std::vector<w_char> ignorechars_utf16;
int numaliasf; // flag vector `compression' with aliases
unsigned short** aliasf;
unsigned short* aliasflen;
@@ -111,7 +113,7 @@ class LIBHUNSPELL_DLL_EXPORTED HashMgr {
int hash(const char*) const;
struct hentry* walk_hashtable(int& col, struct hentry* hp) const;
- int add(const char* word);
+ int add(const std::string& word);
int add_with_affix(const char* word, const char* pattern);
int remove(const char* word);
int decode_flags(unsigned short** result, char* flags, FileMgr* af);
@@ -123,7 +125,7 @@ class LIBHUNSPELL_DLL_EXPORTED HashMgr {
char* get_aliasm(int index);
private:
- int get_clen_and_captype(const char* word, int wbl, int* captype);
+ int get_clen_and_captype(const std::string& word, int* captype);
int load_tables(const char* tpath, const char* key);
int add_word(const char* word,
int wbl,
@@ -134,15 +136,14 @@ class LIBHUNSPELL_DLL_EXPORTED HashMgr {
bool onlyupcase);
int load_config(const char* affpath, const char* key);
int parse_aliasf(char* line, FileMgr* af);
- int add_hidden_capitalized_word(char* word,
- int wbl,
+ int add_hidden_capitalized_word(const std::string& word,
int wcl,
unsigned short* flags,
int al,
char* dp,
int captype);
int parse_aliasm(char* line, FileMgr* af);
- int remove_forbidden_flag(const char* word);
+ int remove_forbidden_flag(const std::string& word);
};
#endif
diff --git a/libs/hunspell/src/hunspell.c++ b/libs/hunspell/src/hunspell.c++
index 726c72931a..f7c1581087 100644
--- a/libs/hunspell/src/hunspell.c++
+++ b/libs/hunspell/src/hunspell.c++
@@ -85,6 +85,9 @@
#include <limits>
#include <string>
+#define MAXWORDLEN 176
+#define MAXWORDUTF8LEN (MAXWORDLEN * 3)
+
Hunspell::Hunspell(const char* affpath, const char* dpath, const char* key) {
encoding = NULL;
csconv = NULL;
@@ -158,14 +161,16 @@ int Hunspell::add_dic(const char* dpath, const char* key) {
// set the capitalization type
// return the length of the "cleaned" (and UTF-8 encoded) word
-int Hunspell::cleanword2(char* dest,
+size_t Hunspell::cleanword2(std::string& dest,
+ std::vector<w_char>& dest_utf,
const char* src,
- w_char* dest_utf,
int* nc,
int* pcaptype,
- int* pabbrev) {
- unsigned char* p = (unsigned char*)dest;
- const unsigned char* q = (const unsigned char*)src;
+ size_t* pabbrev) {
+ dest.clear();
+ dest_utf.clear();
+
+ const char* q = src;
// first skip over any leading blanks
while ((*q != '\0') && (*q == ' '))
@@ -173,7 +178,7 @@ int Hunspell::cleanword2(char* dest,
// now strip off any trailing periods (recording their presence)
*pabbrev = 0;
- int nl = strlen((const char*)q);
+ int nl = strlen(q);
while ((nl > 0) && (*(q + nl - 1) == '.')) {
nl--;
(*pabbrev)++;
@@ -182,35 +187,26 @@ int Hunspell::cleanword2(char* dest,
// if no characters are left it can't be capitalized
if (nl <= 0) {
*pcaptype = NOCAP;
- *p = '\0';
return 0;
}
- strncpy(dest, (char*)q, nl);
- *(dest + nl) = '\0';
- nl = strlen(dest);
+ dest.append(q, nl);
+ nl = dest.size();
if (utf8) {
- *nc = u8_u16(dest_utf, MAXWORDLEN, dest);
- // don't check too long words
- if (*nc >= MAXWORDLEN)
- return 0;
- if (*nc == -1) { // big Unicode character (non BMP area)
- *pcaptype = NOCAP;
- return nl;
- }
- *pcaptype = get_captype_utf8(dest_utf, *nc, langnum);
+ *nc = u8_u16(dest_utf, dest);
+ *pcaptype = get_captype_utf8(dest_utf, langnum);
} else {
- *pcaptype = get_captype(dest, nl, csconv);
+ *pcaptype = get_captype(dest, csconv);
*nc = nl;
}
return nl;
}
-int Hunspell::cleanword(char* dest,
+void Hunspell::cleanword(std::string& dest,
const char* src,
int* pcaptype,
int* pabbrev) {
- unsigned char* p = (unsigned char*)dest;
+ dest.clear();
const unsigned char* q = (const unsigned char*)src;
int firstcap = 0;
@@ -229,8 +225,7 @@ int Hunspell::cleanword(char* dest,
// if no characters are left it can't be capitalized
if (nl <= 0) {
*pcaptype = NOCAP;
- *p = '\0';
- return 0;
+ return;
}
// now determine the capitalization type of the first nl letters
@@ -245,27 +240,25 @@ int Hunspell::cleanword(char* dest,
ncap++;
if (csconv[(*q)].cupper == csconv[(*q)].clower)
nneutral++;
- *p++ = *q++;
+ dest.push_back(*q++);
nl--;
}
// remember to terminate the destination string
- *p = '\0';
- firstcap = csconv[(unsigned char)(*dest)].ccase;
+ firstcap = csconv[static_cast<unsigned char>(dest[0])].ccase;
} else {
- unsigned short idx;
- w_char t[MAXWORDLEN];
- nc = u8_u16(t, MAXWORDLEN, src);
- for (int i = 0; i < nc; i++) {
- idx = (t[i].h << 8) + t[i].l;
+ std::vector<w_char> t;
+ u8_u16(t, src);
+ for (size_t i = 0; i < t.size(); ++i) {
+ unsigned short idx = (t[i].h << 8) + t[i].l;
unsigned short low = unicodetolower(idx, langnum);
if (idx != low)
ncap++;
if (unicodetoupper(idx, langnum) == low)
nneutral++;
}
- u16_u8(dest, MAXWORDUTF8LEN, t, nc);
+ u16_u8(dest, t);
if (ncap) {
- idx = (t[0].h << 8) + t[0].l;
+ unsigned short idx = (t[0].h << 8) + t[0].l;
firstcap = (idx != unicodetolower(idx, langnum));
}
}
@@ -282,117 +275,60 @@ int Hunspell::cleanword(char* dest,
} else {
*pcaptype = HUHCAP;
}
- return strlen(dest);
}
-void Hunspell::mkallcap(char* p) {
+void Hunspell::mkallcap(std::string& u8) {
if (utf8) {
- w_char u[MAXWORDLEN];
- int nc = u8_u16(u, MAXWORDLEN, p);
- unsigned short idx;
- for (int i = 0; i < nc; i++) {
- idx = (u[i].h << 8) + u[i].l;
- if (idx != unicodetoupper(idx, langnum)) {
- u[i].h = (unsigned char)(unicodetoupper(idx, langnum) >> 8);
- u[i].l = (unsigned char)(unicodetoupper(idx, langnum) & 0x00FF);
- }
- }
- u16_u8(p, MAXWORDUTF8LEN, u, nc);
+ std::vector<w_char> u16;
+ u8_u16(u16, u8);
+ ::mkallcap_utf(u16, langnum);
+ u16_u8(u8, u16);
} else {
- while (*p != '\0') {
- *p = csconv[((unsigned char)*p)].cupper;
- p++;
- }
- }
-}
-
-int Hunspell::mkallcap2(char* p, w_char* u, int nc) {
- if (utf8) {
- unsigned short idx;
- for (int i = 0; i < nc; i++) {
- idx = (u[i].h << 8) + u[i].l;
- unsigned short up = unicodetoupper(idx, langnum);
- if (idx != up) {
- u[i].h = (unsigned char)(up >> 8);
- u[i].l = (unsigned char)(up & 0x00FF);
- }
- }
- u16_u8(p, MAXWORDUTF8LEN, u, nc);
- return strlen(p);
- } else {
- while (*p != '\0') {
- *p = csconv[((unsigned char)*p)].cupper;
- p++;
- }
- }
- return nc;
-}
-
-void Hunspell::mkallsmall(char* p) {
- while (*p != '\0') {
- *p = csconv[((unsigned char)*p)].clower;
- p++;
+ ::mkallcap(u8, csconv);
}
}
-int Hunspell::mkallsmall2(char* p, w_char* u, int nc) {
+int Hunspell::mkallsmall2(std::string& u8, std::vector<w_char>& u16) {
if (utf8) {
- unsigned short idx;
- for (int i = 0; i < nc; i++) {
- idx = (u[i].h << 8) + u[i].l;
- unsigned short low = unicodetolower(idx, langnum);
- if (idx != low) {
- u[i].h = (unsigned char)(low >> 8);
- u[i].l = (unsigned char)(low & 0x00FF);
- }
- }
- u16_u8(p, MAXWORDUTF8LEN, u, nc);
- return strlen(p);
+ ::mkallsmall_utf(u16, langnum);
+ u16_u8(u8, u16);
} else {
- while (*p != '\0') {
- *p = csconv[((unsigned char)*p)].clower;
- p++;
- }
+ ::mkallsmall(u8, csconv);
}
- return nc;
+ return u8.size();
}
// convert UTF-8 sharp S codes to latin 1
-char* Hunspell::sharps_u8_l1(char* dest, char* source) {
- char* p = dest;
- *p = *source;
- for (p++, source++; *(source - 1); p++, source++) {
- *p = *source;
- if (*source == '\x9F')
- *--p = '\xDF';
- }
+std::string Hunspell::sharps_u8_l1(const std::string& source) {
+ std::string dest(source);
+ mystrrep(dest, "\xC3\x9F", "\xDF");
return dest;
}
// recursive search for right ss - sharp s permutations
-hentry* Hunspell::spellsharps(char* base,
- char* pos,
+hentry* Hunspell::spellsharps(std::string& base,
+ size_t n_pos,
int n,
int repnum,
- char* tmp,
int* info,
char** root) {
- pos = strstr(pos, "ss");
- if (pos && (n < MAXSHARPS)) {
- *pos = '\xC3';
- *(pos + 1) = '\x9F';
- hentry* h = spellsharps(base, pos + 2, n + 1, repnum + 1, tmp, info, root);
+ size_t pos = base.find("ss", n_pos);
+ if (pos != std::string::npos && (n < MAXSHARPS)) {
+ base[pos] = '\xC3';
+ base[pos + 1] = '\x9F';
+ hentry* h = spellsharps(base, pos + 2, n + 1, repnum + 1, info, root);
if (h)
return h;
- *pos = 's';
- *(pos + 1) = 's';
- h = spellsharps(base, pos + 2, n + 1, repnum, tmp, info, root);
+ base[pos] = 's';
+ base[pos + 1] = 's';
+ h = spellsharps(base, pos + 2, n + 1, repnum, info, root);
if (h)
return h;
} else if (repnum > 0) {
if (utf8)
- return checkword(base, info, root);
- return checkword(sharps_u8_l1(tmp, base), info, root);
+ return checkword(base.c_str(), info, root);
+ std::string tmp(sharps_u8_l1(base));
+ return checkword(tmp.c_str(), info, root);
}
return NULL;
}
@@ -403,7 +339,7 @@ int Hunspell::is_keepcase(const hentry* rv) {
}
/* insert a word to the beginning of the suggestion array and return ns */
-int Hunspell::insert_sug(char*** slst, char* word, int ns) {
+int Hunspell::insert_sug(char*** slst, const char* word, int ns) {
if (!*slst)
return ns;
char* dup = mystrdup(word);
@@ -421,11 +357,6 @@ int Hunspell::insert_sug(char*** slst, char* word, int ns) {
int Hunspell::spell(const char* word, int* info, char** root) {
struct hentry* rv = NULL;
- // need larger vector. For example, Turkish capital letter I converted a
- // 2-byte UTF-8 character (dotless i) by mkallsmall.
- char cw[MAXWORDUTF8LEN];
- char wspace[MAXWORDUTF8LEN];
- w_char unicw[MAXWORDLEN];
int info2 = 0;
if (!info)
@@ -437,7 +368,6 @@ int Hunspell::spell(const char* word, int* info, char** root) {
if (strcmp(word, SPELL_XML) == 0)
return 1;
int nc = strlen(word);
- int wl2 = 0;
if (utf8) {
if (nc >= MAXWORDUTF8LEN)
return 0;
@@ -445,19 +375,26 @@ int Hunspell::spell(const char* word, int* info, char** root) {
if (nc >= MAXWORDLEN)
return 0;
}
- int captype = 0;
- int abbv = 0;
- int wl = 0;
+ int captype = NOCAP;
+ size_t abbv = 0;
+ size_t wl = 0;
+
+ std::string scw;
+ std::vector<w_char> sunicw;
// input conversion
RepList* rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
- int convstatus = rl ? rl->conv(word, wspace, MAXWORDUTF8LEN) : 0;
- if (convstatus < 0)
- return 0;
- else if (convstatus > 0)
- wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
- else
- wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
+ {
+ std::string wspace;
+
+ int convstatus = rl ? rl->conv(word, wspace) : 0;
+ if (convstatus < 0)
+ return 0;
+ else if (convstatus > 0)
+ wl = cleanword2(scw, sunicw, wspace.c_str(), &nc, &captype, &abbv);
+ else
+ wl = cleanword2(scw, sunicw, word, &nc, &captype, &abbv);
+ }
#ifdef MOZILLA_CLIENT
// accept the abbreviated words without dots
@@ -474,12 +411,12 @@ int Hunspell::spell(const char* word, int* info, char** root) {
// "..", "--" etc.)
enum { NBEGIN, NNUM, NSEP };
int nstate = NBEGIN;
- int i;
+ size_t i;
for (i = 0; (i < wl); i++) {
- if ((cw[i] <= '9') && (cw[i] >= '0')) {
+ if ((scw[i] <= '9') && (scw[i] >= '0')) {
nstate = NNUM;
- } else if ((cw[i] == ',') || (cw[i] == '.') || (cw[i] == '-')) {
+ } else if ((scw[i] == ',') || (scw[i] == '.') || (scw[i] == '-')) {
if ((nstate == NSEP) || (i == 0))
break;
nstate = NSEP;
@@ -496,75 +433,75 @@ int Hunspell::spell(const char* word, int* info, char** root) {
*info += SPELL_ORIGCAP;
/* FALLTHROUGH */
case NOCAP:
- rv = checkword(cw, info, root);
+ rv = checkword(scw.c_str(), info, root);
if ((abbv) && !(rv)) {
- memcpy(wspace, cw, wl);
- *(wspace + wl) = '.';
- *(wspace + wl + 1) = '\0';
- rv = checkword(wspace, info, root);
+ std::string u8buffer(scw);
+ u8buffer.push_back('.');
+ rv = checkword(u8buffer.c_str(), info, root);
}
break;
case ALLCAP: {
*info += SPELL_ORIGCAP;
- rv = checkword(cw, info, root);
+ rv = checkword(scw.c_str(), info, root);
if (rv)
break;
if (abbv) {
- memcpy(wspace, cw, wl);
- *(wspace + wl) = '.';
- *(wspace + wl + 1) = '\0';
- rv = checkword(wspace, info, root);
+ std::string u8buffer(scw);
+ u8buffer.push_back('.');
+ rv = checkword(u8buffer.c_str(), info, root);
if (rv)
break;
}
// Spec. prefix handling for Catalan, French, Italian:
// prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia).
- if (pAMgr && strchr(cw, '\'')) {
- mkallsmall2(cw, unicw, nc);
- // There are no really sane circumstances where this could fail,
- // but anyway...
- if (char* apostrophe = strchr(cw, '\'')) {
+ size_t apos = pAMgr ? scw.find('\'') : std::string::npos;
+ if (apos != std::string::npos) {
+ mkallsmall2(scw, sunicw);
+ //conversion may result in string with different len to pre-mkallsmall2
+ //so re-scan
+ if (apos != std::string::npos && apos < scw.size() - 1) {
+ std::string part1 = scw.substr(0, apos+1);
+ std::string part2 = scw.substr(apos+1);
if (utf8) {
- w_char tmpword[MAXWORDLEN];
- *apostrophe = '\0';
- wl2 = u8_u16(tmpword, MAXWORDLEN, cw);
- *apostrophe = '\'';
- if (wl2 >= 0 && wl2 < nc) {
- mkinitcap2(apostrophe + 1, unicw + wl2 + 1, nc - wl2 - 1);
- rv = checkword(cw, info, root);
- if (rv)
- break;
- }
+ std::vector<w_char> part1u, part2u;
+ u8_u16(part1u, part1);
+ u8_u16(part2u, part2);
+ mkinitcap2(part2, part2u);
+ scw = part1 + part2;
+ sunicw = part1u;
+ sunicw.insert(sunicw.end(), part2u.begin(), part2u.end());
+ rv = checkword(scw.c_str(), info, root);
+ if (rv)
+ break;
} else {
- mkinitcap2(apostrophe + 1, unicw, nc);
- rv = checkword(cw, info, root);
+ mkinitcap2(part2, sunicw);
+ scw = part1 + part2;
+ rv = checkword(scw.c_str(), info, root);
if (rv)
break;
}
+ mkinitcap2(scw, sunicw);
+ rv = checkword(scw.c_str(), info, root);
+ if (rv)
+ break;
}
- mkinitcap2(cw, unicw, nc);
- rv = checkword(cw, info, root);
- if (rv)
- break;
}
- if (pAMgr && pAMgr->get_checksharps() && strstr(cw, "SS")) {
- char tmpword[MAXWORDUTF8LEN];
- wl = mkallsmall2(cw, unicw, nc);
- memcpy(wspace, cw, (wl + 1));
- rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root);
+ if (pAMgr && pAMgr->get_checksharps() && scw.find("SS") != std::string::npos) {
+
+ mkallsmall2(scw, sunicw);
+ std::string u8buffer(scw);
+ rv = spellsharps(u8buffer, 0, 0, 0, info, root);
if (!rv) {
- wl2 = mkinitcap2(cw, unicw, nc);
- rv = spellsharps(cw, cw, 0, 0, tmpword, info, root);
+ mkinitcap2(scw, sunicw);
+ rv = spellsharps(scw, 0, 0, 0, info, root);
}
if ((abbv) && !(rv)) {
- *(wspace + wl) = '.';
- *(wspace + wl + 1) = '\0';
- rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root);
+ u8buffer.push_back('.');
+ rv = spellsharps(u8buffer, 0, 0, 0, info, root);
if (!rv) {
- memcpy(wspace, cw, wl2);
- *(wspace + wl2) = '.';
- *(wspace + wl2 + 1) = '\0';
- rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root);
+ u8buffer = std::string(scw);
+ u8buffer.push_back('.');
+ rv = spellsharps(u8buffer, 0, 0, 0, info, root);
}
}
if (rv)
@@ -572,13 +509,14 @@ int Hunspell::spell(const char* word, int* info, char** root) {
}
}
case INITCAP: {
+
*info += SPELL_ORIGCAP;
- wl = mkallsmall2(cw, unicw, nc);
- memcpy(wspace, cw, (wl + 1));
- wl2 = mkinitcap2(cw, unicw, nc);
+ mkallsmall2(scw, sunicw);
+ std::string u8buffer(scw);
+ mkinitcap2(scw, sunicw);
if (captype == INITCAP)
*info += SPELL_INITCAP;
- rv = checkword(cw, info, root);
+ rv = checkword(scw.c_str(), info, root);
if (captype == INITCAP)
*info -= SPELL_INITCAP;
// forbid bad capitalization
@@ -593,18 +531,16 @@ int Hunspell::spell(const char* word, int* info, char** root) {
if (rv)
break;
- rv = checkword(wspace, info, root);
+ rv = checkword(u8buffer.c_str(), info, root);
if (abbv && !rv) {
- *(wspace + wl) = '.';
- *(wspace + wl + 1) = '\0';
- rv = checkword(wspace, info, root);
+ u8buffer.push_back('.');
+ rv = checkword(u8buffer.c_str(), info, root);
if (!rv) {
- memcpy(wspace, cw, wl2);
- *(wspace + wl2) = '.';
- *(wspace + wl2 + 1) = '\0';
+ u8buffer = scw;
+ u8buffer.push_back('.');
if (captype == INITCAP)
*info += SPELL_INITCAP;
- rv = checkword(wspace, info, root);
+ rv = checkword(u8buffer.c_str(), info, root);
if (captype == INITCAP)
*info -= SPELL_INITCAP;
if (rv && is_keepcase(rv) && (captype == ALLCAP))
@@ -617,8 +553,8 @@ int Hunspell::spell(const char* word, int* info, char** root) {
// if CHECKSHARPS: KEEPCASE words with \xDF are allowed
// in INITCAP form, too.
!(pAMgr->get_checksharps() &&
- ((utf8 && strstr(wspace, "\xC3\x9F")) ||
- (!utf8 && strchr(wspace, '\xDF'))))))
+ ((utf8 && u8buffer.find("\xC3\x9F") != std::string::npos) ||
+ (!utf8 && u8buffer.find('\xDF') != std::string::npos)))))
rv = NULL;
break;
}
@@ -637,67 +573,66 @@ int Hunspell::spell(const char* word, int* info, char** root) {
// recursive breaking at break points
if (wordbreak) {
- char* s;
- char r;
+
int nbr = 0;
- wl = strlen(cw);
+ wl = scw.size();
int numbreak = pAMgr ? pAMgr->get_numbreak() : 0;
// calculate break points for recursion limit
for (int j = 0; j < numbreak; j++) {
- s = cw;
- do {
- s = (char*)strstr(s, wordbreak[j]);
- if (s) {
- nbr++;
- s++;
- }
- } while (s);
+ size_t len = strlen(wordbreak[j]);
+ size_t pos = 0;
+ while ((pos = scw.find(wordbreak[j], pos, len)) != std::string::npos) {
+ ++nbr;
+ pos += len;
+ }
}
if (nbr >= 10)
return 0;
// check boundary patterns (^begin and end$)
for (int j = 0; j < numbreak; j++) {
- int plen = strlen(wordbreak[j]);
+ size_t plen = strlen(wordbreak[j]);
if (plen == 1 || plen > wl)
continue;
+
if (wordbreak[j][0] == '^' &&
- strncmp(cw, wordbreak[j] + 1, plen - 1) == 0 && spell(cw + plen - 1))
+ scw.compare(0, plen - 1, wordbreak[j] + 1, plen -1) == 0 && spell(scw.c_str() + plen - 1))
return 1;
+
if (wordbreak[j][plen - 1] == '$' &&
- strncmp(cw + wl - plen + 1, wordbreak[j], plen - 1) == 0) {
- r = cw[wl - plen + 1];
- cw[wl - plen + 1] = '\0';
- if (spell(cw))
+ scw.compare(wl - plen + 1, plen - 1, wordbreak[j], plen - 1) == 0) {
+ char r = scw[wl - plen + 1];
+ scw[wl - plen + 1] = '\0';
+ if (spell(scw.c_str()))
return 1;
- cw[wl - plen + 1] = r;
+ scw[wl - plen + 1] = r;
}
}
// other patterns
for (int j = 0; j < numbreak; j++) {
- int plen = strlen(wordbreak[j]);
- s = (char*)strstr(cw, wordbreak[j]);
- if (s && (s > cw) && (s < cw + wl - plen)) {
- if (!spell(s + plen))
+ size_t plen = strlen(wordbreak[j]);
+ size_t found = scw.find(wordbreak[j]);
+ if ((found > 0) && (found < wl - plen)) {
+ if (!spell(scw.c_str() + found + plen))
continue;
- r = *s;
- *s = '\0';
+ char r = scw[found];
+ scw[found] = '\0';
// examine 2 sides of the break point
- if (spell(cw))
+ if (spell(scw.c_str()))
return 1;
- *s = r;
+ scw[found] = r;
// LANG_hu: spec. dash rule
if (langnum == LANG_hu && strcmp(wordbreak[j], "-") == 0) {
- r = s[1];
- s[1] = '\0';
- if (spell(cw))
+ r = scw[found + 1];
+ scw[found + 1] = '\0';
+ if (spell(scw.c_str()))
return 1; // check the first part with dash
- s[1] = r;
+ scw[found + 1] = r;
}
- // end of LANG speficic region
+ // end of LANG specific region
}
}
}
@@ -716,10 +651,9 @@ struct hentry* Hunspell::checkword(const char* w, int* info, char** root) {
if (ignoredchars != NULL) {
w2.assign(w);
if (utf8) {
- int ignoredchars_utf16_len;
- unsigned short* ignoredchars_utf16 =
- pAMgr->get_ignore_utf16(&ignoredchars_utf16_len);
- remove_ignored_chars_utf(w2, ignoredchars_utf16, ignoredchars_utf16_len);
+ const std::vector<w_char>& ignoredchars_utf16 =
+ pAMgr->get_ignore_utf16();
+ remove_ignored_chars_utf(w2, ignoredchars_utf16);
} else {
remove_ignored_chars(w2, ignoredchars);
}
@@ -802,37 +736,40 @@ struct hentry* Hunspell::checkword(const char* w, int* info, char** root) {
return NULL;
}
if (root) {
- *root = mystrdup(he->word);
- if (*root && complexprefixes) {
+ std::string word_root(he->word);
+ if (complexprefixes) {
if (utf8)
- reverseword_utf(*root);
+ reverseword_utf(word_root);
else
- reverseword(*root);
+ reverseword(word_root);
}
+ *root = mystrdup(word_root.c_str());
}
// try check compound word
} else if (pAMgr->get_compound()) {
- he = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 0, info);
+ struct hentry* rwords[100]; // buffer for COMPOUND pattern checking
+ he = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, (hentry**)&rwords, 0, 0, info);
// LANG_hu section: `moving rule' with last dash
if ((!he) && (langnum == LANG_hu) && (word[len - 1] == '-')) {
char* dup = mystrdup(word);
if (!dup)
return NULL;
dup[len - 1] = '\0';
- he = pAMgr->compound_check(dup, len - 1, -5, 0, 100, 0, NULL, 1, 0,
+ he = pAMgr->compound_check(dup, len - 1, -5, 0, 100, 0, NULL, (hentry**)&rwords, 1, 0,
info);
free(dup);
}
- // end of LANG speficic region
+ // end of LANG specific region
if (he) {
if (root) {
- *root = mystrdup(he->word);
- if (*root && complexprefixes) {
+ std::string word_root(he->word);
+ if (complexprefixes) {
if (utf8)
- reverseword_utf(*root);
+ reverseword_utf(word_root);
else
- reverseword(*root);
+ reverseword(word_root);
}
+ *root = mystrdup(word_root.c_str());
}
if (info)
*info += SPELL_COMPOUND;
@@ -845,11 +782,8 @@ struct hentry* Hunspell::checkword(const char* w, int* info, char** root) {
int Hunspell::suggest(char*** slst, const char* word) {
int onlycmpdsug = 0;
- char cw[MAXWORDUTF8LEN];
- char wspace[MAXWORDUTF8LEN];
if (!pSMgr || maxdic == 0)
return 0;
- w_char unicw[MAXWORDLEN];
*slst = NULL;
// process XML input of the simplified API (see manual)
if (strncmp(word, SPELL_XML, sizeof(SPELL_XML) - 3) == 0) {
@@ -863,130 +797,132 @@ int Hunspell::suggest(char*** slst, const char* word) {
if (nc >= MAXWORDLEN)
return 0;
}
- int captype = 0;
- int abbv = 0;
- int wl = 0;
+ int captype = NOCAP;
+ size_t abbv = 0;
+ size_t wl = 0;
+
+ std::string scw;
+ std::vector<w_char> sunicw;
// input conversion
RepList* rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
- int convstatus = rl ? rl->conv(word, wspace, MAXWORDUTF8LEN) : 0;
- if (convstatus < 0)
- return 0;
- else if (convstatus > 0)
- wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
- else
- wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
+ {
+ std::string wspace;
+
+ int convstatus = rl ? rl->conv(word, wspace) : 0;
+ if (convstatus < 0)
+ return 0;
+ else if (convstatus > 0)
+ wl = cleanword2(scw, sunicw, wspace.c_str(), &nc, &captype, &abbv);
+ else
+ wl = cleanword2(scw, sunicw, word, &nc, &captype, &abbv);
+
+ if (wl == 0)
+ return 0;
+ }
- if (wl == 0)
- return 0;
int ns = 0;
int capwords = 0;
// check capitalized form for FORCEUCASE
if (pAMgr && captype == NOCAP && pAMgr->get_forceucase()) {
int info = SPELL_ORIGCAP;
- char** wlst;
- if (checkword(cw, &info, NULL)) {
- if (*slst) {
- wlst = *slst;
- } else {
- wlst = (char**)malloc(MAXSUGGESTION * sizeof(char*));
- if (wlst == NULL)
- return -1;
- *slst = wlst;
- for (int i = 0; i < MAXSUGGESTION; i++) {
- wlst[i] = NULL;
- }
+ if (checkword(scw.c_str(), &info, NULL)) {
+ std::string form(scw);
+ mkinitcap(form);
+
+ char** wlst = (char**)malloc(MAXSUGGESTION * sizeof(char*));
+ if (wlst == NULL)
+ return -1;
+ *slst = wlst;
+ wlst[0] = mystrdup(form.c_str());
+ for (int i = 1; i < MAXSUGGESTION; ++i) {
+ wlst[i] = NULL;
}
- wlst[0] = mystrdup(cw);
- mkinitcap(wlst[0]);
+
return 1;
}
}
switch (captype) {
case NOCAP: {
- ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug);
+ ns = pSMgr->suggest(slst, scw.c_str(), ns, &onlycmpdsug);
break;
}
case INITCAP: {
capwords = 1;
- ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug);
+ ns = pSMgr->suggest(slst, scw.c_str(), ns, &onlycmpdsug);
if (ns == -1)
break;
- memcpy(wspace, cw, (wl + 1));
- mkallsmall2(wspace, unicw, nc);
- ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
+ std::string wspace(scw);
+ mkallsmall2(wspace, sunicw);
+ ns = pSMgr->suggest(slst, wspace.c_str(), ns, &onlycmpdsug);
break;
}
case HUHINITCAP:
capwords = 1;
case HUHCAP: {
- ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug);
+ ns = pSMgr->suggest(slst, scw.c_str(), ns, &onlycmpdsug);
if (ns != -1) {
- int prevns;
// something.The -> something. The
- char* dot = strchr(cw, '.');
- if (dot && (dot > cw)) {
+ size_t dot_pos = scw.find('.');
+ if (dot_pos != std::string::npos) {
+ std::string postdot = scw.substr(dot_pos + 1);
int captype_;
if (utf8) {
- w_char w_[MAXWORDLEN];
- int wl_ = u8_u16(w_, MAXWORDLEN, dot + 1);
- captype_ = get_captype_utf8(w_, wl_, langnum);
- } else
- captype_ = get_captype(dot + 1, strlen(dot + 1), csconv);
+ std::vector<w_char> postdotu;
+ u8_u16(postdotu, postdot);
+ captype_ = get_captype_utf8(postdotu, langnum);
+ } else {
+ captype_ = get_captype(postdot, csconv);
+ }
if (captype_ == INITCAP) {
- char* st = mystrdup(cw);
- if (st) {
- char* newst = (char*)realloc(st, wl + 2);
- if (newst == NULL)
- free(st);
- st = newst;
- }
- if (st) {
- st[(dot - cw) + 1] = ' ';
- strcpy(st + (dot - cw) + 2, dot + 1);
- ns = insert_sug(slst, st, ns);
- free(st);
- }
+ std::string str(scw);
+ str.insert(dot_pos + 1, 1, ' ');
+ ns = insert_sug(slst, str.c_str(), ns);
}
}
+
+ std::string wspace;
+
if (captype == HUHINITCAP) {
// TheOpenOffice.org -> The OpenOffice.org
- memcpy(wspace, cw, (wl + 1));
- mkinitsmall2(wspace, unicw, nc);
- ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
+ wspace = scw;
+ mkinitsmall2(wspace, sunicw);
+ ns = pSMgr->suggest(slst, wspace.c_str(), ns, &onlycmpdsug);
}
- memcpy(wspace, cw, (wl + 1));
- mkallsmall2(wspace, unicw, nc);
- if (spell(wspace))
- ns = insert_sug(slst, wspace, ns);
- prevns = ns;
- ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
+ wspace = scw;
+ mkallsmall2(wspace, sunicw);
+ if (spell(wspace.c_str()))
+ ns = insert_sug(slst, wspace.c_str(), ns);
+ int prevns = ns;
+ ns = pSMgr->suggest(slst, wspace.c_str(), ns, &onlycmpdsug);
if (captype == HUHINITCAP) {
- mkinitcap2(wspace, unicw, nc);
- if (spell(wspace))
- ns = insert_sug(slst, wspace, ns);
- ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
+ mkinitcap2(wspace, sunicw);
+ if (spell(wspace.c_str()))
+ ns = insert_sug(slst, wspace.c_str(), ns);
+ ns = pSMgr->suggest(slst, wspace.c_str(), ns, &onlycmpdsug);
}
// aNew -> "a New" (instead of "a new")
for (int j = prevns; j < ns; j++) {
char* space = strchr((*slst)[j], ' ');
if (space) {
- int slen = strlen(space + 1);
+ size_t slen = strlen(space + 1);
// different case after space (need capitalisation)
- if ((slen < wl) && strcmp(cw + wl - slen, space + 1)) {
- w_char w[MAXWORDLEN];
- int wc = 0;
- char* r = (*slst)[j];
+ if ((slen < wl) && strcmp(scw.c_str() + wl - slen, space + 1)) {
+ std::string first((*slst)[j], space + 1);
+ std::string second(space + 1);
+ std::vector<w_char> w;
if (utf8)
- wc = u8_u16(w, MAXWORDLEN, space + 1);
- mkinitcap2(space + 1, w, wc);
+ u8_u16(w, second);
+ mkinitcap2(second, w);
// set as first suggestion
+ char* r = (*slst)[j];
for (int k = j; k > 0; k--)
(*slst)[k] = (*slst)[k - 1];
- (*slst)[0] = r;
+ free(r);
+ (*slst)[0] = mystrdup((first + second).c_str());
}
}
}
@@ -995,35 +931,30 @@ int Hunspell::suggest(char*** slst, const char* word) {
}
case ALLCAP: {
- memcpy(wspace, cw, (wl + 1));
- mkallsmall2(wspace, unicw, nc);
- ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
+ std::string wspace(scw);
+ mkallsmall2(wspace, sunicw);
+ ns = pSMgr->suggest(slst, wspace.c_str(), ns, &onlycmpdsug);
if (ns == -1)
break;
- if (pAMgr && pAMgr->get_keepcase() && spell(wspace))
- ns = insert_sug(slst, wspace, ns);
- mkinitcap2(wspace, unicw, nc);
- ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
+ if (pAMgr && pAMgr->get_keepcase() && spell(wspace.c_str()))
+ ns = insert_sug(slst, wspace.c_str(), ns);
+ mkinitcap2(wspace, sunicw);
+ ns = pSMgr->suggest(slst, wspace.c_str(), ns, &onlycmpdsug);
for (int j = 0; j < ns; j++) {
- mkallcap((*slst)[j]);
+ std::string form((*slst)[j]);
+ mkallcap(form);
+
if (pAMgr && pAMgr->get_checksharps()) {
- char* pos;
if (utf8) {
- pos = strstr((*slst)[j], "\xC3\x9F");
- while (pos) {
- *pos = 'S';
- *(pos + 1) = 'S';
- pos = strstr(pos + 2, "\xC3\x9F");
- }
+ mystrrep(form, "\xC3\x9F", "SS");
} else {
- pos = strchr((*slst)[j], '\xDF');
- while (pos) {
- (*slst)[j] = (char*)realloc((*slst)[j], strlen((*slst)[j]) + 2);
- mystrrep((*slst)[j], "\xDF", "SS");
- pos = strchr((*slst)[j], '\xDF');
- }
+ mystrrep(form, "\xDF", "SS");
}
}
+
+ free((*slst)[j]);
+ (*slst)[j] = mystrdup(form.c_str());
+
}
break;
}
@@ -1035,11 +966,10 @@ int Hunspell::suggest(char*** slst, const char* word) {
char* pos = strchr((*slst)[j], '-');
if (pos) {
int info;
- char w[MAXWORDUTF8LEN];
*pos = '\0';
- strcpy(w, (*slst)[j]);
- strcat(w, pos + 1);
- (void)spell(w, &info, NULL);
+ std::string w((*slst)[j]);
+ w.append(pos + 1);
+ (void)spell(w.c_str(), &info, NULL);
if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) {
*pos = ' ';
} else
@@ -1054,64 +984,67 @@ int Hunspell::suggest(char*** slst, const char* word) {
(*slst)) {
switch (captype) {
case NOCAP: {
- ns = pSMgr->ngsuggest(*slst, cw, ns, pHMgr, maxdic);
+ ns = pSMgr->ngsuggest(*slst, scw.c_str(), ns, pHMgr, maxdic);
break;
}
case HUHINITCAP:
capwords = 1;
case HUHCAP: {
- memcpy(wspace, cw, (wl + 1));
- mkallsmall2(wspace, unicw, nc);
- ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic);
+ std::string wspace(scw);
+ mkallsmall2(wspace, sunicw);
+ ns = pSMgr->ngsuggest(*slst, wspace.c_str(), ns, pHMgr, maxdic);
break;
}
case INITCAP: {
capwords = 1;
- memcpy(wspace, cw, (wl + 1));
- mkallsmall2(wspace, unicw, nc);
- ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic);
+ std::string wspace(scw);
+ mkallsmall2(wspace, sunicw);
+ ns = pSMgr->ngsuggest(*slst, wspace.c_str(), ns, pHMgr, maxdic);
break;
}
case ALLCAP: {
- memcpy(wspace, cw, (wl + 1));
- mkallsmall2(wspace, unicw, nc);
+ std::string wspace(scw);
+ mkallsmall2(wspace, sunicw);
int oldns = ns;
- ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic);
- for (int j = oldns; j < ns; j++)
- mkallcap((*slst)[j]);
+ ns = pSMgr->ngsuggest(*slst, wspace.c_str(), ns, pHMgr, maxdic);
+ for (int j = oldns; j < ns; j++) {
+ std::string form((*slst)[j]);
+ mkallcap(form);
+ free((*slst)[j]);
+ (*slst)[j] = mystrdup(form.c_str());
+ }
break;
}
}
}
// try dash suggestion (Afo-American -> Afro-American)
- if (char* pos = strchr(cw, '-')) {
- char* ppos = cw;
+ size_t dash_pos = scw.find('-');
+ if (dash_pos != std::string::npos) {
int nodashsug = 1;
- char** nlst = NULL;
- int nn = 0;
- int last = 0;
- if (*slst) {
- for (int j = 0; j < ns && nodashsug == 1; j++) {
- if (strchr((*slst)[j], '-'))
- nodashsug = 0;
- }
+ for (int j = 0; j < ns && nodashsug == 1; j++) {
+ if (strchr((*slst)[j], '-'))
+ nodashsug = 0;
}
+
+ size_t prev_pos = 0;
+ bool last = false;
+
while (nodashsug && !last) {
- if (*pos == '\0')
+ if (dash_pos == scw.size())
last = 1;
- else
- *pos = '\0';
- if (!spell(ppos)) {
- nn = suggest(&nlst, ppos);
+ std::string chunk = scw.substr(prev_pos, dash_pos - prev_pos);
+ if (!spell(chunk.c_str())) {
+ char** nlst = NULL;
+ int nn = suggest(&nlst, chunk.c_str());
for (int j = nn - 1; j >= 0; j--) {
- strncpy(wspace, cw, ppos - cw);
- strcpy(wspace + (ppos - cw), nlst[j]);
+ std::string wspace = scw.substr(0, prev_pos);
+ wspace.append(nlst[j]);
if (!last) {
- strcat(wspace, "-");
- strcat(wspace, pos + 1);
+ wspace.append("-");
+ wspace.append(scw.substr(dash_pos + 1));
}
- ns = insert_sug(slst, wspace, ns);
+ ns = insert_sug(slst, wspace.c_str(), ns);
free(nlst[j]);
}
if (nlst != NULL)
@@ -1119,29 +1052,34 @@ int Hunspell::suggest(char*** slst, const char* word) {
nodashsug = 0;
}
if (!last) {
- *pos = '-';
- ppos = pos + 1;
- pos = strchr(ppos, '-');
+ prev_pos = dash_pos + 1;
+ dash_pos = scw.find('-', prev_pos);
}
- if (!pos)
- pos = cw + strlen(cw);
+ if (dash_pos == std::string::npos)
+ dash_pos = scw.size();
}
}
// word reversing wrapper for complex prefixes
if (complexprefixes) {
for (int j = 0; j < ns; j++) {
+ std::string root((*slst)[j]);
+ free((*slst)[j]);
if (utf8)
- reverseword_utf((*slst)[j]);
+ reverseword_utf(root);
else
- reverseword((*slst)[j]);
+ reverseword(root);
+ (*slst)[j] = mystrdup(root.c_str());
}
}
// capitalize
if (capwords)
for (int j = 0; j < ns; j++) {
- mkinitcap((*slst)[j]);
+ std::string form((*slst)[j]);
+ free((*slst)[j]);
+ mkinitcap(form);
+ (*slst)[j] = mystrdup(form.c_str());
}
// expand suggestions with dot(s)
@@ -1160,25 +1098,23 @@ int Hunspell::suggest(char*** slst, const char* word) {
int l = 0;
for (int j = 0; j < ns; j++) {
if (!strchr((*slst)[j], ' ') && !spell((*slst)[j])) {
- char s[MAXSWUTF8L];
- w_char w[MAXSWL];
- int len;
+ std::string s;
+ std::vector<w_char> w;
if (utf8) {
- len = u8_u16(w, MAXSWL, (*slst)[j]);
+ u8_u16(w, (*slst)[j]);
} else {
- strcpy(s, (*slst)[j]);
- len = strlen(s);
+ s = (*slst)[j];
}
- mkallsmall2(s, w, len);
+ mkallsmall2(s, w);
free((*slst)[j]);
- if (spell(s)) {
- (*slst)[l] = mystrdup(s);
+ if (spell(s.c_str())) {
+ (*slst)[l] = mystrdup(s.c_str());
if ((*slst)[l])
l++;
} else {
- mkinitcap2(s, w, len);
- if (spell(s)) {
- (*slst)[l] = mystrdup(s);
+ mkinitcap2(s, w);
+ if (spell(s.c_str())) {
+ (*slst)[l] = mystrdup(s.c_str());
if ((*slst)[l])
l++;
}
@@ -1211,9 +1147,10 @@ int Hunspell::suggest(char*** slst, const char* word) {
// output conversion
rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL;
for (int j = 0; rl && j < ns; j++) {
- if (rl->conv((*slst)[j], wspace, MAXWORDUTF8LEN) > 0) {
+ std::string wspace;
+ if (rl->conv((*slst)[j], wspace) > 0) {
free((*slst)[j]);
- (*slst)[j] = mystrdup(wspace);
+ (*slst)[j] = mystrdup(wspace.c_str());
}
}
@@ -1233,151 +1170,25 @@ char* Hunspell::get_dic_encoding() {
return encoding;
}
-#ifdef HUNSPELL_EXPERIMENTAL
-// XXX UTF-8 support is OK?
-int Hunspell::suggest_auto(char*** slst, const char* word) {
- char cw[MAXWORDUTF8LEN];
- char wspace[MAXWORDUTF8LEN];
- if (!pSMgr || maxdic == 0)
- return 0;
- w_char unicw[MAXWORDLEN];
- int nc = strlen(word);
- if (utf8) {
- if (nc >= MAXWORDUTF8LEN)
- return 0;
- } else {
- if (nc >= MAXWORDLEN)
- return 0;
- }
- int captype = 0;
- int abbv = 0;
- int wl = 0;
-
- // input conversion
- RepList* rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
- int convstatus = rl ? rl->conv(word, wspace) : 0;
- if (convstatus < 0)
- return 0;
- else if (convstatus > 0)
- wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
- else
- wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
-
- if (wl == 0)
- return 0;
- int ns = 0;
- *slst = NULL; // HU, nsug in pSMgr->suggest
-
- switch (captype) {
- case NOCAP: {
- ns = pSMgr->suggest_auto(slst, cw, ns);
- if (ns > 0)
- break;
- break;
- }
-
- case INITCAP: {
- memcpy(wspace, cw, (wl + 1));
- mkallsmall2(wspace, unicw, nc);
- ns = pSMgr->suggest_auto(slst, wspace, ns);
- for (int j = 0; j < ns; j++)
- mkinitcap((*slst)[j]);
- ns = pSMgr->suggest_auto(slst, cw, ns);
- break;
- }
-
- case HUHINITCAP:
- case HUHCAP: {
- ns = pSMgr->suggest_auto(slst, cw, ns);
- if (ns == 0) {
- memcpy(wspace, cw, (wl + 1));
- mkallsmall2(wspace, unicw, nc);
- ns = pSMgr->suggest_auto(slst, wspace, ns);
- }
- break;
- }
-
- case ALLCAP: {
- memcpy(wspace, cw, (wl + 1));
- mkallsmall2(wspace, unicw, nc);
- ns = pSMgr->suggest_auto(slst, wspace, ns);
-
- mkinitcap(wspace);
- ns = pSMgr->suggest_auto(slst, wspace, ns);
-
- for (int j = 0; j < ns; j++)
- mkallcap((*slst)[j]);
- break;
- }
- }
-
- // word reversing wrapper for complex prefixes
- if (complexprefixes) {
- for (int j = 0; j < ns; j++) {
- if (utf8)
- reverseword_utf((*slst)[j]);
- else
- reverseword((*slst)[j]);
- }
- }
-
- // expand suggestions with dot(s)
- if (abbv && pAMgr && pAMgr->get_sugswithdots()) {
- for (int j = 0; j < ns; j++) {
- (*slst)[j] = (char*)realloc((*slst)[j], strlen((*slst)[j]) + 1 + abbv);
- strcat((*slst)[j], word + strlen(word) - abbv);
- }
- }
-
- // LANG_hu section: replace '-' with ' ' in Hungarian
- if (langnum == LANG_hu) {
- for (int j = 0; j < ns; j++) {
- char* pos = strchr((*slst)[j], '-');
- if (pos) {
- int info;
- char w[MAXWORDUTF8LEN];
- *pos = '\0';
- strcpy(w, (*slst)[j]);
- strcat(w, pos + 1);
- spell(w, &info, NULL);
- if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) {
- *pos = ' ';
- } else
- *pos = '-';
- }
- }
- }
-
- // output conversion
- rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL;
- for (int j = 0; rl && j < ns; j++) {
- if (rl->conv((*slst)[j], wspace) > 0) {
- free((*slst)[j]);
- (*slst)[j] = mystrdup(wspace);
- }
- }
-
- // END OF LANG_hu section
- return ns;
-}
-#endif
-
int Hunspell::stem(char*** slst, char** desc, int n) {
- char result[MAXLNLEN];
- char result2[MAXLNLEN];
+
+ std::string result2;
*slst = NULL;
if (n == 0)
return 0;
- *result2 = '\0';
for (int i = 0; i < n; i++) {
- *result = '\0';
+
+ std::string result;
+
// add compound word parts (except the last one)
char* s = (char*)desc[i];
char* part = strstr(s, MORPH_PART);
if (part) {
char* nextpart = strstr(part + 1, MORPH_PART);
while (nextpart) {
- copy_field(result + strlen(result), part, MORPH_PART);
+ std::string field;
+ copy_field(field, part, MORPH_PART);
+ result.append(field);
part = nextpart;
nextpart = strstr(part + 1, MORPH_PART);
}
@@ -1404,22 +1215,28 @@ int Hunspell::stem(char*** slst, char** desc, int n) {
int genl = line_tok(sg, &gen, MSEP_REC);
free(sg);
for (int j = 0; j < genl; j++) {
- sprintf(result2 + strlen(result2), "%c%s%s", MSEP_REC, result,
- gen[j]);
+ result2.push_back(MSEP_REC);
+ result2.append(result);
+ result2.append(gen[j]);
}
freelist(&gen, genl);
}
} else {
- sprintf(result2 + strlen(result2), "%c%s", MSEP_REC, result);
+ result2.push_back(MSEP_REC);
+ result2.append(result);
if (strstr(pl[k], MORPH_SURF_PFX)) {
- copy_field(result2 + strlen(result2), pl[k], MORPH_SURF_PFX);
+ std::string field;
+ copy_field(field, pl[k], MORPH_SURF_PFX);
+ result2.append(field);
}
- copy_field(result2 + strlen(result2), pl[k], MORPH_STEM);
+ std::string field;
+ copy_field(field, pl[k], MORPH_STEM);
+ result2.append(field);
}
}
freelist(&pl, pln);
}
- int sln = line_tok(result2, slst, MSEP_REC);
+ int sln = line_tok(result2.c_str(), slst, MSEP_REC);
return uniqlist(*slst, sln);
}
@@ -1431,148 +1248,43 @@ int Hunspell::stem(char*** slst, const char* word) {
return pln2;
}
-#ifdef HUNSPELL_EXPERIMENTAL
-int Hunspell::suggest_pos_stems(char*** slst, const char* word) {
- char cw[MAXWORDUTF8LEN];
- char wspace[MAXWORDUTF8LEN];
- if (!pSMgr || maxdic == 0)
- return 0;
- w_char unicw[MAXWORDLEN];
- int nc = strlen(word);
- if (utf8) {
- if (nc >= MAXWORDUTF8LEN)
- return 0;
- } else {
- if (nc >= MAXWORDLEN)
- return 0;
- }
- int captype = 0;
- int abbv = 0;
- int wl = 0;
-
- // input conversion
- RepList* rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
- int convstatus = rl ? rl->conv(word, wspace) : 0;
- if (convstatus < 0)
- return 0;
- else if (convstatus > 0)
- wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
- else
- wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
-
- if (wl == 0)
- return 0;
-
- int ns = 0; // ns=0 = normalized input
-
- *slst = NULL; // HU, nsug in pSMgr->suggest
-
- switch (captype) {
- case HUHCAP:
- case NOCAP: {
- ns = pSMgr->suggest_pos_stems(slst, cw, ns);
-
- if ((abbv) && (ns == 0)) {
- memcpy(wspace, cw, wl);
- *(wspace + wl) = '.';
- *(wspace + wl + 1) = '\0';
- ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
- }
-
- break;
- }
-
- case INITCAP: {
- ns = pSMgr->suggest_pos_stems(slst, cw, ns);
-
- if (ns == 0 || ((*slst)[0][0] == '#')) {
- memcpy(wspace, cw, (wl + 1));
- mkallsmall2(wspace, unicw, nc);
- ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
- }
-
- break;
- }
-
- case ALLCAP: {
- ns = pSMgr->suggest_pos_stems(slst, cw, ns);
- if (ns != 0)
- break;
-
- memcpy(wspace, cw, (wl + 1));
- mkallsmall2(wspace, unicw, nc);
- ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
-
- if (ns == 0) {
- mkinitcap(wspace);
- ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
- }
- break;
- }
- }
-
- // output conversion
- rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL;
- for (int j = 0; rl && j < ns; j++) {
- if (rl->conv((*slst)[j], wspace) > 0) {
- free((*slst)[j]);
- (*slst)[j] = mystrdup(wspace);
- }
- }
-
- return ns;
-}
-#endif // END OF HUNSPELL_EXPERIMENTAL CODE
-
const char* Hunspell::get_wordchars() {
return pAMgr->get_wordchars();
}
-unsigned short* Hunspell::get_wordchars_utf16(int* len) {
- return pAMgr->get_wordchars_utf16(len);
+const std::vector<w_char>& Hunspell::get_wordchars_utf16() {
+ return pAMgr->get_wordchars_utf16();
}
-void Hunspell::mkinitcap(char* p) {
- if (!utf8) {
- if (*p != '\0')
- *p = csconv[((unsigned char)*p)].cupper;
+void Hunspell::mkinitcap(std::string& u8) {
+ if (utf8) {
+ std::vector<w_char> u16;
+ u8_u16(u16, u8);
+ ::mkinitcap_utf(u16, langnum);
+ u16_u8(u8, u16);
} else {
- int len;
- w_char u[MAXWORDLEN];
- len = u8_u16(u, MAXWORDLEN, p);
- unsigned short i = unicodetoupper((u[0].h << 8) + u[0].l, langnum);
- u[0].h = (unsigned char)(i >> 8);
- u[0].l = (unsigned char)(i & 0x00FF);
- u16_u8(p, MAXWORDUTF8LEN, u, len);
+ ::mkinitcap(u8, csconv);
}
}
-int Hunspell::mkinitcap2(char* p, w_char* u, int nc) {
- if (!utf8) {
- if (*p != '\0')
- *p = csconv[((unsigned char)*p)].cupper;
- } else if (nc > 0) {
- unsigned short i = unicodetoupper((u[0].h << 8) + u[0].l, langnum);
- u[0].h = (unsigned char)(i >> 8);
- u[0].l = (unsigned char)(i & 0x00FF);
- u16_u8(p, MAXWORDUTF8LEN, u, nc);
- return strlen(p);
+int Hunspell::mkinitcap2(std::string& u8, std::vector<w_char>& u16) {
+ if (utf8) {
+ ::mkinitcap_utf(u16, langnum);
+ u16_u8(u8, u16);
+ } else {
+ ::mkinitcap(u8, csconv);
}
- return nc;
+ return u8.size();
}
-int Hunspell::mkinitsmall2(char* p, w_char* u, int nc) {
- if (!utf8) {
- if (*p != '\0')
- *p = csconv[((unsigned char)*p)].clower;
- } else if (nc > 0) {
- unsigned short i = unicodetolower((u[0].h << 8) + u[0].l, langnum);
- u[0].h = (unsigned char)(i >> 8);
- u[0].l = (unsigned char)(i & 0x00FF);
- u16_u8(p, MAXWORDUTF8LEN, u, nc);
- return strlen(p);
+int Hunspell::mkinitsmall2(std::string& u8, std::vector<w_char>& u16) {
+ if (utf8) {
+ ::mkinitsmall_utf(u16, langnum);
+ u16_u8(u8, u16);
+ } else {
+ ::mkinitsmall(u8, csconv);
}
- return nc;
+ return u8.size();
}
int Hunspell::add(const char* word) {
@@ -1601,20 +1313,16 @@ struct cs_info* Hunspell::get_csconv() {
return csconv;
}
-void Hunspell::cat_result(char* result, char* st) {
+void Hunspell::cat_result(std::string& result, char* st) {
if (st) {
- if (*result)
- mystrcat(result, "\n", MAXLNLEN);
- mystrcat(result, st, MAXLNLEN);
+ if (!result.empty())
+ result.append("\n");
+ result.append(st);
free(st);
}
}
int Hunspell::analyze(char*** slst, const char* word) {
- char cw[MAXWORDUTF8LEN];
- char wspace[MAXWORDUTF8LEN];
- w_char unicw[MAXWORDLEN];
- int wl2 = 0;
*slst = NULL;
if (!pSMgr || maxdic == 0)
return 0;
@@ -1626,48 +1334,52 @@ int Hunspell::analyze(char*** slst, const char* word) {
if (nc >= MAXWORDLEN)
return 0;
}
- int captype = 0;
- int abbv = 0;
- int wl = 0;
+ int captype = NOCAP;
+ size_t abbv = 0;
+ size_t wl = 0;
+
+ std::string scw;
+ std::vector<w_char> sunicw;
// input conversion
RepList* rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
- int convstatus = rl ? rl->conv(word, wspace, MAXWORDUTF8LEN) : 0;
- if (convstatus < 0)
- return 0;
- else if (convstatus > 0)
- wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
- else
- wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
+ {
+ std::string wspace;
+
+ int convstatus = rl ? rl->conv(word, wspace) : 0;
+ if (convstatus < 0)
+ return 0;
+ else if (convstatus > 0)
+ wl = cleanword2(scw, sunicw, wspace.c_str(), &nc, &captype, &abbv);
+ else
+ wl = cleanword2(scw, sunicw, word, &nc, &captype, &abbv);
+ }
if (wl == 0) {
if (abbv) {
+ scw.clear();
for (wl = 0; wl < abbv; wl++)
- cw[wl] = '.';
- cw[wl] = '\0';
+ scw.push_back('.');
abbv = 0;
} else
return 0;
}
- char result[MAXLNLEN];
- char* st = NULL;
-
- *result = '\0';
+ std::string result;
- int n = 0;
- int n2 = 0;
- int n3 = 0;
+ size_t n = 0;
+ size_t n2 = 0;
+ size_t n3 = 0;
// test numbers
// LANG_hu section: set dash information for suggestions
if (langnum == LANG_hu) {
- while ((n < wl) && (((cw[n] <= '9') && (cw[n] >= '0')) ||
- (((cw[n] == '.') || (cw[n] == ',')) && (n > 0)))) {
+ while ((n < wl) && (((scw[n] <= '9') && (scw[n] >= '0')) ||
+ (((scw[n] == '.') || (scw[n] == ',')) && (n > 0)))) {
n++;
- if ((cw[n] == '.') || (cw[n] == ',')) {
+ if ((scw[n] == '.') || (scw[n] == ',')) {
if (((n2 == 0) && (n > 3)) ||
- ((n2 > 0) && ((cw[n - 1] == '.') || (cw[n - 1] == ','))))
+ ((n2 > 0) && ((scw[n - 1] == '.') || (scw[n - 1] == ','))))
break;
n2++;
n3 = n;
@@ -1676,21 +1388,21 @@ int Hunspell::analyze(char*** slst, const char* word) {
if ((n == wl) && (n3 > 0) && (n - n3 > 3))
return 0;
- if ((n == wl) || ((n > 0) && ((cw[n] == '%') || (cw[n] == '\xB0')) &&
- checkword(cw + n, NULL, NULL))) {
- mystrcat(result, cw, MAXLNLEN);
- result[n - 1] = '\0';
+ if ((n == wl) || ((n > 0) && ((scw[n] == '%') || (scw[n] == '\xB0')) &&
+ checkword(scw.c_str() + n, NULL, NULL))) {
+ result.append(scw);
+ result.resize(n - 1);
if (n == wl)
- cat_result(result, pSMgr->suggest_morph(cw + n - 1));
+ cat_result(result, pSMgr->suggest_morph(scw.c_str() + n - 1));
else {
- char sign = cw[n];
- cw[n] = '\0';
- cat_result(result, pSMgr->suggest_morph(cw + n - 1));
- mystrcat(result, "+", MAXLNLEN); // XXX SPEC. MORPHCODE
- cw[n] = sign;
- cat_result(result, pSMgr->suggest_morph(cw + n));
+ char sign = scw[n];
+ scw[n] = '\0';
+ cat_result(result, pSMgr->suggest_morph(scw.c_str() + n - 1));
+ result.push_back('+'); // XXX SPEC. MORPHCODE
+ scw[n] = sign;
+ cat_result(result, pSMgr->suggest_morph(scw.c_str() + n));
}
- return line_tok(result, slst, MSEP_REC);
+ return line_tok(result.c_str(), slst, MSEP_REC);
}
}
// END OF LANG_hu section
@@ -1699,64 +1411,58 @@ int Hunspell::analyze(char*** slst, const char* word) {
case HUHCAP:
case HUHINITCAP:
case NOCAP: {
- cat_result(result, pSMgr->suggest_morph(cw));
+ cat_result(result, pSMgr->suggest_morph(scw.c_str()));
if (abbv) {
- memcpy(wspace, cw, wl);
- *(wspace + wl) = '.';
- *(wspace + wl + 1) = '\0';
- cat_result(result, pSMgr->suggest_morph(wspace));
+ std::string u8buffer(scw);
+ u8buffer.push_back('.');
+ cat_result(result, pSMgr->suggest_morph(u8buffer.c_str()));
}
break;
}
case INITCAP: {
- wl = mkallsmall2(cw, unicw, nc);
- memcpy(wspace, cw, (wl + 1));
- wl2 = mkinitcap2(cw, unicw, nc);
- cat_result(result, pSMgr->suggest_morph(wspace));
- cat_result(result, pSMgr->suggest_morph(cw));
+ wl = mkallsmall2(scw, sunicw);
+ std::string u8buffer(scw);
+ mkinitcap2(scw, sunicw);
+ cat_result(result, pSMgr->suggest_morph(u8buffer.c_str()));
+ cat_result(result, pSMgr->suggest_morph(scw.c_str()));
if (abbv) {
- *(wspace + wl) = '.';
- *(wspace + wl + 1) = '\0';
- cat_result(result, pSMgr->suggest_morph(wspace));
+ u8buffer.push_back('.');
+ cat_result(result, pSMgr->suggest_morph(u8buffer.c_str()));
- memcpy(wspace, cw, wl2);
- *(wspace + wl2) = '.';
- *(wspace + wl2 + 1) = '\0';
+ u8buffer = scw;
+ u8buffer.push_back('.');
- cat_result(result, pSMgr->suggest_morph(wspace));
+ cat_result(result, pSMgr->suggest_morph(u8buffer.c_str()));
}
break;
}
case ALLCAP: {
- cat_result(result, pSMgr->suggest_morph(cw));
+ cat_result(result, pSMgr->suggest_morph(scw.c_str()));
if (abbv) {
- memcpy(wspace, cw, wl);
- *(wspace + wl) = '.';
- *(wspace + wl + 1) = '\0';
- cat_result(result, pSMgr->suggest_morph(cw));
+ std::string u8buffer(scw);
+ u8buffer.push_back('.');
+ cat_result(result, pSMgr->suggest_morph(u8buffer.c_str()));
}
- wl = mkallsmall2(cw, unicw, nc);
- memcpy(wspace, cw, (wl + 1));
- wl2 = mkinitcap2(cw, unicw, nc);
+ mkallsmall2(scw, sunicw);
+ std::string u8buffer(scw);
+ mkinitcap2(scw, sunicw);
- cat_result(result, pSMgr->suggest_morph(wspace));
- cat_result(result, pSMgr->suggest_morph(cw));
+ cat_result(result, pSMgr->suggest_morph(u8buffer.c_str()));
+ cat_result(result, pSMgr->suggest_morph(scw.c_str()));
if (abbv) {
- *(wspace + wl) = '.';
- *(wspace + wl + 1) = '\0';
- cat_result(result, pSMgr->suggest_morph(wspace));
+ u8buffer.push_back('.');
+ cat_result(result, pSMgr->suggest_morph(u8buffer.c_str()));
- memcpy(wspace, cw, wl2);
- *(wspace + wl2) = '.';
- *(wspace + wl2 + 1) = '\0';
+ u8buffer = scw;
+ u8buffer.push_back('.');
- cat_result(result, pSMgr->suggest_morph(wspace));
+ cat_result(result, pSMgr->suggest_morph(u8buffer.c_str()));
}
break;
}
}
- if (*result) {
+ if (!result.empty()) {
// word reversing wrapper for complex prefixes
if (complexprefixes) {
if (utf8)
@@ -1764,95 +1470,94 @@ int Hunspell::analyze(char*** slst, const char* word) {
else
reverseword(result);
}
- return line_tok(result, slst, MSEP_REC);
+ return line_tok(result.c_str(), slst, MSEP_REC);
}
// compound word with dash (HU) I18n
- char* dash = NULL;
- int nresult = 0;
// LANG_hu section: set dash information for suggestions
- if (langnum == LANG_hu)
- dash = (char*)strchr(cw, '-');
- if ((langnum == LANG_hu) && dash) {
- *dash = '\0';
+
+ size_t dash_pos = langnum == LANG_hu ? scw.find('-') : std::string::npos;
+ int nresult = 0;
+ if (dash_pos != std::string::npos) {
+ std::string part1 = scw.substr(0, dash_pos);
+ std::string part2 = scw.substr(dash_pos+1);
+
// examine 2 sides of the dash
- if (dash[1] == '\0') { // base word ending with dash
- if (spell(cw)) {
- char* p = pSMgr->suggest_morph(cw);
+ if (part2.empty()) { // base word ending with dash
+ if (spell(part1.c_str())) {
+ char* p = pSMgr->suggest_morph(part1.c_str());
if (p) {
int ret = line_tok(p, slst, MSEP_REC);
free(p);
return ret;
}
}
- } else if ((dash[1] == 'e') && (dash[2] == '\0')) { // XXX (HU) -e hat.
- if (spell(cw) && (spell("-e"))) {
- st = pSMgr->suggest_morph(cw);
+ } else if (part2.size() == 1 && part2[0] == 'e') { // XXX (HU) -e hat.
+ if (spell(part1.c_str()) && (spell("-e"))) {
+ char* st = pSMgr->suggest_morph(part1.c_str());
if (st) {
- mystrcat(result, st, MAXLNLEN);
+ result.append(st);
free(st);
}
- mystrcat(result, "+", MAXLNLEN); // XXX spec. separator in MORPHCODE
+ result.push_back('+'); // XXX spec. separator in MORPHCODE
st = pSMgr->suggest_morph("-e");
if (st) {
- mystrcat(result, st, MAXLNLEN);
+ result.append(st);
free(st);
}
- return line_tok(result, slst, MSEP_REC);
+ return line_tok(result.c_str(), slst, MSEP_REC);
}
} else {
// first word ending with dash: word- XXX ???
- char r2 = *(dash + 1);
- dash[0] = '-';
- dash[1] = '\0';
- nresult = spell(cw);
- dash[1] = r2;
- dash[0] = '\0';
- if (nresult && spell(dash + 1) &&
- ((strlen(dash + 1) > 1) || ((dash[1] > '0') && (dash[1] < '9')))) {
- st = pSMgr->suggest_morph(cw);
+ part1.push_back(' ');
+ nresult = spell(part1.c_str());
+ part1.erase(part1.size() - 1);
+ if (nresult && spell(part2.c_str()) &&
+ ((part2.size() > 1) || ((part2[0] > '0') && (part2[0] < '9')))) {
+ char* st = pSMgr->suggest_morph(part1.c_str());
if (st) {
- mystrcat(result, st, MAXLNLEN);
+ result.append(st);
free(st);
- mystrcat(result, "+", MAXLNLEN); // XXX spec. separator in MORPHCODE
+ result.push_back('+'); // XXX spec. separator in MORPHCODE
}
- st = pSMgr->suggest_morph(dash + 1);
+ st = pSMgr->suggest_morph(part2.c_str());
if (st) {
- mystrcat(result, st, MAXLNLEN);
+ result.append(st);
free(st);
}
- return line_tok(result, slst, MSEP_REC);
+ return line_tok(result.c_str(), slst, MSEP_REC);
}
}
// affixed number in correct word
- if (nresult && (dash > cw) &&
- (((*(dash - 1) <= '9') && (*(dash - 1) >= '0')) ||
- (*(dash - 1) == '.'))) {
- *dash = '-';
+ if (nresult && (dash_pos > 0) &&
+ (((scw[dash_pos - 1] <= '9') && (scw[dash_pos - 1] >= '0')) ||
+ (scw[dash_pos - 1] == '.'))) {
n = 1;
- if (*(dash - n) == '.')
+ if (scw[dash_pos - n] == '.')
n++;
// search first not a number character to left from dash
- while (((dash - n) >= cw) && ((*(dash - n) == '0') || (n < 3)) &&
+ while ((dash_pos >= n) && ((scw[dash_pos - n] == '0') || (n < 3)) &&
(n < 6)) {
n++;
}
- if ((dash - n) < cw)
+ if (dash_pos < n)
n--;
// numbers: valami1000000-hoz
// examine 100000-hoz, 10000-hoz 1000-hoz, 10-hoz,
// 56-hoz, 6-hoz
for (; n >= 1; n--) {
- if ((*(dash - n) >= '0') && (*(dash - n) <= '9') &&
- checkword(dash - n, NULL, NULL)) {
- mystrcat(result, cw, MAXLNLEN);
- result[dash - cw - n] = '\0';
- st = pSMgr->suggest_morph(dash - n);
+ if (scw[dash_pos - n] < '0' || scw[dash_pos - n] > '9') {
+ continue;
+ }
+ std::string chunk = scw.substr(dash_pos - n);
+ if (checkword(chunk.c_str(), NULL, NULL)) {
+ result.append(chunk);
+ char* st = pSMgr->suggest_morph(chunk.c_str());
if (st) {
- mystrcat(result, st, MAXLNLEN);
+ result.append(st);
free(st);
}
- return line_tok(result, slst, MSEP_REC);
+ return line_tok(result.c_str(), slst, MSEP_REC);
}
}
}
@@ -1866,30 +1571,33 @@ int Hunspell::generate(char*** slst, const char* word, char** pl, int pln) {
return 0;
char** pl2;
int pl2n = analyze(&pl2, word);
- int captype = 0;
+ int captype = NOCAP;
int abbv = 0;
- char cw[MAXWORDUTF8LEN];
+ std::string cw;
cleanword(cw, word, &captype, &abbv);
- char result[MAXLNLEN];
- *result = '\0';
+ std::string result;
for (int i = 0; i < pln; i++) {
cat_result(result, pSMgr->suggest_gen(pl2, pl2n, pl[i]));
}
freelist(&pl2, pl2n);
- if (*result) {
+ if (!result.empty()) {
// allcap
if (captype == ALLCAP)
mkallcap(result);
// line split
- int linenum = line_tok(result, slst, MSEP_REC);
+ int linenum = line_tok(result.c_str(), slst, MSEP_REC);
// capitalize
if (captype == INITCAP || captype == HUHINITCAP) {
- for (int j = 0; j < linenum; j++)
- mkinitcap((*slst)[j]);
+ for (int j = 0; j < linenum; j++) {
+ std::string form((*slst)[j]);
+ free((*slst)[j]);
+ mkinitcap(form);
+ (*slst)[j] = mystrdup(form.c_str());
+ }
}
// temporary filtering of prefix related errors (eg.
@@ -1923,22 +1631,21 @@ int Hunspell::generate(char*** slst, const char* word, const char* pattern) {
}
// minimal XML parser functions
-int Hunspell::get_xml_par(char* dest, const char* par, int max) {
- char* d = dest;
+std::string Hunspell::get_xml_par(const char* par) {
+ std::string dest;
if (!par)
- return 0;
+ return dest;
char end = *par;
- char* dmax = dest + max;
if (end == '>')
end = '<';
else if (end != '\'' && end != '"')
return 0; // bad XML
- for (par++; d < dmax && *par != '\0' && *par != end; par++, d++)
- *d = *par;
- *d = '\0';
+ for (par++; *par != '\0' && *par != end; ++par) {
+ dest.push_back(*par);
+ }
mystrrep(dest, "&lt;", "<");
mystrrep(dest, "&amp;", "&");
- return (int)(d - dest);
+ return dest;
}
int Hunspell::get_langnum() const {
@@ -1967,18 +1674,17 @@ const char* Hunspell::get_xml_pos(const char* s, const char* attr) {
int Hunspell::check_xml_par(const char* q,
const char* attr,
const char* value) {
- char cw[MAXWORDUTF8LEN];
- if (get_xml_par(cw, get_xml_pos(q, attr), MAXWORDUTF8LEN - 1) &&
- strcmp(cw, value) == 0)
+ std::string cw = get_xml_par(get_xml_pos(q, attr));
+ if (cw == value)
return 1;
return 0;
}
-int Hunspell::get_xml_list(char*** slst, char* list, const char* tag) {
- int n = 0;
- char* p;
+int Hunspell::get_xml_list(char*** slst, const char* list, const char* tag) {
if (!list)
return 0;
+ int n = 0;
+ const char* p;
for (p = list; ((p = strstr(p, tag)) != NULL); p++)
n++;
if (n == 0)
@@ -1987,25 +1693,20 @@ int Hunspell::get_xml_list(char*** slst, char* list, const char* tag) {
if (!*slst)
return 0;
for (p = list, n = 0; ((p = strstr(p, tag)) != NULL); p++, n++) {
- int l = strlen(p);
- (*slst)[n] = (char*)malloc(l + 1);
- if (!(*slst)[n])
- return n;
- if (!get_xml_par((*slst)[n], p + strlen(tag) - 1, l)) {
- free((*slst)[n]);
+ std::string cw = get_xml_par(p + strlen(tag) - 1);
+ if (cw.empty()) {
break;
}
+ (*slst)[n] = mystrdup(cw.c_str());
}
return n;
}
int Hunspell::spellml(char*** slst, const char* word) {
- char *q, *q2;
- char cw[MAXWORDUTF8LEN], cw2[MAXWORDUTF8LEN];
- q = (char*)strstr(word, "<query");
+ const char* q = strstr(word, "<query");
if (!q)
return 0; // bad XML input
- q2 = strchr(q, '>');
+ const char* q2 = strchr(q, '>');
if (!q2)
return 0; // bad XML input
q2 = strstr(q2, "<word");
@@ -2013,8 +1714,9 @@ int Hunspell::spellml(char*** slst, const char* word) {
return 0; // bad XML input
if (check_xml_par(q, "type=", "analyze")) {
int n = 0;
- if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 10))
- n = analyze(slst, cw);
+ std::string cw = get_xml_par(strchr(q2, '>'));
+ if (!cw.empty())
+ n = analyze(slst, cw.c_str());
if (n == 0)
return 0;
// convert the result to <code><a>ana1</a><a>ana2</a></code> format
@@ -2036,22 +1738,25 @@ int Hunspell::spellml(char*** slst, const char* word) {
(*slst)[0] = mystrdup(r.c_str());
return 1;
} else if (check_xml_par(q, "type=", "stem")) {
- if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 1))
- return stem(slst, cw);
+ std::string cw = get_xml_par(strchr(q2, '>'));
+ if (!cw.empty())
+ return stem(slst, cw.c_str());
} else if (check_xml_par(q, "type=", "generate")) {
- int n = get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 1);
- if (n == 0)
+ std::string cw = get_xml_par(strchr(q2, '>'));
+ if (cw.empty())
return 0;
- char* q3 = strstr(q2 + 1, "<word");
+ const char* q3 = strstr(q2 + 1, "<word");
if (q3) {
- if (get_xml_par(cw2, strchr(q3, '>'), MAXWORDUTF8LEN - 1)) {
- return generate(slst, cw, cw2);
+ std::string cw2 = get_xml_par(strchr(q3, '>'));
+ if (!cw2.empty()) {
+ return generate(slst, cw.c_str(), cw2.c_str());
}
} else {
if ((q2 = strstr(q2 + 1, "<code")) != NULL) {
char** slst2;
- if ((n = get_xml_list(&slst2, strchr(q2, '>'), "<a>")) != 0) {
- int n2 = generate(slst, cw, slst2, n);
+ int n = get_xml_list(&slst2, strchr(q2, '>'), "<a>");
+ if (n != 0) {
+ int n2 = generate(slst, cw.c_str(), slst2, n);
freelist(&slst2, n);
return uniqlist(*slst, n2);
}
@@ -2062,182 +1767,6 @@ int Hunspell::spellml(char*** slst, const char* word) {
return 0;
}
-#ifdef HUNSPELL_EXPERIMENTAL
-// XXX is UTF-8 support OK?
-char* Hunspell::morph_with_correction(const char* word) {
- char cw[MAXWORDUTF8LEN];
- char wspace[MAXWORDUTF8LEN];
- if (!pSMgr || maxdic == 0)
- return NULL;
- w_char unicw[MAXWORDLEN];
- int nc = strlen(word);
- if (utf8) {
- if (nc >= MAXWORDUTF8LEN)
- return NULL;
- } else {
- if (nc >= MAXWORDLEN)
- return NULL;
- }
- int captype = 0;
- int abbv = 0;
- int wl = 0;
-
- // input conversion
- RepList* rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
- int convstatus = rl ? rl->conv(word, wspace) : 0;
- if (convstatus < 0)
- return 0;
- else if (convstatus > 0)
- wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
- else
- wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
-
- if (wl == 0)
- return NULL;
-
- char result[MAXLNLEN];
- char* st = NULL;
-
- *result = '\0';
-
- switch (captype) {
- case NOCAP: {
- st = pSMgr->suggest_morph_for_spelling_error(cw);
- if (st) {
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- if (abbv) {
- memcpy(wspace, cw, wl);
- *(wspace + wl) = '.';
- *(wspace + wl + 1) = '\0';
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- if (*result)
- mystrcat(result, "\n", MAXLNLEN);
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- }
- break;
- }
- case INITCAP: {
- memcpy(wspace, cw, (wl + 1));
- mkallsmall2(wspace, unicw, nc);
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- st = pSMgr->suggest_morph_for_spelling_error(cw);
- if (st) {
- if (*result)
- mystrcat(result, "\n", MAXLNLEN);
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- if (abbv) {
- memcpy(wspace, cw, wl);
- *(wspace + wl) = '.';
- *(wspace + wl + 1) = '\0';
- mkallsmall2(wspace, unicw, nc);
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- if (*result)
- mystrcat(result, "\n", MAXLNLEN);
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- mkinitcap(wspace);
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- if (*result)
- mystrcat(result, "\n", MAXLNLEN);
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- }
- break;
- }
- case HUHCAP: {
- st = pSMgr->suggest_morph_for_spelling_error(cw);
- if (st) {
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- memcpy(wspace, cw, (wl + 1));
- mkallsmall2(wspace, unicw, nc);
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- if (*result)
- mystrcat(result, "\n", MAXLNLEN);
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- break;
- }
- case ALLCAP: {
- memcpy(wspace, cw, (wl + 1));
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- mkallsmall2(wspace, unicw, nc);
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- if (*result)
- mystrcat(result, "\n", MAXLNLEN);
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- mkinitcap(wspace);
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- if (*result)
- mystrcat(result, "\n", MAXLNLEN);
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- if (abbv) {
- memcpy(wspace, cw, (wl + 1));
- *(wspace + wl) = '.';
- *(wspace + wl + 1) = '\0';
- if (*result)
- mystrcat(result, "\n", MAXLNLEN);
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- mkallsmall2(wspace, unicw, nc);
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- if (*result)
- mystrcat(result, "\n", MAXLNLEN);
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- mkinitcap(wspace);
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- if (*result)
- mystrcat(result, "\n", MAXLNLEN);
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- }
- break;
- }
- }
-
- if (*result)
- return mystrdup(result);
- return NULL;
-}
-
-#endif // END OF HUNSPELL_EXPERIMENTAL CODE
-
Hunhandle* Hunspell_create(const char* affpath, const char* dpath) {
return (Hunhandle*)(new Hunspell(affpath, dpath));
}
@@ -2333,10 +1862,9 @@ int Hunspell::suffix_suggest(char*** slst, const char* root_word) {
if (ignoredchars != NULL) {
w2.assign(root_word);
if (utf8) {
- int ignoredchars_utf16_len;
- unsigned short* ignoredchars_utf16 =
- pAMgr->get_ignore_utf16(&ignoredchars_utf16_len);
- remove_ignored_chars_utf(w2, ignoredchars_utf16, ignoredchars_utf16_len);
+ const std::vector<w_char>& ignoredchars_utf16 =
+ pAMgr->get_ignore_utf16();
+ remove_ignored_chars_utf(w2, ignoredchars_utf16);
} else {
remove_ignored_chars(w2, ignoredchars);
}
diff --git a/libs/hunspell/src/hunspell.hxx b/libs/hunspell/src/hunspell.hxx
index 259d44f86c..3bcf75e39c 100644
--- a/libs/hunspell/src/hunspell.hxx
+++ b/libs/hunspell/src/hunspell.hxx
@@ -77,6 +77,7 @@
#include "affixmgr.hxx"
#include "suggestmgr.hxx"
#include "langnum.hxx"
+#include <vector>
#define SPELL_XML "<?xml?>"
@@ -215,7 +216,7 @@ class LIBHUNSPELL_DLL_EXPORTED Hunspell {
/* get extra word characters definied in affix file for tokenization */
const char* get_wordchars();
- unsigned short* get_wordchars_utf16(int* len);
+ const std::vector<w_char>& get_wordchars_utf16();
struct cs_info* get_csconv();
const char* get_version();
@@ -229,45 +230,32 @@ class LIBHUNSPELL_DLL_EXPORTED Hunspell {
{
return pAMgr->get_try_string();
}
-/* experimental and deprecated functions */
-
-#ifdef HUNSPELL_EXPERIMENTAL
- /* suffix is an affix flag string, similarly in dictionary files */
- int put_word_suffix(const char* word, const char* suffix);
- char* morph_with_correction(const char* word);
-
- /* spec. suggestions */
- int suggest_auto(char*** slst, const char* word);
- int suggest_pos_stems(char*** slst, const char* word);
-#endif
private:
- int cleanword(char*, const char*, int* pcaptype, int* pabbrev);
- int cleanword2(char*,
- const char*,
- w_char*,
- int* w_len,
- int* pcaptype,
- int* pabbrev);
- void mkinitcap(char*);
- int mkinitcap2(char* p, w_char* u, int nc);
- int mkinitsmall2(char* p, w_char* u, int nc);
- void mkallcap(char*);
- int mkallcap2(char* p, w_char* u, int nc);
- void mkallsmall(char*);
- int mkallsmall2(char* p, w_char* u, int nc);
+ void cleanword(std::string& dest, const char*, int* pcaptype, int* pabbrev);
+ size_t cleanword2(std::string& dest,
+ std::vector<w_char>& dest_u,
+ const char*,
+ int* w_len,
+ int* pcaptype,
+ size_t* pabbrev);
+ void mkinitcap(std::string& u8);
+ int mkinitcap2(std::string& u8, std::vector<w_char>& u16);
+ int mkinitsmall2(std::string& u8, std::vector<w_char>& u16);
+ void mkallcap(std::string& u8);
+ int mkallsmall2(std::string& u8, std::vector<w_char>& u16);
struct hentry* checkword(const char*, int* info, char** root);
- char* sharps_u8_l1(char* dest, char* source);
+ std::string sharps_u8_l1(const std::string& source);
hentry*
- spellsharps(char* base, char*, int, int, char* tmp, int* info, char** root);
+ spellsharps(std::string& base, size_t start_pos, int, int, int* info, char** root);
int is_keepcase(const hentry* rv);
- int insert_sug(char*** slst, char* word, int ns);
- void cat_result(char* result, char* st);
+ int insert_sug(char*** slst, const char* word, int ns);
+ void cat_result(std::string& result, char* st);
char* stem_description(const char* desc);
int spellml(char*** slst, const char* word);
- int get_xml_par(char* dest, const char* par, int maxl);
+ std::string get_xml_par(const char* par);
const char* get_xml_pos(const char* s, const char* attr);
- int get_xml_list(char*** slst, char* list, const char* tag);
+ int get_xml_list(char*** slst, const char* list, const char* tag);
int check_xml_par(const char* q, const char* attr, const char* value);
};
diff --git a/libs/hunspell/src/phonet.c++ b/libs/hunspell/src/phonet.c++
index 2b4d2ae504..17350e74a7 100644
--- a/libs/hunspell/src/phonet.c++
+++ b/libs/hunspell/src/phonet.c++
@@ -66,33 +66,30 @@ static int myisalpha(char ch) {
return 1;
}
+/* Do phonetic transformation. */
/* phonetic transcription algorithm */
/* see: http://aspell.net/man-html/Phonetic-Code.html */
/* convert string to uppercase before this call */
-int phonet(const char* inword, char* target, int len, phonetable& parms) {
- /** Do phonetic transformation. **/
- /** "len" = length of "inword" incl. '\0'. **/
+std::string phonet(const std::string& inword, phonetable& parms) {
- /** result: >= 0: length of "target" **/
- /** otherwise: error **/
-
- int i, j, k = 0, n, p, z;
+ int i, k = 0, p, z;
int k0, n0, p0 = -333, z0;
- char c, c0;
+ char c;
const char* s;
typedef unsigned char uchar;
- char word[MAXPHONETUTF8LEN + 1];
- if (len == -1)
- len = strlen(inword);
+
+ size_t len = inword.size();
if (len > MAXPHONETUTF8LEN)
- return 0;
- strncpy(word, inword, MAXPHONETUTF8LEN);
+ return std::string();
+ char word[MAXPHONETUTF8LEN + 1];
+ strncpy(word, inword.c_str(), MAXPHONETUTF8LEN);
word[MAXPHONETUTF8LEN] = '\0';
+ std::string target;
/** check word **/
- i = j = z = 0;
+ i = z = 0;
while ((c = word[i]) != '\0') {
- n = parms.hash[(uchar)c];
+ int n = parms.hash[(uchar)c];
z0 = 0;
if (n >= 0) {
@@ -141,7 +138,7 @@ int phonet(const char* inword, char* target, int len, phonetable& parms) {
(!myisalpha(word[i + k0])))) {
/** search for followup rules, if: **/
/** parms.followup and k > 1 and NO '-' in searchstring **/
- c0 = word[i + k - 1];
+ char c0 = word[i + k - 1];
n0 = parms.hash[(uchar)c0];
// if (parms.followup && k > 1 && n0 >= 0
@@ -216,9 +213,9 @@ int phonet(const char* inword, char* target, int len, phonetable& parms) {
: 0;
if (p0 == 1 && z == 0) {
/** rule with '<' is used **/
- if (j > 0 && *s != '\0' &&
- (target[j - 1] == c || target[j - 1] == *s)) {
- j--;
+ if (!target.empty() && *s != '\0' &&
+ (target[target.size()-1] == c || target[target.size()-1] == *s)) {
+ target.erase(target.size() - 1);
}
z0 = 1;
z = 1;
@@ -236,10 +233,9 @@ int phonet(const char* inword, char* target, int len, phonetable& parms) {
} else { /** no '<' rule used **/
i += k - 1;
z = 0;
- while (*s != '\0' && *(s + 1) != '\0' && j < len) {
- if (j == 0 || target[j - 1] != *s) {
- target[j] = *s;
- j++;
+ while (*s != '\0' && *(s + 1) != '\0' && target.size() < len) {
+ if (target.empty() || target[target.size()-1] != *s) {
+ target.push_back(*s);
}
s++;
}
@@ -248,8 +244,7 @@ int phonet(const char* inword, char* target, int len, phonetable& parms) {
if (parms.rules[n][0] != '\0' &&
strstr(parms.rules[n] + 1, "^^") != NULL) {
if (c != '\0') {
- target[j] = c;
- j++;
+ target.push_back(c);
}
strmove(&word[0], &word[0] + i + 1);
i = 0;
@@ -262,15 +257,11 @@ int phonet(const char* inword, char* target, int len, phonetable& parms) {
} /** end of while (parms.rules[n][0] == c) **/
} /** end of if (n >= 0) **/
if (z0 == 0) {
- // if (k && (assert(p0!=-333),!p0) && j < len && c != '\0'
- // && (!parms.collapse_result || j == 0 || target[j-1] !=
- // c)){
- if (k && !p0 && j < len && c != '\0' &&
- (1 || j == 0 || target[j - 1] != c)) {
+ if (k && !p0 && target.size() < len && c != '\0' &&
+ (1 || target.empty() || target[target.size()-1] != c)) {
/** condense only double letters **/
- target[j] = c;
+ target.push_back(c);
/// printf("\n setting \n");
- j++;
}
i++;
@@ -279,7 +270,5 @@ int phonet(const char* inword, char* target, int len, phonetable& parms) {
}
} /** end of while ((c = word[i]) != '\0') **/
- target[j] = '\0';
- return (j);
-
+ return target;
} /** end of function "phonet" **/
diff --git a/libs/hunspell/src/phonet.hxx b/libs/hunspell/src/phonet.hxx
index cb0dbed3fb..eb9fd0c628 100644
--- a/libs/hunspell/src/phonet.hxx
+++ b/libs/hunspell/src/phonet.hxx
@@ -46,9 +46,7 @@ struct phonetable {
LIBHUNSPELL_DLL_EXPORTED void init_phonet_hash(phonetable& parms);
-LIBHUNSPELL_DLL_EXPORTED int phonet(const char* inword,
- char* target,
- int len,
- phonetable& phone);
+LIBHUNSPELL_DLL_EXPORTED std::string phonet(const std::string& inword,
+ phonetable& phone);
#endif
diff --git a/libs/hunspell/src/replist.c++ b/libs/hunspell/src/replist.c++
index ace6c4aaf8..b3e6b37d20 100644
--- a/libs/hunspell/src/replist.c++
+++ b/libs/hunspell/src/replist.c++
@@ -151,7 +151,7 @@ int RepList::add(char* pat1, char* pat2) {
}
int RepList::conv(const char* word, char* dest, size_t destsize) {
- int stl = 0;
+ size_t stl = 0;
int change = 0;
for (size_t i = 0; i < strlen(word); i++) {
int n = near(word + i);
@@ -173,3 +173,21 @@ int RepList::conv(const char* word, char* dest, size_t destsize) {
dest[stl] = '\0';
return change;
}
+
+bool RepList::conv(const char* word, std::string& dest) {
+ dest.clear();
+
+ bool change = false;
+ for (size_t i = 0; i < strlen(word); i++) {
+ int n = near(word + i);
+ int l = match(word + i, n);
+ if (l) {
+ dest.append(dat[n]->pattern2);
+ i += l - 1;
+ change = true;
+ } else {
+ dest.push_back(word[i]);
+ }
+ }
+ return change;
+}
diff --git a/libs/hunspell/src/replist.hxx b/libs/hunspell/src/replist.hxx
index 319eb03fb0..59366e9e02 100644
--- a/libs/hunspell/src/replist.hxx
+++ b/libs/hunspell/src/replist.hxx
@@ -79,6 +79,9 @@
#include "w_char.hxx"
+#include <string>
+#include <vector>
+
class LIBHUNSPELL_DLL_EXPORTED RepList {
private:
RepList(const RepList&);
@@ -100,5 +103,6 @@ class LIBHUNSPELL_DLL_EXPORTED RepList {
int near(const char* word);
int match(const char* word, int n);
int conv(const char* word, char* dest, size_t destsize);
+ bool conv(const char* word, std::string& dest);
};
#endif
diff --git a/libs/hunspell/src/suggestmgr.c++ b/libs/hunspell/src/suggestmgr.c++
index 4269a1181a..17becd7582 100644
--- a/libs/hunspell/src/suggestmgr.c++
+++ b/libs/hunspell/src/suggestmgr.c++
@@ -125,11 +125,11 @@ SuggestMgr::SuggestMgr(const char* tryme, int maxn, AffixMgr* aptr) {
if (ckey) {
if (utf8) {
- w_char t[MAXSWL];
- ckeyl = u8_u16(t, MAXSWL, ckey);
+ std::vector<w_char> t;
+ ckeyl = u8_u16(t, ckey);
ckey_utf = (w_char*)malloc(ckeyl * sizeof(w_char));
if (ckey_utf)
- memcpy(ckey_utf, t, ckeyl * sizeof(w_char));
+ memcpy(ckey_utf, &t[0], ckeyl * sizeof(w_char));
else
ckeyl = 0;
} else {
@@ -142,11 +142,11 @@ SuggestMgr::SuggestMgr(const char* tryme, int maxn, AffixMgr* aptr) {
if (ctry)
ctryl = strlen(ctry);
if (ctry && utf8) {
- w_char t[MAXSWL];
- ctryl = u8_u16(t, MAXSWL, tryme);
+ std::vector<w_char> t;
+ ctryl = u8_u16(t, tryme);
ctry_utf = (w_char*)malloc(ctryl * sizeof(w_char));
if (ctry_utf)
- memcpy(ctry_utf, t, ctryl * sizeof(w_char));
+ memcpy(ctry_utf, &t[0], ctryl * sizeof(w_char));
else
ctryl = 0;
}
@@ -213,7 +213,7 @@ int SuggestMgr::suggest(char*** slst,
int* onlycompoundsug) {
int nocompoundtwowords = 0;
char** wlst;
- w_char word_utf[MAXSWL];
+ std::vector<w_char> word_utf;
int wl = 0;
int nsugorig = nsug;
std::string w2;
@@ -242,7 +242,7 @@ int SuggestMgr::suggest(char*** slst,
}
if (utf8) {
- wl = u8_u16(word_utf, MAXSWL, word);
+ wl = u8_u16(word_utf, word);
if (wl == -1) {
*slst = wlst;
return nsug;
@@ -257,7 +257,7 @@ int SuggestMgr::suggest(char*** slst,
// suggestions for an uppercase word (html -> HTML)
if ((nsug < maxSug) && (nsug > -1)) {
- nsug = (utf8) ? capchars_utf(wlst, word_utf, wl, nsug, cpdsuggest)
+ nsug = (utf8) ? capchars_utf(wlst, &word_utf[0], wl, nsug, cpdsuggest)
: capchars(wlst, word, nsug, cpdsuggest);
}
@@ -280,56 +280,56 @@ int SuggestMgr::suggest(char*** slst,
// did we swap the order of chars by mistake
if ((nsug < maxSug) && (nsug > -1) &&
(!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
- nsug = (utf8) ? swapchar_utf(wlst, word_utf, wl, nsug, cpdsuggest)
+ nsug = (utf8) ? swapchar_utf(wlst, &word_utf[0], wl, nsug, cpdsuggest)
: swapchar(wlst, word, nsug, cpdsuggest);
}
// did we swap the order of non adjacent chars by mistake
if ((nsug < maxSug) && (nsug > -1) &&
(!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
- nsug = (utf8) ? longswapchar_utf(wlst, word_utf, wl, nsug, cpdsuggest)
+ nsug = (utf8) ? longswapchar_utf(wlst, &word_utf[0], wl, nsug, cpdsuggest)
: longswapchar(wlst, word, nsug, cpdsuggest);
}
// did we just hit the wrong key in place of a good char (case and keyboard)
if ((nsug < maxSug) && (nsug > -1) &&
(!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
- nsug = (utf8) ? badcharkey_utf(wlst, word_utf, wl, nsug, cpdsuggest)
+ nsug = (utf8) ? badcharkey_utf(wlst, &word_utf[0], wl, nsug, cpdsuggest)
: badcharkey(wlst, word, nsug, cpdsuggest);
}
// did we add a char that should not be there
if ((nsug < maxSug) && (nsug > -1) &&
(!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
- nsug = (utf8) ? extrachar_utf(wlst, word_utf, wl, nsug, cpdsuggest)
+ nsug = (utf8) ? extrachar_utf(wlst, &word_utf[0], wl, nsug, cpdsuggest)
: extrachar(wlst, word, nsug, cpdsuggest);
}
// did we forgot a char
if ((nsug < maxSug) && (nsug > -1) &&
(!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
- nsug = (utf8) ? forgotchar_utf(wlst, word_utf, wl, nsug, cpdsuggest)
+ nsug = (utf8) ? forgotchar_utf(wlst, &word_utf[0], wl, nsug, cpdsuggest)
: forgotchar(wlst, word, nsug, cpdsuggest);
}
// did we move a char
if ((nsug < maxSug) && (nsug > -1) &&
(!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
- nsug = (utf8) ? movechar_utf(wlst, word_utf, wl, nsug, cpdsuggest)
+ nsug = (utf8) ? movechar_utf(wlst, &word_utf[0], wl, nsug, cpdsuggest)
: movechar(wlst, word, nsug, cpdsuggest);
}
// did we just hit the wrong key in place of a good char
if ((nsug < maxSug) && (nsug > -1) &&
(!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
- nsug = (utf8) ? badchar_utf(wlst, word_utf, wl, nsug, cpdsuggest)
+ nsug = (utf8) ? badchar_utf(wlst, &word_utf[0], wl, nsug, cpdsuggest)
: badchar(wlst, word, nsug, cpdsuggest);
}
// did we double two characters
if ((nsug < maxSug) && (nsug > -1) &&
(!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
- nsug = (utf8) ? doubletwochars_utf(wlst, word_utf, wl, nsug, cpdsuggest)
+ nsug = (utf8) ? doubletwochars_utf(wlst, &word_utf[0], wl, nsug, cpdsuggest)
: doubletwochars(wlst, word, nsug, cpdsuggest);
}
@@ -357,88 +357,17 @@ int SuggestMgr::suggest(char*** slst,
return nsug;
}
-// generate suggestions for a word with typical mistake
-// pass in address of array of char * pointers
-#ifdef HUNSPELL_EXPERIMENTAL
-int SuggestMgr::suggest_auto(char*** slst, const char* w, int nsug) {
- int nocompoundtwowords = 0;
- char** wlst;
- int oldSug;
-
- char w2[MAXWORDUTF8LEN];
- const char* word = w;
-
- // word reversing wrapper for complex prefixes
- if (complexprefixes) {
- strcpy(w2, w);
- if (utf8)
- reverseword_utf(w2);
- else
- reverseword(w2);
- word = w2;
- }
-
- if (*slst) {
- wlst = *slst;
- } else {
- wlst = (char**)malloc(maxSug * sizeof(char*));
- if (wlst == NULL)
- return -1;
- }
-
- for (int cpdsuggest = 0; (cpdsuggest < 2) && (nocompoundtwowords == 0);
- cpdsuggest++) {
- // limit compound suggestion
- if (cpdsuggest > 0)
- oldSug = nsug;
-
- // perhaps we made a typical fault of spelling
- if ((nsug < maxSug) && (nsug > -1))
- nsug = replchars(wlst, word, nsug, cpdsuggest);
-
- // perhaps we made chose the wrong char from a related set
- if ((nsug < maxSug) && (nsug > -1) &&
- (!cpdsuggest || (nsug < oldSug + maxcpdsugs)))
- nsug = mapchars(wlst, word, nsug, cpdsuggest);
-
- if ((cpdsuggest == 0) && (nsug > 0))
- nocompoundtwowords = 1;
-
- // perhaps we forgot to hit space and two words ran together
-
- if ((nsug < maxSug) && (nsug > -1) &&
- (!cpdsuggest || (nsug < oldSug + maxcpdsugs)) &&
- check_forbidden(word, strlen(word))) {
- nsug = twowords(wlst, word, nsug, cpdsuggest);
- }
-
- } // repeating ``for'' statement compounding support
-
- if (nsug < 0) {
- for (int i = 0; i < maxSug; i++)
- if (wlst[i] != NULL)
- free(wlst[i]);
- free(wlst);
- return -1;
- }
-
- *slst = wlst;
- return nsug;
-}
-#endif // END OF HUNSPELL_EXPERIMENTAL CODE
-
// suggestions for an uppercase word (html -> HTML)
int SuggestMgr::capchars_utf(char** wlst,
const w_char* word,
int wl,
int ns,
int cpdsuggest) {
- char candidate[MAXSWUTF8L];
- w_char candidate_utf[MAXSWL];
- memcpy(candidate_utf, word, wl * sizeof(w_char));
- mkallcap_utf(candidate_utf, wl, langnum);
- u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
- return testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL,
+ std::vector<w_char> candidate_utf(word, word + wl);
+ mkallcap_utf(candidate_utf, langnum);
+ std::string candidate;
+ u16_u8(candidate, candidate_utf);
+ return testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL,
NULL);
}
@@ -449,7 +378,7 @@ int SuggestMgr::capchars(char** wlst,
int cpdsuggest) {
std::string candidate(word);
mkallcap(candidate, csconv);
- return testsug(wlst, candidate.data(), candidate.size(), ns, cpdsuggest, NULL,
+ return testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL,
NULL);
}
@@ -458,10 +387,9 @@ int SuggestMgr::mapchars(char** wlst,
const char* word,
int ns,
int cpdsuggest) {
- char candidate[MAXSWUTF8L];
+ std::string candidate;
clock_t timelimit;
int timer;
- candidate[0] = '\0';
int wl = strlen(word);
if (wl < 2 || !pAMgr)
@@ -474,14 +402,13 @@ int SuggestMgr::mapchars(char** wlst,
timelimit = clock();
timer = MINTIMER;
- return map_related(word, (char*)&candidate, 0, 0, wlst, cpdsuggest, ns,
+ return map_related(word, candidate, 0, wlst, cpdsuggest, ns,
maptable, nummap, &timer, &timelimit);
}
int SuggestMgr::map_related(const char* word,
- char* candidate,
+ std::string& candidate,
int wn,
- int cn,
char** wlst,
int cpdsuggest,
int ns,
@@ -491,17 +418,15 @@ int SuggestMgr::map_related(const char* word,
clock_t* timelimit) {
if (*(word + wn) == '\0') {
int cwrd = 1;
- *(candidate + cn) = '\0';
- int wl = strlen(candidate);
for (int m = 0; m < ns; m++) {
- if (strcmp(candidate, wlst[m]) == 0) {
+ if (candidate == wlst[m]) {
cwrd = 0;
break;
}
}
- if ((cwrd) && checkword(candidate, wl, cpdsuggest, timer, timelimit)) {
+ if ((cwrd) && checkword(candidate.c_str(), candidate.size(), cpdsuggest, timer, timelimit)) {
if (ns < maxSug) {
- wlst[ns] = mystrdup(candidate);
+ wlst[ns] = mystrdup(candidate.c_str());
if (wlst[ns] == NULL)
return -1;
ns++;
@@ -515,9 +440,11 @@ int SuggestMgr::map_related(const char* word,
int len = strlen(maptable[j].set[k]);
if (strncmp(maptable[j].set[k], word + wn, len) == 0) {
in_map = 1;
+ size_t cn = candidate.size();
for (int l = 0; l < maptable[j].len; l++) {
- strcpy(candidate + cn, maptable[j].set[l]);
- ns = map_related(word, candidate, wn + len, strlen(candidate), wlst,
+ candidate.resize(cn);
+ candidate.append(maptable[j].set[l]);
+ ns = map_related(word, candidate, wn + len, wlst,
cpdsuggest, ns, maptable, nummap, timer, timelimit);
if (!(*timer))
return ns;
@@ -526,8 +453,8 @@ int SuggestMgr::map_related(const char* word,
}
}
if (!in_map) {
- *(candidate + cn) = *(word + wn);
- ns = map_related(word, candidate, wn + 1, cn + 1, wlst, cpdsuggest, ns,
+ candidate.push_back(*(word + wn));
+ ns = map_related(word, candidate, wn + 1, wlst, cpdsuggest, ns,
maptable, nummap, timer, timelimit);
}
return ns;
@@ -539,9 +466,7 @@ int SuggestMgr::replchars(char** wlst,
const char* word,
int ns,
int cpdsuggest) {
- char candidate[MAXSWUTF8L];
- const char* r;
- int lenr, lenp;
+ std::string candidate;
int wl = strlen(word);
if (wl < 2 || !pAMgr)
return ns;
@@ -550,45 +475,42 @@ int SuggestMgr::replchars(char** wlst,
if (reptable == NULL)
return ns;
for (int i = 0; i < numrep; i++) {
- r = word;
- lenr = strlen(reptable[i].pattern2);
- lenp = strlen(reptable[i].pattern);
+ const char* r = word;
// search every occurence of the pattern in the word
while ((r = strstr(r, reptable[i].pattern)) != NULL &&
(!reptable[i].end || strlen(r) == strlen(reptable[i].pattern)) &&
(!reptable[i].start || r == word)) {
- strcpy(candidate, word);
- if (r - word + lenr + strlen(r + lenp) >= MAXSWUTF8L)
- break;
- strcpy(candidate + (r - word), reptable[i].pattern2);
- strcpy(candidate + (r - word) + lenr, r + lenp);
- ns = testsug(wlst, candidate, wl - lenp + lenr, ns, cpdsuggest, NULL,
+ candidate.assign(word);
+ candidate.resize(r - word);
+ candidate.append(reptable[i].pattern2);
+ int lenp = strlen(reptable[i].pattern);
+ candidate.append(r + lenp);
+ ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL,
NULL);
if (ns == -1)
return -1;
// check REP suggestions with space
- char* sp = strchr(candidate, ' ');
- if (sp) {
- char* prev = candidate;
- while (sp) {
- *sp = '\0';
- if (checkword(prev, strlen(prev), 0, NULL, NULL)) {
+ size_t sp = candidate.find(' ');
+ if (sp != std::string::npos) {
+ size_t prev = 0;
+ while (sp != std::string::npos) {
+ std::string prev_chunk = candidate.substr(prev, sp - prev);
+ if (checkword(prev_chunk.c_str(), prev_chunk.size(), 0, NULL, NULL)) {
int oldns = ns;
- *sp = ' ';
- ns = testsug(wlst, sp + 1, strlen(sp + 1), ns, cpdsuggest, NULL,
+ std::string post_chunk = candidate.substr(sp + 1);
+ ns = testsug(wlst, post_chunk.c_str(), post_chunk.size(), ns, cpdsuggest, NULL,
NULL);
if (ns == -1)
return -1;
if (oldns < ns) {
free(wlst[ns - 1]);
- wlst[ns - 1] = mystrdup(candidate);
+ wlst[ns - 1] = mystrdup(candidate.c_str());
if (!wlst[ns - 1])
return -1;
}
}
- *sp = ' ';
prev = sp + 1;
- sp = strchr(prev, ' ');
+ sp = candidate.find(' ', prev);
}
}
r++; // search for the next letter
@@ -603,7 +525,6 @@ int SuggestMgr::doubletwochars(char** wlst,
const char* word,
int ns,
int cpdsuggest) {
- char candidate[MAXSWUTF8L];
int state = 0;
int wl = strlen(word);
if (wl < 5 || !pAMgr)
@@ -612,9 +533,9 @@ int SuggestMgr::doubletwochars(char** wlst,
if (word[i] == word[i - 2]) {
state++;
if (state == 3) {
- strcpy(candidate, word);
- strcpy(candidate + i - 1, word + i + 1);
- ns = testsug(wlst, candidate, wl - 2, ns, cpdsuggest, NULL, NULL);
+ std::string candidate(word, word + i - 1);
+ candidate.insert(candidate.end(), word + i + 1, word + wl);
+ ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL);
if (ns == -1)
return -1;
state = 0;
@@ -633,20 +554,18 @@ int SuggestMgr::doubletwochars_utf(char** wlst,
int wl,
int ns,
int cpdsuggest) {
- w_char candidate_utf[MAXSWL];
- char candidate[MAXSWUTF8L];
int state = 0;
if (wl < 5 || !pAMgr)
return ns;
for (int i = 2; i < wl; i++) {
- if (w_char_eq(word[i], word[i - 2])) {
+ if (word[i] == word[i - 2]) {
state++;
if (state == 3) {
- memcpy(candidate_utf, word, (i - 1) * sizeof(w_char));
- memcpy(candidate_utf + i - 1, word + i + 1,
- (wl - i - 1) * sizeof(w_char));
- u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl - 2);
- ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL,
+ std::vector<w_char> candidate_utf(word, word + i - 1);
+ candidate_utf.insert(candidate_utf.end(), word + i + 1, word + wl);
+ std::string candidate;
+ u16_u8(candidate, candidate_utf);
+ ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL,
NULL);
if (ns == -1)
return -1;
@@ -665,19 +584,16 @@ int SuggestMgr::badcharkey(char** wlst,
const char* word,
int ns,
int cpdsuggest) {
- char tmpc;
- char candidate[MAXSWUTF8L];
- int wl = strlen(word);
- strcpy(candidate, word);
+ std::string candidate(word);
+
// swap out each char one by one and try uppercase and neighbor
// keyboard chars in its place to see if that makes a good word
-
- for (int i = 0; i < wl; i++) {
- tmpc = candidate[i];
+ for (size_t i = 0; i < candidate.size(); ++i) {
+ char tmpc = candidate[i];
// check with uppercase letters
candidate[i] = csconv[((unsigned char)tmpc)].cupper;
if (tmpc != candidate[i]) {
- ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
+ ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL);
if (ns == -1)
return -1;
candidate[i] = tmpc;
@@ -689,13 +605,13 @@ int SuggestMgr::badcharkey(char** wlst,
while (loc) {
if ((loc > ckey) && (*(loc - 1) != '|')) {
candidate[i] = *(loc - 1);
- ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
+ ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL);
if (ns == -1)
return -1;
}
if ((*(loc + 1) != '|') && (*(loc + 1) != '\0')) {
candidate[i] = *(loc + 1);
- ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
+ ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL);
if (ns == -1)
return -1;
}
@@ -713,19 +629,17 @@ int SuggestMgr::badcharkey_utf(char** wlst,
int wl,
int ns,
int cpdsuggest) {
- w_char tmpc;
- w_char candidate_utf[MAXSWL];
- char candidate[MAXSWUTF8L];
- memcpy(candidate_utf, word, wl * sizeof(w_char));
+ std::string candidate;
+ std::vector<w_char> candidate_utf(word, word + wl);
// swap out each char one by one and try all the tryme
// chars in its place to see if that makes a good word
for (int i = 0; i < wl; i++) {
- tmpc = candidate_utf[i];
+ w_char tmpc = candidate_utf[i];
// check with uppercase letters
- mkallcap_utf(candidate_utf + i, 1, langnum);
- if (!w_char_eq(tmpc, candidate_utf[i])) {
- u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
- ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL,
+ candidate_utf[i] = upper_utf(candidate_utf[i], 1);
+ if (tmpc != candidate_utf[i]) {
+ u16_u8(candidate, candidate_utf);
+ ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL,
NULL);
if (ns == -1)
return -1;
@@ -735,28 +649,28 @@ int SuggestMgr::badcharkey_utf(char** wlst,
if (!ckey)
continue;
w_char* loc = ckey_utf;
- while ((loc < (ckey_utf + ckeyl)) && !w_char_eq(*loc, tmpc))
+ while ((loc < (ckey_utf + ckeyl)) && *loc != tmpc)
loc++;
while (loc < (ckey_utf + ckeyl)) {
- if ((loc > ckey_utf) && !w_char_eq(*(loc - 1), W_VLINE)) {
+ if ((loc > ckey_utf) && *(loc - 1) != W_VLINE) {
candidate_utf[i] = *(loc - 1);
- u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
- ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL,
+ u16_u8(candidate, candidate_utf);
+ ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL,
NULL);
if (ns == -1)
return -1;
}
- if (((loc + 1) < (ckey_utf + ckeyl)) && !w_char_eq(*(loc + 1), W_VLINE)) {
+ if (((loc + 1) < (ckey_utf + ckeyl)) && (*(loc + 1) != W_VLINE)) {
candidate_utf[i] = *(loc + 1);
- u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
- ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL,
+ u16_u8(candidate, candidate_utf);
+ ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL,
NULL);
if (ns == -1)
return -1;
}
do {
loc++;
- } while ((loc < (ckey_utf + ckeyl)) && !w_char_eq(*loc, tmpc));
+ } while ((loc < (ckey_utf + ckeyl)) && *loc != tmpc);
}
candidate_utf[i] = tmpc;
}
@@ -765,26 +679,23 @@ int SuggestMgr::badcharkey_utf(char** wlst,
// error is wrong char in place of correct one
int SuggestMgr::badchar(char** wlst, const char* word, int ns, int cpdsuggest) {
- char tmpc;
- char candidate[MAXSWUTF8L];
+ std::string candidate(word);
clock_t timelimit = clock();
int timer = MINTIMER;
- int wl = strlen(word);
- strcpy(candidate, word);
// swap out each char one by one and try all the tryme
// chars in its place to see if that makes a good word
for (int j = 0; j < ctryl; j++) {
- for (int i = wl - 1; i >= 0; i--) {
- tmpc = candidate[i];
+ for (std::string::reverse_iterator aI = candidate.rbegin(), aEnd = candidate.rend(); aI != aEnd; ++aI) {
+ char tmpc = *aI;
if (ctry[j] == tmpc)
continue;
- candidate[i] = ctry[j];
- ns = testsug(wlst, candidate, wl, ns, cpdsuggest, &timer, &timelimit);
+ *aI = ctry[j];
+ ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, &timer, &timelimit);
if (ns == -1)
return -1;
if (!timer)
return ns;
- candidate[i] = tmpc;
+ *aI = tmpc;
}
}
return ns;
@@ -796,22 +707,20 @@ int SuggestMgr::badchar_utf(char** wlst,
int wl,
int ns,
int cpdsuggest) {
- w_char tmpc;
- w_char candidate_utf[MAXSWL];
- char candidate[MAXSWUTF8L];
+ std::vector<w_char> candidate_utf(word, word + wl);
+ std::string candidate;
clock_t timelimit = clock();
int timer = MINTIMER;
- memcpy(candidate_utf, word, wl * sizeof(w_char));
// swap out each char one by one and try all the tryme
// chars in its place to see if that makes a good word
for (int j = 0; j < ctryl; j++) {
for (int i = wl - 1; i >= 0; i--) {
- tmpc = candidate_utf[i];
- if (w_char_eq(tmpc, ctry_utf[j]))
+ w_char tmpc = candidate_utf[i];
+ if (tmpc == ctry_utf[j])
continue;
candidate_utf[i] = ctry_utf[j];
- u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
- ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, &timer,
+ u16_u8(candidate, candidate_utf);
+ ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, &timer,
&timelimit);
if (ns == -1)
return -1;
@@ -829,24 +738,20 @@ int SuggestMgr::extrachar_utf(char** wlst,
int wl,
int ns,
int cpdsuggest) {
- char candidate[MAXSWUTF8L];
- w_char candidate_utf[MAXSWL];
- w_char* p;
- w_char tmpc = W_VLINE; // not used value, only for VCC warning message
- if (wl < 2)
+ std::vector<w_char> candidate_utf(word, word + wl);
+ if (candidate_utf.size() < 2)
return ns;
// try omitting one char of word at a time
- memcpy(candidate_utf, word, wl * sizeof(w_char));
- for (p = candidate_utf + wl - 1; p >= candidate_utf; p--) {
- w_char tmpc2 = *p;
- if (p < candidate_utf + wl - 1)
- *p = tmpc;
- u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl - 1);
- ns =
- testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
+ for (size_t i = 0; i < candidate_utf.size(); ++i) {
+ size_t index = candidate_utf.size() - 1 - i;
+ w_char tmpc = candidate_utf[index];
+ candidate_utf.erase(candidate_utf.begin() + index);
+ std::string candidate;
+ u16_u8(candidate, candidate_utf);
+ ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL);
if (ns == -1)
return -1;
- tmpc = tmpc2;
+ candidate_utf.insert(candidate_utf.begin() + index, tmpc);
}
return ns;
}
@@ -856,21 +761,18 @@ int SuggestMgr::extrachar(char** wlst,
const char* word,
int ns,
int cpdsuggest) {
- char tmpc = '\0';
- char candidate[MAXSWUTF8L];
- char* p;
- int wl = strlen(word);
- if (wl < 2)
+ std::string candidate(word);
+ if (candidate.size() < 2)
return ns;
// try omitting one char of word at a time
- strcpy(candidate, word);
- for (p = candidate + wl - 1; p >= candidate; p--) {
- char tmpc2 = *p;
- *p = tmpc;
- ns = testsug(wlst, candidate, wl - 1, ns, cpdsuggest, NULL, NULL);
+ for (size_t i = 0; i < candidate.size(); ++i) {
+ size_t index = candidate.size() - 1 - i;
+ char tmpc = candidate[index];
+ candidate.erase(candidate.begin() + index);
+ ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL);
if (ns == -1)
return -1;
- tmpc = tmpc2;
+ candidate.insert(candidate.begin() + index, tmpc);
}
return ns;
}
@@ -880,23 +782,22 @@ int SuggestMgr::forgotchar(char** wlst,
const char* word,
int ns,
int cpdsuggest) {
- char candidate[MAXSWUTF8L + 4];
- char* p;
+ std::string candidate(word);
clock_t timelimit = clock();
int timer = MINTIMER;
- int wl = strlen(word);
+
// try inserting a tryme character before every letter (and the null
// terminator)
- for (int i = 0; i < ctryl; i++) {
- strcpy(candidate, word);
- for (p = candidate + wl; p >= candidate; p--) {
- *(p + 1) = *p;
- *p = ctry[i];
- ns = testsug(wlst, candidate, wl + 1, ns, cpdsuggest, &timer, &timelimit);
+ for (int k = 0; k < ctryl; ++k) {
+ for (size_t i = 0; i <= candidate.size(); ++i) {
+ size_t index = candidate.size() - i;
+ candidate.insert(candidate.begin() + index, ctry[k]);
+ ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, &timer, &timelimit);
if (ns == -1)
return -1;
if (!timer)
return ns;
+ candidate.erase(candidate.begin() + index);
}
}
return ns;
@@ -908,25 +809,25 @@ int SuggestMgr::forgotchar_utf(char** wlst,
int wl,
int ns,
int cpdsuggest) {
- w_char candidate_utf[MAXSWL + 1];
- char candidate[MAXSWUTF8L + 4];
- w_char* p;
+ std::vector<w_char> candidate_utf(word, word + wl);
clock_t timelimit = clock();
int timer = MINTIMER;
+
// try inserting a tryme character at the end of the word and before every
// letter
- for (int i = 0; i < ctryl; i++) {
- memcpy(candidate_utf, word, wl * sizeof(w_char));
- for (p = candidate_utf + wl; p >= candidate_utf; p--) {
- *(p + 1) = *p;
- *p = ctry_utf[i];
- u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl + 1);
- ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, &timer,
+ for (int k = 0; k < ctryl; ++k) {
+ for (size_t i = 0; i <= candidate_utf.size(); ++i) {
+ size_t index = candidate_utf.size() - i;
+ candidate_utf.insert(candidate_utf.begin() + index, ctry_utf[k]);
+ std::string candidate;
+ u16_u8(candidate, candidate_utf);
+ ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, &timer,
&timelimit);
if (ns == -1)
return -1;
if (!timer)
return ns;
+ candidate_utf.erase(candidate_utf.begin() + index);
}
}
return ns;
@@ -937,8 +838,6 @@ int SuggestMgr::twowords(char** wlst,
const char* word,
int ns,
int cpdsuggest) {
- char candidate[MAXSWUTF8L];
- char* p;
int c1, c2;
int forbidden = 0;
int cwrd;
@@ -950,10 +849,12 @@ int SuggestMgr::twowords(char** wlst,
if (langnum == LANG_hu)
forbidden = check_forbidden(word, wl);
+ char* candidate = (char*)malloc(wl + 2);
strcpy(candidate + 1, word);
+
// split the string into two pieces after every char
// if both pieces are good words make them a suggestion
- for (p = candidate + 1; p[1] != '\0'; p++) {
+ for (char* p = candidate + 1; p[1] != '\0'; p++) {
p[-1] = *p;
// go to end of the UTF-8 character
while (utf8 && ((p[1] & 0xc0) == 0x80)) {
@@ -988,12 +889,16 @@ int SuggestMgr::twowords(char** wlst,
if (ns < maxSug) {
if (cwrd) {
wlst[ns] = mystrdup(candidate);
- if (wlst[ns] == NULL)
+ if (wlst[ns] == NULL) {
+ free(candidate);
return -1;
+ }
ns++;
}
- } else
+ } else {
+ free(candidate);
return ns;
+ }
// add two word suggestion with dash, if TRY string contains
// "a" or "-"
// NOTE: cwrd doesn't modified for REP twoword sugg.
@@ -1009,16 +914,21 @@ int SuggestMgr::twowords(char** wlst,
if (ns < maxSug) {
if (cwrd) {
wlst[ns] = mystrdup(candidate);
- if (wlst[ns] == NULL)
+ if (wlst[ns] == NULL) {
+ free(candidate);
return -1;
+ }
ns++;
}
- } else
+ } else {
+ free(candidate);
return ns;
+ }
}
}
}
}
+ free(candidate);
return ns;
}
@@ -1027,42 +937,40 @@ int SuggestMgr::swapchar(char** wlst,
const char* word,
int ns,
int cpdsuggest) {
- char candidate[MAXSWUTF8L];
- char* p;
- char tmpc;
- int wl = strlen(word);
+ std::string candidate(word);
+ if (candidate.size() < 2)
+ return ns;
+
// try swapping adjacent chars one by one
- strcpy(candidate, word);
- for (p = candidate; p[1] != 0; p++) {
- tmpc = *p;
- *p = p[1];
- p[1] = tmpc;
- ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
+ for (size_t i = 0; i < candidate.size() - 1; ++i) {
+ std::swap(candidate[i], candidate[i+1]);
+ ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL);
if (ns == -1)
return -1;
- p[1] = *p;
- *p = tmpc;
+ std::swap(candidate[i], candidate[i+1]);
}
+
// try double swaps for short words
// ahev -> have, owudl -> would
- if (wl == 4 || wl == 5) {
+ if (candidate.size() == 4 || candidate.size() == 5) {
candidate[0] = word[1];
candidate[1] = word[0];
candidate[2] = word[2];
- candidate[wl - 2] = word[wl - 1];
- candidate[wl - 1] = word[wl - 2];
- ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
+ candidate[candidate.size() - 2] = word[candidate.size() - 1];
+ candidate[candidate.size() - 1] = word[candidate.size() - 2];
+ ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL);
if (ns == -1)
return -1;
- if (wl == 5) {
+ if (candidate.size() == 5) {
candidate[0] = word[0];
candidate[1] = word[2];
candidate[2] = word[1];
- ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
+ ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL);
if (ns == -1)
return -1;
}
}
+
return ns;
}
@@ -1072,44 +980,39 @@ int SuggestMgr::swapchar_utf(char** wlst,
int wl,
int ns,
int cpdsuggest) {
- w_char candidate_utf[MAXSWL];
- char candidate[MAXSWUTF8L];
- w_char* p;
- w_char tmpc;
- int len = 0;
+ std::vector<w_char> candidate_utf(word, word + wl);
+ if (candidate_utf.size() < 2)
+ return ns;
+
+ std::string candidate;
// try swapping adjacent chars one by one
- memcpy(candidate_utf, word, wl * sizeof(w_char));
- for (p = candidate_utf; p < (candidate_utf + wl - 1); p++) {
- tmpc = *p;
- *p = p[1];
- p[1] = tmpc;
- u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
- if (len == 0)
- len = strlen(candidate);
- ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL);
+ for (size_t i = 0; i < candidate_utf.size() - 1; ++i) {
+ std::swap(candidate_utf[i], candidate_utf[i+1]);
+ u16_u8(candidate, candidate_utf);
+ ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL);
if (ns == -1)
return -1;
- p[1] = *p;
- *p = tmpc;
+ std::swap(candidate_utf[i], candidate_utf[i+1]);
}
+
// try double swaps for short words
// ahev -> have, owudl -> would, suodn -> sound
- if (wl == 4 || wl == 5) {
+ if (candidate_utf.size() == 4 || candidate_utf.size() == 5) {
candidate_utf[0] = word[1];
candidate_utf[1] = word[0];
candidate_utf[2] = word[2];
- candidate_utf[wl - 2] = word[wl - 1];
- candidate_utf[wl - 1] = word[wl - 2];
- u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
- ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL);
+ candidate_utf[candidate_utf.size() - 2] = word[candidate_utf.size() - 1];
+ candidate_utf[candidate_utf.size() - 1] = word[candidate_utf.size() - 2];
+ u16_u8(candidate, candidate_utf);
+ ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL);
if (ns == -1)
return -1;
- if (wl == 5) {
+ if (candidate_utf.size() == 5) {
candidate_utf[0] = word[0];
candidate_utf[1] = word[2];
candidate_utf[2] = word[1];
- u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
- ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL);
+ u16_u8(candidate, candidate_utf);
+ ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL);
if (ns == -1)
return -1;
}
@@ -1122,24 +1025,16 @@ int SuggestMgr::longswapchar(char** wlst,
const char* word,
int ns,
int cpdsuggest) {
- char candidate[MAXSWUTF8L];
- char* p;
- char* q;
- char tmpc;
- int wl = strlen(word);
+ std::string candidate(word);
// try swapping not adjacent chars one by one
- strcpy(candidate, word);
- for (p = candidate; *p != 0; p++) {
- for (q = candidate; *q != 0; q++) {
- if (abs((int)(p - q)) > 1) {
- tmpc = *p;
- *p = *q;
- *q = tmpc;
- ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
+ for (std::string::iterator p = candidate.begin(); p < candidate.end(); ++p) {
+ for (std::string::iterator q = candidate.begin(); q < candidate.end(); ++q) {
+ if (abs(std::distance(q, p)) > 1) {
+ std::swap(*p, *q);
+ ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL);
if (ns == -1)
return -1;
- *q = *p;
- *p = tmpc;
+ std::swap(*p, *q);
}
}
}
@@ -1152,26 +1047,19 @@ int SuggestMgr::longswapchar_utf(char** wlst,
int wl,
int ns,
int cpdsuggest) {
- w_char candidate_utf[MAXSWL];
- char candidate[MAXSWUTF8L];
- w_char* p;
- w_char* q;
- w_char tmpc;
+ std::vector<w_char> candidate_utf(word, word + wl);
// try swapping not adjacent chars
- memcpy(candidate_utf, word, wl * sizeof(w_char));
- for (p = candidate_utf; p < (candidate_utf + wl); p++) {
- for (q = candidate_utf; q < (candidate_utf + wl); q++) {
- if (abs((int)(p - q)) > 1) {
- tmpc = *p;
- *p = *q;
- *q = tmpc;
- u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
- ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL,
+ for (std::vector<w_char>::iterator p = candidate_utf.begin(); p < candidate_utf.end(); ++p) {
+ for (std::vector<w_char>::iterator q = candidate_utf.begin(); q < candidate_utf.end(); ++q) {
+ if (abs(std::distance(q, p)) > 1) {
+ std::swap(*p, *q);
+ std::string candidate;
+ u16_u8(candidate, candidate_utf);
+ ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL,
NULL);
if (ns == -1)
return -1;
- *q = *p;
- *p = tmpc;
+ std::swap(*p, *q);
}
}
}
@@ -1183,40 +1071,35 @@ int SuggestMgr::movechar(char** wlst,
const char* word,
int ns,
int cpdsuggest) {
- char candidate[MAXSWUTF8L];
- char* p;
- char* q;
- char tmpc;
+ std::string candidate(word);
+ if (candidate.size() < 2)
+ return ns;
- int wl = strlen(word);
// try moving a char
- strcpy(candidate, word);
- for (p = candidate; *p != 0; p++) {
- for (q = p + 1; (*q != 0) && ((q - p) < 10); q++) {
- tmpc = *(q - 1);
- *(q - 1) = *q;
- *q = tmpc;
- if ((q - p) < 2)
+ for (std::string::iterator p = candidate.begin(); p < candidate.end(); ++p) {
+ for (std::string::iterator q = p + 1; q < candidate.end() && std::distance(p, q) < 10; ++q) {
+ std::swap(*q, *(q - 1));
+ if (std::distance(p, q) < 2)
continue; // omit swap char
- ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
+ ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL);
if (ns == -1)
return -1;
}
- strcpy(candidate, word);
+ std::copy(word, word + candidate.size(), candidate.begin());
}
- for (p = candidate + wl - 1; p > candidate; p--) {
- for (q = p - 1; (q >= candidate) && ((p - q) < 10); q--) {
- tmpc = *(q + 1);
- *(q + 1) = *q;
- *q = tmpc;
- if ((p - q) < 2)
+
+ for (std::string::reverse_iterator p = candidate.rbegin(), pEnd = candidate.rend() - 1; p != pEnd; ++p) {
+ for (std::string::reverse_iterator q = p + 1, qEnd = candidate.rend(); q != qEnd && std::distance(p, q) < 10; ++q) {
+ std::swap(*q, *(q - 1));
+ if (std::distance(p, q) < 2)
continue; // omit swap char
- ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
+ ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL, NULL);
if (ns == -1)
return -1;
}
- strcpy(candidate, word);
+ std::copy(word, word + candidate.size(), candidate.begin());
}
+
return ns;
}
@@ -1226,49 +1109,47 @@ int SuggestMgr::movechar_utf(char** wlst,
int wl,
int ns,
int cpdsuggest) {
- w_char candidate_utf[MAXSWL];
- char candidate[MAXSWUTF8L];
- w_char* p;
- w_char* q;
- w_char tmpc;
+ std::vector<w_char> candidate_utf(word, word + wl);
+ if (candidate_utf.size() < 2)
+ return ns;
+
// try moving a char
- memcpy(candidate_utf, word, wl * sizeof(w_char));
- for (p = candidate_utf; p < (candidate_utf + wl); p++) {
- for (q = p + 1; (q < (candidate_utf + wl)) && ((q - p) < 10); q++) {
- tmpc = *(q - 1);
- *(q - 1) = *q;
- *q = tmpc;
- if ((q - p) < 2)
+ for (std::vector<w_char>::iterator p = candidate_utf.begin(); p < candidate_utf.end(); ++p) {
+ for (std::vector<w_char>::iterator q = p + 1; q < candidate_utf.end() && std::distance(p, q) < 10; ++q) {
+ std::swap(*q, *(q - 1));
+ if (std::distance(p, q) < 2)
continue; // omit swap char
- u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
- ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL,
+ std::string candidate;
+ u16_u8(candidate, candidate_utf);
+ ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL,
NULL);
if (ns == -1)
return -1;
}
- memcpy(candidate_utf, word, wl * sizeof(w_char));
+ std::copy(word, word + candidate_utf.size(), candidate_utf.begin());
}
- for (p = candidate_utf + wl - 1; p > candidate_utf; p--) {
- for (q = p - 1; (q >= candidate_utf) && ((p - q) < 10); q--) {
- tmpc = *(q + 1);
- *(q + 1) = *q;
- *q = tmpc;
- if ((p - q) < 2)
+
+ for (std::vector<w_char>::iterator p = candidate_utf.begin() + candidate_utf.size() - 1; p > candidate_utf.begin(); --p) {
+ for (std::vector<w_char>::iterator q = p - 1; q >= candidate_utf.begin() && std::distance(q, p) < 10; --q) {
+ std::swap(*q, *(q + 1));
+ if (std::distance(q, p) < 2)
continue; // omit swap char
- u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
- ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL,
+ std::string candidate;
+ u16_u8(candidate, candidate_utf);
+ ns = testsug(wlst, candidate.c_str(), candidate.size(), ns, cpdsuggest, NULL,
NULL);
if (ns == -1)
return -1;
}
- memcpy(candidate_utf, word, wl * sizeof(w_char));
+ std::copy(word, word + candidate_utf.size(), candidate_utf.begin());
}
+
return ns;
}
// generate a set of suggestions for very poorly spelled words
int SuggestMgr::ngsuggest(char** wlst,
- char* w,
+ const char* w,
int ns,
HashMgr** pHMgr,
int md) {
@@ -1295,7 +1176,6 @@ int SuggestMgr::ngsuggest(char** wlst,
int low = NGRAM_LOWERING;
std::string w2;
- char f[MAXSWUTF8L];
const char* word = w;
// word reversing wrapper for complex prefixes
@@ -1308,10 +1188,9 @@ int SuggestMgr::ngsuggest(char** wlst,
word = w2.c_str();
}
- char mw[MAXSWUTF8L];
- w_char u8[MAXSWL];
+ std::vector<w_char> u8;
int nc = strlen(word);
- int n = (utf8) ? u8_u16(u8, MAXSWL, word) : nc;
+ int n = (utf8) ? u8_u16(u8, word) : nc;
// set character based ngram suggestion for words with non-BMP Unicode
// characters
@@ -1325,21 +1204,20 @@ int SuggestMgr::ngsuggest(char** wlst,
struct hentry* hp = NULL;
int col = -1;
phonetable* ph = (pAMgr) ? pAMgr->get_phonetable() : NULL;
- char target[MAXSWUTF8L];
+ std::string target;
std::string candidate;
if (ph) {
if (utf8) {
std::vector<w_char> _w;
- int _wl = u8_u16(_w, word);
- mkallcap_utf(_w, _wl, langnum);
+ u8_u16(_w, word);
+ mkallcap_utf(_w, langnum);
u16_u8(candidate, _w);
} else {
candidate.assign(word);
if (!nonbmp)
mkallcap(candidate, csconv);
}
- phonet(candidate.c_str(), target, nc,
- *ph); // XXX phonet() is 8-bit (nc, not n)
+ target = phonet(candidate, *ph); // XXX phonet() is 8-bit (nc, not n)
}
FLAG forbiddenword = pAMgr ? pAMgr->get_forbiddenword() : FLAG_NULL;
@@ -1361,27 +1239,27 @@ int SuggestMgr::ngsuggest(char** wlst,
leftcommonsubstring(word, HENTRY_WORD(hp));
// check special pronounciation
+ std::string f;
if ((hp->var & H_OPT_PHON) &&
copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) {
int sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) +
- +leftcommonsubstring(word, f);
+ +leftcommonsubstring(word, f.c_str());
if (sc2 > sc)
sc = sc2;
}
int scphon = -20000;
if (ph && (sc > 2) && (abs(n - (int)hp->clen) <= 3)) {
- char target2[MAXSWUTF8L];
if (utf8) {
std::vector<w_char> _w;
- int _wl = u8_u16(_w, HENTRY_WORD(hp));
- mkallcap_utf(_w, _wl, langnum);
+ u8_u16(_w, HENTRY_WORD(hp));
+ mkallcap_utf(_w, langnum);
u16_u8(candidate, _w);
} else {
candidate.assign(HENTRY_WORD(hp));
mkallcap(candidate, csconv);
}
- phonet(candidate.c_str(), target2, -1, *ph);
+ std::string target2 = phonet(candidate, *ph);
scphon = 2 * ngram(3, target, target2, NGRAM_LONGER_WORSE);
}
@@ -1415,14 +1293,17 @@ int SuggestMgr::ngsuggest(char** wlst,
int thresh = 0;
for (int sp = 1; sp < 4; sp++) {
if (utf8) {
- for (int k = sp; k < n; k += 4)
- *((unsigned short*)u8 + k) = '*';
- u16_u8(mw, MAXSWUTF8L, u8, n);
+ for (int k = sp; k < n; k += 4) {
+ u8[k].l = '*';
+ u8[k].h = 0;
+ }
+ std::string mw;
+ u16_u8(mw, u8);
thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low);
} else {
- strcpy(mw, word);
+ std::string mw(word);
for (int k = sp; k < n; k += 4)
- *(mw + k) = '*';
+ mw[k] = '*';
thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low);
}
}
@@ -1454,11 +1335,14 @@ int SuggestMgr::ngsuggest(char** wlst,
for (i = 0; i < MAX_ROOTS; i++) {
if (roots[i]) {
struct hentry* rp = roots[i];
+
+ std::string f;
+ const char *field = NULL;
+ if ((rp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(rp), MORPH_PHON))
+ field = f.c_str();
int nw = pAMgr->expand_rootword(
glst, MAX_WORDS, HENTRY_WORD(rp), rp->blen, rp->astr, rp->alen, word,
- nc,
- ((rp->var & H_OPT_PHON) ? copy_field(f, HENTRY_DATA(rp), MORPH_PHON)
- : NULL));
+ nc, field);
for (int k = 0; k < nw; k++) {
sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH + low) +
@@ -1524,7 +1408,7 @@ int SuggestMgr::ngsuggest(char** wlst,
if (utf8) {
std::vector<w_char> _w;
len = u8_u16(_w, guess[i]);
- mkallsmall_utf(_w, len, langnum);
+ mkallsmall_utf(_w, langnum);
u16_u8(gl, _w);
} else {
gl.assign(guess[i]);
@@ -1578,7 +1462,7 @@ int SuggestMgr::ngsuggest(char** wlst,
if (utf8) {
std::vector<w_char> _w;
len = u8_u16(_w, rootsphon[i]);
- mkallsmall_utf(_w, len, langnum);
+ mkallsmall_utf(_w, langnum);
u16_u8(gl, _w);
} else {
gl.assign(rootsphon[i]);
@@ -1707,7 +1591,8 @@ int SuggestMgr::checkword(const char* word,
if (pAMgr) {
if (cpdsuggest == 1) {
if (pAMgr->get_compound()) {
- rv = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 1,
+ struct hentry* rwords[100]; // buffer for COMPOUND pattern checking
+ rv = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, (hentry**)&rwords, 0, 1,
0); // EXT
if (rv &&
(!(rv2 = pAMgr->lookup(word)) || !rv2->astr ||
@@ -1790,51 +1675,6 @@ int SuggestMgr::check_forbidden(const char* word, int len) {
return 0;
}
-#ifdef HUNSPELL_EXPERIMENTAL
-// suggest possible stems
-int SuggestMgr::suggest_pos_stems(char*** slst, const char* w, int nsug) {
- char** wlst;
-
- struct hentry* rv = NULL;
-
- char w2[MAXSWUTF8L];
- const char* word = w;
-
- // word reversing wrapper for complex prefixes
- if (complexprefixes) {
- strcpy(w2, w);
- if (utf8)
- reverseword_utf(w2);
- else
- reverseword(w2);
- word = w2;
- }
-
- int wl = strlen(word);
-
- if (*slst) {
- wlst = *slst;
- } else {
- wlst = (char**)calloc(maxSug, sizeof(char*));
- if (wlst == NULL)
- return -1;
- }
-
- rv = pAMgr->suffix_check(word, wl, 0, NULL, wlst, maxSug, &nsug);
-
- // delete dash from end of word
- if (nsug > 0) {
- for (int j = 0; j < nsug; j++) {
- if (wlst[j][strlen(wlst[j]) - 1] == '-')
- wlst[j][strlen(wlst[j]) - 1] = '\0';
- }
- }
-
- *slst = wlst;
- return nsug;
-}
-#endif // END OF HUNSPELL_EXPERIMENTAL CODE
-
char* SuggestMgr::suggest_morph(const char* w) {
char result[MAXLNLEN];
char* r = (char*)result;
@@ -1887,33 +1727,15 @@ char* SuggestMgr::suggest_morph(const char* w) {
free(st);
}
- if (pAMgr->get_compound() && (*result == '\0'))
- pAMgr->compound_check_morph(word, strlen(word), 0, 0, 100, 0, NULL, 0, &r,
+ if (pAMgr->get_compound() && (*result == '\0')) {
+ struct hentry* rwords[100]; // buffer for COMPOUND pattern checking
+ pAMgr->compound_check_morph(word, strlen(word), 0, 0, 100, 0, NULL, (hentry**)&rwords, 0, &r,
NULL);
+ }
return (*result) ? mystrdup(line_uniq(result, MSEP_REC)) : NULL;
}
-#ifdef HUNSPELL_EXPERIMENTAL
-char* SuggestMgr::suggest_morph_for_spelling_error(const char* word) {
- char* p = NULL;
- char** wlst = (char**)calloc(maxSug, sizeof(char*));
- if (!**wlst)
- return NULL;
- // we will use only the first suggestion
- for (int i = 0; i < maxSug - 1; i++)
- wlst[i] = "";
- int ns = suggest(&wlst, word, maxSug - 1, NULL);
- if (ns == maxSug) {
- p = suggest_morph(wlst[maxSug - 1]);
- free(wlst[maxSug - 1]);
- }
- if (wlst)
- free(wlst);
- return p;
-}
-#endif // END OF HUNSPELL_EXPERIMENTAL CODE
-
/* affixation */
char* SuggestMgr::suggest_hentry_gen(hentry* rv, const char* pattern) {
char result[MAXLNLEN];
@@ -1973,23 +1795,24 @@ char* SuggestMgr::suggest_gen(char** desc, int n, const char* pattern) {
if (n == 0 || !pAMgr)
return NULL;
- char result[MAXLNLEN];
- char result2[MAXLNLEN];
+ std::string result2;
std::string newpattern;
- *result2 = '\0';
struct hentry* rv = NULL;
// search affixed forms with and without derivational suffixes
while (1) {
for (int k = 0; k < n; k++) {
- *result = '\0';
+ std::string result;
+
// add compound word parts (except the last one)
char* s = (char*)desc[k];
char* part = strstr(s, MORPH_PART);
if (part) {
char* nextpart = strstr(part + 1, MORPH_PART);
while (nextpart) {
- copy_field(result + strlen(result), part, MORPH_PART);
+ std::string field;
+ copy_field(field, part, MORPH_PART);
+ result.append(field);
part = nextpart;
nextpart = strstr(part + 1, MORPH_PART);
}
@@ -2030,16 +1853,14 @@ char* SuggestMgr::suggest_gen(char** desc, int n, const char* pattern) {
free(sg);
sg = NULL;
for (int j = 0; j < genl; j++) {
+ result2.push_back(MSEP_REC);
+ result2.append(result);
if (strstr(pl[i], MORPH_SURF_PFX)) {
- int r2l = strlen(result2);
- result2[r2l] = MSEP_REC;
- strcpy(result2 + r2l + 1, result);
- copy_field(result2 + strlen(result2), pl[i], MORPH_SURF_PFX);
- mystrcat(result2, gen[j], MAXLNLEN);
- } else {
- sprintf(result2 + strlen(result2), "%c%s%s", MSEP_REC, result,
- gen[j]);
+ std::string field;
+ copy_field(field, pl[i], MORPH_SURF_PFX);
+ result2.append(field);
}
+ result2.append(gen[j]);
}
freelist(&gen, genl);
}
@@ -2050,14 +1871,14 @@ char* SuggestMgr::suggest_gen(char** desc, int n, const char* pattern) {
freelist(&pl, pln);
}
- if (*result2 || !strstr(pattern, MORPH_DERI_SFX))
+ if (!result2.empty() || !strstr(pattern, MORPH_DERI_SFX))
break;
newpattern.assign(pattern);
mystrrep(newpattern, MORPH_DERI_SFX, MORPH_TERM_SFX);
pattern = newpattern.c_str();
}
- return (*result2 ? mystrdup(result2) : NULL);
+ return (!result2.empty() ? mystrdup(result2.c_str()) : NULL);
}
// generate an n-gram score comparing s1 and s2
@@ -2080,7 +1901,7 @@ int SuggestMgr::ngram(int n,
return 0;
// lowering dictionary word
if (opt & NGRAM_LOWERING)
- mkallsmall_utf(su2, l2, langnum);
+ mkallsmall_utf(su2, langnum);
for (int j = 1; j <= n; j++) {
ns = 0;
for (int i = 0; i <= (l1 - j); i++) {
@@ -2147,25 +1968,20 @@ int SuggestMgr::ngram(int n,
// length of the left common substring of s1 and (decapitalised) s2
int SuggestMgr::leftcommonsubstring(const char* s1, const char* s2) {
if (utf8) {
- w_char su1[MAXSWL];
- w_char su2[MAXSWL];
- su1[0].l = su2[0].l = su1[0].h = su2[0].h = 0;
+ std::vector<w_char> su1;
+ std::vector<w_char> su2;
+ int l1 = u8_u16(su1, s1);
+ int l2 = u8_u16(su2, s2);
// decapitalize dictionary word
if (complexprefixes) {
- int l1 = u8_u16(su1, MAXSWL, s1);
- int l2 = u8_u16(su2, MAXSWL, s2);
- if (*((short*)su1 + l1 - 1) == *((short*)su2 + l2 - 1))
+ if (su1[l1 - 1] == su2[l2 - 1])
return 1;
} else {
- int i;
- u8_u16(su1, 1, s1);
- u8_u16(su2, 1, s2);
- unsigned short idx = (su2->h << 8) + su2->l;
- unsigned short otheridx = (su1->h << 8) + su1->l;
+ unsigned short idx = su2.empty() ? 0 : (su2[0].h << 8) + su2[0].l;
+ unsigned short otheridx = su1.empty() ? 0 : (su1[0].h << 8) + su1[0].l;
if (otheridx != idx && (otheridx != unicodetolower(idx, langnum)))
return 0;
- int l1 = u8_u16(su1, MAXSWL, s1);
- int l2 = u8_u16(su2, MAXSWL, s2);
+ int i;
for (i = 1; (i < l1) && (i < l2) && (su1[i].l == su2[i].l) &&
(su1[i].h == su2[i].h);
i++)
@@ -2176,9 +1992,9 @@ int SuggestMgr::leftcommonsubstring(const char* s1, const char* s2) {
if (complexprefixes) {
int l1 = strlen(s1);
int l2 = strlen(s2);
- if (*(s2 + l1 - 1) == *(s2 + l2 - 1))
+ if (l1 <= l2 && s2[l1 - 1] == s2[l2 - 1])
return 1;
- } else {
+ } else if (csconv) {
const char* olds = s1;
// decapitalise dictionary word
if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower))
@@ -2201,22 +2017,22 @@ int SuggestMgr::commoncharacterpositions(const char* s1,
int diffpos[2];
*is_swap = 0;
if (utf8) {
- w_char su1[MAXSWL];
- w_char su2[MAXSWL];
- int l1 = u8_u16(su1, MAXSWL, s1);
- int l2 = u8_u16(su2, MAXSWL, s2);
+ std::vector<w_char> su1;
+ std::vector<w_char> su2;
+ int l1 = u8_u16(su1, s1);
+ int l2 = u8_u16(su2, s2);
if (l1 <= 0 || l2 <= 0)
return 0;
// decapitalize dictionary word
if (complexprefixes) {
- mkallsmall_utf(su2 + l2 - 1, 1, langnum);
+ su2[l2 - 1] = lower_utf(su2[l2 - 1], langnum);
} else {
- mkallsmall_utf(su2, 1, langnum);
+ su2[0] = lower_utf(su2[0], langnum);
}
for (int i = 0; (i < l1) && (i < l2); i++) {
- if (((short*)su1)[i] == ((short*)su2)[i]) {
+ if (su1[i] == su2[i]) {
num++;
} else {
if (diff < 2)
@@ -2225,8 +2041,8 @@ int SuggestMgr::commoncharacterpositions(const char* s1,
}
}
if ((diff == 2) && (l1 == l2) &&
- (((short*)su1)[diffpos[0]] == ((short*)su2)[diffpos[1]]) &&
- (((short*)su1)[diffpos[1]] == ((short*)su2)[diffpos[0]]))
+ (su1[diffpos[0]] == su2[diffpos[1]]) &&
+ (su1[diffpos[1]] == su2[diffpos[0]]))
*is_swap = 1;
} else {
size_t i;
@@ -2257,8 +2073,8 @@ int SuggestMgr::commoncharacterpositions(const char* s1,
int SuggestMgr::mystrlen(const char* word) {
if (utf8) {
- w_char w[MAXSWL];
- return u8_u16(w, MAXSWL, word);
+ std::vector<w_char> w;
+ return u8_u16(w, word);
} else
return strlen(word);
}
@@ -2297,15 +2113,15 @@ void SuggestMgr::lcs(const char* s,
int* l2,
char** result) {
int n, m;
- w_char su[MAXSWL];
- w_char su2[MAXSWL];
+ std::vector<w_char> su;
+ std::vector<w_char> su2;
char* b;
char* c;
int i;
int j;
if (utf8) {
- m = u8_u16(su, MAXSWL, s);
- n = u8_u16(su2, MAXSWL, s2);
+ m = u8_u16(su, s);
+ n = u8_u16(su2, s2);
} else {
m = strlen(s);
n = strlen(s2);
@@ -2326,8 +2142,8 @@ void SuggestMgr::lcs(const char* s,
c[j] = 0;
for (i = 1; i <= m; i++) {
for (j = 1; j <= n; j++) {
- if (((utf8) && (*((short*)su + i - 1) == *((short*)su2 + j - 1))) ||
- ((!utf8) && ((*(s + i - 1)) == (*(s2 + j - 1))))) {
+ if (((utf8) && (su[i - 1] == su2[j - 1])) ||
+ ((!utf8) && (s[i - 1] == s2[j - 1]))) {
c[i * (n + 1) + j] = c[(i - 1) * (n + 1) + j - 1] + 1;
b[i * (n + 1) + j] = LCS_UPLEFT;
} else if (c[(i - 1) * (n + 1) + j] >= c[i * (n + 1) + j - 1]) {
diff --git a/libs/hunspell/src/suggestmgr.hxx b/libs/hunspell/src/suggestmgr.hxx
index c8762f81ef..675d98eb8f 100644
--- a/libs/hunspell/src/suggestmgr.hxx
+++ b/libs/hunspell/src/suggestmgr.hxx
@@ -74,8 +74,6 @@
#ifndef _SUGGESTMGR_HXX_
#define _SUGGESTMGR_HXX_
-#define MAXSWL 100
-#define MAXSWUTF8L (MAXSWL * 4)
#define MAX_ROOTS 100
#define MAX_WORDS 100
#define MAX_GUESS 200
@@ -132,7 +130,7 @@ class LIBHUNSPELL_DLL_EXPORTED SuggestMgr {
~SuggestMgr();
int suggest(char*** slst, const char* word, int nsug, int* onlycmpdsug);
- int ngsuggest(char** wlst, char* word, int ns, HashMgr** pHMgr, int md);
+ int ngsuggest(char** wlst, const char* word, int ns, HashMgr** pHMgr, int md);
int suggest_auto(char*** slst, const char* word, int nsug);
int suggest_stems(char*** slst, const char* word, int nsug);
int suggest_pos_stems(char*** slst, const char* word, int nsug);
@@ -177,8 +175,7 @@ class LIBHUNSPELL_DLL_EXPORTED SuggestMgr {
int mapchars(char**, const char*, int, int);
int map_related(const char*,
- char*,
- int,
+ std::string&,
int,
char** wlst,
int,
diff --git a/libs/hunspell/src/w_char.hxx b/libs/hunspell/src/w_char.hxx
index 9de7989f4f..336c454f79 100644
--- a/libs/hunspell/src/w_char.hxx
+++ b/libs/hunspell/src/w_char.hxx
@@ -42,13 +42,27 @@
#define __WCHARHXX__
#ifndef GCC
-typedef struct {
+struct w_char {
#else
-typedef struct __attribute__((packed)) {
+struct __attribute__((packed)) w_char {
#endif
unsigned char l;
unsigned char h;
-} w_char;
+
+ friend bool operator<(const w_char a, const w_char b) {
+ unsigned short a_idx = (a.h << 8) + a.l;
+ unsigned short b_idx = (b.h << 8) + b.l;
+ return a_idx < b_idx;
+ }
+
+ friend bool operator==(const w_char a, const w_char b) {
+ return (((a).l == (b).l) && ((a).h == (b).h));
+ }
+
+ friend bool operator!=(const w_char a, const w_char b) {
+ return !(a == b);;
+ }
+};
// two character arrays
struct replentry {