summaryrefslogtreecommitdiff
path: root/libs/hunspell/src/csutil.c++
diff options
context:
space:
mode:
Diffstat (limited to 'libs/hunspell/src/csutil.c++')
-rw-r--r--libs/hunspell/src/csutil.c++520
1 files changed, 88 insertions, 432 deletions
diff --git a/libs/hunspell/src/csutil.c++ b/libs/hunspell/src/csutil.c++
index d7411bb216..1948e4a3b3 100644
--- a/libs/hunspell/src/csutil.c++
+++ b/libs/hunspell/src/csutil.c++
@@ -144,53 +144,6 @@ FILE* myfopen(const char* path, const char* mode) {
return fopen(path, mode);
}
-/* only UTF-16 (BMP) implementation */
-char* u16_u8(char* dest, int size, const w_char* src, int srclen) {
- signed char* u8 = (signed char*)dest;
- signed char* u8_max = (signed char*)(u8 + size);
- const w_char* u2 = src;
- const w_char* u2_max = src + srclen;
- while ((u2 < u2_max) && (u8 < u8_max)) {
- if (u2->h) { // > 0xFF
- // XXX 4-byte haven't implemented yet.
- if (u2->h >= 0x08) { // >= 0x800 (3-byte UTF-8 character)
- *u8 = 0xe0 + (u2->h >> 4);
- u8++;
- if (u8 < u8_max) {
- *u8 = 0x80 + ((u2->h & 0xf) << 2) + (u2->l >> 6);
- u8++;
- if (u8 < u8_max) {
- *u8 = 0x80 + (u2->l & 0x3f);
- u8++;
- }
- }
- } else { // < 0x800 (2-byte UTF-8 character)
- *u8 = 0xc0 + (u2->h << 2) + (u2->l >> 6);
- u8++;
- if (u8 < u8_max) {
- *u8 = 0x80 + (u2->l & 0x3f);
- u8++;
- }
- }
- } else { // <= 0xFF
- if (u2->l & 0x80) { // >0x80 (2-byte UTF-8 character)
- *u8 = 0xc0 + (u2->l >> 6);
- u8++;
- if (u8 < u8_max) {
- *u8 = 0x80 + (u2->l & 0x3f);
- u8++;
- }
- } else { // < 0x80 (1-byte UTF-8 character)
- *u8 = u2->l;
- u8++;
- }
- }
- u2++;
- }
- *u8 = '\0';
- return dest;
-}
-
std::string& u16_u8(std::string& dest, const std::vector<w_char>& src) {
dest.clear();
std::vector<w_char>::const_iterator u2 = src.begin();
@@ -228,93 +181,6 @@ std::string& u16_u8(std::string& dest, const std::vector<w_char>& src) {
return dest;
}
-/* only UTF-16 (BMP) implementation */
-int u8_u16(w_char* dest, int size, const char* src) {
- const signed char* u8 = (const signed char*)src;
- w_char* u2 = dest;
- w_char* u2_max = u2 + size;
-
- while ((u2 < u2_max) && *u8) {
- switch ((*u8) & 0xf0) {
- case 0x00:
- case 0x10:
- case 0x20:
- case 0x30:
- case 0x40:
- case 0x50:
- case 0x60:
- case 0x70: {
- u2->h = 0;
- u2->l = *u8;
- break;
- }
- case 0x80:
- case 0x90:
- case 0xa0:
- case 0xb0: {
- HUNSPELL_WARNING(stderr,
- "UTF-8 encoding error. Unexpected continuation bytes "
- "in %ld. character position\n%s\n",
- static_cast<long>(u8 - (signed char*)src), src);
- u2->h = 0xff;
- u2->l = 0xfd;
- break;
- }
- case 0xc0:
- case 0xd0: { // 2-byte UTF-8 codes
- if ((*(u8 + 1) & 0xc0) == 0x80) {
- u2->h = (*u8 & 0x1f) >> 2;
- u2->l = (*u8 << 6) + (*(u8 + 1) & 0x3f);
- u8++;
- } else {
- HUNSPELL_WARNING(stderr,
- "UTF-8 encoding error. Missing continuation byte in "
- "%ld. character position:\n%s\n",
- static_cast<long>(u8 - (signed char*)src), src);
- u2->h = 0xff;
- u2->l = 0xfd;
- }
- break;
- }
- case 0xe0: { // 3-byte UTF-8 codes
- if ((*(u8 + 1) & 0xc0) == 0x80) {
- u2->h = ((*u8 & 0x0f) << 4) + ((*(u8 + 1) & 0x3f) >> 2);
- u8++;
- if ((*(u8 + 1) & 0xc0) == 0x80) {
- u2->l = (*u8 << 6) + (*(u8 + 1) & 0x3f);
- u8++;
- } else {
- HUNSPELL_WARNING(stderr,
- "UTF-8 encoding error. Missing continuation byte "
- "in %ld. character position:\n%s\n",
- static_cast<long>(u8 - (signed char*)src), src);
- u2->h = 0xff;
- u2->l = 0xfd;
- }
- } else {
- HUNSPELL_WARNING(stderr,
- "UTF-8 encoding error. Missing continuation byte in "
- "%ld. character position:\n%s\n",
- static_cast<long>(u8 - (signed char*)src), src);
- u2->h = 0xff;
- u2->l = 0xfd;
- }
- break;
- }
- case 0xf0: { // 4 or more byte UTF-8 codes
- HUNSPELL_WARNING(
- stderr, "This UTF-8 encoding can't convert to UTF-16:\n%s\n", src);
- u2->h = 0xff;
- u2->l = 0xfd;
- return -1;
- }
- }
- u8++;
- u2++;
- }
- return (int)(u2 - dest);
-}
-
int u8_u16(std::vector<w_char>& dest, const std::string& src) {
dest.clear();
std::string::const_iterator u8 = src.begin();
@@ -370,7 +236,7 @@ int u8_u16(std::vector<w_char>& dest, const std::string& src) {
u2.h = ((*u8 & 0x0f) << 4) + ((*(u8 + 1) & 0x3f) >> 2);
++u8;
if ((*(u8 + 1) & 0xc0) == 0x80) {
- u2.l = (*u8 << 6) + (*(u8 + 1) & 0x3f);
+ u2.l = (static_cast<unsigned char>(*u8) << 6) + (*(u8 + 1) & 0x3f);
++u8;
} else {
HUNSPELL_WARNING(stderr,
@@ -409,48 +275,6 @@ int u8_u16(std::vector<w_char>& dest, const std::string& src) {
return dest.size();
}
-void flag_qsort(unsigned short flags[], int begin, int end) {
- unsigned short reg;
- if (end > begin) {
- unsigned short pivot = flags[begin];
- int l = begin + 1;
- int r = end;
- while (l < r) {
- if (flags[l] <= pivot) {
- l++;
- } else {
- r--;
- reg = flags[l];
- flags[l] = flags[r];
- flags[r] = reg;
- }
- }
- l--;
- reg = flags[begin];
- flags[begin] = flags[l];
- flags[l] = reg;
-
- flag_qsort(flags, begin, l);
- flag_qsort(flags, r, end);
- }
-}
-
-int flag_bsearch(unsigned short flags[], unsigned short flag, int length) {
- int mid;
- int left = 0;
- int right = length - 1;
- while (left <= right) {
- mid = (left + right) / 2;
- if (flags[mid] == flag)
- return 1;
- if (flag < flags[mid])
- right = mid - 1;
- else
- left = mid + 1;
- }
- return 0;
-}
-
// strip strings into token based on single char delimiter
// acts like strsep() but only uses a delim char and not
// a delim string
@@ -519,25 +343,6 @@ void mychomp(char* s) {
*(s + k - 2) = '\0';
}
-// does an ansi strdup of the reverse of a string
-char* myrevstrdup(const char* s) {
- char* d = NULL;
- if (s) {
- size_t sl = strlen(s);
- d = (char*)malloc(sl + 1);
- if (d) {
- const char* p = s + sl - 1;
- char* q = d;
- while (p >= s)
- *q++ = *p--;
- *q = '\0';
- } else {
- HUNSPELL_WARNING(stderr, "Can't allocate memory.\n");
- }
- }
- return d;
-}
-
// break text to lines
// return number of lines
int line_tok(const char* text, char*** lines, char breakchar) {
@@ -654,26 +459,6 @@ char* line_uniq_app(char** text, char breakchar) {
}
// append s to ends of every lines in text
-void strlinecat(char* dest, const char* s) {
- char* dup = mystrdup(dest);
- char* source = dup;
- int len = strlen(s);
- if (dup) {
- while (*source) {
- if (*source == '\n') {
- strncpy(dest, s, len);
- dest += len;
- }
- *dest = *source;
- source++;
- dest++;
- }
- strcpy(dest, s);
- free(dup);
- }
-}
-
-// append s to ends of every lines in text
std::string& strlinecat(std::string& str, const std::string& apd) {
size_t pos = 0;
while ((pos = str.find('\n', pos)) != std::string::npos) {
@@ -684,15 +469,6 @@ std::string& strlinecat(std::string& str, const std::string& apd) {
return str;
}
-// change \n to char c
-char* tr(char* text, char oldc, char newc) {
- char* p;
- for (p = text; *p; p++)
- if (*p == oldc)
- *p = newc;
- return text;
-}
-
// morphcmp(): compare MORPH_DERI_SFX, MORPH_INFL_SFX and MORPH_TERM_SFX fields
// in the first line of the inputs
// return 0, if inputs equal
@@ -807,23 +583,6 @@ int fieldlen(const char* r) {
return n;
}
-char* copy_field(char* dest, const char* morph, const char* var) {
- if (!morph)
- return NULL;
- const char* beg = strstr(morph, var);
- if (beg) {
- char* d = dest;
- for (beg += MORPH_TAG_LEN;
- *beg != ' ' && *beg != '\t' && *beg != '\n' && *beg != '\0';
- d++, beg++) {
- *d = *beg;
- }
- *d = '\0';
- return dest;
- }
- return NULL;
-}
-
bool copy_field(std::string& dest,
const std::string& morph,
const std::string& var) {
@@ -884,47 +643,18 @@ char* mystrrep(char* word, const char* pat, const char* rep) {
}
// reverse word
-int reverseword(char* word) {
- char r;
- for (char *dest = word + strlen(word) - 1; word < dest; word++, dest--) {
- r = *word;
- *word = *dest;
- *dest = r;
- }
- return 0;
-}
-
-// reverse word
-std::string& reverseword(std::string& word) {
+size_t reverseword(std::string& word) {
std::reverse(word.begin(), word.end());
- return word;
-}
-
-// reverse word (error: 1)
-int reverseword_utf(char* word) {
- w_char w[MAXWORDLEN];
- w_char* p;
- w_char r;
- int l = u8_u16(w, MAXWORDLEN, word);
- if (l == -1)
- return 1;
- p = w;
- for (w_char *dest = w + l - 1; p < dest; p++, dest--) {
- r = *p;
- *p = *dest;
- *dest = r;
- }
- u16_u8(word, MAXWORDUTF8LEN, w, l);
- return 0;
+ return word.size();
}
// reverse word
-std::string& reverseword_utf(std::string& word) {
+size_t reverseword_utf(std::string& word) {
std::vector<w_char> w;
u8_u16(w, word);
std::reverse(w.begin(), w.end());
u16_u8(word, w);
- return word;
+ return w.size();
}
int uniqlist(char** list, int n) {
@@ -978,12 +708,22 @@ unsigned char ccase(const struct cs_info* csconv, int nIndex) {
}
}
-// convert null terminated string to all caps
-void mkallcap(char* p, const struct cs_info* csconv) {
- while (*p != '\0') {
- *p = cupper(csconv, static_cast<unsigned char>(*p));
- p++;
+w_char upper_utf(w_char u, int langnum) {
+ unsigned short idx = (u.h << 8) + u.l;
+ if (idx != unicodetoupper(idx, langnum)) {
+ u.h = (unsigned char)(unicodetoupper(idx, langnum) >> 8);
+ u.l = (unsigned char)(unicodetoupper(idx, langnum) & 0x00FF);
+ }
+ return u;
+}
+
+w_char lower_utf(w_char u, int langnum) {
+ unsigned short idx = (u.h << 8) + u.l;
+ if (idx != unicodetolower(idx, langnum)) {
+ u.h = (unsigned char)(unicodetolower(idx, langnum) >> 8);
+ u.l = (unsigned char)(unicodetolower(idx, langnum) & 0x00FF);
}
+ return u;
}
// convert std::string to all caps
@@ -994,14 +734,6 @@ std::string& mkallcap(std::string& s, const struct cs_info* csconv) {
return s;
}
-// convert null terminated string to all little
-void mkallsmall(char* p, const struct cs_info* csconv) {
- while (*p != '\0') {
- *p = clower(csconv, static_cast<unsigned char>(*p));
- p++;
- }
-}
-
// convert std::string to all little
std::string& mkallsmall(std::string& s, const struct cs_info* csconv) {
for (std::string::iterator aI = s.begin(), aEnd = s.end(); aI != aEnd; ++aI) {
@@ -1010,20 +742,9 @@ std::string& mkallsmall(std::string& s, const struct cs_info* csconv) {
return s;
}
-void mkallsmall_utf(w_char* u, int nc, int langnum) {
- for (int i = 0; i < nc; i++) {
- unsigned short idx = (u[i].h << 8) + u[i].l;
- if (idx != unicodetolower(idx, langnum)) {
- u[i].h = (unsigned char)(unicodetolower(idx, langnum) >> 8);
- u[i].l = (unsigned char)(unicodetolower(idx, langnum) & 0x00FF);
- }
- }
-}
-
std::vector<w_char>& mkallsmall_utf(std::vector<w_char>& u,
- int nc,
int langnum) {
- for (int i = 0; i < nc; i++) {
+ for (size_t i = 0; i < u.size(); ++i) {
unsigned short idx = (u[i].h << 8) + u[i].l;
if (idx != unicodetolower(idx, langnum)) {
u[i].h = (unsigned char)(unicodetolower(idx, langnum) >> 8);
@@ -1033,31 +754,51 @@ std::vector<w_char>& mkallsmall_utf(std::vector<w_char>& u,
return u;
}
-void mkallcap_utf(w_char* u, int nc, int langnum) {
- for (int i = 0; i < nc; i++) {
+std::vector<w_char>& mkallcap_utf(std::vector<w_char>& u, int langnum) {
+ for (size_t i = 0; i < u.size(); i++) {
unsigned short idx = (u[i].h << 8) + u[i].l;
if (idx != unicodetoupper(idx, langnum)) {
u[i].h = (unsigned char)(unicodetoupper(idx, langnum) >> 8);
u[i].l = (unsigned char)(unicodetoupper(idx, langnum) & 0x00FF);
}
}
+ return u;
}
-std::vector<w_char>& mkallcap_utf(std::vector<w_char>& u, int nc, int langnum) {
- for (int i = 0; i < nc; i++) {
- unsigned short idx = (u[i].h << 8) + u[i].l;
+std::string& mkinitcap(std::string& s, const struct cs_info* csconv) {
+ if (!s.empty()) {
+ s[0] = cupper(csconv, static_cast<unsigned char>(s[0]));
+ }
+ return s;
+}
+
+std::vector<w_char>& mkinitcap_utf(std::vector<w_char>& u, int langnum) {
+ if (!u.empty()) {
+ unsigned short idx = (u[0].h << 8) + u[0].l;
if (idx != unicodetoupper(idx, langnum)) {
- u[i].h = (unsigned char)(unicodetoupper(idx, langnum) >> 8);
- u[i].l = (unsigned char)(unicodetoupper(idx, langnum) & 0x00FF);
+ u[0].h = (unsigned char)(unicodetoupper(idx, langnum) >> 8);
+ u[0].l = (unsigned char)(unicodetoupper(idx, langnum) & 0x00FF);
}
}
return u;
}
-// convert null terminated string to have initial capital
-void mkinitcap(char* p, const struct cs_info* csconv) {
- if (*p != '\0')
- *p = cupper(csconv, static_cast<unsigned char>(*p));
+std::string& mkinitsmall(std::string& s, const struct cs_info* csconv) {
+ if (!s.empty()) {
+ s[0] = clower(csconv, static_cast<unsigned char>(s[0]));
+ }
+ return s;
+}
+
+std::vector<w_char>& mkinitsmall_utf(std::vector<w_char>& u, int langnum) {
+ if (!u.empty()) {
+ unsigned short idx = (u[0].h << 8) + u[0].l;
+ if (idx != unicodetolower(idx, langnum)) {
+ u[0].h = (unsigned char)(unicodetolower(idx, langnum) >> 8);
+ u[0].l = (unsigned char)(unicodetolower(idx, langnum) & 0x00FF);
+ }
+ }
+ return u;
}
// conversion function for protected memory
@@ -1073,35 +814,6 @@ char* get_stored_pointer(const char* s) {
}
#ifndef MOZILLA_CLIENT
-// convert null terminated string to all caps using encoding
-void enmkallcap(char* d, const char* p, const char* encoding)
-
-{
- struct cs_info* csconv = get_current_cs(encoding);
- while (*p != '\0') {
- *d++ = cupper(csconv, static_cast<unsigned char>(*p));
- p++;
- }
- *d = '\0';
-}
-
-// convert null terminated string to all little using encoding
-void enmkallsmall(char* d, const char* p, const char* encoding) {
- struct cs_info* csconv = get_current_cs(encoding);
- while (*p != '\0') {
- *d++ = clower(csconv, static_cast<unsigned char>(*p));
- p++;
- }
- *d = '\0';
-}
-
-// convert null terminated string to have initial capital using encoding
-void enmkinitcap(char* d, const char* p, const char* encoding) {
- struct cs_info* csconv = get_current_cs(encoding);
- memcpy(d, p, (strlen(p) + 1));
- if (*p != '\0')
- *d = cupper(csconv, static_cast<unsigned char>(*p));
-}
// these are simple character mappings for the
// encodings supported
@@ -2982,14 +2694,14 @@ int unicodeisalpha(unsigned short c) {
}
/* get type of capitalization */
-int get_captype(char* word, int nl, cs_info* csconv) {
+int get_captype(const std::string& word, cs_info* csconv) {
// now determine the capitalization type of the first nl letters
- int ncap = 0;
- int nneutral = 0;
- int firstcap = 0;
+ size_t ncap = 0;
+ size_t nneutral = 0;
+ size_t firstcap = 0;
if (csconv == NULL)
return NOCAP;
- for (char* q = word; *q != '\0'; q++) {
+ for (std::string::const_iterator q = word.begin(); q != word.end(); ++q) {
unsigned char nIndex = static_cast<unsigned char>(*q);
if (ccase(csconv, nIndex))
ncap++;
@@ -3006,7 +2718,7 @@ int get_captype(char* word, int nl, cs_info* csconv) {
return NOCAP;
} else if ((ncap == 1) && firstcap) {
return INITCAP;
- } else if ((ncap == nl) || ((ncap + nneutral) == nl)) {
+ } else if ((ncap == word.size()) || ((ncap + nneutral) == word.size())) {
return ALLCAP;
} else if ((ncap > 1) && firstcap) {
return HUHINITCAP;
@@ -3014,27 +2726,20 @@ int get_captype(char* word, int nl, cs_info* csconv) {
return HUHCAP;
}
-int get_captype_utf8(w_char* word, int nl, int langnum) {
+int get_captype_utf8(const std::vector<w_char>& word, int langnum) {
// now determine the capitalization type of the first nl letters
- int ncap = 0;
- int nneutral = 0;
- int firstcap = 0;
- unsigned short idx;
- // don't check too long words
- if (nl >= MAXWORDLEN)
- return 0;
- // big Unicode character (non BMP area)
- if (nl == -1)
- return NOCAP;
- for (int i = 0; i < nl; i++) {
- idx = (word[i].h << 8) + word[i].l;
+ size_t ncap = 0;
+ size_t nneutral = 0;
+ size_t firstcap = 0;
+ for (size_t i = 0; i < word.size(); ++i) {
+ unsigned short idx = (word[i].h << 8) + word[i].l;
if (idx != unicodetolower(idx, langnum))
ncap++;
if (unicodetoupper(idx, langnum) == unicodetolower(idx, langnum))
nneutral++;
}
if (ncap) {
- idx = (word[0].h << 8) + word[0].l;
+ unsigned short idx = (word[0].h << 8) + word[0].l;
firstcap = (idx != unicodetolower(idx, langnum));
}
@@ -3043,7 +2748,7 @@ int get_captype_utf8(w_char* word, int nl, int langnum) {
return NOCAP;
} else if ((ncap == 1) && firstcap) {
return INITCAP;
- } else if ((ncap == nl) || ((ncap + nneutral) == nl)) {
+ } else if ((ncap == word.size()) || ((ncap + nneutral) == word.size())) {
return ALLCAP;
} else if ((ncap > 1) && firstcap) {
return HUHINITCAP;
@@ -3052,63 +2757,22 @@ int get_captype_utf8(w_char* word, int nl, int langnum) {
}
// strip all ignored characters in the string
-void remove_ignored_chars_utf(char* word,
- unsigned short ignored_chars[],
- int ignored_len) {
- w_char w[MAXWORDLEN];
- w_char w2[MAXWORDLEN];
- int i;
- int j;
- int len = u8_u16(w, MAXWORDLEN, word);
- for (i = 0, j = 0; i < len; i++) {
- if (!flag_bsearch(ignored_chars, ((unsigned short*)w)[i], ignored_len)) {
- w2[j] = w[i];
- j++;
- }
- }
- if (j < i)
- u16_u8(word, MAXWORDUTF8LEN, w2, j);
-}
-
-namespace {
-union w_s {
- w_char w;
- unsigned short s;
-};
-
-unsigned short asushort(w_char in) {
- w_s c;
- c.w = in;
- return c.s;
-}
-}
-
-// strip all ignored characters in the string
-std::string& remove_ignored_chars_utf(std::string& word,
- unsigned short ignored_chars[],
- int ignored_len) {
+size_t remove_ignored_chars_utf(std::string& word,
+ const std::vector<w_char>& ignored_chars) {
std::vector<w_char> w;
std::vector<w_char> w2;
u8_u16(w, word);
for (size_t i = 0; i < w.size(); ++i) {
- if (!flag_bsearch(ignored_chars, asushort(w[i]), ignored_len))
+ if (!std::binary_search(ignored_chars.begin(),
+ ignored_chars.end(),
+ w[i])) {
w2.push_back(w[i]);
+ }
}
u16_u8(word, w2);
- return word;
-}
-
-// strip all ignored characters in the string
-void remove_ignored_chars(char* word, char* ignored_chars) {
- for (char* p = word; *p != '\0'; p++) {
- if (!strchr(ignored_chars, *p)) {
- *word = *p;
- word++;
- }
- }
- *word = '\0';
+ return w2.size();
}
namespace {
@@ -3119,16 +2783,17 @@ class is_any_of {
bool operator()(char c) { return chars.find(c) != std::string::npos; }
private:
- const std::string& chars;
+ std::string chars;
};
}
// strip all ignored characters in the string
-std::string& remove_ignored_chars(std::string& word,
- const std::string& ignored_chars) {
+size_t remove_ignored_chars(std::string& word,
+ const std::string& ignored_chars) {
word.erase(
- std::remove_if(word.begin(), word.end(), is_any_of(ignored_chars)));
- return word;
+ std::remove_if(word.begin(), word.end(), is_any_of(ignored_chars)),
+ word.end());
+ return word.size();
}
int parse_string(char* line, char** out, int ln) {
@@ -3170,25 +2835,16 @@ int parse_string(char* line, char** out, int ln) {
return 0;
}
-int parse_array(char* line,
- char** out,
- unsigned short** out_utf16,
- int* out_utf16_len,
- int utf8,
- int ln) {
+bool parse_array(char* line,
+ char** out,
+ std::vector<w_char>& out_utf16,
+ int utf8,
+ int ln) {
if (parse_string(line, out, ln))
- return 1;
+ return false;
if (utf8) {
- w_char w[MAXWORDLEN];
- int n = u8_u16(w, MAXWORDLEN, *out);
- if (n > 0) {
- flag_qsort((unsigned short*)w, 0, n);
- *out_utf16 = (unsigned short*)malloc(n * sizeof(unsigned short));
- if (!*out_utf16)
- return 1;
- memcpy(*out_utf16, w, n * sizeof(unsigned short));
- }
- *out_utf16_len = n;
+ u8_u16(out_utf16, *out);
+ std::sort(out_utf16.begin(), out_utf16.end());
}
- return 0;
+ return true;
}