diff options
Diffstat (limited to 'libs/hunspell/src/csutil.c++')
-rw-r--r-- | libs/hunspell/src/csutil.c++ | 520 |
1 files changed, 88 insertions, 432 deletions
diff --git a/libs/hunspell/src/csutil.c++ b/libs/hunspell/src/csutil.c++ index d7411bb216..1948e4a3b3 100644 --- a/libs/hunspell/src/csutil.c++ +++ b/libs/hunspell/src/csutil.c++ @@ -144,53 +144,6 @@ FILE* myfopen(const char* path, const char* mode) { return fopen(path, mode); } -/* only UTF-16 (BMP) implementation */ -char* u16_u8(char* dest, int size, const w_char* src, int srclen) { - signed char* u8 = (signed char*)dest; - signed char* u8_max = (signed char*)(u8 + size); - const w_char* u2 = src; - const w_char* u2_max = src + srclen; - while ((u2 < u2_max) && (u8 < u8_max)) { - if (u2->h) { // > 0xFF - // XXX 4-byte haven't implemented yet. - if (u2->h >= 0x08) { // >= 0x800 (3-byte UTF-8 character) - *u8 = 0xe0 + (u2->h >> 4); - u8++; - if (u8 < u8_max) { - *u8 = 0x80 + ((u2->h & 0xf) << 2) + (u2->l >> 6); - u8++; - if (u8 < u8_max) { - *u8 = 0x80 + (u2->l & 0x3f); - u8++; - } - } - } else { // < 0x800 (2-byte UTF-8 character) - *u8 = 0xc0 + (u2->h << 2) + (u2->l >> 6); - u8++; - if (u8 < u8_max) { - *u8 = 0x80 + (u2->l & 0x3f); - u8++; - } - } - } else { // <= 0xFF - if (u2->l & 0x80) { // >0x80 (2-byte UTF-8 character) - *u8 = 0xc0 + (u2->l >> 6); - u8++; - if (u8 < u8_max) { - *u8 = 0x80 + (u2->l & 0x3f); - u8++; - } - } else { // < 0x80 (1-byte UTF-8 character) - *u8 = u2->l; - u8++; - } - } - u2++; - } - *u8 = '\0'; - return dest; -} - std::string& u16_u8(std::string& dest, const std::vector<w_char>& src) { dest.clear(); std::vector<w_char>::const_iterator u2 = src.begin(); @@ -228,93 +181,6 @@ std::string& u16_u8(std::string& dest, const std::vector<w_char>& src) { return dest; } -/* only UTF-16 (BMP) implementation */ -int u8_u16(w_char* dest, int size, const char* src) { - const signed char* u8 = (const signed char*)src; - w_char* u2 = dest; - w_char* u2_max = u2 + size; - - while ((u2 < u2_max) && *u8) { - switch ((*u8) & 0xf0) { - case 0x00: - case 0x10: - case 0x20: - case 0x30: - case 0x40: - case 0x50: - case 0x60: - case 0x70: { - u2->h = 0; - u2->l = *u8; - break; - } - case 0x80: - case 0x90: - case 0xa0: - case 0xb0: { - HUNSPELL_WARNING(stderr, - "UTF-8 encoding error. Unexpected continuation bytes " - "in %ld. character position\n%s\n", - static_cast<long>(u8 - (signed char*)src), src); - u2->h = 0xff; - u2->l = 0xfd; - break; - } - case 0xc0: - case 0xd0: { // 2-byte UTF-8 codes - if ((*(u8 + 1) & 0xc0) == 0x80) { - u2->h = (*u8 & 0x1f) >> 2; - u2->l = (*u8 << 6) + (*(u8 + 1) & 0x3f); - u8++; - } else { - HUNSPELL_WARNING(stderr, - "UTF-8 encoding error. Missing continuation byte in " - "%ld. character position:\n%s\n", - static_cast<long>(u8 - (signed char*)src), src); - u2->h = 0xff; - u2->l = 0xfd; - } - break; - } - case 0xe0: { // 3-byte UTF-8 codes - if ((*(u8 + 1) & 0xc0) == 0x80) { - u2->h = ((*u8 & 0x0f) << 4) + ((*(u8 + 1) & 0x3f) >> 2); - u8++; - if ((*(u8 + 1) & 0xc0) == 0x80) { - u2->l = (*u8 << 6) + (*(u8 + 1) & 0x3f); - u8++; - } else { - HUNSPELL_WARNING(stderr, - "UTF-8 encoding error. Missing continuation byte " - "in %ld. character position:\n%s\n", - static_cast<long>(u8 - (signed char*)src), src); - u2->h = 0xff; - u2->l = 0xfd; - } - } else { - HUNSPELL_WARNING(stderr, - "UTF-8 encoding error. Missing continuation byte in " - "%ld. character position:\n%s\n", - static_cast<long>(u8 - (signed char*)src), src); - u2->h = 0xff; - u2->l = 0xfd; - } - break; - } - case 0xf0: { // 4 or more byte UTF-8 codes - HUNSPELL_WARNING( - stderr, "This UTF-8 encoding can't convert to UTF-16:\n%s\n", src); - u2->h = 0xff; - u2->l = 0xfd; - return -1; - } - } - u8++; - u2++; - } - return (int)(u2 - dest); -} - int u8_u16(std::vector<w_char>& dest, const std::string& src) { dest.clear(); std::string::const_iterator u8 = src.begin(); @@ -370,7 +236,7 @@ int u8_u16(std::vector<w_char>& dest, const std::string& src) { u2.h = ((*u8 & 0x0f) << 4) + ((*(u8 + 1) & 0x3f) >> 2); ++u8; if ((*(u8 + 1) & 0xc0) == 0x80) { - u2.l = (*u8 << 6) + (*(u8 + 1) & 0x3f); + u2.l = (static_cast<unsigned char>(*u8) << 6) + (*(u8 + 1) & 0x3f); ++u8; } else { HUNSPELL_WARNING(stderr, @@ -409,48 +275,6 @@ int u8_u16(std::vector<w_char>& dest, const std::string& src) { return dest.size(); } -void flag_qsort(unsigned short flags[], int begin, int end) { - unsigned short reg; - if (end > begin) { - unsigned short pivot = flags[begin]; - int l = begin + 1; - int r = end; - while (l < r) { - if (flags[l] <= pivot) { - l++; - } else { - r--; - reg = flags[l]; - flags[l] = flags[r]; - flags[r] = reg; - } - } - l--; - reg = flags[begin]; - flags[begin] = flags[l]; - flags[l] = reg; - - flag_qsort(flags, begin, l); - flag_qsort(flags, r, end); - } -} - -int flag_bsearch(unsigned short flags[], unsigned short flag, int length) { - int mid; - int left = 0; - int right = length - 1; - while (left <= right) { - mid = (left + right) / 2; - if (flags[mid] == flag) - return 1; - if (flag < flags[mid]) - right = mid - 1; - else - left = mid + 1; - } - return 0; -} - // strip strings into token based on single char delimiter // acts like strsep() but only uses a delim char and not // a delim string @@ -519,25 +343,6 @@ void mychomp(char* s) { *(s + k - 2) = '\0'; } -// does an ansi strdup of the reverse of a string -char* myrevstrdup(const char* s) { - char* d = NULL; - if (s) { - size_t sl = strlen(s); - d = (char*)malloc(sl + 1); - if (d) { - const char* p = s + sl - 1; - char* q = d; - while (p >= s) - *q++ = *p--; - *q = '\0'; - } else { - HUNSPELL_WARNING(stderr, "Can't allocate memory.\n"); - } - } - return d; -} - // break text to lines // return number of lines int line_tok(const char* text, char*** lines, char breakchar) { @@ -654,26 +459,6 @@ char* line_uniq_app(char** text, char breakchar) { } // append s to ends of every lines in text -void strlinecat(char* dest, const char* s) { - char* dup = mystrdup(dest); - char* source = dup; - int len = strlen(s); - if (dup) { - while (*source) { - if (*source == '\n') { - strncpy(dest, s, len); - dest += len; - } - *dest = *source; - source++; - dest++; - } - strcpy(dest, s); - free(dup); - } -} - -// append s to ends of every lines in text std::string& strlinecat(std::string& str, const std::string& apd) { size_t pos = 0; while ((pos = str.find('\n', pos)) != std::string::npos) { @@ -684,15 +469,6 @@ std::string& strlinecat(std::string& str, const std::string& apd) { return str; } -// change \n to char c -char* tr(char* text, char oldc, char newc) { - char* p; - for (p = text; *p; p++) - if (*p == oldc) - *p = newc; - return text; -} - // morphcmp(): compare MORPH_DERI_SFX, MORPH_INFL_SFX and MORPH_TERM_SFX fields // in the first line of the inputs // return 0, if inputs equal @@ -807,23 +583,6 @@ int fieldlen(const char* r) { return n; } -char* copy_field(char* dest, const char* morph, const char* var) { - if (!morph) - return NULL; - const char* beg = strstr(morph, var); - if (beg) { - char* d = dest; - for (beg += MORPH_TAG_LEN; - *beg != ' ' && *beg != '\t' && *beg != '\n' && *beg != '\0'; - d++, beg++) { - *d = *beg; - } - *d = '\0'; - return dest; - } - return NULL; -} - bool copy_field(std::string& dest, const std::string& morph, const std::string& var) { @@ -884,47 +643,18 @@ char* mystrrep(char* word, const char* pat, const char* rep) { } // reverse word -int reverseword(char* word) { - char r; - for (char *dest = word + strlen(word) - 1; word < dest; word++, dest--) { - r = *word; - *word = *dest; - *dest = r; - } - return 0; -} - -// reverse word -std::string& reverseword(std::string& word) { +size_t reverseword(std::string& word) { std::reverse(word.begin(), word.end()); - return word; -} - -// reverse word (error: 1) -int reverseword_utf(char* word) { - w_char w[MAXWORDLEN]; - w_char* p; - w_char r; - int l = u8_u16(w, MAXWORDLEN, word); - if (l == -1) - return 1; - p = w; - for (w_char *dest = w + l - 1; p < dest; p++, dest--) { - r = *p; - *p = *dest; - *dest = r; - } - u16_u8(word, MAXWORDUTF8LEN, w, l); - return 0; + return word.size(); } // reverse word -std::string& reverseword_utf(std::string& word) { +size_t reverseword_utf(std::string& word) { std::vector<w_char> w; u8_u16(w, word); std::reverse(w.begin(), w.end()); u16_u8(word, w); - return word; + return w.size(); } int uniqlist(char** list, int n) { @@ -978,12 +708,22 @@ unsigned char ccase(const struct cs_info* csconv, int nIndex) { } } -// convert null terminated string to all caps -void mkallcap(char* p, const struct cs_info* csconv) { - while (*p != '\0') { - *p = cupper(csconv, static_cast<unsigned char>(*p)); - p++; +w_char upper_utf(w_char u, int langnum) { + unsigned short idx = (u.h << 8) + u.l; + if (idx != unicodetoupper(idx, langnum)) { + u.h = (unsigned char)(unicodetoupper(idx, langnum) >> 8); + u.l = (unsigned char)(unicodetoupper(idx, langnum) & 0x00FF); + } + return u; +} + +w_char lower_utf(w_char u, int langnum) { + unsigned short idx = (u.h << 8) + u.l; + if (idx != unicodetolower(idx, langnum)) { + u.h = (unsigned char)(unicodetolower(idx, langnum) >> 8); + u.l = (unsigned char)(unicodetolower(idx, langnum) & 0x00FF); } + return u; } // convert std::string to all caps @@ -994,14 +734,6 @@ std::string& mkallcap(std::string& s, const struct cs_info* csconv) { return s; } -// convert null terminated string to all little -void mkallsmall(char* p, const struct cs_info* csconv) { - while (*p != '\0') { - *p = clower(csconv, static_cast<unsigned char>(*p)); - p++; - } -} - // convert std::string to all little std::string& mkallsmall(std::string& s, const struct cs_info* csconv) { for (std::string::iterator aI = s.begin(), aEnd = s.end(); aI != aEnd; ++aI) { @@ -1010,20 +742,9 @@ std::string& mkallsmall(std::string& s, const struct cs_info* csconv) { return s; } -void mkallsmall_utf(w_char* u, int nc, int langnum) { - for (int i = 0; i < nc; i++) { - unsigned short idx = (u[i].h << 8) + u[i].l; - if (idx != unicodetolower(idx, langnum)) { - u[i].h = (unsigned char)(unicodetolower(idx, langnum) >> 8); - u[i].l = (unsigned char)(unicodetolower(idx, langnum) & 0x00FF); - } - } -} - std::vector<w_char>& mkallsmall_utf(std::vector<w_char>& u, - int nc, int langnum) { - for (int i = 0; i < nc; i++) { + for (size_t i = 0; i < u.size(); ++i) { unsigned short idx = (u[i].h << 8) + u[i].l; if (idx != unicodetolower(idx, langnum)) { u[i].h = (unsigned char)(unicodetolower(idx, langnum) >> 8); @@ -1033,31 +754,51 @@ std::vector<w_char>& mkallsmall_utf(std::vector<w_char>& u, return u; } -void mkallcap_utf(w_char* u, int nc, int langnum) { - for (int i = 0; i < nc; i++) { +std::vector<w_char>& mkallcap_utf(std::vector<w_char>& u, int langnum) { + for (size_t i = 0; i < u.size(); i++) { unsigned short idx = (u[i].h << 8) + u[i].l; if (idx != unicodetoupper(idx, langnum)) { u[i].h = (unsigned char)(unicodetoupper(idx, langnum) >> 8); u[i].l = (unsigned char)(unicodetoupper(idx, langnum) & 0x00FF); } } + return u; } -std::vector<w_char>& mkallcap_utf(std::vector<w_char>& u, int nc, int langnum) { - for (int i = 0; i < nc; i++) { - unsigned short idx = (u[i].h << 8) + u[i].l; +std::string& mkinitcap(std::string& s, const struct cs_info* csconv) { + if (!s.empty()) { + s[0] = cupper(csconv, static_cast<unsigned char>(s[0])); + } + return s; +} + +std::vector<w_char>& mkinitcap_utf(std::vector<w_char>& u, int langnum) { + if (!u.empty()) { + unsigned short idx = (u[0].h << 8) + u[0].l; if (idx != unicodetoupper(idx, langnum)) { - u[i].h = (unsigned char)(unicodetoupper(idx, langnum) >> 8); - u[i].l = (unsigned char)(unicodetoupper(idx, langnum) & 0x00FF); + u[0].h = (unsigned char)(unicodetoupper(idx, langnum) >> 8); + u[0].l = (unsigned char)(unicodetoupper(idx, langnum) & 0x00FF); } } return u; } -// convert null terminated string to have initial capital -void mkinitcap(char* p, const struct cs_info* csconv) { - if (*p != '\0') - *p = cupper(csconv, static_cast<unsigned char>(*p)); +std::string& mkinitsmall(std::string& s, const struct cs_info* csconv) { + if (!s.empty()) { + s[0] = clower(csconv, static_cast<unsigned char>(s[0])); + } + return s; +} + +std::vector<w_char>& mkinitsmall_utf(std::vector<w_char>& u, int langnum) { + if (!u.empty()) { + unsigned short idx = (u[0].h << 8) + u[0].l; + if (idx != unicodetolower(idx, langnum)) { + u[0].h = (unsigned char)(unicodetolower(idx, langnum) >> 8); + u[0].l = (unsigned char)(unicodetolower(idx, langnum) & 0x00FF); + } + } + return u; } // conversion function for protected memory @@ -1073,35 +814,6 @@ char* get_stored_pointer(const char* s) { } #ifndef MOZILLA_CLIENT -// convert null terminated string to all caps using encoding -void enmkallcap(char* d, const char* p, const char* encoding) - -{ - struct cs_info* csconv = get_current_cs(encoding); - while (*p != '\0') { - *d++ = cupper(csconv, static_cast<unsigned char>(*p)); - p++; - } - *d = '\0'; -} - -// convert null terminated string to all little using encoding -void enmkallsmall(char* d, const char* p, const char* encoding) { - struct cs_info* csconv = get_current_cs(encoding); - while (*p != '\0') { - *d++ = clower(csconv, static_cast<unsigned char>(*p)); - p++; - } - *d = '\0'; -} - -// convert null terminated string to have initial capital using encoding -void enmkinitcap(char* d, const char* p, const char* encoding) { - struct cs_info* csconv = get_current_cs(encoding); - memcpy(d, p, (strlen(p) + 1)); - if (*p != '\0') - *d = cupper(csconv, static_cast<unsigned char>(*p)); -} // these are simple character mappings for the // encodings supported @@ -2982,14 +2694,14 @@ int unicodeisalpha(unsigned short c) { } /* get type of capitalization */ -int get_captype(char* word, int nl, cs_info* csconv) { +int get_captype(const std::string& word, cs_info* csconv) { // now determine the capitalization type of the first nl letters - int ncap = 0; - int nneutral = 0; - int firstcap = 0; + size_t ncap = 0; + size_t nneutral = 0; + size_t firstcap = 0; if (csconv == NULL) return NOCAP; - for (char* q = word; *q != '\0'; q++) { + for (std::string::const_iterator q = word.begin(); q != word.end(); ++q) { unsigned char nIndex = static_cast<unsigned char>(*q); if (ccase(csconv, nIndex)) ncap++; @@ -3006,7 +2718,7 @@ int get_captype(char* word, int nl, cs_info* csconv) { return NOCAP; } else if ((ncap == 1) && firstcap) { return INITCAP; - } else if ((ncap == nl) || ((ncap + nneutral) == nl)) { + } else if ((ncap == word.size()) || ((ncap + nneutral) == word.size())) { return ALLCAP; } else if ((ncap > 1) && firstcap) { return HUHINITCAP; @@ -3014,27 +2726,20 @@ int get_captype(char* word, int nl, cs_info* csconv) { return HUHCAP; } -int get_captype_utf8(w_char* word, int nl, int langnum) { +int get_captype_utf8(const std::vector<w_char>& word, int langnum) { // now determine the capitalization type of the first nl letters - int ncap = 0; - int nneutral = 0; - int firstcap = 0; - unsigned short idx; - // don't check too long words - if (nl >= MAXWORDLEN) - return 0; - // big Unicode character (non BMP area) - if (nl == -1) - return NOCAP; - for (int i = 0; i < nl; i++) { - idx = (word[i].h << 8) + word[i].l; + size_t ncap = 0; + size_t nneutral = 0; + size_t firstcap = 0; + for (size_t i = 0; i < word.size(); ++i) { + unsigned short idx = (word[i].h << 8) + word[i].l; if (idx != unicodetolower(idx, langnum)) ncap++; if (unicodetoupper(idx, langnum) == unicodetolower(idx, langnum)) nneutral++; } if (ncap) { - idx = (word[0].h << 8) + word[0].l; + unsigned short idx = (word[0].h << 8) + word[0].l; firstcap = (idx != unicodetolower(idx, langnum)); } @@ -3043,7 +2748,7 @@ int get_captype_utf8(w_char* word, int nl, int langnum) { return NOCAP; } else if ((ncap == 1) && firstcap) { return INITCAP; - } else if ((ncap == nl) || ((ncap + nneutral) == nl)) { + } else if ((ncap == word.size()) || ((ncap + nneutral) == word.size())) { return ALLCAP; } else if ((ncap > 1) && firstcap) { return HUHINITCAP; @@ -3052,63 +2757,22 @@ int get_captype_utf8(w_char* word, int nl, int langnum) { } // strip all ignored characters in the string -void remove_ignored_chars_utf(char* word, - unsigned short ignored_chars[], - int ignored_len) { - w_char w[MAXWORDLEN]; - w_char w2[MAXWORDLEN]; - int i; - int j; - int len = u8_u16(w, MAXWORDLEN, word); - for (i = 0, j = 0; i < len; i++) { - if (!flag_bsearch(ignored_chars, ((unsigned short*)w)[i], ignored_len)) { - w2[j] = w[i]; - j++; - } - } - if (j < i) - u16_u8(word, MAXWORDUTF8LEN, w2, j); -} - -namespace { -union w_s { - w_char w; - unsigned short s; -}; - -unsigned short asushort(w_char in) { - w_s c; - c.w = in; - return c.s; -} -} - -// strip all ignored characters in the string -std::string& remove_ignored_chars_utf(std::string& word, - unsigned short ignored_chars[], - int ignored_len) { +size_t remove_ignored_chars_utf(std::string& word, + const std::vector<w_char>& ignored_chars) { std::vector<w_char> w; std::vector<w_char> w2; u8_u16(w, word); for (size_t i = 0; i < w.size(); ++i) { - if (!flag_bsearch(ignored_chars, asushort(w[i]), ignored_len)) + if (!std::binary_search(ignored_chars.begin(), + ignored_chars.end(), + w[i])) { w2.push_back(w[i]); + } } u16_u8(word, w2); - return word; -} - -// strip all ignored characters in the string -void remove_ignored_chars(char* word, char* ignored_chars) { - for (char* p = word; *p != '\0'; p++) { - if (!strchr(ignored_chars, *p)) { - *word = *p; - word++; - } - } - *word = '\0'; + return w2.size(); } namespace { @@ -3119,16 +2783,17 @@ class is_any_of { bool operator()(char c) { return chars.find(c) != std::string::npos; } private: - const std::string& chars; + std::string chars; }; } // strip all ignored characters in the string -std::string& remove_ignored_chars(std::string& word, - const std::string& ignored_chars) { +size_t remove_ignored_chars(std::string& word, + const std::string& ignored_chars) { word.erase( - std::remove_if(word.begin(), word.end(), is_any_of(ignored_chars))); - return word; + std::remove_if(word.begin(), word.end(), is_any_of(ignored_chars)), + word.end()); + return word.size(); } int parse_string(char* line, char** out, int ln) { @@ -3170,25 +2835,16 @@ int parse_string(char* line, char** out, int ln) { return 0; } -int parse_array(char* line, - char** out, - unsigned short** out_utf16, - int* out_utf16_len, - int utf8, - int ln) { +bool parse_array(char* line, + char** out, + std::vector<w_char>& out_utf16, + int utf8, + int ln) { if (parse_string(line, out, ln)) - return 1; + return false; if (utf8) { - w_char w[MAXWORDLEN]; - int n = u8_u16(w, MAXWORDLEN, *out); - if (n > 0) { - flag_qsort((unsigned short*)w, 0, n); - *out_utf16 = (unsigned short*)malloc(n * sizeof(unsigned short)); - if (!*out_utf16) - return 1; - memcpy(*out_utf16, w, n * sizeof(unsigned short)); - } - *out_utf16_len = n; + u8_u16(out_utf16, *out); + std::sort(out_utf16.begin(), out_utf16.end()); } - return 0; + return true; } |