summaryrefslogtreecommitdiff
path: root/plugins/SpellChecker/src/hunspell/hunspell.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'plugins/SpellChecker/src/hunspell/hunspell.cxx')
-rw-r--r--plugins/SpellChecker/src/hunspell/hunspell.cxx2035
1 files changed, 0 insertions, 2035 deletions
diff --git a/plugins/SpellChecker/src/hunspell/hunspell.cxx b/plugins/SpellChecker/src/hunspell/hunspell.cxx
deleted file mode 100644
index b5dcfd57fb..0000000000
--- a/plugins/SpellChecker/src/hunspell/hunspell.cxx
+++ /dev/null
@@ -1,2035 +0,0 @@
-#include "..\commons.h"
-
-#ifndef MOZILLA_CLIENT
-# include "config.h"
-#endif
-
-Hunspell::Hunspell(const char * affpath, const char * dpath, const char * key)
-{
- encoding = NULL;
- csconv = NULL;
- utf8 = 0;
- complexprefixes = 0;
- affixpath = mystrdup(affpath);
- maxdic = 0;
-
- /* first set up the hash manager */
- pHMgr[0] = new HashMgr(dpath, affpath, key);
- if (pHMgr[0]) maxdic = 1;
-
- /* next set up the affix manager */
- /* it needs access to the hash manager lookup methods */
- pAMgr = new AffixMgr(affpath, pHMgr, &maxdic, key);
-
- /* get the preferred try string and the dictionary */
- /* encoding from the Affix Manager for that dictionary */
- char * try_string = pAMgr->get_try_string();
- encoding = pAMgr->get_encoding();
- langnum = pAMgr->get_langnum();
- utf8 = pAMgr->get_utf8();
- if (!utf8)
- csconv = get_current_cs(encoding);
- complexprefixes = pAMgr->get_complexprefixes();
- wordbreak = pAMgr->get_breaktable();
-
- /* and finally set up the suggestion manager */
- pSMgr = new SuggestMgr(try_string, MAXSUGGESTION, pAMgr);
- if (try_string) free(try_string);
-}
-
-Hunspell::~Hunspell()
-{
- if (pSMgr) delete pSMgr;
- if (pAMgr) delete pAMgr;
- for (int i = 0; i < maxdic; i++) delete pHMgr[i];
- maxdic = 0;
- pSMgr = NULL;
- pAMgr = NULL;
-#ifdef MOZILLA_CLIENT
- delete [] csconv;
-#endif
- csconv= NULL;
- if (encoding) free(encoding);
- encoding = NULL;
- if (affixpath) free(affixpath);
- affixpath = NULL;
-}
-
-// load extra dictionaries
-int Hunspell::add_dic(const char * dpath, const char * key) {
- if (maxdic == MAXDIC || !affixpath) return 1;
- pHMgr[maxdic] = new HashMgr(dpath, affixpath, key);
- if (pHMgr[maxdic]) maxdic++; else return 1;
- return 0;
-}
-
-// make a copy of src at destination while removing all leading
-// blanks and removing any trailing periods after recording
-// their presence with the abbreviation flag
-// also since already going through character by character,
-// set the capitalization type
-// return the length of the "cleaned" (and UTF-8 encoded) word
-
-int Hunspell::cleanword2(char * dest, const char * src,
- w_char * dest_utf, int * nc, int * pcaptype, int * pabbrev)
-{
- unsigned char * p = (unsigned char *) dest;
- const unsigned char * q = (const unsigned char * ) src;
-
- // first skip over any leading blanks
- while ((*q != '\0') && (*q == ' ')) q++;
-
- // now strip off any trailing periods (recording their presence)
- *pabbrev = 0;
- int nl = strlen((const char *)q);
- while ((nl > 0) && (*(q+nl-1)=='.')) {
- nl--;
- (*pabbrev)++;
- }
-
- // if no characters are left it can't be capitalized
- if (nl <= 0) {
- *pcaptype = NOCAP;
- *p = '\0';
- return 0;
- }
-
- strncpy(dest, (char *) q, nl);
- *(dest + nl) = '\0';
- nl = strlen(dest);
- if (utf8) {
- *nc = u8_u16(dest_utf, MAXWORDLEN, dest);
- // don't check too long words
- if (*nc >= MAXWORDLEN) return 0;
- if (*nc == -1) { // big Unicode character (non BMP area)
- *pcaptype = NOCAP;
- return nl;
- }
- *pcaptype = get_captype_utf8(dest_utf, *nc, langnum);
- } else {
- *pcaptype = get_captype(dest, nl, csconv);
- *nc = nl;
- }
- return nl;
-}
-
-int Hunspell::cleanword(char * dest, const char * src,
- int * pcaptype, int * pabbrev)
-{
- unsigned char * p = (unsigned char *) dest;
- const unsigned char * q = (const unsigned char * ) src;
- int firstcap = 0;
-
- // first skip over any leading blanks
- while ((*q != '\0') && (*q == ' ')) q++;
-
- // now strip off any trailing periods (recording their presence)
- *pabbrev = 0;
- int nl = strlen((const char *)q);
- while ((nl > 0) && (*(q+nl-1)=='.')) {
- nl--;
- (*pabbrev)++;
- }
-
- // if no characters are left it can't be capitalized
- if (nl <= 0) {
- *pcaptype = NOCAP;
- *p = '\0';
- return 0;
- }
-
- // now determine the capitalization type of the first nl letters
- int ncap = 0;
- int nneutral = 0;
- int nc = 0;
-
- if (!utf8) {
- while (nl > 0) {
- nc++;
- if (csconv[(*q)].ccase) ncap++;
- if (csconv[(*q)].cupper == csconv[(*q)].clower) nneutral++;
- *p++ = *q++;
- nl--;
- }
- // remember to terminate the destination string
- *p = '\0';
- firstcap = csconv[(unsigned char)(*dest)].ccase;
- } else {
- unsigned short idx;
- w_char t[MAXWORDLEN];
- nc = u8_u16(t, MAXWORDLEN, src);
- for (int i = 0; i < nc; i++) {
- idx = (t[i].h << 8) + t[i].l;
- unsigned short low = unicodetolower(idx, langnum);
- if (idx != low) ncap++;
- if (unicodetoupper(idx, langnum) == low) nneutral++;
- }
- u16_u8(dest, MAXWORDUTF8LEN, t, nc);
- if (ncap) {
- idx = (t[0].h << 8) + t[0].l;
- firstcap = (idx != unicodetolower(idx, langnum));
- }
- }
-
- // now finally set the captype
- if (ncap == 0) {
- *pcaptype = NOCAP;
- } else if ((ncap == 1) && firstcap) {
- *pcaptype = INITCAP;
- } else if ((ncap == nc) || ((ncap + nneutral) == nc)){
- *pcaptype = ALLCAP;
- } else if ((ncap > 1) && firstcap) {
- *pcaptype = HUHINITCAP;
- } else {
- *pcaptype = HUHCAP;
- }
- return strlen(dest);
-}
-
-void Hunspell::mkallcap(char * p)
-{
- if (utf8) {
- w_char u[MAXWORDLEN];
- int nc = u8_u16(u, MAXWORDLEN, p);
- unsigned short idx;
- for (int i = 0; i < nc; i++) {
- idx = (u[i].h << 8) + u[i].l;
- if (idx != unicodetoupper(idx, langnum)) {
- u[i].h = (unsigned char) (unicodetoupper(idx, langnum) >> 8);
- u[i].l = (unsigned char) (unicodetoupper(idx, langnum) & 0x00FF);
- }
- }
- u16_u8(p, MAXWORDUTF8LEN, u, nc);
- } else {
- while (*p != '\0') {
- *p = csconv[((unsigned char) *p)].cupper;
- p++;
- }
- }
-}
-
-int Hunspell::mkallcap2(char * p, w_char * u, int nc)
-{
- if (utf8) {
- unsigned short idx;
- for (int i = 0; i < nc; i++) {
- idx = (u[i].h << 8) + u[i].l;
- unsigned short up = unicodetoupper(idx, langnum);
- if (idx != up) {
- u[i].h = (unsigned char) (up >> 8);
- u[i].l = (unsigned char) (up & 0x00FF);
- }
- }
- u16_u8(p, MAXWORDUTF8LEN, u, nc);
- return strlen(p);
- } else {
- while (*p != '\0') {
- *p = csconv[((unsigned char) *p)].cupper;
- p++;
- }
- }
- return nc;
-}
-
-
-void Hunspell::mkallsmall(char * p)
-{
- while (*p != '\0') {
- *p = csconv[((unsigned char) *p)].clower;
- p++;
- }
-}
-
-int Hunspell::mkallsmall2(char * p, w_char * u, int nc)
-{
- if (utf8) {
- unsigned short idx;
- for (int i = 0; i < nc; i++) {
- idx = (u[i].h << 8) + u[i].l;
- unsigned short low = unicodetolower(idx, langnum);
- if (idx != low) {
- u[i].h = (unsigned char) (low >> 8);
- u[i].l = (unsigned char) (low & 0x00FF);
- }
- }
- u16_u8(p, MAXWORDUTF8LEN, u, nc);
- return strlen(p);
- } else {
- while (*p != '\0') {
- *p = csconv[((unsigned char) *p)].clower;
- p++;
- }
- }
- return nc;
-}
-
-// convert UTF-8 sharp S codes to latin 1
-char * Hunspell::sharps_u8_l1(char * dest, char * source) {
- char * p = dest;
- *p = *source;
- for (p++, source++; *(source - 1); p++, source++) {
- *p = *source;
- if (*source == '\x9F') *--p = '\xDF';
- }
- return dest;
-}
-
-// recursive search for right ss - sharp s permutations
-hentry * Hunspell::spellsharps(char * base, char * pos, int n,
- int repnum, char * tmp, int * info, char **root) {
- pos = strstr(pos, "ss");
- if (pos && (n < MAXSHARPS)) {
- *pos = '\xC3';
- *(pos + 1) = '\x9F';
- hentry * h = spellsharps(base, pos + 2, n + 1, repnum + 1, tmp, info, root);
- if (h) return h;
- *pos = 's';
- *(pos + 1) = 's';
- h = spellsharps(base, pos + 2, n + 1, repnum, tmp, info, root);
- if (h) return h;
- } else if (repnum > 0) {
- if (utf8) return checkword(base, info, root);
- return checkword(sharps_u8_l1(tmp, base), info, root);
- }
- return NULL;
-}
-
-int Hunspell::is_keepcase(const hentry * rv) {
- return pAMgr && rv->astr && pAMgr->get_keepcase() &&
- TESTAFF(rv->astr, pAMgr->get_keepcase(), rv->alen);
-}
-
-/* insert a word to the beginning of the suggestion array and return ns */
-int Hunspell::insert_sug(char ***slst, char * word, int ns) {
- char * dup = mystrdup(word);
- if (!dup) return ns;
- if (ns == MAXSUGGESTION) {
- ns--;
- free((*slst)[ns]);
- }
- for (int k = ns; k > 0; k--) (*slst)[k] = (*slst)[k - 1];
- (*slst)[0] = dup;
- return ns + 1;
-}
-
-int Hunspell::spell(const char * word, int * info, char ** root)
-{
- struct hentry * rv=NULL;
- // need larger vector. For example, Turkish capital letter I converted a
- // 2-byte UTF-8 character (dotless i) by mkallsmall.
- char cw[MAXWORDUTF8LEN];
- char wspace[MAXWORDUTF8LEN];
- w_char unicw[MAXWORDLEN];
-
- int info2 = 0;
- if (!info) info = &info2; else *info = 0;
-
- // Hunspell supports XML input of the simplified API (see manual)
- if (strcmp(word, SPELL_XML) == 0) return 1;
- int nc = strlen(word);
- int wl2 = 0;
- if (utf8) {
- if (nc >= MAXWORDUTF8LEN) return 0;
- } else {
- if (nc >= MAXWORDLEN) return 0;
- }
- int captype = 0;
- int abbv = 0;
- int wl = 0;
-
- // input conversion
- RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
- if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
- else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
-
- if (wl == 0 || maxdic == 0) return 1;
- if (root) *root = NULL;
-
- // allow numbers with dots, dashes and commas (but forbid double separators: "..", "--" etc.)
- enum { NBEGIN, NNUM, NSEP };
- int nstate = NBEGIN;
- int i;
-
- for (i = 0; (i < wl); i++) {
- if ((cw[i] <= '9') && (cw[i] >= '0')) {
- nstate = NNUM;
- } else if ((cw[i] == ',') || (cw[i] == '.') || (cw[i] == '-')) {
- if ((nstate == NSEP) || (i == 0)) break;
- nstate = NSEP;
- } else break;
- }
- if ((i == wl) && (nstate == NNUM)) return 1;
-
- switch(captype) {
- case HUHCAP:
- /* FALLTHROUGH */
- case HUHINITCAP:
- *info += SPELL_ORIGCAP;
- /* FALLTHROUGH */
- case NOCAP:
- rv = checkword(cw, info, root);
- if ((abbv) && !(rv)) {
- memcpy(wspace,cw,wl);
- *(wspace+wl) = '.';
- *(wspace+wl+1) = '\0';
- rv = checkword(wspace, info, root);
- }
- break;
- case ALLCAP: {
- *info += SPELL_ORIGCAP;
- rv = checkword(cw, info, root);
- if (rv) break;
- if (abbv) {
- memcpy(wspace,cw,wl);
- *(wspace+wl) = '.';
- *(wspace+wl+1) = '\0';
- rv = checkword(wspace, info, root);
- if (rv) break;
- }
- // Spec. prefix handling for Catalan, French, Italian:
- // prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia).
- if (pAMgr && strchr(cw, '\'')) {
- wl = mkallsmall2(cw, unicw, nc);
- //There are no really sane circumstances where this could fail,
- //but anyway...
- if (char * apostrophe = strchr(cw, '\'')) {
- if (utf8) {
- w_char tmpword[MAXWORDLEN];
- *apostrophe = '\0';
- wl2 = u8_u16(tmpword, MAXWORDLEN, cw);
- *apostrophe = '\'';
- if (wl2 >= 0 && wl2 < nc) {
- mkinitcap2(apostrophe + 1, unicw + wl2 + 1, nc - wl2 - 1);
- rv = checkword(cw, info, root);
- if (rv) break;
- }
- } else {
- mkinitcap2(apostrophe + 1, unicw, nc);
- rv = checkword(cw, info, root);
- if (rv) break;
- }
- }
- mkinitcap2(cw, unicw, nc);
- rv = checkword(cw, info, root);
- if (rv) break;
- }
- if (pAMgr && pAMgr->get_checksharps() && strstr(cw, "SS")) {
- char tmpword[MAXWORDUTF8LEN];
- wl = mkallsmall2(cw, unicw, nc);
- memcpy(wspace,cw,(wl+1));
- rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root);
- if (!rv) {
- wl2 = mkinitcap2(cw, unicw, nc);
- rv = spellsharps(cw, cw, 0, 0, tmpword, info, root);
- }
- if ((abbv) && !(rv)) {
- *(wspace+wl) = '.';
- *(wspace+wl+1) = '\0';
- rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root);
- if (!rv) {
- memcpy(wspace, cw, wl2);
- *(wspace+wl2) = '.';
- *(wspace+wl2+1) = '\0';
- rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root);
- }
- }
- if (rv) break;
- }
- }
- case INITCAP: {
- *info += SPELL_ORIGCAP;
- wl = mkallsmall2(cw, unicw, nc);
- memcpy(wspace,cw,(wl+1));
- wl2 = mkinitcap2(cw, unicw, nc);
- if (captype == INITCAP) *info += SPELL_INITCAP;
- rv = checkword(cw, info, root);
- if (captype == INITCAP) *info -= SPELL_INITCAP;
- // forbid bad capitalization
- // (for example, ijs -> Ijs instead of IJs in Dutch)
- // use explicit forms in dic: Ijs/F (F = FORBIDDENWORD flag)
- if (*info & SPELL_FORBIDDEN) {
- rv = NULL;
- break;
- }
- if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL;
- if (rv) break;
-
- rv = checkword(wspace, info, root);
- if (abbv && !rv) {
-
- *(wspace+wl) = '.';
- *(wspace+wl+1) = '\0';
- rv = checkword(wspace, info, root);
- if (!rv) {
- memcpy(wspace, cw, wl2);
- *(wspace+wl2) = '.';
- *(wspace+wl2+1) = '\0';
- if (captype == INITCAP) *info += SPELL_INITCAP;
- rv = checkword(wspace, info, root);
- if (captype == INITCAP) *info -= SPELL_INITCAP;
- if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL;
- break;
- }
- }
- if (rv && is_keepcase(rv) &&
- ((captype == ALLCAP) ||
- // if CHECKSHARPS: KEEPCASE words with \xDF are allowed
- // in INITCAP form, too.
- !(pAMgr->get_checksharps() &&
- ((utf8 && strstr(wspace, "\xC3\x9F")) ||
- (!utf8 && strchr(wspace, '\xDF')))))) rv = NULL;
- break;
- }
- }
-
- if (rv) {
- if (pAMgr && pAMgr->get_warn() && rv->astr &&
- TESTAFF(rv->astr, pAMgr->get_warn(), rv->alen)) {
- *info += SPELL_WARN;
- if (pAMgr->get_forbidwarn()) return 0;
- return HUNSPELL_OK_WARN;
- }
- return HUNSPELL_OK;
- }
-
- // recursive breaking at break points
- if (wordbreak) {
- char * s;
- char r;
- int nbr = 0;
- wl = strlen(cw);
- int numbreak = pAMgr ? pAMgr->get_numbreak() : 0;
-
- // calculate break points for recursion limit
- for (int j = 0; j < numbreak; j++) {
- s = cw;
- do {
- s = (char *) strstr(s, wordbreak[j]);
- if (s) {
- nbr++;
- s++;
- }
- } while (s);
- }
- if (nbr >= 10) return 0;
-
- // check boundary patterns (^begin and end$)
- for (int j = 0; j < numbreak; j++) {
- int plen = strlen(wordbreak[j]);
- if (plen == 1 || plen > wl) continue;
- if (wordbreak[j][0] == '^' && strncmp(cw, wordbreak[j] + 1, plen - 1) == 0
- && spell(cw + plen - 1)) return 1;
- if (wordbreak[j][plen - 1] == '$' &&
- strncmp(cw + wl - plen + 1, wordbreak[j], plen - 1) == 0) {
- r = cw[wl - plen + 1];
- cw[wl - plen + 1] = '\0';
- if (spell(cw)) return 1;
- cw[wl - plen + 1] = r;
- }
- }
-
- // other patterns
- for (int j = 0; j < numbreak; j++) {
- int plen = strlen(wordbreak[j]);
- s=(char *) strstr(cw, wordbreak[j]);
- if (s && (s > cw) && (s < cw + wl - plen)) {
- if (!spell(s + plen)) continue;
- r = *s;
- *s = '\0';
- // examine 2 sides of the break point
- if (spell(cw)) return 1;
- *s = r;
-
- // LANG_hu: spec. dash rule
- if (langnum == LANG_hu && strcmp(wordbreak[j], "-") == 0) {
- r = s[1];
- s[1] = '\0';
- if (spell(cw)) return 1; // check the first part with dash
- s[1] = r;
- }
- // end of LANG speficic region
-
- }
- }
- }
-
- return 0;
-}
-
-struct hentry * Hunspell::checkword(const char * w, int * info, char ** root)
-{
- struct hentry * he = NULL;
- int len, i;
- char w2[MAXWORDUTF8LEN];
- const char * word;
-
- char * ignoredchars = pAMgr->get_ignore();
- if (ignoredchars != NULL) {
- strcpy(w2, w);
- if (utf8) {
- int ignoredchars_utf16_len;
- unsigned short * ignoredchars_utf16 = pAMgr->get_ignore_utf16(&ignoredchars_utf16_len);
- remove_ignored_chars_utf(w2, ignoredchars_utf16, ignoredchars_utf16_len);
- } else {
- remove_ignored_chars(w2,ignoredchars);
- }
- word = w2;
- } else word = w;
-
- len = strlen(word);
-
- if (!len)
- return NULL;
-
- // word reversing wrapper for complex prefixes
- if (complexprefixes) {
- if (word != w2) {
- strcpy(w2, word);
- word = w2;
- }
- if (utf8) reverseword_utf(w2); else reverseword(w2);
- }
-
- // look word in hash table
- for (i = 0; (i < maxdic) && !he; i ++) {
- he = (pHMgr[i])->lookup(word);
-
- // check forbidden and onlyincompound words
- if ((he) && (he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) {
- if (info) *info += SPELL_FORBIDDEN;
- // LANG_hu section: set dash information for suggestions
- if (langnum == LANG_hu) {
- if (pAMgr->get_compoundflag() &&
- TESTAFF(he->astr, pAMgr->get_compoundflag(), he->alen)) {
- if (info) *info += SPELL_COMPOUND;
- }
- }
- return NULL;
- }
-
- // he = next not needaffix, onlyincompound homonym or onlyupcase word
- while (he && (he->astr) &&
- ((pAMgr->get_needaffix() && TESTAFF(he->astr, pAMgr->get_needaffix(), he->alen)) ||
- (pAMgr->get_onlyincompound() && TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) ||
- (info && (*info & SPELL_INITCAP) && TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen))
- )) he = he->next_homonym;
- }
-
- // check with affixes
- if (!he && pAMgr) {
- // try stripping off affixes */
- he = pAMgr->affix_check(word, len, 0);
-
- // check compound restriction and onlyupcase
- if (he && he->astr && (
- (pAMgr->get_onlyincompound() &&
- TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) ||
- (info && (*info & SPELL_INITCAP) &&
- TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)))) {
- he = NULL;
- }
-
- if (he) {
- if ((he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) {
- if (info) *info += SPELL_FORBIDDEN;
- return NULL;
- }
- if (root) {
- *root = mystrdup(he->word);
- if (*root && complexprefixes) {
- if (utf8) reverseword_utf(*root); else reverseword(*root);
- }
- }
- // try check compound word
- } else if (pAMgr->get_compound()) {
- he = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 0, info);
- // LANG_hu section: `moving rule' with last dash
- if ((!he) && (langnum == LANG_hu) && (word[len-1] == '-')) {
- char * dup = mystrdup(word);
- if (!dup) return NULL;
- dup[len-1] = '\0';
- he = pAMgr->compound_check(dup, len-1, -5, 0, 100, 0, NULL, 1, 0, info);
- free(dup);
- }
- // end of LANG speficic region
- if (he) {
- if (root) {
- *root = mystrdup(he->word);
- if (*root && complexprefixes) {
- if (utf8) reverseword_utf(*root); else reverseword(*root);
- }
- }
- if (info) *info += SPELL_COMPOUND;
- }
- }
-
- }
-
- return he;
-}
-
-int Hunspell::suggest(char*** slst, const char * word)
-{
- int onlycmpdsug = 0;
- char cw[MAXWORDUTF8LEN];
- char wspace[MAXWORDUTF8LEN];
- if (!pSMgr || maxdic == 0) return 0;
- w_char unicw[MAXWORDLEN];
- *slst = NULL;
- // process XML input of the simplified API (see manual)
- if (strncmp(word, SPELL_XML, sizeof(SPELL_XML) - 3) == 0) {
- return spellml(slst, word);
- }
- int nc = strlen(word);
- if (utf8) {
- if (nc >= MAXWORDUTF8LEN) return 0;
- } else {
- if (nc >= MAXWORDLEN) return 0;
- }
- int captype = 0;
- int abbv = 0;
- int wl = 0;
-
- // input conversion
- RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
- if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
- else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
-
- if (wl == 0) return 0;
- int ns = 0;
- int capwords = 0;
-
- // check capitalized form for FORCEUCASE
- if (pAMgr && captype == NOCAP && pAMgr->get_forceucase()) {
- int info = SPELL_ORIGCAP;
- char ** wlst;
- if (checkword(cw, &info, NULL)) {
- if (*slst) {
- wlst = *slst;
- } else {
- wlst = (char **) malloc(MAXSUGGESTION * sizeof(char *));
- if (wlst == NULL) return -1;
- *slst = wlst;
- for (int i = 0; i < MAXSUGGESTION; i++) {
- wlst[i] = NULL;
- }
- }
- wlst[0] = mystrdup(cw);
- mkinitcap(wlst[0]);
- return 1;
- }
- }
-
- switch(captype) {
- case NOCAP: {
- ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug);
- break;
- }
-
- case INITCAP: {
- capwords = 1;
- ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug);
- if (ns == -1) break;
- memcpy(wspace,cw,(wl+1));
- mkallsmall2(wspace, unicw, nc);
- ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
- break;
- }
- case HUHINITCAP:
- capwords = 1;
- case HUHCAP: {
- ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug);
- if (ns != -1) {
- int prevns;
- // something.The -> something. The
- char * dot = strchr(cw, '.');
- if (dot && (dot > cw)) {
- int captype_;
- if (utf8)
- {
- w_char w_[MAXWORDLEN];
- int wl_ = u8_u16(w_, MAXWORDLEN, dot + 1);
- captype_ = get_captype_utf8(w_, wl_, langnum);
- } else captype_ = get_captype(dot+1, strlen(dot+1), csconv);
- if (captype_ == INITCAP)
- {
- char * st = mystrdup(cw);
- if (st)
- {
- char *newst = (char *) realloc(st, wl + 2);
- if (newst == NULL)
- free(st);
- st = newst;
- }
- if (st)
- {
- st[(dot - cw) + 1] = ' ';
- strcpy(st + (dot - cw) + 2, dot + 1);
- ns = insert_sug(slst, st, ns);
- free(st);
- }
- }
- }
- if (captype == HUHINITCAP) {
- // TheOpenOffice.org -> The OpenOffice.org
- memcpy(wspace,cw,(wl+1));
- mkinitsmall2(wspace, unicw, nc);
- ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
- }
- memcpy(wspace,cw,(wl+1));
- mkallsmall2(wspace, unicw, nc);
- if (spell(wspace)) ns = insert_sug(slst, wspace, ns);
- prevns = ns;
- ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
- if (captype == HUHINITCAP) {
- mkinitcap2(wspace, unicw, nc);
- if (spell(wspace)) ns = insert_sug(slst, wspace, ns);
- ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
- }
- // aNew -> "a New" (instead of "a new")
- for (int j = prevns; j < ns; j++) {
- char * space = strchr((*slst)[j],' ');
- if (space) {
- int slen = strlen(space + 1);
- // different case after space (need capitalisation)
- if ((slen < wl) && strcmp(cw + wl - slen, space + 1)) {
- w_char w[MAXWORDLEN];
- int wc = 0;
- char * r = (*slst)[j];
- if (utf8) wc = u8_u16(w, MAXWORDLEN, space + 1);
- mkinitcap2(space + 1, w, wc);
- // set as first suggestion
- for (int k = j; k > 0; k--) (*slst)[k] = (*slst)[k - 1];
- (*slst)[0] = r;
- }
- }
- }
- }
- break;
- }
-
- case ALLCAP: {
- memcpy(wspace, cw, (wl+1));
- mkallsmall2(wspace, unicw, nc);
- ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
- if (ns == -1) break;
- if (pAMgr && pAMgr->get_keepcase() && spell(wspace))
- ns = insert_sug(slst, wspace, ns);
- mkinitcap2(wspace, unicw, nc);
- ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
- for (int j=0; j < ns; j++) {
- mkallcap((*slst)[j]);
- if (pAMgr && pAMgr->get_checksharps()) {
- char * pos;
- if (utf8) {
- pos = strstr((*slst)[j], "\xC3\x9F");
- while (pos) {
- *pos = 'S';
- *(pos+1) = 'S';
- pos = strstr(pos+2, "\xC3\x9F");
- }
- } else {
- pos = strchr((*slst)[j], '\xDF');
- while (pos) {
- (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 2);
- mystrrep((*slst)[j], "\xDF", "SS");
- pos = strchr((*slst)[j], '\xDF');
- }
- }
- }
- }
- break;
- }
- }
-
- // LANG_hu section: replace '-' with ' ' in Hungarian
- if (langnum == LANG_hu) {
- for (int j=0; j < ns; j++) {
- char * pos = strchr((*slst)[j],'-');
- if (pos) {
- int info;
- char w[MAXWORDUTF8LEN];
- *pos = '\0';
- strcpy(w, (*slst)[j]);
- strcat(w, pos + 1);
- (void)spell(w, &info, NULL);
- if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) {
- *pos = ' ';
- } else *pos = '-';
- }
- }
- }
- // END OF LANG_hu section
-
- // try ngram approach since found nothing or only compound words
- if (pAMgr && (ns == 0 || onlycmpdsug) && (pAMgr->get_maxngramsugs() != 0) && (*slst)) {
- switch(captype) {
- case NOCAP: {
- ns = pSMgr->ngsuggest(*slst, cw, ns, pHMgr, maxdic);
- break;
- }
- case HUHINITCAP:
- capwords = 1;
- case HUHCAP: {
- memcpy(wspace,cw,(wl+1));
- mkallsmall2(wspace, unicw, nc);
- ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic);
- break;
- }
- case INITCAP: {
- capwords = 1;
- memcpy(wspace,cw,(wl+1));
- mkallsmall2(wspace, unicw, nc);
- ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic);
- break;
- }
- case ALLCAP: {
- memcpy(wspace,cw,(wl+1));
- mkallsmall2(wspace, unicw, nc);
- int oldns = ns;
- ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic);
- for (int j = oldns; j < ns; j++)
- mkallcap((*slst)[j]);
- break;
- }
- }
- }
-
- // try dash suggestion (Afo-American -> Afro-American)
- if (char * pos = strchr(cw, '-')) {
- char * ppos = cw;
- int nodashsug = 1;
- char ** nlst = NULL;
- int nn = 0;
- int last = 0;
- if (*slst) {
- for (int j = 0; j < ns && nodashsug == 1; j++) {
- if (strchr((*slst)[j], '-')) nodashsug = 0;
- }
- }
- while (nodashsug && !last) {
- if (*pos == '\0') last = 1; else *pos = '\0';
- if (!spell(ppos)) {
- nn = suggest(&nlst, ppos);
- for (int j = nn - 1; j >= 0; j--) {
- strncpy(wspace, cw, ppos - cw);
- strcpy(wspace + (ppos - cw), nlst[j]);
- if (!last) {
- strcat(wspace, "-");
- strcat(wspace, pos + 1);
- }
- ns = insert_sug(slst, wspace, ns);
- free(nlst[j]);
- }
- if (nlst != NULL) free(nlst);
- nodashsug = 0;
- }
- if (!last) {
- *pos = '-';
- ppos = pos + 1;
- pos = strchr(ppos, '-');
- }
- if (!pos) pos = cw + strlen(cw);
- }
- }
-
- // word reversing wrapper for complex prefixes
- if (complexprefixes) {
- for (int j = 0; j < ns; j++) {
- if (utf8) reverseword_utf((*slst)[j]); else reverseword((*slst)[j]);
- }
- }
-
- // capitalize
- if (capwords) for (int j=0; j < ns; j++) {
- mkinitcap((*slst)[j]);
- }
-
- // expand suggestions with dot(s)
- if (abbv && pAMgr && pAMgr->get_sugswithdots()) {
- for (int j = 0; j < ns; j++) {
- (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 1 + abbv);
- strcat((*slst)[j], word + strlen(word) - abbv);
- }
- }
-
- // remove bad capitalized and forbidden forms
- if (pAMgr && (pAMgr->get_keepcase() || pAMgr->get_forbiddenword())) {
- switch (captype) {
- case INITCAP:
- case ALLCAP: {
- int l = 0;
- for (int j=0; j < ns; j++) {
- if (!strchr((*slst)[j],' ') && !spell((*slst)[j])) {
- char s[MAXSWUTF8L];
- w_char w[MAXSWL];
- int len;
- if (utf8) {
- len = u8_u16(w, MAXSWL, (*slst)[j]);
- } else {
- strcpy(s, (*slst)[j]);
- len = strlen(s);
- }
- mkallsmall2(s, w, len);
- free((*slst)[j]);
- if (spell(s)) {
- (*slst)[l] = mystrdup(s);
- if ((*slst)[l]) l++;
- } else {
- mkinitcap2(s, w, len);
- if (spell(s)) {
- (*slst)[l] = mystrdup(s);
- if ((*slst)[l]) l++;
- }
- }
- } else {
- (*slst)[l] = (*slst)[j];
- l++;
- }
- }
- ns = l;
- }
- }
- }
-
- // remove duplications
- int l = 0;
- for (int j = 0; j < ns; j++) {
- (*slst)[l] = (*slst)[j];
- for (int k = 0; k < l; k++) {
- if (strcmp((*slst)[k], (*slst)[j]) == 0) {
- free((*slst)[j]);
- l--;
- break;
- }
- }
- l++;
- }
- ns = l;
-
- // output conversion
- rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL;
- for (int j = 0; rl && j < ns; j++) {
- if (rl->conv((*slst)[j], wspace)) {
- free((*slst)[j]);
- (*slst)[j] = mystrdup(wspace);
- }
- }
-
- // if suggestions removed by nosuggest, onlyincompound parameters
- if (l == 0 && *slst) {
- free(*slst);
- *slst = NULL;
- }
- return l;
-}
-
-void Hunspell::free_list(char *** slst, int n) {
- freelist(slst, n);
-}
-
-char * Hunspell::get_dic_encoding()
-{
- return encoding;
-}
-
-#ifdef HUNSPELL_EXPERIMENTAL
-// XXX need UTF-8 support
-int Hunspell::suggest_auto(char*** slst, const char * word)
-{
- char cw[MAXWORDUTF8LEN];
- char wspace[MAXWORDUTF8LEN];
- if (!pSMgr || maxdic == 0) return 0;
- int wl = strlen(word);
- if (utf8) {
- if (wl >= MAXWORDUTF8LEN) return 0;
- } else {
- if (wl >= MAXWORDLEN) return 0;
- }
- int captype = 0;
- int abbv = 0;
- wl = cleanword(cw, word, &captype, &abbv);
- if (wl == 0) return 0;
- int ns = 0;
- *slst = NULL; // HU, nsug in pSMgr->suggest
-
- switch(captype) {
- case NOCAP: {
- ns = pSMgr->suggest_auto(slst, cw, ns);
- if (ns>0) break;
- break;
- }
-
- case INITCAP: {
- memcpy(wspace,cw,(wl+1));
- mkallsmall(wspace);
- ns = pSMgr->suggest_auto(slst, wspace, ns);
- for (int j=0; j < ns; j++)
- mkinitcap((*slst)[j]);
- ns = pSMgr->suggest_auto(slst, cw, ns);
- break;
-
- }
-
- case HUHINITCAP:
- case HUHCAP: {
- ns = pSMgr->suggest_auto(slst, cw, ns);
- if (ns == 0) {
- memcpy(wspace,cw,(wl+1));
- mkallsmall(wspace);
- ns = pSMgr->suggest_auto(slst, wspace, ns);
- }
- break;
- }
-
- case ALLCAP: {
- memcpy(wspace,cw,(wl+1));
- mkallsmall(wspace);
- ns = pSMgr->suggest_auto(slst, wspace, ns);
-
- mkinitcap(wspace);
- ns = pSMgr->suggest_auto(slst, wspace, ns);
-
- for (int j=0; j < ns; j++)
- mkallcap((*slst)[j]);
- break;
- }
- }
-
- // word reversing wrapper for complex prefixes
- if (complexprefixes) {
- for (int j = 0; j < ns; j++) {
- if (utf8) reverseword_utf((*slst)[j]); else reverseword((*slst)[j]);
- }
- }
-
- // expand suggestions with dot(s)
- if (abbv && pAMgr && pAMgr->get_sugswithdots()) {
- for (int j = 0; j < ns; j++) {
- (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 1 + abbv);
- strcat((*slst)[j], word + strlen(word) - abbv);
- }
- }
-
- // LANG_hu section: replace '-' with ' ' in Hungarian
- if (langnum == LANG_hu) {
- for (int j=0; j < ns; j++) {
- char * pos = strchr((*slst)[j],'-');
- if (pos) {
- int info;
- char w[MAXWORDUTF8LEN];
- *pos = '\0';
- strcpy(w, (*slst)[j]);
- strcat(w, pos + 1);
- spell(w, &info, NULL);
- if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) {
- *pos = ' ';
- } else *pos = '-';
- }
- }
- }
- // END OF LANG_hu section
- return ns;
-}
-#endif
-
-int Hunspell::stem(char*** slst, char ** desc, int n)
-{
- char result[MAXLNLEN];
- char result2[MAXLNLEN];
- *slst = NULL;
- if (n == 0) return 0;
- *result2 = '\0';
- for (int i = 0; i < n; i++) {
- *result = '\0';
- // add compound word parts (except the last one)
- char * s = (char *) desc[i];
- char * part = strstr(s, MORPH_PART);
- if (part) {
- char * nextpart = strstr(part + 1, MORPH_PART);
- while (nextpart) {
- copy_field(result + strlen(result), part, MORPH_PART);
- part = nextpart;
- nextpart = strstr(part + 1, MORPH_PART);
- }
- s = part;
- }
-
- char **pl;
- char tok[MAXLNLEN];
- strcpy(tok, s);
- char * alt = strstr(tok, " | ");
- while (alt) {
- alt[1] = MSEP_ALT;
- alt = strstr(alt, " | ");
- }
- int pln = line_tok(tok, &pl, MSEP_ALT);
- for (int k = 0; k < pln; k++) {
- // add derivational suffixes
- if (strstr(pl[k], MORPH_DERI_SFX)) {
- // remove inflectional suffixes
- char * is = strstr(pl[k], MORPH_INFL_SFX);
- if (is) *is = '\0';
- char * sg = pSMgr->suggest_gen(&(pl[k]), 1, pl[k]);
- if (sg) {
- char ** gen;
- int genl = line_tok(sg, &gen, MSEP_REC);
- free(sg);
- for (int j = 0; j < genl; j++) {
- sprintf(result2 + strlen(result2), "%c%s%s",
- MSEP_REC, result, gen[j]);
- }
- freelist(&gen, genl);
- }
- } else {
- sprintf(result2 + strlen(result2), "%c%s", MSEP_REC, result);
- if (strstr(pl[k], MORPH_SURF_PFX)) {
- copy_field(result2 + strlen(result2), pl[k], MORPH_SURF_PFX);
- }
- copy_field(result2 + strlen(result2), pl[k], MORPH_STEM);
- }
- }
- freelist(&pl, pln);
- }
- int sln = line_tok(result2, slst, MSEP_REC);
- return uniqlist(*slst, sln);
-
-}
-
-int Hunspell::stem(char*** slst, const char * word)
-{
- char ** pl;
- int pln = analyze(&pl, word);
- int pln2 = stem(slst, pl, pln);
- freelist(&pl, pln);
- return pln2;
-}
-
-#ifdef HUNSPELL_EXPERIMENTAL
-int Hunspell::suggest_pos_stems(char*** slst, const char * word)
-{
- char cw[MAXWORDUTF8LEN];
- char wspace[MAXWORDUTF8LEN];
- if (! pSMgr || maxdic == 0) return 0;
- int wl = strlen(word);
- if (utf8) {
- if (wl >= MAXWORDUTF8LEN) return 0;
- } else {
- if (wl >= MAXWORDLEN) return 0;
- }
- int captype = 0;
- int abbv = 0;
- wl = cleanword(cw, word, &captype, &abbv);
- if (wl == 0) return 0;
-
- int ns = 0; // ns=0 = normalized input
-
- *slst = NULL; // HU, nsug in pSMgr->suggest
-
- switch(captype) {
- case HUHCAP:
- case NOCAP: {
- ns = pSMgr->suggest_pos_stems(slst, cw, ns);
-
- if ((abbv) && (ns == 0)) {
- memcpy(wspace,cw,wl);
- *(wspace+wl) = '.';
- *(wspace+wl+1) = '\0';
- ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
- }
-
- break;
- }
-
- case INITCAP: {
-
- ns = pSMgr->suggest_pos_stems(slst, cw, ns);
-
- if (ns == 0 || ((*slst)[0][0] == '#')) {
- memcpy(wspace,cw,(wl+1));
- mkallsmall(wspace);
- ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
- }
-
- break;
-
- }
-
- case ALLCAP: {
- ns = pSMgr->suggest_pos_stems(slst, cw, ns);
- if (ns != 0) break;
-
- memcpy(wspace,cw,(wl+1));
- mkallsmall(wspace);
- ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
-
- if (ns == 0) {
- mkinitcap(wspace);
- ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
- }
- break;
- }
- }
-
- return ns;
-}
-#endif // END OF HUNSPELL_EXPERIMENTAL CODE
-
-const char * Hunspell::get_wordchars()
-{
- return pAMgr->get_wordchars();
-}
-
-unsigned short * Hunspell::get_wordchars_utf16(int * len)
-{
- return pAMgr->get_wordchars_utf16(len);
-}
-
-char * Hunspell::get_try_string()
-{
- return pAMgr->get_try_string();
-}
-
-void Hunspell::mkinitcap(char * p)
-{
- if (!utf8) {
- if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper;
- } else {
- int len;
- w_char u[MAXWORDLEN];
- len = u8_u16(u, MAXWORDLEN, p);
- unsigned short i = unicodetoupper((u[0].h << 8) + u[0].l, langnum);
- u[0].h = (unsigned char) (i >> 8);
- u[0].l = (unsigned char) (i & 0x00FF);
- u16_u8(p, MAXWORDUTF8LEN, u, len);
- }
-}
-
-int Hunspell::mkinitcap2(char * p, w_char * u, int nc)
-{
- if (!utf8) {
- if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper;
- } else if (nc > 0) {
- unsigned short i = unicodetoupper((u[0].h << 8) + u[0].l, langnum);
- u[0].h = (unsigned char) (i >> 8);
- u[0].l = (unsigned char) (i & 0x00FF);
- u16_u8(p, MAXWORDUTF8LEN, u, nc);
- return strlen(p);
- }
- return nc;
-}
-
-int Hunspell::mkinitsmall2(char * p, w_char * u, int nc)
-{
- if (!utf8) {
- if (*p != '\0') *p = csconv[((unsigned char)*p)].clower;
- } else if (nc > 0) {
- unsigned short i = unicodetolower((u[0].h << 8) + u[0].l, langnum);
- u[0].h = (unsigned char) (i >> 8);
- u[0].l = (unsigned char) (i & 0x00FF);
- u16_u8(p, MAXWORDUTF8LEN, u, nc);
- return strlen(p);
- }
- return nc;
-}
-
-int Hunspell::add(const char * word)
-{
- if (pHMgr[0]) return (pHMgr[0])->add(word);
- return 0;
-}
-
-int Hunspell::add_with_affix(const char * word, const char * example)
-{
- if (pHMgr[0]) return (pHMgr[0])->add_with_affix(word, example);
- return 0;
-}
-
-int Hunspell::remove(const char * word)
-{
- if (pHMgr[0]) return (pHMgr[0])->remove(word);
- return 0;
-}
-
-const char * Hunspell::get_version()
-{
- return pAMgr->get_version();
-}
-
-struct cs_info * Hunspell::get_csconv()
-{
- return csconv;
-}
-
-void Hunspell::cat_result(char * result, char * st)
-{
- if (st) {
- if (*result) mystrcat(result, "\n", MAXLNLEN);
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
-}
-
-int Hunspell::analyze(char*** slst, const char * word)
-{
- char cw[MAXWORDUTF8LEN];
- char wspace[MAXWORDUTF8LEN];
- w_char unicw[MAXWORDLEN];
- int wl2 = 0;
- *slst = NULL;
- if (! pSMgr || maxdic == 0) return 0;
- int nc = strlen(word);
- if (utf8) {
- if (nc >= MAXWORDUTF8LEN) return 0;
- } else {
- if (nc >= MAXWORDLEN) return 0;
- }
- int captype = 0;
- int abbv = 0;
- int wl = 0;
-
- // input conversion
- RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
- if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
- else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
-
- if (wl == 0) {
- if (abbv) {
- for (wl = 0; wl < abbv; wl++) cw[wl] = '.';
- cw[wl] = '\0';
- abbv = 0;
- } else return 0;
- }
-
- char result[MAXLNLEN];
- char * st = NULL;
-
- *result = '\0';
-
- int n = 0;
- int n2 = 0;
- int n3 = 0;
-
- // test numbers
- // LANG_hu section: set dash information for suggestions
- if (langnum == LANG_hu) {
- while ((n < wl) &&
- (((cw[n] <= '9') && (cw[n] >= '0')) || (((cw[n] == '.') || (cw[n] == ',')) && (n > 0)))) {
- n++;
- if ((cw[n] == '.') || (cw[n] == ',')) {
- if (((n2 == 0) && (n > 3)) ||
- ((n2 > 0) && ((cw[n-1] == '.') || (cw[n-1] == ',')))) break;
- n2++;
- n3 = n;
- }
- }
-
- if ((n == wl) && (n3 > 0) && (n - n3 > 3)) return 0;
- if ((n == wl) || ((n>0) && ((cw[n]=='%') || (cw[n]=='\xB0')) && checkword(cw+n, NULL, NULL))) {
- mystrcat(result, cw, MAXLNLEN);
- result[n - 1] = '\0';
- if (n == wl) cat_result(result, pSMgr->suggest_morph(cw + n - 1));
- else {
- char sign = cw[n];
- cw[n] = '\0';
- cat_result(result, pSMgr->suggest_morph(cw + n - 1));
- mystrcat(result, "+", MAXLNLEN); // XXX SPEC. MORPHCODE
- cw[n] = sign;
- cat_result(result, pSMgr->suggest_morph(cw + n));
- }
- return line_tok(result, slst, MSEP_REC);
- }
- }
- // END OF LANG_hu section
-
- switch(captype) {
- case HUHCAP:
- case HUHINITCAP:
- case NOCAP: {
- cat_result(result, pSMgr->suggest_morph(cw));
- if (abbv) {
- memcpy(wspace,cw,wl);
- *(wspace+wl) = '.';
- *(wspace+wl+1) = '\0';
- cat_result(result, pSMgr->suggest_morph(wspace));
- }
- break;
- }
- case INITCAP: {
- wl = mkallsmall2(cw, unicw, nc);
- memcpy(wspace,cw,(wl+1));
- wl2 = mkinitcap2(cw, unicw, nc);
- cat_result(result, pSMgr->suggest_morph(wspace));
- cat_result(result, pSMgr->suggest_morph(cw));
- if (abbv) {
- *(wspace+wl) = '.';
- *(wspace+wl+1) = '\0';
- cat_result(result, pSMgr->suggest_morph(wspace));
-
- memcpy(wspace, cw, wl2);
- *(wspace+wl2) = '.';
- *(wspace+wl2+1) = '\0';
-
- cat_result(result, pSMgr->suggest_morph(wspace));
- }
- break;
- }
- case ALLCAP: {
- cat_result(result, pSMgr->suggest_morph(cw));
- if (abbv) {
- memcpy(wspace,cw,wl);
- *(wspace+wl) = '.';
- *(wspace+wl+1) = '\0';
- cat_result(result, pSMgr->suggest_morph(cw));
- }
- wl = mkallsmall2(cw, unicw, nc);
- memcpy(wspace,cw,(wl+1));
- wl2 = mkinitcap2(cw, unicw, nc);
-
- cat_result(result, pSMgr->suggest_morph(wspace));
- cat_result(result, pSMgr->suggest_morph(cw));
- if (abbv) {
- *(wspace+wl) = '.';
- *(wspace+wl+1) = '\0';
- cat_result(result, pSMgr->suggest_morph(wspace));
-
- memcpy(wspace, cw, wl2);
- *(wspace+wl2) = '.';
- *(wspace+wl2+1) = '\0';
-
- cat_result(result, pSMgr->suggest_morph(wspace));
- }
- break;
- }
- }
-
- if (*result) {
- // word reversing wrapper for complex prefixes
- if (complexprefixes) {
- if (utf8) reverseword_utf(result); else reverseword(result);
- }
- return line_tok(result, slst, MSEP_REC);
- }
-
- // compound word with dash (HU) I18n
- char * dash = NULL;
- int nresult = 0;
- // LANG_hu section: set dash information for suggestions
- if (langnum == LANG_hu) dash = (char *) strchr(cw,'-');
- if ((langnum == LANG_hu) && dash) {
- *dash='\0';
- // examine 2 sides of the dash
- if (dash[1] == '\0') { // base word ending with dash
- if (spell(cw)) {
- char * p = pSMgr->suggest_morph(cw);
- if (p) {
- int ret = line_tok(p, slst, MSEP_REC);
- free(p);
- return ret;
- }
-
- }
- } else if ((dash[1] == 'e') && (dash[2] == '\0')) { // XXX (HU) -e hat.
- if (spell(cw) && (spell("-e"))) {
- st = pSMgr->suggest_morph(cw);
- if (st) {
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- mystrcat(result,"+", MAXLNLEN); // XXX spec. separator in MORPHCODE
- st = pSMgr->suggest_morph("-e");
- if (st) {
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- return line_tok(result, slst, MSEP_REC);
- }
- } else {
- // first word ending with dash: word- XXX ???
- char r2 = *(dash + 1);
- dash[0]='-';
- dash[1]='\0';
- nresult = spell(cw);
- dash[1] = r2;
- dash[0]='\0';
- if (nresult && spell(dash+1) && ((strlen(dash+1) > 1) ||
- ((dash[1] > '0') && (dash[1] < '9')))) {
- st = pSMgr->suggest_morph(cw);
- if (st) {
- mystrcat(result, st, MAXLNLEN);
- free(st);
- mystrcat(result,"+", MAXLNLEN); // XXX spec. separator in MORPHCODE
- }
- st = pSMgr->suggest_morph(dash+1);
- if (st) {
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- return line_tok(result, slst, MSEP_REC);
- }
- }
- // affixed number in correct word
- if (nresult && (dash > cw) && (((*(dash-1)<='9') &&
- (*(dash-1)>='0')) || (*(dash-1)=='.'))) {
- *dash='-';
- n = 1;
- if (*(dash - n) == '.') n++;
- // search first not a number character to left from dash
- while (((dash - n)>=cw) && ((*(dash - n)=='0') || (n < 3)) && (n < 6)) {
- n++;
- }
- if ((dash - n) < cw) n--;
- // numbers: valami1000000-hoz
- // examine 100000-hoz, 10000-hoz 1000-hoz, 10-hoz,
- // 56-hoz, 6-hoz
- for(; n >= 1; n--) {
- if ((*(dash - n) >= '0') && (*(dash - n) <= '9') && checkword(dash - n, NULL, NULL)) {
- mystrcat(result, cw, MAXLNLEN);
- result[dash - cw - n] = '\0';
- st = pSMgr->suggest_morph(dash - n);
- if (st) {
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- return line_tok(result, slst, MSEP_REC);
- }
- }
- }
- }
- return 0;
-}
-
-int Hunspell::generate(char*** slst, const char * word, char ** pl, int pln)
-{
- *slst = NULL;
- if (!pSMgr || !pln) return 0;
- char **pl2;
- int pl2n = analyze(&pl2, word);
- int captype = 0;
- int abbv = 0;
- char cw[MAXWORDUTF8LEN];
- cleanword(cw, word, &captype, &abbv);
- char result[MAXLNLEN];
- *result = '\0';
-
- for (int i = 0; i < pln; i++) {
- cat_result(result, pSMgr->suggest_gen(pl2, pl2n, pl[i]));
- }
- freelist(&pl2, pl2n);
-
- if (*result) {
- // allcap
- if (captype == ALLCAP) mkallcap(result);
-
- // line split
- int linenum = line_tok(result, slst, MSEP_REC);
-
- // capitalize
- if (captype == INITCAP || captype == HUHINITCAP) {
- for (int j=0; j < linenum; j++) mkinitcap((*slst)[j]);
- }
-
- // temporary filtering of prefix related errors (eg.
- // generate("undrinkable", "eats") --> "undrinkables" and "*undrinks")
-
- int r = 0;
- for (int j=0; j < linenum; j++) {
- if (!spell((*slst)[j])) {
- free((*slst)[j]);
- (*slst)[j] = NULL;
- } else {
- if (r < j) (*slst)[r] = (*slst)[j];
- r++;
- }
- }
- if (r > 0) return r;
- free(*slst);
- *slst = NULL;
- }
- return 0;
-}
-
-int Hunspell::generate(char*** slst, const char * word, const char * pattern)
-{
- char **pl;
- int pln = analyze(&pl, pattern);
- int n = generate(slst, word, pl, pln);
- freelist(&pl, pln);
- return uniqlist(*slst, n);
-}
-
-// minimal XML parser functions
-int Hunspell::get_xml_par(char * dest, const char * par, int max)
-{
- char * d = dest;
- if (!par) return 0;
- char end = *par;
- char * dmax = dest + max;
- if (end == '>') end = '<';
- else if (end != '\'' && end != '"') return 0; // bad XML
- for (par++; d < dmax && *par != '\0' && *par != end; par++, d++) *d = *par;
- *d = '\0';
- mystrrep(dest, "&lt;", "<");
- mystrrep(dest, "&amp;", "&");
- return (int)(d - dest);
-}
-
-int Hunspell::get_langnum() const
-{
- return langnum;
-}
-
-int Hunspell::input_conv(const char * word, char * dest)
-{
- RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
- return (rl && rl->conv(word, dest));
-}
-
-
-// return the beginning of the element (attr == NULL) or the attribute
-const char * Hunspell::get_xml_pos(const char * s, const char * attr)
-{
- const char * end = strchr(s, '>');
- const char * p = s;
- if (attr == NULL) return end;
- do {
- p = strstr(p, attr);
- if (!p || p >= end) return 0;
- } while (*(p-1) != ' ' && *(p-1) != '\n');
- return p + strlen(attr);
-}
-
-int Hunspell::check_xml_par(const char * q, const char * attr, const char * value) {
- char cw[MAXWORDUTF8LEN];
- if (get_xml_par(cw, get_xml_pos(q, attr), MAXWORDUTF8LEN - 1) &&
- strcmp(cw, value) == 0) return 1;
- return 0;
-}
-
-int Hunspell::get_xml_list(char ***slst, char * list, const char * tag) {
- int n = 0;
- char * p;
- if (!list) return 0;
- for (p = list; ((p = strstr(p, tag)) != NULL); p++) n++;
- if (n == 0) return 0;
- *slst = (char **) malloc(sizeof(char *) * n);
- if (!*slst) return 0;
- for (p = list, n = 0; ((p = strstr(p, tag)) != NULL); p++, n++) {
- int l = strlen(p);
- (*slst)[n] = (char *) malloc(l + 1);
- if (!(*slst)[n]) return n;
- if (!get_xml_par((*slst)[n], p + strlen(tag) - 1, l)) {
- free((*slst)[n]);
- break;
- }
- }
- return n;
-}
-
-namespace
-{
- void myrep(std::string& str, const std::string& search, const std::string& replace)
- {
- size_t pos = 0;
- while ((pos = str.find(search, pos)) != std::string::npos)
- {
- str.replace(pos, search.length(), replace);
- pos += replace.length();
- }
- }
-}
-
-int Hunspell::spellml(char*** slst, const char * word)
-{
- char *q, *q2;
- char cw[MAXWORDUTF8LEN], cw2[MAXWORDUTF8LEN];
- q = (char *) strstr(word, "<query");
- if (!q) return 0; // bad XML input
- q2 = strchr(q, '>');
- if (!q2) return 0; // bad XML input
- q2 = strstr(q2, "<word");
- if (!q2) return 0; // bad XML input
- if (check_xml_par(q, "type=", "analyze")) {
- int n = 0;
- if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 10)) n = analyze(slst, cw);
- if (n == 0) return 0;
- // convert the result to <code><a>ana1</a><a>ana2</a></code> format
- std::string r;
- r.append("<code>");
- for (int i = 0; i < n; i++) {
- r.append("<a>");
-
- std::string entry((*slst)[i]);
- free((*slst)[i]);
- myrep(entry, "\t", " ");
- myrep(entry, "&", "&amp;");
- myrep(entry, "<", "&lt;");
- r.append(entry);
-
- r.append("</a>");
- }
- r.append("</code>");
- (*slst)[0] = mystrdup(r.c_str());
- return 1;
- } else if (check_xml_par(q, "type=", "stem")) {
- if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 1)) return stem(slst, cw);
- } else if (check_xml_par(q, "type=", "generate")) {
- int n = get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 1);
- if (n == 0) return 0;
- char * q3 = strstr(q2 + 1, "<word");
- if (q3) {
- if (get_xml_par(cw2, strchr(q3, '>'), MAXWORDUTF8LEN - 1)) {
- return generate(slst, cw, cw2);
- }
- } else {
- if ((q2 = strstr(q2 + 1, "<code")) != NULL) {
- char ** slst2;
- if ((n = get_xml_list(&slst2, strchr(q2, '>'), "<a>")) != 0) {
- int n2 = generate(slst, cw, slst2, n);
- freelist(&slst2, n);
- return uniqlist(*slst, n2);
- }
- freelist(&slst2, n);
- }
- }
- }
- return 0;
-}
-
-
-#ifdef HUNSPELL_EXPERIMENTAL
-// XXX need UTF-8 support
-char * Hunspell::morph_with_correction(const char * word)
-{
- char cw[MAXWORDUTF8LEN];
- char wspace[MAXWORDUTF8LEN];
- if (! pSMgr || maxdic == 0) return NULL;
- int wl = strlen(word);
- if (utf8) {
- if (wl >= MAXWORDUTF8LEN) return NULL;
- } else {
- if (wl >= MAXWORDLEN) return NULL;
- }
- int captype = 0;
- int abbv = 0;
- wl = cleanword(cw, word, &captype, &abbv);
- if (wl == 0) return NULL;
-
- char result[MAXLNLEN];
- char * st = NULL;
-
- *result = '\0';
-
-
- switch(captype) {
- case NOCAP: {
- st = pSMgr->suggest_morph_for_spelling_error(cw);
- if (st) {
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- if (abbv) {
- memcpy(wspace,cw,wl);
- *(wspace+wl) = '.';
- *(wspace+wl+1) = '\0';
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- if (*result) mystrcat(result, "\n", MAXLNLEN);
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- }
- break;
- }
- case INITCAP: {
- memcpy(wspace,cw,(wl+1));
- mkallsmall(wspace);
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- st = pSMgr->suggest_morph_for_spelling_error(cw);
- if (st) {
- if (*result) mystrcat(result, "\n", MAXLNLEN);
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- if (abbv) {
- memcpy(wspace,cw,wl);
- *(wspace+wl) = '.';
- *(wspace+wl+1) = '\0';
- mkallsmall(wspace);
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- if (*result) mystrcat(result, "\n", MAXLNLEN);
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- mkinitcap(wspace);
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- if (*result) mystrcat(result, "\n", MAXLNLEN);
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- }
- break;
- }
- case HUHCAP: {
- st = pSMgr->suggest_morph_for_spelling_error(cw);
- if (st) {
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- memcpy(wspace,cw,(wl+1));
- mkallsmall(wspace);
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- if (*result) mystrcat(result, "\n", MAXLNLEN);
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- break;
- }
- case ALLCAP: {
- memcpy(wspace,cw,(wl+1));
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- mkallsmall(wspace);
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- if (*result) mystrcat(result, "\n", MAXLNLEN);
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- mkinitcap(wspace);
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- if (*result) mystrcat(result, "\n", MAXLNLEN);
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- if (abbv) {
- memcpy(wspace,cw,(wl+1));
- *(wspace+wl) = '.';
- *(wspace+wl+1) = '\0';
- if (*result) mystrcat(result, "\n", MAXLNLEN);
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- mkallsmall(wspace);
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- if (*result) mystrcat(result, "\n", MAXLNLEN);
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- mkinitcap(wspace);
- st = pSMgr->suggest_morph_for_spelling_error(wspace);
- if (st) {
- if (*result) mystrcat(result, "\n", MAXLNLEN);
- mystrcat(result, st, MAXLNLEN);
- free(st);
- }
- }
- break;
- }
- }
-
- if (*result) return mystrdup(result);
- return NULL;
-}
-
-#endif // END OF HUNSPELL_EXPERIMENTAL CODE
-
-Hunhandle *Hunspell_create(const char * affpath, const char * dpath)
-{
- return (Hunhandle*)(new Hunspell(affpath, dpath));
-}
-
-Hunhandle *Hunspell_create_key(const char * affpath, const char * dpath,
- const char * key)
-{
- return (Hunhandle*)(new Hunspell(affpath, dpath, key));
-}
-
-void Hunspell_destroy(Hunhandle *pHunspell)
-{
- delete (Hunspell*)(pHunspell);
-}
-
-int Hunspell_spell(Hunhandle *pHunspell, const char *word)
-{
- return ((Hunspell*)pHunspell)->spell(word);
-}
-
-char *Hunspell_get_dic_encoding(Hunhandle *pHunspell)
-{
- return ((Hunspell*)pHunspell)->get_dic_encoding();
-}
-
-int Hunspell_suggest(Hunhandle *pHunspell, char*** slst, const char * word)
-{
- return ((Hunspell*)pHunspell)->suggest(slst, word);
-}
-
-int Hunspell_analyze(Hunhandle *pHunspell, char*** slst, const char * word)
-{
- return ((Hunspell*)pHunspell)->analyze(slst, word);
-}
-
-int Hunspell_stem(Hunhandle *pHunspell, char*** slst, const char * word)
-{
- return ((Hunspell*)pHunspell)->stem(slst, word);
-}
-
-int Hunspell_stem2(Hunhandle *pHunspell, char*** slst, char** desc, int n)
-{
- return ((Hunspell*)pHunspell)->stem(slst, desc, n);
-}
-
-int Hunspell_generate(Hunhandle *pHunspell, char*** slst, const char * word,
- const char * word2)
-{
- return ((Hunspell*)pHunspell)->generate(slst, word, word2);
-}
-
-int Hunspell_generate2(Hunhandle *pHunspell, char*** slst, const char * word,
- char** desc, int n)
-{
- return ((Hunspell*)pHunspell)->generate(slst, word, desc, n);
-}
-
- /* functions for run-time modification of the dictionary */
-
- /* add word to the run-time dictionary */
-
-int Hunspell_add(Hunhandle *pHunspell, const char * word) {
- return ((Hunspell*)pHunspell)->add(word);
-}
-
- /* add word to the run-time dictionary with affix flags of
- * the example (a dictionary word): Hunspell will recognize
- * affixed forms of the new word, too.
- */
-
-int Hunspell_add_with_affix(Hunhandle *pHunspell, const char * word,
- const char * example) {
- return ((Hunspell*)pHunspell)->add_with_affix(word, example);
-}
-
- /* remove word from the run-time dictionary */
-
-int Hunspell_remove(Hunhandle *pHunspell, const char * word) {
- return ((Hunspell*)pHunspell)->remove(word);
-}
-
-void Hunspell_free_list(Hunhandle *, char *** slst, int n) {
- freelist(slst, n);
-}