diff options
Diffstat (limited to 'plugins/SpellChecker/hunspell/hunspell.cxx')
-rw-r--r-- | plugins/SpellChecker/hunspell/hunspell.cxx | 2011 |
1 files changed, 2011 insertions, 0 deletions
diff --git a/plugins/SpellChecker/hunspell/hunspell.cxx b/plugins/SpellChecker/hunspell/hunspell.cxx new file mode 100644 index 0000000000..8e00e1b8cc --- /dev/null +++ b/plugins/SpellChecker/hunspell/hunspell.cxx @@ -0,0 +1,2011 @@ +#include "license.hunspell" +#include "license.myspell" + +#include <stdlib.h> +#include <string.h> +#include <stdio.h> + +#include "hunspell.hxx" +#include "hunspell.h" +#ifndef MOZILLA_CLIENT +# include "config.h" +#endif +#include "csutil.hxx" + +Hunspell::Hunspell(const char * affpath, const char * dpath, const char * key) +{ + encoding = NULL; + csconv = NULL; + utf8 = 0; + complexprefixes = 0; + affixpath = mystrdup(affpath); + maxdic = 0; + + /* first set up the hash manager */ + pHMgr[0] = new HashMgr(dpath, affpath, key); + if (pHMgr[0]) maxdic = 1; + + /* next set up the affix manager */ + /* it needs access to the hash manager lookup methods */ + pAMgr = new AffixMgr(affpath, pHMgr, &maxdic, key); + + /* get the preferred try string and the dictionary */ + /* encoding from the Affix Manager for that dictionary */ + char * try_string = pAMgr->get_try_string(); + encoding = pAMgr->get_encoding(); + langnum = pAMgr->get_langnum(); + utf8 = pAMgr->get_utf8(); + if (!utf8) + csconv = get_current_cs(encoding); + complexprefixes = pAMgr->get_complexprefixes(); + wordbreak = pAMgr->get_breaktable(); + + /* and finally set up the suggestion manager */ + pSMgr = new SuggestMgr(try_string, MAXSUGGESTION, pAMgr); + if (try_string) free(try_string); +} + +Hunspell::~Hunspell() +{ + if (pSMgr) delete pSMgr; + if (pAMgr) delete pAMgr; + for (int i = 0; i < maxdic; i++) delete pHMgr[i]; + maxdic = 0; + pSMgr = NULL; + pAMgr = NULL; +#ifdef MOZILLA_CLIENT + delete [] csconv; +#endif + csconv= NULL; + if (encoding) free(encoding); + encoding = NULL; + if (affixpath) free(affixpath); + affixpath = NULL; +} + +// load extra dictionaries +int Hunspell::add_dic(const char * dpath, const char * key) { + if (maxdic == MAXDIC || !affixpath) return 1; + pHMgr[maxdic] = new HashMgr(dpath, affixpath, key); + if (pHMgr[maxdic]) maxdic++; else return 1; + return 0; +} + +// make a copy of src at destination while removing all leading +// blanks and removing any trailing periods after recording +// their presence with the abbreviation flag +// also since already going through character by character, +// set the capitalization type +// return the length of the "cleaned" (and UTF-8 encoded) word + +int Hunspell::cleanword2(char * dest, const char * src, + w_char * dest_utf, int * nc, int * pcaptype, int * pabbrev) +{ + unsigned char * p = (unsigned char *) dest; + const unsigned char * q = (const unsigned char * ) src; + + // first skip over any leading blanks + while ((*q != '\0') && (*q == ' ')) q++; + + // now strip off any trailing periods (recording their presence) + *pabbrev = 0; + int nl = strlen((const char *)q); + while ((nl > 0) && (*(q+nl-1)=='.')) { + nl--; + (*pabbrev)++; + } + + // if no characters are left it can't be capitalized + if (nl <= 0) { + *pcaptype = NOCAP; + *p = '\0'; + return 0; + } + + strncpy(dest, (char *) q, nl); + *(dest + nl) = '\0'; + nl = strlen(dest); + if (utf8) { + *nc = u8_u16(dest_utf, MAXWORDLEN, dest); + // don't check too long words + if (*nc >= MAXWORDLEN) return 0; + if (*nc == -1) { // big Unicode character (non BMP area) + *pcaptype = NOCAP; + return nl; + } + *pcaptype = get_captype_utf8(dest_utf, *nc, langnum); + } else { + *pcaptype = get_captype(dest, nl, csconv); + *nc = nl; + } + return nl; +} + +int Hunspell::cleanword(char * dest, const char * src, + int * pcaptype, int * pabbrev) +{ + unsigned char * p = (unsigned char *) dest; + const unsigned char * q = (const unsigned char * ) src; + int firstcap = 0; + + // first skip over any leading blanks + while ((*q != '\0') && (*q == ' ')) q++; + + // now strip off any trailing periods (recording their presence) + *pabbrev = 0; + int nl = strlen((const char *)q); + while ((nl > 0) && (*(q+nl-1)=='.')) { + nl--; + (*pabbrev)++; + } + + // if no characters are left it can't be capitalized + if (nl <= 0) { + *pcaptype = NOCAP; + *p = '\0'; + return 0; + } + + // now determine the capitalization type of the first nl letters + int ncap = 0; + int nneutral = 0; + int nc = 0; + + if (!utf8) { + while (nl > 0) { + nc++; + if (csconv[(*q)].ccase) ncap++; + if (csconv[(*q)].cupper == csconv[(*q)].clower) nneutral++; + *p++ = *q++; + nl--; + } + // remember to terminate the destination string + *p = '\0'; + firstcap = csconv[(unsigned char)(*dest)].ccase; + } else { + unsigned short idx; + w_char t[MAXWORDLEN]; + nc = u8_u16(t, MAXWORDLEN, src); + for (int i = 0; i < nc; i++) { + idx = (t[i].h << 8) + t[i].l; + unsigned short low = unicodetolower(idx, langnum); + if (idx != low) ncap++; + if (unicodetoupper(idx, langnum) == low) nneutral++; + } + u16_u8(dest, MAXWORDUTF8LEN, t, nc); + if (ncap) { + idx = (t[0].h << 8) + t[0].l; + firstcap = (idx != unicodetolower(idx, langnum)); + } + } + + // now finally set the captype + if (ncap == 0) { + *pcaptype = NOCAP; + } else if ((ncap == 1) && firstcap) { + *pcaptype = INITCAP; + } else if ((ncap == nc) || ((ncap + nneutral) == nc)){ + *pcaptype = ALLCAP; + } else if ((ncap > 1) && firstcap) { + *pcaptype = HUHINITCAP; + } else { + *pcaptype = HUHCAP; + } + return strlen(dest); +} + +void Hunspell::mkallcap(char * p) +{ + if (utf8) { + w_char u[MAXWORDLEN]; + int nc = u8_u16(u, MAXWORDLEN, p); + unsigned short idx; + for (int i = 0; i < nc; i++) { + idx = (u[i].h << 8) + u[i].l; + if (idx != unicodetoupper(idx, langnum)) { + u[i].h = (unsigned char) (unicodetoupper(idx, langnum) >> 8); + u[i].l = (unsigned char) (unicodetoupper(idx, langnum) & 0x00FF); + } + } + u16_u8(p, MAXWORDUTF8LEN, u, nc); + } else { + while (*p != '\0') { + *p = csconv[((unsigned char) *p)].cupper; + p++; + } + } +} + +int Hunspell::mkallcap2(char * p, w_char * u, int nc) +{ + if (utf8) { + unsigned short idx; + for (int i = 0; i < nc; i++) { + idx = (u[i].h << 8) + u[i].l; + unsigned short up = unicodetoupper(idx, langnum); + if (idx != up) { + u[i].h = (unsigned char) (up >> 8); + u[i].l = (unsigned char) (up & 0x00FF); + } + } + u16_u8(p, MAXWORDUTF8LEN, u, nc); + return strlen(p); + } else { + while (*p != '\0') { + *p = csconv[((unsigned char) *p)].cupper; + p++; + } + } + return nc; +} + + +void Hunspell::mkallsmall(char * p) +{ + while (*p != '\0') { + *p = csconv[((unsigned char) *p)].clower; + p++; + } +} + +int Hunspell::mkallsmall2(char * p, w_char * u, int nc) +{ + if (utf8) { + unsigned short idx; + for (int i = 0; i < nc; i++) { + idx = (u[i].h << 8) + u[i].l; + unsigned short low = unicodetolower(idx, langnum); + if (idx != low) { + u[i].h = (unsigned char) (low >> 8); + u[i].l = (unsigned char) (low & 0x00FF); + } + } + u16_u8(p, MAXWORDUTF8LEN, u, nc); + return strlen(p); + } else { + while (*p != '\0') { + *p = csconv[((unsigned char) *p)].clower; + p++; + } + } + return nc; +} + +// convert UTF-8 sharp S codes to latin 1 +char * Hunspell::sharps_u8_l1(char * dest, char * source) { + char * p = dest; + *p = *source; + for (p++, source++; *(source - 1); p++, source++) { + *p = *source; + if (*source == '\x9F') *--p = '\xDF'; + } + return dest; +} + +// recursive search for right ss - sharp s permutations +hentry * Hunspell::spellsharps(char * base, char * pos, int n, + int repnum, char * tmp, int * info, char **root) { + pos = strstr(pos, "ss"); + if (pos && (n < MAXSHARPS)) { + *pos = '\xC3'; + *(pos + 1) = '\x9F'; + hentry * h = spellsharps(base, pos + 2, n + 1, repnum + 1, tmp, info, root); + if (h) return h; + *pos = 's'; + *(pos + 1) = 's'; + h = spellsharps(base, pos + 2, n + 1, repnum, tmp, info, root); + if (h) return h; + } else if (repnum > 0) { + if (utf8) return checkword(base, info, root); + return checkword(sharps_u8_l1(tmp, base), info, root); + } + return NULL; +} + +int Hunspell::is_keepcase(const hentry * rv) { + return pAMgr && rv->astr && pAMgr->get_keepcase() && + TESTAFF(rv->astr, pAMgr->get_keepcase(), rv->alen); +} + +/* insert a word to the beginning of the suggestion array and return ns */ +int Hunspell::insert_sug(char ***slst, char * word, int ns) { + char * dup = mystrdup(word); + if (!dup) return ns; + if (ns == MAXSUGGESTION) { + ns--; + free((*slst)[ns]); + } + for (int k = ns; k > 0; k--) (*slst)[k] = (*slst)[k - 1]; + (*slst)[0] = dup; + return ns + 1; +} + +int Hunspell::spell(const char * word, int * info, char ** root) +{ + struct hentry * rv=NULL; + // need larger vector. For example, Turkish capital letter I converted a + // 2-byte UTF-8 character (dotless i) by mkallsmall. + char cw[MAXWORDUTF8LEN]; + char wspace[MAXWORDUTF8LEN]; + w_char unicw[MAXWORDLEN]; + // Hunspell supports XML input of the simplified API (see manual) + if (strcmp(word, SPELL_XML) == 0) return 1; + int nc = strlen(word); + int wl2 = 0; + if (utf8) { + if (nc >= MAXWORDUTF8LEN) return 0; + } else { + if (nc >= MAXWORDLEN) return 0; + } + int captype = 0; + int abbv = 0; + int wl = 0; + + // input conversion + RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; + if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv); + else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); + + int info2 = 0; + if (wl == 0 || maxdic == 0) return 1; + if (root) *root = NULL; + + // allow numbers with dots, dashes and commas (but forbid double separators: "..", "--" etc.) + enum { NBEGIN, NNUM, NSEP }; + int nstate = NBEGIN; + int i; + + for (i = 0; (i < wl); i++) { + if ((cw[i] <= '9') && (cw[i] >= '0')) { + nstate = NNUM; + } else if ((cw[i] == ',') || (cw[i] == '.') || (cw[i] == '-')) { + if ((nstate == NSEP) || (i == 0)) break; + nstate = NSEP; + } else break; + } + if ((i == wl) && (nstate == NNUM)) return 1; + if (!info) info = &info2; else *info = 0; + + switch(captype) { + case HUHCAP: + case HUHINITCAP: + *info += SPELL_ORIGCAP; + case NOCAP: { + rv = checkword(cw, info, root); + if ((abbv) && !(rv)) { + memcpy(wspace,cw,wl); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + rv = checkword(wspace, info, root); + } + break; + } + case ALLCAP: { + *info += SPELL_ORIGCAP; + rv = checkword(cw, info, root); + if (rv) break; + if (abbv) { + memcpy(wspace,cw,wl); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + rv = checkword(wspace, info, root); + if (rv) break; + } + // Spec. prefix handling for Catalan, French, Italian: + // prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia). + if (pAMgr && strchr(cw, '\'')) { + wl = mkallsmall2(cw, unicw, nc); + //There are no really sane circumstances where this could fail, + //but anyway... + if (char * apostrophe = strchr(cw, '\'')) { + if (utf8) { + w_char tmpword[MAXWORDLEN]; + *apostrophe = '\0'; + wl2 = u8_u16(tmpword, MAXWORDLEN, cw); + *apostrophe = '\''; + if (wl2 < nc) { + mkinitcap2(apostrophe + 1, unicw + wl2 + 1, nc - wl2 - 1); + rv = checkword(cw, info, root); + if (rv) break; + } + } else { + mkinitcap2(apostrophe + 1, unicw, nc); + rv = checkword(cw, info, root); + if (rv) break; + } + } + mkinitcap2(cw, unicw, nc); + rv = checkword(cw, info, root); + if (rv) break; + } + if (pAMgr && pAMgr->get_checksharps() && strstr(cw, "SS")) { + char tmpword[MAXWORDUTF8LEN]; + wl = mkallsmall2(cw, unicw, nc); + memcpy(wspace,cw,(wl+1)); + rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); + if (!rv) { + wl2 = mkinitcap2(cw, unicw, nc); + rv = spellsharps(cw, cw, 0, 0, tmpword, info, root); + } + if ((abbv) && !(rv)) { + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); + if (!rv) { + memcpy(wspace, cw, wl2); + *(wspace+wl2) = '.'; + *(wspace+wl2+1) = '\0'; + rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); + } + } + if (rv) break; + } + } + case INITCAP: { + *info += SPELL_ORIGCAP; + wl = mkallsmall2(cw, unicw, nc); + memcpy(wspace,cw,(wl+1)); + wl2 = mkinitcap2(cw, unicw, nc); + if (captype == INITCAP) *info += SPELL_INITCAP; + rv = checkword(cw, info, root); + if (captype == INITCAP) *info -= SPELL_INITCAP; + // forbid bad capitalization + // (for example, ijs -> Ijs instead of IJs in Dutch) + // use explicit forms in dic: Ijs/F (F = FORBIDDENWORD flag) + if (*info & SPELL_FORBIDDEN) { + rv = NULL; + break; + } + if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL; + if (rv) break; + + rv = checkword(wspace, info, root); + if (abbv && !rv) { + + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + rv = checkword(wspace, info, root); + if (!rv) { + memcpy(wspace, cw, wl2); + *(wspace+wl2) = '.'; + *(wspace+wl2+1) = '\0'; + if (captype == INITCAP) *info += SPELL_INITCAP; + rv = checkword(wspace, info, root); + if (captype == INITCAP) *info -= SPELL_INITCAP; + if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL; + break; + } + } + if (rv && is_keepcase(rv) && + ((captype == ALLCAP) || + // if CHECKSHARPS: KEEPCASE words with \xDF are allowed + // in INITCAP form, too. + !(pAMgr->get_checksharps() && + ((utf8 && strstr(wspace, "\xC3\x9F")) || + (!utf8 && strchr(wspace, '\xDF')))))) rv = NULL; + break; + } + } + + if (rv) { + if (pAMgr && pAMgr->get_warn() && rv->astr && + TESTAFF(rv->astr, pAMgr->get_warn(), rv->alen)) { + *info += SPELL_WARN; + if (pAMgr->get_forbidwarn()) return 0; + return HUNSPELL_OK_WARN; + } + return HUNSPELL_OK; + } + + // recursive breaking at break points + if (wordbreak) { + char * s; + char r; + int nbr = 0; + wl = strlen(cw); + int numbreak = pAMgr ? pAMgr->get_numbreak() : 0; + + // calculate break points for recursion limit + for (int j = 0; j < numbreak; j++) { + s = cw; + do { + s = (char *) strstr(s, wordbreak[j]); + if (s) { + nbr++; + s++; + } + } while (s); + } + if (nbr >= 10) return 0; + + // check boundary patterns (^begin and end$) + for (int j = 0; j < numbreak; j++) { + int plen = strlen(wordbreak[j]); + if (plen == 1 || plen > wl) continue; + if (wordbreak[j][0] == '^' && strncmp(cw, wordbreak[j] + 1, plen - 1) == 0 + && spell(cw + plen - 1)) return 1; + if (wordbreak[j][plen - 1] == '$' && + strncmp(cw + wl - plen + 1, wordbreak[j], plen - 1) == 0) { + r = cw[wl - plen + 1]; + cw[wl - plen + 1] = '\0'; + if (spell(cw)) return 1; + cw[wl - plen + 1] = r; + } + } + + // other patterns + for (int j = 0; j < numbreak; j++) { + int plen = strlen(wordbreak[j]); + s=(char *) strstr(cw, wordbreak[j]); + if (s && (s > cw) && (s < cw + wl - plen)) { + if (!spell(s + plen)) continue; + r = *s; + *s = '\0'; + // examine 2 sides of the break point + if (spell(cw)) return 1; + *s = r; + + // LANG_hu: spec. dash rule + if (langnum == LANG_hu && strcmp(wordbreak[j], "-") == 0) { + r = s[1]; + s[1] = '\0'; + if (spell(cw)) return 1; // check the first part with dash + s[1] = r; + } + // end of LANG speficic region + + } + } + } + + return 0; +} + +struct hentry * Hunspell::checkword(const char * w, int * info, char ** root) +{ + struct hentry * he = NULL; + int len, i; + char w2[MAXWORDUTF8LEN]; + const char * word; + + char * ignoredchars = pAMgr->get_ignore(); + if (ignoredchars != NULL) { + strcpy(w2, w); + if (utf8) { + int ignoredchars_utf16_len; + unsigned short * ignoredchars_utf16 = pAMgr->get_ignore_utf16(&ignoredchars_utf16_len); + remove_ignored_chars_utf(w2, ignoredchars_utf16, ignoredchars_utf16_len); + } else { + remove_ignored_chars(w2,ignoredchars); + } + word = w2; + } else word = w; + + len = strlen(word); + + if (!len) + return NULL; + + // word reversing wrapper for complex prefixes + if (complexprefixes) { + if (word != w2) { + strcpy(w2, word); + word = w2; + } + if (utf8) reverseword_utf(w2); else reverseword(w2); + } + + // look word in hash table + for (i = 0; (i < maxdic) && !he; i ++) { + he = (pHMgr[i])->lookup(word); + + // check forbidden and onlyincompound words + if ((he) && (he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) { + if (info) *info += SPELL_FORBIDDEN; + // LANG_hu section: set dash information for suggestions + if (langnum == LANG_hu) { + if (pAMgr->get_compoundflag() && + TESTAFF(he->astr, pAMgr->get_compoundflag(), he->alen)) { + if (info) *info += SPELL_COMPOUND; + } + } + return NULL; + } + + // he = next not needaffix, onlyincompound homonym or onlyupcase word + while (he && (he->astr) && + ((pAMgr->get_needaffix() && TESTAFF(he->astr, pAMgr->get_needaffix(), he->alen)) || + (pAMgr->get_onlyincompound() && TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) || + (info && (*info & SPELL_INITCAP) && TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)) + )) he = he->next_homonym; + } + + // check with affixes + if (!he && pAMgr) { + // try stripping off affixes */ + he = pAMgr->affix_check(word, len, 0); + + // check compound restriction and onlyupcase + if (he && he->astr && ( + (pAMgr->get_onlyincompound() && + TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) || + (info && (*info & SPELL_INITCAP) && + TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)))) { + he = NULL; + } + + if (he) { + if ((he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) { + if (info) *info += SPELL_FORBIDDEN; + return NULL; + } + if (root) { + *root = mystrdup(he->word); + if (*root && complexprefixes) { + if (utf8) reverseword_utf(*root); else reverseword(*root); + } + } + // try check compound word + } else if (pAMgr->get_compound()) { + he = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 0, info); + // LANG_hu section: `moving rule' with last dash + if ((!he) && (langnum == LANG_hu) && (word[len-1] == '-')) { + char * dup = mystrdup(word); + if (!dup) return NULL; + dup[len-1] = '\0'; + he = pAMgr->compound_check(dup, len-1, -5, 0, 100, 0, NULL, 1, 0, info); + free(dup); + } + // end of LANG speficic region + if (he) { + if (root) { + *root = mystrdup(he->word); + if (*root && complexprefixes) { + if (utf8) reverseword_utf(*root); else reverseword(*root); + } + } + if (info) *info += SPELL_COMPOUND; + } + } + + } + + return he; +} + +int Hunspell::suggest(char*** slst, const char * word) +{ + int onlycmpdsug = 0; + char cw[MAXWORDUTF8LEN]; + char wspace[MAXWORDUTF8LEN]; + if (!pSMgr || maxdic == 0) return 0; + w_char unicw[MAXWORDLEN]; + *slst = NULL; + // process XML input of the simplified API (see manual) + if (strncmp(word, SPELL_XML, sizeof(SPELL_XML) - 3) == 0) { + return spellml(slst, word); + } + int nc = strlen(word); + if (utf8) { + if (nc >= MAXWORDUTF8LEN) return 0; + } else { + if (nc >= MAXWORDLEN) return 0; + } + int captype = 0; + int abbv = 0; + int wl = 0; + + // input conversion + RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; + if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv); + else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); + + if (wl == 0) return 0; + int ns = 0; + int capwords = 0; + + // check capitalized form for FORCEUCASE + if (pAMgr && captype == NOCAP && pAMgr->get_forceucase()) { + int info = SPELL_ORIGCAP; + char ** wlst; + if (checkword(cw, &info, NULL)) { + if (*slst) { + wlst = *slst; + } else { + wlst = (char **) malloc(MAXSUGGESTION * sizeof(char *)); + if (wlst == NULL) return -1; + *slst = wlst; + for (int i = 0; i < MAXSUGGESTION; i++) { + wlst[i] = NULL; + } + } + wlst[0] = mystrdup(cw); + mkinitcap(wlst[0]); + return 1; + } + } + + switch(captype) { + case NOCAP: { + ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug); + break; + } + + case INITCAP: { + capwords = 1; + ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug); + if (ns == -1) break; + memcpy(wspace,cw,(wl+1)); + mkallsmall2(wspace, unicw, nc); + ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); + break; + } + case HUHINITCAP: + capwords = 1; + case HUHCAP: { + ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug); + if (ns != -1) { + int prevns; + // something.The -> something. The + char * dot = strchr(cw, '.'); + if (dot && (dot > cw)) { + int captype_; + if (utf8) { + w_char w_[MAXWORDLEN]; + int wl_ = u8_u16(w_, MAXWORDLEN, dot + 1); + captype_ = get_captype_utf8(w_, wl_, langnum); + } else captype_ = get_captype(dot+1, strlen(dot+1), csconv); + if (captype_ == INITCAP) { + char * st = mystrdup(cw); + if (st) st = (char *) realloc(st, wl + 2); + if (st) { + st[(dot - cw) + 1] = ' '; + strcpy(st + (dot - cw) + 2, dot + 1); + ns = insert_sug(slst, st, ns); + free(st); + } + } + } + if (captype == HUHINITCAP) { + // TheOpenOffice.org -> The OpenOffice.org + memcpy(wspace,cw,(wl+1)); + mkinitsmall2(wspace, unicw, nc); + ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); + } + memcpy(wspace,cw,(wl+1)); + mkallsmall2(wspace, unicw, nc); + if (spell(wspace)) ns = insert_sug(slst, wspace, ns); + prevns = ns; + ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); + if (captype == HUHINITCAP) { + mkinitcap2(wspace, unicw, nc); + if (spell(wspace)) ns = insert_sug(slst, wspace, ns); + ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); + } + // aNew -> "a New" (instead of "a new") + for (int j = prevns; j < ns; j++) { + char * space = strchr((*slst)[j],' '); + if (space) { + int slen = strlen(space + 1); + // different case after space (need capitalisation) + if ((slen < wl) && strcmp(cw + wl - slen, space + 1)) { + w_char w[MAXWORDLEN]; + int wc = 0; + char * r = (*slst)[j]; + if (utf8) wc = u8_u16(w, MAXWORDLEN, space + 1); + mkinitcap2(space + 1, w, wc); + // set as first suggestion + for (int k = j; k > 0; k--) (*slst)[k] = (*slst)[k - 1]; + (*slst)[0] = r; + } + } + } + } + break; + } + + case ALLCAP: { + memcpy(wspace, cw, (wl+1)); + mkallsmall2(wspace, unicw, nc); + ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); + if (ns == -1) break; + if (pAMgr && pAMgr->get_keepcase() && spell(wspace)) + ns = insert_sug(slst, wspace, ns); + mkinitcap2(wspace, unicw, nc); + ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); + for (int j=0; j < ns; j++) { + mkallcap((*slst)[j]); + if (pAMgr && pAMgr->get_checksharps()) { + char * pos; + if (utf8) { + pos = strstr((*slst)[j], "\xC3\x9F"); + while (pos) { + *pos = 'S'; + *(pos+1) = 'S'; + pos = strstr(pos+2, "\xC3\x9F"); + } + } else { + pos = strchr((*slst)[j], '\xDF'); + while (pos) { + (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 2); + mystrrep((*slst)[j], "\xDF", "SS"); + pos = strchr((*slst)[j], '\xDF'); + } + } + } + } + break; + } + } + + // LANG_hu section: replace '-' with ' ' in Hungarian + if (langnum == LANG_hu) { + for (int j=0; j < ns; j++) { + char * pos = strchr((*slst)[j],'-'); + if (pos) { + int info; + char w[MAXWORDUTF8LEN]; + *pos = '\0'; + strcpy(w, (*slst)[j]); + strcat(w, pos + 1); + spell(w, &info, NULL); + if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) { + *pos = ' '; + } else *pos = '-'; + } + } + } + // END OF LANG_hu section + + // try ngram approach since found nothing or only compound words + if (pAMgr && (ns == 0 || onlycmpdsug) && (pAMgr->get_maxngramsugs() != 0) && (*slst)) { + switch(captype) { + case NOCAP: { + ns = pSMgr->ngsuggest(*slst, cw, ns, pHMgr, maxdic); + break; + } + case HUHINITCAP: + capwords = 1; + case HUHCAP: { + memcpy(wspace,cw,(wl+1)); + mkallsmall2(wspace, unicw, nc); + ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic); + break; + } + case INITCAP: { + capwords = 1; + memcpy(wspace,cw,(wl+1)); + mkallsmall2(wspace, unicw, nc); + ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic); + break; + } + case ALLCAP: { + memcpy(wspace,cw,(wl+1)); + mkallsmall2(wspace, unicw, nc); + int oldns = ns; + ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic); + for (int j = oldns; j < ns; j++) + mkallcap((*slst)[j]); + break; + } + } + } + + // try dash suggestion (Afo-American -> Afro-American) + if (char * pos = strchr(cw, '-')) { + char * ppos = cw; + int nodashsug = 1; + char ** nlst = NULL; + int nn = 0; + int last = 0; + if (*slst) { + for (int j = 0; j < ns && nodashsug == 1; j++) { + if (strchr((*slst)[j], '-')) nodashsug = 0; + } + } + while (nodashsug && !last) { + if (*pos == '\0') last = 1; else *pos = '\0'; + if (!spell(ppos)) { + nn = suggest(&nlst, ppos); + for (int j = nn - 1; j >= 0; j--) { + strncpy(wspace, cw, ppos - cw); + strcpy(wspace + (ppos - cw), nlst[j]); + if (!last) { + strcat(wspace, "-"); + strcat(wspace, pos + 1); + } + ns = insert_sug(slst, wspace, ns); + free(nlst[j]); + } + if (nlst != NULL) free(nlst); + nodashsug = 0; + } + if (!last) { + *pos = '-'; + ppos = pos + 1; + pos = strchr(ppos, '-'); + } + if (!pos) pos = cw + strlen(cw); + } + } + + // word reversing wrapper for complex prefixes + if (complexprefixes) { + for (int j = 0; j < ns; j++) { + if (utf8) reverseword_utf((*slst)[j]); else reverseword((*slst)[j]); + } + } + + // capitalize + if (capwords) for (int j=0; j < ns; j++) { + mkinitcap((*slst)[j]); + } + + // expand suggestions with dot(s) + if (abbv && pAMgr && pAMgr->get_sugswithdots()) { + for (int j = 0; j < ns; j++) { + (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 1 + abbv); + strcat((*slst)[j], word + strlen(word) - abbv); + } + } + + // remove bad capitalized and forbidden forms + if (pAMgr && (pAMgr->get_keepcase() || pAMgr->get_forbiddenword())) { + switch (captype) { + case INITCAP: + case ALLCAP: { + int l = 0; + for (int j=0; j < ns; j++) { + if (!strchr((*slst)[j],' ') && !spell((*slst)[j])) { + char s[MAXSWUTF8L]; + w_char w[MAXSWL]; + int len; + if (utf8) { + len = u8_u16(w, MAXSWL, (*slst)[j]); + } else { + strcpy(s, (*slst)[j]); + len = strlen(s); + } + mkallsmall2(s, w, len); + free((*slst)[j]); + if (spell(s)) { + (*slst)[l] = mystrdup(s); + if ((*slst)[l]) l++; + } else { + mkinitcap2(s, w, len); + if (spell(s)) { + (*slst)[l] = mystrdup(s); + if ((*slst)[l]) l++; + } + } + } else { + (*slst)[l] = (*slst)[j]; + l++; + } + } + ns = l; + } + } + } + + // remove duplications + int l = 0; + for (int j = 0; j < ns; j++) { + (*slst)[l] = (*slst)[j]; + for (int k = 0; k < l; k++) { + if (strcmp((*slst)[k], (*slst)[j]) == 0) { + free((*slst)[j]); + l--; + break; + } + } + l++; + } + ns = l; + + // output conversion + rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL; + for (int j = 0; rl && j < ns; j++) { + if (rl->conv((*slst)[j], wspace)) { + free((*slst)[j]); + (*slst)[j] = mystrdup(wspace); + } + } + + // if suggestions removed by nosuggest, onlyincompound parameters + if (l == 0 && *slst) { + free(*slst); + *slst = NULL; + } + return l; +} + +void Hunspell::free_list(char *** slst, int n) { + freelist(slst, n); +} + +char * Hunspell::get_dic_encoding() +{ + return encoding; +} + +#ifdef HUNSPELL_EXPERIMENTAL +// XXX need UTF-8 support +int Hunspell::suggest_auto(char*** slst, const char * word) +{ + char cw[MAXWORDUTF8LEN]; + char wspace[MAXWORDUTF8LEN]; + if (!pSMgr || maxdic == 0) return 0; + int wl = strlen(word); + if (utf8) { + if (wl >= MAXWORDUTF8LEN) return 0; + } else { + if (wl >= MAXWORDLEN) return 0; + } + int captype = 0; + int abbv = 0; + wl = cleanword(cw, word, &captype, &abbv); + if (wl == 0) return 0; + int ns = 0; + *slst = NULL; // HU, nsug in pSMgr->suggest + + switch(captype) { + case NOCAP: { + ns = pSMgr->suggest_auto(slst, cw, ns); + if (ns>0) break; + break; + } + + case INITCAP: { + memcpy(wspace,cw,(wl+1)); + mkallsmall(wspace); + ns = pSMgr->suggest_auto(slst, wspace, ns); + for (int j=0; j < ns; j++) + mkinitcap((*slst)[j]); + ns = pSMgr->suggest_auto(slst, cw, ns); + break; + + } + + case HUHINITCAP: + case HUHCAP: { + ns = pSMgr->suggest_auto(slst, cw, ns); + if (ns == 0) { + memcpy(wspace,cw,(wl+1)); + mkallsmall(wspace); + ns = pSMgr->suggest_auto(slst, wspace, ns); + } + break; + } + + case ALLCAP: { + memcpy(wspace,cw,(wl+1)); + mkallsmall(wspace); + ns = pSMgr->suggest_auto(slst, wspace, ns); + + mkinitcap(wspace); + ns = pSMgr->suggest_auto(slst, wspace, ns); + + for (int j=0; j < ns; j++) + mkallcap((*slst)[j]); + break; + } + } + + // word reversing wrapper for complex prefixes + if (complexprefixes) { + for (int j = 0; j < ns; j++) { + if (utf8) reverseword_utf((*slst)[j]); else reverseword((*slst)[j]); + } + } + + // expand suggestions with dot(s) + if (abbv && pAMgr && pAMgr->get_sugswithdots()) { + for (int j = 0; j < ns; j++) { + (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 1 + abbv); + strcat((*slst)[j], word + strlen(word) - abbv); + } + } + + // LANG_hu section: replace '-' with ' ' in Hungarian + if (langnum == LANG_hu) { + for (int j=0; j < ns; j++) { + char * pos = strchr((*slst)[j],'-'); + if (pos) { + int info; + char w[MAXWORDUTF8LEN]; + *pos = '\0'; + strcpy(w, (*slst)[j]); + strcat(w, pos + 1); + spell(w, &info, NULL); + if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) { + *pos = ' '; + } else *pos = '-'; + } + } + } + // END OF LANG_hu section + return ns; +} +#endif + +int Hunspell::stem(char*** slst, char ** desc, int n) +{ + char result[MAXLNLEN]; + char result2[MAXLNLEN]; + *slst = NULL; + if (n == 0) return 0; + *result2 = '\0'; + for (int i = 0; i < n; i++) { + *result = '\0'; + // add compound word parts (except the last one) + char * s = (char *) desc[i]; + char * part = strstr(s, MORPH_PART); + if (part) { + char * nextpart = strstr(part + 1, MORPH_PART); + while (nextpart) { + copy_field(result + strlen(result), part, MORPH_PART); + part = nextpart; + nextpart = strstr(part + 1, MORPH_PART); + } + s = part; + } + + char **pl; + char tok[MAXLNLEN]; + strcpy(tok, s); + char * alt = strstr(tok, " | "); + while (alt) { + alt[1] = MSEP_ALT; + alt = strstr(alt, " | "); + } + int pln = line_tok(tok, &pl, MSEP_ALT); + for (int k = 0; k < pln; k++) { + // add derivational suffixes + if (strstr(pl[k], MORPH_DERI_SFX)) { + // remove inflectional suffixes + char * is = strstr(pl[k], MORPH_INFL_SFX); + if (is) *is = '\0'; + char * sg = pSMgr->suggest_gen(&(pl[k]), 1, pl[k]); + if (sg) { + char ** gen; + int genl = line_tok(sg, &gen, MSEP_REC); + free(sg); + for (int j = 0; j < genl; j++) { + sprintf(result2 + strlen(result2), "%c%s%s", + MSEP_REC, result, gen[j]); + } + freelist(&gen, genl); + } + } else { + sprintf(result2 + strlen(result2), "%c%s", MSEP_REC, result); + if (strstr(pl[k], MORPH_SURF_PFX)) { + copy_field(result2 + strlen(result2), pl[k], MORPH_SURF_PFX); + } + copy_field(result2 + strlen(result2), pl[k], MORPH_STEM); + } + } + freelist(&pl, pln); + } + int sln = line_tok(result2, slst, MSEP_REC); + return uniqlist(*slst, sln); + +} + +int Hunspell::stem(char*** slst, const char * word) +{ + char ** pl; + int pln = analyze(&pl, word); + int pln2 = stem(slst, pl, pln); + freelist(&pl, pln); + return pln2; +} + +#ifdef HUNSPELL_EXPERIMENTAL +int Hunspell::suggest_pos_stems(char*** slst, const char * word) +{ + char cw[MAXWORDUTF8LEN]; + char wspace[MAXWORDUTF8LEN]; + if (! pSMgr || maxdic == 0) return 0; + int wl = strlen(word); + if (utf8) { + if (wl >= MAXWORDUTF8LEN) return 0; + } else { + if (wl >= MAXWORDLEN) return 0; + } + int captype = 0; + int abbv = 0; + wl = cleanword(cw, word, &captype, &abbv); + if (wl == 0) return 0; + + int ns = 0; // ns=0 = normalized input + + *slst = NULL; // HU, nsug in pSMgr->suggest + + switch(captype) { + case HUHCAP: + case NOCAP: { + ns = pSMgr->suggest_pos_stems(slst, cw, ns); + + if ((abbv) && (ns == 0)) { + memcpy(wspace,cw,wl); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + ns = pSMgr->suggest_pos_stems(slst, wspace, ns); + } + + break; + } + + case INITCAP: { + + ns = pSMgr->suggest_pos_stems(slst, cw, ns); + + if (ns == 0 || ((*slst)[0][0] == '#')) { + memcpy(wspace,cw,(wl+1)); + mkallsmall(wspace); + ns = pSMgr->suggest_pos_stems(slst, wspace, ns); + } + + break; + + } + + case ALLCAP: { + ns = pSMgr->suggest_pos_stems(slst, cw, ns); + if (ns != 0) break; + + memcpy(wspace,cw,(wl+1)); + mkallsmall(wspace); + ns = pSMgr->suggest_pos_stems(slst, wspace, ns); + + if (ns == 0) { + mkinitcap(wspace); + ns = pSMgr->suggest_pos_stems(slst, wspace, ns); + } + break; + } + } + + return ns; +} +#endif // END OF HUNSPELL_EXPERIMENTAL CODE + +const char * Hunspell::get_wordchars() +{ + return pAMgr->get_wordchars(); +} + +unsigned short * Hunspell::get_wordchars_utf16(int * len) +{ + return pAMgr->get_wordchars_utf16(len); +} + +char * Hunspell::get_try_string() +{ + return pAMgr->get_try_string(); +} + +void Hunspell::mkinitcap(char * p) +{ + if (!utf8) { + if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper; + } else { + int len; + w_char u[MAXWORDLEN]; + len = u8_u16(u, MAXWORDLEN, p); + unsigned short i = unicodetoupper((u[0].h << 8) + u[0].l, langnum); + u[0].h = (unsigned char) (i >> 8); + u[0].l = (unsigned char) (i & 0x00FF); + u16_u8(p, MAXWORDUTF8LEN, u, len); + } +} + +int Hunspell::mkinitcap2(char * p, w_char * u, int nc) +{ + if (!utf8) { + if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper; + } else if (nc > 0) { + unsigned short i = unicodetoupper((u[0].h << 8) + u[0].l, langnum); + u[0].h = (unsigned char) (i >> 8); + u[0].l = (unsigned char) (i & 0x00FF); + u16_u8(p, MAXWORDUTF8LEN, u, nc); + return strlen(p); + } + return nc; +} + +int Hunspell::mkinitsmall2(char * p, w_char * u, int nc) +{ + if (!utf8) { + if (*p != '\0') *p = csconv[((unsigned char)*p)].clower; + } else if (nc > 0) { + unsigned short i = unicodetolower((u[0].h << 8) + u[0].l, langnum); + u[0].h = (unsigned char) (i >> 8); + u[0].l = (unsigned char) (i & 0x00FF); + u16_u8(p, MAXWORDUTF8LEN, u, nc); + return strlen(p); + } + return nc; +} + +int Hunspell::add(const char * word) +{ + if (pHMgr[0]) return (pHMgr[0])->add(word); + return 0; +} + +int Hunspell::add_with_affix(const char * word, const char * example) +{ + if (pHMgr[0]) return (pHMgr[0])->add_with_affix(word, example); + return 0; +} + +int Hunspell::remove(const char * word) +{ + if (pHMgr[0]) return (pHMgr[0])->remove(word); + return 0; +} + +const char * Hunspell::get_version() +{ + return pAMgr->get_version(); +} + +struct cs_info * Hunspell::get_csconv() +{ + return csconv; +} + +void Hunspell::cat_result(char * result, char * st) +{ + if (st) { + if (*result) mystrcat(result, "\n", MAXLNLEN); + mystrcat(result, st, MAXLNLEN); + free(st); + } +} + +int Hunspell::analyze(char*** slst, const char * word) +{ + char cw[MAXWORDUTF8LEN]; + char wspace[MAXWORDUTF8LEN]; + w_char unicw[MAXWORDLEN]; + int wl2 = 0; + *slst = NULL; + if (! pSMgr || maxdic == 0) return 0; + int nc = strlen(word); + if (utf8) { + if (nc >= MAXWORDUTF8LEN) return 0; + } else { + if (nc >= MAXWORDLEN) return 0; + } + int captype = 0; + int abbv = 0; + int wl = 0; + + // input conversion + RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; + if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv); + else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); + + if (wl == 0) { + if (abbv) { + for (wl = 0; wl < abbv; wl++) cw[wl] = '.'; + cw[wl] = '\0'; + abbv = 0; + } else return 0; + } + + char result[MAXLNLEN]; + char * st = NULL; + + *result = '\0'; + + int n = 0; + int n2 = 0; + int n3 = 0; + + // test numbers + // LANG_hu section: set dash information for suggestions + if (langnum == LANG_hu) { + while ((n < wl) && + (((cw[n] <= '9') && (cw[n] >= '0')) || (((cw[n] == '.') || (cw[n] == ',')) && (n > 0)))) { + n++; + if ((cw[n] == '.') || (cw[n] == ',')) { + if (((n2 == 0) && (n > 3)) || + ((n2 > 0) && ((cw[n-1] == '.') || (cw[n-1] == ',')))) break; + n2++; + n3 = n; + } + } + + if ((n == wl) && (n3 > 0) && (n - n3 > 3)) return 0; + if ((n == wl) || ((n>0) && ((cw[n]=='%') || (cw[n]=='\xB0')) && checkword(cw+n, NULL, NULL))) { + mystrcat(result, cw, MAXLNLEN); + result[n - 1] = '\0'; + if (n == wl) cat_result(result, pSMgr->suggest_morph(cw + n - 1)); + else { + char sign = cw[n]; + cw[n] = '\0'; + cat_result(result, pSMgr->suggest_morph(cw + n - 1)); + mystrcat(result, "+", MAXLNLEN); // XXX SPEC. MORPHCODE + cw[n] = sign; + cat_result(result, pSMgr->suggest_morph(cw + n)); + } + return line_tok(result, slst, MSEP_REC); + } + } + // END OF LANG_hu section + + switch(captype) { + case HUHCAP: + case HUHINITCAP: + case NOCAP: { + cat_result(result, pSMgr->suggest_morph(cw)); + if (abbv) { + memcpy(wspace,cw,wl); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + cat_result(result, pSMgr->suggest_morph(wspace)); + } + break; + } + case INITCAP: { + wl = mkallsmall2(cw, unicw, nc); + memcpy(wspace,cw,(wl+1)); + wl2 = mkinitcap2(cw, unicw, nc); + cat_result(result, pSMgr->suggest_morph(wspace)); + cat_result(result, pSMgr->suggest_morph(cw)); + if (abbv) { + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + cat_result(result, pSMgr->suggest_morph(wspace)); + + memcpy(wspace, cw, wl2); + *(wspace+wl2) = '.'; + *(wspace+wl2+1) = '\0'; + + cat_result(result, pSMgr->suggest_morph(wspace)); + } + break; + } + case ALLCAP: { + cat_result(result, pSMgr->suggest_morph(cw)); + if (abbv) { + memcpy(wspace,cw,wl); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + cat_result(result, pSMgr->suggest_morph(cw)); + } + wl = mkallsmall2(cw, unicw, nc); + memcpy(wspace,cw,(wl+1)); + wl2 = mkinitcap2(cw, unicw, nc); + + cat_result(result, pSMgr->suggest_morph(wspace)); + cat_result(result, pSMgr->suggest_morph(cw)); + if (abbv) { + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + cat_result(result, pSMgr->suggest_morph(wspace)); + + memcpy(wspace, cw, wl2); + *(wspace+wl2) = '.'; + *(wspace+wl2+1) = '\0'; + + cat_result(result, pSMgr->suggest_morph(wspace)); + } + break; + } + } + + if (*result) { + // word reversing wrapper for complex prefixes + if (complexprefixes) { + if (utf8) reverseword_utf(result); else reverseword(result); + } + return line_tok(result, slst, MSEP_REC); + } + + // compound word with dash (HU) I18n + char * dash = NULL; + int nresult = 0; + // LANG_hu section: set dash information for suggestions + if (langnum == LANG_hu) dash = (char *) strchr(cw,'-'); + if ((langnum == LANG_hu) && dash) { + *dash='\0'; + // examine 2 sides of the dash + if (dash[1] == '\0') { // base word ending with dash + if (spell(cw)) { + char * p = pSMgr->suggest_morph(cw); + if (p) { + int ret = line_tok(p, slst, MSEP_REC); + free(p); + return ret; + } + + } + } else if ((dash[1] == 'e') && (dash[2] == '\0')) { // XXX (HU) -e hat. + if (spell(cw) && (spell("-e"))) { + st = pSMgr->suggest_morph(cw); + if (st) { + mystrcat(result, st, MAXLNLEN); + free(st); + } + mystrcat(result,"+", MAXLNLEN); // XXX spec. separator in MORPHCODE + st = pSMgr->suggest_morph("-e"); + if (st) { + mystrcat(result, st, MAXLNLEN); + free(st); + } + return line_tok(result, slst, MSEP_REC); + } + } else { + // first word ending with dash: word- XXX ??? + char r2 = *(dash + 1); + dash[0]='-'; + dash[1]='\0'; + nresult = spell(cw); + dash[1] = r2; + dash[0]='\0'; + if (nresult && spell(dash+1) && ((strlen(dash+1) > 1) || + ((dash[1] > '0') && (dash[1] < '9')))) { + st = pSMgr->suggest_morph(cw); + if (st) { + mystrcat(result, st, MAXLNLEN); + free(st); + mystrcat(result,"+", MAXLNLEN); // XXX spec. separator in MORPHCODE + } + st = pSMgr->suggest_morph(dash+1); + if (st) { + mystrcat(result, st, MAXLNLEN); + free(st); + } + return line_tok(result, slst, MSEP_REC); + } + } + // affixed number in correct word + if (nresult && (dash > cw) && (((*(dash-1)<='9') && + (*(dash-1)>='0')) || (*(dash-1)=='.'))) { + *dash='-'; + n = 1; + if (*(dash - n) == '.') n++; + // search first not a number character to left from dash + while (((dash - n)>=cw) && ((*(dash - n)=='0') || (n < 3)) && (n < 6)) { + n++; + } + if ((dash - n) < cw) n--; + // numbers: valami1000000-hoz + // examine 100000-hoz, 10000-hoz 1000-hoz, 10-hoz, + // 56-hoz, 6-hoz + for(; n >= 1; n--) { + if ((*(dash - n) >= '0') && (*(dash - n) <= '9') && checkword(dash - n, NULL, NULL)) { + mystrcat(result, cw, MAXLNLEN); + result[dash - cw - n] = '\0'; + st = pSMgr->suggest_morph(dash - n); + if (st) { + mystrcat(result, st, MAXLNLEN); + free(st); + } + return line_tok(result, slst, MSEP_REC); + } + } + } + } + return 0; +} + +int Hunspell::generate(char*** slst, const char * word, char ** pl, int pln) +{ + *slst = NULL; + if (!pSMgr || !pln) return 0; + char **pl2; + int pl2n = analyze(&pl2, word); + int captype = 0; + int abbv = 0; + char cw[MAXWORDUTF8LEN]; + cleanword(cw, word, &captype, &abbv); + char result[MAXLNLEN]; + *result = '\0'; + + for (int i = 0; i < pln; i++) { + cat_result(result, pSMgr->suggest_gen(pl2, pl2n, pl[i])); + } + freelist(&pl2, pl2n); + + if (*result) { + // allcap + if (captype == ALLCAP) mkallcap(result); + + // line split + int linenum = line_tok(result, slst, MSEP_REC); + + // capitalize + if (captype == INITCAP || captype == HUHINITCAP) { + for (int j=0; j < linenum; j++) mkinitcap((*slst)[j]); + } + + // temporary filtering of prefix related errors (eg. + // generate("undrinkable", "eats") --> "undrinkables" and "*undrinks") + + int r = 0; + for (int j=0; j < linenum; j++) { + if (!spell((*slst)[j])) { + free((*slst)[j]); + (*slst)[j] = NULL; + } else { + if (r < j) (*slst)[r] = (*slst)[j]; + r++; + } + } + if (r > 0) return r; + free(*slst); + *slst = NULL; + } + return 0; +} + +int Hunspell::generate(char*** slst, const char * word, const char * pattern) +{ + char **pl; + int pln = analyze(&pl, pattern); + int n = generate(slst, word, pl, pln); + freelist(&pl, pln); + return uniqlist(*slst, n); +} + +// minimal XML parser functions +int Hunspell::get_xml_par(char * dest, const char * par, int max) +{ + char * d = dest; + if (!par) return 0; + char end = *par; + char * dmax = dest + max; + if (end == '>') end = '<'; + else if (end != '\'' && end != '"') return 0; // bad XML + for (par++; d < dmax && *par != '\0' && *par != end; par++, d++) *d = *par; + *d = '\0'; + mystrrep(dest, "<", "<"); + mystrrep(dest, "&", "&"); + return (int)(d - dest); +} + +int Hunspell::get_langnum() const +{ + return langnum; +} + +// return the beginning of the element (attr == NULL) or the attribute +const char * Hunspell::get_xml_pos(const char * s, const char * attr) +{ + const char * end = strchr(s, '>'); + const char * p = s; + if (attr == NULL) return end; + do { + p = strstr(p, attr); + if (!p || p >= end) return 0; + } while (*(p-1) != ' ' && *(p-1) != '\n'); + return p + strlen(attr); +} + +int Hunspell::check_xml_par(const char * q, const char * attr, const char * value) { + char cw[MAXWORDUTF8LEN]; + if (get_xml_par(cw, get_xml_pos(q, attr), MAXWORDUTF8LEN - 1) && + strcmp(cw, value) == 0) return 1; + return 0; +} + +int Hunspell::get_xml_list(char ***slst, char * list, const char * tag) { + int n = 0; + char * p; + if (!list) return 0; + for (p = list; (p = strstr(p, tag)); p++) n++; + if (n == 0) return 0; + *slst = (char **) malloc(sizeof(char *) * n); + if (!*slst) return 0; + for (p = list, n = 0; (p = strstr(p, tag)); p++, n++) { + int l = strlen(p); + (*slst)[n] = (char *) malloc(l + 1); + if (!(*slst)[n]) return n; + if (!get_xml_par((*slst)[n], p + strlen(tag) - 1, l)) { + free((*slst)[n]); + break; + } + } + return n; +} + +int Hunspell::spellml(char*** slst, const char * word) +{ + char *q, *q2; + char cw[MAXWORDUTF8LEN], cw2[MAXWORDUTF8LEN]; + q = (char *) strstr(word, "<query"); + if (!q) return 0; // bad XML input + q2 = strchr(q, '>'); + if (!q2) return 0; // bad XML input + q2 = strstr(q2, "<word"); + if (!q2) return 0; // bad XML input + if (check_xml_par(q, "type=", "analyze")) { + int n = 0, s = 0; + if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 10)) n = analyze(slst, cw); + if (n == 0) return 0; + // convert the result to <code><a>ana1</a><a>ana2</a></code> format + for (int i = 0; i < n; i++) s+= strlen((*slst)[i]); + char * r = (char *) malloc(6 + 5 * s + 7 * n + 7 + 1); // XXX 5*s->&->& + if (!r) return 0; + strcpy(r, "<code>"); + for (int i = 0; i < n; i++) { + int l = strlen(r); + strcpy(r + l, "<a>"); + strcpy(r + l + 3, (*slst)[i]); + mystrrep(r + l + 3, "\t", " "); + mystrrep(r + l + 3, "<", "<"); + mystrrep(r + l + 3, "&", "&"); + strcat(r, "</a>"); + free((*slst)[i]); + } + strcat(r, "</code>"); + (*slst)[0] = r; + return 1; + } else if (check_xml_par(q, "type=", "stem")) { + if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 1)) return stem(slst, cw); + } else if (check_xml_par(q, "type=", "generate")) { + int n = get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 1); + if (n == 0) return 0; + char * q3 = strstr(q2 + 1, "<word"); + if (q3) { + if (get_xml_par(cw2, strchr(q3, '>'), MAXWORDUTF8LEN - 1)) { + return generate(slst, cw, cw2); + } + } else { + if ((q2 = strstr(q2 + 1, "<code"))) { + char ** slst2; + if ((n = get_xml_list(&slst2, strchr(q2, '>'), "<a>"))) { + int n2 = generate(slst, cw, slst2, n); + freelist(&slst2, n); + return uniqlist(*slst, n2); + } + freelist(&slst2, n); + } + } + } + return 0; +} + + +#ifdef HUNSPELL_EXPERIMENTAL +// XXX need UTF-8 support +char * Hunspell::morph_with_correction(const char * word) +{ + char cw[MAXWORDUTF8LEN]; + char wspace[MAXWORDUTF8LEN]; + if (! pSMgr || maxdic == 0) return NULL; + int wl = strlen(word); + if (utf8) { + if (wl >= MAXWORDUTF8LEN) return NULL; + } else { + if (wl >= MAXWORDLEN) return NULL; + } + int captype = 0; + int abbv = 0; + wl = cleanword(cw, word, &captype, &abbv); + if (wl == 0) return NULL; + + char result[MAXLNLEN]; + char * st = NULL; + + *result = '\0'; + + + switch(captype) { + case NOCAP: { + st = pSMgr->suggest_morph_for_spelling_error(cw); + if (st) { + mystrcat(result, st, MAXLNLEN); + free(st); + } + if (abbv) { + memcpy(wspace,cw,wl); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + st = pSMgr->suggest_morph_for_spelling_error(wspace); + if (st) { + if (*result) mystrcat(result, "\n", MAXLNLEN); + mystrcat(result, st, MAXLNLEN); + free(st); + } + } + break; + } + case INITCAP: { + memcpy(wspace,cw,(wl+1)); + mkallsmall(wspace); + st = pSMgr->suggest_morph_for_spelling_error(wspace); + if (st) { + mystrcat(result, st, MAXLNLEN); + free(st); + } + st = pSMgr->suggest_morph_for_spelling_error(cw); + if (st) { + if (*result) mystrcat(result, "\n", MAXLNLEN); + mystrcat(result, st, MAXLNLEN); + free(st); + } + if (abbv) { + memcpy(wspace,cw,wl); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + mkallsmall(wspace); + st = pSMgr->suggest_morph_for_spelling_error(wspace); + if (st) { + if (*result) mystrcat(result, "\n", MAXLNLEN); + mystrcat(result, st, MAXLNLEN); + free(st); + } + mkinitcap(wspace); + st = pSMgr->suggest_morph_for_spelling_error(wspace); + if (st) { + if (*result) mystrcat(result, "\n", MAXLNLEN); + mystrcat(result, st, MAXLNLEN); + free(st); + } + } + break; + } + case HUHCAP: { + st = pSMgr->suggest_morph_for_spelling_error(cw); + if (st) { + mystrcat(result, st, MAXLNLEN); + free(st); + } + memcpy(wspace,cw,(wl+1)); + mkallsmall(wspace); + st = pSMgr->suggest_morph_for_spelling_error(wspace); + if (st) { + if (*result) mystrcat(result, "\n", MAXLNLEN); + mystrcat(result, st, MAXLNLEN); + free(st); + } + break; + } + case ALLCAP: { + memcpy(wspace,cw,(wl+1)); + st = pSMgr->suggest_morph_for_spelling_error(wspace); + if (st) { + mystrcat(result, st, MAXLNLEN); + free(st); + } + mkallsmall(wspace); + st = pSMgr->suggest_morph_for_spelling_error(wspace); + if (st) { + if (*result) mystrcat(result, "\n", MAXLNLEN); + mystrcat(result, st, MAXLNLEN); + free(st); + } + mkinitcap(wspace); + st = pSMgr->suggest_morph_for_spelling_error(wspace); + if (st) { + if (*result) mystrcat(result, "\n", MAXLNLEN); + mystrcat(result, st, MAXLNLEN); + free(st); + } + if (abbv) { + memcpy(wspace,cw,(wl+1)); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + if (*result) mystrcat(result, "\n", MAXLNLEN); + st = pSMgr->suggest_morph_for_spelling_error(wspace); + if (st) { + mystrcat(result, st, MAXLNLEN); + free(st); + } + mkallsmall(wspace); + st = pSMgr->suggest_morph_for_spelling_error(wspace); + if (st) { + if (*result) mystrcat(result, "\n", MAXLNLEN); + mystrcat(result, st, MAXLNLEN); + free(st); + } + mkinitcap(wspace); + st = pSMgr->suggest_morph_for_spelling_error(wspace); + if (st) { + if (*result) mystrcat(result, "\n", MAXLNLEN); + mystrcat(result, st, MAXLNLEN); + free(st); + } + } + break; + } + } + + if (*result) return mystrdup(result); + return NULL; +} + +#endif // END OF HUNSPELL_EXPERIMENTAL CODE + +Hunhandle *Hunspell_create(const char * affpath, const char * dpath) +{ + return (Hunhandle*)(new Hunspell(affpath, dpath)); +} + +Hunhandle *Hunspell_create_key(const char * affpath, const char * dpath, + const char * key) +{ + return (Hunhandle*)(new Hunspell(affpath, dpath, key)); +} + +void Hunspell_destroy(Hunhandle *pHunspell) +{ + delete (Hunspell*)(pHunspell); +} + +int Hunspell_spell(Hunhandle *pHunspell, const char *word) +{ + return ((Hunspell*)pHunspell)->spell(word); +} + +char *Hunspell_get_dic_encoding(Hunhandle *pHunspell) +{ + return ((Hunspell*)pHunspell)->get_dic_encoding(); +} + +int Hunspell_suggest(Hunhandle *pHunspell, char*** slst, const char * word) +{ + return ((Hunspell*)pHunspell)->suggest(slst, word); +} + +int Hunspell_analyze(Hunhandle *pHunspell, char*** slst, const char * word) +{ + return ((Hunspell*)pHunspell)->analyze(slst, word); +} + +int Hunspell_stem(Hunhandle *pHunspell, char*** slst, const char * word) +{ + return ((Hunspell*)pHunspell)->stem(slst, word); +} + +int Hunspell_stem2(Hunhandle *pHunspell, char*** slst, char** desc, int n) +{ + return ((Hunspell*)pHunspell)->stem(slst, desc, n); +} + +int Hunspell_generate(Hunhandle *pHunspell, char*** slst, const char * word, + const char * word2) +{ + return ((Hunspell*)pHunspell)->generate(slst, word, word2); +} + +int Hunspell_generate2(Hunhandle *pHunspell, char*** slst, const char * word, + char** desc, int n) +{ + return ((Hunspell*)pHunspell)->generate(slst, word, desc, n); +} + + /* functions for run-time modification of the dictionary */ + + /* add word to the run-time dictionary */ + +int Hunspell_add(Hunhandle *pHunspell, const char * word) { + return ((Hunspell*)pHunspell)->add(word); +} + + /* add word to the run-time dictionary with affix flags of + * the example (a dictionary word): Hunspell will recognize + * affixed forms of the new word, too. + */ + +int Hunspell_add_with_affix(Hunhandle *pHunspell, const char * word, + const char * example) { + return ((Hunspell*)pHunspell)->add_with_affix(word, example); +} + + /* remove word from the run-time dictionary */ + +int Hunspell_remove(Hunhandle *pHunspell, const char * word) { + return ((Hunspell*)pHunspell)->remove(word); +} + +void Hunspell_free_list(Hunhandle *, char *** slst, int n) { + freelist(slst, n); +} |