diff options
author | George Hazan <george.hazan@gmail.com> | 2016-01-31 16:14:52 +0000 |
---|---|---|
committer | George Hazan <george.hazan@gmail.com> | 2016-01-31 16:14:52 +0000 |
commit | 5707c6b2b1eafbf38ee7c14f39e42c2280d294ea (patch) | |
tree | cefc6ec223a91562809b3a885cbde5b44f4fc798 /libs/hunspell/src/hunspell.cxx | |
parent | 62767bfaf7f7fb988b826da797463545db14b3b5 (diff) |
smaller unifired project for hunspell
git-svn-id: http://svn.miranda-ng.org/main/trunk@16202 1316c22d-e87f-b044-9b9b-93d7a3e3ba9c
Diffstat (limited to 'libs/hunspell/src/hunspell.cxx')
-rw-r--r-- | libs/hunspell/src/hunspell.cxx | 2367 |
1 files changed, 0 insertions, 2367 deletions
diff --git a/libs/hunspell/src/hunspell.cxx b/libs/hunspell/src/hunspell.cxx deleted file mode 100644 index 726c72931a..0000000000 --- a/libs/hunspell/src/hunspell.cxx +++ /dev/null @@ -1,2367 +0,0 @@ -/* ***** BEGIN LICENSE BLOCK ***** - * Version: MPL 1.1/GPL 2.0/LGPL 2.1 - * - * The contents of this file are subject to the Mozilla Public License Version - * 1.1 (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" basis, - * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License - * for the specific language governing rights and limitations under the - * License. - * - * The Original Code is Hunspell, based on MySpell. - * - * The Initial Developers of the Original Code are - * Kevin Hendricks (MySpell) and Németh László (Hunspell). - * Portions created by the Initial Developers are Copyright (C) 2002-2005 - * the Initial Developers. All Rights Reserved. - * - * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, - * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, - * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, - * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, - * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen - * - * Alternatively, the contents of this file may be used under the terms of - * either the GNU General Public License Version 2 or later (the "GPL"), or - * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), - * in which case the provisions of the GPL or the LGPL are applicable instead - * of those above. If you wish to allow use of your version of this file only - * under the terms of either the GPL or the LGPL, and not to allow others to - * use your version of this file under the terms of the MPL, indicate your - * decision by deleting the provisions above and replace them with the notice - * and other provisions required by the GPL or the LGPL. If you do not delete - * the provisions above, a recipient may use your version of this file under - * the terms of any one of the MPL, the GPL or the LGPL. - * - * ***** END LICENSE BLOCK ***** */ -/* - * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada - * And Contributors. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. All modifications to the source code must be clearly marked as - * such. Binary redistributions based on modified source code - * must be clearly marked as modified versions in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS - * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL - * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include <stdlib.h> -#include <string.h> -#include <stdio.h> - -#include "hunspell.hxx" -#include "hunspell.h" -#ifndef MOZILLA_CLIENT -#include "config.h" -#endif -#include "csutil.hxx" - -#include <limits> -#include <string> - -Hunspell::Hunspell(const char* affpath, const char* dpath, const char* key) { - encoding = NULL; - csconv = NULL; - utf8 = 0; - complexprefixes = 0; - affixpath = mystrdup(affpath); - maxdic = 0; - - /* first set up the hash manager */ - pHMgr[0] = new HashMgr(dpath, affpath, key); - if (pHMgr[0]) - maxdic = 1; - - /* next set up the affix manager */ - /* it needs access to the hash manager lookup methods */ - pAMgr = new AffixMgr(affpath, pHMgr, &maxdic, key); - - /* get the preferred try string and the dictionary */ - /* encoding from the Affix Manager for that dictionary */ - char* try_string = pAMgr->get_try_string(); - encoding = pAMgr->get_encoding(); - langnum = pAMgr->get_langnum(); - utf8 = pAMgr->get_utf8(); - if (!utf8) - csconv = get_current_cs(encoding); - complexprefixes = pAMgr->get_complexprefixes(); - wordbreak = pAMgr->get_breaktable(); - - /* and finally set up the suggestion manager */ - pSMgr = new SuggestMgr(try_string, MAXSUGGESTION, pAMgr); - if (try_string) - free(try_string); -} - -Hunspell::~Hunspell() { - delete pSMgr; - delete pAMgr; - for (int i = 0; i < maxdic; i++) - delete pHMgr[i]; - maxdic = 0; - pSMgr = NULL; - pAMgr = NULL; -#ifdef MOZILLA_CLIENT - delete[] csconv; -#endif - csconv = NULL; - if (encoding) - free(encoding); - encoding = NULL; - if (affixpath) - free(affixpath); - affixpath = NULL; -} - -// load extra dictionaries -int Hunspell::add_dic(const char* dpath, const char* key) { - if (maxdic == MAXDIC || !affixpath) - return 1; - pHMgr[maxdic] = new HashMgr(dpath, affixpath, key); - if (pHMgr[maxdic]) - maxdic++; - else - return 1; - return 0; -} - -// make a copy of src at destination while removing all leading -// blanks and removing any trailing periods after recording -// their presence with the abbreviation flag -// also since already going through character by character, -// set the capitalization type -// return the length of the "cleaned" (and UTF-8 encoded) word - -int Hunspell::cleanword2(char* dest, - const char* src, - w_char* dest_utf, - int* nc, - int* pcaptype, - int* pabbrev) { - unsigned char* p = (unsigned char*)dest; - const unsigned char* q = (const unsigned char*)src; - - // first skip over any leading blanks - while ((*q != '\0') && (*q == ' ')) - q++; - - // now strip off any trailing periods (recording their presence) - *pabbrev = 0; - int nl = strlen((const char*)q); - while ((nl > 0) && (*(q + nl - 1) == '.')) { - nl--; - (*pabbrev)++; - } - - // if no characters are left it can't be capitalized - if (nl <= 0) { - *pcaptype = NOCAP; - *p = '\0'; - return 0; - } - - strncpy(dest, (char*)q, nl); - *(dest + nl) = '\0'; - nl = strlen(dest); - if (utf8) { - *nc = u8_u16(dest_utf, MAXWORDLEN, dest); - // don't check too long words - if (*nc >= MAXWORDLEN) - return 0; - if (*nc == -1) { // big Unicode character (non BMP area) - *pcaptype = NOCAP; - return nl; - } - *pcaptype = get_captype_utf8(dest_utf, *nc, langnum); - } else { - *pcaptype = get_captype(dest, nl, csconv); - *nc = nl; - } - return nl; -} - -int Hunspell::cleanword(char* dest, - const char* src, - int* pcaptype, - int* pabbrev) { - unsigned char* p = (unsigned char*)dest; - const unsigned char* q = (const unsigned char*)src; - int firstcap = 0; - - // first skip over any leading blanks - while ((*q != '\0') && (*q == ' ')) - q++; - - // now strip off any trailing periods (recording their presence) - *pabbrev = 0; - int nl = strlen((const char*)q); - while ((nl > 0) && (*(q + nl - 1) == '.')) { - nl--; - (*pabbrev)++; - } - - // if no characters are left it can't be capitalized - if (nl <= 0) { - *pcaptype = NOCAP; - *p = '\0'; - return 0; - } - - // now determine the capitalization type of the first nl letters - int ncap = 0; - int nneutral = 0; - int nc = 0; - - if (!utf8) { - while (nl > 0) { - nc++; - if (csconv[(*q)].ccase) - ncap++; - if (csconv[(*q)].cupper == csconv[(*q)].clower) - nneutral++; - *p++ = *q++; - nl--; - } - // remember to terminate the destination string - *p = '\0'; - firstcap = csconv[(unsigned char)(*dest)].ccase; - } else { - unsigned short idx; - w_char t[MAXWORDLEN]; - nc = u8_u16(t, MAXWORDLEN, src); - for (int i = 0; i < nc; i++) { - idx = (t[i].h << 8) + t[i].l; - unsigned short low = unicodetolower(idx, langnum); - if (idx != low) - ncap++; - if (unicodetoupper(idx, langnum) == low) - nneutral++; - } - u16_u8(dest, MAXWORDUTF8LEN, t, nc); - if (ncap) { - idx = (t[0].h << 8) + t[0].l; - firstcap = (idx != unicodetolower(idx, langnum)); - } - } - - // now finally set the captype - if (ncap == 0) { - *pcaptype = NOCAP; - } else if ((ncap == 1) && firstcap) { - *pcaptype = INITCAP; - } else if ((ncap == nc) || ((ncap + nneutral) == nc)) { - *pcaptype = ALLCAP; - } else if ((ncap > 1) && firstcap) { - *pcaptype = HUHINITCAP; - } else { - *pcaptype = HUHCAP; - } - return strlen(dest); -} - -void Hunspell::mkallcap(char* p) { - if (utf8) { - w_char u[MAXWORDLEN]; - int nc = u8_u16(u, MAXWORDLEN, p); - unsigned short idx; - for (int i = 0; i < nc; i++) { - idx = (u[i].h << 8) + u[i].l; - if (idx != unicodetoupper(idx, langnum)) { - u[i].h = (unsigned char)(unicodetoupper(idx, langnum) >> 8); - u[i].l = (unsigned char)(unicodetoupper(idx, langnum) & 0x00FF); - } - } - u16_u8(p, MAXWORDUTF8LEN, u, nc); - } else { - while (*p != '\0') { - *p = csconv[((unsigned char)*p)].cupper; - p++; - } - } -} - -int Hunspell::mkallcap2(char* p, w_char* u, int nc) { - if (utf8) { - unsigned short idx; - for (int i = 0; i < nc; i++) { - idx = (u[i].h << 8) + u[i].l; - unsigned short up = unicodetoupper(idx, langnum); - if (idx != up) { - u[i].h = (unsigned char)(up >> 8); - u[i].l = (unsigned char)(up & 0x00FF); - } - } - u16_u8(p, MAXWORDUTF8LEN, u, nc); - return strlen(p); - } else { - while (*p != '\0') { - *p = csconv[((unsigned char)*p)].cupper; - p++; - } - } - return nc; -} - -void Hunspell::mkallsmall(char* p) { - while (*p != '\0') { - *p = csconv[((unsigned char)*p)].clower; - p++; - } -} - -int Hunspell::mkallsmall2(char* p, w_char* u, int nc) { - if (utf8) { - unsigned short idx; - for (int i = 0; i < nc; i++) { - idx = (u[i].h << 8) + u[i].l; - unsigned short low = unicodetolower(idx, langnum); - if (idx != low) { - u[i].h = (unsigned char)(low >> 8); - u[i].l = (unsigned char)(low & 0x00FF); - } - } - u16_u8(p, MAXWORDUTF8LEN, u, nc); - return strlen(p); - } else { - while (*p != '\0') { - *p = csconv[((unsigned char)*p)].clower; - p++; - } - } - return nc; -} - -// convert UTF-8 sharp S codes to latin 1 -char* Hunspell::sharps_u8_l1(char* dest, char* source) { - char* p = dest; - *p = *source; - for (p++, source++; *(source - 1); p++, source++) { - *p = *source; - if (*source == '\x9F') - *--p = '\xDF'; - } - return dest; -} - -// recursive search for right ss - sharp s permutations -hentry* Hunspell::spellsharps(char* base, - char* pos, - int n, - int repnum, - char* tmp, - int* info, - char** root) { - pos = strstr(pos, "ss"); - if (pos && (n < MAXSHARPS)) { - *pos = '\xC3'; - *(pos + 1) = '\x9F'; - hentry* h = spellsharps(base, pos + 2, n + 1, repnum + 1, tmp, info, root); - if (h) - return h; - *pos = 's'; - *(pos + 1) = 's'; - h = spellsharps(base, pos + 2, n + 1, repnum, tmp, info, root); - if (h) - return h; - } else if (repnum > 0) { - if (utf8) - return checkword(base, info, root); - return checkword(sharps_u8_l1(tmp, base), info, root); - } - return NULL; -} - -int Hunspell::is_keepcase(const hentry* rv) { - return pAMgr && rv->astr && pAMgr->get_keepcase() && - TESTAFF(rv->astr, pAMgr->get_keepcase(), rv->alen); -} - -/* insert a word to the beginning of the suggestion array and return ns */ -int Hunspell::insert_sug(char*** slst, char* word, int ns) { - if (!*slst) - return ns; - char* dup = mystrdup(word); - if (!dup) - return ns; - if (ns == MAXSUGGESTION) { - ns--; - free((*slst)[ns]); - } - for (int k = ns; k > 0; k--) - (*slst)[k] = (*slst)[k - 1]; - (*slst)[0] = dup; - return ns + 1; -} - -int Hunspell::spell(const char* word, int* info, char** root) { - struct hentry* rv = NULL; - // need larger vector. For example, Turkish capital letter I converted a - // 2-byte UTF-8 character (dotless i) by mkallsmall. - char cw[MAXWORDUTF8LEN]; - char wspace[MAXWORDUTF8LEN]; - w_char unicw[MAXWORDLEN]; - - int info2 = 0; - if (!info) - info = &info2; - else - *info = 0; - - // Hunspell supports XML input of the simplified API (see manual) - if (strcmp(word, SPELL_XML) == 0) - return 1; - int nc = strlen(word); - int wl2 = 0; - if (utf8) { - if (nc >= MAXWORDUTF8LEN) - return 0; - } else { - if (nc >= MAXWORDLEN) - return 0; - } - int captype = 0; - int abbv = 0; - int wl = 0; - - // input conversion - RepList* rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; - int convstatus = rl ? rl->conv(word, wspace, MAXWORDUTF8LEN) : 0; - if (convstatus < 0) - return 0; - else if (convstatus > 0) - wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv); - else - wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); - -#ifdef MOZILLA_CLIENT - // accept the abbreviated words without dots - // workaround for the incomplete tokenization of Mozilla - abbv = 1; -#endif - - if (wl == 0 || maxdic == 0) - return 1; - if (root) - *root = NULL; - - // allow numbers with dots, dashes and commas (but forbid double separators: - // "..", "--" etc.) - enum { NBEGIN, NNUM, NSEP }; - int nstate = NBEGIN; - int i; - - for (i = 0; (i < wl); i++) { - if ((cw[i] <= '9') && (cw[i] >= '0')) { - nstate = NNUM; - } else if ((cw[i] == ',') || (cw[i] == '.') || (cw[i] == '-')) { - if ((nstate == NSEP) || (i == 0)) - break; - nstate = NSEP; - } else - break; - } - if ((i == wl) && (nstate == NNUM)) - return 1; - - switch (captype) { - case HUHCAP: - /* FALLTHROUGH */ - case HUHINITCAP: - *info += SPELL_ORIGCAP; - /* FALLTHROUGH */ - case NOCAP: - rv = checkword(cw, info, root); - if ((abbv) && !(rv)) { - memcpy(wspace, cw, wl); - *(wspace + wl) = '.'; - *(wspace + wl + 1) = '\0'; - rv = checkword(wspace, info, root); - } - break; - case ALLCAP: { - *info += SPELL_ORIGCAP; - rv = checkword(cw, info, root); - if (rv) - break; - if (abbv) { - memcpy(wspace, cw, wl); - *(wspace + wl) = '.'; - *(wspace + wl + 1) = '\0'; - rv = checkword(wspace, info, root); - if (rv) - break; - } - // Spec. prefix handling for Catalan, French, Italian: - // prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia). - if (pAMgr && strchr(cw, '\'')) { - mkallsmall2(cw, unicw, nc); - // There are no really sane circumstances where this could fail, - // but anyway... - if (char* apostrophe = strchr(cw, '\'')) { - if (utf8) { - w_char tmpword[MAXWORDLEN]; - *apostrophe = '\0'; - wl2 = u8_u16(tmpword, MAXWORDLEN, cw); - *apostrophe = '\''; - if (wl2 >= 0 && wl2 < nc) { - mkinitcap2(apostrophe + 1, unicw + wl2 + 1, nc - wl2 - 1); - rv = checkword(cw, info, root); - if (rv) - break; - } - } else { - mkinitcap2(apostrophe + 1, unicw, nc); - rv = checkword(cw, info, root); - if (rv) - break; - } - } - mkinitcap2(cw, unicw, nc); - rv = checkword(cw, info, root); - if (rv) - break; - } - if (pAMgr && pAMgr->get_checksharps() && strstr(cw, "SS")) { - char tmpword[MAXWORDUTF8LEN]; - wl = mkallsmall2(cw, unicw, nc); - memcpy(wspace, cw, (wl + 1)); - rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); - if (!rv) { - wl2 = mkinitcap2(cw, unicw, nc); - rv = spellsharps(cw, cw, 0, 0, tmpword, info, root); - } - if ((abbv) && !(rv)) { - *(wspace + wl) = '.'; - *(wspace + wl + 1) = '\0'; - rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); - if (!rv) { - memcpy(wspace, cw, wl2); - *(wspace + wl2) = '.'; - *(wspace + wl2 + 1) = '\0'; - rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); - } - } - if (rv) - break; - } - } - case INITCAP: { - *info += SPELL_ORIGCAP; - wl = mkallsmall2(cw, unicw, nc); - memcpy(wspace, cw, (wl + 1)); - wl2 = mkinitcap2(cw, unicw, nc); - if (captype == INITCAP) - *info += SPELL_INITCAP; - rv = checkword(cw, info, root); - if (captype == INITCAP) - *info -= SPELL_INITCAP; - // forbid bad capitalization - // (for example, ijs -> Ijs instead of IJs in Dutch) - // use explicit forms in dic: Ijs/F (F = FORBIDDENWORD flag) - if (*info & SPELL_FORBIDDEN) { - rv = NULL; - break; - } - if (rv && is_keepcase(rv) && (captype == ALLCAP)) - rv = NULL; - if (rv) - break; - - rv = checkword(wspace, info, root); - if (abbv && !rv) { - *(wspace + wl) = '.'; - *(wspace + wl + 1) = '\0'; - rv = checkword(wspace, info, root); - if (!rv) { - memcpy(wspace, cw, wl2); - *(wspace + wl2) = '.'; - *(wspace + wl2 + 1) = '\0'; - if (captype == INITCAP) - *info += SPELL_INITCAP; - rv = checkword(wspace, info, root); - if (captype == INITCAP) - *info -= SPELL_INITCAP; - if (rv && is_keepcase(rv) && (captype == ALLCAP)) - rv = NULL; - break; - } - } - if (rv && is_keepcase(rv) && - ((captype == ALLCAP) || - // if CHECKSHARPS: KEEPCASE words with \xDF are allowed - // in INITCAP form, too. - !(pAMgr->get_checksharps() && - ((utf8 && strstr(wspace, "\xC3\x9F")) || - (!utf8 && strchr(wspace, '\xDF')))))) - rv = NULL; - break; - } - } - - if (rv) { - if (pAMgr && pAMgr->get_warn() && rv->astr && - TESTAFF(rv->astr, pAMgr->get_warn(), rv->alen)) { - *info += SPELL_WARN; - if (pAMgr->get_forbidwarn()) - return 0; - return HUNSPELL_OK_WARN; - } - return HUNSPELL_OK; - } - - // recursive breaking at break points - if (wordbreak) { - char* s; - char r; - int nbr = 0; - wl = strlen(cw); - int numbreak = pAMgr ? pAMgr->get_numbreak() : 0; - - // calculate break points for recursion limit - for (int j = 0; j < numbreak; j++) { - s = cw; - do { - s = (char*)strstr(s, wordbreak[j]); - if (s) { - nbr++; - s++; - } - } while (s); - } - if (nbr >= 10) - return 0; - - // check boundary patterns (^begin and end$) - for (int j = 0; j < numbreak; j++) { - int plen = strlen(wordbreak[j]); - if (plen == 1 || plen > wl) - continue; - if (wordbreak[j][0] == '^' && - strncmp(cw, wordbreak[j] + 1, plen - 1) == 0 && spell(cw + plen - 1)) - return 1; - if (wordbreak[j][plen - 1] == '$' && - strncmp(cw + wl - plen + 1, wordbreak[j], plen - 1) == 0) { - r = cw[wl - plen + 1]; - cw[wl - plen + 1] = '\0'; - if (spell(cw)) - return 1; - cw[wl - plen + 1] = r; - } - } - - // other patterns - for (int j = 0; j < numbreak; j++) { - int plen = strlen(wordbreak[j]); - s = (char*)strstr(cw, wordbreak[j]); - if (s && (s > cw) && (s < cw + wl - plen)) { - if (!spell(s + plen)) - continue; - r = *s; - *s = '\0'; - // examine 2 sides of the break point - if (spell(cw)) - return 1; - *s = r; - - // LANG_hu: spec. dash rule - if (langnum == LANG_hu && strcmp(wordbreak[j], "-") == 0) { - r = s[1]; - s[1] = '\0'; - if (spell(cw)) - return 1; // check the first part with dash - s[1] = r; - } - // end of LANG speficic region - } - } - } - - return 0; -} - -struct hentry* Hunspell::checkword(const char* w, int* info, char** root) { - struct hentry* he = NULL; - bool usebuffer = false; - int len, i; - std::string w2; - const char* word; - - char* ignoredchars = pAMgr ? pAMgr->get_ignore() : NULL; - if (ignoredchars != NULL) { - w2.assign(w); - if (utf8) { - int ignoredchars_utf16_len; - unsigned short* ignoredchars_utf16 = - pAMgr->get_ignore_utf16(&ignoredchars_utf16_len); - remove_ignored_chars_utf(w2, ignoredchars_utf16, ignoredchars_utf16_len); - } else { - remove_ignored_chars(w2, ignoredchars); - } - word = w2.c_str(); - usebuffer = true; - } else - word = w; - - len = strlen(word); - - if (!len) - return NULL; - - // word reversing wrapper for complex prefixes - if (complexprefixes) { - if (!usebuffer) { - w2.assign(word); - usebuffer = true; - } - if (utf8) - reverseword_utf(w2); - else - reverseword(w2); - } - - if (usebuffer) { - word = w2.c_str(); - } - - // look word in hash table - for (i = 0; (i < maxdic) && !he; i++) { - he = (pHMgr[i])->lookup(word); - - // check forbidden and onlyincompound words - if ((he) && (he->astr) && (pAMgr) && - TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) { - if (info) - *info += SPELL_FORBIDDEN; - // LANG_hu section: set dash information for suggestions - if (langnum == LANG_hu) { - if (pAMgr->get_compoundflag() && - TESTAFF(he->astr, pAMgr->get_compoundflag(), he->alen)) { - if (info) - *info += SPELL_COMPOUND; - } - } - return NULL; - } - - // he = next not needaffix, onlyincompound homonym or onlyupcase word - while (he && (he->astr) && pAMgr && - ((pAMgr->get_needaffix() && - TESTAFF(he->astr, pAMgr->get_needaffix(), he->alen)) || - (pAMgr->get_onlyincompound() && - TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) || - (info && (*info & SPELL_INITCAP) && - TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)))) - he = he->next_homonym; - } - - // check with affixes - if (!he && pAMgr) { - // try stripping off affixes */ - he = pAMgr->affix_check(word, len, 0); - - // check compound restriction and onlyupcase - if (he && he->astr && - ((pAMgr->get_onlyincompound() && - TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) || - (info && (*info & SPELL_INITCAP) && - TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)))) { - he = NULL; - } - - if (he) { - if ((he->astr) && (pAMgr) && - TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) { - if (info) - *info += SPELL_FORBIDDEN; - return NULL; - } - if (root) { - *root = mystrdup(he->word); - if (*root && complexprefixes) { - if (utf8) - reverseword_utf(*root); - else - reverseword(*root); - } - } - // try check compound word - } else if (pAMgr->get_compound()) { - he = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 0, info); - // LANG_hu section: `moving rule' with last dash - if ((!he) && (langnum == LANG_hu) && (word[len - 1] == '-')) { - char* dup = mystrdup(word); - if (!dup) - return NULL; - dup[len - 1] = '\0'; - he = pAMgr->compound_check(dup, len - 1, -5, 0, 100, 0, NULL, 1, 0, - info); - free(dup); - } - // end of LANG speficic region - if (he) { - if (root) { - *root = mystrdup(he->word); - if (*root && complexprefixes) { - if (utf8) - reverseword_utf(*root); - else - reverseword(*root); - } - } - if (info) - *info += SPELL_COMPOUND; - } - } - } - - return he; -} - -int Hunspell::suggest(char*** slst, const char* word) { - int onlycmpdsug = 0; - char cw[MAXWORDUTF8LEN]; - char wspace[MAXWORDUTF8LEN]; - if (!pSMgr || maxdic == 0) - return 0; - w_char unicw[MAXWORDLEN]; - *slst = NULL; - // process XML input of the simplified API (see manual) - if (strncmp(word, SPELL_XML, sizeof(SPELL_XML) - 3) == 0) { - return spellml(slst, word); - } - int nc = strlen(word); - if (utf8) { - if (nc >= MAXWORDUTF8LEN) - return 0; - } else { - if (nc >= MAXWORDLEN) - return 0; - } - int captype = 0; - int abbv = 0; - int wl = 0; - - // input conversion - RepList* rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; - int convstatus = rl ? rl->conv(word, wspace, MAXWORDUTF8LEN) : 0; - if (convstatus < 0) - return 0; - else if (convstatus > 0) - wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv); - else - wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); - - if (wl == 0) - return 0; - int ns = 0; - int capwords = 0; - - // check capitalized form for FORCEUCASE - if (pAMgr && captype == NOCAP && pAMgr->get_forceucase()) { - int info = SPELL_ORIGCAP; - char** wlst; - if (checkword(cw, &info, NULL)) { - if (*slst) { - wlst = *slst; - } else { - wlst = (char**)malloc(MAXSUGGESTION * sizeof(char*)); - if (wlst == NULL) - return -1; - *slst = wlst; - for (int i = 0; i < MAXSUGGESTION; i++) { - wlst[i] = NULL; - } - } - wlst[0] = mystrdup(cw); - mkinitcap(wlst[0]); - return 1; - } - } - - switch (captype) { - case NOCAP: { - ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug); - break; - } - - case INITCAP: { - capwords = 1; - ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug); - if (ns == -1) - break; - memcpy(wspace, cw, (wl + 1)); - mkallsmall2(wspace, unicw, nc); - ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); - break; - } - case HUHINITCAP: - capwords = 1; - case HUHCAP: { - ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug); - if (ns != -1) { - int prevns; - // something.The -> something. The - char* dot = strchr(cw, '.'); - if (dot && (dot > cw)) { - int captype_; - if (utf8) { - w_char w_[MAXWORDLEN]; - int wl_ = u8_u16(w_, MAXWORDLEN, dot + 1); - captype_ = get_captype_utf8(w_, wl_, langnum); - } else - captype_ = get_captype(dot + 1, strlen(dot + 1), csconv); - if (captype_ == INITCAP) { - char* st = mystrdup(cw); - if (st) { - char* newst = (char*)realloc(st, wl + 2); - if (newst == NULL) - free(st); - st = newst; - } - if (st) { - st[(dot - cw) + 1] = ' '; - strcpy(st + (dot - cw) + 2, dot + 1); - ns = insert_sug(slst, st, ns); - free(st); - } - } - } - if (captype == HUHINITCAP) { - // TheOpenOffice.org -> The OpenOffice.org - memcpy(wspace, cw, (wl + 1)); - mkinitsmall2(wspace, unicw, nc); - ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); - } - memcpy(wspace, cw, (wl + 1)); - mkallsmall2(wspace, unicw, nc); - if (spell(wspace)) - ns = insert_sug(slst, wspace, ns); - prevns = ns; - ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); - if (captype == HUHINITCAP) { - mkinitcap2(wspace, unicw, nc); - if (spell(wspace)) - ns = insert_sug(slst, wspace, ns); - ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); - } - // aNew -> "a New" (instead of "a new") - for (int j = prevns; j < ns; j++) { - char* space = strchr((*slst)[j], ' '); - if (space) { - int slen = strlen(space + 1); - // different case after space (need capitalisation) - if ((slen < wl) && strcmp(cw + wl - slen, space + 1)) { - w_char w[MAXWORDLEN]; - int wc = 0; - char* r = (*slst)[j]; - if (utf8) - wc = u8_u16(w, MAXWORDLEN, space + 1); - mkinitcap2(space + 1, w, wc); - // set as first suggestion - for (int k = j; k > 0; k--) - (*slst)[k] = (*slst)[k - 1]; - (*slst)[0] = r; - } - } - } - } - break; - } - - case ALLCAP: { - memcpy(wspace, cw, (wl + 1)); - mkallsmall2(wspace, unicw, nc); - ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); - if (ns == -1) - break; - if (pAMgr && pAMgr->get_keepcase() && spell(wspace)) - ns = insert_sug(slst, wspace, ns); - mkinitcap2(wspace, unicw, nc); - ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); - for (int j = 0; j < ns; j++) { - mkallcap((*slst)[j]); - if (pAMgr && pAMgr->get_checksharps()) { - char* pos; - if (utf8) { - pos = strstr((*slst)[j], "\xC3\x9F"); - while (pos) { - *pos = 'S'; - *(pos + 1) = 'S'; - pos = strstr(pos + 2, "\xC3\x9F"); - } - } else { - pos = strchr((*slst)[j], '\xDF'); - while (pos) { - (*slst)[j] = (char*)realloc((*slst)[j], strlen((*slst)[j]) + 2); - mystrrep((*slst)[j], "\xDF", "SS"); - pos = strchr((*slst)[j], '\xDF'); - } - } - } - } - break; - } - } - - // LANG_hu section: replace '-' with ' ' in Hungarian - if (langnum == LANG_hu) { - for (int j = 0; j < ns; j++) { - char* pos = strchr((*slst)[j], '-'); - if (pos) { - int info; - char w[MAXWORDUTF8LEN]; - *pos = '\0'; - strcpy(w, (*slst)[j]); - strcat(w, pos + 1); - (void)spell(w, &info, NULL); - if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) { - *pos = ' '; - } else - *pos = '-'; - } - } - } - // END OF LANG_hu section - - // try ngram approach since found nothing or only compound words - if (pAMgr && (ns == 0 || onlycmpdsug) && (pAMgr->get_maxngramsugs() != 0) && - (*slst)) { - switch (captype) { - case NOCAP: { - ns = pSMgr->ngsuggest(*slst, cw, ns, pHMgr, maxdic); - break; - } - case HUHINITCAP: - capwords = 1; - case HUHCAP: { - memcpy(wspace, cw, (wl + 1)); - mkallsmall2(wspace, unicw, nc); - ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic); - break; - } - case INITCAP: { - capwords = 1; - memcpy(wspace, cw, (wl + 1)); - mkallsmall2(wspace, unicw, nc); - ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic); - break; - } - case ALLCAP: { - memcpy(wspace, cw, (wl + 1)); - mkallsmall2(wspace, unicw, nc); - int oldns = ns; - ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic); - for (int j = oldns; j < ns; j++) - mkallcap((*slst)[j]); - break; - } - } - } - - // try dash suggestion (Afo-American -> Afro-American) - if (char* pos = strchr(cw, '-')) { - char* ppos = cw; - int nodashsug = 1; - char** nlst = NULL; - int nn = 0; - int last = 0; - if (*slst) { - for (int j = 0; j < ns && nodashsug == 1; j++) { - if (strchr((*slst)[j], '-')) - nodashsug = 0; - } - } - while (nodashsug && !last) { - if (*pos == '\0') - last = 1; - else - *pos = '\0'; - if (!spell(ppos)) { - nn = suggest(&nlst, ppos); - for (int j = nn - 1; j >= 0; j--) { - strncpy(wspace, cw, ppos - cw); - strcpy(wspace + (ppos - cw), nlst[j]); - if (!last) { - strcat(wspace, "-"); - strcat(wspace, pos + 1); - } - ns = insert_sug(slst, wspace, ns); - free(nlst[j]); - } - if (nlst != NULL) - free(nlst); - nodashsug = 0; - } - if (!last) { - *pos = '-'; - ppos = pos + 1; - pos = strchr(ppos, '-'); - } - if (!pos) - pos = cw + strlen(cw); - } - } - - // word reversing wrapper for complex prefixes - if (complexprefixes) { - for (int j = 0; j < ns; j++) { - if (utf8) - reverseword_utf((*slst)[j]); - else - reverseword((*slst)[j]); - } - } - - // capitalize - if (capwords) - for (int j = 0; j < ns; j++) { - mkinitcap((*slst)[j]); - } - - // expand suggestions with dot(s) - if (abbv && pAMgr && pAMgr->get_sugswithdots()) { - for (int j = 0; j < ns; j++) { - (*slst)[j] = (char*)realloc((*slst)[j], strlen((*slst)[j]) + 1 + abbv); - strcat((*slst)[j], word + strlen(word) - abbv); - } - } - - // remove bad capitalized and forbidden forms - if (pAMgr && (pAMgr->get_keepcase() || pAMgr->get_forbiddenword())) { - switch (captype) { - case INITCAP: - case ALLCAP: { - int l = 0; - for (int j = 0; j < ns; j++) { - if (!strchr((*slst)[j], ' ') && !spell((*slst)[j])) { - char s[MAXSWUTF8L]; - w_char w[MAXSWL]; - int len; - if (utf8) { - len = u8_u16(w, MAXSWL, (*slst)[j]); - } else { - strcpy(s, (*slst)[j]); - len = strlen(s); - } - mkallsmall2(s, w, len); - free((*slst)[j]); - if (spell(s)) { - (*slst)[l] = mystrdup(s); - if ((*slst)[l]) - l++; - } else { - mkinitcap2(s, w, len); - if (spell(s)) { - (*slst)[l] = mystrdup(s); - if ((*slst)[l]) - l++; - } - } - } else { - (*slst)[l] = (*slst)[j]; - l++; - } - } - ns = l; - } - } - } - - // remove duplications - int l = 0; - for (int j = 0; j < ns; j++) { - (*slst)[l] = (*slst)[j]; - for (int k = 0; k < l; k++) { - if (strcmp((*slst)[k], (*slst)[j]) == 0) { - free((*slst)[j]); - l--; - break; - } - } - l++; - } - ns = l; - - // output conversion - rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL; - for (int j = 0; rl && j < ns; j++) { - if (rl->conv((*slst)[j], wspace, MAXWORDUTF8LEN) > 0) { - free((*slst)[j]); - (*slst)[j] = mystrdup(wspace); - } - } - - // if suggestions removed by nosuggest, onlyincompound parameters - if (l == 0 && *slst) { - free(*slst); - *slst = NULL; - } - return l; -} - -void Hunspell::free_list(char*** slst, int n) { - freelist(slst, n); -} - -char* Hunspell::get_dic_encoding() { - return encoding; -} - -#ifdef HUNSPELL_EXPERIMENTAL -// XXX UTF-8 support is OK? -int Hunspell::suggest_auto(char*** slst, const char* word) { - char cw[MAXWORDUTF8LEN]; - char wspace[MAXWORDUTF8LEN]; - if (!pSMgr || maxdic == 0) - return 0; - w_char unicw[MAXWORDLEN]; - int nc = strlen(word); - if (utf8) { - if (nc >= MAXWORDUTF8LEN) - return 0; - } else { - if (nc >= MAXWORDLEN) - return 0; - } - int captype = 0; - int abbv = 0; - int wl = 0; - - // input conversion - RepList* rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; - int convstatus = rl ? rl->conv(word, wspace) : 0; - if (convstatus < 0) - return 0; - else if (convstatus > 0) - wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv); - else - wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); - - if (wl == 0) - return 0; - int ns = 0; - *slst = NULL; // HU, nsug in pSMgr->suggest - - switch (captype) { - case NOCAP: { - ns = pSMgr->suggest_auto(slst, cw, ns); - if (ns > 0) - break; - break; - } - - case INITCAP: { - memcpy(wspace, cw, (wl + 1)); - mkallsmall2(wspace, unicw, nc); - ns = pSMgr->suggest_auto(slst, wspace, ns); - for (int j = 0; j < ns; j++) - mkinitcap((*slst)[j]); - ns = pSMgr->suggest_auto(slst, cw, ns); - break; - } - - case HUHINITCAP: - case HUHCAP: { - ns = pSMgr->suggest_auto(slst, cw, ns); - if (ns == 0) { - memcpy(wspace, cw, (wl + 1)); - mkallsmall2(wspace, unicw, nc); - ns = pSMgr->suggest_auto(slst, wspace, ns); - } - break; - } - - case ALLCAP: { - memcpy(wspace, cw, (wl + 1)); - mkallsmall2(wspace, unicw, nc); - ns = pSMgr->suggest_auto(slst, wspace, ns); - - mkinitcap(wspace); - ns = pSMgr->suggest_auto(slst, wspace, ns); - - for (int j = 0; j < ns; j++) - mkallcap((*slst)[j]); - break; - } - } - - // word reversing wrapper for complex prefixes - if (complexprefixes) { - for (int j = 0; j < ns; j++) { - if (utf8) - reverseword_utf((*slst)[j]); - else - reverseword((*slst)[j]); - } - } - - // expand suggestions with dot(s) - if (abbv && pAMgr && pAMgr->get_sugswithdots()) { - for (int j = 0; j < ns; j++) { - (*slst)[j] = (char*)realloc((*slst)[j], strlen((*slst)[j]) + 1 + abbv); - strcat((*slst)[j], word + strlen(word) - abbv); - } - } - - // LANG_hu section: replace '-' with ' ' in Hungarian - if (langnum == LANG_hu) { - for (int j = 0; j < ns; j++) { - char* pos = strchr((*slst)[j], '-'); - if (pos) { - int info; - char w[MAXWORDUTF8LEN]; - *pos = '\0'; - strcpy(w, (*slst)[j]); - strcat(w, pos + 1); - spell(w, &info, NULL); - if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) { - *pos = ' '; - } else - *pos = '-'; - } - } - } - - // output conversion - rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL; - for (int j = 0; rl && j < ns; j++) { - if (rl->conv((*slst)[j], wspace) > 0) { - free((*slst)[j]); - (*slst)[j] = mystrdup(wspace); - } - } - - // END OF LANG_hu section - return ns; -} -#endif - -int Hunspell::stem(char*** slst, char** desc, int n) { - char result[MAXLNLEN]; - char result2[MAXLNLEN]; - *slst = NULL; - if (n == 0) - return 0; - *result2 = '\0'; - for (int i = 0; i < n; i++) { - *result = '\0'; - // add compound word parts (except the last one) - char* s = (char*)desc[i]; - char* part = strstr(s, MORPH_PART); - if (part) { - char* nextpart = strstr(part + 1, MORPH_PART); - while (nextpart) { - copy_field(result + strlen(result), part, MORPH_PART); - part = nextpart; - nextpart = strstr(part + 1, MORPH_PART); - } - s = part; - } - - char** pl; - std::string tok(s); - size_t alt = 0; - while ((alt = tok.find(" | ", alt)) != std::string::npos) { - tok[alt + 1] = MSEP_ALT; - } - int pln = line_tok(tok.c_str(), &pl, MSEP_ALT); - for (int k = 0; k < pln; k++) { - // add derivational suffixes - if (strstr(pl[k], MORPH_DERI_SFX)) { - // remove inflectional suffixes - char* is = strstr(pl[k], MORPH_INFL_SFX); - if (is) - *is = '\0'; - char* sg = pSMgr->suggest_gen(&(pl[k]), 1, pl[k]); - if (sg) { - char** gen; - int genl = line_tok(sg, &gen, MSEP_REC); - free(sg); - for (int j = 0; j < genl; j++) { - sprintf(result2 + strlen(result2), "%c%s%s", MSEP_REC, result, - gen[j]); - } - freelist(&gen, genl); - } - } else { - sprintf(result2 + strlen(result2), "%c%s", MSEP_REC, result); - if (strstr(pl[k], MORPH_SURF_PFX)) { - copy_field(result2 + strlen(result2), pl[k], MORPH_SURF_PFX); - } - copy_field(result2 + strlen(result2), pl[k], MORPH_STEM); - } - } - freelist(&pl, pln); - } - int sln = line_tok(result2, slst, MSEP_REC); - return uniqlist(*slst, sln); -} - -int Hunspell::stem(char*** slst, const char* word) { - char** pl; - int pln = analyze(&pl, word); - int pln2 = stem(slst, pl, pln); - freelist(&pl, pln); - return pln2; -} - -#ifdef HUNSPELL_EXPERIMENTAL -int Hunspell::suggest_pos_stems(char*** slst, const char* word) { - char cw[MAXWORDUTF8LEN]; - char wspace[MAXWORDUTF8LEN]; - if (!pSMgr || maxdic == 0) - return 0; - w_char unicw[MAXWORDLEN]; - int nc = strlen(word); - if (utf8) { - if (nc >= MAXWORDUTF8LEN) - return 0; - } else { - if (nc >= MAXWORDLEN) - return 0; - } - int captype = 0; - int abbv = 0; - int wl = 0; - - // input conversion - RepList* rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; - int convstatus = rl ? rl->conv(word, wspace) : 0; - if (convstatus < 0) - return 0; - else if (convstatus > 0) - wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv); - else - wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); - - if (wl == 0) - return 0; - - int ns = 0; // ns=0 = normalized input - - *slst = NULL; // HU, nsug in pSMgr->suggest - - switch (captype) { - case HUHCAP: - case NOCAP: { - ns = pSMgr->suggest_pos_stems(slst, cw, ns); - - if ((abbv) && (ns == 0)) { - memcpy(wspace, cw, wl); - *(wspace + wl) = '.'; - *(wspace + wl + 1) = '\0'; - ns = pSMgr->suggest_pos_stems(slst, wspace, ns); - } - - break; - } - - case INITCAP: { - ns = pSMgr->suggest_pos_stems(slst, cw, ns); - - if (ns == 0 || ((*slst)[0][0] == '#')) { - memcpy(wspace, cw, (wl + 1)); - mkallsmall2(wspace, unicw, nc); - ns = pSMgr->suggest_pos_stems(slst, wspace, ns); - } - - break; - } - - case ALLCAP: { - ns = pSMgr->suggest_pos_stems(slst, cw, ns); - if (ns != 0) - break; - - memcpy(wspace, cw, (wl + 1)); - mkallsmall2(wspace, unicw, nc); - ns = pSMgr->suggest_pos_stems(slst, wspace, ns); - - if (ns == 0) { - mkinitcap(wspace); - ns = pSMgr->suggest_pos_stems(slst, wspace, ns); - } - break; - } - } - - // output conversion - rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL; - for (int j = 0; rl && j < ns; j++) { - if (rl->conv((*slst)[j], wspace) > 0) { - free((*slst)[j]); - (*slst)[j] = mystrdup(wspace); - } - } - - return ns; -} -#endif // END OF HUNSPELL_EXPERIMENTAL CODE - -const char* Hunspell::get_wordchars() { - return pAMgr->get_wordchars(); -} - -unsigned short* Hunspell::get_wordchars_utf16(int* len) { - return pAMgr->get_wordchars_utf16(len); -} - -void Hunspell::mkinitcap(char* p) { - if (!utf8) { - if (*p != '\0') - *p = csconv[((unsigned char)*p)].cupper; - } else { - int len; - w_char u[MAXWORDLEN]; - len = u8_u16(u, MAXWORDLEN, p); - unsigned short i = unicodetoupper((u[0].h << 8) + u[0].l, langnum); - u[0].h = (unsigned char)(i >> 8); - u[0].l = (unsigned char)(i & 0x00FF); - u16_u8(p, MAXWORDUTF8LEN, u, len); - } -} - -int Hunspell::mkinitcap2(char* p, w_char* u, int nc) { - if (!utf8) { - if (*p != '\0') - *p = csconv[((unsigned char)*p)].cupper; - } else if (nc > 0) { - unsigned short i = unicodetoupper((u[0].h << 8) + u[0].l, langnum); - u[0].h = (unsigned char)(i >> 8); - u[0].l = (unsigned char)(i & 0x00FF); - u16_u8(p, MAXWORDUTF8LEN, u, nc); - return strlen(p); - } - return nc; -} - -int Hunspell::mkinitsmall2(char* p, w_char* u, int nc) { - if (!utf8) { - if (*p != '\0') - *p = csconv[((unsigned char)*p)].clower; - } else if (nc > 0) { - unsigned short i = unicodetolower((u[0].h << 8) + u[0].l, langnum); - u[0].h = (unsigned char)(i >> 8); - u[0].l = (unsigned char)(i & 0x00FF); - u16_u8(p, MAXWORDUTF8LEN, u, nc); - return strlen(p); - } - return nc; -} - -int Hunspell::add(const char* word) { - if (pHMgr[0]) - return (pHMgr[0])->add(word); - return 0; -} - -int Hunspell::add_with_affix(const char* word, const char* example) { - if (pHMgr[0]) - return (pHMgr[0])->add_with_affix(word, example); - return 0; -} - -int Hunspell::remove(const char* word) { - if (pHMgr[0]) - return (pHMgr[0])->remove(word); - return 0; -} - -const char* Hunspell::get_version() { - return pAMgr->get_version(); -} - -struct cs_info* Hunspell::get_csconv() { - return csconv; -} - -void Hunspell::cat_result(char* result, char* st) { - if (st) { - if (*result) - mystrcat(result, "\n", MAXLNLEN); - mystrcat(result, st, MAXLNLEN); - free(st); - } -} - -int Hunspell::analyze(char*** slst, const char* word) { - char cw[MAXWORDUTF8LEN]; - char wspace[MAXWORDUTF8LEN]; - w_char unicw[MAXWORDLEN]; - int wl2 = 0; - *slst = NULL; - if (!pSMgr || maxdic == 0) - return 0; - int nc = strlen(word); - if (utf8) { - if (nc >= MAXWORDUTF8LEN) - return 0; - } else { - if (nc >= MAXWORDLEN) - return 0; - } - int captype = 0; - int abbv = 0; - int wl = 0; - - // input conversion - RepList* rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; - int convstatus = rl ? rl->conv(word, wspace, MAXWORDUTF8LEN) : 0; - if (convstatus < 0) - return 0; - else if (convstatus > 0) - wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv); - else - wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); - - if (wl == 0) { - if (abbv) { - for (wl = 0; wl < abbv; wl++) - cw[wl] = '.'; - cw[wl] = '\0'; - abbv = 0; - } else - return 0; - } - - char result[MAXLNLEN]; - char* st = NULL; - - *result = '\0'; - - int n = 0; - int n2 = 0; - int n3 = 0; - - // test numbers - // LANG_hu section: set dash information for suggestions - if (langnum == LANG_hu) { - while ((n < wl) && (((cw[n] <= '9') && (cw[n] >= '0')) || - (((cw[n] == '.') || (cw[n] == ',')) && (n > 0)))) { - n++; - if ((cw[n] == '.') || (cw[n] == ',')) { - if (((n2 == 0) && (n > 3)) || - ((n2 > 0) && ((cw[n - 1] == '.') || (cw[n - 1] == ',')))) - break; - n2++; - n3 = n; - } - } - - if ((n == wl) && (n3 > 0) && (n - n3 > 3)) - return 0; - if ((n == wl) || ((n > 0) && ((cw[n] == '%') || (cw[n] == '\xB0')) && - checkword(cw + n, NULL, NULL))) { - mystrcat(result, cw, MAXLNLEN); - result[n - 1] = '\0'; - if (n == wl) - cat_result(result, pSMgr->suggest_morph(cw + n - 1)); - else { - char sign = cw[n]; - cw[n] = '\0'; - cat_result(result, pSMgr->suggest_morph(cw + n - 1)); - mystrcat(result, "+", MAXLNLEN); // XXX SPEC. MORPHCODE - cw[n] = sign; - cat_result(result, pSMgr->suggest_morph(cw + n)); - } - return line_tok(result, slst, MSEP_REC); - } - } - // END OF LANG_hu section - - switch (captype) { - case HUHCAP: - case HUHINITCAP: - case NOCAP: { - cat_result(result, pSMgr->suggest_morph(cw)); - if (abbv) { - memcpy(wspace, cw, wl); - *(wspace + wl) = '.'; - *(wspace + wl + 1) = '\0'; - cat_result(result, pSMgr->suggest_morph(wspace)); - } - break; - } - case INITCAP: { - wl = mkallsmall2(cw, unicw, nc); - memcpy(wspace, cw, (wl + 1)); - wl2 = mkinitcap2(cw, unicw, nc); - cat_result(result, pSMgr->suggest_morph(wspace)); - cat_result(result, pSMgr->suggest_morph(cw)); - if (abbv) { - *(wspace + wl) = '.'; - *(wspace + wl + 1) = '\0'; - cat_result(result, pSMgr->suggest_morph(wspace)); - - memcpy(wspace, cw, wl2); - *(wspace + wl2) = '.'; - *(wspace + wl2 + 1) = '\0'; - - cat_result(result, pSMgr->suggest_morph(wspace)); - } - break; - } - case ALLCAP: { - cat_result(result, pSMgr->suggest_morph(cw)); - if (abbv) { - memcpy(wspace, cw, wl); - *(wspace + wl) = '.'; - *(wspace + wl + 1) = '\0'; - cat_result(result, pSMgr->suggest_morph(cw)); - } - wl = mkallsmall2(cw, unicw, nc); - memcpy(wspace, cw, (wl + 1)); - wl2 = mkinitcap2(cw, unicw, nc); - - cat_result(result, pSMgr->suggest_morph(wspace)); - cat_result(result, pSMgr->suggest_morph(cw)); - if (abbv) { - *(wspace + wl) = '.'; - *(wspace + wl + 1) = '\0'; - cat_result(result, pSMgr->suggest_morph(wspace)); - - memcpy(wspace, cw, wl2); - *(wspace + wl2) = '.'; - *(wspace + wl2 + 1) = '\0'; - - cat_result(result, pSMgr->suggest_morph(wspace)); - } - break; - } - } - - if (*result) { - // word reversing wrapper for complex prefixes - if (complexprefixes) { - if (utf8) - reverseword_utf(result); - else - reverseword(result); - } - return line_tok(result, slst, MSEP_REC); - } - - // compound word with dash (HU) I18n - char* dash = NULL; - int nresult = 0; - // LANG_hu section: set dash information for suggestions - if (langnum == LANG_hu) - dash = (char*)strchr(cw, '-'); - if ((langnum == LANG_hu) && dash) { - *dash = '\0'; - // examine 2 sides of the dash - if (dash[1] == '\0') { // base word ending with dash - if (spell(cw)) { - char* p = pSMgr->suggest_morph(cw); - if (p) { - int ret = line_tok(p, slst, MSEP_REC); - free(p); - return ret; - } - } - } else if ((dash[1] == 'e') && (dash[2] == '\0')) { // XXX (HU) -e hat. - if (spell(cw) && (spell("-e"))) { - st = pSMgr->suggest_morph(cw); - if (st) { - mystrcat(result, st, MAXLNLEN); - free(st); - } - mystrcat(result, "+", MAXLNLEN); // XXX spec. separator in MORPHCODE - st = pSMgr->suggest_morph("-e"); - if (st) { - mystrcat(result, st, MAXLNLEN); - free(st); - } - return line_tok(result, slst, MSEP_REC); - } - } else { - // first word ending with dash: word- XXX ??? - char r2 = *(dash + 1); - dash[0] = '-'; - dash[1] = '\0'; - nresult = spell(cw); - dash[1] = r2; - dash[0] = '\0'; - if (nresult && spell(dash + 1) && - ((strlen(dash + 1) > 1) || ((dash[1] > '0') && (dash[1] < '9')))) { - st = pSMgr->suggest_morph(cw); - if (st) { - mystrcat(result, st, MAXLNLEN); - free(st); - mystrcat(result, "+", MAXLNLEN); // XXX spec. separator in MORPHCODE - } - st = pSMgr->suggest_morph(dash + 1); - if (st) { - mystrcat(result, st, MAXLNLEN); - free(st); - } - return line_tok(result, slst, MSEP_REC); - } - } - // affixed number in correct word - if (nresult && (dash > cw) && - (((*(dash - 1) <= '9') && (*(dash - 1) >= '0')) || - (*(dash - 1) == '.'))) { - *dash = '-'; - n = 1; - if (*(dash - n) == '.') - n++; - // search first not a number character to left from dash - while (((dash - n) >= cw) && ((*(dash - n) == '0') || (n < 3)) && - (n < 6)) { - n++; - } - if ((dash - n) < cw) - n--; - // numbers: valami1000000-hoz - // examine 100000-hoz, 10000-hoz 1000-hoz, 10-hoz, - // 56-hoz, 6-hoz - for (; n >= 1; n--) { - if ((*(dash - n) >= '0') && (*(dash - n) <= '9') && - checkword(dash - n, NULL, NULL)) { - mystrcat(result, cw, MAXLNLEN); - result[dash - cw - n] = '\0'; - st = pSMgr->suggest_morph(dash - n); - if (st) { - mystrcat(result, st, MAXLNLEN); - free(st); - } - return line_tok(result, slst, MSEP_REC); - } - } - } - } - return 0; -} - -int Hunspell::generate(char*** slst, const char* word, char** pl, int pln) { - *slst = NULL; - if (!pSMgr || !pln) - return 0; - char** pl2; - int pl2n = analyze(&pl2, word); - int captype = 0; - int abbv = 0; - char cw[MAXWORDUTF8LEN]; - cleanword(cw, word, &captype, &abbv); - char result[MAXLNLEN]; - *result = '\0'; - - for (int i = 0; i < pln; i++) { - cat_result(result, pSMgr->suggest_gen(pl2, pl2n, pl[i])); - } - freelist(&pl2, pl2n); - - if (*result) { - // allcap - if (captype == ALLCAP) - mkallcap(result); - - // line split - int linenum = line_tok(result, slst, MSEP_REC); - - // capitalize - if (captype == INITCAP || captype == HUHINITCAP) { - for (int j = 0; j < linenum; j++) - mkinitcap((*slst)[j]); - } - - // temporary filtering of prefix related errors (eg. - // generate("undrinkable", "eats") --> "undrinkables" and "*undrinks") - - int r = 0; - for (int j = 0; j < linenum; j++) { - if (!spell((*slst)[j])) { - free((*slst)[j]); - (*slst)[j] = NULL; - } else { - if (r < j) - (*slst)[r] = (*slst)[j]; - r++; - } - } - if (r > 0) - return r; - free(*slst); - *slst = NULL; - } - return 0; -} - -int Hunspell::generate(char*** slst, const char* word, const char* pattern) { - char** pl; - int pln = analyze(&pl, pattern); - int n = generate(slst, word, pl, pln); - freelist(&pl, pln); - return uniqlist(*slst, n); -} - -// minimal XML parser functions -int Hunspell::get_xml_par(char* dest, const char* par, int max) { - char* d = dest; - if (!par) - return 0; - char end = *par; - char* dmax = dest + max; - if (end == '>') - end = '<'; - else if (end != '\'' && end != '"') - return 0; // bad XML - for (par++; d < dmax && *par != '\0' && *par != end; par++, d++) - *d = *par; - *d = '\0'; - mystrrep(dest, "<", "<"); - mystrrep(dest, "&", "&"); - return (int)(d - dest); -} - -int Hunspell::get_langnum() const { - return langnum; -} - -int Hunspell::input_conv(const char* word, char* dest, size_t destsize) { - RepList* rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; - return (rl && rl->conv(word, dest, destsize) > 0); -} - -// return the beginning of the element (attr == NULL) or the attribute -const char* Hunspell::get_xml_pos(const char* s, const char* attr) { - const char* end = strchr(s, '>'); - const char* p = s; - if (attr == NULL) - return end; - do { - p = strstr(p, attr); - if (!p || p >= end) - return 0; - } while (*(p - 1) != ' ' && *(p - 1) != '\n'); - return p + strlen(attr); -} - -int Hunspell::check_xml_par(const char* q, - const char* attr, - const char* value) { - char cw[MAXWORDUTF8LEN]; - if (get_xml_par(cw, get_xml_pos(q, attr), MAXWORDUTF8LEN - 1) && - strcmp(cw, value) == 0) - return 1; - return 0; -} - -int Hunspell::get_xml_list(char*** slst, char* list, const char* tag) { - int n = 0; - char* p; - if (!list) - return 0; - for (p = list; ((p = strstr(p, tag)) != NULL); p++) - n++; - if (n == 0) - return 0; - *slst = (char**)malloc(sizeof(char*) * n); - if (!*slst) - return 0; - for (p = list, n = 0; ((p = strstr(p, tag)) != NULL); p++, n++) { - int l = strlen(p); - (*slst)[n] = (char*)malloc(l + 1); - if (!(*slst)[n]) - return n; - if (!get_xml_par((*slst)[n], p + strlen(tag) - 1, l)) { - free((*slst)[n]); - break; - } - } - return n; -} - -int Hunspell::spellml(char*** slst, const char* word) { - char *q, *q2; - char cw[MAXWORDUTF8LEN], cw2[MAXWORDUTF8LEN]; - q = (char*)strstr(word, "<query"); - if (!q) - return 0; // bad XML input - q2 = strchr(q, '>'); - if (!q2) - return 0; // bad XML input - q2 = strstr(q2, "<word"); - if (!q2) - return 0; // bad XML input - if (check_xml_par(q, "type=", "analyze")) { - int n = 0; - if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 10)) - n = analyze(slst, cw); - if (n == 0) - return 0; - // convert the result to <code><a>ana1</a><a>ana2</a></code> format - std::string r; - r.append("<code>"); - for (int i = 0; i < n; i++) { - r.append("<a>"); - - std::string entry((*slst)[i]); - free((*slst)[i]); - mystrrep(entry, "\t", " "); - mystrrep(entry, "&", "&"); - mystrrep(entry, "<", "<"); - r.append(entry); - - r.append("</a>"); - } - r.append("</code>"); - (*slst)[0] = mystrdup(r.c_str()); - return 1; - } else if (check_xml_par(q, "type=", "stem")) { - if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 1)) - return stem(slst, cw); - } else if (check_xml_par(q, "type=", "generate")) { - int n = get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 1); - if (n == 0) - return 0; - char* q3 = strstr(q2 + 1, "<word"); - if (q3) { - if (get_xml_par(cw2, strchr(q3, '>'), MAXWORDUTF8LEN - 1)) { - return generate(slst, cw, cw2); - } - } else { - if ((q2 = strstr(q2 + 1, "<code")) != NULL) { - char** slst2; - if ((n = get_xml_list(&slst2, strchr(q2, '>'), "<a>")) != 0) { - int n2 = generate(slst, cw, slst2, n); - freelist(&slst2, n); - return uniqlist(*slst, n2); - } - freelist(&slst2, n); - } - } - } - return 0; -} - -#ifdef HUNSPELL_EXPERIMENTAL -// XXX is UTF-8 support OK? -char* Hunspell::morph_with_correction(const char* word) { - char cw[MAXWORDUTF8LEN]; - char wspace[MAXWORDUTF8LEN]; - if (!pSMgr || maxdic == 0) - return NULL; - w_char unicw[MAXWORDLEN]; - int nc = strlen(word); - if (utf8) { - if (nc >= MAXWORDUTF8LEN) - return NULL; - } else { - if (nc >= MAXWORDLEN) - return NULL; - } - int captype = 0; - int abbv = 0; - int wl = 0; - - // input conversion - RepList* rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; - int convstatus = rl ? rl->conv(word, wspace) : 0; - if (convstatus < 0) - return 0; - else if (convstatus > 0) - wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv); - else - wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); - - if (wl == 0) - return NULL; - - char result[MAXLNLEN]; - char* st = NULL; - - *result = '\0'; - - switch (captype) { - case NOCAP: { - st = pSMgr->suggest_morph_for_spelling_error(cw); - if (st) { - mystrcat(result, st, MAXLNLEN); - free(st); - } - if (abbv) { - memcpy(wspace, cw, wl); - *(wspace + wl) = '.'; - *(wspace + wl + 1) = '\0'; - st = pSMgr->suggest_morph_for_spelling_error(wspace); - if (st) { - if (*result) - mystrcat(result, "\n", MAXLNLEN); - mystrcat(result, st, MAXLNLEN); - free(st); - } - } - break; - } - case INITCAP: { - memcpy(wspace, cw, (wl + 1)); - mkallsmall2(wspace, unicw, nc); - st = pSMgr->suggest_morph_for_spelling_error(wspace); - if (st) { - mystrcat(result, st, MAXLNLEN); - free(st); - } - st = pSMgr->suggest_morph_for_spelling_error(cw); - if (st) { - if (*result) - mystrcat(result, "\n", MAXLNLEN); - mystrcat(result, st, MAXLNLEN); - free(st); - } - if (abbv) { - memcpy(wspace, cw, wl); - *(wspace + wl) = '.'; - *(wspace + wl + 1) = '\0'; - mkallsmall2(wspace, unicw, nc); - st = pSMgr->suggest_morph_for_spelling_error(wspace); - if (st) { - if (*result) - mystrcat(result, "\n", MAXLNLEN); - mystrcat(result, st, MAXLNLEN); - free(st); - } - mkinitcap(wspace); - st = pSMgr->suggest_morph_for_spelling_error(wspace); - if (st) { - if (*result) - mystrcat(result, "\n", MAXLNLEN); - mystrcat(result, st, MAXLNLEN); - free(st); - } - } - break; - } - case HUHCAP: { - st = pSMgr->suggest_morph_for_spelling_error(cw); - if (st) { - mystrcat(result, st, MAXLNLEN); - free(st); - } - memcpy(wspace, cw, (wl + 1)); - mkallsmall2(wspace, unicw, nc); - st = pSMgr->suggest_morph_for_spelling_error(wspace); - if (st) { - if (*result) - mystrcat(result, "\n", MAXLNLEN); - mystrcat(result, st, MAXLNLEN); - free(st); - } - break; - } - case ALLCAP: { - memcpy(wspace, cw, (wl + 1)); - st = pSMgr->suggest_morph_for_spelling_error(wspace); - if (st) { - mystrcat(result, st, MAXLNLEN); - free(st); - } - mkallsmall2(wspace, unicw, nc); - st = pSMgr->suggest_morph_for_spelling_error(wspace); - if (st) { - if (*result) - mystrcat(result, "\n", MAXLNLEN); - mystrcat(result, st, MAXLNLEN); - free(st); - } - mkinitcap(wspace); - st = pSMgr->suggest_morph_for_spelling_error(wspace); - if (st) { - if (*result) - mystrcat(result, "\n", MAXLNLEN); - mystrcat(result, st, MAXLNLEN); - free(st); - } - if (abbv) { - memcpy(wspace, cw, (wl + 1)); - *(wspace + wl) = '.'; - *(wspace + wl + 1) = '\0'; - if (*result) - mystrcat(result, "\n", MAXLNLEN); - st = pSMgr->suggest_morph_for_spelling_error(wspace); - if (st) { - mystrcat(result, st, MAXLNLEN); - free(st); - } - mkallsmall2(wspace, unicw, nc); - st = pSMgr->suggest_morph_for_spelling_error(wspace); - if (st) { - if (*result) - mystrcat(result, "\n", MAXLNLEN); - mystrcat(result, st, MAXLNLEN); - free(st); - } - mkinitcap(wspace); - st = pSMgr->suggest_morph_for_spelling_error(wspace); - if (st) { - if (*result) - mystrcat(result, "\n", MAXLNLEN); - mystrcat(result, st, MAXLNLEN); - free(st); - } - } - break; - } - } - - if (*result) - return mystrdup(result); - return NULL; -} - -#endif // END OF HUNSPELL_EXPERIMENTAL CODE - -Hunhandle* Hunspell_create(const char* affpath, const char* dpath) { - return (Hunhandle*)(new Hunspell(affpath, dpath)); -} - -Hunhandle* Hunspell_create_key(const char* affpath, - const char* dpath, - const char* key) { - return (Hunhandle*)(new Hunspell(affpath, dpath, key)); -} - -void Hunspell_destroy(Hunhandle* pHunspell) { - delete (Hunspell*)(pHunspell); -} - -int Hunspell_add_dic(Hunhandle* pHunspell, const char* dpath) { - return ((Hunspell*)pHunspell)->add_dic(dpath); -} - -int Hunspell_spell(Hunhandle* pHunspell, const char* word) { - return ((Hunspell*)pHunspell)->spell(word); -} - -char* Hunspell_get_dic_encoding(Hunhandle* pHunspell) { - return ((Hunspell*)pHunspell)->get_dic_encoding(); -} - -int Hunspell_suggest(Hunhandle* pHunspell, char*** slst, const char* word) { - return ((Hunspell*)pHunspell)->suggest(slst, word); -} - -int Hunspell_analyze(Hunhandle* pHunspell, char*** slst, const char* word) { - return ((Hunspell*)pHunspell)->analyze(slst, word); -} - -int Hunspell_stem(Hunhandle* pHunspell, char*** slst, const char* word) { - return ((Hunspell*)pHunspell)->stem(slst, word); -} - -int Hunspell_stem2(Hunhandle* pHunspell, char*** slst, char** desc, int n) { - return ((Hunspell*)pHunspell)->stem(slst, desc, n); -} - -int Hunspell_generate(Hunhandle* pHunspell, - char*** slst, - const char* word, - const char* word2) { - return ((Hunspell*)pHunspell)->generate(slst, word, word2); -} - -int Hunspell_generate2(Hunhandle* pHunspell, - char*** slst, - const char* word, - char** desc, - int n) { - return ((Hunspell*)pHunspell)->generate(slst, word, desc, n); -} - -/* functions for run-time modification of the dictionary */ - -/* add word to the run-time dictionary */ - -int Hunspell_add(Hunhandle* pHunspell, const char* word) { - return ((Hunspell*)pHunspell)->add(word); -} - -/* add word to the run-time dictionary with affix flags of - * the example (a dictionary word): Hunspell will recognize - * affixed forms of the new word, too. - */ - -int Hunspell_add_with_affix(Hunhandle* pHunspell, - const char* word, - const char* example) { - return ((Hunspell*)pHunspell)->add_with_affix(word, example); -} - -/* remove word from the run-time dictionary */ - -int Hunspell_remove(Hunhandle* pHunspell, const char* word) { - return ((Hunspell*)pHunspell)->remove(word); -} - -void Hunspell_free_list(Hunhandle*, char*** slst, int n) { - freelist(slst, n); -} - -int Hunspell::suffix_suggest(char*** slst, const char* root_word) { - struct hentry* he = NULL; - int len; - std::string w2; - const char* word; - char* ignoredchars = pAMgr->get_ignore(); - if (ignoredchars != NULL) { - w2.assign(root_word); - if (utf8) { - int ignoredchars_utf16_len; - unsigned short* ignoredchars_utf16 = - pAMgr->get_ignore_utf16(&ignoredchars_utf16_len); - remove_ignored_chars_utf(w2, ignoredchars_utf16, ignoredchars_utf16_len); - } else { - remove_ignored_chars(w2, ignoredchars); - } - word = w2.c_str(); - } else - word = root_word; - - len = strlen(word); - - if (!len) - return 0; - - char** wlst = (char**)malloc(MAXSUGGESTION * sizeof(char*)); - if (wlst == NULL) - return -1; - *slst = wlst; - for (int i = 0; i < MAXSUGGESTION; i++) { - wlst[i] = NULL; - } - - for (int i = 0; (i < maxdic) && !he; i++) { - he = (pHMgr[i])->lookup(word); - } - if (he) { - return pAMgr->get_suffix_words(he->astr, he->alen, root_word, *slst); - } - return 0; -} |