diff options
author | George Hazan <ghazan@miranda.im> | 2018-03-10 13:56:24 +0300 |
---|---|---|
committer | George Hazan <ghazan@miranda.im> | 2018-03-10 13:56:24 +0300 |
commit | 97a16a6e09df80ffa3429e23a0174bd8daaa29a1 (patch) | |
tree | bc9ec915bfbcdfef2b655aacd8b4d02a80731196 /libs/hunspell/src/hunspell.cxx | |
parent | cb2caccb52c4044937c0d9e8eda7ddeb1d115e85 (diff) |
fix for loading hunspell project
Diffstat (limited to 'libs/hunspell/src/hunspell.cxx')
-rw-r--r-- | libs/hunspell/src/hunspell.cxx | 2017 |
1 files changed, 0 insertions, 2017 deletions
diff --git a/libs/hunspell/src/hunspell.cxx b/libs/hunspell/src/hunspell.cxx deleted file mode 100644 index b1535013fe..0000000000 --- a/libs/hunspell/src/hunspell.cxx +++ /dev/null @@ -1,2017 +0,0 @@ -/* ***** BEGIN LICENSE BLOCK ***** - * Version: MPL 1.1/GPL 2.0/LGPL 2.1 - * - * Copyright (C) 2002-2017 Németh László - * - * The contents of this file are subject to the Mozilla Public License Version - * 1.1 (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" basis, - * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License - * for the specific language governing rights and limitations under the - * License. - * - * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. - * - * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, - * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, - * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, - * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, - * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen - * - * Alternatively, the contents of this file may be used under the terms of - * either the GNU General Public License Version 2 or later (the "GPL"), or - * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), - * in which case the provisions of the GPL or the LGPL are applicable instead - * of those above. If you wish to allow use of your version of this file only - * under the terms of either the GPL or the LGPL, and not to allow others to - * use your version of this file under the terms of the MPL, indicate your - * decision by deleting the provisions above and replace them with the notice - * and other provisions required by the GPL or the LGPL. If you do not delete - * the provisions above, a recipient may use your version of this file under - * the terms of any one of the MPL, the GPL or the LGPL. - * - * ***** END LICENSE BLOCK ***** */ -/* - * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada - * And Contributors. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. All modifications to the source code must be clearly marked as - * such. Binary redistributions based on modified source code - * must be clearly marked as modified versions in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS - * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL - * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include <stdlib.h> -#include <string.h> -#include <stdio.h> - -#include "affixmgr.hxx" -#include "hunspell.hxx" -#include "suggestmgr.hxx" -#include "hunspell.h" -#include "csutil.hxx" - -#include <limits> -#include <string> - -#define MAXWORDUTF8LEN (MAXWORDLEN * 3) - -class HunspellImpl -{ -public: - HunspellImpl(const char* affpath, const char* dpath, const char* key); - ~HunspellImpl(); - int add_dic(const char* dpath, const char* key); - std::vector<std::string> suffix_suggest(const std::string& root_word); - std::vector<std::string> generate(const std::string& word, const std::vector<std::string>& pl); - std::vector<std::string> generate(const std::string& word, const std::string& pattern); - std::vector<std::string> stem(const std::string& word); - std::vector<std::string> stem(const std::vector<std::string>& morph); - std::vector<std::string> analyze(const std::string& word); - bool input_conv(const std::string& word, std::string& dest); - bool spell(const std::string& word, int* info = NULL, std::string* root = NULL); - std::vector<std::string> suggest(const std::string& word); - const std::string& get_wordchars() const; - const std::vector<w_char>& get_wordchars_utf16() const; - const std::string& get_dict_encoding() const; - int add(const std::string& word); - int add_with_affix(const std::string& word, const std::string& example); - int remove(const std::string& word); - struct cs_info* get_csconv(); - std::vector<char> dic_encoding_vec; - - int get_langnum() const { return langnum; } - const char* get_try_string() const { return pAMgr->get_try_string(); } - const std::string& get_version() const { return pAMgr->get_version(); } - -private: - AffixMgr* pAMgr; - std::vector<HashMgr*> m_HMgrs; - SuggestMgr* pSMgr; - char* affixpath; - std::string encoding; - struct cs_info* csconv; - int langnum; - int utf8; - int complexprefixes; - std::vector<std::string> wordbreak; - -private: - void cleanword(std::string& dest, const std::string&, int* pcaptype, int* pabbrev); - size_t cleanword2(std::string& dest, - std::vector<w_char>& dest_u, - const std::string& src, - int* pcaptype, - size_t* pabbrev); - void mkinitcap(std::string& u8); - int mkinitcap2(std::string& u8, std::vector<w_char>& u16); - int mkinitsmall2(std::string& u8, std::vector<w_char>& u16); - void mkallcap(std::string& u8); - int mkallsmall2(std::string& u8, std::vector<w_char>& u16); - struct hentry* checkword(const std::string& source, int* info, std::string* root); - std::string sharps_u8_l1(const std::string& source); - hentry* - spellsharps(std::string& base, size_t start_pos, int, int, int* info, std::string* root); - int is_keepcase(const hentry* rv); - void insert_sug(std::vector<std::string>& slst, const std::string& word); - void cat_result(std::string& result, const std::string& st); - std::vector<std::string> spellml(const std::string& word); - std::string get_xml_par(const char* par); - const char* get_xml_pos(const char* s, const char* attr); - std::vector<std::string> get_xml_list(const char* list, const char* tag); - int check_xml_par(const char* q, const char* attr, const char* value); -private: - HunspellImpl(const HunspellImpl&); - HunspellImpl& operator=(const HunspellImpl&); -}; - -Hunspell::Hunspell(const char* affpath, const char* dpath, const char* key) - : m_Impl(new HunspellImpl(affpath, dpath, key)) { -} - -HunspellImpl::HunspellImpl(const char* affpath, const char* dpath, const char* key) { - csconv = NULL; - utf8 = 0; - complexprefixes = 0; - affixpath = mystrdup(affpath); - - /* first set up the hash manager */ - m_HMgrs.push_back(new HashMgr(dpath, affpath, key)); - - /* next set up the affix manager */ - /* it needs access to the hash manager lookup methods */ - pAMgr = new AffixMgr(affpath, m_HMgrs, key); - - /* get the preferred try string and the dictionary */ - /* encoding from the Affix Manager for that dictionary */ - char* try_string = pAMgr->get_try_string(); - encoding = pAMgr->get_encoding(); - langnum = pAMgr->get_langnum(); - utf8 = pAMgr->get_utf8(); - if (!utf8) - csconv = get_current_cs(encoding); - complexprefixes = pAMgr->get_complexprefixes(); - wordbreak = pAMgr->get_breaktable(); - - dic_encoding_vec.resize(encoding.size()+1); - strcpy(&dic_encoding_vec[0], encoding.c_str()); - - /* and finally set up the suggestion manager */ - pSMgr = new SuggestMgr(try_string, MAXSUGGESTION, pAMgr); - if (try_string) - free(try_string); -} - -Hunspell::~Hunspell() { - delete m_Impl; -} - -HunspellImpl::~HunspellImpl() { - delete pSMgr; - delete pAMgr; - for (size_t i = 0; i < m_HMgrs.size(); ++i) - delete m_HMgrs[i]; - pSMgr = NULL; - pAMgr = NULL; -#ifdef MOZILLA_CLIENT - delete[] csconv; -#endif - csconv = NULL; - if (affixpath) - free(affixpath); - affixpath = NULL; -} - -// load extra dictionaries -int Hunspell::add_dic(const char* dpath, const char* key) { - return m_Impl->add_dic(dpath, key); -} - -// load extra dictionaries -int HunspellImpl::add_dic(const char* dpath, const char* key) { - if (!affixpath) - return 1; - m_HMgrs.push_back(new HashMgr(dpath, affixpath, key)); - return 0; -} - -// make a copy of src at destination while removing all leading -// blanks and removing any trailing periods after recording -// their presence with the abbreviation flag -// also since already going through character by character, -// set the capitalization type -// return the length of the "cleaned" (and UTF-8 encoded) word - -size_t HunspellImpl::cleanword2(std::string& dest, - std::vector<w_char>& dest_utf, - const std::string& src, - int* pcaptype, - size_t* pabbrev) { - dest.clear(); - dest_utf.clear(); - - const char* q = src.c_str(); - - // first skip over any leading blanks - while (*q == ' ') - ++q; - - // now strip off any trailing periods (recording their presence) - *pabbrev = 0; - int nl = strlen(q); - while ((nl > 0) && (*(q + nl - 1) == '.')) { - nl--; - (*pabbrev)++; - } - - // if no characters are left it can't be capitalized - if (nl <= 0) { - *pcaptype = NOCAP; - return 0; - } - - dest.append(q, nl); - nl = dest.size(); - if (utf8) { - u8_u16(dest_utf, dest); - *pcaptype = get_captype_utf8(dest_utf, langnum); - } else { - *pcaptype = get_captype(dest, csconv); - } - return nl; -} - -void HunspellImpl::cleanword(std::string& dest, - const std::string& src, - int* pcaptype, - int* pabbrev) { - dest.clear(); - const unsigned char* q = (const unsigned char*)src.c_str(); - int firstcap = 0; - - // first skip over any leading blanks - while (*q == ' ') - ++q; - - // now strip off any trailing periods (recording their presence) - *pabbrev = 0; - int nl = strlen((const char*)q); - while ((nl > 0) && (*(q + nl - 1) == '.')) { - nl--; - (*pabbrev)++; - } - - // if no characters are left it can't be capitalized - if (nl <= 0) { - *pcaptype = NOCAP; - return; - } - - // now determine the capitalization type of the first nl letters - int ncap = 0; - int nneutral = 0; - int nc = 0; - - if (!utf8) { - while (nl > 0) { - nc++; - if (csconv[(*q)].ccase) - ncap++; - if (csconv[(*q)].cupper == csconv[(*q)].clower) - nneutral++; - dest.push_back(*q++); - nl--; - } - // remember to terminate the destination string - firstcap = csconv[static_cast<unsigned char>(dest[0])].ccase; - } else { - std::vector<w_char> t; - u8_u16(t, src); - for (size_t i = 0; i < t.size(); ++i) { - unsigned short idx = (t[i].h << 8) + t[i].l; - unsigned short low = unicodetolower(idx, langnum); - if (idx != low) - ncap++; - if (unicodetoupper(idx, langnum) == low) - nneutral++; - } - u16_u8(dest, t); - if (ncap) { - unsigned short idx = (t[0].h << 8) + t[0].l; - firstcap = (idx != unicodetolower(idx, langnum)); - } - } - - // now finally set the captype - if (ncap == 0) { - *pcaptype = NOCAP; - } else if ((ncap == 1) && firstcap) { - *pcaptype = INITCAP; - } else if ((ncap == nc) || ((ncap + nneutral) == nc)) { - *pcaptype = ALLCAP; - } else if ((ncap > 1) && firstcap) { - *pcaptype = HUHINITCAP; - } else { - *pcaptype = HUHCAP; - } -} - -void HunspellImpl::mkallcap(std::string& u8) { - if (utf8) { - std::vector<w_char> u16; - u8_u16(u16, u8); - ::mkallcap_utf(u16, langnum); - u16_u8(u8, u16); - } else { - ::mkallcap(u8, csconv); - } -} - -int HunspellImpl::mkallsmall2(std::string& u8, std::vector<w_char>& u16) { - if (utf8) { - ::mkallsmall_utf(u16, langnum); - u16_u8(u8, u16); - } else { - ::mkallsmall(u8, csconv); - } - return u8.size(); -} - -// convert UTF-8 sharp S codes to latin 1 -std::string HunspellImpl::sharps_u8_l1(const std::string& source) { - std::string dest(source); - mystrrep(dest, "\xC3\x9F", "\xDF"); - return dest; -} - -// recursive search for right ss - sharp s permutations -hentry* HunspellImpl::spellsharps(std::string& base, - size_t n_pos, - int n, - int repnum, - int* info, - std::string* root) { - size_t pos = base.find("ss", n_pos); - if (pos != std::string::npos && (n < MAXSHARPS)) { - base[pos] = '\xC3'; - base[pos + 1] = '\x9F'; - hentry* h = spellsharps(base, pos + 2, n + 1, repnum + 1, info, root); - if (h) - return h; - base[pos] = 's'; - base[pos + 1] = 's'; - h = spellsharps(base, pos + 2, n + 1, repnum, info, root); - if (h) - return h; - } else if (repnum > 0) { - if (utf8) - return checkword(base, info, root); - std::string tmp(sharps_u8_l1(base)); - return checkword(tmp, info, root); - } - return NULL; -} - -int HunspellImpl::is_keepcase(const hentry* rv) { - return pAMgr && rv->astr && pAMgr->get_keepcase() && - TESTAFF(rv->astr, pAMgr->get_keepcase(), rv->alen); -} - -/* insert a word to the beginning of the suggestion array */ -void HunspellImpl::insert_sug(std::vector<std::string>& slst, const std::string& word) { - slst.insert(slst.begin(), word); -} - -bool Hunspell::spell(const std::string& word, int* info, std::string* root) { - return m_Impl->spell(word, info, root); -} - -bool HunspellImpl::spell(const std::string& word, int* info, std::string* root) { - struct hentry* rv = NULL; - - int info2 = 0; - if (!info) - info = &info2; - else - *info = 0; - - // Hunspell supports XML input of the simplified API (see manual) - if (word == SPELL_XML) - return true; - if (utf8) { - if (word.size() >= MAXWORDUTF8LEN) - return false; - } else { - if (word.size() >= MAXWORDLEN) - return false; - } - int captype = NOCAP; - size_t abbv = 0; - size_t wl = 0; - - std::string scw; - std::vector<w_char> sunicw; - - // input conversion - RepList* rl = pAMgr ? pAMgr->get_iconvtable() : NULL; - { - std::string wspace; - - bool convstatus = rl ? rl->conv(word, wspace) : false; - if (convstatus) - wl = cleanword2(scw, sunicw, wspace, &captype, &abbv); - else - wl = cleanword2(scw, sunicw, word, &captype, &abbv); - } - -#ifdef MOZILLA_CLIENT - // accept the abbreviated words without dots - // workaround for the incomplete tokenization of Mozilla - abbv = 1; -#endif - - if (wl == 0 || m_HMgrs.empty()) - return true; - if (root) - root->clear(); - - // allow numbers with dots, dashes and commas (but forbid double separators: - // "..", "--" etc.) - enum { NBEGIN, NNUM, NSEP }; - int nstate = NBEGIN; - size_t i; - - for (i = 0; (i < wl); i++) { - if ((scw[i] <= '9') && (scw[i] >= '0')) { - nstate = NNUM; - } else if ((scw[i] == ',') || (scw[i] == '.') || (scw[i] == '-')) { - if ((nstate == NSEP) || (i == 0)) - break; - nstate = NSEP; - } else - break; - } - if ((i == wl) && (nstate == NNUM)) - return true; - - switch (captype) { - case HUHCAP: - /* FALLTHROUGH */ - case HUHINITCAP: - *info += SPELL_ORIGCAP; - /* FALLTHROUGH */ - case NOCAP: - rv = checkword(scw, info, root); - if ((abbv) && !(rv)) { - std::string u8buffer(scw); - u8buffer.push_back('.'); - rv = checkword(u8buffer, info, root); - } - break; - case ALLCAP: { - *info += SPELL_ORIGCAP; - rv = checkword(scw, info, root); - if (rv) - break; - if (abbv) { - std::string u8buffer(scw); - u8buffer.push_back('.'); - rv = checkword(u8buffer, info, root); - if (rv) - break; - } - // Spec. prefix handling for Catalan, French, Italian: - // prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia). - size_t apos = pAMgr ? scw.find('\'') : std::string::npos; - if (apos != std::string::npos) { - mkallsmall2(scw, sunicw); - //conversion may result in string with different len to pre-mkallsmall2 - //so re-scan - if (apos != std::string::npos && apos < scw.size() - 1) { - std::string part1 = scw.substr(0, apos+1); - std::string part2 = scw.substr(apos+1); - if (utf8) { - std::vector<w_char> part1u, part2u; - u8_u16(part1u, part1); - u8_u16(part2u, part2); - mkinitcap2(part2, part2u); - scw = part1 + part2; - sunicw = part1u; - sunicw.insert(sunicw.end(), part2u.begin(), part2u.end()); - rv = checkword(scw, info, root); - if (rv) - break; - } else { - mkinitcap2(part2, sunicw); - scw = part1 + part2; - rv = checkword(scw, info, root); - if (rv) - break; - } - mkinitcap2(scw, sunicw); - rv = checkword(scw, info, root); - if (rv) - break; - } - } - if (pAMgr && pAMgr->get_checksharps() && scw.find("SS") != std::string::npos) { - - mkallsmall2(scw, sunicw); - std::string u8buffer(scw); - rv = spellsharps(u8buffer, 0, 0, 0, info, root); - if (!rv) { - mkinitcap2(scw, sunicw); - rv = spellsharps(scw, 0, 0, 0, info, root); - } - if ((abbv) && !(rv)) { - u8buffer.push_back('.'); - rv = spellsharps(u8buffer, 0, 0, 0, info, root); - if (!rv) { - u8buffer = std::string(scw); - u8buffer.push_back('.'); - rv = spellsharps(u8buffer, 0, 0, 0, info, root); - } - } - if (rv) - break; - } - } - case INITCAP: { - - *info += SPELL_ORIGCAP; - mkallsmall2(scw, sunicw); - std::string u8buffer(scw); - mkinitcap2(scw, sunicw); - if (captype == INITCAP) - *info += SPELL_INITCAP; - rv = checkword(scw, info, root); - if (captype == INITCAP) - *info -= SPELL_INITCAP; - // forbid bad capitalization - // (for example, ijs -> Ijs instead of IJs in Dutch) - // use explicit forms in dic: Ijs/F (F = FORBIDDENWORD flag) - if (*info & SPELL_FORBIDDEN) { - rv = NULL; - break; - } - if (rv && is_keepcase(rv) && (captype == ALLCAP)) - rv = NULL; - if (rv) - break; - - rv = checkword(u8buffer, info, root); - if (abbv && !rv) { - u8buffer.push_back('.'); - rv = checkword(u8buffer, info, root); - if (!rv) { - u8buffer = scw; - u8buffer.push_back('.'); - if (captype == INITCAP) - *info += SPELL_INITCAP; - rv = checkword(u8buffer, info, root); - if (captype == INITCAP) - *info -= SPELL_INITCAP; - if (rv && is_keepcase(rv) && (captype == ALLCAP)) - rv = NULL; - break; - } - } - if (rv && is_keepcase(rv) && - ((captype == ALLCAP) || - // if CHECKSHARPS: KEEPCASE words with \xDF are allowed - // in INITCAP form, too. - !(pAMgr->get_checksharps() && - ((utf8 && u8buffer.find("\xC3\x9F") != std::string::npos) || - (!utf8 && u8buffer.find('\xDF') != std::string::npos))))) - rv = NULL; - break; - } - } - - if (rv) { - if (pAMgr && pAMgr->get_warn() && rv->astr && - TESTAFF(rv->astr, pAMgr->get_warn(), rv->alen)) { - *info += SPELL_WARN; - if (pAMgr->get_forbidwarn()) - return false; - return true; - } - return true; - } - - // recursive breaking at break points - if (!wordbreak.empty()) { - - int nbr = 0; - wl = scw.size(); - - // calculate break points for recursion limit - for (size_t j = 0; j < wordbreak.size(); ++j) { - size_t pos = 0; - while ((pos = scw.find(wordbreak[j], pos)) != std::string::npos) { - ++nbr; - pos += wordbreak[j].size(); - } - } - if (nbr >= 10) - return false; - - // check boundary patterns (^begin and end$) - for (size_t j = 0; j < wordbreak.size(); ++j) { - size_t plen = wordbreak[j].size(); - if (plen == 1 || plen > wl) - continue; - - if (wordbreak[j][0] == '^' && - scw.compare(0, plen - 1, wordbreak[j], 1, plen -1) == 0 && spell(scw.substr(plen - 1))) - return true; - - if (wordbreak[j][plen - 1] == '$' && - scw.compare(wl - plen + 1, plen - 1, wordbreak[j], 0, plen - 1) == 0) { - std::string suffix(scw.substr(wl - plen + 1)); - scw.resize(wl - plen + 1); - if (spell(scw)) - return true; - scw.append(suffix); - } - } - - // other patterns - for (size_t j = 0; j < wordbreak.size(); ++j) { - size_t plen = wordbreak[j].size(); - size_t found = scw.find(wordbreak[j]); - if ((found > 0) && (found < wl - plen)) { - if (!spell(scw.substr(found + plen))) - continue; - std::string suffix(scw.substr(found)); - scw.resize(found); - // examine 2 sides of the break point - if (spell(scw)) - return true; - scw.append(suffix); - - // LANG_hu: spec. dash rule - if (langnum == LANG_hu && wordbreak[j] == "-") { - suffix = scw.substr(found + 1); - scw.resize(found + 1); - if (spell(scw)) - return true; // check the first part with dash - scw.append(suffix); - } - // end of LANG specific region - } - } - } - - return false; -} - -struct hentry* HunspellImpl::checkword(const std::string& w, int* info, std::string* root) { - bool usebuffer = false; - std::string w2; - const char* word; - int len; - - const char* ignoredchars = pAMgr ? pAMgr->get_ignore() : NULL; - if (ignoredchars != NULL) { - w2.assign(w); - if (utf8) { - const std::vector<w_char>& ignoredchars_utf16 = - pAMgr->get_ignore_utf16(); - remove_ignored_chars_utf(w2, ignoredchars_utf16); - } else { - remove_ignored_chars(w2, ignoredchars); - } - word = w2.c_str(); - len = w2.size(); - usebuffer = true; - } else { - word = w.c_str(); - len = w.size(); - } - - if (!len) - return NULL; - - // word reversing wrapper for complex prefixes - if (complexprefixes) { - if (!usebuffer) { - w2.assign(word); - usebuffer = true; - } - if (utf8) - reverseword_utf(w2); - else - reverseword(w2); - } - - if (usebuffer) { - word = w2.c_str(); - } - - // look word in hash table - struct hentry* he = NULL; - for (size_t i = 0; (i < m_HMgrs.size()) && !he; ++i) { - he = m_HMgrs[i]->lookup(word); - - // check forbidden and onlyincompound words - if ((he) && (he->astr) && (pAMgr) && - TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) { - if (info) - *info += SPELL_FORBIDDEN; - // LANG_hu section: set dash information for suggestions - if (langnum == LANG_hu) { - if (pAMgr->get_compoundflag() && - TESTAFF(he->astr, pAMgr->get_compoundflag(), he->alen)) { - if (info) - *info += SPELL_COMPOUND; - } - } - return NULL; - } - - // he = next not needaffix, onlyincompound homonym or onlyupcase word - while (he && (he->astr) && pAMgr && - ((pAMgr->get_needaffix() && - TESTAFF(he->astr, pAMgr->get_needaffix(), he->alen)) || - (pAMgr->get_onlyincompound() && - TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) || - (info && (*info & SPELL_INITCAP) && - TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)))) - he = he->next_homonym; - } - - // check with affixes - if (!he && pAMgr) { - // try stripping off affixes */ - he = pAMgr->affix_check(word, len, 0); - - // check compound restriction and onlyupcase - if (he && he->astr && - ((pAMgr->get_onlyincompound() && - TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) || - (info && (*info & SPELL_INITCAP) && - TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)))) { - he = NULL; - } - - if (he) { - if ((he->astr) && (pAMgr) && - TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) { - if (info) - *info += SPELL_FORBIDDEN; - return NULL; - } - if (root) { - root->assign(he->word); - if (complexprefixes) { - if (utf8) - reverseword_utf(*root); - else - reverseword(*root); - } - } - // try check compound word - } else if (pAMgr->get_compound()) { - struct hentry* rwords[100]; // buffer for COMPOUND pattern checking - he = pAMgr->compound_check(word, 0, 0, 100, 0, NULL, (hentry**)&rwords, 0, 0, info); - // LANG_hu section: `moving rule' with last dash - if ((!he) && (langnum == LANG_hu) && (word[len - 1] == '-')) { - std::string dup(word, len - 1); - he = pAMgr->compound_check(dup, -5, 0, 100, 0, NULL, (hentry**)&rwords, 1, 0, info); - } - // end of LANG specific region - if (he) { - if (root) { - root->assign(he->word); - if (complexprefixes) { - if (utf8) - reverseword_utf(*root); - else - reverseword(*root); - } - } - if (info) - *info += SPELL_COMPOUND; - } - } - } - - return he; -} - -std::vector<std::string> Hunspell::suggest(const std::string& word) { - return m_Impl->suggest(word); -} - -std::vector<std::string> HunspellImpl::suggest(const std::string& word) { - std::vector<std::string> slst; - - int onlycmpdsug = 0; - if (!pSMgr || m_HMgrs.empty()) - return slst; - - // process XML input of the simplified API (see manual) - if (word.compare(0, sizeof(SPELL_XML) - 3, SPELL_XML, sizeof(SPELL_XML) - 3) == 0) { - return spellml(word); - } - if (utf8) { - if (word.size() >= MAXWORDUTF8LEN) - return slst; - } else { - if (word.size() >= MAXWORDLEN) - return slst; - } - int captype = NOCAP; - size_t abbv = 0; - size_t wl = 0; - - std::string scw; - std::vector<w_char> sunicw; - - // input conversion - RepList* rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; - { - std::string wspace; - - bool convstatus = rl ? rl->conv(word, wspace) : false; - if (convstatus) - wl = cleanword2(scw, sunicw, wspace, &captype, &abbv); - else - wl = cleanword2(scw, sunicw, word, &captype, &abbv); - - if (wl == 0) - return slst; - } - - int capwords = 0; - - // check capitalized form for FORCEUCASE - if (pAMgr && captype == NOCAP && pAMgr->get_forceucase()) { - int info = SPELL_ORIGCAP; - if (checkword(scw, &info, NULL)) { - std::string form(scw); - mkinitcap(form); - slst.push_back(form); - return slst; - } - } - - switch (captype) { - case NOCAP: { - pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug); - break; - } - - case INITCAP: { - capwords = 1; - pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug); - std::string wspace(scw); - mkallsmall2(wspace, sunicw); - pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); - break; - } - case HUHINITCAP: - capwords = 1; - case HUHCAP: { - pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug); - // something.The -> something. The - size_t dot_pos = scw.find('.'); - if (dot_pos != std::string::npos) { - std::string postdot = scw.substr(dot_pos + 1); - int captype_; - if (utf8) { - std::vector<w_char> postdotu; - u8_u16(postdotu, postdot); - captype_ = get_captype_utf8(postdotu, langnum); - } else { - captype_ = get_captype(postdot, csconv); - } - if (captype_ == INITCAP) { - std::string str(scw); - str.insert(dot_pos + 1, 1, ' '); - insert_sug(slst, str); - } - } - - std::string wspace; - - if (captype == HUHINITCAP) { - // TheOpenOffice.org -> The OpenOffice.org - wspace = scw; - mkinitsmall2(wspace, sunicw); - pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); - } - wspace = scw; - mkallsmall2(wspace, sunicw); - if (spell(wspace.c_str())) - insert_sug(slst, wspace); - size_t prevns = slst.size(); - pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); - if (captype == HUHINITCAP) { - mkinitcap2(wspace, sunicw); - if (spell(wspace.c_str())) - insert_sug(slst, wspace); - pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); - } - // aNew -> "a New" (instead of "a new") - for (size_t j = prevns; j < slst.size(); ++j) { - const char* space = strchr(slst[j].c_str(), ' '); - if (space) { - size_t slen = strlen(space + 1); - // different case after space (need capitalisation) - if ((slen < wl) && strcmp(scw.c_str() + wl - slen, space + 1)) { - std::string first(slst[j].c_str(), space + 1); - std::string second(space + 1); - std::vector<w_char> w; - if (utf8) - u8_u16(w, second); - mkinitcap2(second, w); - // set as first suggestion - slst.erase(slst.begin() + j); - slst.insert(slst.begin(), first + second); - } - } - } - break; - } - - case ALLCAP: { - std::string wspace(scw); - mkallsmall2(wspace, sunicw); - pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); - if (pAMgr && pAMgr->get_keepcase() && spell(wspace.c_str())) - insert_sug(slst, wspace); - mkinitcap2(wspace, sunicw); - pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); - for (size_t j = 0; j < slst.size(); ++j) { - mkallcap(slst[j]); - if (pAMgr && pAMgr->get_checksharps()) { - if (utf8) { - mystrrep(slst[j], "\xC3\x9F", "SS"); - } else { - mystrrep(slst[j], "\xDF", "SS"); - } - } - } - break; - } - } - - // LANG_hu section: replace '-' with ' ' in Hungarian - if (langnum == LANG_hu) { - for (size_t j = 0; j < slst.size(); ++j) { - size_t pos = slst[j].find('-'); - if (pos != std::string::npos) { - int info; - std::string w(slst[j].substr(0, pos)); - w.append(slst[j].substr(pos + 1)); - (void)spell(w, &info, NULL); - if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) { - slst[j][pos] = ' '; - } else - slst[j][pos] = '-'; - } - } - } - // END OF LANG_hu section - - // try ngram approach since found nothing or only compound words - if (pAMgr && (slst.empty() || onlycmpdsug) && (pAMgr->get_maxngramsugs() != 0)) { - switch (captype) { - case NOCAP: { - pSMgr->ngsuggest(slst, scw.c_str(), m_HMgrs); - break; - } - case HUHINITCAP: - capwords = 1; - case HUHCAP: { - std::string wspace(scw); - mkallsmall2(wspace, sunicw); - pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs); - break; - } - case INITCAP: { - capwords = 1; - std::string wspace(scw); - mkallsmall2(wspace, sunicw); - pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs); - break; - } - case ALLCAP: { - std::string wspace(scw); - mkallsmall2(wspace, sunicw); - size_t oldns = slst.size(); - pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs); - for (size_t j = oldns; j < slst.size(); ++j) { - mkallcap(slst[j]); - } - break; - } - } - } - - // try dash suggestion (Afo-American -> Afro-American) - size_t dash_pos = scw.find('-'); - if (dash_pos != std::string::npos) { - int nodashsug = 1; - for (size_t j = 0; j < slst.size() && nodashsug == 1; ++j) { - if (slst[j].find('-') != std::string::npos) - nodashsug = 0; - } - - size_t prev_pos = 0; - bool last = false; - - while (nodashsug && !last) { - if (dash_pos == scw.size()) - last = 1; - std::string chunk = scw.substr(prev_pos, dash_pos - prev_pos); - if (!spell(chunk.c_str())) { - std::vector<std::string> nlst = suggest(chunk.c_str()); - for (std::vector<std::string>::reverse_iterator j = nlst.rbegin(); j != nlst.rend(); ++j) { - std::string wspace = scw.substr(0, prev_pos); - wspace.append(*j); - if (!last) { - wspace.append("-"); - wspace.append(scw.substr(dash_pos + 1)); - } - insert_sug(slst, wspace); - } - nodashsug = 0; - } - if (!last) { - prev_pos = dash_pos + 1; - dash_pos = scw.find('-', prev_pos); - } - if (dash_pos == std::string::npos) - dash_pos = scw.size(); - } - } - - // word reversing wrapper for complex prefixes - if (complexprefixes) { - for (size_t j = 0; j < slst.size(); ++j) { - if (utf8) - reverseword_utf(slst[j]); - else - reverseword(slst[j]); - } - } - - // capitalize - if (capwords) - for (size_t j = 0; j < slst.size(); ++j) { - mkinitcap(slst[j]); - } - - // expand suggestions with dot(s) - if (abbv && pAMgr && pAMgr->get_sugswithdots()) { - for (size_t j = 0; j < slst.size(); ++j) { - slst[j].append(word.substr(word.size() - abbv)); - } - } - - // remove bad capitalized and forbidden forms - if (pAMgr && (pAMgr->get_keepcase() || pAMgr->get_forbiddenword())) { - switch (captype) { - case INITCAP: - case ALLCAP: { - size_t l = 0; - for (size_t j = 0; j < slst.size(); ++j) { - if (slst[j].find(' ') == std::string::npos && !spell(slst[j])) { - std::string s; - std::vector<w_char> w; - if (utf8) { - u8_u16(w, slst[j]); - } else { - s = slst[j]; - } - mkallsmall2(s, w); - if (spell(s)) { - slst[l] = s; - ++l; - } else { - mkinitcap2(s, w); - if (spell(s)) { - slst[l] = s; - ++l; - } - } - } else { - slst[l] = slst[j]; - ++l; - } - } - slst.resize(l); - } - } - } - - // remove duplications - size_t l = 0; - for (size_t j = 0; j < slst.size(); ++j) { - slst[l] = slst[j]; - for (size_t k = 0; k < l; ++k) { - if (slst[k] == slst[j]) { - --l; - break; - } - } - ++l; - } - slst.resize(l); - - // output conversion - rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL; - for (size_t j = 0; rl && j < slst.size(); ++j) { - std::string wspace; - if (rl->conv(slst[j], wspace)) { - slst[j] = wspace; - } - } - - return slst; -} - -const std::string& Hunspell::get_dict_encoding() const { - return m_Impl->get_dict_encoding(); -} - -const std::string& HunspellImpl::get_dict_encoding() const { - return encoding; -} - -std::vector<std::string> Hunspell::stem(const std::vector<std::string>& desc) { - return m_Impl->stem(desc); -} - -std::vector<std::string> HunspellImpl::stem(const std::vector<std::string>& desc) { - std::vector<std::string> slst; - - std::string result2; - if (desc.empty()) - return slst; - for (size_t i = 0; i < desc.size(); ++i) { - - std::string result; - - // add compound word parts (except the last one) - const char* s = desc[i].c_str(); - const char* part = strstr(s, MORPH_PART); - if (part) { - const char* nextpart = strstr(part + 1, MORPH_PART); - while (nextpart) { - std::string field; - copy_field(field, part, MORPH_PART); - result.append(field); - part = nextpart; - nextpart = strstr(part + 1, MORPH_PART); - } - s = part; - } - - std::string tok(s); - size_t alt = 0; - while ((alt = tok.find(" | ", alt)) != std::string::npos) { - tok[alt + 1] = MSEP_ALT; - } - std::vector<std::string> pl = line_tok(tok, MSEP_ALT); - for (size_t k = 0; k < pl.size(); ++k) { - // add derivational suffixes - if (pl[k].find(MORPH_DERI_SFX) != std::string::npos) { - // remove inflectional suffixes - const size_t is = pl[k].find(MORPH_INFL_SFX); - if (is != std::string::npos) - pl[k].resize(is); - std::vector<std::string> singlepl; - singlepl.push_back(pl[k]); - std::string sg = pSMgr->suggest_gen(singlepl, pl[k]); - if (!sg.empty()) { - std::vector<std::string> gen = line_tok(sg, MSEP_REC); - for (size_t j = 0; j < gen.size(); ++j) { - result2.push_back(MSEP_REC); - result2.append(result); - result2.append(gen[j]); - } - } - } else { - result2.push_back(MSEP_REC); - result2.append(result); - if (pl[k].find(MORPH_SURF_PFX) != std::string::npos) { - std::string field; - copy_field(field, pl[k], MORPH_SURF_PFX); - result2.append(field); - } - std::string field; - copy_field(field, pl[k], MORPH_STEM); - result2.append(field); - } - } - } - slst = line_tok(result2, MSEP_REC); - uniqlist(slst); - return slst; -} - -std::vector<std::string> Hunspell::stem(const std::string& word) { - return m_Impl->stem(word); -} - -std::vector<std::string> HunspellImpl::stem(const std::string& word) { - return stem(analyze(word)); -} - -const char* Hunspell::get_wordchars() const { - return m_Impl->get_wordchars().c_str(); -} - -const std::string& Hunspell::get_wordchars_cpp() const { - return m_Impl->get_wordchars(); -} - -const std::string& HunspellImpl::get_wordchars() const { - return pAMgr->get_wordchars(); -} - -const std::vector<w_char>& Hunspell::get_wordchars_utf16() const { - return m_Impl->get_wordchars_utf16(); -} - -const std::vector<w_char>& HunspellImpl::get_wordchars_utf16() const { - return pAMgr->get_wordchars_utf16(); -} - -void HunspellImpl::mkinitcap(std::string& u8) { - if (utf8) { - std::vector<w_char> u16; - u8_u16(u16, u8); - ::mkinitcap_utf(u16, langnum); - u16_u8(u8, u16); - } else { - ::mkinitcap(u8, csconv); - } -} - -int HunspellImpl::mkinitcap2(std::string& u8, std::vector<w_char>& u16) { - if (utf8) { - ::mkinitcap_utf(u16, langnum); - u16_u8(u8, u16); - } else { - ::mkinitcap(u8, csconv); - } - return u8.size(); -} - -int HunspellImpl::mkinitsmall2(std::string& u8, std::vector<w_char>& u16) { - if (utf8) { - ::mkinitsmall_utf(u16, langnum); - u16_u8(u8, u16); - } else { - ::mkinitsmall(u8, csconv); - } - return u8.size(); -} - -int Hunspell::add(const std::string& word) { - return m_Impl->add(word); -} - -int HunspellImpl::add(const std::string& word) { - if (!m_HMgrs.empty()) - return m_HMgrs[0]->add(word); - return 0; -} - -int Hunspell::add_with_affix(const std::string& word, const std::string& example) { - return m_Impl->add_with_affix(word, example); -} - -int HunspellImpl::add_with_affix(const std::string& word, const std::string& example) { - if (!m_HMgrs.empty()) - return m_HMgrs[0]->add_with_affix(word, example); - return 0; -} - -int Hunspell::remove(const std::string& word) { - return m_Impl->remove(word); -} - -int HunspellImpl::remove(const std::string& word) { - if (!m_HMgrs.empty()) - return m_HMgrs[0]->remove(word); - return 0; -} - -const char* Hunspell::get_version() const { - return m_Impl->get_version().c_str(); -} - -const std::string& Hunspell::get_version_cpp() const { - return m_Impl->get_version(); -} - -const char* Hunspell::get_try_string() const { - return m_Impl->get_try_string(); -} - -struct cs_info* HunspellImpl::get_csconv() { - return csconv; -} - -struct cs_info* Hunspell::get_csconv() { - return m_Impl->get_csconv(); -} - -void HunspellImpl::cat_result(std::string& result, const std::string& st) { - if (!st.empty()) { - if (!result.empty()) - result.append("\n"); - result.append(st); - } -} - -std::vector<std::string> Hunspell::analyze(const std::string& word) { - return m_Impl->analyze(word); -} - -std::vector<std::string> HunspellImpl::analyze(const std::string& word) { - std::vector<std::string> slst; - if (!pSMgr || m_HMgrs.empty()) - return slst; - if (utf8) { - if (word.size() >= MAXWORDUTF8LEN) - return slst; - } else { - if (word.size() >= MAXWORDLEN) - return slst; - } - int captype = NOCAP; - size_t abbv = 0; - size_t wl = 0; - - std::string scw; - std::vector<w_char> sunicw; - - // input conversion - RepList* rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; - { - std::string wspace; - - bool convstatus = rl ? rl->conv(word, wspace) : false; - if (convstatus) - wl = cleanword2(scw, sunicw, wspace, &captype, &abbv); - else - wl = cleanword2(scw, sunicw, word, &captype, &abbv); - } - - if (wl == 0) { - if (abbv) { - scw.clear(); - for (wl = 0; wl < abbv; wl++) - scw.push_back('.'); - abbv = 0; - } else - return slst; - } - - std::string result; - - size_t n = 0; - // test numbers - // LANG_hu section: set dash information for suggestions - if (langnum == LANG_hu) { - size_t n2 = 0; - size_t n3 = 0; - - while ((n < wl) && (((scw[n] <= '9') && (scw[n] >= '0')) || - (((scw[n] == '.') || (scw[n] == ',')) && (n > 0)))) { - n++; - if ((scw[n] == '.') || (scw[n] == ',')) { - if (((n2 == 0) && (n > 3)) || - ((n2 > 0) && ((scw[n - 1] == '.') || (scw[n - 1] == ',')))) - break; - n2++; - n3 = n; - } - } - - if ((n == wl) && (n3 > 0) && (n - n3 > 3)) - return slst; - if ((n == wl) || ((n > 0) && ((scw[n] == '%') || (scw[n] == '\xB0')) && - checkword(scw.substr(n), NULL, NULL))) { - result.append(scw); - result.resize(n - 1); - if (n == wl) - cat_result(result, pSMgr->suggest_morph(scw.substr(n - 1))); - else { - std::string chunk = scw.substr(n - 1, 1); - cat_result(result, pSMgr->suggest_morph(chunk)); - result.push_back('+'); // XXX SPEC. MORPHCODE - cat_result(result, pSMgr->suggest_morph(scw.substr(n))); - } - return line_tok(result, MSEP_REC); - } - } - // END OF LANG_hu section - - switch (captype) { - case HUHCAP: - case HUHINITCAP: - case NOCAP: { - cat_result(result, pSMgr->suggest_morph(scw)); - if (abbv) { - std::string u8buffer(scw); - u8buffer.push_back('.'); - cat_result(result, pSMgr->suggest_morph(u8buffer)); - } - break; - } - case INITCAP: { - mkallsmall2(scw, sunicw); - std::string u8buffer(scw); - mkinitcap2(scw, sunicw); - cat_result(result, pSMgr->suggest_morph(u8buffer)); - cat_result(result, pSMgr->suggest_morph(scw)); - if (abbv) { - u8buffer.push_back('.'); - cat_result(result, pSMgr->suggest_morph(u8buffer)); - - u8buffer = scw; - u8buffer.push_back('.'); - - cat_result(result, pSMgr->suggest_morph(u8buffer)); - } - break; - } - case ALLCAP: { - cat_result(result, pSMgr->suggest_morph(scw)); - if (abbv) { - std::string u8buffer(scw); - u8buffer.push_back('.'); - cat_result(result, pSMgr->suggest_morph(u8buffer)); - } - mkallsmall2(scw, sunicw); - std::string u8buffer(scw); - mkinitcap2(scw, sunicw); - - cat_result(result, pSMgr->suggest_morph(u8buffer)); - cat_result(result, pSMgr->suggest_morph(scw)); - if (abbv) { - u8buffer.push_back('.'); - cat_result(result, pSMgr->suggest_morph(u8buffer)); - - u8buffer = scw; - u8buffer.push_back('.'); - - cat_result(result, pSMgr->suggest_morph(u8buffer)); - } - break; - } - } - - if (!result.empty()) { - // word reversing wrapper for complex prefixes - if (complexprefixes) { - if (utf8) - reverseword_utf(result); - else - reverseword(result); - } - return line_tok(result, MSEP_REC); - } - - // compound word with dash (HU) I18n - // LANG_hu section: set dash information for suggestions - - size_t dash_pos = langnum == LANG_hu ? scw.find('-') : std::string::npos; - if (dash_pos != std::string::npos) { - int nresult = 0; - - std::string part1 = scw.substr(0, dash_pos); - std::string part2 = scw.substr(dash_pos+1); - - // examine 2 sides of the dash - if (part2.empty()) { // base word ending with dash - if (spell(part1)) { - std::string p = pSMgr->suggest_morph(part1); - if (!p.empty()) { - slst = line_tok(p, MSEP_REC); - return slst; - } - } - } else if (part2.size() == 1 && part2[0] == 'e') { // XXX (HU) -e hat. - if (spell(part1) && (spell("-e"))) { - std::string st = pSMgr->suggest_morph(part1); - if (!st.empty()) { - result.append(st); - } - result.push_back('+'); // XXX spec. separator in MORPHCODE - st = pSMgr->suggest_morph("-e"); - if (!st.empty()) { - result.append(st); - } - return line_tok(result, MSEP_REC); - } - } else { - // first word ending with dash: word- XXX ??? - part1.push_back(' '); - nresult = spell(part1); - part1.erase(part1.size() - 1); - if (nresult && spell(part2) && - ((part2.size() > 1) || ((part2[0] > '0') && (part2[0] < '9')))) { - std::string st = pSMgr->suggest_morph(part1); - if (!st.empty()) { - result.append(st); - result.push_back('+'); // XXX spec. separator in MORPHCODE - } - st = pSMgr->suggest_morph(part2); - if (!st.empty()) { - result.append(st); - } - return line_tok(result, MSEP_REC); - } - } - // affixed number in correct word - if (nresult && (dash_pos > 0) && - (((scw[dash_pos - 1] <= '9') && (scw[dash_pos - 1] >= '0')) || - (scw[dash_pos - 1] == '.'))) { - n = 1; - if (scw[dash_pos - n] == '.') - n++; - // search first not a number character to left from dash - while ((dash_pos >= n) && ((scw[dash_pos - n] == '0') || (n < 3)) && - (n < 6)) { - n++; - } - if (dash_pos < n) - n--; - // numbers: valami1000000-hoz - // examine 100000-hoz, 10000-hoz 1000-hoz, 10-hoz, - // 56-hoz, 6-hoz - for (; n >= 1; n--) { - if (scw[dash_pos - n] < '0' || scw[dash_pos - n] > '9') { - continue; - } - std::string chunk = scw.substr(dash_pos - n); - if (checkword(chunk, NULL, NULL)) { - result.append(chunk); - std::string st = pSMgr->suggest_morph(chunk); - if (!st.empty()) { - result.append(st); - } - return line_tok(result, MSEP_REC); - } - } - } - } - return slst; -} - -std::vector<std::string> Hunspell::generate(const std::string& word, const std::vector<std::string>& pl) { - return m_Impl->generate(word, pl); -} - -std::vector<std::string> HunspellImpl::generate(const std::string& word, const std::vector<std::string>& pl) { - std::vector<std::string> slst; - if (!pSMgr || pl.empty()) - return slst; - std::vector<std::string> pl2 = analyze(word); - int captype = NOCAP; - int abbv = 0; - std::string cw; - cleanword(cw, word, &captype, &abbv); - std::string result; - - for (size_t i = 0; i < pl.size(); ++i) { - cat_result(result, pSMgr->suggest_gen(pl2, pl[i])); - } - - if (!result.empty()) { - // allcap - if (captype == ALLCAP) - mkallcap(result); - - // line split - slst = line_tok(result, MSEP_REC); - - // capitalize - if (captype == INITCAP || captype == HUHINITCAP) { - for (size_t j = 0; j < slst.size(); ++j) { - mkinitcap(slst[j]); - } - } - - // temporary filtering of prefix related errors (eg. - // generate("undrinkable", "eats") --> "undrinkables" and "*undrinks") - std::vector<std::string>::iterator it = slst.begin(); - while (it != slst.end()) { - if (!spell(*it)) { - it = slst.erase(it); - } else { - ++it; - } - } - } - return slst; -} - -std::vector<std::string> Hunspell::generate(const std::string& word, const std::string& pattern) { - return m_Impl->generate(word, pattern); -} - -std::vector<std::string> HunspellImpl::generate(const std::string& word, const std::string& pattern) { - std::vector<std::string> pl = analyze(pattern); - std::vector<std::string> slst = generate(word, pl); - uniqlist(slst); - return slst; -} - -// minimal XML parser functions -std::string HunspellImpl::get_xml_par(const char* par) { - std::string dest; - if (!par) - return dest; - char end = *par; - if (end == '>') - end = '<'; - else if (end != '\'' && end != '"') - return dest; // bad XML - for (par++; *par != '\0' && *par != end; ++par) { - dest.push_back(*par); - } - mystrrep(dest, "<", "<"); - mystrrep(dest, "&", "&"); - return dest; -} - -int Hunspell::get_langnum() const { - return m_Impl->get_langnum(); -} - -bool Hunspell::input_conv(const std::string& word, std::string& dest) { - return m_Impl->input_conv(word, dest); -} - -int Hunspell::input_conv(const char* word, char* dest, size_t destsize) { - std::string d; - bool ret = input_conv(word, d); - if (ret && d.size() < destsize) { - strncpy(dest, d.c_str(), destsize); - return 1; - } - return 0; -} - -bool HunspellImpl::input_conv(const std::string& word, std::string& dest) { - RepList* rl = pAMgr ? pAMgr->get_iconvtable() : NULL; - if (rl) { - return rl->conv(word, dest); - } - dest.assign(word); - return false; -} - -// return the beginning of the element (attr == NULL) or the attribute -const char* HunspellImpl::get_xml_pos(const char* s, const char* attr) { - const char* end = strchr(s, '>'); - if (attr == NULL) - return end; - const char* p = s; - while (1) { - p = strstr(p, attr); - if (!p || p >= end) - return 0; - if (*(p - 1) == ' ' || *(p - 1) == '\n') - break; - p += strlen(attr); - } - return p + strlen(attr); -} - -int HunspellImpl::check_xml_par(const char* q, - const char* attr, - const char* value) { - std::string cw = get_xml_par(get_xml_pos(q, attr)); - if (cw == value) - return 1; - return 0; -} - -std::vector<std::string> HunspellImpl::get_xml_list(const char* list, const char* tag) { - std::vector<std::string> slst; - if (!list) - return slst; - const char* p = list; - for (size_t n = 0; ((p = strstr(p, tag)) != NULL); ++p, ++n) { - std::string cw = get_xml_par(p + strlen(tag) - 1); - if (cw.empty()) { - break; - } - slst.push_back(cw); - } - return slst; -} - -std::vector<std::string> HunspellImpl::spellml(const std::string& in_word) { - std::vector<std::string> slst; - - const char* word = in_word.c_str(); - - const char* q = strstr(word, "<query"); - if (!q) - return slst; // bad XML input - const char* q2 = strchr(q, '>'); - if (!q2) - return slst; // bad XML input - q2 = strstr(q2, "<word"); - if (!q2) - return slst; // bad XML input - if (check_xml_par(q, "type=", "analyze")) { - std::string cw = get_xml_par(strchr(q2, '>')); - if (!cw.empty()) - slst = analyze(cw); - if (slst.empty()) - return slst; - // convert the result to <code><a>ana1</a><a>ana2</a></code> format - std::string r; - r.append("<code>"); - for (size_t i = 0; i < slst.size(); ++i) { - r.append("<a>"); - - std::string entry(slst[i]); - mystrrep(entry, "\t", " "); - mystrrep(entry, "&", "&"); - mystrrep(entry, "<", "<"); - r.append(entry); - - r.append("</a>"); - } - r.append("</code>"); - slst.clear(); - slst.push_back(r); - return slst; - } else if (check_xml_par(q, "type=", "stem")) { - std::string cw = get_xml_par(strchr(q2, '>')); - if (!cw.empty()) - return stem(cw); - } else if (check_xml_par(q, "type=", "generate")) { - std::string cw = get_xml_par(strchr(q2, '>')); - if (cw.empty()) - return slst; - const char* q3 = strstr(q2 + 1, "<word"); - if (q3) { - std::string cw2 = get_xml_par(strchr(q3, '>')); - if (!cw2.empty()) { - return generate(cw, cw2); - } - } else { - if ((q2 = strstr(q2 + 1, "<code")) != NULL) { - std::vector<std::string> slst2 = get_xml_list(strchr(q2, '>'), "<a>"); - if (!slst2.empty()) { - slst = generate(cw, slst2); - uniqlist(slst); - return slst; - } - } - } - } - return slst; -} - -int Hunspell::spell(const char* word, int* info, char** root) { - std::string sroot; - bool ret = m_Impl->spell(word, info, root ? &sroot : NULL); - if (root) { - if (sroot.empty()) { - *root = NULL; - } else { - *root = mystrdup(sroot.c_str()); - } - } - return ret; -} - -namespace { - int munge_vector(char*** slst, const std::vector<std::string>& items) { - if (items.empty()) { - *slst = NULL; - return 0; - } else { - *slst = (char**)malloc(sizeof(char*) * items.size()); - if (!*slst) - return 0; - for (size_t i = 0; i < items.size(); ++i) - (*slst)[i] = mystrdup(items[i].c_str()); - } - return items.size(); - } -} - -void Hunspell::free_list(char*** slst, int n) { - Hunspell_free_list((Hunhandle*)(this), slst, n); -} - -int Hunspell::suggest(char*** slst, const char* word) { - return Hunspell_suggest((Hunhandle*)(this), slst, word); -} - -int Hunspell::suffix_suggest(char*** slst, const char* root_word) { - std::vector<std::string> stems = m_Impl->suffix_suggest(root_word); - return munge_vector(slst, stems); -} - -char* Hunspell::get_dic_encoding() { - return &(m_Impl->dic_encoding_vec[0]); -} - -int Hunspell::stem(char*** slst, char** desc, int n) { - return Hunspell_stem2((Hunhandle*)(this), slst, desc, n); -} - -int Hunspell::stem(char*** slst, const char* word) { - return Hunspell_stem((Hunhandle*)(this), slst, word); -} - -int Hunspell::analyze(char*** slst, const char* word) { - return Hunspell_analyze((Hunhandle*)(this), slst, word); -} - -int Hunspell::generate(char*** slst, const char* word, char** pl, int pln) { - return Hunspell_generate2((Hunhandle*)(this), slst, word, pl, pln); -} - -int Hunspell::generate(char*** slst, const char* word, const char* pattern) { - return Hunspell_generate((Hunhandle*)(this), slst, word, pattern); -} - -Hunhandle* Hunspell_create(const char* affpath, const char* dpath) { - return (Hunhandle*)(new Hunspell(affpath, dpath)); -} - -Hunhandle* Hunspell_create_key(const char* affpath, - const char* dpath, - const char* key) { - return reinterpret_cast<Hunhandle*>(new Hunspell(affpath, dpath, key)); -} - -void Hunspell_destroy(Hunhandle* pHunspell) { - delete reinterpret_cast<Hunspell*>(pHunspell); -} - -int Hunspell_add_dic(Hunhandle* pHunspell, const char* dpath) { - return reinterpret_cast<Hunspell*>(pHunspell)->add_dic(dpath); -} - -int Hunspell_spell(Hunhandle* pHunspell, const char* word) { - return reinterpret_cast<Hunspell*>(pHunspell)->spell(std::string(word)); -} - -char* Hunspell_get_dic_encoding(Hunhandle* pHunspell) { - return reinterpret_cast<Hunspell*>(pHunspell)->get_dic_encoding(); -} - -int Hunspell_suggest(Hunhandle* pHunspell, char*** slst, const char* word) { - std::vector<std::string> suggests = reinterpret_cast<Hunspell*>(pHunspell)->suggest(word); - return munge_vector(slst, suggests); -} - -int Hunspell_analyze(Hunhandle* pHunspell, char*** slst, const char* word) { - std::vector<std::string> stems = reinterpret_cast<Hunspell*>(pHunspell)->analyze(word); - return munge_vector(slst, stems); -} - -int Hunspell_stem(Hunhandle* pHunspell, char*** slst, const char* word) { - - std::vector<std::string> stems = reinterpret_cast<Hunspell*>(pHunspell)->stem(word); - return munge_vector(slst, stems); -} - -int Hunspell_stem2(Hunhandle* pHunspell, char*** slst, char** desc, int n) { - std::vector<std::string> morph; - for (int i = 0; i < n; ++i) - morph.push_back(desc[i]); - - std::vector<std::string> stems = reinterpret_cast<Hunspell*>(pHunspell)->stem(morph); - return munge_vector(slst, stems); -} - -int Hunspell_generate(Hunhandle* pHunspell, - char*** slst, - const char* word, - const char* pattern) { - std::vector<std::string> stems = reinterpret_cast<Hunspell*>(pHunspell)->generate(word, pattern); - return munge_vector(slst, stems); -} - -int Hunspell_generate2(Hunhandle* pHunspell, - char*** slst, - const char* word, - char** desc, - int n) { - std::vector<std::string> morph; - for (int i = 0; i < n; ++i) - morph.push_back(desc[i]); - - std::vector<std::string> stems = reinterpret_cast<Hunspell*>(pHunspell)->generate(word, morph); - return munge_vector(slst, stems); -} - -/* functions for run-time modification of the dictionary */ - -/* add word to the run-time dictionary */ - -int Hunspell_add(Hunhandle* pHunspell, const char* word) { - return reinterpret_cast<Hunspell*>(pHunspell)->add(word); -} - -/* add word to the run-time dictionary with affix flags of - * the example (a dictionary word): Hunspell will recognize - * affixed forms of the new word, too. - */ - -int Hunspell_add_with_affix(Hunhandle* pHunspell, - const char* word, - const char* example) { - return reinterpret_cast<Hunspell*>(pHunspell)->add_with_affix(word, example); -} - -/* remove word from the run-time dictionary */ - -int Hunspell_remove(Hunhandle* pHunspell, const char* word) { - return reinterpret_cast<Hunspell*>(pHunspell)->remove(word); -} - -void Hunspell_free_list(Hunhandle*, char*** list, int n) { - if (list && *list) { - for (int i = 0; i < n; i++) - free((*list)[i]); - free(*list); - *list = NULL; - } -} - -std::vector<std::string> Hunspell::suffix_suggest(const std::string& root_word) { - return m_Impl->suffix_suggest(root_word); -} - -std::vector<std::string> HunspellImpl::suffix_suggest(const std::string& root_word) { - std::vector<std::string> slst; - struct hentry* he = NULL; - int len; - std::string w2; - const char* word; - const char* ignoredchars = pAMgr->get_ignore(); - if (ignoredchars != NULL) { - w2.assign(root_word); - if (utf8) { - const std::vector<w_char>& ignoredchars_utf16 = - pAMgr->get_ignore_utf16(); - remove_ignored_chars_utf(w2, ignoredchars_utf16); - } else { - remove_ignored_chars(w2, ignoredchars); - } - word = w2.c_str(); - } else - word = root_word.c_str(); - - len = strlen(word); - - if (!len) - return slst; - - for (size_t i = 0; (i < m_HMgrs.size()) && !he; ++i) { - he = m_HMgrs[i]->lookup(word); - } - if (he) { - slst = pAMgr->get_suffix_words(he->astr, he->alen, root_word.c_str()); - } - return slst; -} |