diff options
Diffstat (limited to 'plugins/SpellChecker/hunspell/affentry.cxx')
-rw-r--r-- | plugins/SpellChecker/hunspell/affentry.cxx | 962 |
1 files changed, 962 insertions, 0 deletions
diff --git a/plugins/SpellChecker/hunspell/affentry.cxx b/plugins/SpellChecker/hunspell/affentry.cxx new file mode 100644 index 0000000000..fef0cca5f5 --- /dev/null +++ b/plugins/SpellChecker/hunspell/affentry.cxx @@ -0,0 +1,962 @@ +#include "license.hunspell" +#include "license.myspell" + +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <ctype.h> + +#include "affentry.hxx" +#include "csutil.hxx" + +PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp) +{ + // register affix manager + pmyMgr = pmgr; + + // set up its initial values + + aflag = dp->aflag; // flag + strip = dp->strip; // string to strip + appnd = dp->appnd; // string to append + stripl = dp->stripl; // length of strip string + appndl = dp->appndl; // length of append string + numconds = dp->numconds; // length of the condition + opts = dp->opts; // cross product flag + // then copy over all of the conditions + if (opts & aeLONGCOND) { + memcpy(c.conds, dp->c.l.conds1, MAXCONDLEN_1); + c.l.conds2 = dp->c.l.conds2; + } else memcpy(c.conds, dp->c.conds, MAXCONDLEN); + next = NULL; + nextne = NULL; + nexteq = NULL; + morphcode = dp->morphcode; + contclass = dp->contclass; + contclasslen = dp->contclasslen; +} + + +PfxEntry::~PfxEntry() +{ + aflag = 0; + if (appnd) free(appnd); + if (strip) free(strip); + pmyMgr = NULL; + appnd = NULL; + strip = NULL; + if (opts & aeLONGCOND) free(c.l.conds2); + if (morphcode && !(opts & aeALIASM)) free(morphcode); + if (contclass && !(opts & aeALIASF)) free(contclass); +} + +// add prefix to this word assuming conditions hold +char * PfxEntry::add(const char * word, int len) +{ + char tword[MAXWORDUTF8LEN + 4]; + + if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) && + (len >= numconds) && test_condition(word) && + (!stripl || (strncmp(word, strip, stripl) == 0)) && + ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) { + /* we have a match so add prefix */ + char * pp = tword; + if (appndl) { + strcpy(tword,appnd); + pp += appndl; + } + strcpy(pp, (word + stripl)); + return mystrdup(tword); + } + return NULL; +} + +inline char * PfxEntry::nextchar(char * p) { + if (p) { + p++; + if (opts & aeLONGCOND) { + // jump to the 2nd part of the condition + if (p == c.conds + MAXCONDLEN_1) return c.l.conds2; + // end of the MAXCONDLEN length condition + } else if (p == c.conds + MAXCONDLEN) return NULL; + return *p ? p : NULL; + } + return NULL; +} + +inline int PfxEntry::test_condition(const char * st) +{ + const char * pos = NULL; // group with pos input position + bool neg = false; // complementer + bool ingroup = false; // character in the group + if (numconds == 0) return 1; + char * p = c.conds; + while (1) { + switch (*p) { + case '\0': return 1; + case '[': { + neg = false; + ingroup = false; + p = nextchar(p); + pos = st; break; + } + case '^': { p = nextchar(p); neg = true; break; } + case ']': { + if ((neg && ingroup) || (!neg && !ingroup)) return 0; + pos = NULL; + p = nextchar(p); + // skip the next character + if (!ingroup && *st) for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++); + if (*st == '\0' && p) return 0; // word <= condition + break; + } + case '.': if (!pos) { // dots are not metacharacters in groups: [.] + p = nextchar(p); + // skip the next character + for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++); + if (*st == '\0' && p) return 0; // word <= condition + break; + } + default: { + if (*st == *p) { + st++; + p = nextchar(p); + if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte + while (p && (*p & 0xc0) == 0x80) { // character + if (*p != *st) { + if (!pos) return 0; + st = pos; + break; + } + p = nextchar(p); + st++; + } + if (pos && st != pos) { + ingroup = true; + while (p && *p != ']' && (p = nextchar(p))); + } + } else if (pos) { + ingroup = true; + while (p && *p != ']' && (p = nextchar(p))); + } + } else if (pos) { // group + p = nextchar(p); + } else return 0; + } + } + if (!p) return 1; + } +} + +// check if this prefix entry matches +struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound, const FLAG needflag) +{ + int tmpl; // length of tmpword + struct hentry * he; // hash entry of root word or NULL + char tmpword[MAXWORDUTF8LEN + 4]; + + // on entry prefix is 0 length or already matches the beginning of the word. + // So if the remaining root word has positive length + // and if there are enough chars in root word and added back strip chars + // to meet the number of characters conditions, then test it + + tmpl = len - appndl; + + if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) { + + // generate new root word by removing prefix and adding + // back any characters that would have been stripped + + if (stripl) strcpy (tmpword, strip); + strcpy ((tmpword + stripl), (word + appndl)); + + // now make sure all of the conditions on characters + // are met. Please see the appendix at the end of + // this file for more info on exactly what is being + // tested + + // if all conditions are met then check if resulting + // root word in the dictionary + + if (test_condition(tmpword)) { + tmpl += stripl; + if ((he = pmyMgr->lookup(tmpword)) != NULL) { + do { + if (TESTAFF(he->astr, aflag, he->alen) && + // forbid single prefixes with needaffix flag + ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) && + // needflag + ((!needflag) || TESTAFF(he->astr, needflag, he->alen) || + (contclass && TESTAFF(contclass, needflag, contclasslen)))) + return he; + he = he->next_homonym; // check homonyms + } while (he); + } + + // prefix matched but no root word was found + // if aeXPRODUCT is allowed, try again but now + // ross checked combined with a suffix + + //if ((opts & aeXPRODUCT) && in_compound) { + if ((opts & aeXPRODUCT)) { + he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, this, NULL, + 0, NULL, FLAG_NULL, needflag, in_compound); + if (he) return he; + } + } + } + return NULL; +} + +// check if this prefix entry matches +struct hentry * PfxEntry::check_twosfx(const char * word, int len, + char in_compound, const FLAG needflag) +{ + int tmpl; // length of tmpword + struct hentry * he; // hash entry of root word or NULL + char tmpword[MAXWORDUTF8LEN + 4]; + + // on entry prefix is 0 length or already matches the beginning of the word. + // So if the remaining root word has positive length + // and if there are enough chars in root word and added back strip chars + // to meet the number of characters conditions, then test it + + tmpl = len - appndl; + + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && + (tmpl + stripl >= numconds)) { + + // generate new root word by removing prefix and adding + // back any characters that would have been stripped + + if (stripl) strcpy (tmpword, strip); + strcpy ((tmpword + stripl), (word + appndl)); + + // now make sure all of the conditions on characters + // are met. Please see the appendix at the end of + // this file for more info on exactly what is being + // tested + + // if all conditions are met then check if resulting + // root word in the dictionary + + if (test_condition(tmpword)) { + tmpl += stripl; + + // prefix matched but no root word was found + // if aeXPRODUCT is allowed, try again but now + // cross checked combined with a suffix + + if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { + he = pmyMgr->suffix_check_twosfx(tmpword, tmpl, aeXPRODUCT, this, needflag); + if (he) return he; + } + } + } + return NULL; +} + +// check if this prefix entry matches +char * PfxEntry::check_twosfx_morph(const char * word, int len, + char in_compound, const FLAG needflag) +{ + int tmpl; // length of tmpword + char tmpword[MAXWORDUTF8LEN + 4]; + + // on entry prefix is 0 length or already matches the beginning of the word. + // So if the remaining root word has positive length + // and if there are enough chars in root word and added back strip chars + // to meet the number of characters conditions, then test it + + tmpl = len - appndl; + + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && + (tmpl + stripl >= numconds)) { + + // generate new root word by removing prefix and adding + // back any characters that would have been stripped + + if (stripl) strcpy (tmpword, strip); + strcpy ((tmpword + stripl), (word + appndl)); + + // now make sure all of the conditions on characters + // are met. Please see the appendix at the end of + // this file for more info on exactly what is being + // tested + + // if all conditions are met then check if resulting + // root word in the dictionary + + if (test_condition(tmpword)) { + tmpl += stripl; + + // prefix matched but no root word was found + // if aeXPRODUCT is allowed, try again but now + // ross checked combined with a suffix + + if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { + return pmyMgr->suffix_check_twosfx_morph(tmpword, tmpl, + aeXPRODUCT, this, needflag); + } + } + } + return NULL; +} + +// check if this prefix entry matches +char * PfxEntry::check_morph(const char * word, int len, char in_compound, const FLAG needflag) +{ + int tmpl; // length of tmpword + struct hentry * he; // hash entry of root word or NULL + char tmpword[MAXWORDUTF8LEN + 4]; + char result[MAXLNLEN]; + char * st; + + *result = '\0'; + + // on entry prefix is 0 length or already matches the beginning of the word. + // So if the remaining root word has positive length + // and if there are enough chars in root word and added back strip chars + // to meet the number of characters conditions, then test it + + tmpl = len - appndl; + + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && + (tmpl + stripl >= numconds)) { + + // generate new root word by removing prefix and adding + // back any characters that would have been stripped + + if (stripl) strcpy (tmpword, strip); + strcpy ((tmpword + stripl), (word + appndl)); + + // now make sure all of the conditions on characters + // are met. Please see the appendix at the end of + // this file for more info on exactly what is being + // tested + + // if all conditions are met then check if resulting + // root word in the dictionary + + if (test_condition(tmpword)) { + tmpl += stripl; + if ((he = pmyMgr->lookup(tmpword)) != NULL) { + do { + if (TESTAFF(he->astr, aflag, he->alen) && + // forbid single prefixes with needaffix flag + ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) && + // needflag + ((!needflag) || TESTAFF(he->astr, needflag, he->alen) || + (contclass && TESTAFF(contclass, needflag, contclasslen)))) { + if (morphcode) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, morphcode, MAXLNLEN); + } else mystrcat(result,getKey(), MAXLNLEN); + if (!HENTRY_FIND(he, MORPH_STEM)) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, MORPH_STEM, MAXLNLEN); + mystrcat(result, HENTRY_WORD(he), MAXLNLEN); + } + // store the pointer of the hash entry + if (HENTRY_DATA(he)) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, HENTRY_DATA2(he), MAXLNLEN); + } else { + // return with debug information + char * flag = pmyMgr->encode_flag(getFlag()); + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, MORPH_FLAG, MAXLNLEN); + mystrcat(result, flag, MAXLNLEN); + free(flag); + } + mystrcat(result, "\n", MAXLNLEN); + } + he = he->next_homonym; + } while (he); + } + + // prefix matched but no root word was found + // if aeXPRODUCT is allowed, try again but now + // ross checked combined with a suffix + + if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { + st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, this, + FLAG_NULL, needflag); + if (st) { + mystrcat(result, st, MAXLNLEN); + free(st); + } + } + } + } + + if (*result) return mystrdup(result); + return NULL; +} + +SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp) +{ + // register affix manager + pmyMgr = pmgr; + + // set up its initial values + aflag = dp->aflag; // char flag + strip = dp->strip; // string to strip + appnd = dp->appnd; // string to append + stripl = dp->stripl; // length of strip string + appndl = dp->appndl; // length of append string + numconds = dp->numconds; // length of the condition + opts = dp->opts; // cross product flag + + // then copy over all of the conditions + if (opts & aeLONGCOND) { + memcpy(c.l.conds1, dp->c.l.conds1, MAXCONDLEN_1); + c.l.conds2 = dp->c.l.conds2; + } else memcpy(c.conds, dp->c.conds, MAXCONDLEN); + + rappnd = myrevstrdup(appnd); + morphcode = dp->morphcode; + contclass = dp->contclass; + contclasslen = dp->contclasslen; +} + + +SfxEntry::~SfxEntry() +{ + aflag = 0; + if (appnd) free(appnd); + if (rappnd) free(rappnd); + if (strip) free(strip); + pmyMgr = NULL; + appnd = NULL; + strip = NULL; + if (opts & aeLONGCOND) free(c.l.conds2); + if (morphcode && !(opts & aeALIASM)) free(morphcode); + if (contclass && !(opts & aeALIASF)) free(contclass); +} + +// add suffix to this word assuming conditions hold +char * SfxEntry::add(const char * word, int len) +{ + char tword[MAXWORDUTF8LEN + 4]; + + /* make sure all conditions match */ + if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) && + (len >= numconds) && test_condition(word + len, word) && + (!stripl || (strcmp(word + len - stripl, strip) == 0)) && + ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) { + /* we have a match so add suffix */ + strcpy(tword,word); + if (appndl) { + strcpy(tword + len - stripl, appnd); + } else { + *(tword + len - stripl) = '\0'; + } + return mystrdup(tword); + } + return NULL; +} + +inline char * SfxEntry::nextchar(char * p) { + if (p) { + p++; + if (opts & aeLONGCOND) { + // jump to the 2nd part of the condition + if (p == c.l.conds1 + MAXCONDLEN_1) return c.l.conds2; + // end of the MAXCONDLEN length condition + } else if (p == c.conds + MAXCONDLEN) return NULL; + return *p ? p : NULL; + } + return NULL; +} + +inline int SfxEntry::test_condition(const char * st, const char * beg) +{ + const char * pos = NULL; // group with pos input position + bool neg = false; // complementer + bool ingroup = false; // character in the group + if (numconds == 0) return 1; + char * p = c.conds; + st--; + int i = 1; + while (1) { + switch (*p) { + case '\0': return 1; + case '[': { p = nextchar(p); pos = st; break; } + case '^': { p = nextchar(p); neg = true; break; } + case ']': { if (!neg && !ingroup) return 0; + i++; + // skip the next character + if (!ingroup) { + for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--); + st--; + } + pos = NULL; + neg = false; + ingroup = false; + p = nextchar(p); + if (st < beg && p) return 0; // word <= condition + break; + } + case '.': if (!pos) { // dots are not metacharacters in groups: [.] + p = nextchar(p); + // skip the next character + for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--); + if (st < beg) { // word <= condition + if (p) return 0; else return 1; + } + if ((opts & aeUTF8) && (*st & 0x80)) { // head of the UTF-8 character + st--; + if (st < beg) { // word <= condition + if (p) return 0; else return 1; + } + } + break; + } + default: { + if (*st == *p) { + p = nextchar(p); + if ((opts & aeUTF8) && (*st & 0x80)) { + st--; + while (p && (st >= beg)) { + if (*p != *st) { + if (!pos) return 0; + st = pos; + break; + } + // first byte of the UTF-8 multibyte character + if ((*p & 0xc0) != 0x80) break; + p = nextchar(p); + st--; + } + if (pos && st != pos) { + if (neg) return 0; + else if (i == numconds) return 1; + ingroup = true; + while (p && *p != ']' && (p = nextchar(p))); + st--; + } + if (p && *p != ']') p = nextchar(p); + } else if (pos) { + if (neg) return 0; + else if (i == numconds) return 1; + ingroup = true; + while (p && *p != ']' && (p = nextchar(p))); +// if (p && *p != ']') p = nextchar(p); + st--; + } + if (!pos) { + i++; + st--; + } + if (st < beg && p && *p != ']') return 0; // word <= condition + } else if (pos) { // group + p = nextchar(p); + } else return 0; + } + } + if (!p) return 1; + } +} + +// see if this suffix is present in the word +struct hentry * SfxEntry::checkword(const char * word, int len, int optflags, + PfxEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag, + const FLAG badflag) +{ + int tmpl; // length of tmpword + struct hentry * he; // hash entry pointer + unsigned char * cp; + char tmpword[MAXWORDUTF8LEN + 4]; + PfxEntry* ep = ppfx; + + // if this suffix is being cross checked with a prefix + // but it does not support cross products skip it + + if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0)) + return NULL; + + // upon entry suffix is 0 length or already matches the end of the word. + // So if the remaining root word has positive length + // and if there are enough chars in root word and added back strip chars + // to meet the number of characters conditions, then test it + + tmpl = len - appndl; + // the second condition is not enough for UTF-8 strings + // it checked in test_condition() + + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && + (tmpl + stripl >= numconds)) { + + // generate new root word by removing suffix and adding + // back any characters that would have been stripped or + // or null terminating the shorter string + + strcpy (tmpword, word); + cp = (unsigned char *)(tmpword + tmpl); + if (stripl) { + strcpy ((char *)cp, strip); + tmpl += stripl; + cp = (unsigned char *)(tmpword + tmpl); + } else *cp = '\0'; + + // now make sure all of the conditions on characters + // are met. Please see the appendix at the end of + // this file for more info on exactly what is being + // tested + + // if all conditions are met then check if resulting + // root word in the dictionary + + if (test_condition((char *) cp, (char *) tmpword)) { + +#ifdef SZOSZABLYA_POSSIBLE_ROOTS + fprintf(stdout,"%s %s %c\n", word, tmpword, aflag); +#endif + if ((he = pmyMgr->lookup(tmpword)) != NULL) { + do { + // check conditional suffix (enabled by prefix) + if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && + TESTAFF(ep->getCont(), aflag, ep->getContLen()))) && + (((optflags & aeXPRODUCT) == 0) || + (ep && TESTAFF(he->astr, ep->getFlag(), he->alen)) || + // enabled by prefix + ((contclass) && (ep && TESTAFF(contclass, ep->getFlag(), contclasslen))) + ) && + // handle cont. class + ((!cclass) || + ((contclass) && TESTAFF(contclass, cclass, contclasslen)) + ) && + // check only in compound homonyms (bad flags) + (!badflag || !TESTAFF(he->astr, badflag, he->alen) + ) && + // handle required flag + ((!needflag) || + (TESTAFF(he->astr, needflag, he->alen) || + ((contclass) && TESTAFF(contclass, needflag, contclasslen))) + ) + ) return he; + he = he->next_homonym; // check homonyms + } while (he); + + // obsolote stemming code (used only by the + // experimental SuffixMgr:suggest_pos_stems) + // store resulting root in wlst + } else if (wlst && (*ns < maxSug)) { + int cwrd = 1; + for (int k=0; k < *ns; k++) + if (strcmp(tmpword, wlst[k]) == 0) cwrd = 0; + if (cwrd) { + wlst[*ns] = mystrdup(tmpword); + if (wlst[*ns] == NULL) { + for (int j=0; j<*ns; j++) free(wlst[j]); + *ns = -1; + return NULL; + } + (*ns)++; + } + } + } + } + return NULL; +} + +// see if two-level suffix is present in the word +struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags, + PfxEntry* ppfx, const FLAG needflag) +{ + int tmpl; // length of tmpword + struct hentry * he; // hash entry pointer + unsigned char * cp; + char tmpword[MAXWORDUTF8LEN + 4]; + PfxEntry* ep = ppfx; + + + // if this suffix is being cross checked with a prefix + // but it does not support cross products skip it + + if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0) + return NULL; + + // upon entry suffix is 0 length or already matches the end of the word. + // So if the remaining root word has positive length + // and if there are enough chars in root word and added back strip chars + // to meet the number of characters conditions, then test it + + tmpl = len - appndl; + + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && + (tmpl + stripl >= numconds)) { + + // generate new root word by removing suffix and adding + // back any characters that would have been stripped or + // or null terminating the shorter string + + strcpy (tmpword, word); + cp = (unsigned char *)(tmpword + tmpl); + if (stripl) { + strcpy ((char *)cp, strip); + tmpl += stripl; + cp = (unsigned char *)(tmpword + tmpl); + } else *cp = '\0'; + + // now make sure all of the conditions on characters + // are met. Please see the appendix at the end of + // this file for more info on exactly what is being + // tested + + // if all conditions are met then recall suffix_check + + if (test_condition((char *) cp, (char *) tmpword)) { + if (ppfx) { + // handle conditional suffix + if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) + he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag); + else + he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0, NULL, (FLAG) aflag, needflag); + } else { + he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag); + } + if (he) return he; + } + } + return NULL; +} + +// see if two-level suffix is present in the word +char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags, + PfxEntry* ppfx, const FLAG needflag) +{ + int tmpl; // length of tmpword + unsigned char * cp; + char tmpword[MAXWORDUTF8LEN + 4]; + PfxEntry* ep = ppfx; + char * st; + + char result[MAXLNLEN]; + + *result = '\0'; + + // if this suffix is being cross checked with a prefix + // but it does not support cross products skip it + + if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0) + return NULL; + + // upon entry suffix is 0 length or already matches the end of the word. + // So if the remaining root word has positive length + // and if there are enough chars in root word and added back strip chars + // to meet the number of characters conditions, then test it + + tmpl = len - appndl; + + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && + (tmpl + stripl >= numconds)) { + + // generate new root word by removing suffix and adding + // back any characters that would have been stripped or + // or null terminating the shorter string + + strcpy (tmpword, word); + cp = (unsigned char *)(tmpword + tmpl); + if (stripl) { + strcpy ((char *)cp, strip); + tmpl += stripl; + cp = (unsigned char *)(tmpword + tmpl); + } else *cp = '\0'; + + // now make sure all of the conditions on characters + // are met. Please see the appendix at the end of + // this file for more info on exactly what is being + // tested + + // if all conditions are met then recall suffix_check + + if (test_condition((char *) cp, (char *) tmpword)) { + if (ppfx) { + // handle conditional suffix + if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) { + st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag); + if (st) { + if (ppfx->getMorph()) { + mystrcat(result, ppfx->getMorph(), MAXLNLEN); + mystrcat(result, " ", MAXLNLEN); + } + mystrcat(result,st, MAXLNLEN); + free(st); + mychomp(result); + } + } else { + st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag); + if (st) { + mystrcat(result, st, MAXLNLEN); + free(st); + mychomp(result); + } + } + } else { + st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag); + if (st) { + mystrcat(result, st, MAXLNLEN); + free(st); + mychomp(result); + } + } + if (*result) return mystrdup(result); + } + } + return NULL; +} + +// get next homonym with same affix +struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, PfxEntry* ppfx, + const FLAG cclass, const FLAG needflag) +{ + PfxEntry* ep = ppfx; + FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL; + + while (he->next_homonym) { + he = he->next_homonym; + if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) && + ((optflags & aeXPRODUCT) == 0 || + TESTAFF(he->astr, eFlag, he->alen) || + // handle conditional suffix + ((contclass) && TESTAFF(contclass, eFlag, contclasslen)) + ) && + // handle cont. class + ((!cclass) || + ((contclass) && TESTAFF(contclass, cclass, contclasslen)) + ) && + // handle required flag + ((!needflag) || + (TESTAFF(he->astr, needflag, he->alen) || + ((contclass) && TESTAFF(contclass, needflag, contclasslen))) + ) + ) return he; + } + return NULL; +} + + +#if 0 + +Appendix: Understanding Affix Code + + +An affix is either a prefix or a suffix attached to root words to make +other words. + +Basically a Prefix or a Suffix is set of AffEntry objects +which store information about the prefix or suffix along +with supporting routines to check if a word has a particular +prefix or suffix or a combination. + +The structure affentry is defined as follows: + +struct affentry +{ + unsigned short aflag; // ID used to represent the affix + char * strip; // string to strip before adding affix + char * appnd; // the affix string to add + unsigned char stripl; // length of the strip string + unsigned char appndl; // length of the affix string + char numconds; // the number of conditions that must be met + char opts; // flag: aeXPRODUCT- combine both prefix and suffix + char conds[SETSIZE]; // array which encodes the conditions to be met +}; + + +Here is a suffix borrowed from the en_US.aff file. This file +is whitespace delimited. + +SFX D Y 4 +SFX D 0 e d +SFX D y ied [^aeiou]y +SFX D 0 ed [^ey] +SFX D 0 ed [aeiou]y + +This information can be interpreted as follows: + +In the first line has 4 fields + +Field +----- +1 SFX - indicates this is a suffix +2 D - is the name of the character flag which represents this suffix +3 Y - indicates it can be combined with prefixes (cross product) +4 4 - indicates that sequence of 4 affentry structures are needed to + properly store the affix information + +The remaining lines describe the unique information for the 4 SfxEntry +objects that make up this affix. Each line can be interpreted +as follows: (note fields 1 and 2 are as a check against line 1 info) + +Field +----- +1 SFX - indicates this is a suffix +2 D - is the name of the character flag for this affix +3 y - the string of chars to strip off before adding affix + (a 0 here indicates the NULL string) +4 ied - the string of affix characters to add +5 [^aeiou]y - the conditions which must be met before the affix + can be applied + +Field 5 is interesting. Since this is a suffix, field 5 tells us that +there are 2 conditions that must be met. The first condition is that +the next to the last character in the word must *NOT* be any of the +following "a", "e", "i", "o" or "u". The second condition is that +the last character of the word must end in "y". + +So how can we encode this information concisely and be able to +test for both conditions in a fast manner? The answer is found +but studying the wonderful ispell code of Geoff Kuenning, et.al. +(now available under a normal BSD license). + +If we set up a conds array of 256 bytes indexed (0 to 255) and access it +using a character (cast to an unsigned char) of a string, we have 8 bits +of information we can store about that character. Specifically we +could use each bit to say if that character is allowed in any of the +last (or first for prefixes) 8 characters of the word. + +Basically, each character at one end of the word (up to the number +of conditions) is used to index into the conds array and the resulting +value found there says whether the that character is valid for a +specific character position in the word. + +For prefixes, it does this by setting bit 0 if that char is valid +in the first position, bit 1 if valid in the second position, and so on. + +If a bit is not set, then that char is not valid for that postion in the +word. + +If working with suffixes bit 0 is used for the character closest +to the front, bit 1 for the next character towards the end, ..., +with bit numconds-1 representing the last char at the end of the string. + +Note: since entries in the conds[] are 8 bits, only 8 conditions +(read that only 8 character positions) can be examined at one +end of a word (the beginning for prefixes and the end for suffixes. + +So to make this clearer, lets encode the conds array values for the +first two affentries for the suffix D described earlier. + + + For the first affentry: + numconds = 1 (only examine the last character) + + conds['e'] = (1 << 0) (the word must end in an E) + all others are all 0 + + For the second affentry: + numconds = 2 (only examine the last two characters) + + conds[X] = conds[X] | (1 << 0) (aeiou are not allowed) + where X is all characters *but* a, e, i, o, or u + + + conds['y'] = (1 << 1) (the last char must be a y) + all other bits for all other entries in the conds array are zero + + +#endif + |