michael@0: /******* BEGIN LICENSE BLOCK ******* michael@0: * Version: MPL 1.1/GPL 2.0/LGPL 2.1 michael@0: * michael@0: * The contents of this file are subject to the Mozilla Public License Version michael@0: * 1.1 (the "License"); you may not use this file except in compliance with michael@0: * the License. You may obtain a copy of the License at michael@0: * http://www.mozilla.org/MPL/ michael@0: * michael@0: * Software distributed under the License is distributed on an "AS IS" basis, michael@0: * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License michael@0: * for the specific language governing rights and limitations under the michael@0: * License. michael@0: * michael@0: * The Initial Developers of the Original Code are Kevin Hendricks (MySpell) michael@0: * and László Németh (Hunspell). Portions created by the Initial Developers michael@0: * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved. michael@0: * michael@0: * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca) michael@0: * David Einstein (deinst@world.std.com) michael@0: * László Németh (nemethl@gyorsposta.hu) michael@0: * Caolan McNamara (caolanm@redhat.com) michael@0: * Davide Prina michael@0: * Giuseppe Modugno michael@0: * Gianluca Turconi michael@0: * Simon Brouwer michael@0: * Noll Janos michael@0: * Biro Arpad michael@0: * Goldman Eleonora michael@0: * Sarlos Tamas michael@0: * Bencsath Boldizsar michael@0: * Halacsy Peter michael@0: * Dvornik Laszlo michael@0: * Gefferth Andras michael@0: * Nagy Viktor michael@0: * Varga Daniel michael@0: * Chris Halls michael@0: * Rene Engelhard michael@0: * Bram Moolenaar michael@0: * Dafydd Jones michael@0: * Harri Pitkanen michael@0: * Andras Timar michael@0: * Tor Lillqvist michael@0: * michael@0: * Alternatively, the contents of this file may be used under the terms of michael@0: * either the GNU General Public License Version 2 or later (the "GPL"), or michael@0: * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), michael@0: * in which case the provisions of the GPL or the LGPL are applicable instead michael@0: * of those above. If you wish to allow use of your version of this file only michael@0: * under the terms of either the GPL or the LGPL, and not to allow others to michael@0: * use your version of this file under the terms of the MPL, indicate your michael@0: * decision by deleting the provisions above and replace them with the notice michael@0: * and other provisions required by the GPL or the LGPL. If you do not delete michael@0: * the provisions above, a recipient may use your version of this file under michael@0: * the terms of any one of the MPL, the GPL or the LGPL. michael@0: * michael@0: ******* END LICENSE BLOCK *******/ michael@0: michael@0: #include michael@0: #include michael@0: #include michael@0: #include michael@0: michael@0: #include "affentry.hxx" michael@0: #include "csutil.hxx" michael@0: michael@0: PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp) michael@0: { michael@0: // register affix manager michael@0: pmyMgr = pmgr; michael@0: michael@0: // set up its initial values michael@0: michael@0: aflag = dp->aflag; // flag michael@0: strip = dp->strip; // string to strip michael@0: appnd = dp->appnd; // string to append michael@0: stripl = dp->stripl; // length of strip string michael@0: appndl = dp->appndl; // length of append string michael@0: numconds = dp->numconds; // length of the condition michael@0: opts = dp->opts; // cross product flag michael@0: // then copy over all of the conditions michael@0: if (opts & aeLONGCOND) { michael@0: memcpy(c.conds, dp->c.l.conds1, MAXCONDLEN_1); michael@0: c.l.conds2 = dp->c.l.conds2; michael@0: } else memcpy(c.conds, dp->c.conds, MAXCONDLEN); michael@0: next = NULL; michael@0: nextne = NULL; michael@0: nexteq = NULL; michael@0: morphcode = dp->morphcode; michael@0: contclass = dp->contclass; michael@0: contclasslen = dp->contclasslen; michael@0: } michael@0: michael@0: michael@0: PfxEntry::~PfxEntry() michael@0: { michael@0: aflag = 0; michael@0: if (appnd) free(appnd); michael@0: if (strip) free(strip); michael@0: pmyMgr = NULL; michael@0: appnd = NULL; michael@0: strip = NULL; michael@0: if (opts & aeLONGCOND) free(c.l.conds2); michael@0: if (morphcode && !(opts & aeALIASM)) free(morphcode); michael@0: if (contclass && !(opts & aeALIASF)) free(contclass); michael@0: } michael@0: michael@0: // add prefix to this word assuming conditions hold michael@0: char * PfxEntry::add(const char * word, int len) michael@0: { michael@0: char tword[MAXWORDUTF8LEN + 4]; michael@0: michael@0: if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) && michael@0: (len >= numconds) && test_condition(word) && michael@0: (!stripl || (strncmp(word, strip, stripl) == 0)) && michael@0: ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) { michael@0: /* we have a match so add prefix */ michael@0: char * pp = tword; michael@0: if (appndl) { michael@0: strcpy(tword,appnd); michael@0: pp += appndl; michael@0: } michael@0: strcpy(pp, (word + stripl)); michael@0: return mystrdup(tword); michael@0: } michael@0: return NULL; michael@0: } michael@0: michael@0: inline char * PfxEntry::nextchar(char * p) { michael@0: if (p) { michael@0: p++; michael@0: if (opts & aeLONGCOND) { michael@0: // jump to the 2nd part of the condition michael@0: if (p == c.conds + MAXCONDLEN_1) return c.l.conds2; michael@0: // end of the MAXCONDLEN length condition michael@0: } else if (p == c.conds + MAXCONDLEN) return NULL; michael@0: return *p ? p : NULL; michael@0: } michael@0: return NULL; michael@0: } michael@0: michael@0: inline int PfxEntry::test_condition(const char * st) michael@0: { michael@0: const char * pos = NULL; // group with pos input position michael@0: bool neg = false; // complementer michael@0: bool ingroup = false; // character in the group michael@0: if (numconds == 0) return 1; michael@0: char * p = c.conds; michael@0: while (1) { michael@0: switch (*p) { michael@0: case '\0': return 1; michael@0: case '[': { michael@0: neg = false; michael@0: ingroup = false; michael@0: p = nextchar(p); michael@0: pos = st; break; michael@0: } michael@0: case '^': { p = nextchar(p); neg = true; break; } michael@0: case ']': { michael@0: if ((neg && ingroup) || (!neg && !ingroup)) return 0; michael@0: pos = NULL; michael@0: p = nextchar(p); michael@0: // skip the next character michael@0: if (!ingroup && *st) for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++); michael@0: if (*st == '\0' && p) return 0; // word <= condition michael@0: break; michael@0: } michael@0: case '.': if (!pos) { // dots are not metacharacters in groups: [.] michael@0: p = nextchar(p); michael@0: // skip the next character michael@0: for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++); michael@0: if (*st == '\0' && p) return 0; // word <= condition michael@0: break; michael@0: } michael@0: default: { michael@0: if (*st == *p) { michael@0: st++; michael@0: p = nextchar(p); michael@0: if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte michael@0: while (p && (*p & 0xc0) == 0x80) { // character michael@0: if (*p != *st) { michael@0: if (!pos) return 0; michael@0: st = pos; michael@0: break; michael@0: } michael@0: p = nextchar(p); michael@0: st++; michael@0: } michael@0: if (pos && st != pos) { michael@0: ingroup = true; michael@0: while (p && *p != ']' && (p = nextchar(p))); michael@0: } michael@0: } else if (pos) { michael@0: ingroup = true; michael@0: while (p && *p != ']' && (p = nextchar(p))); michael@0: } michael@0: } else if (pos) { // group michael@0: p = nextchar(p); michael@0: } else return 0; michael@0: } michael@0: } michael@0: if (!p) return 1; michael@0: } michael@0: } michael@0: michael@0: // check if this prefix entry matches michael@0: struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound, const FLAG needflag) michael@0: { michael@0: int tmpl; // length of tmpword michael@0: struct hentry * he; // hash entry of root word or NULL michael@0: char tmpword[MAXWORDUTF8LEN + 4]; michael@0: michael@0: // on entry prefix is 0 length or already matches the beginning of the word. michael@0: // So if the remaining root word has positive length michael@0: // and if there are enough chars in root word and added back strip chars michael@0: // to meet the number of characters conditions, then test it michael@0: michael@0: tmpl = len - appndl; michael@0: michael@0: if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) { michael@0: michael@0: // generate new root word by removing prefix and adding michael@0: // back any characters that would have been stripped michael@0: michael@0: if (stripl) strcpy (tmpword, strip); michael@0: strcpy ((tmpword + stripl), (word + appndl)); michael@0: michael@0: // now make sure all of the conditions on characters michael@0: // are met. Please see the appendix at the end of michael@0: // this file for more info on exactly what is being michael@0: // tested michael@0: michael@0: // if all conditions are met then check if resulting michael@0: // root word in the dictionary michael@0: michael@0: if (test_condition(tmpword)) { michael@0: tmpl += stripl; michael@0: if ((he = pmyMgr->lookup(tmpword)) != NULL) { michael@0: do { michael@0: if (TESTAFF(he->astr, aflag, he->alen) && michael@0: // forbid single prefixes with needaffix flag michael@0: ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) && michael@0: // needflag michael@0: ((!needflag) || TESTAFF(he->astr, needflag, he->alen) || michael@0: (contclass && TESTAFF(contclass, needflag, contclasslen)))) michael@0: return he; michael@0: he = he->next_homonym; // check homonyms michael@0: } while (he); michael@0: } michael@0: michael@0: // prefix matched but no root word was found michael@0: // if aeXPRODUCT is allowed, try again but now michael@0: // ross checked combined with a suffix michael@0: michael@0: //if ((opts & aeXPRODUCT) && in_compound) { michael@0: if ((opts & aeXPRODUCT)) { michael@0: he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, this, NULL, michael@0: 0, NULL, FLAG_NULL, needflag, in_compound); michael@0: if (he) return he; michael@0: } michael@0: } michael@0: } michael@0: return NULL; michael@0: } michael@0: michael@0: // check if this prefix entry matches michael@0: struct hentry * PfxEntry::check_twosfx(const char * word, int len, michael@0: char in_compound, const FLAG needflag) michael@0: { michael@0: int tmpl; // length of tmpword michael@0: struct hentry * he; // hash entry of root word or NULL michael@0: char tmpword[MAXWORDUTF8LEN + 4]; michael@0: michael@0: // on entry prefix is 0 length or already matches the beginning of the word. michael@0: // So if the remaining root word has positive length michael@0: // and if there are enough chars in root word and added back strip chars michael@0: // to meet the number of characters conditions, then test it michael@0: michael@0: tmpl = len - appndl; michael@0: michael@0: if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && michael@0: (tmpl + stripl >= numconds)) { michael@0: michael@0: // generate new root word by removing prefix and adding michael@0: // back any characters that would have been stripped michael@0: michael@0: if (stripl) strcpy (tmpword, strip); michael@0: strcpy ((tmpword + stripl), (word + appndl)); michael@0: michael@0: // now make sure all of the conditions on characters michael@0: // are met. Please see the appendix at the end of michael@0: // this file for more info on exactly what is being michael@0: // tested michael@0: michael@0: // if all conditions are met then check if resulting michael@0: // root word in the dictionary michael@0: michael@0: if (test_condition(tmpword)) { michael@0: tmpl += stripl; michael@0: michael@0: // prefix matched but no root word was found michael@0: // if aeXPRODUCT is allowed, try again but now michael@0: // cross checked combined with a suffix michael@0: michael@0: if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { michael@0: he = pmyMgr->suffix_check_twosfx(tmpword, tmpl, aeXPRODUCT, this, needflag); michael@0: if (he) return he; michael@0: } michael@0: } michael@0: } michael@0: return NULL; michael@0: } michael@0: michael@0: // check if this prefix entry matches michael@0: char * PfxEntry::check_twosfx_morph(const char * word, int len, michael@0: char in_compound, const FLAG needflag) michael@0: { michael@0: int tmpl; // length of tmpword michael@0: char tmpword[MAXWORDUTF8LEN + 4]; michael@0: michael@0: // on entry prefix is 0 length or already matches the beginning of the word. michael@0: // So if the remaining root word has positive length michael@0: // and if there are enough chars in root word and added back strip chars michael@0: // to meet the number of characters conditions, then test it michael@0: michael@0: tmpl = len - appndl; michael@0: michael@0: if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && michael@0: (tmpl + stripl >= numconds)) { michael@0: michael@0: // generate new root word by removing prefix and adding michael@0: // back any characters that would have been stripped michael@0: michael@0: if (stripl) strcpy (tmpword, strip); michael@0: strcpy ((tmpword + stripl), (word + appndl)); michael@0: michael@0: // now make sure all of the conditions on characters michael@0: // are met. Please see the appendix at the end of michael@0: // this file for more info on exactly what is being michael@0: // tested michael@0: michael@0: // if all conditions are met then check if resulting michael@0: // root word in the dictionary michael@0: michael@0: if (test_condition(tmpword)) { michael@0: tmpl += stripl; michael@0: michael@0: // prefix matched but no root word was found michael@0: // if aeXPRODUCT is allowed, try again but now michael@0: // ross checked combined with a suffix michael@0: michael@0: if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { michael@0: return pmyMgr->suffix_check_twosfx_morph(tmpword, tmpl, michael@0: aeXPRODUCT, this, needflag); michael@0: } michael@0: } michael@0: } michael@0: return NULL; michael@0: } michael@0: michael@0: // check if this prefix entry matches michael@0: char * PfxEntry::check_morph(const char * word, int len, char in_compound, const FLAG needflag) michael@0: { michael@0: int tmpl; // length of tmpword michael@0: struct hentry * he; // hash entry of root word or NULL michael@0: char tmpword[MAXWORDUTF8LEN + 4]; michael@0: char result[MAXLNLEN]; michael@0: char * st; michael@0: michael@0: *result = '\0'; michael@0: michael@0: // on entry prefix is 0 length or already matches the beginning of the word. michael@0: // So if the remaining root word has positive length michael@0: // and if there are enough chars in root word and added back strip chars michael@0: // to meet the number of characters conditions, then test it michael@0: michael@0: tmpl = len - appndl; michael@0: michael@0: if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && michael@0: (tmpl + stripl >= numconds)) { michael@0: michael@0: // generate new root word by removing prefix and adding michael@0: // back any characters that would have been stripped michael@0: michael@0: if (stripl) strcpy (tmpword, strip); michael@0: strcpy ((tmpword + stripl), (word + appndl)); michael@0: michael@0: // now make sure all of the conditions on characters michael@0: // are met. Please see the appendix at the end of michael@0: // this file for more info on exactly what is being michael@0: // tested michael@0: michael@0: // if all conditions are met then check if resulting michael@0: // root word in the dictionary michael@0: michael@0: if (test_condition(tmpword)) { michael@0: tmpl += stripl; michael@0: if ((he = pmyMgr->lookup(tmpword)) != NULL) { michael@0: do { michael@0: if (TESTAFF(he->astr, aflag, he->alen) && michael@0: // forbid single prefixes with needaffix flag michael@0: ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) && michael@0: // needflag michael@0: ((!needflag) || TESTAFF(he->astr, needflag, he->alen) || michael@0: (contclass && TESTAFF(contclass, needflag, contclasslen)))) { michael@0: if (morphcode) { michael@0: mystrcat(result, " ", MAXLNLEN); michael@0: mystrcat(result, morphcode, MAXLNLEN); michael@0: } else mystrcat(result,getKey(), MAXLNLEN); michael@0: if (!HENTRY_FIND(he, MORPH_STEM)) { michael@0: mystrcat(result, " ", MAXLNLEN); michael@0: mystrcat(result, MORPH_STEM, MAXLNLEN); michael@0: mystrcat(result, HENTRY_WORD(he), MAXLNLEN); michael@0: } michael@0: // store the pointer of the hash entry michael@0: if (HENTRY_DATA(he)) { michael@0: mystrcat(result, " ", MAXLNLEN); michael@0: mystrcat(result, HENTRY_DATA2(he), MAXLNLEN); michael@0: } else { michael@0: // return with debug information michael@0: char * flag = pmyMgr->encode_flag(getFlag()); michael@0: mystrcat(result, " ", MAXLNLEN); michael@0: mystrcat(result, MORPH_FLAG, MAXLNLEN); michael@0: mystrcat(result, flag, MAXLNLEN); michael@0: free(flag); michael@0: } michael@0: mystrcat(result, "\n", MAXLNLEN); michael@0: } michael@0: he = he->next_homonym; michael@0: } while (he); michael@0: } michael@0: michael@0: // prefix matched but no root word was found michael@0: // if aeXPRODUCT is allowed, try again but now michael@0: // ross checked combined with a suffix michael@0: michael@0: if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { michael@0: st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, this, michael@0: FLAG_NULL, needflag); michael@0: if (st) { michael@0: mystrcat(result, st, MAXLNLEN); michael@0: free(st); michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: if (*result) return mystrdup(result); michael@0: return NULL; michael@0: } michael@0: michael@0: SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp) michael@0: { michael@0: // register affix manager michael@0: pmyMgr = pmgr; michael@0: michael@0: // set up its initial values michael@0: aflag = dp->aflag; // char flag michael@0: strip = dp->strip; // string to strip michael@0: appnd = dp->appnd; // string to append michael@0: stripl = dp->stripl; // length of strip string michael@0: appndl = dp->appndl; // length of append string michael@0: numconds = dp->numconds; // length of the condition michael@0: opts = dp->opts; // cross product flag michael@0: michael@0: // then copy over all of the conditions michael@0: if (opts & aeLONGCOND) { michael@0: memcpy(c.l.conds1, dp->c.l.conds1, MAXCONDLEN_1); michael@0: c.l.conds2 = dp->c.l.conds2; michael@0: } else memcpy(c.conds, dp->c.conds, MAXCONDLEN); michael@0: next = NULL; michael@0: nextne = NULL; michael@0: nexteq = NULL; michael@0: rappnd = myrevstrdup(appnd); michael@0: morphcode = dp->morphcode; michael@0: contclass = dp->contclass; michael@0: contclasslen = dp->contclasslen; michael@0: } michael@0: michael@0: michael@0: SfxEntry::~SfxEntry() michael@0: { michael@0: aflag = 0; michael@0: if (appnd) free(appnd); michael@0: if (rappnd) free(rappnd); michael@0: if (strip) free(strip); michael@0: pmyMgr = NULL; michael@0: appnd = NULL; michael@0: strip = NULL; michael@0: if (opts & aeLONGCOND) free(c.l.conds2); michael@0: if (morphcode && !(opts & aeALIASM)) free(morphcode); michael@0: if (contclass && !(opts & aeALIASF)) free(contclass); michael@0: } michael@0: michael@0: // add suffix to this word assuming conditions hold michael@0: char * SfxEntry::add(const char * word, int len) michael@0: { michael@0: char tword[MAXWORDUTF8LEN + 4]; michael@0: michael@0: /* make sure all conditions match */ michael@0: if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) && michael@0: (len >= numconds) && test_condition(word + len, word) && michael@0: (!stripl || (strcmp(word + len - stripl, strip) == 0)) && michael@0: ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) { michael@0: /* we have a match so add suffix */ michael@0: strcpy(tword,word); michael@0: if (appndl) { michael@0: strcpy(tword + len - stripl, appnd); michael@0: } else { michael@0: *(tword + len - stripl) = '\0'; michael@0: } michael@0: return mystrdup(tword); michael@0: } michael@0: return NULL; michael@0: } michael@0: michael@0: inline char * SfxEntry::nextchar(char * p) { michael@0: if (p) { michael@0: p++; michael@0: if (opts & aeLONGCOND) { michael@0: // jump to the 2nd part of the condition michael@0: if (p == c.l.conds1 + MAXCONDLEN_1) return c.l.conds2; michael@0: // end of the MAXCONDLEN length condition michael@0: } else if (p == c.conds + MAXCONDLEN) return NULL; michael@0: return *p ? p : NULL; michael@0: } michael@0: return NULL; michael@0: } michael@0: michael@0: inline int SfxEntry::test_condition(const char * st, const char * beg) michael@0: { michael@0: const char * pos = NULL; // group with pos input position michael@0: bool neg = false; // complementer michael@0: bool ingroup = false; // character in the group michael@0: if (numconds == 0) return 1; michael@0: char * p = c.conds; michael@0: st--; michael@0: int i = 1; michael@0: while (1) { michael@0: switch (*p) { michael@0: case '\0': return 1; michael@0: case '[': { p = nextchar(p); pos = st; break; } michael@0: case '^': { p = nextchar(p); neg = true; break; } michael@0: case ']': { if (!neg && !ingroup) return 0; michael@0: i++; michael@0: // skip the next character michael@0: if (!ingroup) { michael@0: for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--); michael@0: st--; michael@0: } michael@0: pos = NULL; michael@0: neg = false; michael@0: ingroup = false; michael@0: p = nextchar(p); michael@0: if (st < beg && p) return 0; // word <= condition michael@0: break; michael@0: } michael@0: case '.': if (!pos) { // dots are not metacharacters in groups: [.] michael@0: p = nextchar(p); michael@0: // skip the next character michael@0: for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--); michael@0: if (st < beg) { // word <= condition michael@0: if (p) return 0; else return 1; michael@0: } michael@0: if ((opts & aeUTF8) && (*st & 0x80)) { // head of the UTF-8 character michael@0: st--; michael@0: if (st < beg) { // word <= condition michael@0: if (p) return 0; else return 1; michael@0: } michael@0: } michael@0: break; michael@0: } michael@0: default: { michael@0: if (*st == *p) { michael@0: p = nextchar(p); michael@0: if ((opts & aeUTF8) && (*st & 0x80)) { michael@0: st--; michael@0: while (p && (st >= beg)) { michael@0: if (*p != *st) { michael@0: if (!pos) return 0; michael@0: st = pos; michael@0: break; michael@0: } michael@0: // first byte of the UTF-8 multibyte character michael@0: if ((*p & 0xc0) != 0x80) break; michael@0: p = nextchar(p); michael@0: st--; michael@0: } michael@0: if (pos && st != pos) { michael@0: if (neg) return 0; michael@0: else if (i == numconds) return 1; michael@0: ingroup = true; michael@0: while (p && *p != ']' && (p = nextchar(p))); michael@0: st--; michael@0: } michael@0: if (p && *p != ']') p = nextchar(p); michael@0: } else if (pos) { michael@0: if (neg) return 0; michael@0: else if (i == numconds) return 1; michael@0: ingroup = true; michael@0: while (p && *p != ']' && (p = nextchar(p))) michael@0: ; michael@0: // if (p && *p != ']') p = nextchar(p); michael@0: st--; michael@0: } michael@0: if (!pos) { michael@0: i++; michael@0: st--; michael@0: } michael@0: if (st < beg && p && *p != ']') return 0; // word <= condition michael@0: } else if (pos) { // group michael@0: p = nextchar(p); michael@0: } else return 0; michael@0: } michael@0: } michael@0: if (!p) return 1; michael@0: } michael@0: } michael@0: michael@0: // see if this suffix is present in the word michael@0: struct hentry * SfxEntry::checkword(const char * word, int len, int optflags, michael@0: PfxEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag, michael@0: const FLAG badflag) michael@0: { michael@0: int tmpl; // length of tmpword michael@0: struct hentry * he; // hash entry pointer michael@0: unsigned char * cp; michael@0: char tmpword[MAXWORDUTF8LEN + 4]; michael@0: PfxEntry* ep = ppfx; michael@0: michael@0: // if this suffix is being cross checked with a prefix michael@0: // but it does not support cross products skip it michael@0: michael@0: if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0)) michael@0: return NULL; michael@0: michael@0: // upon entry suffix is 0 length or already matches the end of the word. michael@0: // So if the remaining root word has positive length michael@0: // and if there are enough chars in root word and added back strip chars michael@0: // to meet the number of characters conditions, then test it michael@0: michael@0: tmpl = len - appndl; michael@0: // the second condition is not enough for UTF-8 strings michael@0: // it checked in test_condition() michael@0: michael@0: if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && michael@0: (tmpl + stripl >= numconds)) { michael@0: michael@0: // generate new root word by removing suffix and adding michael@0: // back any characters that would have been stripped or michael@0: // or null terminating the shorter string michael@0: michael@0: strcpy (tmpword, word); michael@0: cp = (unsigned char *)(tmpword + tmpl); michael@0: if (stripl) { michael@0: strcpy ((char *)cp, strip); michael@0: tmpl += stripl; michael@0: cp = (unsigned char *)(tmpword + tmpl); michael@0: } else *cp = '\0'; michael@0: michael@0: // now make sure all of the conditions on characters michael@0: // are met. Please see the appendix at the end of michael@0: // this file for more info on exactly what is being michael@0: // tested michael@0: michael@0: // if all conditions are met then check if resulting michael@0: // root word in the dictionary michael@0: michael@0: if (test_condition((char *) cp, (char *) tmpword)) { michael@0: michael@0: #ifdef SZOSZABLYA_POSSIBLE_ROOTS michael@0: fprintf(stdout,"%s %s %c\n", word, tmpword, aflag); michael@0: #endif michael@0: if ((he = pmyMgr->lookup(tmpword)) != NULL) { michael@0: do { michael@0: // check conditional suffix (enabled by prefix) michael@0: if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && michael@0: TESTAFF(ep->getCont(), aflag, ep->getContLen()))) && michael@0: (((optflags & aeXPRODUCT) == 0) || michael@0: (ep && TESTAFF(he->astr, ep->getFlag(), he->alen)) || michael@0: // enabled by prefix michael@0: ((contclass) && (ep && TESTAFF(contclass, ep->getFlag(), contclasslen))) michael@0: ) && michael@0: // handle cont. class michael@0: ((!cclass) || michael@0: ((contclass) && TESTAFF(contclass, cclass, contclasslen)) michael@0: ) && michael@0: // check only in compound homonyms (bad flags) michael@0: (!badflag || !TESTAFF(he->astr, badflag, he->alen) michael@0: ) && michael@0: // handle required flag michael@0: ((!needflag) || michael@0: (TESTAFF(he->astr, needflag, he->alen) || michael@0: ((contclass) && TESTAFF(contclass, needflag, contclasslen))) michael@0: ) michael@0: ) return he; michael@0: he = he->next_homonym; // check homonyms michael@0: } while (he); michael@0: michael@0: // obsolote stemming code (used only by the michael@0: // experimental SuffixMgr:suggest_pos_stems) michael@0: // store resulting root in wlst michael@0: } else if (wlst && (*ns < maxSug)) { michael@0: int cwrd = 1; michael@0: for (int k=0; k < *ns; k++) michael@0: if (strcmp(tmpword, wlst[k]) == 0) cwrd = 0; michael@0: if (cwrd) { michael@0: wlst[*ns] = mystrdup(tmpword); michael@0: if (wlst[*ns] == NULL) { michael@0: for (int j=0; j<*ns; j++) free(wlst[j]); michael@0: *ns = -1; michael@0: return NULL; michael@0: } michael@0: (*ns)++; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: return NULL; michael@0: } michael@0: michael@0: // see if two-level suffix is present in the word michael@0: struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags, michael@0: PfxEntry* ppfx, const FLAG needflag) michael@0: { michael@0: int tmpl; // length of tmpword michael@0: struct hentry * he; // hash entry pointer michael@0: unsigned char * cp; michael@0: char tmpword[MAXWORDUTF8LEN + 4]; michael@0: PfxEntry* ep = ppfx; michael@0: michael@0: michael@0: // if this suffix is being cross checked with a prefix michael@0: // but it does not support cross products skip it michael@0: michael@0: if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0) michael@0: return NULL; michael@0: michael@0: // upon entry suffix is 0 length or already matches the end of the word. michael@0: // So if the remaining root word has positive length michael@0: // and if there are enough chars in root word and added back strip chars michael@0: // to meet the number of characters conditions, then test it michael@0: michael@0: tmpl = len - appndl; michael@0: michael@0: if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && michael@0: (tmpl + stripl >= numconds)) { michael@0: michael@0: // generate new root word by removing suffix and adding michael@0: // back any characters that would have been stripped or michael@0: // or null terminating the shorter string michael@0: michael@0: strcpy (tmpword, word); michael@0: cp = (unsigned char *)(tmpword + tmpl); michael@0: if (stripl) { michael@0: strcpy ((char *)cp, strip); michael@0: tmpl += stripl; michael@0: cp = (unsigned char *)(tmpword + tmpl); michael@0: } else *cp = '\0'; michael@0: michael@0: // now make sure all of the conditions on characters michael@0: // are met. Please see the appendix at the end of michael@0: // this file for more info on exactly what is being michael@0: // tested michael@0: michael@0: // if all conditions are met then recall suffix_check michael@0: michael@0: if (test_condition((char *) cp, (char *) tmpword)) { michael@0: if (ppfx) { michael@0: // handle conditional suffix michael@0: if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) michael@0: he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag); michael@0: else michael@0: he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0, NULL, (FLAG) aflag, needflag); michael@0: } else { michael@0: he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag); michael@0: } michael@0: if (he) return he; michael@0: } michael@0: } michael@0: return NULL; michael@0: } michael@0: michael@0: // see if two-level suffix is present in the word michael@0: char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags, michael@0: PfxEntry* ppfx, const FLAG needflag) michael@0: { michael@0: int tmpl; // length of tmpword michael@0: unsigned char * cp; michael@0: char tmpword[MAXWORDUTF8LEN + 4]; michael@0: PfxEntry* ep = ppfx; michael@0: char * st; michael@0: michael@0: char result[MAXLNLEN]; michael@0: michael@0: *result = '\0'; michael@0: michael@0: // if this suffix is being cross checked with a prefix michael@0: // but it does not support cross products skip it michael@0: michael@0: if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0) michael@0: return NULL; michael@0: michael@0: // upon entry suffix is 0 length or already matches the end of the word. michael@0: // So if the remaining root word has positive length michael@0: // and if there are enough chars in root word and added back strip chars michael@0: // to meet the number of characters conditions, then test it michael@0: michael@0: tmpl = len - appndl; michael@0: michael@0: if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && michael@0: (tmpl + stripl >= numconds)) { michael@0: michael@0: // generate new root word by removing suffix and adding michael@0: // back any characters that would have been stripped or michael@0: // or null terminating the shorter string michael@0: michael@0: strcpy (tmpword, word); michael@0: cp = (unsigned char *)(tmpword + tmpl); michael@0: if (stripl) { michael@0: strcpy ((char *)cp, strip); michael@0: tmpl += stripl; michael@0: cp = (unsigned char *)(tmpword + tmpl); michael@0: } else *cp = '\0'; michael@0: michael@0: // now make sure all of the conditions on characters michael@0: // are met. Please see the appendix at the end of michael@0: // this file for more info on exactly what is being michael@0: // tested michael@0: michael@0: // if all conditions are met then recall suffix_check michael@0: michael@0: if (test_condition((char *) cp, (char *) tmpword)) { michael@0: if (ppfx) { michael@0: // handle conditional suffix michael@0: if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) { michael@0: st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag); michael@0: if (st) { michael@0: if (ppfx->getMorph()) { michael@0: mystrcat(result, ppfx->getMorph(), MAXLNLEN); michael@0: mystrcat(result, " ", MAXLNLEN); michael@0: } michael@0: mystrcat(result,st, MAXLNLEN); michael@0: free(st); michael@0: mychomp(result); michael@0: } michael@0: } else { michael@0: st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag); michael@0: if (st) { michael@0: mystrcat(result, st, MAXLNLEN); michael@0: free(st); michael@0: mychomp(result); michael@0: } michael@0: } michael@0: } else { michael@0: st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag); michael@0: if (st) { michael@0: mystrcat(result, st, MAXLNLEN); michael@0: free(st); michael@0: mychomp(result); michael@0: } michael@0: } michael@0: if (*result) return mystrdup(result); michael@0: } michael@0: } michael@0: return NULL; michael@0: } michael@0: michael@0: // get next homonym with same affix michael@0: struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, PfxEntry* ppfx, michael@0: const FLAG cclass, const FLAG needflag) michael@0: { michael@0: PfxEntry* ep = ppfx; michael@0: FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL; michael@0: michael@0: while (he->next_homonym) { michael@0: he = he->next_homonym; michael@0: if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) && michael@0: ((optflags & aeXPRODUCT) == 0 || michael@0: TESTAFF(he->astr, eFlag, he->alen) || michael@0: // handle conditional suffix michael@0: ((contclass) && TESTAFF(contclass, eFlag, contclasslen)) michael@0: ) && michael@0: // handle cont. class michael@0: ((!cclass) || michael@0: ((contclass) && TESTAFF(contclass, cclass, contclasslen)) michael@0: ) && michael@0: // handle required flag michael@0: ((!needflag) || michael@0: (TESTAFF(he->astr, needflag, he->alen) || michael@0: ((contclass) && TESTAFF(contclass, needflag, contclasslen))) michael@0: ) michael@0: ) return he; michael@0: } michael@0: return NULL; michael@0: } michael@0: michael@0: michael@0: #if 0 michael@0: michael@0: Appendix: Understanding Affix Code michael@0: michael@0: michael@0: An affix is either a prefix or a suffix attached to root words to make michael@0: other words. michael@0: michael@0: Basically a Prefix or a Suffix is set of AffEntry objects michael@0: which store information about the prefix or suffix along michael@0: with supporting routines to check if a word has a particular michael@0: prefix or suffix or a combination. michael@0: michael@0: The structure affentry is defined as follows: michael@0: michael@0: struct affentry michael@0: { michael@0: unsigned short aflag; // ID used to represent the affix michael@0: char * strip; // string to strip before adding affix michael@0: char * appnd; // the affix string to add michael@0: unsigned char stripl; // length of the strip string michael@0: unsigned char appndl; // length of the affix string michael@0: char numconds; // the number of conditions that must be met michael@0: char opts; // flag: aeXPRODUCT- combine both prefix and suffix michael@0: char conds[SETSIZE]; // array which encodes the conditions to be met michael@0: }; michael@0: michael@0: michael@0: Here is a suffix borrowed from the en_US.aff file. This file michael@0: is whitespace delimited. michael@0: michael@0: SFX D Y 4 michael@0: SFX D 0 e d michael@0: SFX D y ied [^aeiou]y michael@0: SFX D 0 ed [^ey] michael@0: SFX D 0 ed [aeiou]y michael@0: michael@0: This information can be interpreted as follows: michael@0: michael@0: In the first line has 4 fields michael@0: michael@0: Field michael@0: ----- michael@0: 1 SFX - indicates this is a suffix michael@0: 2 D - is the name of the character flag which represents this suffix michael@0: 3 Y - indicates it can be combined with prefixes (cross product) michael@0: 4 4 - indicates that sequence of 4 affentry structures are needed to michael@0: properly store the affix information michael@0: michael@0: The remaining lines describe the unique information for the 4 SfxEntry michael@0: objects that make up this affix. Each line can be interpreted michael@0: as follows: (note fields 1 and 2 are as a check against line 1 info) michael@0: michael@0: Field michael@0: ----- michael@0: 1 SFX - indicates this is a suffix michael@0: 2 D - is the name of the character flag for this affix michael@0: 3 y - the string of chars to strip off before adding affix michael@0: (a 0 here indicates the NULL string) michael@0: 4 ied - the string of affix characters to add michael@0: 5 [^aeiou]y - the conditions which must be met before the affix michael@0: can be applied michael@0: michael@0: Field 5 is interesting. Since this is a suffix, field 5 tells us that michael@0: there are 2 conditions that must be met. The first condition is that michael@0: the next to the last character in the word must *NOT* be any of the michael@0: following "a", "e", "i", "o" or "u". The second condition is that michael@0: the last character of the word must end in "y". michael@0: michael@0: So how can we encode this information concisely and be able to michael@0: test for both conditions in a fast manner? The answer is found michael@0: but studying the wonderful ispell code of Geoff Kuenning, et.al. michael@0: (now available under a normal BSD license). michael@0: michael@0: If we set up a conds array of 256 bytes indexed (0 to 255) and access it michael@0: using a character (cast to an unsigned char) of a string, we have 8 bits michael@0: of information we can store about that character. Specifically we michael@0: could use each bit to say if that character is allowed in any of the michael@0: last (or first for prefixes) 8 characters of the word. michael@0: michael@0: Basically, each character at one end of the word (up to the number michael@0: of conditions) is used to index into the conds array and the resulting michael@0: value found there says whether the that character is valid for a michael@0: specific character position in the word. michael@0: michael@0: For prefixes, it does this by setting bit 0 if that char is valid michael@0: in the first position, bit 1 if valid in the second position, and so on. michael@0: michael@0: If a bit is not set, then that char is not valid for that postion in the michael@0: word. michael@0: michael@0: If working with suffixes bit 0 is used for the character closest michael@0: to the front, bit 1 for the next character towards the end, ..., michael@0: with bit numconds-1 representing the last char at the end of the string. michael@0: michael@0: Note: since entries in the conds[] are 8 bits, only 8 conditions michael@0: (read that only 8 character positions) can be examined at one michael@0: end of a word (the beginning for prefixes and the end for suffixes. michael@0: michael@0: So to make this clearer, lets encode the conds array values for the michael@0: first two affentries for the suffix D described earlier. michael@0: michael@0: michael@0: For the first affentry: michael@0: numconds = 1 (only examine the last character) michael@0: michael@0: conds['e'] = (1 << 0) (the word must end in an E) michael@0: all others are all 0 michael@0: michael@0: For the second affentry: michael@0: numconds = 2 (only examine the last two characters) michael@0: michael@0: conds[X] = conds[X] | (1 << 0) (aeiou are not allowed) michael@0: where X is all characters *but* a, e, i, o, or u michael@0: michael@0: michael@0: conds['y'] = (1 << 1) (the last char must be a y) michael@0: all other bits for all other entries in the conds array are zero michael@0: michael@0: michael@0: #endif michael@0: