michael@0: /******* BEGIN LICENSE BLOCK *******
michael@0:  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
michael@0:  * 
michael@0:  * The contents of this file are subject to the Mozilla Public License Version
michael@0:  * 1.1 (the "License"); you may not use this file except in compliance with
michael@0:  * the License. You may obtain a copy of the License at
michael@0:  * http://www.mozilla.org/MPL/
michael@0:  * 
michael@0:  * Software distributed under the License is distributed on an "AS IS" basis,
michael@0:  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
michael@0:  * for the specific language governing rights and limitations under the
michael@0:  * License.
michael@0:  * 
michael@0:  * The Initial Developers of the Original Code are Kevin Hendricks (MySpell)
michael@0:  * and László Németh (Hunspell). Portions created by the Initial Developers
michael@0:  * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved.
michael@0:  * 
michael@0:  * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca)
michael@0:  *                 David Einstein (deinst@world.std.com)
michael@0:  *                 László Németh (nemethl@gyorsposta.hu)
michael@0:  *                 Caolan McNamara (caolanm@redhat.com)
michael@0:  *                 Davide Prina
michael@0:  *                 Giuseppe Modugno
michael@0:  *                 Gianluca Turconi
michael@0:  *                 Simon Brouwer
michael@0:  *                 Noll Janos
michael@0:  *                 Biro Arpad
michael@0:  *                 Goldman Eleonora
michael@0:  *                 Sarlos Tamas
michael@0:  *                 Bencsath Boldizsar
michael@0:  *                 Halacsy Peter
michael@0:  *                 Dvornik Laszlo
michael@0:  *                 Gefferth Andras
michael@0:  *                 Nagy Viktor
michael@0:  *                 Varga Daniel
michael@0:  *                 Chris Halls
michael@0:  *                 Rene Engelhard
michael@0:  *                 Bram Moolenaar
michael@0:  *                 Dafydd Jones
michael@0:  *                 Harri Pitkanen
michael@0:  *                 Andras Timar
michael@0:  *                 Tor Lillqvist
michael@0:  * 
michael@0:  * Alternatively, the contents of this file may be used under the terms of
michael@0:  * either the GNU General Public License Version 2 or later (the "GPL"), or
michael@0:  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
michael@0:  * in which case the provisions of the GPL or the LGPL are applicable instead
michael@0:  * of those above. If you wish to allow use of your version of this file only
michael@0:  * under the terms of either the GPL or the LGPL, and not to allow others to
michael@0:  * use your version of this file under the terms of the MPL, indicate your
michael@0:  * decision by deleting the provisions above and replace them with the notice
michael@0:  * and other provisions required by the GPL or the LGPL. If you do not delete
michael@0:  * the provisions above, a recipient may use your version of this file under
michael@0:  * the terms of any one of the MPL, the GPL or the LGPL.
michael@0:  *
michael@0:  ******* END LICENSE BLOCK *******/
michael@0: 
michael@0: #include <stdlib.h>
michael@0: #include <string.h>
michael@0: #include <stdio.h>
michael@0: #include <ctype.h>
michael@0: 
michael@0: #include "affentry.hxx"
michael@0: #include "csutil.hxx"
michael@0: 
michael@0: PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp)
michael@0: {
michael@0:   // register affix manager
michael@0:   pmyMgr = pmgr;
michael@0: 
michael@0:   // set up its initial values
michael@0: 
michael@0:   aflag = dp->aflag;         // flag
michael@0:   strip = dp->strip;         // string to strip
michael@0:   appnd = dp->appnd;         // string to append
michael@0:   stripl = dp->stripl;       // length of strip string
michael@0:   appndl = dp->appndl;       // length of append string
michael@0:   numconds = dp->numconds;   // length of the condition
michael@0:   opts = dp->opts;           // cross product flag
michael@0:   // then copy over all of the conditions
michael@0:   if (opts & aeLONGCOND) {
michael@0:     memcpy(c.conds, dp->c.l.conds1, MAXCONDLEN_1);
michael@0:     c.l.conds2 = dp->c.l.conds2;
michael@0:   } else memcpy(c.conds, dp->c.conds, MAXCONDLEN);
michael@0:   next = NULL;
michael@0:   nextne = NULL;
michael@0:   nexteq = NULL;
michael@0:   morphcode = dp->morphcode;
michael@0:   contclass = dp->contclass;
michael@0:   contclasslen = dp->contclasslen;
michael@0: }
michael@0: 
michael@0: 
michael@0: PfxEntry::~PfxEntry()
michael@0: {
michael@0:     aflag = 0;
michael@0:     if (appnd) free(appnd);
michael@0:     if (strip) free(strip);
michael@0:     pmyMgr = NULL;
michael@0:     appnd = NULL;
michael@0:     strip = NULL;
michael@0:     if (opts & aeLONGCOND) free(c.l.conds2);
michael@0:     if (morphcode && !(opts & aeALIASM)) free(morphcode);
michael@0:     if (contclass && !(opts & aeALIASF)) free(contclass);
michael@0: }
michael@0: 
michael@0: // add prefix to this word assuming conditions hold
michael@0: char * PfxEntry::add(const char * word, int len)
michael@0: {
michael@0:     char tword[MAXWORDUTF8LEN + 4];
michael@0: 
michael@0:     if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) && 
michael@0:        (len >= numconds) && test_condition(word) &&
michael@0:        (!stripl || (strncmp(word, strip, stripl) == 0)) &&
michael@0:        ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
michael@0:     /* we have a match so add prefix */
michael@0:               char * pp = tword;
michael@0:               if (appndl) {
michael@0:                   strcpy(tword,appnd);
michael@0:                   pp += appndl;
michael@0:                }
michael@0:                strcpy(pp, (word + stripl));
michael@0:                return mystrdup(tword);
michael@0:      }
michael@0:      return NULL;
michael@0: }
michael@0: 
michael@0: inline char * PfxEntry::nextchar(char * p) {
michael@0:     if (p) {
michael@0:         p++;
michael@0:         if (opts & aeLONGCOND) {
michael@0:             // jump to the 2nd part of the condition
michael@0:             if (p == c.conds + MAXCONDLEN_1) return c.l.conds2;
michael@0:         // end of the MAXCONDLEN length condition
michael@0:         } else if (p == c.conds + MAXCONDLEN) return NULL;
michael@0: 	return *p ? p : NULL;
michael@0:     }
michael@0:     return NULL;
michael@0: }
michael@0: 
michael@0: inline int PfxEntry::test_condition(const char * st)
michael@0: {
michael@0:     const char * pos = NULL; // group with pos input position
michael@0:     bool neg = false;        // complementer
michael@0:     bool ingroup = false;    // character in the group
michael@0:     if (numconds == 0) return 1;
michael@0:     char * p = c.conds;
michael@0:     while (1) {
michael@0:       switch (*p) {
michael@0:         case '\0': return 1;
michael@0:         case '[': { 
michael@0:                 neg = false;
michael@0:                 ingroup = false;
michael@0:                 p = nextchar(p);
michael@0:                 pos = st; break;
michael@0:             }
michael@0:         case '^': { p = nextchar(p); neg = true; break; }
michael@0:         case ']': { 
michael@0:                 if ((neg && ingroup) || (!neg && !ingroup)) return 0;
michael@0:                 pos = NULL;
michael@0:                 p = nextchar(p);
michael@0:                 // skip the next character
michael@0:                 if (!ingroup && *st) for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);
michael@0:                 if (*st == '\0' && p) return 0; // word <= condition
michael@0:                 break;
michael@0:             }
michael@0:          case '.': if (!pos) { // dots are not metacharacters in groups: [.]
michael@0:                 p = nextchar(p);
michael@0:                 // skip the next character
michael@0:                 for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);
michael@0:                 if (*st == '\0' && p) return 0; // word <= condition
michael@0:                 break;
michael@0:             }
michael@0:     default: {
michael@0:                 if (*st == *p) {
michael@0:                     st++;
michael@0:                     p = nextchar(p);
michael@0:                     if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte
michael@0:                         while (p && (*p & 0xc0) == 0x80) {       // character
michael@0:                             if (*p != *st) {
michael@0:                                 if (!pos) return 0;
michael@0:                                 st = pos;
michael@0:                                 break;
michael@0:                             }
michael@0:                             p = nextchar(p);
michael@0:                             st++;
michael@0:                         }
michael@0:                         if (pos && st != pos) {
michael@0:                             ingroup = true;
michael@0:                             while (p && *p != ']' && (p = nextchar(p)));
michael@0:                         }
michael@0:                     } else if (pos) {
michael@0:                         ingroup = true;
michael@0:                         while (p && *p != ']' && (p = nextchar(p)));
michael@0:                     }
michael@0:                 } else if (pos) { // group
michael@0:                     p = nextchar(p);
michael@0:                 } else return 0;
michael@0:             }
michael@0:       }
michael@0:       if (!p) return 1;
michael@0:     }
michael@0: }
michael@0: 
michael@0: // check if this prefix entry matches
michael@0: struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound, const FLAG needflag)
michael@0: {
michael@0:     int                 tmpl;   // length of tmpword
michael@0:     struct hentry *     he;     // hash entry of root word or NULL
michael@0:     char                tmpword[MAXWORDUTF8LEN + 4];
michael@0: 
michael@0:     // on entry prefix is 0 length or already matches the beginning of the word.
michael@0:     // So if the remaining root word has positive length
michael@0:     // and if there are enough chars in root word and added back strip chars
michael@0:     // to meet the number of characters conditions, then test it
michael@0: 
michael@0:      tmpl = len - appndl;
michael@0: 
michael@0:      if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) {
michael@0: 
michael@0:             // generate new root word by removing prefix and adding
michael@0:             // back any characters that would have been stripped
michael@0: 
michael@0:             if (stripl) strcpy (tmpword, strip);
michael@0:             strcpy ((tmpword + stripl), (word + appndl));
michael@0: 
michael@0:             // now make sure all of the conditions on characters
michael@0:             // are met.  Please see the appendix at the end of
michael@0:             // this file for more info on exactly what is being
michael@0:             // tested
michael@0: 
michael@0:             // if all conditions are met then check if resulting
michael@0:             // root word in the dictionary
michael@0: 
michael@0:             if (test_condition(tmpword)) {
michael@0:                 tmpl += stripl;
michael@0:                 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
michael@0:                    do {
michael@0:                       if (TESTAFF(he->astr, aflag, he->alen) &&
michael@0:                         // forbid single prefixes with needaffix flag
michael@0:                         ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
michael@0:                         // needflag
michael@0:                         ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
michael@0:                          (contclass && TESTAFF(contclass, needflag, contclasslen))))
michael@0:                             return he;
michael@0:                       he = he->next_homonym; // check homonyms
michael@0:                    } while (he);
michael@0:                 }
michael@0: 
michael@0:                 // prefix matched but no root word was found
michael@0:                 // if aeXPRODUCT is allowed, try again but now
michael@0:                 // ross checked combined with a suffix
michael@0: 
michael@0:                 //if ((opts & aeXPRODUCT) && in_compound) {
michael@0:                 if ((opts & aeXPRODUCT)) {
michael@0:                    he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, this, NULL,
michael@0:                         0, NULL, FLAG_NULL, needflag, in_compound);
michael@0:                    if (he) return he;
michael@0:                 }
michael@0:             }
michael@0:      }
michael@0:     return NULL;
michael@0: }
michael@0: 
michael@0: // check if this prefix entry matches
michael@0: struct hentry * PfxEntry::check_twosfx(const char * word, int len,
michael@0:     char in_compound, const FLAG needflag)
michael@0: {
michael@0:     int                 tmpl;   // length of tmpword
michael@0:     struct hentry *     he;     // hash entry of root word or NULL
michael@0:     char                tmpword[MAXWORDUTF8LEN + 4];
michael@0: 
michael@0:     // on entry prefix is 0 length or already matches the beginning of the word.
michael@0:     // So if the remaining root word has positive length
michael@0:     // and if there are enough chars in root word and added back strip chars
michael@0:     // to meet the number of characters conditions, then test it
michael@0: 
michael@0:      tmpl = len - appndl;
michael@0: 
michael@0:      if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
michael@0:         (tmpl + stripl >= numconds)) {
michael@0: 
michael@0:             // generate new root word by removing prefix and adding
michael@0:             // back any characters that would have been stripped
michael@0: 
michael@0:             if (stripl) strcpy (tmpword, strip);
michael@0:             strcpy ((tmpword + stripl), (word + appndl));
michael@0: 
michael@0:             // now make sure all of the conditions on characters
michael@0:             // are met.  Please see the appendix at the end of
michael@0:             // this file for more info on exactly what is being
michael@0:             // tested
michael@0: 
michael@0:             // if all conditions are met then check if resulting
michael@0:             // root word in the dictionary
michael@0: 
michael@0:             if (test_condition(tmpword)) {
michael@0:                 tmpl += stripl;
michael@0: 
michael@0:                 // prefix matched but no root word was found
michael@0:                 // if aeXPRODUCT is allowed, try again but now
michael@0:                 // cross checked combined with a suffix
michael@0: 
michael@0:                 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
michael@0:                    he = pmyMgr->suffix_check_twosfx(tmpword, tmpl, aeXPRODUCT, this, needflag);
michael@0:                    if (he) return he;
michael@0:                 }
michael@0:             }
michael@0:      }
michael@0:     return NULL;
michael@0: }
michael@0: 
michael@0: // check if this prefix entry matches
michael@0: char * PfxEntry::check_twosfx_morph(const char * word, int len,
michael@0:          char in_compound, const FLAG needflag)
michael@0: {
michael@0:     int                 tmpl;   // length of tmpword
michael@0:     char                tmpword[MAXWORDUTF8LEN + 4];
michael@0: 
michael@0:     // on entry prefix is 0 length or already matches the beginning of the word.
michael@0:     // So if the remaining root word has positive length
michael@0:     // and if there are enough chars in root word and added back strip chars
michael@0:     // to meet the number of characters conditions, then test it
michael@0: 
michael@0:      tmpl = len - appndl;
michael@0: 
michael@0:      if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
michael@0:         (tmpl + stripl >= numconds)) {
michael@0: 
michael@0:             // generate new root word by removing prefix and adding
michael@0:             // back any characters that would have been stripped
michael@0: 
michael@0:             if (stripl) strcpy (tmpword, strip);
michael@0:             strcpy ((tmpword + stripl), (word + appndl));
michael@0: 
michael@0:             // now make sure all of the conditions on characters
michael@0:             // are met.  Please see the appendix at the end of
michael@0:             // this file for more info on exactly what is being
michael@0:             // tested
michael@0: 
michael@0:             // if all conditions are met then check if resulting
michael@0:             // root word in the dictionary
michael@0: 
michael@0:             if (test_condition(tmpword)) {
michael@0:                 tmpl += stripl;
michael@0: 
michael@0:                 // prefix matched but no root word was found
michael@0:                 // if aeXPRODUCT is allowed, try again but now
michael@0:                 // ross checked combined with a suffix
michael@0: 
michael@0:                 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
michael@0:                     return pmyMgr->suffix_check_twosfx_morph(tmpword, tmpl,
michael@0:                              aeXPRODUCT, this, needflag);
michael@0:                 }
michael@0:             }
michael@0:      }
michael@0:     return NULL;
michael@0: }
michael@0: 
michael@0: // check if this prefix entry matches
michael@0: char * PfxEntry::check_morph(const char * word, int len, char in_compound, const FLAG needflag)
michael@0: {
michael@0:     int                 tmpl;   // length of tmpword
michael@0:     struct hentry *     he;     // hash entry of root word or NULL
michael@0:     char                tmpword[MAXWORDUTF8LEN + 4];
michael@0:     char                result[MAXLNLEN];
michael@0:     char * st;
michael@0: 
michael@0:     *result = '\0';
michael@0: 
michael@0:     // on entry prefix is 0 length or already matches the beginning of the word.
michael@0:     // So if the remaining root word has positive length
michael@0:     // and if there are enough chars in root word and added back strip chars
michael@0:     // to meet the number of characters conditions, then test it
michael@0: 
michael@0:      tmpl = len - appndl;
michael@0: 
michael@0:      if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
michael@0:         (tmpl + stripl >= numconds)) {
michael@0: 
michael@0:             // generate new root word by removing prefix and adding
michael@0:             // back any characters that would have been stripped
michael@0: 
michael@0:             if (stripl) strcpy (tmpword, strip);
michael@0:             strcpy ((tmpword + stripl), (word + appndl));
michael@0: 
michael@0:             // now make sure all of the conditions on characters
michael@0:             // are met.  Please see the appendix at the end of
michael@0:             // this file for more info on exactly what is being
michael@0:             // tested
michael@0: 
michael@0:             // if all conditions are met then check if resulting
michael@0:             // root word in the dictionary
michael@0: 
michael@0:             if (test_condition(tmpword)) {
michael@0:                 tmpl += stripl;
michael@0:                 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
michael@0:                     do {
michael@0:                       if (TESTAFF(he->astr, aflag, he->alen) &&
michael@0:                         // forbid single prefixes with needaffix flag
michael@0:                         ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
michael@0:                         // needflag
michael@0:                         ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
michael@0:                          (contclass && TESTAFF(contclass, needflag, contclasslen)))) {
michael@0:                             if (morphcode) {
michael@0:                                 mystrcat(result, " ", MAXLNLEN);
michael@0:                                 mystrcat(result, morphcode, MAXLNLEN);
michael@0:                             } else mystrcat(result,getKey(), MAXLNLEN);
michael@0:                             if (!HENTRY_FIND(he, MORPH_STEM)) {
michael@0:                                 mystrcat(result, " ", MAXLNLEN);
michael@0:                                 mystrcat(result, MORPH_STEM, MAXLNLEN);
michael@0:                                 mystrcat(result, HENTRY_WORD(he), MAXLNLEN);
michael@0:                             }
michael@0:                             // store the pointer of the hash entry
michael@0:                             if (HENTRY_DATA(he)) {
michael@0:                                 mystrcat(result, " ", MAXLNLEN);
michael@0:                                 mystrcat(result, HENTRY_DATA2(he), MAXLNLEN);
michael@0:                             } else {
michael@0:                                 // return with debug information
michael@0:                                 char * flag = pmyMgr->encode_flag(getFlag());
michael@0:                                 mystrcat(result, " ", MAXLNLEN);
michael@0:                                 mystrcat(result, MORPH_FLAG, MAXLNLEN);
michael@0:                                 mystrcat(result, flag, MAXLNLEN);
michael@0:                                 free(flag);
michael@0:                             }
michael@0:                             mystrcat(result, "\n", MAXLNLEN);
michael@0:                       }
michael@0:                       he = he->next_homonym;
michael@0:                     } while (he);
michael@0:                 }
michael@0: 
michael@0:                 // prefix matched but no root word was found
michael@0:                 // if aeXPRODUCT is allowed, try again but now
michael@0:                 // ross checked combined with a suffix
michael@0: 
michael@0:                 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
michael@0:                    st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, this,
michael@0:                      FLAG_NULL, needflag);
michael@0:                    if (st) {
michael@0:                         mystrcat(result, st, MAXLNLEN);
michael@0:                         free(st);
michael@0:                    }
michael@0:                 }
michael@0:             }
michael@0:      }
michael@0:     
michael@0:     if (*result) return mystrdup(result);
michael@0:     return NULL;
michael@0: }
michael@0: 
michael@0: SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp)
michael@0: {
michael@0:   // register affix manager
michael@0:   pmyMgr = pmgr;
michael@0: 
michael@0:   // set up its initial values
michael@0:   aflag = dp->aflag;         // char flag
michael@0:   strip = dp->strip;         // string to strip
michael@0:   appnd = dp->appnd;         // string to append
michael@0:   stripl = dp->stripl;       // length of strip string
michael@0:   appndl = dp->appndl;       // length of append string
michael@0:   numconds = dp->numconds;   // length of the condition
michael@0:   opts = dp->opts;           // cross product flag
michael@0: 
michael@0:   // then copy over all of the conditions
michael@0:   if (opts & aeLONGCOND) {
michael@0:     memcpy(c.l.conds1, dp->c.l.conds1, MAXCONDLEN_1);
michael@0:     c.l.conds2 = dp->c.l.conds2;
michael@0:   } else memcpy(c.conds, dp->c.conds, MAXCONDLEN);
michael@0:   next = NULL;
michael@0:   nextne = NULL;
michael@0:   nexteq = NULL;
michael@0:   rappnd = myrevstrdup(appnd);
michael@0:   morphcode = dp->morphcode;
michael@0:   contclass = dp->contclass;
michael@0:   contclasslen = dp->contclasslen;
michael@0: }
michael@0: 
michael@0: 
michael@0: SfxEntry::~SfxEntry()
michael@0: {
michael@0:     aflag = 0;
michael@0:     if (appnd) free(appnd);
michael@0:     if (rappnd) free(rappnd);
michael@0:     if (strip) free(strip);
michael@0:     pmyMgr = NULL;
michael@0:     appnd = NULL;
michael@0:     strip = NULL;
michael@0:     if (opts & aeLONGCOND) free(c.l.conds2);
michael@0:     if (morphcode && !(opts & aeALIASM)) free(morphcode);
michael@0:     if (contclass && !(opts & aeALIASF)) free(contclass);
michael@0: }
michael@0: 
michael@0: // add suffix to this word assuming conditions hold
michael@0: char * SfxEntry::add(const char * word, int len)
michael@0: {
michael@0:     char                tword[MAXWORDUTF8LEN + 4];
michael@0: 
michael@0:      /* make sure all conditions match */
michael@0:      if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) &&
michael@0:         (len >= numconds) && test_condition(word + len, word) &&
michael@0:         (!stripl || (strcmp(word + len - stripl, strip) == 0)) &&
michael@0:         ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
michael@0:               /* we have a match so add suffix */
michael@0:               strcpy(tword,word);
michael@0:               if (appndl) {
michael@0:                   strcpy(tword + len - stripl, appnd);
michael@0:               } else {
michael@0:                   *(tword + len - stripl) = '\0';
michael@0:               }
michael@0:               return mystrdup(tword);
michael@0:      }
michael@0:      return NULL;
michael@0: }
michael@0: 
michael@0: inline char * SfxEntry::nextchar(char * p) {
michael@0:     if (p) {
michael@0: 	p++;
michael@0: 	if (opts & aeLONGCOND) {
michael@0:     	    // jump to the 2nd part of the condition
michael@0:     	    if (p == c.l.conds1 + MAXCONDLEN_1) return c.l.conds2;
michael@0: 	// end of the MAXCONDLEN length condition
michael@0: 	} else if (p == c.conds + MAXCONDLEN) return NULL;
michael@0: 	return *p ? p : NULL;
michael@0:     }
michael@0:     return NULL;
michael@0: }
michael@0: 
michael@0: inline int SfxEntry::test_condition(const char * st, const char * beg)
michael@0: {
michael@0:     const char * pos = NULL;    // group with pos input position
michael@0:     bool neg = false;           // complementer
michael@0:     bool ingroup = false;       // character in the group
michael@0:     if (numconds == 0) return 1;
michael@0:     char * p = c.conds;
michael@0:     st--;
michael@0:     int i = 1;
michael@0:     while (1) {
michael@0:       switch (*p) {
michael@0:         case '\0': return 1;
michael@0:         case '[': { p = nextchar(p); pos = st; break; }
michael@0:         case '^': { p = nextchar(p); neg = true; break; }
michael@0:         case ']': { if (!neg && !ingroup) return 0;
michael@0:                 i++;
michael@0:                 // skip the next character
michael@0:                 if (!ingroup) {
michael@0:                     for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);
michael@0:                     st--;
michael@0:                 }                    
michael@0:                 pos = NULL;
michael@0:                 neg = false;
michael@0:                 ingroup = false;
michael@0:                 p = nextchar(p);
michael@0:                 if (st < beg && p) return 0; // word <= condition
michael@0:                 break;
michael@0:             }
michael@0:         case '.': if (!pos) { // dots are not metacharacters in groups: [.]
michael@0:                 p = nextchar(p);
michael@0:                 // skip the next character
michael@0:                 for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);
michael@0:                 if (st < beg) { // word <= condition
michael@0: 		    if (p) return 0; else return 1;
michael@0: 		}
michael@0:                 if ((opts & aeUTF8) && (*st & 0x80)) { // head of the UTF-8 character
michael@0:                     st--;
michael@0:                     if (st < beg) { // word <= condition
michael@0: 			if (p) return 0; else return 1;
michael@0: 		    }
michael@0:                 }
michael@0:                 break;
michael@0:             }
michael@0:     default: {
michael@0:                 if (*st == *p) {
michael@0:                     p = nextchar(p);
michael@0:                     if ((opts & aeUTF8) && (*st & 0x80)) {
michael@0:                         st--;
michael@0:                         while (p && (st >= beg)) {
michael@0:                             if (*p != *st) {
michael@0:                                 if (!pos) return 0;
michael@0:                                 st = pos;
michael@0:                                 break;
michael@0:                             }
michael@0:                             // first byte of the UTF-8 multibyte character
michael@0:                             if ((*p & 0xc0) != 0x80) break;
michael@0:                             p = nextchar(p);
michael@0:                             st--;
michael@0:                         }
michael@0:                         if (pos && st != pos) {
michael@0:                             if (neg) return 0;
michael@0:                             else if (i == numconds) return 1;
michael@0:                             ingroup = true;
michael@0:                             while (p && *p != ']' && (p = nextchar(p)));
michael@0: 			    st--;
michael@0:                         }
michael@0:                         if (p && *p != ']') p = nextchar(p);
michael@0:                     } else if (pos) {
michael@0:                         if (neg) return 0;
michael@0:                         else if (i == numconds) return 1;
michael@0:                         ingroup = true;
michael@0: 			while (p && *p != ']' && (p = nextchar(p)))
michael@0:                           ;
michael@0: //			if (p && *p != ']') p = nextchar(p);
michael@0:                         st--;
michael@0:                     }
michael@0:                     if (!pos) {
michael@0:                         i++;
michael@0:                         st--;
michael@0:                     }
michael@0:                     if (st < beg && p && *p != ']') return 0; // word <= condition
michael@0:                 } else if (pos) { // group
michael@0:                     p = nextchar(p);
michael@0:                 } else return 0;
michael@0:             }
michael@0:       }
michael@0:       if (!p) return 1;
michael@0:     }
michael@0: }
michael@0: 
michael@0: // see if this suffix is present in the word
michael@0: struct hentry * SfxEntry::checkword(const char * word, int len, int optflags,
michael@0:     PfxEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag,
michael@0:     const FLAG badflag)
michael@0: {
michael@0:     int                 tmpl;            // length of tmpword
michael@0:     struct hentry *     he;              // hash entry pointer
michael@0:     unsigned char *     cp;
michael@0:     char                tmpword[MAXWORDUTF8LEN + 4];
michael@0:     PfxEntry* ep = ppfx;
michael@0: 
michael@0:     // if this suffix is being cross checked with a prefix
michael@0:     // but it does not support cross products skip it
michael@0: 
michael@0:     if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0))
michael@0:         return NULL;
michael@0: 
michael@0:     // upon entry suffix is 0 length or already matches the end of the word.
michael@0:     // So if the remaining root word has positive length
michael@0:     // and if there are enough chars in root word and added back strip chars
michael@0:     // to meet the number of characters conditions, then test it
michael@0: 
michael@0:     tmpl = len - appndl;
michael@0:     // the second condition is not enough for UTF-8 strings
michael@0:     // it checked in test_condition()
michael@0: 
michael@0:     if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
michael@0:         (tmpl + stripl >= numconds)) {
michael@0: 
michael@0:             // generate new root word by removing suffix and adding
michael@0:             // back any characters that would have been stripped or
michael@0:             // or null terminating the shorter string
michael@0: 
michael@0:             strcpy (tmpword, word);
michael@0:             cp = (unsigned char *)(tmpword + tmpl);
michael@0:             if (stripl) {
michael@0:                 strcpy ((char *)cp, strip);
michael@0:                 tmpl += stripl;
michael@0:                 cp = (unsigned char *)(tmpword + tmpl);
michael@0:             } else *cp = '\0';
michael@0: 
michael@0:             // now make sure all of the conditions on characters
michael@0:             // are met.  Please see the appendix at the end of
michael@0:             // this file for more info on exactly what is being
michael@0:             // tested
michael@0: 
michael@0:             // if all conditions are met then check if resulting
michael@0:             // root word in the dictionary
michael@0: 
michael@0:             if (test_condition((char *) cp, (char *) tmpword)) {
michael@0: 
michael@0: #ifdef SZOSZABLYA_POSSIBLE_ROOTS
michael@0:                 fprintf(stdout,"%s %s %c\n", word, tmpword, aflag);
michael@0: #endif
michael@0:                 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
michael@0:                     do {
michael@0:                         // check conditional suffix (enabled by prefix)
michael@0:                         if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() &&
michael@0:                                     TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
michael@0:                             (((optflags & aeXPRODUCT) == 0) ||
michael@0:                             (ep && TESTAFF(he->astr, ep->getFlag(), he->alen)) ||
michael@0:                              // enabled by prefix
michael@0:                             ((contclass) && (ep && TESTAFF(contclass, ep->getFlag(), contclasslen)))
michael@0:                             ) &&
michael@0:                             // handle cont. class
michael@0:                             ((!cclass) ||
michael@0:                                 ((contclass) && TESTAFF(contclass, cclass, contclasslen))
michael@0:                             ) &&
michael@0:                             // check only in compound homonyms (bad flags)
michael@0:                             (!badflag || !TESTAFF(he->astr, badflag, he->alen)
michael@0:                             ) &&
michael@0:                             // handle required flag
michael@0:                             ((!needflag) ||
michael@0:                               (TESTAFF(he->astr, needflag, he->alen) ||
michael@0:                               ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
michael@0:                             )
michael@0:                         ) return he;
michael@0:                         he = he->next_homonym; // check homonyms
michael@0:                     } while (he);
michael@0: 
michael@0:                 // obsolote stemming code (used only by the
michael@0:                 // experimental SuffixMgr:suggest_pos_stems)
michael@0:                 // store resulting root in wlst
michael@0:                 } else if (wlst && (*ns < maxSug)) {
michael@0:                     int cwrd = 1;
michael@0:                     for (int k=0; k < *ns; k++)
michael@0:                         if (strcmp(tmpword, wlst[k]) == 0) cwrd = 0;
michael@0:                     if (cwrd) {
michael@0:                         wlst[*ns] = mystrdup(tmpword);
michael@0:                         if (wlst[*ns] == NULL) {
michael@0:                             for (int j=0; j<*ns; j++) free(wlst[j]);
michael@0:                             *ns = -1;
michael@0:                             return NULL;
michael@0:                         }
michael@0:                         (*ns)++;
michael@0:                     }
michael@0:                 }
michael@0:             }
michael@0:     }
michael@0:     return NULL;
michael@0: }
michael@0: 
michael@0: // see if two-level suffix is present in the word
michael@0: struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags,
michael@0:     PfxEntry* ppfx, const FLAG needflag)
michael@0: {
michael@0:     int                 tmpl;            // length of tmpword
michael@0:     struct hentry *     he;              // hash entry pointer
michael@0:     unsigned char *     cp;
michael@0:     char                tmpword[MAXWORDUTF8LEN + 4];
michael@0:     PfxEntry* ep = ppfx;
michael@0: 
michael@0: 
michael@0:     // if this suffix is being cross checked with a prefix
michael@0:     // but it does not support cross products skip it
michael@0: 
michael@0:     if ((optflags & aeXPRODUCT) != 0 &&  (opts & aeXPRODUCT) == 0)
michael@0:         return NULL;
michael@0: 
michael@0:     // upon entry suffix is 0 length or already matches the end of the word.
michael@0:     // So if the remaining root word has positive length
michael@0:     // and if there are enough chars in root word and added back strip chars
michael@0:     // to meet the number of characters conditions, then test it
michael@0: 
michael@0:     tmpl = len - appndl;
michael@0: 
michael@0:     if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
michael@0:        (tmpl + stripl >= numconds)) {
michael@0: 
michael@0:             // generate new root word by removing suffix and adding
michael@0:             // back any characters that would have been stripped or
michael@0:             // or null terminating the shorter string
michael@0: 
michael@0:             strcpy (tmpword, word);
michael@0:             cp = (unsigned char *)(tmpword + tmpl);
michael@0:             if (stripl) {
michael@0:                 strcpy ((char *)cp, strip);
michael@0:                 tmpl += stripl;
michael@0:                 cp = (unsigned char *)(tmpword + tmpl);
michael@0:             } else *cp = '\0';
michael@0: 
michael@0:             // now make sure all of the conditions on characters
michael@0:             // are met.  Please see the appendix at the end of
michael@0:             // this file for more info on exactly what is being
michael@0:             // tested
michael@0: 
michael@0:             // if all conditions are met then recall suffix_check
michael@0: 
michael@0:             if (test_condition((char *) cp, (char *) tmpword)) {
michael@0:                 if (ppfx) {
michael@0:                     // handle conditional suffix
michael@0:                     if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
michael@0:                         he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
michael@0:                     else
michael@0:                         he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0, NULL, (FLAG) aflag, needflag);
michael@0:                 } else {
michael@0:                     he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
michael@0:                 }
michael@0:                 if (he) return he;
michael@0:             }
michael@0:     }
michael@0:     return NULL;
michael@0: }
michael@0: 
michael@0: // see if two-level suffix is present in the word
michael@0: char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags,
michael@0:     PfxEntry* ppfx, const FLAG needflag)
michael@0: {
michael@0:     int                 tmpl;            // length of tmpword
michael@0:     unsigned char *     cp;
michael@0:     char                tmpword[MAXWORDUTF8LEN + 4];
michael@0:     PfxEntry* ep = ppfx;
michael@0:     char * st;
michael@0: 
michael@0:     char result[MAXLNLEN];
michael@0: 
michael@0:     *result = '\0';
michael@0: 
michael@0:     // if this suffix is being cross checked with a prefix
michael@0:     // but it does not support cross products skip it
michael@0: 
michael@0:     if ((optflags & aeXPRODUCT) != 0 &&  (opts & aeXPRODUCT) == 0)
michael@0:         return NULL;
michael@0: 
michael@0:     // upon entry suffix is 0 length or already matches the end of the word.
michael@0:     // So if the remaining root word has positive length
michael@0:     // and if there are enough chars in root word and added back strip chars
michael@0:     // to meet the number of characters conditions, then test it
michael@0: 
michael@0:     tmpl = len - appndl;
michael@0: 
michael@0:     if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
michael@0:        (tmpl + stripl >= numconds)) {
michael@0: 
michael@0:             // generate new root word by removing suffix and adding
michael@0:             // back any characters that would have been stripped or
michael@0:             // or null terminating the shorter string
michael@0: 
michael@0:             strcpy (tmpword, word);
michael@0:             cp = (unsigned char *)(tmpword + tmpl);
michael@0:             if (stripl) {
michael@0:                 strcpy ((char *)cp, strip);
michael@0:                 tmpl += stripl;
michael@0:                 cp = (unsigned char *)(tmpword + tmpl);
michael@0:             } else *cp = '\0';
michael@0: 
michael@0:             // now make sure all of the conditions on characters
michael@0:             // are met.  Please see the appendix at the end of
michael@0:             // this file for more info on exactly what is being
michael@0:             // tested
michael@0: 
michael@0:             // if all conditions are met then recall suffix_check
michael@0: 
michael@0:             if (test_condition((char *) cp, (char *) tmpword)) {
michael@0:                 if (ppfx) {
michael@0:                     // handle conditional suffix
michael@0:                     if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {
michael@0:                         st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
michael@0:                         if (st) {
michael@0:                             if (ppfx->getMorph()) {
michael@0:                                 mystrcat(result, ppfx->getMorph(), MAXLNLEN);
michael@0:                                 mystrcat(result, " ", MAXLNLEN);
michael@0:                             }
michael@0:                             mystrcat(result,st, MAXLNLEN);
michael@0:                             free(st);
michael@0:                             mychomp(result);
michael@0:                         }
michael@0:                     } else {
michael@0:                         st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag);
michael@0:                         if (st) {
michael@0:                             mystrcat(result, st, MAXLNLEN);
michael@0:                             free(st);
michael@0:                             mychomp(result);
michael@0:                         }
michael@0:                     }
michael@0:                 } else {
michael@0:                         st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
michael@0:                         if (st) {
michael@0:                             mystrcat(result, st, MAXLNLEN);
michael@0:                             free(st);
michael@0:                             mychomp(result);
michael@0:                         }
michael@0:                 }
michael@0:                 if (*result) return mystrdup(result);
michael@0:             }
michael@0:     }
michael@0:     return NULL;
michael@0: }
michael@0: 
michael@0: // get next homonym with same affix
michael@0: struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, PfxEntry* ppfx,
michael@0:     const FLAG cclass, const FLAG needflag)
michael@0: {
michael@0:     PfxEntry* ep = ppfx;
michael@0:     FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL;
michael@0: 
michael@0:     while (he->next_homonym) {
michael@0:         he = he->next_homonym;
michael@0:         if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
michael@0:                             ((optflags & aeXPRODUCT) == 0 ||
michael@0:                             TESTAFF(he->astr, eFlag, he->alen) ||
michael@0:                              // handle conditional suffix
michael@0:                             ((contclass) && TESTAFF(contclass, eFlag, contclasslen))
michael@0:                             ) &&
michael@0:                             // handle cont. class
michael@0:                             ((!cclass) ||
michael@0:                                 ((contclass) && TESTAFF(contclass, cclass, contclasslen))
michael@0:                             ) &&
michael@0:                             // handle required flag
michael@0:                             ((!needflag) ||
michael@0:                               (TESTAFF(he->astr, needflag, he->alen) ||
michael@0:                               ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
michael@0:                             )
michael@0:                         ) return he;
michael@0:     }
michael@0:     return NULL;
michael@0: }
michael@0: 
michael@0: 
michael@0: #if 0
michael@0: 
michael@0: Appendix:  Understanding Affix Code
michael@0: 
michael@0: 
michael@0: An affix is either a  prefix or a suffix attached to root words to make 
michael@0: other words.
michael@0: 
michael@0: Basically a Prefix or a Suffix is set of AffEntry objects
michael@0: which store information about the prefix or suffix along 
michael@0: with supporting routines to check if a word has a particular 
michael@0: prefix or suffix or a combination.
michael@0: 
michael@0: The structure affentry is defined as follows:
michael@0: 
michael@0: struct affentry
michael@0: {
michael@0:    unsigned short aflag;    // ID used to represent the affix
michael@0:    char * strip;            // string to strip before adding affix
michael@0:    char * appnd;            // the affix string to add
michael@0:    unsigned char stripl;    // length of the strip string
michael@0:    unsigned char appndl;    // length of the affix string
michael@0:    char numconds;           // the number of conditions that must be met
michael@0:    char opts;               // flag: aeXPRODUCT- combine both prefix and suffix 
michael@0:    char   conds[SETSIZE];   // array which encodes the conditions to be met
michael@0: };
michael@0: 
michael@0: 
michael@0: Here is a suffix borrowed from the en_US.aff file.  This file 
michael@0: is whitespace delimited.
michael@0: 
michael@0: SFX D Y 4 
michael@0: SFX D   0     e          d
michael@0: SFX D   y     ied        [^aeiou]y
michael@0: SFX D   0     ed         [^ey]
michael@0: SFX D   0     ed         [aeiou]y
michael@0: 
michael@0: This information can be interpreted as follows:
michael@0: 
michael@0: In the first line has 4 fields
michael@0: 
michael@0: Field
michael@0: -----
michael@0: 1     SFX - indicates this is a suffix
michael@0: 2     D   - is the name of the character flag which represents this suffix
michael@0: 3     Y   - indicates it can be combined with prefixes (cross product)
michael@0: 4     4   - indicates that sequence of 4 affentry structures are needed to
michael@0:                properly store the affix information
michael@0: 
michael@0: The remaining lines describe the unique information for the 4 SfxEntry 
michael@0: objects that make up this affix.  Each line can be interpreted
michael@0: as follows: (note fields 1 and 2 are as a check against line 1 info)
michael@0: 
michael@0: Field
michael@0: -----
michael@0: 1     SFX         - indicates this is a suffix
michael@0: 2     D           - is the name of the character flag for this affix
michael@0: 3     y           - the string of chars to strip off before adding affix
michael@0:                          (a 0 here indicates the NULL string)
michael@0: 4     ied         - the string of affix characters to add
michael@0: 5     [^aeiou]y   - the conditions which must be met before the affix
michael@0:                     can be applied
michael@0: 
michael@0: Field 5 is interesting.  Since this is a suffix, field 5 tells us that
michael@0: there are 2 conditions that must be met.  The first condition is that 
michael@0: the next to the last character in the word must *NOT* be any of the 
michael@0: following "a", "e", "i", "o" or "u".  The second condition is that
michael@0: the last character of the word must end in "y".
michael@0: 
michael@0: So how can we encode this information concisely and be able to 
michael@0: test for both conditions in a fast manner?  The answer is found
michael@0: but studying the wonderful ispell code of Geoff Kuenning, et.al. 
michael@0: (now available under a normal BSD license).
michael@0: 
michael@0: If we set up a conds array of 256 bytes indexed (0 to 255) and access it
michael@0: using a character (cast to an unsigned char) of a string, we have 8 bits
michael@0: of information we can store about that character.  Specifically we
michael@0: could use each bit to say if that character is allowed in any of the 
michael@0: last (or first for prefixes) 8 characters of the word.
michael@0: 
michael@0: Basically, each character at one end of the word (up to the number 
michael@0: of conditions) is used to index into the conds array and the resulting 
michael@0: value found there says whether the that character is valid for a 
michael@0: specific character position in the word.  
michael@0: 
michael@0: For prefixes, it does this by setting bit 0 if that char is valid 
michael@0: in the first position, bit 1 if valid in the second position, and so on. 
michael@0: 
michael@0: If a bit is not set, then that char is not valid for that postion in the
michael@0: word.
michael@0: 
michael@0: If working with suffixes bit 0 is used for the character closest 
michael@0: to the front, bit 1 for the next character towards the end, ..., 
michael@0: with bit numconds-1 representing the last char at the end of the string. 
michael@0: 
michael@0: Note: since entries in the conds[] are 8 bits, only 8 conditions 
michael@0: (read that only 8 character positions) can be examined at one
michael@0: end of a word (the beginning for prefixes and the end for suffixes.
michael@0: 
michael@0: So to make this clearer, lets encode the conds array values for the 
michael@0: first two affentries for the suffix D described earlier.
michael@0: 
michael@0: 
michael@0:   For the first affentry:    
michael@0:      numconds = 1             (only examine the last character)
michael@0: 
michael@0:      conds['e'] =  (1 << 0)   (the word must end in an E)
michael@0:      all others are all 0
michael@0: 
michael@0:   For the second affentry:
michael@0:      numconds = 2             (only examine the last two characters)     
michael@0: 
michael@0:      conds[X] = conds[X] | (1 << 0)     (aeiou are not allowed)
michael@0:          where X is all characters *but* a, e, i, o, or u
michael@0:          
michael@0: 
michael@0:      conds['y'] = (1 << 1)     (the last char must be a y)
michael@0:      all other bits for all other entries in the conds array are zero
michael@0: 
michael@0: 
michael@0: #endif
michael@0: