extensions/spellcheck/hunspell/src/affentry.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /******* BEGIN LICENSE BLOCK *******
     2  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
     3  * 
     4  * The contents of this file are subject to the Mozilla Public License Version
     5  * 1.1 (the "License"); you may not use this file except in compliance with
     6  * the License. You may obtain a copy of the License at
     7  * http://www.mozilla.org/MPL/
     8  * 
     9  * Software distributed under the License is distributed on an "AS IS" basis,
    10  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
    11  * for the specific language governing rights and limitations under the
    12  * License.
    13  * 
    14  * The Initial Developers of the Original Code are Kevin Hendricks (MySpell)
    15  * and László Németh (Hunspell). Portions created by the Initial Developers
    16  * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved.
    17  * 
    18  * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca)
    19  *                 David Einstein (deinst@world.std.com)
    20  *                 László Németh (nemethl@gyorsposta.hu)
    21  *                 Caolan McNamara (caolanm@redhat.com)
    22  *                 Davide Prina
    23  *                 Giuseppe Modugno
    24  *                 Gianluca Turconi
    25  *                 Simon Brouwer
    26  *                 Noll Janos
    27  *                 Biro Arpad
    28  *                 Goldman Eleonora
    29  *                 Sarlos Tamas
    30  *                 Bencsath Boldizsar
    31  *                 Halacsy Peter
    32  *                 Dvornik Laszlo
    33  *                 Gefferth Andras
    34  *                 Nagy Viktor
    35  *                 Varga Daniel
    36  *                 Chris Halls
    37  *                 Rene Engelhard
    38  *                 Bram Moolenaar
    39  *                 Dafydd Jones
    40  *                 Harri Pitkanen
    41  *                 Andras Timar
    42  *                 Tor Lillqvist
    43  * 
    44  * Alternatively, the contents of this file may be used under the terms of
    45  * either the GNU General Public License Version 2 or later (the "GPL"), or
    46  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
    47  * in which case the provisions of the GPL or the LGPL are applicable instead
    48  * of those above. If you wish to allow use of your version of this file only
    49  * under the terms of either the GPL or the LGPL, and not to allow others to
    50  * use your version of this file under the terms of the MPL, indicate your
    51  * decision by deleting the provisions above and replace them with the notice
    52  * and other provisions required by the GPL or the LGPL. If you do not delete
    53  * the provisions above, a recipient may use your version of this file under
    54  * the terms of any one of the MPL, the GPL or the LGPL.
    55  *
    56  ******* END LICENSE BLOCK *******/
    58 #include <stdlib.h>
    59 #include <string.h>
    60 #include <stdio.h>
    61 #include <ctype.h>
    63 #include "affentry.hxx"
    64 #include "csutil.hxx"
    66 PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp)
    67 {
    68   // register affix manager
    69   pmyMgr = pmgr;
    71   // set up its initial values
    73   aflag = dp->aflag;         // flag
    74   strip = dp->strip;         // string to strip
    75   appnd = dp->appnd;         // string to append
    76   stripl = dp->stripl;       // length of strip string
    77   appndl = dp->appndl;       // length of append string
    78   numconds = dp->numconds;   // length of the condition
    79   opts = dp->opts;           // cross product flag
    80   // then copy over all of the conditions
    81   if (opts & aeLONGCOND) {
    82     memcpy(c.conds, dp->c.l.conds1, MAXCONDLEN_1);
    83     c.l.conds2 = dp->c.l.conds2;
    84   } else memcpy(c.conds, dp->c.conds, MAXCONDLEN);
    85   next = NULL;
    86   nextne = NULL;
    87   nexteq = NULL;
    88   morphcode = dp->morphcode;
    89   contclass = dp->contclass;
    90   contclasslen = dp->contclasslen;
    91 }
    94 PfxEntry::~PfxEntry()
    95 {
    96     aflag = 0;
    97     if (appnd) free(appnd);
    98     if (strip) free(strip);
    99     pmyMgr = NULL;
   100     appnd = NULL;
   101     strip = NULL;
   102     if (opts & aeLONGCOND) free(c.l.conds2);
   103     if (morphcode && !(opts & aeALIASM)) free(morphcode);
   104     if (contclass && !(opts & aeALIASF)) free(contclass);
   105 }
   107 // add prefix to this word assuming conditions hold
   108 char * PfxEntry::add(const char * word, int len)
   109 {
   110     char tword[MAXWORDUTF8LEN + 4];
   112     if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) && 
   113        (len >= numconds) && test_condition(word) &&
   114        (!stripl || (strncmp(word, strip, stripl) == 0)) &&
   115        ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
   116     /* we have a match so add prefix */
   117               char * pp = tword;
   118               if (appndl) {
   119                   strcpy(tword,appnd);
   120                   pp += appndl;
   121                }
   122                strcpy(pp, (word + stripl));
   123                return mystrdup(tword);
   124      }
   125      return NULL;
   126 }
   128 inline char * PfxEntry::nextchar(char * p) {
   129     if (p) {
   130         p++;
   131         if (opts & aeLONGCOND) {
   132             // jump to the 2nd part of the condition
   133             if (p == c.conds + MAXCONDLEN_1) return c.l.conds2;
   134         // end of the MAXCONDLEN length condition
   135         } else if (p == c.conds + MAXCONDLEN) return NULL;
   136 	return *p ? p : NULL;
   137     }
   138     return NULL;
   139 }
   141 inline int PfxEntry::test_condition(const char * st)
   142 {
   143     const char * pos = NULL; // group with pos input position
   144     bool neg = false;        // complementer
   145     bool ingroup = false;    // character in the group
   146     if (numconds == 0) return 1;
   147     char * p = c.conds;
   148     while (1) {
   149       switch (*p) {
   150         case '\0': return 1;
   151         case '[': { 
   152                 neg = false;
   153                 ingroup = false;
   154                 p = nextchar(p);
   155                 pos = st; break;
   156             }
   157         case '^': { p = nextchar(p); neg = true; break; }
   158         case ']': { 
   159                 if ((neg && ingroup) || (!neg && !ingroup)) return 0;
   160                 pos = NULL;
   161                 p = nextchar(p);
   162                 // skip the next character
   163                 if (!ingroup && *st) for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);
   164                 if (*st == '\0' && p) return 0; // word <= condition
   165                 break;
   166             }
   167          case '.': if (!pos) { // dots are not metacharacters in groups: [.]
   168                 p = nextchar(p);
   169                 // skip the next character
   170                 for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);
   171                 if (*st == '\0' && p) return 0; // word <= condition
   172                 break;
   173             }
   174     default: {
   175                 if (*st == *p) {
   176                     st++;
   177                     p = nextchar(p);
   178                     if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte
   179                         while (p && (*p & 0xc0) == 0x80) {       // character
   180                             if (*p != *st) {
   181                                 if (!pos) return 0;
   182                                 st = pos;
   183                                 break;
   184                             }
   185                             p = nextchar(p);
   186                             st++;
   187                         }
   188                         if (pos && st != pos) {
   189                             ingroup = true;
   190                             while (p && *p != ']' && (p = nextchar(p)));
   191                         }
   192                     } else if (pos) {
   193                         ingroup = true;
   194                         while (p && *p != ']' && (p = nextchar(p)));
   195                     }
   196                 } else if (pos) { // group
   197                     p = nextchar(p);
   198                 } else return 0;
   199             }
   200       }
   201       if (!p) return 1;
   202     }
   203 }
   205 // check if this prefix entry matches
   206 struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound, const FLAG needflag)
   207 {
   208     int                 tmpl;   // length of tmpword
   209     struct hentry *     he;     // hash entry of root word or NULL
   210     char                tmpword[MAXWORDUTF8LEN + 4];
   212     // on entry prefix is 0 length or already matches the beginning of the word.
   213     // So if the remaining root word has positive length
   214     // and if there are enough chars in root word and added back strip chars
   215     // to meet the number of characters conditions, then test it
   217      tmpl = len - appndl;
   219      if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) {
   221             // generate new root word by removing prefix and adding
   222             // back any characters that would have been stripped
   224             if (stripl) strcpy (tmpword, strip);
   225             strcpy ((tmpword + stripl), (word + appndl));
   227             // now make sure all of the conditions on characters
   228             // are met.  Please see the appendix at the end of
   229             // this file for more info on exactly what is being
   230             // tested
   232             // if all conditions are met then check if resulting
   233             // root word in the dictionary
   235             if (test_condition(tmpword)) {
   236                 tmpl += stripl;
   237                 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
   238                    do {
   239                       if (TESTAFF(he->astr, aflag, he->alen) &&
   240                         // forbid single prefixes with needaffix flag
   241                         ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
   242                         // needflag
   243                         ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
   244                          (contclass && TESTAFF(contclass, needflag, contclasslen))))
   245                             return he;
   246                       he = he->next_homonym; // check homonyms
   247                    } while (he);
   248                 }
   250                 // prefix matched but no root word was found
   251                 // if aeXPRODUCT is allowed, try again but now
   252                 // ross checked combined with a suffix
   254                 //if ((opts & aeXPRODUCT) && in_compound) {
   255                 if ((opts & aeXPRODUCT)) {
   256                    he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, this, NULL,
   257                         0, NULL, FLAG_NULL, needflag, in_compound);
   258                    if (he) return he;
   259                 }
   260             }
   261      }
   262     return NULL;
   263 }
   265 // check if this prefix entry matches
   266 struct hentry * PfxEntry::check_twosfx(const char * word, int len,
   267     char in_compound, const FLAG needflag)
   268 {
   269     int                 tmpl;   // length of tmpword
   270     struct hentry *     he;     // hash entry of root word or NULL
   271     char                tmpword[MAXWORDUTF8LEN + 4];
   273     // on entry prefix is 0 length or already matches the beginning of the word.
   274     // So if the remaining root word has positive length
   275     // and if there are enough chars in root word and added back strip chars
   276     // to meet the number of characters conditions, then test it
   278      tmpl = len - appndl;
   280      if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
   281         (tmpl + stripl >= numconds)) {
   283             // generate new root word by removing prefix and adding
   284             // back any characters that would have been stripped
   286             if (stripl) strcpy (tmpword, strip);
   287             strcpy ((tmpword + stripl), (word + appndl));
   289             // now make sure all of the conditions on characters
   290             // are met.  Please see the appendix at the end of
   291             // this file for more info on exactly what is being
   292             // tested
   294             // if all conditions are met then check if resulting
   295             // root word in the dictionary
   297             if (test_condition(tmpword)) {
   298                 tmpl += stripl;
   300                 // prefix matched but no root word was found
   301                 // if aeXPRODUCT is allowed, try again but now
   302                 // cross checked combined with a suffix
   304                 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
   305                    he = pmyMgr->suffix_check_twosfx(tmpword, tmpl, aeXPRODUCT, this, needflag);
   306                    if (he) return he;
   307                 }
   308             }
   309      }
   310     return NULL;
   311 }
   313 // check if this prefix entry matches
   314 char * PfxEntry::check_twosfx_morph(const char * word, int len,
   315          char in_compound, const FLAG needflag)
   316 {
   317     int                 tmpl;   // length of tmpword
   318     char                tmpword[MAXWORDUTF8LEN + 4];
   320     // on entry prefix is 0 length or already matches the beginning of the word.
   321     // So if the remaining root word has positive length
   322     // and if there are enough chars in root word and added back strip chars
   323     // to meet the number of characters conditions, then test it
   325      tmpl = len - appndl;
   327      if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
   328         (tmpl + stripl >= numconds)) {
   330             // generate new root word by removing prefix and adding
   331             // back any characters that would have been stripped
   333             if (stripl) strcpy (tmpword, strip);
   334             strcpy ((tmpword + stripl), (word + appndl));
   336             // now make sure all of the conditions on characters
   337             // are met.  Please see the appendix at the end of
   338             // this file for more info on exactly what is being
   339             // tested
   341             // if all conditions are met then check if resulting
   342             // root word in the dictionary
   344             if (test_condition(tmpword)) {
   345                 tmpl += stripl;
   347                 // prefix matched but no root word was found
   348                 // if aeXPRODUCT is allowed, try again but now
   349                 // ross checked combined with a suffix
   351                 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
   352                     return pmyMgr->suffix_check_twosfx_morph(tmpword, tmpl,
   353                              aeXPRODUCT, this, needflag);
   354                 }
   355             }
   356      }
   357     return NULL;
   358 }
   360 // check if this prefix entry matches
   361 char * PfxEntry::check_morph(const char * word, int len, char in_compound, const FLAG needflag)
   362 {
   363     int                 tmpl;   // length of tmpword
   364     struct hentry *     he;     // hash entry of root word or NULL
   365     char                tmpword[MAXWORDUTF8LEN + 4];
   366     char                result[MAXLNLEN];
   367     char * st;
   369     *result = '\0';
   371     // on entry prefix is 0 length or already matches the beginning of the word.
   372     // So if the remaining root word has positive length
   373     // and if there are enough chars in root word and added back strip chars
   374     // to meet the number of characters conditions, then test it
   376      tmpl = len - appndl;
   378      if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
   379         (tmpl + stripl >= numconds)) {
   381             // generate new root word by removing prefix and adding
   382             // back any characters that would have been stripped
   384             if (stripl) strcpy (tmpword, strip);
   385             strcpy ((tmpword + stripl), (word + appndl));
   387             // now make sure all of the conditions on characters
   388             // are met.  Please see the appendix at the end of
   389             // this file for more info on exactly what is being
   390             // tested
   392             // if all conditions are met then check if resulting
   393             // root word in the dictionary
   395             if (test_condition(tmpword)) {
   396                 tmpl += stripl;
   397                 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
   398                     do {
   399                       if (TESTAFF(he->astr, aflag, he->alen) &&
   400                         // forbid single prefixes with needaffix flag
   401                         ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
   402                         // needflag
   403                         ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
   404                          (contclass && TESTAFF(contclass, needflag, contclasslen)))) {
   405                             if (morphcode) {
   406                                 mystrcat(result, " ", MAXLNLEN);
   407                                 mystrcat(result, morphcode, MAXLNLEN);
   408                             } else mystrcat(result,getKey(), MAXLNLEN);
   409                             if (!HENTRY_FIND(he, MORPH_STEM)) {
   410                                 mystrcat(result, " ", MAXLNLEN);
   411                                 mystrcat(result, MORPH_STEM, MAXLNLEN);
   412                                 mystrcat(result, HENTRY_WORD(he), MAXLNLEN);
   413                             }
   414                             // store the pointer of the hash entry
   415                             if (HENTRY_DATA(he)) {
   416                                 mystrcat(result, " ", MAXLNLEN);
   417                                 mystrcat(result, HENTRY_DATA2(he), MAXLNLEN);
   418                             } else {
   419                                 // return with debug information
   420                                 char * flag = pmyMgr->encode_flag(getFlag());
   421                                 mystrcat(result, " ", MAXLNLEN);
   422                                 mystrcat(result, MORPH_FLAG, MAXLNLEN);
   423                                 mystrcat(result, flag, MAXLNLEN);
   424                                 free(flag);
   425                             }
   426                             mystrcat(result, "\n", MAXLNLEN);
   427                       }
   428                       he = he->next_homonym;
   429                     } while (he);
   430                 }
   432                 // prefix matched but no root word was found
   433                 // if aeXPRODUCT is allowed, try again but now
   434                 // ross checked combined with a suffix
   436                 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
   437                    st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, this,
   438                      FLAG_NULL, needflag);
   439                    if (st) {
   440                         mystrcat(result, st, MAXLNLEN);
   441                         free(st);
   442                    }
   443                 }
   444             }
   445      }
   447     if (*result) return mystrdup(result);
   448     return NULL;
   449 }
   451 SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp)
   452 {
   453   // register affix manager
   454   pmyMgr = pmgr;
   456   // set up its initial values
   457   aflag = dp->aflag;         // char flag
   458   strip = dp->strip;         // string to strip
   459   appnd = dp->appnd;         // string to append
   460   stripl = dp->stripl;       // length of strip string
   461   appndl = dp->appndl;       // length of append string
   462   numconds = dp->numconds;   // length of the condition
   463   opts = dp->opts;           // cross product flag
   465   // then copy over all of the conditions
   466   if (opts & aeLONGCOND) {
   467     memcpy(c.l.conds1, dp->c.l.conds1, MAXCONDLEN_1);
   468     c.l.conds2 = dp->c.l.conds2;
   469   } else memcpy(c.conds, dp->c.conds, MAXCONDLEN);
   470   next = NULL;
   471   nextne = NULL;
   472   nexteq = NULL;
   473   rappnd = myrevstrdup(appnd);
   474   morphcode = dp->morphcode;
   475   contclass = dp->contclass;
   476   contclasslen = dp->contclasslen;
   477 }
   480 SfxEntry::~SfxEntry()
   481 {
   482     aflag = 0;
   483     if (appnd) free(appnd);
   484     if (rappnd) free(rappnd);
   485     if (strip) free(strip);
   486     pmyMgr = NULL;
   487     appnd = NULL;
   488     strip = NULL;
   489     if (opts & aeLONGCOND) free(c.l.conds2);
   490     if (morphcode && !(opts & aeALIASM)) free(morphcode);
   491     if (contclass && !(opts & aeALIASF)) free(contclass);
   492 }
   494 // add suffix to this word assuming conditions hold
   495 char * SfxEntry::add(const char * word, int len)
   496 {
   497     char                tword[MAXWORDUTF8LEN + 4];
   499      /* make sure all conditions match */
   500      if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) &&
   501         (len >= numconds) && test_condition(word + len, word) &&
   502         (!stripl || (strcmp(word + len - stripl, strip) == 0)) &&
   503         ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
   504               /* we have a match so add suffix */
   505               strcpy(tword,word);
   506               if (appndl) {
   507                   strcpy(tword + len - stripl, appnd);
   508               } else {
   509                   *(tword + len - stripl) = '\0';
   510               }
   511               return mystrdup(tword);
   512      }
   513      return NULL;
   514 }
   516 inline char * SfxEntry::nextchar(char * p) {
   517     if (p) {
   518 	p++;
   519 	if (opts & aeLONGCOND) {
   520     	    // jump to the 2nd part of the condition
   521     	    if (p == c.l.conds1 + MAXCONDLEN_1) return c.l.conds2;
   522 	// end of the MAXCONDLEN length condition
   523 	} else if (p == c.conds + MAXCONDLEN) return NULL;
   524 	return *p ? p : NULL;
   525     }
   526     return NULL;
   527 }
   529 inline int SfxEntry::test_condition(const char * st, const char * beg)
   530 {
   531     const char * pos = NULL;    // group with pos input position
   532     bool neg = false;           // complementer
   533     bool ingroup = false;       // character in the group
   534     if (numconds == 0) return 1;
   535     char * p = c.conds;
   536     st--;
   537     int i = 1;
   538     while (1) {
   539       switch (*p) {
   540         case '\0': return 1;
   541         case '[': { p = nextchar(p); pos = st; break; }
   542         case '^': { p = nextchar(p); neg = true; break; }
   543         case ']': { if (!neg && !ingroup) return 0;
   544                 i++;
   545                 // skip the next character
   546                 if (!ingroup) {
   547                     for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);
   548                     st--;
   549                 }                    
   550                 pos = NULL;
   551                 neg = false;
   552                 ingroup = false;
   553                 p = nextchar(p);
   554                 if (st < beg && p) return 0; // word <= condition
   555                 break;
   556             }
   557         case '.': if (!pos) { // dots are not metacharacters in groups: [.]
   558                 p = nextchar(p);
   559                 // skip the next character
   560                 for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);
   561                 if (st < beg) { // word <= condition
   562 		    if (p) return 0; else return 1;
   563 		}
   564                 if ((opts & aeUTF8) && (*st & 0x80)) { // head of the UTF-8 character
   565                     st--;
   566                     if (st < beg) { // word <= condition
   567 			if (p) return 0; else return 1;
   568 		    }
   569                 }
   570                 break;
   571             }
   572     default: {
   573                 if (*st == *p) {
   574                     p = nextchar(p);
   575                     if ((opts & aeUTF8) && (*st & 0x80)) {
   576                         st--;
   577                         while (p && (st >= beg)) {
   578                             if (*p != *st) {
   579                                 if (!pos) return 0;
   580                                 st = pos;
   581                                 break;
   582                             }
   583                             // first byte of the UTF-8 multibyte character
   584                             if ((*p & 0xc0) != 0x80) break;
   585                             p = nextchar(p);
   586                             st--;
   587                         }
   588                         if (pos && st != pos) {
   589                             if (neg) return 0;
   590                             else if (i == numconds) return 1;
   591                             ingroup = true;
   592                             while (p && *p != ']' && (p = nextchar(p)));
   593 			    st--;
   594                         }
   595                         if (p && *p != ']') p = nextchar(p);
   596                     } else if (pos) {
   597                         if (neg) return 0;
   598                         else if (i == numconds) return 1;
   599                         ingroup = true;
   600 			while (p && *p != ']' && (p = nextchar(p)))
   601                           ;
   602 //			if (p && *p != ']') p = nextchar(p);
   603                         st--;
   604                     }
   605                     if (!pos) {
   606                         i++;
   607                         st--;
   608                     }
   609                     if (st < beg && p && *p != ']') return 0; // word <= condition
   610                 } else if (pos) { // group
   611                     p = nextchar(p);
   612                 } else return 0;
   613             }
   614       }
   615       if (!p) return 1;
   616     }
   617 }
   619 // see if this suffix is present in the word
   620 struct hentry * SfxEntry::checkword(const char * word, int len, int optflags,
   621     PfxEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag,
   622     const FLAG badflag)
   623 {
   624     int                 tmpl;            // length of tmpword
   625     struct hentry *     he;              // hash entry pointer
   626     unsigned char *     cp;
   627     char                tmpword[MAXWORDUTF8LEN + 4];
   628     PfxEntry* ep = ppfx;
   630     // if this suffix is being cross checked with a prefix
   631     // but it does not support cross products skip it
   633     if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0))
   634         return NULL;
   636     // upon entry suffix is 0 length or already matches the end of the word.
   637     // So if the remaining root word has positive length
   638     // and if there are enough chars in root word and added back strip chars
   639     // to meet the number of characters conditions, then test it
   641     tmpl = len - appndl;
   642     // the second condition is not enough for UTF-8 strings
   643     // it checked in test_condition()
   645     if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
   646         (tmpl + stripl >= numconds)) {
   648             // generate new root word by removing suffix and adding
   649             // back any characters that would have been stripped or
   650             // or null terminating the shorter string
   652             strcpy (tmpword, word);
   653             cp = (unsigned char *)(tmpword + tmpl);
   654             if (stripl) {
   655                 strcpy ((char *)cp, strip);
   656                 tmpl += stripl;
   657                 cp = (unsigned char *)(tmpword + tmpl);
   658             } else *cp = '\0';
   660             // now make sure all of the conditions on characters
   661             // are met.  Please see the appendix at the end of
   662             // this file for more info on exactly what is being
   663             // tested
   665             // if all conditions are met then check if resulting
   666             // root word in the dictionary
   668             if (test_condition((char *) cp, (char *) tmpword)) {
   670 #ifdef SZOSZABLYA_POSSIBLE_ROOTS
   671                 fprintf(stdout,"%s %s %c\n", word, tmpword, aflag);
   672 #endif
   673                 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
   674                     do {
   675                         // check conditional suffix (enabled by prefix)
   676                         if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() &&
   677                                     TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
   678                             (((optflags & aeXPRODUCT) == 0) ||
   679                             (ep && TESTAFF(he->astr, ep->getFlag(), he->alen)) ||
   680                              // enabled by prefix
   681                             ((contclass) && (ep && TESTAFF(contclass, ep->getFlag(), contclasslen)))
   682                             ) &&
   683                             // handle cont. class
   684                             ((!cclass) ||
   685                                 ((contclass) && TESTAFF(contclass, cclass, contclasslen))
   686                             ) &&
   687                             // check only in compound homonyms (bad flags)
   688                             (!badflag || !TESTAFF(he->astr, badflag, he->alen)
   689                             ) &&
   690                             // handle required flag
   691                             ((!needflag) ||
   692                               (TESTAFF(he->astr, needflag, he->alen) ||
   693                               ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
   694                             )
   695                         ) return he;
   696                         he = he->next_homonym; // check homonyms
   697                     } while (he);
   699                 // obsolote stemming code (used only by the
   700                 // experimental SuffixMgr:suggest_pos_stems)
   701                 // store resulting root in wlst
   702                 } else if (wlst && (*ns < maxSug)) {
   703                     int cwrd = 1;
   704                     for (int k=0; k < *ns; k++)
   705                         if (strcmp(tmpword, wlst[k]) == 0) cwrd = 0;
   706                     if (cwrd) {
   707                         wlst[*ns] = mystrdup(tmpword);
   708                         if (wlst[*ns] == NULL) {
   709                             for (int j=0; j<*ns; j++) free(wlst[j]);
   710                             *ns = -1;
   711                             return NULL;
   712                         }
   713                         (*ns)++;
   714                     }
   715                 }
   716             }
   717     }
   718     return NULL;
   719 }
   721 // see if two-level suffix is present in the word
   722 struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags,
   723     PfxEntry* ppfx, const FLAG needflag)
   724 {
   725     int                 tmpl;            // length of tmpword
   726     struct hentry *     he;              // hash entry pointer
   727     unsigned char *     cp;
   728     char                tmpword[MAXWORDUTF8LEN + 4];
   729     PfxEntry* ep = ppfx;
   732     // if this suffix is being cross checked with a prefix
   733     // but it does not support cross products skip it
   735     if ((optflags & aeXPRODUCT) != 0 &&  (opts & aeXPRODUCT) == 0)
   736         return NULL;
   738     // upon entry suffix is 0 length or already matches the end of the word.
   739     // So if the remaining root word has positive length
   740     // and if there are enough chars in root word and added back strip chars
   741     // to meet the number of characters conditions, then test it
   743     tmpl = len - appndl;
   745     if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
   746        (tmpl + stripl >= numconds)) {
   748             // generate new root word by removing suffix and adding
   749             // back any characters that would have been stripped or
   750             // or null terminating the shorter string
   752             strcpy (tmpword, word);
   753             cp = (unsigned char *)(tmpword + tmpl);
   754             if (stripl) {
   755                 strcpy ((char *)cp, strip);
   756                 tmpl += stripl;
   757                 cp = (unsigned char *)(tmpword + tmpl);
   758             } else *cp = '\0';
   760             // now make sure all of the conditions on characters
   761             // are met.  Please see the appendix at the end of
   762             // this file for more info on exactly what is being
   763             // tested
   765             // if all conditions are met then recall suffix_check
   767             if (test_condition((char *) cp, (char *) tmpword)) {
   768                 if (ppfx) {
   769                     // handle conditional suffix
   770                     if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
   771                         he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
   772                     else
   773                         he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0, NULL, (FLAG) aflag, needflag);
   774                 } else {
   775                     he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
   776                 }
   777                 if (he) return he;
   778             }
   779     }
   780     return NULL;
   781 }
   783 // see if two-level suffix is present in the word
   784 char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags,
   785     PfxEntry* ppfx, const FLAG needflag)
   786 {
   787     int                 tmpl;            // length of tmpword
   788     unsigned char *     cp;
   789     char                tmpword[MAXWORDUTF8LEN + 4];
   790     PfxEntry* ep = ppfx;
   791     char * st;
   793     char result[MAXLNLEN];
   795     *result = '\0';
   797     // if this suffix is being cross checked with a prefix
   798     // but it does not support cross products skip it
   800     if ((optflags & aeXPRODUCT) != 0 &&  (opts & aeXPRODUCT) == 0)
   801         return NULL;
   803     // upon entry suffix is 0 length or already matches the end of the word.
   804     // So if the remaining root word has positive length
   805     // and if there are enough chars in root word and added back strip chars
   806     // to meet the number of characters conditions, then test it
   808     tmpl = len - appndl;
   810     if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
   811        (tmpl + stripl >= numconds)) {
   813             // generate new root word by removing suffix and adding
   814             // back any characters that would have been stripped or
   815             // or null terminating the shorter string
   817             strcpy (tmpword, word);
   818             cp = (unsigned char *)(tmpword + tmpl);
   819             if (stripl) {
   820                 strcpy ((char *)cp, strip);
   821                 tmpl += stripl;
   822                 cp = (unsigned char *)(tmpword + tmpl);
   823             } else *cp = '\0';
   825             // now make sure all of the conditions on characters
   826             // are met.  Please see the appendix at the end of
   827             // this file for more info on exactly what is being
   828             // tested
   830             // if all conditions are met then recall suffix_check
   832             if (test_condition((char *) cp, (char *) tmpword)) {
   833                 if (ppfx) {
   834                     // handle conditional suffix
   835                     if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {
   836                         st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
   837                         if (st) {
   838                             if (ppfx->getMorph()) {
   839                                 mystrcat(result, ppfx->getMorph(), MAXLNLEN);
   840                                 mystrcat(result, " ", MAXLNLEN);
   841                             }
   842                             mystrcat(result,st, MAXLNLEN);
   843                             free(st);
   844                             mychomp(result);
   845                         }
   846                     } else {
   847                         st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag);
   848                         if (st) {
   849                             mystrcat(result, st, MAXLNLEN);
   850                             free(st);
   851                             mychomp(result);
   852                         }
   853                     }
   854                 } else {
   855                         st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
   856                         if (st) {
   857                             mystrcat(result, st, MAXLNLEN);
   858                             free(st);
   859                             mychomp(result);
   860                         }
   861                 }
   862                 if (*result) return mystrdup(result);
   863             }
   864     }
   865     return NULL;
   866 }
   868 // get next homonym with same affix
   869 struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, PfxEntry* ppfx,
   870     const FLAG cclass, const FLAG needflag)
   871 {
   872     PfxEntry* ep = ppfx;
   873     FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL;
   875     while (he->next_homonym) {
   876         he = he->next_homonym;
   877         if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
   878                             ((optflags & aeXPRODUCT) == 0 ||
   879                             TESTAFF(he->astr, eFlag, he->alen) ||
   880                              // handle conditional suffix
   881                             ((contclass) && TESTAFF(contclass, eFlag, contclasslen))
   882                             ) &&
   883                             // handle cont. class
   884                             ((!cclass) ||
   885                                 ((contclass) && TESTAFF(contclass, cclass, contclasslen))
   886                             ) &&
   887                             // handle required flag
   888                             ((!needflag) ||
   889                               (TESTAFF(he->astr, needflag, he->alen) ||
   890                               ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
   891                             )
   892                         ) return he;
   893     }
   894     return NULL;
   895 }
   898 #if 0
   900 Appendix:  Understanding Affix Code
   903 An affix is either a  prefix or a suffix attached to root words to make 
   904 other words.
   906 Basically a Prefix or a Suffix is set of AffEntry objects
   907 which store information about the prefix or suffix along 
   908 with supporting routines to check if a word has a particular 
   909 prefix or suffix or a combination.
   911 The structure affentry is defined as follows:
   913 struct affentry
   914 {
   915    unsigned short aflag;    // ID used to represent the affix
   916    char * strip;            // string to strip before adding affix
   917    char * appnd;            // the affix string to add
   918    unsigned char stripl;    // length of the strip string
   919    unsigned char appndl;    // length of the affix string
   920    char numconds;           // the number of conditions that must be met
   921    char opts;               // flag: aeXPRODUCT- combine both prefix and suffix 
   922    char   conds[SETSIZE];   // array which encodes the conditions to be met
   923 };
   926 Here is a suffix borrowed from the en_US.aff file.  This file 
   927 is whitespace delimited.
   929 SFX D Y 4 
   930 SFX D   0     e          d
   931 SFX D   y     ied        [^aeiou]y
   932 SFX D   0     ed         [^ey]
   933 SFX D   0     ed         [aeiou]y
   935 This information can be interpreted as follows:
   937 In the first line has 4 fields
   939 Field
   940 -----
   941 1     SFX - indicates this is a suffix
   942 2     D   - is the name of the character flag which represents this suffix
   943 3     Y   - indicates it can be combined with prefixes (cross product)
   944 4     4   - indicates that sequence of 4 affentry structures are needed to
   945                properly store the affix information
   947 The remaining lines describe the unique information for the 4 SfxEntry 
   948 objects that make up this affix.  Each line can be interpreted
   949 as follows: (note fields 1 and 2 are as a check against line 1 info)
   951 Field
   952 -----
   953 1     SFX         - indicates this is a suffix
   954 2     D           - is the name of the character flag for this affix
   955 3     y           - the string of chars to strip off before adding affix
   956                          (a 0 here indicates the NULL string)
   957 4     ied         - the string of affix characters to add
   958 5     [^aeiou]y   - the conditions which must be met before the affix
   959                     can be applied
   961 Field 5 is interesting.  Since this is a suffix, field 5 tells us that
   962 there are 2 conditions that must be met.  The first condition is that 
   963 the next to the last character in the word must *NOT* be any of the 
   964 following "a", "e", "i", "o" or "u".  The second condition is that
   965 the last character of the word must end in "y".
   967 So how can we encode this information concisely and be able to 
   968 test for both conditions in a fast manner?  The answer is found
   969 but studying the wonderful ispell code of Geoff Kuenning, et.al. 
   970 (now available under a normal BSD license).
   972 If we set up a conds array of 256 bytes indexed (0 to 255) and access it
   973 using a character (cast to an unsigned char) of a string, we have 8 bits
   974 of information we can store about that character.  Specifically we
   975 could use each bit to say if that character is allowed in any of the 
   976 last (or first for prefixes) 8 characters of the word.
   978 Basically, each character at one end of the word (up to the number 
   979 of conditions) is used to index into the conds array and the resulting 
   980 value found there says whether the that character is valid for a 
   981 specific character position in the word.  
   983 For prefixes, it does this by setting bit 0 if that char is valid 
   984 in the first position, bit 1 if valid in the second position, and so on. 
   986 If a bit is not set, then that char is not valid for that postion in the
   987 word.
   989 If working with suffixes bit 0 is used for the character closest 
   990 to the front, bit 1 for the next character towards the end, ..., 
   991 with bit numconds-1 representing the last char at the end of the string. 
   993 Note: since entries in the conds[] are 8 bits, only 8 conditions 
   994 (read that only 8 character positions) can be examined at one
   995 end of a word (the beginning for prefixes and the end for suffixes.
   997 So to make this clearer, lets encode the conds array values for the 
   998 first two affentries for the suffix D described earlier.
  1001   For the first affentry:    
  1002      numconds = 1             (only examine the last character)
  1004      conds['e'] =  (1 << 0)   (the word must end in an E)
  1005      all others are all 0
  1007   For the second affentry:
  1008      numconds = 2             (only examine the last two characters)     
  1010      conds[X] = conds[X] | (1 << 0)     (aeiou are not allowed)
  1011          where X is all characters *but* a, e, i, o, or u
  1014      conds['y'] = (1 << 1)     (the last char must be a y)
  1015      all other bits for all other entries in the conds array are zero
  1018 #endif

mercurial