The Tor Browser: extensions/spellcheck/hunspell/src/affentry.cpp@6474c204b198

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /******* BEGIN LICENSE BLOCK *******

     2  * Version: MPL 1.1/GPL 2.0/LGPL 2.1

     3  *

     4  * The contents of this file are subject to the Mozilla Public License Version

     5  * 1.1 (the "License"); you may not use this file except in compliance with

     6  * the License. You may obtain a copy of the License at

     7  * http://www.mozilla.org/MPL/

     8  *

     9  * Software distributed under the License is distributed on an "AS IS" basis,

    10  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License

    11  * for the specific language governing rights and limitations under the

    12  * License.

    13  *

    14  * The Initial Developers of the Original Code are Kevin Hendricks (MySpell)

    15  * and László Németh (Hunspell). Portions created by the Initial Developers

    16  * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved.

    17  *

    18  * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca)

    19  *                 David Einstein (deinst@world.std.com)

    20  *                 László Németh (nemethl@gyorsposta.hu)

    21  *                 Caolan McNamara (caolanm@redhat.com)

    22  *                 Davide Prina

    23  *                 Giuseppe Modugno

    24  *                 Gianluca Turconi

    25  *                 Simon Brouwer

    26  *                 Noll Janos

    27  *                 Biro Arpad

    28  *                 Goldman Eleonora

    29  *                 Sarlos Tamas

    30  *                 Bencsath Boldizsar

    31  *                 Halacsy Peter

    32  *                 Dvornik Laszlo

    33  *                 Gefferth Andras

    34  *                 Nagy Viktor

    35  *                 Varga Daniel

    36  *                 Chris Halls

    37  *                 Rene Engelhard

    38  *                 Bram Moolenaar

    39  *                 Dafydd Jones

    40  *                 Harri Pitkanen

    41  *                 Andras Timar

    42  *                 Tor Lillqvist

    43  *

    44  * Alternatively, the contents of this file may be used under the terms of

    45  * either the GNU General Public License Version 2 or later (the "GPL"), or

    46  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),

    47  * in which case the provisions of the GPL or the LGPL are applicable instead

    48  * of those above. If you wish to allow use of your version of this file only

    49  * under the terms of either the GPL or the LGPL, and not to allow others to

    50  * use your version of this file under the terms of the MPL, indicate your

    51  * decision by deleting the provisions above and replace them with the notice

    52  * and other provisions required by the GPL or the LGPL. If you do not delete

    53  * the provisions above, a recipient may use your version of this file under

    54  * the terms of any one of the MPL, the GPL or the LGPL.

    55  *

    56  ******* END LICENSE BLOCK *******/

    58 #include <stdlib.h>

    59 #include <string.h>

    60 #include <stdio.h>

    61 #include <ctype.h>

    63 #include "affentry.hxx"

    64 #include "csutil.hxx"

    66 PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp)

    67 {

    68   // register affix manager

    69   pmyMgr = pmgr;

    71   // set up its initial values

    73   aflag = dp->aflag;         // flag

    74   strip = dp->strip;         // string to strip

    75   appnd = dp->appnd;         // string to append

    76   stripl = dp->stripl;       // length of strip string

    77   appndl = dp->appndl;       // length of append string

    78   numconds = dp->numconds;   // length of the condition

    79   opts = dp->opts;           // cross product flag

    80   // then copy over all of the conditions

    81   if (opts & aeLONGCOND) {

    82     memcpy(c.conds, dp->c.l.conds1, MAXCONDLEN_1);

    83     c.l.conds2 = dp->c.l.conds2;

    84   } else memcpy(c.conds, dp->c.conds, MAXCONDLEN);

    85   next = NULL;

    86   nextne = NULL;

    87   nexteq = NULL;

    88   morphcode = dp->morphcode;

    89   contclass = dp->contclass;

    90   contclasslen = dp->contclasslen;

    91 }

    94 PfxEntry::~PfxEntry()

    95 {

    96     aflag = 0;

    97     if (appnd) free(appnd);

    98     if (strip) free(strip);

    99     pmyMgr = NULL;

   100     appnd = NULL;

   101     strip = NULL;

   102     if (opts & aeLONGCOND) free(c.l.conds2);

   103     if (morphcode && !(opts & aeALIASM)) free(morphcode);

   104     if (contclass && !(opts & aeALIASF)) free(contclass);

   105 }

   107 // add prefix to this word assuming conditions hold

   108 char * PfxEntry::add(const char * word, int len)

   109 {

   110     char tword[MAXWORDUTF8LEN + 4];

   112     if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) &&

   113        (len >= numconds) && test_condition(word) &&

   114        (!stripl || (strncmp(word, strip, stripl) == 0)) &&

   115        ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {

   116     /* we have a match so add prefix */

   117               char * pp = tword;

   118               if (appndl) {

   119                   strcpy(tword,appnd);

   120                   pp += appndl;

   121                }

   122                strcpy(pp, (word + stripl));

   123                return mystrdup(tword);

   124      }

   125      return NULL;

   126 }

   128 inline char * PfxEntry::nextchar(char * p) {

   129     if (p) {

   130         p++;

   131         if (opts & aeLONGCOND) {

   132             // jump to the 2nd part of the condition

   133             if (p == c.conds + MAXCONDLEN_1) return c.l.conds2;

   134         // end of the MAXCONDLEN length condition

   135         } else if (p == c.conds + MAXCONDLEN) return NULL;

   136 	return *p ? p : NULL;

   137     }

   138     return NULL;

   139 }

   141 inline int PfxEntry::test_condition(const char * st)

   142 {

   143     const char * pos = NULL; // group with pos input position

   144     bool neg = false;        // complementer

   145     bool ingroup = false;    // character in the group

   146     if (numconds == 0) return 1;

   147     char * p = c.conds;

   148     while (1) {

   149       switch (*p) {

   150         case '\0': return 1;

   151         case '[': {

   152                 neg = false;

   153                 ingroup = false;

   154                 p = nextchar(p);

   155                 pos = st; break;

   156             }

   157         case '^': { p = nextchar(p); neg = true; break; }

   158         case ']': {

   159                 if ((neg && ingroup) || (!neg && !ingroup)) return 0;

   160                 pos = NULL;

   161                 p = nextchar(p);

   162                 // skip the next character

   163                 if (!ingroup && *st) for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);

   164                 if (*st == '\0' && p) return 0; // word <= condition

   165                 break;

   166             }

   167          case '.': if (!pos) { // dots are not metacharacters in groups: [.]

   168                 p = nextchar(p);

   169                 // skip the next character

   170                 for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);

   171                 if (*st == '\0' && p) return 0; // word <= condition

   172                 break;

   173             }

   174     default: {

   175                 if (*st == *p) {

   176                     st++;

   177                     p = nextchar(p);

   178                     if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte

   179                         while (p && (*p & 0xc0) == 0x80) {       // character

   180                             if (*p != *st) {

   181                                 if (!pos) return 0;

   182                                 st = pos;

   183                                 break;

   184                             }

   185                             p = nextchar(p);

   186                             st++;

   187                         }

   188                         if (pos && st != pos) {

   189                             ingroup = true;

   190                             while (p && *p != ']' && (p = nextchar(p)));

   191                         }

   192                     } else if (pos) {

   193                         ingroup = true;

   194                         while (p && *p != ']' && (p = nextchar(p)));

   195                     }

   196                 } else if (pos) { // group

   197                     p = nextchar(p);

   198                 } else return 0;

   199             }

   200       }

   201       if (!p) return 1;

   202     }

   203 }

   205 // check if this prefix entry matches

   206 struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound, const FLAG needflag)

   207 {

   208     int                 tmpl;   // length of tmpword

   209     struct hentry *     he;     // hash entry of root word or NULL

   210     char                tmpword[MAXWORDUTF8LEN + 4];

   212     // on entry prefix is 0 length or already matches the beginning of the word.

   213     // So if the remaining root word has positive length

   214     // and if there are enough chars in root word and added back strip chars

   215     // to meet the number of characters conditions, then test it

   217      tmpl = len - appndl;

   219      if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) {

   221             // generate new root word by removing prefix and adding

   222             // back any characters that would have been stripped

   224             if (stripl) strcpy (tmpword, strip);

   225             strcpy ((tmpword + stripl), (word + appndl));

   227             // now make sure all of the conditions on characters

   228             // are met.  Please see the appendix at the end of

   229             // this file for more info on exactly what is being

   230             // tested

   232             // if all conditions are met then check if resulting

   233             // root word in the dictionary

   235             if (test_condition(tmpword)) {

   236                 tmpl += stripl;

   237                 if ((he = pmyMgr->lookup(tmpword)) != NULL) {

   238                    do {

   239                       if (TESTAFF(he->astr, aflag, he->alen) &&

   240                         // forbid single prefixes with needaffix flag

   241                         ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&

   242                         // needflag

   243                         ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||

   244                          (contclass && TESTAFF(contclass, needflag, contclasslen))))

   245                             return he;

   246                       he = he->next_homonym; // check homonyms

   247                    } while (he);

   248                 }

   250                 // prefix matched but no root word was found

   251                 // if aeXPRODUCT is allowed, try again but now

   252                 // ross checked combined with a suffix

   254                 //if ((opts & aeXPRODUCT) && in_compound) {

   255                 if ((opts & aeXPRODUCT)) {

   256                    he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, this, NULL,

   257                         0, NULL, FLAG_NULL, needflag, in_compound);

   258                    if (he) return he;

   259                 }

   260             }

   261      }

   262     return NULL;

   263 }

   265 // check if this prefix entry matches

   266 struct hentry * PfxEntry::check_twosfx(const char * word, int len,

   267     char in_compound, const FLAG needflag)

   268 {

   269     int                 tmpl;   // length of tmpword

   270     struct hentry *     he;     // hash entry of root word or NULL

   271     char                tmpword[MAXWORDUTF8LEN + 4];

   273     // on entry prefix is 0 length or already matches the beginning of the word.

   274     // So if the remaining root word has positive length

   275     // and if there are enough chars in root word and added back strip chars

   276     // to meet the number of characters conditions, then test it

   278      tmpl = len - appndl;

   280      if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&

   281         (tmpl + stripl >= numconds)) {

   283             // generate new root word by removing prefix and adding

   284             // back any characters that would have been stripped

   286             if (stripl) strcpy (tmpword, strip);

   287             strcpy ((tmpword + stripl), (word + appndl));

   289             // now make sure all of the conditions on characters

   290             // are met.  Please see the appendix at the end of

   291             // this file for more info on exactly what is being

   292             // tested

   294             // if all conditions are met then check if resulting

   295             // root word in the dictionary

   297             if (test_condition(tmpword)) {

   298                 tmpl += stripl;

   300                 // prefix matched but no root word was found

   301                 // if aeXPRODUCT is allowed, try again but now

   302                 // cross checked combined with a suffix

   304                 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {

   305                    he = pmyMgr->suffix_check_twosfx(tmpword, tmpl, aeXPRODUCT, this, needflag);

   306                    if (he) return he;

   307                 }

   308             }

   309      }

   310     return NULL;

   311 }

   313 // check if this prefix entry matches

   314 char * PfxEntry::check_twosfx_morph(const char * word, int len,

   315          char in_compound, const FLAG needflag)

   316 {

   317     int                 tmpl;   // length of tmpword

   318     char                tmpword[MAXWORDUTF8LEN + 4];

   320     // on entry prefix is 0 length or already matches the beginning of the word.

   321     // So if the remaining root word has positive length

   322     // and if there are enough chars in root word and added back strip chars

   323     // to meet the number of characters conditions, then test it

   325      tmpl = len - appndl;

   327      if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&

   328         (tmpl + stripl >= numconds)) {

   330             // generate new root word by removing prefix and adding

   331             // back any characters that would have been stripped

   333             if (stripl) strcpy (tmpword, strip);

   334             strcpy ((tmpword + stripl), (word + appndl));

   336             // now make sure all of the conditions on characters

   337             // are met.  Please see the appendix at the end of

   338             // this file for more info on exactly what is being

   339             // tested

   341             // if all conditions are met then check if resulting

   342             // root word in the dictionary

   344             if (test_condition(tmpword)) {

   345                 tmpl += stripl;

   347                 // prefix matched but no root word was found

   348                 // if aeXPRODUCT is allowed, try again but now

   349                 // ross checked combined with a suffix

   351                 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {

   352                     return pmyMgr->suffix_check_twosfx_morph(tmpword, tmpl,

   353                              aeXPRODUCT, this, needflag);

   354                 }

   355             }

   356      }

   357     return NULL;

   358 }

   360 // check if this prefix entry matches

   361 char * PfxEntry::check_morph(const char * word, int len, char in_compound, const FLAG needflag)

   362 {

   363     int                 tmpl;   // length of tmpword

   364     struct hentry *     he;     // hash entry of root word or NULL

   365     char                tmpword[MAXWORDUTF8LEN + 4];

   366     char                result[MAXLNLEN];

   367     char * st;

   369     *result = '\0';

   371     // on entry prefix is 0 length or already matches the beginning of the word.

   372     // So if the remaining root word has positive length

   373     // and if there are enough chars in root word and added back strip chars

   374     // to meet the number of characters conditions, then test it

   376      tmpl = len - appndl;

   378      if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&

   379         (tmpl + stripl >= numconds)) {

   381             // generate new root word by removing prefix and adding

   382             // back any characters that would have been stripped

   384             if (stripl) strcpy (tmpword, strip);

   385             strcpy ((tmpword + stripl), (word + appndl));

   387             // now make sure all of the conditions on characters

   388             // are met.  Please see the appendix at the end of

   389             // this file for more info on exactly what is being

   390             // tested

   392             // if all conditions are met then check if resulting

   393             // root word in the dictionary

   395             if (test_condition(tmpword)) {

   396                 tmpl += stripl;

   397                 if ((he = pmyMgr->lookup(tmpword)) != NULL) {

   398                     do {

   399                       if (TESTAFF(he->astr, aflag, he->alen) &&

   400                         // forbid single prefixes with needaffix flag

   401                         ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&

   402                         // needflag

   403                         ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||

   404                          (contclass && TESTAFF(contclass, needflag, contclasslen)))) {

   405                             if (morphcode) {

   406                                 mystrcat(result, " ", MAXLNLEN);

   407                                 mystrcat(result, morphcode, MAXLNLEN);

   408                             } else mystrcat(result,getKey(), MAXLNLEN);

   409                             if (!HENTRY_FIND(he, MORPH_STEM)) {

   410                                 mystrcat(result, " ", MAXLNLEN);

   411                                 mystrcat(result, MORPH_STEM, MAXLNLEN);

   412                                 mystrcat(result, HENTRY_WORD(he), MAXLNLEN);

   413                             }

   414                             // store the pointer of the hash entry

   415                             if (HENTRY_DATA(he)) {

   416                                 mystrcat(result, " ", MAXLNLEN);

   417                                 mystrcat(result, HENTRY_DATA2(he), MAXLNLEN);

   418                             } else {

   419                                 // return with debug information

   420                                 char * flag = pmyMgr->encode_flag(getFlag());

   421                                 mystrcat(result, " ", MAXLNLEN);

   422                                 mystrcat(result, MORPH_FLAG, MAXLNLEN);

   423                                 mystrcat(result, flag, MAXLNLEN);

   424                                 free(flag);

   425                             }

   426                             mystrcat(result, "\n", MAXLNLEN);

   427                       }

   428                       he = he->next_homonym;

   429                     } while (he);

   430                 }

   432                 // prefix matched but no root word was found

   433                 // if aeXPRODUCT is allowed, try again but now

   434                 // ross checked combined with a suffix

   436                 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {

   437                    st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, this,

   438                      FLAG_NULL, needflag);

   439                    if (st) {

   440                         mystrcat(result, st, MAXLNLEN);

   441                         free(st);

   442                    }

   443                 }

   444             }

   445      }

   447     if (*result) return mystrdup(result);

   448     return NULL;

   449 }

   451 SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp)

   452 {

   453   // register affix manager

   454   pmyMgr = pmgr;

   456   // set up its initial values

   457   aflag = dp->aflag;         // char flag

   458   strip = dp->strip;         // string to strip

   459   appnd = dp->appnd;         // string to append

   460   stripl = dp->stripl;       // length of strip string

   461   appndl = dp->appndl;       // length of append string

   462   numconds = dp->numconds;   // length of the condition

   463   opts = dp->opts;           // cross product flag

   465   // then copy over all of the conditions

   466   if (opts & aeLONGCOND) {

   467     memcpy(c.l.conds1, dp->c.l.conds1, MAXCONDLEN_1);

   468     c.l.conds2 = dp->c.l.conds2;

   469   } else memcpy(c.conds, dp->c.conds, MAXCONDLEN);

   470   next = NULL;

   471   nextne = NULL;

   472   nexteq = NULL;

   473   rappnd = myrevstrdup(appnd);

   474   morphcode = dp->morphcode;

   475   contclass = dp->contclass;

   476   contclasslen = dp->contclasslen;

   477 }

   480 SfxEntry::~SfxEntry()

   481 {

   482     aflag = 0;

   483     if (appnd) free(appnd);

   484     if (rappnd) free(rappnd);

   485     if (strip) free(strip);

   486     pmyMgr = NULL;

   487     appnd = NULL;

   488     strip = NULL;

   489     if (opts & aeLONGCOND) free(c.l.conds2);

   490     if (morphcode && !(opts & aeALIASM)) free(morphcode);

   491     if (contclass && !(opts & aeALIASF)) free(contclass);

   492 }

   494 // add suffix to this word assuming conditions hold

   495 char * SfxEntry::add(const char * word, int len)

   496 {

   497     char                tword[MAXWORDUTF8LEN + 4];

   499      /* make sure all conditions match */

   500      if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) &&

   501         (len >= numconds) && test_condition(word + len, word) &&

   502         (!stripl || (strcmp(word + len - stripl, strip) == 0)) &&

   503         ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {

   504               /* we have a match so add suffix */

   505               strcpy(tword,word);

   506               if (appndl) {

   507                   strcpy(tword + len - stripl, appnd);

   508               } else {

   509                   *(tword + len - stripl) = '\0';

   510               }

   511               return mystrdup(tword);

   512      }

   513      return NULL;

   514 }

   516 inline char * SfxEntry::nextchar(char * p) {

   517     if (p) {

   518 	p++;

   519 	if (opts & aeLONGCOND) {

   520     	    // jump to the 2nd part of the condition

   521     	    if (p == c.l.conds1 + MAXCONDLEN_1) return c.l.conds2;

   522 	// end of the MAXCONDLEN length condition

   523 	} else if (p == c.conds + MAXCONDLEN) return NULL;

   524 	return *p ? p : NULL;

   525     }

   526     return NULL;

   527 }

   529 inline int SfxEntry::test_condition(const char * st, const char * beg)

   530 {

   531     const char * pos = NULL;    // group with pos input position

   532     bool neg = false;           // complementer

   533     bool ingroup = false;       // character in the group

   534     if (numconds == 0) return 1;

   535     char * p = c.conds;

   536     st--;

   537     int i = 1;

   538     while (1) {

   539       switch (*p) {

   540         case '\0': return 1;

   541         case '[': { p = nextchar(p); pos = st; break; }

   542         case '^': { p = nextchar(p); neg = true; break; }

   543         case ']': { if (!neg && !ingroup) return 0;

   544                 i++;

   545                 // skip the next character

   546                 if (!ingroup) {

   547                     for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);

   548                     st--;

   549                 }

   550                 pos = NULL;

   551                 neg = false;

   552                 ingroup = false;

   553                 p = nextchar(p);

   554                 if (st < beg && p) return 0; // word <= condition

   555                 break;

   556             }

   557         case '.': if (!pos) { // dots are not metacharacters in groups: [.]

   558                 p = nextchar(p);

   559                 // skip the next character

   560                 for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);

   561                 if (st < beg) { // word <= condition

   562 		    if (p) return 0; else return 1;

   563 		}

   564                 if ((opts & aeUTF8) && (*st & 0x80)) { // head of the UTF-8 character

   565                     st--;

   566                     if (st < beg) { // word <= condition

   567 			if (p) return 0; else return 1;

   568 		    }

   569                 }

   570                 break;

   571             }

   572     default: {

   573                 if (*st == *p) {

   574                     p = nextchar(p);

   575                     if ((opts & aeUTF8) && (*st & 0x80)) {

   576                         st--;

   577                         while (p && (st >= beg)) {

   578                             if (*p != *st) {

   579                                 if (!pos) return 0;

   580                                 st = pos;

   581                                 break;

   582                             }

   583                             // first byte of the UTF-8 multibyte character

   584                             if ((*p & 0xc0) != 0x80) break;

   585                             p = nextchar(p);

   586                             st--;

   587                         }

   588                         if (pos && st != pos) {

   589                             if (neg) return 0;

   590                             else if (i == numconds) return 1;

   591                             ingroup = true;

   592                             while (p && *p != ']' && (p = nextchar(p)));

   593 			    st--;

   594                         }

   595                         if (p && *p != ']') p = nextchar(p);

   596                     } else if (pos) {

   597                         if (neg) return 0;

   598                         else if (i == numconds) return 1;

   599                         ingroup = true;

   600 			while (p && *p != ']' && (p = nextchar(p)))

   601                           ;

   602 //			if (p && *p != ']') p = nextchar(p);

   603                         st--;

   604                     }

   605                     if (!pos) {

   606                         i++;

   607                         st--;

   608                     }

   609                     if (st < beg && p && *p != ']') return 0; // word <= condition

   610                 } else if (pos) { // group

   611                     p = nextchar(p);

   612                 } else return 0;

   613             }

   614       }

   615       if (!p) return 1;

   616     }

   617 }

   619 // see if this suffix is present in the word

   620 struct hentry * SfxEntry::checkword(const char * word, int len, int optflags,

   621     PfxEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag,

   622     const FLAG badflag)

   623 {

   624     int                 tmpl;            // length of tmpword

   625     struct hentry *     he;              // hash entry pointer

   626     unsigned char *     cp;

   627     char                tmpword[MAXWORDUTF8LEN + 4];

   628     PfxEntry* ep = ppfx;

   630     // if this suffix is being cross checked with a prefix

   631     // but it does not support cross products skip it

   633     if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0))

   634         return NULL;

   636     // upon entry suffix is 0 length or already matches the end of the word.

   637     // So if the remaining root word has positive length

   638     // and if there are enough chars in root word and added back strip chars

   639     // to meet the number of characters conditions, then test it

   641     tmpl = len - appndl;

   642     // the second condition is not enough for UTF-8 strings

   643     // it checked in test_condition()

   645     if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&

   646         (tmpl + stripl >= numconds)) {

   648             // generate new root word by removing suffix and adding

   649             // back any characters that would have been stripped or

   650             // or null terminating the shorter string

   652             strcpy (tmpword, word);

   653             cp = (unsigned char *)(tmpword + tmpl);

   654             if (stripl) {

   655                 strcpy ((char *)cp, strip);

   656                 tmpl += stripl;

   657                 cp = (unsigned char *)(tmpword + tmpl);

   658             } else *cp = '\0';

   660             // now make sure all of the conditions on characters

   661             // are met.  Please see the appendix at the end of

   662             // this file for more info on exactly what is being

   663             // tested

   665             // if all conditions are met then check if resulting

   666             // root word in the dictionary

   668             if (test_condition((char *) cp, (char *) tmpword)) {

   670 #ifdef SZOSZABLYA_POSSIBLE_ROOTS

   671                 fprintf(stdout,"%s %s %c\n", word, tmpword, aflag);

   672 #endif

   673                 if ((he = pmyMgr->lookup(tmpword)) != NULL) {

   674                     do {

   675                         // check conditional suffix (enabled by prefix)

   676                         if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() &&

   677                                     TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&

   678                             (((optflags & aeXPRODUCT) == 0) ||

   679                             (ep && TESTAFF(he->astr, ep->getFlag(), he->alen)) ||

   680                              // enabled by prefix

   681                             ((contclass) && (ep && TESTAFF(contclass, ep->getFlag(), contclasslen)))

   682                             ) &&

   683                             // handle cont. class

   684                             ((!cclass) ||

   685                                 ((contclass) && TESTAFF(contclass, cclass, contclasslen))

   686                             ) &&

   687                             // check only in compound homonyms (bad flags)

   688                             (!badflag || !TESTAFF(he->astr, badflag, he->alen)

   689                             ) &&

   690                             // handle required flag

   691                             ((!needflag) ||

   692                               (TESTAFF(he->astr, needflag, he->alen) ||

   693                               ((contclass) && TESTAFF(contclass, needflag, contclasslen)))

   694                             )

   695                         ) return he;

   696                         he = he->next_homonym; // check homonyms

   697                     } while (he);

   699                 // obsolote stemming code (used only by the

   700                 // experimental SuffixMgr:suggest_pos_stems)

   701                 // store resulting root in wlst

   702                 } else if (wlst && (*ns < maxSug)) {

   703                     int cwrd = 1;

   704                     for (int k=0; k < *ns; k++)

   705                         if (strcmp(tmpword, wlst[k]) == 0) cwrd = 0;

   706                     if (cwrd) {

   707                         wlst[*ns] = mystrdup(tmpword);

   708                         if (wlst[*ns] == NULL) {

   709                             for (int j=0; j<*ns; j++) free(wlst[j]);

   710                             *ns = -1;

   711                             return NULL;

   712                         }

   713                         (*ns)++;

   714                     }

   715                 }

   716             }

   717     }

   718     return NULL;

   719 }

   721 // see if two-level suffix is present in the word

   722 struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags,

   723     PfxEntry* ppfx, const FLAG needflag)

   724 {

   725     int                 tmpl;            // length of tmpword

   726     struct hentry *     he;              // hash entry pointer

   727     unsigned char *     cp;

   728     char                tmpword[MAXWORDUTF8LEN + 4];

   729     PfxEntry* ep = ppfx;

   732     // if this suffix is being cross checked with a prefix

   733     // but it does not support cross products skip it

   735     if ((optflags & aeXPRODUCT) != 0 &&  (opts & aeXPRODUCT) == 0)

   736         return NULL;

   738     // upon entry suffix is 0 length or already matches the end of the word.

   739     // So if the remaining root word has positive length

   740     // and if there are enough chars in root word and added back strip chars

   741     // to meet the number of characters conditions, then test it

   743     tmpl = len - appndl;

   745     if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&

   746        (tmpl + stripl >= numconds)) {

   748             // generate new root word by removing suffix and adding

   749             // back any characters that would have been stripped or

   750             // or null terminating the shorter string

   752             strcpy (tmpword, word);

   753             cp = (unsigned char *)(tmpword + tmpl);

   754             if (stripl) {

   755                 strcpy ((char *)cp, strip);

   756                 tmpl += stripl;

   757                 cp = (unsigned char *)(tmpword + tmpl);

   758             } else *cp = '\0';

   760             // now make sure all of the conditions on characters

   761             // are met.  Please see the appendix at the end of

   762             // this file for more info on exactly what is being

   763             // tested

   765             // if all conditions are met then recall suffix_check

   767             if (test_condition((char *) cp, (char *) tmpword)) {

   768                 if (ppfx) {

   769                     // handle conditional suffix

   770                     if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))

   771                         he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);

   772                     else

   773                         he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0, NULL, (FLAG) aflag, needflag);

   774                 } else {

   775                     he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);

   776                 }

   777                 if (he) return he;

   778             }

   779     }

   780     return NULL;

   781 }

   783 // see if two-level suffix is present in the word

   784 char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags,

   785     PfxEntry* ppfx, const FLAG needflag)

   786 {

   787     int                 tmpl;            // length of tmpword

   788     unsigned char *     cp;

   789     char                tmpword[MAXWORDUTF8LEN + 4];

   790     PfxEntry* ep = ppfx;

   791     char * st;

   793     char result[MAXLNLEN];

   795     *result = '\0';

   797     // if this suffix is being cross checked with a prefix

   798     // but it does not support cross products skip it

   800     if ((optflags & aeXPRODUCT) != 0 &&  (opts & aeXPRODUCT) == 0)

   801         return NULL;

   803     // upon entry suffix is 0 length or already matches the end of the word.

   804     // So if the remaining root word has positive length

   805     // and if there are enough chars in root word and added back strip chars

   806     // to meet the number of characters conditions, then test it

   808     tmpl = len - appndl;

   810     if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&

   811        (tmpl + stripl >= numconds)) {

   813             // generate new root word by removing suffix and adding

   814             // back any characters that would have been stripped or

   815             // or null terminating the shorter string

   817             strcpy (tmpword, word);

   818             cp = (unsigned char *)(tmpword + tmpl);

   819             if (stripl) {

   820                 strcpy ((char *)cp, strip);

   821                 tmpl += stripl;

   822                 cp = (unsigned char *)(tmpword + tmpl);

   823             } else *cp = '\0';

   825             // now make sure all of the conditions on characters

   826             // are met.  Please see the appendix at the end of

   827             // this file for more info on exactly what is being

   828             // tested

   830             // if all conditions are met then recall suffix_check

   832             if (test_condition((char *) cp, (char *) tmpword)) {

   833                 if (ppfx) {

   834                     // handle conditional suffix

   835                     if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {

   836                         st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);

   837                         if (st) {

   838                             if (ppfx->getMorph()) {

   839                                 mystrcat(result, ppfx->getMorph(), MAXLNLEN);

   840                                 mystrcat(result, " ", MAXLNLEN);

   841                             }

   842                             mystrcat(result,st, MAXLNLEN);

   843                             free(st);

   844                             mychomp(result);

   845                         }

   846                     } else {

   847                         st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag);

   848                         if (st) {

   849                             mystrcat(result, st, MAXLNLEN);

   850                             free(st);

   851                             mychomp(result);

   852                         }

   853                     }

   854                 } else {

   855                         st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);

   856                         if (st) {

   857                             mystrcat(result, st, MAXLNLEN);

   858                             free(st);

   859                             mychomp(result);

   860                         }

   861                 }

   862                 if (*result) return mystrdup(result);

   863             }

   864     }

   865     return NULL;

   866 }

   868 // get next homonym with same affix

   869 struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, PfxEntry* ppfx,

   870     const FLAG cclass, const FLAG needflag)

   871 {

   872     PfxEntry* ep = ppfx;

   873     FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL;

   875     while (he->next_homonym) {

   876         he = he->next_homonym;

   877         if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&

   878                             ((optflags & aeXPRODUCT) == 0 ||

   879                             TESTAFF(he->astr, eFlag, he->alen) ||

   880                              // handle conditional suffix

   881                             ((contclass) && TESTAFF(contclass, eFlag, contclasslen))

   882                             ) &&

   883                             // handle cont. class

   884                             ((!cclass) ||

   885                                 ((contclass) && TESTAFF(contclass, cclass, contclasslen))

   886                             ) &&

   887                             // handle required flag

   888                             ((!needflag) ||

   889                               (TESTAFF(he->astr, needflag, he->alen) ||

   890                               ((contclass) && TESTAFF(contclass, needflag, contclasslen)))

   891                             )

   892                         ) return he;

   893     }

   894     return NULL;

   895 }

   898 #if 0

   900 Appendix:  Understanding Affix Code

   903 An affix is either a  prefix or a suffix attached to root words to make

   904 other words.

   906 Basically a Prefix or a Suffix is set of AffEntry objects

   907 which store information about the prefix or suffix along

   908 with supporting routines to check if a word has a particular

   909 prefix or suffix or a combination.

   911 The structure affentry is defined as follows:

   913 struct affentry

   914 {

   915    unsigned short aflag;    // ID used to represent the affix

   916    char * strip;            // string to strip before adding affix

   917    char * appnd;            // the affix string to add

   918    unsigned char stripl;    // length of the strip string

   919    unsigned char appndl;    // length of the affix string

   920    char numconds;           // the number of conditions that must be met

   921    char opts;               // flag: aeXPRODUCT- combine both prefix and suffix

   922    char   conds[SETSIZE];   // array which encodes the conditions to be met

   923 };

   926 Here is a suffix borrowed from the en_US.aff file.  This file

   927 is whitespace delimited.

   929 SFX D Y 4

   930 SFX D   0     e          d

   931 SFX D   y     ied        [^aeiou]y

   932 SFX D   0     ed         [^ey]

   933 SFX D   0     ed         [aeiou]y

   935 This information can be interpreted as follows:

   937 In the first line has 4 fields

   939 Field

   940 -----

   941 1     SFX - indicates this is a suffix

   942 2     D   - is the name of the character flag which represents this suffix

   943 3     Y   - indicates it can be combined with prefixes (cross product)

   944 4     4   - indicates that sequence of 4 affentry structures are needed to

   945                properly store the affix information

   947 The remaining lines describe the unique information for the 4 SfxEntry

   948 objects that make up this affix.  Each line can be interpreted

   949 as follows: (note fields 1 and 2 are as a check against line 1 info)

   951 Field

   952 -----

   953 1     SFX         - indicates this is a suffix

   954 2     D           - is the name of the character flag for this affix

   955 3     y           - the string of chars to strip off before adding affix

   956                          (a 0 here indicates the NULL string)

   957 4     ied         - the string of affix characters to add

   958 5     [^aeiou]y   - the conditions which must be met before the affix

   959                     can be applied

   961 Field 5 is interesting.  Since this is a suffix, field 5 tells us that

   962 there are 2 conditions that must be met.  The first condition is that

   963 the next to the last character in the word must *NOT* be any of the

   964 following "a", "e", "i", "o" or "u".  The second condition is that

   965 the last character of the word must end in "y".

   967 So how can we encode this information concisely and be able to

   968 test for both conditions in a fast manner?  The answer is found

   969 but studying the wonderful ispell code of Geoff Kuenning, et.al.

   970 (now available under a normal BSD license).

   972 If we set up a conds array of 256 bytes indexed (0 to 255) and access it

   973 using a character (cast to an unsigned char) of a string, we have 8 bits

   974 of information we can store about that character.  Specifically we

   975 could use each bit to say if that character is allowed in any of the

   976 last (or first for prefixes) 8 characters of the word.

   978 Basically, each character at one end of the word (up to the number

   979 of conditions) is used to index into the conds array and the resulting

   980 value found there says whether the that character is valid for a

   981 specific character position in the word.

   983 For prefixes, it does this by setting bit 0 if that char is valid

   984 in the first position, bit 1 if valid in the second position, and so on.

   986 If a bit is not set, then that char is not valid for that postion in the

   987 word.

   989 If working with suffixes bit 0 is used for the character closest

   990 to the front, bit 1 for the next character towards the end, ...,

   991 with bit numconds-1 representing the last char at the end of the string.

   993 Note: since entries in the conds[] are 8 bits, only 8 conditions

   994 (read that only 8 character positions) can be examined at one

   995 end of a word (the beginning for prefixes and the end for suffixes.

   997 So to make this clearer, lets encode the conds array values for the

   998 first two affentries for the suffix D described earlier.

  1001   For the first affentry:

  1002      numconds = 1             (only examine the last character)

  1004      conds['e'] =  (1 << 0)   (the word must end in an E)

  1005      all others are all 0

  1007   For the second affentry:

  1008      numconds = 2             (only examine the last two characters)

  1010      conds[X] = conds[X] | (1 << 0)     (aeiou are not allowed)

  1011          where X is all characters *but* a, e, i, o, or u

  1014      conds['y'] = (1 << 1)     (the last char must be a y)

  1015      all other bits for all other entries in the conds array are zero

  1018 #endif

The Tor Browser / file revision

extensions/spellcheck/hunspell/src/affentry.cpp@6474c204b198

extensions/spellcheck/hunspell/src/affentry.cpp