The Tor Browser: extensions/spellcheck/hunspell/src/affixmgr.cpp@6474c204b198

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /******* BEGIN LICENSE BLOCK *******

     2  * Version: MPL 1.1/GPL 2.0/LGPL 2.1

     3  *

     4  * The contents of this file are subject to the Mozilla Public License Version

     5  * 1.1 (the "License"); you may not use this file except in compliance with

     6  * the License. You may obtain a copy of the License at

     7  * http://www.mozilla.org/MPL/

     8  *

     9  * Software distributed under the License is distributed on an "AS IS" basis,

    10  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License

    11  * for the specific language governing rights and limitations under the

    12  * License.

    13  *

    14  * The Initial Developers of the Original Code are Kevin Hendricks (MySpell)

    15  * and László Németh (Hunspell). Portions created by the Initial Developers

    16  * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved.

    17  *

    18  * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca)

    19  *                 David Einstein (deinst@world.std.com)

    20  *                 László Németh (nemethl@gyorsposta.hu)

    21  *                 Caolan McNamara (caolanm@redhat.com)

    22  *                 Davide Prina

    23  *                 Giuseppe Modugno

    24  *                 Gianluca Turconi

    25  *                 Simon Brouwer

    26  *                 Noll Janos

    27  *                 Biro Arpad

    28  *                 Goldman Eleonora

    29  *                 Sarlos Tamas

    30  *                 Bencsath Boldizsar

    31  *                 Halacsy Peter

    32  *                 Dvornik Laszlo

    33  *                 Gefferth Andras

    34  *                 Nagy Viktor

    35  *                 Varga Daniel

    36  *                 Chris Halls

    37  *                 Rene Engelhard

    38  *                 Bram Moolenaar

    39  *                 Dafydd Jones

    40  *                 Harri Pitkanen

    41  *                 Andras Timar

    42  *                 Tor Lillqvist

    43  *

    44  * Alternatively, the contents of this file may be used under the terms of

    45  * either the GNU General Public License Version 2 or later (the "GPL"), or

    46  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),

    47  * in which case the provisions of the GPL or the LGPL are applicable instead

    48  * of those above. If you wish to allow use of your version of this file only

    49  * under the terms of either the GPL or the LGPL, and not to allow others to

    50  * use your version of this file under the terms of the MPL, indicate your

    51  * decision by deleting the provisions above and replace them with the notice

    52  * and other provisions required by the GPL or the LGPL. If you do not delete

    53  * the provisions above, a recipient may use your version of this file under

    54  * the terms of any one of the MPL, the GPL or the LGPL.

    55  *

    56  ******* END LICENSE BLOCK *******/

    58 #include <stdlib.h>

    59 #include <string.h>

    60 #include <stdio.h>

    61 #include <ctype.h>

    63 #include <vector>

    65 #include "affixmgr.hxx"

    66 #include "affentry.hxx"

    67 #include "langnum.hxx"

    69 #include "csutil.hxx"

    71 AffixMgr::AffixMgr(const char * affpath, HashMgr** ptr, int * md, const char * key)

    72 {

    73   // register hash manager and load affix data from aff file

    74   pHMgr = ptr[0];

    75   alldic = ptr;

    76   maxdic = md;

    77   keystring = NULL;

    78   trystring = NULL;

    79   encoding=NULL;

    80   csconv=NULL;

    81   utf8 = 0;

    82   complexprefixes = 0;

    83   maptable = NULL;

    84   nummap = 0;

    85   breaktable = NULL;

    86   numbreak = -1;

    87   reptable = NULL;

    88   numrep = 0;

    89   iconvtable = NULL;

    90   oconvtable = NULL;

    91   checkcpdtable = NULL;

    92   // allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN)

    93   simplifiedcpd = 0;

    94   numcheckcpd = 0;

    95   defcpdtable = NULL;

    96   numdefcpd = 0;

    97   phone = NULL;

    98   compoundflag = FLAG_NULL; // permits word in compound forms

    99   compoundbegin = FLAG_NULL; // may be first word in compound forms

   100   compoundmiddle = FLAG_NULL; // may be middle word in compound forms

   101   compoundend = FLAG_NULL; // may be last word in compound forms

   102   compoundroot = FLAG_NULL; // compound word signing flag

   103   compoundpermitflag = FLAG_NULL; // compound permitting flag for suffixed word

   104   compoundforbidflag = FLAG_NULL; // compound fordidden flag for suffixed word

   105   checkcompounddup = 0; // forbid double words in compounds

   106   checkcompoundrep = 0; // forbid bad compounds (may be non compound word with a REP substitution)

   107   checkcompoundcase = 0; // forbid upper and lowercase combinations at word bounds

   108   checkcompoundtriple = 0; // forbid compounds with triple letters

   109   simplifiedtriple = 0; // allow simplified triple letters in compounds (Schiff+fahrt -> Schiffahrt)

   110   forbiddenword = FORBIDDENWORD; // forbidden word signing flag

   111   nosuggest = FLAG_NULL; // don't suggest words signed with NOSUGGEST flag

   112   nongramsuggest = FLAG_NULL;

   113   lang = NULL; // language

   114   langnum = 0; // language code (see http://l10n.openoffice.org/languages.html)

   115   needaffix = FLAG_NULL; // forbidden root, allowed only with suffixes

   116   cpdwordmax = -1; // default: unlimited wordcount in compound words

   117   cpdmin = -1;  // undefined

   118   cpdmaxsyllable = 0; // default: unlimited syllablecount in compound words

   119   cpdvowels=NULL; // vowels (for calculating of Hungarian compounding limit, O(n) search! XXX)

   120   cpdvowels_utf16=NULL; // vowels for UTF-8 encoding (bsearch instead of O(n) search)

   121   cpdvowels_utf16_len=0; // vowels

   122   pfxappnd=NULL; // previous prefix for counting the syllables of prefix BUG

   123   sfxappnd=NULL; // previous suffix for counting a special syllables BUG

   124   cpdsyllablenum=NULL; // syllable count incrementing flag

   125   checknum=0; // checking numbers, and word with numbers

   126   wordchars=NULL; // letters + spec. word characters

   127   wordchars_utf16=NULL; // letters + spec. word characters

   128   wordchars_utf16_len=0; // letters + spec. word characters

   129   ignorechars=NULL; // letters + spec. word characters

   130   ignorechars_utf16=NULL; // letters + spec. word characters

   131   ignorechars_utf16_len=0; // letters + spec. word characters

   132   version=NULL; // affix and dictionary file version string

   133   havecontclass=0; // flags of possible continuing classes (double affix)

   134   // LEMMA_PRESENT: not put root into the morphological output. Lemma presents

   135   // in morhological description in dictionary file. It's often combined with PSEUDOROOT.

   136   lemma_present = FLAG_NULL;

   137   circumfix = FLAG_NULL;

   138   onlyincompound = FLAG_NULL;

   139   maxngramsugs = -1; // undefined

   140   maxdiff = -1; // undefined

   141   onlymaxdiff = 0;

   142   maxcpdsugs = -1; // undefined

   143   nosplitsugs = 0;

   144   sugswithdots = 0;

   145   keepcase = 0;

   146   forceucase = 0;

   147   warn = 0;

   148   forbidwarn = 0;

   149   checksharps = 0;

   150   substandard = FLAG_NULL;

   151   fullstrip = 0;

   153   sfx = NULL;

   154   pfx = NULL;

   156   for (int i=0; i < SETSIZE; i++) {

   157      pStart[i] = NULL;

   158      sStart[i] = NULL;

   159      pFlag[i] = NULL;

   160      sFlag[i] = NULL;

   161   }

   163   for (int j=0; j < CONTSIZE; j++) {

   164     contclasses[j] = 0;

   165   }

   167   if (parse_file(affpath, key)) {

   168      HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n",affpath);

   169   }

   171   if (cpdmin == -1) cpdmin = MINCPDLEN;

   173 }

   176 AffixMgr::~AffixMgr()

   177 {

   178   // pass through linked prefix entries and clean up

   179   for (int i=0; i < SETSIZE ;i++) {

   180        pFlag[i] = NULL;

   181        PfxEntry * ptr = pStart[i];

   182        PfxEntry * nptr = NULL;

   183        while (ptr) {

   184             nptr = ptr->getNext();

   185             delete(ptr);

   186             ptr = nptr;

   187             nptr = NULL;

   188        }

   189   }

   191   // pass through linked suffix entries and clean up

   192   for (int j=0; j < SETSIZE ; j++) {

   193        sFlag[j] = NULL;

   194        SfxEntry * ptr = sStart[j];

   195        SfxEntry * nptr = NULL;

   196        while (ptr) {

   197             nptr = ptr->getNext();

   198             delete(ptr);

   199             ptr = nptr;

   200             nptr = NULL;

   201        }

   202        sStart[j] = NULL;

   203   }

   205   if (keystring) free(keystring);

   206   keystring=NULL;

   207   if (trystring) free(trystring);

   208   trystring=NULL;

   209   if (encoding) free(encoding);

   210   encoding=NULL;

   211   if (maptable) {

   212      for (int j=0; j < nummap; j++) {

   213         for (int k=0; k < maptable[j].len; k++) {

   214            if (maptable[j].set[k]) free(maptable[j].set[k]);

   215         }

   216         free(maptable[j].set);

   217         maptable[j].set = NULL;

   218         maptable[j].len = 0;

   219      }

   220      free(maptable);

   221      maptable = NULL;

   222   }

   223   nummap = 0;

   224   if (breaktable) {

   225      for (int j=0; j < numbreak; j++) {

   226         if (breaktable[j]) free(breaktable[j]);

   227         breaktable[j] = NULL;

   228      }

   229      free(breaktable);

   230      breaktable = NULL;

   231   }

   232   numbreak = 0;

   233   if (reptable) {

   234      for (int j=0; j < numrep; j++) {

   235         free(reptable[j].pattern);

   236         free(reptable[j].pattern2);

   237      }

   238      free(reptable);

   239      reptable = NULL;

   240   }

   241   if (iconvtable) delete iconvtable;

   242   if (oconvtable) delete oconvtable;

   243   if (phone && phone->rules) {

   244      for (int j=0; j < phone->num + 1; j++) {

   245         free(phone->rules[j * 2]);

   246         free(phone->rules[j * 2 + 1]);

   247      }

   248      free(phone->rules);

   249      free(phone);

   250      phone = NULL;

   251   }

   253   if (defcpdtable) {

   254      for (int j=0; j < numdefcpd; j++) {

   255         free(defcpdtable[j].def);

   256         defcpdtable[j].def = NULL;

   257      }

   258      free(defcpdtable);

   259      defcpdtable = NULL;

   260   }

   261   numrep = 0;

   262   if (checkcpdtable) {

   263      for (int j=0; j < numcheckcpd; j++) {

   264         free(checkcpdtable[j].pattern);

   265         free(checkcpdtable[j].pattern2);

   266         free(checkcpdtable[j].pattern3);

   267         checkcpdtable[j].pattern = NULL;

   268         checkcpdtable[j].pattern2 = NULL;

   269         checkcpdtable[j].pattern3 = NULL;

   270      }

   271      free(checkcpdtable);

   272      checkcpdtable = NULL;

   273   }

   274   numcheckcpd = 0;

   275   FREE_FLAG(compoundflag);

   276   FREE_FLAG(compoundbegin);

   277   FREE_FLAG(compoundmiddle);

   278   FREE_FLAG(compoundend);

   279   FREE_FLAG(compoundpermitflag);

   280   FREE_FLAG(compoundforbidflag);

   281   FREE_FLAG(compoundroot);

   282   FREE_FLAG(forbiddenword);

   283   FREE_FLAG(nosuggest);

   284   FREE_FLAG(nongramsuggest);

   285   FREE_FLAG(needaffix);

   286   FREE_FLAG(lemma_present);

   287   FREE_FLAG(circumfix);

   288   FREE_FLAG(onlyincompound);

   290   cpdwordmax = 0;

   291   pHMgr = NULL;

   292   cpdmin = 0;

   293   cpdmaxsyllable = 0;

   294   if (cpdvowels) free(cpdvowels);

   295   if (cpdvowels_utf16) free(cpdvowels_utf16);

   296   if (cpdsyllablenum) free(cpdsyllablenum);

   297   free_utf_tbl();

   298   if (lang) free(lang);

   299   if (wordchars) free(wordchars);

   300   if (wordchars_utf16) free(wordchars_utf16);

   301   if (ignorechars) free(ignorechars);

   302   if (ignorechars_utf16) free(ignorechars_utf16);

   303   if (version) free(version);

   304   checknum=0;

   305 #ifdef MOZILLA_CLIENT

   306   delete [] csconv;

   307 #endif

   308 }

   311 // read in aff file and build up prefix and suffix entry objects

   312 int  AffixMgr::parse_file(const char * affpath, const char * key)

   313 {

   314   char * line; // io buffers

   315   char ft;     // affix type

   317   // checking flag duplication

   318   char dupflags[CONTSIZE];

   319   char dupflags_ini = 1;

   321   // first line indicator for removing byte order mark

   322   int firstline = 1;

   324   // open the affix file

   325   FileMgr * afflst = new FileMgr(affpath, key);

   326   if (!afflst) {

   327     HUNSPELL_WARNING(stderr, "error: could not open affix description file %s\n",affpath);

   328     return 1;

   329   }

   331   // step one is to parse the affix file building up the internal

   332   // affix data structures

   334     // read in each line ignoring any that do not

   335     // start with a known line type indicator

   336     while ((line = afflst->getline())) {

   337        mychomp(line);

   339        /* remove byte order mark */

   340        if (firstline) {

   341          firstline = 0;

   342          // Affix file begins with byte order mark: possible incompatibility with old Hunspell versions

   343          if (strncmp(line,"\xEF\xBB\xBF",3) == 0) {

   344             memmove(line, line+3, strlen(line+3)+1);

   345          }

   346        }

   348        /* parse in the keyboard string */

   349        if (strncmp(line,"KEY",3) == 0) {

   350           if (parse_string(line, &keystring, afflst->getlinenum())) {

   351              delete afflst;

   352              return 1;

   353           }

   354        }

   356        /* parse in the try string */

   357        if (strncmp(line,"TRY",3) == 0) {

   358           if (parse_string(line, &trystring, afflst->getlinenum())) {

   359              delete afflst;

   360              return 1;

   361           }

   362        }

   364        /* parse in the name of the character set used by the .dict and .aff */

   365        if (strncmp(line,"SET",3) == 0) {

   366           if (parse_string(line, &encoding, afflst->getlinenum())) {

   367              delete afflst;

   368              return 1;

   369           }

   370           if (strcmp(encoding, "UTF-8") == 0) {

   371              utf8 = 1;

   372 #ifndef OPENOFFICEORG

   373 #ifndef MOZILLA_CLIENT

   374              if (initialize_utf_tbl()) return 1;

   375 #endif

   376 #endif

   377           }

   378        }

   380        /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left writing system */

   381        if (strncmp(line,"COMPLEXPREFIXES",15) == 0)

   382                    complexprefixes = 1;

   384        /* parse in the flag used by the controlled compound words */

   385        if (strncmp(line,"COMPOUNDFLAG",12) == 0) {

   386           if (parse_flag(line, &compoundflag, afflst)) {

   387              delete afflst;

   388              return 1;

   389           }

   390        }

   392        /* parse in the flag used by compound words */

   393        if (strncmp(line,"COMPOUNDBEGIN",13) == 0) {

   394           if (complexprefixes) {

   395             if (parse_flag(line, &compoundend, afflst)) {

   396               delete afflst;

   397               return 1;

   398             }

   399           } else {

   400             if (parse_flag(line, &compoundbegin, afflst)) {

   401               delete afflst;

   402               return 1;

   403             }

   404           }

   405        }

   407        /* parse in the flag used by compound words */

   408        if (strncmp(line,"COMPOUNDMIDDLE",14) == 0) {

   409           if (parse_flag(line, &compoundmiddle, afflst)) {

   410              delete afflst;

   411              return 1;

   412           }

   413        }

   414        /* parse in the flag used by compound words */

   415        if (strncmp(line,"COMPOUNDEND",11) == 0) {

   416           if (complexprefixes) {

   417             if (parse_flag(line, &compoundbegin, afflst)) {

   418               delete afflst;

   419               return 1;

   420             }

   421           } else {

   422             if (parse_flag(line, &compoundend, afflst)) {

   423               delete afflst;

   424               return 1;

   425             }

   426           }

   427        }

   429        /* parse in the data used by compound_check() method */

   430        if (strncmp(line,"COMPOUNDWORDMAX",15) == 0) {

   431           if (parse_num(line, &cpdwordmax, afflst)) {

   432              delete afflst;

   433              return 1;

   434           }

   435        }

   437        /* parse in the flag sign compounds in dictionary */

   438        if (strncmp(line,"COMPOUNDROOT",12) == 0) {

   439           if (parse_flag(line, &compoundroot, afflst)) {

   440              delete afflst;

   441              return 1;

   442           }

   443        }

   445        /* parse in the flag used by compound_check() method */

   446        if (strncmp(line,"COMPOUNDPERMITFLAG",18) == 0) {

   447           if (parse_flag(line, &compoundpermitflag, afflst)) {

   448              delete afflst;

   449              return 1;

   450           }

   451        }

   453        /* parse in the flag used by compound_check() method */

   454        if (strncmp(line,"COMPOUNDFORBIDFLAG",18) == 0) {

   455           if (parse_flag(line, &compoundforbidflag, afflst)) {

   456              delete afflst;

   457              return 1;

   458           }

   459        }

   461        if (strncmp(line,"CHECKCOMPOUNDDUP",16) == 0) {

   462                    checkcompounddup = 1;

   463        }

   465        if (strncmp(line,"CHECKCOMPOUNDREP",16) == 0) {

   466                    checkcompoundrep = 1;

   467        }

   469        if (strncmp(line,"CHECKCOMPOUNDTRIPLE",19) == 0) {

   470                    checkcompoundtriple = 1;

   471        }

   473        if (strncmp(line,"SIMPLIFIEDTRIPLE",16) == 0) {

   474                    simplifiedtriple = 1;

   475        }

   477        if (strncmp(line,"CHECKCOMPOUNDCASE",17) == 0) {

   478                    checkcompoundcase = 1;

   479        }

   481        if (strncmp(line,"NOSUGGEST",9) == 0) {

   482           if (parse_flag(line, &nosuggest, afflst)) {

   483              delete afflst;

   484              return 1;

   485           }

   486        }

   488        if (strncmp(line,"NONGRAMSUGGEST",14) == 0) {

   489           if (parse_flag(line, &nongramsuggest, afflst)) {

   490              delete afflst;

   491              return 1;

   492           }

   493        }

   495        /* parse in the flag used by forbidden words */

   496        if (strncmp(line,"FORBIDDENWORD",13) == 0) {

   497           if (parse_flag(line, &forbiddenword, afflst)) {

   498              delete afflst;

   499              return 1;

   500           }

   501        }

   503        /* parse in the flag used by forbidden words */

   504        if (strncmp(line,"LEMMA_PRESENT",13) == 0) {

   505           if (parse_flag(line, &lemma_present, afflst)) {

   506              delete afflst;

   507              return 1;

   508           }

   509        }

   511        /* parse in the flag used by circumfixes */

   512        if (strncmp(line,"CIRCUMFIX",9) == 0) {

   513           if (parse_flag(line, &circumfix, afflst)) {

   514              delete afflst;

   515              return 1;

   516           }

   517        }

   519        /* parse in the flag used by fogemorphemes */

   520        if (strncmp(line,"ONLYINCOMPOUND",14) == 0) {

   521           if (parse_flag(line, &onlyincompound, afflst)) {

   522              delete afflst;

   523              return 1;

   524           }

   525        }

   527        /* parse in the flag used by `needaffixs' */

   528        if (strncmp(line,"PSEUDOROOT",10) == 0) {

   529           if (parse_flag(line, &needaffix, afflst)) {

   530              delete afflst;

   531              return 1;

   532           }

   533        }

   535        /* parse in the flag used by `needaffixs' */

   536        if (strncmp(line,"NEEDAFFIX",9) == 0) {

   537           if (parse_flag(line, &needaffix, afflst)) {

   538              delete afflst;

   539              return 1;

   540           }

   541        }

   543        /* parse in the minimal length for words in compounds */

   544        if (strncmp(line,"COMPOUNDMIN",11) == 0) {

   545           if (parse_num(line, &cpdmin, afflst)) {

   546              delete afflst;

   547              return 1;

   548           }

   549           if (cpdmin < 1) cpdmin = 1;

   550        }

   552        /* parse in the max. words and syllables in compounds */

   553        if (strncmp(line,"COMPOUNDSYLLABLE",16) == 0) {

   554           if (parse_cpdsyllable(line, afflst)) {

   555              delete afflst;

   556              return 1;

   557           }

   558        }

   560        /* parse in the flag used by compound_check() method */

   561        if (strncmp(line,"SYLLABLENUM",11) == 0) {

   562           if (parse_string(line, &cpdsyllablenum, afflst->getlinenum())) {

   563              delete afflst;

   564              return 1;

   565           }

   566        }

   568        /* parse in the flag used by the controlled compound words */

   569        if (strncmp(line,"CHECKNUM",8) == 0) {

   570            checknum=1;

   571        }

   573        /* parse in the extra word characters */

   574        if (strncmp(line,"WORDCHARS",9) == 0) {

   575           if (parse_array(line, &wordchars, &wordchars_utf16, &wordchars_utf16_len, utf8, afflst->getlinenum())) {

   576              delete afflst;

   577              return 1;

   578           }

   579        }

   581        /* parse in the ignored characters (for example, Arabic optional diacretics charachters */

   582        if (strncmp(line,"IGNORE",6) == 0) {

   583           if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, utf8, afflst->getlinenum())) {

   584              delete afflst;

   585              return 1;

   586           }

   587        }

   589        /* parse in the typical fault correcting table */

   590        if (strncmp(line,"REP",3) == 0) {

   591           if (parse_reptable(line, afflst)) {

   592              delete afflst;

   593              return 1;

   594           }

   595        }

   597        /* parse in the input conversion table */

   598        if (strncmp(line,"ICONV",5) == 0) {

   599           if (parse_convtable(line, afflst, &iconvtable, "ICONV")) {

   600              delete afflst;

   601              return 1;

   602           }

   603        }

   605        /* parse in the input conversion table */

   606        if (strncmp(line,"OCONV",5) == 0) {

   607           if (parse_convtable(line, afflst, &oconvtable, "OCONV")) {

   608              delete afflst;

   609              return 1;

   610           }

   611        }

   613        /* parse in the phonetic translation table */

   614        if (strncmp(line,"PHONE",5) == 0) {

   615           if (parse_phonetable(line, afflst)) {

   616              delete afflst;

   617              return 1;

   618           }

   619        }

   621        /* parse in the checkcompoundpattern table */

   622        if (strncmp(line,"CHECKCOMPOUNDPATTERN",20) == 0) {

   623           if (parse_checkcpdtable(line, afflst)) {

   624              delete afflst;

   625              return 1;

   626           }

   627        }

   629        /* parse in the defcompound table */

   630        if (strncmp(line,"COMPOUNDRULE",12) == 0) {

   631           if (parse_defcpdtable(line, afflst)) {

   632              delete afflst;

   633              return 1;

   634           }

   635        }

   637        /* parse in the related character map table */

   638        if (strncmp(line,"MAP",3) == 0) {

   639           if (parse_maptable(line, afflst)) {

   640              delete afflst;

   641              return 1;

   642           }

   643        }

   645        /* parse in the word breakpoints table */

   646        if (strncmp(line,"BREAK",5) == 0) {

   647           if (parse_breaktable(line, afflst)) {

   648              delete afflst;

   649              return 1;

   650           }

   651        }

   653        /* parse in the language for language specific codes */

   654        if (strncmp(line,"LANG",4) == 0) {

   655           if (parse_string(line, &lang, afflst->getlinenum())) {

   656              delete afflst;

   657              return 1;

   658           }

   659           langnum = get_lang_num(lang);

   660        }

   662        if (strncmp(line,"VERSION",7) == 0) {

   663           for(line = line + 7; *line == ' ' || *line == '\t'; line++);

   664           version = mystrdup(line);

   665        }

   667        if (strncmp(line,"MAXNGRAMSUGS",12) == 0) {

   668           if (parse_num(line, &maxngramsugs, afflst)) {

   669              delete afflst;

   670              return 1;

   671           }

   672        }

   674        if (strncmp(line,"ONLYMAXDIFF", 11) == 0)

   675                    onlymaxdiff = 1;

   677        if (strncmp(line,"MAXDIFF",7) == 0) {

   678           if (parse_num(line, &maxdiff, afflst)) {

   679              delete afflst;

   680              return 1;

   681           }

   682        }

   684        if (strncmp(line,"MAXCPDSUGS",10) == 0) {

   685           if (parse_num(line, &maxcpdsugs, afflst)) {

   686              delete afflst;

   687              return 1;

   688           }

   689        }

   691        if (strncmp(line,"NOSPLITSUGS",11) == 0) {

   692                    nosplitsugs=1;

   693        }

   695        if (strncmp(line,"FULLSTRIP",9) == 0) {

   696                    fullstrip=1;

   697        }

   699        if (strncmp(line,"SUGSWITHDOTS",12) == 0) {

   700                    sugswithdots=1;

   701        }

   703        /* parse in the flag used by forbidden words */

   704        if (strncmp(line,"KEEPCASE",8) == 0) {

   705           if (parse_flag(line, &keepcase, afflst)) {

   706              delete afflst;

   707              return 1;

   708           }

   709        }

   711        /* parse in the flag used by `forceucase' */

   712        if (strncmp(line,"FORCEUCASE",10) == 0) {

   713           if (parse_flag(line, &forceucase, afflst)) {

   714              delete afflst;

   715              return 1;

   716           }

   717        }

   719        /* parse in the flag used by `warn' */

   720        if (strncmp(line,"WARN",4) == 0) {

   721           if (parse_flag(line, &warn, afflst)) {

   722              delete afflst;

   723              return 1;

   724           }

   725        }

   727        if (strncmp(line,"FORBIDWARN",10) == 0) {

   728                    forbidwarn=1;

   729        }

   731        /* parse in the flag used by the affix generator */

   732        if (strncmp(line,"SUBSTANDARD",11) == 0) {

   733           if (parse_flag(line, &substandard, afflst)) {

   734              delete afflst;

   735              return 1;

   736           }

   737        }

   739        if (strncmp(line,"CHECKSHARPS",11) == 0) {

   740                    checksharps=1;

   741        }

   743        /* parse this affix: P - prefix, S - suffix */

   744        ft = ' ';

   745        if (strncmp(line,"PFX",3) == 0) ft = complexprefixes ? 'S' : 'P';

   746        if (strncmp(line,"SFX",3) == 0) ft = complexprefixes ? 'P' : 'S';

   747        if (ft != ' ') {

   748           if (dupflags_ini) {

   749             memset(dupflags, 0, sizeof(dupflags));

   750             dupflags_ini = 0;

   751           }

   752           if (parse_affix(line, ft, afflst, dupflags)) {

   753              delete afflst;

   754              process_pfx_tree_to_list();

   755              process_sfx_tree_to_list();

   756              return 1;

   757           }

   758        }

   760     }

   761     delete afflst;

   763     // convert affix trees to sorted list

   764     process_pfx_tree_to_list();

   765     process_sfx_tree_to_list();

   767     // now we can speed up performance greatly taking advantage of the

   768     // relationship between the affixes and the idea of "subsets".

   770     // View each prefix as a potential leading subset of another and view

   771     // each suffix (reversed) as a potential trailing subset of another.

   773     // To illustrate this relationship if we know the prefix "ab" is found in the

   774     // word to examine, only prefixes that "ab" is a leading subset of need be examined.

   775     // Furthermore is "ab" is not present then none of the prefixes that "ab" is

   776     // is a subset need be examined.

   777     // The same argument goes for suffix string that are reversed.

   779     // Then to top this off why not examine the first char of the word to quickly

   780     // limit the set of prefixes to examine (i.e. the prefixes to examine must

   781     // be leading supersets of the first character of the word (if they exist)

   783     // To take advantage of this "subset" relationship, we need to add two links

   784     // from entry.  One to take next if the current prefix is found (call it nexteq)

   785     // and one to take next if the current prefix is not found (call it nextne).

   787     // Since we have built ordered lists, all that remains is to properly initialize

   788     // the nextne and nexteq pointers that relate them

   790     process_pfx_order();

   791     process_sfx_order();

   793     /* get encoding for CHECKCOMPOUNDCASE */

   794     if (!utf8) {

   795     char * enc = get_encoding();

   796     csconv = get_current_cs(enc);

   797     free(enc);

   798     enc = NULL;

   800     char expw[MAXLNLEN];

   801     if (wordchars) {

   802         strcpy(expw, wordchars);

   803         free(wordchars);

   804     } else *expw = '\0';

   806     for (int i = 0; i <= 255; i++) {

   807         if ( (csconv[i].cupper != csconv[i].clower) &&

   808             (! strchr(expw, (char) i))) {

   809                 *(expw + strlen(expw) + 1) = '\0';

   810                 *(expw + strlen(expw)) = (char) i;

   811         }

   812     }

   814     wordchars = mystrdup(expw);

   815     }

   817     // default BREAK definition

   818     if (numbreak == -1) {

   819         breaktable = (char **) malloc(sizeof(char *) * 3);

   820         if (!breaktable) return 1;

   821         breaktable[0] = mystrdup("-");

   822         breaktable[1] = mystrdup("^-");

   823         breaktable[2] = mystrdup("-$");

   824         if (breaktable[0] && breaktable[1] && breaktable[2]) numbreak = 3;

   825     }

   826     return 0;

   827 }

   830 // we want to be able to quickly access prefix information

   831 // both by prefix flag, and sorted by prefix string itself

   832 // so we need to set up two indexes

   834 int AffixMgr::build_pfxtree(PfxEntry* pfxptr)

   835 {

   836   PfxEntry * ptr;

   837   PfxEntry * pptr;

   838   PfxEntry * ep = pfxptr;

   840   // get the right starting points

   841   const char * key = ep->getKey();

   842   const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF);

   844   // first index by flag which must exist

   845   ptr = pFlag[flg];

   846   ep->setFlgNxt(ptr);

   847   pFlag[flg] = ep;

   850   // handle the special case of null affix string

   851   if (strlen(key) == 0) {

   852     // always inset them at head of list at element 0

   853      ptr = pStart[0];

   854      ep->setNext(ptr);

   855      pStart[0] = ep;

   856      return 0;

   857   }

   859   // now handle the normal case

   860   ep->setNextEQ(NULL);

   861   ep->setNextNE(NULL);

   863   unsigned char sp = *((const unsigned char *)key);

   864   ptr = pStart[sp];

   866   // handle the first insert

   867   if (!ptr) {

   868      pStart[sp] = ep;

   869      return 0;

   870   }

   873   // otherwise use binary tree insertion so that a sorted

   874   // list can easily be generated later

   875   pptr = NULL;

   876   for (;;) {

   877     pptr = ptr;

   878     if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {

   879        ptr = ptr->getNextEQ();

   880        if (!ptr) {

   881           pptr->setNextEQ(ep);

   882           break;

   883        }

   884     } else {

   885        ptr = ptr->getNextNE();

   886        if (!ptr) {

   887           pptr->setNextNE(ep);

   888           break;

   889        }

   890     }

   891   }

   892   return 0;

   893 }

   895 // we want to be able to quickly access suffix information

   896 // both by suffix flag, and sorted by the reverse of the

   897 // suffix string itself; so we need to set up two indexes

   898 int AffixMgr::build_sfxtree(SfxEntry* sfxptr)

   899 {

   900   SfxEntry * ptr;

   901   SfxEntry * pptr;

   902   SfxEntry * ep = sfxptr;

   904   /* get the right starting point */

   905   const char * key = ep->getKey();

   906   const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF);

   908   // first index by flag which must exist

   909   ptr = sFlag[flg];

   910   ep->setFlgNxt(ptr);

   911   sFlag[flg] = ep;

   913   // next index by affix string

   915   // handle the special case of null affix string

   916   if (strlen(key) == 0) {

   917     // always inset them at head of list at element 0

   918      ptr = sStart[0];

   919      ep->setNext(ptr);

   920      sStart[0] = ep;

   921      return 0;

   922   }

   924   // now handle the normal case

   925   ep->setNextEQ(NULL);

   926   ep->setNextNE(NULL);

   928   unsigned char sp = *((const unsigned char *)key);

   929   ptr = sStart[sp];

   931   // handle the first insert

   932   if (!ptr) {

   933      sStart[sp] = ep;

   934      return 0;

   935   }

   937   // otherwise use binary tree insertion so that a sorted

   938   // list can easily be generated later

   939   pptr = NULL;

   940   for (;;) {

   941     pptr = ptr;

   942     if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {

   943        ptr = ptr->getNextEQ();

   944        if (!ptr) {

   945           pptr->setNextEQ(ep);

   946           break;

   947        }

   948     } else {

   949        ptr = ptr->getNextNE();

   950        if (!ptr) {

   951           pptr->setNextNE(ep);

   952           break;

   953        }

   954     }

   955   }

   956   return 0;

   957 }

   959 // convert from binary tree to sorted list

   960 int AffixMgr::process_pfx_tree_to_list()

   961 {

   962   for (int i=1; i< SETSIZE; i++) {

   963     pStart[i] = process_pfx_in_order(pStart[i],NULL);

   964   }

   965   return 0;

   966 }

   969 PfxEntry* AffixMgr::process_pfx_in_order(PfxEntry* ptr, PfxEntry* nptr)

   970 {

   971   if (ptr) {

   972     nptr = process_pfx_in_order(ptr->getNextNE(), nptr);

   973     ptr->setNext(nptr);

   974     nptr = process_pfx_in_order(ptr->getNextEQ(), ptr);

   975   }

   976   return nptr;

   977 }

   980 // convert from binary tree to sorted list

   981 int AffixMgr:: process_sfx_tree_to_list()

   982 {

   983   for (int i=1; i< SETSIZE; i++) {

   984     sStart[i] = process_sfx_in_order(sStart[i],NULL);

   985   }

   986   return 0;

   987 }

   989 SfxEntry* AffixMgr::process_sfx_in_order(SfxEntry* ptr, SfxEntry* nptr)

   990 {

   991   if (ptr) {

   992     nptr = process_sfx_in_order(ptr->getNextNE(), nptr);

   993     ptr->setNext(nptr);

   994     nptr = process_sfx_in_order(ptr->getNextEQ(), ptr);

   995   }

   996   return nptr;

   997 }

  1000 // reinitialize the PfxEntry links NextEQ and NextNE to speed searching

  1001 // using the idea of leading subsets this time

  1002 int AffixMgr::process_pfx_order()

  1003 {

  1004     PfxEntry* ptr;

  1006     // loop through each prefix list starting point

  1007     for (int i=1; i < SETSIZE; i++) {

  1009          ptr = pStart[i];

  1011          // look through the remainder of the list

  1012          //  and find next entry with affix that

  1013          // the current one is not a subset of

  1014          // mark that as destination for NextNE

  1015          // use next in list that you are a subset

  1016          // of as NextEQ

  1018          for (; ptr != NULL; ptr = ptr->getNext()) {

  1020              PfxEntry * nptr = ptr->getNext();

  1021              for (; nptr != NULL; nptr = nptr->getNext()) {

  1022                  if (! isSubset( ptr->getKey() , nptr->getKey() )) break;

  1023              }

  1024              ptr->setNextNE(nptr);

  1025              ptr->setNextEQ(NULL);

  1026              if ((ptr->getNext()) && isSubset(ptr->getKey() , (ptr->getNext())->getKey()))

  1027                  ptr->setNextEQ(ptr->getNext());

  1028          }

  1030          // now clean up by adding smart search termination strings:

  1031          // if you are already a superset of the previous prefix

  1032          // but not a subset of the next, search can end here

  1033          // so set NextNE properly

  1035          ptr = pStart[i];

  1036          for (; ptr != NULL; ptr = ptr->getNext()) {

  1037              PfxEntry * nptr = ptr->getNext();

  1038              PfxEntry * mptr = NULL;

  1039              for (; nptr != NULL; nptr = nptr->getNext()) {

  1040                  if (! isSubset(ptr->getKey(),nptr->getKey())) break;

  1041                  mptr = nptr;

  1042              }

  1043              if (mptr) mptr->setNextNE(NULL);

  1044          }

  1045     }

  1046     return 0;

  1047 }

  1049 // initialize the SfxEntry links NextEQ and NextNE to speed searching

  1050 // using the idea of leading subsets this time

  1051 int AffixMgr::process_sfx_order()

  1052 {

  1053     SfxEntry* ptr;

  1055     // loop through each prefix list starting point

  1056     for (int i=1; i < SETSIZE; i++) {

  1058          ptr = sStart[i];

  1060          // look through the remainder of the list

  1061          //  and find next entry with affix that

  1062          // the current one is not a subset of

  1063          // mark that as destination for NextNE

  1064          // use next in list that you are a subset

  1065          // of as NextEQ

  1067          for (; ptr != NULL; ptr = ptr->getNext()) {

  1068              SfxEntry * nptr = ptr->getNext();

  1069              for (; nptr != NULL; nptr = nptr->getNext()) {

  1070                  if (! isSubset(ptr->getKey(),nptr->getKey())) break;

  1071              }

  1072              ptr->setNextNE(nptr);

  1073              ptr->setNextEQ(NULL);

  1074              if ((ptr->getNext()) && isSubset(ptr->getKey(),(ptr->getNext())->getKey()))

  1075                  ptr->setNextEQ(ptr->getNext());

  1076          }

  1079          // now clean up by adding smart search termination strings:

  1080          // if you are already a superset of the previous suffix

  1081          // but not a subset of the next, search can end here

  1082          // so set NextNE properly

  1084          ptr = sStart[i];

  1085          for (; ptr != NULL; ptr = ptr->getNext()) {

  1086              SfxEntry * nptr = ptr->getNext();

  1087              SfxEntry * mptr = NULL;

  1088              for (; nptr != NULL; nptr = nptr->getNext()) {

  1089                  if (! isSubset(ptr->getKey(),nptr->getKey())) break;

  1090                  mptr = nptr;

  1091              }

  1092              if (mptr) mptr->setNextNE(NULL);

  1093          }

  1094     }

  1095     return 0;

  1096 }

  1098 // add flags to the result for dictionary debugging

  1099 void AffixMgr::debugflag(char * result, unsigned short flag) {

  1100     char * st = encode_flag(flag);

  1101     mystrcat(result, " ", MAXLNLEN);

  1102     mystrcat(result, MORPH_FLAG, MAXLNLEN);

  1103     if (st) {

  1104         mystrcat(result, st, MAXLNLEN);

  1105         free(st);

  1106     }

  1107 }

  1109 // calculate the character length of the condition

  1110 int AffixMgr::condlen(char * st)

  1111 {

  1112   int l = 0;

  1113   bool group = false;

  1114   for(; *st; st++) {

  1115     if (*st == '[') {

  1116         group = true;

  1117         l++;

  1118     } else if (*st == ']') group = false;

  1119     else if (!group && (!utf8 ||

  1120         (!(*st & 0x80) || ((*st & 0xc0) == 0x80)))) l++;

  1121   }

  1122   return l;

  1123 }

  1125 int AffixMgr::encodeit(affentry &entry, char * cs)

  1126 {

  1127   if (strcmp(cs,".") != 0) {

  1128     entry.numconds = (char) condlen(cs);

  1129     strncpy(entry.c.conds, cs, MAXCONDLEN);

  1130     // long condition (end of conds padded by strncpy)

  1131     if (entry.c.conds[MAXCONDLEN - 1] && cs[MAXCONDLEN]) {

  1132       entry.opts += aeLONGCOND;

  1133       entry.c.l.conds2 = mystrdup(cs + MAXCONDLEN_1);

  1134       if (!entry.c.l.conds2) return 1;

  1135     }

  1136   } else {

  1137     entry.numconds = 0;

  1138     entry.c.conds[0] = '\0';

  1139   }

  1140   return 0;

  1141 }

  1143 // return 1 if s1 is a leading subset of s2 (dots are for infixes)

  1144 inline int AffixMgr::isSubset(const char * s1, const char * s2)

  1145  {

  1146     while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0')) {

  1147         s1++;

  1148         s2++;

  1149     }

  1150     return (*s1 == '\0');

  1151  }

  1154 // check word for prefixes

  1155 struct hentry * AffixMgr::prefix_check(const char * word, int len, char in_compound,

  1156     const FLAG needflag)

  1157 {

  1158     struct hentry * rv= NULL;

  1160     pfx = NULL;

  1161     pfxappnd = NULL;

  1162     sfxappnd = NULL;

  1164     // first handle the special case of 0 length prefixes

  1165     PfxEntry * pe = pStart[0];

  1166     while (pe) {

  1167         if (

  1168             // fogemorpheme

  1169               ((in_compound != IN_CPD_NOT) || !(pe->getCont() &&

  1170                   (TESTAFF(pe->getCont(), onlyincompound, pe->getContLen())))) &&

  1171             // permit prefixes in compounds

  1172               ((in_compound != IN_CPD_END) || (pe->getCont() &&

  1173                   (TESTAFF(pe->getCont(), compoundpermitflag, pe->getContLen()))))

  1174               ) {

  1175                     // check prefix

  1176                     rv = pe->checkword(word, len, in_compound, needflag);

  1177                     if (rv) {

  1178                         pfx=pe; // BUG: pfx not stateless

  1179                         return rv;

  1180                     }

  1181              }

  1182        pe = pe->getNext();

  1183     }

  1185     // now handle the general case

  1186     unsigned char sp = *((const unsigned char *)word);

  1187     PfxEntry * pptr = pStart[sp];

  1189     while (pptr) {

  1190         if (isSubset(pptr->getKey(),word)) {

  1191              if (

  1192             // fogemorpheme

  1193               ((in_compound != IN_CPD_NOT) || !(pptr->getCont() &&

  1194                   (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen())))) &&

  1195             // permit prefixes in compounds

  1196               ((in_compound != IN_CPD_END) || (pptr->getCont() &&

  1197                   (TESTAFF(pptr->getCont(), compoundpermitflag, pptr->getContLen()))))

  1198               ) {

  1199             // check prefix

  1200                   rv = pptr->checkword(word, len, in_compound, needflag);

  1201                   if (rv) {

  1202                     pfx=pptr; // BUG: pfx not stateless

  1203                     return rv;

  1204                   }

  1205              }

  1206              pptr = pptr->getNextEQ();

  1207         } else {

  1208              pptr = pptr->getNextNE();

  1209         }

  1210     }

  1212     return NULL;

  1213 }

  1215 // check word for prefixes

  1216 struct hentry * AffixMgr::prefix_check_twosfx(const char * word, int len,

  1217     char in_compound, const FLAG needflag)

  1218 {

  1219     struct hentry * rv= NULL;

  1221     pfx = NULL;

  1222     sfxappnd = NULL;

  1224     // first handle the special case of 0 length prefixes

  1225     PfxEntry * pe = pStart[0];

  1227     while (pe) {

  1228         rv = pe->check_twosfx(word, len, in_compound, needflag);

  1229         if (rv) return rv;

  1230         pe = pe->getNext();

  1231     }

  1233     // now handle the general case

  1234     unsigned char sp = *((const unsigned char *)word);

  1235     PfxEntry * pptr = pStart[sp];

  1237     while (pptr) {

  1238         if (isSubset(pptr->getKey(),word)) {

  1239             rv = pptr->check_twosfx(word, len, in_compound, needflag);

  1240             if (rv) {

  1241                 pfx = pptr;

  1242                 return rv;

  1243             }

  1244             pptr = pptr->getNextEQ();

  1245         } else {

  1246              pptr = pptr->getNextNE();

  1247         }

  1248     }

  1250     return NULL;

  1251 }

  1253 // check word for prefixes

  1254 char * AffixMgr::prefix_check_morph(const char * word, int len, char in_compound,

  1255     const FLAG needflag)

  1256 {

  1257     char * st;

  1259     char result[MAXLNLEN];

  1260     result[0] = '\0';

  1262     pfx = NULL;

  1263     sfxappnd = NULL;

  1265     // first handle the special case of 0 length prefixes

  1266     PfxEntry * pe = pStart[0];

  1267     while (pe) {

  1268        st = pe->check_morph(word,len,in_compound, needflag);

  1269        if (st) {

  1270             mystrcat(result, st, MAXLNLEN);

  1271             free(st);

  1272        }

  1273        // if (rv) return rv;

  1274        pe = pe->getNext();

  1275     }

  1277     // now handle the general case

  1278     unsigned char sp = *((const unsigned char *)word);

  1279     PfxEntry * pptr = pStart[sp];

  1281     while (pptr) {

  1282         if (isSubset(pptr->getKey(),word)) {

  1283             st = pptr->check_morph(word,len,in_compound, needflag);

  1284             if (st) {

  1285               // fogemorpheme

  1286               if ((in_compound != IN_CPD_NOT) || !((pptr->getCont() &&

  1287                         (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen()))))) {

  1288                     mystrcat(result, st, MAXLNLEN);

  1289                     pfx = pptr;

  1290                 }

  1291                 free(st);

  1292             }

  1293             pptr = pptr->getNextEQ();

  1294         } else {

  1295             pptr = pptr->getNextNE();

  1296         }

  1297     }

  1299     if (*result) return mystrdup(result);

  1300     return NULL;

  1301 }

  1304 // check word for prefixes

  1305 char * AffixMgr::prefix_check_twosfx_morph(const char * word, int len,

  1306     char in_compound, const FLAG needflag)

  1307 {

  1308     char * st;

  1310     char result[MAXLNLEN];

  1311     result[0] = '\0';

  1313     pfx = NULL;

  1314     sfxappnd = NULL;

  1316     // first handle the special case of 0 length prefixes

  1317     PfxEntry * pe = pStart[0];

  1318     while (pe) {

  1319         st = pe->check_twosfx_morph(word,len,in_compound, needflag);

  1320         if (st) {

  1321             mystrcat(result, st, MAXLNLEN);

  1322             free(st);

  1323         }

  1324         pe = pe->getNext();

  1325     }

  1327     // now handle the general case

  1328     unsigned char sp = *((const unsigned char *)word);

  1329     PfxEntry * pptr = pStart[sp];

  1331     while (pptr) {

  1332         if (isSubset(pptr->getKey(),word)) {

  1333             st = pptr->check_twosfx_morph(word, len, in_compound, needflag);

  1334             if (st) {

  1335                 mystrcat(result, st, MAXLNLEN);

  1336                 free(st);

  1337                 pfx = pptr;

  1338             }

  1339             pptr = pptr->getNextEQ();

  1340         } else {

  1341             pptr = pptr->getNextNE();

  1342         }

  1343     }

  1345     if (*result) return mystrdup(result);

  1346     return NULL;

  1347 }

  1349 // Is word a non compound with a REP substitution (see checkcompoundrep)?

  1350 int AffixMgr::cpdrep_check(const char * word, int wl)

  1351 {

  1352   char candidate[MAXLNLEN];

  1353   const char * r;

  1354   int lenr, lenp;

  1356   if ((wl < 2) || !numrep) return 0;

  1358   for (int i=0; i < numrep; i++ ) {

  1359       r = word;

  1360       lenr = strlen(reptable[i].pattern2);

  1361       lenp = strlen(reptable[i].pattern);

  1362       // search every occurence of the pattern in the word

  1363       while ((r=strstr(r, reptable[i].pattern)) != NULL) {

  1364           strcpy(candidate, word);

  1365           if (r-word + lenr + strlen(r+lenp) >= MAXLNLEN) break;

  1366           strcpy(candidate+(r-word),reptable[i].pattern2);

  1367           strcpy(candidate+(r-word)+lenr, r+lenp);

  1368           if (candidate_check(candidate,strlen(candidate))) return 1;

  1369           r++; // search for the next letter

  1370       }

  1371    }

  1372    return 0;

  1373 }

  1375 // forbid compoundings when there are special patterns at word bound

  1376 int AffixMgr::cpdpat_check(const char * word, int pos, hentry * r1, hentry * r2, const char affixed)

  1377 {

  1378   int len;

  1379   for (int i = 0; i < numcheckcpd; i++) {

  1380       if (isSubset(checkcpdtable[i].pattern2, word + pos) &&

  1381         (!r1 || !checkcpdtable[i].cond ||

  1382           (r1->astr && TESTAFF(r1->astr, checkcpdtable[i].cond, r1->alen))) &&

  1383         (!r2 || !checkcpdtable[i].cond2 ||

  1384           (r2->astr && TESTAFF(r2->astr, checkcpdtable[i].cond2, r2->alen))) &&

  1385         // zero length pattern => only TESTAFF

  1386         // zero pattern (0/flag) => unmodified stem (zero affixes allowed)

  1387         (!*(checkcpdtable[i].pattern) || (

  1388             (*(checkcpdtable[i].pattern)=='0' && r1->blen <= pos && strncmp(word + pos - r1->blen, r1->word, r1->blen) == 0) ||

  1389             (*(checkcpdtable[i].pattern)!='0' && (len = strlen(checkcpdtable[i].pattern)) &&

  1390                 strncmp(word + pos - len, checkcpdtable[i].pattern, len) == 0)))) {

  1391             return 1;

  1392         }

  1393   }

  1394   return 0;

  1395 }

  1397 // forbid compounding with neighbouring upper and lower case characters at word bounds

  1398 int AffixMgr::cpdcase_check(const char * word, int pos)

  1399 {

  1400   if (utf8) {

  1401       w_char u, w;

  1402       const char * p;

  1403       u8_u16(&u, 1, word + pos);

  1404       for (p = word + pos - 1; (*p & 0xc0) == 0x80; p--);

  1405       u8_u16(&w, 1, p);

  1406       unsigned short a = (u.h << 8) + u.l;

  1407       unsigned short b = (w.h << 8) + w.l;

  1408       if (((unicodetoupper(a, langnum) == a) || (unicodetoupper(b, langnum) == b)) &&

  1409           (a != '-') && (b != '-')) return 1;

  1410   } else {

  1411       unsigned char a = *(word + pos - 1);

  1412       unsigned char b = *(word + pos);

  1413       if ((csconv[a].ccase || csconv[b].ccase) && (a != '-') && (b != '-')) return 1;

  1414   }

  1415   return 0;

  1416 }

  1418 // check compound patterns

  1419 int AffixMgr::defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** def, char all)

  1420 {

  1421   signed short btpp[MAXWORDLEN]; // metacharacter (*, ?) positions for backtracking

  1422   signed short btwp[MAXWORDLEN]; // word positions for metacharacters

  1423   int btnum[MAXWORDLEN]; // number of matched characters in metacharacter positions

  1424   short bt = 0;

  1425   int i, j;

  1426   int ok;

  1427   int w = 0;

  1429   if (!*words) {

  1430     w = 1;

  1431     *words = def;

  1432   }

  1434   if (!*words) {

  1435     return 0;

  1436   }

  1438   (*words)[wnum] = rv;

  1440   // has the last word COMPOUNDRULE flag?

  1441   if (rv->alen == 0) {

  1442     (*words)[wnum] = NULL;

  1443     if (w) *words = NULL;

  1444     return 0;

  1445   }

  1446   ok = 0;

  1447   for (i = 0; i < numdefcpd; i++) {

  1448     for (j = 0; j < defcpdtable[i].len; j++) {

  1449        if (defcpdtable[i].def[j] != '*' && defcpdtable[i].def[j] != '?' &&

  1450           TESTAFF(rv->astr, defcpdtable[i].def[j], rv->alen)) ok = 1;

  1451     }

  1452   }

  1453   if (ok == 0) {

  1454     (*words)[wnum] = NULL;

  1455     if (w) *words = NULL;

  1456     return 0;

  1457   }

  1459   for (i = 0; i < numdefcpd; i++) {

  1460     signed short pp = 0; // pattern position

  1461     signed short wp = 0; // "words" position

  1462     int ok2;

  1463     ok = 1;

  1464     ok2 = 1;

  1465     do {

  1466       while ((pp < defcpdtable[i].len) && (wp <= wnum)) {

  1467         if (((pp+1) < defcpdtable[i].len) &&

  1468           ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) {

  1469             int wend = (defcpdtable[i].def[pp+1] == '?') ? wp : wnum;

  1470             ok2 = 1;

  1471             pp+=2;

  1472             btpp[bt] = pp;

  1473             btwp[bt] = wp;

  1474             while (wp <= wend) {

  1475                 if (!(*words)[wp]->alen ||

  1476                   !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp-2], (*words)[wp]->alen)) {

  1477                     ok2 = 0;

  1478                     break;

  1479                 }

  1480                 wp++;

  1481             }

  1482             if (wp <= wnum) ok2 = 0;

  1483             btnum[bt] = wp - btwp[bt];

  1484             if (btnum[bt] > 0) bt++;

  1485             if (ok2) break;

  1486         } else {

  1487             ok2 = 1;

  1488             if (!(*words)[wp] || !(*words)[wp]->alen ||

  1489               !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp], (*words)[wp]->alen)) {

  1490                 ok = 0;

  1491                 break;

  1492             }

  1493             pp++;

  1494             wp++;

  1495             if ((defcpdtable[i].len == pp) && !(wp > wnum)) ok = 0;

  1496         }

  1497       }

  1498     if (ok && ok2) {

  1499         int r = pp;

  1500         while ((defcpdtable[i].len > r) && ((r+1) < defcpdtable[i].len) &&

  1501             ((defcpdtable[i].def[r+1] == '*') || (defcpdtable[i].def[r+1] == '?'))) r+=2;

  1502         if (defcpdtable[i].len <= r) return 1;

  1503     }

  1504     // backtrack

  1505     if (bt) do {

  1506         ok = 1;

  1507         btnum[bt - 1]--;

  1508         pp = btpp[bt - 1];

  1509         wp = btwp[bt - 1] + (signed short) btnum[bt - 1];

  1510     } while ((btnum[bt - 1] < 0) && --bt);

  1511   } while (bt);

  1513   if (ok && ok2 && (!all || (defcpdtable[i].len <= pp))) return 1;

  1515   // check zero ending

  1516   while (ok && ok2 && (defcpdtable[i].len > pp) && ((pp+1) < defcpdtable[i].len) &&

  1517     ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) pp+=2;

  1518   if (ok && ok2 && (defcpdtable[i].len <= pp)) return 1;

  1519   }

  1520   (*words)[wnum] = NULL;

  1521   if (w) *words = NULL;

  1522   return 0;

  1523 }

  1525 inline int AffixMgr::candidate_check(const char * word, int len)

  1526 {

  1527   struct hentry * rv=NULL;

  1529   rv = lookup(word);

  1530   if (rv) return 1;

  1532 //  rv = prefix_check(word,len,1);

  1533 //  if (rv) return 1;

  1535   rv = affix_check(word,len);

  1536   if (rv) return 1;

  1537   return 0;

  1538 }

  1540 // calculate number of syllable for compound-checking

  1541 short AffixMgr::get_syllable(const char * word, int wlen)

  1542 {

  1543     if (cpdmaxsyllable==0) return 0;

  1545     short num=0;

  1547     if (!utf8) {

  1548         for (int i=0; i<wlen; i++) {

  1549             if (strchr(cpdvowels, word[i])) num++;

  1550         }

  1551     } else if (cpdvowels_utf16) {

  1552         w_char w[MAXWORDUTF8LEN];

  1553         int i = u8_u16(w, MAXWORDUTF8LEN, word);

  1554         for (; i > 0; i--) {

  1555             if (flag_bsearch((unsigned short *) cpdvowels_utf16,

  1556                 ((unsigned short *) w)[i - 1], cpdvowels_utf16_len)) num++;

  1557         }

  1558     }

  1559     return num;

  1560 }

  1562 void AffixMgr::setcminmax(int * cmin, int * cmax, const char * word, int len) {

  1563     if (utf8) {

  1564         int i;

  1565         for (*cmin = 0, i = 0; (i < cpdmin) && word[*cmin]; i++) {

  1566           for ((*cmin)++; (word[*cmin] & 0xc0) == 0x80; (*cmin)++);

  1567         }

  1568         for (*cmax = len, i = 0; (i < (cpdmin - 1)) && *cmax; i++) {

  1569           for ((*cmax)--; (word[*cmax] & 0xc0) == 0x80; (*cmax)--);

  1570         }

  1571     } else {

  1572         *cmin = cpdmin;

  1573         *cmax = len - cpdmin + 1;

  1574     }

  1575 }

  1578 // check if compound word is correctly spelled

  1579 // hu_mov_rule = spec. Hungarian rule (XXX)

  1580 struct hentry * AffixMgr::compound_check(const char * word, int len,

  1581     short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words = NULL,

  1582     char hu_mov_rule = 0, char is_sug = 0, int * info = NULL)

  1583 {

  1584     int i;

  1585     short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;

  1586     struct hentry * rv = NULL;

  1587     struct hentry * rv_first;

  1588     struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking

  1589     char st [MAXWORDUTF8LEN + 4];

  1590     char ch = '\0';

  1591     int cmin;

  1592     int cmax;

  1593     int striple = 0;

  1594     int scpd = 0;

  1595     int soldi = 0;

  1596     int oldcmin = 0;

  1597     int oldcmax = 0;

  1598     int oldlen = 0;

  1599     int checkedstriple = 0;

  1600     int onlycpdrule;

  1601     int affixed = 0;

  1602     hentry ** oldwords = words;

  1604     int checked_prefix;

  1606     setcminmax(&cmin, &cmax, word, len);

  1608     strcpy(st, word);

  1610     for (i = cmin; i < cmax; i++) {

  1611         // go to end of the UTF-8 character

  1612         if (utf8) {

  1613             for (; (st[i] & 0xc0) == 0x80; i++);

  1614             if (i >= cmax) return NULL;

  1615         }

  1617         words = oldwords;

  1618         onlycpdrule = (words) ? 1 : 0;

  1620         do { // onlycpdrule loop

  1622         oldnumsyllable = numsyllable;

  1623         oldwordnum = wordnum;

  1624         checked_prefix = 0;

  1627         do { // simplified checkcompoundpattern loop

  1629         if (scpd > 0) {

  1630           for (; scpd <= numcheckcpd && (!checkcpdtable[scpd-1].pattern3 ||

  1631             strncmp(word + i, checkcpdtable[scpd-1].pattern3, strlen(checkcpdtable[scpd-1].pattern3)) != 0); scpd++);

  1633           if (scpd > numcheckcpd) break; // break simplified checkcompoundpattern loop

  1634           strcpy(st + i, checkcpdtable[scpd-1].pattern);

  1635           soldi = i;

  1636           i += strlen(checkcpdtable[scpd-1].pattern);

  1637           strcpy(st + i, checkcpdtable[scpd-1].pattern2);

  1638           strcpy(st + i + strlen(checkcpdtable[scpd-1].pattern2), word + soldi + strlen(checkcpdtable[scpd-1].pattern3));

  1640           oldlen = len;

  1641           len += strlen(checkcpdtable[scpd-1].pattern) + strlen(checkcpdtable[scpd-1].pattern2) - strlen(checkcpdtable[scpd-1].pattern3);

  1642           oldcmin = cmin;

  1643           oldcmax = cmax;

  1644           setcminmax(&cmin, &cmax, st, len);

  1646           cmax = len - cpdmin + 1;

  1647         }

  1649         ch = st[i];

  1650         st[i] = '\0';

  1652         sfx = NULL;

  1653         pfx = NULL;

  1655         // FIRST WORD

  1657         affixed = 1;

  1658         rv = lookup(st); // perhaps without prefix

  1660         // search homonym with compound flag

  1661         while ((rv) && !hu_mov_rule &&

  1662             ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||

  1663                 !((compoundflag && !words && !onlycpdrule && TESTAFF(rv->astr, compoundflag, rv->alen)) ||

  1664                   (compoundbegin && !wordnum && !onlycpdrule &&

  1665                         TESTAFF(rv->astr, compoundbegin, rv->alen)) ||

  1666                   (compoundmiddle && wordnum && !words && !onlycpdrule &&

  1667                     TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||

  1668                   (numdefcpd && onlycpdrule &&

  1669                     ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) ||

  1670                     (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0))))) ||

  1671                   (scpd != 0 && checkcpdtable[scpd-1].cond != FLAG_NULL &&

  1672                     !TESTAFF(rv->astr, checkcpdtable[scpd-1].cond, rv->alen)))

  1673                   ) {

  1674             rv = rv->next_homonym;

  1675         }

  1677         if (rv) affixed = 0;

  1679         if (!rv) {

  1680             if (onlycpdrule) break;

  1681             if (compoundflag &&

  1682              !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) {

  1683                 if ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL,

  1684                         FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) && !hu_mov_rule &&

  1685                     sfx->getCont() &&

  1686                         ((compoundforbidflag && TESTAFF(sfx->getCont(), compoundforbidflag,

  1687                             sfx->getContLen())) || (compoundend &&

  1688                         TESTAFF(sfx->getCont(), compoundend,

  1689                             sfx->getContLen())))) {

  1690                         rv = NULL;

  1691                 }

  1692             }

  1694             if (rv ||

  1695               (((wordnum == 0) && compoundbegin &&

  1696                 ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||

  1697                 (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundbegin)))) ||

  1698               ((wordnum > 0) && compoundmiddle &&

  1699                 ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||

  1700                 (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle)))))

  1701               ) checked_prefix = 1;

  1702         // else check forbiddenwords and needaffix

  1703         } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||

  1704             TESTAFF(rv->astr, needaffix, rv->alen) ||

  1705             TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||

  1706             (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen))

  1707              )) {

  1708                 st[i] = ch;

  1709                 //continue;

  1710                 break;

  1711         }

  1713             // check non_compound flag in suffix and prefix

  1714             if ((rv) && !hu_mov_rule &&

  1715                 ((pfx && pfx->getCont() &&

  1716                     TESTAFF(pfx->getCont(), compoundforbidflag,

  1717                         pfx->getContLen())) ||

  1718                 (sfx && sfx->getCont() &&

  1719                     TESTAFF(sfx->getCont(), compoundforbidflag,

  1720                         sfx->getContLen())))) {

  1721                     rv = NULL;

  1722             }

  1724             // check compoundend flag in suffix and prefix

  1725             if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&

  1726                 ((pfx && pfx->getCont() &&

  1727                     TESTAFF(pfx->getCont(), compoundend,

  1728                         pfx->getContLen())) ||

  1729                 (sfx && sfx->getCont() &&

  1730                     TESTAFF(sfx->getCont(), compoundend,

  1731                         sfx->getContLen())))) {

  1732                     rv = NULL;

  1733             }

  1735             // check compoundmiddle flag in suffix and prefix

  1736             if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu_mov_rule &&

  1737                 ((pfx && pfx->getCont() &&

  1738                     TESTAFF(pfx->getCont(), compoundmiddle,

  1739                         pfx->getContLen())) ||

  1740                 (sfx && sfx->getCont() &&

  1741                     TESTAFF(sfx->getCont(), compoundmiddle,

  1742                         sfx->getContLen())))) {

  1743                     rv = NULL;

  1744             }

  1746         // check forbiddenwords

  1747         if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||

  1748             TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||

  1749             (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) {

  1750                 return NULL;

  1751             }

  1753         // increment word number, if the second root has a compoundroot flag

  1754         if ((rv) && compoundroot &&

  1755             (TESTAFF(rv->astr, compoundroot, rv->alen))) {

  1756                 wordnum++;

  1757         }

  1759         // first word is acceptable in compound words?

  1760         if (((rv) &&

  1761           ( checked_prefix || (words && words[wnum]) ||

  1762             (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||

  1763             ((oldwordnum == 0) && compoundbegin && TESTAFF(rv->astr, compoundbegin, rv->alen)) ||

  1764             ((oldwordnum > 0) && compoundmiddle && TESTAFF(rv->astr, compoundmiddle, rv->alen))// ||

  1765 //            (numdefcpd && )

  1767 // LANG_hu section: spec. Hungarian rule

  1768             || ((langnum == LANG_hu) && hu_mov_rule && (

  1769                     TESTAFF(rv->astr, 'F', rv->alen) || // XXX hardwired Hungarian dictionary codes

  1770                     TESTAFF(rv->astr, 'G', rv->alen) ||

  1771                     TESTAFF(rv->astr, 'H', rv->alen)

  1772                 )

  1773               )

  1774 // END of LANG_hu section

  1775           ) &&

  1776           (

  1777              // test CHECKCOMPOUNDPATTERN conditions

  1778              scpd == 0 || checkcpdtable[scpd-1].cond == FLAG_NULL ||

  1779                 TESTAFF(rv->astr, checkcpdtable[scpd-1].cond, rv->alen)

  1780           )

  1781           && ! (( checkcompoundtriple && scpd == 0 && !words && // test triple letters

  1782                    (word[i-1]==word[i]) && (

  1783                       ((i>1) && (word[i-1]==word[i-2])) ||

  1784                       ((word[i-1]==word[i+1])) // may be word[i+1] == '\0'

  1785                    )

  1786                ) ||

  1787                (

  1788                  checkcompoundcase && scpd == 0 && !words && cpdcase_check(word, i)

  1789                ))

  1790          )

  1791 // LANG_hu section: spec. Hungarian rule

  1792          || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && (rv = affix_check(st,i)) &&

  1793               (sfx && sfx->getCont() && ( // XXX hardwired Hungarian dic. codes

  1794                         TESTAFF(sfx->getCont(), (unsigned short) 'x', sfx->getContLen()) ||

  1795                         TESTAFF(sfx->getCont(), (unsigned short) '%', sfx->getContLen())

  1796                     )

  1797                )

  1798              )

  1799          ) { // first word is ok condition

  1801 // LANG_hu section: spec. Hungarian rule

  1802             if (langnum == LANG_hu) {

  1803                 // calculate syllable number of the word

  1804                 numsyllable += get_syllable(st, i);

  1805                 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)

  1806                 if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) > 1)) wordnum++;

  1807             }

  1808 // END of LANG_hu section

  1810             // NEXT WORD(S)

  1811             rv_first = rv;

  1812             st[i] = ch;

  1814         do { // striple loop

  1816             // check simplifiedtriple

  1817             if (simplifiedtriple) {

  1818               if (striple) {

  1819                 checkedstriple = 1;

  1820                 i--; // check "fahrt" instead of "ahrt" in "Schiffahrt"

  1821               } else if (i > 2 && *(word+i - 1) == *(word + i - 2)) striple = 1;

  1822             }

  1824             rv = lookup((st+i)); // perhaps without prefix

  1826         // search homonym with compound flag

  1827         while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||

  1828                         !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||

  1829                           (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) ||

  1830                            (numdefcpd && words && defcpd_check(&words, wnum + 1, rv, NULL,1))) ||

  1831                              (scpd != 0 && checkcpdtable[scpd-1].cond2 != FLAG_NULL &&

  1832                                 !TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen))

  1833                            )) {

  1834             rv = rv->next_homonym;

  1835         }

  1837             // check FORCEUCASE

  1838             if (rv && forceucase && (rv) &&

  1839                 (TESTAFF(rv->astr, forceucase, rv->alen)) && !(info && *info & SPELL_ORIGCAP)) rv = NULL;

  1841             if (rv && words && words[wnum + 1]) return rv_first;

  1843             oldnumsyllable2 = numsyllable;

  1844             oldwordnum2 = wordnum;

  1847 // LANG_hu section: spec. Hungarian rule, XXX hardwired dictionary code

  1848             if ((rv) && (langnum == LANG_hu) && (TESTAFF(rv->astr, 'I', rv->alen)) && !(TESTAFF(rv->astr, 'J', rv->alen))) {

  1849                 numsyllable--;

  1850             }

  1851 // END of LANG_hu section

  1853             // increment word number, if the second root has a compoundroot flag

  1854             if ((rv) && (compoundroot) &&

  1855                 (TESTAFF(rv->astr, compoundroot, rv->alen))) {

  1856                     wordnum++;

  1857             }

  1859             // check forbiddenwords

  1860             if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||

  1861                 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||

  1862                (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) return NULL;

  1864             // second word is acceptable, as a root?

  1865             // hungarian conventions: compounding is acceptable,

  1866             // when compound forms consist of 2 words, or if more,

  1867             // then the syllable number of root words must be 6, or lesser.

  1869             if ((rv) && (

  1870                       (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||

  1871                       (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))

  1872                     )

  1873                 && (

  1874                       ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) ||

  1875                       ((cpdmaxsyllable!=0) &&

  1876                           (numsyllable + get_syllable(HENTRY_WORD(rv), rv->clen)<=cpdmaxsyllable))

  1877                     ) &&

  1878                (

  1879                  // test CHECKCOMPOUNDPATTERN

  1880                  !numcheckcpd || scpd != 0 || !cpdpat_check(word, i, rv_first, rv, 0)

  1881                ) &&

  1882                 (

  1883                      (!checkcompounddup || (rv != rv_first))

  1884                    )

  1885             // test CHECKCOMPOUNDPATTERN conditions

  1886                 && (scpd == 0 || checkcpdtable[scpd-1].cond2 == FLAG_NULL ||

  1887                       TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen))

  1888                 )

  1889                  {

  1890                       // forbid compound word, if it is a non compound word with typical fault

  1891                       if (checkcompoundrep && cpdrep_check(word,len)) return NULL;

  1892                       return rv_first;

  1893             }

  1895             numsyllable = oldnumsyllable2;

  1896             wordnum = oldwordnum2;

  1898             // perhaps second word has prefix or/and suffix

  1899             sfx = NULL;

  1900             sfxflag = FLAG_NULL;

  1901             rv = (compoundflag && !onlycpdrule) ? affix_check((word+i),strlen(word+i), compoundflag, IN_CPD_END) : NULL;

  1902             if (!rv && compoundend && !onlycpdrule) {

  1903                 sfx = NULL;

  1904                 pfx = NULL;

  1905                 rv = affix_check((word+i),strlen(word+i), compoundend, IN_CPD_END);

  1906             }

  1908             if (!rv && numdefcpd && words) {

  1909                 rv = affix_check((word+i),strlen(word+i), 0, IN_CPD_END);

  1910                 if (rv && defcpd_check(&words, wnum + 1, rv, NULL, 1)) return rv_first;

  1911                 rv = NULL;

  1912             }

  1914             // test CHECKCOMPOUNDPATTERN conditions (allowed forms)

  1915             if (rv && !(scpd == 0 || checkcpdtable[scpd-1].cond2 == FLAG_NULL ||

  1916                 TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen))) rv = NULL;

  1918             // test CHECKCOMPOUNDPATTERN conditions (forbidden compounds)

  1919             if (rv && numcheckcpd && scpd == 0 && cpdpat_check(word, i, rv_first, rv, affixed)) rv = NULL;

  1921             // check non_compound flag in suffix and prefix

  1922             if ((rv) &&

  1923                 ((pfx && pfx->getCont() &&

  1924                     TESTAFF(pfx->getCont(), compoundforbidflag,

  1925                         pfx->getContLen())) ||

  1926                 (sfx && sfx->getCont() &&

  1927                     TESTAFF(sfx->getCont(), compoundforbidflag,

  1928                         sfx->getContLen())))) {

  1929                     rv = NULL;

  1930             }

  1932             // check FORCEUCASE

  1933             if (rv && forceucase && (rv) &&

  1934                 (TESTAFF(rv->astr, forceucase, rv->alen)) && !(info && *info & SPELL_ORIGCAP)) rv = NULL;

  1936             // check forbiddenwords

  1937             if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||

  1938                 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||

  1939                (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) return NULL;

  1941             // pfxappnd = prefix of word+i, or NULL

  1942             // calculate syllable number of prefix.

  1943             // hungarian convention: when syllable number of prefix is more,

  1944             // than 1, the prefix+word counts as two words.

  1946             if (langnum == LANG_hu) {

  1947                 // calculate syllable number of the word

  1948                 numsyllable += get_syllable(word + i, strlen(word + i));

  1950                 // - affix syllable num.

  1951                 // XXX only second suffix (inflections, not derivations)

  1952                 if (sfxappnd) {

  1953                     char * tmp = myrevstrdup(sfxappnd);

  1954                     numsyllable -= get_syllable(tmp, strlen(tmp));

  1955                     free(tmp);

  1956                 }

  1958                 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)

  1959                 if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) > 1)) wordnum++;

  1961                 // increment syllable num, if last word has a SYLLABLENUM flag

  1962                 // and the suffix is beginning `s'

  1964                 if (cpdsyllablenum) {

  1965                     switch (sfxflag) {

  1966                         case 'c': { numsyllable+=2; break; }

  1967                         case 'J': { numsyllable += 1; break; }

  1968                         case 'I': { if (rv && TESTAFF(rv->astr, 'J', rv->alen)) numsyllable += 1; break; }

  1969                     }

  1970                 }

  1971             }

  1973             // increment word number, if the second word has a compoundroot flag

  1974             if ((rv) && (compoundroot) &&

  1975                 (TESTAFF(rv->astr, compoundroot, rv->alen))) {

  1976                     wordnum++;

  1977             }

  1979             // second word is acceptable, as a word with prefix or/and suffix?

  1980             // hungarian conventions: compounding is acceptable,

  1981             // when compound forms consist 2 word, otherwise

  1982             // the syllable number of root words is 6, or lesser.

  1983             if ((rv) &&

  1984                     (

  1985                       ((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||

  1986                       ((cpdmaxsyllable != 0) &&

  1987                           (numsyllable <= cpdmaxsyllable))

  1988                     )

  1989                 && (

  1990                    (!checkcompounddup || (rv != rv_first))

  1991                    )) {

  1992                     // forbid compound word, if it is a non compound word with typical fault

  1993                     if (checkcompoundrep && cpdrep_check(word, len)) return NULL;

  1994                     return rv_first;

  1995             }

  1997             numsyllable = oldnumsyllable2;

  1998             wordnum = oldwordnum2;

  2000             // perhaps second word is a compound word (recursive call)

  2001             if (wordnum < maxwordnum) {

  2002                 rv = compound_check((st+i),strlen(st+i), wordnum+1,

  2003                      numsyllable, maxwordnum, wnum + 1, words, 0, is_sug, info);

  2005                 if (rv && numcheckcpd && ((scpd == 0 && cpdpat_check(word, i, rv_first, rv, affixed)) ||

  2006                    (scpd != 0 && !cpdpat_check(word, i, rv_first, rv, affixed)))) rv = NULL;

  2007             } else {

  2008                 rv=NULL;

  2009             }

  2010             if (rv) {

  2011                 // forbid compound word, if it is a non compound word with typical fault

  2012                 if (checkcompoundrep || forbiddenword) {

  2013                     struct hentry * rv2 = NULL;

  2015                     if (checkcompoundrep && cpdrep_check(word, len)) return NULL;

  2017                     // check first part

  2018                     if (strncmp(rv->word, word + i, rv->blen) == 0) {

  2019                         char r = *(st + i + rv->blen);

  2020                         *(st + i + rv->blen) = '\0';

  2022                         if (checkcompoundrep && cpdrep_check(st, i + rv->blen)) {

  2023                             *(st + i + rv->blen) = r;

  2024                             continue;

  2025                         }

  2027                         if (forbiddenword) {

  2028                             rv2 = lookup(word);

  2029                             if (!rv2) rv2 = affix_check(word, len);

  2030                             if (rv2 && rv2->astr && TESTAFF(rv2->astr, forbiddenword, rv2->alen) &&

  2031                                 (strncmp(rv2->word, st, i + rv->blen) == 0)) {

  2032                                     return NULL;

  2033                             }

  2034                         }

  2035                         *(st + i + rv->blen) = r;

  2036                     }

  2037                 }

  2038                 return rv_first;

  2039             }

  2040           } while (striple && !checkedstriple); // end of striple loop

  2042           if (checkedstriple) {

  2043             i++;

  2044             checkedstriple = 0;

  2045             striple = 0;

  2046           }

  2048         } // first word is ok condition

  2050         if (soldi != 0) {

  2051           i = soldi;

  2052           soldi = 0;

  2053           len = oldlen;

  2054           cmin = oldcmin;

  2055           cmax = oldcmax;

  2056         }

  2057         scpd++;

  2060         } while (!onlycpdrule && simplifiedcpd && scpd <= numcheckcpd); // end of simplifiedcpd loop

  2062         scpd = 0;

  2063         wordnum = oldwordnum;

  2064         numsyllable = oldnumsyllable;

  2066         if (soldi != 0) {

  2067           i = soldi;

  2068           strcpy(st, word); // XXX add more optim.

  2069           soldi = 0;

  2070         } else st[i] = ch;

  2072         } while (numdefcpd && oldwordnum == 0 && !onlycpdrule && (onlycpdrule = 1)); // end of onlycpd loop

  2074     }

  2076     return NULL;

  2077 }

  2079 // check if compound word is correctly spelled

  2080 // hu_mov_rule = spec. Hungarian rule (XXX)

  2081 int AffixMgr::compound_check_morph(const char * word, int len,

  2082     short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words,

  2083     char hu_mov_rule = 0, char ** result = NULL, char * partresult = NULL)

  2084 {

  2085     int i;

  2086     short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;

  2087     int ok = 0;

  2089     struct hentry * rv = NULL;

  2090     struct hentry * rv_first;

  2091     struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking

  2092     char st [MAXWORDUTF8LEN + 4];

  2093     char ch;

  2095     int checked_prefix;

  2096     char presult[MAXLNLEN];

  2098     int cmin;

  2099     int cmax;

  2101     int onlycpdrule;

  2102     int affixed = 0;

  2103     hentry ** oldwords = words;

  2105     setcminmax(&cmin, &cmax, word, len);

  2107     strcpy(st, word);

  2109     for (i = cmin; i < cmax; i++) {

  2110         oldnumsyllable = numsyllable;

  2111         oldwordnum = wordnum;

  2112         checked_prefix = 0;

  2114         // go to end of the UTF-8 character

  2115         if (utf8) {

  2116             for (; (st[i] & 0xc0) == 0x80; i++);

  2117             if (i >= cmax) return 0;

  2118         }

  2120         words = oldwords;

  2121         onlycpdrule = (words) ? 1 : 0;

  2123         do { // onlycpdrule loop

  2125         oldnumsyllable = numsyllable;

  2126         oldwordnum = wordnum;

  2127         checked_prefix = 0;

  2129         ch = st[i];

  2130         st[i] = '\0';

  2131         sfx = NULL;

  2133         // FIRST WORD

  2135         affixed = 1;

  2137         *presult = '\0';

  2138         if (partresult) mystrcat(presult, partresult, MAXLNLEN);

  2140         rv = lookup(st); // perhaps without prefix

  2142         // search homonym with compound flag

  2143         while ((rv) && !hu_mov_rule &&

  2144             ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||

  2145                 !((compoundflag && !words && !onlycpdrule && TESTAFF(rv->astr, compoundflag, rv->alen)) ||

  2146                 (compoundbegin && !wordnum && !onlycpdrule &&

  2147                         TESTAFF(rv->astr, compoundbegin, rv->alen)) ||

  2148                 (compoundmiddle && wordnum && !words && !onlycpdrule &&

  2149                     TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||

  2150                   (numdefcpd && onlycpdrule &&

  2151                     ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) ||

  2152                     (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0))))

  2153                   ))) {

  2154             rv = rv->next_homonym;

  2155         }

  2157         if (rv) affixed = 0;

  2159         if (rv)  {

  2160             sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_PART, st);

  2161             if (!HENTRY_FIND(rv, MORPH_STEM)) {

  2162                 sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_STEM, st);

  2163             }

  2164             // store the pointer of the hash entry

  2165 //            sprintf(presult + strlen(presult), "%c%s%p", MSEP_FLD, MORPH_HENTRY, rv);

  2166             if (HENTRY_DATA(rv)) {

  2167                 sprintf(presult + strlen(presult), "%c%s", MSEP_FLD, HENTRY_DATA2(rv));

  2168             }

  2169         }

  2171         if (!rv) {

  2172             if (onlycpdrule) break;

  2173             if (compoundflag &&

  2174              !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) {

  2175                 if ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL,

  2176                         FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) && !hu_mov_rule &&

  2177                     sfx->getCont() &&

  2178                         ((compoundforbidflag && TESTAFF(sfx->getCont(), compoundforbidflag,

  2179                             sfx->getContLen())) || (compoundend &&

  2180                         TESTAFF(sfx->getCont(), compoundend,

  2181                             sfx->getContLen())))) {

  2182                         rv = NULL;

  2183                 }

  2184             }

  2186             if (rv ||

  2187               (((wordnum == 0) && compoundbegin &&

  2188                 ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||

  2189                 (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundbegin)))) ||

  2190               ((wordnum > 0) && compoundmiddle &&

  2191                 ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||

  2192                 (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle)))))

  2193               ) {

  2194                 // char * p = prefix_check_morph(st, i, 0, compound);

  2195                 char * p = NULL;

  2196                 if (compoundflag) p = affix_check_morph(st, i, compoundflag);

  2197                 if (!p || (*p == '\0')) {

  2198                    if (p) free(p);

  2199                    p = NULL;

  2200                    if ((wordnum == 0) && compoundbegin) {

  2201                      p = affix_check_morph(st, i, compoundbegin);

  2202                    } else if ((wordnum > 0) && compoundmiddle) {

  2203                      p = affix_check_morph(st, i, compoundmiddle);

  2204                    }

  2205                 }

  2206                 if (p && (*p != '\0')) {

  2207                     sprintf(presult + strlen(presult), "%c%s%s%s", MSEP_FLD,

  2208                         MORPH_PART, st, line_uniq_app(&p, MSEP_REC));

  2209                 }

  2210                 if (p) free(p);

  2211                 checked_prefix = 1;

  2212             }

  2213         // else check forbiddenwords

  2214         } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||

  2215             TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||

  2216             TESTAFF(rv->astr, needaffix, rv->alen))) {

  2217                 st[i] = ch;

  2218                 continue;

  2219         }

  2221             // check non_compound flag in suffix and prefix

  2222             if ((rv) && !hu_mov_rule &&

  2223                 ((pfx && pfx->getCont() &&

  2224                     TESTAFF(pfx->getCont(), compoundforbidflag,

  2225                         pfx->getContLen())) ||

  2226                 (sfx && sfx->getCont() &&

  2227                     TESTAFF(sfx->getCont(), compoundforbidflag,

  2228                         sfx->getContLen())))) {

  2229                     continue;

  2230             }

  2232             // check compoundend flag in suffix and prefix

  2233             if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&

  2234                 ((pfx && pfx->getCont() &&

  2235                     TESTAFF(pfx->getCont(), compoundend,

  2236                         pfx->getContLen())) ||

  2237                 (sfx && sfx->getCont() &&

  2238                     TESTAFF(sfx->getCont(), compoundend,

  2239                         sfx->getContLen())))) {

  2240                     continue;

  2241             }

  2243             // check compoundmiddle flag in suffix and prefix

  2244             if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu_mov_rule &&

  2245                 ((pfx && pfx->getCont() &&

  2246                     TESTAFF(pfx->getCont(), compoundmiddle,

  2247                         pfx->getContLen())) ||

  2248                 (sfx && sfx->getCont() &&

  2249                     TESTAFF(sfx->getCont(), compoundmiddle,

  2250                         sfx->getContLen())))) {

  2251                     rv = NULL;

  2252             }

  2254         // check forbiddenwords

  2255         if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen)

  2256             || TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))) continue;

  2258         // increment word number, if the second root has a compoundroot flag

  2259         if ((rv) && (compoundroot) &&

  2260             (TESTAFF(rv->astr, compoundroot, rv->alen))) {

  2261                 wordnum++;

  2262         }

  2264         // first word is acceptable in compound words?

  2265         if (((rv) &&

  2266           ( checked_prefix || (words && words[wnum]) ||

  2267             (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||

  2268             ((oldwordnum == 0) && compoundbegin && TESTAFF(rv->astr, compoundbegin, rv->alen)) ||

  2269             ((oldwordnum > 0) && compoundmiddle && TESTAFF(rv->astr, compoundmiddle, rv->alen))

  2270 // LANG_hu section: spec. Hungarian rule

  2271             || ((langnum == LANG_hu) && // hu_mov_rule

  2272                 hu_mov_rule && (

  2273                     TESTAFF(rv->astr, 'F', rv->alen) ||

  2274                     TESTAFF(rv->astr, 'G', rv->alen) ||

  2275                     TESTAFF(rv->astr, 'H', rv->alen)

  2276                 )

  2277               )

  2278 // END of LANG_hu section

  2279           )

  2280           && ! (( checkcompoundtriple && !words && // test triple letters

  2281                    (word[i-1]==word[i]) && (

  2282                       ((i>1) && (word[i-1]==word[i-2])) ||

  2283                       ((word[i-1]==word[i+1])) // may be word[i+1] == '\0'

  2284                    )

  2285                ) ||

  2286                (

  2287                    // test CHECKCOMPOUNDPATTERN

  2288                    numcheckcpd && !words && cpdpat_check(word, i, rv, NULL, affixed)

  2289                ) ||

  2290                (

  2291                  checkcompoundcase && !words && cpdcase_check(word, i)

  2292                ))

  2293          )

  2294 // LANG_hu section: spec. Hungarian rule

  2295          || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && (rv = affix_check(st,i)) &&

  2296               (sfx && sfx->getCont() && (

  2297                         TESTAFF(sfx->getCont(), (unsigned short) 'x', sfx->getContLen()) ||

  2298                         TESTAFF(sfx->getCont(), (unsigned short) '%', sfx->getContLen())

  2299                     )

  2300                )

  2301              )

  2302 // END of LANG_hu section

  2303          ) {

  2305 // LANG_hu section: spec. Hungarian rule

  2306             if (langnum == LANG_hu) {

  2307                 // calculate syllable number of the word

  2308                 numsyllable += get_syllable(st, i);

  2310                 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)

  2311                 if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) > 1)) wordnum++;

  2312             }

  2313 // END of LANG_hu section

  2315             // NEXT WORD(S)

  2316             rv_first = rv;

  2317             rv = lookup((word+i)); // perhaps without prefix

  2319         // search homonym with compound flag

  2320         while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||

  2321                         !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||

  2322                           (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) ||

  2323                            (numdefcpd && words && defcpd_check(&words, wnum + 1, rv, NULL,1))))) {

  2324             rv = rv->next_homonym;

  2325         }

  2327             if (rv && words && words[wnum + 1]) {

  2328                   mystrcat(*result, presult, MAXLNLEN);

  2329                   mystrcat(*result, " ", MAXLNLEN);

  2330                   mystrcat(*result, MORPH_PART, MAXLNLEN);

  2331                   mystrcat(*result, word+i, MAXLNLEN);

  2332                   if (complexprefixes && HENTRY_DATA(rv)) mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN);

  2333                   if (!HENTRY_FIND(rv, MORPH_STEM)) {

  2334                     mystrcat(*result, " ", MAXLNLEN);

  2335                     mystrcat(*result, MORPH_STEM, MAXLNLEN);

  2336                     mystrcat(*result, HENTRY_WORD(rv), MAXLNLEN);

  2337                   }

  2338                   // store the pointer of the hash entry

  2339 //                  sprintf(*result + strlen(*result), " %s%p", MORPH_HENTRY, rv);

  2340                   if (!complexprefixes && HENTRY_DATA(rv)) {

  2341                     mystrcat(*result, " ", MAXLNLEN);

  2342                     mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN);

  2343                   }

  2344                   mystrcat(*result, "\n", MAXLNLEN);

  2345                   ok = 1;

  2346                   return 0;

  2347             }

  2349             oldnumsyllable2 = numsyllable;

  2350             oldwordnum2 = wordnum;

  2352 // LANG_hu section: spec. Hungarian rule

  2353             if ((rv) && (langnum == LANG_hu) && (TESTAFF(rv->astr, 'I', rv->alen)) && !(TESTAFF(rv->astr, 'J', rv->alen))) {

  2354                 numsyllable--;

  2355             }

  2356 // END of LANG_hu section

  2357             // increment word number, if the second root has a compoundroot flag

  2358             if ((rv) && (compoundroot) &&

  2359                 (TESTAFF(rv->astr, compoundroot, rv->alen))) {

  2360                     wordnum++;

  2361             }

  2363             // check forbiddenwords

  2364             if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||

  2365                 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))) {

  2366                 st[i] = ch;

  2367                 continue;

  2368             }

  2370             // second word is acceptable, as a root?

  2371             // hungarian conventions: compounding is acceptable,

  2372             // when compound forms consist of 2 words, or if more,

  2373             // then the syllable number of root words must be 6, or lesser.

  2374             if ((rv) && (

  2375                       (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||

  2376                       (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))

  2377                     )

  2378                 && (

  2379                       ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) ||

  2380                       ((cpdmaxsyllable!=0) &&

  2381                           (numsyllable+get_syllable(HENTRY_WORD(rv),rv->blen)<=cpdmaxsyllable))

  2382                     )

  2383                 && (

  2384                      (!checkcompounddup || (rv != rv_first))

  2385                    )

  2386                 )

  2387                  {

  2388                       // bad compound word

  2389                       mystrcat(*result, presult, MAXLNLEN);

  2390                       mystrcat(*result, " ", MAXLNLEN);

  2391                       mystrcat(*result, MORPH_PART, MAXLNLEN);

  2392                       mystrcat(*result, word+i, MAXLNLEN);

  2394                       if (HENTRY_DATA(rv)) {

  2395                         if (complexprefixes) mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN);

  2396                         if (! HENTRY_FIND(rv, MORPH_STEM)) {

  2397                            mystrcat(*result, " ", MAXLNLEN);

  2398                            mystrcat(*result, MORPH_STEM, MAXLNLEN);

  2399                            mystrcat(*result, HENTRY_WORD(rv), MAXLNLEN);

  2400                         }

  2401                         // store the pointer of the hash entry

  2402 //                        sprintf(*result + strlen(*result), " %s%p", MORPH_HENTRY, rv);

  2403                         if (!complexprefixes) {

  2404                             mystrcat(*result, " ", MAXLNLEN);

  2405                             mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN);

  2406                         }

  2407                       }

  2408                       mystrcat(*result, "\n", MAXLNLEN);

  2409                               ok = 1;

  2410             }

  2412             numsyllable = oldnumsyllable2 ;

  2413             wordnum = oldwordnum2;

  2415             // perhaps second word has prefix or/and suffix

  2416             sfx = NULL;

  2417             sfxflag = FLAG_NULL;

  2419             if (compoundflag && !onlycpdrule) rv = affix_check((word+i),strlen(word+i), compoundflag); else rv = NULL;

  2421             if (!rv && compoundend && !onlycpdrule) {

  2422                 sfx = NULL;

  2423                 pfx = NULL;

  2424                 rv = affix_check((word+i),strlen(word+i), compoundend);

  2425             }

  2427             if (!rv && numdefcpd && words) {

  2428                 rv = affix_check((word+i),strlen(word+i), 0, IN_CPD_END);

  2429                 if (rv && words && defcpd_check(&words, wnum + 1, rv, NULL, 1)) {

  2430                       char * m = NULL;

  2431                       if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag);

  2432                       if ((!m || *m == '\0') && compoundend) {

  2433                             if (m) free(m);

  2434                             m = affix_check_morph((word+i),strlen(word+i), compoundend);

  2435                       }

  2436                       mystrcat(*result, presult, MAXLNLEN);

  2437                       if (m || (*m != '\0')) {

  2438                         sprintf(*result + strlen(*result), "%c%s%s%s", MSEP_FLD,

  2439                             MORPH_PART, word + i, line_uniq_app(&m, MSEP_REC));

  2440                       }

  2441                       if (m) free(m);

  2442                       mystrcat(*result, "\n", MAXLNLEN);

  2443                       ok = 1;

  2444                 }

  2445             }

  2447             // check non_compound flag in suffix and prefix

  2448             if ((rv) &&

  2449                 ((pfx && pfx->getCont() &&

  2450                     TESTAFF(pfx->getCont(), compoundforbidflag,

  2451                         pfx->getContLen())) ||

  2452                 (sfx && sfx->getCont() &&

  2453                     TESTAFF(sfx->getCont(), compoundforbidflag,

  2454                         sfx->getContLen())))) {

  2455                     rv = NULL;

  2456             }

  2458             // check forbiddenwords

  2459             if ((rv) && (rv->astr) && (TESTAFF(rv->astr,forbiddenword,rv->alen) ||

  2460                     TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))

  2461                     && (! TESTAFF(rv->astr, needaffix, rv->alen))) {

  2462                         st[i] = ch;

  2463                         continue;

  2464                     }

  2466             if (langnum == LANG_hu) {

  2467                 // calculate syllable number of the word

  2468                 numsyllable += get_syllable(word + i, strlen(word + i));

  2470                 // - affix syllable num.

  2471                 // XXX only second suffix (inflections, not derivations)

  2472                 if (sfxappnd) {

  2473                     char * tmp = myrevstrdup(sfxappnd);

  2474                     numsyllable -= get_syllable(tmp, strlen(tmp));

  2475                     free(tmp);

  2476                 }

  2478                 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)

  2479                 if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) > 1)) wordnum++;

  2481                 // increment syllable num, if last word has a SYLLABLENUM flag

  2482                 // and the suffix is beginning `s'

  2484                 if (cpdsyllablenum) {

  2485                     switch (sfxflag) {

  2486                         case 'c': { numsyllable+=2; break; }

  2487                         case 'J': { numsyllable += 1; break; }

  2488                         case 'I': { if (rv && TESTAFF(rv->astr, 'J', rv->alen)) numsyllable += 1; break; }

  2489                     }

  2490                 }

  2491             }

  2493             // increment word number, if the second word has a compoundroot flag

  2494             if ((rv) && (compoundroot) &&

  2495                 (TESTAFF(rv->astr, compoundroot, rv->alen))) {

  2496                     wordnum++;

  2497             }

  2498             // second word is acceptable, as a word with prefix or/and suffix?

  2499             // hungarian conventions: compounding is acceptable,

  2500             // when compound forms consist 2 word, otherwise

  2501             // the syllable number of root words is 6, or lesser.

  2502             if ((rv) &&

  2503                     (

  2504                       ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) ||

  2505                       ((cpdmaxsyllable!=0) &&

  2506                           (numsyllable <= cpdmaxsyllable))

  2507                     )

  2508                 && (

  2509                    (!checkcompounddup || (rv != rv_first))

  2510                    )) {

  2511                       char * m = NULL;

  2512                       if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag);

  2513                       if ((!m || *m == '\0') && compoundend) {

  2514                             if (m) free(m);

  2515                             m = affix_check_morph((word+i),strlen(word+i), compoundend);

  2516                       }

  2517                       mystrcat(*result, presult, MAXLNLEN);

  2518                       if (m && (*m != '\0')) {

  2519                         sprintf(*result + strlen(*result), "%c%s%s%s", MSEP_FLD,

  2520                             MORPH_PART, word + i, line_uniq_app(&m, MSEP_REC));

  2521                       }

  2522                       if (m) free(m);

  2523                       sprintf(*result + strlen(*result), "%c", MSEP_REC);

  2524                       ok = 1;

  2525             }

  2527             numsyllable = oldnumsyllable2;

  2528             wordnum = oldwordnum2;

  2530             // perhaps second word is a compound word (recursive call)

  2531             if ((wordnum < maxwordnum) && (ok == 0)) {

  2532                         compound_check_morph((word+i),strlen(word+i), wordnum+1,

  2533                              numsyllable, maxwordnum, wnum + 1, words, 0, result, presult);

  2534             } else {

  2535                 rv=NULL;

  2536             }

  2537         }

  2538         st[i] = ch;

  2539         wordnum = oldwordnum;

  2540         numsyllable = oldnumsyllable;

  2542         } while (numdefcpd && oldwordnum == 0 && !onlycpdrule && (onlycpdrule = 1)); // end of onlycpd loop

  2544     }

  2545     return 0;

  2546 }

  2548  // return 1 if s1 (reversed) is a leading subset of end of s2

  2549 /* inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int len)

  2550  {

  2551     while ((len > 0) && *s1 && (*s1 == *end_of_s2)) {

  2552         s1++;

  2553         end_of_s2--;

  2554         len--;

  2555     }

  2556     return (*s1 == '\0');

  2557  }

  2558  */

  2560 inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int len)

  2561  {

  2562     while ((len > 0) && (*s1 != '\0') && ((*s1 == *end_of_s2) || (*s1 == '.'))) {

  2563         s1++;

  2564         end_of_s2--;

  2565         len--;

  2566     }

  2567     return (*s1 == '\0');

  2568  }

  2570 // check word for suffixes

  2572 struct hentry * AffixMgr::suffix_check (const char * word, int len,

  2573        int sfxopts, PfxEntry * ppfx, char ** wlst, int maxSug, int * ns,

  2574        const FLAG cclass, const FLAG needflag, char in_compound)

  2575 {

  2576     struct hentry * rv = NULL;

  2577     PfxEntry* ep = ppfx;

  2579     // first handle the special case of 0 length suffixes

  2580     SfxEntry * se = sStart[0];

  2582     while (se) {

  2583         if (!cclass || se->getCont()) {

  2584             // suffixes are not allowed in beginning of compounds

  2585             if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass

  2586              // except when signed with compoundpermitflag flag

  2587              (se->getCont() && compoundpermitflag &&

  2588                 TESTAFF(se->getCont(),compoundpermitflag,se->getContLen()))) && (!circumfix ||

  2589               // no circumfix flag in prefix and suffix

  2590               ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),

  2591                    circumfix, ep->getContLen())) &&

  2592                (!se->getCont() || !(TESTAFF(se->getCont(),circumfix,se->getContLen())))) ||

  2593               // circumfix flag in prefix AND suffix

  2594               ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),

  2595                    circumfix, ep->getContLen())) &&

  2596                (se->getCont() && (TESTAFF(se->getCont(),circumfix,se->getContLen())))))  &&

  2597             // fogemorpheme

  2598               (in_compound ||

  2599                  !(se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen())))) &&

  2600             // needaffix on prefix or first suffix

  2601               (cclass ||

  2602                    !(se->getCont() && TESTAFF(se->getCont(), needaffix, se->getContLen())) ||

  2603                    (ppfx && !((ep->getCont()) &&

  2604                      TESTAFF(ep->getCont(), needaffix,

  2605                        ep->getContLen())))

  2606               )) {

  2607                 rv = se->checkword(word,len, sfxopts, ppfx, wlst, maxSug, ns, (FLAG) cclass,

  2608                     needflag, (in_compound ? 0 : onlyincompound));

  2609                 if (rv) {

  2610                     sfx=se; // BUG: sfx not stateless

  2611                     return rv;

  2612                 }

  2613             }

  2614         }

  2615        se = se->getNext();

  2616     }

  2618     // now handle the general case

  2619     if (len == 0) return NULL; // FULLSTRIP

  2620     unsigned char sp= *((const unsigned char *)(word + len - 1));

  2621     SfxEntry * sptr = sStart[sp];

  2623     while (sptr) {

  2624         if (isRevSubset(sptr->getKey(), word + len - 1, len)

  2625         ) {

  2626             // suffixes are not allowed in beginning of compounds

  2627             if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass

  2628              // except when signed with compoundpermitflag flag

  2629              (sptr->getCont() && compoundpermitflag &&

  2630                 TESTAFF(sptr->getCont(),compoundpermitflag,sptr->getContLen()))) && (!circumfix ||

  2631               // no circumfix flag in prefix and suffix

  2632               ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),

  2633                    circumfix, ep->getContLen())) &&

  2634                (!sptr->getCont() || !(TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))) ||

  2635               // circumfix flag in prefix AND suffix

  2636               ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),

  2637                    circumfix, ep->getContLen())) &&

  2638                (sptr->getCont() && (TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))))  &&

  2639             // fogemorpheme

  2640               (in_compound ||

  2641                  !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) &&

  2642             // needaffix on prefix or first suffix

  2643               (cclass ||

  2644                   !(sptr->getCont() && TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) ||

  2645                   (ppfx && !((ep->getCont()) &&

  2646                      TESTAFF(ep->getCont(), needaffix,

  2647                        ep->getContLen())))

  2648               )

  2649             ) if (in_compound != IN_CPD_END || ppfx || !(sptr->getCont() && TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))) {

  2650                 rv = sptr->checkword(word,len, sfxopts, ppfx, wlst,

  2651                     maxSug, ns, cclass, needflag, (in_compound ? 0 : onlyincompound));

  2652                 if (rv) {

  2653                     sfx=sptr; // BUG: sfx not stateless

  2654                     sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless

  2655                     if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless

  2656                     return rv;

  2657                 }

  2658              }

  2659              sptr = sptr->getNextEQ();

  2660         } else {

  2661              sptr = sptr->getNextNE();

  2662         }

  2663     }

  2665     return NULL;

  2666 }

  2668 // check word for two-level suffixes

  2670 struct hentry * AffixMgr::suffix_check_twosfx(const char * word, int len,

  2671        int sfxopts, PfxEntry * ppfx, const FLAG needflag)

  2672 {

  2673     struct hentry * rv = NULL;

  2675     // first handle the special case of 0 length suffixes

  2676     SfxEntry * se = sStart[0];

  2677     while (se) {

  2678         if (contclasses[se->getFlag()])

  2679         {

  2680             rv = se->check_twosfx(word,len, sfxopts, ppfx, needflag);

  2681             if (rv) return rv;

  2682         }

  2683         se = se->getNext();

  2684     }

  2686     // now handle the general case

  2687     if (len == 0) return NULL; // FULLSTRIP

  2688     unsigned char sp = *((const unsigned char *)(word + len - 1));

  2689     SfxEntry * sptr = sStart[sp];

  2691     while (sptr) {

  2692         if (isRevSubset(sptr->getKey(), word + len - 1, len)) {

  2693             if (contclasses[sptr->getFlag()])

  2694             {

  2695                 rv = sptr->check_twosfx(word,len, sfxopts, ppfx, needflag);

  2696                 if (rv) {

  2697                     sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless

  2698                     if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless

  2699                     return rv;

  2700                 }

  2701             }

  2702             sptr = sptr->getNextEQ();

  2703         } else {

  2704              sptr = sptr->getNextNE();

  2705         }

  2706     }

  2708     return NULL;

  2709 }

  2711 char * AffixMgr::suffix_check_twosfx_morph(const char * word, int len,

  2712        int sfxopts, PfxEntry * ppfx, const FLAG needflag)

  2713 {

  2714     char result[MAXLNLEN];

  2715     char result2[MAXLNLEN];

  2716     char result3[MAXLNLEN];

  2718     char * st;

  2720     result[0] = '\0';

  2721     result2[0] = '\0';

  2722     result3[0] = '\0';

  2724     // first handle the special case of 0 length suffixes

  2725     SfxEntry * se = sStart[0];

  2726     while (se) {

  2727         if (contclasses[se->getFlag()])

  2728         {

  2729             st = se->check_twosfx_morph(word,len, sfxopts, ppfx, needflag);

  2730             if (st) {

  2731                 if (ppfx) {

  2732                     if (ppfx->getMorph()) {

  2733                         mystrcat(result, ppfx->getMorph(), MAXLNLEN);

  2734                         mystrcat(result, " ", MAXLNLEN);

  2735                     } else debugflag(result, ppfx->getFlag());

  2736                 }

  2737                 mystrcat(result, st, MAXLNLEN);

  2738                 free(st);

  2739                 if (se->getMorph()) {

  2740                     mystrcat(result, " ", MAXLNLEN);

  2741                     mystrcat(result, se->getMorph(), MAXLNLEN);

  2742                 } else debugflag(result, se->getFlag());

  2743                 mystrcat(result, "\n", MAXLNLEN);

  2744             }

  2745         }

  2746         se = se->getNext();

  2747     }

  2749     // now handle the general case

  2750     if (len == 0) return NULL; // FULLSTRIP

  2751     unsigned char sp = *((const unsigned char *)(word + len - 1));

  2752     SfxEntry * sptr = sStart[sp];

  2754     while (sptr) {

  2755         if (isRevSubset(sptr->getKey(), word + len - 1, len)) {

  2756             if (contclasses[sptr->getFlag()])

  2757             {

  2758                 st = sptr->check_twosfx_morph(word,len, sfxopts, ppfx, needflag);

  2759                 if (st) {

  2760                     sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless

  2761                     if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless

  2762                     strcpy(result2, st);

  2763                     free(st);

  2765                 result3[0] = '\0';

  2767                 if (sptr->getMorph()) {

  2768                     mystrcat(result3, " ", MAXLNLEN);

  2769                     mystrcat(result3, sptr->getMorph(), MAXLNLEN);

  2770                 } else debugflag(result3, sptr->getFlag());

  2771                 strlinecat(result2, result3);

  2772                 mystrcat(result2, "\n", MAXLNLEN);

  2773                 mystrcat(result,  result2, MAXLNLEN);

  2774                 }

  2775             }

  2776             sptr = sptr->getNextEQ();

  2777         } else {

  2778              sptr = sptr->getNextNE();

  2779         }

  2780     }

  2781     if (*result) return mystrdup(result);

  2782     return NULL;

  2783 }

  2785 char * AffixMgr::suffix_check_morph(const char * word, int len,

  2786        int sfxopts, PfxEntry * ppfx, const FLAG cclass, const FLAG needflag, char in_compound)

  2787 {

  2788     char result[MAXLNLEN];

  2790     struct hentry * rv = NULL;

  2792     result[0] = '\0';

  2794     PfxEntry* ep = ppfx;

  2796     // first handle the special case of 0 length suffixes

  2797     SfxEntry * se = sStart[0];

  2798     while (se) {

  2799         if (!cclass || se->getCont()) {

  2800             // suffixes are not allowed in beginning of compounds

  2801             if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass

  2802              // except when signed with compoundpermitflag flag

  2803              (se->getCont() && compoundpermitflag &&

  2804                 TESTAFF(se->getCont(),compoundpermitflag,se->getContLen()))) && (!circumfix ||

  2805               // no circumfix flag in prefix and suffix

  2806               ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),

  2807                    circumfix, ep->getContLen())) &&

  2808                (!se->getCont() || !(TESTAFF(se->getCont(),circumfix,se->getContLen())))) ||

  2809               // circumfix flag in prefix AND suffix

  2810               ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),

  2811                    circumfix, ep->getContLen())) &&

  2812                (se->getCont() && (TESTAFF(se->getCont(),circumfix,se->getContLen())))))  &&

  2813             // fogemorpheme

  2814               (in_compound ||

  2815                  !((se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) &&

  2816             // needaffix on prefix or first suffix

  2817               (cclass ||

  2818                    !(se->getCont() && TESTAFF(se->getCont(), needaffix, se->getContLen())) ||

  2819                    (ppfx && !((ep->getCont()) &&

  2820                      TESTAFF(ep->getCont(), needaffix,

  2821                        ep->getContLen())))

  2822               )

  2823             ))

  2824             rv = se->checkword(word, len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag);

  2825          while (rv) {

  2826            if (ppfx) {

  2827                 if (ppfx->getMorph()) {

  2828                     mystrcat(result, ppfx->getMorph(), MAXLNLEN);

  2829                     mystrcat(result, " ", MAXLNLEN);

  2830                 } else debugflag(result, ppfx->getFlag());

  2831             }

  2832             if (complexprefixes && HENTRY_DATA(rv)) mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);

  2833             if (! HENTRY_FIND(rv, MORPH_STEM)) {

  2834                 mystrcat(result, " ", MAXLNLEN);

  2835                 mystrcat(result, MORPH_STEM, MAXLNLEN);

  2836                 mystrcat(result, HENTRY_WORD(rv), MAXLNLEN);

  2837             }

  2838             // store the pointer of the hash entry

  2839 //            sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv);

  2841             if (!complexprefixes && HENTRY_DATA(rv)) {

  2842                     mystrcat(result, " ", MAXLNLEN);

  2843                     mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);

  2844             }

  2845             if (se->getMorph()) {

  2846                 mystrcat(result, " ", MAXLNLEN);

  2847                 mystrcat(result, se->getMorph(), MAXLNLEN);

  2848             } else debugflag(result, se->getFlag());

  2849             mystrcat(result, "\n", MAXLNLEN);

  2850             rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);

  2851          }

  2852        }

  2853        se = se->getNext();

  2854     }

  2856     // now handle the general case

  2857     if (len == 0) return NULL; // FULLSTRIP

  2858     unsigned char sp = *((const unsigned char *)(word + len - 1));

  2859     SfxEntry * sptr = sStart[sp];

  2861     while (sptr) {

  2862         if (isRevSubset(sptr->getKey(), word + len - 1, len)

  2863         ) {

  2864             // suffixes are not allowed in beginning of compounds

  2865             if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass

  2866              // except when signed with compoundpermitflag flag

  2867              (sptr->getCont() && compoundpermitflag &&

  2868                 TESTAFF(sptr->getCont(),compoundpermitflag,sptr->getContLen()))) && (!circumfix ||

  2869               // no circumfix flag in prefix and suffix

  2870               ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),

  2871                    circumfix, ep->getContLen())) &&

  2872                (!sptr->getCont() || !(TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))) ||

  2873               // circumfix flag in prefix AND suffix

  2874               ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),

  2875                    circumfix, ep->getContLen())) &&

  2876                (sptr->getCont() && (TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))))  &&

  2877             // fogemorpheme

  2878               (in_compound ||

  2879                  !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) &&

  2880             // needaffix on first suffix

  2881               (cclass || !(sptr->getCont() &&

  2882                    TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())))

  2883             )) rv = sptr->checkword(word,len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag);

  2884             while (rv) {

  2885                     if (ppfx) {

  2886                         if (ppfx->getMorph()) {

  2887                             mystrcat(result, ppfx->getMorph(), MAXLNLEN);

  2888                             mystrcat(result, " ", MAXLNLEN);

  2889                         } else debugflag(result, ppfx->getFlag());

  2890                     }

  2891                     if (complexprefixes && HENTRY_DATA(rv)) mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);

  2892                     if (! HENTRY_FIND(rv, MORPH_STEM)) {

  2893                             mystrcat(result, " ", MAXLNLEN);

  2894                             mystrcat(result, MORPH_STEM, MAXLNLEN);

  2895                             mystrcat(result, HENTRY_WORD(rv), MAXLNLEN);

  2896                     }

  2897                     // store the pointer of the hash entry

  2898 //                    sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv);

  2900                     if (!complexprefixes && HENTRY_DATA(rv)) {

  2901                         mystrcat(result, " ", MAXLNLEN);

  2902                         mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);

  2903                     }

  2905                 if (sptr->getMorph()) {

  2906                     mystrcat(result, " ", MAXLNLEN);

  2907                     mystrcat(result, sptr->getMorph(), MAXLNLEN);

  2908                 } else debugflag(result, sptr->getFlag());

  2909                 mystrcat(result, "\n", MAXLNLEN);

  2910                 rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);

  2911             }

  2912              sptr = sptr->getNextEQ();

  2913         } else {

  2914              sptr = sptr->getNextNE();

  2915         }

  2916     }

  2918     if (*result) return mystrdup(result);

  2919     return NULL;

  2920 }

  2922 // check if word with affixes is correctly spelled

  2923 struct hentry * AffixMgr::affix_check (const char * word, int len, const FLAG needflag, char in_compound)

  2924 {

  2925     struct hentry * rv= NULL;

  2927     // check all prefixes (also crossed with suffixes if allowed)

  2928     rv = prefix_check(word, len, in_compound, needflag);

  2929     if (rv) return rv;

  2931     // if still not found check all suffixes

  2932     rv = suffix_check(word, len, 0, NULL, NULL, 0, NULL, FLAG_NULL, needflag, in_compound);

  2934     if (havecontclass) {

  2935         sfx = NULL;

  2936         pfx = NULL;

  2938         if (rv) return rv;

  2939         // if still not found check all two-level suffixes

  2940         rv = suffix_check_twosfx(word, len, 0, NULL, needflag);

  2942         if (rv) return rv;

  2943         // if still not found check all two-level suffixes

  2944         rv = prefix_check_twosfx(word, len, IN_CPD_NOT, needflag);

  2945     }

  2947     return rv;

  2948 }

  2950 // check if word with affixes is correctly spelled

  2951 char * AffixMgr::affix_check_morph(const char * word, int len, const FLAG needflag, char in_compound)

  2952 {

  2953     char result[MAXLNLEN];

  2954     char * st = NULL;

  2956     *result = '\0';

  2958     // check all prefixes (also crossed with suffixes if allowed)

  2959     st = prefix_check_morph(word, len, in_compound);

  2960     if (st) {

  2961         mystrcat(result, st, MAXLNLEN);

  2962         free(st);

  2963     }

  2965     // if still not found check all suffixes

  2966     st = suffix_check_morph(word, len, 0, NULL, '\0', needflag, in_compound);

  2967     if (st) {

  2968         mystrcat(result, st, MAXLNLEN);

  2969         free(st);

  2970     }

  2972     if (havecontclass) {

  2973         sfx = NULL;

  2974         pfx = NULL;

  2975         // if still not found check all two-level suffixes

  2976         st = suffix_check_twosfx_morph(word, len, 0, NULL, needflag);

  2977         if (st) {

  2978             mystrcat(result, st, MAXLNLEN);

  2979             free(st);

  2980         }

  2982         // if still not found check all two-level suffixes

  2983         st = prefix_check_twosfx_morph(word, len, IN_CPD_NOT, needflag);

  2984         if (st) {

  2985             mystrcat(result, st, MAXLNLEN);

  2986             free(st);

  2987         }

  2988     }

  2990     return mystrdup(result);

  2991 }

  2993 char * AffixMgr::morphgen(char * ts, int wl, const unsigned short * ap,

  2994     unsigned short al, char * morph, char * targetmorph, int level)

  2995 {

  2996     // handle suffixes

  2997     char * stemmorph;

  2998     char * stemmorphcatpos;

  2999     char mymorph[MAXLNLEN];

  3001     if (!morph) return NULL;

  3003     // check substandard flag

  3004     if (TESTAFF(ap, substandard, al)) return NULL;

  3006     if (morphcmp(morph, targetmorph) == 0) return mystrdup(ts);

  3008 //    int targetcount = get_sfxcount(targetmorph);

  3010     // use input suffix fields, if exist

  3011     if (strstr(morph, MORPH_INFL_SFX) || strstr(morph, MORPH_DERI_SFX)) {

  3012         stemmorph = mymorph;

  3013         strcpy(stemmorph, morph);

  3014         mystrcat(stemmorph, " ", MAXLNLEN);

  3015         stemmorphcatpos = stemmorph + strlen(stemmorph);

  3016     } else {

  3017         stemmorph = morph;

  3018         stemmorphcatpos = NULL;

  3019     }

  3021     for (int i = 0; i < al; i++) {

  3022         const unsigned char c = (unsigned char) (ap[i] & 0x00FF);

  3023         SfxEntry * sptr = sFlag[c];

  3024         while (sptr) {

  3025             if (sptr->getFlag() == ap[i] && sptr->getMorph() && ((sptr->getContLen() == 0) ||

  3026                 // don't generate forms with substandard affixes

  3027                 !TESTAFF(sptr->getCont(), substandard, sptr->getContLen()))) {

  3029                 if (stemmorphcatpos) strcpy(stemmorphcatpos, sptr->getMorph());

  3030                 else stemmorph = (char *) sptr->getMorph();

  3032                 int cmp = morphcmp(stemmorph, targetmorph);

  3034                 if (cmp == 0) {

  3035                     char * newword = sptr->add(ts, wl);

  3036                     if (newword) {

  3037                         hentry * check = pHMgr->lookup(newword); // XXX extra dic

  3038                         if (!check || !check->astr ||

  3039                             !(TESTAFF(check->astr, forbiddenword, check->alen) ||

  3040                               TESTAFF(check->astr, ONLYUPCASEFLAG, check->alen))) {

  3041                                 return newword;

  3042                         }

  3043                         free(newword);

  3044                     }

  3045                 }

  3047                 // recursive call for secondary suffixes

  3048                 if ((level == 0) && (cmp == 1) && (sptr->getContLen() > 0) &&

  3049 //                    (get_sfxcount(stemmorph) < targetcount) &&

  3050                     !TESTAFF(sptr->getCont(), substandard, sptr->getContLen())) {

  3051                     char * newword = sptr->add(ts, wl);

  3052                     if (newword) {

  3053                         char * newword2 = morphgen(newword, strlen(newword), sptr->getCont(),

  3054                             sptr->getContLen(), stemmorph, targetmorph, 1);

  3056                         if (newword2) {

  3057                             free(newword);

  3058                             return newword2;

  3059                         }

  3060                         free(newword);

  3061                         newword = NULL;

  3062                     }

  3063                 }

  3064             }

  3065             sptr = sptr->getFlgNxt();

  3066         }

  3067     }

  3068    return NULL;

  3069 }

  3072 int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts,

  3073     int wl, const unsigned short * ap, unsigned short al, char * bad, int badl,

  3074     char * phon)

  3075 {

  3076     int nh=0;

  3077     // first add root word to list

  3078     if ((nh < maxn) && !(al && ((needaffix && TESTAFF(ap, needaffix, al)) ||

  3079          (onlyincompound && TESTAFF(ap, onlyincompound, al))))) {

  3080        wlst[nh].word = mystrdup(ts);

  3081        if (!wlst[nh].word) return 0;

  3082        wlst[nh].allow = (1 == 0);

  3083        wlst[nh].orig = NULL;

  3084        nh++;

  3085        // add special phonetic version

  3086        if (phon && (nh < maxn)) {

  3087     	    wlst[nh].word = mystrdup(phon);

  3088             if (!wlst[nh].word) return nh - 1;

  3089     	    wlst[nh].allow = (1 == 0);

  3090     	    wlst[nh].orig = mystrdup(ts);

  3091             if (!wlst[nh].orig) return nh - 1;

  3092     	    nh++;

  3093        }

  3094     }

  3096     // handle suffixes

  3097     for (int i = 0; i < al; i++) {

  3098        const unsigned char c = (unsigned char) (ap[i] & 0x00FF);

  3099        SfxEntry * sptr = sFlag[c];

  3100        while (sptr) {

  3101          if ((sptr->getFlag() == ap[i]) && (!sptr->getKeyLen() || ((badl > sptr->getKeyLen()) &&

  3102                 (strcmp(sptr->getAffix(), bad + badl - sptr->getKeyLen()) == 0))) &&

  3103                 // check needaffix flag

  3104                 !(sptr->getCont() && ((needaffix &&

  3105                       TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) ||

  3106                   (circumfix &&

  3107                       TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())) ||

  3108                   (onlyincompound &&

  3109                       TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))

  3110                 ) {

  3111             char * newword = sptr->add(ts, wl);

  3112             if (newword) {

  3113                 if (nh < maxn) {

  3114                     wlst[nh].word = newword;

  3115                     wlst[nh].allow = sptr->allowCross();

  3116                     wlst[nh].orig = NULL;

  3117                     nh++;

  3118                     // add special phonetic version

  3119     		    if (phon && (nh < maxn)) {

  3120     			char st[MAXWORDUTF8LEN];

  3121     			strcpy(st, phon);

  3122     			strcat(st, sptr->getKey());

  3123     			reverseword(st + strlen(phon));

  3124     			wlst[nh].word = mystrdup(st);

  3125     			if (!wlst[nh].word) return nh - 1;

  3126     			wlst[nh].allow = (1 == 0);

  3127     			wlst[nh].orig = mystrdup(newword);

  3128                         if (!wlst[nh].orig) return nh - 1;

  3129     			nh++;

  3130     		    }

  3131                 } else {

  3132                     free(newword);

  3133                 }

  3134             }

  3135          }

  3136          sptr = sptr->getFlgNxt();

  3137        }

  3138     }

  3140     int n = nh;

  3142     // handle cross products of prefixes and suffixes

  3143     for (int j=1;j<n ;j++)

  3144        if (wlst[j].allow) {

  3145           for (int k = 0; k < al; k++) {

  3146              const unsigned char c = (unsigned char) (ap[k] & 0x00FF);

  3147              PfxEntry * cptr = pFlag[c];

  3148              while (cptr) {

  3149                 if ((cptr->getFlag() == ap[k]) && cptr->allowCross() && (!cptr->getKeyLen() || ((badl > cptr->getKeyLen()) &&

  3150                         (strncmp(cptr->getKey(), bad, cptr->getKeyLen()) == 0)))) {

  3151                     int l1 = strlen(wlst[j].word);

  3152                     char * newword = cptr->add(wlst[j].word, l1);

  3153                     if (newword) {

  3154                        if (nh < maxn) {

  3155                           wlst[nh].word = newword;

  3156                           wlst[nh].allow = cptr->allowCross();

  3157                           wlst[nh].orig = NULL;

  3158                           nh++;

  3159                        } else {

  3160                           free(newword);

  3161                        }

  3162                     }

  3163                 }

  3164                 cptr = cptr->getFlgNxt();

  3165              }

  3166           }

  3167        }

  3170     // now handle pure prefixes

  3171     for (int m = 0; m < al; m ++) {

  3172        const unsigned char c = (unsigned char) (ap[m] & 0x00FF);

  3173        PfxEntry * ptr = pFlag[c];

  3174        while (ptr) {

  3175          if ((ptr->getFlag() == ap[m]) && (!ptr->getKeyLen() || ((badl > ptr->getKeyLen()) &&

  3176                 (strncmp(ptr->getKey(), bad, ptr->getKeyLen()) == 0))) &&

  3177                 // check needaffix flag

  3178                 !(ptr->getCont() && ((needaffix &&

  3179                       TESTAFF(ptr->getCont(), needaffix, ptr->getContLen())) ||

  3180                      (circumfix &&

  3181                       TESTAFF(ptr->getCont(), circumfix, ptr->getContLen())) ||

  3182                   (onlyincompound &&

  3183                       TESTAFF(ptr->getCont(), onlyincompound, ptr->getContLen()))))

  3184                 ) {

  3185             char * newword = ptr->add(ts, wl);

  3186             if (newword) {

  3187                 if (nh < maxn) {

  3188                     wlst[nh].word = newword;

  3189                     wlst[nh].allow = ptr->allowCross();

  3190                     wlst[nh].orig = NULL;

  3191                     nh++;

  3192                 } else {

  3193                     free(newword);

  3194                 }

  3195             }

  3196          }

  3197          ptr = ptr->getFlgNxt();

  3198        }

  3199     }

  3201     return nh;

  3202 }

  3204 // return length of replacing table

  3205 int AffixMgr::get_numrep() const

  3206 {

  3207   return numrep;

  3208 }

  3210 // return replacing table

  3211 struct replentry * AffixMgr::get_reptable() const

  3212 {

  3213   if (! reptable ) return NULL;

  3214   return reptable;

  3215 }

  3217 // return iconv table

  3218 RepList * AffixMgr::get_iconvtable() const

  3219 {

  3220   if (! iconvtable ) return NULL;

  3221   return iconvtable;

  3222 }

  3224 // return oconv table

  3225 RepList * AffixMgr::get_oconvtable() const

  3226 {

  3227   if (! oconvtable ) return NULL;

  3228   return oconvtable;

  3229 }

  3231 // return replacing table

  3232 struct phonetable * AffixMgr::get_phonetable() const

  3233 {

  3234   if (! phone ) return NULL;

  3235   return phone;

  3236 }

  3238 // return length of character map table

  3239 int AffixMgr::get_nummap() const

  3240 {

  3241   return nummap;

  3242 }

  3244 // return character map table

  3245 struct mapentry * AffixMgr::get_maptable() const

  3246 {

  3247   if (! maptable ) return NULL;

  3248   return maptable;

  3249 }

  3251 // return length of word break table

  3252 int AffixMgr::get_numbreak() const

  3253 {

  3254   return numbreak;

  3255 }

  3257 // return character map table

  3258 char ** AffixMgr::get_breaktable() const

  3259 {

  3260   if (! breaktable ) return NULL;

  3261   return breaktable;

  3262 }

  3264 // return text encoding of dictionary

  3265 char * AffixMgr::get_encoding()

  3266 {

  3267   if (! encoding ) encoding = mystrdup(SPELL_ENCODING);

  3268   return mystrdup(encoding);

  3269 }

  3271 // return text encoding of dictionary

  3272 int AffixMgr::get_langnum() const

  3273 {

  3274   return langnum;

  3275 }

  3277 // return double prefix option

  3278 int AffixMgr::get_complexprefixes() const

  3279 {

  3280   return complexprefixes;

  3281 }

  3283 // return FULLSTRIP option

  3284 int AffixMgr::get_fullstrip() const

  3285 {

  3286   return fullstrip;

  3287 }

  3289 FLAG AffixMgr::get_keepcase() const

  3290 {

  3291   return keepcase;

  3292 }

  3294 FLAG AffixMgr::get_forceucase() const

  3295 {

  3296   return forceucase;

  3297 }

  3299 FLAG AffixMgr::get_warn() const

  3300 {

  3301   return warn;

  3302 }

  3304 int AffixMgr::get_forbidwarn() const

  3305 {

  3306   return forbidwarn;

  3307 }

  3309 int AffixMgr::get_checksharps() const

  3310 {

  3311   return checksharps;

  3312 }

  3314 char * AffixMgr::encode_flag(unsigned short aflag) const

  3315 {

  3316   return pHMgr->encode_flag(aflag);

  3317 }

  3320 // return the preferred ignore string for suggestions

  3321 char * AffixMgr::get_ignore() const

  3322 {

  3323   if (!ignorechars) return NULL;

  3324   return ignorechars;

  3325 }

  3327 // return the preferred ignore string for suggestions

  3328 unsigned short * AffixMgr::get_ignore_utf16(int * len) const

  3329 {

  3330   *len = ignorechars_utf16_len;

  3331   return ignorechars_utf16;

  3332 }

  3334 // return the keyboard string for suggestions

  3335 char * AffixMgr::get_key_string()

  3336 {

  3337   if (! keystring ) keystring = mystrdup(SPELL_KEYSTRING);

  3338   return mystrdup(keystring);

  3339 }

  3341 // return the preferred try string for suggestions

  3342 char * AffixMgr::get_try_string() const

  3343 {

  3344   if (! trystring ) return NULL;

  3345   return mystrdup(trystring);

  3346 }

  3348 // return the preferred try string for suggestions

  3349 const char * AffixMgr::get_wordchars() const

  3350 {

  3351   return wordchars;

  3352 }

  3354 unsigned short * AffixMgr::get_wordchars_utf16(int * len) const

  3355 {

  3356   *len = wordchars_utf16_len;

  3357   return wordchars_utf16;

  3358 }

  3360 // is there compounding?

  3361 int AffixMgr::get_compound() const

  3362 {

  3363   return compoundflag || compoundbegin || numdefcpd;

  3364 }

  3366 // return the compound words control flag

  3367 FLAG AffixMgr::get_compoundflag() const

  3368 {

  3369   return compoundflag;

  3370 }

  3372 // return the forbidden words control flag

  3373 FLAG AffixMgr::get_forbiddenword() const

  3374 {

  3375   return forbiddenword;

  3376 }

  3378 // return the forbidden words control flag

  3379 FLAG AffixMgr::get_nosuggest() const

  3380 {

  3381   return nosuggest;

  3382 }

  3384 // return the forbidden words control flag

  3385 FLAG AffixMgr::get_nongramsuggest() const

  3386 {

  3387   return nongramsuggest;

  3388 }

  3390 // return the forbidden words flag modify flag

  3391 FLAG AffixMgr::get_needaffix() const

  3392 {

  3393   return needaffix;

  3394 }

  3396 // return the onlyincompound flag

  3397 FLAG AffixMgr::get_onlyincompound() const

  3398 {

  3399   return onlyincompound;

  3400 }

  3402 // return the compound word signal flag

  3403 FLAG AffixMgr::get_compoundroot() const

  3404 {

  3405   return compoundroot;

  3406 }

  3408 // return the compound begin signal flag

  3409 FLAG AffixMgr::get_compoundbegin() const

  3410 {

  3411   return compoundbegin;

  3412 }

  3414 // return the value of checknum

  3415 int AffixMgr::get_checknum() const

  3416 {

  3417   return checknum;

  3418 }

  3420 // return the value of prefix

  3421 const char * AffixMgr::get_prefix() const

  3422 {

  3423   if (pfx) return pfx->getKey();

  3424   return NULL;

  3425 }

  3427 // return the value of suffix

  3428 const char * AffixMgr::get_suffix() const

  3429 {

  3430   return sfxappnd;

  3431 }

  3433 // return the value of suffix

  3434 const char * AffixMgr::get_version() const

  3435 {

  3436   return version;

  3437 }

  3439 // return lemma_present flag

  3440 FLAG AffixMgr::get_lemma_present() const

  3441 {

  3442   return lemma_present;

  3443 }

  3445 // utility method to look up root words in hash table

  3446 struct hentry * AffixMgr::lookup(const char * word)

  3447 {

  3448   int i;

  3449   struct hentry * he = NULL;

  3450   for (i = 0; i < *maxdic && !he; i++) {

  3451     he = (alldic[i])->lookup(word);

  3452   }

  3453   return he;

  3454 }

  3456 // return the value of suffix

  3457 int AffixMgr::have_contclass() const

  3458 {

  3459   return havecontclass;

  3460 }

  3462 // return utf8

  3463 int AffixMgr::get_utf8() const

  3464 {

  3465   return utf8;

  3466 }

  3468 int AffixMgr::get_maxngramsugs(void) const

  3469 {

  3470   return maxngramsugs;

  3471 }

  3473 int AffixMgr::get_maxcpdsugs(void) const

  3474 {

  3475   return maxcpdsugs;

  3476 }

  3478 int AffixMgr::get_maxdiff(void) const

  3479 {

  3480   return maxdiff;

  3481 }

  3483 int AffixMgr::get_onlymaxdiff(void) const

  3484 {

  3485   return onlymaxdiff;

  3486 }

  3488 // return nosplitsugs

  3489 int AffixMgr::get_nosplitsugs(void) const

  3490 {

  3491   return nosplitsugs;

  3492 }

  3494 // return sugswithdots

  3495 int AffixMgr::get_sugswithdots(void) const

  3496 {

  3497   return sugswithdots;

  3498 }

  3500 /* parse flag */

  3501 int AffixMgr::parse_flag(char * line, unsigned short * out, FileMgr * af) {

  3502    char * s = NULL;

  3503    if (*out != FLAG_NULL && !(*out >= DEFAULTFLAGS)) {

  3504       HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of an affix file parameter\n", af->getlinenum());

  3505       return 1;

  3506    }

  3507    if (parse_string(line, &s, af->getlinenum())) return 1;

  3508    *out = pHMgr->decode_flag(s);

  3509    free(s);

  3510    return 0;

  3511 }

  3513 /* parse num */

  3514 int AffixMgr::parse_num(char * line, int * out, FileMgr * af) {

  3515    char * s = NULL;

  3516    if (*out != -1) {

  3517       HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of an affix file parameter\n", af->getlinenum());

  3518       return 1;

  3519    }

  3520    if (parse_string(line, &s, af->getlinenum())) return 1;

  3521    *out = atoi(s);

  3522    free(s);

  3523    return 0;

  3524 }

  3526 /* parse in the max syllablecount of compound words and  */

  3527 int  AffixMgr::parse_cpdsyllable(char * line, FileMgr * af)

  3528 {

  3529    char * tp = line;

  3530    char * piece;

  3531    int i = 0;

  3532    int np = 0;

  3533    w_char w[MAXWORDLEN];

  3534    piece = mystrsep(&tp, 0);

  3535    while (piece) {

  3536       if (*piece != '\0') {

  3537           switch(i) {

  3538              case 0: { np++; break; }

  3539              case 1: { cpdmaxsyllable = atoi(piece); np++; break; }

  3540              case 2: {

  3541                 if (!utf8) {

  3542                     cpdvowels = mystrdup(piece);

  3543                 } else {

  3544                     int n = u8_u16(w, MAXWORDLEN, piece);

  3545                     if (n > 0) {

  3546                         flag_qsort((unsigned short *) w, 0, n);

  3547                         cpdvowels_utf16 = (w_char *) malloc(n * sizeof(w_char));

  3548                         if (!cpdvowels_utf16) return 1;

  3549                         memcpy(cpdvowels_utf16, w, n * sizeof(w_char));

  3550                     }

  3551                     cpdvowels_utf16_len = n;

  3552                 }

  3553                 np++;

  3554                 break;

  3555              }

  3556              default: break;

  3557           }

  3558           i++;

  3559       }

  3560       piece = mystrsep(&tp, 0);

  3561    }

  3562    if (np < 2) {

  3563       HUNSPELL_WARNING(stderr, "error: line %d: missing compoundsyllable information\n", af->getlinenum());

  3564       return 1;

  3565    }

  3566    if (np == 2) cpdvowels = mystrdup("aeiouAEIOU");

  3567    return 0;

  3568 }

  3570 /* parse in the typical fault correcting table */

  3571 int  AffixMgr::parse_reptable(char * line, FileMgr * af)

  3572 {

  3573    if (numrep != 0) {

  3574       HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());

  3575       return 1;

  3576    }

  3577    char * tp = line;

  3578    char * piece;

  3579    int i = 0;

  3580    int np = 0;

  3581    piece = mystrsep(&tp, 0);

  3582    while (piece) {

  3583        if (*piece != '\0') {

  3584           switch(i) {

  3585              case 0: { np++; break; }

  3586              case 1: {

  3587                        numrep = atoi(piece);

  3588                        if (numrep < 1) {

  3589                           HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", af->getlinenum());

  3590                           return 1;

  3591                        }

  3592                        reptable = (replentry *) malloc(numrep * sizeof(struct replentry));

  3593                        if (!reptable) return 1;

  3594                        np++;

  3595                        break;

  3596                      }

  3597              default: break;

  3598           }

  3599           i++;

  3600        }

  3601        piece = mystrsep(&tp, 0);

  3602    }

  3603    if (np != 2) {

  3604       HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());

  3605       return 1;

  3606    }

  3608    /* now parse the numrep lines to read in the remainder of the table */

  3609    char * nl;

  3610    for (int j=0; j < numrep; j++) {

  3611         if (!(nl = af->getline())) return 1;

  3612         mychomp(nl);

  3613         tp = nl;

  3614         i = 0;

  3615         reptable[j].pattern = NULL;

  3616         reptable[j].pattern2 = NULL;

  3617         piece = mystrsep(&tp, 0);

  3618         while (piece) {

  3619            if (*piece != '\0') {

  3620                switch(i) {

  3621                   case 0: {

  3622                              if (strncmp(piece,"REP",3) != 0) {

  3623                                  HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());

  3624                                  numrep = 0;

  3625                                  return 1;

  3626                              }

  3627                              break;

  3628                           }

  3629                   case 1: {

  3630                             if (*piece == '^') reptable[j].start = true; else reptable[j].start = false;

  3631                             reptable[j].pattern = mystrrep(mystrdup(piece + int(reptable[j].start)),"_"," ");

  3632                             int lr = strlen(reptable[j].pattern) - 1;

  3633                             if (reptable[j].pattern[lr] == '$') {

  3634                                 reptable[j].end = true;

  3635                                 reptable[j].pattern[lr] = '\0';

  3636                             } else reptable[j].end = false;

  3637                             break;

  3638                           }

  3639                   case 2: { reptable[j].pattern2 = mystrrep(mystrdup(piece),"_"," "); break; }

  3640                   default: break;

  3641                }

  3642                i++;

  3643            }

  3644            piece = mystrsep(&tp, 0);

  3645         }

  3646         if ((!(reptable[j].pattern)) || (!(reptable[j].pattern2))) {

  3647              HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());

  3648              numrep = 0;

  3649              return 1;

  3650         }

  3651    }

  3652    return 0;

  3653 }

  3655 /* parse in the typical fault correcting table */

  3656 int  AffixMgr::parse_convtable(char * line, FileMgr * af, RepList ** rl, const char * keyword)

  3657 {

  3658    if (*rl) {

  3659       HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());

  3660       return 1;

  3661    }

  3662    char * tp = line;

  3663    char * piece;

  3664    int i = 0;

  3665    int np = 0;

  3666    int numrl = 0;

  3667    piece = mystrsep(&tp, 0);

  3668    while (piece) {

  3669        if (*piece != '\0') {

  3670           switch(i) {

  3671              case 0: { np++; break; }

  3672              case 1: {

  3673                        numrl = atoi(piece);

  3674                        if (numrl < 1) {

  3675                           HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", af->getlinenum());

  3676                           return 1;

  3677                        }

  3678                        *rl = new RepList(numrl);

  3679                        if (!*rl) return 1;

  3680                        np++;

  3681                        break;

  3682                      }

  3683              default: break;

  3684           }

  3685           i++;

  3686        }

  3687        piece = mystrsep(&tp, 0);

  3688    }

  3689    if (np != 2) {

  3690       HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());

  3691       return 1;

  3692    }

  3694    /* now parse the num lines to read in the remainder of the table */

  3695    char * nl;

  3696    for (int j=0; j < numrl; j++) {

  3697         if (!(nl = af->getline())) return 1;

  3698         mychomp(nl);

  3699         tp = nl;

  3700         i = 0;

  3701         char * pattern = NULL;

  3702         char * pattern2 = NULL;

  3703         piece = mystrsep(&tp, 0);

  3704         while (piece) {

  3705            if (*piece != '\0') {

  3706                switch(i) {

  3707                   case 0: {

  3708                              if (strncmp(piece, keyword, strlen(keyword)) != 0) {

  3709                                  HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());

  3710                                  delete *rl;

  3711                                  *rl = NULL;

  3712                                  return 1;

  3713                              }

  3714                              break;

  3715                           }

  3716                   case 1: { pattern = mystrrep(mystrdup(piece),"_"," "); break; }

  3717                   case 2: {

  3718                     pattern2 = mystrrep(mystrdup(piece),"_"," ");

  3719                     break;

  3720                   }

  3721                   default: break;

  3722                }

  3723                i++;

  3724            }

  3725            piece = mystrsep(&tp, 0);

  3726         }

  3727         if (!pattern || !pattern2) {

  3728             if (pattern)

  3729                 free(pattern);

  3730             if (pattern2)

  3731                 free(pattern2);

  3732             HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());

  3733             return 1;

  3734         }

  3735         (*rl)->add(pattern, pattern2);

  3736    }

  3737    return 0;

  3738 }

  3741 /* parse in the typical fault correcting table */

  3742 int  AffixMgr::parse_phonetable(char * line, FileMgr * af)

  3743 {

  3744    if (phone) {

  3745       HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());

  3746       return 1;

  3747    }

  3748    char * tp = line;

  3749    char * piece;

  3750    int i = 0;

  3751    int np = 0;

  3752    piece = mystrsep(&tp, 0);

  3753    while (piece) {

  3754        if (*piece != '\0') {

  3755           switch(i) {

  3756              case 0: { np++; break; }

  3757              case 1: {

  3758                        phone = (phonetable *) malloc(sizeof(struct phonetable));

  3759                        if (!phone) return 1;

  3760                        phone->num = atoi(piece);

  3761                        phone->rules = NULL;

  3762                        phone->utf8 = (char) utf8;

  3763                        if (phone->num < 1) {

  3764                           HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());

  3765                           return 1;

  3766                        }

  3767                        phone->rules = (char * *) malloc(2 * (phone->num + 1) * sizeof(char *));

  3768                        if (!phone->rules) {

  3769                           free(phone);

  3770                           phone = NULL;

  3771                           return 1;

  3772                        }

  3773                        np++;

  3774                        break;

  3775                      }

  3776              default: break;

  3777           }

  3778           i++;

  3779        }

  3780        piece = mystrsep(&tp, 0);

  3781    }

  3782    if (np != 2) {

  3783       HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());

  3784       return 1;

  3785    }

  3787    /* now parse the phone->num lines to read in the remainder of the table */

  3788    char * nl;

  3789    for (int j=0; j < phone->num; j++) {

  3790         if (!(nl = af->getline())) return 1;

  3791         mychomp(nl);

  3792         tp = nl;

  3793         i = 0;

  3794         phone->rules[j * 2] = NULL;

  3795         phone->rules[j * 2 + 1] = NULL;

  3796         piece = mystrsep(&tp, 0);

  3797         while (piece) {

  3798            if (*piece != '\0') {

  3799                switch(i) {

  3800                   case 0: {

  3801                              if (strncmp(piece,"PHONE",5) != 0) {

  3802                                  HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());

  3803                                  phone->num = 0;

  3804                                  return 1;

  3805                              }

  3806                              break;

  3807                           }

  3808                   case 1: { phone->rules[j * 2] = mystrrep(mystrdup(piece),"_",""); break; }

  3809                   case 2: { phone->rules[j * 2 + 1] = mystrrep(mystrdup(piece),"_",""); break; }

  3810                   default: break;

  3811                }

  3812                i++;

  3813            }

  3814            piece = mystrsep(&tp, 0);

  3815         }

  3816         if ((!(phone->rules[j * 2])) || (!(phone->rules[j * 2 + 1]))) {

  3817              HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());

  3818              phone->num = 0;

  3819              return 1;

  3820         }

  3821    }

  3822    phone->rules[phone->num * 2] = mystrdup("");

  3823    phone->rules[phone->num * 2 + 1] = mystrdup("");

  3824    init_phonet_hash(*phone);

  3825    return 0;

  3826 }

  3828 /* parse in the checkcompoundpattern table */

  3829 int  AffixMgr::parse_checkcpdtable(char * line, FileMgr * af)

  3830 {

  3831    if (numcheckcpd != 0) {

  3832       HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());

  3833       return 1;

  3834    }

  3835    char * tp = line;

  3836    char * piece;

  3837    int i = 0;

  3838    int np = 0;

  3839    piece = mystrsep(&tp, 0);

  3840    while (piece) {

  3841        if (*piece != '\0') {

  3842           switch(i) {

  3843              case 0: { np++; break; }

  3844              case 1: {

  3845                        numcheckcpd = atoi(piece);

  3846                        if (numcheckcpd < 1) {

  3847                           HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());

  3848                           return 1;

  3849                        }

  3850                        checkcpdtable = (patentry *) malloc(numcheckcpd * sizeof(struct patentry));

  3851                        if (!checkcpdtable) return 1;

  3852                        np++;

  3853                        break;

  3854                      }

  3855              default: break;

  3856           }

  3857           i++;

  3858        }

  3859        piece = mystrsep(&tp, 0);

  3860    }

  3861    if (np != 2) {

  3862       HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",  af->getlinenum());

  3863       return 1;

  3864    }

  3866    /* now parse the numcheckcpd lines to read in the remainder of the table */

  3867    char * nl;

  3868    for (int j=0; j < numcheckcpd; j++) {

  3869         if (!(nl = af->getline())) return 1;

  3870         mychomp(nl);

  3871         tp = nl;

  3872         i = 0;

  3873         checkcpdtable[j].pattern = NULL;

  3874         checkcpdtable[j].pattern2 = NULL;

  3875         checkcpdtable[j].pattern3 = NULL;

  3876         checkcpdtable[j].cond = FLAG_NULL;

  3877         checkcpdtable[j].cond2 = FLAG_NULL;

  3878         piece = mystrsep(&tp, 0);

  3879         while (piece) {

  3880            if (*piece != '\0') {

  3881                switch(i) {

  3882                   case 0: {

  3883                              if (strncmp(piece,"CHECKCOMPOUNDPATTERN",20) != 0) {

  3884                                  HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());

  3885                                  numcheckcpd = 0;

  3886                                  return 1;

  3887                              }

  3888                              break;

  3889                           }

  3890                   case 1: {

  3891                     checkcpdtable[j].pattern = mystrdup(piece);

  3892                     char * p = strchr(checkcpdtable[j].pattern, '/');

  3893                     if (p) {

  3894                       *p = '\0';

  3895                     checkcpdtable[j].cond = pHMgr->decode_flag(p + 1);

  3896                     }

  3897                     break; }

  3898                   case 2: {

  3899                     checkcpdtable[j].pattern2 = mystrdup(piece);

  3900                     char * p = strchr(checkcpdtable[j].pattern2, '/');

  3901                     if (p) {

  3902                       *p = '\0';

  3903                       checkcpdtable[j].cond2 = pHMgr->decode_flag(p + 1);

  3904                     }

  3905                     break;

  3906                     }

  3907                   case 3: { checkcpdtable[j].pattern3 = mystrdup(piece); simplifiedcpd = 1; break; }

  3908                   default: break;

  3909                }

  3910                i++;

  3911            }

  3912            piece = mystrsep(&tp, 0);

  3913         }

  3914         if ((!(checkcpdtable[j].pattern)) || (!(checkcpdtable[j].pattern2))) {

  3915              HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());

  3916              numcheckcpd = 0;

  3917              return 1;

  3918         }

  3919    }

  3920    return 0;

  3921 }

  3923 /* parse in the compound rule table */

  3924 int  AffixMgr::parse_defcpdtable(char * line, FileMgr * af)

  3925 {

  3926    if (numdefcpd != 0) {

  3927       HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());

  3928       return 1;

  3929    }

  3930    char * tp = line;

  3931    char * piece;

  3932    int i = 0;

  3933    int np = 0;

  3934    piece = mystrsep(&tp, 0);

  3935    while (piece) {

  3936        if (*piece != '\0') {

  3937           switch(i) {

  3938              case 0: { np++; break; }

  3939              case 1: {

  3940                        numdefcpd = atoi(piece);

  3941                        if (numdefcpd < 1) {

  3942                           HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());

  3943                           return 1;

  3944                        }

  3945                        defcpdtable = (flagentry *) malloc(numdefcpd * sizeof(flagentry));

  3946                        if (!defcpdtable) return 1;

  3947                        np++;

  3948                        break;

  3949                      }

  3950              default: break;

  3951           }

  3952           i++;

  3953        }

  3954        piece = mystrsep(&tp, 0);

  3955    }

  3956    if (np != 2) {

  3957       HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());

  3958       return 1;

  3959    }

  3961    /* now parse the numdefcpd lines to read in the remainder of the table */

  3962    char * nl;

  3963    for (int j=0; j < numdefcpd; j++) {

  3964         if (!(nl = af->getline())) return 1;

  3965         mychomp(nl);

  3966         tp = nl;

  3967         i = 0;

  3968         defcpdtable[j].def = NULL;

  3969         piece = mystrsep(&tp, 0);

  3970         while (piece) {

  3971            if (*piece != '\0') {

  3972                switch(i) {

  3973                   case 0: {

  3974                              if (strncmp(piece, "COMPOUNDRULE", 12) != 0) {

  3975                                  HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());

  3976                                  numdefcpd = 0;

  3977                                  return 1;

  3978                              }

  3979                              break;

  3980                           }

  3981                   case 1: { // handle parenthesized flags

  3982                             if (strchr(piece, '(')) {

  3983                                 defcpdtable[j].def = (FLAG *) malloc(strlen(piece) * sizeof(FLAG));

  3984                                 defcpdtable[j].len = 0;

  3985                                 int end = 0;

  3986                                 FLAG * conv;

  3987                                 while (!end) {

  3988                                     char * par = piece + 1;

  3989                                     while (*par != '(' && *par != ')' && *par != '\0') par++;

  3990                                     if (*par == '\0') end = 1; else *par = '\0';

  3991                                     if (*piece == '(') piece++;

  3992                                     if (*piece == '*' || *piece == '?') {

  3993                                         defcpdtable[j].def[defcpdtable[j].len++] = (FLAG) *piece;

  3994                                     } else if (*piece != '\0') {

  3995                                         int l = pHMgr->decode_flags(&conv, piece, af);

  3996                                         for (int k = 0; k < l; k++) defcpdtable[j].def[defcpdtable[j].len++] = conv[k];

  3997                                         free(conv);

  3998                                     }

  3999                                     piece = par + 1;

  4000                                 }

  4001                             } else {

  4002                                 defcpdtable[j].len = pHMgr->decode_flags(&(defcpdtable[j].def), piece, af);

  4003                             }

  4004                             break;

  4005                            }

  4006                   default: break;

  4007                }

  4008                i++;

  4009            }

  4010            piece = mystrsep(&tp, 0);

  4011         }

  4012         if (!defcpdtable[j].len) {

  4013              HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());

  4014              numdefcpd = 0;

  4015              return 1;

  4016         }

  4017    }

  4018    return 0;

  4019 }

  4022 /* parse in the character map table */

  4023 int  AffixMgr::parse_maptable(char * line, FileMgr * af)

  4024 {

  4025    if (nummap != 0) {

  4026       HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());

  4027       return 1;

  4028    }

  4029    char * tp = line;

  4030    char * piece;

  4031    int i = 0;

  4032    int np = 0;

  4033    piece = mystrsep(&tp, 0);

  4034    while (piece) {

  4035        if (*piece != '\0') {

  4036           switch(i) {

  4037              case 0: { np++; break; }

  4038              case 1: {

  4039                        nummap = atoi(piece);

  4040                        if (nummap < 1) {

  4041                           HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());

  4042                           return 1;

  4043                        }

  4044                        maptable = (mapentry *) malloc(nummap * sizeof(struct mapentry));

  4045                        if (!maptable) return 1;

  4046                        np++;

  4047                        break;

  4048                      }

  4049              default: break;

  4050           }

  4051           i++;

  4052        }

  4053        piece = mystrsep(&tp, 0);

  4054    }

  4055    if (np != 2) {

  4056       HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());

  4057       return 1;

  4058    }

  4060    /* now parse the nummap lines to read in the remainder of the table */

  4061    char * nl;

  4062    for (int j=0; j < nummap; j++) {

  4063         if (!(nl = af->getline())) return 1;

  4064         mychomp(nl);

  4065         tp = nl;

  4066         i = 0;

  4067         maptable[j].set = NULL;

  4068         maptable[j].len = 0;

  4069         piece = mystrsep(&tp, 0);

  4070         while (piece) {

  4071            if (*piece != '\0') {

  4072                switch(i) {

  4073                   case 0: {

  4074                              if (strncmp(piece,"MAP",3) != 0) {

  4075                                  HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());

  4076                                  nummap = 0;

  4077                                  return 1;

  4078                              }

  4079                              break;

  4080                           }

  4081                   case 1: {

  4082 			    int setn = 0;

  4083                             maptable[j].len = strlen(piece);

  4084                             maptable[j].set = (char **) malloc(maptable[j].len * sizeof(char*));

  4085                             if (!maptable[j].set) return 1;

  4086 			    for (int k = 0; k < maptable[j].len; k++) {

  4087 				int chl = 1;

  4088 				int chb = k;

  4089 			        if (piece[k] == '(') {

  4090 				    char * parpos = strchr(piece + k, ')');

  4091 				    if (parpos != NULL) {

  4092 					chb = k + 1;

  4093 					chl = (int)(parpos - piece) - k - 1;

  4094 					k = k + chl + 1;

  4095 				    }

  4096 				} else {

  4097 				    if (utf8 && (piece[k] & 0xc0) == 0xc0) {

  4098 					for (k++; utf8 && (piece[k] & 0xc0) == 0x80; k++);

  4099 					chl = k - chb;

  4100 					k--;

  4101 				    }

  4102 				}

  4103 				maptable[j].set[setn] = (char *) malloc(chl + 1);

  4104 				if (!maptable[j].set[setn]) return 1;

  4105 				strncpy(maptable[j].set[setn], piece + chb, chl);

  4106 				maptable[j].set[setn][chl] = '\0';

  4107 				setn++;

  4108 			    }

  4109                             maptable[j].len = setn;

  4110                             break; }

  4111                   default: break;

  4112                }

  4113                i++;

  4114            }

  4115            piece = mystrsep(&tp, 0);

  4116         }

  4117         if (!maptable[j].set || !maptable[j].len) {

  4118              HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());

  4119              nummap = 0;

  4120              return 1;

  4121         }

  4122    }

  4123    return 0;

  4124 }

  4126 /* parse in the word breakpoint table */

  4127 int  AffixMgr::parse_breaktable(char * line, FileMgr * af)

  4128 {

  4129    if (numbreak > -1) {

  4130       HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());

  4131       return 1;

  4132    }

  4133    char * tp = line;

  4134    char * piece;

  4135    int i = 0;

  4136    int np = 0;

  4137    piece = mystrsep(&tp, 0);

  4138    while (piece) {

  4139        if (*piece != '\0') {

  4140           switch(i) {

  4141              case 0: { np++; break; }

  4142              case 1: {

  4143                        numbreak = atoi(piece);

  4144                        if (numbreak < 0) {

  4145                           HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());

  4146                           return 1;

  4147                        }

  4148                        if (numbreak == 0) return 0;

  4149                        breaktable = (char **) malloc(numbreak * sizeof(char *));

  4150                        if (!breaktable) return 1;

  4151                        np++;

  4152                        break;

  4153                      }

  4154              default: break;

  4155           }

  4156           i++;

  4157        }

  4158        piece = mystrsep(&tp, 0);

  4159    }

  4160    if (np != 2) {

  4161       HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());

  4162       return 1;

  4163    }

  4165    /* now parse the numbreak lines to read in the remainder of the table */

  4166    char * nl;

  4167    for (int j=0; j < numbreak; j++) {

  4168         if (!(nl = af->getline())) return 1;

  4169         mychomp(nl);

  4170         tp = nl;

  4171         i = 0;

  4172         piece = mystrsep(&tp, 0);

  4173         while (piece) {

  4174            if (*piece != '\0') {

  4175                switch(i) {

  4176                   case 0: {

  4177                              if (strncmp(piece,"BREAK",5) != 0) {

  4178                                  HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());

  4179                                  numbreak = 0;

  4180                                  return 1;

  4181                              }

  4182                              break;

  4183                           }

  4184                   case 1: {

  4185                             breaktable[j] = mystrdup(piece);

  4186                             break;

  4187                           }

  4188                   default: break;

  4189                }

  4190                i++;

  4191            }

  4192            piece = mystrsep(&tp, 0);

  4193         }

  4194         if (!breaktable) {

  4195              HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());

  4196              numbreak = 0;

  4197              return 1;

  4198         }

  4199    }

  4200    return 0;

  4201 }

  4203 void AffixMgr::reverse_condition(char * piece) {

  4204     int neg = 0;

  4205     for (char * k = piece + strlen(piece) - 1; k >= piece; k--) {

  4206         switch(*k) {

  4207           case '[': {

  4208                 if (neg) *(k+1) = '['; else *k = ']';

  4209                     break;

  4210             }

  4211           case ']': {

  4212                 *k = '[';

  4213                 if (neg) *(k+1) = '^';

  4214                 neg = 0;

  4215                 break;

  4216             }

  4217           case '^': {

  4218                if (*(k+1) == ']') neg = 1; else *(k+1) = *k;

  4219                break;

  4220                 }

  4221           default: {

  4222             if (neg) *(k+1) = *k;

  4223           }

  4224        }

  4225     }

  4226 }

  4228 int  AffixMgr::parse_affix(char * line, const char at, FileMgr * af, char * dupflags)

  4229 {

  4230    int numents = 0;      // number of affentry structures to parse

  4232    unsigned short aflag = 0;      // affix char identifier

  4234    char ff=0;

  4235    std::vector<affentry> affentries;

  4237    char * tp = line;

  4238    char * nl = line;

  4239    char * piece;

  4240    int i = 0;

  4242    // checking lines with bad syntax

  4243 #ifdef DEBUG

  4244    int basefieldnum = 0;

  4245 #endif

  4247    // split affix header line into pieces

  4249    int np = 0;

  4251    piece = mystrsep(&tp, 0);

  4252    while (piece) {

  4253       if (*piece != '\0') {

  4254           switch(i) {

  4255              // piece 1 - is type of affix

  4256              case 0: { np++; break; }

  4258              // piece 2 - is affix char

  4259              case 1: {

  4260                     np++;

  4261                     aflag = pHMgr->decode_flag(piece);

  4262                     if (((at == 'S') && (dupflags[aflag] & dupSFX)) ||

  4263                         ((at == 'P') && (dupflags[aflag] & dupPFX))) {

  4264                         HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of an affix flag\n",

  4265                             af->getlinenum());

  4266                         // return 1; XXX permissive mode for bad dictionaries

  4267                     }

  4268                     dupflags[aflag] += (char) ((at == 'S') ? dupSFX : dupPFX);

  4269                     break;

  4270                     }

  4271              // piece 3 - is cross product indicator

  4272              case 2: { np++; if (*piece == 'Y') ff = aeXPRODUCT; break; }

  4274              // piece 4 - is number of affentries

  4275              case 3: {

  4276                        np++;

  4277                        numents = atoi(piece);

  4278                        if (numents == 0) {

  4279                            char * err = pHMgr->encode_flag(aflag);

  4280                            if (err) {

  4281                                 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",

  4282                                    af->getlinenum());

  4283                                 free(err);

  4284                            }

  4285                            return 1;

  4286                        }

  4287                        affentries.resize(numents);

  4288                        affentries[0].opts = ff;

  4289                        if (utf8) affentries[0].opts += aeUTF8;

  4290                        if (pHMgr->is_aliasf()) affentries[0].opts += aeALIASF;

  4291                        if (pHMgr->is_aliasm()) affentries[0].opts += aeALIASM;

  4292                        affentries[0].aflag = aflag;

  4293                      }

  4295              default: break;

  4296           }

  4297           i++;

  4298       }

  4299       piece = mystrsep(&tp, 0);

  4300    }

  4301    // check to make sure we parsed enough pieces

  4302    if (np != 4) {

  4303        char * err = pHMgr->encode_flag(aflag);

  4304        if (err) {

  4305             HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());

  4306             free(err);

  4307        }

  4308        return 1;

  4309    }

  4311    // now parse numents affentries for this affix

  4312    std::vector<affentry>::iterator start = affentries.begin();

  4313    std::vector<affentry>::iterator end = affentries.end();

  4314    for (std::vector<affentry>::iterator entry = start; entry != end; ++entry) {

  4315       if (!(nl = af->getline())) return 1;

  4316       mychomp(nl);

  4317       tp = nl;

  4318       i = 0;

  4319       np = 0;

  4321       // split line into pieces

  4322       piece = mystrsep(&tp, 0);

  4323       while (piece) {

  4324          if (*piece != '\0') {

  4325              switch(i) {

  4326                 // piece 1 - is type

  4327                 case 0: {

  4328                           np++;

  4329                           if (entry != start) entry->opts = start->opts &

  4330                              (char) (aeXPRODUCT + aeUTF8 + aeALIASF + aeALIASM);

  4331                           break;

  4332                         }

  4334                 // piece 2 - is affix char

  4335                 case 1: {

  4336                           np++;

  4337                           if (pHMgr->decode_flag(piece) != aflag) {

  4338                               char * err = pHMgr->encode_flag(aflag);

  4339                               if (err) {

  4340                                 HUNSPELL_WARNING(stderr, "error: line %d: affix %s is corrupt\n",

  4341                                     af->getlinenum(), err);

  4342                                 free(err);

  4343                               }

  4344                               return 1;

  4345                           }

  4347                           if (entry != start) entry->aflag = start->aflag;

  4348                           break;

  4349                         }

  4351                 // piece 3 - is string to strip or 0 for null

  4352                 case 2: {

  4353                           np++;

  4354                           if (complexprefixes) {

  4355                             if (utf8) reverseword_utf(piece); else reverseword(piece);

  4356                           }

  4357                           entry->strip = mystrdup(piece);

  4358                           entry->stripl = (unsigned char) strlen(entry->strip);

  4359                           if (strcmp(entry->strip,"0") == 0) {

  4360                               free(entry->strip);

  4361                               entry->strip=mystrdup("");

  4362                               entry->stripl = 0;

  4363                           }

  4364                           break;

  4365                         }

  4367                 // piece 4 - is affix string or 0 for null

  4368                 case 3: {

  4369                           char * dash;

  4370                           entry->morphcode = NULL;

  4371                           entry->contclass = NULL;

  4372                           entry->contclasslen = 0;

  4373                           np++;

  4374                           dash = strchr(piece, '/');

  4375                           if (dash) {

  4376                             *dash = '\0';

  4378                             if (ignorechars) {

  4379                               if (utf8) {

  4380                                 remove_ignored_chars_utf(piece, ignorechars_utf16, ignorechars_utf16_len);

  4381                               } else {

  4382                                 remove_ignored_chars(piece,ignorechars);

  4383                               }

  4384                             }

  4386                             if (complexprefixes) {

  4387                                 if (utf8) reverseword_utf(piece); else reverseword(piece);

  4388                             }

  4389                             entry->appnd = mystrdup(piece);

  4391                             if (pHMgr->is_aliasf()) {

  4392                                 int index = atoi(dash + 1);

  4393                                 entry->contclasslen = (unsigned short) pHMgr->get_aliasf(index, &(entry->contclass), af);

  4394                                 if (!entry->contclasslen) HUNSPELL_WARNING(stderr, "error: bad affix flag alias: \"%s\"\n", dash+1);

  4395                             } else {

  4396                                 entry->contclasslen = (unsigned short) pHMgr->decode_flags(&(entry->contclass), dash + 1, af);

  4397                                 flag_qsort(entry->contclass, 0, entry->contclasslen);

  4398                             }

  4399                             *dash = '/';

  4401                             havecontclass = 1;

  4402                             for (unsigned short _i = 0; _i < entry->contclasslen; _i++) {

  4403                               contclasses[(entry->contclass)[_i]] = 1;

  4404                             }

  4405                           } else {

  4406                             if (ignorechars) {

  4407                               if (utf8) {

  4408                                 remove_ignored_chars_utf(piece, ignorechars_utf16, ignorechars_utf16_len);

  4409                               } else {

  4410                                 remove_ignored_chars(piece,ignorechars);

  4411                               }

  4412                             }

  4414                             if (complexprefixes) {

  4415                                 if (utf8) reverseword_utf(piece); else reverseword(piece);

  4416                             }

  4417                             entry->appnd = mystrdup(piece);

  4418                           }

  4420                           entry->appndl = (unsigned char) strlen(entry->appnd);

  4421                           if (strcmp(entry->appnd,"0") == 0) {

  4422                               free(entry->appnd);

  4423                               entry->appnd=mystrdup("");

  4424                               entry->appndl = 0;

  4425                           }

  4426                           break;

  4427                         }

  4429                 // piece 5 - is the conditions descriptions

  4430                 case 4: {

  4431                           np++;

  4432                           if (complexprefixes) {

  4433                             if (utf8) reverseword_utf(piece); else reverseword(piece);

  4434                             reverse_condition(piece);

  4435                           }

  4436                           if (entry->stripl && (strcmp(piece, ".") != 0) &&

  4437                             redundant_condition(at, entry->strip, entry->stripl, piece, af->getlinenum()))

  4438                                 strcpy(piece, ".");

  4439                           if (at == 'S') {

  4440                             reverseword(piece);

  4441                             reverse_condition(piece);

  4442                           }

  4443                           if (encodeit(*entry, piece)) return 1;

  4444                          break;

  4445                 }

  4447                 case 5: {

  4448                           np++;

  4449                           if (pHMgr->is_aliasm()) {

  4450                             int index = atoi(piece);

  4451                             entry->morphcode = pHMgr->get_aliasm(index);

  4452                           } else {

  4453                             if (complexprefixes) { // XXX - fix me for morph. gen.

  4454                                 if (utf8) reverseword_utf(piece); else reverseword(piece);

  4455                             }

  4456                             // add the remaining of the line

  4457                             if (*tp) {

  4458                                 *(tp - 1) = ' ';

  4459                                 tp = tp + strlen(tp);

  4460                             }

  4461                             entry->morphcode = mystrdup(piece);

  4462                             if (!entry->morphcode) return 1;

  4463                           }

  4464                           break;

  4465                 }

  4466                 default: break;

  4467              }

  4468              i++;

  4469          }

  4470          piece = mystrsep(&tp, 0);

  4471       }

  4472       // check to make sure we parsed enough pieces

  4473       if (np < 4) {

  4474           char * err = pHMgr->encode_flag(aflag);

  4475           if (err) {

  4476             HUNSPELL_WARNING(stderr, "error: line %d: affix %s is corrupt\n",

  4477                 af->getlinenum(), err);

  4478             free(err);

  4479           }

  4480           return 1;

  4481       }

  4483 #ifdef DEBUG

  4484       // detect unnecessary fields, excepting comments

  4485       if (basefieldnum) {

  4486         int fieldnum = !(entry->morphcode) ? 5 : ((*(entry->morphcode)=='#') ? 5 : 6);

  4487           if (fieldnum != basefieldnum)

  4488             HUNSPELL_WARNING(stderr, "warning: line %d: bad field number\n", af->getlinenum());

  4489       } else {

  4490         basefieldnum = !(entry->morphcode) ? 5 : ((*(entry->morphcode)=='#') ? 5 : 6);

  4491       }

  4492 #endif

  4493    }

  4495    // now create SfxEntry or PfxEntry objects and use links to

  4496    // build an ordered (sorted by affix string) list

  4497    for (std::vector<affentry>::iterator entry = start; entry != end; ++entry) {

  4498       if (at == 'P') {

  4499           PfxEntry * pfxptr = new PfxEntry(this,&(*entry));

  4500           build_pfxtree(pfxptr);

  4501       } else {

  4502           SfxEntry * sfxptr = new SfxEntry(this,&(*entry));

  4503           build_sfxtree(sfxptr);

  4504       }

  4505    }

  4506    return 0;

  4507 }

  4509 int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char * cond, int linenum) {

  4510   int condl = strlen(cond);

  4511   int i;

  4512   int j;

  4513   int neg;

  4514   int in;

  4515   if (ft == 'P') { // prefix

  4516     if (strncmp(strip, cond, condl) == 0) return 1;

  4517     if (utf8) {

  4518     } else {

  4519       for (i = 0, j = 0; (i < stripl) && (j < condl); i++, j++) {

  4520         if (cond[j] != '[') {

  4521           if (cond[j] != strip[i]) {

  4522             HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum);

  4523             return 0;

  4524           }

  4525         } else {

  4526           neg = (cond[j+1] == '^') ? 1 : 0;

  4527           in = 0;

  4528           do {

  4529             j++;

  4530             if (strip[i] == cond[j]) in = 1;

  4531           } while ((j < (condl - 1)) && (cond[j] != ']'));

  4532           if (j == (condl - 1) && (cond[j] != ']')) {

  4533             HUNSPELL_WARNING(stderr, "error: line %d: missing ] in condition:\n%s\n", linenum, cond);

  4534             return 0;

  4535           }

  4536           if ((!neg && !in) || (neg && in)) {

  4537             HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum);

  4538             return 0;

  4539           }

  4540         }

  4541       }

  4542       if (j >= condl) return 1;

  4543     }

  4544   } else { // suffix

  4545     if ((stripl >= condl) && strcmp(strip + stripl - condl, cond) == 0) return 1;

  4546     if (utf8) {

  4547     } else {

  4548       for (i = stripl - 1, j = condl - 1; (i >= 0) && (j >= 0); i--, j--) {

  4549         if (cond[j] != ']') {

  4550           if (cond[j] != strip[i]) {

  4551             HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum);

  4552             return 0;

  4553           }

  4554         } else {

  4555           in = 0;

  4556           do {

  4557             j--;

  4558             if (strip[i] == cond[j]) in = 1;

  4559           } while ((j > 0) && (cond[j] != '['));

  4560           if ((j == 0) && (cond[j] != '[')) {

  4561             HUNSPELL_WARNING(stderr, "error: line: %d: missing ] in condition:\n%s\n", linenum, cond);

  4562             return 0;

  4563           }

  4564           neg = (cond[j+1] == '^') ? 1 : 0;

  4565           if ((!neg && !in) || (neg && in)) {

  4566             HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum);

  4567             return 0;

  4568           }

  4569         }

  4570       }

  4571       if (j < 0) return 1;

  4572     }

  4573   }

  4574   return 0;

  4575 }

The Tor Browser / file revision

extensions/spellcheck/hunspell/src/affixmgr.cpp@6474c204b198

extensions/spellcheck/hunspell/src/affixmgr.cpp