1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/extensions/spellcheck/hunspell/src/affixmgr.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,4575 @@ 1.4 +/******* BEGIN LICENSE BLOCK ******* 1.5 + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 1.6 + * 1.7 + * The contents of this file are subject to the Mozilla Public License Version 1.8 + * 1.1 (the "License"); you may not use this file except in compliance with 1.9 + * the License. You may obtain a copy of the License at 1.10 + * http://www.mozilla.org/MPL/ 1.11 + * 1.12 + * Software distributed under the License is distributed on an "AS IS" basis, 1.13 + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 1.14 + * for the specific language governing rights and limitations under the 1.15 + * License. 1.16 + * 1.17 + * The Initial Developers of the Original Code are Kevin Hendricks (MySpell) 1.18 + * and László Németh (Hunspell). Portions created by the Initial Developers 1.19 + * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved. 1.20 + * 1.21 + * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca) 1.22 + * David Einstein (deinst@world.std.com) 1.23 + * László Németh (nemethl@gyorsposta.hu) 1.24 + * Caolan McNamara (caolanm@redhat.com) 1.25 + * Davide Prina 1.26 + * Giuseppe Modugno 1.27 + * Gianluca Turconi 1.28 + * Simon Brouwer 1.29 + * Noll Janos 1.30 + * Biro Arpad 1.31 + * Goldman Eleonora 1.32 + * Sarlos Tamas 1.33 + * Bencsath Boldizsar 1.34 + * Halacsy Peter 1.35 + * Dvornik Laszlo 1.36 + * Gefferth Andras 1.37 + * Nagy Viktor 1.38 + * Varga Daniel 1.39 + * Chris Halls 1.40 + * Rene Engelhard 1.41 + * Bram Moolenaar 1.42 + * Dafydd Jones 1.43 + * Harri Pitkanen 1.44 + * Andras Timar 1.45 + * Tor Lillqvist 1.46 + * 1.47 + * Alternatively, the contents of this file may be used under the terms of 1.48 + * either the GNU General Public License Version 2 or later (the "GPL"), or 1.49 + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 1.50 + * in which case the provisions of the GPL or the LGPL are applicable instead 1.51 + * of those above. If you wish to allow use of your version of this file only 1.52 + * under the terms of either the GPL or the LGPL, and not to allow others to 1.53 + * use your version of this file under the terms of the MPL, indicate your 1.54 + * decision by deleting the provisions above and replace them with the notice 1.55 + * and other provisions required by the GPL or the LGPL. If you do not delete 1.56 + * the provisions above, a recipient may use your version of this file under 1.57 + * the terms of any one of the MPL, the GPL or the LGPL. 1.58 + * 1.59 + ******* END LICENSE BLOCK *******/ 1.60 + 1.61 +#include <stdlib.h> 1.62 +#include <string.h> 1.63 +#include <stdio.h> 1.64 +#include <ctype.h> 1.65 + 1.66 +#include <vector> 1.67 + 1.68 +#include "affixmgr.hxx" 1.69 +#include "affentry.hxx" 1.70 +#include "langnum.hxx" 1.71 + 1.72 +#include "csutil.hxx" 1.73 + 1.74 +AffixMgr::AffixMgr(const char * affpath, HashMgr** ptr, int * md, const char * key) 1.75 +{ 1.76 + // register hash manager and load affix data from aff file 1.77 + pHMgr = ptr[0]; 1.78 + alldic = ptr; 1.79 + maxdic = md; 1.80 + keystring = NULL; 1.81 + trystring = NULL; 1.82 + encoding=NULL; 1.83 + csconv=NULL; 1.84 + utf8 = 0; 1.85 + complexprefixes = 0; 1.86 + maptable = NULL; 1.87 + nummap = 0; 1.88 + breaktable = NULL; 1.89 + numbreak = -1; 1.90 + reptable = NULL; 1.91 + numrep = 0; 1.92 + iconvtable = NULL; 1.93 + oconvtable = NULL; 1.94 + checkcpdtable = NULL; 1.95 + // allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN) 1.96 + simplifiedcpd = 0; 1.97 + numcheckcpd = 0; 1.98 + defcpdtable = NULL; 1.99 + numdefcpd = 0; 1.100 + phone = NULL; 1.101 + compoundflag = FLAG_NULL; // permits word in compound forms 1.102 + compoundbegin = FLAG_NULL; // may be first word in compound forms 1.103 + compoundmiddle = FLAG_NULL; // may be middle word in compound forms 1.104 + compoundend = FLAG_NULL; // may be last word in compound forms 1.105 + compoundroot = FLAG_NULL; // compound word signing flag 1.106 + compoundpermitflag = FLAG_NULL; // compound permitting flag for suffixed word 1.107 + compoundforbidflag = FLAG_NULL; // compound fordidden flag for suffixed word 1.108 + checkcompounddup = 0; // forbid double words in compounds 1.109 + checkcompoundrep = 0; // forbid bad compounds (may be non compound word with a REP substitution) 1.110 + checkcompoundcase = 0; // forbid upper and lowercase combinations at word bounds 1.111 + checkcompoundtriple = 0; // forbid compounds with triple letters 1.112 + simplifiedtriple = 0; // allow simplified triple letters in compounds (Schiff+fahrt -> Schiffahrt) 1.113 + forbiddenword = FORBIDDENWORD; // forbidden word signing flag 1.114 + nosuggest = FLAG_NULL; // don't suggest words signed with NOSUGGEST flag 1.115 + nongramsuggest = FLAG_NULL; 1.116 + lang = NULL; // language 1.117 + langnum = 0; // language code (see http://l10n.openoffice.org/languages.html) 1.118 + needaffix = FLAG_NULL; // forbidden root, allowed only with suffixes 1.119 + cpdwordmax = -1; // default: unlimited wordcount in compound words 1.120 + cpdmin = -1; // undefined 1.121 + cpdmaxsyllable = 0; // default: unlimited syllablecount in compound words 1.122 + cpdvowels=NULL; // vowels (for calculating of Hungarian compounding limit, O(n) search! XXX) 1.123 + cpdvowels_utf16=NULL; // vowels for UTF-8 encoding (bsearch instead of O(n) search) 1.124 + cpdvowels_utf16_len=0; // vowels 1.125 + pfxappnd=NULL; // previous prefix for counting the syllables of prefix BUG 1.126 + sfxappnd=NULL; // previous suffix for counting a special syllables BUG 1.127 + cpdsyllablenum=NULL; // syllable count incrementing flag 1.128 + checknum=0; // checking numbers, and word with numbers 1.129 + wordchars=NULL; // letters + spec. word characters 1.130 + wordchars_utf16=NULL; // letters + spec. word characters 1.131 + wordchars_utf16_len=0; // letters + spec. word characters 1.132 + ignorechars=NULL; // letters + spec. word characters 1.133 + ignorechars_utf16=NULL; // letters + spec. word characters 1.134 + ignorechars_utf16_len=0; // letters + spec. word characters 1.135 + version=NULL; // affix and dictionary file version string 1.136 + havecontclass=0; // flags of possible continuing classes (double affix) 1.137 + // LEMMA_PRESENT: not put root into the morphological output. Lemma presents 1.138 + // in morhological description in dictionary file. It's often combined with PSEUDOROOT. 1.139 + lemma_present = FLAG_NULL; 1.140 + circumfix = FLAG_NULL; 1.141 + onlyincompound = FLAG_NULL; 1.142 + maxngramsugs = -1; // undefined 1.143 + maxdiff = -1; // undefined 1.144 + onlymaxdiff = 0; 1.145 + maxcpdsugs = -1; // undefined 1.146 + nosplitsugs = 0; 1.147 + sugswithdots = 0; 1.148 + keepcase = 0; 1.149 + forceucase = 0; 1.150 + warn = 0; 1.151 + forbidwarn = 0; 1.152 + checksharps = 0; 1.153 + substandard = FLAG_NULL; 1.154 + fullstrip = 0; 1.155 + 1.156 + sfx = NULL; 1.157 + pfx = NULL; 1.158 + 1.159 + for (int i=0; i < SETSIZE; i++) { 1.160 + pStart[i] = NULL; 1.161 + sStart[i] = NULL; 1.162 + pFlag[i] = NULL; 1.163 + sFlag[i] = NULL; 1.164 + } 1.165 + 1.166 + for (int j=0; j < CONTSIZE; j++) { 1.167 + contclasses[j] = 0; 1.168 + } 1.169 + 1.170 + if (parse_file(affpath, key)) { 1.171 + HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n",affpath); 1.172 + } 1.173 + 1.174 + if (cpdmin == -1) cpdmin = MINCPDLEN; 1.175 + 1.176 +} 1.177 + 1.178 + 1.179 +AffixMgr::~AffixMgr() 1.180 +{ 1.181 + // pass through linked prefix entries and clean up 1.182 + for (int i=0; i < SETSIZE ;i++) { 1.183 + pFlag[i] = NULL; 1.184 + PfxEntry * ptr = pStart[i]; 1.185 + PfxEntry * nptr = NULL; 1.186 + while (ptr) { 1.187 + nptr = ptr->getNext(); 1.188 + delete(ptr); 1.189 + ptr = nptr; 1.190 + nptr = NULL; 1.191 + } 1.192 + } 1.193 + 1.194 + // pass through linked suffix entries and clean up 1.195 + for (int j=0; j < SETSIZE ; j++) { 1.196 + sFlag[j] = NULL; 1.197 + SfxEntry * ptr = sStart[j]; 1.198 + SfxEntry * nptr = NULL; 1.199 + while (ptr) { 1.200 + nptr = ptr->getNext(); 1.201 + delete(ptr); 1.202 + ptr = nptr; 1.203 + nptr = NULL; 1.204 + } 1.205 + sStart[j] = NULL; 1.206 + } 1.207 + 1.208 + if (keystring) free(keystring); 1.209 + keystring=NULL; 1.210 + if (trystring) free(trystring); 1.211 + trystring=NULL; 1.212 + if (encoding) free(encoding); 1.213 + encoding=NULL; 1.214 + if (maptable) { 1.215 + for (int j=0; j < nummap; j++) { 1.216 + for (int k=0; k < maptable[j].len; k++) { 1.217 + if (maptable[j].set[k]) free(maptable[j].set[k]); 1.218 + } 1.219 + free(maptable[j].set); 1.220 + maptable[j].set = NULL; 1.221 + maptable[j].len = 0; 1.222 + } 1.223 + free(maptable); 1.224 + maptable = NULL; 1.225 + } 1.226 + nummap = 0; 1.227 + if (breaktable) { 1.228 + for (int j=0; j < numbreak; j++) { 1.229 + if (breaktable[j]) free(breaktable[j]); 1.230 + breaktable[j] = NULL; 1.231 + } 1.232 + free(breaktable); 1.233 + breaktable = NULL; 1.234 + } 1.235 + numbreak = 0; 1.236 + if (reptable) { 1.237 + for (int j=0; j < numrep; j++) { 1.238 + free(reptable[j].pattern); 1.239 + free(reptable[j].pattern2); 1.240 + } 1.241 + free(reptable); 1.242 + reptable = NULL; 1.243 + } 1.244 + if (iconvtable) delete iconvtable; 1.245 + if (oconvtable) delete oconvtable; 1.246 + if (phone && phone->rules) { 1.247 + for (int j=0; j < phone->num + 1; j++) { 1.248 + free(phone->rules[j * 2]); 1.249 + free(phone->rules[j * 2 + 1]); 1.250 + } 1.251 + free(phone->rules); 1.252 + free(phone); 1.253 + phone = NULL; 1.254 + } 1.255 + 1.256 + if (defcpdtable) { 1.257 + for (int j=0; j < numdefcpd; j++) { 1.258 + free(defcpdtable[j].def); 1.259 + defcpdtable[j].def = NULL; 1.260 + } 1.261 + free(defcpdtable); 1.262 + defcpdtable = NULL; 1.263 + } 1.264 + numrep = 0; 1.265 + if (checkcpdtable) { 1.266 + for (int j=0; j < numcheckcpd; j++) { 1.267 + free(checkcpdtable[j].pattern); 1.268 + free(checkcpdtable[j].pattern2); 1.269 + free(checkcpdtable[j].pattern3); 1.270 + checkcpdtable[j].pattern = NULL; 1.271 + checkcpdtable[j].pattern2 = NULL; 1.272 + checkcpdtable[j].pattern3 = NULL; 1.273 + } 1.274 + free(checkcpdtable); 1.275 + checkcpdtable = NULL; 1.276 + } 1.277 + numcheckcpd = 0; 1.278 + FREE_FLAG(compoundflag); 1.279 + FREE_FLAG(compoundbegin); 1.280 + FREE_FLAG(compoundmiddle); 1.281 + FREE_FLAG(compoundend); 1.282 + FREE_FLAG(compoundpermitflag); 1.283 + FREE_FLAG(compoundforbidflag); 1.284 + FREE_FLAG(compoundroot); 1.285 + FREE_FLAG(forbiddenword); 1.286 + FREE_FLAG(nosuggest); 1.287 + FREE_FLAG(nongramsuggest); 1.288 + FREE_FLAG(needaffix); 1.289 + FREE_FLAG(lemma_present); 1.290 + FREE_FLAG(circumfix); 1.291 + FREE_FLAG(onlyincompound); 1.292 + 1.293 + cpdwordmax = 0; 1.294 + pHMgr = NULL; 1.295 + cpdmin = 0; 1.296 + cpdmaxsyllable = 0; 1.297 + if (cpdvowels) free(cpdvowels); 1.298 + if (cpdvowels_utf16) free(cpdvowels_utf16); 1.299 + if (cpdsyllablenum) free(cpdsyllablenum); 1.300 + free_utf_tbl(); 1.301 + if (lang) free(lang); 1.302 + if (wordchars) free(wordchars); 1.303 + if (wordchars_utf16) free(wordchars_utf16); 1.304 + if (ignorechars) free(ignorechars); 1.305 + if (ignorechars_utf16) free(ignorechars_utf16); 1.306 + if (version) free(version); 1.307 + checknum=0; 1.308 +#ifdef MOZILLA_CLIENT 1.309 + delete [] csconv; 1.310 +#endif 1.311 +} 1.312 + 1.313 + 1.314 +// read in aff file and build up prefix and suffix entry objects 1.315 +int AffixMgr::parse_file(const char * affpath, const char * key) 1.316 +{ 1.317 + char * line; // io buffers 1.318 + char ft; // affix type 1.319 + 1.320 + // checking flag duplication 1.321 + char dupflags[CONTSIZE]; 1.322 + char dupflags_ini = 1; 1.323 + 1.324 + // first line indicator for removing byte order mark 1.325 + int firstline = 1; 1.326 + 1.327 + // open the affix file 1.328 + FileMgr * afflst = new FileMgr(affpath, key); 1.329 + if (!afflst) { 1.330 + HUNSPELL_WARNING(stderr, "error: could not open affix description file %s\n",affpath); 1.331 + return 1; 1.332 + } 1.333 + 1.334 + // step one is to parse the affix file building up the internal 1.335 + // affix data structures 1.336 + 1.337 + // read in each line ignoring any that do not 1.338 + // start with a known line type indicator 1.339 + while ((line = afflst->getline())) { 1.340 + mychomp(line); 1.341 + 1.342 + /* remove byte order mark */ 1.343 + if (firstline) { 1.344 + firstline = 0; 1.345 + // Affix file begins with byte order mark: possible incompatibility with old Hunspell versions 1.346 + if (strncmp(line,"\xEF\xBB\xBF",3) == 0) { 1.347 + memmove(line, line+3, strlen(line+3)+1); 1.348 + } 1.349 + } 1.350 + 1.351 + /* parse in the keyboard string */ 1.352 + if (strncmp(line,"KEY",3) == 0) { 1.353 + if (parse_string(line, &keystring, afflst->getlinenum())) { 1.354 + delete afflst; 1.355 + return 1; 1.356 + } 1.357 + } 1.358 + 1.359 + /* parse in the try string */ 1.360 + if (strncmp(line,"TRY",3) == 0) { 1.361 + if (parse_string(line, &trystring, afflst->getlinenum())) { 1.362 + delete afflst; 1.363 + return 1; 1.364 + } 1.365 + } 1.366 + 1.367 + /* parse in the name of the character set used by the .dict and .aff */ 1.368 + if (strncmp(line,"SET",3) == 0) { 1.369 + if (parse_string(line, &encoding, afflst->getlinenum())) { 1.370 + delete afflst; 1.371 + return 1; 1.372 + } 1.373 + if (strcmp(encoding, "UTF-8") == 0) { 1.374 + utf8 = 1; 1.375 +#ifndef OPENOFFICEORG 1.376 +#ifndef MOZILLA_CLIENT 1.377 + if (initialize_utf_tbl()) return 1; 1.378 +#endif 1.379 +#endif 1.380 + } 1.381 + } 1.382 + 1.383 + /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left writing system */ 1.384 + if (strncmp(line,"COMPLEXPREFIXES",15) == 0) 1.385 + complexprefixes = 1; 1.386 + 1.387 + /* parse in the flag used by the controlled compound words */ 1.388 + if (strncmp(line,"COMPOUNDFLAG",12) == 0) { 1.389 + if (parse_flag(line, &compoundflag, afflst)) { 1.390 + delete afflst; 1.391 + return 1; 1.392 + } 1.393 + } 1.394 + 1.395 + /* parse in the flag used by compound words */ 1.396 + if (strncmp(line,"COMPOUNDBEGIN",13) == 0) { 1.397 + if (complexprefixes) { 1.398 + if (parse_flag(line, &compoundend, afflst)) { 1.399 + delete afflst; 1.400 + return 1; 1.401 + } 1.402 + } else { 1.403 + if (parse_flag(line, &compoundbegin, afflst)) { 1.404 + delete afflst; 1.405 + return 1; 1.406 + } 1.407 + } 1.408 + } 1.409 + 1.410 + /* parse in the flag used by compound words */ 1.411 + if (strncmp(line,"COMPOUNDMIDDLE",14) == 0) { 1.412 + if (parse_flag(line, &compoundmiddle, afflst)) { 1.413 + delete afflst; 1.414 + return 1; 1.415 + } 1.416 + } 1.417 + /* parse in the flag used by compound words */ 1.418 + if (strncmp(line,"COMPOUNDEND",11) == 0) { 1.419 + if (complexprefixes) { 1.420 + if (parse_flag(line, &compoundbegin, afflst)) { 1.421 + delete afflst; 1.422 + return 1; 1.423 + } 1.424 + } else { 1.425 + if (parse_flag(line, &compoundend, afflst)) { 1.426 + delete afflst; 1.427 + return 1; 1.428 + } 1.429 + } 1.430 + } 1.431 + 1.432 + /* parse in the data used by compound_check() method */ 1.433 + if (strncmp(line,"COMPOUNDWORDMAX",15) == 0) { 1.434 + if (parse_num(line, &cpdwordmax, afflst)) { 1.435 + delete afflst; 1.436 + return 1; 1.437 + } 1.438 + } 1.439 + 1.440 + /* parse in the flag sign compounds in dictionary */ 1.441 + if (strncmp(line,"COMPOUNDROOT",12) == 0) { 1.442 + if (parse_flag(line, &compoundroot, afflst)) { 1.443 + delete afflst; 1.444 + return 1; 1.445 + } 1.446 + } 1.447 + 1.448 + /* parse in the flag used by compound_check() method */ 1.449 + if (strncmp(line,"COMPOUNDPERMITFLAG",18) == 0) { 1.450 + if (parse_flag(line, &compoundpermitflag, afflst)) { 1.451 + delete afflst; 1.452 + return 1; 1.453 + } 1.454 + } 1.455 + 1.456 + /* parse in the flag used by compound_check() method */ 1.457 + if (strncmp(line,"COMPOUNDFORBIDFLAG",18) == 0) { 1.458 + if (parse_flag(line, &compoundforbidflag, afflst)) { 1.459 + delete afflst; 1.460 + return 1; 1.461 + } 1.462 + } 1.463 + 1.464 + if (strncmp(line,"CHECKCOMPOUNDDUP",16) == 0) { 1.465 + checkcompounddup = 1; 1.466 + } 1.467 + 1.468 + if (strncmp(line,"CHECKCOMPOUNDREP",16) == 0) { 1.469 + checkcompoundrep = 1; 1.470 + } 1.471 + 1.472 + if (strncmp(line,"CHECKCOMPOUNDTRIPLE",19) == 0) { 1.473 + checkcompoundtriple = 1; 1.474 + } 1.475 + 1.476 + if (strncmp(line,"SIMPLIFIEDTRIPLE",16) == 0) { 1.477 + simplifiedtriple = 1; 1.478 + } 1.479 + 1.480 + if (strncmp(line,"CHECKCOMPOUNDCASE",17) == 0) { 1.481 + checkcompoundcase = 1; 1.482 + } 1.483 + 1.484 + if (strncmp(line,"NOSUGGEST",9) == 0) { 1.485 + if (parse_flag(line, &nosuggest, afflst)) { 1.486 + delete afflst; 1.487 + return 1; 1.488 + } 1.489 + } 1.490 + 1.491 + if (strncmp(line,"NONGRAMSUGGEST",14) == 0) { 1.492 + if (parse_flag(line, &nongramsuggest, afflst)) { 1.493 + delete afflst; 1.494 + return 1; 1.495 + } 1.496 + } 1.497 + 1.498 + /* parse in the flag used by forbidden words */ 1.499 + if (strncmp(line,"FORBIDDENWORD",13) == 0) { 1.500 + if (parse_flag(line, &forbiddenword, afflst)) { 1.501 + delete afflst; 1.502 + return 1; 1.503 + } 1.504 + } 1.505 + 1.506 + /* parse in the flag used by forbidden words */ 1.507 + if (strncmp(line,"LEMMA_PRESENT",13) == 0) { 1.508 + if (parse_flag(line, &lemma_present, afflst)) { 1.509 + delete afflst; 1.510 + return 1; 1.511 + } 1.512 + } 1.513 + 1.514 + /* parse in the flag used by circumfixes */ 1.515 + if (strncmp(line,"CIRCUMFIX",9) == 0) { 1.516 + if (parse_flag(line, &circumfix, afflst)) { 1.517 + delete afflst; 1.518 + return 1; 1.519 + } 1.520 + } 1.521 + 1.522 + /* parse in the flag used by fogemorphemes */ 1.523 + if (strncmp(line,"ONLYINCOMPOUND",14) == 0) { 1.524 + if (parse_flag(line, &onlyincompound, afflst)) { 1.525 + delete afflst; 1.526 + return 1; 1.527 + } 1.528 + } 1.529 + 1.530 + /* parse in the flag used by `needaffixs' */ 1.531 + if (strncmp(line,"PSEUDOROOT",10) == 0) { 1.532 + if (parse_flag(line, &needaffix, afflst)) { 1.533 + delete afflst; 1.534 + return 1; 1.535 + } 1.536 + } 1.537 + 1.538 + /* parse in the flag used by `needaffixs' */ 1.539 + if (strncmp(line,"NEEDAFFIX",9) == 0) { 1.540 + if (parse_flag(line, &needaffix, afflst)) { 1.541 + delete afflst; 1.542 + return 1; 1.543 + } 1.544 + } 1.545 + 1.546 + /* parse in the minimal length for words in compounds */ 1.547 + if (strncmp(line,"COMPOUNDMIN",11) == 0) { 1.548 + if (parse_num(line, &cpdmin, afflst)) { 1.549 + delete afflst; 1.550 + return 1; 1.551 + } 1.552 + if (cpdmin < 1) cpdmin = 1; 1.553 + } 1.554 + 1.555 + /* parse in the max. words and syllables in compounds */ 1.556 + if (strncmp(line,"COMPOUNDSYLLABLE",16) == 0) { 1.557 + if (parse_cpdsyllable(line, afflst)) { 1.558 + delete afflst; 1.559 + return 1; 1.560 + } 1.561 + } 1.562 + 1.563 + /* parse in the flag used by compound_check() method */ 1.564 + if (strncmp(line,"SYLLABLENUM",11) == 0) { 1.565 + if (parse_string(line, &cpdsyllablenum, afflst->getlinenum())) { 1.566 + delete afflst; 1.567 + return 1; 1.568 + } 1.569 + } 1.570 + 1.571 + /* parse in the flag used by the controlled compound words */ 1.572 + if (strncmp(line,"CHECKNUM",8) == 0) { 1.573 + checknum=1; 1.574 + } 1.575 + 1.576 + /* parse in the extra word characters */ 1.577 + if (strncmp(line,"WORDCHARS",9) == 0) { 1.578 + if (parse_array(line, &wordchars, &wordchars_utf16, &wordchars_utf16_len, utf8, afflst->getlinenum())) { 1.579 + delete afflst; 1.580 + return 1; 1.581 + } 1.582 + } 1.583 + 1.584 + /* parse in the ignored characters (for example, Arabic optional diacretics charachters */ 1.585 + if (strncmp(line,"IGNORE",6) == 0) { 1.586 + if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, utf8, afflst->getlinenum())) { 1.587 + delete afflst; 1.588 + return 1; 1.589 + } 1.590 + } 1.591 + 1.592 + /* parse in the typical fault correcting table */ 1.593 + if (strncmp(line,"REP",3) == 0) { 1.594 + if (parse_reptable(line, afflst)) { 1.595 + delete afflst; 1.596 + return 1; 1.597 + } 1.598 + } 1.599 + 1.600 + /* parse in the input conversion table */ 1.601 + if (strncmp(line,"ICONV",5) == 0) { 1.602 + if (parse_convtable(line, afflst, &iconvtable, "ICONV")) { 1.603 + delete afflst; 1.604 + return 1; 1.605 + } 1.606 + } 1.607 + 1.608 + /* parse in the input conversion table */ 1.609 + if (strncmp(line,"OCONV",5) == 0) { 1.610 + if (parse_convtable(line, afflst, &oconvtable, "OCONV")) { 1.611 + delete afflst; 1.612 + return 1; 1.613 + } 1.614 + } 1.615 + 1.616 + /* parse in the phonetic translation table */ 1.617 + if (strncmp(line,"PHONE",5) == 0) { 1.618 + if (parse_phonetable(line, afflst)) { 1.619 + delete afflst; 1.620 + return 1; 1.621 + } 1.622 + } 1.623 + 1.624 + /* parse in the checkcompoundpattern table */ 1.625 + if (strncmp(line,"CHECKCOMPOUNDPATTERN",20) == 0) { 1.626 + if (parse_checkcpdtable(line, afflst)) { 1.627 + delete afflst; 1.628 + return 1; 1.629 + } 1.630 + } 1.631 + 1.632 + /* parse in the defcompound table */ 1.633 + if (strncmp(line,"COMPOUNDRULE",12) == 0) { 1.634 + if (parse_defcpdtable(line, afflst)) { 1.635 + delete afflst; 1.636 + return 1; 1.637 + } 1.638 + } 1.639 + 1.640 + /* parse in the related character map table */ 1.641 + if (strncmp(line,"MAP",3) == 0) { 1.642 + if (parse_maptable(line, afflst)) { 1.643 + delete afflst; 1.644 + return 1; 1.645 + } 1.646 + } 1.647 + 1.648 + /* parse in the word breakpoints table */ 1.649 + if (strncmp(line,"BREAK",5) == 0) { 1.650 + if (parse_breaktable(line, afflst)) { 1.651 + delete afflst; 1.652 + return 1; 1.653 + } 1.654 + } 1.655 + 1.656 + /* parse in the language for language specific codes */ 1.657 + if (strncmp(line,"LANG",4) == 0) { 1.658 + if (parse_string(line, &lang, afflst->getlinenum())) { 1.659 + delete afflst; 1.660 + return 1; 1.661 + } 1.662 + langnum = get_lang_num(lang); 1.663 + } 1.664 + 1.665 + if (strncmp(line,"VERSION",7) == 0) { 1.666 + for(line = line + 7; *line == ' ' || *line == '\t'; line++); 1.667 + version = mystrdup(line); 1.668 + } 1.669 + 1.670 + if (strncmp(line,"MAXNGRAMSUGS",12) == 0) { 1.671 + if (parse_num(line, &maxngramsugs, afflst)) { 1.672 + delete afflst; 1.673 + return 1; 1.674 + } 1.675 + } 1.676 + 1.677 + if (strncmp(line,"ONLYMAXDIFF", 11) == 0) 1.678 + onlymaxdiff = 1; 1.679 + 1.680 + if (strncmp(line,"MAXDIFF",7) == 0) { 1.681 + if (parse_num(line, &maxdiff, afflst)) { 1.682 + delete afflst; 1.683 + return 1; 1.684 + } 1.685 + } 1.686 + 1.687 + if (strncmp(line,"MAXCPDSUGS",10) == 0) { 1.688 + if (parse_num(line, &maxcpdsugs, afflst)) { 1.689 + delete afflst; 1.690 + return 1; 1.691 + } 1.692 + } 1.693 + 1.694 + if (strncmp(line,"NOSPLITSUGS",11) == 0) { 1.695 + nosplitsugs=1; 1.696 + } 1.697 + 1.698 + if (strncmp(line,"FULLSTRIP",9) == 0) { 1.699 + fullstrip=1; 1.700 + } 1.701 + 1.702 + if (strncmp(line,"SUGSWITHDOTS",12) == 0) { 1.703 + sugswithdots=1; 1.704 + } 1.705 + 1.706 + /* parse in the flag used by forbidden words */ 1.707 + if (strncmp(line,"KEEPCASE",8) == 0) { 1.708 + if (parse_flag(line, &keepcase, afflst)) { 1.709 + delete afflst; 1.710 + return 1; 1.711 + } 1.712 + } 1.713 + 1.714 + /* parse in the flag used by `forceucase' */ 1.715 + if (strncmp(line,"FORCEUCASE",10) == 0) { 1.716 + if (parse_flag(line, &forceucase, afflst)) { 1.717 + delete afflst; 1.718 + return 1; 1.719 + } 1.720 + } 1.721 + 1.722 + /* parse in the flag used by `warn' */ 1.723 + if (strncmp(line,"WARN",4) == 0) { 1.724 + if (parse_flag(line, &warn, afflst)) { 1.725 + delete afflst; 1.726 + return 1; 1.727 + } 1.728 + } 1.729 + 1.730 + if (strncmp(line,"FORBIDWARN",10) == 0) { 1.731 + forbidwarn=1; 1.732 + } 1.733 + 1.734 + /* parse in the flag used by the affix generator */ 1.735 + if (strncmp(line,"SUBSTANDARD",11) == 0) { 1.736 + if (parse_flag(line, &substandard, afflst)) { 1.737 + delete afflst; 1.738 + return 1; 1.739 + } 1.740 + } 1.741 + 1.742 + if (strncmp(line,"CHECKSHARPS",11) == 0) { 1.743 + checksharps=1; 1.744 + } 1.745 + 1.746 + /* parse this affix: P - prefix, S - suffix */ 1.747 + ft = ' '; 1.748 + if (strncmp(line,"PFX",3) == 0) ft = complexprefixes ? 'S' : 'P'; 1.749 + if (strncmp(line,"SFX",3) == 0) ft = complexprefixes ? 'P' : 'S'; 1.750 + if (ft != ' ') { 1.751 + if (dupflags_ini) { 1.752 + memset(dupflags, 0, sizeof(dupflags)); 1.753 + dupflags_ini = 0; 1.754 + } 1.755 + if (parse_affix(line, ft, afflst, dupflags)) { 1.756 + delete afflst; 1.757 + process_pfx_tree_to_list(); 1.758 + process_sfx_tree_to_list(); 1.759 + return 1; 1.760 + } 1.761 + } 1.762 + 1.763 + } 1.764 + delete afflst; 1.765 + 1.766 + // convert affix trees to sorted list 1.767 + process_pfx_tree_to_list(); 1.768 + process_sfx_tree_to_list(); 1.769 + 1.770 + // now we can speed up performance greatly taking advantage of the 1.771 + // relationship between the affixes and the idea of "subsets". 1.772 + 1.773 + // View each prefix as a potential leading subset of another and view 1.774 + // each suffix (reversed) as a potential trailing subset of another. 1.775 + 1.776 + // To illustrate this relationship if we know the prefix "ab" is found in the 1.777 + // word to examine, only prefixes that "ab" is a leading subset of need be examined. 1.778 + // Furthermore is "ab" is not present then none of the prefixes that "ab" is 1.779 + // is a subset need be examined. 1.780 + // The same argument goes for suffix string that are reversed. 1.781 + 1.782 + // Then to top this off why not examine the first char of the word to quickly 1.783 + // limit the set of prefixes to examine (i.e. the prefixes to examine must 1.784 + // be leading supersets of the first character of the word (if they exist) 1.785 + 1.786 + // To take advantage of this "subset" relationship, we need to add two links 1.787 + // from entry. One to take next if the current prefix is found (call it nexteq) 1.788 + // and one to take next if the current prefix is not found (call it nextne). 1.789 + 1.790 + // Since we have built ordered lists, all that remains is to properly initialize 1.791 + // the nextne and nexteq pointers that relate them 1.792 + 1.793 + process_pfx_order(); 1.794 + process_sfx_order(); 1.795 + 1.796 + /* get encoding for CHECKCOMPOUNDCASE */ 1.797 + if (!utf8) { 1.798 + char * enc = get_encoding(); 1.799 + csconv = get_current_cs(enc); 1.800 + free(enc); 1.801 + enc = NULL; 1.802 + 1.803 + char expw[MAXLNLEN]; 1.804 + if (wordchars) { 1.805 + strcpy(expw, wordchars); 1.806 + free(wordchars); 1.807 + } else *expw = '\0'; 1.808 + 1.809 + for (int i = 0; i <= 255; i++) { 1.810 + if ( (csconv[i].cupper != csconv[i].clower) && 1.811 + (! strchr(expw, (char) i))) { 1.812 + *(expw + strlen(expw) + 1) = '\0'; 1.813 + *(expw + strlen(expw)) = (char) i; 1.814 + } 1.815 + } 1.816 + 1.817 + wordchars = mystrdup(expw); 1.818 + } 1.819 + 1.820 + // default BREAK definition 1.821 + if (numbreak == -1) { 1.822 + breaktable = (char **) malloc(sizeof(char *) * 3); 1.823 + if (!breaktable) return 1; 1.824 + breaktable[0] = mystrdup("-"); 1.825 + breaktable[1] = mystrdup("^-"); 1.826 + breaktable[2] = mystrdup("-$"); 1.827 + if (breaktable[0] && breaktable[1] && breaktable[2]) numbreak = 3; 1.828 + } 1.829 + return 0; 1.830 +} 1.831 + 1.832 + 1.833 +// we want to be able to quickly access prefix information 1.834 +// both by prefix flag, and sorted by prefix string itself 1.835 +// so we need to set up two indexes 1.836 + 1.837 +int AffixMgr::build_pfxtree(PfxEntry* pfxptr) 1.838 +{ 1.839 + PfxEntry * ptr; 1.840 + PfxEntry * pptr; 1.841 + PfxEntry * ep = pfxptr; 1.842 + 1.843 + // get the right starting points 1.844 + const char * key = ep->getKey(); 1.845 + const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF); 1.846 + 1.847 + // first index by flag which must exist 1.848 + ptr = pFlag[flg]; 1.849 + ep->setFlgNxt(ptr); 1.850 + pFlag[flg] = ep; 1.851 + 1.852 + 1.853 + // handle the special case of null affix string 1.854 + if (strlen(key) == 0) { 1.855 + // always inset them at head of list at element 0 1.856 + ptr = pStart[0]; 1.857 + ep->setNext(ptr); 1.858 + pStart[0] = ep; 1.859 + return 0; 1.860 + } 1.861 + 1.862 + // now handle the normal case 1.863 + ep->setNextEQ(NULL); 1.864 + ep->setNextNE(NULL); 1.865 + 1.866 + unsigned char sp = *((const unsigned char *)key); 1.867 + ptr = pStart[sp]; 1.868 + 1.869 + // handle the first insert 1.870 + if (!ptr) { 1.871 + pStart[sp] = ep; 1.872 + return 0; 1.873 + } 1.874 + 1.875 + 1.876 + // otherwise use binary tree insertion so that a sorted 1.877 + // list can easily be generated later 1.878 + pptr = NULL; 1.879 + for (;;) { 1.880 + pptr = ptr; 1.881 + if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) { 1.882 + ptr = ptr->getNextEQ(); 1.883 + if (!ptr) { 1.884 + pptr->setNextEQ(ep); 1.885 + break; 1.886 + } 1.887 + } else { 1.888 + ptr = ptr->getNextNE(); 1.889 + if (!ptr) { 1.890 + pptr->setNextNE(ep); 1.891 + break; 1.892 + } 1.893 + } 1.894 + } 1.895 + return 0; 1.896 +} 1.897 + 1.898 +// we want to be able to quickly access suffix information 1.899 +// both by suffix flag, and sorted by the reverse of the 1.900 +// suffix string itself; so we need to set up two indexes 1.901 +int AffixMgr::build_sfxtree(SfxEntry* sfxptr) 1.902 +{ 1.903 + SfxEntry * ptr; 1.904 + SfxEntry * pptr; 1.905 + SfxEntry * ep = sfxptr; 1.906 + 1.907 + /* get the right starting point */ 1.908 + const char * key = ep->getKey(); 1.909 + const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF); 1.910 + 1.911 + // first index by flag which must exist 1.912 + ptr = sFlag[flg]; 1.913 + ep->setFlgNxt(ptr); 1.914 + sFlag[flg] = ep; 1.915 + 1.916 + // next index by affix string 1.917 + 1.918 + // handle the special case of null affix string 1.919 + if (strlen(key) == 0) { 1.920 + // always inset them at head of list at element 0 1.921 + ptr = sStart[0]; 1.922 + ep->setNext(ptr); 1.923 + sStart[0] = ep; 1.924 + return 0; 1.925 + } 1.926 + 1.927 + // now handle the normal case 1.928 + ep->setNextEQ(NULL); 1.929 + ep->setNextNE(NULL); 1.930 + 1.931 + unsigned char sp = *((const unsigned char *)key); 1.932 + ptr = sStart[sp]; 1.933 + 1.934 + // handle the first insert 1.935 + if (!ptr) { 1.936 + sStart[sp] = ep; 1.937 + return 0; 1.938 + } 1.939 + 1.940 + // otherwise use binary tree insertion so that a sorted 1.941 + // list can easily be generated later 1.942 + pptr = NULL; 1.943 + for (;;) { 1.944 + pptr = ptr; 1.945 + if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) { 1.946 + ptr = ptr->getNextEQ(); 1.947 + if (!ptr) { 1.948 + pptr->setNextEQ(ep); 1.949 + break; 1.950 + } 1.951 + } else { 1.952 + ptr = ptr->getNextNE(); 1.953 + if (!ptr) { 1.954 + pptr->setNextNE(ep); 1.955 + break; 1.956 + } 1.957 + } 1.958 + } 1.959 + return 0; 1.960 +} 1.961 + 1.962 +// convert from binary tree to sorted list 1.963 +int AffixMgr::process_pfx_tree_to_list() 1.964 +{ 1.965 + for (int i=1; i< SETSIZE; i++) { 1.966 + pStart[i] = process_pfx_in_order(pStart[i],NULL); 1.967 + } 1.968 + return 0; 1.969 +} 1.970 + 1.971 + 1.972 +PfxEntry* AffixMgr::process_pfx_in_order(PfxEntry* ptr, PfxEntry* nptr) 1.973 +{ 1.974 + if (ptr) { 1.975 + nptr = process_pfx_in_order(ptr->getNextNE(), nptr); 1.976 + ptr->setNext(nptr); 1.977 + nptr = process_pfx_in_order(ptr->getNextEQ(), ptr); 1.978 + } 1.979 + return nptr; 1.980 +} 1.981 + 1.982 + 1.983 +// convert from binary tree to sorted list 1.984 +int AffixMgr:: process_sfx_tree_to_list() 1.985 +{ 1.986 + for (int i=1; i< SETSIZE; i++) { 1.987 + sStart[i] = process_sfx_in_order(sStart[i],NULL); 1.988 + } 1.989 + return 0; 1.990 +} 1.991 + 1.992 +SfxEntry* AffixMgr::process_sfx_in_order(SfxEntry* ptr, SfxEntry* nptr) 1.993 +{ 1.994 + if (ptr) { 1.995 + nptr = process_sfx_in_order(ptr->getNextNE(), nptr); 1.996 + ptr->setNext(nptr); 1.997 + nptr = process_sfx_in_order(ptr->getNextEQ(), ptr); 1.998 + } 1.999 + return nptr; 1.1000 +} 1.1001 + 1.1002 + 1.1003 +// reinitialize the PfxEntry links NextEQ and NextNE to speed searching 1.1004 +// using the idea of leading subsets this time 1.1005 +int AffixMgr::process_pfx_order() 1.1006 +{ 1.1007 + PfxEntry* ptr; 1.1008 + 1.1009 + // loop through each prefix list starting point 1.1010 + for (int i=1; i < SETSIZE; i++) { 1.1011 + 1.1012 + ptr = pStart[i]; 1.1013 + 1.1014 + // look through the remainder of the list 1.1015 + // and find next entry with affix that 1.1016 + // the current one is not a subset of 1.1017 + // mark that as destination for NextNE 1.1018 + // use next in list that you are a subset 1.1019 + // of as NextEQ 1.1020 + 1.1021 + for (; ptr != NULL; ptr = ptr->getNext()) { 1.1022 + 1.1023 + PfxEntry * nptr = ptr->getNext(); 1.1024 + for (; nptr != NULL; nptr = nptr->getNext()) { 1.1025 + if (! isSubset( ptr->getKey() , nptr->getKey() )) break; 1.1026 + } 1.1027 + ptr->setNextNE(nptr); 1.1028 + ptr->setNextEQ(NULL); 1.1029 + if ((ptr->getNext()) && isSubset(ptr->getKey() , (ptr->getNext())->getKey())) 1.1030 + ptr->setNextEQ(ptr->getNext()); 1.1031 + } 1.1032 + 1.1033 + // now clean up by adding smart search termination strings: 1.1034 + // if you are already a superset of the previous prefix 1.1035 + // but not a subset of the next, search can end here 1.1036 + // so set NextNE properly 1.1037 + 1.1038 + ptr = pStart[i]; 1.1039 + for (; ptr != NULL; ptr = ptr->getNext()) { 1.1040 + PfxEntry * nptr = ptr->getNext(); 1.1041 + PfxEntry * mptr = NULL; 1.1042 + for (; nptr != NULL; nptr = nptr->getNext()) { 1.1043 + if (! isSubset(ptr->getKey(),nptr->getKey())) break; 1.1044 + mptr = nptr; 1.1045 + } 1.1046 + if (mptr) mptr->setNextNE(NULL); 1.1047 + } 1.1048 + } 1.1049 + return 0; 1.1050 +} 1.1051 + 1.1052 +// initialize the SfxEntry links NextEQ and NextNE to speed searching 1.1053 +// using the idea of leading subsets this time 1.1054 +int AffixMgr::process_sfx_order() 1.1055 +{ 1.1056 + SfxEntry* ptr; 1.1057 + 1.1058 + // loop through each prefix list starting point 1.1059 + for (int i=1; i < SETSIZE; i++) { 1.1060 + 1.1061 + ptr = sStart[i]; 1.1062 + 1.1063 + // look through the remainder of the list 1.1064 + // and find next entry with affix that 1.1065 + // the current one is not a subset of 1.1066 + // mark that as destination for NextNE 1.1067 + // use next in list that you are a subset 1.1068 + // of as NextEQ 1.1069 + 1.1070 + for (; ptr != NULL; ptr = ptr->getNext()) { 1.1071 + SfxEntry * nptr = ptr->getNext(); 1.1072 + for (; nptr != NULL; nptr = nptr->getNext()) { 1.1073 + if (! isSubset(ptr->getKey(),nptr->getKey())) break; 1.1074 + } 1.1075 + ptr->setNextNE(nptr); 1.1076 + ptr->setNextEQ(NULL); 1.1077 + if ((ptr->getNext()) && isSubset(ptr->getKey(),(ptr->getNext())->getKey())) 1.1078 + ptr->setNextEQ(ptr->getNext()); 1.1079 + } 1.1080 + 1.1081 + 1.1082 + // now clean up by adding smart search termination strings: 1.1083 + // if you are already a superset of the previous suffix 1.1084 + // but not a subset of the next, search can end here 1.1085 + // so set NextNE properly 1.1086 + 1.1087 + ptr = sStart[i]; 1.1088 + for (; ptr != NULL; ptr = ptr->getNext()) { 1.1089 + SfxEntry * nptr = ptr->getNext(); 1.1090 + SfxEntry * mptr = NULL; 1.1091 + for (; nptr != NULL; nptr = nptr->getNext()) { 1.1092 + if (! isSubset(ptr->getKey(),nptr->getKey())) break; 1.1093 + mptr = nptr; 1.1094 + } 1.1095 + if (mptr) mptr->setNextNE(NULL); 1.1096 + } 1.1097 + } 1.1098 + return 0; 1.1099 +} 1.1100 + 1.1101 +// add flags to the result for dictionary debugging 1.1102 +void AffixMgr::debugflag(char * result, unsigned short flag) { 1.1103 + char * st = encode_flag(flag); 1.1104 + mystrcat(result, " ", MAXLNLEN); 1.1105 + mystrcat(result, MORPH_FLAG, MAXLNLEN); 1.1106 + if (st) { 1.1107 + mystrcat(result, st, MAXLNLEN); 1.1108 + free(st); 1.1109 + } 1.1110 +} 1.1111 + 1.1112 +// calculate the character length of the condition 1.1113 +int AffixMgr::condlen(char * st) 1.1114 +{ 1.1115 + int l = 0; 1.1116 + bool group = false; 1.1117 + for(; *st; st++) { 1.1118 + if (*st == '[') { 1.1119 + group = true; 1.1120 + l++; 1.1121 + } else if (*st == ']') group = false; 1.1122 + else if (!group && (!utf8 || 1.1123 + (!(*st & 0x80) || ((*st & 0xc0) == 0x80)))) l++; 1.1124 + } 1.1125 + return l; 1.1126 +} 1.1127 + 1.1128 +int AffixMgr::encodeit(affentry &entry, char * cs) 1.1129 +{ 1.1130 + if (strcmp(cs,".") != 0) { 1.1131 + entry.numconds = (char) condlen(cs); 1.1132 + strncpy(entry.c.conds, cs, MAXCONDLEN); 1.1133 + // long condition (end of conds padded by strncpy) 1.1134 + if (entry.c.conds[MAXCONDLEN - 1] && cs[MAXCONDLEN]) { 1.1135 + entry.opts += aeLONGCOND; 1.1136 + entry.c.l.conds2 = mystrdup(cs + MAXCONDLEN_1); 1.1137 + if (!entry.c.l.conds2) return 1; 1.1138 + } 1.1139 + } else { 1.1140 + entry.numconds = 0; 1.1141 + entry.c.conds[0] = '\0'; 1.1142 + } 1.1143 + return 0; 1.1144 +} 1.1145 + 1.1146 +// return 1 if s1 is a leading subset of s2 (dots are for infixes) 1.1147 +inline int AffixMgr::isSubset(const char * s1, const char * s2) 1.1148 + { 1.1149 + while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0')) { 1.1150 + s1++; 1.1151 + s2++; 1.1152 + } 1.1153 + return (*s1 == '\0'); 1.1154 + } 1.1155 + 1.1156 + 1.1157 +// check word for prefixes 1.1158 +struct hentry * AffixMgr::prefix_check(const char * word, int len, char in_compound, 1.1159 + const FLAG needflag) 1.1160 +{ 1.1161 + struct hentry * rv= NULL; 1.1162 + 1.1163 + pfx = NULL; 1.1164 + pfxappnd = NULL; 1.1165 + sfxappnd = NULL; 1.1166 + 1.1167 + // first handle the special case of 0 length prefixes 1.1168 + PfxEntry * pe = pStart[0]; 1.1169 + while (pe) { 1.1170 + if ( 1.1171 + // fogemorpheme 1.1172 + ((in_compound != IN_CPD_NOT) || !(pe->getCont() && 1.1173 + (TESTAFF(pe->getCont(), onlyincompound, pe->getContLen())))) && 1.1174 + // permit prefixes in compounds 1.1175 + ((in_compound != IN_CPD_END) || (pe->getCont() && 1.1176 + (TESTAFF(pe->getCont(), compoundpermitflag, pe->getContLen())))) 1.1177 + ) { 1.1178 + // check prefix 1.1179 + rv = pe->checkword(word, len, in_compound, needflag); 1.1180 + if (rv) { 1.1181 + pfx=pe; // BUG: pfx not stateless 1.1182 + return rv; 1.1183 + } 1.1184 + } 1.1185 + pe = pe->getNext(); 1.1186 + } 1.1187 + 1.1188 + // now handle the general case 1.1189 + unsigned char sp = *((const unsigned char *)word); 1.1190 + PfxEntry * pptr = pStart[sp]; 1.1191 + 1.1192 + while (pptr) { 1.1193 + if (isSubset(pptr->getKey(),word)) { 1.1194 + if ( 1.1195 + // fogemorpheme 1.1196 + ((in_compound != IN_CPD_NOT) || !(pptr->getCont() && 1.1197 + (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen())))) && 1.1198 + // permit prefixes in compounds 1.1199 + ((in_compound != IN_CPD_END) || (pptr->getCont() && 1.1200 + (TESTAFF(pptr->getCont(), compoundpermitflag, pptr->getContLen())))) 1.1201 + ) { 1.1202 + // check prefix 1.1203 + rv = pptr->checkword(word, len, in_compound, needflag); 1.1204 + if (rv) { 1.1205 + pfx=pptr; // BUG: pfx not stateless 1.1206 + return rv; 1.1207 + } 1.1208 + } 1.1209 + pptr = pptr->getNextEQ(); 1.1210 + } else { 1.1211 + pptr = pptr->getNextNE(); 1.1212 + } 1.1213 + } 1.1214 + 1.1215 + return NULL; 1.1216 +} 1.1217 + 1.1218 +// check word for prefixes 1.1219 +struct hentry * AffixMgr::prefix_check_twosfx(const char * word, int len, 1.1220 + char in_compound, const FLAG needflag) 1.1221 +{ 1.1222 + struct hentry * rv= NULL; 1.1223 + 1.1224 + pfx = NULL; 1.1225 + sfxappnd = NULL; 1.1226 + 1.1227 + // first handle the special case of 0 length prefixes 1.1228 + PfxEntry * pe = pStart[0]; 1.1229 + 1.1230 + while (pe) { 1.1231 + rv = pe->check_twosfx(word, len, in_compound, needflag); 1.1232 + if (rv) return rv; 1.1233 + pe = pe->getNext(); 1.1234 + } 1.1235 + 1.1236 + // now handle the general case 1.1237 + unsigned char sp = *((const unsigned char *)word); 1.1238 + PfxEntry * pptr = pStart[sp]; 1.1239 + 1.1240 + while (pptr) { 1.1241 + if (isSubset(pptr->getKey(),word)) { 1.1242 + rv = pptr->check_twosfx(word, len, in_compound, needflag); 1.1243 + if (rv) { 1.1244 + pfx = pptr; 1.1245 + return rv; 1.1246 + } 1.1247 + pptr = pptr->getNextEQ(); 1.1248 + } else { 1.1249 + pptr = pptr->getNextNE(); 1.1250 + } 1.1251 + } 1.1252 + 1.1253 + return NULL; 1.1254 +} 1.1255 + 1.1256 +// check word for prefixes 1.1257 +char * AffixMgr::prefix_check_morph(const char * word, int len, char in_compound, 1.1258 + const FLAG needflag) 1.1259 +{ 1.1260 + char * st; 1.1261 + 1.1262 + char result[MAXLNLEN]; 1.1263 + result[0] = '\0'; 1.1264 + 1.1265 + pfx = NULL; 1.1266 + sfxappnd = NULL; 1.1267 + 1.1268 + // first handle the special case of 0 length prefixes 1.1269 + PfxEntry * pe = pStart[0]; 1.1270 + while (pe) { 1.1271 + st = pe->check_morph(word,len,in_compound, needflag); 1.1272 + if (st) { 1.1273 + mystrcat(result, st, MAXLNLEN); 1.1274 + free(st); 1.1275 + } 1.1276 + // if (rv) return rv; 1.1277 + pe = pe->getNext(); 1.1278 + } 1.1279 + 1.1280 + // now handle the general case 1.1281 + unsigned char sp = *((const unsigned char *)word); 1.1282 + PfxEntry * pptr = pStart[sp]; 1.1283 + 1.1284 + while (pptr) { 1.1285 + if (isSubset(pptr->getKey(),word)) { 1.1286 + st = pptr->check_morph(word,len,in_compound, needflag); 1.1287 + if (st) { 1.1288 + // fogemorpheme 1.1289 + if ((in_compound != IN_CPD_NOT) || !((pptr->getCont() && 1.1290 + (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen()))))) { 1.1291 + mystrcat(result, st, MAXLNLEN); 1.1292 + pfx = pptr; 1.1293 + } 1.1294 + free(st); 1.1295 + } 1.1296 + pptr = pptr->getNextEQ(); 1.1297 + } else { 1.1298 + pptr = pptr->getNextNE(); 1.1299 + } 1.1300 + } 1.1301 + 1.1302 + if (*result) return mystrdup(result); 1.1303 + return NULL; 1.1304 +} 1.1305 + 1.1306 + 1.1307 +// check word for prefixes 1.1308 +char * AffixMgr::prefix_check_twosfx_morph(const char * word, int len, 1.1309 + char in_compound, const FLAG needflag) 1.1310 +{ 1.1311 + char * st; 1.1312 + 1.1313 + char result[MAXLNLEN]; 1.1314 + result[0] = '\0'; 1.1315 + 1.1316 + pfx = NULL; 1.1317 + sfxappnd = NULL; 1.1318 + 1.1319 + // first handle the special case of 0 length prefixes 1.1320 + PfxEntry * pe = pStart[0]; 1.1321 + while (pe) { 1.1322 + st = pe->check_twosfx_morph(word,len,in_compound, needflag); 1.1323 + if (st) { 1.1324 + mystrcat(result, st, MAXLNLEN); 1.1325 + free(st); 1.1326 + } 1.1327 + pe = pe->getNext(); 1.1328 + } 1.1329 + 1.1330 + // now handle the general case 1.1331 + unsigned char sp = *((const unsigned char *)word); 1.1332 + PfxEntry * pptr = pStart[sp]; 1.1333 + 1.1334 + while (pptr) { 1.1335 + if (isSubset(pptr->getKey(),word)) { 1.1336 + st = pptr->check_twosfx_morph(word, len, in_compound, needflag); 1.1337 + if (st) { 1.1338 + mystrcat(result, st, MAXLNLEN); 1.1339 + free(st); 1.1340 + pfx = pptr; 1.1341 + } 1.1342 + pptr = pptr->getNextEQ(); 1.1343 + } else { 1.1344 + pptr = pptr->getNextNE(); 1.1345 + } 1.1346 + } 1.1347 + 1.1348 + if (*result) return mystrdup(result); 1.1349 + return NULL; 1.1350 +} 1.1351 + 1.1352 +// Is word a non compound with a REP substitution (see checkcompoundrep)? 1.1353 +int AffixMgr::cpdrep_check(const char * word, int wl) 1.1354 +{ 1.1355 + char candidate[MAXLNLEN]; 1.1356 + const char * r; 1.1357 + int lenr, lenp; 1.1358 + 1.1359 + if ((wl < 2) || !numrep) return 0; 1.1360 + 1.1361 + for (int i=0; i < numrep; i++ ) { 1.1362 + r = word; 1.1363 + lenr = strlen(reptable[i].pattern2); 1.1364 + lenp = strlen(reptable[i].pattern); 1.1365 + // search every occurence of the pattern in the word 1.1366 + while ((r=strstr(r, reptable[i].pattern)) != NULL) { 1.1367 + strcpy(candidate, word); 1.1368 + if (r-word + lenr + strlen(r+lenp) >= MAXLNLEN) break; 1.1369 + strcpy(candidate+(r-word),reptable[i].pattern2); 1.1370 + strcpy(candidate+(r-word)+lenr, r+lenp); 1.1371 + if (candidate_check(candidate,strlen(candidate))) return 1; 1.1372 + r++; // search for the next letter 1.1373 + } 1.1374 + } 1.1375 + return 0; 1.1376 +} 1.1377 + 1.1378 +// forbid compoundings when there are special patterns at word bound 1.1379 +int AffixMgr::cpdpat_check(const char * word, int pos, hentry * r1, hentry * r2, const char affixed) 1.1380 +{ 1.1381 + int len; 1.1382 + for (int i = 0; i < numcheckcpd; i++) { 1.1383 + if (isSubset(checkcpdtable[i].pattern2, word + pos) && 1.1384 + (!r1 || !checkcpdtable[i].cond || 1.1385 + (r1->astr && TESTAFF(r1->astr, checkcpdtable[i].cond, r1->alen))) && 1.1386 + (!r2 || !checkcpdtable[i].cond2 || 1.1387 + (r2->astr && TESTAFF(r2->astr, checkcpdtable[i].cond2, r2->alen))) && 1.1388 + // zero length pattern => only TESTAFF 1.1389 + // zero pattern (0/flag) => unmodified stem (zero affixes allowed) 1.1390 + (!*(checkcpdtable[i].pattern) || ( 1.1391 + (*(checkcpdtable[i].pattern)=='0' && r1->blen <= pos && strncmp(word + pos - r1->blen, r1->word, r1->blen) == 0) || 1.1392 + (*(checkcpdtable[i].pattern)!='0' && (len = strlen(checkcpdtable[i].pattern)) && 1.1393 + strncmp(word + pos - len, checkcpdtable[i].pattern, len) == 0)))) { 1.1394 + return 1; 1.1395 + } 1.1396 + } 1.1397 + return 0; 1.1398 +} 1.1399 + 1.1400 +// forbid compounding with neighbouring upper and lower case characters at word bounds 1.1401 +int AffixMgr::cpdcase_check(const char * word, int pos) 1.1402 +{ 1.1403 + if (utf8) { 1.1404 + w_char u, w; 1.1405 + const char * p; 1.1406 + u8_u16(&u, 1, word + pos); 1.1407 + for (p = word + pos - 1; (*p & 0xc0) == 0x80; p--); 1.1408 + u8_u16(&w, 1, p); 1.1409 + unsigned short a = (u.h << 8) + u.l; 1.1410 + unsigned short b = (w.h << 8) + w.l; 1.1411 + if (((unicodetoupper(a, langnum) == a) || (unicodetoupper(b, langnum) == b)) && 1.1412 + (a != '-') && (b != '-')) return 1; 1.1413 + } else { 1.1414 + unsigned char a = *(word + pos - 1); 1.1415 + unsigned char b = *(word + pos); 1.1416 + if ((csconv[a].ccase || csconv[b].ccase) && (a != '-') && (b != '-')) return 1; 1.1417 + } 1.1418 + return 0; 1.1419 +} 1.1420 + 1.1421 +// check compound patterns 1.1422 +int AffixMgr::defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** def, char all) 1.1423 +{ 1.1424 + signed short btpp[MAXWORDLEN]; // metacharacter (*, ?) positions for backtracking 1.1425 + signed short btwp[MAXWORDLEN]; // word positions for metacharacters 1.1426 + int btnum[MAXWORDLEN]; // number of matched characters in metacharacter positions 1.1427 + short bt = 0; 1.1428 + int i, j; 1.1429 + int ok; 1.1430 + int w = 0; 1.1431 + 1.1432 + if (!*words) { 1.1433 + w = 1; 1.1434 + *words = def; 1.1435 + } 1.1436 + 1.1437 + if (!*words) { 1.1438 + return 0; 1.1439 + } 1.1440 + 1.1441 + (*words)[wnum] = rv; 1.1442 + 1.1443 + // has the last word COMPOUNDRULE flag? 1.1444 + if (rv->alen == 0) { 1.1445 + (*words)[wnum] = NULL; 1.1446 + if (w) *words = NULL; 1.1447 + return 0; 1.1448 + } 1.1449 + ok = 0; 1.1450 + for (i = 0; i < numdefcpd; i++) { 1.1451 + for (j = 0; j < defcpdtable[i].len; j++) { 1.1452 + if (defcpdtable[i].def[j] != '*' && defcpdtable[i].def[j] != '?' && 1.1453 + TESTAFF(rv->astr, defcpdtable[i].def[j], rv->alen)) ok = 1; 1.1454 + } 1.1455 + } 1.1456 + if (ok == 0) { 1.1457 + (*words)[wnum] = NULL; 1.1458 + if (w) *words = NULL; 1.1459 + return 0; 1.1460 + } 1.1461 + 1.1462 + for (i = 0; i < numdefcpd; i++) { 1.1463 + signed short pp = 0; // pattern position 1.1464 + signed short wp = 0; // "words" position 1.1465 + int ok2; 1.1466 + ok = 1; 1.1467 + ok2 = 1; 1.1468 + do { 1.1469 + while ((pp < defcpdtable[i].len) && (wp <= wnum)) { 1.1470 + if (((pp+1) < defcpdtable[i].len) && 1.1471 + ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) { 1.1472 + int wend = (defcpdtable[i].def[pp+1] == '?') ? wp : wnum; 1.1473 + ok2 = 1; 1.1474 + pp+=2; 1.1475 + btpp[bt] = pp; 1.1476 + btwp[bt] = wp; 1.1477 + while (wp <= wend) { 1.1478 + if (!(*words)[wp]->alen || 1.1479 + !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp-2], (*words)[wp]->alen)) { 1.1480 + ok2 = 0; 1.1481 + break; 1.1482 + } 1.1483 + wp++; 1.1484 + } 1.1485 + if (wp <= wnum) ok2 = 0; 1.1486 + btnum[bt] = wp - btwp[bt]; 1.1487 + if (btnum[bt] > 0) bt++; 1.1488 + if (ok2) break; 1.1489 + } else { 1.1490 + ok2 = 1; 1.1491 + if (!(*words)[wp] || !(*words)[wp]->alen || 1.1492 + !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp], (*words)[wp]->alen)) { 1.1493 + ok = 0; 1.1494 + break; 1.1495 + } 1.1496 + pp++; 1.1497 + wp++; 1.1498 + if ((defcpdtable[i].len == pp) && !(wp > wnum)) ok = 0; 1.1499 + } 1.1500 + } 1.1501 + if (ok && ok2) { 1.1502 + int r = pp; 1.1503 + while ((defcpdtable[i].len > r) && ((r+1) < defcpdtable[i].len) && 1.1504 + ((defcpdtable[i].def[r+1] == '*') || (defcpdtable[i].def[r+1] == '?'))) r+=2; 1.1505 + if (defcpdtable[i].len <= r) return 1; 1.1506 + } 1.1507 + // backtrack 1.1508 + if (bt) do { 1.1509 + ok = 1; 1.1510 + btnum[bt - 1]--; 1.1511 + pp = btpp[bt - 1]; 1.1512 + wp = btwp[bt - 1] + (signed short) btnum[bt - 1]; 1.1513 + } while ((btnum[bt - 1] < 0) && --bt); 1.1514 + } while (bt); 1.1515 + 1.1516 + if (ok && ok2 && (!all || (defcpdtable[i].len <= pp))) return 1; 1.1517 + 1.1518 + // check zero ending 1.1519 + while (ok && ok2 && (defcpdtable[i].len > pp) && ((pp+1) < defcpdtable[i].len) && 1.1520 + ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) pp+=2; 1.1521 + if (ok && ok2 && (defcpdtable[i].len <= pp)) return 1; 1.1522 + } 1.1523 + (*words)[wnum] = NULL; 1.1524 + if (w) *words = NULL; 1.1525 + return 0; 1.1526 +} 1.1527 + 1.1528 +inline int AffixMgr::candidate_check(const char * word, int len) 1.1529 +{ 1.1530 + struct hentry * rv=NULL; 1.1531 + 1.1532 + rv = lookup(word); 1.1533 + if (rv) return 1; 1.1534 + 1.1535 +// rv = prefix_check(word,len,1); 1.1536 +// if (rv) return 1; 1.1537 + 1.1538 + rv = affix_check(word,len); 1.1539 + if (rv) return 1; 1.1540 + return 0; 1.1541 +} 1.1542 + 1.1543 +// calculate number of syllable for compound-checking 1.1544 +short AffixMgr::get_syllable(const char * word, int wlen) 1.1545 +{ 1.1546 + if (cpdmaxsyllable==0) return 0; 1.1547 + 1.1548 + short num=0; 1.1549 + 1.1550 + if (!utf8) { 1.1551 + for (int i=0; i<wlen; i++) { 1.1552 + if (strchr(cpdvowels, word[i])) num++; 1.1553 + } 1.1554 + } else if (cpdvowels_utf16) { 1.1555 + w_char w[MAXWORDUTF8LEN]; 1.1556 + int i = u8_u16(w, MAXWORDUTF8LEN, word); 1.1557 + for (; i > 0; i--) { 1.1558 + if (flag_bsearch((unsigned short *) cpdvowels_utf16, 1.1559 + ((unsigned short *) w)[i - 1], cpdvowels_utf16_len)) num++; 1.1560 + } 1.1561 + } 1.1562 + return num; 1.1563 +} 1.1564 + 1.1565 +void AffixMgr::setcminmax(int * cmin, int * cmax, const char * word, int len) { 1.1566 + if (utf8) { 1.1567 + int i; 1.1568 + for (*cmin = 0, i = 0; (i < cpdmin) && word[*cmin]; i++) { 1.1569 + for ((*cmin)++; (word[*cmin] & 0xc0) == 0x80; (*cmin)++); 1.1570 + } 1.1571 + for (*cmax = len, i = 0; (i < (cpdmin - 1)) && *cmax; i++) { 1.1572 + for ((*cmax)--; (word[*cmax] & 0xc0) == 0x80; (*cmax)--); 1.1573 + } 1.1574 + } else { 1.1575 + *cmin = cpdmin; 1.1576 + *cmax = len - cpdmin + 1; 1.1577 + } 1.1578 +} 1.1579 + 1.1580 + 1.1581 +// check if compound word is correctly spelled 1.1582 +// hu_mov_rule = spec. Hungarian rule (XXX) 1.1583 +struct hentry * AffixMgr::compound_check(const char * word, int len, 1.1584 + short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words = NULL, 1.1585 + char hu_mov_rule = 0, char is_sug = 0, int * info = NULL) 1.1586 +{ 1.1587 + int i; 1.1588 + short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2; 1.1589 + struct hentry * rv = NULL; 1.1590 + struct hentry * rv_first; 1.1591 + struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking 1.1592 + char st [MAXWORDUTF8LEN + 4]; 1.1593 + char ch = '\0'; 1.1594 + int cmin; 1.1595 + int cmax; 1.1596 + int striple = 0; 1.1597 + int scpd = 0; 1.1598 + int soldi = 0; 1.1599 + int oldcmin = 0; 1.1600 + int oldcmax = 0; 1.1601 + int oldlen = 0; 1.1602 + int checkedstriple = 0; 1.1603 + int onlycpdrule; 1.1604 + int affixed = 0; 1.1605 + hentry ** oldwords = words; 1.1606 + 1.1607 + int checked_prefix; 1.1608 + 1.1609 + setcminmax(&cmin, &cmax, word, len); 1.1610 + 1.1611 + strcpy(st, word); 1.1612 + 1.1613 + for (i = cmin; i < cmax; i++) { 1.1614 + // go to end of the UTF-8 character 1.1615 + if (utf8) { 1.1616 + for (; (st[i] & 0xc0) == 0x80; i++); 1.1617 + if (i >= cmax) return NULL; 1.1618 + } 1.1619 + 1.1620 + words = oldwords; 1.1621 + onlycpdrule = (words) ? 1 : 0; 1.1622 + 1.1623 + do { // onlycpdrule loop 1.1624 + 1.1625 + oldnumsyllable = numsyllable; 1.1626 + oldwordnum = wordnum; 1.1627 + checked_prefix = 0; 1.1628 + 1.1629 + 1.1630 + do { // simplified checkcompoundpattern loop 1.1631 + 1.1632 + if (scpd > 0) { 1.1633 + for (; scpd <= numcheckcpd && (!checkcpdtable[scpd-1].pattern3 || 1.1634 + strncmp(word + i, checkcpdtable[scpd-1].pattern3, strlen(checkcpdtable[scpd-1].pattern3)) != 0); scpd++); 1.1635 + 1.1636 + if (scpd > numcheckcpd) break; // break simplified checkcompoundpattern loop 1.1637 + strcpy(st + i, checkcpdtable[scpd-1].pattern); 1.1638 + soldi = i; 1.1639 + i += strlen(checkcpdtable[scpd-1].pattern); 1.1640 + strcpy(st + i, checkcpdtable[scpd-1].pattern2); 1.1641 + strcpy(st + i + strlen(checkcpdtable[scpd-1].pattern2), word + soldi + strlen(checkcpdtable[scpd-1].pattern3)); 1.1642 + 1.1643 + oldlen = len; 1.1644 + len += strlen(checkcpdtable[scpd-1].pattern) + strlen(checkcpdtable[scpd-1].pattern2) - strlen(checkcpdtable[scpd-1].pattern3); 1.1645 + oldcmin = cmin; 1.1646 + oldcmax = cmax; 1.1647 + setcminmax(&cmin, &cmax, st, len); 1.1648 + 1.1649 + cmax = len - cpdmin + 1; 1.1650 + } 1.1651 + 1.1652 + ch = st[i]; 1.1653 + st[i] = '\0'; 1.1654 + 1.1655 + sfx = NULL; 1.1656 + pfx = NULL; 1.1657 + 1.1658 + // FIRST WORD 1.1659 + 1.1660 + affixed = 1; 1.1661 + rv = lookup(st); // perhaps without prefix 1.1662 + 1.1663 + // search homonym with compound flag 1.1664 + while ((rv) && !hu_mov_rule && 1.1665 + ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || 1.1666 + !((compoundflag && !words && !onlycpdrule && TESTAFF(rv->astr, compoundflag, rv->alen)) || 1.1667 + (compoundbegin && !wordnum && !onlycpdrule && 1.1668 + TESTAFF(rv->astr, compoundbegin, rv->alen)) || 1.1669 + (compoundmiddle && wordnum && !words && !onlycpdrule && 1.1670 + TESTAFF(rv->astr, compoundmiddle, rv->alen)) || 1.1671 + (numdefcpd && onlycpdrule && 1.1672 + ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) || 1.1673 + (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0))))) || 1.1674 + (scpd != 0 && checkcpdtable[scpd-1].cond != FLAG_NULL && 1.1675 + !TESTAFF(rv->astr, checkcpdtable[scpd-1].cond, rv->alen))) 1.1676 + ) { 1.1677 + rv = rv->next_homonym; 1.1678 + } 1.1679 + 1.1680 + if (rv) affixed = 0; 1.1681 + 1.1682 + if (!rv) { 1.1683 + if (onlycpdrule) break; 1.1684 + if (compoundflag && 1.1685 + !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) { 1.1686 + if ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, 1.1687 + FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) && !hu_mov_rule && 1.1688 + sfx->getCont() && 1.1689 + ((compoundforbidflag && TESTAFF(sfx->getCont(), compoundforbidflag, 1.1690 + sfx->getContLen())) || (compoundend && 1.1691 + TESTAFF(sfx->getCont(), compoundend, 1.1692 + sfx->getContLen())))) { 1.1693 + rv = NULL; 1.1694 + } 1.1695 + } 1.1696 + 1.1697 + if (rv || 1.1698 + (((wordnum == 0) && compoundbegin && 1.1699 + ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || 1.1700 + (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundbegin)))) || 1.1701 + ((wordnum > 0) && compoundmiddle && 1.1702 + ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || 1.1703 + (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle))))) 1.1704 + ) checked_prefix = 1; 1.1705 + // else check forbiddenwords and needaffix 1.1706 + } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) || 1.1707 + TESTAFF(rv->astr, needaffix, rv->alen) || 1.1708 + TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || 1.1709 + (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)) 1.1710 + )) { 1.1711 + st[i] = ch; 1.1712 + //continue; 1.1713 + break; 1.1714 + } 1.1715 + 1.1716 + // check non_compound flag in suffix and prefix 1.1717 + if ((rv) && !hu_mov_rule && 1.1718 + ((pfx && pfx->getCont() && 1.1719 + TESTAFF(pfx->getCont(), compoundforbidflag, 1.1720 + pfx->getContLen())) || 1.1721 + (sfx && sfx->getCont() && 1.1722 + TESTAFF(sfx->getCont(), compoundforbidflag, 1.1723 + sfx->getContLen())))) { 1.1724 + rv = NULL; 1.1725 + } 1.1726 + 1.1727 + // check compoundend flag in suffix and prefix 1.1728 + if ((rv) && !checked_prefix && compoundend && !hu_mov_rule && 1.1729 + ((pfx && pfx->getCont() && 1.1730 + TESTAFF(pfx->getCont(), compoundend, 1.1731 + pfx->getContLen())) || 1.1732 + (sfx && sfx->getCont() && 1.1733 + TESTAFF(sfx->getCont(), compoundend, 1.1734 + sfx->getContLen())))) { 1.1735 + rv = NULL; 1.1736 + } 1.1737 + 1.1738 + // check compoundmiddle flag in suffix and prefix 1.1739 + if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu_mov_rule && 1.1740 + ((pfx && pfx->getCont() && 1.1741 + TESTAFF(pfx->getCont(), compoundmiddle, 1.1742 + pfx->getContLen())) || 1.1743 + (sfx && sfx->getCont() && 1.1744 + TESTAFF(sfx->getCont(), compoundmiddle, 1.1745 + sfx->getContLen())))) { 1.1746 + rv = NULL; 1.1747 + } 1.1748 + 1.1749 + // check forbiddenwords 1.1750 + if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) || 1.1751 + TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || 1.1752 + (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) { 1.1753 + return NULL; 1.1754 + } 1.1755 + 1.1756 + // increment word number, if the second root has a compoundroot flag 1.1757 + if ((rv) && compoundroot && 1.1758 + (TESTAFF(rv->astr, compoundroot, rv->alen))) { 1.1759 + wordnum++; 1.1760 + } 1.1761 + 1.1762 + // first word is acceptable in compound words? 1.1763 + if (((rv) && 1.1764 + ( checked_prefix || (words && words[wnum]) || 1.1765 + (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) || 1.1766 + ((oldwordnum == 0) && compoundbegin && TESTAFF(rv->astr, compoundbegin, rv->alen)) || 1.1767 + ((oldwordnum > 0) && compoundmiddle && TESTAFF(rv->astr, compoundmiddle, rv->alen))// || 1.1768 +// (numdefcpd && ) 1.1769 + 1.1770 +// LANG_hu section: spec. Hungarian rule 1.1771 + || ((langnum == LANG_hu) && hu_mov_rule && ( 1.1772 + TESTAFF(rv->astr, 'F', rv->alen) || // XXX hardwired Hungarian dictionary codes 1.1773 + TESTAFF(rv->astr, 'G', rv->alen) || 1.1774 + TESTAFF(rv->astr, 'H', rv->alen) 1.1775 + ) 1.1776 + ) 1.1777 +// END of LANG_hu section 1.1778 + ) && 1.1779 + ( 1.1780 + // test CHECKCOMPOUNDPATTERN conditions 1.1781 + scpd == 0 || checkcpdtable[scpd-1].cond == FLAG_NULL || 1.1782 + TESTAFF(rv->astr, checkcpdtable[scpd-1].cond, rv->alen) 1.1783 + ) 1.1784 + && ! (( checkcompoundtriple && scpd == 0 && !words && // test triple letters 1.1785 + (word[i-1]==word[i]) && ( 1.1786 + ((i>1) && (word[i-1]==word[i-2])) || 1.1787 + ((word[i-1]==word[i+1])) // may be word[i+1] == '\0' 1.1788 + ) 1.1789 + ) || 1.1790 + ( 1.1791 + checkcompoundcase && scpd == 0 && !words && cpdcase_check(word, i) 1.1792 + )) 1.1793 + ) 1.1794 +// LANG_hu section: spec. Hungarian rule 1.1795 + || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && (rv = affix_check(st,i)) && 1.1796 + (sfx && sfx->getCont() && ( // XXX hardwired Hungarian dic. codes 1.1797 + TESTAFF(sfx->getCont(), (unsigned short) 'x', sfx->getContLen()) || 1.1798 + TESTAFF(sfx->getCont(), (unsigned short) '%', sfx->getContLen()) 1.1799 + ) 1.1800 + ) 1.1801 + ) 1.1802 + ) { // first word is ok condition 1.1803 + 1.1804 +// LANG_hu section: spec. Hungarian rule 1.1805 + if (langnum == LANG_hu) { 1.1806 + // calculate syllable number of the word 1.1807 + numsyllable += get_syllable(st, i); 1.1808 + // + 1 word, if syllable number of the prefix > 1 (hungarian convention) 1.1809 + if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) > 1)) wordnum++; 1.1810 + } 1.1811 +// END of LANG_hu section 1.1812 + 1.1813 + // NEXT WORD(S) 1.1814 + rv_first = rv; 1.1815 + st[i] = ch; 1.1816 + 1.1817 + do { // striple loop 1.1818 + 1.1819 + // check simplifiedtriple 1.1820 + if (simplifiedtriple) { 1.1821 + if (striple) { 1.1822 + checkedstriple = 1; 1.1823 + i--; // check "fahrt" instead of "ahrt" in "Schiffahrt" 1.1824 + } else if (i > 2 && *(word+i - 1) == *(word + i - 2)) striple = 1; 1.1825 + } 1.1826 + 1.1827 + rv = lookup((st+i)); // perhaps without prefix 1.1828 + 1.1829 + // search homonym with compound flag 1.1830 + while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || 1.1831 + !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) || 1.1832 + (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) || 1.1833 + (numdefcpd && words && defcpd_check(&words, wnum + 1, rv, NULL,1))) || 1.1834 + (scpd != 0 && checkcpdtable[scpd-1].cond2 != FLAG_NULL && 1.1835 + !TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen)) 1.1836 + )) { 1.1837 + rv = rv->next_homonym; 1.1838 + } 1.1839 + 1.1840 + // check FORCEUCASE 1.1841 + if (rv && forceucase && (rv) && 1.1842 + (TESTAFF(rv->astr, forceucase, rv->alen)) && !(info && *info & SPELL_ORIGCAP)) rv = NULL; 1.1843 + 1.1844 + if (rv && words && words[wnum + 1]) return rv_first; 1.1845 + 1.1846 + oldnumsyllable2 = numsyllable; 1.1847 + oldwordnum2 = wordnum; 1.1848 + 1.1849 + 1.1850 +// LANG_hu section: spec. Hungarian rule, XXX hardwired dictionary code 1.1851 + if ((rv) && (langnum == LANG_hu) && (TESTAFF(rv->astr, 'I', rv->alen)) && !(TESTAFF(rv->astr, 'J', rv->alen))) { 1.1852 + numsyllable--; 1.1853 + } 1.1854 +// END of LANG_hu section 1.1855 + 1.1856 + // increment word number, if the second root has a compoundroot flag 1.1857 + if ((rv) && (compoundroot) && 1.1858 + (TESTAFF(rv->astr, compoundroot, rv->alen))) { 1.1859 + wordnum++; 1.1860 + } 1.1861 + 1.1862 + // check forbiddenwords 1.1863 + if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) || 1.1864 + TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || 1.1865 + (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) return NULL; 1.1866 + 1.1867 + // second word is acceptable, as a root? 1.1868 + // hungarian conventions: compounding is acceptable, 1.1869 + // when compound forms consist of 2 words, or if more, 1.1870 + // then the syllable number of root words must be 6, or lesser. 1.1871 + 1.1872 + if ((rv) && ( 1.1873 + (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) || 1.1874 + (compoundend && TESTAFF(rv->astr, compoundend, rv->alen)) 1.1875 + ) 1.1876 + && ( 1.1877 + ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) || 1.1878 + ((cpdmaxsyllable!=0) && 1.1879 + (numsyllable + get_syllable(HENTRY_WORD(rv), rv->clen)<=cpdmaxsyllable)) 1.1880 + ) && 1.1881 + ( 1.1882 + // test CHECKCOMPOUNDPATTERN 1.1883 + !numcheckcpd || scpd != 0 || !cpdpat_check(word, i, rv_first, rv, 0) 1.1884 + ) && 1.1885 + ( 1.1886 + (!checkcompounddup || (rv != rv_first)) 1.1887 + ) 1.1888 + // test CHECKCOMPOUNDPATTERN conditions 1.1889 + && (scpd == 0 || checkcpdtable[scpd-1].cond2 == FLAG_NULL || 1.1890 + TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen)) 1.1891 + ) 1.1892 + { 1.1893 + // forbid compound word, if it is a non compound word with typical fault 1.1894 + if (checkcompoundrep && cpdrep_check(word,len)) return NULL; 1.1895 + return rv_first; 1.1896 + } 1.1897 + 1.1898 + numsyllable = oldnumsyllable2; 1.1899 + wordnum = oldwordnum2; 1.1900 + 1.1901 + // perhaps second word has prefix or/and suffix 1.1902 + sfx = NULL; 1.1903 + sfxflag = FLAG_NULL; 1.1904 + rv = (compoundflag && !onlycpdrule) ? affix_check((word+i),strlen(word+i), compoundflag, IN_CPD_END) : NULL; 1.1905 + if (!rv && compoundend && !onlycpdrule) { 1.1906 + sfx = NULL; 1.1907 + pfx = NULL; 1.1908 + rv = affix_check((word+i),strlen(word+i), compoundend, IN_CPD_END); 1.1909 + } 1.1910 + 1.1911 + if (!rv && numdefcpd && words) { 1.1912 + rv = affix_check((word+i),strlen(word+i), 0, IN_CPD_END); 1.1913 + if (rv && defcpd_check(&words, wnum + 1, rv, NULL, 1)) return rv_first; 1.1914 + rv = NULL; 1.1915 + } 1.1916 + 1.1917 + // test CHECKCOMPOUNDPATTERN conditions (allowed forms) 1.1918 + if (rv && !(scpd == 0 || checkcpdtable[scpd-1].cond2 == FLAG_NULL || 1.1919 + TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen))) rv = NULL; 1.1920 + 1.1921 + // test CHECKCOMPOUNDPATTERN conditions (forbidden compounds) 1.1922 + if (rv && numcheckcpd && scpd == 0 && cpdpat_check(word, i, rv_first, rv, affixed)) rv = NULL; 1.1923 + 1.1924 + // check non_compound flag in suffix and prefix 1.1925 + if ((rv) && 1.1926 + ((pfx && pfx->getCont() && 1.1927 + TESTAFF(pfx->getCont(), compoundforbidflag, 1.1928 + pfx->getContLen())) || 1.1929 + (sfx && sfx->getCont() && 1.1930 + TESTAFF(sfx->getCont(), compoundforbidflag, 1.1931 + sfx->getContLen())))) { 1.1932 + rv = NULL; 1.1933 + } 1.1934 + 1.1935 + // check FORCEUCASE 1.1936 + if (rv && forceucase && (rv) && 1.1937 + (TESTAFF(rv->astr, forceucase, rv->alen)) && !(info && *info & SPELL_ORIGCAP)) rv = NULL; 1.1938 + 1.1939 + // check forbiddenwords 1.1940 + if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) || 1.1941 + TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || 1.1942 + (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) return NULL; 1.1943 + 1.1944 + // pfxappnd = prefix of word+i, or NULL 1.1945 + // calculate syllable number of prefix. 1.1946 + // hungarian convention: when syllable number of prefix is more, 1.1947 + // than 1, the prefix+word counts as two words. 1.1948 + 1.1949 + if (langnum == LANG_hu) { 1.1950 + // calculate syllable number of the word 1.1951 + numsyllable += get_syllable(word + i, strlen(word + i)); 1.1952 + 1.1953 + // - affix syllable num. 1.1954 + // XXX only second suffix (inflections, not derivations) 1.1955 + if (sfxappnd) { 1.1956 + char * tmp = myrevstrdup(sfxappnd); 1.1957 + numsyllable -= get_syllable(tmp, strlen(tmp)); 1.1958 + free(tmp); 1.1959 + } 1.1960 + 1.1961 + // + 1 word, if syllable number of the prefix > 1 (hungarian convention) 1.1962 + if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) > 1)) wordnum++; 1.1963 + 1.1964 + // increment syllable num, if last word has a SYLLABLENUM flag 1.1965 + // and the suffix is beginning `s' 1.1966 + 1.1967 + if (cpdsyllablenum) { 1.1968 + switch (sfxflag) { 1.1969 + case 'c': { numsyllable+=2; break; } 1.1970 + case 'J': { numsyllable += 1; break; } 1.1971 + case 'I': { if (rv && TESTAFF(rv->astr, 'J', rv->alen)) numsyllable += 1; break; } 1.1972 + } 1.1973 + } 1.1974 + } 1.1975 + 1.1976 + // increment word number, if the second word has a compoundroot flag 1.1977 + if ((rv) && (compoundroot) && 1.1978 + (TESTAFF(rv->astr, compoundroot, rv->alen))) { 1.1979 + wordnum++; 1.1980 + } 1.1981 + 1.1982 + // second word is acceptable, as a word with prefix or/and suffix? 1.1983 + // hungarian conventions: compounding is acceptable, 1.1984 + // when compound forms consist 2 word, otherwise 1.1985 + // the syllable number of root words is 6, or lesser. 1.1986 + if ((rv) && 1.1987 + ( 1.1988 + ((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) || 1.1989 + ((cpdmaxsyllable != 0) && 1.1990 + (numsyllable <= cpdmaxsyllable)) 1.1991 + ) 1.1992 + && ( 1.1993 + (!checkcompounddup || (rv != rv_first)) 1.1994 + )) { 1.1995 + // forbid compound word, if it is a non compound word with typical fault 1.1996 + if (checkcompoundrep && cpdrep_check(word, len)) return NULL; 1.1997 + return rv_first; 1.1998 + } 1.1999 + 1.2000 + numsyllable = oldnumsyllable2; 1.2001 + wordnum = oldwordnum2; 1.2002 + 1.2003 + // perhaps second word is a compound word (recursive call) 1.2004 + if (wordnum < maxwordnum) { 1.2005 + rv = compound_check((st+i),strlen(st+i), wordnum+1, 1.2006 + numsyllable, maxwordnum, wnum + 1, words, 0, is_sug, info); 1.2007 + 1.2008 + if (rv && numcheckcpd && ((scpd == 0 && cpdpat_check(word, i, rv_first, rv, affixed)) || 1.2009 + (scpd != 0 && !cpdpat_check(word, i, rv_first, rv, affixed)))) rv = NULL; 1.2010 + } else { 1.2011 + rv=NULL; 1.2012 + } 1.2013 + if (rv) { 1.2014 + // forbid compound word, if it is a non compound word with typical fault 1.2015 + if (checkcompoundrep || forbiddenword) { 1.2016 + struct hentry * rv2 = NULL; 1.2017 + 1.2018 + if (checkcompoundrep && cpdrep_check(word, len)) return NULL; 1.2019 + 1.2020 + // check first part 1.2021 + if (strncmp(rv->word, word + i, rv->blen) == 0) { 1.2022 + char r = *(st + i + rv->blen); 1.2023 + *(st + i + rv->blen) = '\0'; 1.2024 + 1.2025 + if (checkcompoundrep && cpdrep_check(st, i + rv->blen)) { 1.2026 + *(st + i + rv->blen) = r; 1.2027 + continue; 1.2028 + } 1.2029 + 1.2030 + if (forbiddenword) { 1.2031 + rv2 = lookup(word); 1.2032 + if (!rv2) rv2 = affix_check(word, len); 1.2033 + if (rv2 && rv2->astr && TESTAFF(rv2->astr, forbiddenword, rv2->alen) && 1.2034 + (strncmp(rv2->word, st, i + rv->blen) == 0)) { 1.2035 + return NULL; 1.2036 + } 1.2037 + } 1.2038 + *(st + i + rv->blen) = r; 1.2039 + } 1.2040 + } 1.2041 + return rv_first; 1.2042 + } 1.2043 + } while (striple && !checkedstriple); // end of striple loop 1.2044 + 1.2045 + if (checkedstriple) { 1.2046 + i++; 1.2047 + checkedstriple = 0; 1.2048 + striple = 0; 1.2049 + } 1.2050 + 1.2051 + } // first word is ok condition 1.2052 + 1.2053 + if (soldi != 0) { 1.2054 + i = soldi; 1.2055 + soldi = 0; 1.2056 + len = oldlen; 1.2057 + cmin = oldcmin; 1.2058 + cmax = oldcmax; 1.2059 + } 1.2060 + scpd++; 1.2061 + 1.2062 + 1.2063 + } while (!onlycpdrule && simplifiedcpd && scpd <= numcheckcpd); // end of simplifiedcpd loop 1.2064 + 1.2065 + scpd = 0; 1.2066 + wordnum = oldwordnum; 1.2067 + numsyllable = oldnumsyllable; 1.2068 + 1.2069 + if (soldi != 0) { 1.2070 + i = soldi; 1.2071 + strcpy(st, word); // XXX add more optim. 1.2072 + soldi = 0; 1.2073 + } else st[i] = ch; 1.2074 + 1.2075 + } while (numdefcpd && oldwordnum == 0 && !onlycpdrule && (onlycpdrule = 1)); // end of onlycpd loop 1.2076 + 1.2077 + } 1.2078 + 1.2079 + return NULL; 1.2080 +} 1.2081 + 1.2082 +// check if compound word is correctly spelled 1.2083 +// hu_mov_rule = spec. Hungarian rule (XXX) 1.2084 +int AffixMgr::compound_check_morph(const char * word, int len, 1.2085 + short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words, 1.2086 + char hu_mov_rule = 0, char ** result = NULL, char * partresult = NULL) 1.2087 +{ 1.2088 + int i; 1.2089 + short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2; 1.2090 + int ok = 0; 1.2091 + 1.2092 + struct hentry * rv = NULL; 1.2093 + struct hentry * rv_first; 1.2094 + struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking 1.2095 + char st [MAXWORDUTF8LEN + 4]; 1.2096 + char ch; 1.2097 + 1.2098 + int checked_prefix; 1.2099 + char presult[MAXLNLEN]; 1.2100 + 1.2101 + int cmin; 1.2102 + int cmax; 1.2103 + 1.2104 + int onlycpdrule; 1.2105 + int affixed = 0; 1.2106 + hentry ** oldwords = words; 1.2107 + 1.2108 + setcminmax(&cmin, &cmax, word, len); 1.2109 + 1.2110 + strcpy(st, word); 1.2111 + 1.2112 + for (i = cmin; i < cmax; i++) { 1.2113 + oldnumsyllable = numsyllable; 1.2114 + oldwordnum = wordnum; 1.2115 + checked_prefix = 0; 1.2116 + 1.2117 + // go to end of the UTF-8 character 1.2118 + if (utf8) { 1.2119 + for (; (st[i] & 0xc0) == 0x80; i++); 1.2120 + if (i >= cmax) return 0; 1.2121 + } 1.2122 + 1.2123 + words = oldwords; 1.2124 + onlycpdrule = (words) ? 1 : 0; 1.2125 + 1.2126 + do { // onlycpdrule loop 1.2127 + 1.2128 + oldnumsyllable = numsyllable; 1.2129 + oldwordnum = wordnum; 1.2130 + checked_prefix = 0; 1.2131 + 1.2132 + ch = st[i]; 1.2133 + st[i] = '\0'; 1.2134 + sfx = NULL; 1.2135 + 1.2136 + // FIRST WORD 1.2137 + 1.2138 + affixed = 1; 1.2139 + 1.2140 + *presult = '\0'; 1.2141 + if (partresult) mystrcat(presult, partresult, MAXLNLEN); 1.2142 + 1.2143 + rv = lookup(st); // perhaps without prefix 1.2144 + 1.2145 + // search homonym with compound flag 1.2146 + while ((rv) && !hu_mov_rule && 1.2147 + ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || 1.2148 + !((compoundflag && !words && !onlycpdrule && TESTAFF(rv->astr, compoundflag, rv->alen)) || 1.2149 + (compoundbegin && !wordnum && !onlycpdrule && 1.2150 + TESTAFF(rv->astr, compoundbegin, rv->alen)) || 1.2151 + (compoundmiddle && wordnum && !words && !onlycpdrule && 1.2152 + TESTAFF(rv->astr, compoundmiddle, rv->alen)) || 1.2153 + (numdefcpd && onlycpdrule && 1.2154 + ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) || 1.2155 + (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)))) 1.2156 + ))) { 1.2157 + rv = rv->next_homonym; 1.2158 + } 1.2159 + 1.2160 + if (rv) affixed = 0; 1.2161 + 1.2162 + if (rv) { 1.2163 + sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_PART, st); 1.2164 + if (!HENTRY_FIND(rv, MORPH_STEM)) { 1.2165 + sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_STEM, st); 1.2166 + } 1.2167 + // store the pointer of the hash entry 1.2168 +// sprintf(presult + strlen(presult), "%c%s%p", MSEP_FLD, MORPH_HENTRY, rv); 1.2169 + if (HENTRY_DATA(rv)) { 1.2170 + sprintf(presult + strlen(presult), "%c%s", MSEP_FLD, HENTRY_DATA2(rv)); 1.2171 + } 1.2172 + } 1.2173 + 1.2174 + if (!rv) { 1.2175 + if (onlycpdrule) break; 1.2176 + if (compoundflag && 1.2177 + !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) { 1.2178 + if ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, 1.2179 + FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) && !hu_mov_rule && 1.2180 + sfx->getCont() && 1.2181 + ((compoundforbidflag && TESTAFF(sfx->getCont(), compoundforbidflag, 1.2182 + sfx->getContLen())) || (compoundend && 1.2183 + TESTAFF(sfx->getCont(), compoundend, 1.2184 + sfx->getContLen())))) { 1.2185 + rv = NULL; 1.2186 + } 1.2187 + } 1.2188 + 1.2189 + if (rv || 1.2190 + (((wordnum == 0) && compoundbegin && 1.2191 + ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || 1.2192 + (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundbegin)))) || 1.2193 + ((wordnum > 0) && compoundmiddle && 1.2194 + ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || 1.2195 + (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle))))) 1.2196 + ) { 1.2197 + // char * p = prefix_check_morph(st, i, 0, compound); 1.2198 + char * p = NULL; 1.2199 + if (compoundflag) p = affix_check_morph(st, i, compoundflag); 1.2200 + if (!p || (*p == '\0')) { 1.2201 + if (p) free(p); 1.2202 + p = NULL; 1.2203 + if ((wordnum == 0) && compoundbegin) { 1.2204 + p = affix_check_morph(st, i, compoundbegin); 1.2205 + } else if ((wordnum > 0) && compoundmiddle) { 1.2206 + p = affix_check_morph(st, i, compoundmiddle); 1.2207 + } 1.2208 + } 1.2209 + if (p && (*p != '\0')) { 1.2210 + sprintf(presult + strlen(presult), "%c%s%s%s", MSEP_FLD, 1.2211 + MORPH_PART, st, line_uniq_app(&p, MSEP_REC)); 1.2212 + } 1.2213 + if (p) free(p); 1.2214 + checked_prefix = 1; 1.2215 + } 1.2216 + // else check forbiddenwords 1.2217 + } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) || 1.2218 + TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || 1.2219 + TESTAFF(rv->astr, needaffix, rv->alen))) { 1.2220 + st[i] = ch; 1.2221 + continue; 1.2222 + } 1.2223 + 1.2224 + // check non_compound flag in suffix and prefix 1.2225 + if ((rv) && !hu_mov_rule && 1.2226 + ((pfx && pfx->getCont() && 1.2227 + TESTAFF(pfx->getCont(), compoundforbidflag, 1.2228 + pfx->getContLen())) || 1.2229 + (sfx && sfx->getCont() && 1.2230 + TESTAFF(sfx->getCont(), compoundforbidflag, 1.2231 + sfx->getContLen())))) { 1.2232 + continue; 1.2233 + } 1.2234 + 1.2235 + // check compoundend flag in suffix and prefix 1.2236 + if ((rv) && !checked_prefix && compoundend && !hu_mov_rule && 1.2237 + ((pfx && pfx->getCont() && 1.2238 + TESTAFF(pfx->getCont(), compoundend, 1.2239 + pfx->getContLen())) || 1.2240 + (sfx && sfx->getCont() && 1.2241 + TESTAFF(sfx->getCont(), compoundend, 1.2242 + sfx->getContLen())))) { 1.2243 + continue; 1.2244 + } 1.2245 + 1.2246 + // check compoundmiddle flag in suffix and prefix 1.2247 + if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu_mov_rule && 1.2248 + ((pfx && pfx->getCont() && 1.2249 + TESTAFF(pfx->getCont(), compoundmiddle, 1.2250 + pfx->getContLen())) || 1.2251 + (sfx && sfx->getCont() && 1.2252 + TESTAFF(sfx->getCont(), compoundmiddle, 1.2253 + sfx->getContLen())))) { 1.2254 + rv = NULL; 1.2255 + } 1.2256 + 1.2257 + // check forbiddenwords 1.2258 + if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) 1.2259 + || TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))) continue; 1.2260 + 1.2261 + // increment word number, if the second root has a compoundroot flag 1.2262 + if ((rv) && (compoundroot) && 1.2263 + (TESTAFF(rv->astr, compoundroot, rv->alen))) { 1.2264 + wordnum++; 1.2265 + } 1.2266 + 1.2267 + // first word is acceptable in compound words? 1.2268 + if (((rv) && 1.2269 + ( checked_prefix || (words && words[wnum]) || 1.2270 + (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) || 1.2271 + ((oldwordnum == 0) && compoundbegin && TESTAFF(rv->astr, compoundbegin, rv->alen)) || 1.2272 + ((oldwordnum > 0) && compoundmiddle && TESTAFF(rv->astr, compoundmiddle, rv->alen)) 1.2273 +// LANG_hu section: spec. Hungarian rule 1.2274 + || ((langnum == LANG_hu) && // hu_mov_rule 1.2275 + hu_mov_rule && ( 1.2276 + TESTAFF(rv->astr, 'F', rv->alen) || 1.2277 + TESTAFF(rv->astr, 'G', rv->alen) || 1.2278 + TESTAFF(rv->astr, 'H', rv->alen) 1.2279 + ) 1.2280 + ) 1.2281 +// END of LANG_hu section 1.2282 + ) 1.2283 + && ! (( checkcompoundtriple && !words && // test triple letters 1.2284 + (word[i-1]==word[i]) && ( 1.2285 + ((i>1) && (word[i-1]==word[i-2])) || 1.2286 + ((word[i-1]==word[i+1])) // may be word[i+1] == '\0' 1.2287 + ) 1.2288 + ) || 1.2289 + ( 1.2290 + // test CHECKCOMPOUNDPATTERN 1.2291 + numcheckcpd && !words && cpdpat_check(word, i, rv, NULL, affixed) 1.2292 + ) || 1.2293 + ( 1.2294 + checkcompoundcase && !words && cpdcase_check(word, i) 1.2295 + )) 1.2296 + ) 1.2297 +// LANG_hu section: spec. Hungarian rule 1.2298 + || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && (rv = affix_check(st,i)) && 1.2299 + (sfx && sfx->getCont() && ( 1.2300 + TESTAFF(sfx->getCont(), (unsigned short) 'x', sfx->getContLen()) || 1.2301 + TESTAFF(sfx->getCont(), (unsigned short) '%', sfx->getContLen()) 1.2302 + ) 1.2303 + ) 1.2304 + ) 1.2305 +// END of LANG_hu section 1.2306 + ) { 1.2307 + 1.2308 +// LANG_hu section: spec. Hungarian rule 1.2309 + if (langnum == LANG_hu) { 1.2310 + // calculate syllable number of the word 1.2311 + numsyllable += get_syllable(st, i); 1.2312 + 1.2313 + // + 1 word, if syllable number of the prefix > 1 (hungarian convention) 1.2314 + if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) > 1)) wordnum++; 1.2315 + } 1.2316 +// END of LANG_hu section 1.2317 + 1.2318 + // NEXT WORD(S) 1.2319 + rv_first = rv; 1.2320 + rv = lookup((word+i)); // perhaps without prefix 1.2321 + 1.2322 + // search homonym with compound flag 1.2323 + while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || 1.2324 + !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) || 1.2325 + (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) || 1.2326 + (numdefcpd && words && defcpd_check(&words, wnum + 1, rv, NULL,1))))) { 1.2327 + rv = rv->next_homonym; 1.2328 + } 1.2329 + 1.2330 + if (rv && words && words[wnum + 1]) { 1.2331 + mystrcat(*result, presult, MAXLNLEN); 1.2332 + mystrcat(*result, " ", MAXLNLEN); 1.2333 + mystrcat(*result, MORPH_PART, MAXLNLEN); 1.2334 + mystrcat(*result, word+i, MAXLNLEN); 1.2335 + if (complexprefixes && HENTRY_DATA(rv)) mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN); 1.2336 + if (!HENTRY_FIND(rv, MORPH_STEM)) { 1.2337 + mystrcat(*result, " ", MAXLNLEN); 1.2338 + mystrcat(*result, MORPH_STEM, MAXLNLEN); 1.2339 + mystrcat(*result, HENTRY_WORD(rv), MAXLNLEN); 1.2340 + } 1.2341 + // store the pointer of the hash entry 1.2342 +// sprintf(*result + strlen(*result), " %s%p", MORPH_HENTRY, rv); 1.2343 + if (!complexprefixes && HENTRY_DATA(rv)) { 1.2344 + mystrcat(*result, " ", MAXLNLEN); 1.2345 + mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN); 1.2346 + } 1.2347 + mystrcat(*result, "\n", MAXLNLEN); 1.2348 + ok = 1; 1.2349 + return 0; 1.2350 + } 1.2351 + 1.2352 + oldnumsyllable2 = numsyllable; 1.2353 + oldwordnum2 = wordnum; 1.2354 + 1.2355 +// LANG_hu section: spec. Hungarian rule 1.2356 + if ((rv) && (langnum == LANG_hu) && (TESTAFF(rv->astr, 'I', rv->alen)) && !(TESTAFF(rv->astr, 'J', rv->alen))) { 1.2357 + numsyllable--; 1.2358 + } 1.2359 +// END of LANG_hu section 1.2360 + // increment word number, if the second root has a compoundroot flag 1.2361 + if ((rv) && (compoundroot) && 1.2362 + (TESTAFF(rv->astr, compoundroot, rv->alen))) { 1.2363 + wordnum++; 1.2364 + } 1.2365 + 1.2366 + // check forbiddenwords 1.2367 + if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) || 1.2368 + TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))) { 1.2369 + st[i] = ch; 1.2370 + continue; 1.2371 + } 1.2372 + 1.2373 + // second word is acceptable, as a root? 1.2374 + // hungarian conventions: compounding is acceptable, 1.2375 + // when compound forms consist of 2 words, or if more, 1.2376 + // then the syllable number of root words must be 6, or lesser. 1.2377 + if ((rv) && ( 1.2378 + (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) || 1.2379 + (compoundend && TESTAFF(rv->astr, compoundend, rv->alen)) 1.2380 + ) 1.2381 + && ( 1.2382 + ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) || 1.2383 + ((cpdmaxsyllable!=0) && 1.2384 + (numsyllable+get_syllable(HENTRY_WORD(rv),rv->blen)<=cpdmaxsyllable)) 1.2385 + ) 1.2386 + && ( 1.2387 + (!checkcompounddup || (rv != rv_first)) 1.2388 + ) 1.2389 + ) 1.2390 + { 1.2391 + // bad compound word 1.2392 + mystrcat(*result, presult, MAXLNLEN); 1.2393 + mystrcat(*result, " ", MAXLNLEN); 1.2394 + mystrcat(*result, MORPH_PART, MAXLNLEN); 1.2395 + mystrcat(*result, word+i, MAXLNLEN); 1.2396 + 1.2397 + if (HENTRY_DATA(rv)) { 1.2398 + if (complexprefixes) mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN); 1.2399 + if (! HENTRY_FIND(rv, MORPH_STEM)) { 1.2400 + mystrcat(*result, " ", MAXLNLEN); 1.2401 + mystrcat(*result, MORPH_STEM, MAXLNLEN); 1.2402 + mystrcat(*result, HENTRY_WORD(rv), MAXLNLEN); 1.2403 + } 1.2404 + // store the pointer of the hash entry 1.2405 +// sprintf(*result + strlen(*result), " %s%p", MORPH_HENTRY, rv); 1.2406 + if (!complexprefixes) { 1.2407 + mystrcat(*result, " ", MAXLNLEN); 1.2408 + mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN); 1.2409 + } 1.2410 + } 1.2411 + mystrcat(*result, "\n", MAXLNLEN); 1.2412 + ok = 1; 1.2413 + } 1.2414 + 1.2415 + numsyllable = oldnumsyllable2 ; 1.2416 + wordnum = oldwordnum2; 1.2417 + 1.2418 + // perhaps second word has prefix or/and suffix 1.2419 + sfx = NULL; 1.2420 + sfxflag = FLAG_NULL; 1.2421 + 1.2422 + if (compoundflag && !onlycpdrule) rv = affix_check((word+i),strlen(word+i), compoundflag); else rv = NULL; 1.2423 + 1.2424 + if (!rv && compoundend && !onlycpdrule) { 1.2425 + sfx = NULL; 1.2426 + pfx = NULL; 1.2427 + rv = affix_check((word+i),strlen(word+i), compoundend); 1.2428 + } 1.2429 + 1.2430 + if (!rv && numdefcpd && words) { 1.2431 + rv = affix_check((word+i),strlen(word+i), 0, IN_CPD_END); 1.2432 + if (rv && words && defcpd_check(&words, wnum + 1, rv, NULL, 1)) { 1.2433 + char * m = NULL; 1.2434 + if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag); 1.2435 + if ((!m || *m == '\0') && compoundend) { 1.2436 + if (m) free(m); 1.2437 + m = affix_check_morph((word+i),strlen(word+i), compoundend); 1.2438 + } 1.2439 + mystrcat(*result, presult, MAXLNLEN); 1.2440 + if (m || (*m != '\0')) { 1.2441 + sprintf(*result + strlen(*result), "%c%s%s%s", MSEP_FLD, 1.2442 + MORPH_PART, word + i, line_uniq_app(&m, MSEP_REC)); 1.2443 + } 1.2444 + if (m) free(m); 1.2445 + mystrcat(*result, "\n", MAXLNLEN); 1.2446 + ok = 1; 1.2447 + } 1.2448 + } 1.2449 + 1.2450 + // check non_compound flag in suffix and prefix 1.2451 + if ((rv) && 1.2452 + ((pfx && pfx->getCont() && 1.2453 + TESTAFF(pfx->getCont(), compoundforbidflag, 1.2454 + pfx->getContLen())) || 1.2455 + (sfx && sfx->getCont() && 1.2456 + TESTAFF(sfx->getCont(), compoundforbidflag, 1.2457 + sfx->getContLen())))) { 1.2458 + rv = NULL; 1.2459 + } 1.2460 + 1.2461 + // check forbiddenwords 1.2462 + if ((rv) && (rv->astr) && (TESTAFF(rv->astr,forbiddenword,rv->alen) || 1.2463 + TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen)) 1.2464 + && (! TESTAFF(rv->astr, needaffix, rv->alen))) { 1.2465 + st[i] = ch; 1.2466 + continue; 1.2467 + } 1.2468 + 1.2469 + if (langnum == LANG_hu) { 1.2470 + // calculate syllable number of the word 1.2471 + numsyllable += get_syllable(word + i, strlen(word + i)); 1.2472 + 1.2473 + // - affix syllable num. 1.2474 + // XXX only second suffix (inflections, not derivations) 1.2475 + if (sfxappnd) { 1.2476 + char * tmp = myrevstrdup(sfxappnd); 1.2477 + numsyllable -= get_syllable(tmp, strlen(tmp)); 1.2478 + free(tmp); 1.2479 + } 1.2480 + 1.2481 + // + 1 word, if syllable number of the prefix > 1 (hungarian convention) 1.2482 + if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) > 1)) wordnum++; 1.2483 + 1.2484 + // increment syllable num, if last word has a SYLLABLENUM flag 1.2485 + // and the suffix is beginning `s' 1.2486 + 1.2487 + if (cpdsyllablenum) { 1.2488 + switch (sfxflag) { 1.2489 + case 'c': { numsyllable+=2; break; } 1.2490 + case 'J': { numsyllable += 1; break; } 1.2491 + case 'I': { if (rv && TESTAFF(rv->astr, 'J', rv->alen)) numsyllable += 1; break; } 1.2492 + } 1.2493 + } 1.2494 + } 1.2495 + 1.2496 + // increment word number, if the second word has a compoundroot flag 1.2497 + if ((rv) && (compoundroot) && 1.2498 + (TESTAFF(rv->astr, compoundroot, rv->alen))) { 1.2499 + wordnum++; 1.2500 + } 1.2501 + // second word is acceptable, as a word with prefix or/and suffix? 1.2502 + // hungarian conventions: compounding is acceptable, 1.2503 + // when compound forms consist 2 word, otherwise 1.2504 + // the syllable number of root words is 6, or lesser. 1.2505 + if ((rv) && 1.2506 + ( 1.2507 + ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) || 1.2508 + ((cpdmaxsyllable!=0) && 1.2509 + (numsyllable <= cpdmaxsyllable)) 1.2510 + ) 1.2511 + && ( 1.2512 + (!checkcompounddup || (rv != rv_first)) 1.2513 + )) { 1.2514 + char * m = NULL; 1.2515 + if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag); 1.2516 + if ((!m || *m == '\0') && compoundend) { 1.2517 + if (m) free(m); 1.2518 + m = affix_check_morph((word+i),strlen(word+i), compoundend); 1.2519 + } 1.2520 + mystrcat(*result, presult, MAXLNLEN); 1.2521 + if (m && (*m != '\0')) { 1.2522 + sprintf(*result + strlen(*result), "%c%s%s%s", MSEP_FLD, 1.2523 + MORPH_PART, word + i, line_uniq_app(&m, MSEP_REC)); 1.2524 + } 1.2525 + if (m) free(m); 1.2526 + sprintf(*result + strlen(*result), "%c", MSEP_REC); 1.2527 + ok = 1; 1.2528 + } 1.2529 + 1.2530 + numsyllable = oldnumsyllable2; 1.2531 + wordnum = oldwordnum2; 1.2532 + 1.2533 + // perhaps second word is a compound word (recursive call) 1.2534 + if ((wordnum < maxwordnum) && (ok == 0)) { 1.2535 + compound_check_morph((word+i),strlen(word+i), wordnum+1, 1.2536 + numsyllable, maxwordnum, wnum + 1, words, 0, result, presult); 1.2537 + } else { 1.2538 + rv=NULL; 1.2539 + } 1.2540 + } 1.2541 + st[i] = ch; 1.2542 + wordnum = oldwordnum; 1.2543 + numsyllable = oldnumsyllable; 1.2544 + 1.2545 + } while (numdefcpd && oldwordnum == 0 && !onlycpdrule && (onlycpdrule = 1)); // end of onlycpd loop 1.2546 + 1.2547 + } 1.2548 + return 0; 1.2549 +} 1.2550 + 1.2551 + // return 1 if s1 (reversed) is a leading subset of end of s2 1.2552 +/* inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int len) 1.2553 + { 1.2554 + while ((len > 0) && *s1 && (*s1 == *end_of_s2)) { 1.2555 + s1++; 1.2556 + end_of_s2--; 1.2557 + len--; 1.2558 + } 1.2559 + return (*s1 == '\0'); 1.2560 + } 1.2561 + */ 1.2562 + 1.2563 +inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int len) 1.2564 + { 1.2565 + while ((len > 0) && (*s1 != '\0') && ((*s1 == *end_of_s2) || (*s1 == '.'))) { 1.2566 + s1++; 1.2567 + end_of_s2--; 1.2568 + len--; 1.2569 + } 1.2570 + return (*s1 == '\0'); 1.2571 + } 1.2572 + 1.2573 +// check word for suffixes 1.2574 + 1.2575 +struct hentry * AffixMgr::suffix_check (const char * word, int len, 1.2576 + int sfxopts, PfxEntry * ppfx, char ** wlst, int maxSug, int * ns, 1.2577 + const FLAG cclass, const FLAG needflag, char in_compound) 1.2578 +{ 1.2579 + struct hentry * rv = NULL; 1.2580 + PfxEntry* ep = ppfx; 1.2581 + 1.2582 + // first handle the special case of 0 length suffixes 1.2583 + SfxEntry * se = sStart[0]; 1.2584 + 1.2585 + while (se) { 1.2586 + if (!cclass || se->getCont()) { 1.2587 + // suffixes are not allowed in beginning of compounds 1.2588 + if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass 1.2589 + // except when signed with compoundpermitflag flag 1.2590 + (se->getCont() && compoundpermitflag && 1.2591 + TESTAFF(se->getCont(),compoundpermitflag,se->getContLen()))) && (!circumfix || 1.2592 + // no circumfix flag in prefix and suffix 1.2593 + ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(), 1.2594 + circumfix, ep->getContLen())) && 1.2595 + (!se->getCont() || !(TESTAFF(se->getCont(),circumfix,se->getContLen())))) || 1.2596 + // circumfix flag in prefix AND suffix 1.2597 + ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(), 1.2598 + circumfix, ep->getContLen())) && 1.2599 + (se->getCont() && (TESTAFF(se->getCont(),circumfix,se->getContLen()))))) && 1.2600 + // fogemorpheme 1.2601 + (in_compound || 1.2602 + !(se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen())))) && 1.2603 + // needaffix on prefix or first suffix 1.2604 + (cclass || 1.2605 + !(se->getCont() && TESTAFF(se->getCont(), needaffix, se->getContLen())) || 1.2606 + (ppfx && !((ep->getCont()) && 1.2607 + TESTAFF(ep->getCont(), needaffix, 1.2608 + ep->getContLen()))) 1.2609 + )) { 1.2610 + rv = se->checkword(word,len, sfxopts, ppfx, wlst, maxSug, ns, (FLAG) cclass, 1.2611 + needflag, (in_compound ? 0 : onlyincompound)); 1.2612 + if (rv) { 1.2613 + sfx=se; // BUG: sfx not stateless 1.2614 + return rv; 1.2615 + } 1.2616 + } 1.2617 + } 1.2618 + se = se->getNext(); 1.2619 + } 1.2620 + 1.2621 + // now handle the general case 1.2622 + if (len == 0) return NULL; // FULLSTRIP 1.2623 + unsigned char sp= *((const unsigned char *)(word + len - 1)); 1.2624 + SfxEntry * sptr = sStart[sp]; 1.2625 + 1.2626 + while (sptr) { 1.2627 + if (isRevSubset(sptr->getKey(), word + len - 1, len) 1.2628 + ) { 1.2629 + // suffixes are not allowed in beginning of compounds 1.2630 + if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass 1.2631 + // except when signed with compoundpermitflag flag 1.2632 + (sptr->getCont() && compoundpermitflag && 1.2633 + TESTAFF(sptr->getCont(),compoundpermitflag,sptr->getContLen()))) && (!circumfix || 1.2634 + // no circumfix flag in prefix and suffix 1.2635 + ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(), 1.2636 + circumfix, ep->getContLen())) && 1.2637 + (!sptr->getCont() || !(TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))) || 1.2638 + // circumfix flag in prefix AND suffix 1.2639 + ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(), 1.2640 + circumfix, ep->getContLen())) && 1.2641 + (sptr->getCont() && (TESTAFF(sptr->getCont(),circumfix,sptr->getContLen()))))) && 1.2642 + // fogemorpheme 1.2643 + (in_compound || 1.2644 + !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) && 1.2645 + // needaffix on prefix or first suffix 1.2646 + (cclass || 1.2647 + !(sptr->getCont() && TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) || 1.2648 + (ppfx && !((ep->getCont()) && 1.2649 + TESTAFF(ep->getCont(), needaffix, 1.2650 + ep->getContLen()))) 1.2651 + ) 1.2652 + ) if (in_compound != IN_CPD_END || ppfx || !(sptr->getCont() && TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))) { 1.2653 + rv = sptr->checkword(word,len, sfxopts, ppfx, wlst, 1.2654 + maxSug, ns, cclass, needflag, (in_compound ? 0 : onlyincompound)); 1.2655 + if (rv) { 1.2656 + sfx=sptr; // BUG: sfx not stateless 1.2657 + sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless 1.2658 + if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless 1.2659 + return rv; 1.2660 + } 1.2661 + } 1.2662 + sptr = sptr->getNextEQ(); 1.2663 + } else { 1.2664 + sptr = sptr->getNextNE(); 1.2665 + } 1.2666 + } 1.2667 + 1.2668 + return NULL; 1.2669 +} 1.2670 + 1.2671 +// check word for two-level suffixes 1.2672 + 1.2673 +struct hentry * AffixMgr::suffix_check_twosfx(const char * word, int len, 1.2674 + int sfxopts, PfxEntry * ppfx, const FLAG needflag) 1.2675 +{ 1.2676 + struct hentry * rv = NULL; 1.2677 + 1.2678 + // first handle the special case of 0 length suffixes 1.2679 + SfxEntry * se = sStart[0]; 1.2680 + while (se) { 1.2681 + if (contclasses[se->getFlag()]) 1.2682 + { 1.2683 + rv = se->check_twosfx(word,len, sfxopts, ppfx, needflag); 1.2684 + if (rv) return rv; 1.2685 + } 1.2686 + se = se->getNext(); 1.2687 + } 1.2688 + 1.2689 + // now handle the general case 1.2690 + if (len == 0) return NULL; // FULLSTRIP 1.2691 + unsigned char sp = *((const unsigned char *)(word + len - 1)); 1.2692 + SfxEntry * sptr = sStart[sp]; 1.2693 + 1.2694 + while (sptr) { 1.2695 + if (isRevSubset(sptr->getKey(), word + len - 1, len)) { 1.2696 + if (contclasses[sptr->getFlag()]) 1.2697 + { 1.2698 + rv = sptr->check_twosfx(word,len, sfxopts, ppfx, needflag); 1.2699 + if (rv) { 1.2700 + sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless 1.2701 + if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless 1.2702 + return rv; 1.2703 + } 1.2704 + } 1.2705 + sptr = sptr->getNextEQ(); 1.2706 + } else { 1.2707 + sptr = sptr->getNextNE(); 1.2708 + } 1.2709 + } 1.2710 + 1.2711 + return NULL; 1.2712 +} 1.2713 + 1.2714 +char * AffixMgr::suffix_check_twosfx_morph(const char * word, int len, 1.2715 + int sfxopts, PfxEntry * ppfx, const FLAG needflag) 1.2716 +{ 1.2717 + char result[MAXLNLEN]; 1.2718 + char result2[MAXLNLEN]; 1.2719 + char result3[MAXLNLEN]; 1.2720 + 1.2721 + char * st; 1.2722 + 1.2723 + result[0] = '\0'; 1.2724 + result2[0] = '\0'; 1.2725 + result3[0] = '\0'; 1.2726 + 1.2727 + // first handle the special case of 0 length suffixes 1.2728 + SfxEntry * se = sStart[0]; 1.2729 + while (se) { 1.2730 + if (contclasses[se->getFlag()]) 1.2731 + { 1.2732 + st = se->check_twosfx_morph(word,len, sfxopts, ppfx, needflag); 1.2733 + if (st) { 1.2734 + if (ppfx) { 1.2735 + if (ppfx->getMorph()) { 1.2736 + mystrcat(result, ppfx->getMorph(), MAXLNLEN); 1.2737 + mystrcat(result, " ", MAXLNLEN); 1.2738 + } else debugflag(result, ppfx->getFlag()); 1.2739 + } 1.2740 + mystrcat(result, st, MAXLNLEN); 1.2741 + free(st); 1.2742 + if (se->getMorph()) { 1.2743 + mystrcat(result, " ", MAXLNLEN); 1.2744 + mystrcat(result, se->getMorph(), MAXLNLEN); 1.2745 + } else debugflag(result, se->getFlag()); 1.2746 + mystrcat(result, "\n", MAXLNLEN); 1.2747 + } 1.2748 + } 1.2749 + se = se->getNext(); 1.2750 + } 1.2751 + 1.2752 + // now handle the general case 1.2753 + if (len == 0) return NULL; // FULLSTRIP 1.2754 + unsigned char sp = *((const unsigned char *)(word + len - 1)); 1.2755 + SfxEntry * sptr = sStart[sp]; 1.2756 + 1.2757 + while (sptr) { 1.2758 + if (isRevSubset(sptr->getKey(), word + len - 1, len)) { 1.2759 + if (contclasses[sptr->getFlag()]) 1.2760 + { 1.2761 + st = sptr->check_twosfx_morph(word,len, sfxopts, ppfx, needflag); 1.2762 + if (st) { 1.2763 + sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless 1.2764 + if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless 1.2765 + strcpy(result2, st); 1.2766 + free(st); 1.2767 + 1.2768 + result3[0] = '\0'; 1.2769 + 1.2770 + if (sptr->getMorph()) { 1.2771 + mystrcat(result3, " ", MAXLNLEN); 1.2772 + mystrcat(result3, sptr->getMorph(), MAXLNLEN); 1.2773 + } else debugflag(result3, sptr->getFlag()); 1.2774 + strlinecat(result2, result3); 1.2775 + mystrcat(result2, "\n", MAXLNLEN); 1.2776 + mystrcat(result, result2, MAXLNLEN); 1.2777 + } 1.2778 + } 1.2779 + sptr = sptr->getNextEQ(); 1.2780 + } else { 1.2781 + sptr = sptr->getNextNE(); 1.2782 + } 1.2783 + } 1.2784 + if (*result) return mystrdup(result); 1.2785 + return NULL; 1.2786 +} 1.2787 + 1.2788 +char * AffixMgr::suffix_check_morph(const char * word, int len, 1.2789 + int sfxopts, PfxEntry * ppfx, const FLAG cclass, const FLAG needflag, char in_compound) 1.2790 +{ 1.2791 + char result[MAXLNLEN]; 1.2792 + 1.2793 + struct hentry * rv = NULL; 1.2794 + 1.2795 + result[0] = '\0'; 1.2796 + 1.2797 + PfxEntry* ep = ppfx; 1.2798 + 1.2799 + // first handle the special case of 0 length suffixes 1.2800 + SfxEntry * se = sStart[0]; 1.2801 + while (se) { 1.2802 + if (!cclass || se->getCont()) { 1.2803 + // suffixes are not allowed in beginning of compounds 1.2804 + if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass 1.2805 + // except when signed with compoundpermitflag flag 1.2806 + (se->getCont() && compoundpermitflag && 1.2807 + TESTAFF(se->getCont(),compoundpermitflag,se->getContLen()))) && (!circumfix || 1.2808 + // no circumfix flag in prefix and suffix 1.2809 + ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(), 1.2810 + circumfix, ep->getContLen())) && 1.2811 + (!se->getCont() || !(TESTAFF(se->getCont(),circumfix,se->getContLen())))) || 1.2812 + // circumfix flag in prefix AND suffix 1.2813 + ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(), 1.2814 + circumfix, ep->getContLen())) && 1.2815 + (se->getCont() && (TESTAFF(se->getCont(),circumfix,se->getContLen()))))) && 1.2816 + // fogemorpheme 1.2817 + (in_compound || 1.2818 + !((se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) && 1.2819 + // needaffix on prefix or first suffix 1.2820 + (cclass || 1.2821 + !(se->getCont() && TESTAFF(se->getCont(), needaffix, se->getContLen())) || 1.2822 + (ppfx && !((ep->getCont()) && 1.2823 + TESTAFF(ep->getCont(), needaffix, 1.2824 + ep->getContLen()))) 1.2825 + ) 1.2826 + )) 1.2827 + rv = se->checkword(word, len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag); 1.2828 + while (rv) { 1.2829 + if (ppfx) { 1.2830 + if (ppfx->getMorph()) { 1.2831 + mystrcat(result, ppfx->getMorph(), MAXLNLEN); 1.2832 + mystrcat(result, " ", MAXLNLEN); 1.2833 + } else debugflag(result, ppfx->getFlag()); 1.2834 + } 1.2835 + if (complexprefixes && HENTRY_DATA(rv)) mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); 1.2836 + if (! HENTRY_FIND(rv, MORPH_STEM)) { 1.2837 + mystrcat(result, " ", MAXLNLEN); 1.2838 + mystrcat(result, MORPH_STEM, MAXLNLEN); 1.2839 + mystrcat(result, HENTRY_WORD(rv), MAXLNLEN); 1.2840 + } 1.2841 + // store the pointer of the hash entry 1.2842 +// sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv); 1.2843 + 1.2844 + if (!complexprefixes && HENTRY_DATA(rv)) { 1.2845 + mystrcat(result, " ", MAXLNLEN); 1.2846 + mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); 1.2847 + } 1.2848 + if (se->getMorph()) { 1.2849 + mystrcat(result, " ", MAXLNLEN); 1.2850 + mystrcat(result, se->getMorph(), MAXLNLEN); 1.2851 + } else debugflag(result, se->getFlag()); 1.2852 + mystrcat(result, "\n", MAXLNLEN); 1.2853 + rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag); 1.2854 + } 1.2855 + } 1.2856 + se = se->getNext(); 1.2857 + } 1.2858 + 1.2859 + // now handle the general case 1.2860 + if (len == 0) return NULL; // FULLSTRIP 1.2861 + unsigned char sp = *((const unsigned char *)(word + len - 1)); 1.2862 + SfxEntry * sptr = sStart[sp]; 1.2863 + 1.2864 + while (sptr) { 1.2865 + if (isRevSubset(sptr->getKey(), word + len - 1, len) 1.2866 + ) { 1.2867 + // suffixes are not allowed in beginning of compounds 1.2868 + if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass 1.2869 + // except when signed with compoundpermitflag flag 1.2870 + (sptr->getCont() && compoundpermitflag && 1.2871 + TESTAFF(sptr->getCont(),compoundpermitflag,sptr->getContLen()))) && (!circumfix || 1.2872 + // no circumfix flag in prefix and suffix 1.2873 + ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(), 1.2874 + circumfix, ep->getContLen())) && 1.2875 + (!sptr->getCont() || !(TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))) || 1.2876 + // circumfix flag in prefix AND suffix 1.2877 + ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(), 1.2878 + circumfix, ep->getContLen())) && 1.2879 + (sptr->getCont() && (TESTAFF(sptr->getCont(),circumfix,sptr->getContLen()))))) && 1.2880 + // fogemorpheme 1.2881 + (in_compound || 1.2882 + !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) && 1.2883 + // needaffix on first suffix 1.2884 + (cclass || !(sptr->getCont() && 1.2885 + TESTAFF(sptr->getCont(), needaffix, sptr->getContLen()))) 1.2886 + )) rv = sptr->checkword(word,len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag); 1.2887 + while (rv) { 1.2888 + if (ppfx) { 1.2889 + if (ppfx->getMorph()) { 1.2890 + mystrcat(result, ppfx->getMorph(), MAXLNLEN); 1.2891 + mystrcat(result, " ", MAXLNLEN); 1.2892 + } else debugflag(result, ppfx->getFlag()); 1.2893 + } 1.2894 + if (complexprefixes && HENTRY_DATA(rv)) mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); 1.2895 + if (! HENTRY_FIND(rv, MORPH_STEM)) { 1.2896 + mystrcat(result, " ", MAXLNLEN); 1.2897 + mystrcat(result, MORPH_STEM, MAXLNLEN); 1.2898 + mystrcat(result, HENTRY_WORD(rv), MAXLNLEN); 1.2899 + } 1.2900 + // store the pointer of the hash entry 1.2901 +// sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv); 1.2902 + 1.2903 + if (!complexprefixes && HENTRY_DATA(rv)) { 1.2904 + mystrcat(result, " ", MAXLNLEN); 1.2905 + mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); 1.2906 + } 1.2907 + 1.2908 + if (sptr->getMorph()) { 1.2909 + mystrcat(result, " ", MAXLNLEN); 1.2910 + mystrcat(result, sptr->getMorph(), MAXLNLEN); 1.2911 + } else debugflag(result, sptr->getFlag()); 1.2912 + mystrcat(result, "\n", MAXLNLEN); 1.2913 + rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag); 1.2914 + } 1.2915 + sptr = sptr->getNextEQ(); 1.2916 + } else { 1.2917 + sptr = sptr->getNextNE(); 1.2918 + } 1.2919 + } 1.2920 + 1.2921 + if (*result) return mystrdup(result); 1.2922 + return NULL; 1.2923 +} 1.2924 + 1.2925 +// check if word with affixes is correctly spelled 1.2926 +struct hentry * AffixMgr::affix_check (const char * word, int len, const FLAG needflag, char in_compound) 1.2927 +{ 1.2928 + struct hentry * rv= NULL; 1.2929 + 1.2930 + // check all prefixes (also crossed with suffixes if allowed) 1.2931 + rv = prefix_check(word, len, in_compound, needflag); 1.2932 + if (rv) return rv; 1.2933 + 1.2934 + // if still not found check all suffixes 1.2935 + rv = suffix_check(word, len, 0, NULL, NULL, 0, NULL, FLAG_NULL, needflag, in_compound); 1.2936 + 1.2937 + if (havecontclass) { 1.2938 + sfx = NULL; 1.2939 + pfx = NULL; 1.2940 + 1.2941 + if (rv) return rv; 1.2942 + // if still not found check all two-level suffixes 1.2943 + rv = suffix_check_twosfx(word, len, 0, NULL, needflag); 1.2944 + 1.2945 + if (rv) return rv; 1.2946 + // if still not found check all two-level suffixes 1.2947 + rv = prefix_check_twosfx(word, len, IN_CPD_NOT, needflag); 1.2948 + } 1.2949 + 1.2950 + return rv; 1.2951 +} 1.2952 + 1.2953 +// check if word with affixes is correctly spelled 1.2954 +char * AffixMgr::affix_check_morph(const char * word, int len, const FLAG needflag, char in_compound) 1.2955 +{ 1.2956 + char result[MAXLNLEN]; 1.2957 + char * st = NULL; 1.2958 + 1.2959 + *result = '\0'; 1.2960 + 1.2961 + // check all prefixes (also crossed with suffixes if allowed) 1.2962 + st = prefix_check_morph(word, len, in_compound); 1.2963 + if (st) { 1.2964 + mystrcat(result, st, MAXLNLEN); 1.2965 + free(st); 1.2966 + } 1.2967 + 1.2968 + // if still not found check all suffixes 1.2969 + st = suffix_check_morph(word, len, 0, NULL, '\0', needflag, in_compound); 1.2970 + if (st) { 1.2971 + mystrcat(result, st, MAXLNLEN); 1.2972 + free(st); 1.2973 + } 1.2974 + 1.2975 + if (havecontclass) { 1.2976 + sfx = NULL; 1.2977 + pfx = NULL; 1.2978 + // if still not found check all two-level suffixes 1.2979 + st = suffix_check_twosfx_morph(word, len, 0, NULL, needflag); 1.2980 + if (st) { 1.2981 + mystrcat(result, st, MAXLNLEN); 1.2982 + free(st); 1.2983 + } 1.2984 + 1.2985 + // if still not found check all two-level suffixes 1.2986 + st = prefix_check_twosfx_morph(word, len, IN_CPD_NOT, needflag); 1.2987 + if (st) { 1.2988 + mystrcat(result, st, MAXLNLEN); 1.2989 + free(st); 1.2990 + } 1.2991 + } 1.2992 + 1.2993 + return mystrdup(result); 1.2994 +} 1.2995 + 1.2996 +char * AffixMgr::morphgen(char * ts, int wl, const unsigned short * ap, 1.2997 + unsigned short al, char * morph, char * targetmorph, int level) 1.2998 +{ 1.2999 + // handle suffixes 1.3000 + char * stemmorph; 1.3001 + char * stemmorphcatpos; 1.3002 + char mymorph[MAXLNLEN]; 1.3003 + 1.3004 + if (!morph) return NULL; 1.3005 + 1.3006 + // check substandard flag 1.3007 + if (TESTAFF(ap, substandard, al)) return NULL; 1.3008 + 1.3009 + if (morphcmp(morph, targetmorph) == 0) return mystrdup(ts); 1.3010 + 1.3011 +// int targetcount = get_sfxcount(targetmorph); 1.3012 + 1.3013 + // use input suffix fields, if exist 1.3014 + if (strstr(morph, MORPH_INFL_SFX) || strstr(morph, MORPH_DERI_SFX)) { 1.3015 + stemmorph = mymorph; 1.3016 + strcpy(stemmorph, morph); 1.3017 + mystrcat(stemmorph, " ", MAXLNLEN); 1.3018 + stemmorphcatpos = stemmorph + strlen(stemmorph); 1.3019 + } else { 1.3020 + stemmorph = morph; 1.3021 + stemmorphcatpos = NULL; 1.3022 + } 1.3023 + 1.3024 + for (int i = 0; i < al; i++) { 1.3025 + const unsigned char c = (unsigned char) (ap[i] & 0x00FF); 1.3026 + SfxEntry * sptr = sFlag[c]; 1.3027 + while (sptr) { 1.3028 + if (sptr->getFlag() == ap[i] && sptr->getMorph() && ((sptr->getContLen() == 0) || 1.3029 + // don't generate forms with substandard affixes 1.3030 + !TESTAFF(sptr->getCont(), substandard, sptr->getContLen()))) { 1.3031 + 1.3032 + if (stemmorphcatpos) strcpy(stemmorphcatpos, sptr->getMorph()); 1.3033 + else stemmorph = (char *) sptr->getMorph(); 1.3034 + 1.3035 + int cmp = morphcmp(stemmorph, targetmorph); 1.3036 + 1.3037 + if (cmp == 0) { 1.3038 + char * newword = sptr->add(ts, wl); 1.3039 + if (newword) { 1.3040 + hentry * check = pHMgr->lookup(newword); // XXX extra dic 1.3041 + if (!check || !check->astr || 1.3042 + !(TESTAFF(check->astr, forbiddenword, check->alen) || 1.3043 + TESTAFF(check->astr, ONLYUPCASEFLAG, check->alen))) { 1.3044 + return newword; 1.3045 + } 1.3046 + free(newword); 1.3047 + } 1.3048 + } 1.3049 + 1.3050 + // recursive call for secondary suffixes 1.3051 + if ((level == 0) && (cmp == 1) && (sptr->getContLen() > 0) && 1.3052 +// (get_sfxcount(stemmorph) < targetcount) && 1.3053 + !TESTAFF(sptr->getCont(), substandard, sptr->getContLen())) { 1.3054 + char * newword = sptr->add(ts, wl); 1.3055 + if (newword) { 1.3056 + char * newword2 = morphgen(newword, strlen(newword), sptr->getCont(), 1.3057 + sptr->getContLen(), stemmorph, targetmorph, 1); 1.3058 + 1.3059 + if (newword2) { 1.3060 + free(newword); 1.3061 + return newword2; 1.3062 + } 1.3063 + free(newword); 1.3064 + newword = NULL; 1.3065 + } 1.3066 + } 1.3067 + } 1.3068 + sptr = sptr->getFlgNxt(); 1.3069 + } 1.3070 + } 1.3071 + return NULL; 1.3072 +} 1.3073 + 1.3074 + 1.3075 +int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts, 1.3076 + int wl, const unsigned short * ap, unsigned short al, char * bad, int badl, 1.3077 + char * phon) 1.3078 +{ 1.3079 + int nh=0; 1.3080 + // first add root word to list 1.3081 + if ((nh < maxn) && !(al && ((needaffix && TESTAFF(ap, needaffix, al)) || 1.3082 + (onlyincompound && TESTAFF(ap, onlyincompound, al))))) { 1.3083 + wlst[nh].word = mystrdup(ts); 1.3084 + if (!wlst[nh].word) return 0; 1.3085 + wlst[nh].allow = (1 == 0); 1.3086 + wlst[nh].orig = NULL; 1.3087 + nh++; 1.3088 + // add special phonetic version 1.3089 + if (phon && (nh < maxn)) { 1.3090 + wlst[nh].word = mystrdup(phon); 1.3091 + if (!wlst[nh].word) return nh - 1; 1.3092 + wlst[nh].allow = (1 == 0); 1.3093 + wlst[nh].orig = mystrdup(ts); 1.3094 + if (!wlst[nh].orig) return nh - 1; 1.3095 + nh++; 1.3096 + } 1.3097 + } 1.3098 + 1.3099 + // handle suffixes 1.3100 + for (int i = 0; i < al; i++) { 1.3101 + const unsigned char c = (unsigned char) (ap[i] & 0x00FF); 1.3102 + SfxEntry * sptr = sFlag[c]; 1.3103 + while (sptr) { 1.3104 + if ((sptr->getFlag() == ap[i]) && (!sptr->getKeyLen() || ((badl > sptr->getKeyLen()) && 1.3105 + (strcmp(sptr->getAffix(), bad + badl - sptr->getKeyLen()) == 0))) && 1.3106 + // check needaffix flag 1.3107 + !(sptr->getCont() && ((needaffix && 1.3108 + TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) || 1.3109 + (circumfix && 1.3110 + TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())) || 1.3111 + (onlyincompound && 1.3112 + TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen())))) 1.3113 + ) { 1.3114 + char * newword = sptr->add(ts, wl); 1.3115 + if (newword) { 1.3116 + if (nh < maxn) { 1.3117 + wlst[nh].word = newword; 1.3118 + wlst[nh].allow = sptr->allowCross(); 1.3119 + wlst[nh].orig = NULL; 1.3120 + nh++; 1.3121 + // add special phonetic version 1.3122 + if (phon && (nh < maxn)) { 1.3123 + char st[MAXWORDUTF8LEN]; 1.3124 + strcpy(st, phon); 1.3125 + strcat(st, sptr->getKey()); 1.3126 + reverseword(st + strlen(phon)); 1.3127 + wlst[nh].word = mystrdup(st); 1.3128 + if (!wlst[nh].word) return nh - 1; 1.3129 + wlst[nh].allow = (1 == 0); 1.3130 + wlst[nh].orig = mystrdup(newword); 1.3131 + if (!wlst[nh].orig) return nh - 1; 1.3132 + nh++; 1.3133 + } 1.3134 + } else { 1.3135 + free(newword); 1.3136 + } 1.3137 + } 1.3138 + } 1.3139 + sptr = sptr->getFlgNxt(); 1.3140 + } 1.3141 + } 1.3142 + 1.3143 + int n = nh; 1.3144 + 1.3145 + // handle cross products of prefixes and suffixes 1.3146 + for (int j=1;j<n ;j++) 1.3147 + if (wlst[j].allow) { 1.3148 + for (int k = 0; k < al; k++) { 1.3149 + const unsigned char c = (unsigned char) (ap[k] & 0x00FF); 1.3150 + PfxEntry * cptr = pFlag[c]; 1.3151 + while (cptr) { 1.3152 + if ((cptr->getFlag() == ap[k]) && cptr->allowCross() && (!cptr->getKeyLen() || ((badl > cptr->getKeyLen()) && 1.3153 + (strncmp(cptr->getKey(), bad, cptr->getKeyLen()) == 0)))) { 1.3154 + int l1 = strlen(wlst[j].word); 1.3155 + char * newword = cptr->add(wlst[j].word, l1); 1.3156 + if (newword) { 1.3157 + if (nh < maxn) { 1.3158 + wlst[nh].word = newword; 1.3159 + wlst[nh].allow = cptr->allowCross(); 1.3160 + wlst[nh].orig = NULL; 1.3161 + nh++; 1.3162 + } else { 1.3163 + free(newword); 1.3164 + } 1.3165 + } 1.3166 + } 1.3167 + cptr = cptr->getFlgNxt(); 1.3168 + } 1.3169 + } 1.3170 + } 1.3171 + 1.3172 + 1.3173 + // now handle pure prefixes 1.3174 + for (int m = 0; m < al; m ++) { 1.3175 + const unsigned char c = (unsigned char) (ap[m] & 0x00FF); 1.3176 + PfxEntry * ptr = pFlag[c]; 1.3177 + while (ptr) { 1.3178 + if ((ptr->getFlag() == ap[m]) && (!ptr->getKeyLen() || ((badl > ptr->getKeyLen()) && 1.3179 + (strncmp(ptr->getKey(), bad, ptr->getKeyLen()) == 0))) && 1.3180 + // check needaffix flag 1.3181 + !(ptr->getCont() && ((needaffix && 1.3182 + TESTAFF(ptr->getCont(), needaffix, ptr->getContLen())) || 1.3183 + (circumfix && 1.3184 + TESTAFF(ptr->getCont(), circumfix, ptr->getContLen())) || 1.3185 + (onlyincompound && 1.3186 + TESTAFF(ptr->getCont(), onlyincompound, ptr->getContLen())))) 1.3187 + ) { 1.3188 + char * newword = ptr->add(ts, wl); 1.3189 + if (newword) { 1.3190 + if (nh < maxn) { 1.3191 + wlst[nh].word = newword; 1.3192 + wlst[nh].allow = ptr->allowCross(); 1.3193 + wlst[nh].orig = NULL; 1.3194 + nh++; 1.3195 + } else { 1.3196 + free(newword); 1.3197 + } 1.3198 + } 1.3199 + } 1.3200 + ptr = ptr->getFlgNxt(); 1.3201 + } 1.3202 + } 1.3203 + 1.3204 + return nh; 1.3205 +} 1.3206 + 1.3207 +// return length of replacing table 1.3208 +int AffixMgr::get_numrep() const 1.3209 +{ 1.3210 + return numrep; 1.3211 +} 1.3212 + 1.3213 +// return replacing table 1.3214 +struct replentry * AffixMgr::get_reptable() const 1.3215 +{ 1.3216 + if (! reptable ) return NULL; 1.3217 + return reptable; 1.3218 +} 1.3219 + 1.3220 +// return iconv table 1.3221 +RepList * AffixMgr::get_iconvtable() const 1.3222 +{ 1.3223 + if (! iconvtable ) return NULL; 1.3224 + return iconvtable; 1.3225 +} 1.3226 + 1.3227 +// return oconv table 1.3228 +RepList * AffixMgr::get_oconvtable() const 1.3229 +{ 1.3230 + if (! oconvtable ) return NULL; 1.3231 + return oconvtable; 1.3232 +} 1.3233 + 1.3234 +// return replacing table 1.3235 +struct phonetable * AffixMgr::get_phonetable() const 1.3236 +{ 1.3237 + if (! phone ) return NULL; 1.3238 + return phone; 1.3239 +} 1.3240 + 1.3241 +// return length of character map table 1.3242 +int AffixMgr::get_nummap() const 1.3243 +{ 1.3244 + return nummap; 1.3245 +} 1.3246 + 1.3247 +// return character map table 1.3248 +struct mapentry * AffixMgr::get_maptable() const 1.3249 +{ 1.3250 + if (! maptable ) return NULL; 1.3251 + return maptable; 1.3252 +} 1.3253 + 1.3254 +// return length of word break table 1.3255 +int AffixMgr::get_numbreak() const 1.3256 +{ 1.3257 + return numbreak; 1.3258 +} 1.3259 + 1.3260 +// return character map table 1.3261 +char ** AffixMgr::get_breaktable() const 1.3262 +{ 1.3263 + if (! breaktable ) return NULL; 1.3264 + return breaktable; 1.3265 +} 1.3266 + 1.3267 +// return text encoding of dictionary 1.3268 +char * AffixMgr::get_encoding() 1.3269 +{ 1.3270 + if (! encoding ) encoding = mystrdup(SPELL_ENCODING); 1.3271 + return mystrdup(encoding); 1.3272 +} 1.3273 + 1.3274 +// return text encoding of dictionary 1.3275 +int AffixMgr::get_langnum() const 1.3276 +{ 1.3277 + return langnum; 1.3278 +} 1.3279 + 1.3280 +// return double prefix option 1.3281 +int AffixMgr::get_complexprefixes() const 1.3282 +{ 1.3283 + return complexprefixes; 1.3284 +} 1.3285 + 1.3286 +// return FULLSTRIP option 1.3287 +int AffixMgr::get_fullstrip() const 1.3288 +{ 1.3289 + return fullstrip; 1.3290 +} 1.3291 + 1.3292 +FLAG AffixMgr::get_keepcase() const 1.3293 +{ 1.3294 + return keepcase; 1.3295 +} 1.3296 + 1.3297 +FLAG AffixMgr::get_forceucase() const 1.3298 +{ 1.3299 + return forceucase; 1.3300 +} 1.3301 + 1.3302 +FLAG AffixMgr::get_warn() const 1.3303 +{ 1.3304 + return warn; 1.3305 +} 1.3306 + 1.3307 +int AffixMgr::get_forbidwarn() const 1.3308 +{ 1.3309 + return forbidwarn; 1.3310 +} 1.3311 + 1.3312 +int AffixMgr::get_checksharps() const 1.3313 +{ 1.3314 + return checksharps; 1.3315 +} 1.3316 + 1.3317 +char * AffixMgr::encode_flag(unsigned short aflag) const 1.3318 +{ 1.3319 + return pHMgr->encode_flag(aflag); 1.3320 +} 1.3321 + 1.3322 + 1.3323 +// return the preferred ignore string for suggestions 1.3324 +char * AffixMgr::get_ignore() const 1.3325 +{ 1.3326 + if (!ignorechars) return NULL; 1.3327 + return ignorechars; 1.3328 +} 1.3329 + 1.3330 +// return the preferred ignore string for suggestions 1.3331 +unsigned short * AffixMgr::get_ignore_utf16(int * len) const 1.3332 +{ 1.3333 + *len = ignorechars_utf16_len; 1.3334 + return ignorechars_utf16; 1.3335 +} 1.3336 + 1.3337 +// return the keyboard string for suggestions 1.3338 +char * AffixMgr::get_key_string() 1.3339 +{ 1.3340 + if (! keystring ) keystring = mystrdup(SPELL_KEYSTRING); 1.3341 + return mystrdup(keystring); 1.3342 +} 1.3343 + 1.3344 +// return the preferred try string for suggestions 1.3345 +char * AffixMgr::get_try_string() const 1.3346 +{ 1.3347 + if (! trystring ) return NULL; 1.3348 + return mystrdup(trystring); 1.3349 +} 1.3350 + 1.3351 +// return the preferred try string for suggestions 1.3352 +const char * AffixMgr::get_wordchars() const 1.3353 +{ 1.3354 + return wordchars; 1.3355 +} 1.3356 + 1.3357 +unsigned short * AffixMgr::get_wordchars_utf16(int * len) const 1.3358 +{ 1.3359 + *len = wordchars_utf16_len; 1.3360 + return wordchars_utf16; 1.3361 +} 1.3362 + 1.3363 +// is there compounding? 1.3364 +int AffixMgr::get_compound() const 1.3365 +{ 1.3366 + return compoundflag || compoundbegin || numdefcpd; 1.3367 +} 1.3368 + 1.3369 +// return the compound words control flag 1.3370 +FLAG AffixMgr::get_compoundflag() const 1.3371 +{ 1.3372 + return compoundflag; 1.3373 +} 1.3374 + 1.3375 +// return the forbidden words control flag 1.3376 +FLAG AffixMgr::get_forbiddenword() const 1.3377 +{ 1.3378 + return forbiddenword; 1.3379 +} 1.3380 + 1.3381 +// return the forbidden words control flag 1.3382 +FLAG AffixMgr::get_nosuggest() const 1.3383 +{ 1.3384 + return nosuggest; 1.3385 +} 1.3386 + 1.3387 +// return the forbidden words control flag 1.3388 +FLAG AffixMgr::get_nongramsuggest() const 1.3389 +{ 1.3390 + return nongramsuggest; 1.3391 +} 1.3392 + 1.3393 +// return the forbidden words flag modify flag 1.3394 +FLAG AffixMgr::get_needaffix() const 1.3395 +{ 1.3396 + return needaffix; 1.3397 +} 1.3398 + 1.3399 +// return the onlyincompound flag 1.3400 +FLAG AffixMgr::get_onlyincompound() const 1.3401 +{ 1.3402 + return onlyincompound; 1.3403 +} 1.3404 + 1.3405 +// return the compound word signal flag 1.3406 +FLAG AffixMgr::get_compoundroot() const 1.3407 +{ 1.3408 + return compoundroot; 1.3409 +} 1.3410 + 1.3411 +// return the compound begin signal flag 1.3412 +FLAG AffixMgr::get_compoundbegin() const 1.3413 +{ 1.3414 + return compoundbegin; 1.3415 +} 1.3416 + 1.3417 +// return the value of checknum 1.3418 +int AffixMgr::get_checknum() const 1.3419 +{ 1.3420 + return checknum; 1.3421 +} 1.3422 + 1.3423 +// return the value of prefix 1.3424 +const char * AffixMgr::get_prefix() const 1.3425 +{ 1.3426 + if (pfx) return pfx->getKey(); 1.3427 + return NULL; 1.3428 +} 1.3429 + 1.3430 +// return the value of suffix 1.3431 +const char * AffixMgr::get_suffix() const 1.3432 +{ 1.3433 + return sfxappnd; 1.3434 +} 1.3435 + 1.3436 +// return the value of suffix 1.3437 +const char * AffixMgr::get_version() const 1.3438 +{ 1.3439 + return version; 1.3440 +} 1.3441 + 1.3442 +// return lemma_present flag 1.3443 +FLAG AffixMgr::get_lemma_present() const 1.3444 +{ 1.3445 + return lemma_present; 1.3446 +} 1.3447 + 1.3448 +// utility method to look up root words in hash table 1.3449 +struct hentry * AffixMgr::lookup(const char * word) 1.3450 +{ 1.3451 + int i; 1.3452 + struct hentry * he = NULL; 1.3453 + for (i = 0; i < *maxdic && !he; i++) { 1.3454 + he = (alldic[i])->lookup(word); 1.3455 + } 1.3456 + return he; 1.3457 +} 1.3458 + 1.3459 +// return the value of suffix 1.3460 +int AffixMgr::have_contclass() const 1.3461 +{ 1.3462 + return havecontclass; 1.3463 +} 1.3464 + 1.3465 +// return utf8 1.3466 +int AffixMgr::get_utf8() const 1.3467 +{ 1.3468 + return utf8; 1.3469 +} 1.3470 + 1.3471 +int AffixMgr::get_maxngramsugs(void) const 1.3472 +{ 1.3473 + return maxngramsugs; 1.3474 +} 1.3475 + 1.3476 +int AffixMgr::get_maxcpdsugs(void) const 1.3477 +{ 1.3478 + return maxcpdsugs; 1.3479 +} 1.3480 + 1.3481 +int AffixMgr::get_maxdiff(void) const 1.3482 +{ 1.3483 + return maxdiff; 1.3484 +} 1.3485 + 1.3486 +int AffixMgr::get_onlymaxdiff(void) const 1.3487 +{ 1.3488 + return onlymaxdiff; 1.3489 +} 1.3490 + 1.3491 +// return nosplitsugs 1.3492 +int AffixMgr::get_nosplitsugs(void) const 1.3493 +{ 1.3494 + return nosplitsugs; 1.3495 +} 1.3496 + 1.3497 +// return sugswithdots 1.3498 +int AffixMgr::get_sugswithdots(void) const 1.3499 +{ 1.3500 + return sugswithdots; 1.3501 +} 1.3502 + 1.3503 +/* parse flag */ 1.3504 +int AffixMgr::parse_flag(char * line, unsigned short * out, FileMgr * af) { 1.3505 + char * s = NULL; 1.3506 + if (*out != FLAG_NULL && !(*out >= DEFAULTFLAGS)) { 1.3507 + HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of an affix file parameter\n", af->getlinenum()); 1.3508 + return 1; 1.3509 + } 1.3510 + if (parse_string(line, &s, af->getlinenum())) return 1; 1.3511 + *out = pHMgr->decode_flag(s); 1.3512 + free(s); 1.3513 + return 0; 1.3514 +} 1.3515 + 1.3516 +/* parse num */ 1.3517 +int AffixMgr::parse_num(char * line, int * out, FileMgr * af) { 1.3518 + char * s = NULL; 1.3519 + if (*out != -1) { 1.3520 + HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of an affix file parameter\n", af->getlinenum()); 1.3521 + return 1; 1.3522 + } 1.3523 + if (parse_string(line, &s, af->getlinenum())) return 1; 1.3524 + *out = atoi(s); 1.3525 + free(s); 1.3526 + return 0; 1.3527 +} 1.3528 + 1.3529 +/* parse in the max syllablecount of compound words and */ 1.3530 +int AffixMgr::parse_cpdsyllable(char * line, FileMgr * af) 1.3531 +{ 1.3532 + char * tp = line; 1.3533 + char * piece; 1.3534 + int i = 0; 1.3535 + int np = 0; 1.3536 + w_char w[MAXWORDLEN]; 1.3537 + piece = mystrsep(&tp, 0); 1.3538 + while (piece) { 1.3539 + if (*piece != '\0') { 1.3540 + switch(i) { 1.3541 + case 0: { np++; break; } 1.3542 + case 1: { cpdmaxsyllable = atoi(piece); np++; break; } 1.3543 + case 2: { 1.3544 + if (!utf8) { 1.3545 + cpdvowels = mystrdup(piece); 1.3546 + } else { 1.3547 + int n = u8_u16(w, MAXWORDLEN, piece); 1.3548 + if (n > 0) { 1.3549 + flag_qsort((unsigned short *) w, 0, n); 1.3550 + cpdvowels_utf16 = (w_char *) malloc(n * sizeof(w_char)); 1.3551 + if (!cpdvowels_utf16) return 1; 1.3552 + memcpy(cpdvowels_utf16, w, n * sizeof(w_char)); 1.3553 + } 1.3554 + cpdvowels_utf16_len = n; 1.3555 + } 1.3556 + np++; 1.3557 + break; 1.3558 + } 1.3559 + default: break; 1.3560 + } 1.3561 + i++; 1.3562 + } 1.3563 + piece = mystrsep(&tp, 0); 1.3564 + } 1.3565 + if (np < 2) { 1.3566 + HUNSPELL_WARNING(stderr, "error: line %d: missing compoundsyllable information\n", af->getlinenum()); 1.3567 + return 1; 1.3568 + } 1.3569 + if (np == 2) cpdvowels = mystrdup("aeiouAEIOU"); 1.3570 + return 0; 1.3571 +} 1.3572 + 1.3573 +/* parse in the typical fault correcting table */ 1.3574 +int AffixMgr::parse_reptable(char * line, FileMgr * af) 1.3575 +{ 1.3576 + if (numrep != 0) { 1.3577 + HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum()); 1.3578 + return 1; 1.3579 + } 1.3580 + char * tp = line; 1.3581 + char * piece; 1.3582 + int i = 0; 1.3583 + int np = 0; 1.3584 + piece = mystrsep(&tp, 0); 1.3585 + while (piece) { 1.3586 + if (*piece != '\0') { 1.3587 + switch(i) { 1.3588 + case 0: { np++; break; } 1.3589 + case 1: { 1.3590 + numrep = atoi(piece); 1.3591 + if (numrep < 1) { 1.3592 + HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", af->getlinenum()); 1.3593 + return 1; 1.3594 + } 1.3595 + reptable = (replentry *) malloc(numrep * sizeof(struct replentry)); 1.3596 + if (!reptable) return 1; 1.3597 + np++; 1.3598 + break; 1.3599 + } 1.3600 + default: break; 1.3601 + } 1.3602 + i++; 1.3603 + } 1.3604 + piece = mystrsep(&tp, 0); 1.3605 + } 1.3606 + if (np != 2) { 1.3607 + HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); 1.3608 + return 1; 1.3609 + } 1.3610 + 1.3611 + /* now parse the numrep lines to read in the remainder of the table */ 1.3612 + char * nl; 1.3613 + for (int j=0; j < numrep; j++) { 1.3614 + if (!(nl = af->getline())) return 1; 1.3615 + mychomp(nl); 1.3616 + tp = nl; 1.3617 + i = 0; 1.3618 + reptable[j].pattern = NULL; 1.3619 + reptable[j].pattern2 = NULL; 1.3620 + piece = mystrsep(&tp, 0); 1.3621 + while (piece) { 1.3622 + if (*piece != '\0') { 1.3623 + switch(i) { 1.3624 + case 0: { 1.3625 + if (strncmp(piece,"REP",3) != 0) { 1.3626 + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); 1.3627 + numrep = 0; 1.3628 + return 1; 1.3629 + } 1.3630 + break; 1.3631 + } 1.3632 + case 1: { 1.3633 + if (*piece == '^') reptable[j].start = true; else reptable[j].start = false; 1.3634 + reptable[j].pattern = mystrrep(mystrdup(piece + int(reptable[j].start)),"_"," "); 1.3635 + int lr = strlen(reptable[j].pattern) - 1; 1.3636 + if (reptable[j].pattern[lr] == '$') { 1.3637 + reptable[j].end = true; 1.3638 + reptable[j].pattern[lr] = '\0'; 1.3639 + } else reptable[j].end = false; 1.3640 + break; 1.3641 + } 1.3642 + case 2: { reptable[j].pattern2 = mystrrep(mystrdup(piece),"_"," "); break; } 1.3643 + default: break; 1.3644 + } 1.3645 + i++; 1.3646 + } 1.3647 + piece = mystrsep(&tp, 0); 1.3648 + } 1.3649 + if ((!(reptable[j].pattern)) || (!(reptable[j].pattern2))) { 1.3650 + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); 1.3651 + numrep = 0; 1.3652 + return 1; 1.3653 + } 1.3654 + } 1.3655 + return 0; 1.3656 +} 1.3657 + 1.3658 +/* parse in the typical fault correcting table */ 1.3659 +int AffixMgr::parse_convtable(char * line, FileMgr * af, RepList ** rl, const char * keyword) 1.3660 +{ 1.3661 + if (*rl) { 1.3662 + HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum()); 1.3663 + return 1; 1.3664 + } 1.3665 + char * tp = line; 1.3666 + char * piece; 1.3667 + int i = 0; 1.3668 + int np = 0; 1.3669 + int numrl = 0; 1.3670 + piece = mystrsep(&tp, 0); 1.3671 + while (piece) { 1.3672 + if (*piece != '\0') { 1.3673 + switch(i) { 1.3674 + case 0: { np++; break; } 1.3675 + case 1: { 1.3676 + numrl = atoi(piece); 1.3677 + if (numrl < 1) { 1.3678 + HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", af->getlinenum()); 1.3679 + return 1; 1.3680 + } 1.3681 + *rl = new RepList(numrl); 1.3682 + if (!*rl) return 1; 1.3683 + np++; 1.3684 + break; 1.3685 + } 1.3686 + default: break; 1.3687 + } 1.3688 + i++; 1.3689 + } 1.3690 + piece = mystrsep(&tp, 0); 1.3691 + } 1.3692 + if (np != 2) { 1.3693 + HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); 1.3694 + return 1; 1.3695 + } 1.3696 + 1.3697 + /* now parse the num lines to read in the remainder of the table */ 1.3698 + char * nl; 1.3699 + for (int j=0; j < numrl; j++) { 1.3700 + if (!(nl = af->getline())) return 1; 1.3701 + mychomp(nl); 1.3702 + tp = nl; 1.3703 + i = 0; 1.3704 + char * pattern = NULL; 1.3705 + char * pattern2 = NULL; 1.3706 + piece = mystrsep(&tp, 0); 1.3707 + while (piece) { 1.3708 + if (*piece != '\0') { 1.3709 + switch(i) { 1.3710 + case 0: { 1.3711 + if (strncmp(piece, keyword, strlen(keyword)) != 0) { 1.3712 + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); 1.3713 + delete *rl; 1.3714 + *rl = NULL; 1.3715 + return 1; 1.3716 + } 1.3717 + break; 1.3718 + } 1.3719 + case 1: { pattern = mystrrep(mystrdup(piece),"_"," "); break; } 1.3720 + case 2: { 1.3721 + pattern2 = mystrrep(mystrdup(piece),"_"," "); 1.3722 + break; 1.3723 + } 1.3724 + default: break; 1.3725 + } 1.3726 + i++; 1.3727 + } 1.3728 + piece = mystrsep(&tp, 0); 1.3729 + } 1.3730 + if (!pattern || !pattern2) { 1.3731 + if (pattern) 1.3732 + free(pattern); 1.3733 + if (pattern2) 1.3734 + free(pattern2); 1.3735 + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); 1.3736 + return 1; 1.3737 + } 1.3738 + (*rl)->add(pattern, pattern2); 1.3739 + } 1.3740 + return 0; 1.3741 +} 1.3742 + 1.3743 + 1.3744 +/* parse in the typical fault correcting table */ 1.3745 +int AffixMgr::parse_phonetable(char * line, FileMgr * af) 1.3746 +{ 1.3747 + if (phone) { 1.3748 + HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum()); 1.3749 + return 1; 1.3750 + } 1.3751 + char * tp = line; 1.3752 + char * piece; 1.3753 + int i = 0; 1.3754 + int np = 0; 1.3755 + piece = mystrsep(&tp, 0); 1.3756 + while (piece) { 1.3757 + if (*piece != '\0') { 1.3758 + switch(i) { 1.3759 + case 0: { np++; break; } 1.3760 + case 1: { 1.3761 + phone = (phonetable *) malloc(sizeof(struct phonetable)); 1.3762 + if (!phone) return 1; 1.3763 + phone->num = atoi(piece); 1.3764 + phone->rules = NULL; 1.3765 + phone->utf8 = (char) utf8; 1.3766 + if (phone->num < 1) { 1.3767 + HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum()); 1.3768 + return 1; 1.3769 + } 1.3770 + phone->rules = (char * *) malloc(2 * (phone->num + 1) * sizeof(char *)); 1.3771 + if (!phone->rules) { 1.3772 + free(phone); 1.3773 + phone = NULL; 1.3774 + return 1; 1.3775 + } 1.3776 + np++; 1.3777 + break; 1.3778 + } 1.3779 + default: break; 1.3780 + } 1.3781 + i++; 1.3782 + } 1.3783 + piece = mystrsep(&tp, 0); 1.3784 + } 1.3785 + if (np != 2) { 1.3786 + HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); 1.3787 + return 1; 1.3788 + } 1.3789 + 1.3790 + /* now parse the phone->num lines to read in the remainder of the table */ 1.3791 + char * nl; 1.3792 + for (int j=0; j < phone->num; j++) { 1.3793 + if (!(nl = af->getline())) return 1; 1.3794 + mychomp(nl); 1.3795 + tp = nl; 1.3796 + i = 0; 1.3797 + phone->rules[j * 2] = NULL; 1.3798 + phone->rules[j * 2 + 1] = NULL; 1.3799 + piece = mystrsep(&tp, 0); 1.3800 + while (piece) { 1.3801 + if (*piece != '\0') { 1.3802 + switch(i) { 1.3803 + case 0: { 1.3804 + if (strncmp(piece,"PHONE",5) != 0) { 1.3805 + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); 1.3806 + phone->num = 0; 1.3807 + return 1; 1.3808 + } 1.3809 + break; 1.3810 + } 1.3811 + case 1: { phone->rules[j * 2] = mystrrep(mystrdup(piece),"_",""); break; } 1.3812 + case 2: { phone->rules[j * 2 + 1] = mystrrep(mystrdup(piece),"_",""); break; } 1.3813 + default: break; 1.3814 + } 1.3815 + i++; 1.3816 + } 1.3817 + piece = mystrsep(&tp, 0); 1.3818 + } 1.3819 + if ((!(phone->rules[j * 2])) || (!(phone->rules[j * 2 + 1]))) { 1.3820 + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); 1.3821 + phone->num = 0; 1.3822 + return 1; 1.3823 + } 1.3824 + } 1.3825 + phone->rules[phone->num * 2] = mystrdup(""); 1.3826 + phone->rules[phone->num * 2 + 1] = mystrdup(""); 1.3827 + init_phonet_hash(*phone); 1.3828 + return 0; 1.3829 +} 1.3830 + 1.3831 +/* parse in the checkcompoundpattern table */ 1.3832 +int AffixMgr::parse_checkcpdtable(char * line, FileMgr * af) 1.3833 +{ 1.3834 + if (numcheckcpd != 0) { 1.3835 + HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum()); 1.3836 + return 1; 1.3837 + } 1.3838 + char * tp = line; 1.3839 + char * piece; 1.3840 + int i = 0; 1.3841 + int np = 0; 1.3842 + piece = mystrsep(&tp, 0); 1.3843 + while (piece) { 1.3844 + if (*piece != '\0') { 1.3845 + switch(i) { 1.3846 + case 0: { np++; break; } 1.3847 + case 1: { 1.3848 + numcheckcpd = atoi(piece); 1.3849 + if (numcheckcpd < 1) { 1.3850 + HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum()); 1.3851 + return 1; 1.3852 + } 1.3853 + checkcpdtable = (patentry *) malloc(numcheckcpd * sizeof(struct patentry)); 1.3854 + if (!checkcpdtable) return 1; 1.3855 + np++; 1.3856 + break; 1.3857 + } 1.3858 + default: break; 1.3859 + } 1.3860 + i++; 1.3861 + } 1.3862 + piece = mystrsep(&tp, 0); 1.3863 + } 1.3864 + if (np != 2) { 1.3865 + HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); 1.3866 + return 1; 1.3867 + } 1.3868 + 1.3869 + /* now parse the numcheckcpd lines to read in the remainder of the table */ 1.3870 + char * nl; 1.3871 + for (int j=0; j < numcheckcpd; j++) { 1.3872 + if (!(nl = af->getline())) return 1; 1.3873 + mychomp(nl); 1.3874 + tp = nl; 1.3875 + i = 0; 1.3876 + checkcpdtable[j].pattern = NULL; 1.3877 + checkcpdtable[j].pattern2 = NULL; 1.3878 + checkcpdtable[j].pattern3 = NULL; 1.3879 + checkcpdtable[j].cond = FLAG_NULL; 1.3880 + checkcpdtable[j].cond2 = FLAG_NULL; 1.3881 + piece = mystrsep(&tp, 0); 1.3882 + while (piece) { 1.3883 + if (*piece != '\0') { 1.3884 + switch(i) { 1.3885 + case 0: { 1.3886 + if (strncmp(piece,"CHECKCOMPOUNDPATTERN",20) != 0) { 1.3887 + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); 1.3888 + numcheckcpd = 0; 1.3889 + return 1; 1.3890 + } 1.3891 + break; 1.3892 + } 1.3893 + case 1: { 1.3894 + checkcpdtable[j].pattern = mystrdup(piece); 1.3895 + char * p = strchr(checkcpdtable[j].pattern, '/'); 1.3896 + if (p) { 1.3897 + *p = '\0'; 1.3898 + checkcpdtable[j].cond = pHMgr->decode_flag(p + 1); 1.3899 + } 1.3900 + break; } 1.3901 + case 2: { 1.3902 + checkcpdtable[j].pattern2 = mystrdup(piece); 1.3903 + char * p = strchr(checkcpdtable[j].pattern2, '/'); 1.3904 + if (p) { 1.3905 + *p = '\0'; 1.3906 + checkcpdtable[j].cond2 = pHMgr->decode_flag(p + 1); 1.3907 + } 1.3908 + break; 1.3909 + } 1.3910 + case 3: { checkcpdtable[j].pattern3 = mystrdup(piece); simplifiedcpd = 1; break; } 1.3911 + default: break; 1.3912 + } 1.3913 + i++; 1.3914 + } 1.3915 + piece = mystrsep(&tp, 0); 1.3916 + } 1.3917 + if ((!(checkcpdtable[j].pattern)) || (!(checkcpdtable[j].pattern2))) { 1.3918 + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); 1.3919 + numcheckcpd = 0; 1.3920 + return 1; 1.3921 + } 1.3922 + } 1.3923 + return 0; 1.3924 +} 1.3925 + 1.3926 +/* parse in the compound rule table */ 1.3927 +int AffixMgr::parse_defcpdtable(char * line, FileMgr * af) 1.3928 +{ 1.3929 + if (numdefcpd != 0) { 1.3930 + HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum()); 1.3931 + return 1; 1.3932 + } 1.3933 + char * tp = line; 1.3934 + char * piece; 1.3935 + int i = 0; 1.3936 + int np = 0; 1.3937 + piece = mystrsep(&tp, 0); 1.3938 + while (piece) { 1.3939 + if (*piece != '\0') { 1.3940 + switch(i) { 1.3941 + case 0: { np++; break; } 1.3942 + case 1: { 1.3943 + numdefcpd = atoi(piece); 1.3944 + if (numdefcpd < 1) { 1.3945 + HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum()); 1.3946 + return 1; 1.3947 + } 1.3948 + defcpdtable = (flagentry *) malloc(numdefcpd * sizeof(flagentry)); 1.3949 + if (!defcpdtable) return 1; 1.3950 + np++; 1.3951 + break; 1.3952 + } 1.3953 + default: break; 1.3954 + } 1.3955 + i++; 1.3956 + } 1.3957 + piece = mystrsep(&tp, 0); 1.3958 + } 1.3959 + if (np != 2) { 1.3960 + HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); 1.3961 + return 1; 1.3962 + } 1.3963 + 1.3964 + /* now parse the numdefcpd lines to read in the remainder of the table */ 1.3965 + char * nl; 1.3966 + for (int j=0; j < numdefcpd; j++) { 1.3967 + if (!(nl = af->getline())) return 1; 1.3968 + mychomp(nl); 1.3969 + tp = nl; 1.3970 + i = 0; 1.3971 + defcpdtable[j].def = NULL; 1.3972 + piece = mystrsep(&tp, 0); 1.3973 + while (piece) { 1.3974 + if (*piece != '\0') { 1.3975 + switch(i) { 1.3976 + case 0: { 1.3977 + if (strncmp(piece, "COMPOUNDRULE", 12) != 0) { 1.3978 + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); 1.3979 + numdefcpd = 0; 1.3980 + return 1; 1.3981 + } 1.3982 + break; 1.3983 + } 1.3984 + case 1: { // handle parenthesized flags 1.3985 + if (strchr(piece, '(')) { 1.3986 + defcpdtable[j].def = (FLAG *) malloc(strlen(piece) * sizeof(FLAG)); 1.3987 + defcpdtable[j].len = 0; 1.3988 + int end = 0; 1.3989 + FLAG * conv; 1.3990 + while (!end) { 1.3991 + char * par = piece + 1; 1.3992 + while (*par != '(' && *par != ')' && *par != '\0') par++; 1.3993 + if (*par == '\0') end = 1; else *par = '\0'; 1.3994 + if (*piece == '(') piece++; 1.3995 + if (*piece == '*' || *piece == '?') { 1.3996 + defcpdtable[j].def[defcpdtable[j].len++] = (FLAG) *piece; 1.3997 + } else if (*piece != '\0') { 1.3998 + int l = pHMgr->decode_flags(&conv, piece, af); 1.3999 + for (int k = 0; k < l; k++) defcpdtable[j].def[defcpdtable[j].len++] = conv[k]; 1.4000 + free(conv); 1.4001 + } 1.4002 + piece = par + 1; 1.4003 + } 1.4004 + } else { 1.4005 + defcpdtable[j].len = pHMgr->decode_flags(&(defcpdtable[j].def), piece, af); 1.4006 + } 1.4007 + break; 1.4008 + } 1.4009 + default: break; 1.4010 + } 1.4011 + i++; 1.4012 + } 1.4013 + piece = mystrsep(&tp, 0); 1.4014 + } 1.4015 + if (!defcpdtable[j].len) { 1.4016 + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); 1.4017 + numdefcpd = 0; 1.4018 + return 1; 1.4019 + } 1.4020 + } 1.4021 + return 0; 1.4022 +} 1.4023 + 1.4024 + 1.4025 +/* parse in the character map table */ 1.4026 +int AffixMgr::parse_maptable(char * line, FileMgr * af) 1.4027 +{ 1.4028 + if (nummap != 0) { 1.4029 + HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum()); 1.4030 + return 1; 1.4031 + } 1.4032 + char * tp = line; 1.4033 + char * piece; 1.4034 + int i = 0; 1.4035 + int np = 0; 1.4036 + piece = mystrsep(&tp, 0); 1.4037 + while (piece) { 1.4038 + if (*piece != '\0') { 1.4039 + switch(i) { 1.4040 + case 0: { np++; break; } 1.4041 + case 1: { 1.4042 + nummap = atoi(piece); 1.4043 + if (nummap < 1) { 1.4044 + HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum()); 1.4045 + return 1; 1.4046 + } 1.4047 + maptable = (mapentry *) malloc(nummap * sizeof(struct mapentry)); 1.4048 + if (!maptable) return 1; 1.4049 + np++; 1.4050 + break; 1.4051 + } 1.4052 + default: break; 1.4053 + } 1.4054 + i++; 1.4055 + } 1.4056 + piece = mystrsep(&tp, 0); 1.4057 + } 1.4058 + if (np != 2) { 1.4059 + HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); 1.4060 + return 1; 1.4061 + } 1.4062 + 1.4063 + /* now parse the nummap lines to read in the remainder of the table */ 1.4064 + char * nl; 1.4065 + for (int j=0; j < nummap; j++) { 1.4066 + if (!(nl = af->getline())) return 1; 1.4067 + mychomp(nl); 1.4068 + tp = nl; 1.4069 + i = 0; 1.4070 + maptable[j].set = NULL; 1.4071 + maptable[j].len = 0; 1.4072 + piece = mystrsep(&tp, 0); 1.4073 + while (piece) { 1.4074 + if (*piece != '\0') { 1.4075 + switch(i) { 1.4076 + case 0: { 1.4077 + if (strncmp(piece,"MAP",3) != 0) { 1.4078 + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); 1.4079 + nummap = 0; 1.4080 + return 1; 1.4081 + } 1.4082 + break; 1.4083 + } 1.4084 + case 1: { 1.4085 + int setn = 0; 1.4086 + maptable[j].len = strlen(piece); 1.4087 + maptable[j].set = (char **) malloc(maptable[j].len * sizeof(char*)); 1.4088 + if (!maptable[j].set) return 1; 1.4089 + for (int k = 0; k < maptable[j].len; k++) { 1.4090 + int chl = 1; 1.4091 + int chb = k; 1.4092 + if (piece[k] == '(') { 1.4093 + char * parpos = strchr(piece + k, ')'); 1.4094 + if (parpos != NULL) { 1.4095 + chb = k + 1; 1.4096 + chl = (int)(parpos - piece) - k - 1; 1.4097 + k = k + chl + 1; 1.4098 + } 1.4099 + } else { 1.4100 + if (utf8 && (piece[k] & 0xc0) == 0xc0) { 1.4101 + for (k++; utf8 && (piece[k] & 0xc0) == 0x80; k++); 1.4102 + chl = k - chb; 1.4103 + k--; 1.4104 + } 1.4105 + } 1.4106 + maptable[j].set[setn] = (char *) malloc(chl + 1); 1.4107 + if (!maptable[j].set[setn]) return 1; 1.4108 + strncpy(maptable[j].set[setn], piece + chb, chl); 1.4109 + maptable[j].set[setn][chl] = '\0'; 1.4110 + setn++; 1.4111 + } 1.4112 + maptable[j].len = setn; 1.4113 + break; } 1.4114 + default: break; 1.4115 + } 1.4116 + i++; 1.4117 + } 1.4118 + piece = mystrsep(&tp, 0); 1.4119 + } 1.4120 + if (!maptable[j].set || !maptable[j].len) { 1.4121 + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); 1.4122 + nummap = 0; 1.4123 + return 1; 1.4124 + } 1.4125 + } 1.4126 + return 0; 1.4127 +} 1.4128 + 1.4129 +/* parse in the word breakpoint table */ 1.4130 +int AffixMgr::parse_breaktable(char * line, FileMgr * af) 1.4131 +{ 1.4132 + if (numbreak > -1) { 1.4133 + HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum()); 1.4134 + return 1; 1.4135 + } 1.4136 + char * tp = line; 1.4137 + char * piece; 1.4138 + int i = 0; 1.4139 + int np = 0; 1.4140 + piece = mystrsep(&tp, 0); 1.4141 + while (piece) { 1.4142 + if (*piece != '\0') { 1.4143 + switch(i) { 1.4144 + case 0: { np++; break; } 1.4145 + case 1: { 1.4146 + numbreak = atoi(piece); 1.4147 + if (numbreak < 0) { 1.4148 + HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum()); 1.4149 + return 1; 1.4150 + } 1.4151 + if (numbreak == 0) return 0; 1.4152 + breaktable = (char **) malloc(numbreak * sizeof(char *)); 1.4153 + if (!breaktable) return 1; 1.4154 + np++; 1.4155 + break; 1.4156 + } 1.4157 + default: break; 1.4158 + } 1.4159 + i++; 1.4160 + } 1.4161 + piece = mystrsep(&tp, 0); 1.4162 + } 1.4163 + if (np != 2) { 1.4164 + HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); 1.4165 + return 1; 1.4166 + } 1.4167 + 1.4168 + /* now parse the numbreak lines to read in the remainder of the table */ 1.4169 + char * nl; 1.4170 + for (int j=0; j < numbreak; j++) { 1.4171 + if (!(nl = af->getline())) return 1; 1.4172 + mychomp(nl); 1.4173 + tp = nl; 1.4174 + i = 0; 1.4175 + piece = mystrsep(&tp, 0); 1.4176 + while (piece) { 1.4177 + if (*piece != '\0') { 1.4178 + switch(i) { 1.4179 + case 0: { 1.4180 + if (strncmp(piece,"BREAK",5) != 0) { 1.4181 + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); 1.4182 + numbreak = 0; 1.4183 + return 1; 1.4184 + } 1.4185 + break; 1.4186 + } 1.4187 + case 1: { 1.4188 + breaktable[j] = mystrdup(piece); 1.4189 + break; 1.4190 + } 1.4191 + default: break; 1.4192 + } 1.4193 + i++; 1.4194 + } 1.4195 + piece = mystrsep(&tp, 0); 1.4196 + } 1.4197 + if (!breaktable) { 1.4198 + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); 1.4199 + numbreak = 0; 1.4200 + return 1; 1.4201 + } 1.4202 + } 1.4203 + return 0; 1.4204 +} 1.4205 + 1.4206 +void AffixMgr::reverse_condition(char * piece) { 1.4207 + int neg = 0; 1.4208 + for (char * k = piece + strlen(piece) - 1; k >= piece; k--) { 1.4209 + switch(*k) { 1.4210 + case '[': { 1.4211 + if (neg) *(k+1) = '['; else *k = ']'; 1.4212 + break; 1.4213 + } 1.4214 + case ']': { 1.4215 + *k = '['; 1.4216 + if (neg) *(k+1) = '^'; 1.4217 + neg = 0; 1.4218 + break; 1.4219 + } 1.4220 + case '^': { 1.4221 + if (*(k+1) == ']') neg = 1; else *(k+1) = *k; 1.4222 + break; 1.4223 + } 1.4224 + default: { 1.4225 + if (neg) *(k+1) = *k; 1.4226 + } 1.4227 + } 1.4228 + } 1.4229 +} 1.4230 + 1.4231 +int AffixMgr::parse_affix(char * line, const char at, FileMgr * af, char * dupflags) 1.4232 +{ 1.4233 + int numents = 0; // number of affentry structures to parse 1.4234 + 1.4235 + unsigned short aflag = 0; // affix char identifier 1.4236 + 1.4237 + char ff=0; 1.4238 + std::vector<affentry> affentries; 1.4239 + 1.4240 + char * tp = line; 1.4241 + char * nl = line; 1.4242 + char * piece; 1.4243 + int i = 0; 1.4244 + 1.4245 + // checking lines with bad syntax 1.4246 +#ifdef DEBUG 1.4247 + int basefieldnum = 0; 1.4248 +#endif 1.4249 + 1.4250 + // split affix header line into pieces 1.4251 + 1.4252 + int np = 0; 1.4253 + 1.4254 + piece = mystrsep(&tp, 0); 1.4255 + while (piece) { 1.4256 + if (*piece != '\0') { 1.4257 + switch(i) { 1.4258 + // piece 1 - is type of affix 1.4259 + case 0: { np++; break; } 1.4260 + 1.4261 + // piece 2 - is affix char 1.4262 + case 1: { 1.4263 + np++; 1.4264 + aflag = pHMgr->decode_flag(piece); 1.4265 + if (((at == 'S') && (dupflags[aflag] & dupSFX)) || 1.4266 + ((at == 'P') && (dupflags[aflag] & dupPFX))) { 1.4267 + HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of an affix flag\n", 1.4268 + af->getlinenum()); 1.4269 + // return 1; XXX permissive mode for bad dictionaries 1.4270 + } 1.4271 + dupflags[aflag] += (char) ((at == 'S') ? dupSFX : dupPFX); 1.4272 + break; 1.4273 + } 1.4274 + // piece 3 - is cross product indicator 1.4275 + case 2: { np++; if (*piece == 'Y') ff = aeXPRODUCT; break; } 1.4276 + 1.4277 + // piece 4 - is number of affentries 1.4278 + case 3: { 1.4279 + np++; 1.4280 + numents = atoi(piece); 1.4281 + if (numents == 0) { 1.4282 + char * err = pHMgr->encode_flag(aflag); 1.4283 + if (err) { 1.4284 + HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", 1.4285 + af->getlinenum()); 1.4286 + free(err); 1.4287 + } 1.4288 + return 1; 1.4289 + } 1.4290 + affentries.resize(numents); 1.4291 + affentries[0].opts = ff; 1.4292 + if (utf8) affentries[0].opts += aeUTF8; 1.4293 + if (pHMgr->is_aliasf()) affentries[0].opts += aeALIASF; 1.4294 + if (pHMgr->is_aliasm()) affentries[0].opts += aeALIASM; 1.4295 + affentries[0].aflag = aflag; 1.4296 + } 1.4297 + 1.4298 + default: break; 1.4299 + } 1.4300 + i++; 1.4301 + } 1.4302 + piece = mystrsep(&tp, 0); 1.4303 + } 1.4304 + // check to make sure we parsed enough pieces 1.4305 + if (np != 4) { 1.4306 + char * err = pHMgr->encode_flag(aflag); 1.4307 + if (err) { 1.4308 + HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); 1.4309 + free(err); 1.4310 + } 1.4311 + return 1; 1.4312 + } 1.4313 + 1.4314 + // now parse numents affentries for this affix 1.4315 + std::vector<affentry>::iterator start = affentries.begin(); 1.4316 + std::vector<affentry>::iterator end = affentries.end(); 1.4317 + for (std::vector<affentry>::iterator entry = start; entry != end; ++entry) { 1.4318 + if (!(nl = af->getline())) return 1; 1.4319 + mychomp(nl); 1.4320 + tp = nl; 1.4321 + i = 0; 1.4322 + np = 0; 1.4323 + 1.4324 + // split line into pieces 1.4325 + piece = mystrsep(&tp, 0); 1.4326 + while (piece) { 1.4327 + if (*piece != '\0') { 1.4328 + switch(i) { 1.4329 + // piece 1 - is type 1.4330 + case 0: { 1.4331 + np++; 1.4332 + if (entry != start) entry->opts = start->opts & 1.4333 + (char) (aeXPRODUCT + aeUTF8 + aeALIASF + aeALIASM); 1.4334 + break; 1.4335 + } 1.4336 + 1.4337 + // piece 2 - is affix char 1.4338 + case 1: { 1.4339 + np++; 1.4340 + if (pHMgr->decode_flag(piece) != aflag) { 1.4341 + char * err = pHMgr->encode_flag(aflag); 1.4342 + if (err) { 1.4343 + HUNSPELL_WARNING(stderr, "error: line %d: affix %s is corrupt\n", 1.4344 + af->getlinenum(), err); 1.4345 + free(err); 1.4346 + } 1.4347 + return 1; 1.4348 + } 1.4349 + 1.4350 + if (entry != start) entry->aflag = start->aflag; 1.4351 + break; 1.4352 + } 1.4353 + 1.4354 + // piece 3 - is string to strip or 0 for null 1.4355 + case 2: { 1.4356 + np++; 1.4357 + if (complexprefixes) { 1.4358 + if (utf8) reverseword_utf(piece); else reverseword(piece); 1.4359 + } 1.4360 + entry->strip = mystrdup(piece); 1.4361 + entry->stripl = (unsigned char) strlen(entry->strip); 1.4362 + if (strcmp(entry->strip,"0") == 0) { 1.4363 + free(entry->strip); 1.4364 + entry->strip=mystrdup(""); 1.4365 + entry->stripl = 0; 1.4366 + } 1.4367 + break; 1.4368 + } 1.4369 + 1.4370 + // piece 4 - is affix string or 0 for null 1.4371 + case 3: { 1.4372 + char * dash; 1.4373 + entry->morphcode = NULL; 1.4374 + entry->contclass = NULL; 1.4375 + entry->contclasslen = 0; 1.4376 + np++; 1.4377 + dash = strchr(piece, '/'); 1.4378 + if (dash) { 1.4379 + *dash = '\0'; 1.4380 + 1.4381 + if (ignorechars) { 1.4382 + if (utf8) { 1.4383 + remove_ignored_chars_utf(piece, ignorechars_utf16, ignorechars_utf16_len); 1.4384 + } else { 1.4385 + remove_ignored_chars(piece,ignorechars); 1.4386 + } 1.4387 + } 1.4388 + 1.4389 + if (complexprefixes) { 1.4390 + if (utf8) reverseword_utf(piece); else reverseword(piece); 1.4391 + } 1.4392 + entry->appnd = mystrdup(piece); 1.4393 + 1.4394 + if (pHMgr->is_aliasf()) { 1.4395 + int index = atoi(dash + 1); 1.4396 + entry->contclasslen = (unsigned short) pHMgr->get_aliasf(index, &(entry->contclass), af); 1.4397 + if (!entry->contclasslen) HUNSPELL_WARNING(stderr, "error: bad affix flag alias: \"%s\"\n", dash+1); 1.4398 + } else { 1.4399 + entry->contclasslen = (unsigned short) pHMgr->decode_flags(&(entry->contclass), dash + 1, af); 1.4400 + flag_qsort(entry->contclass, 0, entry->contclasslen); 1.4401 + } 1.4402 + *dash = '/'; 1.4403 + 1.4404 + havecontclass = 1; 1.4405 + for (unsigned short _i = 0; _i < entry->contclasslen; _i++) { 1.4406 + contclasses[(entry->contclass)[_i]] = 1; 1.4407 + } 1.4408 + } else { 1.4409 + if (ignorechars) { 1.4410 + if (utf8) { 1.4411 + remove_ignored_chars_utf(piece, ignorechars_utf16, ignorechars_utf16_len); 1.4412 + } else { 1.4413 + remove_ignored_chars(piece,ignorechars); 1.4414 + } 1.4415 + } 1.4416 + 1.4417 + if (complexprefixes) { 1.4418 + if (utf8) reverseword_utf(piece); else reverseword(piece); 1.4419 + } 1.4420 + entry->appnd = mystrdup(piece); 1.4421 + } 1.4422 + 1.4423 + entry->appndl = (unsigned char) strlen(entry->appnd); 1.4424 + if (strcmp(entry->appnd,"0") == 0) { 1.4425 + free(entry->appnd); 1.4426 + entry->appnd=mystrdup(""); 1.4427 + entry->appndl = 0; 1.4428 + } 1.4429 + break; 1.4430 + } 1.4431 + 1.4432 + // piece 5 - is the conditions descriptions 1.4433 + case 4: { 1.4434 + np++; 1.4435 + if (complexprefixes) { 1.4436 + if (utf8) reverseword_utf(piece); else reverseword(piece); 1.4437 + reverse_condition(piece); 1.4438 + } 1.4439 + if (entry->stripl && (strcmp(piece, ".") != 0) && 1.4440 + redundant_condition(at, entry->strip, entry->stripl, piece, af->getlinenum())) 1.4441 + strcpy(piece, "."); 1.4442 + if (at == 'S') { 1.4443 + reverseword(piece); 1.4444 + reverse_condition(piece); 1.4445 + } 1.4446 + if (encodeit(*entry, piece)) return 1; 1.4447 + break; 1.4448 + } 1.4449 + 1.4450 + case 5: { 1.4451 + np++; 1.4452 + if (pHMgr->is_aliasm()) { 1.4453 + int index = atoi(piece); 1.4454 + entry->morphcode = pHMgr->get_aliasm(index); 1.4455 + } else { 1.4456 + if (complexprefixes) { // XXX - fix me for morph. gen. 1.4457 + if (utf8) reverseword_utf(piece); else reverseword(piece); 1.4458 + } 1.4459 + // add the remaining of the line 1.4460 + if (*tp) { 1.4461 + *(tp - 1) = ' '; 1.4462 + tp = tp + strlen(tp); 1.4463 + } 1.4464 + entry->morphcode = mystrdup(piece); 1.4465 + if (!entry->morphcode) return 1; 1.4466 + } 1.4467 + break; 1.4468 + } 1.4469 + default: break; 1.4470 + } 1.4471 + i++; 1.4472 + } 1.4473 + piece = mystrsep(&tp, 0); 1.4474 + } 1.4475 + // check to make sure we parsed enough pieces 1.4476 + if (np < 4) { 1.4477 + char * err = pHMgr->encode_flag(aflag); 1.4478 + if (err) { 1.4479 + HUNSPELL_WARNING(stderr, "error: line %d: affix %s is corrupt\n", 1.4480 + af->getlinenum(), err); 1.4481 + free(err); 1.4482 + } 1.4483 + return 1; 1.4484 + } 1.4485 + 1.4486 +#ifdef DEBUG 1.4487 + // detect unnecessary fields, excepting comments 1.4488 + if (basefieldnum) { 1.4489 + int fieldnum = !(entry->morphcode) ? 5 : ((*(entry->morphcode)=='#') ? 5 : 6); 1.4490 + if (fieldnum != basefieldnum) 1.4491 + HUNSPELL_WARNING(stderr, "warning: line %d: bad field number\n", af->getlinenum()); 1.4492 + } else { 1.4493 + basefieldnum = !(entry->morphcode) ? 5 : ((*(entry->morphcode)=='#') ? 5 : 6); 1.4494 + } 1.4495 +#endif 1.4496 + } 1.4497 + 1.4498 + // now create SfxEntry or PfxEntry objects and use links to 1.4499 + // build an ordered (sorted by affix string) list 1.4500 + for (std::vector<affentry>::iterator entry = start; entry != end; ++entry) { 1.4501 + if (at == 'P') { 1.4502 + PfxEntry * pfxptr = new PfxEntry(this,&(*entry)); 1.4503 + build_pfxtree(pfxptr); 1.4504 + } else { 1.4505 + SfxEntry * sfxptr = new SfxEntry(this,&(*entry)); 1.4506 + build_sfxtree(sfxptr); 1.4507 + } 1.4508 + } 1.4509 + return 0; 1.4510 +} 1.4511 + 1.4512 +int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char * cond, int linenum) { 1.4513 + int condl = strlen(cond); 1.4514 + int i; 1.4515 + int j; 1.4516 + int neg; 1.4517 + int in; 1.4518 + if (ft == 'P') { // prefix 1.4519 + if (strncmp(strip, cond, condl) == 0) return 1; 1.4520 + if (utf8) { 1.4521 + } else { 1.4522 + for (i = 0, j = 0; (i < stripl) && (j < condl); i++, j++) { 1.4523 + if (cond[j] != '[') { 1.4524 + if (cond[j] != strip[i]) { 1.4525 + HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum); 1.4526 + return 0; 1.4527 + } 1.4528 + } else { 1.4529 + neg = (cond[j+1] == '^') ? 1 : 0; 1.4530 + in = 0; 1.4531 + do { 1.4532 + j++; 1.4533 + if (strip[i] == cond[j]) in = 1; 1.4534 + } while ((j < (condl - 1)) && (cond[j] != ']')); 1.4535 + if (j == (condl - 1) && (cond[j] != ']')) { 1.4536 + HUNSPELL_WARNING(stderr, "error: line %d: missing ] in condition:\n%s\n", linenum, cond); 1.4537 + return 0; 1.4538 + } 1.4539 + if ((!neg && !in) || (neg && in)) { 1.4540 + HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum); 1.4541 + return 0; 1.4542 + } 1.4543 + } 1.4544 + } 1.4545 + if (j >= condl) return 1; 1.4546 + } 1.4547 + } else { // suffix 1.4548 + if ((stripl >= condl) && strcmp(strip + stripl - condl, cond) == 0) return 1; 1.4549 + if (utf8) { 1.4550 + } else { 1.4551 + for (i = stripl - 1, j = condl - 1; (i >= 0) && (j >= 0); i--, j--) { 1.4552 + if (cond[j] != ']') { 1.4553 + if (cond[j] != strip[i]) { 1.4554 + HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum); 1.4555 + return 0; 1.4556 + } 1.4557 + } else { 1.4558 + in = 0; 1.4559 + do { 1.4560 + j--; 1.4561 + if (strip[i] == cond[j]) in = 1; 1.4562 + } while ((j > 0) && (cond[j] != '[')); 1.4563 + if ((j == 0) && (cond[j] != '[')) { 1.4564 + HUNSPELL_WARNING(stderr, "error: line: %d: missing ] in condition:\n%s\n", linenum, cond); 1.4565 + return 0; 1.4566 + } 1.4567 + neg = (cond[j+1] == '^') ? 1 : 0; 1.4568 + if ((!neg && !in) || (neg && in)) { 1.4569 + HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum); 1.4570 + return 0; 1.4571 + } 1.4572 + } 1.4573 + } 1.4574 + if (j < 0) return 1; 1.4575 + } 1.4576 + } 1.4577 + return 0; 1.4578 +}