Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | /******* BEGIN LICENSE BLOCK ******* |
michael@0 | 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 |
michael@0 | 3 | * |
michael@0 | 4 | * The contents of this file are subject to the Mozilla Public License Version |
michael@0 | 5 | * 1.1 (the "License"); you may not use this file except in compliance with |
michael@0 | 6 | * the License. You may obtain a copy of the License at |
michael@0 | 7 | * http://www.mozilla.org/MPL/ |
michael@0 | 8 | * |
michael@0 | 9 | * Software distributed under the License is distributed on an "AS IS" basis, |
michael@0 | 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License |
michael@0 | 11 | * for the specific language governing rights and limitations under the |
michael@0 | 12 | * License. |
michael@0 | 13 | * |
michael@0 | 14 | * The Initial Developers of the Original Code are Kevin Hendricks (MySpell) |
michael@0 | 15 | * and László Németh (Hunspell). Portions created by the Initial Developers |
michael@0 | 16 | * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved. |
michael@0 | 17 | * |
michael@0 | 18 | * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca) |
michael@0 | 19 | * David Einstein (deinst@world.std.com) |
michael@0 | 20 | * László Németh (nemethl@gyorsposta.hu) |
michael@0 | 21 | * Caolan McNamara (caolanm@redhat.com) |
michael@0 | 22 | * Davide Prina |
michael@0 | 23 | * Giuseppe Modugno |
michael@0 | 24 | * Gianluca Turconi |
michael@0 | 25 | * Simon Brouwer |
michael@0 | 26 | * Noll Janos |
michael@0 | 27 | * Biro Arpad |
michael@0 | 28 | * Goldman Eleonora |
michael@0 | 29 | * Sarlos Tamas |
michael@0 | 30 | * Bencsath Boldizsar |
michael@0 | 31 | * Halacsy Peter |
michael@0 | 32 | * Dvornik Laszlo |
michael@0 | 33 | * Gefferth Andras |
michael@0 | 34 | * Nagy Viktor |
michael@0 | 35 | * Varga Daniel |
michael@0 | 36 | * Chris Halls |
michael@0 | 37 | * Rene Engelhard |
michael@0 | 38 | * Bram Moolenaar |
michael@0 | 39 | * Dafydd Jones |
michael@0 | 40 | * Harri Pitkanen |
michael@0 | 41 | * Andras Timar |
michael@0 | 42 | * Tor Lillqvist |
michael@0 | 43 | * |
michael@0 | 44 | * Alternatively, the contents of this file may be used under the terms of |
michael@0 | 45 | * either the GNU General Public License Version 2 or later (the "GPL"), or |
michael@0 | 46 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), |
michael@0 | 47 | * in which case the provisions of the GPL or the LGPL are applicable instead |
michael@0 | 48 | * of those above. If you wish to allow use of your version of this file only |
michael@0 | 49 | * under the terms of either the GPL or the LGPL, and not to allow others to |
michael@0 | 50 | * use your version of this file under the terms of the MPL, indicate your |
michael@0 | 51 | * decision by deleting the provisions above and replace them with the notice |
michael@0 | 52 | * and other provisions required by the GPL or the LGPL. If you do not delete |
michael@0 | 53 | * the provisions above, a recipient may use your version of this file under |
michael@0 | 54 | * the terms of any one of the MPL, the GPL or the LGPL. |
michael@0 | 55 | * |
michael@0 | 56 | ******* END LICENSE BLOCK *******/ |
michael@0 | 57 | |
michael@0 | 58 | #include <stdlib.h> |
michael@0 | 59 | #include <string.h> |
michael@0 | 60 | #include <stdio.h> |
michael@0 | 61 | #include <ctype.h> |
michael@0 | 62 | |
michael@0 | 63 | #include <vector> |
michael@0 | 64 | |
michael@0 | 65 | #include "affixmgr.hxx" |
michael@0 | 66 | #include "affentry.hxx" |
michael@0 | 67 | #include "langnum.hxx" |
michael@0 | 68 | |
michael@0 | 69 | #include "csutil.hxx" |
michael@0 | 70 | |
michael@0 | 71 | AffixMgr::AffixMgr(const char * affpath, HashMgr** ptr, int * md, const char * key) |
michael@0 | 72 | { |
michael@0 | 73 | // register hash manager and load affix data from aff file |
michael@0 | 74 | pHMgr = ptr[0]; |
michael@0 | 75 | alldic = ptr; |
michael@0 | 76 | maxdic = md; |
michael@0 | 77 | keystring = NULL; |
michael@0 | 78 | trystring = NULL; |
michael@0 | 79 | encoding=NULL; |
michael@0 | 80 | csconv=NULL; |
michael@0 | 81 | utf8 = 0; |
michael@0 | 82 | complexprefixes = 0; |
michael@0 | 83 | maptable = NULL; |
michael@0 | 84 | nummap = 0; |
michael@0 | 85 | breaktable = NULL; |
michael@0 | 86 | numbreak = -1; |
michael@0 | 87 | reptable = NULL; |
michael@0 | 88 | numrep = 0; |
michael@0 | 89 | iconvtable = NULL; |
michael@0 | 90 | oconvtable = NULL; |
michael@0 | 91 | checkcpdtable = NULL; |
michael@0 | 92 | // allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN) |
michael@0 | 93 | simplifiedcpd = 0; |
michael@0 | 94 | numcheckcpd = 0; |
michael@0 | 95 | defcpdtable = NULL; |
michael@0 | 96 | numdefcpd = 0; |
michael@0 | 97 | phone = NULL; |
michael@0 | 98 | compoundflag = FLAG_NULL; // permits word in compound forms |
michael@0 | 99 | compoundbegin = FLAG_NULL; // may be first word in compound forms |
michael@0 | 100 | compoundmiddle = FLAG_NULL; // may be middle word in compound forms |
michael@0 | 101 | compoundend = FLAG_NULL; // may be last word in compound forms |
michael@0 | 102 | compoundroot = FLAG_NULL; // compound word signing flag |
michael@0 | 103 | compoundpermitflag = FLAG_NULL; // compound permitting flag for suffixed word |
michael@0 | 104 | compoundforbidflag = FLAG_NULL; // compound fordidden flag for suffixed word |
michael@0 | 105 | checkcompounddup = 0; // forbid double words in compounds |
michael@0 | 106 | checkcompoundrep = 0; // forbid bad compounds (may be non compound word with a REP substitution) |
michael@0 | 107 | checkcompoundcase = 0; // forbid upper and lowercase combinations at word bounds |
michael@0 | 108 | checkcompoundtriple = 0; // forbid compounds with triple letters |
michael@0 | 109 | simplifiedtriple = 0; // allow simplified triple letters in compounds (Schiff+fahrt -> Schiffahrt) |
michael@0 | 110 | forbiddenword = FORBIDDENWORD; // forbidden word signing flag |
michael@0 | 111 | nosuggest = FLAG_NULL; // don't suggest words signed with NOSUGGEST flag |
michael@0 | 112 | nongramsuggest = FLAG_NULL; |
michael@0 | 113 | lang = NULL; // language |
michael@0 | 114 | langnum = 0; // language code (see http://l10n.openoffice.org/languages.html) |
michael@0 | 115 | needaffix = FLAG_NULL; // forbidden root, allowed only with suffixes |
michael@0 | 116 | cpdwordmax = -1; // default: unlimited wordcount in compound words |
michael@0 | 117 | cpdmin = -1; // undefined |
michael@0 | 118 | cpdmaxsyllable = 0; // default: unlimited syllablecount in compound words |
michael@0 | 119 | cpdvowels=NULL; // vowels (for calculating of Hungarian compounding limit, O(n) search! XXX) |
michael@0 | 120 | cpdvowels_utf16=NULL; // vowels for UTF-8 encoding (bsearch instead of O(n) search) |
michael@0 | 121 | cpdvowels_utf16_len=0; // vowels |
michael@0 | 122 | pfxappnd=NULL; // previous prefix for counting the syllables of prefix BUG |
michael@0 | 123 | sfxappnd=NULL; // previous suffix for counting a special syllables BUG |
michael@0 | 124 | cpdsyllablenum=NULL; // syllable count incrementing flag |
michael@0 | 125 | checknum=0; // checking numbers, and word with numbers |
michael@0 | 126 | wordchars=NULL; // letters + spec. word characters |
michael@0 | 127 | wordchars_utf16=NULL; // letters + spec. word characters |
michael@0 | 128 | wordchars_utf16_len=0; // letters + spec. word characters |
michael@0 | 129 | ignorechars=NULL; // letters + spec. word characters |
michael@0 | 130 | ignorechars_utf16=NULL; // letters + spec. word characters |
michael@0 | 131 | ignorechars_utf16_len=0; // letters + spec. word characters |
michael@0 | 132 | version=NULL; // affix and dictionary file version string |
michael@0 | 133 | havecontclass=0; // flags of possible continuing classes (double affix) |
michael@0 | 134 | // LEMMA_PRESENT: not put root into the morphological output. Lemma presents |
michael@0 | 135 | // in morhological description in dictionary file. It's often combined with PSEUDOROOT. |
michael@0 | 136 | lemma_present = FLAG_NULL; |
michael@0 | 137 | circumfix = FLAG_NULL; |
michael@0 | 138 | onlyincompound = FLAG_NULL; |
michael@0 | 139 | maxngramsugs = -1; // undefined |
michael@0 | 140 | maxdiff = -1; // undefined |
michael@0 | 141 | onlymaxdiff = 0; |
michael@0 | 142 | maxcpdsugs = -1; // undefined |
michael@0 | 143 | nosplitsugs = 0; |
michael@0 | 144 | sugswithdots = 0; |
michael@0 | 145 | keepcase = 0; |
michael@0 | 146 | forceucase = 0; |
michael@0 | 147 | warn = 0; |
michael@0 | 148 | forbidwarn = 0; |
michael@0 | 149 | checksharps = 0; |
michael@0 | 150 | substandard = FLAG_NULL; |
michael@0 | 151 | fullstrip = 0; |
michael@0 | 152 | |
michael@0 | 153 | sfx = NULL; |
michael@0 | 154 | pfx = NULL; |
michael@0 | 155 | |
michael@0 | 156 | for (int i=0; i < SETSIZE; i++) { |
michael@0 | 157 | pStart[i] = NULL; |
michael@0 | 158 | sStart[i] = NULL; |
michael@0 | 159 | pFlag[i] = NULL; |
michael@0 | 160 | sFlag[i] = NULL; |
michael@0 | 161 | } |
michael@0 | 162 | |
michael@0 | 163 | for (int j=0; j < CONTSIZE; j++) { |
michael@0 | 164 | contclasses[j] = 0; |
michael@0 | 165 | } |
michael@0 | 166 | |
michael@0 | 167 | if (parse_file(affpath, key)) { |
michael@0 | 168 | HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n",affpath); |
michael@0 | 169 | } |
michael@0 | 170 | |
michael@0 | 171 | if (cpdmin == -1) cpdmin = MINCPDLEN; |
michael@0 | 172 | |
michael@0 | 173 | } |
michael@0 | 174 | |
michael@0 | 175 | |
michael@0 | 176 | AffixMgr::~AffixMgr() |
michael@0 | 177 | { |
michael@0 | 178 | // pass through linked prefix entries and clean up |
michael@0 | 179 | for (int i=0; i < SETSIZE ;i++) { |
michael@0 | 180 | pFlag[i] = NULL; |
michael@0 | 181 | PfxEntry * ptr = pStart[i]; |
michael@0 | 182 | PfxEntry * nptr = NULL; |
michael@0 | 183 | while (ptr) { |
michael@0 | 184 | nptr = ptr->getNext(); |
michael@0 | 185 | delete(ptr); |
michael@0 | 186 | ptr = nptr; |
michael@0 | 187 | nptr = NULL; |
michael@0 | 188 | } |
michael@0 | 189 | } |
michael@0 | 190 | |
michael@0 | 191 | // pass through linked suffix entries and clean up |
michael@0 | 192 | for (int j=0; j < SETSIZE ; j++) { |
michael@0 | 193 | sFlag[j] = NULL; |
michael@0 | 194 | SfxEntry * ptr = sStart[j]; |
michael@0 | 195 | SfxEntry * nptr = NULL; |
michael@0 | 196 | while (ptr) { |
michael@0 | 197 | nptr = ptr->getNext(); |
michael@0 | 198 | delete(ptr); |
michael@0 | 199 | ptr = nptr; |
michael@0 | 200 | nptr = NULL; |
michael@0 | 201 | } |
michael@0 | 202 | sStart[j] = NULL; |
michael@0 | 203 | } |
michael@0 | 204 | |
michael@0 | 205 | if (keystring) free(keystring); |
michael@0 | 206 | keystring=NULL; |
michael@0 | 207 | if (trystring) free(trystring); |
michael@0 | 208 | trystring=NULL; |
michael@0 | 209 | if (encoding) free(encoding); |
michael@0 | 210 | encoding=NULL; |
michael@0 | 211 | if (maptable) { |
michael@0 | 212 | for (int j=0; j < nummap; j++) { |
michael@0 | 213 | for (int k=0; k < maptable[j].len; k++) { |
michael@0 | 214 | if (maptable[j].set[k]) free(maptable[j].set[k]); |
michael@0 | 215 | } |
michael@0 | 216 | free(maptable[j].set); |
michael@0 | 217 | maptable[j].set = NULL; |
michael@0 | 218 | maptable[j].len = 0; |
michael@0 | 219 | } |
michael@0 | 220 | free(maptable); |
michael@0 | 221 | maptable = NULL; |
michael@0 | 222 | } |
michael@0 | 223 | nummap = 0; |
michael@0 | 224 | if (breaktable) { |
michael@0 | 225 | for (int j=0; j < numbreak; j++) { |
michael@0 | 226 | if (breaktable[j]) free(breaktable[j]); |
michael@0 | 227 | breaktable[j] = NULL; |
michael@0 | 228 | } |
michael@0 | 229 | free(breaktable); |
michael@0 | 230 | breaktable = NULL; |
michael@0 | 231 | } |
michael@0 | 232 | numbreak = 0; |
michael@0 | 233 | if (reptable) { |
michael@0 | 234 | for (int j=0; j < numrep; j++) { |
michael@0 | 235 | free(reptable[j].pattern); |
michael@0 | 236 | free(reptable[j].pattern2); |
michael@0 | 237 | } |
michael@0 | 238 | free(reptable); |
michael@0 | 239 | reptable = NULL; |
michael@0 | 240 | } |
michael@0 | 241 | if (iconvtable) delete iconvtable; |
michael@0 | 242 | if (oconvtable) delete oconvtable; |
michael@0 | 243 | if (phone && phone->rules) { |
michael@0 | 244 | for (int j=0; j < phone->num + 1; j++) { |
michael@0 | 245 | free(phone->rules[j * 2]); |
michael@0 | 246 | free(phone->rules[j * 2 + 1]); |
michael@0 | 247 | } |
michael@0 | 248 | free(phone->rules); |
michael@0 | 249 | free(phone); |
michael@0 | 250 | phone = NULL; |
michael@0 | 251 | } |
michael@0 | 252 | |
michael@0 | 253 | if (defcpdtable) { |
michael@0 | 254 | for (int j=0; j < numdefcpd; j++) { |
michael@0 | 255 | free(defcpdtable[j].def); |
michael@0 | 256 | defcpdtable[j].def = NULL; |
michael@0 | 257 | } |
michael@0 | 258 | free(defcpdtable); |
michael@0 | 259 | defcpdtable = NULL; |
michael@0 | 260 | } |
michael@0 | 261 | numrep = 0; |
michael@0 | 262 | if (checkcpdtable) { |
michael@0 | 263 | for (int j=0; j < numcheckcpd; j++) { |
michael@0 | 264 | free(checkcpdtable[j].pattern); |
michael@0 | 265 | free(checkcpdtable[j].pattern2); |
michael@0 | 266 | free(checkcpdtable[j].pattern3); |
michael@0 | 267 | checkcpdtable[j].pattern = NULL; |
michael@0 | 268 | checkcpdtable[j].pattern2 = NULL; |
michael@0 | 269 | checkcpdtable[j].pattern3 = NULL; |
michael@0 | 270 | } |
michael@0 | 271 | free(checkcpdtable); |
michael@0 | 272 | checkcpdtable = NULL; |
michael@0 | 273 | } |
michael@0 | 274 | numcheckcpd = 0; |
michael@0 | 275 | FREE_FLAG(compoundflag); |
michael@0 | 276 | FREE_FLAG(compoundbegin); |
michael@0 | 277 | FREE_FLAG(compoundmiddle); |
michael@0 | 278 | FREE_FLAG(compoundend); |
michael@0 | 279 | FREE_FLAG(compoundpermitflag); |
michael@0 | 280 | FREE_FLAG(compoundforbidflag); |
michael@0 | 281 | FREE_FLAG(compoundroot); |
michael@0 | 282 | FREE_FLAG(forbiddenword); |
michael@0 | 283 | FREE_FLAG(nosuggest); |
michael@0 | 284 | FREE_FLAG(nongramsuggest); |
michael@0 | 285 | FREE_FLAG(needaffix); |
michael@0 | 286 | FREE_FLAG(lemma_present); |
michael@0 | 287 | FREE_FLAG(circumfix); |
michael@0 | 288 | FREE_FLAG(onlyincompound); |
michael@0 | 289 | |
michael@0 | 290 | cpdwordmax = 0; |
michael@0 | 291 | pHMgr = NULL; |
michael@0 | 292 | cpdmin = 0; |
michael@0 | 293 | cpdmaxsyllable = 0; |
michael@0 | 294 | if (cpdvowels) free(cpdvowels); |
michael@0 | 295 | if (cpdvowels_utf16) free(cpdvowels_utf16); |
michael@0 | 296 | if (cpdsyllablenum) free(cpdsyllablenum); |
michael@0 | 297 | free_utf_tbl(); |
michael@0 | 298 | if (lang) free(lang); |
michael@0 | 299 | if (wordchars) free(wordchars); |
michael@0 | 300 | if (wordchars_utf16) free(wordchars_utf16); |
michael@0 | 301 | if (ignorechars) free(ignorechars); |
michael@0 | 302 | if (ignorechars_utf16) free(ignorechars_utf16); |
michael@0 | 303 | if (version) free(version); |
michael@0 | 304 | checknum=0; |
michael@0 | 305 | #ifdef MOZILLA_CLIENT |
michael@0 | 306 | delete [] csconv; |
michael@0 | 307 | #endif |
michael@0 | 308 | } |
michael@0 | 309 | |
michael@0 | 310 | |
michael@0 | 311 | // read in aff file and build up prefix and suffix entry objects |
michael@0 | 312 | int AffixMgr::parse_file(const char * affpath, const char * key) |
michael@0 | 313 | { |
michael@0 | 314 | char * line; // io buffers |
michael@0 | 315 | char ft; // affix type |
michael@0 | 316 | |
michael@0 | 317 | // checking flag duplication |
michael@0 | 318 | char dupflags[CONTSIZE]; |
michael@0 | 319 | char dupflags_ini = 1; |
michael@0 | 320 | |
michael@0 | 321 | // first line indicator for removing byte order mark |
michael@0 | 322 | int firstline = 1; |
michael@0 | 323 | |
michael@0 | 324 | // open the affix file |
michael@0 | 325 | FileMgr * afflst = new FileMgr(affpath, key); |
michael@0 | 326 | if (!afflst) { |
michael@0 | 327 | HUNSPELL_WARNING(stderr, "error: could not open affix description file %s\n",affpath); |
michael@0 | 328 | return 1; |
michael@0 | 329 | } |
michael@0 | 330 | |
michael@0 | 331 | // step one is to parse the affix file building up the internal |
michael@0 | 332 | // affix data structures |
michael@0 | 333 | |
michael@0 | 334 | // read in each line ignoring any that do not |
michael@0 | 335 | // start with a known line type indicator |
michael@0 | 336 | while ((line = afflst->getline())) { |
michael@0 | 337 | mychomp(line); |
michael@0 | 338 | |
michael@0 | 339 | /* remove byte order mark */ |
michael@0 | 340 | if (firstline) { |
michael@0 | 341 | firstline = 0; |
michael@0 | 342 | // Affix file begins with byte order mark: possible incompatibility with old Hunspell versions |
michael@0 | 343 | if (strncmp(line,"\xEF\xBB\xBF",3) == 0) { |
michael@0 | 344 | memmove(line, line+3, strlen(line+3)+1); |
michael@0 | 345 | } |
michael@0 | 346 | } |
michael@0 | 347 | |
michael@0 | 348 | /* parse in the keyboard string */ |
michael@0 | 349 | if (strncmp(line,"KEY",3) == 0) { |
michael@0 | 350 | if (parse_string(line, &keystring, afflst->getlinenum())) { |
michael@0 | 351 | delete afflst; |
michael@0 | 352 | return 1; |
michael@0 | 353 | } |
michael@0 | 354 | } |
michael@0 | 355 | |
michael@0 | 356 | /* parse in the try string */ |
michael@0 | 357 | if (strncmp(line,"TRY",3) == 0) { |
michael@0 | 358 | if (parse_string(line, &trystring, afflst->getlinenum())) { |
michael@0 | 359 | delete afflst; |
michael@0 | 360 | return 1; |
michael@0 | 361 | } |
michael@0 | 362 | } |
michael@0 | 363 | |
michael@0 | 364 | /* parse in the name of the character set used by the .dict and .aff */ |
michael@0 | 365 | if (strncmp(line,"SET",3) == 0) { |
michael@0 | 366 | if (parse_string(line, &encoding, afflst->getlinenum())) { |
michael@0 | 367 | delete afflst; |
michael@0 | 368 | return 1; |
michael@0 | 369 | } |
michael@0 | 370 | if (strcmp(encoding, "UTF-8") == 0) { |
michael@0 | 371 | utf8 = 1; |
michael@0 | 372 | #ifndef OPENOFFICEORG |
michael@0 | 373 | #ifndef MOZILLA_CLIENT |
michael@0 | 374 | if (initialize_utf_tbl()) return 1; |
michael@0 | 375 | #endif |
michael@0 | 376 | #endif |
michael@0 | 377 | } |
michael@0 | 378 | } |
michael@0 | 379 | |
michael@0 | 380 | /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left writing system */ |
michael@0 | 381 | if (strncmp(line,"COMPLEXPREFIXES",15) == 0) |
michael@0 | 382 | complexprefixes = 1; |
michael@0 | 383 | |
michael@0 | 384 | /* parse in the flag used by the controlled compound words */ |
michael@0 | 385 | if (strncmp(line,"COMPOUNDFLAG",12) == 0) { |
michael@0 | 386 | if (parse_flag(line, &compoundflag, afflst)) { |
michael@0 | 387 | delete afflst; |
michael@0 | 388 | return 1; |
michael@0 | 389 | } |
michael@0 | 390 | } |
michael@0 | 391 | |
michael@0 | 392 | /* parse in the flag used by compound words */ |
michael@0 | 393 | if (strncmp(line,"COMPOUNDBEGIN",13) == 0) { |
michael@0 | 394 | if (complexprefixes) { |
michael@0 | 395 | if (parse_flag(line, &compoundend, afflst)) { |
michael@0 | 396 | delete afflst; |
michael@0 | 397 | return 1; |
michael@0 | 398 | } |
michael@0 | 399 | } else { |
michael@0 | 400 | if (parse_flag(line, &compoundbegin, afflst)) { |
michael@0 | 401 | delete afflst; |
michael@0 | 402 | return 1; |
michael@0 | 403 | } |
michael@0 | 404 | } |
michael@0 | 405 | } |
michael@0 | 406 | |
michael@0 | 407 | /* parse in the flag used by compound words */ |
michael@0 | 408 | if (strncmp(line,"COMPOUNDMIDDLE",14) == 0) { |
michael@0 | 409 | if (parse_flag(line, &compoundmiddle, afflst)) { |
michael@0 | 410 | delete afflst; |
michael@0 | 411 | return 1; |
michael@0 | 412 | } |
michael@0 | 413 | } |
michael@0 | 414 | /* parse in the flag used by compound words */ |
michael@0 | 415 | if (strncmp(line,"COMPOUNDEND",11) == 0) { |
michael@0 | 416 | if (complexprefixes) { |
michael@0 | 417 | if (parse_flag(line, &compoundbegin, afflst)) { |
michael@0 | 418 | delete afflst; |
michael@0 | 419 | return 1; |
michael@0 | 420 | } |
michael@0 | 421 | } else { |
michael@0 | 422 | if (parse_flag(line, &compoundend, afflst)) { |
michael@0 | 423 | delete afflst; |
michael@0 | 424 | return 1; |
michael@0 | 425 | } |
michael@0 | 426 | } |
michael@0 | 427 | } |
michael@0 | 428 | |
michael@0 | 429 | /* parse in the data used by compound_check() method */ |
michael@0 | 430 | if (strncmp(line,"COMPOUNDWORDMAX",15) == 0) { |
michael@0 | 431 | if (parse_num(line, &cpdwordmax, afflst)) { |
michael@0 | 432 | delete afflst; |
michael@0 | 433 | return 1; |
michael@0 | 434 | } |
michael@0 | 435 | } |
michael@0 | 436 | |
michael@0 | 437 | /* parse in the flag sign compounds in dictionary */ |
michael@0 | 438 | if (strncmp(line,"COMPOUNDROOT",12) == 0) { |
michael@0 | 439 | if (parse_flag(line, &compoundroot, afflst)) { |
michael@0 | 440 | delete afflst; |
michael@0 | 441 | return 1; |
michael@0 | 442 | } |
michael@0 | 443 | } |
michael@0 | 444 | |
michael@0 | 445 | /* parse in the flag used by compound_check() method */ |
michael@0 | 446 | if (strncmp(line,"COMPOUNDPERMITFLAG",18) == 0) { |
michael@0 | 447 | if (parse_flag(line, &compoundpermitflag, afflst)) { |
michael@0 | 448 | delete afflst; |
michael@0 | 449 | return 1; |
michael@0 | 450 | } |
michael@0 | 451 | } |
michael@0 | 452 | |
michael@0 | 453 | /* parse in the flag used by compound_check() method */ |
michael@0 | 454 | if (strncmp(line,"COMPOUNDFORBIDFLAG",18) == 0) { |
michael@0 | 455 | if (parse_flag(line, &compoundforbidflag, afflst)) { |
michael@0 | 456 | delete afflst; |
michael@0 | 457 | return 1; |
michael@0 | 458 | } |
michael@0 | 459 | } |
michael@0 | 460 | |
michael@0 | 461 | if (strncmp(line,"CHECKCOMPOUNDDUP",16) == 0) { |
michael@0 | 462 | checkcompounddup = 1; |
michael@0 | 463 | } |
michael@0 | 464 | |
michael@0 | 465 | if (strncmp(line,"CHECKCOMPOUNDREP",16) == 0) { |
michael@0 | 466 | checkcompoundrep = 1; |
michael@0 | 467 | } |
michael@0 | 468 | |
michael@0 | 469 | if (strncmp(line,"CHECKCOMPOUNDTRIPLE",19) == 0) { |
michael@0 | 470 | checkcompoundtriple = 1; |
michael@0 | 471 | } |
michael@0 | 472 | |
michael@0 | 473 | if (strncmp(line,"SIMPLIFIEDTRIPLE",16) == 0) { |
michael@0 | 474 | simplifiedtriple = 1; |
michael@0 | 475 | } |
michael@0 | 476 | |
michael@0 | 477 | if (strncmp(line,"CHECKCOMPOUNDCASE",17) == 0) { |
michael@0 | 478 | checkcompoundcase = 1; |
michael@0 | 479 | } |
michael@0 | 480 | |
michael@0 | 481 | if (strncmp(line,"NOSUGGEST",9) == 0) { |
michael@0 | 482 | if (parse_flag(line, &nosuggest, afflst)) { |
michael@0 | 483 | delete afflst; |
michael@0 | 484 | return 1; |
michael@0 | 485 | } |
michael@0 | 486 | } |
michael@0 | 487 | |
michael@0 | 488 | if (strncmp(line,"NONGRAMSUGGEST",14) == 0) { |
michael@0 | 489 | if (parse_flag(line, &nongramsuggest, afflst)) { |
michael@0 | 490 | delete afflst; |
michael@0 | 491 | return 1; |
michael@0 | 492 | } |
michael@0 | 493 | } |
michael@0 | 494 | |
michael@0 | 495 | /* parse in the flag used by forbidden words */ |
michael@0 | 496 | if (strncmp(line,"FORBIDDENWORD",13) == 0) { |
michael@0 | 497 | if (parse_flag(line, &forbiddenword, afflst)) { |
michael@0 | 498 | delete afflst; |
michael@0 | 499 | return 1; |
michael@0 | 500 | } |
michael@0 | 501 | } |
michael@0 | 502 | |
michael@0 | 503 | /* parse in the flag used by forbidden words */ |
michael@0 | 504 | if (strncmp(line,"LEMMA_PRESENT",13) == 0) { |
michael@0 | 505 | if (parse_flag(line, &lemma_present, afflst)) { |
michael@0 | 506 | delete afflst; |
michael@0 | 507 | return 1; |
michael@0 | 508 | } |
michael@0 | 509 | } |
michael@0 | 510 | |
michael@0 | 511 | /* parse in the flag used by circumfixes */ |
michael@0 | 512 | if (strncmp(line,"CIRCUMFIX",9) == 0) { |
michael@0 | 513 | if (parse_flag(line, &circumfix, afflst)) { |
michael@0 | 514 | delete afflst; |
michael@0 | 515 | return 1; |
michael@0 | 516 | } |
michael@0 | 517 | } |
michael@0 | 518 | |
michael@0 | 519 | /* parse in the flag used by fogemorphemes */ |
michael@0 | 520 | if (strncmp(line,"ONLYINCOMPOUND",14) == 0) { |
michael@0 | 521 | if (parse_flag(line, &onlyincompound, afflst)) { |
michael@0 | 522 | delete afflst; |
michael@0 | 523 | return 1; |
michael@0 | 524 | } |
michael@0 | 525 | } |
michael@0 | 526 | |
michael@0 | 527 | /* parse in the flag used by `needaffixs' */ |
michael@0 | 528 | if (strncmp(line,"PSEUDOROOT",10) == 0) { |
michael@0 | 529 | if (parse_flag(line, &needaffix, afflst)) { |
michael@0 | 530 | delete afflst; |
michael@0 | 531 | return 1; |
michael@0 | 532 | } |
michael@0 | 533 | } |
michael@0 | 534 | |
michael@0 | 535 | /* parse in the flag used by `needaffixs' */ |
michael@0 | 536 | if (strncmp(line,"NEEDAFFIX",9) == 0) { |
michael@0 | 537 | if (parse_flag(line, &needaffix, afflst)) { |
michael@0 | 538 | delete afflst; |
michael@0 | 539 | return 1; |
michael@0 | 540 | } |
michael@0 | 541 | } |
michael@0 | 542 | |
michael@0 | 543 | /* parse in the minimal length for words in compounds */ |
michael@0 | 544 | if (strncmp(line,"COMPOUNDMIN",11) == 0) { |
michael@0 | 545 | if (parse_num(line, &cpdmin, afflst)) { |
michael@0 | 546 | delete afflst; |
michael@0 | 547 | return 1; |
michael@0 | 548 | } |
michael@0 | 549 | if (cpdmin < 1) cpdmin = 1; |
michael@0 | 550 | } |
michael@0 | 551 | |
michael@0 | 552 | /* parse in the max. words and syllables in compounds */ |
michael@0 | 553 | if (strncmp(line,"COMPOUNDSYLLABLE",16) == 0) { |
michael@0 | 554 | if (parse_cpdsyllable(line, afflst)) { |
michael@0 | 555 | delete afflst; |
michael@0 | 556 | return 1; |
michael@0 | 557 | } |
michael@0 | 558 | } |
michael@0 | 559 | |
michael@0 | 560 | /* parse in the flag used by compound_check() method */ |
michael@0 | 561 | if (strncmp(line,"SYLLABLENUM",11) == 0) { |
michael@0 | 562 | if (parse_string(line, &cpdsyllablenum, afflst->getlinenum())) { |
michael@0 | 563 | delete afflst; |
michael@0 | 564 | return 1; |
michael@0 | 565 | } |
michael@0 | 566 | } |
michael@0 | 567 | |
michael@0 | 568 | /* parse in the flag used by the controlled compound words */ |
michael@0 | 569 | if (strncmp(line,"CHECKNUM",8) == 0) { |
michael@0 | 570 | checknum=1; |
michael@0 | 571 | } |
michael@0 | 572 | |
michael@0 | 573 | /* parse in the extra word characters */ |
michael@0 | 574 | if (strncmp(line,"WORDCHARS",9) == 0) { |
michael@0 | 575 | if (parse_array(line, &wordchars, &wordchars_utf16, &wordchars_utf16_len, utf8, afflst->getlinenum())) { |
michael@0 | 576 | delete afflst; |
michael@0 | 577 | return 1; |
michael@0 | 578 | } |
michael@0 | 579 | } |
michael@0 | 580 | |
michael@0 | 581 | /* parse in the ignored characters (for example, Arabic optional diacretics charachters */ |
michael@0 | 582 | if (strncmp(line,"IGNORE",6) == 0) { |
michael@0 | 583 | if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, utf8, afflst->getlinenum())) { |
michael@0 | 584 | delete afflst; |
michael@0 | 585 | return 1; |
michael@0 | 586 | } |
michael@0 | 587 | } |
michael@0 | 588 | |
michael@0 | 589 | /* parse in the typical fault correcting table */ |
michael@0 | 590 | if (strncmp(line,"REP",3) == 0) { |
michael@0 | 591 | if (parse_reptable(line, afflst)) { |
michael@0 | 592 | delete afflst; |
michael@0 | 593 | return 1; |
michael@0 | 594 | } |
michael@0 | 595 | } |
michael@0 | 596 | |
michael@0 | 597 | /* parse in the input conversion table */ |
michael@0 | 598 | if (strncmp(line,"ICONV",5) == 0) { |
michael@0 | 599 | if (parse_convtable(line, afflst, &iconvtable, "ICONV")) { |
michael@0 | 600 | delete afflst; |
michael@0 | 601 | return 1; |
michael@0 | 602 | } |
michael@0 | 603 | } |
michael@0 | 604 | |
michael@0 | 605 | /* parse in the input conversion table */ |
michael@0 | 606 | if (strncmp(line,"OCONV",5) == 0) { |
michael@0 | 607 | if (parse_convtable(line, afflst, &oconvtable, "OCONV")) { |
michael@0 | 608 | delete afflst; |
michael@0 | 609 | return 1; |
michael@0 | 610 | } |
michael@0 | 611 | } |
michael@0 | 612 | |
michael@0 | 613 | /* parse in the phonetic translation table */ |
michael@0 | 614 | if (strncmp(line,"PHONE",5) == 0) { |
michael@0 | 615 | if (parse_phonetable(line, afflst)) { |
michael@0 | 616 | delete afflst; |
michael@0 | 617 | return 1; |
michael@0 | 618 | } |
michael@0 | 619 | } |
michael@0 | 620 | |
michael@0 | 621 | /* parse in the checkcompoundpattern table */ |
michael@0 | 622 | if (strncmp(line,"CHECKCOMPOUNDPATTERN",20) == 0) { |
michael@0 | 623 | if (parse_checkcpdtable(line, afflst)) { |
michael@0 | 624 | delete afflst; |
michael@0 | 625 | return 1; |
michael@0 | 626 | } |
michael@0 | 627 | } |
michael@0 | 628 | |
michael@0 | 629 | /* parse in the defcompound table */ |
michael@0 | 630 | if (strncmp(line,"COMPOUNDRULE",12) == 0) { |
michael@0 | 631 | if (parse_defcpdtable(line, afflst)) { |
michael@0 | 632 | delete afflst; |
michael@0 | 633 | return 1; |
michael@0 | 634 | } |
michael@0 | 635 | } |
michael@0 | 636 | |
michael@0 | 637 | /* parse in the related character map table */ |
michael@0 | 638 | if (strncmp(line,"MAP",3) == 0) { |
michael@0 | 639 | if (parse_maptable(line, afflst)) { |
michael@0 | 640 | delete afflst; |
michael@0 | 641 | return 1; |
michael@0 | 642 | } |
michael@0 | 643 | } |
michael@0 | 644 | |
michael@0 | 645 | /* parse in the word breakpoints table */ |
michael@0 | 646 | if (strncmp(line,"BREAK",5) == 0) { |
michael@0 | 647 | if (parse_breaktable(line, afflst)) { |
michael@0 | 648 | delete afflst; |
michael@0 | 649 | return 1; |
michael@0 | 650 | } |
michael@0 | 651 | } |
michael@0 | 652 | |
michael@0 | 653 | /* parse in the language for language specific codes */ |
michael@0 | 654 | if (strncmp(line,"LANG",4) == 0) { |
michael@0 | 655 | if (parse_string(line, &lang, afflst->getlinenum())) { |
michael@0 | 656 | delete afflst; |
michael@0 | 657 | return 1; |
michael@0 | 658 | } |
michael@0 | 659 | langnum = get_lang_num(lang); |
michael@0 | 660 | } |
michael@0 | 661 | |
michael@0 | 662 | if (strncmp(line,"VERSION",7) == 0) { |
michael@0 | 663 | for(line = line + 7; *line == ' ' || *line == '\t'; line++); |
michael@0 | 664 | version = mystrdup(line); |
michael@0 | 665 | } |
michael@0 | 666 | |
michael@0 | 667 | if (strncmp(line,"MAXNGRAMSUGS",12) == 0) { |
michael@0 | 668 | if (parse_num(line, &maxngramsugs, afflst)) { |
michael@0 | 669 | delete afflst; |
michael@0 | 670 | return 1; |
michael@0 | 671 | } |
michael@0 | 672 | } |
michael@0 | 673 | |
michael@0 | 674 | if (strncmp(line,"ONLYMAXDIFF", 11) == 0) |
michael@0 | 675 | onlymaxdiff = 1; |
michael@0 | 676 | |
michael@0 | 677 | if (strncmp(line,"MAXDIFF",7) == 0) { |
michael@0 | 678 | if (parse_num(line, &maxdiff, afflst)) { |
michael@0 | 679 | delete afflst; |
michael@0 | 680 | return 1; |
michael@0 | 681 | } |
michael@0 | 682 | } |
michael@0 | 683 | |
michael@0 | 684 | if (strncmp(line,"MAXCPDSUGS",10) == 0) { |
michael@0 | 685 | if (parse_num(line, &maxcpdsugs, afflst)) { |
michael@0 | 686 | delete afflst; |
michael@0 | 687 | return 1; |
michael@0 | 688 | } |
michael@0 | 689 | } |
michael@0 | 690 | |
michael@0 | 691 | if (strncmp(line,"NOSPLITSUGS",11) == 0) { |
michael@0 | 692 | nosplitsugs=1; |
michael@0 | 693 | } |
michael@0 | 694 | |
michael@0 | 695 | if (strncmp(line,"FULLSTRIP",9) == 0) { |
michael@0 | 696 | fullstrip=1; |
michael@0 | 697 | } |
michael@0 | 698 | |
michael@0 | 699 | if (strncmp(line,"SUGSWITHDOTS",12) == 0) { |
michael@0 | 700 | sugswithdots=1; |
michael@0 | 701 | } |
michael@0 | 702 | |
michael@0 | 703 | /* parse in the flag used by forbidden words */ |
michael@0 | 704 | if (strncmp(line,"KEEPCASE",8) == 0) { |
michael@0 | 705 | if (parse_flag(line, &keepcase, afflst)) { |
michael@0 | 706 | delete afflst; |
michael@0 | 707 | return 1; |
michael@0 | 708 | } |
michael@0 | 709 | } |
michael@0 | 710 | |
michael@0 | 711 | /* parse in the flag used by `forceucase' */ |
michael@0 | 712 | if (strncmp(line,"FORCEUCASE",10) == 0) { |
michael@0 | 713 | if (parse_flag(line, &forceucase, afflst)) { |
michael@0 | 714 | delete afflst; |
michael@0 | 715 | return 1; |
michael@0 | 716 | } |
michael@0 | 717 | } |
michael@0 | 718 | |
michael@0 | 719 | /* parse in the flag used by `warn' */ |
michael@0 | 720 | if (strncmp(line,"WARN",4) == 0) { |
michael@0 | 721 | if (parse_flag(line, &warn, afflst)) { |
michael@0 | 722 | delete afflst; |
michael@0 | 723 | return 1; |
michael@0 | 724 | } |
michael@0 | 725 | } |
michael@0 | 726 | |
michael@0 | 727 | if (strncmp(line,"FORBIDWARN",10) == 0) { |
michael@0 | 728 | forbidwarn=1; |
michael@0 | 729 | } |
michael@0 | 730 | |
michael@0 | 731 | /* parse in the flag used by the affix generator */ |
michael@0 | 732 | if (strncmp(line,"SUBSTANDARD",11) == 0) { |
michael@0 | 733 | if (parse_flag(line, &substandard, afflst)) { |
michael@0 | 734 | delete afflst; |
michael@0 | 735 | return 1; |
michael@0 | 736 | } |
michael@0 | 737 | } |
michael@0 | 738 | |
michael@0 | 739 | if (strncmp(line,"CHECKSHARPS",11) == 0) { |
michael@0 | 740 | checksharps=1; |
michael@0 | 741 | } |
michael@0 | 742 | |
michael@0 | 743 | /* parse this affix: P - prefix, S - suffix */ |
michael@0 | 744 | ft = ' '; |
michael@0 | 745 | if (strncmp(line,"PFX",3) == 0) ft = complexprefixes ? 'S' : 'P'; |
michael@0 | 746 | if (strncmp(line,"SFX",3) == 0) ft = complexprefixes ? 'P' : 'S'; |
michael@0 | 747 | if (ft != ' ') { |
michael@0 | 748 | if (dupflags_ini) { |
michael@0 | 749 | memset(dupflags, 0, sizeof(dupflags)); |
michael@0 | 750 | dupflags_ini = 0; |
michael@0 | 751 | } |
michael@0 | 752 | if (parse_affix(line, ft, afflst, dupflags)) { |
michael@0 | 753 | delete afflst; |
michael@0 | 754 | process_pfx_tree_to_list(); |
michael@0 | 755 | process_sfx_tree_to_list(); |
michael@0 | 756 | return 1; |
michael@0 | 757 | } |
michael@0 | 758 | } |
michael@0 | 759 | |
michael@0 | 760 | } |
michael@0 | 761 | delete afflst; |
michael@0 | 762 | |
michael@0 | 763 | // convert affix trees to sorted list |
michael@0 | 764 | process_pfx_tree_to_list(); |
michael@0 | 765 | process_sfx_tree_to_list(); |
michael@0 | 766 | |
michael@0 | 767 | // now we can speed up performance greatly taking advantage of the |
michael@0 | 768 | // relationship between the affixes and the idea of "subsets". |
michael@0 | 769 | |
michael@0 | 770 | // View each prefix as a potential leading subset of another and view |
michael@0 | 771 | // each suffix (reversed) as a potential trailing subset of another. |
michael@0 | 772 | |
michael@0 | 773 | // To illustrate this relationship if we know the prefix "ab" is found in the |
michael@0 | 774 | // word to examine, only prefixes that "ab" is a leading subset of need be examined. |
michael@0 | 775 | // Furthermore is "ab" is not present then none of the prefixes that "ab" is |
michael@0 | 776 | // is a subset need be examined. |
michael@0 | 777 | // The same argument goes for suffix string that are reversed. |
michael@0 | 778 | |
michael@0 | 779 | // Then to top this off why not examine the first char of the word to quickly |
michael@0 | 780 | // limit the set of prefixes to examine (i.e. the prefixes to examine must |
michael@0 | 781 | // be leading supersets of the first character of the word (if they exist) |
michael@0 | 782 | |
michael@0 | 783 | // To take advantage of this "subset" relationship, we need to add two links |
michael@0 | 784 | // from entry. One to take next if the current prefix is found (call it nexteq) |
michael@0 | 785 | // and one to take next if the current prefix is not found (call it nextne). |
michael@0 | 786 | |
michael@0 | 787 | // Since we have built ordered lists, all that remains is to properly initialize |
michael@0 | 788 | // the nextne and nexteq pointers that relate them |
michael@0 | 789 | |
michael@0 | 790 | process_pfx_order(); |
michael@0 | 791 | process_sfx_order(); |
michael@0 | 792 | |
michael@0 | 793 | /* get encoding for CHECKCOMPOUNDCASE */ |
michael@0 | 794 | if (!utf8) { |
michael@0 | 795 | char * enc = get_encoding(); |
michael@0 | 796 | csconv = get_current_cs(enc); |
michael@0 | 797 | free(enc); |
michael@0 | 798 | enc = NULL; |
michael@0 | 799 | |
michael@0 | 800 | char expw[MAXLNLEN]; |
michael@0 | 801 | if (wordchars) { |
michael@0 | 802 | strcpy(expw, wordchars); |
michael@0 | 803 | free(wordchars); |
michael@0 | 804 | } else *expw = '\0'; |
michael@0 | 805 | |
michael@0 | 806 | for (int i = 0; i <= 255; i++) { |
michael@0 | 807 | if ( (csconv[i].cupper != csconv[i].clower) && |
michael@0 | 808 | (! strchr(expw, (char) i))) { |
michael@0 | 809 | *(expw + strlen(expw) + 1) = '\0'; |
michael@0 | 810 | *(expw + strlen(expw)) = (char) i; |
michael@0 | 811 | } |
michael@0 | 812 | } |
michael@0 | 813 | |
michael@0 | 814 | wordchars = mystrdup(expw); |
michael@0 | 815 | } |
michael@0 | 816 | |
michael@0 | 817 | // default BREAK definition |
michael@0 | 818 | if (numbreak == -1) { |
michael@0 | 819 | breaktable = (char **) malloc(sizeof(char *) * 3); |
michael@0 | 820 | if (!breaktable) return 1; |
michael@0 | 821 | breaktable[0] = mystrdup("-"); |
michael@0 | 822 | breaktable[1] = mystrdup("^-"); |
michael@0 | 823 | breaktable[2] = mystrdup("-$"); |
michael@0 | 824 | if (breaktable[0] && breaktable[1] && breaktable[2]) numbreak = 3; |
michael@0 | 825 | } |
michael@0 | 826 | return 0; |
michael@0 | 827 | } |
michael@0 | 828 | |
michael@0 | 829 | |
michael@0 | 830 | // we want to be able to quickly access prefix information |
michael@0 | 831 | // both by prefix flag, and sorted by prefix string itself |
michael@0 | 832 | // so we need to set up two indexes |
michael@0 | 833 | |
michael@0 | 834 | int AffixMgr::build_pfxtree(PfxEntry* pfxptr) |
michael@0 | 835 | { |
michael@0 | 836 | PfxEntry * ptr; |
michael@0 | 837 | PfxEntry * pptr; |
michael@0 | 838 | PfxEntry * ep = pfxptr; |
michael@0 | 839 | |
michael@0 | 840 | // get the right starting points |
michael@0 | 841 | const char * key = ep->getKey(); |
michael@0 | 842 | const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF); |
michael@0 | 843 | |
michael@0 | 844 | // first index by flag which must exist |
michael@0 | 845 | ptr = pFlag[flg]; |
michael@0 | 846 | ep->setFlgNxt(ptr); |
michael@0 | 847 | pFlag[flg] = ep; |
michael@0 | 848 | |
michael@0 | 849 | |
michael@0 | 850 | // handle the special case of null affix string |
michael@0 | 851 | if (strlen(key) == 0) { |
michael@0 | 852 | // always inset them at head of list at element 0 |
michael@0 | 853 | ptr = pStart[0]; |
michael@0 | 854 | ep->setNext(ptr); |
michael@0 | 855 | pStart[0] = ep; |
michael@0 | 856 | return 0; |
michael@0 | 857 | } |
michael@0 | 858 | |
michael@0 | 859 | // now handle the normal case |
michael@0 | 860 | ep->setNextEQ(NULL); |
michael@0 | 861 | ep->setNextNE(NULL); |
michael@0 | 862 | |
michael@0 | 863 | unsigned char sp = *((const unsigned char *)key); |
michael@0 | 864 | ptr = pStart[sp]; |
michael@0 | 865 | |
michael@0 | 866 | // handle the first insert |
michael@0 | 867 | if (!ptr) { |
michael@0 | 868 | pStart[sp] = ep; |
michael@0 | 869 | return 0; |
michael@0 | 870 | } |
michael@0 | 871 | |
michael@0 | 872 | |
michael@0 | 873 | // otherwise use binary tree insertion so that a sorted |
michael@0 | 874 | // list can easily be generated later |
michael@0 | 875 | pptr = NULL; |
michael@0 | 876 | for (;;) { |
michael@0 | 877 | pptr = ptr; |
michael@0 | 878 | if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) { |
michael@0 | 879 | ptr = ptr->getNextEQ(); |
michael@0 | 880 | if (!ptr) { |
michael@0 | 881 | pptr->setNextEQ(ep); |
michael@0 | 882 | break; |
michael@0 | 883 | } |
michael@0 | 884 | } else { |
michael@0 | 885 | ptr = ptr->getNextNE(); |
michael@0 | 886 | if (!ptr) { |
michael@0 | 887 | pptr->setNextNE(ep); |
michael@0 | 888 | break; |
michael@0 | 889 | } |
michael@0 | 890 | } |
michael@0 | 891 | } |
michael@0 | 892 | return 0; |
michael@0 | 893 | } |
michael@0 | 894 | |
michael@0 | 895 | // we want to be able to quickly access suffix information |
michael@0 | 896 | // both by suffix flag, and sorted by the reverse of the |
michael@0 | 897 | // suffix string itself; so we need to set up two indexes |
michael@0 | 898 | int AffixMgr::build_sfxtree(SfxEntry* sfxptr) |
michael@0 | 899 | { |
michael@0 | 900 | SfxEntry * ptr; |
michael@0 | 901 | SfxEntry * pptr; |
michael@0 | 902 | SfxEntry * ep = sfxptr; |
michael@0 | 903 | |
michael@0 | 904 | /* get the right starting point */ |
michael@0 | 905 | const char * key = ep->getKey(); |
michael@0 | 906 | const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF); |
michael@0 | 907 | |
michael@0 | 908 | // first index by flag which must exist |
michael@0 | 909 | ptr = sFlag[flg]; |
michael@0 | 910 | ep->setFlgNxt(ptr); |
michael@0 | 911 | sFlag[flg] = ep; |
michael@0 | 912 | |
michael@0 | 913 | // next index by affix string |
michael@0 | 914 | |
michael@0 | 915 | // handle the special case of null affix string |
michael@0 | 916 | if (strlen(key) == 0) { |
michael@0 | 917 | // always inset them at head of list at element 0 |
michael@0 | 918 | ptr = sStart[0]; |
michael@0 | 919 | ep->setNext(ptr); |
michael@0 | 920 | sStart[0] = ep; |
michael@0 | 921 | return 0; |
michael@0 | 922 | } |
michael@0 | 923 | |
michael@0 | 924 | // now handle the normal case |
michael@0 | 925 | ep->setNextEQ(NULL); |
michael@0 | 926 | ep->setNextNE(NULL); |
michael@0 | 927 | |
michael@0 | 928 | unsigned char sp = *((const unsigned char *)key); |
michael@0 | 929 | ptr = sStart[sp]; |
michael@0 | 930 | |
michael@0 | 931 | // handle the first insert |
michael@0 | 932 | if (!ptr) { |
michael@0 | 933 | sStart[sp] = ep; |
michael@0 | 934 | return 0; |
michael@0 | 935 | } |
michael@0 | 936 | |
michael@0 | 937 | // otherwise use binary tree insertion so that a sorted |
michael@0 | 938 | // list can easily be generated later |
michael@0 | 939 | pptr = NULL; |
michael@0 | 940 | for (;;) { |
michael@0 | 941 | pptr = ptr; |
michael@0 | 942 | if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) { |
michael@0 | 943 | ptr = ptr->getNextEQ(); |
michael@0 | 944 | if (!ptr) { |
michael@0 | 945 | pptr->setNextEQ(ep); |
michael@0 | 946 | break; |
michael@0 | 947 | } |
michael@0 | 948 | } else { |
michael@0 | 949 | ptr = ptr->getNextNE(); |
michael@0 | 950 | if (!ptr) { |
michael@0 | 951 | pptr->setNextNE(ep); |
michael@0 | 952 | break; |
michael@0 | 953 | } |
michael@0 | 954 | } |
michael@0 | 955 | } |
michael@0 | 956 | return 0; |
michael@0 | 957 | } |
michael@0 | 958 | |
michael@0 | 959 | // convert from binary tree to sorted list |
michael@0 | 960 | int AffixMgr::process_pfx_tree_to_list() |
michael@0 | 961 | { |
michael@0 | 962 | for (int i=1; i< SETSIZE; i++) { |
michael@0 | 963 | pStart[i] = process_pfx_in_order(pStart[i],NULL); |
michael@0 | 964 | } |
michael@0 | 965 | return 0; |
michael@0 | 966 | } |
michael@0 | 967 | |
michael@0 | 968 | |
michael@0 | 969 | PfxEntry* AffixMgr::process_pfx_in_order(PfxEntry* ptr, PfxEntry* nptr) |
michael@0 | 970 | { |
michael@0 | 971 | if (ptr) { |
michael@0 | 972 | nptr = process_pfx_in_order(ptr->getNextNE(), nptr); |
michael@0 | 973 | ptr->setNext(nptr); |
michael@0 | 974 | nptr = process_pfx_in_order(ptr->getNextEQ(), ptr); |
michael@0 | 975 | } |
michael@0 | 976 | return nptr; |
michael@0 | 977 | } |
michael@0 | 978 | |
michael@0 | 979 | |
michael@0 | 980 | // convert from binary tree to sorted list |
michael@0 | 981 | int AffixMgr:: process_sfx_tree_to_list() |
michael@0 | 982 | { |
michael@0 | 983 | for (int i=1; i< SETSIZE; i++) { |
michael@0 | 984 | sStart[i] = process_sfx_in_order(sStart[i],NULL); |
michael@0 | 985 | } |
michael@0 | 986 | return 0; |
michael@0 | 987 | } |
michael@0 | 988 | |
michael@0 | 989 | SfxEntry* AffixMgr::process_sfx_in_order(SfxEntry* ptr, SfxEntry* nptr) |
michael@0 | 990 | { |
michael@0 | 991 | if (ptr) { |
michael@0 | 992 | nptr = process_sfx_in_order(ptr->getNextNE(), nptr); |
michael@0 | 993 | ptr->setNext(nptr); |
michael@0 | 994 | nptr = process_sfx_in_order(ptr->getNextEQ(), ptr); |
michael@0 | 995 | } |
michael@0 | 996 | return nptr; |
michael@0 | 997 | } |
michael@0 | 998 | |
michael@0 | 999 | |
michael@0 | 1000 | // reinitialize the PfxEntry links NextEQ and NextNE to speed searching |
michael@0 | 1001 | // using the idea of leading subsets this time |
michael@0 | 1002 | int AffixMgr::process_pfx_order() |
michael@0 | 1003 | { |
michael@0 | 1004 | PfxEntry* ptr; |
michael@0 | 1005 | |
michael@0 | 1006 | // loop through each prefix list starting point |
michael@0 | 1007 | for (int i=1; i < SETSIZE; i++) { |
michael@0 | 1008 | |
michael@0 | 1009 | ptr = pStart[i]; |
michael@0 | 1010 | |
michael@0 | 1011 | // look through the remainder of the list |
michael@0 | 1012 | // and find next entry with affix that |
michael@0 | 1013 | // the current one is not a subset of |
michael@0 | 1014 | // mark that as destination for NextNE |
michael@0 | 1015 | // use next in list that you are a subset |
michael@0 | 1016 | // of as NextEQ |
michael@0 | 1017 | |
michael@0 | 1018 | for (; ptr != NULL; ptr = ptr->getNext()) { |
michael@0 | 1019 | |
michael@0 | 1020 | PfxEntry * nptr = ptr->getNext(); |
michael@0 | 1021 | for (; nptr != NULL; nptr = nptr->getNext()) { |
michael@0 | 1022 | if (! isSubset( ptr->getKey() , nptr->getKey() )) break; |
michael@0 | 1023 | } |
michael@0 | 1024 | ptr->setNextNE(nptr); |
michael@0 | 1025 | ptr->setNextEQ(NULL); |
michael@0 | 1026 | if ((ptr->getNext()) && isSubset(ptr->getKey() , (ptr->getNext())->getKey())) |
michael@0 | 1027 | ptr->setNextEQ(ptr->getNext()); |
michael@0 | 1028 | } |
michael@0 | 1029 | |
michael@0 | 1030 | // now clean up by adding smart search termination strings: |
michael@0 | 1031 | // if you are already a superset of the previous prefix |
michael@0 | 1032 | // but not a subset of the next, search can end here |
michael@0 | 1033 | // so set NextNE properly |
michael@0 | 1034 | |
michael@0 | 1035 | ptr = pStart[i]; |
michael@0 | 1036 | for (; ptr != NULL; ptr = ptr->getNext()) { |
michael@0 | 1037 | PfxEntry * nptr = ptr->getNext(); |
michael@0 | 1038 | PfxEntry * mptr = NULL; |
michael@0 | 1039 | for (; nptr != NULL; nptr = nptr->getNext()) { |
michael@0 | 1040 | if (! isSubset(ptr->getKey(),nptr->getKey())) break; |
michael@0 | 1041 | mptr = nptr; |
michael@0 | 1042 | } |
michael@0 | 1043 | if (mptr) mptr->setNextNE(NULL); |
michael@0 | 1044 | } |
michael@0 | 1045 | } |
michael@0 | 1046 | return 0; |
michael@0 | 1047 | } |
michael@0 | 1048 | |
michael@0 | 1049 | // initialize the SfxEntry links NextEQ and NextNE to speed searching |
michael@0 | 1050 | // using the idea of leading subsets this time |
michael@0 | 1051 | int AffixMgr::process_sfx_order() |
michael@0 | 1052 | { |
michael@0 | 1053 | SfxEntry* ptr; |
michael@0 | 1054 | |
michael@0 | 1055 | // loop through each prefix list starting point |
michael@0 | 1056 | for (int i=1; i < SETSIZE; i++) { |
michael@0 | 1057 | |
michael@0 | 1058 | ptr = sStart[i]; |
michael@0 | 1059 | |
michael@0 | 1060 | // look through the remainder of the list |
michael@0 | 1061 | // and find next entry with affix that |
michael@0 | 1062 | // the current one is not a subset of |
michael@0 | 1063 | // mark that as destination for NextNE |
michael@0 | 1064 | // use next in list that you are a subset |
michael@0 | 1065 | // of as NextEQ |
michael@0 | 1066 | |
michael@0 | 1067 | for (; ptr != NULL; ptr = ptr->getNext()) { |
michael@0 | 1068 | SfxEntry * nptr = ptr->getNext(); |
michael@0 | 1069 | for (; nptr != NULL; nptr = nptr->getNext()) { |
michael@0 | 1070 | if (! isSubset(ptr->getKey(),nptr->getKey())) break; |
michael@0 | 1071 | } |
michael@0 | 1072 | ptr->setNextNE(nptr); |
michael@0 | 1073 | ptr->setNextEQ(NULL); |
michael@0 | 1074 | if ((ptr->getNext()) && isSubset(ptr->getKey(),(ptr->getNext())->getKey())) |
michael@0 | 1075 | ptr->setNextEQ(ptr->getNext()); |
michael@0 | 1076 | } |
michael@0 | 1077 | |
michael@0 | 1078 | |
michael@0 | 1079 | // now clean up by adding smart search termination strings: |
michael@0 | 1080 | // if you are already a superset of the previous suffix |
michael@0 | 1081 | // but not a subset of the next, search can end here |
michael@0 | 1082 | // so set NextNE properly |
michael@0 | 1083 | |
michael@0 | 1084 | ptr = sStart[i]; |
michael@0 | 1085 | for (; ptr != NULL; ptr = ptr->getNext()) { |
michael@0 | 1086 | SfxEntry * nptr = ptr->getNext(); |
michael@0 | 1087 | SfxEntry * mptr = NULL; |
michael@0 | 1088 | for (; nptr != NULL; nptr = nptr->getNext()) { |
michael@0 | 1089 | if (! isSubset(ptr->getKey(),nptr->getKey())) break; |
michael@0 | 1090 | mptr = nptr; |
michael@0 | 1091 | } |
michael@0 | 1092 | if (mptr) mptr->setNextNE(NULL); |
michael@0 | 1093 | } |
michael@0 | 1094 | } |
michael@0 | 1095 | return 0; |
michael@0 | 1096 | } |
michael@0 | 1097 | |
michael@0 | 1098 | // add flags to the result for dictionary debugging |
michael@0 | 1099 | void AffixMgr::debugflag(char * result, unsigned short flag) { |
michael@0 | 1100 | char * st = encode_flag(flag); |
michael@0 | 1101 | mystrcat(result, " ", MAXLNLEN); |
michael@0 | 1102 | mystrcat(result, MORPH_FLAG, MAXLNLEN); |
michael@0 | 1103 | if (st) { |
michael@0 | 1104 | mystrcat(result, st, MAXLNLEN); |
michael@0 | 1105 | free(st); |
michael@0 | 1106 | } |
michael@0 | 1107 | } |
michael@0 | 1108 | |
michael@0 | 1109 | // calculate the character length of the condition |
michael@0 | 1110 | int AffixMgr::condlen(char * st) |
michael@0 | 1111 | { |
michael@0 | 1112 | int l = 0; |
michael@0 | 1113 | bool group = false; |
michael@0 | 1114 | for(; *st; st++) { |
michael@0 | 1115 | if (*st == '[') { |
michael@0 | 1116 | group = true; |
michael@0 | 1117 | l++; |
michael@0 | 1118 | } else if (*st == ']') group = false; |
michael@0 | 1119 | else if (!group && (!utf8 || |
michael@0 | 1120 | (!(*st & 0x80) || ((*st & 0xc0) == 0x80)))) l++; |
michael@0 | 1121 | } |
michael@0 | 1122 | return l; |
michael@0 | 1123 | } |
michael@0 | 1124 | |
michael@0 | 1125 | int AffixMgr::encodeit(affentry &entry, char * cs) |
michael@0 | 1126 | { |
michael@0 | 1127 | if (strcmp(cs,".") != 0) { |
michael@0 | 1128 | entry.numconds = (char) condlen(cs); |
michael@0 | 1129 | strncpy(entry.c.conds, cs, MAXCONDLEN); |
michael@0 | 1130 | // long condition (end of conds padded by strncpy) |
michael@0 | 1131 | if (entry.c.conds[MAXCONDLEN - 1] && cs[MAXCONDLEN]) { |
michael@0 | 1132 | entry.opts += aeLONGCOND; |
michael@0 | 1133 | entry.c.l.conds2 = mystrdup(cs + MAXCONDLEN_1); |
michael@0 | 1134 | if (!entry.c.l.conds2) return 1; |
michael@0 | 1135 | } |
michael@0 | 1136 | } else { |
michael@0 | 1137 | entry.numconds = 0; |
michael@0 | 1138 | entry.c.conds[0] = '\0'; |
michael@0 | 1139 | } |
michael@0 | 1140 | return 0; |
michael@0 | 1141 | } |
michael@0 | 1142 | |
michael@0 | 1143 | // return 1 if s1 is a leading subset of s2 (dots are for infixes) |
michael@0 | 1144 | inline int AffixMgr::isSubset(const char * s1, const char * s2) |
michael@0 | 1145 | { |
michael@0 | 1146 | while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0')) { |
michael@0 | 1147 | s1++; |
michael@0 | 1148 | s2++; |
michael@0 | 1149 | } |
michael@0 | 1150 | return (*s1 == '\0'); |
michael@0 | 1151 | } |
michael@0 | 1152 | |
michael@0 | 1153 | |
michael@0 | 1154 | // check word for prefixes |
michael@0 | 1155 | struct hentry * AffixMgr::prefix_check(const char * word, int len, char in_compound, |
michael@0 | 1156 | const FLAG needflag) |
michael@0 | 1157 | { |
michael@0 | 1158 | struct hentry * rv= NULL; |
michael@0 | 1159 | |
michael@0 | 1160 | pfx = NULL; |
michael@0 | 1161 | pfxappnd = NULL; |
michael@0 | 1162 | sfxappnd = NULL; |
michael@0 | 1163 | |
michael@0 | 1164 | // first handle the special case of 0 length prefixes |
michael@0 | 1165 | PfxEntry * pe = pStart[0]; |
michael@0 | 1166 | while (pe) { |
michael@0 | 1167 | if ( |
michael@0 | 1168 | // fogemorpheme |
michael@0 | 1169 | ((in_compound != IN_CPD_NOT) || !(pe->getCont() && |
michael@0 | 1170 | (TESTAFF(pe->getCont(), onlyincompound, pe->getContLen())))) && |
michael@0 | 1171 | // permit prefixes in compounds |
michael@0 | 1172 | ((in_compound != IN_CPD_END) || (pe->getCont() && |
michael@0 | 1173 | (TESTAFF(pe->getCont(), compoundpermitflag, pe->getContLen())))) |
michael@0 | 1174 | ) { |
michael@0 | 1175 | // check prefix |
michael@0 | 1176 | rv = pe->checkword(word, len, in_compound, needflag); |
michael@0 | 1177 | if (rv) { |
michael@0 | 1178 | pfx=pe; // BUG: pfx not stateless |
michael@0 | 1179 | return rv; |
michael@0 | 1180 | } |
michael@0 | 1181 | } |
michael@0 | 1182 | pe = pe->getNext(); |
michael@0 | 1183 | } |
michael@0 | 1184 | |
michael@0 | 1185 | // now handle the general case |
michael@0 | 1186 | unsigned char sp = *((const unsigned char *)word); |
michael@0 | 1187 | PfxEntry * pptr = pStart[sp]; |
michael@0 | 1188 | |
michael@0 | 1189 | while (pptr) { |
michael@0 | 1190 | if (isSubset(pptr->getKey(),word)) { |
michael@0 | 1191 | if ( |
michael@0 | 1192 | // fogemorpheme |
michael@0 | 1193 | ((in_compound != IN_CPD_NOT) || !(pptr->getCont() && |
michael@0 | 1194 | (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen())))) && |
michael@0 | 1195 | // permit prefixes in compounds |
michael@0 | 1196 | ((in_compound != IN_CPD_END) || (pptr->getCont() && |
michael@0 | 1197 | (TESTAFF(pptr->getCont(), compoundpermitflag, pptr->getContLen())))) |
michael@0 | 1198 | ) { |
michael@0 | 1199 | // check prefix |
michael@0 | 1200 | rv = pptr->checkword(word, len, in_compound, needflag); |
michael@0 | 1201 | if (rv) { |
michael@0 | 1202 | pfx=pptr; // BUG: pfx not stateless |
michael@0 | 1203 | return rv; |
michael@0 | 1204 | } |
michael@0 | 1205 | } |
michael@0 | 1206 | pptr = pptr->getNextEQ(); |
michael@0 | 1207 | } else { |
michael@0 | 1208 | pptr = pptr->getNextNE(); |
michael@0 | 1209 | } |
michael@0 | 1210 | } |
michael@0 | 1211 | |
michael@0 | 1212 | return NULL; |
michael@0 | 1213 | } |
michael@0 | 1214 | |
michael@0 | 1215 | // check word for prefixes |
michael@0 | 1216 | struct hentry * AffixMgr::prefix_check_twosfx(const char * word, int len, |
michael@0 | 1217 | char in_compound, const FLAG needflag) |
michael@0 | 1218 | { |
michael@0 | 1219 | struct hentry * rv= NULL; |
michael@0 | 1220 | |
michael@0 | 1221 | pfx = NULL; |
michael@0 | 1222 | sfxappnd = NULL; |
michael@0 | 1223 | |
michael@0 | 1224 | // first handle the special case of 0 length prefixes |
michael@0 | 1225 | PfxEntry * pe = pStart[0]; |
michael@0 | 1226 | |
michael@0 | 1227 | while (pe) { |
michael@0 | 1228 | rv = pe->check_twosfx(word, len, in_compound, needflag); |
michael@0 | 1229 | if (rv) return rv; |
michael@0 | 1230 | pe = pe->getNext(); |
michael@0 | 1231 | } |
michael@0 | 1232 | |
michael@0 | 1233 | // now handle the general case |
michael@0 | 1234 | unsigned char sp = *((const unsigned char *)word); |
michael@0 | 1235 | PfxEntry * pptr = pStart[sp]; |
michael@0 | 1236 | |
michael@0 | 1237 | while (pptr) { |
michael@0 | 1238 | if (isSubset(pptr->getKey(),word)) { |
michael@0 | 1239 | rv = pptr->check_twosfx(word, len, in_compound, needflag); |
michael@0 | 1240 | if (rv) { |
michael@0 | 1241 | pfx = pptr; |
michael@0 | 1242 | return rv; |
michael@0 | 1243 | } |
michael@0 | 1244 | pptr = pptr->getNextEQ(); |
michael@0 | 1245 | } else { |
michael@0 | 1246 | pptr = pptr->getNextNE(); |
michael@0 | 1247 | } |
michael@0 | 1248 | } |
michael@0 | 1249 | |
michael@0 | 1250 | return NULL; |
michael@0 | 1251 | } |
michael@0 | 1252 | |
michael@0 | 1253 | // check word for prefixes |
michael@0 | 1254 | char * AffixMgr::prefix_check_morph(const char * word, int len, char in_compound, |
michael@0 | 1255 | const FLAG needflag) |
michael@0 | 1256 | { |
michael@0 | 1257 | char * st; |
michael@0 | 1258 | |
michael@0 | 1259 | char result[MAXLNLEN]; |
michael@0 | 1260 | result[0] = '\0'; |
michael@0 | 1261 | |
michael@0 | 1262 | pfx = NULL; |
michael@0 | 1263 | sfxappnd = NULL; |
michael@0 | 1264 | |
michael@0 | 1265 | // first handle the special case of 0 length prefixes |
michael@0 | 1266 | PfxEntry * pe = pStart[0]; |
michael@0 | 1267 | while (pe) { |
michael@0 | 1268 | st = pe->check_morph(word,len,in_compound, needflag); |
michael@0 | 1269 | if (st) { |
michael@0 | 1270 | mystrcat(result, st, MAXLNLEN); |
michael@0 | 1271 | free(st); |
michael@0 | 1272 | } |
michael@0 | 1273 | // if (rv) return rv; |
michael@0 | 1274 | pe = pe->getNext(); |
michael@0 | 1275 | } |
michael@0 | 1276 | |
michael@0 | 1277 | // now handle the general case |
michael@0 | 1278 | unsigned char sp = *((const unsigned char *)word); |
michael@0 | 1279 | PfxEntry * pptr = pStart[sp]; |
michael@0 | 1280 | |
michael@0 | 1281 | while (pptr) { |
michael@0 | 1282 | if (isSubset(pptr->getKey(),word)) { |
michael@0 | 1283 | st = pptr->check_morph(word,len,in_compound, needflag); |
michael@0 | 1284 | if (st) { |
michael@0 | 1285 | // fogemorpheme |
michael@0 | 1286 | if ((in_compound != IN_CPD_NOT) || !((pptr->getCont() && |
michael@0 | 1287 | (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen()))))) { |
michael@0 | 1288 | mystrcat(result, st, MAXLNLEN); |
michael@0 | 1289 | pfx = pptr; |
michael@0 | 1290 | } |
michael@0 | 1291 | free(st); |
michael@0 | 1292 | } |
michael@0 | 1293 | pptr = pptr->getNextEQ(); |
michael@0 | 1294 | } else { |
michael@0 | 1295 | pptr = pptr->getNextNE(); |
michael@0 | 1296 | } |
michael@0 | 1297 | } |
michael@0 | 1298 | |
michael@0 | 1299 | if (*result) return mystrdup(result); |
michael@0 | 1300 | return NULL; |
michael@0 | 1301 | } |
michael@0 | 1302 | |
michael@0 | 1303 | |
michael@0 | 1304 | // check word for prefixes |
michael@0 | 1305 | char * AffixMgr::prefix_check_twosfx_morph(const char * word, int len, |
michael@0 | 1306 | char in_compound, const FLAG needflag) |
michael@0 | 1307 | { |
michael@0 | 1308 | char * st; |
michael@0 | 1309 | |
michael@0 | 1310 | char result[MAXLNLEN]; |
michael@0 | 1311 | result[0] = '\0'; |
michael@0 | 1312 | |
michael@0 | 1313 | pfx = NULL; |
michael@0 | 1314 | sfxappnd = NULL; |
michael@0 | 1315 | |
michael@0 | 1316 | // first handle the special case of 0 length prefixes |
michael@0 | 1317 | PfxEntry * pe = pStart[0]; |
michael@0 | 1318 | while (pe) { |
michael@0 | 1319 | st = pe->check_twosfx_morph(word,len,in_compound, needflag); |
michael@0 | 1320 | if (st) { |
michael@0 | 1321 | mystrcat(result, st, MAXLNLEN); |
michael@0 | 1322 | free(st); |
michael@0 | 1323 | } |
michael@0 | 1324 | pe = pe->getNext(); |
michael@0 | 1325 | } |
michael@0 | 1326 | |
michael@0 | 1327 | // now handle the general case |
michael@0 | 1328 | unsigned char sp = *((const unsigned char *)word); |
michael@0 | 1329 | PfxEntry * pptr = pStart[sp]; |
michael@0 | 1330 | |
michael@0 | 1331 | while (pptr) { |
michael@0 | 1332 | if (isSubset(pptr->getKey(),word)) { |
michael@0 | 1333 | st = pptr->check_twosfx_morph(word, len, in_compound, needflag); |
michael@0 | 1334 | if (st) { |
michael@0 | 1335 | mystrcat(result, st, MAXLNLEN); |
michael@0 | 1336 | free(st); |
michael@0 | 1337 | pfx = pptr; |
michael@0 | 1338 | } |
michael@0 | 1339 | pptr = pptr->getNextEQ(); |
michael@0 | 1340 | } else { |
michael@0 | 1341 | pptr = pptr->getNextNE(); |
michael@0 | 1342 | } |
michael@0 | 1343 | } |
michael@0 | 1344 | |
michael@0 | 1345 | if (*result) return mystrdup(result); |
michael@0 | 1346 | return NULL; |
michael@0 | 1347 | } |
michael@0 | 1348 | |
michael@0 | 1349 | // Is word a non compound with a REP substitution (see checkcompoundrep)? |
michael@0 | 1350 | int AffixMgr::cpdrep_check(const char * word, int wl) |
michael@0 | 1351 | { |
michael@0 | 1352 | char candidate[MAXLNLEN]; |
michael@0 | 1353 | const char * r; |
michael@0 | 1354 | int lenr, lenp; |
michael@0 | 1355 | |
michael@0 | 1356 | if ((wl < 2) || !numrep) return 0; |
michael@0 | 1357 | |
michael@0 | 1358 | for (int i=0; i < numrep; i++ ) { |
michael@0 | 1359 | r = word; |
michael@0 | 1360 | lenr = strlen(reptable[i].pattern2); |
michael@0 | 1361 | lenp = strlen(reptable[i].pattern); |
michael@0 | 1362 | // search every occurence of the pattern in the word |
michael@0 | 1363 | while ((r=strstr(r, reptable[i].pattern)) != NULL) { |
michael@0 | 1364 | strcpy(candidate, word); |
michael@0 | 1365 | if (r-word + lenr + strlen(r+lenp) >= MAXLNLEN) break; |
michael@0 | 1366 | strcpy(candidate+(r-word),reptable[i].pattern2); |
michael@0 | 1367 | strcpy(candidate+(r-word)+lenr, r+lenp); |
michael@0 | 1368 | if (candidate_check(candidate,strlen(candidate))) return 1; |
michael@0 | 1369 | r++; // search for the next letter |
michael@0 | 1370 | } |
michael@0 | 1371 | } |
michael@0 | 1372 | return 0; |
michael@0 | 1373 | } |
michael@0 | 1374 | |
michael@0 | 1375 | // forbid compoundings when there are special patterns at word bound |
michael@0 | 1376 | int AffixMgr::cpdpat_check(const char * word, int pos, hentry * r1, hentry * r2, const char affixed) |
michael@0 | 1377 | { |
michael@0 | 1378 | int len; |
michael@0 | 1379 | for (int i = 0; i < numcheckcpd; i++) { |
michael@0 | 1380 | if (isSubset(checkcpdtable[i].pattern2, word + pos) && |
michael@0 | 1381 | (!r1 || !checkcpdtable[i].cond || |
michael@0 | 1382 | (r1->astr && TESTAFF(r1->astr, checkcpdtable[i].cond, r1->alen))) && |
michael@0 | 1383 | (!r2 || !checkcpdtable[i].cond2 || |
michael@0 | 1384 | (r2->astr && TESTAFF(r2->astr, checkcpdtable[i].cond2, r2->alen))) && |
michael@0 | 1385 | // zero length pattern => only TESTAFF |
michael@0 | 1386 | // zero pattern (0/flag) => unmodified stem (zero affixes allowed) |
michael@0 | 1387 | (!*(checkcpdtable[i].pattern) || ( |
michael@0 | 1388 | (*(checkcpdtable[i].pattern)=='0' && r1->blen <= pos && strncmp(word + pos - r1->blen, r1->word, r1->blen) == 0) || |
michael@0 | 1389 | (*(checkcpdtable[i].pattern)!='0' && (len = strlen(checkcpdtable[i].pattern)) && |
michael@0 | 1390 | strncmp(word + pos - len, checkcpdtable[i].pattern, len) == 0)))) { |
michael@0 | 1391 | return 1; |
michael@0 | 1392 | } |
michael@0 | 1393 | } |
michael@0 | 1394 | return 0; |
michael@0 | 1395 | } |
michael@0 | 1396 | |
michael@0 | 1397 | // forbid compounding with neighbouring upper and lower case characters at word bounds |
michael@0 | 1398 | int AffixMgr::cpdcase_check(const char * word, int pos) |
michael@0 | 1399 | { |
michael@0 | 1400 | if (utf8) { |
michael@0 | 1401 | w_char u, w; |
michael@0 | 1402 | const char * p; |
michael@0 | 1403 | u8_u16(&u, 1, word + pos); |
michael@0 | 1404 | for (p = word + pos - 1; (*p & 0xc0) == 0x80; p--); |
michael@0 | 1405 | u8_u16(&w, 1, p); |
michael@0 | 1406 | unsigned short a = (u.h << 8) + u.l; |
michael@0 | 1407 | unsigned short b = (w.h << 8) + w.l; |
michael@0 | 1408 | if (((unicodetoupper(a, langnum) == a) || (unicodetoupper(b, langnum) == b)) && |
michael@0 | 1409 | (a != '-') && (b != '-')) return 1; |
michael@0 | 1410 | } else { |
michael@0 | 1411 | unsigned char a = *(word + pos - 1); |
michael@0 | 1412 | unsigned char b = *(word + pos); |
michael@0 | 1413 | if ((csconv[a].ccase || csconv[b].ccase) && (a != '-') && (b != '-')) return 1; |
michael@0 | 1414 | } |
michael@0 | 1415 | return 0; |
michael@0 | 1416 | } |
michael@0 | 1417 | |
michael@0 | 1418 | // check compound patterns |
michael@0 | 1419 | int AffixMgr::defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** def, char all) |
michael@0 | 1420 | { |
michael@0 | 1421 | signed short btpp[MAXWORDLEN]; // metacharacter (*, ?) positions for backtracking |
michael@0 | 1422 | signed short btwp[MAXWORDLEN]; // word positions for metacharacters |
michael@0 | 1423 | int btnum[MAXWORDLEN]; // number of matched characters in metacharacter positions |
michael@0 | 1424 | short bt = 0; |
michael@0 | 1425 | int i, j; |
michael@0 | 1426 | int ok; |
michael@0 | 1427 | int w = 0; |
michael@0 | 1428 | |
michael@0 | 1429 | if (!*words) { |
michael@0 | 1430 | w = 1; |
michael@0 | 1431 | *words = def; |
michael@0 | 1432 | } |
michael@0 | 1433 | |
michael@0 | 1434 | if (!*words) { |
michael@0 | 1435 | return 0; |
michael@0 | 1436 | } |
michael@0 | 1437 | |
michael@0 | 1438 | (*words)[wnum] = rv; |
michael@0 | 1439 | |
michael@0 | 1440 | // has the last word COMPOUNDRULE flag? |
michael@0 | 1441 | if (rv->alen == 0) { |
michael@0 | 1442 | (*words)[wnum] = NULL; |
michael@0 | 1443 | if (w) *words = NULL; |
michael@0 | 1444 | return 0; |
michael@0 | 1445 | } |
michael@0 | 1446 | ok = 0; |
michael@0 | 1447 | for (i = 0; i < numdefcpd; i++) { |
michael@0 | 1448 | for (j = 0; j < defcpdtable[i].len; j++) { |
michael@0 | 1449 | if (defcpdtable[i].def[j] != '*' && defcpdtable[i].def[j] != '?' && |
michael@0 | 1450 | TESTAFF(rv->astr, defcpdtable[i].def[j], rv->alen)) ok = 1; |
michael@0 | 1451 | } |
michael@0 | 1452 | } |
michael@0 | 1453 | if (ok == 0) { |
michael@0 | 1454 | (*words)[wnum] = NULL; |
michael@0 | 1455 | if (w) *words = NULL; |
michael@0 | 1456 | return 0; |
michael@0 | 1457 | } |
michael@0 | 1458 | |
michael@0 | 1459 | for (i = 0; i < numdefcpd; i++) { |
michael@0 | 1460 | signed short pp = 0; // pattern position |
michael@0 | 1461 | signed short wp = 0; // "words" position |
michael@0 | 1462 | int ok2; |
michael@0 | 1463 | ok = 1; |
michael@0 | 1464 | ok2 = 1; |
michael@0 | 1465 | do { |
michael@0 | 1466 | while ((pp < defcpdtable[i].len) && (wp <= wnum)) { |
michael@0 | 1467 | if (((pp+1) < defcpdtable[i].len) && |
michael@0 | 1468 | ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) { |
michael@0 | 1469 | int wend = (defcpdtable[i].def[pp+1] == '?') ? wp : wnum; |
michael@0 | 1470 | ok2 = 1; |
michael@0 | 1471 | pp+=2; |
michael@0 | 1472 | btpp[bt] = pp; |
michael@0 | 1473 | btwp[bt] = wp; |
michael@0 | 1474 | while (wp <= wend) { |
michael@0 | 1475 | if (!(*words)[wp]->alen || |
michael@0 | 1476 | !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp-2], (*words)[wp]->alen)) { |
michael@0 | 1477 | ok2 = 0; |
michael@0 | 1478 | break; |
michael@0 | 1479 | } |
michael@0 | 1480 | wp++; |
michael@0 | 1481 | } |
michael@0 | 1482 | if (wp <= wnum) ok2 = 0; |
michael@0 | 1483 | btnum[bt] = wp - btwp[bt]; |
michael@0 | 1484 | if (btnum[bt] > 0) bt++; |
michael@0 | 1485 | if (ok2) break; |
michael@0 | 1486 | } else { |
michael@0 | 1487 | ok2 = 1; |
michael@0 | 1488 | if (!(*words)[wp] || !(*words)[wp]->alen || |
michael@0 | 1489 | !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp], (*words)[wp]->alen)) { |
michael@0 | 1490 | ok = 0; |
michael@0 | 1491 | break; |
michael@0 | 1492 | } |
michael@0 | 1493 | pp++; |
michael@0 | 1494 | wp++; |
michael@0 | 1495 | if ((defcpdtable[i].len == pp) && !(wp > wnum)) ok = 0; |
michael@0 | 1496 | } |
michael@0 | 1497 | } |
michael@0 | 1498 | if (ok && ok2) { |
michael@0 | 1499 | int r = pp; |
michael@0 | 1500 | while ((defcpdtable[i].len > r) && ((r+1) < defcpdtable[i].len) && |
michael@0 | 1501 | ((defcpdtable[i].def[r+1] == '*') || (defcpdtable[i].def[r+1] == '?'))) r+=2; |
michael@0 | 1502 | if (defcpdtable[i].len <= r) return 1; |
michael@0 | 1503 | } |
michael@0 | 1504 | // backtrack |
michael@0 | 1505 | if (bt) do { |
michael@0 | 1506 | ok = 1; |
michael@0 | 1507 | btnum[bt - 1]--; |
michael@0 | 1508 | pp = btpp[bt - 1]; |
michael@0 | 1509 | wp = btwp[bt - 1] + (signed short) btnum[bt - 1]; |
michael@0 | 1510 | } while ((btnum[bt - 1] < 0) && --bt); |
michael@0 | 1511 | } while (bt); |
michael@0 | 1512 | |
michael@0 | 1513 | if (ok && ok2 && (!all || (defcpdtable[i].len <= pp))) return 1; |
michael@0 | 1514 | |
michael@0 | 1515 | // check zero ending |
michael@0 | 1516 | while (ok && ok2 && (defcpdtable[i].len > pp) && ((pp+1) < defcpdtable[i].len) && |
michael@0 | 1517 | ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) pp+=2; |
michael@0 | 1518 | if (ok && ok2 && (defcpdtable[i].len <= pp)) return 1; |
michael@0 | 1519 | } |
michael@0 | 1520 | (*words)[wnum] = NULL; |
michael@0 | 1521 | if (w) *words = NULL; |
michael@0 | 1522 | return 0; |
michael@0 | 1523 | } |
michael@0 | 1524 | |
michael@0 | 1525 | inline int AffixMgr::candidate_check(const char * word, int len) |
michael@0 | 1526 | { |
michael@0 | 1527 | struct hentry * rv=NULL; |
michael@0 | 1528 | |
michael@0 | 1529 | rv = lookup(word); |
michael@0 | 1530 | if (rv) return 1; |
michael@0 | 1531 | |
michael@0 | 1532 | // rv = prefix_check(word,len,1); |
michael@0 | 1533 | // if (rv) return 1; |
michael@0 | 1534 | |
michael@0 | 1535 | rv = affix_check(word,len); |
michael@0 | 1536 | if (rv) return 1; |
michael@0 | 1537 | return 0; |
michael@0 | 1538 | } |
michael@0 | 1539 | |
michael@0 | 1540 | // calculate number of syllable for compound-checking |
michael@0 | 1541 | short AffixMgr::get_syllable(const char * word, int wlen) |
michael@0 | 1542 | { |
michael@0 | 1543 | if (cpdmaxsyllable==0) return 0; |
michael@0 | 1544 | |
michael@0 | 1545 | short num=0; |
michael@0 | 1546 | |
michael@0 | 1547 | if (!utf8) { |
michael@0 | 1548 | for (int i=0; i<wlen; i++) { |
michael@0 | 1549 | if (strchr(cpdvowels, word[i])) num++; |
michael@0 | 1550 | } |
michael@0 | 1551 | } else if (cpdvowels_utf16) { |
michael@0 | 1552 | w_char w[MAXWORDUTF8LEN]; |
michael@0 | 1553 | int i = u8_u16(w, MAXWORDUTF8LEN, word); |
michael@0 | 1554 | for (; i > 0; i--) { |
michael@0 | 1555 | if (flag_bsearch((unsigned short *) cpdvowels_utf16, |
michael@0 | 1556 | ((unsigned short *) w)[i - 1], cpdvowels_utf16_len)) num++; |
michael@0 | 1557 | } |
michael@0 | 1558 | } |
michael@0 | 1559 | return num; |
michael@0 | 1560 | } |
michael@0 | 1561 | |
michael@0 | 1562 | void AffixMgr::setcminmax(int * cmin, int * cmax, const char * word, int len) { |
michael@0 | 1563 | if (utf8) { |
michael@0 | 1564 | int i; |
michael@0 | 1565 | for (*cmin = 0, i = 0; (i < cpdmin) && word[*cmin]; i++) { |
michael@0 | 1566 | for ((*cmin)++; (word[*cmin] & 0xc0) == 0x80; (*cmin)++); |
michael@0 | 1567 | } |
michael@0 | 1568 | for (*cmax = len, i = 0; (i < (cpdmin - 1)) && *cmax; i++) { |
michael@0 | 1569 | for ((*cmax)--; (word[*cmax] & 0xc0) == 0x80; (*cmax)--); |
michael@0 | 1570 | } |
michael@0 | 1571 | } else { |
michael@0 | 1572 | *cmin = cpdmin; |
michael@0 | 1573 | *cmax = len - cpdmin + 1; |
michael@0 | 1574 | } |
michael@0 | 1575 | } |
michael@0 | 1576 | |
michael@0 | 1577 | |
michael@0 | 1578 | // check if compound word is correctly spelled |
michael@0 | 1579 | // hu_mov_rule = spec. Hungarian rule (XXX) |
michael@0 | 1580 | struct hentry * AffixMgr::compound_check(const char * word, int len, |
michael@0 | 1581 | short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words = NULL, |
michael@0 | 1582 | char hu_mov_rule = 0, char is_sug = 0, int * info = NULL) |
michael@0 | 1583 | { |
michael@0 | 1584 | int i; |
michael@0 | 1585 | short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2; |
michael@0 | 1586 | struct hentry * rv = NULL; |
michael@0 | 1587 | struct hentry * rv_first; |
michael@0 | 1588 | struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking |
michael@0 | 1589 | char st [MAXWORDUTF8LEN + 4]; |
michael@0 | 1590 | char ch = '\0'; |
michael@0 | 1591 | int cmin; |
michael@0 | 1592 | int cmax; |
michael@0 | 1593 | int striple = 0; |
michael@0 | 1594 | int scpd = 0; |
michael@0 | 1595 | int soldi = 0; |
michael@0 | 1596 | int oldcmin = 0; |
michael@0 | 1597 | int oldcmax = 0; |
michael@0 | 1598 | int oldlen = 0; |
michael@0 | 1599 | int checkedstriple = 0; |
michael@0 | 1600 | int onlycpdrule; |
michael@0 | 1601 | int affixed = 0; |
michael@0 | 1602 | hentry ** oldwords = words; |
michael@0 | 1603 | |
michael@0 | 1604 | int checked_prefix; |
michael@0 | 1605 | |
michael@0 | 1606 | setcminmax(&cmin, &cmax, word, len); |
michael@0 | 1607 | |
michael@0 | 1608 | strcpy(st, word); |
michael@0 | 1609 | |
michael@0 | 1610 | for (i = cmin; i < cmax; i++) { |
michael@0 | 1611 | // go to end of the UTF-8 character |
michael@0 | 1612 | if (utf8) { |
michael@0 | 1613 | for (; (st[i] & 0xc0) == 0x80; i++); |
michael@0 | 1614 | if (i >= cmax) return NULL; |
michael@0 | 1615 | } |
michael@0 | 1616 | |
michael@0 | 1617 | words = oldwords; |
michael@0 | 1618 | onlycpdrule = (words) ? 1 : 0; |
michael@0 | 1619 | |
michael@0 | 1620 | do { // onlycpdrule loop |
michael@0 | 1621 | |
michael@0 | 1622 | oldnumsyllable = numsyllable; |
michael@0 | 1623 | oldwordnum = wordnum; |
michael@0 | 1624 | checked_prefix = 0; |
michael@0 | 1625 | |
michael@0 | 1626 | |
michael@0 | 1627 | do { // simplified checkcompoundpattern loop |
michael@0 | 1628 | |
michael@0 | 1629 | if (scpd > 0) { |
michael@0 | 1630 | for (; scpd <= numcheckcpd && (!checkcpdtable[scpd-1].pattern3 || |
michael@0 | 1631 | strncmp(word + i, checkcpdtable[scpd-1].pattern3, strlen(checkcpdtable[scpd-1].pattern3)) != 0); scpd++); |
michael@0 | 1632 | |
michael@0 | 1633 | if (scpd > numcheckcpd) break; // break simplified checkcompoundpattern loop |
michael@0 | 1634 | strcpy(st + i, checkcpdtable[scpd-1].pattern); |
michael@0 | 1635 | soldi = i; |
michael@0 | 1636 | i += strlen(checkcpdtable[scpd-1].pattern); |
michael@0 | 1637 | strcpy(st + i, checkcpdtable[scpd-1].pattern2); |
michael@0 | 1638 | strcpy(st + i + strlen(checkcpdtable[scpd-1].pattern2), word + soldi + strlen(checkcpdtable[scpd-1].pattern3)); |
michael@0 | 1639 | |
michael@0 | 1640 | oldlen = len; |
michael@0 | 1641 | len += strlen(checkcpdtable[scpd-1].pattern) + strlen(checkcpdtable[scpd-1].pattern2) - strlen(checkcpdtable[scpd-1].pattern3); |
michael@0 | 1642 | oldcmin = cmin; |
michael@0 | 1643 | oldcmax = cmax; |
michael@0 | 1644 | setcminmax(&cmin, &cmax, st, len); |
michael@0 | 1645 | |
michael@0 | 1646 | cmax = len - cpdmin + 1; |
michael@0 | 1647 | } |
michael@0 | 1648 | |
michael@0 | 1649 | ch = st[i]; |
michael@0 | 1650 | st[i] = '\0'; |
michael@0 | 1651 | |
michael@0 | 1652 | sfx = NULL; |
michael@0 | 1653 | pfx = NULL; |
michael@0 | 1654 | |
michael@0 | 1655 | // FIRST WORD |
michael@0 | 1656 | |
michael@0 | 1657 | affixed = 1; |
michael@0 | 1658 | rv = lookup(st); // perhaps without prefix |
michael@0 | 1659 | |
michael@0 | 1660 | // search homonym with compound flag |
michael@0 | 1661 | while ((rv) && !hu_mov_rule && |
michael@0 | 1662 | ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || |
michael@0 | 1663 | !((compoundflag && !words && !onlycpdrule && TESTAFF(rv->astr, compoundflag, rv->alen)) || |
michael@0 | 1664 | (compoundbegin && !wordnum && !onlycpdrule && |
michael@0 | 1665 | TESTAFF(rv->astr, compoundbegin, rv->alen)) || |
michael@0 | 1666 | (compoundmiddle && wordnum && !words && !onlycpdrule && |
michael@0 | 1667 | TESTAFF(rv->astr, compoundmiddle, rv->alen)) || |
michael@0 | 1668 | (numdefcpd && onlycpdrule && |
michael@0 | 1669 | ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) || |
michael@0 | 1670 | (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0))))) || |
michael@0 | 1671 | (scpd != 0 && checkcpdtable[scpd-1].cond != FLAG_NULL && |
michael@0 | 1672 | !TESTAFF(rv->astr, checkcpdtable[scpd-1].cond, rv->alen))) |
michael@0 | 1673 | ) { |
michael@0 | 1674 | rv = rv->next_homonym; |
michael@0 | 1675 | } |
michael@0 | 1676 | |
michael@0 | 1677 | if (rv) affixed = 0; |
michael@0 | 1678 | |
michael@0 | 1679 | if (!rv) { |
michael@0 | 1680 | if (onlycpdrule) break; |
michael@0 | 1681 | if (compoundflag && |
michael@0 | 1682 | !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) { |
michael@0 | 1683 | if ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, |
michael@0 | 1684 | FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) && !hu_mov_rule && |
michael@0 | 1685 | sfx->getCont() && |
michael@0 | 1686 | ((compoundforbidflag && TESTAFF(sfx->getCont(), compoundforbidflag, |
michael@0 | 1687 | sfx->getContLen())) || (compoundend && |
michael@0 | 1688 | TESTAFF(sfx->getCont(), compoundend, |
michael@0 | 1689 | sfx->getContLen())))) { |
michael@0 | 1690 | rv = NULL; |
michael@0 | 1691 | } |
michael@0 | 1692 | } |
michael@0 | 1693 | |
michael@0 | 1694 | if (rv || |
michael@0 | 1695 | (((wordnum == 0) && compoundbegin && |
michael@0 | 1696 | ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || |
michael@0 | 1697 | (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundbegin)))) || |
michael@0 | 1698 | ((wordnum > 0) && compoundmiddle && |
michael@0 | 1699 | ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || |
michael@0 | 1700 | (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle))))) |
michael@0 | 1701 | ) checked_prefix = 1; |
michael@0 | 1702 | // else check forbiddenwords and needaffix |
michael@0 | 1703 | } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) || |
michael@0 | 1704 | TESTAFF(rv->astr, needaffix, rv->alen) || |
michael@0 | 1705 | TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || |
michael@0 | 1706 | (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)) |
michael@0 | 1707 | )) { |
michael@0 | 1708 | st[i] = ch; |
michael@0 | 1709 | //continue; |
michael@0 | 1710 | break; |
michael@0 | 1711 | } |
michael@0 | 1712 | |
michael@0 | 1713 | // check non_compound flag in suffix and prefix |
michael@0 | 1714 | if ((rv) && !hu_mov_rule && |
michael@0 | 1715 | ((pfx && pfx->getCont() && |
michael@0 | 1716 | TESTAFF(pfx->getCont(), compoundforbidflag, |
michael@0 | 1717 | pfx->getContLen())) || |
michael@0 | 1718 | (sfx && sfx->getCont() && |
michael@0 | 1719 | TESTAFF(sfx->getCont(), compoundforbidflag, |
michael@0 | 1720 | sfx->getContLen())))) { |
michael@0 | 1721 | rv = NULL; |
michael@0 | 1722 | } |
michael@0 | 1723 | |
michael@0 | 1724 | // check compoundend flag in suffix and prefix |
michael@0 | 1725 | if ((rv) && !checked_prefix && compoundend && !hu_mov_rule && |
michael@0 | 1726 | ((pfx && pfx->getCont() && |
michael@0 | 1727 | TESTAFF(pfx->getCont(), compoundend, |
michael@0 | 1728 | pfx->getContLen())) || |
michael@0 | 1729 | (sfx && sfx->getCont() && |
michael@0 | 1730 | TESTAFF(sfx->getCont(), compoundend, |
michael@0 | 1731 | sfx->getContLen())))) { |
michael@0 | 1732 | rv = NULL; |
michael@0 | 1733 | } |
michael@0 | 1734 | |
michael@0 | 1735 | // check compoundmiddle flag in suffix and prefix |
michael@0 | 1736 | if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu_mov_rule && |
michael@0 | 1737 | ((pfx && pfx->getCont() && |
michael@0 | 1738 | TESTAFF(pfx->getCont(), compoundmiddle, |
michael@0 | 1739 | pfx->getContLen())) || |
michael@0 | 1740 | (sfx && sfx->getCont() && |
michael@0 | 1741 | TESTAFF(sfx->getCont(), compoundmiddle, |
michael@0 | 1742 | sfx->getContLen())))) { |
michael@0 | 1743 | rv = NULL; |
michael@0 | 1744 | } |
michael@0 | 1745 | |
michael@0 | 1746 | // check forbiddenwords |
michael@0 | 1747 | if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) || |
michael@0 | 1748 | TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || |
michael@0 | 1749 | (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) { |
michael@0 | 1750 | return NULL; |
michael@0 | 1751 | } |
michael@0 | 1752 | |
michael@0 | 1753 | // increment word number, if the second root has a compoundroot flag |
michael@0 | 1754 | if ((rv) && compoundroot && |
michael@0 | 1755 | (TESTAFF(rv->astr, compoundroot, rv->alen))) { |
michael@0 | 1756 | wordnum++; |
michael@0 | 1757 | } |
michael@0 | 1758 | |
michael@0 | 1759 | // first word is acceptable in compound words? |
michael@0 | 1760 | if (((rv) && |
michael@0 | 1761 | ( checked_prefix || (words && words[wnum]) || |
michael@0 | 1762 | (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) || |
michael@0 | 1763 | ((oldwordnum == 0) && compoundbegin && TESTAFF(rv->astr, compoundbegin, rv->alen)) || |
michael@0 | 1764 | ((oldwordnum > 0) && compoundmiddle && TESTAFF(rv->astr, compoundmiddle, rv->alen))// || |
michael@0 | 1765 | // (numdefcpd && ) |
michael@0 | 1766 | |
michael@0 | 1767 | // LANG_hu section: spec. Hungarian rule |
michael@0 | 1768 | || ((langnum == LANG_hu) && hu_mov_rule && ( |
michael@0 | 1769 | TESTAFF(rv->astr, 'F', rv->alen) || // XXX hardwired Hungarian dictionary codes |
michael@0 | 1770 | TESTAFF(rv->astr, 'G', rv->alen) || |
michael@0 | 1771 | TESTAFF(rv->astr, 'H', rv->alen) |
michael@0 | 1772 | ) |
michael@0 | 1773 | ) |
michael@0 | 1774 | // END of LANG_hu section |
michael@0 | 1775 | ) && |
michael@0 | 1776 | ( |
michael@0 | 1777 | // test CHECKCOMPOUNDPATTERN conditions |
michael@0 | 1778 | scpd == 0 || checkcpdtable[scpd-1].cond == FLAG_NULL || |
michael@0 | 1779 | TESTAFF(rv->astr, checkcpdtable[scpd-1].cond, rv->alen) |
michael@0 | 1780 | ) |
michael@0 | 1781 | && ! (( checkcompoundtriple && scpd == 0 && !words && // test triple letters |
michael@0 | 1782 | (word[i-1]==word[i]) && ( |
michael@0 | 1783 | ((i>1) && (word[i-1]==word[i-2])) || |
michael@0 | 1784 | ((word[i-1]==word[i+1])) // may be word[i+1] == '\0' |
michael@0 | 1785 | ) |
michael@0 | 1786 | ) || |
michael@0 | 1787 | ( |
michael@0 | 1788 | checkcompoundcase && scpd == 0 && !words && cpdcase_check(word, i) |
michael@0 | 1789 | )) |
michael@0 | 1790 | ) |
michael@0 | 1791 | // LANG_hu section: spec. Hungarian rule |
michael@0 | 1792 | || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && (rv = affix_check(st,i)) && |
michael@0 | 1793 | (sfx && sfx->getCont() && ( // XXX hardwired Hungarian dic. codes |
michael@0 | 1794 | TESTAFF(sfx->getCont(), (unsigned short) 'x', sfx->getContLen()) || |
michael@0 | 1795 | TESTAFF(sfx->getCont(), (unsigned short) '%', sfx->getContLen()) |
michael@0 | 1796 | ) |
michael@0 | 1797 | ) |
michael@0 | 1798 | ) |
michael@0 | 1799 | ) { // first word is ok condition |
michael@0 | 1800 | |
michael@0 | 1801 | // LANG_hu section: spec. Hungarian rule |
michael@0 | 1802 | if (langnum == LANG_hu) { |
michael@0 | 1803 | // calculate syllable number of the word |
michael@0 | 1804 | numsyllable += get_syllable(st, i); |
michael@0 | 1805 | // + 1 word, if syllable number of the prefix > 1 (hungarian convention) |
michael@0 | 1806 | if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) > 1)) wordnum++; |
michael@0 | 1807 | } |
michael@0 | 1808 | // END of LANG_hu section |
michael@0 | 1809 | |
michael@0 | 1810 | // NEXT WORD(S) |
michael@0 | 1811 | rv_first = rv; |
michael@0 | 1812 | st[i] = ch; |
michael@0 | 1813 | |
michael@0 | 1814 | do { // striple loop |
michael@0 | 1815 | |
michael@0 | 1816 | // check simplifiedtriple |
michael@0 | 1817 | if (simplifiedtriple) { |
michael@0 | 1818 | if (striple) { |
michael@0 | 1819 | checkedstriple = 1; |
michael@0 | 1820 | i--; // check "fahrt" instead of "ahrt" in "Schiffahrt" |
michael@0 | 1821 | } else if (i > 2 && *(word+i - 1) == *(word + i - 2)) striple = 1; |
michael@0 | 1822 | } |
michael@0 | 1823 | |
michael@0 | 1824 | rv = lookup((st+i)); // perhaps without prefix |
michael@0 | 1825 | |
michael@0 | 1826 | // search homonym with compound flag |
michael@0 | 1827 | while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || |
michael@0 | 1828 | !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) || |
michael@0 | 1829 | (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) || |
michael@0 | 1830 | (numdefcpd && words && defcpd_check(&words, wnum + 1, rv, NULL,1))) || |
michael@0 | 1831 | (scpd != 0 && checkcpdtable[scpd-1].cond2 != FLAG_NULL && |
michael@0 | 1832 | !TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen)) |
michael@0 | 1833 | )) { |
michael@0 | 1834 | rv = rv->next_homonym; |
michael@0 | 1835 | } |
michael@0 | 1836 | |
michael@0 | 1837 | // check FORCEUCASE |
michael@0 | 1838 | if (rv && forceucase && (rv) && |
michael@0 | 1839 | (TESTAFF(rv->astr, forceucase, rv->alen)) && !(info && *info & SPELL_ORIGCAP)) rv = NULL; |
michael@0 | 1840 | |
michael@0 | 1841 | if (rv && words && words[wnum + 1]) return rv_first; |
michael@0 | 1842 | |
michael@0 | 1843 | oldnumsyllable2 = numsyllable; |
michael@0 | 1844 | oldwordnum2 = wordnum; |
michael@0 | 1845 | |
michael@0 | 1846 | |
michael@0 | 1847 | // LANG_hu section: spec. Hungarian rule, XXX hardwired dictionary code |
michael@0 | 1848 | if ((rv) && (langnum == LANG_hu) && (TESTAFF(rv->astr, 'I', rv->alen)) && !(TESTAFF(rv->astr, 'J', rv->alen))) { |
michael@0 | 1849 | numsyllable--; |
michael@0 | 1850 | } |
michael@0 | 1851 | // END of LANG_hu section |
michael@0 | 1852 | |
michael@0 | 1853 | // increment word number, if the second root has a compoundroot flag |
michael@0 | 1854 | if ((rv) && (compoundroot) && |
michael@0 | 1855 | (TESTAFF(rv->astr, compoundroot, rv->alen))) { |
michael@0 | 1856 | wordnum++; |
michael@0 | 1857 | } |
michael@0 | 1858 | |
michael@0 | 1859 | // check forbiddenwords |
michael@0 | 1860 | if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) || |
michael@0 | 1861 | TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || |
michael@0 | 1862 | (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) return NULL; |
michael@0 | 1863 | |
michael@0 | 1864 | // second word is acceptable, as a root? |
michael@0 | 1865 | // hungarian conventions: compounding is acceptable, |
michael@0 | 1866 | // when compound forms consist of 2 words, or if more, |
michael@0 | 1867 | // then the syllable number of root words must be 6, or lesser. |
michael@0 | 1868 | |
michael@0 | 1869 | if ((rv) && ( |
michael@0 | 1870 | (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) || |
michael@0 | 1871 | (compoundend && TESTAFF(rv->astr, compoundend, rv->alen)) |
michael@0 | 1872 | ) |
michael@0 | 1873 | && ( |
michael@0 | 1874 | ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) || |
michael@0 | 1875 | ((cpdmaxsyllable!=0) && |
michael@0 | 1876 | (numsyllable + get_syllable(HENTRY_WORD(rv), rv->clen)<=cpdmaxsyllable)) |
michael@0 | 1877 | ) && |
michael@0 | 1878 | ( |
michael@0 | 1879 | // test CHECKCOMPOUNDPATTERN |
michael@0 | 1880 | !numcheckcpd || scpd != 0 || !cpdpat_check(word, i, rv_first, rv, 0) |
michael@0 | 1881 | ) && |
michael@0 | 1882 | ( |
michael@0 | 1883 | (!checkcompounddup || (rv != rv_first)) |
michael@0 | 1884 | ) |
michael@0 | 1885 | // test CHECKCOMPOUNDPATTERN conditions |
michael@0 | 1886 | && (scpd == 0 || checkcpdtable[scpd-1].cond2 == FLAG_NULL || |
michael@0 | 1887 | TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen)) |
michael@0 | 1888 | ) |
michael@0 | 1889 | { |
michael@0 | 1890 | // forbid compound word, if it is a non compound word with typical fault |
michael@0 | 1891 | if (checkcompoundrep && cpdrep_check(word,len)) return NULL; |
michael@0 | 1892 | return rv_first; |
michael@0 | 1893 | } |
michael@0 | 1894 | |
michael@0 | 1895 | numsyllable = oldnumsyllable2; |
michael@0 | 1896 | wordnum = oldwordnum2; |
michael@0 | 1897 | |
michael@0 | 1898 | // perhaps second word has prefix or/and suffix |
michael@0 | 1899 | sfx = NULL; |
michael@0 | 1900 | sfxflag = FLAG_NULL; |
michael@0 | 1901 | rv = (compoundflag && !onlycpdrule) ? affix_check((word+i),strlen(word+i), compoundflag, IN_CPD_END) : NULL; |
michael@0 | 1902 | if (!rv && compoundend && !onlycpdrule) { |
michael@0 | 1903 | sfx = NULL; |
michael@0 | 1904 | pfx = NULL; |
michael@0 | 1905 | rv = affix_check((word+i),strlen(word+i), compoundend, IN_CPD_END); |
michael@0 | 1906 | } |
michael@0 | 1907 | |
michael@0 | 1908 | if (!rv && numdefcpd && words) { |
michael@0 | 1909 | rv = affix_check((word+i),strlen(word+i), 0, IN_CPD_END); |
michael@0 | 1910 | if (rv && defcpd_check(&words, wnum + 1, rv, NULL, 1)) return rv_first; |
michael@0 | 1911 | rv = NULL; |
michael@0 | 1912 | } |
michael@0 | 1913 | |
michael@0 | 1914 | // test CHECKCOMPOUNDPATTERN conditions (allowed forms) |
michael@0 | 1915 | if (rv && !(scpd == 0 || checkcpdtable[scpd-1].cond2 == FLAG_NULL || |
michael@0 | 1916 | TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen))) rv = NULL; |
michael@0 | 1917 | |
michael@0 | 1918 | // test CHECKCOMPOUNDPATTERN conditions (forbidden compounds) |
michael@0 | 1919 | if (rv && numcheckcpd && scpd == 0 && cpdpat_check(word, i, rv_first, rv, affixed)) rv = NULL; |
michael@0 | 1920 | |
michael@0 | 1921 | // check non_compound flag in suffix and prefix |
michael@0 | 1922 | if ((rv) && |
michael@0 | 1923 | ((pfx && pfx->getCont() && |
michael@0 | 1924 | TESTAFF(pfx->getCont(), compoundforbidflag, |
michael@0 | 1925 | pfx->getContLen())) || |
michael@0 | 1926 | (sfx && sfx->getCont() && |
michael@0 | 1927 | TESTAFF(sfx->getCont(), compoundforbidflag, |
michael@0 | 1928 | sfx->getContLen())))) { |
michael@0 | 1929 | rv = NULL; |
michael@0 | 1930 | } |
michael@0 | 1931 | |
michael@0 | 1932 | // check FORCEUCASE |
michael@0 | 1933 | if (rv && forceucase && (rv) && |
michael@0 | 1934 | (TESTAFF(rv->astr, forceucase, rv->alen)) && !(info && *info & SPELL_ORIGCAP)) rv = NULL; |
michael@0 | 1935 | |
michael@0 | 1936 | // check forbiddenwords |
michael@0 | 1937 | if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) || |
michael@0 | 1938 | TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || |
michael@0 | 1939 | (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) return NULL; |
michael@0 | 1940 | |
michael@0 | 1941 | // pfxappnd = prefix of word+i, or NULL |
michael@0 | 1942 | // calculate syllable number of prefix. |
michael@0 | 1943 | // hungarian convention: when syllable number of prefix is more, |
michael@0 | 1944 | // than 1, the prefix+word counts as two words. |
michael@0 | 1945 | |
michael@0 | 1946 | if (langnum == LANG_hu) { |
michael@0 | 1947 | // calculate syllable number of the word |
michael@0 | 1948 | numsyllable += get_syllable(word + i, strlen(word + i)); |
michael@0 | 1949 | |
michael@0 | 1950 | // - affix syllable num. |
michael@0 | 1951 | // XXX only second suffix (inflections, not derivations) |
michael@0 | 1952 | if (sfxappnd) { |
michael@0 | 1953 | char * tmp = myrevstrdup(sfxappnd); |
michael@0 | 1954 | numsyllable -= get_syllable(tmp, strlen(tmp)); |
michael@0 | 1955 | free(tmp); |
michael@0 | 1956 | } |
michael@0 | 1957 | |
michael@0 | 1958 | // + 1 word, if syllable number of the prefix > 1 (hungarian convention) |
michael@0 | 1959 | if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) > 1)) wordnum++; |
michael@0 | 1960 | |
michael@0 | 1961 | // increment syllable num, if last word has a SYLLABLENUM flag |
michael@0 | 1962 | // and the suffix is beginning `s' |
michael@0 | 1963 | |
michael@0 | 1964 | if (cpdsyllablenum) { |
michael@0 | 1965 | switch (sfxflag) { |
michael@0 | 1966 | case 'c': { numsyllable+=2; break; } |
michael@0 | 1967 | case 'J': { numsyllable += 1; break; } |
michael@0 | 1968 | case 'I': { if (rv && TESTAFF(rv->astr, 'J', rv->alen)) numsyllable += 1; break; } |
michael@0 | 1969 | } |
michael@0 | 1970 | } |
michael@0 | 1971 | } |
michael@0 | 1972 | |
michael@0 | 1973 | // increment word number, if the second word has a compoundroot flag |
michael@0 | 1974 | if ((rv) && (compoundroot) && |
michael@0 | 1975 | (TESTAFF(rv->astr, compoundroot, rv->alen))) { |
michael@0 | 1976 | wordnum++; |
michael@0 | 1977 | } |
michael@0 | 1978 | |
michael@0 | 1979 | // second word is acceptable, as a word with prefix or/and suffix? |
michael@0 | 1980 | // hungarian conventions: compounding is acceptable, |
michael@0 | 1981 | // when compound forms consist 2 word, otherwise |
michael@0 | 1982 | // the syllable number of root words is 6, or lesser. |
michael@0 | 1983 | if ((rv) && |
michael@0 | 1984 | ( |
michael@0 | 1985 | ((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) || |
michael@0 | 1986 | ((cpdmaxsyllable != 0) && |
michael@0 | 1987 | (numsyllable <= cpdmaxsyllable)) |
michael@0 | 1988 | ) |
michael@0 | 1989 | && ( |
michael@0 | 1990 | (!checkcompounddup || (rv != rv_first)) |
michael@0 | 1991 | )) { |
michael@0 | 1992 | // forbid compound word, if it is a non compound word with typical fault |
michael@0 | 1993 | if (checkcompoundrep && cpdrep_check(word, len)) return NULL; |
michael@0 | 1994 | return rv_first; |
michael@0 | 1995 | } |
michael@0 | 1996 | |
michael@0 | 1997 | numsyllable = oldnumsyllable2; |
michael@0 | 1998 | wordnum = oldwordnum2; |
michael@0 | 1999 | |
michael@0 | 2000 | // perhaps second word is a compound word (recursive call) |
michael@0 | 2001 | if (wordnum < maxwordnum) { |
michael@0 | 2002 | rv = compound_check((st+i),strlen(st+i), wordnum+1, |
michael@0 | 2003 | numsyllable, maxwordnum, wnum + 1, words, 0, is_sug, info); |
michael@0 | 2004 | |
michael@0 | 2005 | if (rv && numcheckcpd && ((scpd == 0 && cpdpat_check(word, i, rv_first, rv, affixed)) || |
michael@0 | 2006 | (scpd != 0 && !cpdpat_check(word, i, rv_first, rv, affixed)))) rv = NULL; |
michael@0 | 2007 | } else { |
michael@0 | 2008 | rv=NULL; |
michael@0 | 2009 | } |
michael@0 | 2010 | if (rv) { |
michael@0 | 2011 | // forbid compound word, if it is a non compound word with typical fault |
michael@0 | 2012 | if (checkcompoundrep || forbiddenword) { |
michael@0 | 2013 | struct hentry * rv2 = NULL; |
michael@0 | 2014 | |
michael@0 | 2015 | if (checkcompoundrep && cpdrep_check(word, len)) return NULL; |
michael@0 | 2016 | |
michael@0 | 2017 | // check first part |
michael@0 | 2018 | if (strncmp(rv->word, word + i, rv->blen) == 0) { |
michael@0 | 2019 | char r = *(st + i + rv->blen); |
michael@0 | 2020 | *(st + i + rv->blen) = '\0'; |
michael@0 | 2021 | |
michael@0 | 2022 | if (checkcompoundrep && cpdrep_check(st, i + rv->blen)) { |
michael@0 | 2023 | *(st + i + rv->blen) = r; |
michael@0 | 2024 | continue; |
michael@0 | 2025 | } |
michael@0 | 2026 | |
michael@0 | 2027 | if (forbiddenword) { |
michael@0 | 2028 | rv2 = lookup(word); |
michael@0 | 2029 | if (!rv2) rv2 = affix_check(word, len); |
michael@0 | 2030 | if (rv2 && rv2->astr && TESTAFF(rv2->astr, forbiddenword, rv2->alen) && |
michael@0 | 2031 | (strncmp(rv2->word, st, i + rv->blen) == 0)) { |
michael@0 | 2032 | return NULL; |
michael@0 | 2033 | } |
michael@0 | 2034 | } |
michael@0 | 2035 | *(st + i + rv->blen) = r; |
michael@0 | 2036 | } |
michael@0 | 2037 | } |
michael@0 | 2038 | return rv_first; |
michael@0 | 2039 | } |
michael@0 | 2040 | } while (striple && !checkedstriple); // end of striple loop |
michael@0 | 2041 | |
michael@0 | 2042 | if (checkedstriple) { |
michael@0 | 2043 | i++; |
michael@0 | 2044 | checkedstriple = 0; |
michael@0 | 2045 | striple = 0; |
michael@0 | 2046 | } |
michael@0 | 2047 | |
michael@0 | 2048 | } // first word is ok condition |
michael@0 | 2049 | |
michael@0 | 2050 | if (soldi != 0) { |
michael@0 | 2051 | i = soldi; |
michael@0 | 2052 | soldi = 0; |
michael@0 | 2053 | len = oldlen; |
michael@0 | 2054 | cmin = oldcmin; |
michael@0 | 2055 | cmax = oldcmax; |
michael@0 | 2056 | } |
michael@0 | 2057 | scpd++; |
michael@0 | 2058 | |
michael@0 | 2059 | |
michael@0 | 2060 | } while (!onlycpdrule && simplifiedcpd && scpd <= numcheckcpd); // end of simplifiedcpd loop |
michael@0 | 2061 | |
michael@0 | 2062 | scpd = 0; |
michael@0 | 2063 | wordnum = oldwordnum; |
michael@0 | 2064 | numsyllable = oldnumsyllable; |
michael@0 | 2065 | |
michael@0 | 2066 | if (soldi != 0) { |
michael@0 | 2067 | i = soldi; |
michael@0 | 2068 | strcpy(st, word); // XXX add more optim. |
michael@0 | 2069 | soldi = 0; |
michael@0 | 2070 | } else st[i] = ch; |
michael@0 | 2071 | |
michael@0 | 2072 | } while (numdefcpd && oldwordnum == 0 && !onlycpdrule && (onlycpdrule = 1)); // end of onlycpd loop |
michael@0 | 2073 | |
michael@0 | 2074 | } |
michael@0 | 2075 | |
michael@0 | 2076 | return NULL; |
michael@0 | 2077 | } |
michael@0 | 2078 | |
michael@0 | 2079 | // check if compound word is correctly spelled |
michael@0 | 2080 | // hu_mov_rule = spec. Hungarian rule (XXX) |
michael@0 | 2081 | int AffixMgr::compound_check_morph(const char * word, int len, |
michael@0 | 2082 | short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words, |
michael@0 | 2083 | char hu_mov_rule = 0, char ** result = NULL, char * partresult = NULL) |
michael@0 | 2084 | { |
michael@0 | 2085 | int i; |
michael@0 | 2086 | short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2; |
michael@0 | 2087 | int ok = 0; |
michael@0 | 2088 | |
michael@0 | 2089 | struct hentry * rv = NULL; |
michael@0 | 2090 | struct hentry * rv_first; |
michael@0 | 2091 | struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking |
michael@0 | 2092 | char st [MAXWORDUTF8LEN + 4]; |
michael@0 | 2093 | char ch; |
michael@0 | 2094 | |
michael@0 | 2095 | int checked_prefix; |
michael@0 | 2096 | char presult[MAXLNLEN]; |
michael@0 | 2097 | |
michael@0 | 2098 | int cmin; |
michael@0 | 2099 | int cmax; |
michael@0 | 2100 | |
michael@0 | 2101 | int onlycpdrule; |
michael@0 | 2102 | int affixed = 0; |
michael@0 | 2103 | hentry ** oldwords = words; |
michael@0 | 2104 | |
michael@0 | 2105 | setcminmax(&cmin, &cmax, word, len); |
michael@0 | 2106 | |
michael@0 | 2107 | strcpy(st, word); |
michael@0 | 2108 | |
michael@0 | 2109 | for (i = cmin; i < cmax; i++) { |
michael@0 | 2110 | oldnumsyllable = numsyllable; |
michael@0 | 2111 | oldwordnum = wordnum; |
michael@0 | 2112 | checked_prefix = 0; |
michael@0 | 2113 | |
michael@0 | 2114 | // go to end of the UTF-8 character |
michael@0 | 2115 | if (utf8) { |
michael@0 | 2116 | for (; (st[i] & 0xc0) == 0x80; i++); |
michael@0 | 2117 | if (i >= cmax) return 0; |
michael@0 | 2118 | } |
michael@0 | 2119 | |
michael@0 | 2120 | words = oldwords; |
michael@0 | 2121 | onlycpdrule = (words) ? 1 : 0; |
michael@0 | 2122 | |
michael@0 | 2123 | do { // onlycpdrule loop |
michael@0 | 2124 | |
michael@0 | 2125 | oldnumsyllable = numsyllable; |
michael@0 | 2126 | oldwordnum = wordnum; |
michael@0 | 2127 | checked_prefix = 0; |
michael@0 | 2128 | |
michael@0 | 2129 | ch = st[i]; |
michael@0 | 2130 | st[i] = '\0'; |
michael@0 | 2131 | sfx = NULL; |
michael@0 | 2132 | |
michael@0 | 2133 | // FIRST WORD |
michael@0 | 2134 | |
michael@0 | 2135 | affixed = 1; |
michael@0 | 2136 | |
michael@0 | 2137 | *presult = '\0'; |
michael@0 | 2138 | if (partresult) mystrcat(presult, partresult, MAXLNLEN); |
michael@0 | 2139 | |
michael@0 | 2140 | rv = lookup(st); // perhaps without prefix |
michael@0 | 2141 | |
michael@0 | 2142 | // search homonym with compound flag |
michael@0 | 2143 | while ((rv) && !hu_mov_rule && |
michael@0 | 2144 | ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || |
michael@0 | 2145 | !((compoundflag && !words && !onlycpdrule && TESTAFF(rv->astr, compoundflag, rv->alen)) || |
michael@0 | 2146 | (compoundbegin && !wordnum && !onlycpdrule && |
michael@0 | 2147 | TESTAFF(rv->astr, compoundbegin, rv->alen)) || |
michael@0 | 2148 | (compoundmiddle && wordnum && !words && !onlycpdrule && |
michael@0 | 2149 | TESTAFF(rv->astr, compoundmiddle, rv->alen)) || |
michael@0 | 2150 | (numdefcpd && onlycpdrule && |
michael@0 | 2151 | ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) || |
michael@0 | 2152 | (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)))) |
michael@0 | 2153 | ))) { |
michael@0 | 2154 | rv = rv->next_homonym; |
michael@0 | 2155 | } |
michael@0 | 2156 | |
michael@0 | 2157 | if (rv) affixed = 0; |
michael@0 | 2158 | |
michael@0 | 2159 | if (rv) { |
michael@0 | 2160 | sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_PART, st); |
michael@0 | 2161 | if (!HENTRY_FIND(rv, MORPH_STEM)) { |
michael@0 | 2162 | sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_STEM, st); |
michael@0 | 2163 | } |
michael@0 | 2164 | // store the pointer of the hash entry |
michael@0 | 2165 | // sprintf(presult + strlen(presult), "%c%s%p", MSEP_FLD, MORPH_HENTRY, rv); |
michael@0 | 2166 | if (HENTRY_DATA(rv)) { |
michael@0 | 2167 | sprintf(presult + strlen(presult), "%c%s", MSEP_FLD, HENTRY_DATA2(rv)); |
michael@0 | 2168 | } |
michael@0 | 2169 | } |
michael@0 | 2170 | |
michael@0 | 2171 | if (!rv) { |
michael@0 | 2172 | if (onlycpdrule) break; |
michael@0 | 2173 | if (compoundflag && |
michael@0 | 2174 | !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) { |
michael@0 | 2175 | if ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, |
michael@0 | 2176 | FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) && !hu_mov_rule && |
michael@0 | 2177 | sfx->getCont() && |
michael@0 | 2178 | ((compoundforbidflag && TESTAFF(sfx->getCont(), compoundforbidflag, |
michael@0 | 2179 | sfx->getContLen())) || (compoundend && |
michael@0 | 2180 | TESTAFF(sfx->getCont(), compoundend, |
michael@0 | 2181 | sfx->getContLen())))) { |
michael@0 | 2182 | rv = NULL; |
michael@0 | 2183 | } |
michael@0 | 2184 | } |
michael@0 | 2185 | |
michael@0 | 2186 | if (rv || |
michael@0 | 2187 | (((wordnum == 0) && compoundbegin && |
michael@0 | 2188 | ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || |
michael@0 | 2189 | (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundbegin)))) || |
michael@0 | 2190 | ((wordnum > 0) && compoundmiddle && |
michael@0 | 2191 | ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || |
michael@0 | 2192 | (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle))))) |
michael@0 | 2193 | ) { |
michael@0 | 2194 | // char * p = prefix_check_morph(st, i, 0, compound); |
michael@0 | 2195 | char * p = NULL; |
michael@0 | 2196 | if (compoundflag) p = affix_check_morph(st, i, compoundflag); |
michael@0 | 2197 | if (!p || (*p == '\0')) { |
michael@0 | 2198 | if (p) free(p); |
michael@0 | 2199 | p = NULL; |
michael@0 | 2200 | if ((wordnum == 0) && compoundbegin) { |
michael@0 | 2201 | p = affix_check_morph(st, i, compoundbegin); |
michael@0 | 2202 | } else if ((wordnum > 0) && compoundmiddle) { |
michael@0 | 2203 | p = affix_check_morph(st, i, compoundmiddle); |
michael@0 | 2204 | } |
michael@0 | 2205 | } |
michael@0 | 2206 | if (p && (*p != '\0')) { |
michael@0 | 2207 | sprintf(presult + strlen(presult), "%c%s%s%s", MSEP_FLD, |
michael@0 | 2208 | MORPH_PART, st, line_uniq_app(&p, MSEP_REC)); |
michael@0 | 2209 | } |
michael@0 | 2210 | if (p) free(p); |
michael@0 | 2211 | checked_prefix = 1; |
michael@0 | 2212 | } |
michael@0 | 2213 | // else check forbiddenwords |
michael@0 | 2214 | } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) || |
michael@0 | 2215 | TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || |
michael@0 | 2216 | TESTAFF(rv->astr, needaffix, rv->alen))) { |
michael@0 | 2217 | st[i] = ch; |
michael@0 | 2218 | continue; |
michael@0 | 2219 | } |
michael@0 | 2220 | |
michael@0 | 2221 | // check non_compound flag in suffix and prefix |
michael@0 | 2222 | if ((rv) && !hu_mov_rule && |
michael@0 | 2223 | ((pfx && pfx->getCont() && |
michael@0 | 2224 | TESTAFF(pfx->getCont(), compoundforbidflag, |
michael@0 | 2225 | pfx->getContLen())) || |
michael@0 | 2226 | (sfx && sfx->getCont() && |
michael@0 | 2227 | TESTAFF(sfx->getCont(), compoundforbidflag, |
michael@0 | 2228 | sfx->getContLen())))) { |
michael@0 | 2229 | continue; |
michael@0 | 2230 | } |
michael@0 | 2231 | |
michael@0 | 2232 | // check compoundend flag in suffix and prefix |
michael@0 | 2233 | if ((rv) && !checked_prefix && compoundend && !hu_mov_rule && |
michael@0 | 2234 | ((pfx && pfx->getCont() && |
michael@0 | 2235 | TESTAFF(pfx->getCont(), compoundend, |
michael@0 | 2236 | pfx->getContLen())) || |
michael@0 | 2237 | (sfx && sfx->getCont() && |
michael@0 | 2238 | TESTAFF(sfx->getCont(), compoundend, |
michael@0 | 2239 | sfx->getContLen())))) { |
michael@0 | 2240 | continue; |
michael@0 | 2241 | } |
michael@0 | 2242 | |
michael@0 | 2243 | // check compoundmiddle flag in suffix and prefix |
michael@0 | 2244 | if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu_mov_rule && |
michael@0 | 2245 | ((pfx && pfx->getCont() && |
michael@0 | 2246 | TESTAFF(pfx->getCont(), compoundmiddle, |
michael@0 | 2247 | pfx->getContLen())) || |
michael@0 | 2248 | (sfx && sfx->getCont() && |
michael@0 | 2249 | TESTAFF(sfx->getCont(), compoundmiddle, |
michael@0 | 2250 | sfx->getContLen())))) { |
michael@0 | 2251 | rv = NULL; |
michael@0 | 2252 | } |
michael@0 | 2253 | |
michael@0 | 2254 | // check forbiddenwords |
michael@0 | 2255 | if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) |
michael@0 | 2256 | || TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))) continue; |
michael@0 | 2257 | |
michael@0 | 2258 | // increment word number, if the second root has a compoundroot flag |
michael@0 | 2259 | if ((rv) && (compoundroot) && |
michael@0 | 2260 | (TESTAFF(rv->astr, compoundroot, rv->alen))) { |
michael@0 | 2261 | wordnum++; |
michael@0 | 2262 | } |
michael@0 | 2263 | |
michael@0 | 2264 | // first word is acceptable in compound words? |
michael@0 | 2265 | if (((rv) && |
michael@0 | 2266 | ( checked_prefix || (words && words[wnum]) || |
michael@0 | 2267 | (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) || |
michael@0 | 2268 | ((oldwordnum == 0) && compoundbegin && TESTAFF(rv->astr, compoundbegin, rv->alen)) || |
michael@0 | 2269 | ((oldwordnum > 0) && compoundmiddle && TESTAFF(rv->astr, compoundmiddle, rv->alen)) |
michael@0 | 2270 | // LANG_hu section: spec. Hungarian rule |
michael@0 | 2271 | || ((langnum == LANG_hu) && // hu_mov_rule |
michael@0 | 2272 | hu_mov_rule && ( |
michael@0 | 2273 | TESTAFF(rv->astr, 'F', rv->alen) || |
michael@0 | 2274 | TESTAFF(rv->astr, 'G', rv->alen) || |
michael@0 | 2275 | TESTAFF(rv->astr, 'H', rv->alen) |
michael@0 | 2276 | ) |
michael@0 | 2277 | ) |
michael@0 | 2278 | // END of LANG_hu section |
michael@0 | 2279 | ) |
michael@0 | 2280 | && ! (( checkcompoundtriple && !words && // test triple letters |
michael@0 | 2281 | (word[i-1]==word[i]) && ( |
michael@0 | 2282 | ((i>1) && (word[i-1]==word[i-2])) || |
michael@0 | 2283 | ((word[i-1]==word[i+1])) // may be word[i+1] == '\0' |
michael@0 | 2284 | ) |
michael@0 | 2285 | ) || |
michael@0 | 2286 | ( |
michael@0 | 2287 | // test CHECKCOMPOUNDPATTERN |
michael@0 | 2288 | numcheckcpd && !words && cpdpat_check(word, i, rv, NULL, affixed) |
michael@0 | 2289 | ) || |
michael@0 | 2290 | ( |
michael@0 | 2291 | checkcompoundcase && !words && cpdcase_check(word, i) |
michael@0 | 2292 | )) |
michael@0 | 2293 | ) |
michael@0 | 2294 | // LANG_hu section: spec. Hungarian rule |
michael@0 | 2295 | || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && (rv = affix_check(st,i)) && |
michael@0 | 2296 | (sfx && sfx->getCont() && ( |
michael@0 | 2297 | TESTAFF(sfx->getCont(), (unsigned short) 'x', sfx->getContLen()) || |
michael@0 | 2298 | TESTAFF(sfx->getCont(), (unsigned short) '%', sfx->getContLen()) |
michael@0 | 2299 | ) |
michael@0 | 2300 | ) |
michael@0 | 2301 | ) |
michael@0 | 2302 | // END of LANG_hu section |
michael@0 | 2303 | ) { |
michael@0 | 2304 | |
michael@0 | 2305 | // LANG_hu section: spec. Hungarian rule |
michael@0 | 2306 | if (langnum == LANG_hu) { |
michael@0 | 2307 | // calculate syllable number of the word |
michael@0 | 2308 | numsyllable += get_syllable(st, i); |
michael@0 | 2309 | |
michael@0 | 2310 | // + 1 word, if syllable number of the prefix > 1 (hungarian convention) |
michael@0 | 2311 | if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) > 1)) wordnum++; |
michael@0 | 2312 | } |
michael@0 | 2313 | // END of LANG_hu section |
michael@0 | 2314 | |
michael@0 | 2315 | // NEXT WORD(S) |
michael@0 | 2316 | rv_first = rv; |
michael@0 | 2317 | rv = lookup((word+i)); // perhaps without prefix |
michael@0 | 2318 | |
michael@0 | 2319 | // search homonym with compound flag |
michael@0 | 2320 | while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || |
michael@0 | 2321 | !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) || |
michael@0 | 2322 | (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) || |
michael@0 | 2323 | (numdefcpd && words && defcpd_check(&words, wnum + 1, rv, NULL,1))))) { |
michael@0 | 2324 | rv = rv->next_homonym; |
michael@0 | 2325 | } |
michael@0 | 2326 | |
michael@0 | 2327 | if (rv && words && words[wnum + 1]) { |
michael@0 | 2328 | mystrcat(*result, presult, MAXLNLEN); |
michael@0 | 2329 | mystrcat(*result, " ", MAXLNLEN); |
michael@0 | 2330 | mystrcat(*result, MORPH_PART, MAXLNLEN); |
michael@0 | 2331 | mystrcat(*result, word+i, MAXLNLEN); |
michael@0 | 2332 | if (complexprefixes && HENTRY_DATA(rv)) mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN); |
michael@0 | 2333 | if (!HENTRY_FIND(rv, MORPH_STEM)) { |
michael@0 | 2334 | mystrcat(*result, " ", MAXLNLEN); |
michael@0 | 2335 | mystrcat(*result, MORPH_STEM, MAXLNLEN); |
michael@0 | 2336 | mystrcat(*result, HENTRY_WORD(rv), MAXLNLEN); |
michael@0 | 2337 | } |
michael@0 | 2338 | // store the pointer of the hash entry |
michael@0 | 2339 | // sprintf(*result + strlen(*result), " %s%p", MORPH_HENTRY, rv); |
michael@0 | 2340 | if (!complexprefixes && HENTRY_DATA(rv)) { |
michael@0 | 2341 | mystrcat(*result, " ", MAXLNLEN); |
michael@0 | 2342 | mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN); |
michael@0 | 2343 | } |
michael@0 | 2344 | mystrcat(*result, "\n", MAXLNLEN); |
michael@0 | 2345 | ok = 1; |
michael@0 | 2346 | return 0; |
michael@0 | 2347 | } |
michael@0 | 2348 | |
michael@0 | 2349 | oldnumsyllable2 = numsyllable; |
michael@0 | 2350 | oldwordnum2 = wordnum; |
michael@0 | 2351 | |
michael@0 | 2352 | // LANG_hu section: spec. Hungarian rule |
michael@0 | 2353 | if ((rv) && (langnum == LANG_hu) && (TESTAFF(rv->astr, 'I', rv->alen)) && !(TESTAFF(rv->astr, 'J', rv->alen))) { |
michael@0 | 2354 | numsyllable--; |
michael@0 | 2355 | } |
michael@0 | 2356 | // END of LANG_hu section |
michael@0 | 2357 | // increment word number, if the second root has a compoundroot flag |
michael@0 | 2358 | if ((rv) && (compoundroot) && |
michael@0 | 2359 | (TESTAFF(rv->astr, compoundroot, rv->alen))) { |
michael@0 | 2360 | wordnum++; |
michael@0 | 2361 | } |
michael@0 | 2362 | |
michael@0 | 2363 | // check forbiddenwords |
michael@0 | 2364 | if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) || |
michael@0 | 2365 | TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))) { |
michael@0 | 2366 | st[i] = ch; |
michael@0 | 2367 | continue; |
michael@0 | 2368 | } |
michael@0 | 2369 | |
michael@0 | 2370 | // second word is acceptable, as a root? |
michael@0 | 2371 | // hungarian conventions: compounding is acceptable, |
michael@0 | 2372 | // when compound forms consist of 2 words, or if more, |
michael@0 | 2373 | // then the syllable number of root words must be 6, or lesser. |
michael@0 | 2374 | if ((rv) && ( |
michael@0 | 2375 | (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) || |
michael@0 | 2376 | (compoundend && TESTAFF(rv->astr, compoundend, rv->alen)) |
michael@0 | 2377 | ) |
michael@0 | 2378 | && ( |
michael@0 | 2379 | ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) || |
michael@0 | 2380 | ((cpdmaxsyllable!=0) && |
michael@0 | 2381 | (numsyllable+get_syllable(HENTRY_WORD(rv),rv->blen)<=cpdmaxsyllable)) |
michael@0 | 2382 | ) |
michael@0 | 2383 | && ( |
michael@0 | 2384 | (!checkcompounddup || (rv != rv_first)) |
michael@0 | 2385 | ) |
michael@0 | 2386 | ) |
michael@0 | 2387 | { |
michael@0 | 2388 | // bad compound word |
michael@0 | 2389 | mystrcat(*result, presult, MAXLNLEN); |
michael@0 | 2390 | mystrcat(*result, " ", MAXLNLEN); |
michael@0 | 2391 | mystrcat(*result, MORPH_PART, MAXLNLEN); |
michael@0 | 2392 | mystrcat(*result, word+i, MAXLNLEN); |
michael@0 | 2393 | |
michael@0 | 2394 | if (HENTRY_DATA(rv)) { |
michael@0 | 2395 | if (complexprefixes) mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN); |
michael@0 | 2396 | if (! HENTRY_FIND(rv, MORPH_STEM)) { |
michael@0 | 2397 | mystrcat(*result, " ", MAXLNLEN); |
michael@0 | 2398 | mystrcat(*result, MORPH_STEM, MAXLNLEN); |
michael@0 | 2399 | mystrcat(*result, HENTRY_WORD(rv), MAXLNLEN); |
michael@0 | 2400 | } |
michael@0 | 2401 | // store the pointer of the hash entry |
michael@0 | 2402 | // sprintf(*result + strlen(*result), " %s%p", MORPH_HENTRY, rv); |
michael@0 | 2403 | if (!complexprefixes) { |
michael@0 | 2404 | mystrcat(*result, " ", MAXLNLEN); |
michael@0 | 2405 | mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN); |
michael@0 | 2406 | } |
michael@0 | 2407 | } |
michael@0 | 2408 | mystrcat(*result, "\n", MAXLNLEN); |
michael@0 | 2409 | ok = 1; |
michael@0 | 2410 | } |
michael@0 | 2411 | |
michael@0 | 2412 | numsyllable = oldnumsyllable2 ; |
michael@0 | 2413 | wordnum = oldwordnum2; |
michael@0 | 2414 | |
michael@0 | 2415 | // perhaps second word has prefix or/and suffix |
michael@0 | 2416 | sfx = NULL; |
michael@0 | 2417 | sfxflag = FLAG_NULL; |
michael@0 | 2418 | |
michael@0 | 2419 | if (compoundflag && !onlycpdrule) rv = affix_check((word+i),strlen(word+i), compoundflag); else rv = NULL; |
michael@0 | 2420 | |
michael@0 | 2421 | if (!rv && compoundend && !onlycpdrule) { |
michael@0 | 2422 | sfx = NULL; |
michael@0 | 2423 | pfx = NULL; |
michael@0 | 2424 | rv = affix_check((word+i),strlen(word+i), compoundend); |
michael@0 | 2425 | } |
michael@0 | 2426 | |
michael@0 | 2427 | if (!rv && numdefcpd && words) { |
michael@0 | 2428 | rv = affix_check((word+i),strlen(word+i), 0, IN_CPD_END); |
michael@0 | 2429 | if (rv && words && defcpd_check(&words, wnum + 1, rv, NULL, 1)) { |
michael@0 | 2430 | char * m = NULL; |
michael@0 | 2431 | if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag); |
michael@0 | 2432 | if ((!m || *m == '\0') && compoundend) { |
michael@0 | 2433 | if (m) free(m); |
michael@0 | 2434 | m = affix_check_morph((word+i),strlen(word+i), compoundend); |
michael@0 | 2435 | } |
michael@0 | 2436 | mystrcat(*result, presult, MAXLNLEN); |
michael@0 | 2437 | if (m || (*m != '\0')) { |
michael@0 | 2438 | sprintf(*result + strlen(*result), "%c%s%s%s", MSEP_FLD, |
michael@0 | 2439 | MORPH_PART, word + i, line_uniq_app(&m, MSEP_REC)); |
michael@0 | 2440 | } |
michael@0 | 2441 | if (m) free(m); |
michael@0 | 2442 | mystrcat(*result, "\n", MAXLNLEN); |
michael@0 | 2443 | ok = 1; |
michael@0 | 2444 | } |
michael@0 | 2445 | } |
michael@0 | 2446 | |
michael@0 | 2447 | // check non_compound flag in suffix and prefix |
michael@0 | 2448 | if ((rv) && |
michael@0 | 2449 | ((pfx && pfx->getCont() && |
michael@0 | 2450 | TESTAFF(pfx->getCont(), compoundforbidflag, |
michael@0 | 2451 | pfx->getContLen())) || |
michael@0 | 2452 | (sfx && sfx->getCont() && |
michael@0 | 2453 | TESTAFF(sfx->getCont(), compoundforbidflag, |
michael@0 | 2454 | sfx->getContLen())))) { |
michael@0 | 2455 | rv = NULL; |
michael@0 | 2456 | } |
michael@0 | 2457 | |
michael@0 | 2458 | // check forbiddenwords |
michael@0 | 2459 | if ((rv) && (rv->astr) && (TESTAFF(rv->astr,forbiddenword,rv->alen) || |
michael@0 | 2460 | TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen)) |
michael@0 | 2461 | && (! TESTAFF(rv->astr, needaffix, rv->alen))) { |
michael@0 | 2462 | st[i] = ch; |
michael@0 | 2463 | continue; |
michael@0 | 2464 | } |
michael@0 | 2465 | |
michael@0 | 2466 | if (langnum == LANG_hu) { |
michael@0 | 2467 | // calculate syllable number of the word |
michael@0 | 2468 | numsyllable += get_syllable(word + i, strlen(word + i)); |
michael@0 | 2469 | |
michael@0 | 2470 | // - affix syllable num. |
michael@0 | 2471 | // XXX only second suffix (inflections, not derivations) |
michael@0 | 2472 | if (sfxappnd) { |
michael@0 | 2473 | char * tmp = myrevstrdup(sfxappnd); |
michael@0 | 2474 | numsyllable -= get_syllable(tmp, strlen(tmp)); |
michael@0 | 2475 | free(tmp); |
michael@0 | 2476 | } |
michael@0 | 2477 | |
michael@0 | 2478 | // + 1 word, if syllable number of the prefix > 1 (hungarian convention) |
michael@0 | 2479 | if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) > 1)) wordnum++; |
michael@0 | 2480 | |
michael@0 | 2481 | // increment syllable num, if last word has a SYLLABLENUM flag |
michael@0 | 2482 | // and the suffix is beginning `s' |
michael@0 | 2483 | |
michael@0 | 2484 | if (cpdsyllablenum) { |
michael@0 | 2485 | switch (sfxflag) { |
michael@0 | 2486 | case 'c': { numsyllable+=2; break; } |
michael@0 | 2487 | case 'J': { numsyllable += 1; break; } |
michael@0 | 2488 | case 'I': { if (rv && TESTAFF(rv->astr, 'J', rv->alen)) numsyllable += 1; break; } |
michael@0 | 2489 | } |
michael@0 | 2490 | } |
michael@0 | 2491 | } |
michael@0 | 2492 | |
michael@0 | 2493 | // increment word number, if the second word has a compoundroot flag |
michael@0 | 2494 | if ((rv) && (compoundroot) && |
michael@0 | 2495 | (TESTAFF(rv->astr, compoundroot, rv->alen))) { |
michael@0 | 2496 | wordnum++; |
michael@0 | 2497 | } |
michael@0 | 2498 | // second word is acceptable, as a word with prefix or/and suffix? |
michael@0 | 2499 | // hungarian conventions: compounding is acceptable, |
michael@0 | 2500 | // when compound forms consist 2 word, otherwise |
michael@0 | 2501 | // the syllable number of root words is 6, or lesser. |
michael@0 | 2502 | if ((rv) && |
michael@0 | 2503 | ( |
michael@0 | 2504 | ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) || |
michael@0 | 2505 | ((cpdmaxsyllable!=0) && |
michael@0 | 2506 | (numsyllable <= cpdmaxsyllable)) |
michael@0 | 2507 | ) |
michael@0 | 2508 | && ( |
michael@0 | 2509 | (!checkcompounddup || (rv != rv_first)) |
michael@0 | 2510 | )) { |
michael@0 | 2511 | char * m = NULL; |
michael@0 | 2512 | if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag); |
michael@0 | 2513 | if ((!m || *m == '\0') && compoundend) { |
michael@0 | 2514 | if (m) free(m); |
michael@0 | 2515 | m = affix_check_morph((word+i),strlen(word+i), compoundend); |
michael@0 | 2516 | } |
michael@0 | 2517 | mystrcat(*result, presult, MAXLNLEN); |
michael@0 | 2518 | if (m && (*m != '\0')) { |
michael@0 | 2519 | sprintf(*result + strlen(*result), "%c%s%s%s", MSEP_FLD, |
michael@0 | 2520 | MORPH_PART, word + i, line_uniq_app(&m, MSEP_REC)); |
michael@0 | 2521 | } |
michael@0 | 2522 | if (m) free(m); |
michael@0 | 2523 | sprintf(*result + strlen(*result), "%c", MSEP_REC); |
michael@0 | 2524 | ok = 1; |
michael@0 | 2525 | } |
michael@0 | 2526 | |
michael@0 | 2527 | numsyllable = oldnumsyllable2; |
michael@0 | 2528 | wordnum = oldwordnum2; |
michael@0 | 2529 | |
michael@0 | 2530 | // perhaps second word is a compound word (recursive call) |
michael@0 | 2531 | if ((wordnum < maxwordnum) && (ok == 0)) { |
michael@0 | 2532 | compound_check_morph((word+i),strlen(word+i), wordnum+1, |
michael@0 | 2533 | numsyllable, maxwordnum, wnum + 1, words, 0, result, presult); |
michael@0 | 2534 | } else { |
michael@0 | 2535 | rv=NULL; |
michael@0 | 2536 | } |
michael@0 | 2537 | } |
michael@0 | 2538 | st[i] = ch; |
michael@0 | 2539 | wordnum = oldwordnum; |
michael@0 | 2540 | numsyllable = oldnumsyllable; |
michael@0 | 2541 | |
michael@0 | 2542 | } while (numdefcpd && oldwordnum == 0 && !onlycpdrule && (onlycpdrule = 1)); // end of onlycpd loop |
michael@0 | 2543 | |
michael@0 | 2544 | } |
michael@0 | 2545 | return 0; |
michael@0 | 2546 | } |
michael@0 | 2547 | |
michael@0 | 2548 | // return 1 if s1 (reversed) is a leading subset of end of s2 |
michael@0 | 2549 | /* inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int len) |
michael@0 | 2550 | { |
michael@0 | 2551 | while ((len > 0) && *s1 && (*s1 == *end_of_s2)) { |
michael@0 | 2552 | s1++; |
michael@0 | 2553 | end_of_s2--; |
michael@0 | 2554 | len--; |
michael@0 | 2555 | } |
michael@0 | 2556 | return (*s1 == '\0'); |
michael@0 | 2557 | } |
michael@0 | 2558 | */ |
michael@0 | 2559 | |
michael@0 | 2560 | inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int len) |
michael@0 | 2561 | { |
michael@0 | 2562 | while ((len > 0) && (*s1 != '\0') && ((*s1 == *end_of_s2) || (*s1 == '.'))) { |
michael@0 | 2563 | s1++; |
michael@0 | 2564 | end_of_s2--; |
michael@0 | 2565 | len--; |
michael@0 | 2566 | } |
michael@0 | 2567 | return (*s1 == '\0'); |
michael@0 | 2568 | } |
michael@0 | 2569 | |
michael@0 | 2570 | // check word for suffixes |
michael@0 | 2571 | |
michael@0 | 2572 | struct hentry * AffixMgr::suffix_check (const char * word, int len, |
michael@0 | 2573 | int sfxopts, PfxEntry * ppfx, char ** wlst, int maxSug, int * ns, |
michael@0 | 2574 | const FLAG cclass, const FLAG needflag, char in_compound) |
michael@0 | 2575 | { |
michael@0 | 2576 | struct hentry * rv = NULL; |
michael@0 | 2577 | PfxEntry* ep = ppfx; |
michael@0 | 2578 | |
michael@0 | 2579 | // first handle the special case of 0 length suffixes |
michael@0 | 2580 | SfxEntry * se = sStart[0]; |
michael@0 | 2581 | |
michael@0 | 2582 | while (se) { |
michael@0 | 2583 | if (!cclass || se->getCont()) { |
michael@0 | 2584 | // suffixes are not allowed in beginning of compounds |
michael@0 | 2585 | if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass |
michael@0 | 2586 | // except when signed with compoundpermitflag flag |
michael@0 | 2587 | (se->getCont() && compoundpermitflag && |
michael@0 | 2588 | TESTAFF(se->getCont(),compoundpermitflag,se->getContLen()))) && (!circumfix || |
michael@0 | 2589 | // no circumfix flag in prefix and suffix |
michael@0 | 2590 | ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(), |
michael@0 | 2591 | circumfix, ep->getContLen())) && |
michael@0 | 2592 | (!se->getCont() || !(TESTAFF(se->getCont(),circumfix,se->getContLen())))) || |
michael@0 | 2593 | // circumfix flag in prefix AND suffix |
michael@0 | 2594 | ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(), |
michael@0 | 2595 | circumfix, ep->getContLen())) && |
michael@0 | 2596 | (se->getCont() && (TESTAFF(se->getCont(),circumfix,se->getContLen()))))) && |
michael@0 | 2597 | // fogemorpheme |
michael@0 | 2598 | (in_compound || |
michael@0 | 2599 | !(se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen())))) && |
michael@0 | 2600 | // needaffix on prefix or first suffix |
michael@0 | 2601 | (cclass || |
michael@0 | 2602 | !(se->getCont() && TESTAFF(se->getCont(), needaffix, se->getContLen())) || |
michael@0 | 2603 | (ppfx && !((ep->getCont()) && |
michael@0 | 2604 | TESTAFF(ep->getCont(), needaffix, |
michael@0 | 2605 | ep->getContLen()))) |
michael@0 | 2606 | )) { |
michael@0 | 2607 | rv = se->checkword(word,len, sfxopts, ppfx, wlst, maxSug, ns, (FLAG) cclass, |
michael@0 | 2608 | needflag, (in_compound ? 0 : onlyincompound)); |
michael@0 | 2609 | if (rv) { |
michael@0 | 2610 | sfx=se; // BUG: sfx not stateless |
michael@0 | 2611 | return rv; |
michael@0 | 2612 | } |
michael@0 | 2613 | } |
michael@0 | 2614 | } |
michael@0 | 2615 | se = se->getNext(); |
michael@0 | 2616 | } |
michael@0 | 2617 | |
michael@0 | 2618 | // now handle the general case |
michael@0 | 2619 | if (len == 0) return NULL; // FULLSTRIP |
michael@0 | 2620 | unsigned char sp= *((const unsigned char *)(word + len - 1)); |
michael@0 | 2621 | SfxEntry * sptr = sStart[sp]; |
michael@0 | 2622 | |
michael@0 | 2623 | while (sptr) { |
michael@0 | 2624 | if (isRevSubset(sptr->getKey(), word + len - 1, len) |
michael@0 | 2625 | ) { |
michael@0 | 2626 | // suffixes are not allowed in beginning of compounds |
michael@0 | 2627 | if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass |
michael@0 | 2628 | // except when signed with compoundpermitflag flag |
michael@0 | 2629 | (sptr->getCont() && compoundpermitflag && |
michael@0 | 2630 | TESTAFF(sptr->getCont(),compoundpermitflag,sptr->getContLen()))) && (!circumfix || |
michael@0 | 2631 | // no circumfix flag in prefix and suffix |
michael@0 | 2632 | ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(), |
michael@0 | 2633 | circumfix, ep->getContLen())) && |
michael@0 | 2634 | (!sptr->getCont() || !(TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))) || |
michael@0 | 2635 | // circumfix flag in prefix AND suffix |
michael@0 | 2636 | ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(), |
michael@0 | 2637 | circumfix, ep->getContLen())) && |
michael@0 | 2638 | (sptr->getCont() && (TESTAFF(sptr->getCont(),circumfix,sptr->getContLen()))))) && |
michael@0 | 2639 | // fogemorpheme |
michael@0 | 2640 | (in_compound || |
michael@0 | 2641 | !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) && |
michael@0 | 2642 | // needaffix on prefix or first suffix |
michael@0 | 2643 | (cclass || |
michael@0 | 2644 | !(sptr->getCont() && TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) || |
michael@0 | 2645 | (ppfx && !((ep->getCont()) && |
michael@0 | 2646 | TESTAFF(ep->getCont(), needaffix, |
michael@0 | 2647 | ep->getContLen()))) |
michael@0 | 2648 | ) |
michael@0 | 2649 | ) if (in_compound != IN_CPD_END || ppfx || !(sptr->getCont() && TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))) { |
michael@0 | 2650 | rv = sptr->checkword(word,len, sfxopts, ppfx, wlst, |
michael@0 | 2651 | maxSug, ns, cclass, needflag, (in_compound ? 0 : onlyincompound)); |
michael@0 | 2652 | if (rv) { |
michael@0 | 2653 | sfx=sptr; // BUG: sfx not stateless |
michael@0 | 2654 | sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless |
michael@0 | 2655 | if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless |
michael@0 | 2656 | return rv; |
michael@0 | 2657 | } |
michael@0 | 2658 | } |
michael@0 | 2659 | sptr = sptr->getNextEQ(); |
michael@0 | 2660 | } else { |
michael@0 | 2661 | sptr = sptr->getNextNE(); |
michael@0 | 2662 | } |
michael@0 | 2663 | } |
michael@0 | 2664 | |
michael@0 | 2665 | return NULL; |
michael@0 | 2666 | } |
michael@0 | 2667 | |
michael@0 | 2668 | // check word for two-level suffixes |
michael@0 | 2669 | |
michael@0 | 2670 | struct hentry * AffixMgr::suffix_check_twosfx(const char * word, int len, |
michael@0 | 2671 | int sfxopts, PfxEntry * ppfx, const FLAG needflag) |
michael@0 | 2672 | { |
michael@0 | 2673 | struct hentry * rv = NULL; |
michael@0 | 2674 | |
michael@0 | 2675 | // first handle the special case of 0 length suffixes |
michael@0 | 2676 | SfxEntry * se = sStart[0]; |
michael@0 | 2677 | while (se) { |
michael@0 | 2678 | if (contclasses[se->getFlag()]) |
michael@0 | 2679 | { |
michael@0 | 2680 | rv = se->check_twosfx(word,len, sfxopts, ppfx, needflag); |
michael@0 | 2681 | if (rv) return rv; |
michael@0 | 2682 | } |
michael@0 | 2683 | se = se->getNext(); |
michael@0 | 2684 | } |
michael@0 | 2685 | |
michael@0 | 2686 | // now handle the general case |
michael@0 | 2687 | if (len == 0) return NULL; // FULLSTRIP |
michael@0 | 2688 | unsigned char sp = *((const unsigned char *)(word + len - 1)); |
michael@0 | 2689 | SfxEntry * sptr = sStart[sp]; |
michael@0 | 2690 | |
michael@0 | 2691 | while (sptr) { |
michael@0 | 2692 | if (isRevSubset(sptr->getKey(), word + len - 1, len)) { |
michael@0 | 2693 | if (contclasses[sptr->getFlag()]) |
michael@0 | 2694 | { |
michael@0 | 2695 | rv = sptr->check_twosfx(word,len, sfxopts, ppfx, needflag); |
michael@0 | 2696 | if (rv) { |
michael@0 | 2697 | sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless |
michael@0 | 2698 | if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless |
michael@0 | 2699 | return rv; |
michael@0 | 2700 | } |
michael@0 | 2701 | } |
michael@0 | 2702 | sptr = sptr->getNextEQ(); |
michael@0 | 2703 | } else { |
michael@0 | 2704 | sptr = sptr->getNextNE(); |
michael@0 | 2705 | } |
michael@0 | 2706 | } |
michael@0 | 2707 | |
michael@0 | 2708 | return NULL; |
michael@0 | 2709 | } |
michael@0 | 2710 | |
michael@0 | 2711 | char * AffixMgr::suffix_check_twosfx_morph(const char * word, int len, |
michael@0 | 2712 | int sfxopts, PfxEntry * ppfx, const FLAG needflag) |
michael@0 | 2713 | { |
michael@0 | 2714 | char result[MAXLNLEN]; |
michael@0 | 2715 | char result2[MAXLNLEN]; |
michael@0 | 2716 | char result3[MAXLNLEN]; |
michael@0 | 2717 | |
michael@0 | 2718 | char * st; |
michael@0 | 2719 | |
michael@0 | 2720 | result[0] = '\0'; |
michael@0 | 2721 | result2[0] = '\0'; |
michael@0 | 2722 | result3[0] = '\0'; |
michael@0 | 2723 | |
michael@0 | 2724 | // first handle the special case of 0 length suffixes |
michael@0 | 2725 | SfxEntry * se = sStart[0]; |
michael@0 | 2726 | while (se) { |
michael@0 | 2727 | if (contclasses[se->getFlag()]) |
michael@0 | 2728 | { |
michael@0 | 2729 | st = se->check_twosfx_morph(word,len, sfxopts, ppfx, needflag); |
michael@0 | 2730 | if (st) { |
michael@0 | 2731 | if (ppfx) { |
michael@0 | 2732 | if (ppfx->getMorph()) { |
michael@0 | 2733 | mystrcat(result, ppfx->getMorph(), MAXLNLEN); |
michael@0 | 2734 | mystrcat(result, " ", MAXLNLEN); |
michael@0 | 2735 | } else debugflag(result, ppfx->getFlag()); |
michael@0 | 2736 | } |
michael@0 | 2737 | mystrcat(result, st, MAXLNLEN); |
michael@0 | 2738 | free(st); |
michael@0 | 2739 | if (se->getMorph()) { |
michael@0 | 2740 | mystrcat(result, " ", MAXLNLEN); |
michael@0 | 2741 | mystrcat(result, se->getMorph(), MAXLNLEN); |
michael@0 | 2742 | } else debugflag(result, se->getFlag()); |
michael@0 | 2743 | mystrcat(result, "\n", MAXLNLEN); |
michael@0 | 2744 | } |
michael@0 | 2745 | } |
michael@0 | 2746 | se = se->getNext(); |
michael@0 | 2747 | } |
michael@0 | 2748 | |
michael@0 | 2749 | // now handle the general case |
michael@0 | 2750 | if (len == 0) return NULL; // FULLSTRIP |
michael@0 | 2751 | unsigned char sp = *((const unsigned char *)(word + len - 1)); |
michael@0 | 2752 | SfxEntry * sptr = sStart[sp]; |
michael@0 | 2753 | |
michael@0 | 2754 | while (sptr) { |
michael@0 | 2755 | if (isRevSubset(sptr->getKey(), word + len - 1, len)) { |
michael@0 | 2756 | if (contclasses[sptr->getFlag()]) |
michael@0 | 2757 | { |
michael@0 | 2758 | st = sptr->check_twosfx_morph(word,len, sfxopts, ppfx, needflag); |
michael@0 | 2759 | if (st) { |
michael@0 | 2760 | sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless |
michael@0 | 2761 | if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless |
michael@0 | 2762 | strcpy(result2, st); |
michael@0 | 2763 | free(st); |
michael@0 | 2764 | |
michael@0 | 2765 | result3[0] = '\0'; |
michael@0 | 2766 | |
michael@0 | 2767 | if (sptr->getMorph()) { |
michael@0 | 2768 | mystrcat(result3, " ", MAXLNLEN); |
michael@0 | 2769 | mystrcat(result3, sptr->getMorph(), MAXLNLEN); |
michael@0 | 2770 | } else debugflag(result3, sptr->getFlag()); |
michael@0 | 2771 | strlinecat(result2, result3); |
michael@0 | 2772 | mystrcat(result2, "\n", MAXLNLEN); |
michael@0 | 2773 | mystrcat(result, result2, MAXLNLEN); |
michael@0 | 2774 | } |
michael@0 | 2775 | } |
michael@0 | 2776 | sptr = sptr->getNextEQ(); |
michael@0 | 2777 | } else { |
michael@0 | 2778 | sptr = sptr->getNextNE(); |
michael@0 | 2779 | } |
michael@0 | 2780 | } |
michael@0 | 2781 | if (*result) return mystrdup(result); |
michael@0 | 2782 | return NULL; |
michael@0 | 2783 | } |
michael@0 | 2784 | |
michael@0 | 2785 | char * AffixMgr::suffix_check_morph(const char * word, int len, |
michael@0 | 2786 | int sfxopts, PfxEntry * ppfx, const FLAG cclass, const FLAG needflag, char in_compound) |
michael@0 | 2787 | { |
michael@0 | 2788 | char result[MAXLNLEN]; |
michael@0 | 2789 | |
michael@0 | 2790 | struct hentry * rv = NULL; |
michael@0 | 2791 | |
michael@0 | 2792 | result[0] = '\0'; |
michael@0 | 2793 | |
michael@0 | 2794 | PfxEntry* ep = ppfx; |
michael@0 | 2795 | |
michael@0 | 2796 | // first handle the special case of 0 length suffixes |
michael@0 | 2797 | SfxEntry * se = sStart[0]; |
michael@0 | 2798 | while (se) { |
michael@0 | 2799 | if (!cclass || se->getCont()) { |
michael@0 | 2800 | // suffixes are not allowed in beginning of compounds |
michael@0 | 2801 | if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass |
michael@0 | 2802 | // except when signed with compoundpermitflag flag |
michael@0 | 2803 | (se->getCont() && compoundpermitflag && |
michael@0 | 2804 | TESTAFF(se->getCont(),compoundpermitflag,se->getContLen()))) && (!circumfix || |
michael@0 | 2805 | // no circumfix flag in prefix and suffix |
michael@0 | 2806 | ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(), |
michael@0 | 2807 | circumfix, ep->getContLen())) && |
michael@0 | 2808 | (!se->getCont() || !(TESTAFF(se->getCont(),circumfix,se->getContLen())))) || |
michael@0 | 2809 | // circumfix flag in prefix AND suffix |
michael@0 | 2810 | ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(), |
michael@0 | 2811 | circumfix, ep->getContLen())) && |
michael@0 | 2812 | (se->getCont() && (TESTAFF(se->getCont(),circumfix,se->getContLen()))))) && |
michael@0 | 2813 | // fogemorpheme |
michael@0 | 2814 | (in_compound || |
michael@0 | 2815 | !((se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) && |
michael@0 | 2816 | // needaffix on prefix or first suffix |
michael@0 | 2817 | (cclass || |
michael@0 | 2818 | !(se->getCont() && TESTAFF(se->getCont(), needaffix, se->getContLen())) || |
michael@0 | 2819 | (ppfx && !((ep->getCont()) && |
michael@0 | 2820 | TESTAFF(ep->getCont(), needaffix, |
michael@0 | 2821 | ep->getContLen()))) |
michael@0 | 2822 | ) |
michael@0 | 2823 | )) |
michael@0 | 2824 | rv = se->checkword(word, len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag); |
michael@0 | 2825 | while (rv) { |
michael@0 | 2826 | if (ppfx) { |
michael@0 | 2827 | if (ppfx->getMorph()) { |
michael@0 | 2828 | mystrcat(result, ppfx->getMorph(), MAXLNLEN); |
michael@0 | 2829 | mystrcat(result, " ", MAXLNLEN); |
michael@0 | 2830 | } else debugflag(result, ppfx->getFlag()); |
michael@0 | 2831 | } |
michael@0 | 2832 | if (complexprefixes && HENTRY_DATA(rv)) mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); |
michael@0 | 2833 | if (! HENTRY_FIND(rv, MORPH_STEM)) { |
michael@0 | 2834 | mystrcat(result, " ", MAXLNLEN); |
michael@0 | 2835 | mystrcat(result, MORPH_STEM, MAXLNLEN); |
michael@0 | 2836 | mystrcat(result, HENTRY_WORD(rv), MAXLNLEN); |
michael@0 | 2837 | } |
michael@0 | 2838 | // store the pointer of the hash entry |
michael@0 | 2839 | // sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv); |
michael@0 | 2840 | |
michael@0 | 2841 | if (!complexprefixes && HENTRY_DATA(rv)) { |
michael@0 | 2842 | mystrcat(result, " ", MAXLNLEN); |
michael@0 | 2843 | mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); |
michael@0 | 2844 | } |
michael@0 | 2845 | if (se->getMorph()) { |
michael@0 | 2846 | mystrcat(result, " ", MAXLNLEN); |
michael@0 | 2847 | mystrcat(result, se->getMorph(), MAXLNLEN); |
michael@0 | 2848 | } else debugflag(result, se->getFlag()); |
michael@0 | 2849 | mystrcat(result, "\n", MAXLNLEN); |
michael@0 | 2850 | rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag); |
michael@0 | 2851 | } |
michael@0 | 2852 | } |
michael@0 | 2853 | se = se->getNext(); |
michael@0 | 2854 | } |
michael@0 | 2855 | |
michael@0 | 2856 | // now handle the general case |
michael@0 | 2857 | if (len == 0) return NULL; // FULLSTRIP |
michael@0 | 2858 | unsigned char sp = *((const unsigned char *)(word + len - 1)); |
michael@0 | 2859 | SfxEntry * sptr = sStart[sp]; |
michael@0 | 2860 | |
michael@0 | 2861 | while (sptr) { |
michael@0 | 2862 | if (isRevSubset(sptr->getKey(), word + len - 1, len) |
michael@0 | 2863 | ) { |
michael@0 | 2864 | // suffixes are not allowed in beginning of compounds |
michael@0 | 2865 | if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass |
michael@0 | 2866 | // except when signed with compoundpermitflag flag |
michael@0 | 2867 | (sptr->getCont() && compoundpermitflag && |
michael@0 | 2868 | TESTAFF(sptr->getCont(),compoundpermitflag,sptr->getContLen()))) && (!circumfix || |
michael@0 | 2869 | // no circumfix flag in prefix and suffix |
michael@0 | 2870 | ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(), |
michael@0 | 2871 | circumfix, ep->getContLen())) && |
michael@0 | 2872 | (!sptr->getCont() || !(TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))) || |
michael@0 | 2873 | // circumfix flag in prefix AND suffix |
michael@0 | 2874 | ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(), |
michael@0 | 2875 | circumfix, ep->getContLen())) && |
michael@0 | 2876 | (sptr->getCont() && (TESTAFF(sptr->getCont(),circumfix,sptr->getContLen()))))) && |
michael@0 | 2877 | // fogemorpheme |
michael@0 | 2878 | (in_compound || |
michael@0 | 2879 | !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) && |
michael@0 | 2880 | // needaffix on first suffix |
michael@0 | 2881 | (cclass || !(sptr->getCont() && |
michael@0 | 2882 | TESTAFF(sptr->getCont(), needaffix, sptr->getContLen()))) |
michael@0 | 2883 | )) rv = sptr->checkword(word,len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag); |
michael@0 | 2884 | while (rv) { |
michael@0 | 2885 | if (ppfx) { |
michael@0 | 2886 | if (ppfx->getMorph()) { |
michael@0 | 2887 | mystrcat(result, ppfx->getMorph(), MAXLNLEN); |
michael@0 | 2888 | mystrcat(result, " ", MAXLNLEN); |
michael@0 | 2889 | } else debugflag(result, ppfx->getFlag()); |
michael@0 | 2890 | } |
michael@0 | 2891 | if (complexprefixes && HENTRY_DATA(rv)) mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); |
michael@0 | 2892 | if (! HENTRY_FIND(rv, MORPH_STEM)) { |
michael@0 | 2893 | mystrcat(result, " ", MAXLNLEN); |
michael@0 | 2894 | mystrcat(result, MORPH_STEM, MAXLNLEN); |
michael@0 | 2895 | mystrcat(result, HENTRY_WORD(rv), MAXLNLEN); |
michael@0 | 2896 | } |
michael@0 | 2897 | // store the pointer of the hash entry |
michael@0 | 2898 | // sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv); |
michael@0 | 2899 | |
michael@0 | 2900 | if (!complexprefixes && HENTRY_DATA(rv)) { |
michael@0 | 2901 | mystrcat(result, " ", MAXLNLEN); |
michael@0 | 2902 | mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); |
michael@0 | 2903 | } |
michael@0 | 2904 | |
michael@0 | 2905 | if (sptr->getMorph()) { |
michael@0 | 2906 | mystrcat(result, " ", MAXLNLEN); |
michael@0 | 2907 | mystrcat(result, sptr->getMorph(), MAXLNLEN); |
michael@0 | 2908 | } else debugflag(result, sptr->getFlag()); |
michael@0 | 2909 | mystrcat(result, "\n", MAXLNLEN); |
michael@0 | 2910 | rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag); |
michael@0 | 2911 | } |
michael@0 | 2912 | sptr = sptr->getNextEQ(); |
michael@0 | 2913 | } else { |
michael@0 | 2914 | sptr = sptr->getNextNE(); |
michael@0 | 2915 | } |
michael@0 | 2916 | } |
michael@0 | 2917 | |
michael@0 | 2918 | if (*result) return mystrdup(result); |
michael@0 | 2919 | return NULL; |
michael@0 | 2920 | } |
michael@0 | 2921 | |
michael@0 | 2922 | // check if word with affixes is correctly spelled |
michael@0 | 2923 | struct hentry * AffixMgr::affix_check (const char * word, int len, const FLAG needflag, char in_compound) |
michael@0 | 2924 | { |
michael@0 | 2925 | struct hentry * rv= NULL; |
michael@0 | 2926 | |
michael@0 | 2927 | // check all prefixes (also crossed with suffixes if allowed) |
michael@0 | 2928 | rv = prefix_check(word, len, in_compound, needflag); |
michael@0 | 2929 | if (rv) return rv; |
michael@0 | 2930 | |
michael@0 | 2931 | // if still not found check all suffixes |
michael@0 | 2932 | rv = suffix_check(word, len, 0, NULL, NULL, 0, NULL, FLAG_NULL, needflag, in_compound); |
michael@0 | 2933 | |
michael@0 | 2934 | if (havecontclass) { |
michael@0 | 2935 | sfx = NULL; |
michael@0 | 2936 | pfx = NULL; |
michael@0 | 2937 | |
michael@0 | 2938 | if (rv) return rv; |
michael@0 | 2939 | // if still not found check all two-level suffixes |
michael@0 | 2940 | rv = suffix_check_twosfx(word, len, 0, NULL, needflag); |
michael@0 | 2941 | |
michael@0 | 2942 | if (rv) return rv; |
michael@0 | 2943 | // if still not found check all two-level suffixes |
michael@0 | 2944 | rv = prefix_check_twosfx(word, len, IN_CPD_NOT, needflag); |
michael@0 | 2945 | } |
michael@0 | 2946 | |
michael@0 | 2947 | return rv; |
michael@0 | 2948 | } |
michael@0 | 2949 | |
michael@0 | 2950 | // check if word with affixes is correctly spelled |
michael@0 | 2951 | char * AffixMgr::affix_check_morph(const char * word, int len, const FLAG needflag, char in_compound) |
michael@0 | 2952 | { |
michael@0 | 2953 | char result[MAXLNLEN]; |
michael@0 | 2954 | char * st = NULL; |
michael@0 | 2955 | |
michael@0 | 2956 | *result = '\0'; |
michael@0 | 2957 | |
michael@0 | 2958 | // check all prefixes (also crossed with suffixes if allowed) |
michael@0 | 2959 | st = prefix_check_morph(word, len, in_compound); |
michael@0 | 2960 | if (st) { |
michael@0 | 2961 | mystrcat(result, st, MAXLNLEN); |
michael@0 | 2962 | free(st); |
michael@0 | 2963 | } |
michael@0 | 2964 | |
michael@0 | 2965 | // if still not found check all suffixes |
michael@0 | 2966 | st = suffix_check_morph(word, len, 0, NULL, '\0', needflag, in_compound); |
michael@0 | 2967 | if (st) { |
michael@0 | 2968 | mystrcat(result, st, MAXLNLEN); |
michael@0 | 2969 | free(st); |
michael@0 | 2970 | } |
michael@0 | 2971 | |
michael@0 | 2972 | if (havecontclass) { |
michael@0 | 2973 | sfx = NULL; |
michael@0 | 2974 | pfx = NULL; |
michael@0 | 2975 | // if still not found check all two-level suffixes |
michael@0 | 2976 | st = suffix_check_twosfx_morph(word, len, 0, NULL, needflag); |
michael@0 | 2977 | if (st) { |
michael@0 | 2978 | mystrcat(result, st, MAXLNLEN); |
michael@0 | 2979 | free(st); |
michael@0 | 2980 | } |
michael@0 | 2981 | |
michael@0 | 2982 | // if still not found check all two-level suffixes |
michael@0 | 2983 | st = prefix_check_twosfx_morph(word, len, IN_CPD_NOT, needflag); |
michael@0 | 2984 | if (st) { |
michael@0 | 2985 | mystrcat(result, st, MAXLNLEN); |
michael@0 | 2986 | free(st); |
michael@0 | 2987 | } |
michael@0 | 2988 | } |
michael@0 | 2989 | |
michael@0 | 2990 | return mystrdup(result); |
michael@0 | 2991 | } |
michael@0 | 2992 | |
michael@0 | 2993 | char * AffixMgr::morphgen(char * ts, int wl, const unsigned short * ap, |
michael@0 | 2994 | unsigned short al, char * morph, char * targetmorph, int level) |
michael@0 | 2995 | { |
michael@0 | 2996 | // handle suffixes |
michael@0 | 2997 | char * stemmorph; |
michael@0 | 2998 | char * stemmorphcatpos; |
michael@0 | 2999 | char mymorph[MAXLNLEN]; |
michael@0 | 3000 | |
michael@0 | 3001 | if (!morph) return NULL; |
michael@0 | 3002 | |
michael@0 | 3003 | // check substandard flag |
michael@0 | 3004 | if (TESTAFF(ap, substandard, al)) return NULL; |
michael@0 | 3005 | |
michael@0 | 3006 | if (morphcmp(morph, targetmorph) == 0) return mystrdup(ts); |
michael@0 | 3007 | |
michael@0 | 3008 | // int targetcount = get_sfxcount(targetmorph); |
michael@0 | 3009 | |
michael@0 | 3010 | // use input suffix fields, if exist |
michael@0 | 3011 | if (strstr(morph, MORPH_INFL_SFX) || strstr(morph, MORPH_DERI_SFX)) { |
michael@0 | 3012 | stemmorph = mymorph; |
michael@0 | 3013 | strcpy(stemmorph, morph); |
michael@0 | 3014 | mystrcat(stemmorph, " ", MAXLNLEN); |
michael@0 | 3015 | stemmorphcatpos = stemmorph + strlen(stemmorph); |
michael@0 | 3016 | } else { |
michael@0 | 3017 | stemmorph = morph; |
michael@0 | 3018 | stemmorphcatpos = NULL; |
michael@0 | 3019 | } |
michael@0 | 3020 | |
michael@0 | 3021 | for (int i = 0; i < al; i++) { |
michael@0 | 3022 | const unsigned char c = (unsigned char) (ap[i] & 0x00FF); |
michael@0 | 3023 | SfxEntry * sptr = sFlag[c]; |
michael@0 | 3024 | while (sptr) { |
michael@0 | 3025 | if (sptr->getFlag() == ap[i] && sptr->getMorph() && ((sptr->getContLen() == 0) || |
michael@0 | 3026 | // don't generate forms with substandard affixes |
michael@0 | 3027 | !TESTAFF(sptr->getCont(), substandard, sptr->getContLen()))) { |
michael@0 | 3028 | |
michael@0 | 3029 | if (stemmorphcatpos) strcpy(stemmorphcatpos, sptr->getMorph()); |
michael@0 | 3030 | else stemmorph = (char *) sptr->getMorph(); |
michael@0 | 3031 | |
michael@0 | 3032 | int cmp = morphcmp(stemmorph, targetmorph); |
michael@0 | 3033 | |
michael@0 | 3034 | if (cmp == 0) { |
michael@0 | 3035 | char * newword = sptr->add(ts, wl); |
michael@0 | 3036 | if (newword) { |
michael@0 | 3037 | hentry * check = pHMgr->lookup(newword); // XXX extra dic |
michael@0 | 3038 | if (!check || !check->astr || |
michael@0 | 3039 | !(TESTAFF(check->astr, forbiddenword, check->alen) || |
michael@0 | 3040 | TESTAFF(check->astr, ONLYUPCASEFLAG, check->alen))) { |
michael@0 | 3041 | return newword; |
michael@0 | 3042 | } |
michael@0 | 3043 | free(newword); |
michael@0 | 3044 | } |
michael@0 | 3045 | } |
michael@0 | 3046 | |
michael@0 | 3047 | // recursive call for secondary suffixes |
michael@0 | 3048 | if ((level == 0) && (cmp == 1) && (sptr->getContLen() > 0) && |
michael@0 | 3049 | // (get_sfxcount(stemmorph) < targetcount) && |
michael@0 | 3050 | !TESTAFF(sptr->getCont(), substandard, sptr->getContLen())) { |
michael@0 | 3051 | char * newword = sptr->add(ts, wl); |
michael@0 | 3052 | if (newword) { |
michael@0 | 3053 | char * newword2 = morphgen(newword, strlen(newword), sptr->getCont(), |
michael@0 | 3054 | sptr->getContLen(), stemmorph, targetmorph, 1); |
michael@0 | 3055 | |
michael@0 | 3056 | if (newword2) { |
michael@0 | 3057 | free(newword); |
michael@0 | 3058 | return newword2; |
michael@0 | 3059 | } |
michael@0 | 3060 | free(newword); |
michael@0 | 3061 | newword = NULL; |
michael@0 | 3062 | } |
michael@0 | 3063 | } |
michael@0 | 3064 | } |
michael@0 | 3065 | sptr = sptr->getFlgNxt(); |
michael@0 | 3066 | } |
michael@0 | 3067 | } |
michael@0 | 3068 | return NULL; |
michael@0 | 3069 | } |
michael@0 | 3070 | |
michael@0 | 3071 | |
michael@0 | 3072 | int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts, |
michael@0 | 3073 | int wl, const unsigned short * ap, unsigned short al, char * bad, int badl, |
michael@0 | 3074 | char * phon) |
michael@0 | 3075 | { |
michael@0 | 3076 | int nh=0; |
michael@0 | 3077 | // first add root word to list |
michael@0 | 3078 | if ((nh < maxn) && !(al && ((needaffix && TESTAFF(ap, needaffix, al)) || |
michael@0 | 3079 | (onlyincompound && TESTAFF(ap, onlyincompound, al))))) { |
michael@0 | 3080 | wlst[nh].word = mystrdup(ts); |
michael@0 | 3081 | if (!wlst[nh].word) return 0; |
michael@0 | 3082 | wlst[nh].allow = (1 == 0); |
michael@0 | 3083 | wlst[nh].orig = NULL; |
michael@0 | 3084 | nh++; |
michael@0 | 3085 | // add special phonetic version |
michael@0 | 3086 | if (phon && (nh < maxn)) { |
michael@0 | 3087 | wlst[nh].word = mystrdup(phon); |
michael@0 | 3088 | if (!wlst[nh].word) return nh - 1; |
michael@0 | 3089 | wlst[nh].allow = (1 == 0); |
michael@0 | 3090 | wlst[nh].orig = mystrdup(ts); |
michael@0 | 3091 | if (!wlst[nh].orig) return nh - 1; |
michael@0 | 3092 | nh++; |
michael@0 | 3093 | } |
michael@0 | 3094 | } |
michael@0 | 3095 | |
michael@0 | 3096 | // handle suffixes |
michael@0 | 3097 | for (int i = 0; i < al; i++) { |
michael@0 | 3098 | const unsigned char c = (unsigned char) (ap[i] & 0x00FF); |
michael@0 | 3099 | SfxEntry * sptr = sFlag[c]; |
michael@0 | 3100 | while (sptr) { |
michael@0 | 3101 | if ((sptr->getFlag() == ap[i]) && (!sptr->getKeyLen() || ((badl > sptr->getKeyLen()) && |
michael@0 | 3102 | (strcmp(sptr->getAffix(), bad + badl - sptr->getKeyLen()) == 0))) && |
michael@0 | 3103 | // check needaffix flag |
michael@0 | 3104 | !(sptr->getCont() && ((needaffix && |
michael@0 | 3105 | TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) || |
michael@0 | 3106 | (circumfix && |
michael@0 | 3107 | TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())) || |
michael@0 | 3108 | (onlyincompound && |
michael@0 | 3109 | TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen())))) |
michael@0 | 3110 | ) { |
michael@0 | 3111 | char * newword = sptr->add(ts, wl); |
michael@0 | 3112 | if (newword) { |
michael@0 | 3113 | if (nh < maxn) { |
michael@0 | 3114 | wlst[nh].word = newword; |
michael@0 | 3115 | wlst[nh].allow = sptr->allowCross(); |
michael@0 | 3116 | wlst[nh].orig = NULL; |
michael@0 | 3117 | nh++; |
michael@0 | 3118 | // add special phonetic version |
michael@0 | 3119 | if (phon && (nh < maxn)) { |
michael@0 | 3120 | char st[MAXWORDUTF8LEN]; |
michael@0 | 3121 | strcpy(st, phon); |
michael@0 | 3122 | strcat(st, sptr->getKey()); |
michael@0 | 3123 | reverseword(st + strlen(phon)); |
michael@0 | 3124 | wlst[nh].word = mystrdup(st); |
michael@0 | 3125 | if (!wlst[nh].word) return nh - 1; |
michael@0 | 3126 | wlst[nh].allow = (1 == 0); |
michael@0 | 3127 | wlst[nh].orig = mystrdup(newword); |
michael@0 | 3128 | if (!wlst[nh].orig) return nh - 1; |
michael@0 | 3129 | nh++; |
michael@0 | 3130 | } |
michael@0 | 3131 | } else { |
michael@0 | 3132 | free(newword); |
michael@0 | 3133 | } |
michael@0 | 3134 | } |
michael@0 | 3135 | } |
michael@0 | 3136 | sptr = sptr->getFlgNxt(); |
michael@0 | 3137 | } |
michael@0 | 3138 | } |
michael@0 | 3139 | |
michael@0 | 3140 | int n = nh; |
michael@0 | 3141 | |
michael@0 | 3142 | // handle cross products of prefixes and suffixes |
michael@0 | 3143 | for (int j=1;j<n ;j++) |
michael@0 | 3144 | if (wlst[j].allow) { |
michael@0 | 3145 | for (int k = 0; k < al; k++) { |
michael@0 | 3146 | const unsigned char c = (unsigned char) (ap[k] & 0x00FF); |
michael@0 | 3147 | PfxEntry * cptr = pFlag[c]; |
michael@0 | 3148 | while (cptr) { |
michael@0 | 3149 | if ((cptr->getFlag() == ap[k]) && cptr->allowCross() && (!cptr->getKeyLen() || ((badl > cptr->getKeyLen()) && |
michael@0 | 3150 | (strncmp(cptr->getKey(), bad, cptr->getKeyLen()) == 0)))) { |
michael@0 | 3151 | int l1 = strlen(wlst[j].word); |
michael@0 | 3152 | char * newword = cptr->add(wlst[j].word, l1); |
michael@0 | 3153 | if (newword) { |
michael@0 | 3154 | if (nh < maxn) { |
michael@0 | 3155 | wlst[nh].word = newword; |
michael@0 | 3156 | wlst[nh].allow = cptr->allowCross(); |
michael@0 | 3157 | wlst[nh].orig = NULL; |
michael@0 | 3158 | nh++; |
michael@0 | 3159 | } else { |
michael@0 | 3160 | free(newword); |
michael@0 | 3161 | } |
michael@0 | 3162 | } |
michael@0 | 3163 | } |
michael@0 | 3164 | cptr = cptr->getFlgNxt(); |
michael@0 | 3165 | } |
michael@0 | 3166 | } |
michael@0 | 3167 | } |
michael@0 | 3168 | |
michael@0 | 3169 | |
michael@0 | 3170 | // now handle pure prefixes |
michael@0 | 3171 | for (int m = 0; m < al; m ++) { |
michael@0 | 3172 | const unsigned char c = (unsigned char) (ap[m] & 0x00FF); |
michael@0 | 3173 | PfxEntry * ptr = pFlag[c]; |
michael@0 | 3174 | while (ptr) { |
michael@0 | 3175 | if ((ptr->getFlag() == ap[m]) && (!ptr->getKeyLen() || ((badl > ptr->getKeyLen()) && |
michael@0 | 3176 | (strncmp(ptr->getKey(), bad, ptr->getKeyLen()) == 0))) && |
michael@0 | 3177 | // check needaffix flag |
michael@0 | 3178 | !(ptr->getCont() && ((needaffix && |
michael@0 | 3179 | TESTAFF(ptr->getCont(), needaffix, ptr->getContLen())) || |
michael@0 | 3180 | (circumfix && |
michael@0 | 3181 | TESTAFF(ptr->getCont(), circumfix, ptr->getContLen())) || |
michael@0 | 3182 | (onlyincompound && |
michael@0 | 3183 | TESTAFF(ptr->getCont(), onlyincompound, ptr->getContLen())))) |
michael@0 | 3184 | ) { |
michael@0 | 3185 | char * newword = ptr->add(ts, wl); |
michael@0 | 3186 | if (newword) { |
michael@0 | 3187 | if (nh < maxn) { |
michael@0 | 3188 | wlst[nh].word = newword; |
michael@0 | 3189 | wlst[nh].allow = ptr->allowCross(); |
michael@0 | 3190 | wlst[nh].orig = NULL; |
michael@0 | 3191 | nh++; |
michael@0 | 3192 | } else { |
michael@0 | 3193 | free(newword); |
michael@0 | 3194 | } |
michael@0 | 3195 | } |
michael@0 | 3196 | } |
michael@0 | 3197 | ptr = ptr->getFlgNxt(); |
michael@0 | 3198 | } |
michael@0 | 3199 | } |
michael@0 | 3200 | |
michael@0 | 3201 | return nh; |
michael@0 | 3202 | } |
michael@0 | 3203 | |
michael@0 | 3204 | // return length of replacing table |
michael@0 | 3205 | int AffixMgr::get_numrep() const |
michael@0 | 3206 | { |
michael@0 | 3207 | return numrep; |
michael@0 | 3208 | } |
michael@0 | 3209 | |
michael@0 | 3210 | // return replacing table |
michael@0 | 3211 | struct replentry * AffixMgr::get_reptable() const |
michael@0 | 3212 | { |
michael@0 | 3213 | if (! reptable ) return NULL; |
michael@0 | 3214 | return reptable; |
michael@0 | 3215 | } |
michael@0 | 3216 | |
michael@0 | 3217 | // return iconv table |
michael@0 | 3218 | RepList * AffixMgr::get_iconvtable() const |
michael@0 | 3219 | { |
michael@0 | 3220 | if (! iconvtable ) return NULL; |
michael@0 | 3221 | return iconvtable; |
michael@0 | 3222 | } |
michael@0 | 3223 | |
michael@0 | 3224 | // return oconv table |
michael@0 | 3225 | RepList * AffixMgr::get_oconvtable() const |
michael@0 | 3226 | { |
michael@0 | 3227 | if (! oconvtable ) return NULL; |
michael@0 | 3228 | return oconvtable; |
michael@0 | 3229 | } |
michael@0 | 3230 | |
michael@0 | 3231 | // return replacing table |
michael@0 | 3232 | struct phonetable * AffixMgr::get_phonetable() const |
michael@0 | 3233 | { |
michael@0 | 3234 | if (! phone ) return NULL; |
michael@0 | 3235 | return phone; |
michael@0 | 3236 | } |
michael@0 | 3237 | |
michael@0 | 3238 | // return length of character map table |
michael@0 | 3239 | int AffixMgr::get_nummap() const |
michael@0 | 3240 | { |
michael@0 | 3241 | return nummap; |
michael@0 | 3242 | } |
michael@0 | 3243 | |
michael@0 | 3244 | // return character map table |
michael@0 | 3245 | struct mapentry * AffixMgr::get_maptable() const |
michael@0 | 3246 | { |
michael@0 | 3247 | if (! maptable ) return NULL; |
michael@0 | 3248 | return maptable; |
michael@0 | 3249 | } |
michael@0 | 3250 | |
michael@0 | 3251 | // return length of word break table |
michael@0 | 3252 | int AffixMgr::get_numbreak() const |
michael@0 | 3253 | { |
michael@0 | 3254 | return numbreak; |
michael@0 | 3255 | } |
michael@0 | 3256 | |
michael@0 | 3257 | // return character map table |
michael@0 | 3258 | char ** AffixMgr::get_breaktable() const |
michael@0 | 3259 | { |
michael@0 | 3260 | if (! breaktable ) return NULL; |
michael@0 | 3261 | return breaktable; |
michael@0 | 3262 | } |
michael@0 | 3263 | |
michael@0 | 3264 | // return text encoding of dictionary |
michael@0 | 3265 | char * AffixMgr::get_encoding() |
michael@0 | 3266 | { |
michael@0 | 3267 | if (! encoding ) encoding = mystrdup(SPELL_ENCODING); |
michael@0 | 3268 | return mystrdup(encoding); |
michael@0 | 3269 | } |
michael@0 | 3270 | |
michael@0 | 3271 | // return text encoding of dictionary |
michael@0 | 3272 | int AffixMgr::get_langnum() const |
michael@0 | 3273 | { |
michael@0 | 3274 | return langnum; |
michael@0 | 3275 | } |
michael@0 | 3276 | |
michael@0 | 3277 | // return double prefix option |
michael@0 | 3278 | int AffixMgr::get_complexprefixes() const |
michael@0 | 3279 | { |
michael@0 | 3280 | return complexprefixes; |
michael@0 | 3281 | } |
michael@0 | 3282 | |
michael@0 | 3283 | // return FULLSTRIP option |
michael@0 | 3284 | int AffixMgr::get_fullstrip() const |
michael@0 | 3285 | { |
michael@0 | 3286 | return fullstrip; |
michael@0 | 3287 | } |
michael@0 | 3288 | |
michael@0 | 3289 | FLAG AffixMgr::get_keepcase() const |
michael@0 | 3290 | { |
michael@0 | 3291 | return keepcase; |
michael@0 | 3292 | } |
michael@0 | 3293 | |
michael@0 | 3294 | FLAG AffixMgr::get_forceucase() const |
michael@0 | 3295 | { |
michael@0 | 3296 | return forceucase; |
michael@0 | 3297 | } |
michael@0 | 3298 | |
michael@0 | 3299 | FLAG AffixMgr::get_warn() const |
michael@0 | 3300 | { |
michael@0 | 3301 | return warn; |
michael@0 | 3302 | } |
michael@0 | 3303 | |
michael@0 | 3304 | int AffixMgr::get_forbidwarn() const |
michael@0 | 3305 | { |
michael@0 | 3306 | return forbidwarn; |
michael@0 | 3307 | } |
michael@0 | 3308 | |
michael@0 | 3309 | int AffixMgr::get_checksharps() const |
michael@0 | 3310 | { |
michael@0 | 3311 | return checksharps; |
michael@0 | 3312 | } |
michael@0 | 3313 | |
michael@0 | 3314 | char * AffixMgr::encode_flag(unsigned short aflag) const |
michael@0 | 3315 | { |
michael@0 | 3316 | return pHMgr->encode_flag(aflag); |
michael@0 | 3317 | } |
michael@0 | 3318 | |
michael@0 | 3319 | |
michael@0 | 3320 | // return the preferred ignore string for suggestions |
michael@0 | 3321 | char * AffixMgr::get_ignore() const |
michael@0 | 3322 | { |
michael@0 | 3323 | if (!ignorechars) return NULL; |
michael@0 | 3324 | return ignorechars; |
michael@0 | 3325 | } |
michael@0 | 3326 | |
michael@0 | 3327 | // return the preferred ignore string for suggestions |
michael@0 | 3328 | unsigned short * AffixMgr::get_ignore_utf16(int * len) const |
michael@0 | 3329 | { |
michael@0 | 3330 | *len = ignorechars_utf16_len; |
michael@0 | 3331 | return ignorechars_utf16; |
michael@0 | 3332 | } |
michael@0 | 3333 | |
michael@0 | 3334 | // return the keyboard string for suggestions |
michael@0 | 3335 | char * AffixMgr::get_key_string() |
michael@0 | 3336 | { |
michael@0 | 3337 | if (! keystring ) keystring = mystrdup(SPELL_KEYSTRING); |
michael@0 | 3338 | return mystrdup(keystring); |
michael@0 | 3339 | } |
michael@0 | 3340 | |
michael@0 | 3341 | // return the preferred try string for suggestions |
michael@0 | 3342 | char * AffixMgr::get_try_string() const |
michael@0 | 3343 | { |
michael@0 | 3344 | if (! trystring ) return NULL; |
michael@0 | 3345 | return mystrdup(trystring); |
michael@0 | 3346 | } |
michael@0 | 3347 | |
michael@0 | 3348 | // return the preferred try string for suggestions |
michael@0 | 3349 | const char * AffixMgr::get_wordchars() const |
michael@0 | 3350 | { |
michael@0 | 3351 | return wordchars; |
michael@0 | 3352 | } |
michael@0 | 3353 | |
michael@0 | 3354 | unsigned short * AffixMgr::get_wordchars_utf16(int * len) const |
michael@0 | 3355 | { |
michael@0 | 3356 | *len = wordchars_utf16_len; |
michael@0 | 3357 | return wordchars_utf16; |
michael@0 | 3358 | } |
michael@0 | 3359 | |
michael@0 | 3360 | // is there compounding? |
michael@0 | 3361 | int AffixMgr::get_compound() const |
michael@0 | 3362 | { |
michael@0 | 3363 | return compoundflag || compoundbegin || numdefcpd; |
michael@0 | 3364 | } |
michael@0 | 3365 | |
michael@0 | 3366 | // return the compound words control flag |
michael@0 | 3367 | FLAG AffixMgr::get_compoundflag() const |
michael@0 | 3368 | { |
michael@0 | 3369 | return compoundflag; |
michael@0 | 3370 | } |
michael@0 | 3371 | |
michael@0 | 3372 | // return the forbidden words control flag |
michael@0 | 3373 | FLAG AffixMgr::get_forbiddenword() const |
michael@0 | 3374 | { |
michael@0 | 3375 | return forbiddenword; |
michael@0 | 3376 | } |
michael@0 | 3377 | |
michael@0 | 3378 | // return the forbidden words control flag |
michael@0 | 3379 | FLAG AffixMgr::get_nosuggest() const |
michael@0 | 3380 | { |
michael@0 | 3381 | return nosuggest; |
michael@0 | 3382 | } |
michael@0 | 3383 | |
michael@0 | 3384 | // return the forbidden words control flag |
michael@0 | 3385 | FLAG AffixMgr::get_nongramsuggest() const |
michael@0 | 3386 | { |
michael@0 | 3387 | return nongramsuggest; |
michael@0 | 3388 | } |
michael@0 | 3389 | |
michael@0 | 3390 | // return the forbidden words flag modify flag |
michael@0 | 3391 | FLAG AffixMgr::get_needaffix() const |
michael@0 | 3392 | { |
michael@0 | 3393 | return needaffix; |
michael@0 | 3394 | } |
michael@0 | 3395 | |
michael@0 | 3396 | // return the onlyincompound flag |
michael@0 | 3397 | FLAG AffixMgr::get_onlyincompound() const |
michael@0 | 3398 | { |
michael@0 | 3399 | return onlyincompound; |
michael@0 | 3400 | } |
michael@0 | 3401 | |
michael@0 | 3402 | // return the compound word signal flag |
michael@0 | 3403 | FLAG AffixMgr::get_compoundroot() const |
michael@0 | 3404 | { |
michael@0 | 3405 | return compoundroot; |
michael@0 | 3406 | } |
michael@0 | 3407 | |
michael@0 | 3408 | // return the compound begin signal flag |
michael@0 | 3409 | FLAG AffixMgr::get_compoundbegin() const |
michael@0 | 3410 | { |
michael@0 | 3411 | return compoundbegin; |
michael@0 | 3412 | } |
michael@0 | 3413 | |
michael@0 | 3414 | // return the value of checknum |
michael@0 | 3415 | int AffixMgr::get_checknum() const |
michael@0 | 3416 | { |
michael@0 | 3417 | return checknum; |
michael@0 | 3418 | } |
michael@0 | 3419 | |
michael@0 | 3420 | // return the value of prefix |
michael@0 | 3421 | const char * AffixMgr::get_prefix() const |
michael@0 | 3422 | { |
michael@0 | 3423 | if (pfx) return pfx->getKey(); |
michael@0 | 3424 | return NULL; |
michael@0 | 3425 | } |
michael@0 | 3426 | |
michael@0 | 3427 | // return the value of suffix |
michael@0 | 3428 | const char * AffixMgr::get_suffix() const |
michael@0 | 3429 | { |
michael@0 | 3430 | return sfxappnd; |
michael@0 | 3431 | } |
michael@0 | 3432 | |
michael@0 | 3433 | // return the value of suffix |
michael@0 | 3434 | const char * AffixMgr::get_version() const |
michael@0 | 3435 | { |
michael@0 | 3436 | return version; |
michael@0 | 3437 | } |
michael@0 | 3438 | |
michael@0 | 3439 | // return lemma_present flag |
michael@0 | 3440 | FLAG AffixMgr::get_lemma_present() const |
michael@0 | 3441 | { |
michael@0 | 3442 | return lemma_present; |
michael@0 | 3443 | } |
michael@0 | 3444 | |
michael@0 | 3445 | // utility method to look up root words in hash table |
michael@0 | 3446 | struct hentry * AffixMgr::lookup(const char * word) |
michael@0 | 3447 | { |
michael@0 | 3448 | int i; |
michael@0 | 3449 | struct hentry * he = NULL; |
michael@0 | 3450 | for (i = 0; i < *maxdic && !he; i++) { |
michael@0 | 3451 | he = (alldic[i])->lookup(word); |
michael@0 | 3452 | } |
michael@0 | 3453 | return he; |
michael@0 | 3454 | } |
michael@0 | 3455 | |
michael@0 | 3456 | // return the value of suffix |
michael@0 | 3457 | int AffixMgr::have_contclass() const |
michael@0 | 3458 | { |
michael@0 | 3459 | return havecontclass; |
michael@0 | 3460 | } |
michael@0 | 3461 | |
michael@0 | 3462 | // return utf8 |
michael@0 | 3463 | int AffixMgr::get_utf8() const |
michael@0 | 3464 | { |
michael@0 | 3465 | return utf8; |
michael@0 | 3466 | } |
michael@0 | 3467 | |
michael@0 | 3468 | int AffixMgr::get_maxngramsugs(void) const |
michael@0 | 3469 | { |
michael@0 | 3470 | return maxngramsugs; |
michael@0 | 3471 | } |
michael@0 | 3472 | |
michael@0 | 3473 | int AffixMgr::get_maxcpdsugs(void) const |
michael@0 | 3474 | { |
michael@0 | 3475 | return maxcpdsugs; |
michael@0 | 3476 | } |
michael@0 | 3477 | |
michael@0 | 3478 | int AffixMgr::get_maxdiff(void) const |
michael@0 | 3479 | { |
michael@0 | 3480 | return maxdiff; |
michael@0 | 3481 | } |
michael@0 | 3482 | |
michael@0 | 3483 | int AffixMgr::get_onlymaxdiff(void) const |
michael@0 | 3484 | { |
michael@0 | 3485 | return onlymaxdiff; |
michael@0 | 3486 | } |
michael@0 | 3487 | |
michael@0 | 3488 | // return nosplitsugs |
michael@0 | 3489 | int AffixMgr::get_nosplitsugs(void) const |
michael@0 | 3490 | { |
michael@0 | 3491 | return nosplitsugs; |
michael@0 | 3492 | } |
michael@0 | 3493 | |
michael@0 | 3494 | // return sugswithdots |
michael@0 | 3495 | int AffixMgr::get_sugswithdots(void) const |
michael@0 | 3496 | { |
michael@0 | 3497 | return sugswithdots; |
michael@0 | 3498 | } |
michael@0 | 3499 | |
michael@0 | 3500 | /* parse flag */ |
michael@0 | 3501 | int AffixMgr::parse_flag(char * line, unsigned short * out, FileMgr * af) { |
michael@0 | 3502 | char * s = NULL; |
michael@0 | 3503 | if (*out != FLAG_NULL && !(*out >= DEFAULTFLAGS)) { |
michael@0 | 3504 | HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of an affix file parameter\n", af->getlinenum()); |
michael@0 | 3505 | return 1; |
michael@0 | 3506 | } |
michael@0 | 3507 | if (parse_string(line, &s, af->getlinenum())) return 1; |
michael@0 | 3508 | *out = pHMgr->decode_flag(s); |
michael@0 | 3509 | free(s); |
michael@0 | 3510 | return 0; |
michael@0 | 3511 | } |
michael@0 | 3512 | |
michael@0 | 3513 | /* parse num */ |
michael@0 | 3514 | int AffixMgr::parse_num(char * line, int * out, FileMgr * af) { |
michael@0 | 3515 | char * s = NULL; |
michael@0 | 3516 | if (*out != -1) { |
michael@0 | 3517 | HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of an affix file parameter\n", af->getlinenum()); |
michael@0 | 3518 | return 1; |
michael@0 | 3519 | } |
michael@0 | 3520 | if (parse_string(line, &s, af->getlinenum())) return 1; |
michael@0 | 3521 | *out = atoi(s); |
michael@0 | 3522 | free(s); |
michael@0 | 3523 | return 0; |
michael@0 | 3524 | } |
michael@0 | 3525 | |
michael@0 | 3526 | /* parse in the max syllablecount of compound words and */ |
michael@0 | 3527 | int AffixMgr::parse_cpdsyllable(char * line, FileMgr * af) |
michael@0 | 3528 | { |
michael@0 | 3529 | char * tp = line; |
michael@0 | 3530 | char * piece; |
michael@0 | 3531 | int i = 0; |
michael@0 | 3532 | int np = 0; |
michael@0 | 3533 | w_char w[MAXWORDLEN]; |
michael@0 | 3534 | piece = mystrsep(&tp, 0); |
michael@0 | 3535 | while (piece) { |
michael@0 | 3536 | if (*piece != '\0') { |
michael@0 | 3537 | switch(i) { |
michael@0 | 3538 | case 0: { np++; break; } |
michael@0 | 3539 | case 1: { cpdmaxsyllable = atoi(piece); np++; break; } |
michael@0 | 3540 | case 2: { |
michael@0 | 3541 | if (!utf8) { |
michael@0 | 3542 | cpdvowels = mystrdup(piece); |
michael@0 | 3543 | } else { |
michael@0 | 3544 | int n = u8_u16(w, MAXWORDLEN, piece); |
michael@0 | 3545 | if (n > 0) { |
michael@0 | 3546 | flag_qsort((unsigned short *) w, 0, n); |
michael@0 | 3547 | cpdvowels_utf16 = (w_char *) malloc(n * sizeof(w_char)); |
michael@0 | 3548 | if (!cpdvowels_utf16) return 1; |
michael@0 | 3549 | memcpy(cpdvowels_utf16, w, n * sizeof(w_char)); |
michael@0 | 3550 | } |
michael@0 | 3551 | cpdvowels_utf16_len = n; |
michael@0 | 3552 | } |
michael@0 | 3553 | np++; |
michael@0 | 3554 | break; |
michael@0 | 3555 | } |
michael@0 | 3556 | default: break; |
michael@0 | 3557 | } |
michael@0 | 3558 | i++; |
michael@0 | 3559 | } |
michael@0 | 3560 | piece = mystrsep(&tp, 0); |
michael@0 | 3561 | } |
michael@0 | 3562 | if (np < 2) { |
michael@0 | 3563 | HUNSPELL_WARNING(stderr, "error: line %d: missing compoundsyllable information\n", af->getlinenum()); |
michael@0 | 3564 | return 1; |
michael@0 | 3565 | } |
michael@0 | 3566 | if (np == 2) cpdvowels = mystrdup("aeiouAEIOU"); |
michael@0 | 3567 | return 0; |
michael@0 | 3568 | } |
michael@0 | 3569 | |
michael@0 | 3570 | /* parse in the typical fault correcting table */ |
michael@0 | 3571 | int AffixMgr::parse_reptable(char * line, FileMgr * af) |
michael@0 | 3572 | { |
michael@0 | 3573 | if (numrep != 0) { |
michael@0 | 3574 | HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum()); |
michael@0 | 3575 | return 1; |
michael@0 | 3576 | } |
michael@0 | 3577 | char * tp = line; |
michael@0 | 3578 | char * piece; |
michael@0 | 3579 | int i = 0; |
michael@0 | 3580 | int np = 0; |
michael@0 | 3581 | piece = mystrsep(&tp, 0); |
michael@0 | 3582 | while (piece) { |
michael@0 | 3583 | if (*piece != '\0') { |
michael@0 | 3584 | switch(i) { |
michael@0 | 3585 | case 0: { np++; break; } |
michael@0 | 3586 | case 1: { |
michael@0 | 3587 | numrep = atoi(piece); |
michael@0 | 3588 | if (numrep < 1) { |
michael@0 | 3589 | HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", af->getlinenum()); |
michael@0 | 3590 | return 1; |
michael@0 | 3591 | } |
michael@0 | 3592 | reptable = (replentry *) malloc(numrep * sizeof(struct replentry)); |
michael@0 | 3593 | if (!reptable) return 1; |
michael@0 | 3594 | np++; |
michael@0 | 3595 | break; |
michael@0 | 3596 | } |
michael@0 | 3597 | default: break; |
michael@0 | 3598 | } |
michael@0 | 3599 | i++; |
michael@0 | 3600 | } |
michael@0 | 3601 | piece = mystrsep(&tp, 0); |
michael@0 | 3602 | } |
michael@0 | 3603 | if (np != 2) { |
michael@0 | 3604 | HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); |
michael@0 | 3605 | return 1; |
michael@0 | 3606 | } |
michael@0 | 3607 | |
michael@0 | 3608 | /* now parse the numrep lines to read in the remainder of the table */ |
michael@0 | 3609 | char * nl; |
michael@0 | 3610 | for (int j=0; j < numrep; j++) { |
michael@0 | 3611 | if (!(nl = af->getline())) return 1; |
michael@0 | 3612 | mychomp(nl); |
michael@0 | 3613 | tp = nl; |
michael@0 | 3614 | i = 0; |
michael@0 | 3615 | reptable[j].pattern = NULL; |
michael@0 | 3616 | reptable[j].pattern2 = NULL; |
michael@0 | 3617 | piece = mystrsep(&tp, 0); |
michael@0 | 3618 | while (piece) { |
michael@0 | 3619 | if (*piece != '\0') { |
michael@0 | 3620 | switch(i) { |
michael@0 | 3621 | case 0: { |
michael@0 | 3622 | if (strncmp(piece,"REP",3) != 0) { |
michael@0 | 3623 | HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); |
michael@0 | 3624 | numrep = 0; |
michael@0 | 3625 | return 1; |
michael@0 | 3626 | } |
michael@0 | 3627 | break; |
michael@0 | 3628 | } |
michael@0 | 3629 | case 1: { |
michael@0 | 3630 | if (*piece == '^') reptable[j].start = true; else reptable[j].start = false; |
michael@0 | 3631 | reptable[j].pattern = mystrrep(mystrdup(piece + int(reptable[j].start)),"_"," "); |
michael@0 | 3632 | int lr = strlen(reptable[j].pattern) - 1; |
michael@0 | 3633 | if (reptable[j].pattern[lr] == '$') { |
michael@0 | 3634 | reptable[j].end = true; |
michael@0 | 3635 | reptable[j].pattern[lr] = '\0'; |
michael@0 | 3636 | } else reptable[j].end = false; |
michael@0 | 3637 | break; |
michael@0 | 3638 | } |
michael@0 | 3639 | case 2: { reptable[j].pattern2 = mystrrep(mystrdup(piece),"_"," "); break; } |
michael@0 | 3640 | default: break; |
michael@0 | 3641 | } |
michael@0 | 3642 | i++; |
michael@0 | 3643 | } |
michael@0 | 3644 | piece = mystrsep(&tp, 0); |
michael@0 | 3645 | } |
michael@0 | 3646 | if ((!(reptable[j].pattern)) || (!(reptable[j].pattern2))) { |
michael@0 | 3647 | HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); |
michael@0 | 3648 | numrep = 0; |
michael@0 | 3649 | return 1; |
michael@0 | 3650 | } |
michael@0 | 3651 | } |
michael@0 | 3652 | return 0; |
michael@0 | 3653 | } |
michael@0 | 3654 | |
michael@0 | 3655 | /* parse in the typical fault correcting table */ |
michael@0 | 3656 | int AffixMgr::parse_convtable(char * line, FileMgr * af, RepList ** rl, const char * keyword) |
michael@0 | 3657 | { |
michael@0 | 3658 | if (*rl) { |
michael@0 | 3659 | HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum()); |
michael@0 | 3660 | return 1; |
michael@0 | 3661 | } |
michael@0 | 3662 | char * tp = line; |
michael@0 | 3663 | char * piece; |
michael@0 | 3664 | int i = 0; |
michael@0 | 3665 | int np = 0; |
michael@0 | 3666 | int numrl = 0; |
michael@0 | 3667 | piece = mystrsep(&tp, 0); |
michael@0 | 3668 | while (piece) { |
michael@0 | 3669 | if (*piece != '\0') { |
michael@0 | 3670 | switch(i) { |
michael@0 | 3671 | case 0: { np++; break; } |
michael@0 | 3672 | case 1: { |
michael@0 | 3673 | numrl = atoi(piece); |
michael@0 | 3674 | if (numrl < 1) { |
michael@0 | 3675 | HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", af->getlinenum()); |
michael@0 | 3676 | return 1; |
michael@0 | 3677 | } |
michael@0 | 3678 | *rl = new RepList(numrl); |
michael@0 | 3679 | if (!*rl) return 1; |
michael@0 | 3680 | np++; |
michael@0 | 3681 | break; |
michael@0 | 3682 | } |
michael@0 | 3683 | default: break; |
michael@0 | 3684 | } |
michael@0 | 3685 | i++; |
michael@0 | 3686 | } |
michael@0 | 3687 | piece = mystrsep(&tp, 0); |
michael@0 | 3688 | } |
michael@0 | 3689 | if (np != 2) { |
michael@0 | 3690 | HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); |
michael@0 | 3691 | return 1; |
michael@0 | 3692 | } |
michael@0 | 3693 | |
michael@0 | 3694 | /* now parse the num lines to read in the remainder of the table */ |
michael@0 | 3695 | char * nl; |
michael@0 | 3696 | for (int j=0; j < numrl; j++) { |
michael@0 | 3697 | if (!(nl = af->getline())) return 1; |
michael@0 | 3698 | mychomp(nl); |
michael@0 | 3699 | tp = nl; |
michael@0 | 3700 | i = 0; |
michael@0 | 3701 | char * pattern = NULL; |
michael@0 | 3702 | char * pattern2 = NULL; |
michael@0 | 3703 | piece = mystrsep(&tp, 0); |
michael@0 | 3704 | while (piece) { |
michael@0 | 3705 | if (*piece != '\0') { |
michael@0 | 3706 | switch(i) { |
michael@0 | 3707 | case 0: { |
michael@0 | 3708 | if (strncmp(piece, keyword, strlen(keyword)) != 0) { |
michael@0 | 3709 | HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); |
michael@0 | 3710 | delete *rl; |
michael@0 | 3711 | *rl = NULL; |
michael@0 | 3712 | return 1; |
michael@0 | 3713 | } |
michael@0 | 3714 | break; |
michael@0 | 3715 | } |
michael@0 | 3716 | case 1: { pattern = mystrrep(mystrdup(piece),"_"," "); break; } |
michael@0 | 3717 | case 2: { |
michael@0 | 3718 | pattern2 = mystrrep(mystrdup(piece),"_"," "); |
michael@0 | 3719 | break; |
michael@0 | 3720 | } |
michael@0 | 3721 | default: break; |
michael@0 | 3722 | } |
michael@0 | 3723 | i++; |
michael@0 | 3724 | } |
michael@0 | 3725 | piece = mystrsep(&tp, 0); |
michael@0 | 3726 | } |
michael@0 | 3727 | if (!pattern || !pattern2) { |
michael@0 | 3728 | if (pattern) |
michael@0 | 3729 | free(pattern); |
michael@0 | 3730 | if (pattern2) |
michael@0 | 3731 | free(pattern2); |
michael@0 | 3732 | HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); |
michael@0 | 3733 | return 1; |
michael@0 | 3734 | } |
michael@0 | 3735 | (*rl)->add(pattern, pattern2); |
michael@0 | 3736 | } |
michael@0 | 3737 | return 0; |
michael@0 | 3738 | } |
michael@0 | 3739 | |
michael@0 | 3740 | |
michael@0 | 3741 | /* parse in the typical fault correcting table */ |
michael@0 | 3742 | int AffixMgr::parse_phonetable(char * line, FileMgr * af) |
michael@0 | 3743 | { |
michael@0 | 3744 | if (phone) { |
michael@0 | 3745 | HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum()); |
michael@0 | 3746 | return 1; |
michael@0 | 3747 | } |
michael@0 | 3748 | char * tp = line; |
michael@0 | 3749 | char * piece; |
michael@0 | 3750 | int i = 0; |
michael@0 | 3751 | int np = 0; |
michael@0 | 3752 | piece = mystrsep(&tp, 0); |
michael@0 | 3753 | while (piece) { |
michael@0 | 3754 | if (*piece != '\0') { |
michael@0 | 3755 | switch(i) { |
michael@0 | 3756 | case 0: { np++; break; } |
michael@0 | 3757 | case 1: { |
michael@0 | 3758 | phone = (phonetable *) malloc(sizeof(struct phonetable)); |
michael@0 | 3759 | if (!phone) return 1; |
michael@0 | 3760 | phone->num = atoi(piece); |
michael@0 | 3761 | phone->rules = NULL; |
michael@0 | 3762 | phone->utf8 = (char) utf8; |
michael@0 | 3763 | if (phone->num < 1) { |
michael@0 | 3764 | HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum()); |
michael@0 | 3765 | return 1; |
michael@0 | 3766 | } |
michael@0 | 3767 | phone->rules = (char * *) malloc(2 * (phone->num + 1) * sizeof(char *)); |
michael@0 | 3768 | if (!phone->rules) { |
michael@0 | 3769 | free(phone); |
michael@0 | 3770 | phone = NULL; |
michael@0 | 3771 | return 1; |
michael@0 | 3772 | } |
michael@0 | 3773 | np++; |
michael@0 | 3774 | break; |
michael@0 | 3775 | } |
michael@0 | 3776 | default: break; |
michael@0 | 3777 | } |
michael@0 | 3778 | i++; |
michael@0 | 3779 | } |
michael@0 | 3780 | piece = mystrsep(&tp, 0); |
michael@0 | 3781 | } |
michael@0 | 3782 | if (np != 2) { |
michael@0 | 3783 | HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); |
michael@0 | 3784 | return 1; |
michael@0 | 3785 | } |
michael@0 | 3786 | |
michael@0 | 3787 | /* now parse the phone->num lines to read in the remainder of the table */ |
michael@0 | 3788 | char * nl; |
michael@0 | 3789 | for (int j=0; j < phone->num; j++) { |
michael@0 | 3790 | if (!(nl = af->getline())) return 1; |
michael@0 | 3791 | mychomp(nl); |
michael@0 | 3792 | tp = nl; |
michael@0 | 3793 | i = 0; |
michael@0 | 3794 | phone->rules[j * 2] = NULL; |
michael@0 | 3795 | phone->rules[j * 2 + 1] = NULL; |
michael@0 | 3796 | piece = mystrsep(&tp, 0); |
michael@0 | 3797 | while (piece) { |
michael@0 | 3798 | if (*piece != '\0') { |
michael@0 | 3799 | switch(i) { |
michael@0 | 3800 | case 0: { |
michael@0 | 3801 | if (strncmp(piece,"PHONE",5) != 0) { |
michael@0 | 3802 | HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); |
michael@0 | 3803 | phone->num = 0; |
michael@0 | 3804 | return 1; |
michael@0 | 3805 | } |
michael@0 | 3806 | break; |
michael@0 | 3807 | } |
michael@0 | 3808 | case 1: { phone->rules[j * 2] = mystrrep(mystrdup(piece),"_",""); break; } |
michael@0 | 3809 | case 2: { phone->rules[j * 2 + 1] = mystrrep(mystrdup(piece),"_",""); break; } |
michael@0 | 3810 | default: break; |
michael@0 | 3811 | } |
michael@0 | 3812 | i++; |
michael@0 | 3813 | } |
michael@0 | 3814 | piece = mystrsep(&tp, 0); |
michael@0 | 3815 | } |
michael@0 | 3816 | if ((!(phone->rules[j * 2])) || (!(phone->rules[j * 2 + 1]))) { |
michael@0 | 3817 | HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); |
michael@0 | 3818 | phone->num = 0; |
michael@0 | 3819 | return 1; |
michael@0 | 3820 | } |
michael@0 | 3821 | } |
michael@0 | 3822 | phone->rules[phone->num * 2] = mystrdup(""); |
michael@0 | 3823 | phone->rules[phone->num * 2 + 1] = mystrdup(""); |
michael@0 | 3824 | init_phonet_hash(*phone); |
michael@0 | 3825 | return 0; |
michael@0 | 3826 | } |
michael@0 | 3827 | |
michael@0 | 3828 | /* parse in the checkcompoundpattern table */ |
michael@0 | 3829 | int AffixMgr::parse_checkcpdtable(char * line, FileMgr * af) |
michael@0 | 3830 | { |
michael@0 | 3831 | if (numcheckcpd != 0) { |
michael@0 | 3832 | HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum()); |
michael@0 | 3833 | return 1; |
michael@0 | 3834 | } |
michael@0 | 3835 | char * tp = line; |
michael@0 | 3836 | char * piece; |
michael@0 | 3837 | int i = 0; |
michael@0 | 3838 | int np = 0; |
michael@0 | 3839 | piece = mystrsep(&tp, 0); |
michael@0 | 3840 | while (piece) { |
michael@0 | 3841 | if (*piece != '\0') { |
michael@0 | 3842 | switch(i) { |
michael@0 | 3843 | case 0: { np++; break; } |
michael@0 | 3844 | case 1: { |
michael@0 | 3845 | numcheckcpd = atoi(piece); |
michael@0 | 3846 | if (numcheckcpd < 1) { |
michael@0 | 3847 | HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum()); |
michael@0 | 3848 | return 1; |
michael@0 | 3849 | } |
michael@0 | 3850 | checkcpdtable = (patentry *) malloc(numcheckcpd * sizeof(struct patentry)); |
michael@0 | 3851 | if (!checkcpdtable) return 1; |
michael@0 | 3852 | np++; |
michael@0 | 3853 | break; |
michael@0 | 3854 | } |
michael@0 | 3855 | default: break; |
michael@0 | 3856 | } |
michael@0 | 3857 | i++; |
michael@0 | 3858 | } |
michael@0 | 3859 | piece = mystrsep(&tp, 0); |
michael@0 | 3860 | } |
michael@0 | 3861 | if (np != 2) { |
michael@0 | 3862 | HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); |
michael@0 | 3863 | return 1; |
michael@0 | 3864 | } |
michael@0 | 3865 | |
michael@0 | 3866 | /* now parse the numcheckcpd lines to read in the remainder of the table */ |
michael@0 | 3867 | char * nl; |
michael@0 | 3868 | for (int j=0; j < numcheckcpd; j++) { |
michael@0 | 3869 | if (!(nl = af->getline())) return 1; |
michael@0 | 3870 | mychomp(nl); |
michael@0 | 3871 | tp = nl; |
michael@0 | 3872 | i = 0; |
michael@0 | 3873 | checkcpdtable[j].pattern = NULL; |
michael@0 | 3874 | checkcpdtable[j].pattern2 = NULL; |
michael@0 | 3875 | checkcpdtable[j].pattern3 = NULL; |
michael@0 | 3876 | checkcpdtable[j].cond = FLAG_NULL; |
michael@0 | 3877 | checkcpdtable[j].cond2 = FLAG_NULL; |
michael@0 | 3878 | piece = mystrsep(&tp, 0); |
michael@0 | 3879 | while (piece) { |
michael@0 | 3880 | if (*piece != '\0') { |
michael@0 | 3881 | switch(i) { |
michael@0 | 3882 | case 0: { |
michael@0 | 3883 | if (strncmp(piece,"CHECKCOMPOUNDPATTERN",20) != 0) { |
michael@0 | 3884 | HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); |
michael@0 | 3885 | numcheckcpd = 0; |
michael@0 | 3886 | return 1; |
michael@0 | 3887 | } |
michael@0 | 3888 | break; |
michael@0 | 3889 | } |
michael@0 | 3890 | case 1: { |
michael@0 | 3891 | checkcpdtable[j].pattern = mystrdup(piece); |
michael@0 | 3892 | char * p = strchr(checkcpdtable[j].pattern, '/'); |
michael@0 | 3893 | if (p) { |
michael@0 | 3894 | *p = '\0'; |
michael@0 | 3895 | checkcpdtable[j].cond = pHMgr->decode_flag(p + 1); |
michael@0 | 3896 | } |
michael@0 | 3897 | break; } |
michael@0 | 3898 | case 2: { |
michael@0 | 3899 | checkcpdtable[j].pattern2 = mystrdup(piece); |
michael@0 | 3900 | char * p = strchr(checkcpdtable[j].pattern2, '/'); |
michael@0 | 3901 | if (p) { |
michael@0 | 3902 | *p = '\0'; |
michael@0 | 3903 | checkcpdtable[j].cond2 = pHMgr->decode_flag(p + 1); |
michael@0 | 3904 | } |
michael@0 | 3905 | break; |
michael@0 | 3906 | } |
michael@0 | 3907 | case 3: { checkcpdtable[j].pattern3 = mystrdup(piece); simplifiedcpd = 1; break; } |
michael@0 | 3908 | default: break; |
michael@0 | 3909 | } |
michael@0 | 3910 | i++; |
michael@0 | 3911 | } |
michael@0 | 3912 | piece = mystrsep(&tp, 0); |
michael@0 | 3913 | } |
michael@0 | 3914 | if ((!(checkcpdtable[j].pattern)) || (!(checkcpdtable[j].pattern2))) { |
michael@0 | 3915 | HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); |
michael@0 | 3916 | numcheckcpd = 0; |
michael@0 | 3917 | return 1; |
michael@0 | 3918 | } |
michael@0 | 3919 | } |
michael@0 | 3920 | return 0; |
michael@0 | 3921 | } |
michael@0 | 3922 | |
michael@0 | 3923 | /* parse in the compound rule table */ |
michael@0 | 3924 | int AffixMgr::parse_defcpdtable(char * line, FileMgr * af) |
michael@0 | 3925 | { |
michael@0 | 3926 | if (numdefcpd != 0) { |
michael@0 | 3927 | HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum()); |
michael@0 | 3928 | return 1; |
michael@0 | 3929 | } |
michael@0 | 3930 | char * tp = line; |
michael@0 | 3931 | char * piece; |
michael@0 | 3932 | int i = 0; |
michael@0 | 3933 | int np = 0; |
michael@0 | 3934 | piece = mystrsep(&tp, 0); |
michael@0 | 3935 | while (piece) { |
michael@0 | 3936 | if (*piece != '\0') { |
michael@0 | 3937 | switch(i) { |
michael@0 | 3938 | case 0: { np++; break; } |
michael@0 | 3939 | case 1: { |
michael@0 | 3940 | numdefcpd = atoi(piece); |
michael@0 | 3941 | if (numdefcpd < 1) { |
michael@0 | 3942 | HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum()); |
michael@0 | 3943 | return 1; |
michael@0 | 3944 | } |
michael@0 | 3945 | defcpdtable = (flagentry *) malloc(numdefcpd * sizeof(flagentry)); |
michael@0 | 3946 | if (!defcpdtable) return 1; |
michael@0 | 3947 | np++; |
michael@0 | 3948 | break; |
michael@0 | 3949 | } |
michael@0 | 3950 | default: break; |
michael@0 | 3951 | } |
michael@0 | 3952 | i++; |
michael@0 | 3953 | } |
michael@0 | 3954 | piece = mystrsep(&tp, 0); |
michael@0 | 3955 | } |
michael@0 | 3956 | if (np != 2) { |
michael@0 | 3957 | HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); |
michael@0 | 3958 | return 1; |
michael@0 | 3959 | } |
michael@0 | 3960 | |
michael@0 | 3961 | /* now parse the numdefcpd lines to read in the remainder of the table */ |
michael@0 | 3962 | char * nl; |
michael@0 | 3963 | for (int j=0; j < numdefcpd; j++) { |
michael@0 | 3964 | if (!(nl = af->getline())) return 1; |
michael@0 | 3965 | mychomp(nl); |
michael@0 | 3966 | tp = nl; |
michael@0 | 3967 | i = 0; |
michael@0 | 3968 | defcpdtable[j].def = NULL; |
michael@0 | 3969 | piece = mystrsep(&tp, 0); |
michael@0 | 3970 | while (piece) { |
michael@0 | 3971 | if (*piece != '\0') { |
michael@0 | 3972 | switch(i) { |
michael@0 | 3973 | case 0: { |
michael@0 | 3974 | if (strncmp(piece, "COMPOUNDRULE", 12) != 0) { |
michael@0 | 3975 | HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); |
michael@0 | 3976 | numdefcpd = 0; |
michael@0 | 3977 | return 1; |
michael@0 | 3978 | } |
michael@0 | 3979 | break; |
michael@0 | 3980 | } |
michael@0 | 3981 | case 1: { // handle parenthesized flags |
michael@0 | 3982 | if (strchr(piece, '(')) { |
michael@0 | 3983 | defcpdtable[j].def = (FLAG *) malloc(strlen(piece) * sizeof(FLAG)); |
michael@0 | 3984 | defcpdtable[j].len = 0; |
michael@0 | 3985 | int end = 0; |
michael@0 | 3986 | FLAG * conv; |
michael@0 | 3987 | while (!end) { |
michael@0 | 3988 | char * par = piece + 1; |
michael@0 | 3989 | while (*par != '(' && *par != ')' && *par != '\0') par++; |
michael@0 | 3990 | if (*par == '\0') end = 1; else *par = '\0'; |
michael@0 | 3991 | if (*piece == '(') piece++; |
michael@0 | 3992 | if (*piece == '*' || *piece == '?') { |
michael@0 | 3993 | defcpdtable[j].def[defcpdtable[j].len++] = (FLAG) *piece; |
michael@0 | 3994 | } else if (*piece != '\0') { |
michael@0 | 3995 | int l = pHMgr->decode_flags(&conv, piece, af); |
michael@0 | 3996 | for (int k = 0; k < l; k++) defcpdtable[j].def[defcpdtable[j].len++] = conv[k]; |
michael@0 | 3997 | free(conv); |
michael@0 | 3998 | } |
michael@0 | 3999 | piece = par + 1; |
michael@0 | 4000 | } |
michael@0 | 4001 | } else { |
michael@0 | 4002 | defcpdtable[j].len = pHMgr->decode_flags(&(defcpdtable[j].def), piece, af); |
michael@0 | 4003 | } |
michael@0 | 4004 | break; |
michael@0 | 4005 | } |
michael@0 | 4006 | default: break; |
michael@0 | 4007 | } |
michael@0 | 4008 | i++; |
michael@0 | 4009 | } |
michael@0 | 4010 | piece = mystrsep(&tp, 0); |
michael@0 | 4011 | } |
michael@0 | 4012 | if (!defcpdtable[j].len) { |
michael@0 | 4013 | HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); |
michael@0 | 4014 | numdefcpd = 0; |
michael@0 | 4015 | return 1; |
michael@0 | 4016 | } |
michael@0 | 4017 | } |
michael@0 | 4018 | return 0; |
michael@0 | 4019 | } |
michael@0 | 4020 | |
michael@0 | 4021 | |
michael@0 | 4022 | /* parse in the character map table */ |
michael@0 | 4023 | int AffixMgr::parse_maptable(char * line, FileMgr * af) |
michael@0 | 4024 | { |
michael@0 | 4025 | if (nummap != 0) { |
michael@0 | 4026 | HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum()); |
michael@0 | 4027 | return 1; |
michael@0 | 4028 | } |
michael@0 | 4029 | char * tp = line; |
michael@0 | 4030 | char * piece; |
michael@0 | 4031 | int i = 0; |
michael@0 | 4032 | int np = 0; |
michael@0 | 4033 | piece = mystrsep(&tp, 0); |
michael@0 | 4034 | while (piece) { |
michael@0 | 4035 | if (*piece != '\0') { |
michael@0 | 4036 | switch(i) { |
michael@0 | 4037 | case 0: { np++; break; } |
michael@0 | 4038 | case 1: { |
michael@0 | 4039 | nummap = atoi(piece); |
michael@0 | 4040 | if (nummap < 1) { |
michael@0 | 4041 | HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum()); |
michael@0 | 4042 | return 1; |
michael@0 | 4043 | } |
michael@0 | 4044 | maptable = (mapentry *) malloc(nummap * sizeof(struct mapentry)); |
michael@0 | 4045 | if (!maptable) return 1; |
michael@0 | 4046 | np++; |
michael@0 | 4047 | break; |
michael@0 | 4048 | } |
michael@0 | 4049 | default: break; |
michael@0 | 4050 | } |
michael@0 | 4051 | i++; |
michael@0 | 4052 | } |
michael@0 | 4053 | piece = mystrsep(&tp, 0); |
michael@0 | 4054 | } |
michael@0 | 4055 | if (np != 2) { |
michael@0 | 4056 | HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); |
michael@0 | 4057 | return 1; |
michael@0 | 4058 | } |
michael@0 | 4059 | |
michael@0 | 4060 | /* now parse the nummap lines to read in the remainder of the table */ |
michael@0 | 4061 | char * nl; |
michael@0 | 4062 | for (int j=0; j < nummap; j++) { |
michael@0 | 4063 | if (!(nl = af->getline())) return 1; |
michael@0 | 4064 | mychomp(nl); |
michael@0 | 4065 | tp = nl; |
michael@0 | 4066 | i = 0; |
michael@0 | 4067 | maptable[j].set = NULL; |
michael@0 | 4068 | maptable[j].len = 0; |
michael@0 | 4069 | piece = mystrsep(&tp, 0); |
michael@0 | 4070 | while (piece) { |
michael@0 | 4071 | if (*piece != '\0') { |
michael@0 | 4072 | switch(i) { |
michael@0 | 4073 | case 0: { |
michael@0 | 4074 | if (strncmp(piece,"MAP",3) != 0) { |
michael@0 | 4075 | HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); |
michael@0 | 4076 | nummap = 0; |
michael@0 | 4077 | return 1; |
michael@0 | 4078 | } |
michael@0 | 4079 | break; |
michael@0 | 4080 | } |
michael@0 | 4081 | case 1: { |
michael@0 | 4082 | int setn = 0; |
michael@0 | 4083 | maptable[j].len = strlen(piece); |
michael@0 | 4084 | maptable[j].set = (char **) malloc(maptable[j].len * sizeof(char*)); |
michael@0 | 4085 | if (!maptable[j].set) return 1; |
michael@0 | 4086 | for (int k = 0; k < maptable[j].len; k++) { |
michael@0 | 4087 | int chl = 1; |
michael@0 | 4088 | int chb = k; |
michael@0 | 4089 | if (piece[k] == '(') { |
michael@0 | 4090 | char * parpos = strchr(piece + k, ')'); |
michael@0 | 4091 | if (parpos != NULL) { |
michael@0 | 4092 | chb = k + 1; |
michael@0 | 4093 | chl = (int)(parpos - piece) - k - 1; |
michael@0 | 4094 | k = k + chl + 1; |
michael@0 | 4095 | } |
michael@0 | 4096 | } else { |
michael@0 | 4097 | if (utf8 && (piece[k] & 0xc0) == 0xc0) { |
michael@0 | 4098 | for (k++; utf8 && (piece[k] & 0xc0) == 0x80; k++); |
michael@0 | 4099 | chl = k - chb; |
michael@0 | 4100 | k--; |
michael@0 | 4101 | } |
michael@0 | 4102 | } |
michael@0 | 4103 | maptable[j].set[setn] = (char *) malloc(chl + 1); |
michael@0 | 4104 | if (!maptable[j].set[setn]) return 1; |
michael@0 | 4105 | strncpy(maptable[j].set[setn], piece + chb, chl); |
michael@0 | 4106 | maptable[j].set[setn][chl] = '\0'; |
michael@0 | 4107 | setn++; |
michael@0 | 4108 | } |
michael@0 | 4109 | maptable[j].len = setn; |
michael@0 | 4110 | break; } |
michael@0 | 4111 | default: break; |
michael@0 | 4112 | } |
michael@0 | 4113 | i++; |
michael@0 | 4114 | } |
michael@0 | 4115 | piece = mystrsep(&tp, 0); |
michael@0 | 4116 | } |
michael@0 | 4117 | if (!maptable[j].set || !maptable[j].len) { |
michael@0 | 4118 | HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); |
michael@0 | 4119 | nummap = 0; |
michael@0 | 4120 | return 1; |
michael@0 | 4121 | } |
michael@0 | 4122 | } |
michael@0 | 4123 | return 0; |
michael@0 | 4124 | } |
michael@0 | 4125 | |
michael@0 | 4126 | /* parse in the word breakpoint table */ |
michael@0 | 4127 | int AffixMgr::parse_breaktable(char * line, FileMgr * af) |
michael@0 | 4128 | { |
michael@0 | 4129 | if (numbreak > -1) { |
michael@0 | 4130 | HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum()); |
michael@0 | 4131 | return 1; |
michael@0 | 4132 | } |
michael@0 | 4133 | char * tp = line; |
michael@0 | 4134 | char * piece; |
michael@0 | 4135 | int i = 0; |
michael@0 | 4136 | int np = 0; |
michael@0 | 4137 | piece = mystrsep(&tp, 0); |
michael@0 | 4138 | while (piece) { |
michael@0 | 4139 | if (*piece != '\0') { |
michael@0 | 4140 | switch(i) { |
michael@0 | 4141 | case 0: { np++; break; } |
michael@0 | 4142 | case 1: { |
michael@0 | 4143 | numbreak = atoi(piece); |
michael@0 | 4144 | if (numbreak < 0) { |
michael@0 | 4145 | HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum()); |
michael@0 | 4146 | return 1; |
michael@0 | 4147 | } |
michael@0 | 4148 | if (numbreak == 0) return 0; |
michael@0 | 4149 | breaktable = (char **) malloc(numbreak * sizeof(char *)); |
michael@0 | 4150 | if (!breaktable) return 1; |
michael@0 | 4151 | np++; |
michael@0 | 4152 | break; |
michael@0 | 4153 | } |
michael@0 | 4154 | default: break; |
michael@0 | 4155 | } |
michael@0 | 4156 | i++; |
michael@0 | 4157 | } |
michael@0 | 4158 | piece = mystrsep(&tp, 0); |
michael@0 | 4159 | } |
michael@0 | 4160 | if (np != 2) { |
michael@0 | 4161 | HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); |
michael@0 | 4162 | return 1; |
michael@0 | 4163 | } |
michael@0 | 4164 | |
michael@0 | 4165 | /* now parse the numbreak lines to read in the remainder of the table */ |
michael@0 | 4166 | char * nl; |
michael@0 | 4167 | for (int j=0; j < numbreak; j++) { |
michael@0 | 4168 | if (!(nl = af->getline())) return 1; |
michael@0 | 4169 | mychomp(nl); |
michael@0 | 4170 | tp = nl; |
michael@0 | 4171 | i = 0; |
michael@0 | 4172 | piece = mystrsep(&tp, 0); |
michael@0 | 4173 | while (piece) { |
michael@0 | 4174 | if (*piece != '\0') { |
michael@0 | 4175 | switch(i) { |
michael@0 | 4176 | case 0: { |
michael@0 | 4177 | if (strncmp(piece,"BREAK",5) != 0) { |
michael@0 | 4178 | HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); |
michael@0 | 4179 | numbreak = 0; |
michael@0 | 4180 | return 1; |
michael@0 | 4181 | } |
michael@0 | 4182 | break; |
michael@0 | 4183 | } |
michael@0 | 4184 | case 1: { |
michael@0 | 4185 | breaktable[j] = mystrdup(piece); |
michael@0 | 4186 | break; |
michael@0 | 4187 | } |
michael@0 | 4188 | default: break; |
michael@0 | 4189 | } |
michael@0 | 4190 | i++; |
michael@0 | 4191 | } |
michael@0 | 4192 | piece = mystrsep(&tp, 0); |
michael@0 | 4193 | } |
michael@0 | 4194 | if (!breaktable) { |
michael@0 | 4195 | HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); |
michael@0 | 4196 | numbreak = 0; |
michael@0 | 4197 | return 1; |
michael@0 | 4198 | } |
michael@0 | 4199 | } |
michael@0 | 4200 | return 0; |
michael@0 | 4201 | } |
michael@0 | 4202 | |
michael@0 | 4203 | void AffixMgr::reverse_condition(char * piece) { |
michael@0 | 4204 | int neg = 0; |
michael@0 | 4205 | for (char * k = piece + strlen(piece) - 1; k >= piece; k--) { |
michael@0 | 4206 | switch(*k) { |
michael@0 | 4207 | case '[': { |
michael@0 | 4208 | if (neg) *(k+1) = '['; else *k = ']'; |
michael@0 | 4209 | break; |
michael@0 | 4210 | } |
michael@0 | 4211 | case ']': { |
michael@0 | 4212 | *k = '['; |
michael@0 | 4213 | if (neg) *(k+1) = '^'; |
michael@0 | 4214 | neg = 0; |
michael@0 | 4215 | break; |
michael@0 | 4216 | } |
michael@0 | 4217 | case '^': { |
michael@0 | 4218 | if (*(k+1) == ']') neg = 1; else *(k+1) = *k; |
michael@0 | 4219 | break; |
michael@0 | 4220 | } |
michael@0 | 4221 | default: { |
michael@0 | 4222 | if (neg) *(k+1) = *k; |
michael@0 | 4223 | } |
michael@0 | 4224 | } |
michael@0 | 4225 | } |
michael@0 | 4226 | } |
michael@0 | 4227 | |
michael@0 | 4228 | int AffixMgr::parse_affix(char * line, const char at, FileMgr * af, char * dupflags) |
michael@0 | 4229 | { |
michael@0 | 4230 | int numents = 0; // number of affentry structures to parse |
michael@0 | 4231 | |
michael@0 | 4232 | unsigned short aflag = 0; // affix char identifier |
michael@0 | 4233 | |
michael@0 | 4234 | char ff=0; |
michael@0 | 4235 | std::vector<affentry> affentries; |
michael@0 | 4236 | |
michael@0 | 4237 | char * tp = line; |
michael@0 | 4238 | char * nl = line; |
michael@0 | 4239 | char * piece; |
michael@0 | 4240 | int i = 0; |
michael@0 | 4241 | |
michael@0 | 4242 | // checking lines with bad syntax |
michael@0 | 4243 | #ifdef DEBUG |
michael@0 | 4244 | int basefieldnum = 0; |
michael@0 | 4245 | #endif |
michael@0 | 4246 | |
michael@0 | 4247 | // split affix header line into pieces |
michael@0 | 4248 | |
michael@0 | 4249 | int np = 0; |
michael@0 | 4250 | |
michael@0 | 4251 | piece = mystrsep(&tp, 0); |
michael@0 | 4252 | while (piece) { |
michael@0 | 4253 | if (*piece != '\0') { |
michael@0 | 4254 | switch(i) { |
michael@0 | 4255 | // piece 1 - is type of affix |
michael@0 | 4256 | case 0: { np++; break; } |
michael@0 | 4257 | |
michael@0 | 4258 | // piece 2 - is affix char |
michael@0 | 4259 | case 1: { |
michael@0 | 4260 | np++; |
michael@0 | 4261 | aflag = pHMgr->decode_flag(piece); |
michael@0 | 4262 | if (((at == 'S') && (dupflags[aflag] & dupSFX)) || |
michael@0 | 4263 | ((at == 'P') && (dupflags[aflag] & dupPFX))) { |
michael@0 | 4264 | HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of an affix flag\n", |
michael@0 | 4265 | af->getlinenum()); |
michael@0 | 4266 | // return 1; XXX permissive mode for bad dictionaries |
michael@0 | 4267 | } |
michael@0 | 4268 | dupflags[aflag] += (char) ((at == 'S') ? dupSFX : dupPFX); |
michael@0 | 4269 | break; |
michael@0 | 4270 | } |
michael@0 | 4271 | // piece 3 - is cross product indicator |
michael@0 | 4272 | case 2: { np++; if (*piece == 'Y') ff = aeXPRODUCT; break; } |
michael@0 | 4273 | |
michael@0 | 4274 | // piece 4 - is number of affentries |
michael@0 | 4275 | case 3: { |
michael@0 | 4276 | np++; |
michael@0 | 4277 | numents = atoi(piece); |
michael@0 | 4278 | if (numents == 0) { |
michael@0 | 4279 | char * err = pHMgr->encode_flag(aflag); |
michael@0 | 4280 | if (err) { |
michael@0 | 4281 | HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", |
michael@0 | 4282 | af->getlinenum()); |
michael@0 | 4283 | free(err); |
michael@0 | 4284 | } |
michael@0 | 4285 | return 1; |
michael@0 | 4286 | } |
michael@0 | 4287 | affentries.resize(numents); |
michael@0 | 4288 | affentries[0].opts = ff; |
michael@0 | 4289 | if (utf8) affentries[0].opts += aeUTF8; |
michael@0 | 4290 | if (pHMgr->is_aliasf()) affentries[0].opts += aeALIASF; |
michael@0 | 4291 | if (pHMgr->is_aliasm()) affentries[0].opts += aeALIASM; |
michael@0 | 4292 | affentries[0].aflag = aflag; |
michael@0 | 4293 | } |
michael@0 | 4294 | |
michael@0 | 4295 | default: break; |
michael@0 | 4296 | } |
michael@0 | 4297 | i++; |
michael@0 | 4298 | } |
michael@0 | 4299 | piece = mystrsep(&tp, 0); |
michael@0 | 4300 | } |
michael@0 | 4301 | // check to make sure we parsed enough pieces |
michael@0 | 4302 | if (np != 4) { |
michael@0 | 4303 | char * err = pHMgr->encode_flag(aflag); |
michael@0 | 4304 | if (err) { |
michael@0 | 4305 | HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); |
michael@0 | 4306 | free(err); |
michael@0 | 4307 | } |
michael@0 | 4308 | return 1; |
michael@0 | 4309 | } |
michael@0 | 4310 | |
michael@0 | 4311 | // now parse numents affentries for this affix |
michael@0 | 4312 | std::vector<affentry>::iterator start = affentries.begin(); |
michael@0 | 4313 | std::vector<affentry>::iterator end = affentries.end(); |
michael@0 | 4314 | for (std::vector<affentry>::iterator entry = start; entry != end; ++entry) { |
michael@0 | 4315 | if (!(nl = af->getline())) return 1; |
michael@0 | 4316 | mychomp(nl); |
michael@0 | 4317 | tp = nl; |
michael@0 | 4318 | i = 0; |
michael@0 | 4319 | np = 0; |
michael@0 | 4320 | |
michael@0 | 4321 | // split line into pieces |
michael@0 | 4322 | piece = mystrsep(&tp, 0); |
michael@0 | 4323 | while (piece) { |
michael@0 | 4324 | if (*piece != '\0') { |
michael@0 | 4325 | switch(i) { |
michael@0 | 4326 | // piece 1 - is type |
michael@0 | 4327 | case 0: { |
michael@0 | 4328 | np++; |
michael@0 | 4329 | if (entry != start) entry->opts = start->opts & |
michael@0 | 4330 | (char) (aeXPRODUCT + aeUTF8 + aeALIASF + aeALIASM); |
michael@0 | 4331 | break; |
michael@0 | 4332 | } |
michael@0 | 4333 | |
michael@0 | 4334 | // piece 2 - is affix char |
michael@0 | 4335 | case 1: { |
michael@0 | 4336 | np++; |
michael@0 | 4337 | if (pHMgr->decode_flag(piece) != aflag) { |
michael@0 | 4338 | char * err = pHMgr->encode_flag(aflag); |
michael@0 | 4339 | if (err) { |
michael@0 | 4340 | HUNSPELL_WARNING(stderr, "error: line %d: affix %s is corrupt\n", |
michael@0 | 4341 | af->getlinenum(), err); |
michael@0 | 4342 | free(err); |
michael@0 | 4343 | } |
michael@0 | 4344 | return 1; |
michael@0 | 4345 | } |
michael@0 | 4346 | |
michael@0 | 4347 | if (entry != start) entry->aflag = start->aflag; |
michael@0 | 4348 | break; |
michael@0 | 4349 | } |
michael@0 | 4350 | |
michael@0 | 4351 | // piece 3 - is string to strip or 0 for null |
michael@0 | 4352 | case 2: { |
michael@0 | 4353 | np++; |
michael@0 | 4354 | if (complexprefixes) { |
michael@0 | 4355 | if (utf8) reverseword_utf(piece); else reverseword(piece); |
michael@0 | 4356 | } |
michael@0 | 4357 | entry->strip = mystrdup(piece); |
michael@0 | 4358 | entry->stripl = (unsigned char) strlen(entry->strip); |
michael@0 | 4359 | if (strcmp(entry->strip,"0") == 0) { |
michael@0 | 4360 | free(entry->strip); |
michael@0 | 4361 | entry->strip=mystrdup(""); |
michael@0 | 4362 | entry->stripl = 0; |
michael@0 | 4363 | } |
michael@0 | 4364 | break; |
michael@0 | 4365 | } |
michael@0 | 4366 | |
michael@0 | 4367 | // piece 4 - is affix string or 0 for null |
michael@0 | 4368 | case 3: { |
michael@0 | 4369 | char * dash; |
michael@0 | 4370 | entry->morphcode = NULL; |
michael@0 | 4371 | entry->contclass = NULL; |
michael@0 | 4372 | entry->contclasslen = 0; |
michael@0 | 4373 | np++; |
michael@0 | 4374 | dash = strchr(piece, '/'); |
michael@0 | 4375 | if (dash) { |
michael@0 | 4376 | *dash = '\0'; |
michael@0 | 4377 | |
michael@0 | 4378 | if (ignorechars) { |
michael@0 | 4379 | if (utf8) { |
michael@0 | 4380 | remove_ignored_chars_utf(piece, ignorechars_utf16, ignorechars_utf16_len); |
michael@0 | 4381 | } else { |
michael@0 | 4382 | remove_ignored_chars(piece,ignorechars); |
michael@0 | 4383 | } |
michael@0 | 4384 | } |
michael@0 | 4385 | |
michael@0 | 4386 | if (complexprefixes) { |
michael@0 | 4387 | if (utf8) reverseword_utf(piece); else reverseword(piece); |
michael@0 | 4388 | } |
michael@0 | 4389 | entry->appnd = mystrdup(piece); |
michael@0 | 4390 | |
michael@0 | 4391 | if (pHMgr->is_aliasf()) { |
michael@0 | 4392 | int index = atoi(dash + 1); |
michael@0 | 4393 | entry->contclasslen = (unsigned short) pHMgr->get_aliasf(index, &(entry->contclass), af); |
michael@0 | 4394 | if (!entry->contclasslen) HUNSPELL_WARNING(stderr, "error: bad affix flag alias: \"%s\"\n", dash+1); |
michael@0 | 4395 | } else { |
michael@0 | 4396 | entry->contclasslen = (unsigned short) pHMgr->decode_flags(&(entry->contclass), dash + 1, af); |
michael@0 | 4397 | flag_qsort(entry->contclass, 0, entry->contclasslen); |
michael@0 | 4398 | } |
michael@0 | 4399 | *dash = '/'; |
michael@0 | 4400 | |
michael@0 | 4401 | havecontclass = 1; |
michael@0 | 4402 | for (unsigned short _i = 0; _i < entry->contclasslen; _i++) { |
michael@0 | 4403 | contclasses[(entry->contclass)[_i]] = 1; |
michael@0 | 4404 | } |
michael@0 | 4405 | } else { |
michael@0 | 4406 | if (ignorechars) { |
michael@0 | 4407 | if (utf8) { |
michael@0 | 4408 | remove_ignored_chars_utf(piece, ignorechars_utf16, ignorechars_utf16_len); |
michael@0 | 4409 | } else { |
michael@0 | 4410 | remove_ignored_chars(piece,ignorechars); |
michael@0 | 4411 | } |
michael@0 | 4412 | } |
michael@0 | 4413 | |
michael@0 | 4414 | if (complexprefixes) { |
michael@0 | 4415 | if (utf8) reverseword_utf(piece); else reverseword(piece); |
michael@0 | 4416 | } |
michael@0 | 4417 | entry->appnd = mystrdup(piece); |
michael@0 | 4418 | } |
michael@0 | 4419 | |
michael@0 | 4420 | entry->appndl = (unsigned char) strlen(entry->appnd); |
michael@0 | 4421 | if (strcmp(entry->appnd,"0") == 0) { |
michael@0 | 4422 | free(entry->appnd); |
michael@0 | 4423 | entry->appnd=mystrdup(""); |
michael@0 | 4424 | entry->appndl = 0; |
michael@0 | 4425 | } |
michael@0 | 4426 | break; |
michael@0 | 4427 | } |
michael@0 | 4428 | |
michael@0 | 4429 | // piece 5 - is the conditions descriptions |
michael@0 | 4430 | case 4: { |
michael@0 | 4431 | np++; |
michael@0 | 4432 | if (complexprefixes) { |
michael@0 | 4433 | if (utf8) reverseword_utf(piece); else reverseword(piece); |
michael@0 | 4434 | reverse_condition(piece); |
michael@0 | 4435 | } |
michael@0 | 4436 | if (entry->stripl && (strcmp(piece, ".") != 0) && |
michael@0 | 4437 | redundant_condition(at, entry->strip, entry->stripl, piece, af->getlinenum())) |
michael@0 | 4438 | strcpy(piece, "."); |
michael@0 | 4439 | if (at == 'S') { |
michael@0 | 4440 | reverseword(piece); |
michael@0 | 4441 | reverse_condition(piece); |
michael@0 | 4442 | } |
michael@0 | 4443 | if (encodeit(*entry, piece)) return 1; |
michael@0 | 4444 | break; |
michael@0 | 4445 | } |
michael@0 | 4446 | |
michael@0 | 4447 | case 5: { |
michael@0 | 4448 | np++; |
michael@0 | 4449 | if (pHMgr->is_aliasm()) { |
michael@0 | 4450 | int index = atoi(piece); |
michael@0 | 4451 | entry->morphcode = pHMgr->get_aliasm(index); |
michael@0 | 4452 | } else { |
michael@0 | 4453 | if (complexprefixes) { // XXX - fix me for morph. gen. |
michael@0 | 4454 | if (utf8) reverseword_utf(piece); else reverseword(piece); |
michael@0 | 4455 | } |
michael@0 | 4456 | // add the remaining of the line |
michael@0 | 4457 | if (*tp) { |
michael@0 | 4458 | *(tp - 1) = ' '; |
michael@0 | 4459 | tp = tp + strlen(tp); |
michael@0 | 4460 | } |
michael@0 | 4461 | entry->morphcode = mystrdup(piece); |
michael@0 | 4462 | if (!entry->morphcode) return 1; |
michael@0 | 4463 | } |
michael@0 | 4464 | break; |
michael@0 | 4465 | } |
michael@0 | 4466 | default: break; |
michael@0 | 4467 | } |
michael@0 | 4468 | i++; |
michael@0 | 4469 | } |
michael@0 | 4470 | piece = mystrsep(&tp, 0); |
michael@0 | 4471 | } |
michael@0 | 4472 | // check to make sure we parsed enough pieces |
michael@0 | 4473 | if (np < 4) { |
michael@0 | 4474 | char * err = pHMgr->encode_flag(aflag); |
michael@0 | 4475 | if (err) { |
michael@0 | 4476 | HUNSPELL_WARNING(stderr, "error: line %d: affix %s is corrupt\n", |
michael@0 | 4477 | af->getlinenum(), err); |
michael@0 | 4478 | free(err); |
michael@0 | 4479 | } |
michael@0 | 4480 | return 1; |
michael@0 | 4481 | } |
michael@0 | 4482 | |
michael@0 | 4483 | #ifdef DEBUG |
michael@0 | 4484 | // detect unnecessary fields, excepting comments |
michael@0 | 4485 | if (basefieldnum) { |
michael@0 | 4486 | int fieldnum = !(entry->morphcode) ? 5 : ((*(entry->morphcode)=='#') ? 5 : 6); |
michael@0 | 4487 | if (fieldnum != basefieldnum) |
michael@0 | 4488 | HUNSPELL_WARNING(stderr, "warning: line %d: bad field number\n", af->getlinenum()); |
michael@0 | 4489 | } else { |
michael@0 | 4490 | basefieldnum = !(entry->morphcode) ? 5 : ((*(entry->morphcode)=='#') ? 5 : 6); |
michael@0 | 4491 | } |
michael@0 | 4492 | #endif |
michael@0 | 4493 | } |
michael@0 | 4494 | |
michael@0 | 4495 | // now create SfxEntry or PfxEntry objects and use links to |
michael@0 | 4496 | // build an ordered (sorted by affix string) list |
michael@0 | 4497 | for (std::vector<affentry>::iterator entry = start; entry != end; ++entry) { |
michael@0 | 4498 | if (at == 'P') { |
michael@0 | 4499 | PfxEntry * pfxptr = new PfxEntry(this,&(*entry)); |
michael@0 | 4500 | build_pfxtree(pfxptr); |
michael@0 | 4501 | } else { |
michael@0 | 4502 | SfxEntry * sfxptr = new SfxEntry(this,&(*entry)); |
michael@0 | 4503 | build_sfxtree(sfxptr); |
michael@0 | 4504 | } |
michael@0 | 4505 | } |
michael@0 | 4506 | return 0; |
michael@0 | 4507 | } |
michael@0 | 4508 | |
michael@0 | 4509 | int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char * cond, int linenum) { |
michael@0 | 4510 | int condl = strlen(cond); |
michael@0 | 4511 | int i; |
michael@0 | 4512 | int j; |
michael@0 | 4513 | int neg; |
michael@0 | 4514 | int in; |
michael@0 | 4515 | if (ft == 'P') { // prefix |
michael@0 | 4516 | if (strncmp(strip, cond, condl) == 0) return 1; |
michael@0 | 4517 | if (utf8) { |
michael@0 | 4518 | } else { |
michael@0 | 4519 | for (i = 0, j = 0; (i < stripl) && (j < condl); i++, j++) { |
michael@0 | 4520 | if (cond[j] != '[') { |
michael@0 | 4521 | if (cond[j] != strip[i]) { |
michael@0 | 4522 | HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum); |
michael@0 | 4523 | return 0; |
michael@0 | 4524 | } |
michael@0 | 4525 | } else { |
michael@0 | 4526 | neg = (cond[j+1] == '^') ? 1 : 0; |
michael@0 | 4527 | in = 0; |
michael@0 | 4528 | do { |
michael@0 | 4529 | j++; |
michael@0 | 4530 | if (strip[i] == cond[j]) in = 1; |
michael@0 | 4531 | } while ((j < (condl - 1)) && (cond[j] != ']')); |
michael@0 | 4532 | if (j == (condl - 1) && (cond[j] != ']')) { |
michael@0 | 4533 | HUNSPELL_WARNING(stderr, "error: line %d: missing ] in condition:\n%s\n", linenum, cond); |
michael@0 | 4534 | return 0; |
michael@0 | 4535 | } |
michael@0 | 4536 | if ((!neg && !in) || (neg && in)) { |
michael@0 | 4537 | HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum); |
michael@0 | 4538 | return 0; |
michael@0 | 4539 | } |
michael@0 | 4540 | } |
michael@0 | 4541 | } |
michael@0 | 4542 | if (j >= condl) return 1; |
michael@0 | 4543 | } |
michael@0 | 4544 | } else { // suffix |
michael@0 | 4545 | if ((stripl >= condl) && strcmp(strip + stripl - condl, cond) == 0) return 1; |
michael@0 | 4546 | if (utf8) { |
michael@0 | 4547 | } else { |
michael@0 | 4548 | for (i = stripl - 1, j = condl - 1; (i >= 0) && (j >= 0); i--, j--) { |
michael@0 | 4549 | if (cond[j] != ']') { |
michael@0 | 4550 | if (cond[j] != strip[i]) { |
michael@0 | 4551 | HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum); |
michael@0 | 4552 | return 0; |
michael@0 | 4553 | } |
michael@0 | 4554 | } else { |
michael@0 | 4555 | in = 0; |
michael@0 | 4556 | do { |
michael@0 | 4557 | j--; |
michael@0 | 4558 | if (strip[i] == cond[j]) in = 1; |
michael@0 | 4559 | } while ((j > 0) && (cond[j] != '[')); |
michael@0 | 4560 | if ((j == 0) && (cond[j] != '[')) { |
michael@0 | 4561 | HUNSPELL_WARNING(stderr, "error: line: %d: missing ] in condition:\n%s\n", linenum, cond); |
michael@0 | 4562 | return 0; |
michael@0 | 4563 | } |
michael@0 | 4564 | neg = (cond[j+1] == '^') ? 1 : 0; |
michael@0 | 4565 | if ((!neg && !in) || (neg && in)) { |
michael@0 | 4566 | HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum); |
michael@0 | 4567 | return 0; |
michael@0 | 4568 | } |
michael@0 | 4569 | } |
michael@0 | 4570 | } |
michael@0 | 4571 | if (j < 0) return 1; |
michael@0 | 4572 | } |
michael@0 | 4573 | } |
michael@0 | 4574 | return 0; |
michael@0 | 4575 | } |