extensions/spellcheck/hunspell/src/affentry.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /******* BEGIN LICENSE BLOCK *******
michael@0 2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
michael@0 3 *
michael@0 4 * The contents of this file are subject to the Mozilla Public License Version
michael@0 5 * 1.1 (the "License"); you may not use this file except in compliance with
michael@0 6 * the License. You may obtain a copy of the License at
michael@0 7 * http://www.mozilla.org/MPL/
michael@0 8 *
michael@0 9 * Software distributed under the License is distributed on an "AS IS" basis,
michael@0 10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
michael@0 11 * for the specific language governing rights and limitations under the
michael@0 12 * License.
michael@0 13 *
michael@0 14 * The Initial Developers of the Original Code are Kevin Hendricks (MySpell)
michael@0 15 * and László Németh (Hunspell). Portions created by the Initial Developers
michael@0 16 * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved.
michael@0 17 *
michael@0 18 * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca)
michael@0 19 * David Einstein (deinst@world.std.com)
michael@0 20 * László Németh (nemethl@gyorsposta.hu)
michael@0 21 * Caolan McNamara (caolanm@redhat.com)
michael@0 22 * Davide Prina
michael@0 23 * Giuseppe Modugno
michael@0 24 * Gianluca Turconi
michael@0 25 * Simon Brouwer
michael@0 26 * Noll Janos
michael@0 27 * Biro Arpad
michael@0 28 * Goldman Eleonora
michael@0 29 * Sarlos Tamas
michael@0 30 * Bencsath Boldizsar
michael@0 31 * Halacsy Peter
michael@0 32 * Dvornik Laszlo
michael@0 33 * Gefferth Andras
michael@0 34 * Nagy Viktor
michael@0 35 * Varga Daniel
michael@0 36 * Chris Halls
michael@0 37 * Rene Engelhard
michael@0 38 * Bram Moolenaar
michael@0 39 * Dafydd Jones
michael@0 40 * Harri Pitkanen
michael@0 41 * Andras Timar
michael@0 42 * Tor Lillqvist
michael@0 43 *
michael@0 44 * Alternatively, the contents of this file may be used under the terms of
michael@0 45 * either the GNU General Public License Version 2 or later (the "GPL"), or
michael@0 46 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
michael@0 47 * in which case the provisions of the GPL or the LGPL are applicable instead
michael@0 48 * of those above. If you wish to allow use of your version of this file only
michael@0 49 * under the terms of either the GPL or the LGPL, and not to allow others to
michael@0 50 * use your version of this file under the terms of the MPL, indicate your
michael@0 51 * decision by deleting the provisions above and replace them with the notice
michael@0 52 * and other provisions required by the GPL or the LGPL. If you do not delete
michael@0 53 * the provisions above, a recipient may use your version of this file under
michael@0 54 * the terms of any one of the MPL, the GPL or the LGPL.
michael@0 55 *
michael@0 56 ******* END LICENSE BLOCK *******/
michael@0 57
michael@0 58 #include <stdlib.h>
michael@0 59 #include <string.h>
michael@0 60 #include <stdio.h>
michael@0 61 #include <ctype.h>
michael@0 62
michael@0 63 #include "affentry.hxx"
michael@0 64 #include "csutil.hxx"
michael@0 65
michael@0 66 PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp)
michael@0 67 {
michael@0 68 // register affix manager
michael@0 69 pmyMgr = pmgr;
michael@0 70
michael@0 71 // set up its initial values
michael@0 72
michael@0 73 aflag = dp->aflag; // flag
michael@0 74 strip = dp->strip; // string to strip
michael@0 75 appnd = dp->appnd; // string to append
michael@0 76 stripl = dp->stripl; // length of strip string
michael@0 77 appndl = dp->appndl; // length of append string
michael@0 78 numconds = dp->numconds; // length of the condition
michael@0 79 opts = dp->opts; // cross product flag
michael@0 80 // then copy over all of the conditions
michael@0 81 if (opts & aeLONGCOND) {
michael@0 82 memcpy(c.conds, dp->c.l.conds1, MAXCONDLEN_1);
michael@0 83 c.l.conds2 = dp->c.l.conds2;
michael@0 84 } else memcpy(c.conds, dp->c.conds, MAXCONDLEN);
michael@0 85 next = NULL;
michael@0 86 nextne = NULL;
michael@0 87 nexteq = NULL;
michael@0 88 morphcode = dp->morphcode;
michael@0 89 contclass = dp->contclass;
michael@0 90 contclasslen = dp->contclasslen;
michael@0 91 }
michael@0 92
michael@0 93
michael@0 94 PfxEntry::~PfxEntry()
michael@0 95 {
michael@0 96 aflag = 0;
michael@0 97 if (appnd) free(appnd);
michael@0 98 if (strip) free(strip);
michael@0 99 pmyMgr = NULL;
michael@0 100 appnd = NULL;
michael@0 101 strip = NULL;
michael@0 102 if (opts & aeLONGCOND) free(c.l.conds2);
michael@0 103 if (morphcode && !(opts & aeALIASM)) free(morphcode);
michael@0 104 if (contclass && !(opts & aeALIASF)) free(contclass);
michael@0 105 }
michael@0 106
michael@0 107 // add prefix to this word assuming conditions hold
michael@0 108 char * PfxEntry::add(const char * word, int len)
michael@0 109 {
michael@0 110 char tword[MAXWORDUTF8LEN + 4];
michael@0 111
michael@0 112 if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) &&
michael@0 113 (len >= numconds) && test_condition(word) &&
michael@0 114 (!stripl || (strncmp(word, strip, stripl) == 0)) &&
michael@0 115 ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
michael@0 116 /* we have a match so add prefix */
michael@0 117 char * pp = tword;
michael@0 118 if (appndl) {
michael@0 119 strcpy(tword,appnd);
michael@0 120 pp += appndl;
michael@0 121 }
michael@0 122 strcpy(pp, (word + stripl));
michael@0 123 return mystrdup(tword);
michael@0 124 }
michael@0 125 return NULL;
michael@0 126 }
michael@0 127
michael@0 128 inline char * PfxEntry::nextchar(char * p) {
michael@0 129 if (p) {
michael@0 130 p++;
michael@0 131 if (opts & aeLONGCOND) {
michael@0 132 // jump to the 2nd part of the condition
michael@0 133 if (p == c.conds + MAXCONDLEN_1) return c.l.conds2;
michael@0 134 // end of the MAXCONDLEN length condition
michael@0 135 } else if (p == c.conds + MAXCONDLEN) return NULL;
michael@0 136 return *p ? p : NULL;
michael@0 137 }
michael@0 138 return NULL;
michael@0 139 }
michael@0 140
michael@0 141 inline int PfxEntry::test_condition(const char * st)
michael@0 142 {
michael@0 143 const char * pos = NULL; // group with pos input position
michael@0 144 bool neg = false; // complementer
michael@0 145 bool ingroup = false; // character in the group
michael@0 146 if (numconds == 0) return 1;
michael@0 147 char * p = c.conds;
michael@0 148 while (1) {
michael@0 149 switch (*p) {
michael@0 150 case '\0': return 1;
michael@0 151 case '[': {
michael@0 152 neg = false;
michael@0 153 ingroup = false;
michael@0 154 p = nextchar(p);
michael@0 155 pos = st; break;
michael@0 156 }
michael@0 157 case '^': { p = nextchar(p); neg = true; break; }
michael@0 158 case ']': {
michael@0 159 if ((neg && ingroup) || (!neg && !ingroup)) return 0;
michael@0 160 pos = NULL;
michael@0 161 p = nextchar(p);
michael@0 162 // skip the next character
michael@0 163 if (!ingroup && *st) for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);
michael@0 164 if (*st == '\0' && p) return 0; // word <= condition
michael@0 165 break;
michael@0 166 }
michael@0 167 case '.': if (!pos) { // dots are not metacharacters in groups: [.]
michael@0 168 p = nextchar(p);
michael@0 169 // skip the next character
michael@0 170 for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);
michael@0 171 if (*st == '\0' && p) return 0; // word <= condition
michael@0 172 break;
michael@0 173 }
michael@0 174 default: {
michael@0 175 if (*st == *p) {
michael@0 176 st++;
michael@0 177 p = nextchar(p);
michael@0 178 if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte
michael@0 179 while (p && (*p & 0xc0) == 0x80) { // character
michael@0 180 if (*p != *st) {
michael@0 181 if (!pos) return 0;
michael@0 182 st = pos;
michael@0 183 break;
michael@0 184 }
michael@0 185 p = nextchar(p);
michael@0 186 st++;
michael@0 187 }
michael@0 188 if (pos && st != pos) {
michael@0 189 ingroup = true;
michael@0 190 while (p && *p != ']' && (p = nextchar(p)));
michael@0 191 }
michael@0 192 } else if (pos) {
michael@0 193 ingroup = true;
michael@0 194 while (p && *p != ']' && (p = nextchar(p)));
michael@0 195 }
michael@0 196 } else if (pos) { // group
michael@0 197 p = nextchar(p);
michael@0 198 } else return 0;
michael@0 199 }
michael@0 200 }
michael@0 201 if (!p) return 1;
michael@0 202 }
michael@0 203 }
michael@0 204
michael@0 205 // check if this prefix entry matches
michael@0 206 struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound, const FLAG needflag)
michael@0 207 {
michael@0 208 int tmpl; // length of tmpword
michael@0 209 struct hentry * he; // hash entry of root word or NULL
michael@0 210 char tmpword[MAXWORDUTF8LEN + 4];
michael@0 211
michael@0 212 // on entry prefix is 0 length or already matches the beginning of the word.
michael@0 213 // So if the remaining root word has positive length
michael@0 214 // and if there are enough chars in root word and added back strip chars
michael@0 215 // to meet the number of characters conditions, then test it
michael@0 216
michael@0 217 tmpl = len - appndl;
michael@0 218
michael@0 219 if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) {
michael@0 220
michael@0 221 // generate new root word by removing prefix and adding
michael@0 222 // back any characters that would have been stripped
michael@0 223
michael@0 224 if (stripl) strcpy (tmpword, strip);
michael@0 225 strcpy ((tmpword + stripl), (word + appndl));
michael@0 226
michael@0 227 // now make sure all of the conditions on characters
michael@0 228 // are met. Please see the appendix at the end of
michael@0 229 // this file for more info on exactly what is being
michael@0 230 // tested
michael@0 231
michael@0 232 // if all conditions are met then check if resulting
michael@0 233 // root word in the dictionary
michael@0 234
michael@0 235 if (test_condition(tmpword)) {
michael@0 236 tmpl += stripl;
michael@0 237 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
michael@0 238 do {
michael@0 239 if (TESTAFF(he->astr, aflag, he->alen) &&
michael@0 240 // forbid single prefixes with needaffix flag
michael@0 241 ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
michael@0 242 // needflag
michael@0 243 ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
michael@0 244 (contclass && TESTAFF(contclass, needflag, contclasslen))))
michael@0 245 return he;
michael@0 246 he = he->next_homonym; // check homonyms
michael@0 247 } while (he);
michael@0 248 }
michael@0 249
michael@0 250 // prefix matched but no root word was found
michael@0 251 // if aeXPRODUCT is allowed, try again but now
michael@0 252 // ross checked combined with a suffix
michael@0 253
michael@0 254 //if ((opts & aeXPRODUCT) && in_compound) {
michael@0 255 if ((opts & aeXPRODUCT)) {
michael@0 256 he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, this, NULL,
michael@0 257 0, NULL, FLAG_NULL, needflag, in_compound);
michael@0 258 if (he) return he;
michael@0 259 }
michael@0 260 }
michael@0 261 }
michael@0 262 return NULL;
michael@0 263 }
michael@0 264
michael@0 265 // check if this prefix entry matches
michael@0 266 struct hentry * PfxEntry::check_twosfx(const char * word, int len,
michael@0 267 char in_compound, const FLAG needflag)
michael@0 268 {
michael@0 269 int tmpl; // length of tmpword
michael@0 270 struct hentry * he; // hash entry of root word or NULL
michael@0 271 char tmpword[MAXWORDUTF8LEN + 4];
michael@0 272
michael@0 273 // on entry prefix is 0 length or already matches the beginning of the word.
michael@0 274 // So if the remaining root word has positive length
michael@0 275 // and if there are enough chars in root word and added back strip chars
michael@0 276 // to meet the number of characters conditions, then test it
michael@0 277
michael@0 278 tmpl = len - appndl;
michael@0 279
michael@0 280 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
michael@0 281 (tmpl + stripl >= numconds)) {
michael@0 282
michael@0 283 // generate new root word by removing prefix and adding
michael@0 284 // back any characters that would have been stripped
michael@0 285
michael@0 286 if (stripl) strcpy (tmpword, strip);
michael@0 287 strcpy ((tmpword + stripl), (word + appndl));
michael@0 288
michael@0 289 // now make sure all of the conditions on characters
michael@0 290 // are met. Please see the appendix at the end of
michael@0 291 // this file for more info on exactly what is being
michael@0 292 // tested
michael@0 293
michael@0 294 // if all conditions are met then check if resulting
michael@0 295 // root word in the dictionary
michael@0 296
michael@0 297 if (test_condition(tmpword)) {
michael@0 298 tmpl += stripl;
michael@0 299
michael@0 300 // prefix matched but no root word was found
michael@0 301 // if aeXPRODUCT is allowed, try again but now
michael@0 302 // cross checked combined with a suffix
michael@0 303
michael@0 304 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
michael@0 305 he = pmyMgr->suffix_check_twosfx(tmpword, tmpl, aeXPRODUCT, this, needflag);
michael@0 306 if (he) return he;
michael@0 307 }
michael@0 308 }
michael@0 309 }
michael@0 310 return NULL;
michael@0 311 }
michael@0 312
michael@0 313 // check if this prefix entry matches
michael@0 314 char * PfxEntry::check_twosfx_morph(const char * word, int len,
michael@0 315 char in_compound, const FLAG needflag)
michael@0 316 {
michael@0 317 int tmpl; // length of tmpword
michael@0 318 char tmpword[MAXWORDUTF8LEN + 4];
michael@0 319
michael@0 320 // on entry prefix is 0 length or already matches the beginning of the word.
michael@0 321 // So if the remaining root word has positive length
michael@0 322 // and if there are enough chars in root word and added back strip chars
michael@0 323 // to meet the number of characters conditions, then test it
michael@0 324
michael@0 325 tmpl = len - appndl;
michael@0 326
michael@0 327 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
michael@0 328 (tmpl + stripl >= numconds)) {
michael@0 329
michael@0 330 // generate new root word by removing prefix and adding
michael@0 331 // back any characters that would have been stripped
michael@0 332
michael@0 333 if (stripl) strcpy (tmpword, strip);
michael@0 334 strcpy ((tmpword + stripl), (word + appndl));
michael@0 335
michael@0 336 // now make sure all of the conditions on characters
michael@0 337 // are met. Please see the appendix at the end of
michael@0 338 // this file for more info on exactly what is being
michael@0 339 // tested
michael@0 340
michael@0 341 // if all conditions are met then check if resulting
michael@0 342 // root word in the dictionary
michael@0 343
michael@0 344 if (test_condition(tmpword)) {
michael@0 345 tmpl += stripl;
michael@0 346
michael@0 347 // prefix matched but no root word was found
michael@0 348 // if aeXPRODUCT is allowed, try again but now
michael@0 349 // ross checked combined with a suffix
michael@0 350
michael@0 351 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
michael@0 352 return pmyMgr->suffix_check_twosfx_morph(tmpword, tmpl,
michael@0 353 aeXPRODUCT, this, needflag);
michael@0 354 }
michael@0 355 }
michael@0 356 }
michael@0 357 return NULL;
michael@0 358 }
michael@0 359
michael@0 360 // check if this prefix entry matches
michael@0 361 char * PfxEntry::check_morph(const char * word, int len, char in_compound, const FLAG needflag)
michael@0 362 {
michael@0 363 int tmpl; // length of tmpword
michael@0 364 struct hentry * he; // hash entry of root word or NULL
michael@0 365 char tmpword[MAXWORDUTF8LEN + 4];
michael@0 366 char result[MAXLNLEN];
michael@0 367 char * st;
michael@0 368
michael@0 369 *result = '\0';
michael@0 370
michael@0 371 // on entry prefix is 0 length or already matches the beginning of the word.
michael@0 372 // So if the remaining root word has positive length
michael@0 373 // and if there are enough chars in root word and added back strip chars
michael@0 374 // to meet the number of characters conditions, then test it
michael@0 375
michael@0 376 tmpl = len - appndl;
michael@0 377
michael@0 378 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
michael@0 379 (tmpl + stripl >= numconds)) {
michael@0 380
michael@0 381 // generate new root word by removing prefix and adding
michael@0 382 // back any characters that would have been stripped
michael@0 383
michael@0 384 if (stripl) strcpy (tmpword, strip);
michael@0 385 strcpy ((tmpword + stripl), (word + appndl));
michael@0 386
michael@0 387 // now make sure all of the conditions on characters
michael@0 388 // are met. Please see the appendix at the end of
michael@0 389 // this file for more info on exactly what is being
michael@0 390 // tested
michael@0 391
michael@0 392 // if all conditions are met then check if resulting
michael@0 393 // root word in the dictionary
michael@0 394
michael@0 395 if (test_condition(tmpword)) {
michael@0 396 tmpl += stripl;
michael@0 397 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
michael@0 398 do {
michael@0 399 if (TESTAFF(he->astr, aflag, he->alen) &&
michael@0 400 // forbid single prefixes with needaffix flag
michael@0 401 ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
michael@0 402 // needflag
michael@0 403 ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
michael@0 404 (contclass && TESTAFF(contclass, needflag, contclasslen)))) {
michael@0 405 if (morphcode) {
michael@0 406 mystrcat(result, " ", MAXLNLEN);
michael@0 407 mystrcat(result, morphcode, MAXLNLEN);
michael@0 408 } else mystrcat(result,getKey(), MAXLNLEN);
michael@0 409 if (!HENTRY_FIND(he, MORPH_STEM)) {
michael@0 410 mystrcat(result, " ", MAXLNLEN);
michael@0 411 mystrcat(result, MORPH_STEM, MAXLNLEN);
michael@0 412 mystrcat(result, HENTRY_WORD(he), MAXLNLEN);
michael@0 413 }
michael@0 414 // store the pointer of the hash entry
michael@0 415 if (HENTRY_DATA(he)) {
michael@0 416 mystrcat(result, " ", MAXLNLEN);
michael@0 417 mystrcat(result, HENTRY_DATA2(he), MAXLNLEN);
michael@0 418 } else {
michael@0 419 // return with debug information
michael@0 420 char * flag = pmyMgr->encode_flag(getFlag());
michael@0 421 mystrcat(result, " ", MAXLNLEN);
michael@0 422 mystrcat(result, MORPH_FLAG, MAXLNLEN);
michael@0 423 mystrcat(result, flag, MAXLNLEN);
michael@0 424 free(flag);
michael@0 425 }
michael@0 426 mystrcat(result, "\n", MAXLNLEN);
michael@0 427 }
michael@0 428 he = he->next_homonym;
michael@0 429 } while (he);
michael@0 430 }
michael@0 431
michael@0 432 // prefix matched but no root word was found
michael@0 433 // if aeXPRODUCT is allowed, try again but now
michael@0 434 // ross checked combined with a suffix
michael@0 435
michael@0 436 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
michael@0 437 st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, this,
michael@0 438 FLAG_NULL, needflag);
michael@0 439 if (st) {
michael@0 440 mystrcat(result, st, MAXLNLEN);
michael@0 441 free(st);
michael@0 442 }
michael@0 443 }
michael@0 444 }
michael@0 445 }
michael@0 446
michael@0 447 if (*result) return mystrdup(result);
michael@0 448 return NULL;
michael@0 449 }
michael@0 450
michael@0 451 SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp)
michael@0 452 {
michael@0 453 // register affix manager
michael@0 454 pmyMgr = pmgr;
michael@0 455
michael@0 456 // set up its initial values
michael@0 457 aflag = dp->aflag; // char flag
michael@0 458 strip = dp->strip; // string to strip
michael@0 459 appnd = dp->appnd; // string to append
michael@0 460 stripl = dp->stripl; // length of strip string
michael@0 461 appndl = dp->appndl; // length of append string
michael@0 462 numconds = dp->numconds; // length of the condition
michael@0 463 opts = dp->opts; // cross product flag
michael@0 464
michael@0 465 // then copy over all of the conditions
michael@0 466 if (opts & aeLONGCOND) {
michael@0 467 memcpy(c.l.conds1, dp->c.l.conds1, MAXCONDLEN_1);
michael@0 468 c.l.conds2 = dp->c.l.conds2;
michael@0 469 } else memcpy(c.conds, dp->c.conds, MAXCONDLEN);
michael@0 470 next = NULL;
michael@0 471 nextne = NULL;
michael@0 472 nexteq = NULL;
michael@0 473 rappnd = myrevstrdup(appnd);
michael@0 474 morphcode = dp->morphcode;
michael@0 475 contclass = dp->contclass;
michael@0 476 contclasslen = dp->contclasslen;
michael@0 477 }
michael@0 478
michael@0 479
michael@0 480 SfxEntry::~SfxEntry()
michael@0 481 {
michael@0 482 aflag = 0;
michael@0 483 if (appnd) free(appnd);
michael@0 484 if (rappnd) free(rappnd);
michael@0 485 if (strip) free(strip);
michael@0 486 pmyMgr = NULL;
michael@0 487 appnd = NULL;
michael@0 488 strip = NULL;
michael@0 489 if (opts & aeLONGCOND) free(c.l.conds2);
michael@0 490 if (morphcode && !(opts & aeALIASM)) free(morphcode);
michael@0 491 if (contclass && !(opts & aeALIASF)) free(contclass);
michael@0 492 }
michael@0 493
michael@0 494 // add suffix to this word assuming conditions hold
michael@0 495 char * SfxEntry::add(const char * word, int len)
michael@0 496 {
michael@0 497 char tword[MAXWORDUTF8LEN + 4];
michael@0 498
michael@0 499 /* make sure all conditions match */
michael@0 500 if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) &&
michael@0 501 (len >= numconds) && test_condition(word + len, word) &&
michael@0 502 (!stripl || (strcmp(word + len - stripl, strip) == 0)) &&
michael@0 503 ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
michael@0 504 /* we have a match so add suffix */
michael@0 505 strcpy(tword,word);
michael@0 506 if (appndl) {
michael@0 507 strcpy(tword + len - stripl, appnd);
michael@0 508 } else {
michael@0 509 *(tword + len - stripl) = '\0';
michael@0 510 }
michael@0 511 return mystrdup(tword);
michael@0 512 }
michael@0 513 return NULL;
michael@0 514 }
michael@0 515
michael@0 516 inline char * SfxEntry::nextchar(char * p) {
michael@0 517 if (p) {
michael@0 518 p++;
michael@0 519 if (opts & aeLONGCOND) {
michael@0 520 // jump to the 2nd part of the condition
michael@0 521 if (p == c.l.conds1 + MAXCONDLEN_1) return c.l.conds2;
michael@0 522 // end of the MAXCONDLEN length condition
michael@0 523 } else if (p == c.conds + MAXCONDLEN) return NULL;
michael@0 524 return *p ? p : NULL;
michael@0 525 }
michael@0 526 return NULL;
michael@0 527 }
michael@0 528
michael@0 529 inline int SfxEntry::test_condition(const char * st, const char * beg)
michael@0 530 {
michael@0 531 const char * pos = NULL; // group with pos input position
michael@0 532 bool neg = false; // complementer
michael@0 533 bool ingroup = false; // character in the group
michael@0 534 if (numconds == 0) return 1;
michael@0 535 char * p = c.conds;
michael@0 536 st--;
michael@0 537 int i = 1;
michael@0 538 while (1) {
michael@0 539 switch (*p) {
michael@0 540 case '\0': return 1;
michael@0 541 case '[': { p = nextchar(p); pos = st; break; }
michael@0 542 case '^': { p = nextchar(p); neg = true; break; }
michael@0 543 case ']': { if (!neg && !ingroup) return 0;
michael@0 544 i++;
michael@0 545 // skip the next character
michael@0 546 if (!ingroup) {
michael@0 547 for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);
michael@0 548 st--;
michael@0 549 }
michael@0 550 pos = NULL;
michael@0 551 neg = false;
michael@0 552 ingroup = false;
michael@0 553 p = nextchar(p);
michael@0 554 if (st < beg && p) return 0; // word <= condition
michael@0 555 break;
michael@0 556 }
michael@0 557 case '.': if (!pos) { // dots are not metacharacters in groups: [.]
michael@0 558 p = nextchar(p);
michael@0 559 // skip the next character
michael@0 560 for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);
michael@0 561 if (st < beg) { // word <= condition
michael@0 562 if (p) return 0; else return 1;
michael@0 563 }
michael@0 564 if ((opts & aeUTF8) && (*st & 0x80)) { // head of the UTF-8 character
michael@0 565 st--;
michael@0 566 if (st < beg) { // word <= condition
michael@0 567 if (p) return 0; else return 1;
michael@0 568 }
michael@0 569 }
michael@0 570 break;
michael@0 571 }
michael@0 572 default: {
michael@0 573 if (*st == *p) {
michael@0 574 p = nextchar(p);
michael@0 575 if ((opts & aeUTF8) && (*st & 0x80)) {
michael@0 576 st--;
michael@0 577 while (p && (st >= beg)) {
michael@0 578 if (*p != *st) {
michael@0 579 if (!pos) return 0;
michael@0 580 st = pos;
michael@0 581 break;
michael@0 582 }
michael@0 583 // first byte of the UTF-8 multibyte character
michael@0 584 if ((*p & 0xc0) != 0x80) break;
michael@0 585 p = nextchar(p);
michael@0 586 st--;
michael@0 587 }
michael@0 588 if (pos && st != pos) {
michael@0 589 if (neg) return 0;
michael@0 590 else if (i == numconds) return 1;
michael@0 591 ingroup = true;
michael@0 592 while (p && *p != ']' && (p = nextchar(p)));
michael@0 593 st--;
michael@0 594 }
michael@0 595 if (p && *p != ']') p = nextchar(p);
michael@0 596 } else if (pos) {
michael@0 597 if (neg) return 0;
michael@0 598 else if (i == numconds) return 1;
michael@0 599 ingroup = true;
michael@0 600 while (p && *p != ']' && (p = nextchar(p)))
michael@0 601 ;
michael@0 602 // if (p && *p != ']') p = nextchar(p);
michael@0 603 st--;
michael@0 604 }
michael@0 605 if (!pos) {
michael@0 606 i++;
michael@0 607 st--;
michael@0 608 }
michael@0 609 if (st < beg && p && *p != ']') return 0; // word <= condition
michael@0 610 } else if (pos) { // group
michael@0 611 p = nextchar(p);
michael@0 612 } else return 0;
michael@0 613 }
michael@0 614 }
michael@0 615 if (!p) return 1;
michael@0 616 }
michael@0 617 }
michael@0 618
michael@0 619 // see if this suffix is present in the word
michael@0 620 struct hentry * SfxEntry::checkword(const char * word, int len, int optflags,
michael@0 621 PfxEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag,
michael@0 622 const FLAG badflag)
michael@0 623 {
michael@0 624 int tmpl; // length of tmpword
michael@0 625 struct hentry * he; // hash entry pointer
michael@0 626 unsigned char * cp;
michael@0 627 char tmpword[MAXWORDUTF8LEN + 4];
michael@0 628 PfxEntry* ep = ppfx;
michael@0 629
michael@0 630 // if this suffix is being cross checked with a prefix
michael@0 631 // but it does not support cross products skip it
michael@0 632
michael@0 633 if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0))
michael@0 634 return NULL;
michael@0 635
michael@0 636 // upon entry suffix is 0 length or already matches the end of the word.
michael@0 637 // So if the remaining root word has positive length
michael@0 638 // and if there are enough chars in root word and added back strip chars
michael@0 639 // to meet the number of characters conditions, then test it
michael@0 640
michael@0 641 tmpl = len - appndl;
michael@0 642 // the second condition is not enough for UTF-8 strings
michael@0 643 // it checked in test_condition()
michael@0 644
michael@0 645 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
michael@0 646 (tmpl + stripl >= numconds)) {
michael@0 647
michael@0 648 // generate new root word by removing suffix and adding
michael@0 649 // back any characters that would have been stripped or
michael@0 650 // or null terminating the shorter string
michael@0 651
michael@0 652 strcpy (tmpword, word);
michael@0 653 cp = (unsigned char *)(tmpword + tmpl);
michael@0 654 if (stripl) {
michael@0 655 strcpy ((char *)cp, strip);
michael@0 656 tmpl += stripl;
michael@0 657 cp = (unsigned char *)(tmpword + tmpl);
michael@0 658 } else *cp = '\0';
michael@0 659
michael@0 660 // now make sure all of the conditions on characters
michael@0 661 // are met. Please see the appendix at the end of
michael@0 662 // this file for more info on exactly what is being
michael@0 663 // tested
michael@0 664
michael@0 665 // if all conditions are met then check if resulting
michael@0 666 // root word in the dictionary
michael@0 667
michael@0 668 if (test_condition((char *) cp, (char *) tmpword)) {
michael@0 669
michael@0 670 #ifdef SZOSZABLYA_POSSIBLE_ROOTS
michael@0 671 fprintf(stdout,"%s %s %c\n", word, tmpword, aflag);
michael@0 672 #endif
michael@0 673 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
michael@0 674 do {
michael@0 675 // check conditional suffix (enabled by prefix)
michael@0 676 if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() &&
michael@0 677 TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
michael@0 678 (((optflags & aeXPRODUCT) == 0) ||
michael@0 679 (ep && TESTAFF(he->astr, ep->getFlag(), he->alen)) ||
michael@0 680 // enabled by prefix
michael@0 681 ((contclass) && (ep && TESTAFF(contclass, ep->getFlag(), contclasslen)))
michael@0 682 ) &&
michael@0 683 // handle cont. class
michael@0 684 ((!cclass) ||
michael@0 685 ((contclass) && TESTAFF(contclass, cclass, contclasslen))
michael@0 686 ) &&
michael@0 687 // check only in compound homonyms (bad flags)
michael@0 688 (!badflag || !TESTAFF(he->astr, badflag, he->alen)
michael@0 689 ) &&
michael@0 690 // handle required flag
michael@0 691 ((!needflag) ||
michael@0 692 (TESTAFF(he->astr, needflag, he->alen) ||
michael@0 693 ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
michael@0 694 )
michael@0 695 ) return he;
michael@0 696 he = he->next_homonym; // check homonyms
michael@0 697 } while (he);
michael@0 698
michael@0 699 // obsolote stemming code (used only by the
michael@0 700 // experimental SuffixMgr:suggest_pos_stems)
michael@0 701 // store resulting root in wlst
michael@0 702 } else if (wlst && (*ns < maxSug)) {
michael@0 703 int cwrd = 1;
michael@0 704 for (int k=0; k < *ns; k++)
michael@0 705 if (strcmp(tmpword, wlst[k]) == 0) cwrd = 0;
michael@0 706 if (cwrd) {
michael@0 707 wlst[*ns] = mystrdup(tmpword);
michael@0 708 if (wlst[*ns] == NULL) {
michael@0 709 for (int j=0; j<*ns; j++) free(wlst[j]);
michael@0 710 *ns = -1;
michael@0 711 return NULL;
michael@0 712 }
michael@0 713 (*ns)++;
michael@0 714 }
michael@0 715 }
michael@0 716 }
michael@0 717 }
michael@0 718 return NULL;
michael@0 719 }
michael@0 720
michael@0 721 // see if two-level suffix is present in the word
michael@0 722 struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags,
michael@0 723 PfxEntry* ppfx, const FLAG needflag)
michael@0 724 {
michael@0 725 int tmpl; // length of tmpword
michael@0 726 struct hentry * he; // hash entry pointer
michael@0 727 unsigned char * cp;
michael@0 728 char tmpword[MAXWORDUTF8LEN + 4];
michael@0 729 PfxEntry* ep = ppfx;
michael@0 730
michael@0 731
michael@0 732 // if this suffix is being cross checked with a prefix
michael@0 733 // but it does not support cross products skip it
michael@0 734
michael@0 735 if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
michael@0 736 return NULL;
michael@0 737
michael@0 738 // upon entry suffix is 0 length or already matches the end of the word.
michael@0 739 // So if the remaining root word has positive length
michael@0 740 // and if there are enough chars in root word and added back strip chars
michael@0 741 // to meet the number of characters conditions, then test it
michael@0 742
michael@0 743 tmpl = len - appndl;
michael@0 744
michael@0 745 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
michael@0 746 (tmpl + stripl >= numconds)) {
michael@0 747
michael@0 748 // generate new root word by removing suffix and adding
michael@0 749 // back any characters that would have been stripped or
michael@0 750 // or null terminating the shorter string
michael@0 751
michael@0 752 strcpy (tmpword, word);
michael@0 753 cp = (unsigned char *)(tmpword + tmpl);
michael@0 754 if (stripl) {
michael@0 755 strcpy ((char *)cp, strip);
michael@0 756 tmpl += stripl;
michael@0 757 cp = (unsigned char *)(tmpword + tmpl);
michael@0 758 } else *cp = '\0';
michael@0 759
michael@0 760 // now make sure all of the conditions on characters
michael@0 761 // are met. Please see the appendix at the end of
michael@0 762 // this file for more info on exactly what is being
michael@0 763 // tested
michael@0 764
michael@0 765 // if all conditions are met then recall suffix_check
michael@0 766
michael@0 767 if (test_condition((char *) cp, (char *) tmpword)) {
michael@0 768 if (ppfx) {
michael@0 769 // handle conditional suffix
michael@0 770 if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
michael@0 771 he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
michael@0 772 else
michael@0 773 he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0, NULL, (FLAG) aflag, needflag);
michael@0 774 } else {
michael@0 775 he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
michael@0 776 }
michael@0 777 if (he) return he;
michael@0 778 }
michael@0 779 }
michael@0 780 return NULL;
michael@0 781 }
michael@0 782
michael@0 783 // see if two-level suffix is present in the word
michael@0 784 char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags,
michael@0 785 PfxEntry* ppfx, const FLAG needflag)
michael@0 786 {
michael@0 787 int tmpl; // length of tmpword
michael@0 788 unsigned char * cp;
michael@0 789 char tmpword[MAXWORDUTF8LEN + 4];
michael@0 790 PfxEntry* ep = ppfx;
michael@0 791 char * st;
michael@0 792
michael@0 793 char result[MAXLNLEN];
michael@0 794
michael@0 795 *result = '\0';
michael@0 796
michael@0 797 // if this suffix is being cross checked with a prefix
michael@0 798 // but it does not support cross products skip it
michael@0 799
michael@0 800 if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
michael@0 801 return NULL;
michael@0 802
michael@0 803 // upon entry suffix is 0 length or already matches the end of the word.
michael@0 804 // So if the remaining root word has positive length
michael@0 805 // and if there are enough chars in root word and added back strip chars
michael@0 806 // to meet the number of characters conditions, then test it
michael@0 807
michael@0 808 tmpl = len - appndl;
michael@0 809
michael@0 810 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
michael@0 811 (tmpl + stripl >= numconds)) {
michael@0 812
michael@0 813 // generate new root word by removing suffix and adding
michael@0 814 // back any characters that would have been stripped or
michael@0 815 // or null terminating the shorter string
michael@0 816
michael@0 817 strcpy (tmpword, word);
michael@0 818 cp = (unsigned char *)(tmpword + tmpl);
michael@0 819 if (stripl) {
michael@0 820 strcpy ((char *)cp, strip);
michael@0 821 tmpl += stripl;
michael@0 822 cp = (unsigned char *)(tmpword + tmpl);
michael@0 823 } else *cp = '\0';
michael@0 824
michael@0 825 // now make sure all of the conditions on characters
michael@0 826 // are met. Please see the appendix at the end of
michael@0 827 // this file for more info on exactly what is being
michael@0 828 // tested
michael@0 829
michael@0 830 // if all conditions are met then recall suffix_check
michael@0 831
michael@0 832 if (test_condition((char *) cp, (char *) tmpword)) {
michael@0 833 if (ppfx) {
michael@0 834 // handle conditional suffix
michael@0 835 if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {
michael@0 836 st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
michael@0 837 if (st) {
michael@0 838 if (ppfx->getMorph()) {
michael@0 839 mystrcat(result, ppfx->getMorph(), MAXLNLEN);
michael@0 840 mystrcat(result, " ", MAXLNLEN);
michael@0 841 }
michael@0 842 mystrcat(result,st, MAXLNLEN);
michael@0 843 free(st);
michael@0 844 mychomp(result);
michael@0 845 }
michael@0 846 } else {
michael@0 847 st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag);
michael@0 848 if (st) {
michael@0 849 mystrcat(result, st, MAXLNLEN);
michael@0 850 free(st);
michael@0 851 mychomp(result);
michael@0 852 }
michael@0 853 }
michael@0 854 } else {
michael@0 855 st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
michael@0 856 if (st) {
michael@0 857 mystrcat(result, st, MAXLNLEN);
michael@0 858 free(st);
michael@0 859 mychomp(result);
michael@0 860 }
michael@0 861 }
michael@0 862 if (*result) return mystrdup(result);
michael@0 863 }
michael@0 864 }
michael@0 865 return NULL;
michael@0 866 }
michael@0 867
michael@0 868 // get next homonym with same affix
michael@0 869 struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, PfxEntry* ppfx,
michael@0 870 const FLAG cclass, const FLAG needflag)
michael@0 871 {
michael@0 872 PfxEntry* ep = ppfx;
michael@0 873 FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL;
michael@0 874
michael@0 875 while (he->next_homonym) {
michael@0 876 he = he->next_homonym;
michael@0 877 if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
michael@0 878 ((optflags & aeXPRODUCT) == 0 ||
michael@0 879 TESTAFF(he->astr, eFlag, he->alen) ||
michael@0 880 // handle conditional suffix
michael@0 881 ((contclass) && TESTAFF(contclass, eFlag, contclasslen))
michael@0 882 ) &&
michael@0 883 // handle cont. class
michael@0 884 ((!cclass) ||
michael@0 885 ((contclass) && TESTAFF(contclass, cclass, contclasslen))
michael@0 886 ) &&
michael@0 887 // handle required flag
michael@0 888 ((!needflag) ||
michael@0 889 (TESTAFF(he->astr, needflag, he->alen) ||
michael@0 890 ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
michael@0 891 )
michael@0 892 ) return he;
michael@0 893 }
michael@0 894 return NULL;
michael@0 895 }
michael@0 896
michael@0 897
michael@0 898 #if 0
michael@0 899
michael@0 900 Appendix: Understanding Affix Code
michael@0 901
michael@0 902
michael@0 903 An affix is either a prefix or a suffix attached to root words to make
michael@0 904 other words.
michael@0 905
michael@0 906 Basically a Prefix or a Suffix is set of AffEntry objects
michael@0 907 which store information about the prefix or suffix along
michael@0 908 with supporting routines to check if a word has a particular
michael@0 909 prefix or suffix or a combination.
michael@0 910
michael@0 911 The structure affentry is defined as follows:
michael@0 912
michael@0 913 struct affentry
michael@0 914 {
michael@0 915 unsigned short aflag; // ID used to represent the affix
michael@0 916 char * strip; // string to strip before adding affix
michael@0 917 char * appnd; // the affix string to add
michael@0 918 unsigned char stripl; // length of the strip string
michael@0 919 unsigned char appndl; // length of the affix string
michael@0 920 char numconds; // the number of conditions that must be met
michael@0 921 char opts; // flag: aeXPRODUCT- combine both prefix and suffix
michael@0 922 char conds[SETSIZE]; // array which encodes the conditions to be met
michael@0 923 };
michael@0 924
michael@0 925
michael@0 926 Here is a suffix borrowed from the en_US.aff file. This file
michael@0 927 is whitespace delimited.
michael@0 928
michael@0 929 SFX D Y 4
michael@0 930 SFX D 0 e d
michael@0 931 SFX D y ied [^aeiou]y
michael@0 932 SFX D 0 ed [^ey]
michael@0 933 SFX D 0 ed [aeiou]y
michael@0 934
michael@0 935 This information can be interpreted as follows:
michael@0 936
michael@0 937 In the first line has 4 fields
michael@0 938
michael@0 939 Field
michael@0 940 -----
michael@0 941 1 SFX - indicates this is a suffix
michael@0 942 2 D - is the name of the character flag which represents this suffix
michael@0 943 3 Y - indicates it can be combined with prefixes (cross product)
michael@0 944 4 4 - indicates that sequence of 4 affentry structures are needed to
michael@0 945 properly store the affix information
michael@0 946
michael@0 947 The remaining lines describe the unique information for the 4 SfxEntry
michael@0 948 objects that make up this affix. Each line can be interpreted
michael@0 949 as follows: (note fields 1 and 2 are as a check against line 1 info)
michael@0 950
michael@0 951 Field
michael@0 952 -----
michael@0 953 1 SFX - indicates this is a suffix
michael@0 954 2 D - is the name of the character flag for this affix
michael@0 955 3 y - the string of chars to strip off before adding affix
michael@0 956 (a 0 here indicates the NULL string)
michael@0 957 4 ied - the string of affix characters to add
michael@0 958 5 [^aeiou]y - the conditions which must be met before the affix
michael@0 959 can be applied
michael@0 960
michael@0 961 Field 5 is interesting. Since this is a suffix, field 5 tells us that
michael@0 962 there are 2 conditions that must be met. The first condition is that
michael@0 963 the next to the last character in the word must *NOT* be any of the
michael@0 964 following "a", "e", "i", "o" or "u". The second condition is that
michael@0 965 the last character of the word must end in "y".
michael@0 966
michael@0 967 So how can we encode this information concisely and be able to
michael@0 968 test for both conditions in a fast manner? The answer is found
michael@0 969 but studying the wonderful ispell code of Geoff Kuenning, et.al.
michael@0 970 (now available under a normal BSD license).
michael@0 971
michael@0 972 If we set up a conds array of 256 bytes indexed (0 to 255) and access it
michael@0 973 using a character (cast to an unsigned char) of a string, we have 8 bits
michael@0 974 of information we can store about that character. Specifically we
michael@0 975 could use each bit to say if that character is allowed in any of the
michael@0 976 last (or first for prefixes) 8 characters of the word.
michael@0 977
michael@0 978 Basically, each character at one end of the word (up to the number
michael@0 979 of conditions) is used to index into the conds array and the resulting
michael@0 980 value found there says whether the that character is valid for a
michael@0 981 specific character position in the word.
michael@0 982
michael@0 983 For prefixes, it does this by setting bit 0 if that char is valid
michael@0 984 in the first position, bit 1 if valid in the second position, and so on.
michael@0 985
michael@0 986 If a bit is not set, then that char is not valid for that postion in the
michael@0 987 word.
michael@0 988
michael@0 989 If working with suffixes bit 0 is used for the character closest
michael@0 990 to the front, bit 1 for the next character towards the end, ...,
michael@0 991 with bit numconds-1 representing the last char at the end of the string.
michael@0 992
michael@0 993 Note: since entries in the conds[] are 8 bits, only 8 conditions
michael@0 994 (read that only 8 character positions) can be examined at one
michael@0 995 end of a word (the beginning for prefixes and the end for suffixes.
michael@0 996
michael@0 997 So to make this clearer, lets encode the conds array values for the
michael@0 998 first two affentries for the suffix D described earlier.
michael@0 999
michael@0 1000
michael@0 1001 For the first affentry:
michael@0 1002 numconds = 1 (only examine the last character)
michael@0 1003
michael@0 1004 conds['e'] = (1 << 0) (the word must end in an E)
michael@0 1005 all others are all 0
michael@0 1006
michael@0 1007 For the second affentry:
michael@0 1008 numconds = 2 (only examine the last two characters)
michael@0 1009
michael@0 1010 conds[X] = conds[X] | (1 << 0) (aeiou are not allowed)
michael@0 1011 where X is all characters *but* a, e, i, o, or u
michael@0 1012
michael@0 1013
michael@0 1014 conds['y'] = (1 << 1) (the last char must be a y)
michael@0 1015 all other bits for all other entries in the conds array are zero
michael@0 1016
michael@0 1017
michael@0 1018 #endif
michael@0 1019

mercurial