1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/extensions/spellcheck/hunspell/src/affentry.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1019 @@ 1.4 +/******* BEGIN LICENSE BLOCK ******* 1.5 + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 1.6 + * 1.7 + * The contents of this file are subject to the Mozilla Public License Version 1.8 + * 1.1 (the "License"); you may not use this file except in compliance with 1.9 + * the License. You may obtain a copy of the License at 1.10 + * http://www.mozilla.org/MPL/ 1.11 + * 1.12 + * Software distributed under the License is distributed on an "AS IS" basis, 1.13 + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 1.14 + * for the specific language governing rights and limitations under the 1.15 + * License. 1.16 + * 1.17 + * The Initial Developers of the Original Code are Kevin Hendricks (MySpell) 1.18 + * and László Németh (Hunspell). Portions created by the Initial Developers 1.19 + * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved. 1.20 + * 1.21 + * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca) 1.22 + * David Einstein (deinst@world.std.com) 1.23 + * László Németh (nemethl@gyorsposta.hu) 1.24 + * Caolan McNamara (caolanm@redhat.com) 1.25 + * Davide Prina 1.26 + * Giuseppe Modugno 1.27 + * Gianluca Turconi 1.28 + * Simon Brouwer 1.29 + * Noll Janos 1.30 + * Biro Arpad 1.31 + * Goldman Eleonora 1.32 + * Sarlos Tamas 1.33 + * Bencsath Boldizsar 1.34 + * Halacsy Peter 1.35 + * Dvornik Laszlo 1.36 + * Gefferth Andras 1.37 + * Nagy Viktor 1.38 + * Varga Daniel 1.39 + * Chris Halls 1.40 + * Rene Engelhard 1.41 + * Bram Moolenaar 1.42 + * Dafydd Jones 1.43 + * Harri Pitkanen 1.44 + * Andras Timar 1.45 + * Tor Lillqvist 1.46 + * 1.47 + * Alternatively, the contents of this file may be used under the terms of 1.48 + * either the GNU General Public License Version 2 or later (the "GPL"), or 1.49 + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 1.50 + * in which case the provisions of the GPL or the LGPL are applicable instead 1.51 + * of those above. If you wish to allow use of your version of this file only 1.52 + * under the terms of either the GPL or the LGPL, and not to allow others to 1.53 + * use your version of this file under the terms of the MPL, indicate your 1.54 + * decision by deleting the provisions above and replace them with the notice 1.55 + * and other provisions required by the GPL or the LGPL. If you do not delete 1.56 + * the provisions above, a recipient may use your version of this file under 1.57 + * the terms of any one of the MPL, the GPL or the LGPL. 1.58 + * 1.59 + ******* END LICENSE BLOCK *******/ 1.60 + 1.61 +#include <stdlib.h> 1.62 +#include <string.h> 1.63 +#include <stdio.h> 1.64 +#include <ctype.h> 1.65 + 1.66 +#include "affentry.hxx" 1.67 +#include "csutil.hxx" 1.68 + 1.69 +PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp) 1.70 +{ 1.71 + // register affix manager 1.72 + pmyMgr = pmgr; 1.73 + 1.74 + // set up its initial values 1.75 + 1.76 + aflag = dp->aflag; // flag 1.77 + strip = dp->strip; // string to strip 1.78 + appnd = dp->appnd; // string to append 1.79 + stripl = dp->stripl; // length of strip string 1.80 + appndl = dp->appndl; // length of append string 1.81 + numconds = dp->numconds; // length of the condition 1.82 + opts = dp->opts; // cross product flag 1.83 + // then copy over all of the conditions 1.84 + if (opts & aeLONGCOND) { 1.85 + memcpy(c.conds, dp->c.l.conds1, MAXCONDLEN_1); 1.86 + c.l.conds2 = dp->c.l.conds2; 1.87 + } else memcpy(c.conds, dp->c.conds, MAXCONDLEN); 1.88 + next = NULL; 1.89 + nextne = NULL; 1.90 + nexteq = NULL; 1.91 + morphcode = dp->morphcode; 1.92 + contclass = dp->contclass; 1.93 + contclasslen = dp->contclasslen; 1.94 +} 1.95 + 1.96 + 1.97 +PfxEntry::~PfxEntry() 1.98 +{ 1.99 + aflag = 0; 1.100 + if (appnd) free(appnd); 1.101 + if (strip) free(strip); 1.102 + pmyMgr = NULL; 1.103 + appnd = NULL; 1.104 + strip = NULL; 1.105 + if (opts & aeLONGCOND) free(c.l.conds2); 1.106 + if (morphcode && !(opts & aeALIASM)) free(morphcode); 1.107 + if (contclass && !(opts & aeALIASF)) free(contclass); 1.108 +} 1.109 + 1.110 +// add prefix to this word assuming conditions hold 1.111 +char * PfxEntry::add(const char * word, int len) 1.112 +{ 1.113 + char tword[MAXWORDUTF8LEN + 4]; 1.114 + 1.115 + if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) && 1.116 + (len >= numconds) && test_condition(word) && 1.117 + (!stripl || (strncmp(word, strip, stripl) == 0)) && 1.118 + ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) { 1.119 + /* we have a match so add prefix */ 1.120 + char * pp = tword; 1.121 + if (appndl) { 1.122 + strcpy(tword,appnd); 1.123 + pp += appndl; 1.124 + } 1.125 + strcpy(pp, (word + stripl)); 1.126 + return mystrdup(tword); 1.127 + } 1.128 + return NULL; 1.129 +} 1.130 + 1.131 +inline char * PfxEntry::nextchar(char * p) { 1.132 + if (p) { 1.133 + p++; 1.134 + if (opts & aeLONGCOND) { 1.135 + // jump to the 2nd part of the condition 1.136 + if (p == c.conds + MAXCONDLEN_1) return c.l.conds2; 1.137 + // end of the MAXCONDLEN length condition 1.138 + } else if (p == c.conds + MAXCONDLEN) return NULL; 1.139 + return *p ? p : NULL; 1.140 + } 1.141 + return NULL; 1.142 +} 1.143 + 1.144 +inline int PfxEntry::test_condition(const char * st) 1.145 +{ 1.146 + const char * pos = NULL; // group with pos input position 1.147 + bool neg = false; // complementer 1.148 + bool ingroup = false; // character in the group 1.149 + if (numconds == 0) return 1; 1.150 + char * p = c.conds; 1.151 + while (1) { 1.152 + switch (*p) { 1.153 + case '\0': return 1; 1.154 + case '[': { 1.155 + neg = false; 1.156 + ingroup = false; 1.157 + p = nextchar(p); 1.158 + pos = st; break; 1.159 + } 1.160 + case '^': { p = nextchar(p); neg = true; break; } 1.161 + case ']': { 1.162 + if ((neg && ingroup) || (!neg && !ingroup)) return 0; 1.163 + pos = NULL; 1.164 + p = nextchar(p); 1.165 + // skip the next character 1.166 + if (!ingroup && *st) for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++); 1.167 + if (*st == '\0' && p) return 0; // word <= condition 1.168 + break; 1.169 + } 1.170 + case '.': if (!pos) { // dots are not metacharacters in groups: [.] 1.171 + p = nextchar(p); 1.172 + // skip the next character 1.173 + for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++); 1.174 + if (*st == '\0' && p) return 0; // word <= condition 1.175 + break; 1.176 + } 1.177 + default: { 1.178 + if (*st == *p) { 1.179 + st++; 1.180 + p = nextchar(p); 1.181 + if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte 1.182 + while (p && (*p & 0xc0) == 0x80) { // character 1.183 + if (*p != *st) { 1.184 + if (!pos) return 0; 1.185 + st = pos; 1.186 + break; 1.187 + } 1.188 + p = nextchar(p); 1.189 + st++; 1.190 + } 1.191 + if (pos && st != pos) { 1.192 + ingroup = true; 1.193 + while (p && *p != ']' && (p = nextchar(p))); 1.194 + } 1.195 + } else if (pos) { 1.196 + ingroup = true; 1.197 + while (p && *p != ']' && (p = nextchar(p))); 1.198 + } 1.199 + } else if (pos) { // group 1.200 + p = nextchar(p); 1.201 + } else return 0; 1.202 + } 1.203 + } 1.204 + if (!p) return 1; 1.205 + } 1.206 +} 1.207 + 1.208 +// check if this prefix entry matches 1.209 +struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound, const FLAG needflag) 1.210 +{ 1.211 + int tmpl; // length of tmpword 1.212 + struct hentry * he; // hash entry of root word or NULL 1.213 + char tmpword[MAXWORDUTF8LEN + 4]; 1.214 + 1.215 + // on entry prefix is 0 length or already matches the beginning of the word. 1.216 + // So if the remaining root word has positive length 1.217 + // and if there are enough chars in root word and added back strip chars 1.218 + // to meet the number of characters conditions, then test it 1.219 + 1.220 + tmpl = len - appndl; 1.221 + 1.222 + if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) { 1.223 + 1.224 + // generate new root word by removing prefix and adding 1.225 + // back any characters that would have been stripped 1.226 + 1.227 + if (stripl) strcpy (tmpword, strip); 1.228 + strcpy ((tmpword + stripl), (word + appndl)); 1.229 + 1.230 + // now make sure all of the conditions on characters 1.231 + // are met. Please see the appendix at the end of 1.232 + // this file for more info on exactly what is being 1.233 + // tested 1.234 + 1.235 + // if all conditions are met then check if resulting 1.236 + // root word in the dictionary 1.237 + 1.238 + if (test_condition(tmpword)) { 1.239 + tmpl += stripl; 1.240 + if ((he = pmyMgr->lookup(tmpword)) != NULL) { 1.241 + do { 1.242 + if (TESTAFF(he->astr, aflag, he->alen) && 1.243 + // forbid single prefixes with needaffix flag 1.244 + ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) && 1.245 + // needflag 1.246 + ((!needflag) || TESTAFF(he->astr, needflag, he->alen) || 1.247 + (contclass && TESTAFF(contclass, needflag, contclasslen)))) 1.248 + return he; 1.249 + he = he->next_homonym; // check homonyms 1.250 + } while (he); 1.251 + } 1.252 + 1.253 + // prefix matched but no root word was found 1.254 + // if aeXPRODUCT is allowed, try again but now 1.255 + // ross checked combined with a suffix 1.256 + 1.257 + //if ((opts & aeXPRODUCT) && in_compound) { 1.258 + if ((opts & aeXPRODUCT)) { 1.259 + he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, this, NULL, 1.260 + 0, NULL, FLAG_NULL, needflag, in_compound); 1.261 + if (he) return he; 1.262 + } 1.263 + } 1.264 + } 1.265 + return NULL; 1.266 +} 1.267 + 1.268 +// check if this prefix entry matches 1.269 +struct hentry * PfxEntry::check_twosfx(const char * word, int len, 1.270 + char in_compound, const FLAG needflag) 1.271 +{ 1.272 + int tmpl; // length of tmpword 1.273 + struct hentry * he; // hash entry of root word or NULL 1.274 + char tmpword[MAXWORDUTF8LEN + 4]; 1.275 + 1.276 + // on entry prefix is 0 length or already matches the beginning of the word. 1.277 + // So if the remaining root word has positive length 1.278 + // and if there are enough chars in root word and added back strip chars 1.279 + // to meet the number of characters conditions, then test it 1.280 + 1.281 + tmpl = len - appndl; 1.282 + 1.283 + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && 1.284 + (tmpl + stripl >= numconds)) { 1.285 + 1.286 + // generate new root word by removing prefix and adding 1.287 + // back any characters that would have been stripped 1.288 + 1.289 + if (stripl) strcpy (tmpword, strip); 1.290 + strcpy ((tmpword + stripl), (word + appndl)); 1.291 + 1.292 + // now make sure all of the conditions on characters 1.293 + // are met. Please see the appendix at the end of 1.294 + // this file for more info on exactly what is being 1.295 + // tested 1.296 + 1.297 + // if all conditions are met then check if resulting 1.298 + // root word in the dictionary 1.299 + 1.300 + if (test_condition(tmpword)) { 1.301 + tmpl += stripl; 1.302 + 1.303 + // prefix matched but no root word was found 1.304 + // if aeXPRODUCT is allowed, try again but now 1.305 + // cross checked combined with a suffix 1.306 + 1.307 + if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { 1.308 + he = pmyMgr->suffix_check_twosfx(tmpword, tmpl, aeXPRODUCT, this, needflag); 1.309 + if (he) return he; 1.310 + } 1.311 + } 1.312 + } 1.313 + return NULL; 1.314 +} 1.315 + 1.316 +// check if this prefix entry matches 1.317 +char * PfxEntry::check_twosfx_morph(const char * word, int len, 1.318 + char in_compound, const FLAG needflag) 1.319 +{ 1.320 + int tmpl; // length of tmpword 1.321 + char tmpword[MAXWORDUTF8LEN + 4]; 1.322 + 1.323 + // on entry prefix is 0 length or already matches the beginning of the word. 1.324 + // So if the remaining root word has positive length 1.325 + // and if there are enough chars in root word and added back strip chars 1.326 + // to meet the number of characters conditions, then test it 1.327 + 1.328 + tmpl = len - appndl; 1.329 + 1.330 + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && 1.331 + (tmpl + stripl >= numconds)) { 1.332 + 1.333 + // generate new root word by removing prefix and adding 1.334 + // back any characters that would have been stripped 1.335 + 1.336 + if (stripl) strcpy (tmpword, strip); 1.337 + strcpy ((tmpword + stripl), (word + appndl)); 1.338 + 1.339 + // now make sure all of the conditions on characters 1.340 + // are met. Please see the appendix at the end of 1.341 + // this file for more info on exactly what is being 1.342 + // tested 1.343 + 1.344 + // if all conditions are met then check if resulting 1.345 + // root word in the dictionary 1.346 + 1.347 + if (test_condition(tmpword)) { 1.348 + tmpl += stripl; 1.349 + 1.350 + // prefix matched but no root word was found 1.351 + // if aeXPRODUCT is allowed, try again but now 1.352 + // ross checked combined with a suffix 1.353 + 1.354 + if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { 1.355 + return pmyMgr->suffix_check_twosfx_morph(tmpword, tmpl, 1.356 + aeXPRODUCT, this, needflag); 1.357 + } 1.358 + } 1.359 + } 1.360 + return NULL; 1.361 +} 1.362 + 1.363 +// check if this prefix entry matches 1.364 +char * PfxEntry::check_morph(const char * word, int len, char in_compound, const FLAG needflag) 1.365 +{ 1.366 + int tmpl; // length of tmpword 1.367 + struct hentry * he; // hash entry of root word or NULL 1.368 + char tmpword[MAXWORDUTF8LEN + 4]; 1.369 + char result[MAXLNLEN]; 1.370 + char * st; 1.371 + 1.372 + *result = '\0'; 1.373 + 1.374 + // on entry prefix is 0 length or already matches the beginning of the word. 1.375 + // So if the remaining root word has positive length 1.376 + // and if there are enough chars in root word and added back strip chars 1.377 + // to meet the number of characters conditions, then test it 1.378 + 1.379 + tmpl = len - appndl; 1.380 + 1.381 + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && 1.382 + (tmpl + stripl >= numconds)) { 1.383 + 1.384 + // generate new root word by removing prefix and adding 1.385 + // back any characters that would have been stripped 1.386 + 1.387 + if (stripl) strcpy (tmpword, strip); 1.388 + strcpy ((tmpword + stripl), (word + appndl)); 1.389 + 1.390 + // now make sure all of the conditions on characters 1.391 + // are met. Please see the appendix at the end of 1.392 + // this file for more info on exactly what is being 1.393 + // tested 1.394 + 1.395 + // if all conditions are met then check if resulting 1.396 + // root word in the dictionary 1.397 + 1.398 + if (test_condition(tmpword)) { 1.399 + tmpl += stripl; 1.400 + if ((he = pmyMgr->lookup(tmpword)) != NULL) { 1.401 + do { 1.402 + if (TESTAFF(he->astr, aflag, he->alen) && 1.403 + // forbid single prefixes with needaffix flag 1.404 + ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) && 1.405 + // needflag 1.406 + ((!needflag) || TESTAFF(he->astr, needflag, he->alen) || 1.407 + (contclass && TESTAFF(contclass, needflag, contclasslen)))) { 1.408 + if (morphcode) { 1.409 + mystrcat(result, " ", MAXLNLEN); 1.410 + mystrcat(result, morphcode, MAXLNLEN); 1.411 + } else mystrcat(result,getKey(), MAXLNLEN); 1.412 + if (!HENTRY_FIND(he, MORPH_STEM)) { 1.413 + mystrcat(result, " ", MAXLNLEN); 1.414 + mystrcat(result, MORPH_STEM, MAXLNLEN); 1.415 + mystrcat(result, HENTRY_WORD(he), MAXLNLEN); 1.416 + } 1.417 + // store the pointer of the hash entry 1.418 + if (HENTRY_DATA(he)) { 1.419 + mystrcat(result, " ", MAXLNLEN); 1.420 + mystrcat(result, HENTRY_DATA2(he), MAXLNLEN); 1.421 + } else { 1.422 + // return with debug information 1.423 + char * flag = pmyMgr->encode_flag(getFlag()); 1.424 + mystrcat(result, " ", MAXLNLEN); 1.425 + mystrcat(result, MORPH_FLAG, MAXLNLEN); 1.426 + mystrcat(result, flag, MAXLNLEN); 1.427 + free(flag); 1.428 + } 1.429 + mystrcat(result, "\n", MAXLNLEN); 1.430 + } 1.431 + he = he->next_homonym; 1.432 + } while (he); 1.433 + } 1.434 + 1.435 + // prefix matched but no root word was found 1.436 + // if aeXPRODUCT is allowed, try again but now 1.437 + // ross checked combined with a suffix 1.438 + 1.439 + if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { 1.440 + st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, this, 1.441 + FLAG_NULL, needflag); 1.442 + if (st) { 1.443 + mystrcat(result, st, MAXLNLEN); 1.444 + free(st); 1.445 + } 1.446 + } 1.447 + } 1.448 + } 1.449 + 1.450 + if (*result) return mystrdup(result); 1.451 + return NULL; 1.452 +} 1.453 + 1.454 +SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp) 1.455 +{ 1.456 + // register affix manager 1.457 + pmyMgr = pmgr; 1.458 + 1.459 + // set up its initial values 1.460 + aflag = dp->aflag; // char flag 1.461 + strip = dp->strip; // string to strip 1.462 + appnd = dp->appnd; // string to append 1.463 + stripl = dp->stripl; // length of strip string 1.464 + appndl = dp->appndl; // length of append string 1.465 + numconds = dp->numconds; // length of the condition 1.466 + opts = dp->opts; // cross product flag 1.467 + 1.468 + // then copy over all of the conditions 1.469 + if (opts & aeLONGCOND) { 1.470 + memcpy(c.l.conds1, dp->c.l.conds1, MAXCONDLEN_1); 1.471 + c.l.conds2 = dp->c.l.conds2; 1.472 + } else memcpy(c.conds, dp->c.conds, MAXCONDLEN); 1.473 + next = NULL; 1.474 + nextne = NULL; 1.475 + nexteq = NULL; 1.476 + rappnd = myrevstrdup(appnd); 1.477 + morphcode = dp->morphcode; 1.478 + contclass = dp->contclass; 1.479 + contclasslen = dp->contclasslen; 1.480 +} 1.481 + 1.482 + 1.483 +SfxEntry::~SfxEntry() 1.484 +{ 1.485 + aflag = 0; 1.486 + if (appnd) free(appnd); 1.487 + if (rappnd) free(rappnd); 1.488 + if (strip) free(strip); 1.489 + pmyMgr = NULL; 1.490 + appnd = NULL; 1.491 + strip = NULL; 1.492 + if (opts & aeLONGCOND) free(c.l.conds2); 1.493 + if (morphcode && !(opts & aeALIASM)) free(morphcode); 1.494 + if (contclass && !(opts & aeALIASF)) free(contclass); 1.495 +} 1.496 + 1.497 +// add suffix to this word assuming conditions hold 1.498 +char * SfxEntry::add(const char * word, int len) 1.499 +{ 1.500 + char tword[MAXWORDUTF8LEN + 4]; 1.501 + 1.502 + /* make sure all conditions match */ 1.503 + if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) && 1.504 + (len >= numconds) && test_condition(word + len, word) && 1.505 + (!stripl || (strcmp(word + len - stripl, strip) == 0)) && 1.506 + ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) { 1.507 + /* we have a match so add suffix */ 1.508 + strcpy(tword,word); 1.509 + if (appndl) { 1.510 + strcpy(tword + len - stripl, appnd); 1.511 + } else { 1.512 + *(tword + len - stripl) = '\0'; 1.513 + } 1.514 + return mystrdup(tword); 1.515 + } 1.516 + return NULL; 1.517 +} 1.518 + 1.519 +inline char * SfxEntry::nextchar(char * p) { 1.520 + if (p) { 1.521 + p++; 1.522 + if (opts & aeLONGCOND) { 1.523 + // jump to the 2nd part of the condition 1.524 + if (p == c.l.conds1 + MAXCONDLEN_1) return c.l.conds2; 1.525 + // end of the MAXCONDLEN length condition 1.526 + } else if (p == c.conds + MAXCONDLEN) return NULL; 1.527 + return *p ? p : NULL; 1.528 + } 1.529 + return NULL; 1.530 +} 1.531 + 1.532 +inline int SfxEntry::test_condition(const char * st, const char * beg) 1.533 +{ 1.534 + const char * pos = NULL; // group with pos input position 1.535 + bool neg = false; // complementer 1.536 + bool ingroup = false; // character in the group 1.537 + if (numconds == 0) return 1; 1.538 + char * p = c.conds; 1.539 + st--; 1.540 + int i = 1; 1.541 + while (1) { 1.542 + switch (*p) { 1.543 + case '\0': return 1; 1.544 + case '[': { p = nextchar(p); pos = st; break; } 1.545 + case '^': { p = nextchar(p); neg = true; break; } 1.546 + case ']': { if (!neg && !ingroup) return 0; 1.547 + i++; 1.548 + // skip the next character 1.549 + if (!ingroup) { 1.550 + for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--); 1.551 + st--; 1.552 + } 1.553 + pos = NULL; 1.554 + neg = false; 1.555 + ingroup = false; 1.556 + p = nextchar(p); 1.557 + if (st < beg && p) return 0; // word <= condition 1.558 + break; 1.559 + } 1.560 + case '.': if (!pos) { // dots are not metacharacters in groups: [.] 1.561 + p = nextchar(p); 1.562 + // skip the next character 1.563 + for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--); 1.564 + if (st < beg) { // word <= condition 1.565 + if (p) return 0; else return 1; 1.566 + } 1.567 + if ((opts & aeUTF8) && (*st & 0x80)) { // head of the UTF-8 character 1.568 + st--; 1.569 + if (st < beg) { // word <= condition 1.570 + if (p) return 0; else return 1; 1.571 + } 1.572 + } 1.573 + break; 1.574 + } 1.575 + default: { 1.576 + if (*st == *p) { 1.577 + p = nextchar(p); 1.578 + if ((opts & aeUTF8) && (*st & 0x80)) { 1.579 + st--; 1.580 + while (p && (st >= beg)) { 1.581 + if (*p != *st) { 1.582 + if (!pos) return 0; 1.583 + st = pos; 1.584 + break; 1.585 + } 1.586 + // first byte of the UTF-8 multibyte character 1.587 + if ((*p & 0xc0) != 0x80) break; 1.588 + p = nextchar(p); 1.589 + st--; 1.590 + } 1.591 + if (pos && st != pos) { 1.592 + if (neg) return 0; 1.593 + else if (i == numconds) return 1; 1.594 + ingroup = true; 1.595 + while (p && *p != ']' && (p = nextchar(p))); 1.596 + st--; 1.597 + } 1.598 + if (p && *p != ']') p = nextchar(p); 1.599 + } else if (pos) { 1.600 + if (neg) return 0; 1.601 + else if (i == numconds) return 1; 1.602 + ingroup = true; 1.603 + while (p && *p != ']' && (p = nextchar(p))) 1.604 + ; 1.605 +// if (p && *p != ']') p = nextchar(p); 1.606 + st--; 1.607 + } 1.608 + if (!pos) { 1.609 + i++; 1.610 + st--; 1.611 + } 1.612 + if (st < beg && p && *p != ']') return 0; // word <= condition 1.613 + } else if (pos) { // group 1.614 + p = nextchar(p); 1.615 + } else return 0; 1.616 + } 1.617 + } 1.618 + if (!p) return 1; 1.619 + } 1.620 +} 1.621 + 1.622 +// see if this suffix is present in the word 1.623 +struct hentry * SfxEntry::checkword(const char * word, int len, int optflags, 1.624 + PfxEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag, 1.625 + const FLAG badflag) 1.626 +{ 1.627 + int tmpl; // length of tmpword 1.628 + struct hentry * he; // hash entry pointer 1.629 + unsigned char * cp; 1.630 + char tmpword[MAXWORDUTF8LEN + 4]; 1.631 + PfxEntry* ep = ppfx; 1.632 + 1.633 + // if this suffix is being cross checked with a prefix 1.634 + // but it does not support cross products skip it 1.635 + 1.636 + if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0)) 1.637 + return NULL; 1.638 + 1.639 + // upon entry suffix is 0 length or already matches the end of the word. 1.640 + // So if the remaining root word has positive length 1.641 + // and if there are enough chars in root word and added back strip chars 1.642 + // to meet the number of characters conditions, then test it 1.643 + 1.644 + tmpl = len - appndl; 1.645 + // the second condition is not enough for UTF-8 strings 1.646 + // it checked in test_condition() 1.647 + 1.648 + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && 1.649 + (tmpl + stripl >= numconds)) { 1.650 + 1.651 + // generate new root word by removing suffix and adding 1.652 + // back any characters that would have been stripped or 1.653 + // or null terminating the shorter string 1.654 + 1.655 + strcpy (tmpword, word); 1.656 + cp = (unsigned char *)(tmpword + tmpl); 1.657 + if (stripl) { 1.658 + strcpy ((char *)cp, strip); 1.659 + tmpl += stripl; 1.660 + cp = (unsigned char *)(tmpword + tmpl); 1.661 + } else *cp = '\0'; 1.662 + 1.663 + // now make sure all of the conditions on characters 1.664 + // are met. Please see the appendix at the end of 1.665 + // this file for more info on exactly what is being 1.666 + // tested 1.667 + 1.668 + // if all conditions are met then check if resulting 1.669 + // root word in the dictionary 1.670 + 1.671 + if (test_condition((char *) cp, (char *) tmpword)) { 1.672 + 1.673 +#ifdef SZOSZABLYA_POSSIBLE_ROOTS 1.674 + fprintf(stdout,"%s %s %c\n", word, tmpword, aflag); 1.675 +#endif 1.676 + if ((he = pmyMgr->lookup(tmpword)) != NULL) { 1.677 + do { 1.678 + // check conditional suffix (enabled by prefix) 1.679 + if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && 1.680 + TESTAFF(ep->getCont(), aflag, ep->getContLen()))) && 1.681 + (((optflags & aeXPRODUCT) == 0) || 1.682 + (ep && TESTAFF(he->astr, ep->getFlag(), he->alen)) || 1.683 + // enabled by prefix 1.684 + ((contclass) && (ep && TESTAFF(contclass, ep->getFlag(), contclasslen))) 1.685 + ) && 1.686 + // handle cont. class 1.687 + ((!cclass) || 1.688 + ((contclass) && TESTAFF(contclass, cclass, contclasslen)) 1.689 + ) && 1.690 + // check only in compound homonyms (bad flags) 1.691 + (!badflag || !TESTAFF(he->astr, badflag, he->alen) 1.692 + ) && 1.693 + // handle required flag 1.694 + ((!needflag) || 1.695 + (TESTAFF(he->astr, needflag, he->alen) || 1.696 + ((contclass) && TESTAFF(contclass, needflag, contclasslen))) 1.697 + ) 1.698 + ) return he; 1.699 + he = he->next_homonym; // check homonyms 1.700 + } while (he); 1.701 + 1.702 + // obsolote stemming code (used only by the 1.703 + // experimental SuffixMgr:suggest_pos_stems) 1.704 + // store resulting root in wlst 1.705 + } else if (wlst && (*ns < maxSug)) { 1.706 + int cwrd = 1; 1.707 + for (int k=0; k < *ns; k++) 1.708 + if (strcmp(tmpword, wlst[k]) == 0) cwrd = 0; 1.709 + if (cwrd) { 1.710 + wlst[*ns] = mystrdup(tmpword); 1.711 + if (wlst[*ns] == NULL) { 1.712 + for (int j=0; j<*ns; j++) free(wlst[j]); 1.713 + *ns = -1; 1.714 + return NULL; 1.715 + } 1.716 + (*ns)++; 1.717 + } 1.718 + } 1.719 + } 1.720 + } 1.721 + return NULL; 1.722 +} 1.723 + 1.724 +// see if two-level suffix is present in the word 1.725 +struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags, 1.726 + PfxEntry* ppfx, const FLAG needflag) 1.727 +{ 1.728 + int tmpl; // length of tmpword 1.729 + struct hentry * he; // hash entry pointer 1.730 + unsigned char * cp; 1.731 + char tmpword[MAXWORDUTF8LEN + 4]; 1.732 + PfxEntry* ep = ppfx; 1.733 + 1.734 + 1.735 + // if this suffix is being cross checked with a prefix 1.736 + // but it does not support cross products skip it 1.737 + 1.738 + if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0) 1.739 + return NULL; 1.740 + 1.741 + // upon entry suffix is 0 length or already matches the end of the word. 1.742 + // So if the remaining root word has positive length 1.743 + // and if there are enough chars in root word and added back strip chars 1.744 + // to meet the number of characters conditions, then test it 1.745 + 1.746 + tmpl = len - appndl; 1.747 + 1.748 + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && 1.749 + (tmpl + stripl >= numconds)) { 1.750 + 1.751 + // generate new root word by removing suffix and adding 1.752 + // back any characters that would have been stripped or 1.753 + // or null terminating the shorter string 1.754 + 1.755 + strcpy (tmpword, word); 1.756 + cp = (unsigned char *)(tmpword + tmpl); 1.757 + if (stripl) { 1.758 + strcpy ((char *)cp, strip); 1.759 + tmpl += stripl; 1.760 + cp = (unsigned char *)(tmpword + tmpl); 1.761 + } else *cp = '\0'; 1.762 + 1.763 + // now make sure all of the conditions on characters 1.764 + // are met. Please see the appendix at the end of 1.765 + // this file for more info on exactly what is being 1.766 + // tested 1.767 + 1.768 + // if all conditions are met then recall suffix_check 1.769 + 1.770 + if (test_condition((char *) cp, (char *) tmpword)) { 1.771 + if (ppfx) { 1.772 + // handle conditional suffix 1.773 + if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) 1.774 + he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag); 1.775 + else 1.776 + he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0, NULL, (FLAG) aflag, needflag); 1.777 + } else { 1.778 + he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag); 1.779 + } 1.780 + if (he) return he; 1.781 + } 1.782 + } 1.783 + return NULL; 1.784 +} 1.785 + 1.786 +// see if two-level suffix is present in the word 1.787 +char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags, 1.788 + PfxEntry* ppfx, const FLAG needflag) 1.789 +{ 1.790 + int tmpl; // length of tmpword 1.791 + unsigned char * cp; 1.792 + char tmpword[MAXWORDUTF8LEN + 4]; 1.793 + PfxEntry* ep = ppfx; 1.794 + char * st; 1.795 + 1.796 + char result[MAXLNLEN]; 1.797 + 1.798 + *result = '\0'; 1.799 + 1.800 + // if this suffix is being cross checked with a prefix 1.801 + // but it does not support cross products skip it 1.802 + 1.803 + if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0) 1.804 + return NULL; 1.805 + 1.806 + // upon entry suffix is 0 length or already matches the end of the word. 1.807 + // So if the remaining root word has positive length 1.808 + // and if there are enough chars in root word and added back strip chars 1.809 + // to meet the number of characters conditions, then test it 1.810 + 1.811 + tmpl = len - appndl; 1.812 + 1.813 + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && 1.814 + (tmpl + stripl >= numconds)) { 1.815 + 1.816 + // generate new root word by removing suffix and adding 1.817 + // back any characters that would have been stripped or 1.818 + // or null terminating the shorter string 1.819 + 1.820 + strcpy (tmpword, word); 1.821 + cp = (unsigned char *)(tmpword + tmpl); 1.822 + if (stripl) { 1.823 + strcpy ((char *)cp, strip); 1.824 + tmpl += stripl; 1.825 + cp = (unsigned char *)(tmpword + tmpl); 1.826 + } else *cp = '\0'; 1.827 + 1.828 + // now make sure all of the conditions on characters 1.829 + // are met. Please see the appendix at the end of 1.830 + // this file for more info on exactly what is being 1.831 + // tested 1.832 + 1.833 + // if all conditions are met then recall suffix_check 1.834 + 1.835 + if (test_condition((char *) cp, (char *) tmpword)) { 1.836 + if (ppfx) { 1.837 + // handle conditional suffix 1.838 + if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) { 1.839 + st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag); 1.840 + if (st) { 1.841 + if (ppfx->getMorph()) { 1.842 + mystrcat(result, ppfx->getMorph(), MAXLNLEN); 1.843 + mystrcat(result, " ", MAXLNLEN); 1.844 + } 1.845 + mystrcat(result,st, MAXLNLEN); 1.846 + free(st); 1.847 + mychomp(result); 1.848 + } 1.849 + } else { 1.850 + st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag); 1.851 + if (st) { 1.852 + mystrcat(result, st, MAXLNLEN); 1.853 + free(st); 1.854 + mychomp(result); 1.855 + } 1.856 + } 1.857 + } else { 1.858 + st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag); 1.859 + if (st) { 1.860 + mystrcat(result, st, MAXLNLEN); 1.861 + free(st); 1.862 + mychomp(result); 1.863 + } 1.864 + } 1.865 + if (*result) return mystrdup(result); 1.866 + } 1.867 + } 1.868 + return NULL; 1.869 +} 1.870 + 1.871 +// get next homonym with same affix 1.872 +struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, PfxEntry* ppfx, 1.873 + const FLAG cclass, const FLAG needflag) 1.874 +{ 1.875 + PfxEntry* ep = ppfx; 1.876 + FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL; 1.877 + 1.878 + while (he->next_homonym) { 1.879 + he = he->next_homonym; 1.880 + if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) && 1.881 + ((optflags & aeXPRODUCT) == 0 || 1.882 + TESTAFF(he->astr, eFlag, he->alen) || 1.883 + // handle conditional suffix 1.884 + ((contclass) && TESTAFF(contclass, eFlag, contclasslen)) 1.885 + ) && 1.886 + // handle cont. class 1.887 + ((!cclass) || 1.888 + ((contclass) && TESTAFF(contclass, cclass, contclasslen)) 1.889 + ) && 1.890 + // handle required flag 1.891 + ((!needflag) || 1.892 + (TESTAFF(he->astr, needflag, he->alen) || 1.893 + ((contclass) && TESTAFF(contclass, needflag, contclasslen))) 1.894 + ) 1.895 + ) return he; 1.896 + } 1.897 + return NULL; 1.898 +} 1.899 + 1.900 + 1.901 +#if 0 1.902 + 1.903 +Appendix: Understanding Affix Code 1.904 + 1.905 + 1.906 +An affix is either a prefix or a suffix attached to root words to make 1.907 +other words. 1.908 + 1.909 +Basically a Prefix or a Suffix is set of AffEntry objects 1.910 +which store information about the prefix or suffix along 1.911 +with supporting routines to check if a word has a particular 1.912 +prefix or suffix or a combination. 1.913 + 1.914 +The structure affentry is defined as follows: 1.915 + 1.916 +struct affentry 1.917 +{ 1.918 + unsigned short aflag; // ID used to represent the affix 1.919 + char * strip; // string to strip before adding affix 1.920 + char * appnd; // the affix string to add 1.921 + unsigned char stripl; // length of the strip string 1.922 + unsigned char appndl; // length of the affix string 1.923 + char numconds; // the number of conditions that must be met 1.924 + char opts; // flag: aeXPRODUCT- combine both prefix and suffix 1.925 + char conds[SETSIZE]; // array which encodes the conditions to be met 1.926 +}; 1.927 + 1.928 + 1.929 +Here is a suffix borrowed from the en_US.aff file. This file 1.930 +is whitespace delimited. 1.931 + 1.932 +SFX D Y 4 1.933 +SFX D 0 e d 1.934 +SFX D y ied [^aeiou]y 1.935 +SFX D 0 ed [^ey] 1.936 +SFX D 0 ed [aeiou]y 1.937 + 1.938 +This information can be interpreted as follows: 1.939 + 1.940 +In the first line has 4 fields 1.941 + 1.942 +Field 1.943 +----- 1.944 +1 SFX - indicates this is a suffix 1.945 +2 D - is the name of the character flag which represents this suffix 1.946 +3 Y - indicates it can be combined with prefixes (cross product) 1.947 +4 4 - indicates that sequence of 4 affentry structures are needed to 1.948 + properly store the affix information 1.949 + 1.950 +The remaining lines describe the unique information for the 4 SfxEntry 1.951 +objects that make up this affix. Each line can be interpreted 1.952 +as follows: (note fields 1 and 2 are as a check against line 1 info) 1.953 + 1.954 +Field 1.955 +----- 1.956 +1 SFX - indicates this is a suffix 1.957 +2 D - is the name of the character flag for this affix 1.958 +3 y - the string of chars to strip off before adding affix 1.959 + (a 0 here indicates the NULL string) 1.960 +4 ied - the string of affix characters to add 1.961 +5 [^aeiou]y - the conditions which must be met before the affix 1.962 + can be applied 1.963 + 1.964 +Field 5 is interesting. Since this is a suffix, field 5 tells us that 1.965 +there are 2 conditions that must be met. The first condition is that 1.966 +the next to the last character in the word must *NOT* be any of the 1.967 +following "a", "e", "i", "o" or "u". The second condition is that 1.968 +the last character of the word must end in "y". 1.969 + 1.970 +So how can we encode this information concisely and be able to 1.971 +test for both conditions in a fast manner? The answer is found 1.972 +but studying the wonderful ispell code of Geoff Kuenning, et.al. 1.973 +(now available under a normal BSD license). 1.974 + 1.975 +If we set up a conds array of 256 bytes indexed (0 to 255) and access it 1.976 +using a character (cast to an unsigned char) of a string, we have 8 bits 1.977 +of information we can store about that character. Specifically we 1.978 +could use each bit to say if that character is allowed in any of the 1.979 +last (or first for prefixes) 8 characters of the word. 1.980 + 1.981 +Basically, each character at one end of the word (up to the number 1.982 +of conditions) is used to index into the conds array and the resulting 1.983 +value found there says whether the that character is valid for a 1.984 +specific character position in the word. 1.985 + 1.986 +For prefixes, it does this by setting bit 0 if that char is valid 1.987 +in the first position, bit 1 if valid in the second position, and so on. 1.988 + 1.989 +If a bit is not set, then that char is not valid for that postion in the 1.990 +word. 1.991 + 1.992 +If working with suffixes bit 0 is used for the character closest 1.993 +to the front, bit 1 for the next character towards the end, ..., 1.994 +with bit numconds-1 representing the last char at the end of the string. 1.995 + 1.996 +Note: since entries in the conds[] are 8 bits, only 8 conditions 1.997 +(read that only 8 character positions) can be examined at one 1.998 +end of a word (the beginning for prefixes and the end for suffixes. 1.999 + 1.1000 +So to make this clearer, lets encode the conds array values for the 1.1001 +first two affentries for the suffix D described earlier. 1.1002 + 1.1003 + 1.1004 + For the first affentry: 1.1005 + numconds = 1 (only examine the last character) 1.1006 + 1.1007 + conds['e'] = (1 << 0) (the word must end in an E) 1.1008 + all others are all 0 1.1009 + 1.1010 + For the second affentry: 1.1011 + numconds = 2 (only examine the last two characters) 1.1012 + 1.1013 + conds[X] = conds[X] | (1 << 0) (aeiou are not allowed) 1.1014 + where X is all characters *but* a, e, i, o, or u 1.1015 + 1.1016 + 1.1017 + conds['y'] = (1 << 1) (the last char must be a y) 1.1018 + all other bits for all other entries in the conds array are zero 1.1019 + 1.1020 + 1.1021 +#endif 1.1022 +