Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
1 /******* BEGIN LICENSE BLOCK *******
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 *
4 * The contents of this file are subject to the Mozilla Public License Version
5 * 1.1 (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 * http://www.mozilla.org/MPL/
8 *
9 * Software distributed under the License is distributed on an "AS IS" basis,
10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 * for the specific language governing rights and limitations under the
12 * License.
13 *
14 * The Initial Developers of the Original Code are Kevin Hendricks (MySpell)
15 * and László Németh (Hunspell). Portions created by the Initial Developers
16 * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved.
17 *
18 * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca)
19 * David Einstein (deinst@world.std.com)
20 * László Németh (nemethl@gyorsposta.hu)
21 * Caolan McNamara (caolanm@redhat.com)
22 * Davide Prina
23 * Giuseppe Modugno
24 * Gianluca Turconi
25 * Simon Brouwer
26 * Noll Janos
27 * Biro Arpad
28 * Goldman Eleonora
29 * Sarlos Tamas
30 * Bencsath Boldizsar
31 * Halacsy Peter
32 * Dvornik Laszlo
33 * Gefferth Andras
34 * Nagy Viktor
35 * Varga Daniel
36 * Chris Halls
37 * Rene Engelhard
38 * Bram Moolenaar
39 * Dafydd Jones
40 * Harri Pitkanen
41 * Andras Timar
42 * Tor Lillqvist
43 *
44 * Alternatively, the contents of this file may be used under the terms of
45 * either the GNU General Public License Version 2 or later (the "GPL"), or
46 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
47 * in which case the provisions of the GPL or the LGPL are applicable instead
48 * of those above. If you wish to allow use of your version of this file only
49 * under the terms of either the GPL or the LGPL, and not to allow others to
50 * use your version of this file under the terms of the MPL, indicate your
51 * decision by deleting the provisions above and replace them with the notice
52 * and other provisions required by the GPL or the LGPL. If you do not delete
53 * the provisions above, a recipient may use your version of this file under
54 * the terms of any one of the MPL, the GPL or the LGPL.
55 *
56 ******* END LICENSE BLOCK *******/
58 #include <stdlib.h>
59 #include <string.h>
60 #include <stdio.h>
61 #include <ctype.h>
63 #include "affentry.hxx"
64 #include "csutil.hxx"
66 PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp)
67 {
68 // register affix manager
69 pmyMgr = pmgr;
71 // set up its initial values
73 aflag = dp->aflag; // flag
74 strip = dp->strip; // string to strip
75 appnd = dp->appnd; // string to append
76 stripl = dp->stripl; // length of strip string
77 appndl = dp->appndl; // length of append string
78 numconds = dp->numconds; // length of the condition
79 opts = dp->opts; // cross product flag
80 // then copy over all of the conditions
81 if (opts & aeLONGCOND) {
82 memcpy(c.conds, dp->c.l.conds1, MAXCONDLEN_1);
83 c.l.conds2 = dp->c.l.conds2;
84 } else memcpy(c.conds, dp->c.conds, MAXCONDLEN);
85 next = NULL;
86 nextne = NULL;
87 nexteq = NULL;
88 morphcode = dp->morphcode;
89 contclass = dp->contclass;
90 contclasslen = dp->contclasslen;
91 }
94 PfxEntry::~PfxEntry()
95 {
96 aflag = 0;
97 if (appnd) free(appnd);
98 if (strip) free(strip);
99 pmyMgr = NULL;
100 appnd = NULL;
101 strip = NULL;
102 if (opts & aeLONGCOND) free(c.l.conds2);
103 if (morphcode && !(opts & aeALIASM)) free(morphcode);
104 if (contclass && !(opts & aeALIASF)) free(contclass);
105 }
107 // add prefix to this word assuming conditions hold
108 char * PfxEntry::add(const char * word, int len)
109 {
110 char tword[MAXWORDUTF8LEN + 4];
112 if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) &&
113 (len >= numconds) && test_condition(word) &&
114 (!stripl || (strncmp(word, strip, stripl) == 0)) &&
115 ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
116 /* we have a match so add prefix */
117 char * pp = tword;
118 if (appndl) {
119 strcpy(tword,appnd);
120 pp += appndl;
121 }
122 strcpy(pp, (word + stripl));
123 return mystrdup(tword);
124 }
125 return NULL;
126 }
128 inline char * PfxEntry::nextchar(char * p) {
129 if (p) {
130 p++;
131 if (opts & aeLONGCOND) {
132 // jump to the 2nd part of the condition
133 if (p == c.conds + MAXCONDLEN_1) return c.l.conds2;
134 // end of the MAXCONDLEN length condition
135 } else if (p == c.conds + MAXCONDLEN) return NULL;
136 return *p ? p : NULL;
137 }
138 return NULL;
139 }
141 inline int PfxEntry::test_condition(const char * st)
142 {
143 const char * pos = NULL; // group with pos input position
144 bool neg = false; // complementer
145 bool ingroup = false; // character in the group
146 if (numconds == 0) return 1;
147 char * p = c.conds;
148 while (1) {
149 switch (*p) {
150 case '\0': return 1;
151 case '[': {
152 neg = false;
153 ingroup = false;
154 p = nextchar(p);
155 pos = st; break;
156 }
157 case '^': { p = nextchar(p); neg = true; break; }
158 case ']': {
159 if ((neg && ingroup) || (!neg && !ingroup)) return 0;
160 pos = NULL;
161 p = nextchar(p);
162 // skip the next character
163 if (!ingroup && *st) for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);
164 if (*st == '\0' && p) return 0; // word <= condition
165 break;
166 }
167 case '.': if (!pos) { // dots are not metacharacters in groups: [.]
168 p = nextchar(p);
169 // skip the next character
170 for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);
171 if (*st == '\0' && p) return 0; // word <= condition
172 break;
173 }
174 default: {
175 if (*st == *p) {
176 st++;
177 p = nextchar(p);
178 if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte
179 while (p && (*p & 0xc0) == 0x80) { // character
180 if (*p != *st) {
181 if (!pos) return 0;
182 st = pos;
183 break;
184 }
185 p = nextchar(p);
186 st++;
187 }
188 if (pos && st != pos) {
189 ingroup = true;
190 while (p && *p != ']' && (p = nextchar(p)));
191 }
192 } else if (pos) {
193 ingroup = true;
194 while (p && *p != ']' && (p = nextchar(p)));
195 }
196 } else if (pos) { // group
197 p = nextchar(p);
198 } else return 0;
199 }
200 }
201 if (!p) return 1;
202 }
203 }
205 // check if this prefix entry matches
206 struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound, const FLAG needflag)
207 {
208 int tmpl; // length of tmpword
209 struct hentry * he; // hash entry of root word or NULL
210 char tmpword[MAXWORDUTF8LEN + 4];
212 // on entry prefix is 0 length or already matches the beginning of the word.
213 // So if the remaining root word has positive length
214 // and if there are enough chars in root word and added back strip chars
215 // to meet the number of characters conditions, then test it
217 tmpl = len - appndl;
219 if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) {
221 // generate new root word by removing prefix and adding
222 // back any characters that would have been stripped
224 if (stripl) strcpy (tmpword, strip);
225 strcpy ((tmpword + stripl), (word + appndl));
227 // now make sure all of the conditions on characters
228 // are met. Please see the appendix at the end of
229 // this file for more info on exactly what is being
230 // tested
232 // if all conditions are met then check if resulting
233 // root word in the dictionary
235 if (test_condition(tmpword)) {
236 tmpl += stripl;
237 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
238 do {
239 if (TESTAFF(he->astr, aflag, he->alen) &&
240 // forbid single prefixes with needaffix flag
241 ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
242 // needflag
243 ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
244 (contclass && TESTAFF(contclass, needflag, contclasslen))))
245 return he;
246 he = he->next_homonym; // check homonyms
247 } while (he);
248 }
250 // prefix matched but no root word was found
251 // if aeXPRODUCT is allowed, try again but now
252 // ross checked combined with a suffix
254 //if ((opts & aeXPRODUCT) && in_compound) {
255 if ((opts & aeXPRODUCT)) {
256 he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, this, NULL,
257 0, NULL, FLAG_NULL, needflag, in_compound);
258 if (he) return he;
259 }
260 }
261 }
262 return NULL;
263 }
265 // check if this prefix entry matches
266 struct hentry * PfxEntry::check_twosfx(const char * word, int len,
267 char in_compound, const FLAG needflag)
268 {
269 int tmpl; // length of tmpword
270 struct hentry * he; // hash entry of root word or NULL
271 char tmpword[MAXWORDUTF8LEN + 4];
273 // on entry prefix is 0 length or already matches the beginning of the word.
274 // So if the remaining root word has positive length
275 // and if there are enough chars in root word and added back strip chars
276 // to meet the number of characters conditions, then test it
278 tmpl = len - appndl;
280 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
281 (tmpl + stripl >= numconds)) {
283 // generate new root word by removing prefix and adding
284 // back any characters that would have been stripped
286 if (stripl) strcpy (tmpword, strip);
287 strcpy ((tmpword + stripl), (word + appndl));
289 // now make sure all of the conditions on characters
290 // are met. Please see the appendix at the end of
291 // this file for more info on exactly what is being
292 // tested
294 // if all conditions are met then check if resulting
295 // root word in the dictionary
297 if (test_condition(tmpword)) {
298 tmpl += stripl;
300 // prefix matched but no root word was found
301 // if aeXPRODUCT is allowed, try again but now
302 // cross checked combined with a suffix
304 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
305 he = pmyMgr->suffix_check_twosfx(tmpword, tmpl, aeXPRODUCT, this, needflag);
306 if (he) return he;
307 }
308 }
309 }
310 return NULL;
311 }
313 // check if this prefix entry matches
314 char * PfxEntry::check_twosfx_morph(const char * word, int len,
315 char in_compound, const FLAG needflag)
316 {
317 int tmpl; // length of tmpword
318 char tmpword[MAXWORDUTF8LEN + 4];
320 // on entry prefix is 0 length or already matches the beginning of the word.
321 // So if the remaining root word has positive length
322 // and if there are enough chars in root word and added back strip chars
323 // to meet the number of characters conditions, then test it
325 tmpl = len - appndl;
327 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
328 (tmpl + stripl >= numconds)) {
330 // generate new root word by removing prefix and adding
331 // back any characters that would have been stripped
333 if (stripl) strcpy (tmpword, strip);
334 strcpy ((tmpword + stripl), (word + appndl));
336 // now make sure all of the conditions on characters
337 // are met. Please see the appendix at the end of
338 // this file for more info on exactly what is being
339 // tested
341 // if all conditions are met then check if resulting
342 // root word in the dictionary
344 if (test_condition(tmpword)) {
345 tmpl += stripl;
347 // prefix matched but no root word was found
348 // if aeXPRODUCT is allowed, try again but now
349 // ross checked combined with a suffix
351 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
352 return pmyMgr->suffix_check_twosfx_morph(tmpword, tmpl,
353 aeXPRODUCT, this, needflag);
354 }
355 }
356 }
357 return NULL;
358 }
360 // check if this prefix entry matches
361 char * PfxEntry::check_morph(const char * word, int len, char in_compound, const FLAG needflag)
362 {
363 int tmpl; // length of tmpword
364 struct hentry * he; // hash entry of root word or NULL
365 char tmpword[MAXWORDUTF8LEN + 4];
366 char result[MAXLNLEN];
367 char * st;
369 *result = '\0';
371 // on entry prefix is 0 length or already matches the beginning of the word.
372 // So if the remaining root word has positive length
373 // and if there are enough chars in root word and added back strip chars
374 // to meet the number of characters conditions, then test it
376 tmpl = len - appndl;
378 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
379 (tmpl + stripl >= numconds)) {
381 // generate new root word by removing prefix and adding
382 // back any characters that would have been stripped
384 if (stripl) strcpy (tmpword, strip);
385 strcpy ((tmpword + stripl), (word + appndl));
387 // now make sure all of the conditions on characters
388 // are met. Please see the appendix at the end of
389 // this file for more info on exactly what is being
390 // tested
392 // if all conditions are met then check if resulting
393 // root word in the dictionary
395 if (test_condition(tmpword)) {
396 tmpl += stripl;
397 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
398 do {
399 if (TESTAFF(he->astr, aflag, he->alen) &&
400 // forbid single prefixes with needaffix flag
401 ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
402 // needflag
403 ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
404 (contclass && TESTAFF(contclass, needflag, contclasslen)))) {
405 if (morphcode) {
406 mystrcat(result, " ", MAXLNLEN);
407 mystrcat(result, morphcode, MAXLNLEN);
408 } else mystrcat(result,getKey(), MAXLNLEN);
409 if (!HENTRY_FIND(he, MORPH_STEM)) {
410 mystrcat(result, " ", MAXLNLEN);
411 mystrcat(result, MORPH_STEM, MAXLNLEN);
412 mystrcat(result, HENTRY_WORD(he), MAXLNLEN);
413 }
414 // store the pointer of the hash entry
415 if (HENTRY_DATA(he)) {
416 mystrcat(result, " ", MAXLNLEN);
417 mystrcat(result, HENTRY_DATA2(he), MAXLNLEN);
418 } else {
419 // return with debug information
420 char * flag = pmyMgr->encode_flag(getFlag());
421 mystrcat(result, " ", MAXLNLEN);
422 mystrcat(result, MORPH_FLAG, MAXLNLEN);
423 mystrcat(result, flag, MAXLNLEN);
424 free(flag);
425 }
426 mystrcat(result, "\n", MAXLNLEN);
427 }
428 he = he->next_homonym;
429 } while (he);
430 }
432 // prefix matched but no root word was found
433 // if aeXPRODUCT is allowed, try again but now
434 // ross checked combined with a suffix
436 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
437 st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, this,
438 FLAG_NULL, needflag);
439 if (st) {
440 mystrcat(result, st, MAXLNLEN);
441 free(st);
442 }
443 }
444 }
445 }
447 if (*result) return mystrdup(result);
448 return NULL;
449 }
451 SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp)
452 {
453 // register affix manager
454 pmyMgr = pmgr;
456 // set up its initial values
457 aflag = dp->aflag; // char flag
458 strip = dp->strip; // string to strip
459 appnd = dp->appnd; // string to append
460 stripl = dp->stripl; // length of strip string
461 appndl = dp->appndl; // length of append string
462 numconds = dp->numconds; // length of the condition
463 opts = dp->opts; // cross product flag
465 // then copy over all of the conditions
466 if (opts & aeLONGCOND) {
467 memcpy(c.l.conds1, dp->c.l.conds1, MAXCONDLEN_1);
468 c.l.conds2 = dp->c.l.conds2;
469 } else memcpy(c.conds, dp->c.conds, MAXCONDLEN);
470 next = NULL;
471 nextne = NULL;
472 nexteq = NULL;
473 rappnd = myrevstrdup(appnd);
474 morphcode = dp->morphcode;
475 contclass = dp->contclass;
476 contclasslen = dp->contclasslen;
477 }
480 SfxEntry::~SfxEntry()
481 {
482 aflag = 0;
483 if (appnd) free(appnd);
484 if (rappnd) free(rappnd);
485 if (strip) free(strip);
486 pmyMgr = NULL;
487 appnd = NULL;
488 strip = NULL;
489 if (opts & aeLONGCOND) free(c.l.conds2);
490 if (morphcode && !(opts & aeALIASM)) free(morphcode);
491 if (contclass && !(opts & aeALIASF)) free(contclass);
492 }
494 // add suffix to this word assuming conditions hold
495 char * SfxEntry::add(const char * word, int len)
496 {
497 char tword[MAXWORDUTF8LEN + 4];
499 /* make sure all conditions match */
500 if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) &&
501 (len >= numconds) && test_condition(word + len, word) &&
502 (!stripl || (strcmp(word + len - stripl, strip) == 0)) &&
503 ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
504 /* we have a match so add suffix */
505 strcpy(tword,word);
506 if (appndl) {
507 strcpy(tword + len - stripl, appnd);
508 } else {
509 *(tword + len - stripl) = '\0';
510 }
511 return mystrdup(tword);
512 }
513 return NULL;
514 }
516 inline char * SfxEntry::nextchar(char * p) {
517 if (p) {
518 p++;
519 if (opts & aeLONGCOND) {
520 // jump to the 2nd part of the condition
521 if (p == c.l.conds1 + MAXCONDLEN_1) return c.l.conds2;
522 // end of the MAXCONDLEN length condition
523 } else if (p == c.conds + MAXCONDLEN) return NULL;
524 return *p ? p : NULL;
525 }
526 return NULL;
527 }
529 inline int SfxEntry::test_condition(const char * st, const char * beg)
530 {
531 const char * pos = NULL; // group with pos input position
532 bool neg = false; // complementer
533 bool ingroup = false; // character in the group
534 if (numconds == 0) return 1;
535 char * p = c.conds;
536 st--;
537 int i = 1;
538 while (1) {
539 switch (*p) {
540 case '\0': return 1;
541 case '[': { p = nextchar(p); pos = st; break; }
542 case '^': { p = nextchar(p); neg = true; break; }
543 case ']': { if (!neg && !ingroup) return 0;
544 i++;
545 // skip the next character
546 if (!ingroup) {
547 for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);
548 st--;
549 }
550 pos = NULL;
551 neg = false;
552 ingroup = false;
553 p = nextchar(p);
554 if (st < beg && p) return 0; // word <= condition
555 break;
556 }
557 case '.': if (!pos) { // dots are not metacharacters in groups: [.]
558 p = nextchar(p);
559 // skip the next character
560 for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);
561 if (st < beg) { // word <= condition
562 if (p) return 0; else return 1;
563 }
564 if ((opts & aeUTF8) && (*st & 0x80)) { // head of the UTF-8 character
565 st--;
566 if (st < beg) { // word <= condition
567 if (p) return 0; else return 1;
568 }
569 }
570 break;
571 }
572 default: {
573 if (*st == *p) {
574 p = nextchar(p);
575 if ((opts & aeUTF8) && (*st & 0x80)) {
576 st--;
577 while (p && (st >= beg)) {
578 if (*p != *st) {
579 if (!pos) return 0;
580 st = pos;
581 break;
582 }
583 // first byte of the UTF-8 multibyte character
584 if ((*p & 0xc0) != 0x80) break;
585 p = nextchar(p);
586 st--;
587 }
588 if (pos && st != pos) {
589 if (neg) return 0;
590 else if (i == numconds) return 1;
591 ingroup = true;
592 while (p && *p != ']' && (p = nextchar(p)));
593 st--;
594 }
595 if (p && *p != ']') p = nextchar(p);
596 } else if (pos) {
597 if (neg) return 0;
598 else if (i == numconds) return 1;
599 ingroup = true;
600 while (p && *p != ']' && (p = nextchar(p)))
601 ;
602 // if (p && *p != ']') p = nextchar(p);
603 st--;
604 }
605 if (!pos) {
606 i++;
607 st--;
608 }
609 if (st < beg && p && *p != ']') return 0; // word <= condition
610 } else if (pos) { // group
611 p = nextchar(p);
612 } else return 0;
613 }
614 }
615 if (!p) return 1;
616 }
617 }
619 // see if this suffix is present in the word
620 struct hentry * SfxEntry::checkword(const char * word, int len, int optflags,
621 PfxEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag,
622 const FLAG badflag)
623 {
624 int tmpl; // length of tmpword
625 struct hentry * he; // hash entry pointer
626 unsigned char * cp;
627 char tmpword[MAXWORDUTF8LEN + 4];
628 PfxEntry* ep = ppfx;
630 // if this suffix is being cross checked with a prefix
631 // but it does not support cross products skip it
633 if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0))
634 return NULL;
636 // upon entry suffix is 0 length or already matches the end of the word.
637 // So if the remaining root word has positive length
638 // and if there are enough chars in root word and added back strip chars
639 // to meet the number of characters conditions, then test it
641 tmpl = len - appndl;
642 // the second condition is not enough for UTF-8 strings
643 // it checked in test_condition()
645 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
646 (tmpl + stripl >= numconds)) {
648 // generate new root word by removing suffix and adding
649 // back any characters that would have been stripped or
650 // or null terminating the shorter string
652 strcpy (tmpword, word);
653 cp = (unsigned char *)(tmpword + tmpl);
654 if (stripl) {
655 strcpy ((char *)cp, strip);
656 tmpl += stripl;
657 cp = (unsigned char *)(tmpword + tmpl);
658 } else *cp = '\0';
660 // now make sure all of the conditions on characters
661 // are met. Please see the appendix at the end of
662 // this file for more info on exactly what is being
663 // tested
665 // if all conditions are met then check if resulting
666 // root word in the dictionary
668 if (test_condition((char *) cp, (char *) tmpword)) {
670 #ifdef SZOSZABLYA_POSSIBLE_ROOTS
671 fprintf(stdout,"%s %s %c\n", word, tmpword, aflag);
672 #endif
673 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
674 do {
675 // check conditional suffix (enabled by prefix)
676 if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() &&
677 TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
678 (((optflags & aeXPRODUCT) == 0) ||
679 (ep && TESTAFF(he->astr, ep->getFlag(), he->alen)) ||
680 // enabled by prefix
681 ((contclass) && (ep && TESTAFF(contclass, ep->getFlag(), contclasslen)))
682 ) &&
683 // handle cont. class
684 ((!cclass) ||
685 ((contclass) && TESTAFF(contclass, cclass, contclasslen))
686 ) &&
687 // check only in compound homonyms (bad flags)
688 (!badflag || !TESTAFF(he->astr, badflag, he->alen)
689 ) &&
690 // handle required flag
691 ((!needflag) ||
692 (TESTAFF(he->astr, needflag, he->alen) ||
693 ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
694 )
695 ) return he;
696 he = he->next_homonym; // check homonyms
697 } while (he);
699 // obsolote stemming code (used only by the
700 // experimental SuffixMgr:suggest_pos_stems)
701 // store resulting root in wlst
702 } else if (wlst && (*ns < maxSug)) {
703 int cwrd = 1;
704 for (int k=0; k < *ns; k++)
705 if (strcmp(tmpword, wlst[k]) == 0) cwrd = 0;
706 if (cwrd) {
707 wlst[*ns] = mystrdup(tmpword);
708 if (wlst[*ns] == NULL) {
709 for (int j=0; j<*ns; j++) free(wlst[j]);
710 *ns = -1;
711 return NULL;
712 }
713 (*ns)++;
714 }
715 }
716 }
717 }
718 return NULL;
719 }
721 // see if two-level suffix is present in the word
722 struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags,
723 PfxEntry* ppfx, const FLAG needflag)
724 {
725 int tmpl; // length of tmpword
726 struct hentry * he; // hash entry pointer
727 unsigned char * cp;
728 char tmpword[MAXWORDUTF8LEN + 4];
729 PfxEntry* ep = ppfx;
732 // if this suffix is being cross checked with a prefix
733 // but it does not support cross products skip it
735 if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
736 return NULL;
738 // upon entry suffix is 0 length or already matches the end of the word.
739 // So if the remaining root word has positive length
740 // and if there are enough chars in root word and added back strip chars
741 // to meet the number of characters conditions, then test it
743 tmpl = len - appndl;
745 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
746 (tmpl + stripl >= numconds)) {
748 // generate new root word by removing suffix and adding
749 // back any characters that would have been stripped or
750 // or null terminating the shorter string
752 strcpy (tmpword, word);
753 cp = (unsigned char *)(tmpword + tmpl);
754 if (stripl) {
755 strcpy ((char *)cp, strip);
756 tmpl += stripl;
757 cp = (unsigned char *)(tmpword + tmpl);
758 } else *cp = '\0';
760 // now make sure all of the conditions on characters
761 // are met. Please see the appendix at the end of
762 // this file for more info on exactly what is being
763 // tested
765 // if all conditions are met then recall suffix_check
767 if (test_condition((char *) cp, (char *) tmpword)) {
768 if (ppfx) {
769 // handle conditional suffix
770 if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
771 he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
772 else
773 he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0, NULL, (FLAG) aflag, needflag);
774 } else {
775 he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
776 }
777 if (he) return he;
778 }
779 }
780 return NULL;
781 }
783 // see if two-level suffix is present in the word
784 char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags,
785 PfxEntry* ppfx, const FLAG needflag)
786 {
787 int tmpl; // length of tmpword
788 unsigned char * cp;
789 char tmpword[MAXWORDUTF8LEN + 4];
790 PfxEntry* ep = ppfx;
791 char * st;
793 char result[MAXLNLEN];
795 *result = '\0';
797 // if this suffix is being cross checked with a prefix
798 // but it does not support cross products skip it
800 if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
801 return NULL;
803 // upon entry suffix is 0 length or already matches the end of the word.
804 // So if the remaining root word has positive length
805 // and if there are enough chars in root word and added back strip chars
806 // to meet the number of characters conditions, then test it
808 tmpl = len - appndl;
810 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
811 (tmpl + stripl >= numconds)) {
813 // generate new root word by removing suffix and adding
814 // back any characters that would have been stripped or
815 // or null terminating the shorter string
817 strcpy (tmpword, word);
818 cp = (unsigned char *)(tmpword + tmpl);
819 if (stripl) {
820 strcpy ((char *)cp, strip);
821 tmpl += stripl;
822 cp = (unsigned char *)(tmpword + tmpl);
823 } else *cp = '\0';
825 // now make sure all of the conditions on characters
826 // are met. Please see the appendix at the end of
827 // this file for more info on exactly what is being
828 // tested
830 // if all conditions are met then recall suffix_check
832 if (test_condition((char *) cp, (char *) tmpword)) {
833 if (ppfx) {
834 // handle conditional suffix
835 if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {
836 st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
837 if (st) {
838 if (ppfx->getMorph()) {
839 mystrcat(result, ppfx->getMorph(), MAXLNLEN);
840 mystrcat(result, " ", MAXLNLEN);
841 }
842 mystrcat(result,st, MAXLNLEN);
843 free(st);
844 mychomp(result);
845 }
846 } else {
847 st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag);
848 if (st) {
849 mystrcat(result, st, MAXLNLEN);
850 free(st);
851 mychomp(result);
852 }
853 }
854 } else {
855 st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
856 if (st) {
857 mystrcat(result, st, MAXLNLEN);
858 free(st);
859 mychomp(result);
860 }
861 }
862 if (*result) return mystrdup(result);
863 }
864 }
865 return NULL;
866 }
868 // get next homonym with same affix
869 struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, PfxEntry* ppfx,
870 const FLAG cclass, const FLAG needflag)
871 {
872 PfxEntry* ep = ppfx;
873 FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL;
875 while (he->next_homonym) {
876 he = he->next_homonym;
877 if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
878 ((optflags & aeXPRODUCT) == 0 ||
879 TESTAFF(he->astr, eFlag, he->alen) ||
880 // handle conditional suffix
881 ((contclass) && TESTAFF(contclass, eFlag, contclasslen))
882 ) &&
883 // handle cont. class
884 ((!cclass) ||
885 ((contclass) && TESTAFF(contclass, cclass, contclasslen))
886 ) &&
887 // handle required flag
888 ((!needflag) ||
889 (TESTAFF(he->astr, needflag, he->alen) ||
890 ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
891 )
892 ) return he;
893 }
894 return NULL;
895 }
898 #if 0
900 Appendix: Understanding Affix Code
903 An affix is either a prefix or a suffix attached to root words to make
904 other words.
906 Basically a Prefix or a Suffix is set of AffEntry objects
907 which store information about the prefix or suffix along
908 with supporting routines to check if a word has a particular
909 prefix or suffix or a combination.
911 The structure affentry is defined as follows:
913 struct affentry
914 {
915 unsigned short aflag; // ID used to represent the affix
916 char * strip; // string to strip before adding affix
917 char * appnd; // the affix string to add
918 unsigned char stripl; // length of the strip string
919 unsigned char appndl; // length of the affix string
920 char numconds; // the number of conditions that must be met
921 char opts; // flag: aeXPRODUCT- combine both prefix and suffix
922 char conds[SETSIZE]; // array which encodes the conditions to be met
923 };
926 Here is a suffix borrowed from the en_US.aff file. This file
927 is whitespace delimited.
929 SFX D Y 4
930 SFX D 0 e d
931 SFX D y ied [^aeiou]y
932 SFX D 0 ed [^ey]
933 SFX D 0 ed [aeiou]y
935 This information can be interpreted as follows:
937 In the first line has 4 fields
939 Field
940 -----
941 1 SFX - indicates this is a suffix
942 2 D - is the name of the character flag which represents this suffix
943 3 Y - indicates it can be combined with prefixes (cross product)
944 4 4 - indicates that sequence of 4 affentry structures are needed to
945 properly store the affix information
947 The remaining lines describe the unique information for the 4 SfxEntry
948 objects that make up this affix. Each line can be interpreted
949 as follows: (note fields 1 and 2 are as a check against line 1 info)
951 Field
952 -----
953 1 SFX - indicates this is a suffix
954 2 D - is the name of the character flag for this affix
955 3 y - the string of chars to strip off before adding affix
956 (a 0 here indicates the NULL string)
957 4 ied - the string of affix characters to add
958 5 [^aeiou]y - the conditions which must be met before the affix
959 can be applied
961 Field 5 is interesting. Since this is a suffix, field 5 tells us that
962 there are 2 conditions that must be met. The first condition is that
963 the next to the last character in the word must *NOT* be any of the
964 following "a", "e", "i", "o" or "u". The second condition is that
965 the last character of the word must end in "y".
967 So how can we encode this information concisely and be able to
968 test for both conditions in a fast manner? The answer is found
969 but studying the wonderful ispell code of Geoff Kuenning, et.al.
970 (now available under a normal BSD license).
972 If we set up a conds array of 256 bytes indexed (0 to 255) and access it
973 using a character (cast to an unsigned char) of a string, we have 8 bits
974 of information we can store about that character. Specifically we
975 could use each bit to say if that character is allowed in any of the
976 last (or first for prefixes) 8 characters of the word.
978 Basically, each character at one end of the word (up to the number
979 of conditions) is used to index into the conds array and the resulting
980 value found there says whether the that character is valid for a
981 specific character position in the word.
983 For prefixes, it does this by setting bit 0 if that char is valid
984 in the first position, bit 1 if valid in the second position, and so on.
986 If a bit is not set, then that char is not valid for that postion in the
987 word.
989 If working with suffixes bit 0 is used for the character closest
990 to the front, bit 1 for the next character towards the end, ...,
991 with bit numconds-1 representing the last char at the end of the string.
993 Note: since entries in the conds[] are 8 bits, only 8 conditions
994 (read that only 8 character positions) can be examined at one
995 end of a word (the beginning for prefixes and the end for suffixes.
997 So to make this clearer, lets encode the conds array values for the
998 first two affentries for the suffix D described earlier.
1001 For the first affentry:
1002 numconds = 1 (only examine the last character)
1004 conds['e'] = (1 << 0) (the word must end in an E)
1005 all others are all 0
1007 For the second affentry:
1008 numconds = 2 (only examine the last two characters)
1010 conds[X] = conds[X] | (1 << 0) (aeiou are not allowed)
1011 where X is all characters *but* a, e, i, o, or u
1014 conds['y'] = (1 << 1) (the last char must be a y)
1015 all other bits for all other entries in the conds array are zero
1018 #endif