|
1 /******* BEGIN LICENSE BLOCK ******* |
|
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 |
|
3 * |
|
4 * The contents of this file are subject to the Mozilla Public License Version |
|
5 * 1.1 (the "License"); you may not use this file except in compliance with |
|
6 * the License. You may obtain a copy of the License at |
|
7 * http://www.mozilla.org/MPL/ |
|
8 * |
|
9 * Software distributed under the License is distributed on an "AS IS" basis, |
|
10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License |
|
11 * for the specific language governing rights and limitations under the |
|
12 * License. |
|
13 * |
|
14 * The Initial Developers of the Original Code are Kevin Hendricks (MySpell) |
|
15 * and László Németh (Hunspell). Portions created by the Initial Developers |
|
16 * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved. |
|
17 * |
|
18 * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca) |
|
19 * David Einstein (deinst@world.std.com) |
|
20 * László Németh (nemethl@gyorsposta.hu) |
|
21 * Caolan McNamara (caolanm@redhat.com) |
|
22 * Davide Prina |
|
23 * Giuseppe Modugno |
|
24 * Gianluca Turconi |
|
25 * Simon Brouwer |
|
26 * Noll Janos |
|
27 * Biro Arpad |
|
28 * Goldman Eleonora |
|
29 * Sarlos Tamas |
|
30 * Bencsath Boldizsar |
|
31 * Halacsy Peter |
|
32 * Dvornik Laszlo |
|
33 * Gefferth Andras |
|
34 * Nagy Viktor |
|
35 * Varga Daniel |
|
36 * Chris Halls |
|
37 * Rene Engelhard |
|
38 * Bram Moolenaar |
|
39 * Dafydd Jones |
|
40 * Harri Pitkanen |
|
41 * Andras Timar |
|
42 * Tor Lillqvist |
|
43 * |
|
44 * Alternatively, the contents of this file may be used under the terms of |
|
45 * either the GNU General Public License Version 2 or later (the "GPL"), or |
|
46 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), |
|
47 * in which case the provisions of the GPL or the LGPL are applicable instead |
|
48 * of those above. If you wish to allow use of your version of this file only |
|
49 * under the terms of either the GPL or the LGPL, and not to allow others to |
|
50 * use your version of this file under the terms of the MPL, indicate your |
|
51 * decision by deleting the provisions above and replace them with the notice |
|
52 * and other provisions required by the GPL or the LGPL. If you do not delete |
|
53 * the provisions above, a recipient may use your version of this file under |
|
54 * the terms of any one of the MPL, the GPL or the LGPL. |
|
55 * |
|
56 ******* END LICENSE BLOCK *******/ |
|
57 |
|
58 #include <stdlib.h> |
|
59 #include <string.h> |
|
60 #include <stdio.h> |
|
61 #include <ctype.h> |
|
62 |
|
63 #include "hashmgr.hxx" |
|
64 #include "csutil.hxx" |
|
65 #include "atypes.hxx" |
|
66 |
|
67 // build a hash table from a munched word list |
|
68 |
|
69 HashMgr::HashMgr(const char * tpath, const char * apath, const char * key) |
|
70 { |
|
71 tablesize = 0; |
|
72 tableptr = NULL; |
|
73 flag_mode = FLAG_CHAR; |
|
74 complexprefixes = 0; |
|
75 utf8 = 0; |
|
76 langnum = 0; |
|
77 lang = NULL; |
|
78 enc = NULL; |
|
79 csconv = 0; |
|
80 ignorechars = NULL; |
|
81 ignorechars_utf16 = NULL; |
|
82 ignorechars_utf16_len = 0; |
|
83 numaliasf = 0; |
|
84 aliasf = NULL; |
|
85 numaliasm = 0; |
|
86 aliasm = NULL; |
|
87 forbiddenword = FORBIDDENWORD; // forbidden word signing flag |
|
88 load_config(apath, key); |
|
89 int ec = load_tables(tpath, key); |
|
90 if (ec) { |
|
91 /* error condition - what should we do here */ |
|
92 HUNSPELL_WARNING(stderr, "Hash Manager Error : %d\n",ec); |
|
93 if (tableptr) { |
|
94 free(tableptr); |
|
95 tableptr = NULL; |
|
96 } |
|
97 tablesize = 0; |
|
98 } |
|
99 } |
|
100 |
|
101 |
|
102 HashMgr::~HashMgr() |
|
103 { |
|
104 if (tableptr) { |
|
105 // now pass through hash table freeing up everything |
|
106 // go through column by column of the table |
|
107 for (int i=0; i < tablesize; i++) { |
|
108 struct hentry * pt = tableptr[i]; |
|
109 struct hentry * nt = NULL; |
|
110 while(pt) { |
|
111 nt = pt->next; |
|
112 if (pt->astr && (!aliasf || TESTAFF(pt->astr, ONLYUPCASEFLAG, pt->alen))) free(pt->astr); |
|
113 free(pt); |
|
114 pt = nt; |
|
115 } |
|
116 } |
|
117 free(tableptr); |
|
118 } |
|
119 tablesize = 0; |
|
120 |
|
121 if (aliasf) { |
|
122 for (int j = 0; j < (numaliasf); j++) free(aliasf[j]); |
|
123 free(aliasf); |
|
124 aliasf = NULL; |
|
125 if (aliasflen) { |
|
126 free(aliasflen); |
|
127 aliasflen = NULL; |
|
128 } |
|
129 } |
|
130 if (aliasm) { |
|
131 for (int j = 0; j < (numaliasm); j++) free(aliasm[j]); |
|
132 free(aliasm); |
|
133 aliasm = NULL; |
|
134 } |
|
135 |
|
136 #ifndef OPENOFFICEORG |
|
137 #ifndef MOZILLA_CLIENT |
|
138 if (utf8) free_utf_tbl(); |
|
139 #endif |
|
140 #endif |
|
141 |
|
142 if (enc) free(enc); |
|
143 if (lang) free(lang); |
|
144 |
|
145 if (ignorechars) free(ignorechars); |
|
146 if (ignorechars_utf16) free(ignorechars_utf16); |
|
147 |
|
148 #ifdef MOZILLA_CLIENT |
|
149 delete [] csconv; |
|
150 #endif |
|
151 } |
|
152 |
|
153 // lookup a root word in the hashtable |
|
154 |
|
155 struct hentry * HashMgr::lookup(const char *word) const |
|
156 { |
|
157 struct hentry * dp; |
|
158 if (tableptr) { |
|
159 dp = tableptr[hash(word)]; |
|
160 if (!dp) return NULL; |
|
161 for ( ; dp != NULL; dp = dp->next) { |
|
162 if (strcmp(word, dp->word) == 0) return dp; |
|
163 } |
|
164 } |
|
165 return NULL; |
|
166 } |
|
167 |
|
168 // add a word to the hash table (private) |
|
169 int HashMgr::add_word(const char * word, int wbl, int wcl, unsigned short * aff, |
|
170 int al, const char * desc, bool onlyupcase) |
|
171 { |
|
172 bool upcasehomonym = false; |
|
173 int descl = desc ? (aliasm ? sizeof(short) : strlen(desc) + 1) : 0; |
|
174 // variable-length hash record with word and optional fields |
|
175 struct hentry* hp = |
|
176 (struct hentry *) malloc (sizeof(struct hentry) + wbl + descl); |
|
177 if (!hp) return 1; |
|
178 char * hpw = hp->word; |
|
179 strcpy(hpw, word); |
|
180 if (ignorechars != NULL) { |
|
181 if (utf8) { |
|
182 remove_ignored_chars_utf(hpw, ignorechars_utf16, ignorechars_utf16_len); |
|
183 } else { |
|
184 remove_ignored_chars(hpw, ignorechars); |
|
185 } |
|
186 } |
|
187 if (complexprefixes) { |
|
188 if (utf8) reverseword_utf(hpw); else reverseword(hpw); |
|
189 } |
|
190 |
|
191 int i = hash(hpw); |
|
192 |
|
193 hp->blen = (unsigned char) wbl; |
|
194 hp->clen = (unsigned char) wcl; |
|
195 hp->alen = (short) al; |
|
196 hp->astr = aff; |
|
197 hp->next = NULL; |
|
198 hp->next_homonym = NULL; |
|
199 |
|
200 // store the description string or its pointer |
|
201 if (desc) { |
|
202 hp->var = H_OPT; |
|
203 if (aliasm) { |
|
204 hp->var += H_OPT_ALIASM; |
|
205 store_pointer(hpw + wbl + 1, get_aliasm(atoi(desc))); |
|
206 } else { |
|
207 strcpy(hpw + wbl + 1, desc); |
|
208 if (complexprefixes) { |
|
209 if (utf8) reverseword_utf(HENTRY_DATA(hp)); |
|
210 else reverseword(HENTRY_DATA(hp)); |
|
211 } |
|
212 } |
|
213 if (strstr(HENTRY_DATA(hp), MORPH_PHON)) hp->var += H_OPT_PHON; |
|
214 } else hp->var = 0; |
|
215 |
|
216 struct hentry * dp = tableptr[i]; |
|
217 if (!dp) { |
|
218 tableptr[i] = hp; |
|
219 return 0; |
|
220 } |
|
221 while (dp->next != NULL) { |
|
222 if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) { |
|
223 // remove hidden onlyupcase homonym |
|
224 if (!onlyupcase) { |
|
225 if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) { |
|
226 free(dp->astr); |
|
227 dp->astr = hp->astr; |
|
228 dp->alen = hp->alen; |
|
229 free(hp); |
|
230 return 0; |
|
231 } else { |
|
232 dp->next_homonym = hp; |
|
233 } |
|
234 } else { |
|
235 upcasehomonym = true; |
|
236 } |
|
237 } |
|
238 dp=dp->next; |
|
239 } |
|
240 if (strcmp(hp->word, dp->word) == 0) { |
|
241 // remove hidden onlyupcase homonym |
|
242 if (!onlyupcase) { |
|
243 if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) { |
|
244 free(dp->astr); |
|
245 dp->astr = hp->astr; |
|
246 dp->alen = hp->alen; |
|
247 free(hp); |
|
248 return 0; |
|
249 } else { |
|
250 dp->next_homonym = hp; |
|
251 } |
|
252 } else { |
|
253 upcasehomonym = true; |
|
254 } |
|
255 } |
|
256 if (!upcasehomonym) { |
|
257 dp->next = hp; |
|
258 } else { |
|
259 // remove hidden onlyupcase homonym |
|
260 if (hp->astr) free(hp->astr); |
|
261 free(hp); |
|
262 } |
|
263 return 0; |
|
264 } |
|
265 |
|
266 int HashMgr::add_hidden_capitalized_word(char * word, int wbl, int wcl, |
|
267 unsigned short * flags, int al, char * dp, int captype) |
|
268 { |
|
269 // add inner capitalized forms to handle the following allcap forms: |
|
270 // Mixed caps: OpenOffice.org -> OPENOFFICE.ORG |
|
271 // Allcaps with suffixes: CIA's -> CIA'S |
|
272 if (((captype == HUHCAP) || (captype == HUHINITCAP) || |
|
273 ((captype == ALLCAP) && (flags != NULL))) && |
|
274 !((flags != NULL) && TESTAFF(flags, forbiddenword, al))) { |
|
275 unsigned short * flags2 = (unsigned short *) malloc (sizeof(unsigned short) * (al+1)); |
|
276 if (!flags2) return 1; |
|
277 if (al) memcpy(flags2, flags, al * sizeof(unsigned short)); |
|
278 flags2[al] = ONLYUPCASEFLAG; |
|
279 if (utf8) { |
|
280 char st[BUFSIZE]; |
|
281 w_char w[BUFSIZE]; |
|
282 int wlen = u8_u16(w, BUFSIZE, word); |
|
283 mkallsmall_utf(w, wlen, langnum); |
|
284 mkallcap_utf(w, 1, langnum); |
|
285 u16_u8(st, BUFSIZE, w, wlen); |
|
286 return add_word(st,wbl,wcl,flags2,al+1,dp, true); |
|
287 } else { |
|
288 mkallsmall(word, csconv); |
|
289 mkinitcap(word, csconv); |
|
290 return add_word(word,wbl,wcl,flags2,al+1,dp, true); |
|
291 } |
|
292 } |
|
293 return 0; |
|
294 } |
|
295 |
|
296 // detect captype and modify word length for UTF-8 encoding |
|
297 int HashMgr::get_clen_and_captype(const char * word, int wbl, int * captype) { |
|
298 int len; |
|
299 if (utf8) { |
|
300 w_char dest_utf[BUFSIZE]; |
|
301 len = u8_u16(dest_utf, BUFSIZE, word); |
|
302 *captype = get_captype_utf8(dest_utf, len, langnum); |
|
303 } else { |
|
304 len = wbl; |
|
305 *captype = get_captype((char *) word, len, csconv); |
|
306 } |
|
307 return len; |
|
308 } |
|
309 |
|
310 // remove word (personal dictionary function for standalone applications) |
|
311 int HashMgr::remove(const char * word) |
|
312 { |
|
313 struct hentry * dp = lookup(word); |
|
314 while (dp) { |
|
315 if (dp->alen == 0 || !TESTAFF(dp->astr, forbiddenword, dp->alen)) { |
|
316 unsigned short * flags = |
|
317 (unsigned short *) malloc(sizeof(short) * (dp->alen + 1)); |
|
318 if (!flags) return 1; |
|
319 for (int i = 0; i < dp->alen; i++) flags[i] = dp->astr[i]; |
|
320 flags[dp->alen] = forbiddenword; |
|
321 dp->astr = flags; |
|
322 dp->alen++; |
|
323 flag_qsort(flags, 0, dp->alen); |
|
324 } |
|
325 dp = dp->next_homonym; |
|
326 } |
|
327 return 0; |
|
328 } |
|
329 |
|
330 /* remove forbidden flag to add a personal word to the hash */ |
|
331 int HashMgr::remove_forbidden_flag(const char * word) { |
|
332 struct hentry * dp = lookup(word); |
|
333 if (!dp) return 1; |
|
334 while (dp) { |
|
335 if (dp->astr && TESTAFF(dp->astr, forbiddenword, dp->alen)) { |
|
336 if (dp->alen == 1) dp->alen = 0; // XXX forbidden words of personal dic. |
|
337 else { |
|
338 unsigned short * flags2 = |
|
339 (unsigned short *) malloc(sizeof(short) * (dp->alen - 1)); |
|
340 if (!flags2) return 1; |
|
341 int i, j = 0; |
|
342 for (i = 0; i < dp->alen; i++) { |
|
343 if (dp->astr[i] != forbiddenword) flags2[j++] = dp->astr[i]; |
|
344 } |
|
345 dp->alen--; |
|
346 dp->astr = flags2; // XXX allowed forbidden words |
|
347 } |
|
348 } |
|
349 dp = dp->next_homonym; |
|
350 } |
|
351 return 0; |
|
352 } |
|
353 |
|
354 // add a custom dic. word to the hash table (public) |
|
355 int HashMgr::add(const char * word) |
|
356 { |
|
357 unsigned short * flags = NULL; |
|
358 int al = 0; |
|
359 if (remove_forbidden_flag(word)) { |
|
360 int captype; |
|
361 int wbl = strlen(word); |
|
362 int wcl = get_clen_and_captype(word, wbl, &captype); |
|
363 add_word(word, wbl, wcl, flags, al, NULL, false); |
|
364 return add_hidden_capitalized_word((char *) word, wbl, wcl, flags, al, NULL, captype); |
|
365 } |
|
366 return 0; |
|
367 } |
|
368 |
|
369 int HashMgr::add_with_affix(const char * word, const char * example) |
|
370 { |
|
371 // detect captype and modify word length for UTF-8 encoding |
|
372 struct hentry * dp = lookup(example); |
|
373 remove_forbidden_flag(word); |
|
374 if (dp && dp->astr) { |
|
375 int captype; |
|
376 int wbl = strlen(word); |
|
377 int wcl = get_clen_and_captype(word, wbl, &captype); |
|
378 if (aliasf) { |
|
379 add_word(word, wbl, wcl, dp->astr, dp->alen, NULL, false); |
|
380 } else { |
|
381 unsigned short * flags = (unsigned short *) malloc (dp->alen * sizeof(short)); |
|
382 if (flags) { |
|
383 memcpy((void *) flags, (void *) dp->astr, dp->alen * sizeof(short)); |
|
384 add_word(word, wbl, wcl, flags, dp->alen, NULL, false); |
|
385 } else return 1; |
|
386 } |
|
387 return add_hidden_capitalized_word((char *) word, wbl, wcl, dp->astr, dp->alen, NULL, captype); |
|
388 } |
|
389 return 1; |
|
390 } |
|
391 |
|
392 // walk the hash table entry by entry - null at end |
|
393 // initialize: col=-1; hp = NULL; hp = walk_hashtable(&col, hp); |
|
394 struct hentry * HashMgr::walk_hashtable(int &col, struct hentry * hp) const |
|
395 { |
|
396 if (hp && hp->next != NULL) return hp->next; |
|
397 for (col++; col < tablesize; col++) { |
|
398 if (tableptr[col]) return tableptr[col]; |
|
399 } |
|
400 // null at end and reset to start |
|
401 col = -1; |
|
402 return NULL; |
|
403 } |
|
404 |
|
405 // load a munched word list and build a hash table on the fly |
|
406 int HashMgr::load_tables(const char * tpath, const char * key) |
|
407 { |
|
408 int al; |
|
409 char * ap; |
|
410 char * dp; |
|
411 char * dp2; |
|
412 unsigned short * flags; |
|
413 char * ts; |
|
414 |
|
415 // open dictionary file |
|
416 FileMgr * dict = new FileMgr(tpath, key); |
|
417 if (dict == NULL) return 1; |
|
418 |
|
419 // first read the first line of file to get hash table size */ |
|
420 if (!(ts = dict->getline())) { |
|
421 HUNSPELL_WARNING(stderr, "error: empty dic file\n"); |
|
422 delete dict; |
|
423 return 2; |
|
424 } |
|
425 mychomp(ts); |
|
426 |
|
427 /* remove byte order mark */ |
|
428 if (strncmp(ts,"\xEF\xBB\xBF",3) == 0) { |
|
429 memmove(ts, ts+3, strlen(ts+3)+1); |
|
430 // warning: dic file begins with byte order mark: possible incompatibility with old Hunspell versions |
|
431 } |
|
432 |
|
433 tablesize = atoi(ts); |
|
434 if (tablesize == 0) { |
|
435 HUNSPELL_WARNING(stderr, "error: line 1: missing or bad word count in the dic file\n"); |
|
436 delete dict; |
|
437 return 4; |
|
438 } |
|
439 tablesize = tablesize + 5 + USERWORD; |
|
440 if ((tablesize %2) == 0) tablesize++; |
|
441 |
|
442 // allocate the hash table |
|
443 tableptr = (struct hentry **) malloc(tablesize * sizeof(struct hentry *)); |
|
444 if (! tableptr) { |
|
445 delete dict; |
|
446 return 3; |
|
447 } |
|
448 for (int i=0; i<tablesize; i++) tableptr[i] = NULL; |
|
449 |
|
450 // loop through all words on much list and add to hash |
|
451 // table and create word and affix strings |
|
452 |
|
453 while ((ts = dict->getline())) { |
|
454 mychomp(ts); |
|
455 // split each line into word and morphological description |
|
456 dp = ts; |
|
457 while ((dp = strchr(dp, ':'))) { |
|
458 if ((dp > ts + 3) && (*(dp - 3) == ' ' || *(dp - 3) == '\t')) { |
|
459 for (dp -= 4; dp >= ts && (*dp == ' ' || *dp == '\t'); dp--); |
|
460 if (dp < ts) { // missing word |
|
461 dp = NULL; |
|
462 } else { |
|
463 *(dp + 1) = '\0'; |
|
464 dp = dp + 2; |
|
465 } |
|
466 break; |
|
467 } |
|
468 dp++; |
|
469 } |
|
470 |
|
471 // tabulator is the old morphological field separator |
|
472 dp2 = strchr(ts, '\t'); |
|
473 if (dp2 && (!dp || dp2 < dp)) { |
|
474 *dp2 = '\0'; |
|
475 dp = dp2 + 1; |
|
476 } |
|
477 |
|
478 // split each line into word and affix char strings |
|
479 // "\/" signs slash in words (not affix separator) |
|
480 // "/" at beginning of the line is word character (not affix separator) |
|
481 ap = strchr(ts,'/'); |
|
482 while (ap) { |
|
483 if (ap == ts) { |
|
484 ap++; |
|
485 continue; |
|
486 } else if (*(ap - 1) != '\\') break; |
|
487 // replace "\/" with "/" |
|
488 for (char * sp = ap - 1; *sp; *sp = *(sp + 1), sp++); |
|
489 ap = strchr(ap,'/'); |
|
490 } |
|
491 |
|
492 if (ap) { |
|
493 *ap = '\0'; |
|
494 if (aliasf) { |
|
495 int index = atoi(ap + 1); |
|
496 al = get_aliasf(index, &flags, dict); |
|
497 if (!al) { |
|
498 HUNSPELL_WARNING(stderr, "error: line %d: bad flag vector alias\n", dict->getlinenum()); |
|
499 *ap = '\0'; |
|
500 } |
|
501 } else { |
|
502 al = decode_flags(&flags, ap + 1, dict); |
|
503 if (al == -1) { |
|
504 HUNSPELL_WARNING(stderr, "Can't allocate memory.\n"); |
|
505 delete dict; |
|
506 return 6; |
|
507 } |
|
508 flag_qsort(flags, 0, al); |
|
509 } |
|
510 } else { |
|
511 al = 0; |
|
512 ap = NULL; |
|
513 flags = NULL; |
|
514 } |
|
515 |
|
516 int captype; |
|
517 int wbl = strlen(ts); |
|
518 int wcl = get_clen_and_captype(ts, wbl, &captype); |
|
519 // add the word and its index plus its capitalized form optionally |
|
520 if (add_word(ts,wbl,wcl,flags,al,dp, false) || |
|
521 add_hidden_capitalized_word(ts, wbl, wcl, flags, al, dp, captype)) { |
|
522 delete dict; |
|
523 return 5; |
|
524 } |
|
525 } |
|
526 |
|
527 delete dict; |
|
528 return 0; |
|
529 } |
|
530 |
|
531 // the hash function is a simple load and rotate |
|
532 // algorithm borrowed |
|
533 |
|
534 int HashMgr::hash(const char * word) const |
|
535 { |
|
536 long hv = 0; |
|
537 for (int i=0; i < 4 && *word != 0; i++) |
|
538 hv = (hv << 8) | (*word++); |
|
539 while (*word != 0) { |
|
540 ROTATE(hv,ROTATE_LEN); |
|
541 hv ^= (*word++); |
|
542 } |
|
543 return (unsigned long) hv % tablesize; |
|
544 } |
|
545 |
|
546 int HashMgr::decode_flags(unsigned short ** result, char * flags, FileMgr * af) { |
|
547 int len; |
|
548 if (*flags == '\0') { |
|
549 *result = NULL; |
|
550 return 0; |
|
551 } |
|
552 switch (flag_mode) { |
|
553 case FLAG_LONG: { // two-character flags (1x2yZz -> 1x 2y Zz) |
|
554 len = strlen(flags); |
|
555 if (len%2 == 1) HUNSPELL_WARNING(stderr, "error: line %d: bad flagvector\n", af->getlinenum()); |
|
556 len /= 2; |
|
557 *result = (unsigned short *) malloc(len * sizeof(short)); |
|
558 if (!*result) return -1; |
|
559 for (int i = 0; i < len; i++) { |
|
560 (*result)[i] = (((unsigned short) flags[i * 2]) << 8) + (unsigned short) flags[i * 2 + 1]; |
|
561 } |
|
562 break; |
|
563 } |
|
564 case FLAG_NUM: { // decimal numbers separated by comma (4521,23,233 -> 4521 23 233) |
|
565 int i; |
|
566 len = 1; |
|
567 char * src = flags; |
|
568 unsigned short * dest; |
|
569 char * p; |
|
570 for (p = flags; *p; p++) { |
|
571 if (*p == ',') len++; |
|
572 } |
|
573 *result = (unsigned short *) malloc(len * sizeof(short)); |
|
574 if (!*result) return -1; |
|
575 dest = *result; |
|
576 for (p = flags; *p; p++) { |
|
577 if (*p == ',') { |
|
578 i = atoi(src); |
|
579 if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: line %d: flag id %d is too large (max: %d)\n", |
|
580 af->getlinenum(), i, DEFAULTFLAGS - 1); |
|
581 *dest = (unsigned short) i; |
|
582 if (*dest == 0) HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n", af->getlinenum()); |
|
583 src = p + 1; |
|
584 dest++; |
|
585 } |
|
586 } |
|
587 i = atoi(src); |
|
588 if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: line %d: flag id %d is too large (max: %d)\n", |
|
589 af->getlinenum(), i, DEFAULTFLAGS - 1); |
|
590 *dest = (unsigned short) i; |
|
591 if (*dest == 0) HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n", af->getlinenum()); |
|
592 break; |
|
593 } |
|
594 case FLAG_UNI: { // UTF-8 characters |
|
595 w_char w[BUFSIZE/2]; |
|
596 len = u8_u16(w, BUFSIZE/2, flags); |
|
597 *result = (unsigned short *) malloc(len * sizeof(short)); |
|
598 if (!*result) return -1; |
|
599 memcpy(*result, w, len * sizeof(short)); |
|
600 break; |
|
601 } |
|
602 default: { // Ispell's one-character flags (erfg -> e r f g) |
|
603 unsigned short * dest; |
|
604 len = strlen(flags); |
|
605 *result = (unsigned short *) malloc(len * sizeof(short)); |
|
606 if (!*result) return -1; |
|
607 dest = *result; |
|
608 for (unsigned char * p = (unsigned char *) flags; *p; p++) { |
|
609 *dest = (unsigned short) *p; |
|
610 dest++; |
|
611 } |
|
612 } |
|
613 } |
|
614 return len; |
|
615 } |
|
616 |
|
617 unsigned short HashMgr::decode_flag(const char * f) { |
|
618 unsigned short s = 0; |
|
619 int i; |
|
620 switch (flag_mode) { |
|
621 case FLAG_LONG: |
|
622 s = ((unsigned short) f[0] << 8) + (unsigned short) f[1]; |
|
623 break; |
|
624 case FLAG_NUM: |
|
625 i = atoi(f); |
|
626 if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: flag id %d is too large (max: %d)\n", i, DEFAULTFLAGS - 1); |
|
627 s = (unsigned short) i; |
|
628 break; |
|
629 case FLAG_UNI: |
|
630 u8_u16((w_char *) &s, 1, f); |
|
631 break; |
|
632 default: |
|
633 s = (unsigned short) *((unsigned char *)f); |
|
634 } |
|
635 if (s == 0) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n"); |
|
636 return s; |
|
637 } |
|
638 |
|
639 char * HashMgr::encode_flag(unsigned short f) { |
|
640 unsigned char ch[10]; |
|
641 if (f==0) return mystrdup("(NULL)"); |
|
642 if (flag_mode == FLAG_LONG) { |
|
643 ch[0] = (unsigned char) (f >> 8); |
|
644 ch[1] = (unsigned char) (f - ((f >> 8) << 8)); |
|
645 ch[2] = '\0'; |
|
646 } else if (flag_mode == FLAG_NUM) { |
|
647 sprintf((char *) ch, "%d", f); |
|
648 } else if (flag_mode == FLAG_UNI) { |
|
649 u16_u8((char *) &ch, 10, (w_char *) &f, 1); |
|
650 } else { |
|
651 ch[0] = (unsigned char) (f); |
|
652 ch[1] = '\0'; |
|
653 } |
|
654 return mystrdup((char *) ch); |
|
655 } |
|
656 |
|
657 // read in aff file and set flag mode |
|
658 int HashMgr::load_config(const char * affpath, const char * key) |
|
659 { |
|
660 char * line; // io buffers |
|
661 int firstline = 1; |
|
662 |
|
663 // open the affix file |
|
664 FileMgr * afflst = new FileMgr(affpath, key); |
|
665 if (!afflst) { |
|
666 HUNSPELL_WARNING(stderr, "Error - could not open affix description file %s\n",affpath); |
|
667 return 1; |
|
668 } |
|
669 |
|
670 // read in each line ignoring any that do not |
|
671 // start with a known line type indicator |
|
672 |
|
673 while ((line = afflst->getline())) { |
|
674 mychomp(line); |
|
675 |
|
676 /* remove byte order mark */ |
|
677 if (firstline) { |
|
678 firstline = 0; |
|
679 if (strncmp(line,"\xEF\xBB\xBF",3) == 0) memmove(line, line+3, strlen(line+3)+1); |
|
680 } |
|
681 |
|
682 /* parse in the try string */ |
|
683 if ((strncmp(line,"FLAG",4) == 0) && isspace(line[4])) { |
|
684 if (flag_mode != FLAG_CHAR) { |
|
685 HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of the FLAG affix file parameter\n", afflst->getlinenum()); |
|
686 } |
|
687 if (strstr(line, "long")) flag_mode = FLAG_LONG; |
|
688 if (strstr(line, "num")) flag_mode = FLAG_NUM; |
|
689 if (strstr(line, "UTF-8")) flag_mode = FLAG_UNI; |
|
690 if (flag_mode == FLAG_CHAR) { |
|
691 HUNSPELL_WARNING(stderr, "error: line %d: FLAG needs `num', `long' or `UTF-8' parameter\n", afflst->getlinenum()); |
|
692 } |
|
693 } |
|
694 if (strncmp(line,"FORBIDDENWORD",13) == 0) { |
|
695 char * st = NULL; |
|
696 if (parse_string(line, &st, afflst->getlinenum())) { |
|
697 delete afflst; |
|
698 return 1; |
|
699 } |
|
700 forbiddenword = decode_flag(st); |
|
701 free(st); |
|
702 } |
|
703 if (strncmp(line, "SET", 3) == 0) { |
|
704 if (parse_string(line, &enc, afflst->getlinenum())) { |
|
705 delete afflst; |
|
706 return 1; |
|
707 } |
|
708 if (strcmp(enc, "UTF-8") == 0) { |
|
709 utf8 = 1; |
|
710 #ifndef OPENOFFICEORG |
|
711 #ifndef MOZILLA_CLIENT |
|
712 initialize_utf_tbl(); |
|
713 #endif |
|
714 #endif |
|
715 } else csconv = get_current_cs(enc); |
|
716 } |
|
717 if (strncmp(line, "LANG", 4) == 0) { |
|
718 if (parse_string(line, &lang, afflst->getlinenum())) { |
|
719 delete afflst; |
|
720 return 1; |
|
721 } |
|
722 langnum = get_lang_num(lang); |
|
723 } |
|
724 |
|
725 /* parse in the ignored characters (for example, Arabic optional diacritics characters */ |
|
726 if (strncmp(line,"IGNORE",6) == 0) { |
|
727 if (parse_array(line, &ignorechars, &ignorechars_utf16, |
|
728 &ignorechars_utf16_len, utf8, afflst->getlinenum())) { |
|
729 delete afflst; |
|
730 return 1; |
|
731 } |
|
732 } |
|
733 |
|
734 if ((strncmp(line,"AF",2) == 0) && isspace(line[2])) { |
|
735 if (parse_aliasf(line, afflst)) { |
|
736 delete afflst; |
|
737 return 1; |
|
738 } |
|
739 } |
|
740 |
|
741 if ((strncmp(line,"AM",2) == 0) && isspace(line[2])) { |
|
742 if (parse_aliasm(line, afflst)) { |
|
743 delete afflst; |
|
744 return 1; |
|
745 } |
|
746 } |
|
747 |
|
748 if (strncmp(line,"COMPLEXPREFIXES",15) == 0) complexprefixes = 1; |
|
749 if (((strncmp(line,"SFX",3) == 0) || (strncmp(line,"PFX",3) == 0)) && isspace(line[3])) break; |
|
750 } |
|
751 if (csconv == NULL) csconv = get_current_cs(SPELL_ENCODING); |
|
752 delete afflst; |
|
753 return 0; |
|
754 } |
|
755 |
|
756 /* parse in the ALIAS table */ |
|
757 int HashMgr::parse_aliasf(char * line, FileMgr * af) |
|
758 { |
|
759 if (numaliasf != 0) { |
|
760 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum()); |
|
761 return 1; |
|
762 } |
|
763 char * tp = line; |
|
764 char * piece; |
|
765 int i = 0; |
|
766 int np = 0; |
|
767 piece = mystrsep(&tp, 0); |
|
768 while (piece) { |
|
769 if (*piece != '\0') { |
|
770 switch(i) { |
|
771 case 0: { np++; break; } |
|
772 case 1: { |
|
773 numaliasf = atoi(piece); |
|
774 if (numaliasf < 1) { |
|
775 numaliasf = 0; |
|
776 aliasf = NULL; |
|
777 aliasflen = NULL; |
|
778 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum()); |
|
779 return 1; |
|
780 } |
|
781 aliasf = (unsigned short **) malloc(numaliasf * sizeof(unsigned short *)); |
|
782 aliasflen = (unsigned short *) malloc(numaliasf * sizeof(short)); |
|
783 if (!aliasf || !aliasflen) { |
|
784 numaliasf = 0; |
|
785 if (aliasf) free(aliasf); |
|
786 if (aliasflen) free(aliasflen); |
|
787 aliasf = NULL; |
|
788 aliasflen = NULL; |
|
789 return 1; |
|
790 } |
|
791 np++; |
|
792 break; |
|
793 } |
|
794 default: break; |
|
795 } |
|
796 i++; |
|
797 } |
|
798 piece = mystrsep(&tp, 0); |
|
799 } |
|
800 if (np != 2) { |
|
801 numaliasf = 0; |
|
802 free(aliasf); |
|
803 free(aliasflen); |
|
804 aliasf = NULL; |
|
805 aliasflen = NULL; |
|
806 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); |
|
807 return 1; |
|
808 } |
|
809 |
|
810 /* now parse the numaliasf lines to read in the remainder of the table */ |
|
811 char * nl; |
|
812 for (int j=0; j < numaliasf; j++) { |
|
813 if (!(nl = af->getline())) return 1; |
|
814 mychomp(nl); |
|
815 tp = nl; |
|
816 i = 0; |
|
817 aliasf[j] = NULL; |
|
818 aliasflen[j] = 0; |
|
819 piece = mystrsep(&tp, 0); |
|
820 while (piece) { |
|
821 if (*piece != '\0') { |
|
822 switch(i) { |
|
823 case 0: { |
|
824 if (strncmp(piece,"AF",2) != 0) { |
|
825 numaliasf = 0; |
|
826 free(aliasf); |
|
827 free(aliasflen); |
|
828 aliasf = NULL; |
|
829 aliasflen = NULL; |
|
830 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); |
|
831 return 1; |
|
832 } |
|
833 break; |
|
834 } |
|
835 case 1: { |
|
836 aliasflen[j] = (unsigned short) decode_flags(&(aliasf[j]), piece, af); |
|
837 flag_qsort(aliasf[j], 0, aliasflen[j]); |
|
838 break; |
|
839 } |
|
840 default: break; |
|
841 } |
|
842 i++; |
|
843 } |
|
844 piece = mystrsep(&tp, 0); |
|
845 } |
|
846 if (!aliasf[j]) { |
|
847 free(aliasf); |
|
848 free(aliasflen); |
|
849 aliasf = NULL; |
|
850 aliasflen = NULL; |
|
851 numaliasf = 0; |
|
852 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); |
|
853 return 1; |
|
854 } |
|
855 } |
|
856 return 0; |
|
857 } |
|
858 |
|
859 int HashMgr::is_aliasf() { |
|
860 return (aliasf != NULL); |
|
861 } |
|
862 |
|
863 int HashMgr::get_aliasf(int index, unsigned short ** fvec, FileMgr * af) { |
|
864 if ((index > 0) && (index <= numaliasf)) { |
|
865 *fvec = aliasf[index - 1]; |
|
866 return aliasflen[index - 1]; |
|
867 } |
|
868 HUNSPELL_WARNING(stderr, "error: line %d: bad flag alias index: %d\n", af->getlinenum(), index); |
|
869 *fvec = NULL; |
|
870 return 0; |
|
871 } |
|
872 |
|
873 /* parse morph alias definitions */ |
|
874 int HashMgr::parse_aliasm(char * line, FileMgr * af) |
|
875 { |
|
876 if (numaliasm != 0) { |
|
877 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum()); |
|
878 return 1; |
|
879 } |
|
880 char * tp = line; |
|
881 char * piece; |
|
882 int i = 0; |
|
883 int np = 0; |
|
884 piece = mystrsep(&tp, 0); |
|
885 while (piece) { |
|
886 if (*piece != '\0') { |
|
887 switch(i) { |
|
888 case 0: { np++; break; } |
|
889 case 1: { |
|
890 numaliasm = atoi(piece); |
|
891 if (numaliasm < 1) { |
|
892 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum()); |
|
893 return 1; |
|
894 } |
|
895 aliasm = (char **) malloc(numaliasm * sizeof(char *)); |
|
896 if (!aliasm) { |
|
897 numaliasm = 0; |
|
898 return 1; |
|
899 } |
|
900 np++; |
|
901 break; |
|
902 } |
|
903 default: break; |
|
904 } |
|
905 i++; |
|
906 } |
|
907 piece = mystrsep(&tp, 0); |
|
908 } |
|
909 if (np != 2) { |
|
910 numaliasm = 0; |
|
911 free(aliasm); |
|
912 aliasm = NULL; |
|
913 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); |
|
914 return 1; |
|
915 } |
|
916 |
|
917 /* now parse the numaliasm lines to read in the remainder of the table */ |
|
918 char * nl = line; |
|
919 for (int j=0; j < numaliasm; j++) { |
|
920 if (!(nl = af->getline())) return 1; |
|
921 mychomp(nl); |
|
922 tp = nl; |
|
923 i = 0; |
|
924 aliasm[j] = NULL; |
|
925 piece = mystrsep(&tp, ' '); |
|
926 while (piece) { |
|
927 if (*piece != '\0') { |
|
928 switch(i) { |
|
929 case 0: { |
|
930 if (strncmp(piece,"AM",2) != 0) { |
|
931 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); |
|
932 numaliasm = 0; |
|
933 free(aliasm); |
|
934 aliasm = NULL; |
|
935 return 1; |
|
936 } |
|
937 break; |
|
938 } |
|
939 case 1: { |
|
940 // add the remaining of the line |
|
941 if (*tp) { |
|
942 *(tp - 1) = ' '; |
|
943 tp = tp + strlen(tp); |
|
944 } |
|
945 if (complexprefixes) { |
|
946 if (utf8) reverseword_utf(piece); |
|
947 else reverseword(piece); |
|
948 } |
|
949 aliasm[j] = mystrdup(piece); |
|
950 if (!aliasm[j]) { |
|
951 numaliasm = 0; |
|
952 free(aliasm); |
|
953 aliasm = NULL; |
|
954 return 1; |
|
955 } |
|
956 break; } |
|
957 default: break; |
|
958 } |
|
959 i++; |
|
960 } |
|
961 piece = mystrsep(&tp, ' '); |
|
962 } |
|
963 if (!aliasm[j]) { |
|
964 numaliasm = 0; |
|
965 free(aliasm); |
|
966 aliasm = NULL; |
|
967 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); |
|
968 return 1; |
|
969 } |
|
970 } |
|
971 return 0; |
|
972 } |
|
973 |
|
974 int HashMgr::is_aliasm() { |
|
975 return (aliasm != NULL); |
|
976 } |
|
977 |
|
978 char * HashMgr::get_aliasm(int index) { |
|
979 if ((index > 0) && (index <= numaliasm)) return aliasm[index - 1]; |
|
980 HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index); |
|
981 return NULL; |
|
982 } |