extensions/spellcheck/hunspell/src/hashmgr.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /******* BEGIN LICENSE BLOCK *******
michael@0 2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
michael@0 3 *
michael@0 4 * The contents of this file are subject to the Mozilla Public License Version
michael@0 5 * 1.1 (the "License"); you may not use this file except in compliance with
michael@0 6 * the License. You may obtain a copy of the License at
michael@0 7 * http://www.mozilla.org/MPL/
michael@0 8 *
michael@0 9 * Software distributed under the License is distributed on an "AS IS" basis,
michael@0 10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
michael@0 11 * for the specific language governing rights and limitations under the
michael@0 12 * License.
michael@0 13 *
michael@0 14 * The Initial Developers of the Original Code are Kevin Hendricks (MySpell)
michael@0 15 * and László Németh (Hunspell). Portions created by the Initial Developers
michael@0 16 * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved.
michael@0 17 *
michael@0 18 * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca)
michael@0 19 * David Einstein (deinst@world.std.com)
michael@0 20 * László Németh (nemethl@gyorsposta.hu)
michael@0 21 * Caolan McNamara (caolanm@redhat.com)
michael@0 22 * Davide Prina
michael@0 23 * Giuseppe Modugno
michael@0 24 * Gianluca Turconi
michael@0 25 * Simon Brouwer
michael@0 26 * Noll Janos
michael@0 27 * Biro Arpad
michael@0 28 * Goldman Eleonora
michael@0 29 * Sarlos Tamas
michael@0 30 * Bencsath Boldizsar
michael@0 31 * Halacsy Peter
michael@0 32 * Dvornik Laszlo
michael@0 33 * Gefferth Andras
michael@0 34 * Nagy Viktor
michael@0 35 * Varga Daniel
michael@0 36 * Chris Halls
michael@0 37 * Rene Engelhard
michael@0 38 * Bram Moolenaar
michael@0 39 * Dafydd Jones
michael@0 40 * Harri Pitkanen
michael@0 41 * Andras Timar
michael@0 42 * Tor Lillqvist
michael@0 43 *
michael@0 44 * Alternatively, the contents of this file may be used under the terms of
michael@0 45 * either the GNU General Public License Version 2 or later (the "GPL"), or
michael@0 46 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
michael@0 47 * in which case the provisions of the GPL or the LGPL are applicable instead
michael@0 48 * of those above. If you wish to allow use of your version of this file only
michael@0 49 * under the terms of either the GPL or the LGPL, and not to allow others to
michael@0 50 * use your version of this file under the terms of the MPL, indicate your
michael@0 51 * decision by deleting the provisions above and replace them with the notice
michael@0 52 * and other provisions required by the GPL or the LGPL. If you do not delete
michael@0 53 * the provisions above, a recipient may use your version of this file under
michael@0 54 * the terms of any one of the MPL, the GPL or the LGPL.
michael@0 55 *
michael@0 56 ******* END LICENSE BLOCK *******/
michael@0 57
michael@0 58 #include <stdlib.h>
michael@0 59 #include <string.h>
michael@0 60 #include <stdio.h>
michael@0 61 #include <ctype.h>
michael@0 62
michael@0 63 #include "hashmgr.hxx"
michael@0 64 #include "csutil.hxx"
michael@0 65 #include "atypes.hxx"
michael@0 66
michael@0 67 // build a hash table from a munched word list
michael@0 68
michael@0 69 HashMgr::HashMgr(const char * tpath, const char * apath, const char * key)
michael@0 70 {
michael@0 71 tablesize = 0;
michael@0 72 tableptr = NULL;
michael@0 73 flag_mode = FLAG_CHAR;
michael@0 74 complexprefixes = 0;
michael@0 75 utf8 = 0;
michael@0 76 langnum = 0;
michael@0 77 lang = NULL;
michael@0 78 enc = NULL;
michael@0 79 csconv = 0;
michael@0 80 ignorechars = NULL;
michael@0 81 ignorechars_utf16 = NULL;
michael@0 82 ignorechars_utf16_len = 0;
michael@0 83 numaliasf = 0;
michael@0 84 aliasf = NULL;
michael@0 85 numaliasm = 0;
michael@0 86 aliasm = NULL;
michael@0 87 forbiddenword = FORBIDDENWORD; // forbidden word signing flag
michael@0 88 load_config(apath, key);
michael@0 89 int ec = load_tables(tpath, key);
michael@0 90 if (ec) {
michael@0 91 /* error condition - what should we do here */
michael@0 92 HUNSPELL_WARNING(stderr, "Hash Manager Error : %d\n",ec);
michael@0 93 if (tableptr) {
michael@0 94 free(tableptr);
michael@0 95 tableptr = NULL;
michael@0 96 }
michael@0 97 tablesize = 0;
michael@0 98 }
michael@0 99 }
michael@0 100
michael@0 101
michael@0 102 HashMgr::~HashMgr()
michael@0 103 {
michael@0 104 if (tableptr) {
michael@0 105 // now pass through hash table freeing up everything
michael@0 106 // go through column by column of the table
michael@0 107 for (int i=0; i < tablesize; i++) {
michael@0 108 struct hentry * pt = tableptr[i];
michael@0 109 struct hentry * nt = NULL;
michael@0 110 while(pt) {
michael@0 111 nt = pt->next;
michael@0 112 if (pt->astr && (!aliasf || TESTAFF(pt->astr, ONLYUPCASEFLAG, pt->alen))) free(pt->astr);
michael@0 113 free(pt);
michael@0 114 pt = nt;
michael@0 115 }
michael@0 116 }
michael@0 117 free(tableptr);
michael@0 118 }
michael@0 119 tablesize = 0;
michael@0 120
michael@0 121 if (aliasf) {
michael@0 122 for (int j = 0; j < (numaliasf); j++) free(aliasf[j]);
michael@0 123 free(aliasf);
michael@0 124 aliasf = NULL;
michael@0 125 if (aliasflen) {
michael@0 126 free(aliasflen);
michael@0 127 aliasflen = NULL;
michael@0 128 }
michael@0 129 }
michael@0 130 if (aliasm) {
michael@0 131 for (int j = 0; j < (numaliasm); j++) free(aliasm[j]);
michael@0 132 free(aliasm);
michael@0 133 aliasm = NULL;
michael@0 134 }
michael@0 135
michael@0 136 #ifndef OPENOFFICEORG
michael@0 137 #ifndef MOZILLA_CLIENT
michael@0 138 if (utf8) free_utf_tbl();
michael@0 139 #endif
michael@0 140 #endif
michael@0 141
michael@0 142 if (enc) free(enc);
michael@0 143 if (lang) free(lang);
michael@0 144
michael@0 145 if (ignorechars) free(ignorechars);
michael@0 146 if (ignorechars_utf16) free(ignorechars_utf16);
michael@0 147
michael@0 148 #ifdef MOZILLA_CLIENT
michael@0 149 delete [] csconv;
michael@0 150 #endif
michael@0 151 }
michael@0 152
michael@0 153 // lookup a root word in the hashtable
michael@0 154
michael@0 155 struct hentry * HashMgr::lookup(const char *word) const
michael@0 156 {
michael@0 157 struct hentry * dp;
michael@0 158 if (tableptr) {
michael@0 159 dp = tableptr[hash(word)];
michael@0 160 if (!dp) return NULL;
michael@0 161 for ( ; dp != NULL; dp = dp->next) {
michael@0 162 if (strcmp(word, dp->word) == 0) return dp;
michael@0 163 }
michael@0 164 }
michael@0 165 return NULL;
michael@0 166 }
michael@0 167
michael@0 168 // add a word to the hash table (private)
michael@0 169 int HashMgr::add_word(const char * word, int wbl, int wcl, unsigned short * aff,
michael@0 170 int al, const char * desc, bool onlyupcase)
michael@0 171 {
michael@0 172 bool upcasehomonym = false;
michael@0 173 int descl = desc ? (aliasm ? sizeof(short) : strlen(desc) + 1) : 0;
michael@0 174 // variable-length hash record with word and optional fields
michael@0 175 struct hentry* hp =
michael@0 176 (struct hentry *) malloc (sizeof(struct hentry) + wbl + descl);
michael@0 177 if (!hp) return 1;
michael@0 178 char * hpw = hp->word;
michael@0 179 strcpy(hpw, word);
michael@0 180 if (ignorechars != NULL) {
michael@0 181 if (utf8) {
michael@0 182 remove_ignored_chars_utf(hpw, ignorechars_utf16, ignorechars_utf16_len);
michael@0 183 } else {
michael@0 184 remove_ignored_chars(hpw, ignorechars);
michael@0 185 }
michael@0 186 }
michael@0 187 if (complexprefixes) {
michael@0 188 if (utf8) reverseword_utf(hpw); else reverseword(hpw);
michael@0 189 }
michael@0 190
michael@0 191 int i = hash(hpw);
michael@0 192
michael@0 193 hp->blen = (unsigned char) wbl;
michael@0 194 hp->clen = (unsigned char) wcl;
michael@0 195 hp->alen = (short) al;
michael@0 196 hp->astr = aff;
michael@0 197 hp->next = NULL;
michael@0 198 hp->next_homonym = NULL;
michael@0 199
michael@0 200 // store the description string or its pointer
michael@0 201 if (desc) {
michael@0 202 hp->var = H_OPT;
michael@0 203 if (aliasm) {
michael@0 204 hp->var += H_OPT_ALIASM;
michael@0 205 store_pointer(hpw + wbl + 1, get_aliasm(atoi(desc)));
michael@0 206 } else {
michael@0 207 strcpy(hpw + wbl + 1, desc);
michael@0 208 if (complexprefixes) {
michael@0 209 if (utf8) reverseword_utf(HENTRY_DATA(hp));
michael@0 210 else reverseword(HENTRY_DATA(hp));
michael@0 211 }
michael@0 212 }
michael@0 213 if (strstr(HENTRY_DATA(hp), MORPH_PHON)) hp->var += H_OPT_PHON;
michael@0 214 } else hp->var = 0;
michael@0 215
michael@0 216 struct hentry * dp = tableptr[i];
michael@0 217 if (!dp) {
michael@0 218 tableptr[i] = hp;
michael@0 219 return 0;
michael@0 220 }
michael@0 221 while (dp->next != NULL) {
michael@0 222 if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) {
michael@0 223 // remove hidden onlyupcase homonym
michael@0 224 if (!onlyupcase) {
michael@0 225 if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) {
michael@0 226 free(dp->astr);
michael@0 227 dp->astr = hp->astr;
michael@0 228 dp->alen = hp->alen;
michael@0 229 free(hp);
michael@0 230 return 0;
michael@0 231 } else {
michael@0 232 dp->next_homonym = hp;
michael@0 233 }
michael@0 234 } else {
michael@0 235 upcasehomonym = true;
michael@0 236 }
michael@0 237 }
michael@0 238 dp=dp->next;
michael@0 239 }
michael@0 240 if (strcmp(hp->word, dp->word) == 0) {
michael@0 241 // remove hidden onlyupcase homonym
michael@0 242 if (!onlyupcase) {
michael@0 243 if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) {
michael@0 244 free(dp->astr);
michael@0 245 dp->astr = hp->astr;
michael@0 246 dp->alen = hp->alen;
michael@0 247 free(hp);
michael@0 248 return 0;
michael@0 249 } else {
michael@0 250 dp->next_homonym = hp;
michael@0 251 }
michael@0 252 } else {
michael@0 253 upcasehomonym = true;
michael@0 254 }
michael@0 255 }
michael@0 256 if (!upcasehomonym) {
michael@0 257 dp->next = hp;
michael@0 258 } else {
michael@0 259 // remove hidden onlyupcase homonym
michael@0 260 if (hp->astr) free(hp->astr);
michael@0 261 free(hp);
michael@0 262 }
michael@0 263 return 0;
michael@0 264 }
michael@0 265
michael@0 266 int HashMgr::add_hidden_capitalized_word(char * word, int wbl, int wcl,
michael@0 267 unsigned short * flags, int al, char * dp, int captype)
michael@0 268 {
michael@0 269 // add inner capitalized forms to handle the following allcap forms:
michael@0 270 // Mixed caps: OpenOffice.org -> OPENOFFICE.ORG
michael@0 271 // Allcaps with suffixes: CIA's -> CIA'S
michael@0 272 if (((captype == HUHCAP) || (captype == HUHINITCAP) ||
michael@0 273 ((captype == ALLCAP) && (flags != NULL))) &&
michael@0 274 !((flags != NULL) && TESTAFF(flags, forbiddenword, al))) {
michael@0 275 unsigned short * flags2 = (unsigned short *) malloc (sizeof(unsigned short) * (al+1));
michael@0 276 if (!flags2) return 1;
michael@0 277 if (al) memcpy(flags2, flags, al * sizeof(unsigned short));
michael@0 278 flags2[al] = ONLYUPCASEFLAG;
michael@0 279 if (utf8) {
michael@0 280 char st[BUFSIZE];
michael@0 281 w_char w[BUFSIZE];
michael@0 282 int wlen = u8_u16(w, BUFSIZE, word);
michael@0 283 mkallsmall_utf(w, wlen, langnum);
michael@0 284 mkallcap_utf(w, 1, langnum);
michael@0 285 u16_u8(st, BUFSIZE, w, wlen);
michael@0 286 return add_word(st,wbl,wcl,flags2,al+1,dp, true);
michael@0 287 } else {
michael@0 288 mkallsmall(word, csconv);
michael@0 289 mkinitcap(word, csconv);
michael@0 290 return add_word(word,wbl,wcl,flags2,al+1,dp, true);
michael@0 291 }
michael@0 292 }
michael@0 293 return 0;
michael@0 294 }
michael@0 295
michael@0 296 // detect captype and modify word length for UTF-8 encoding
michael@0 297 int HashMgr::get_clen_and_captype(const char * word, int wbl, int * captype) {
michael@0 298 int len;
michael@0 299 if (utf8) {
michael@0 300 w_char dest_utf[BUFSIZE];
michael@0 301 len = u8_u16(dest_utf, BUFSIZE, word);
michael@0 302 *captype = get_captype_utf8(dest_utf, len, langnum);
michael@0 303 } else {
michael@0 304 len = wbl;
michael@0 305 *captype = get_captype((char *) word, len, csconv);
michael@0 306 }
michael@0 307 return len;
michael@0 308 }
michael@0 309
michael@0 310 // remove word (personal dictionary function for standalone applications)
michael@0 311 int HashMgr::remove(const char * word)
michael@0 312 {
michael@0 313 struct hentry * dp = lookup(word);
michael@0 314 while (dp) {
michael@0 315 if (dp->alen == 0 || !TESTAFF(dp->astr, forbiddenword, dp->alen)) {
michael@0 316 unsigned short * flags =
michael@0 317 (unsigned short *) malloc(sizeof(short) * (dp->alen + 1));
michael@0 318 if (!flags) return 1;
michael@0 319 for (int i = 0; i < dp->alen; i++) flags[i] = dp->astr[i];
michael@0 320 flags[dp->alen] = forbiddenword;
michael@0 321 dp->astr = flags;
michael@0 322 dp->alen++;
michael@0 323 flag_qsort(flags, 0, dp->alen);
michael@0 324 }
michael@0 325 dp = dp->next_homonym;
michael@0 326 }
michael@0 327 return 0;
michael@0 328 }
michael@0 329
michael@0 330 /* remove forbidden flag to add a personal word to the hash */
michael@0 331 int HashMgr::remove_forbidden_flag(const char * word) {
michael@0 332 struct hentry * dp = lookup(word);
michael@0 333 if (!dp) return 1;
michael@0 334 while (dp) {
michael@0 335 if (dp->astr && TESTAFF(dp->astr, forbiddenword, dp->alen)) {
michael@0 336 if (dp->alen == 1) dp->alen = 0; // XXX forbidden words of personal dic.
michael@0 337 else {
michael@0 338 unsigned short * flags2 =
michael@0 339 (unsigned short *) malloc(sizeof(short) * (dp->alen - 1));
michael@0 340 if (!flags2) return 1;
michael@0 341 int i, j = 0;
michael@0 342 for (i = 0; i < dp->alen; i++) {
michael@0 343 if (dp->astr[i] != forbiddenword) flags2[j++] = dp->astr[i];
michael@0 344 }
michael@0 345 dp->alen--;
michael@0 346 dp->astr = flags2; // XXX allowed forbidden words
michael@0 347 }
michael@0 348 }
michael@0 349 dp = dp->next_homonym;
michael@0 350 }
michael@0 351 return 0;
michael@0 352 }
michael@0 353
michael@0 354 // add a custom dic. word to the hash table (public)
michael@0 355 int HashMgr::add(const char * word)
michael@0 356 {
michael@0 357 unsigned short * flags = NULL;
michael@0 358 int al = 0;
michael@0 359 if (remove_forbidden_flag(word)) {
michael@0 360 int captype;
michael@0 361 int wbl = strlen(word);
michael@0 362 int wcl = get_clen_and_captype(word, wbl, &captype);
michael@0 363 add_word(word, wbl, wcl, flags, al, NULL, false);
michael@0 364 return add_hidden_capitalized_word((char *) word, wbl, wcl, flags, al, NULL, captype);
michael@0 365 }
michael@0 366 return 0;
michael@0 367 }
michael@0 368
michael@0 369 int HashMgr::add_with_affix(const char * word, const char * example)
michael@0 370 {
michael@0 371 // detect captype and modify word length for UTF-8 encoding
michael@0 372 struct hentry * dp = lookup(example);
michael@0 373 remove_forbidden_flag(word);
michael@0 374 if (dp && dp->astr) {
michael@0 375 int captype;
michael@0 376 int wbl = strlen(word);
michael@0 377 int wcl = get_clen_and_captype(word, wbl, &captype);
michael@0 378 if (aliasf) {
michael@0 379 add_word(word, wbl, wcl, dp->astr, dp->alen, NULL, false);
michael@0 380 } else {
michael@0 381 unsigned short * flags = (unsigned short *) malloc (dp->alen * sizeof(short));
michael@0 382 if (flags) {
michael@0 383 memcpy((void *) flags, (void *) dp->astr, dp->alen * sizeof(short));
michael@0 384 add_word(word, wbl, wcl, flags, dp->alen, NULL, false);
michael@0 385 } else return 1;
michael@0 386 }
michael@0 387 return add_hidden_capitalized_word((char *) word, wbl, wcl, dp->astr, dp->alen, NULL, captype);
michael@0 388 }
michael@0 389 return 1;
michael@0 390 }
michael@0 391
michael@0 392 // walk the hash table entry by entry - null at end
michael@0 393 // initialize: col=-1; hp = NULL; hp = walk_hashtable(&col, hp);
michael@0 394 struct hentry * HashMgr::walk_hashtable(int &col, struct hentry * hp) const
michael@0 395 {
michael@0 396 if (hp && hp->next != NULL) return hp->next;
michael@0 397 for (col++; col < tablesize; col++) {
michael@0 398 if (tableptr[col]) return tableptr[col];
michael@0 399 }
michael@0 400 // null at end and reset to start
michael@0 401 col = -1;
michael@0 402 return NULL;
michael@0 403 }
michael@0 404
michael@0 405 // load a munched word list and build a hash table on the fly
michael@0 406 int HashMgr::load_tables(const char * tpath, const char * key)
michael@0 407 {
michael@0 408 int al;
michael@0 409 char * ap;
michael@0 410 char * dp;
michael@0 411 char * dp2;
michael@0 412 unsigned short * flags;
michael@0 413 char * ts;
michael@0 414
michael@0 415 // open dictionary file
michael@0 416 FileMgr * dict = new FileMgr(tpath, key);
michael@0 417 if (dict == NULL) return 1;
michael@0 418
michael@0 419 // first read the first line of file to get hash table size */
michael@0 420 if (!(ts = dict->getline())) {
michael@0 421 HUNSPELL_WARNING(stderr, "error: empty dic file\n");
michael@0 422 delete dict;
michael@0 423 return 2;
michael@0 424 }
michael@0 425 mychomp(ts);
michael@0 426
michael@0 427 /* remove byte order mark */
michael@0 428 if (strncmp(ts,"\xEF\xBB\xBF",3) == 0) {
michael@0 429 memmove(ts, ts+3, strlen(ts+3)+1);
michael@0 430 // warning: dic file begins with byte order mark: possible incompatibility with old Hunspell versions
michael@0 431 }
michael@0 432
michael@0 433 tablesize = atoi(ts);
michael@0 434 if (tablesize == 0) {
michael@0 435 HUNSPELL_WARNING(stderr, "error: line 1: missing or bad word count in the dic file\n");
michael@0 436 delete dict;
michael@0 437 return 4;
michael@0 438 }
michael@0 439 tablesize = tablesize + 5 + USERWORD;
michael@0 440 if ((tablesize %2) == 0) tablesize++;
michael@0 441
michael@0 442 // allocate the hash table
michael@0 443 tableptr = (struct hentry **) malloc(tablesize * sizeof(struct hentry *));
michael@0 444 if (! tableptr) {
michael@0 445 delete dict;
michael@0 446 return 3;
michael@0 447 }
michael@0 448 for (int i=0; i<tablesize; i++) tableptr[i] = NULL;
michael@0 449
michael@0 450 // loop through all words on much list and add to hash
michael@0 451 // table and create word and affix strings
michael@0 452
michael@0 453 while ((ts = dict->getline())) {
michael@0 454 mychomp(ts);
michael@0 455 // split each line into word and morphological description
michael@0 456 dp = ts;
michael@0 457 while ((dp = strchr(dp, ':'))) {
michael@0 458 if ((dp > ts + 3) && (*(dp - 3) == ' ' || *(dp - 3) == '\t')) {
michael@0 459 for (dp -= 4; dp >= ts && (*dp == ' ' || *dp == '\t'); dp--);
michael@0 460 if (dp < ts) { // missing word
michael@0 461 dp = NULL;
michael@0 462 } else {
michael@0 463 *(dp + 1) = '\0';
michael@0 464 dp = dp + 2;
michael@0 465 }
michael@0 466 break;
michael@0 467 }
michael@0 468 dp++;
michael@0 469 }
michael@0 470
michael@0 471 // tabulator is the old morphological field separator
michael@0 472 dp2 = strchr(ts, '\t');
michael@0 473 if (dp2 && (!dp || dp2 < dp)) {
michael@0 474 *dp2 = '\0';
michael@0 475 dp = dp2 + 1;
michael@0 476 }
michael@0 477
michael@0 478 // split each line into word and affix char strings
michael@0 479 // "\/" signs slash in words (not affix separator)
michael@0 480 // "/" at beginning of the line is word character (not affix separator)
michael@0 481 ap = strchr(ts,'/');
michael@0 482 while (ap) {
michael@0 483 if (ap == ts) {
michael@0 484 ap++;
michael@0 485 continue;
michael@0 486 } else if (*(ap - 1) != '\\') break;
michael@0 487 // replace "\/" with "/"
michael@0 488 for (char * sp = ap - 1; *sp; *sp = *(sp + 1), sp++);
michael@0 489 ap = strchr(ap,'/');
michael@0 490 }
michael@0 491
michael@0 492 if (ap) {
michael@0 493 *ap = '\0';
michael@0 494 if (aliasf) {
michael@0 495 int index = atoi(ap + 1);
michael@0 496 al = get_aliasf(index, &flags, dict);
michael@0 497 if (!al) {
michael@0 498 HUNSPELL_WARNING(stderr, "error: line %d: bad flag vector alias\n", dict->getlinenum());
michael@0 499 *ap = '\0';
michael@0 500 }
michael@0 501 } else {
michael@0 502 al = decode_flags(&flags, ap + 1, dict);
michael@0 503 if (al == -1) {
michael@0 504 HUNSPELL_WARNING(stderr, "Can't allocate memory.\n");
michael@0 505 delete dict;
michael@0 506 return 6;
michael@0 507 }
michael@0 508 flag_qsort(flags, 0, al);
michael@0 509 }
michael@0 510 } else {
michael@0 511 al = 0;
michael@0 512 ap = NULL;
michael@0 513 flags = NULL;
michael@0 514 }
michael@0 515
michael@0 516 int captype;
michael@0 517 int wbl = strlen(ts);
michael@0 518 int wcl = get_clen_and_captype(ts, wbl, &captype);
michael@0 519 // add the word and its index plus its capitalized form optionally
michael@0 520 if (add_word(ts,wbl,wcl,flags,al,dp, false) ||
michael@0 521 add_hidden_capitalized_word(ts, wbl, wcl, flags, al, dp, captype)) {
michael@0 522 delete dict;
michael@0 523 return 5;
michael@0 524 }
michael@0 525 }
michael@0 526
michael@0 527 delete dict;
michael@0 528 return 0;
michael@0 529 }
michael@0 530
michael@0 531 // the hash function is a simple load and rotate
michael@0 532 // algorithm borrowed
michael@0 533
michael@0 534 int HashMgr::hash(const char * word) const
michael@0 535 {
michael@0 536 long hv = 0;
michael@0 537 for (int i=0; i < 4 && *word != 0; i++)
michael@0 538 hv = (hv << 8) | (*word++);
michael@0 539 while (*word != 0) {
michael@0 540 ROTATE(hv,ROTATE_LEN);
michael@0 541 hv ^= (*word++);
michael@0 542 }
michael@0 543 return (unsigned long) hv % tablesize;
michael@0 544 }
michael@0 545
michael@0 546 int HashMgr::decode_flags(unsigned short ** result, char * flags, FileMgr * af) {
michael@0 547 int len;
michael@0 548 if (*flags == '\0') {
michael@0 549 *result = NULL;
michael@0 550 return 0;
michael@0 551 }
michael@0 552 switch (flag_mode) {
michael@0 553 case FLAG_LONG: { // two-character flags (1x2yZz -> 1x 2y Zz)
michael@0 554 len = strlen(flags);
michael@0 555 if (len%2 == 1) HUNSPELL_WARNING(stderr, "error: line %d: bad flagvector\n", af->getlinenum());
michael@0 556 len /= 2;
michael@0 557 *result = (unsigned short *) malloc(len * sizeof(short));
michael@0 558 if (!*result) return -1;
michael@0 559 for (int i = 0; i < len; i++) {
michael@0 560 (*result)[i] = (((unsigned short) flags[i * 2]) << 8) + (unsigned short) flags[i * 2 + 1];
michael@0 561 }
michael@0 562 break;
michael@0 563 }
michael@0 564 case FLAG_NUM: { // decimal numbers separated by comma (4521,23,233 -> 4521 23 233)
michael@0 565 int i;
michael@0 566 len = 1;
michael@0 567 char * src = flags;
michael@0 568 unsigned short * dest;
michael@0 569 char * p;
michael@0 570 for (p = flags; *p; p++) {
michael@0 571 if (*p == ',') len++;
michael@0 572 }
michael@0 573 *result = (unsigned short *) malloc(len * sizeof(short));
michael@0 574 if (!*result) return -1;
michael@0 575 dest = *result;
michael@0 576 for (p = flags; *p; p++) {
michael@0 577 if (*p == ',') {
michael@0 578 i = atoi(src);
michael@0 579 if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: line %d: flag id %d is too large (max: %d)\n",
michael@0 580 af->getlinenum(), i, DEFAULTFLAGS - 1);
michael@0 581 *dest = (unsigned short) i;
michael@0 582 if (*dest == 0) HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n", af->getlinenum());
michael@0 583 src = p + 1;
michael@0 584 dest++;
michael@0 585 }
michael@0 586 }
michael@0 587 i = atoi(src);
michael@0 588 if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: line %d: flag id %d is too large (max: %d)\n",
michael@0 589 af->getlinenum(), i, DEFAULTFLAGS - 1);
michael@0 590 *dest = (unsigned short) i;
michael@0 591 if (*dest == 0) HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n", af->getlinenum());
michael@0 592 break;
michael@0 593 }
michael@0 594 case FLAG_UNI: { // UTF-8 characters
michael@0 595 w_char w[BUFSIZE/2];
michael@0 596 len = u8_u16(w, BUFSIZE/2, flags);
michael@0 597 *result = (unsigned short *) malloc(len * sizeof(short));
michael@0 598 if (!*result) return -1;
michael@0 599 memcpy(*result, w, len * sizeof(short));
michael@0 600 break;
michael@0 601 }
michael@0 602 default: { // Ispell's one-character flags (erfg -> e r f g)
michael@0 603 unsigned short * dest;
michael@0 604 len = strlen(flags);
michael@0 605 *result = (unsigned short *) malloc(len * sizeof(short));
michael@0 606 if (!*result) return -1;
michael@0 607 dest = *result;
michael@0 608 for (unsigned char * p = (unsigned char *) flags; *p; p++) {
michael@0 609 *dest = (unsigned short) *p;
michael@0 610 dest++;
michael@0 611 }
michael@0 612 }
michael@0 613 }
michael@0 614 return len;
michael@0 615 }
michael@0 616
michael@0 617 unsigned short HashMgr::decode_flag(const char * f) {
michael@0 618 unsigned short s = 0;
michael@0 619 int i;
michael@0 620 switch (flag_mode) {
michael@0 621 case FLAG_LONG:
michael@0 622 s = ((unsigned short) f[0] << 8) + (unsigned short) f[1];
michael@0 623 break;
michael@0 624 case FLAG_NUM:
michael@0 625 i = atoi(f);
michael@0 626 if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: flag id %d is too large (max: %d)\n", i, DEFAULTFLAGS - 1);
michael@0 627 s = (unsigned short) i;
michael@0 628 break;
michael@0 629 case FLAG_UNI:
michael@0 630 u8_u16((w_char *) &s, 1, f);
michael@0 631 break;
michael@0 632 default:
michael@0 633 s = (unsigned short) *((unsigned char *)f);
michael@0 634 }
michael@0 635 if (s == 0) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n");
michael@0 636 return s;
michael@0 637 }
michael@0 638
michael@0 639 char * HashMgr::encode_flag(unsigned short f) {
michael@0 640 unsigned char ch[10];
michael@0 641 if (f==0) return mystrdup("(NULL)");
michael@0 642 if (flag_mode == FLAG_LONG) {
michael@0 643 ch[0] = (unsigned char) (f >> 8);
michael@0 644 ch[1] = (unsigned char) (f - ((f >> 8) << 8));
michael@0 645 ch[2] = '\0';
michael@0 646 } else if (flag_mode == FLAG_NUM) {
michael@0 647 sprintf((char *) ch, "%d", f);
michael@0 648 } else if (flag_mode == FLAG_UNI) {
michael@0 649 u16_u8((char *) &ch, 10, (w_char *) &f, 1);
michael@0 650 } else {
michael@0 651 ch[0] = (unsigned char) (f);
michael@0 652 ch[1] = '\0';
michael@0 653 }
michael@0 654 return mystrdup((char *) ch);
michael@0 655 }
michael@0 656
michael@0 657 // read in aff file and set flag mode
michael@0 658 int HashMgr::load_config(const char * affpath, const char * key)
michael@0 659 {
michael@0 660 char * line; // io buffers
michael@0 661 int firstline = 1;
michael@0 662
michael@0 663 // open the affix file
michael@0 664 FileMgr * afflst = new FileMgr(affpath, key);
michael@0 665 if (!afflst) {
michael@0 666 HUNSPELL_WARNING(stderr, "Error - could not open affix description file %s\n",affpath);
michael@0 667 return 1;
michael@0 668 }
michael@0 669
michael@0 670 // read in each line ignoring any that do not
michael@0 671 // start with a known line type indicator
michael@0 672
michael@0 673 while ((line = afflst->getline())) {
michael@0 674 mychomp(line);
michael@0 675
michael@0 676 /* remove byte order mark */
michael@0 677 if (firstline) {
michael@0 678 firstline = 0;
michael@0 679 if (strncmp(line,"\xEF\xBB\xBF",3) == 0) memmove(line, line+3, strlen(line+3)+1);
michael@0 680 }
michael@0 681
michael@0 682 /* parse in the try string */
michael@0 683 if ((strncmp(line,"FLAG",4) == 0) && isspace(line[4])) {
michael@0 684 if (flag_mode != FLAG_CHAR) {
michael@0 685 HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of the FLAG affix file parameter\n", afflst->getlinenum());
michael@0 686 }
michael@0 687 if (strstr(line, "long")) flag_mode = FLAG_LONG;
michael@0 688 if (strstr(line, "num")) flag_mode = FLAG_NUM;
michael@0 689 if (strstr(line, "UTF-8")) flag_mode = FLAG_UNI;
michael@0 690 if (flag_mode == FLAG_CHAR) {
michael@0 691 HUNSPELL_WARNING(stderr, "error: line %d: FLAG needs `num', `long' or `UTF-8' parameter\n", afflst->getlinenum());
michael@0 692 }
michael@0 693 }
michael@0 694 if (strncmp(line,"FORBIDDENWORD",13) == 0) {
michael@0 695 char * st = NULL;
michael@0 696 if (parse_string(line, &st, afflst->getlinenum())) {
michael@0 697 delete afflst;
michael@0 698 return 1;
michael@0 699 }
michael@0 700 forbiddenword = decode_flag(st);
michael@0 701 free(st);
michael@0 702 }
michael@0 703 if (strncmp(line, "SET", 3) == 0) {
michael@0 704 if (parse_string(line, &enc, afflst->getlinenum())) {
michael@0 705 delete afflst;
michael@0 706 return 1;
michael@0 707 }
michael@0 708 if (strcmp(enc, "UTF-8") == 0) {
michael@0 709 utf8 = 1;
michael@0 710 #ifndef OPENOFFICEORG
michael@0 711 #ifndef MOZILLA_CLIENT
michael@0 712 initialize_utf_tbl();
michael@0 713 #endif
michael@0 714 #endif
michael@0 715 } else csconv = get_current_cs(enc);
michael@0 716 }
michael@0 717 if (strncmp(line, "LANG", 4) == 0) {
michael@0 718 if (parse_string(line, &lang, afflst->getlinenum())) {
michael@0 719 delete afflst;
michael@0 720 return 1;
michael@0 721 }
michael@0 722 langnum = get_lang_num(lang);
michael@0 723 }
michael@0 724
michael@0 725 /* parse in the ignored characters (for example, Arabic optional diacritics characters */
michael@0 726 if (strncmp(line,"IGNORE",6) == 0) {
michael@0 727 if (parse_array(line, &ignorechars, &ignorechars_utf16,
michael@0 728 &ignorechars_utf16_len, utf8, afflst->getlinenum())) {
michael@0 729 delete afflst;
michael@0 730 return 1;
michael@0 731 }
michael@0 732 }
michael@0 733
michael@0 734 if ((strncmp(line,"AF",2) == 0) && isspace(line[2])) {
michael@0 735 if (parse_aliasf(line, afflst)) {
michael@0 736 delete afflst;
michael@0 737 return 1;
michael@0 738 }
michael@0 739 }
michael@0 740
michael@0 741 if ((strncmp(line,"AM",2) == 0) && isspace(line[2])) {
michael@0 742 if (parse_aliasm(line, afflst)) {
michael@0 743 delete afflst;
michael@0 744 return 1;
michael@0 745 }
michael@0 746 }
michael@0 747
michael@0 748 if (strncmp(line,"COMPLEXPREFIXES",15) == 0) complexprefixes = 1;
michael@0 749 if (((strncmp(line,"SFX",3) == 0) || (strncmp(line,"PFX",3) == 0)) && isspace(line[3])) break;
michael@0 750 }
michael@0 751 if (csconv == NULL) csconv = get_current_cs(SPELL_ENCODING);
michael@0 752 delete afflst;
michael@0 753 return 0;
michael@0 754 }
michael@0 755
michael@0 756 /* parse in the ALIAS table */
michael@0 757 int HashMgr::parse_aliasf(char * line, FileMgr * af)
michael@0 758 {
michael@0 759 if (numaliasf != 0) {
michael@0 760 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
michael@0 761 return 1;
michael@0 762 }
michael@0 763 char * tp = line;
michael@0 764 char * piece;
michael@0 765 int i = 0;
michael@0 766 int np = 0;
michael@0 767 piece = mystrsep(&tp, 0);
michael@0 768 while (piece) {
michael@0 769 if (*piece != '\0') {
michael@0 770 switch(i) {
michael@0 771 case 0: { np++; break; }
michael@0 772 case 1: {
michael@0 773 numaliasf = atoi(piece);
michael@0 774 if (numaliasf < 1) {
michael@0 775 numaliasf = 0;
michael@0 776 aliasf = NULL;
michael@0 777 aliasflen = NULL;
michael@0 778 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
michael@0 779 return 1;
michael@0 780 }
michael@0 781 aliasf = (unsigned short **) malloc(numaliasf * sizeof(unsigned short *));
michael@0 782 aliasflen = (unsigned short *) malloc(numaliasf * sizeof(short));
michael@0 783 if (!aliasf || !aliasflen) {
michael@0 784 numaliasf = 0;
michael@0 785 if (aliasf) free(aliasf);
michael@0 786 if (aliasflen) free(aliasflen);
michael@0 787 aliasf = NULL;
michael@0 788 aliasflen = NULL;
michael@0 789 return 1;
michael@0 790 }
michael@0 791 np++;
michael@0 792 break;
michael@0 793 }
michael@0 794 default: break;
michael@0 795 }
michael@0 796 i++;
michael@0 797 }
michael@0 798 piece = mystrsep(&tp, 0);
michael@0 799 }
michael@0 800 if (np != 2) {
michael@0 801 numaliasf = 0;
michael@0 802 free(aliasf);
michael@0 803 free(aliasflen);
michael@0 804 aliasf = NULL;
michael@0 805 aliasflen = NULL;
michael@0 806 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
michael@0 807 return 1;
michael@0 808 }
michael@0 809
michael@0 810 /* now parse the numaliasf lines to read in the remainder of the table */
michael@0 811 char * nl;
michael@0 812 for (int j=0; j < numaliasf; j++) {
michael@0 813 if (!(nl = af->getline())) return 1;
michael@0 814 mychomp(nl);
michael@0 815 tp = nl;
michael@0 816 i = 0;
michael@0 817 aliasf[j] = NULL;
michael@0 818 aliasflen[j] = 0;
michael@0 819 piece = mystrsep(&tp, 0);
michael@0 820 while (piece) {
michael@0 821 if (*piece != '\0') {
michael@0 822 switch(i) {
michael@0 823 case 0: {
michael@0 824 if (strncmp(piece,"AF",2) != 0) {
michael@0 825 numaliasf = 0;
michael@0 826 free(aliasf);
michael@0 827 free(aliasflen);
michael@0 828 aliasf = NULL;
michael@0 829 aliasflen = NULL;
michael@0 830 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
michael@0 831 return 1;
michael@0 832 }
michael@0 833 break;
michael@0 834 }
michael@0 835 case 1: {
michael@0 836 aliasflen[j] = (unsigned short) decode_flags(&(aliasf[j]), piece, af);
michael@0 837 flag_qsort(aliasf[j], 0, aliasflen[j]);
michael@0 838 break;
michael@0 839 }
michael@0 840 default: break;
michael@0 841 }
michael@0 842 i++;
michael@0 843 }
michael@0 844 piece = mystrsep(&tp, 0);
michael@0 845 }
michael@0 846 if (!aliasf[j]) {
michael@0 847 free(aliasf);
michael@0 848 free(aliasflen);
michael@0 849 aliasf = NULL;
michael@0 850 aliasflen = NULL;
michael@0 851 numaliasf = 0;
michael@0 852 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
michael@0 853 return 1;
michael@0 854 }
michael@0 855 }
michael@0 856 return 0;
michael@0 857 }
michael@0 858
michael@0 859 int HashMgr::is_aliasf() {
michael@0 860 return (aliasf != NULL);
michael@0 861 }
michael@0 862
michael@0 863 int HashMgr::get_aliasf(int index, unsigned short ** fvec, FileMgr * af) {
michael@0 864 if ((index > 0) && (index <= numaliasf)) {
michael@0 865 *fvec = aliasf[index - 1];
michael@0 866 return aliasflen[index - 1];
michael@0 867 }
michael@0 868 HUNSPELL_WARNING(stderr, "error: line %d: bad flag alias index: %d\n", af->getlinenum(), index);
michael@0 869 *fvec = NULL;
michael@0 870 return 0;
michael@0 871 }
michael@0 872
michael@0 873 /* parse morph alias definitions */
michael@0 874 int HashMgr::parse_aliasm(char * line, FileMgr * af)
michael@0 875 {
michael@0 876 if (numaliasm != 0) {
michael@0 877 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
michael@0 878 return 1;
michael@0 879 }
michael@0 880 char * tp = line;
michael@0 881 char * piece;
michael@0 882 int i = 0;
michael@0 883 int np = 0;
michael@0 884 piece = mystrsep(&tp, 0);
michael@0 885 while (piece) {
michael@0 886 if (*piece != '\0') {
michael@0 887 switch(i) {
michael@0 888 case 0: { np++; break; }
michael@0 889 case 1: {
michael@0 890 numaliasm = atoi(piece);
michael@0 891 if (numaliasm < 1) {
michael@0 892 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
michael@0 893 return 1;
michael@0 894 }
michael@0 895 aliasm = (char **) malloc(numaliasm * sizeof(char *));
michael@0 896 if (!aliasm) {
michael@0 897 numaliasm = 0;
michael@0 898 return 1;
michael@0 899 }
michael@0 900 np++;
michael@0 901 break;
michael@0 902 }
michael@0 903 default: break;
michael@0 904 }
michael@0 905 i++;
michael@0 906 }
michael@0 907 piece = mystrsep(&tp, 0);
michael@0 908 }
michael@0 909 if (np != 2) {
michael@0 910 numaliasm = 0;
michael@0 911 free(aliasm);
michael@0 912 aliasm = NULL;
michael@0 913 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
michael@0 914 return 1;
michael@0 915 }
michael@0 916
michael@0 917 /* now parse the numaliasm lines to read in the remainder of the table */
michael@0 918 char * nl = line;
michael@0 919 for (int j=0; j < numaliasm; j++) {
michael@0 920 if (!(nl = af->getline())) return 1;
michael@0 921 mychomp(nl);
michael@0 922 tp = nl;
michael@0 923 i = 0;
michael@0 924 aliasm[j] = NULL;
michael@0 925 piece = mystrsep(&tp, ' ');
michael@0 926 while (piece) {
michael@0 927 if (*piece != '\0') {
michael@0 928 switch(i) {
michael@0 929 case 0: {
michael@0 930 if (strncmp(piece,"AM",2) != 0) {
michael@0 931 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
michael@0 932 numaliasm = 0;
michael@0 933 free(aliasm);
michael@0 934 aliasm = NULL;
michael@0 935 return 1;
michael@0 936 }
michael@0 937 break;
michael@0 938 }
michael@0 939 case 1: {
michael@0 940 // add the remaining of the line
michael@0 941 if (*tp) {
michael@0 942 *(tp - 1) = ' ';
michael@0 943 tp = tp + strlen(tp);
michael@0 944 }
michael@0 945 if (complexprefixes) {
michael@0 946 if (utf8) reverseword_utf(piece);
michael@0 947 else reverseword(piece);
michael@0 948 }
michael@0 949 aliasm[j] = mystrdup(piece);
michael@0 950 if (!aliasm[j]) {
michael@0 951 numaliasm = 0;
michael@0 952 free(aliasm);
michael@0 953 aliasm = NULL;
michael@0 954 return 1;
michael@0 955 }
michael@0 956 break; }
michael@0 957 default: break;
michael@0 958 }
michael@0 959 i++;
michael@0 960 }
michael@0 961 piece = mystrsep(&tp, ' ');
michael@0 962 }
michael@0 963 if (!aliasm[j]) {
michael@0 964 numaliasm = 0;
michael@0 965 free(aliasm);
michael@0 966 aliasm = NULL;
michael@0 967 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
michael@0 968 return 1;
michael@0 969 }
michael@0 970 }
michael@0 971 return 0;
michael@0 972 }
michael@0 973
michael@0 974 int HashMgr::is_aliasm() {
michael@0 975 return (aliasm != NULL);
michael@0 976 }
michael@0 977
michael@0 978 char * HashMgr::get_aliasm(int index) {
michael@0 979 if ((index > 0) && (index <= numaliasm)) return aliasm[index - 1];
michael@0 980 HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index);
michael@0 981 return NULL;
michael@0 982 }

mercurial