extensions/spellcheck/hunspell/src/hashmgr.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/extensions/spellcheck/hunspell/src/hashmgr.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,982 @@
     1.4 +/******* BEGIN LICENSE BLOCK *******
     1.5 + * Version: MPL 1.1/GPL 2.0/LGPL 2.1
     1.6 + * 
     1.7 + * The contents of this file are subject to the Mozilla Public License Version
     1.8 + * 1.1 (the "License"); you may not use this file except in compliance with
     1.9 + * the License. You may obtain a copy of the License at
    1.10 + * http://www.mozilla.org/MPL/
    1.11 + * 
    1.12 + * Software distributed under the License is distributed on an "AS IS" basis,
    1.13 + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
    1.14 + * for the specific language governing rights and limitations under the
    1.15 + * License.
    1.16 + * 
    1.17 + * The Initial Developers of the Original Code are Kevin Hendricks (MySpell)
    1.18 + * and László Németh (Hunspell). Portions created by the Initial Developers
    1.19 + * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved.
    1.20 + * 
    1.21 + * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca)
    1.22 + *                 David Einstein (deinst@world.std.com)
    1.23 + *                 László Németh (nemethl@gyorsposta.hu)
    1.24 + *                 Caolan McNamara (caolanm@redhat.com)
    1.25 + *                 Davide Prina
    1.26 + *                 Giuseppe Modugno
    1.27 + *                 Gianluca Turconi
    1.28 + *                 Simon Brouwer
    1.29 + *                 Noll Janos
    1.30 + *                 Biro Arpad
    1.31 + *                 Goldman Eleonora
    1.32 + *                 Sarlos Tamas
    1.33 + *                 Bencsath Boldizsar
    1.34 + *                 Halacsy Peter
    1.35 + *                 Dvornik Laszlo
    1.36 + *                 Gefferth Andras
    1.37 + *                 Nagy Viktor
    1.38 + *                 Varga Daniel
    1.39 + *                 Chris Halls
    1.40 + *                 Rene Engelhard
    1.41 + *                 Bram Moolenaar
    1.42 + *                 Dafydd Jones
    1.43 + *                 Harri Pitkanen
    1.44 + *                 Andras Timar
    1.45 + *                 Tor Lillqvist
    1.46 + * 
    1.47 + * Alternatively, the contents of this file may be used under the terms of
    1.48 + * either the GNU General Public License Version 2 or later (the "GPL"), or
    1.49 + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
    1.50 + * in which case the provisions of the GPL or the LGPL are applicable instead
    1.51 + * of those above. If you wish to allow use of your version of this file only
    1.52 + * under the terms of either the GPL or the LGPL, and not to allow others to
    1.53 + * use your version of this file under the terms of the MPL, indicate your
    1.54 + * decision by deleting the provisions above and replace them with the notice
    1.55 + * and other provisions required by the GPL or the LGPL. If you do not delete
    1.56 + * the provisions above, a recipient may use your version of this file under
    1.57 + * the terms of any one of the MPL, the GPL or the LGPL.
    1.58 + *
    1.59 + ******* END LICENSE BLOCK *******/
    1.60 +
    1.61 +#include <stdlib.h> 
    1.62 +#include <string.h>
    1.63 +#include <stdio.h> 
    1.64 +#include <ctype.h>
    1.65 +
    1.66 +#include "hashmgr.hxx"
    1.67 +#include "csutil.hxx"
    1.68 +#include "atypes.hxx"
    1.69 +
    1.70 +// build a hash table from a munched word list
    1.71 +
    1.72 +HashMgr::HashMgr(const char * tpath, const char * apath, const char * key)
    1.73 +{
    1.74 +  tablesize = 0;
    1.75 +  tableptr = NULL;
    1.76 +  flag_mode = FLAG_CHAR;
    1.77 +  complexprefixes = 0;
    1.78 +  utf8 = 0;
    1.79 +  langnum = 0;
    1.80 +  lang = NULL;
    1.81 +  enc = NULL;
    1.82 +  csconv = 0;
    1.83 +  ignorechars = NULL;
    1.84 +  ignorechars_utf16 = NULL;
    1.85 +  ignorechars_utf16_len = 0;
    1.86 +  numaliasf = 0;
    1.87 +  aliasf = NULL;
    1.88 +  numaliasm = 0;
    1.89 +  aliasm = NULL;
    1.90 +  forbiddenword = FORBIDDENWORD; // forbidden word signing flag
    1.91 +  load_config(apath, key);
    1.92 +  int ec = load_tables(tpath, key);
    1.93 +  if (ec) {
    1.94 +    /* error condition - what should we do here */
    1.95 +    HUNSPELL_WARNING(stderr, "Hash Manager Error : %d\n",ec);
    1.96 +    if (tableptr) {
    1.97 +      free(tableptr);
    1.98 +      tableptr = NULL;
    1.99 +    }
   1.100 +    tablesize = 0;
   1.101 +  }
   1.102 +}
   1.103 +
   1.104 +
   1.105 +HashMgr::~HashMgr()
   1.106 +{
   1.107 +  if (tableptr) {
   1.108 +    // now pass through hash table freeing up everything
   1.109 +    // go through column by column of the table
   1.110 +    for (int i=0; i < tablesize; i++) {
   1.111 +      struct hentry * pt = tableptr[i];
   1.112 +      struct hentry * nt = NULL;
   1.113 +      while(pt) {
   1.114 +        nt = pt->next;
   1.115 +        if (pt->astr && (!aliasf || TESTAFF(pt->astr, ONLYUPCASEFLAG, pt->alen))) free(pt->astr);
   1.116 +        free(pt);
   1.117 +        pt = nt;
   1.118 +      }
   1.119 +    }
   1.120 +    free(tableptr);
   1.121 +  }
   1.122 +  tablesize = 0;
   1.123 +
   1.124 +  if (aliasf) {
   1.125 +    for (int j = 0; j < (numaliasf); j++) free(aliasf[j]);
   1.126 +    free(aliasf);
   1.127 +    aliasf = NULL;
   1.128 +    if (aliasflen) {
   1.129 +      free(aliasflen);
   1.130 +      aliasflen = NULL;
   1.131 +    }
   1.132 +  }
   1.133 +  if (aliasm) {
   1.134 +    for (int j = 0; j < (numaliasm); j++) free(aliasm[j]);
   1.135 +    free(aliasm);
   1.136 +    aliasm = NULL;
   1.137 +  }  
   1.138 +
   1.139 +#ifndef OPENOFFICEORG
   1.140 +#ifndef MOZILLA_CLIENT
   1.141 +  if (utf8) free_utf_tbl();
   1.142 +#endif
   1.143 +#endif
   1.144 +
   1.145 +  if (enc) free(enc);
   1.146 +  if (lang) free(lang);
   1.147 +  
   1.148 +  if (ignorechars) free(ignorechars);
   1.149 +  if (ignorechars_utf16) free(ignorechars_utf16);
   1.150 +
   1.151 +#ifdef MOZILLA_CLIENT
   1.152 +    delete [] csconv;
   1.153 +#endif
   1.154 +}
   1.155 +
   1.156 +// lookup a root word in the hashtable
   1.157 +
   1.158 +struct hentry * HashMgr::lookup(const char *word) const
   1.159 +{
   1.160 +    struct hentry * dp;
   1.161 +    if (tableptr) {
   1.162 +       dp = tableptr[hash(word)];
   1.163 +       if (!dp) return NULL;
   1.164 +       for (  ;  dp != NULL;  dp = dp->next) {
   1.165 +          if (strcmp(word, dp->word) == 0) return dp;
   1.166 +       }
   1.167 +    }
   1.168 +    return NULL;
   1.169 +}
   1.170 +
   1.171 +// add a word to the hash table (private)
   1.172 +int HashMgr::add_word(const char * word, int wbl, int wcl, unsigned short * aff,
   1.173 +    int al, const char * desc, bool onlyupcase)
   1.174 +{
   1.175 +    bool upcasehomonym = false;
   1.176 +    int descl = desc ? (aliasm ? sizeof(short) : strlen(desc) + 1) : 0;
   1.177 +    // variable-length hash record with word and optional fields
   1.178 +    struct hentry* hp = 
   1.179 +	(struct hentry *) malloc (sizeof(struct hentry) + wbl + descl);
   1.180 +    if (!hp) return 1;
   1.181 +    char * hpw = hp->word;
   1.182 +    strcpy(hpw, word);
   1.183 +    if (ignorechars != NULL) {
   1.184 +      if (utf8) {
   1.185 +        remove_ignored_chars_utf(hpw, ignorechars_utf16, ignorechars_utf16_len);
   1.186 +      } else {
   1.187 +        remove_ignored_chars(hpw, ignorechars);
   1.188 +      }
   1.189 +    }
   1.190 +    if (complexprefixes) {
   1.191 +        if (utf8) reverseword_utf(hpw); else reverseword(hpw);
   1.192 +    }
   1.193 +
   1.194 +    int i = hash(hpw);
   1.195 +
   1.196 +    hp->blen = (unsigned char) wbl;
   1.197 +    hp->clen = (unsigned char) wcl;
   1.198 +    hp->alen = (short) al;
   1.199 +    hp->astr = aff;
   1.200 +    hp->next = NULL;      
   1.201 +    hp->next_homonym = NULL;
   1.202 +
   1.203 +    // store the description string or its pointer
   1.204 +    if (desc) {
   1.205 +        hp->var = H_OPT;
   1.206 +        if (aliasm) {
   1.207 +            hp->var += H_OPT_ALIASM;
   1.208 +            store_pointer(hpw + wbl + 1, get_aliasm(atoi(desc)));
   1.209 +        } else {
   1.210 +	    strcpy(hpw + wbl + 1, desc);
   1.211 +            if (complexprefixes) {
   1.212 +                if (utf8) reverseword_utf(HENTRY_DATA(hp));
   1.213 +                else reverseword(HENTRY_DATA(hp));
   1.214 +            }
   1.215 +        }
   1.216 +	if (strstr(HENTRY_DATA(hp), MORPH_PHON)) hp->var += H_OPT_PHON;
   1.217 +    } else hp->var = 0;
   1.218 +
   1.219 +       struct hentry * dp = tableptr[i];
   1.220 +       if (!dp) {
   1.221 +         tableptr[i] = hp;
   1.222 +         return 0;
   1.223 +       }
   1.224 +       while (dp->next != NULL) {
   1.225 +         if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) {
   1.226 +    	    // remove hidden onlyupcase homonym
   1.227 +            if (!onlyupcase) {
   1.228 +		if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) {
   1.229 +		    free(dp->astr);
   1.230 +		    dp->astr = hp->astr;
   1.231 +		    dp->alen = hp->alen;
   1.232 +		    free(hp);
   1.233 +		    return 0;
   1.234 +		} else {
   1.235 +    		    dp->next_homonym = hp;
   1.236 +    		}
   1.237 +            } else {
   1.238 +        	upcasehomonym = true;
   1.239 +            }
   1.240 +         }
   1.241 +         dp=dp->next;
   1.242 +       }
   1.243 +       if (strcmp(hp->word, dp->word) == 0) {
   1.244 +    	    // remove hidden onlyupcase homonym
   1.245 +            if (!onlyupcase) {
   1.246 +		if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) {
   1.247 +		    free(dp->astr);
   1.248 +		    dp->astr = hp->astr;
   1.249 +		    dp->alen = hp->alen;
   1.250 +		    free(hp);
   1.251 +		    return 0;
   1.252 +		} else {
   1.253 +    		    dp->next_homonym = hp;
   1.254 +    		}
   1.255 +            } else {
   1.256 +        	upcasehomonym = true;
   1.257 +            }
   1.258 +       }
   1.259 +       if (!upcasehomonym) {
   1.260 +    	    dp->next = hp;
   1.261 +       } else {
   1.262 +    	    // remove hidden onlyupcase homonym
   1.263 +    	    if (hp->astr) free(hp->astr);
   1.264 +    	    free(hp);
   1.265 +       }
   1.266 +    return 0;
   1.267 +}     
   1.268 +
   1.269 +int HashMgr::add_hidden_capitalized_word(char * word, int wbl, int wcl,
   1.270 +    unsigned short * flags, int al, char * dp, int captype)
   1.271 +{
   1.272 +    // add inner capitalized forms to handle the following allcap forms:
   1.273 +    // Mixed caps: OpenOffice.org -> OPENOFFICE.ORG
   1.274 +    // Allcaps with suffixes: CIA's -> CIA'S    
   1.275 +    if (((captype == HUHCAP) || (captype == HUHINITCAP) ||
   1.276 +      ((captype == ALLCAP) && (flags != NULL))) &&
   1.277 +      !((flags != NULL) && TESTAFF(flags, forbiddenword, al))) {
   1.278 +          unsigned short * flags2 = (unsigned short *) malloc (sizeof(unsigned short) * (al+1));
   1.279 +	  if (!flags2) return 1;
   1.280 +          if (al) memcpy(flags2, flags, al * sizeof(unsigned short));
   1.281 +          flags2[al] = ONLYUPCASEFLAG;
   1.282 +          if (utf8) {
   1.283 +              char st[BUFSIZE];
   1.284 +              w_char w[BUFSIZE];
   1.285 +              int wlen = u8_u16(w, BUFSIZE, word);
   1.286 +              mkallsmall_utf(w, wlen, langnum);
   1.287 +              mkallcap_utf(w, 1, langnum);
   1.288 +              u16_u8(st, BUFSIZE, w, wlen);
   1.289 +              return add_word(st,wbl,wcl,flags2,al+1,dp, true);
   1.290 +           } else {
   1.291 +               mkallsmall(word, csconv);
   1.292 +               mkinitcap(word, csconv);
   1.293 +               return add_word(word,wbl,wcl,flags2,al+1,dp, true);
   1.294 +           }
   1.295 +    }
   1.296 +    return 0;
   1.297 +}
   1.298 +
   1.299 +// detect captype and modify word length for UTF-8 encoding
   1.300 +int HashMgr::get_clen_and_captype(const char * word, int wbl, int * captype) {
   1.301 +    int len;
   1.302 +    if (utf8) {
   1.303 +      w_char dest_utf[BUFSIZE];
   1.304 +      len = u8_u16(dest_utf, BUFSIZE, word);
   1.305 +      *captype = get_captype_utf8(dest_utf, len, langnum);
   1.306 +    } else {
   1.307 +      len = wbl;
   1.308 +      *captype = get_captype((char *) word, len, csconv);
   1.309 +    }
   1.310 +    return len;
   1.311 +}
   1.312 +
   1.313 +// remove word (personal dictionary function for standalone applications)
   1.314 +int HashMgr::remove(const char * word)
   1.315 +{
   1.316 +    struct hentry * dp = lookup(word);
   1.317 +    while (dp) {
   1.318 +        if (dp->alen == 0 || !TESTAFF(dp->astr, forbiddenword, dp->alen)) {
   1.319 +            unsigned short * flags =
   1.320 +                (unsigned short *) malloc(sizeof(short) * (dp->alen + 1));
   1.321 +            if (!flags) return 1;
   1.322 +            for (int i = 0; i < dp->alen; i++) flags[i] = dp->astr[i];
   1.323 +            flags[dp->alen] = forbiddenword;
   1.324 +            dp->astr = flags;
   1.325 +            dp->alen++;
   1.326 +            flag_qsort(flags, 0, dp->alen);
   1.327 +        }
   1.328 +        dp = dp->next_homonym;
   1.329 +    }
   1.330 +    return 0;
   1.331 +}
   1.332 +
   1.333 +/* remove forbidden flag to add a personal word to the hash */
   1.334 +int HashMgr::remove_forbidden_flag(const char * word) {
   1.335 +    struct hentry * dp = lookup(word);
   1.336 +    if (!dp) return 1;
   1.337 +    while (dp) {
   1.338 +         if (dp->astr && TESTAFF(dp->astr, forbiddenword, dp->alen)) {
   1.339 +            if (dp->alen == 1) dp->alen = 0; // XXX forbidden words of personal dic.
   1.340 +            else {
   1.341 +                unsigned short * flags2 =
   1.342 +                    (unsigned short *) malloc(sizeof(short) * (dp->alen - 1));
   1.343 +                if (!flags2) return 1;
   1.344 +                int i, j = 0;
   1.345 +                for (i = 0; i < dp->alen; i++) {
   1.346 +                    if (dp->astr[i] != forbiddenword) flags2[j++] = dp->astr[i];
   1.347 +                }
   1.348 +                dp->alen--;
   1.349 +                dp->astr = flags2; // XXX allowed forbidden words
   1.350 +            }
   1.351 +         }
   1.352 +         dp = dp->next_homonym;
   1.353 +       }
   1.354 +   return 0;
   1.355 +}
   1.356 +
   1.357 +// add a custom dic. word to the hash table (public)
   1.358 +int HashMgr::add(const char * word)
   1.359 +{
   1.360 +    unsigned short * flags = NULL;
   1.361 +    int al = 0;
   1.362 +    if (remove_forbidden_flag(word)) {
   1.363 +        int captype;
   1.364 +        int wbl = strlen(word);
   1.365 +        int wcl = get_clen_and_captype(word, wbl, &captype);
   1.366 +        add_word(word, wbl, wcl, flags, al, NULL, false);
   1.367 +        return add_hidden_capitalized_word((char *) word, wbl, wcl, flags, al, NULL, captype);
   1.368 +    }
   1.369 +    return 0;
   1.370 +}
   1.371 +
   1.372 +int HashMgr::add_with_affix(const char * word, const char * example)
   1.373 +{
   1.374 +    // detect captype and modify word length for UTF-8 encoding
   1.375 +    struct hentry * dp = lookup(example);
   1.376 +    remove_forbidden_flag(word);
   1.377 +    if (dp && dp->astr) {
   1.378 +        int captype;
   1.379 +        int wbl = strlen(word);
   1.380 +        int wcl = get_clen_and_captype(word, wbl, &captype);
   1.381 +	if (aliasf) {
   1.382 +	    add_word(word, wbl, wcl, dp->astr, dp->alen, NULL, false);	
   1.383 +	} else {
   1.384 +    	    unsigned short * flags = (unsigned short *) malloc (dp->alen * sizeof(short));
   1.385 +	    if (flags) {
   1.386 +		memcpy((void *) flags, (void *) dp->astr, dp->alen * sizeof(short));
   1.387 +		add_word(word, wbl, wcl, flags, dp->alen, NULL, false);
   1.388 +	    } else return 1;
   1.389 +	}
   1.390 +    	return add_hidden_capitalized_word((char *) word, wbl, wcl, dp->astr, dp->alen, NULL, captype);
   1.391 +    }
   1.392 +    return 1;
   1.393 +}
   1.394 +
   1.395 +// walk the hash table entry by entry - null at end
   1.396 +// initialize: col=-1; hp = NULL; hp = walk_hashtable(&col, hp);
   1.397 +struct hentry * HashMgr::walk_hashtable(int &col, struct hentry * hp) const
   1.398 +{  
   1.399 +  if (hp && hp->next != NULL) return hp->next;
   1.400 +  for (col++; col < tablesize; col++) {
   1.401 +    if (tableptr[col]) return tableptr[col];
   1.402 +  }
   1.403 +  // null at end and reset to start
   1.404 +  col = -1;
   1.405 +  return NULL;
   1.406 +}
   1.407 +
   1.408 +// load a munched word list and build a hash table on the fly
   1.409 +int HashMgr::load_tables(const char * tpath, const char * key)
   1.410 +{
   1.411 +  int al;
   1.412 +  char * ap;
   1.413 +  char * dp;
   1.414 +  char * dp2;
   1.415 +  unsigned short * flags;
   1.416 +  char * ts;
   1.417 +
   1.418 +  // open dictionary file
   1.419 +  FileMgr * dict = new FileMgr(tpath, key);
   1.420 +  if (dict == NULL) return 1;
   1.421 +
   1.422 +  // first read the first line of file to get hash table size */
   1.423 +  if (!(ts = dict->getline())) {
   1.424 +    HUNSPELL_WARNING(stderr, "error: empty dic file\n");
   1.425 +    delete dict;
   1.426 +    return 2;
   1.427 +  }
   1.428 +  mychomp(ts);
   1.429 +
   1.430 +  /* remove byte order mark */
   1.431 +  if (strncmp(ts,"\xEF\xBB\xBF",3) == 0) {
   1.432 +    memmove(ts, ts+3, strlen(ts+3)+1);
   1.433 +    // warning: dic file begins with byte order mark: possible incompatibility with old Hunspell versions
   1.434 +  }
   1.435 +
   1.436 +  tablesize = atoi(ts);
   1.437 +  if (tablesize == 0) {
   1.438 +    HUNSPELL_WARNING(stderr, "error: line 1: missing or bad word count in the dic file\n");
   1.439 +    delete dict;
   1.440 +    return 4;
   1.441 +  }
   1.442 +  tablesize = tablesize + 5 + USERWORD;
   1.443 +  if ((tablesize %2) == 0) tablesize++;
   1.444 +
   1.445 +  // allocate the hash table
   1.446 +  tableptr = (struct hentry **) malloc(tablesize * sizeof(struct hentry *));
   1.447 +  if (! tableptr) {
   1.448 +    delete dict;
   1.449 +    return 3;
   1.450 +  }
   1.451 +  for (int i=0; i<tablesize; i++) tableptr[i] = NULL;
   1.452 +
   1.453 +  // loop through all words on much list and add to hash
   1.454 +  // table and create word and affix strings
   1.455 +
   1.456 +  while ((ts = dict->getline())) {
   1.457 +    mychomp(ts);
   1.458 +    // split each line into word and morphological description
   1.459 +    dp = ts;
   1.460 +    while ((dp = strchr(dp, ':'))) {
   1.461 +	if ((dp > ts + 3) && (*(dp - 3) == ' ' || *(dp - 3) == '\t')) {
   1.462 +	    for (dp -= 4; dp >= ts && (*dp == ' ' || *dp == '\t'); dp--);
   1.463 +	    if (dp < ts) { // missing word
   1.464 +		dp = NULL;
   1.465 +	    } else {
   1.466 +		*(dp + 1) = '\0';
   1.467 +		dp = dp + 2;
   1.468 +	    }
   1.469 +	    break;
   1.470 +	}
   1.471 +	dp++;
   1.472 +    }
   1.473 +
   1.474 +    // tabulator is the old morphological field separator
   1.475 +    dp2 = strchr(ts, '\t');
   1.476 +    if (dp2 && (!dp || dp2 < dp)) {
   1.477 +	*dp2 = '\0';
   1.478 +	dp = dp2 + 1;
   1.479 +    }
   1.480 +
   1.481 +    // split each line into word and affix char strings
   1.482 +    // "\/" signs slash in words (not affix separator)
   1.483 +    // "/" at beginning of the line is word character (not affix separator)
   1.484 +    ap = strchr(ts,'/');
   1.485 +    while (ap) {
   1.486 +        if (ap == ts) {
   1.487 +            ap++;
   1.488 +            continue;
   1.489 +        } else if (*(ap - 1) != '\\') break;
   1.490 +        // replace "\/" with "/"
   1.491 +        for (char * sp = ap - 1; *sp; *sp = *(sp + 1), sp++);
   1.492 +        ap = strchr(ap,'/');
   1.493 +    }
   1.494 +
   1.495 +    if (ap) {
   1.496 +      *ap = '\0';
   1.497 +      if (aliasf) {
   1.498 +        int index = atoi(ap + 1);
   1.499 +        al = get_aliasf(index, &flags, dict);
   1.500 +        if (!al) {
   1.501 +            HUNSPELL_WARNING(stderr, "error: line %d: bad flag vector alias\n", dict->getlinenum());
   1.502 +            *ap = '\0';
   1.503 +        }
   1.504 +      } else {
   1.505 +        al = decode_flags(&flags, ap + 1, dict);
   1.506 +        if (al == -1) {
   1.507 +            HUNSPELL_WARNING(stderr, "Can't allocate memory.\n");
   1.508 +            delete dict;
   1.509 +            return 6;
   1.510 +        }
   1.511 +        flag_qsort(flags, 0, al);
   1.512 +      }
   1.513 +    } else {
   1.514 +      al = 0;
   1.515 +      ap = NULL;
   1.516 +      flags = NULL;
   1.517 +    }
   1.518 +
   1.519 +    int captype;
   1.520 +    int wbl = strlen(ts);
   1.521 +    int wcl = get_clen_and_captype(ts, wbl, &captype);
   1.522 +    // add the word and its index plus its capitalized form optionally
   1.523 +    if (add_word(ts,wbl,wcl,flags,al,dp, false) ||
   1.524 +	add_hidden_capitalized_word(ts, wbl, wcl, flags, al, dp, captype)) {
   1.525 +	delete dict;
   1.526 +	return 5;
   1.527 +    }
   1.528 +  }
   1.529 +
   1.530 +  delete dict;
   1.531 +  return 0;
   1.532 +}
   1.533 +
   1.534 +// the hash function is a simple load and rotate
   1.535 +// algorithm borrowed
   1.536 +
   1.537 +int HashMgr::hash(const char * word) const
   1.538 +{
   1.539 +    long  hv = 0;
   1.540 +    for (int i=0; i < 4  &&  *word != 0; i++)
   1.541 +        hv = (hv << 8) | (*word++);
   1.542 +    while (*word != 0) {
   1.543 +      ROTATE(hv,ROTATE_LEN);
   1.544 +      hv ^= (*word++);
   1.545 +    }
   1.546 +    return (unsigned long) hv % tablesize;
   1.547 +}
   1.548 +
   1.549 +int HashMgr::decode_flags(unsigned short ** result, char * flags, FileMgr * af) {
   1.550 +    int len;
   1.551 +    if (*flags == '\0') {
   1.552 +        *result = NULL;
   1.553 +        return 0;
   1.554 +    }
   1.555 +    switch (flag_mode) {
   1.556 +      case FLAG_LONG: { // two-character flags (1x2yZz -> 1x 2y Zz)
   1.557 +        len = strlen(flags);
   1.558 +        if (len%2 == 1) HUNSPELL_WARNING(stderr, "error: line %d: bad flagvector\n", af->getlinenum());
   1.559 +        len /= 2;
   1.560 +        *result = (unsigned short *) malloc(len * sizeof(short));
   1.561 +        if (!*result) return -1;
   1.562 +        for (int i = 0; i < len; i++) {
   1.563 +            (*result)[i] = (((unsigned short) flags[i * 2]) << 8) + (unsigned short) flags[i * 2 + 1]; 
   1.564 +        }
   1.565 +        break;
   1.566 +      }
   1.567 +      case FLAG_NUM: { // decimal numbers separated by comma (4521,23,233 -> 4521 23 233)
   1.568 +        int i;
   1.569 +        len = 1;
   1.570 +        char * src = flags; 
   1.571 +        unsigned short * dest;
   1.572 +        char * p;
   1.573 +        for (p = flags; *p; p++) {
   1.574 +          if (*p == ',') len++;
   1.575 +        }
   1.576 +        *result = (unsigned short *) malloc(len * sizeof(short));
   1.577 +        if (!*result) return -1;
   1.578 +        dest = *result;
   1.579 +        for (p = flags; *p; p++) {
   1.580 +          if (*p == ',') {
   1.581 +            i = atoi(src);
   1.582 +            if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: line %d: flag id %d is too large (max: %d)\n",
   1.583 +              af->getlinenum(), i, DEFAULTFLAGS - 1);
   1.584 +            *dest = (unsigned short) i;
   1.585 +            if (*dest == 0) HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n", af->getlinenum());
   1.586 +            src = p + 1;
   1.587 +            dest++;
   1.588 +          }
   1.589 +        }
   1.590 +        i = atoi(src);
   1.591 +        if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: line %d: flag id %d is too large (max: %d)\n",
   1.592 +          af->getlinenum(), i, DEFAULTFLAGS - 1);
   1.593 +        *dest = (unsigned short) i;
   1.594 +        if (*dest == 0) HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n", af->getlinenum());
   1.595 +        break;
   1.596 +      }    
   1.597 +      case FLAG_UNI: { // UTF-8 characters
   1.598 +        w_char w[BUFSIZE/2];
   1.599 +        len = u8_u16(w, BUFSIZE/2, flags);
   1.600 +        *result = (unsigned short *) malloc(len * sizeof(short));
   1.601 +        if (!*result) return -1;
   1.602 +        memcpy(*result, w, len * sizeof(short));
   1.603 +        break;
   1.604 +      }
   1.605 +      default: { // Ispell's one-character flags (erfg -> e r f g)
   1.606 +        unsigned short * dest;
   1.607 +        len = strlen(flags);
   1.608 +        *result = (unsigned short *) malloc(len * sizeof(short));
   1.609 +        if (!*result) return -1;
   1.610 +        dest = *result;
   1.611 +        for (unsigned char * p = (unsigned char *) flags; *p; p++) {
   1.612 +          *dest = (unsigned short) *p;
   1.613 +          dest++;
   1.614 +        }
   1.615 +      }
   1.616 +    }
   1.617 +    return len;
   1.618 +}
   1.619 +
   1.620 +unsigned short HashMgr::decode_flag(const char * f) {
   1.621 +    unsigned short s = 0;
   1.622 +    int i;
   1.623 +    switch (flag_mode) {
   1.624 +      case FLAG_LONG:
   1.625 +        s = ((unsigned short) f[0] << 8) + (unsigned short) f[1];
   1.626 +        break;
   1.627 +      case FLAG_NUM:
   1.628 +        i = atoi(f);
   1.629 +        if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: flag id %d is too large (max: %d)\n", i, DEFAULTFLAGS - 1);
   1.630 +        s = (unsigned short) i;
   1.631 +        break;
   1.632 +      case FLAG_UNI:
   1.633 +        u8_u16((w_char *) &s, 1, f);
   1.634 +        break;
   1.635 +      default:
   1.636 +        s = (unsigned short) *((unsigned char *)f);
   1.637 +    }
   1.638 +    if (s == 0) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n");
   1.639 +    return s;
   1.640 +}
   1.641 +
   1.642 +char * HashMgr::encode_flag(unsigned short f) {
   1.643 +    unsigned char ch[10];
   1.644 +    if (f==0) return mystrdup("(NULL)");
   1.645 +    if (flag_mode == FLAG_LONG) {
   1.646 +        ch[0] = (unsigned char) (f >> 8);
   1.647 +        ch[1] = (unsigned char) (f - ((f >> 8) << 8));
   1.648 +        ch[2] = '\0';
   1.649 +    } else if (flag_mode == FLAG_NUM) {
   1.650 +        sprintf((char *) ch, "%d", f);
   1.651 +    } else if (flag_mode == FLAG_UNI) {
   1.652 +        u16_u8((char *) &ch, 10, (w_char *) &f, 1);
   1.653 +    } else {
   1.654 +        ch[0] = (unsigned char) (f);
   1.655 +        ch[1] = '\0';
   1.656 +    }
   1.657 +    return mystrdup((char *) ch);
   1.658 +}
   1.659 +
   1.660 +// read in aff file and set flag mode
   1.661 +int  HashMgr::load_config(const char * affpath, const char * key)
   1.662 +{
   1.663 +  char * line; // io buffers
   1.664 +  int firstline = 1;
   1.665 + 
   1.666 +  // open the affix file
   1.667 +  FileMgr * afflst = new FileMgr(affpath, key);
   1.668 +  if (!afflst) {
   1.669 +    HUNSPELL_WARNING(stderr, "Error - could not open affix description file %s\n",affpath);
   1.670 +    return 1;
   1.671 +  }
   1.672 +
   1.673 +    // read in each line ignoring any that do not
   1.674 +    // start with a known line type indicator
   1.675 +
   1.676 +    while ((line = afflst->getline())) {
   1.677 +        mychomp(line);
   1.678 +
   1.679 +       /* remove byte order mark */
   1.680 +       if (firstline) {
   1.681 +         firstline = 0;
   1.682 +         if (strncmp(line,"\xEF\xBB\xBF",3) == 0) memmove(line, line+3, strlen(line+3)+1);
   1.683 +       }
   1.684 +
   1.685 +        /* parse in the try string */
   1.686 +        if ((strncmp(line,"FLAG",4) == 0) && isspace(line[4])) {
   1.687 +            if (flag_mode != FLAG_CHAR) {
   1.688 +                HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of the FLAG affix file parameter\n", afflst->getlinenum());
   1.689 +            }
   1.690 +            if (strstr(line, "long")) flag_mode = FLAG_LONG;
   1.691 +            if (strstr(line, "num")) flag_mode = FLAG_NUM;
   1.692 +            if (strstr(line, "UTF-8")) flag_mode = FLAG_UNI;
   1.693 +            if (flag_mode == FLAG_CHAR) {
   1.694 +                HUNSPELL_WARNING(stderr, "error: line %d: FLAG needs `num', `long' or `UTF-8' parameter\n", afflst->getlinenum());
   1.695 +            }
   1.696 +        }
   1.697 +        if (strncmp(line,"FORBIDDENWORD",13) == 0) {
   1.698 +          char * st = NULL;
   1.699 +          if (parse_string(line, &st, afflst->getlinenum())) {
   1.700 +             delete afflst;
   1.701 +             return 1;
   1.702 +          }
   1.703 +          forbiddenword = decode_flag(st);
   1.704 +          free(st);
   1.705 +        }
   1.706 +        if (strncmp(line, "SET", 3) == 0) {
   1.707 +    	  if (parse_string(line, &enc, afflst->getlinenum())) {
   1.708 +             delete afflst;
   1.709 +             return 1;
   1.710 +          }    	    
   1.711 +    	  if (strcmp(enc, "UTF-8") == 0) {
   1.712 +    	    utf8 = 1;
   1.713 +#ifndef OPENOFFICEORG
   1.714 +#ifndef MOZILLA_CLIENT
   1.715 +    	    initialize_utf_tbl();
   1.716 +#endif
   1.717 +#endif
   1.718 +    	  } else csconv = get_current_cs(enc);
   1.719 +    	}
   1.720 +        if (strncmp(line, "LANG", 4) == 0) {
   1.721 +    	  if (parse_string(line, &lang, afflst->getlinenum())) {
   1.722 +             delete afflst;
   1.723 +             return 1;
   1.724 +          }    	    
   1.725 +    	  langnum = get_lang_num(lang);
   1.726 +    	}
   1.727 +
   1.728 +       /* parse in the ignored characters (for example, Arabic optional diacritics characters */
   1.729 +       if (strncmp(line,"IGNORE",6) == 0) {
   1.730 +          if (parse_array(line, &ignorechars, &ignorechars_utf16,
   1.731 +                 &ignorechars_utf16_len, utf8, afflst->getlinenum())) {
   1.732 +             delete afflst;
   1.733 +             return 1;
   1.734 +          }
   1.735 +       }
   1.736 +
   1.737 +       if ((strncmp(line,"AF",2) == 0) && isspace(line[2])) {
   1.738 +          if (parse_aliasf(line, afflst)) {
   1.739 +             delete afflst;
   1.740 +             return 1;
   1.741 +          }
   1.742 +       }
   1.743 +
   1.744 +       if ((strncmp(line,"AM",2) == 0) && isspace(line[2])) {
   1.745 +          if (parse_aliasm(line, afflst)) {
   1.746 +             delete afflst;
   1.747 +             return 1;
   1.748 +          }
   1.749 +       }
   1.750 +
   1.751 +       if (strncmp(line,"COMPLEXPREFIXES",15) == 0) complexprefixes = 1;
   1.752 +       if (((strncmp(line,"SFX",3) == 0) || (strncmp(line,"PFX",3) == 0)) && isspace(line[3])) break;
   1.753 +    }
   1.754 +    if (csconv == NULL) csconv = get_current_cs(SPELL_ENCODING);
   1.755 +    delete afflst;
   1.756 +    return 0;
   1.757 +}
   1.758 +
   1.759 +/* parse in the ALIAS table */
   1.760 +int  HashMgr::parse_aliasf(char * line, FileMgr * af)
   1.761 +{
   1.762 +   if (numaliasf != 0) {
   1.763 +      HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
   1.764 +      return 1;
   1.765 +   }
   1.766 +   char * tp = line;
   1.767 +   char * piece;
   1.768 +   int i = 0;
   1.769 +   int np = 0;
   1.770 +   piece = mystrsep(&tp, 0);
   1.771 +   while (piece) {
   1.772 +       if (*piece != '\0') {
   1.773 +          switch(i) {
   1.774 +             case 0: { np++; break; }
   1.775 +             case 1: { 
   1.776 +                       numaliasf = atoi(piece);
   1.777 +                       if (numaliasf < 1) {
   1.778 +                          numaliasf = 0;
   1.779 +                          aliasf = NULL;
   1.780 +                          aliasflen = NULL;
   1.781 +                          HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
   1.782 +                          return 1;
   1.783 +                       }
   1.784 +                       aliasf = (unsigned short **) malloc(numaliasf * sizeof(unsigned short *));
   1.785 +                       aliasflen = (unsigned short *) malloc(numaliasf * sizeof(short));
   1.786 +                       if (!aliasf || !aliasflen) {
   1.787 +                          numaliasf = 0;
   1.788 +                          if (aliasf) free(aliasf);
   1.789 +                          if (aliasflen) free(aliasflen);
   1.790 +                          aliasf = NULL;
   1.791 +                          aliasflen = NULL;
   1.792 +                          return 1;
   1.793 +                       }
   1.794 +                       np++;
   1.795 +                       break;
   1.796 +                     }
   1.797 +             default: break;
   1.798 +          }
   1.799 +          i++;
   1.800 +       }
   1.801 +       piece = mystrsep(&tp, 0);
   1.802 +   }
   1.803 +   if (np != 2) {
   1.804 +      numaliasf = 0;
   1.805 +      free(aliasf);
   1.806 +      free(aliasflen);
   1.807 +      aliasf = NULL;
   1.808 +      aliasflen = NULL;
   1.809 +      HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
   1.810 +      return 1;
   1.811 +   } 
   1.812 + 
   1.813 +   /* now parse the numaliasf lines to read in the remainder of the table */
   1.814 +   char * nl;
   1.815 +   for (int j=0; j < numaliasf; j++) {
   1.816 +        if (!(nl = af->getline())) return 1;
   1.817 +        mychomp(nl);
   1.818 +        tp = nl;
   1.819 +        i = 0;
   1.820 +        aliasf[j] = NULL;
   1.821 +        aliasflen[j] = 0;
   1.822 +        piece = mystrsep(&tp, 0);
   1.823 +        while (piece) {
   1.824 +           if (*piece != '\0') {
   1.825 +               switch(i) {
   1.826 +                  case 0: {
   1.827 +                             if (strncmp(piece,"AF",2) != 0) {
   1.828 +                                 numaliasf = 0;
   1.829 +                                 free(aliasf);
   1.830 +                                 free(aliasflen);
   1.831 +                                 aliasf = NULL;
   1.832 +                                 aliasflen = NULL;
   1.833 +                                 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
   1.834 +                                 return 1;
   1.835 +                             }
   1.836 +                             break;
   1.837 +                          }
   1.838 +                  case 1: {
   1.839 +                            aliasflen[j] = (unsigned short) decode_flags(&(aliasf[j]), piece, af);
   1.840 +                            flag_qsort(aliasf[j], 0, aliasflen[j]);
   1.841 +                            break; 
   1.842 +                          }
   1.843 +                  default: break;
   1.844 +               }
   1.845 +               i++;
   1.846 +           }
   1.847 +           piece = mystrsep(&tp, 0);
   1.848 +        }
   1.849 +        if (!aliasf[j]) {
   1.850 +             free(aliasf);
   1.851 +             free(aliasflen);
   1.852 +             aliasf = NULL;
   1.853 +             aliasflen = NULL;
   1.854 +             numaliasf = 0;
   1.855 +             HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
   1.856 +             return 1;
   1.857 +        }
   1.858 +   }
   1.859 +   return 0;
   1.860 +}
   1.861 +
   1.862 +int HashMgr::is_aliasf() {
   1.863 +    return (aliasf != NULL);
   1.864 +}
   1.865 +
   1.866 +int HashMgr::get_aliasf(int index, unsigned short ** fvec, FileMgr * af) {
   1.867 +    if ((index > 0) && (index <= numaliasf)) {
   1.868 +        *fvec = aliasf[index - 1];
   1.869 +        return aliasflen[index - 1];
   1.870 +    }
   1.871 +    HUNSPELL_WARNING(stderr, "error: line %d: bad flag alias index: %d\n", af->getlinenum(), index);
   1.872 +    *fvec = NULL;
   1.873 +    return 0;
   1.874 +}
   1.875 +
   1.876 +/* parse morph alias definitions */
   1.877 +int  HashMgr::parse_aliasm(char * line, FileMgr * af)
   1.878 +{
   1.879 +   if (numaliasm != 0) {
   1.880 +      HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
   1.881 +      return 1;
   1.882 +   }
   1.883 +   char * tp = line;
   1.884 +   char * piece;
   1.885 +   int i = 0;
   1.886 +   int np = 0;
   1.887 +   piece = mystrsep(&tp, 0);
   1.888 +   while (piece) {
   1.889 +       if (*piece != '\0') {
   1.890 +          switch(i) {
   1.891 +             case 0: { np++; break; }
   1.892 +             case 1: { 
   1.893 +                       numaliasm = atoi(piece);
   1.894 +                       if (numaliasm < 1) {
   1.895 +                          HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
   1.896 +                          return 1;
   1.897 +                       }
   1.898 +                       aliasm = (char **) malloc(numaliasm * sizeof(char *));
   1.899 +                       if (!aliasm) {
   1.900 +                          numaliasm = 0;
   1.901 +                          return 1;
   1.902 +                       }
   1.903 +                       np++;
   1.904 +                       break;
   1.905 +                     }
   1.906 +             default: break;
   1.907 +          }
   1.908 +          i++;
   1.909 +       }
   1.910 +       piece = mystrsep(&tp, 0);
   1.911 +   }
   1.912 +   if (np != 2) {
   1.913 +      numaliasm = 0;
   1.914 +      free(aliasm);
   1.915 +      aliasm = NULL;
   1.916 +      HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
   1.917 +      return 1;
   1.918 +   } 
   1.919 +
   1.920 +   /* now parse the numaliasm lines to read in the remainder of the table */
   1.921 +   char * nl = line;
   1.922 +   for (int j=0; j < numaliasm; j++) {
   1.923 +        if (!(nl = af->getline())) return 1;
   1.924 +        mychomp(nl);
   1.925 +        tp = nl;
   1.926 +        i = 0;
   1.927 +        aliasm[j] = NULL;
   1.928 +        piece = mystrsep(&tp, ' ');
   1.929 +        while (piece) {
   1.930 +           if (*piece != '\0') {
   1.931 +               switch(i) {
   1.932 +                  case 0: {
   1.933 +                             if (strncmp(piece,"AM",2) != 0) {
   1.934 +                                 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
   1.935 +                                 numaliasm = 0;
   1.936 +                                 free(aliasm);
   1.937 +                                 aliasm = NULL;
   1.938 +                                 return 1;
   1.939 +                             }
   1.940 +                             break;
   1.941 +                          }
   1.942 +                  case 1: {
   1.943 +                            // add the remaining of the line
   1.944 +                            if (*tp) {
   1.945 +                                *(tp - 1) = ' ';
   1.946 +                                tp = tp + strlen(tp);
   1.947 +                            }
   1.948 +                            if (complexprefixes) {
   1.949 +                                if (utf8) reverseword_utf(piece);
   1.950 +                                    else reverseword(piece);
   1.951 +                            }
   1.952 +                            aliasm[j] = mystrdup(piece);
   1.953 +                            if (!aliasm[j]) {
   1.954 +                                 numaliasm = 0;
   1.955 +                                 free(aliasm);
   1.956 +                                 aliasm = NULL;
   1.957 +                                 return 1;
   1.958 +                            }
   1.959 +                            break; }
   1.960 +                  default: break;
   1.961 +               }
   1.962 +               i++;
   1.963 +           }
   1.964 +           piece = mystrsep(&tp, ' ');
   1.965 +        }
   1.966 +        if (!aliasm[j]) {
   1.967 +             numaliasm = 0;
   1.968 +             free(aliasm);
   1.969 +             aliasm = NULL;
   1.970 +             HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
   1.971 +             return 1;
   1.972 +        }
   1.973 +   }
   1.974 +   return 0;
   1.975 +}
   1.976 +
   1.977 +int HashMgr::is_aliasm() {
   1.978 +    return (aliasm != NULL);
   1.979 +}
   1.980 +
   1.981 +char * HashMgr::get_aliasm(int index) {
   1.982 +    if ((index > 0) && (index <= numaliasm)) return aliasm[index - 1];
   1.983 +    HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index);
   1.984 +    return NULL;
   1.985 +}

mercurial