michael@0: /******* BEGIN LICENSE BLOCK *******
michael@0:  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
michael@0:  * 
michael@0:  * The contents of this file are subject to the Mozilla Public License Version
michael@0:  * 1.1 (the "License"); you may not use this file except in compliance with
michael@0:  * the License. You may obtain a copy of the License at
michael@0:  * http://www.mozilla.org/MPL/
michael@0:  * 
michael@0:  * Software distributed under the License is distributed on an "AS IS" basis,
michael@0:  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
michael@0:  * for the specific language governing rights and limitations under the
michael@0:  * License.
michael@0:  * 
michael@0:  * The Initial Developers of the Original Code are Kevin Hendricks (MySpell)
michael@0:  * and László Németh (Hunspell). Portions created by the Initial Developers
michael@0:  * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved.
michael@0:  * 
michael@0:  * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca)
michael@0:  *                 David Einstein (deinst@world.std.com)
michael@0:  *                 László Németh (nemethl@gyorsposta.hu)
michael@0:  *                 Caolan McNamara (caolanm@redhat.com)
michael@0:  *                 Davide Prina
michael@0:  *                 Giuseppe Modugno
michael@0:  *                 Gianluca Turconi
michael@0:  *                 Simon Brouwer
michael@0:  *                 Noll Janos
michael@0:  *                 Biro Arpad
michael@0:  *                 Goldman Eleonora
michael@0:  *                 Sarlos Tamas
michael@0:  *                 Bencsath Boldizsar
michael@0:  *                 Halacsy Peter
michael@0:  *                 Dvornik Laszlo
michael@0:  *                 Gefferth Andras
michael@0:  *                 Nagy Viktor
michael@0:  *                 Varga Daniel
michael@0:  *                 Chris Halls
michael@0:  *                 Rene Engelhard
michael@0:  *                 Bram Moolenaar
michael@0:  *                 Dafydd Jones
michael@0:  *                 Harri Pitkanen
michael@0:  *                 Andras Timar
michael@0:  *                 Tor Lillqvist
michael@0:  * 
michael@0:  * Alternatively, the contents of this file may be used under the terms of
michael@0:  * either the GNU General Public License Version 2 or later (the "GPL"), or
michael@0:  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
michael@0:  * in which case the provisions of the GPL or the LGPL are applicable instead
michael@0:  * of those above. If you wish to allow use of your version of this file only
michael@0:  * under the terms of either the GPL or the LGPL, and not to allow others to
michael@0:  * use your version of this file under the terms of the MPL, indicate your
michael@0:  * decision by deleting the provisions above and replace them with the notice
michael@0:  * and other provisions required by the GPL or the LGPL. If you do not delete
michael@0:  * the provisions above, a recipient may use your version of this file under
michael@0:  * the terms of any one of the MPL, the GPL or the LGPL.
michael@0:  *
michael@0:  ******* END LICENSE BLOCK *******/
michael@0: 
michael@0: #include <stdlib.h> 
michael@0: #include <string.h>
michael@0: #include <stdio.h> 
michael@0: #include <ctype.h>
michael@0: 
michael@0: #include "hashmgr.hxx"
michael@0: #include "csutil.hxx"
michael@0: #include "atypes.hxx"
michael@0: 
michael@0: // build a hash table from a munched word list
michael@0: 
michael@0: HashMgr::HashMgr(const char * tpath, const char * apath, const char * key)
michael@0: {
michael@0:   tablesize = 0;
michael@0:   tableptr = NULL;
michael@0:   flag_mode = FLAG_CHAR;
michael@0:   complexprefixes = 0;
michael@0:   utf8 = 0;
michael@0:   langnum = 0;
michael@0:   lang = NULL;
michael@0:   enc = NULL;
michael@0:   csconv = 0;
michael@0:   ignorechars = NULL;
michael@0:   ignorechars_utf16 = NULL;
michael@0:   ignorechars_utf16_len = 0;
michael@0:   numaliasf = 0;
michael@0:   aliasf = NULL;
michael@0:   numaliasm = 0;
michael@0:   aliasm = NULL;
michael@0:   forbiddenword = FORBIDDENWORD; // forbidden word signing flag
michael@0:   load_config(apath, key);
michael@0:   int ec = load_tables(tpath, key);
michael@0:   if (ec) {
michael@0:     /* error condition - what should we do here */
michael@0:     HUNSPELL_WARNING(stderr, "Hash Manager Error : %d\n",ec);
michael@0:     if (tableptr) {
michael@0:       free(tableptr);
michael@0:       tableptr = NULL;
michael@0:     }
michael@0:     tablesize = 0;
michael@0:   }
michael@0: }
michael@0: 
michael@0: 
michael@0: HashMgr::~HashMgr()
michael@0: {
michael@0:   if (tableptr) {
michael@0:     // now pass through hash table freeing up everything
michael@0:     // go through column by column of the table
michael@0:     for (int i=0; i < tablesize; i++) {
michael@0:       struct hentry * pt = tableptr[i];
michael@0:       struct hentry * nt = NULL;
michael@0:       while(pt) {
michael@0:         nt = pt->next;
michael@0:         if (pt->astr && (!aliasf || TESTAFF(pt->astr, ONLYUPCASEFLAG, pt->alen))) free(pt->astr);
michael@0:         free(pt);
michael@0:         pt = nt;
michael@0:       }
michael@0:     }
michael@0:     free(tableptr);
michael@0:   }
michael@0:   tablesize = 0;
michael@0: 
michael@0:   if (aliasf) {
michael@0:     for (int j = 0; j < (numaliasf); j++) free(aliasf[j]);
michael@0:     free(aliasf);
michael@0:     aliasf = NULL;
michael@0:     if (aliasflen) {
michael@0:       free(aliasflen);
michael@0:       aliasflen = NULL;
michael@0:     }
michael@0:   }
michael@0:   if (aliasm) {
michael@0:     for (int j = 0; j < (numaliasm); j++) free(aliasm[j]);
michael@0:     free(aliasm);
michael@0:     aliasm = NULL;
michael@0:   }  
michael@0: 
michael@0: #ifndef OPENOFFICEORG
michael@0: #ifndef MOZILLA_CLIENT
michael@0:   if (utf8) free_utf_tbl();
michael@0: #endif
michael@0: #endif
michael@0: 
michael@0:   if (enc) free(enc);
michael@0:   if (lang) free(lang);
michael@0:   
michael@0:   if (ignorechars) free(ignorechars);
michael@0:   if (ignorechars_utf16) free(ignorechars_utf16);
michael@0: 
michael@0: #ifdef MOZILLA_CLIENT
michael@0:     delete [] csconv;
michael@0: #endif
michael@0: }
michael@0: 
michael@0: // lookup a root word in the hashtable
michael@0: 
michael@0: struct hentry * HashMgr::lookup(const char *word) const
michael@0: {
michael@0:     struct hentry * dp;
michael@0:     if (tableptr) {
michael@0:        dp = tableptr[hash(word)];
michael@0:        if (!dp) return NULL;
michael@0:        for (  ;  dp != NULL;  dp = dp->next) {
michael@0:           if (strcmp(word, dp->word) == 0) return dp;
michael@0:        }
michael@0:     }
michael@0:     return NULL;
michael@0: }
michael@0: 
michael@0: // add a word to the hash table (private)
michael@0: int HashMgr::add_word(const char * word, int wbl, int wcl, unsigned short * aff,
michael@0:     int al, const char * desc, bool onlyupcase)
michael@0: {
michael@0:     bool upcasehomonym = false;
michael@0:     int descl = desc ? (aliasm ? sizeof(short) : strlen(desc) + 1) : 0;
michael@0:     // variable-length hash record with word and optional fields
michael@0:     struct hentry* hp = 
michael@0: 	(struct hentry *) malloc (sizeof(struct hentry) + wbl + descl);
michael@0:     if (!hp) return 1;
michael@0:     char * hpw = hp->word;
michael@0:     strcpy(hpw, word);
michael@0:     if (ignorechars != NULL) {
michael@0:       if (utf8) {
michael@0:         remove_ignored_chars_utf(hpw, ignorechars_utf16, ignorechars_utf16_len);
michael@0:       } else {
michael@0:         remove_ignored_chars(hpw, ignorechars);
michael@0:       }
michael@0:     }
michael@0:     if (complexprefixes) {
michael@0:         if (utf8) reverseword_utf(hpw); else reverseword(hpw);
michael@0:     }
michael@0: 
michael@0:     int i = hash(hpw);
michael@0: 
michael@0:     hp->blen = (unsigned char) wbl;
michael@0:     hp->clen = (unsigned char) wcl;
michael@0:     hp->alen = (short) al;
michael@0:     hp->astr = aff;
michael@0:     hp->next = NULL;      
michael@0:     hp->next_homonym = NULL;
michael@0: 
michael@0:     // store the description string or its pointer
michael@0:     if (desc) {
michael@0:         hp->var = H_OPT;
michael@0:         if (aliasm) {
michael@0:             hp->var += H_OPT_ALIASM;
michael@0:             store_pointer(hpw + wbl + 1, get_aliasm(atoi(desc)));
michael@0:         } else {
michael@0: 	    strcpy(hpw + wbl + 1, desc);
michael@0:             if (complexprefixes) {
michael@0:                 if (utf8) reverseword_utf(HENTRY_DATA(hp));
michael@0:                 else reverseword(HENTRY_DATA(hp));
michael@0:             }
michael@0:         }
michael@0: 	if (strstr(HENTRY_DATA(hp), MORPH_PHON)) hp->var += H_OPT_PHON;
michael@0:     } else hp->var = 0;
michael@0: 
michael@0:        struct hentry * dp = tableptr[i];
michael@0:        if (!dp) {
michael@0:          tableptr[i] = hp;
michael@0:          return 0;
michael@0:        }
michael@0:        while (dp->next != NULL) {
michael@0:          if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) {
michael@0:     	    // remove hidden onlyupcase homonym
michael@0:             if (!onlyupcase) {
michael@0: 		if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) {
michael@0: 		    free(dp->astr);
michael@0: 		    dp->astr = hp->astr;
michael@0: 		    dp->alen = hp->alen;
michael@0: 		    free(hp);
michael@0: 		    return 0;
michael@0: 		} else {
michael@0:     		    dp->next_homonym = hp;
michael@0:     		}
michael@0:             } else {
michael@0:         	upcasehomonym = true;
michael@0:             }
michael@0:          }
michael@0:          dp=dp->next;
michael@0:        }
michael@0:        if (strcmp(hp->word, dp->word) == 0) {
michael@0:     	    // remove hidden onlyupcase homonym
michael@0:             if (!onlyupcase) {
michael@0: 		if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) {
michael@0: 		    free(dp->astr);
michael@0: 		    dp->astr = hp->astr;
michael@0: 		    dp->alen = hp->alen;
michael@0: 		    free(hp);
michael@0: 		    return 0;
michael@0: 		} else {
michael@0:     		    dp->next_homonym = hp;
michael@0:     		}
michael@0:             } else {
michael@0:         	upcasehomonym = true;
michael@0:             }
michael@0:        }
michael@0:        if (!upcasehomonym) {
michael@0:     	    dp->next = hp;
michael@0:        } else {
michael@0:     	    // remove hidden onlyupcase homonym
michael@0:     	    if (hp->astr) free(hp->astr);
michael@0:     	    free(hp);
michael@0:        }
michael@0:     return 0;
michael@0: }     
michael@0: 
michael@0: int HashMgr::add_hidden_capitalized_word(char * word, int wbl, int wcl,
michael@0:     unsigned short * flags, int al, char * dp, int captype)
michael@0: {
michael@0:     // add inner capitalized forms to handle the following allcap forms:
michael@0:     // Mixed caps: OpenOffice.org -> OPENOFFICE.ORG
michael@0:     // Allcaps with suffixes: CIA's -> CIA'S    
michael@0:     if (((captype == HUHCAP) || (captype == HUHINITCAP) ||
michael@0:       ((captype == ALLCAP) && (flags != NULL))) &&
michael@0:       !((flags != NULL) && TESTAFF(flags, forbiddenword, al))) {
michael@0:           unsigned short * flags2 = (unsigned short *) malloc (sizeof(unsigned short) * (al+1));
michael@0: 	  if (!flags2) return 1;
michael@0:           if (al) memcpy(flags2, flags, al * sizeof(unsigned short));
michael@0:           flags2[al] = ONLYUPCASEFLAG;
michael@0:           if (utf8) {
michael@0:               char st[BUFSIZE];
michael@0:               w_char w[BUFSIZE];
michael@0:               int wlen = u8_u16(w, BUFSIZE, word);
michael@0:               mkallsmall_utf(w, wlen, langnum);
michael@0:               mkallcap_utf(w, 1, langnum);
michael@0:               u16_u8(st, BUFSIZE, w, wlen);
michael@0:               return add_word(st,wbl,wcl,flags2,al+1,dp, true);
michael@0:            } else {
michael@0:                mkallsmall(word, csconv);
michael@0:                mkinitcap(word, csconv);
michael@0:                return add_word(word,wbl,wcl,flags2,al+1,dp, true);
michael@0:            }
michael@0:     }
michael@0:     return 0;
michael@0: }
michael@0: 
michael@0: // detect captype and modify word length for UTF-8 encoding
michael@0: int HashMgr::get_clen_and_captype(const char * word, int wbl, int * captype) {
michael@0:     int len;
michael@0:     if (utf8) {
michael@0:       w_char dest_utf[BUFSIZE];
michael@0:       len = u8_u16(dest_utf, BUFSIZE, word);
michael@0:       *captype = get_captype_utf8(dest_utf, len, langnum);
michael@0:     } else {
michael@0:       len = wbl;
michael@0:       *captype = get_captype((char *) word, len, csconv);
michael@0:     }
michael@0:     return len;
michael@0: }
michael@0: 
michael@0: // remove word (personal dictionary function for standalone applications)
michael@0: int HashMgr::remove(const char * word)
michael@0: {
michael@0:     struct hentry * dp = lookup(word);
michael@0:     while (dp) {
michael@0:         if (dp->alen == 0 || !TESTAFF(dp->astr, forbiddenword, dp->alen)) {
michael@0:             unsigned short * flags =
michael@0:                 (unsigned short *) malloc(sizeof(short) * (dp->alen + 1));
michael@0:             if (!flags) return 1;
michael@0:             for (int i = 0; i < dp->alen; i++) flags[i] = dp->astr[i];
michael@0:             flags[dp->alen] = forbiddenword;
michael@0:             dp->astr = flags;
michael@0:             dp->alen++;
michael@0:             flag_qsort(flags, 0, dp->alen);
michael@0:         }
michael@0:         dp = dp->next_homonym;
michael@0:     }
michael@0:     return 0;
michael@0: }
michael@0: 
michael@0: /* remove forbidden flag to add a personal word to the hash */
michael@0: int HashMgr::remove_forbidden_flag(const char * word) {
michael@0:     struct hentry * dp = lookup(word);
michael@0:     if (!dp) return 1;
michael@0:     while (dp) {
michael@0:          if (dp->astr && TESTAFF(dp->astr, forbiddenword, dp->alen)) {
michael@0:             if (dp->alen == 1) dp->alen = 0; // XXX forbidden words of personal dic.
michael@0:             else {
michael@0:                 unsigned short * flags2 =
michael@0:                     (unsigned short *) malloc(sizeof(short) * (dp->alen - 1));
michael@0:                 if (!flags2) return 1;
michael@0:                 int i, j = 0;
michael@0:                 for (i = 0; i < dp->alen; i++) {
michael@0:                     if (dp->astr[i] != forbiddenword) flags2[j++] = dp->astr[i];
michael@0:                 }
michael@0:                 dp->alen--;
michael@0:                 dp->astr = flags2; // XXX allowed forbidden words
michael@0:             }
michael@0:          }
michael@0:          dp = dp->next_homonym;
michael@0:        }
michael@0:    return 0;
michael@0: }
michael@0: 
michael@0: // add a custom dic. word to the hash table (public)
michael@0: int HashMgr::add(const char * word)
michael@0: {
michael@0:     unsigned short * flags = NULL;
michael@0:     int al = 0;
michael@0:     if (remove_forbidden_flag(word)) {
michael@0:         int captype;
michael@0:         int wbl = strlen(word);
michael@0:         int wcl = get_clen_and_captype(word, wbl, &captype);
michael@0:         add_word(word, wbl, wcl, flags, al, NULL, false);
michael@0:         return add_hidden_capitalized_word((char *) word, wbl, wcl, flags, al, NULL, captype);
michael@0:     }
michael@0:     return 0;
michael@0: }
michael@0: 
michael@0: int HashMgr::add_with_affix(const char * word, const char * example)
michael@0: {
michael@0:     // detect captype and modify word length for UTF-8 encoding
michael@0:     struct hentry * dp = lookup(example);
michael@0:     remove_forbidden_flag(word);
michael@0:     if (dp && dp->astr) {
michael@0:         int captype;
michael@0:         int wbl = strlen(word);
michael@0:         int wcl = get_clen_and_captype(word, wbl, &captype);
michael@0: 	if (aliasf) {
michael@0: 	    add_word(word, wbl, wcl, dp->astr, dp->alen, NULL, false);	
michael@0: 	} else {
michael@0:     	    unsigned short * flags = (unsigned short *) malloc (dp->alen * sizeof(short));
michael@0: 	    if (flags) {
michael@0: 		memcpy((void *) flags, (void *) dp->astr, dp->alen * sizeof(short));
michael@0: 		add_word(word, wbl, wcl, flags, dp->alen, NULL, false);
michael@0: 	    } else return 1;
michael@0: 	}
michael@0:     	return add_hidden_capitalized_word((char *) word, wbl, wcl, dp->astr, dp->alen, NULL, captype);
michael@0:     }
michael@0:     return 1;
michael@0: }
michael@0: 
michael@0: // walk the hash table entry by entry - null at end
michael@0: // initialize: col=-1; hp = NULL; hp = walk_hashtable(&col, hp);
michael@0: struct hentry * HashMgr::walk_hashtable(int &col, struct hentry * hp) const
michael@0: {  
michael@0:   if (hp && hp->next != NULL) return hp->next;
michael@0:   for (col++; col < tablesize; col++) {
michael@0:     if (tableptr[col]) return tableptr[col];
michael@0:   }
michael@0:   // null at end and reset to start
michael@0:   col = -1;
michael@0:   return NULL;
michael@0: }
michael@0: 
michael@0: // load a munched word list and build a hash table on the fly
michael@0: int HashMgr::load_tables(const char * tpath, const char * key)
michael@0: {
michael@0:   int al;
michael@0:   char * ap;
michael@0:   char * dp;
michael@0:   char * dp2;
michael@0:   unsigned short * flags;
michael@0:   char * ts;
michael@0: 
michael@0:   // open dictionary file
michael@0:   FileMgr * dict = new FileMgr(tpath, key);
michael@0:   if (dict == NULL) return 1;
michael@0: 
michael@0:   // first read the first line of file to get hash table size */
michael@0:   if (!(ts = dict->getline())) {
michael@0:     HUNSPELL_WARNING(stderr, "error: empty dic file\n");
michael@0:     delete dict;
michael@0:     return 2;
michael@0:   }
michael@0:   mychomp(ts);
michael@0: 
michael@0:   /* remove byte order mark */
michael@0:   if (strncmp(ts,"\xEF\xBB\xBF",3) == 0) {
michael@0:     memmove(ts, ts+3, strlen(ts+3)+1);
michael@0:     // warning: dic file begins with byte order mark: possible incompatibility with old Hunspell versions
michael@0:   }
michael@0: 
michael@0:   tablesize = atoi(ts);
michael@0:   if (tablesize == 0) {
michael@0:     HUNSPELL_WARNING(stderr, "error: line 1: missing or bad word count in the dic file\n");
michael@0:     delete dict;
michael@0:     return 4;
michael@0:   }
michael@0:   tablesize = tablesize + 5 + USERWORD;
michael@0:   if ((tablesize %2) == 0) tablesize++;
michael@0: 
michael@0:   // allocate the hash table
michael@0:   tableptr = (struct hentry **) malloc(tablesize * sizeof(struct hentry *));
michael@0:   if (! tableptr) {
michael@0:     delete dict;
michael@0:     return 3;
michael@0:   }
michael@0:   for (int i=0; i<tablesize; i++) tableptr[i] = NULL;
michael@0: 
michael@0:   // loop through all words on much list and add to hash
michael@0:   // table and create word and affix strings
michael@0: 
michael@0:   while ((ts = dict->getline())) {
michael@0:     mychomp(ts);
michael@0:     // split each line into word and morphological description
michael@0:     dp = ts;
michael@0:     while ((dp = strchr(dp, ':'))) {
michael@0: 	if ((dp > ts + 3) && (*(dp - 3) == ' ' || *(dp - 3) == '\t')) {
michael@0: 	    for (dp -= 4; dp >= ts && (*dp == ' ' || *dp == '\t'); dp--);
michael@0: 	    if (dp < ts) { // missing word
michael@0: 		dp = NULL;
michael@0: 	    } else {
michael@0: 		*(dp + 1) = '\0';
michael@0: 		dp = dp + 2;
michael@0: 	    }
michael@0: 	    break;
michael@0: 	}
michael@0: 	dp++;
michael@0:     }
michael@0: 
michael@0:     // tabulator is the old morphological field separator
michael@0:     dp2 = strchr(ts, '\t');
michael@0:     if (dp2 && (!dp || dp2 < dp)) {
michael@0: 	*dp2 = '\0';
michael@0: 	dp = dp2 + 1;
michael@0:     }
michael@0: 
michael@0:     // split each line into word and affix char strings
michael@0:     // "\/" signs slash in words (not affix separator)
michael@0:     // "/" at beginning of the line is word character (not affix separator)
michael@0:     ap = strchr(ts,'/');
michael@0:     while (ap) {
michael@0:         if (ap == ts) {
michael@0:             ap++;
michael@0:             continue;
michael@0:         } else if (*(ap - 1) != '\\') break;
michael@0:         // replace "\/" with "/"
michael@0:         for (char * sp = ap - 1; *sp; *sp = *(sp + 1), sp++);
michael@0:         ap = strchr(ap,'/');
michael@0:     }
michael@0: 
michael@0:     if (ap) {
michael@0:       *ap = '\0';
michael@0:       if (aliasf) {
michael@0:         int index = atoi(ap + 1);
michael@0:         al = get_aliasf(index, &flags, dict);
michael@0:         if (!al) {
michael@0:             HUNSPELL_WARNING(stderr, "error: line %d: bad flag vector alias\n", dict->getlinenum());
michael@0:             *ap = '\0';
michael@0:         }
michael@0:       } else {
michael@0:         al = decode_flags(&flags, ap + 1, dict);
michael@0:         if (al == -1) {
michael@0:             HUNSPELL_WARNING(stderr, "Can't allocate memory.\n");
michael@0:             delete dict;
michael@0:             return 6;
michael@0:         }
michael@0:         flag_qsort(flags, 0, al);
michael@0:       }
michael@0:     } else {
michael@0:       al = 0;
michael@0:       ap = NULL;
michael@0:       flags = NULL;
michael@0:     }
michael@0: 
michael@0:     int captype;
michael@0:     int wbl = strlen(ts);
michael@0:     int wcl = get_clen_and_captype(ts, wbl, &captype);
michael@0:     // add the word and its index plus its capitalized form optionally
michael@0:     if (add_word(ts,wbl,wcl,flags,al,dp, false) ||
michael@0: 	add_hidden_capitalized_word(ts, wbl, wcl, flags, al, dp, captype)) {
michael@0: 	delete dict;
michael@0: 	return 5;
michael@0:     }
michael@0:   }
michael@0: 
michael@0:   delete dict;
michael@0:   return 0;
michael@0: }
michael@0: 
michael@0: // the hash function is a simple load and rotate
michael@0: // algorithm borrowed
michael@0: 
michael@0: int HashMgr::hash(const char * word) const
michael@0: {
michael@0:     long  hv = 0;
michael@0:     for (int i=0; i < 4  &&  *word != 0; i++)
michael@0:         hv = (hv << 8) | (*word++);
michael@0:     while (*word != 0) {
michael@0:       ROTATE(hv,ROTATE_LEN);
michael@0:       hv ^= (*word++);
michael@0:     }
michael@0:     return (unsigned long) hv % tablesize;
michael@0: }
michael@0: 
michael@0: int HashMgr::decode_flags(unsigned short ** result, char * flags, FileMgr * af) {
michael@0:     int len;
michael@0:     if (*flags == '\0') {
michael@0:         *result = NULL;
michael@0:         return 0;
michael@0:     }
michael@0:     switch (flag_mode) {
michael@0:       case FLAG_LONG: { // two-character flags (1x2yZz -> 1x 2y Zz)
michael@0:         len = strlen(flags);
michael@0:         if (len%2 == 1) HUNSPELL_WARNING(stderr, "error: line %d: bad flagvector\n", af->getlinenum());
michael@0:         len /= 2;
michael@0:         *result = (unsigned short *) malloc(len * sizeof(short));
michael@0:         if (!*result) return -1;
michael@0:         for (int i = 0; i < len; i++) {
michael@0:             (*result)[i] = (((unsigned short) flags[i * 2]) << 8) + (unsigned short) flags[i * 2 + 1]; 
michael@0:         }
michael@0:         break;
michael@0:       }
michael@0:       case FLAG_NUM: { // decimal numbers separated by comma (4521,23,233 -> 4521 23 233)
michael@0:         int i;
michael@0:         len = 1;
michael@0:         char * src = flags; 
michael@0:         unsigned short * dest;
michael@0:         char * p;
michael@0:         for (p = flags; *p; p++) {
michael@0:           if (*p == ',') len++;
michael@0:         }
michael@0:         *result = (unsigned short *) malloc(len * sizeof(short));
michael@0:         if (!*result) return -1;
michael@0:         dest = *result;
michael@0:         for (p = flags; *p; p++) {
michael@0:           if (*p == ',') {
michael@0:             i = atoi(src);
michael@0:             if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: line %d: flag id %d is too large (max: %d)\n",
michael@0:               af->getlinenum(), i, DEFAULTFLAGS - 1);
michael@0:             *dest = (unsigned short) i;
michael@0:             if (*dest == 0) HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n", af->getlinenum());
michael@0:             src = p + 1;
michael@0:             dest++;
michael@0:           }
michael@0:         }
michael@0:         i = atoi(src);
michael@0:         if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: line %d: flag id %d is too large (max: %d)\n",
michael@0:           af->getlinenum(), i, DEFAULTFLAGS - 1);
michael@0:         *dest = (unsigned short) i;
michael@0:         if (*dest == 0) HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n", af->getlinenum());
michael@0:         break;
michael@0:       }    
michael@0:       case FLAG_UNI: { // UTF-8 characters
michael@0:         w_char w[BUFSIZE/2];
michael@0:         len = u8_u16(w, BUFSIZE/2, flags);
michael@0:         *result = (unsigned short *) malloc(len * sizeof(short));
michael@0:         if (!*result) return -1;
michael@0:         memcpy(*result, w, len * sizeof(short));
michael@0:         break;
michael@0:       }
michael@0:       default: { // Ispell's one-character flags (erfg -> e r f g)
michael@0:         unsigned short * dest;
michael@0:         len = strlen(flags);
michael@0:         *result = (unsigned short *) malloc(len * sizeof(short));
michael@0:         if (!*result) return -1;
michael@0:         dest = *result;
michael@0:         for (unsigned char * p = (unsigned char *) flags; *p; p++) {
michael@0:           *dest = (unsigned short) *p;
michael@0:           dest++;
michael@0:         }
michael@0:       }
michael@0:     }
michael@0:     return len;
michael@0: }
michael@0: 
michael@0: unsigned short HashMgr::decode_flag(const char * f) {
michael@0:     unsigned short s = 0;
michael@0:     int i;
michael@0:     switch (flag_mode) {
michael@0:       case FLAG_LONG:
michael@0:         s = ((unsigned short) f[0] << 8) + (unsigned short) f[1];
michael@0:         break;
michael@0:       case FLAG_NUM:
michael@0:         i = atoi(f);
michael@0:         if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: flag id %d is too large (max: %d)\n", i, DEFAULTFLAGS - 1);
michael@0:         s = (unsigned short) i;
michael@0:         break;
michael@0:       case FLAG_UNI:
michael@0:         u8_u16((w_char *) &s, 1, f);
michael@0:         break;
michael@0:       default:
michael@0:         s = (unsigned short) *((unsigned char *)f);
michael@0:     }
michael@0:     if (s == 0) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n");
michael@0:     return s;
michael@0: }
michael@0: 
michael@0: char * HashMgr::encode_flag(unsigned short f) {
michael@0:     unsigned char ch[10];
michael@0:     if (f==0) return mystrdup("(NULL)");
michael@0:     if (flag_mode == FLAG_LONG) {
michael@0:         ch[0] = (unsigned char) (f >> 8);
michael@0:         ch[1] = (unsigned char) (f - ((f >> 8) << 8));
michael@0:         ch[2] = '\0';
michael@0:     } else if (flag_mode == FLAG_NUM) {
michael@0:         sprintf((char *) ch, "%d", f);
michael@0:     } else if (flag_mode == FLAG_UNI) {
michael@0:         u16_u8((char *) &ch, 10, (w_char *) &f, 1);
michael@0:     } else {
michael@0:         ch[0] = (unsigned char) (f);
michael@0:         ch[1] = '\0';
michael@0:     }
michael@0:     return mystrdup((char *) ch);
michael@0: }
michael@0: 
michael@0: // read in aff file and set flag mode
michael@0: int  HashMgr::load_config(const char * affpath, const char * key)
michael@0: {
michael@0:   char * line; // io buffers
michael@0:   int firstline = 1;
michael@0:  
michael@0:   // open the affix file
michael@0:   FileMgr * afflst = new FileMgr(affpath, key);
michael@0:   if (!afflst) {
michael@0:     HUNSPELL_WARNING(stderr, "Error - could not open affix description file %s\n",affpath);
michael@0:     return 1;
michael@0:   }
michael@0: 
michael@0:     // read in each line ignoring any that do not
michael@0:     // start with a known line type indicator
michael@0: 
michael@0:     while ((line = afflst->getline())) {
michael@0:         mychomp(line);
michael@0: 
michael@0:        /* remove byte order mark */
michael@0:        if (firstline) {
michael@0:          firstline = 0;
michael@0:          if (strncmp(line,"\xEF\xBB\xBF",3) == 0) memmove(line, line+3, strlen(line+3)+1);
michael@0:        }
michael@0: 
michael@0:         /* parse in the try string */
michael@0:         if ((strncmp(line,"FLAG",4) == 0) && isspace(line[4])) {
michael@0:             if (flag_mode != FLAG_CHAR) {
michael@0:                 HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of the FLAG affix file parameter\n", afflst->getlinenum());
michael@0:             }
michael@0:             if (strstr(line, "long")) flag_mode = FLAG_LONG;
michael@0:             if (strstr(line, "num")) flag_mode = FLAG_NUM;
michael@0:             if (strstr(line, "UTF-8")) flag_mode = FLAG_UNI;
michael@0:             if (flag_mode == FLAG_CHAR) {
michael@0:                 HUNSPELL_WARNING(stderr, "error: line %d: FLAG needs `num', `long' or `UTF-8' parameter\n", afflst->getlinenum());
michael@0:             }
michael@0:         }
michael@0:         if (strncmp(line,"FORBIDDENWORD",13) == 0) {
michael@0:           char * st = NULL;
michael@0:           if (parse_string(line, &st, afflst->getlinenum())) {
michael@0:              delete afflst;
michael@0:              return 1;
michael@0:           }
michael@0:           forbiddenword = decode_flag(st);
michael@0:           free(st);
michael@0:         }
michael@0:         if (strncmp(line, "SET", 3) == 0) {
michael@0:     	  if (parse_string(line, &enc, afflst->getlinenum())) {
michael@0:              delete afflst;
michael@0:              return 1;
michael@0:           }    	    
michael@0:     	  if (strcmp(enc, "UTF-8") == 0) {
michael@0:     	    utf8 = 1;
michael@0: #ifndef OPENOFFICEORG
michael@0: #ifndef MOZILLA_CLIENT
michael@0:     	    initialize_utf_tbl();
michael@0: #endif
michael@0: #endif
michael@0:     	  } else csconv = get_current_cs(enc);
michael@0:     	}
michael@0:         if (strncmp(line, "LANG", 4) == 0) {
michael@0:     	  if (parse_string(line, &lang, afflst->getlinenum())) {
michael@0:              delete afflst;
michael@0:              return 1;
michael@0:           }    	    
michael@0:     	  langnum = get_lang_num(lang);
michael@0:     	}
michael@0: 
michael@0:        /* parse in the ignored characters (for example, Arabic optional diacritics characters */
michael@0:        if (strncmp(line,"IGNORE",6) == 0) {
michael@0:           if (parse_array(line, &ignorechars, &ignorechars_utf16,
michael@0:                  &ignorechars_utf16_len, utf8, afflst->getlinenum())) {
michael@0:              delete afflst;
michael@0:              return 1;
michael@0:           }
michael@0:        }
michael@0: 
michael@0:        if ((strncmp(line,"AF",2) == 0) && isspace(line[2])) {
michael@0:           if (parse_aliasf(line, afflst)) {
michael@0:              delete afflst;
michael@0:              return 1;
michael@0:           }
michael@0:        }
michael@0: 
michael@0:        if ((strncmp(line,"AM",2) == 0) && isspace(line[2])) {
michael@0:           if (parse_aliasm(line, afflst)) {
michael@0:              delete afflst;
michael@0:              return 1;
michael@0:           }
michael@0:        }
michael@0: 
michael@0:        if (strncmp(line,"COMPLEXPREFIXES",15) == 0) complexprefixes = 1;
michael@0:        if (((strncmp(line,"SFX",3) == 0) || (strncmp(line,"PFX",3) == 0)) && isspace(line[3])) break;
michael@0:     }
michael@0:     if (csconv == NULL) csconv = get_current_cs(SPELL_ENCODING);
michael@0:     delete afflst;
michael@0:     return 0;
michael@0: }
michael@0: 
michael@0: /* parse in the ALIAS table */
michael@0: int  HashMgr::parse_aliasf(char * line, FileMgr * af)
michael@0: {
michael@0:    if (numaliasf != 0) {
michael@0:       HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
michael@0:       return 1;
michael@0:    }
michael@0:    char * tp = line;
michael@0:    char * piece;
michael@0:    int i = 0;
michael@0:    int np = 0;
michael@0:    piece = mystrsep(&tp, 0);
michael@0:    while (piece) {
michael@0:        if (*piece != '\0') {
michael@0:           switch(i) {
michael@0:              case 0: { np++; break; }
michael@0:              case 1: { 
michael@0:                        numaliasf = atoi(piece);
michael@0:                        if (numaliasf < 1) {
michael@0:                           numaliasf = 0;
michael@0:                           aliasf = NULL;
michael@0:                           aliasflen = NULL;
michael@0:                           HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
michael@0:                           return 1;
michael@0:                        }
michael@0:                        aliasf = (unsigned short **) malloc(numaliasf * sizeof(unsigned short *));
michael@0:                        aliasflen = (unsigned short *) malloc(numaliasf * sizeof(short));
michael@0:                        if (!aliasf || !aliasflen) {
michael@0:                           numaliasf = 0;
michael@0:                           if (aliasf) free(aliasf);
michael@0:                           if (aliasflen) free(aliasflen);
michael@0:                           aliasf = NULL;
michael@0:                           aliasflen = NULL;
michael@0:                           return 1;
michael@0:                        }
michael@0:                        np++;
michael@0:                        break;
michael@0:                      }
michael@0:              default: break;
michael@0:           }
michael@0:           i++;
michael@0:        }
michael@0:        piece = mystrsep(&tp, 0);
michael@0:    }
michael@0:    if (np != 2) {
michael@0:       numaliasf = 0;
michael@0:       free(aliasf);
michael@0:       free(aliasflen);
michael@0:       aliasf = NULL;
michael@0:       aliasflen = NULL;
michael@0:       HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
michael@0:       return 1;
michael@0:    } 
michael@0:  
michael@0:    /* now parse the numaliasf lines to read in the remainder of the table */
michael@0:    char * nl;
michael@0:    for (int j=0; j < numaliasf; j++) {
michael@0:         if (!(nl = af->getline())) return 1;
michael@0:         mychomp(nl);
michael@0:         tp = nl;
michael@0:         i = 0;
michael@0:         aliasf[j] = NULL;
michael@0:         aliasflen[j] = 0;
michael@0:         piece = mystrsep(&tp, 0);
michael@0:         while (piece) {
michael@0:            if (*piece != '\0') {
michael@0:                switch(i) {
michael@0:                   case 0: {
michael@0:                              if (strncmp(piece,"AF",2) != 0) {
michael@0:                                  numaliasf = 0;
michael@0:                                  free(aliasf);
michael@0:                                  free(aliasflen);
michael@0:                                  aliasf = NULL;
michael@0:                                  aliasflen = NULL;
michael@0:                                  HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
michael@0:                                  return 1;
michael@0:                              }
michael@0:                              break;
michael@0:                           }
michael@0:                   case 1: {
michael@0:                             aliasflen[j] = (unsigned short) decode_flags(&(aliasf[j]), piece, af);
michael@0:                             flag_qsort(aliasf[j], 0, aliasflen[j]);
michael@0:                             break; 
michael@0:                           }
michael@0:                   default: break;
michael@0:                }
michael@0:                i++;
michael@0:            }
michael@0:            piece = mystrsep(&tp, 0);
michael@0:         }
michael@0:         if (!aliasf[j]) {
michael@0:              free(aliasf);
michael@0:              free(aliasflen);
michael@0:              aliasf = NULL;
michael@0:              aliasflen = NULL;
michael@0:              numaliasf = 0;
michael@0:              HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
michael@0:              return 1;
michael@0:         }
michael@0:    }
michael@0:    return 0;
michael@0: }
michael@0: 
michael@0: int HashMgr::is_aliasf() {
michael@0:     return (aliasf != NULL);
michael@0: }
michael@0: 
michael@0: int HashMgr::get_aliasf(int index, unsigned short ** fvec, FileMgr * af) {
michael@0:     if ((index > 0) && (index <= numaliasf)) {
michael@0:         *fvec = aliasf[index - 1];
michael@0:         return aliasflen[index - 1];
michael@0:     }
michael@0:     HUNSPELL_WARNING(stderr, "error: line %d: bad flag alias index: %d\n", af->getlinenum(), index);
michael@0:     *fvec = NULL;
michael@0:     return 0;
michael@0: }
michael@0: 
michael@0: /* parse morph alias definitions */
michael@0: int  HashMgr::parse_aliasm(char * line, FileMgr * af)
michael@0: {
michael@0:    if (numaliasm != 0) {
michael@0:       HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
michael@0:       return 1;
michael@0:    }
michael@0:    char * tp = line;
michael@0:    char * piece;
michael@0:    int i = 0;
michael@0:    int np = 0;
michael@0:    piece = mystrsep(&tp, 0);
michael@0:    while (piece) {
michael@0:        if (*piece != '\0') {
michael@0:           switch(i) {
michael@0:              case 0: { np++; break; }
michael@0:              case 1: { 
michael@0:                        numaliasm = atoi(piece);
michael@0:                        if (numaliasm < 1) {
michael@0:                           HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
michael@0:                           return 1;
michael@0:                        }
michael@0:                        aliasm = (char **) malloc(numaliasm * sizeof(char *));
michael@0:                        if (!aliasm) {
michael@0:                           numaliasm = 0;
michael@0:                           return 1;
michael@0:                        }
michael@0:                        np++;
michael@0:                        break;
michael@0:                      }
michael@0:              default: break;
michael@0:           }
michael@0:           i++;
michael@0:        }
michael@0:        piece = mystrsep(&tp, 0);
michael@0:    }
michael@0:    if (np != 2) {
michael@0:       numaliasm = 0;
michael@0:       free(aliasm);
michael@0:       aliasm = NULL;
michael@0:       HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
michael@0:       return 1;
michael@0:    } 
michael@0: 
michael@0:    /* now parse the numaliasm lines to read in the remainder of the table */
michael@0:    char * nl = line;
michael@0:    for (int j=0; j < numaliasm; j++) {
michael@0:         if (!(nl = af->getline())) return 1;
michael@0:         mychomp(nl);
michael@0:         tp = nl;
michael@0:         i = 0;
michael@0:         aliasm[j] = NULL;
michael@0:         piece = mystrsep(&tp, ' ');
michael@0:         while (piece) {
michael@0:            if (*piece != '\0') {
michael@0:                switch(i) {
michael@0:                   case 0: {
michael@0:                              if (strncmp(piece,"AM",2) != 0) {
michael@0:                                  HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
michael@0:                                  numaliasm = 0;
michael@0:                                  free(aliasm);
michael@0:                                  aliasm = NULL;
michael@0:                                  return 1;
michael@0:                              }
michael@0:                              break;
michael@0:                           }
michael@0:                   case 1: {
michael@0:                             // add the remaining of the line
michael@0:                             if (*tp) {
michael@0:                                 *(tp - 1) = ' ';
michael@0:                                 tp = tp + strlen(tp);
michael@0:                             }
michael@0:                             if (complexprefixes) {
michael@0:                                 if (utf8) reverseword_utf(piece);
michael@0:                                     else reverseword(piece);
michael@0:                             }
michael@0:                             aliasm[j] = mystrdup(piece);
michael@0:                             if (!aliasm[j]) {
michael@0:                                  numaliasm = 0;
michael@0:                                  free(aliasm);
michael@0:                                  aliasm = NULL;
michael@0:                                  return 1;
michael@0:                             }
michael@0:                             break; }
michael@0:                   default: break;
michael@0:                }
michael@0:                i++;
michael@0:            }
michael@0:            piece = mystrsep(&tp, ' ');
michael@0:         }
michael@0:         if (!aliasm[j]) {
michael@0:              numaliasm = 0;
michael@0:              free(aliasm);
michael@0:              aliasm = NULL;
michael@0:              HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
michael@0:              return 1;
michael@0:         }
michael@0:    }
michael@0:    return 0;
michael@0: }
michael@0: 
michael@0: int HashMgr::is_aliasm() {
michael@0:     return (aliasm != NULL);
michael@0: }
michael@0: 
michael@0: char * HashMgr::get_aliasm(int index) {
michael@0:     if ((index > 0) && (index <= numaliasm)) return aliasm[index - 1];
michael@0:     HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index);
michael@0:     return NULL;
michael@0: }