michael@0: /******* BEGIN LICENSE BLOCK ******* michael@0: * Version: MPL 1.1/GPL 2.0/LGPL 2.1 michael@0: * michael@0: * The contents of this file are subject to the Mozilla Public License Version michael@0: * 1.1 (the "License"); you may not use this file except in compliance with michael@0: * the License. You may obtain a copy of the License at michael@0: * http://www.mozilla.org/MPL/ michael@0: * michael@0: * Software distributed under the License is distributed on an "AS IS" basis, michael@0: * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License michael@0: * for the specific language governing rights and limitations under the michael@0: * License. michael@0: * michael@0: * The Initial Developers of the Original Code are Kevin Hendricks (MySpell) michael@0: * and László Németh (Hunspell). Portions created by the Initial Developers michael@0: * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved. michael@0: * michael@0: * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca) michael@0: * David Einstein (deinst@world.std.com) michael@0: * László Németh (nemethl@gyorsposta.hu) michael@0: * Caolan McNamara (caolanm@redhat.com) michael@0: * Davide Prina michael@0: * Giuseppe Modugno michael@0: * Gianluca Turconi michael@0: * Simon Brouwer michael@0: * Noll Janos michael@0: * Biro Arpad michael@0: * Goldman Eleonora michael@0: * Sarlos Tamas michael@0: * Bencsath Boldizsar michael@0: * Halacsy Peter michael@0: * Dvornik Laszlo michael@0: * Gefferth Andras michael@0: * Nagy Viktor michael@0: * Varga Daniel michael@0: * Chris Halls michael@0: * Rene Engelhard michael@0: * Bram Moolenaar michael@0: * Dafydd Jones michael@0: * Harri Pitkanen michael@0: * Andras Timar michael@0: * Tor Lillqvist michael@0: * michael@0: * Alternatively, the contents of this file may be used under the terms of michael@0: * either the GNU General Public License Version 2 or later (the "GPL"), or michael@0: * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), michael@0: * in which case the provisions of the GPL or the LGPL are applicable instead michael@0: * of those above. If you wish to allow use of your version of this file only michael@0: * under the terms of either the GPL or the LGPL, and not to allow others to michael@0: * use your version of this file under the terms of the MPL, indicate your michael@0: * decision by deleting the provisions above and replace them with the notice michael@0: * and other provisions required by the GPL or the LGPL. If you do not delete michael@0: * the provisions above, a recipient may use your version of this file under michael@0: * the terms of any one of the MPL, the GPL or the LGPL. michael@0: * michael@0: ******* END LICENSE BLOCK *******/ michael@0: michael@0: #ifndef __CSUTILHXX__ michael@0: #define __CSUTILHXX__ michael@0: michael@0: #include "hunvisapi.h" michael@0: michael@0: // First some base level utility routines michael@0: michael@0: #include michael@0: #include "w_char.hxx" michael@0: #include "htypes.hxx" michael@0: michael@0: #ifdef MOZILLA_CLIENT michael@0: #include "nscore.h" // for mozalloc headers michael@0: #endif michael@0: michael@0: // casing michael@0: #define NOCAP 0 michael@0: #define INITCAP 1 michael@0: #define ALLCAP 2 michael@0: #define HUHCAP 3 michael@0: #define HUHINITCAP 4 michael@0: michael@0: // default encoding and keystring michael@0: #define SPELL_ENCODING "ISO8859-1" michael@0: #define SPELL_KEYSTRING "qwertyuiop|asdfghjkl|zxcvbnm" michael@0: michael@0: // default morphological fields michael@0: #define MORPH_STEM "st:" michael@0: #define MORPH_ALLOMORPH "al:" michael@0: #define MORPH_POS "po:" michael@0: #define MORPH_DERI_PFX "dp:" michael@0: #define MORPH_INFL_PFX "ip:" michael@0: #define MORPH_TERM_PFX "tp:" michael@0: #define MORPH_DERI_SFX "ds:" michael@0: #define MORPH_INFL_SFX "is:" michael@0: #define MORPH_TERM_SFX "ts:" michael@0: #define MORPH_SURF_PFX "sp:" michael@0: #define MORPH_FREQ "fr:" michael@0: #define MORPH_PHON "ph:" michael@0: #define MORPH_HYPH "hy:" michael@0: #define MORPH_PART "pa:" michael@0: #define MORPH_FLAG "fl:" michael@0: #define MORPH_HENTRY "_H:" michael@0: #define MORPH_TAG_LEN strlen(MORPH_STEM) michael@0: michael@0: #define MSEP_FLD ' ' michael@0: #define MSEP_REC '\n' michael@0: #define MSEP_ALT '\v' michael@0: michael@0: // default flags michael@0: #define DEFAULTFLAGS 65510 michael@0: #define FORBIDDENWORD 65510 michael@0: #define ONLYUPCASEFLAG 65511 michael@0: michael@0: // convert UTF-16 characters to UTF-8 michael@0: LIBHUNSPELL_DLL_EXPORTED char * u16_u8(char * dest, int size, const w_char * src, int srclen); michael@0: michael@0: // convert UTF-8 characters to UTF-16 michael@0: LIBHUNSPELL_DLL_EXPORTED int u8_u16(w_char * dest, int size, const char * src); michael@0: michael@0: // sort 2-byte vector michael@0: LIBHUNSPELL_DLL_EXPORTED void flag_qsort(unsigned short flags[], int begin, int end); michael@0: michael@0: // binary search in 2-byte vector michael@0: LIBHUNSPELL_DLL_EXPORTED int flag_bsearch(unsigned short flags[], unsigned short flag, int right); michael@0: michael@0: // remove end of line char(s) michael@0: LIBHUNSPELL_DLL_EXPORTED void mychomp(char * s); michael@0: michael@0: // duplicate string michael@0: LIBHUNSPELL_DLL_EXPORTED char * mystrdup(const char * s); michael@0: michael@0: // strcat for limited length destination string michael@0: LIBHUNSPELL_DLL_EXPORTED char * mystrcat(char * dest, const char * st, int max); michael@0: michael@0: // duplicate reverse of string michael@0: LIBHUNSPELL_DLL_EXPORTED char * myrevstrdup(const char * s); michael@0: michael@0: // parse into tokens with char delimiter michael@0: LIBHUNSPELL_DLL_EXPORTED char * mystrsep(char ** sptr, const char delim); michael@0: // parse into tokens with char delimiter michael@0: LIBHUNSPELL_DLL_EXPORTED char * mystrsep2(char ** sptr, const char delim); michael@0: michael@0: // parse into tokens with char delimiter michael@0: LIBHUNSPELL_DLL_EXPORTED char * mystrrep(char *, const char *, const char *); michael@0: michael@0: // append s to ends of every lines in text michael@0: LIBHUNSPELL_DLL_EXPORTED void strlinecat(char * lines, const char * s); michael@0: michael@0: // tokenize into lines with new line michael@0: LIBHUNSPELL_DLL_EXPORTED int line_tok(const char * text, char *** lines, char breakchar); michael@0: michael@0: // tokenize into lines with new line and uniq in place michael@0: LIBHUNSPELL_DLL_EXPORTED char * line_uniq(char * text, char breakchar); michael@0: LIBHUNSPELL_DLL_EXPORTED char * line_uniq_app(char ** text, char breakchar); michael@0: michael@0: // change oldchar to newchar in place michael@0: LIBHUNSPELL_DLL_EXPORTED char * tr(char * text, char oldc, char newc); michael@0: michael@0: // reverse word michael@0: LIBHUNSPELL_DLL_EXPORTED int reverseword(char *); michael@0: michael@0: // reverse word michael@0: LIBHUNSPELL_DLL_EXPORTED int reverseword_utf(char *); michael@0: michael@0: // remove duplicates michael@0: LIBHUNSPELL_DLL_EXPORTED int uniqlist(char ** list, int n); michael@0: michael@0: // free character array list michael@0: LIBHUNSPELL_DLL_EXPORTED void freelist(char *** list, int n); michael@0: michael@0: // character encoding information michael@0: struct cs_info { michael@0: unsigned char ccase; michael@0: unsigned char clower; michael@0: unsigned char cupper; michael@0: }; michael@0: michael@0: LIBHUNSPELL_DLL_EXPORTED int initialize_utf_tbl(); michael@0: LIBHUNSPELL_DLL_EXPORTED void free_utf_tbl(); michael@0: LIBHUNSPELL_DLL_EXPORTED unsigned short unicodetoupper(unsigned short c, int langnum); michael@0: LIBHUNSPELL_DLL_EXPORTED unsigned short unicodetolower(unsigned short c, int langnum); michael@0: LIBHUNSPELL_DLL_EXPORTED int unicodeisalpha(unsigned short c); michael@0: michael@0: LIBHUNSPELL_DLL_EXPORTED struct cs_info * get_current_cs(const char * es); michael@0: michael@0: // get language identifiers of language codes michael@0: LIBHUNSPELL_DLL_EXPORTED int get_lang_num(const char * lang); michael@0: michael@0: // get characters of the given 8bit encoding with lower- and uppercase forms michael@0: LIBHUNSPELL_DLL_EXPORTED char * get_casechars(const char * enc); michael@0: michael@0: // convert null terminated string to all caps using encoding michael@0: LIBHUNSPELL_DLL_EXPORTED void enmkallcap(char * d, const char * p, const char * encoding); michael@0: michael@0: // convert null terminated string to all little using encoding michael@0: LIBHUNSPELL_DLL_EXPORTED void enmkallsmall(char * d, const char * p, const char * encoding); michael@0: michael@0: // convert null terminated string to have initial capital using encoding michael@0: LIBHUNSPELL_DLL_EXPORTED void enmkinitcap(char * d, const char * p, const char * encoding); michael@0: michael@0: // convert null terminated string to all caps michael@0: LIBHUNSPELL_DLL_EXPORTED void mkallcap(char * p, const struct cs_info * csconv); michael@0: michael@0: // convert null terminated string to all little michael@0: LIBHUNSPELL_DLL_EXPORTED void mkallsmall(char * p, const struct cs_info * csconv); michael@0: michael@0: // convert null terminated string to have initial capital michael@0: LIBHUNSPELL_DLL_EXPORTED void mkinitcap(char * p, const struct cs_info * csconv); michael@0: michael@0: // convert first nc characters of UTF-8 string to little michael@0: LIBHUNSPELL_DLL_EXPORTED void mkallsmall_utf(w_char * u, int nc, int langnum); michael@0: michael@0: // convert first nc characters of UTF-8 string to capital michael@0: LIBHUNSPELL_DLL_EXPORTED void mkallcap_utf(w_char * u, int nc, int langnum); michael@0: michael@0: // get type of capitalization michael@0: LIBHUNSPELL_DLL_EXPORTED int get_captype(char * q, int nl, cs_info *); michael@0: michael@0: // get type of capitalization (UTF-8) michael@0: LIBHUNSPELL_DLL_EXPORTED int get_captype_utf8(w_char * q, int nl, int langnum); michael@0: michael@0: // strip all ignored characters in the string michael@0: LIBHUNSPELL_DLL_EXPORTED void remove_ignored_chars_utf(char * word, unsigned short ignored_chars[], int ignored_len); michael@0: michael@0: // strip all ignored characters in the string michael@0: LIBHUNSPELL_DLL_EXPORTED void remove_ignored_chars(char * word, char * ignored_chars); michael@0: michael@0: LIBHUNSPELL_DLL_EXPORTED int parse_string(char * line, char ** out, int ln); michael@0: michael@0: LIBHUNSPELL_DLL_EXPORTED int parse_array(char * line, char ** out, unsigned short ** out_utf16, michael@0: int * out_utf16_len, int utf8, int ln); michael@0: michael@0: LIBHUNSPELL_DLL_EXPORTED int fieldlen(const char * r); michael@0: LIBHUNSPELL_DLL_EXPORTED char * copy_field(char * dest, const char * morph, const char * var); michael@0: michael@0: LIBHUNSPELL_DLL_EXPORTED int morphcmp(const char * s, const char * t); michael@0: michael@0: LIBHUNSPELL_DLL_EXPORTED int get_sfxcount(const char * morph); michael@0: michael@0: // conversion function for protected memory michael@0: LIBHUNSPELL_DLL_EXPORTED void store_pointer(char * dest, char * source); michael@0: michael@0: // conversion function for protected memory michael@0: LIBHUNSPELL_DLL_EXPORTED char * get_stored_pointer(const char * s); michael@0: michael@0: // hash entry macros michael@0: LIBHUNSPELL_DLL_EXPORTED inline char* HENTRY_DATA(struct hentry *h) michael@0: { michael@0: char *ret; michael@0: if (!h->var) michael@0: ret = NULL; michael@0: else if (h->var & H_OPT_ALIASM) michael@0: ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1); michael@0: else michael@0: ret = HENTRY_WORD(h) + h->blen + 1; michael@0: return ret; michael@0: } michael@0: michael@0: // NULL-free version for warning-free OOo build michael@0: LIBHUNSPELL_DLL_EXPORTED inline const char* HENTRY_DATA2(const struct hentry *h) michael@0: { michael@0: const char *ret; michael@0: if (!h->var) michael@0: ret = ""; michael@0: else if (h->var & H_OPT_ALIASM) michael@0: ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1); michael@0: else michael@0: ret = HENTRY_WORD(h) + h->blen + 1; michael@0: return ret; michael@0: } michael@0: michael@0: LIBHUNSPELL_DLL_EXPORTED inline char* HENTRY_FIND(struct hentry *h, const char *p) michael@0: { michael@0: return (HENTRY_DATA(h) ? strstr(HENTRY_DATA(h), p) : NULL); michael@0: } michael@0: michael@0: #define w_char_eq(a,b) (((a).l == (b).l) && ((a).h == (b).h)) michael@0: michael@0: #endif