|
1 /******* BEGIN LICENSE BLOCK ******* |
|
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 |
|
3 * |
|
4 * The contents of this file are subject to the Mozilla Public License Version |
|
5 * 1.1 (the "License"); you may not use this file except in compliance with |
|
6 * the License. You may obtain a copy of the License at |
|
7 * http://www.mozilla.org/MPL/ |
|
8 * |
|
9 * Software distributed under the License is distributed on an "AS IS" basis, |
|
10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License |
|
11 * for the specific language governing rights and limitations under the |
|
12 * License. |
|
13 * |
|
14 * The Initial Developers of the Original Code are Kevin Hendricks (MySpell) |
|
15 * and László Németh (Hunspell). Portions created by the Initial Developers |
|
16 * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved. |
|
17 * |
|
18 * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca) |
|
19 * David Einstein (deinst@world.std.com) |
|
20 * László Németh (nemethl@gyorsposta.hu) |
|
21 * Caolan McNamara (caolanm@redhat.com) |
|
22 * Davide Prina |
|
23 * Giuseppe Modugno |
|
24 * Gianluca Turconi |
|
25 * Simon Brouwer |
|
26 * Noll Janos |
|
27 * Biro Arpad |
|
28 * Goldman Eleonora |
|
29 * Sarlos Tamas |
|
30 * Bencsath Boldizsar |
|
31 * Halacsy Peter |
|
32 * Dvornik Laszlo |
|
33 * Gefferth Andras |
|
34 * Nagy Viktor |
|
35 * Varga Daniel |
|
36 * Chris Halls |
|
37 * Rene Engelhard |
|
38 * Bram Moolenaar |
|
39 * Dafydd Jones |
|
40 * Harri Pitkanen |
|
41 * Andras Timar |
|
42 * Tor Lillqvist |
|
43 * |
|
44 * Alternatively, the contents of this file may be used under the terms of |
|
45 * either the GNU General Public License Version 2 or later (the "GPL"), or |
|
46 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), |
|
47 * in which case the provisions of the GPL or the LGPL are applicable instead |
|
48 * of those above. If you wish to allow use of your version of this file only |
|
49 * under the terms of either the GPL or the LGPL, and not to allow others to |
|
50 * use your version of this file under the terms of the MPL, indicate your |
|
51 * decision by deleting the provisions above and replace them with the notice |
|
52 * and other provisions required by the GPL or the LGPL. If you do not delete |
|
53 * the provisions above, a recipient may use your version of this file under |
|
54 * the terms of any one of the MPL, the GPL or the LGPL. |
|
55 * |
|
56 ******* END LICENSE BLOCK *******/ |
|
57 |
|
58 #ifndef __CSUTILHXX__ |
|
59 #define __CSUTILHXX__ |
|
60 |
|
61 #include "hunvisapi.h" |
|
62 |
|
63 // First some base level utility routines |
|
64 |
|
65 #include <string.h> |
|
66 #include "w_char.hxx" |
|
67 #include "htypes.hxx" |
|
68 |
|
69 #ifdef MOZILLA_CLIENT |
|
70 #include "nscore.h" // for mozalloc headers |
|
71 #endif |
|
72 |
|
73 // casing |
|
74 #define NOCAP 0 |
|
75 #define INITCAP 1 |
|
76 #define ALLCAP 2 |
|
77 #define HUHCAP 3 |
|
78 #define HUHINITCAP 4 |
|
79 |
|
80 // default encoding and keystring |
|
81 #define SPELL_ENCODING "ISO8859-1" |
|
82 #define SPELL_KEYSTRING "qwertyuiop|asdfghjkl|zxcvbnm" |
|
83 |
|
84 // default morphological fields |
|
85 #define MORPH_STEM "st:" |
|
86 #define MORPH_ALLOMORPH "al:" |
|
87 #define MORPH_POS "po:" |
|
88 #define MORPH_DERI_PFX "dp:" |
|
89 #define MORPH_INFL_PFX "ip:" |
|
90 #define MORPH_TERM_PFX "tp:" |
|
91 #define MORPH_DERI_SFX "ds:" |
|
92 #define MORPH_INFL_SFX "is:" |
|
93 #define MORPH_TERM_SFX "ts:" |
|
94 #define MORPH_SURF_PFX "sp:" |
|
95 #define MORPH_FREQ "fr:" |
|
96 #define MORPH_PHON "ph:" |
|
97 #define MORPH_HYPH "hy:" |
|
98 #define MORPH_PART "pa:" |
|
99 #define MORPH_FLAG "fl:" |
|
100 #define MORPH_HENTRY "_H:" |
|
101 #define MORPH_TAG_LEN strlen(MORPH_STEM) |
|
102 |
|
103 #define MSEP_FLD ' ' |
|
104 #define MSEP_REC '\n' |
|
105 #define MSEP_ALT '\v' |
|
106 |
|
107 // default flags |
|
108 #define DEFAULTFLAGS 65510 |
|
109 #define FORBIDDENWORD 65510 |
|
110 #define ONLYUPCASEFLAG 65511 |
|
111 |
|
112 // convert UTF-16 characters to UTF-8 |
|
113 LIBHUNSPELL_DLL_EXPORTED char * u16_u8(char * dest, int size, const w_char * src, int srclen); |
|
114 |
|
115 // convert UTF-8 characters to UTF-16 |
|
116 LIBHUNSPELL_DLL_EXPORTED int u8_u16(w_char * dest, int size, const char * src); |
|
117 |
|
118 // sort 2-byte vector |
|
119 LIBHUNSPELL_DLL_EXPORTED void flag_qsort(unsigned short flags[], int begin, int end); |
|
120 |
|
121 // binary search in 2-byte vector |
|
122 LIBHUNSPELL_DLL_EXPORTED int flag_bsearch(unsigned short flags[], unsigned short flag, int right); |
|
123 |
|
124 // remove end of line char(s) |
|
125 LIBHUNSPELL_DLL_EXPORTED void mychomp(char * s); |
|
126 |
|
127 // duplicate string |
|
128 LIBHUNSPELL_DLL_EXPORTED char * mystrdup(const char * s); |
|
129 |
|
130 // strcat for limited length destination string |
|
131 LIBHUNSPELL_DLL_EXPORTED char * mystrcat(char * dest, const char * st, int max); |
|
132 |
|
133 // duplicate reverse of string |
|
134 LIBHUNSPELL_DLL_EXPORTED char * myrevstrdup(const char * s); |
|
135 |
|
136 // parse into tokens with char delimiter |
|
137 LIBHUNSPELL_DLL_EXPORTED char * mystrsep(char ** sptr, const char delim); |
|
138 // parse into tokens with char delimiter |
|
139 LIBHUNSPELL_DLL_EXPORTED char * mystrsep2(char ** sptr, const char delim); |
|
140 |
|
141 // parse into tokens with char delimiter |
|
142 LIBHUNSPELL_DLL_EXPORTED char * mystrrep(char *, const char *, const char *); |
|
143 |
|
144 // append s to ends of every lines in text |
|
145 LIBHUNSPELL_DLL_EXPORTED void strlinecat(char * lines, const char * s); |
|
146 |
|
147 // tokenize into lines with new line |
|
148 LIBHUNSPELL_DLL_EXPORTED int line_tok(const char * text, char *** lines, char breakchar); |
|
149 |
|
150 // tokenize into lines with new line and uniq in place |
|
151 LIBHUNSPELL_DLL_EXPORTED char * line_uniq(char * text, char breakchar); |
|
152 LIBHUNSPELL_DLL_EXPORTED char * line_uniq_app(char ** text, char breakchar); |
|
153 |
|
154 // change oldchar to newchar in place |
|
155 LIBHUNSPELL_DLL_EXPORTED char * tr(char * text, char oldc, char newc); |
|
156 |
|
157 // reverse word |
|
158 LIBHUNSPELL_DLL_EXPORTED int reverseword(char *); |
|
159 |
|
160 // reverse word |
|
161 LIBHUNSPELL_DLL_EXPORTED int reverseword_utf(char *); |
|
162 |
|
163 // remove duplicates |
|
164 LIBHUNSPELL_DLL_EXPORTED int uniqlist(char ** list, int n); |
|
165 |
|
166 // free character array list |
|
167 LIBHUNSPELL_DLL_EXPORTED void freelist(char *** list, int n); |
|
168 |
|
169 // character encoding information |
|
170 struct cs_info { |
|
171 unsigned char ccase; |
|
172 unsigned char clower; |
|
173 unsigned char cupper; |
|
174 }; |
|
175 |
|
176 LIBHUNSPELL_DLL_EXPORTED int initialize_utf_tbl(); |
|
177 LIBHUNSPELL_DLL_EXPORTED void free_utf_tbl(); |
|
178 LIBHUNSPELL_DLL_EXPORTED unsigned short unicodetoupper(unsigned short c, int langnum); |
|
179 LIBHUNSPELL_DLL_EXPORTED unsigned short unicodetolower(unsigned short c, int langnum); |
|
180 LIBHUNSPELL_DLL_EXPORTED int unicodeisalpha(unsigned short c); |
|
181 |
|
182 LIBHUNSPELL_DLL_EXPORTED struct cs_info * get_current_cs(const char * es); |
|
183 |
|
184 // get language identifiers of language codes |
|
185 LIBHUNSPELL_DLL_EXPORTED int get_lang_num(const char * lang); |
|
186 |
|
187 // get characters of the given 8bit encoding with lower- and uppercase forms |
|
188 LIBHUNSPELL_DLL_EXPORTED char * get_casechars(const char * enc); |
|
189 |
|
190 // convert null terminated string to all caps using encoding |
|
191 LIBHUNSPELL_DLL_EXPORTED void enmkallcap(char * d, const char * p, const char * encoding); |
|
192 |
|
193 // convert null terminated string to all little using encoding |
|
194 LIBHUNSPELL_DLL_EXPORTED void enmkallsmall(char * d, const char * p, const char * encoding); |
|
195 |
|
196 // convert null terminated string to have initial capital using encoding |
|
197 LIBHUNSPELL_DLL_EXPORTED void enmkinitcap(char * d, const char * p, const char * encoding); |
|
198 |
|
199 // convert null terminated string to all caps |
|
200 LIBHUNSPELL_DLL_EXPORTED void mkallcap(char * p, const struct cs_info * csconv); |
|
201 |
|
202 // convert null terminated string to all little |
|
203 LIBHUNSPELL_DLL_EXPORTED void mkallsmall(char * p, const struct cs_info * csconv); |
|
204 |
|
205 // convert null terminated string to have initial capital |
|
206 LIBHUNSPELL_DLL_EXPORTED void mkinitcap(char * p, const struct cs_info * csconv); |
|
207 |
|
208 // convert first nc characters of UTF-8 string to little |
|
209 LIBHUNSPELL_DLL_EXPORTED void mkallsmall_utf(w_char * u, int nc, int langnum); |
|
210 |
|
211 // convert first nc characters of UTF-8 string to capital |
|
212 LIBHUNSPELL_DLL_EXPORTED void mkallcap_utf(w_char * u, int nc, int langnum); |
|
213 |
|
214 // get type of capitalization |
|
215 LIBHUNSPELL_DLL_EXPORTED int get_captype(char * q, int nl, cs_info *); |
|
216 |
|
217 // get type of capitalization (UTF-8) |
|
218 LIBHUNSPELL_DLL_EXPORTED int get_captype_utf8(w_char * q, int nl, int langnum); |
|
219 |
|
220 // strip all ignored characters in the string |
|
221 LIBHUNSPELL_DLL_EXPORTED void remove_ignored_chars_utf(char * word, unsigned short ignored_chars[], int ignored_len); |
|
222 |
|
223 // strip all ignored characters in the string |
|
224 LIBHUNSPELL_DLL_EXPORTED void remove_ignored_chars(char * word, char * ignored_chars); |
|
225 |
|
226 LIBHUNSPELL_DLL_EXPORTED int parse_string(char * line, char ** out, int ln); |
|
227 |
|
228 LIBHUNSPELL_DLL_EXPORTED int parse_array(char * line, char ** out, unsigned short ** out_utf16, |
|
229 int * out_utf16_len, int utf8, int ln); |
|
230 |
|
231 LIBHUNSPELL_DLL_EXPORTED int fieldlen(const char * r); |
|
232 LIBHUNSPELL_DLL_EXPORTED char * copy_field(char * dest, const char * morph, const char * var); |
|
233 |
|
234 LIBHUNSPELL_DLL_EXPORTED int morphcmp(const char * s, const char * t); |
|
235 |
|
236 LIBHUNSPELL_DLL_EXPORTED int get_sfxcount(const char * morph); |
|
237 |
|
238 // conversion function for protected memory |
|
239 LIBHUNSPELL_DLL_EXPORTED void store_pointer(char * dest, char * source); |
|
240 |
|
241 // conversion function for protected memory |
|
242 LIBHUNSPELL_DLL_EXPORTED char * get_stored_pointer(const char * s); |
|
243 |
|
244 // hash entry macros |
|
245 LIBHUNSPELL_DLL_EXPORTED inline char* HENTRY_DATA(struct hentry *h) |
|
246 { |
|
247 char *ret; |
|
248 if (!h->var) |
|
249 ret = NULL; |
|
250 else if (h->var & H_OPT_ALIASM) |
|
251 ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1); |
|
252 else |
|
253 ret = HENTRY_WORD(h) + h->blen + 1; |
|
254 return ret; |
|
255 } |
|
256 |
|
257 // NULL-free version for warning-free OOo build |
|
258 LIBHUNSPELL_DLL_EXPORTED inline const char* HENTRY_DATA2(const struct hentry *h) |
|
259 { |
|
260 const char *ret; |
|
261 if (!h->var) |
|
262 ret = ""; |
|
263 else if (h->var & H_OPT_ALIASM) |
|
264 ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1); |
|
265 else |
|
266 ret = HENTRY_WORD(h) + h->blen + 1; |
|
267 return ret; |
|
268 } |
|
269 |
|
270 LIBHUNSPELL_DLL_EXPORTED inline char* HENTRY_FIND(struct hentry *h, const char *p) |
|
271 { |
|
272 return (HENTRY_DATA(h) ? strstr(HENTRY_DATA(h), p) : NULL); |
|
273 } |
|
274 |
|
275 #define w_char_eq(a,b) (((a).l == (b).l) && ((a).h == (b).h)) |
|
276 |
|
277 #endif |