Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | /******* BEGIN LICENSE BLOCK ******* |
michael@0 | 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 |
michael@0 | 3 | * |
michael@0 | 4 | * The contents of this file are subject to the Mozilla Public License Version |
michael@0 | 5 | * 1.1 (the "License"); you may not use this file except in compliance with |
michael@0 | 6 | * the License. You may obtain a copy of the License at |
michael@0 | 7 | * http://www.mozilla.org/MPL/ |
michael@0 | 8 | * |
michael@0 | 9 | * Software distributed under the License is distributed on an "AS IS" basis, |
michael@0 | 10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License |
michael@0 | 11 | * for the specific language governing rights and limitations under the |
michael@0 | 12 | * License. |
michael@0 | 13 | * |
michael@0 | 14 | * The Initial Developers of the Original Code are Kevin Hendricks (MySpell) |
michael@0 | 15 | * and László Németh (Hunspell). Portions created by the Initial Developers |
michael@0 | 16 | * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved. |
michael@0 | 17 | * |
michael@0 | 18 | * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca) |
michael@0 | 19 | * David Einstein (deinst@world.std.com) |
michael@0 | 20 | * László Németh (nemethl@gyorsposta.hu) |
michael@0 | 21 | * Caolan McNamara (caolanm@redhat.com) |
michael@0 | 22 | * Davide Prina |
michael@0 | 23 | * Giuseppe Modugno |
michael@0 | 24 | * Gianluca Turconi |
michael@0 | 25 | * Simon Brouwer |
michael@0 | 26 | * Noll Janos |
michael@0 | 27 | * Biro Arpad |
michael@0 | 28 | * Goldman Eleonora |
michael@0 | 29 | * Sarlos Tamas |
michael@0 | 30 | * Bencsath Boldizsar |
michael@0 | 31 | * Halacsy Peter |
michael@0 | 32 | * Dvornik Laszlo |
michael@0 | 33 | * Gefferth Andras |
michael@0 | 34 | * Nagy Viktor |
michael@0 | 35 | * Varga Daniel |
michael@0 | 36 | * Chris Halls |
michael@0 | 37 | * Rene Engelhard |
michael@0 | 38 | * Bram Moolenaar |
michael@0 | 39 | * Dafydd Jones |
michael@0 | 40 | * Harri Pitkanen |
michael@0 | 41 | * Andras Timar |
michael@0 | 42 | * Tor Lillqvist |
michael@0 | 43 | * |
michael@0 | 44 | * Alternatively, the contents of this file may be used under the terms of |
michael@0 | 45 | * either the GNU General Public License Version 2 or later (the "GPL"), or |
michael@0 | 46 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), |
michael@0 | 47 | * in which case the provisions of the GPL or the LGPL are applicable instead |
michael@0 | 48 | * of those above. If you wish to allow use of your version of this file only |
michael@0 | 49 | * under the terms of either the GPL or the LGPL, and not to allow others to |
michael@0 | 50 | * use your version of this file under the terms of the MPL, indicate your |
michael@0 | 51 | * decision by deleting the provisions above and replace them with the notice |
michael@0 | 52 | * and other provisions required by the GPL or the LGPL. If you do not delete |
michael@0 | 53 | * the provisions above, a recipient may use your version of this file under |
michael@0 | 54 | * the terms of any one of the MPL, the GPL or the LGPL. |
michael@0 | 55 | * |
michael@0 | 56 | ******* END LICENSE BLOCK *******/ |
michael@0 | 57 | |
michael@0 | 58 | #include <stdlib.h> |
michael@0 | 59 | #include <string.h> |
michael@0 | 60 | #include <stdio.h> |
michael@0 | 61 | |
michael@0 | 62 | #include "hunspell.hxx" |
michael@0 | 63 | #include "hunspell.h" |
michael@0 | 64 | #ifndef MOZILLA_CLIENT |
michael@0 | 65 | # include "config.h" |
michael@0 | 66 | #endif |
michael@0 | 67 | #include "csutil.hxx" |
michael@0 | 68 | |
michael@0 | 69 | Hunspell::Hunspell(const char * affpath, const char * dpath, const char * key) |
michael@0 | 70 | { |
michael@0 | 71 | encoding = NULL; |
michael@0 | 72 | csconv = NULL; |
michael@0 | 73 | utf8 = 0; |
michael@0 | 74 | complexprefixes = 0; |
michael@0 | 75 | affixpath = mystrdup(affpath); |
michael@0 | 76 | maxdic = 0; |
michael@0 | 77 | |
michael@0 | 78 | /* first set up the hash manager */ |
michael@0 | 79 | pHMgr[0] = new HashMgr(dpath, affpath, key); |
michael@0 | 80 | if (pHMgr[0]) maxdic = 1; |
michael@0 | 81 | |
michael@0 | 82 | /* next set up the affix manager */ |
michael@0 | 83 | /* it needs access to the hash manager lookup methods */ |
michael@0 | 84 | pAMgr = new AffixMgr(affpath, pHMgr, &maxdic, key); |
michael@0 | 85 | |
michael@0 | 86 | /* get the preferred try string and the dictionary */ |
michael@0 | 87 | /* encoding from the Affix Manager for that dictionary */ |
michael@0 | 88 | char * try_string = pAMgr->get_try_string(); |
michael@0 | 89 | encoding = pAMgr->get_encoding(); |
michael@0 | 90 | langnum = pAMgr->get_langnum(); |
michael@0 | 91 | utf8 = pAMgr->get_utf8(); |
michael@0 | 92 | if (!utf8) |
michael@0 | 93 | csconv = get_current_cs(encoding); |
michael@0 | 94 | complexprefixes = pAMgr->get_complexprefixes(); |
michael@0 | 95 | wordbreak = pAMgr->get_breaktable(); |
michael@0 | 96 | |
michael@0 | 97 | /* and finally set up the suggestion manager */ |
michael@0 | 98 | pSMgr = new SuggestMgr(try_string, MAXSUGGESTION, pAMgr); |
michael@0 | 99 | if (try_string) free(try_string); |
michael@0 | 100 | } |
michael@0 | 101 | |
michael@0 | 102 | Hunspell::~Hunspell() |
michael@0 | 103 | { |
michael@0 | 104 | if (pSMgr) delete pSMgr; |
michael@0 | 105 | if (pAMgr) delete pAMgr; |
michael@0 | 106 | for (int i = 0; i < maxdic; i++) delete pHMgr[i]; |
michael@0 | 107 | maxdic = 0; |
michael@0 | 108 | pSMgr = NULL; |
michael@0 | 109 | pAMgr = NULL; |
michael@0 | 110 | #ifdef MOZILLA_CLIENT |
michael@0 | 111 | delete [] csconv; |
michael@0 | 112 | #endif |
michael@0 | 113 | csconv= NULL; |
michael@0 | 114 | if (encoding) free(encoding); |
michael@0 | 115 | encoding = NULL; |
michael@0 | 116 | if (affixpath) free(affixpath); |
michael@0 | 117 | affixpath = NULL; |
michael@0 | 118 | } |
michael@0 | 119 | |
michael@0 | 120 | // load extra dictionaries |
michael@0 | 121 | int Hunspell::add_dic(const char * dpath, const char * key) { |
michael@0 | 122 | if (maxdic == MAXDIC || !affixpath) return 1; |
michael@0 | 123 | pHMgr[maxdic] = new HashMgr(dpath, affixpath, key); |
michael@0 | 124 | if (pHMgr[maxdic]) maxdic++; else return 1; |
michael@0 | 125 | return 0; |
michael@0 | 126 | } |
michael@0 | 127 | |
michael@0 | 128 | // make a copy of src at destination while removing all leading |
michael@0 | 129 | // blanks and removing any trailing periods after recording |
michael@0 | 130 | // their presence with the abbreviation flag |
michael@0 | 131 | // also since already going through character by character, |
michael@0 | 132 | // set the capitalization type |
michael@0 | 133 | // return the length of the "cleaned" (and UTF-8 encoded) word |
michael@0 | 134 | |
michael@0 | 135 | int Hunspell::cleanword2(char * dest, const char * src, |
michael@0 | 136 | w_char * dest_utf, int * nc, int * pcaptype, int * pabbrev) |
michael@0 | 137 | { |
michael@0 | 138 | unsigned char * p = (unsigned char *) dest; |
michael@0 | 139 | const unsigned char * q = (const unsigned char * ) src; |
michael@0 | 140 | |
michael@0 | 141 | // first skip over any leading blanks |
michael@0 | 142 | while ((*q != '\0') && (*q == ' ')) q++; |
michael@0 | 143 | |
michael@0 | 144 | // now strip off any trailing periods (recording their presence) |
michael@0 | 145 | *pabbrev = 0; |
michael@0 | 146 | int nl = strlen((const char *)q); |
michael@0 | 147 | while ((nl > 0) && (*(q+nl-1)=='.')) { |
michael@0 | 148 | nl--; |
michael@0 | 149 | (*pabbrev)++; |
michael@0 | 150 | } |
michael@0 | 151 | |
michael@0 | 152 | // if no characters are left it can't be capitalized |
michael@0 | 153 | if (nl <= 0) { |
michael@0 | 154 | *pcaptype = NOCAP; |
michael@0 | 155 | *p = '\0'; |
michael@0 | 156 | return 0; |
michael@0 | 157 | } |
michael@0 | 158 | |
michael@0 | 159 | strncpy(dest, (char *) q, nl); |
michael@0 | 160 | *(dest + nl) = '\0'; |
michael@0 | 161 | nl = strlen(dest); |
michael@0 | 162 | if (utf8) { |
michael@0 | 163 | *nc = u8_u16(dest_utf, MAXWORDLEN, dest); |
michael@0 | 164 | // don't check too long words |
michael@0 | 165 | if (*nc >= MAXWORDLEN) return 0; |
michael@0 | 166 | if (*nc == -1) { // big Unicode character (non BMP area) |
michael@0 | 167 | *pcaptype = NOCAP; |
michael@0 | 168 | return nl; |
michael@0 | 169 | } |
michael@0 | 170 | *pcaptype = get_captype_utf8(dest_utf, *nc, langnum); |
michael@0 | 171 | } else { |
michael@0 | 172 | *pcaptype = get_captype(dest, nl, csconv); |
michael@0 | 173 | *nc = nl; |
michael@0 | 174 | } |
michael@0 | 175 | return nl; |
michael@0 | 176 | } |
michael@0 | 177 | |
michael@0 | 178 | int Hunspell::cleanword(char * dest, const char * src, |
michael@0 | 179 | int * pcaptype, int * pabbrev) |
michael@0 | 180 | { |
michael@0 | 181 | unsigned char * p = (unsigned char *) dest; |
michael@0 | 182 | const unsigned char * q = (const unsigned char * ) src; |
michael@0 | 183 | int firstcap = 0; |
michael@0 | 184 | |
michael@0 | 185 | // first skip over any leading blanks |
michael@0 | 186 | while ((*q != '\0') && (*q == ' ')) q++; |
michael@0 | 187 | |
michael@0 | 188 | // now strip off any trailing periods (recording their presence) |
michael@0 | 189 | *pabbrev = 0; |
michael@0 | 190 | int nl = strlen((const char *)q); |
michael@0 | 191 | while ((nl > 0) && (*(q+nl-1)=='.')) { |
michael@0 | 192 | nl--; |
michael@0 | 193 | (*pabbrev)++; |
michael@0 | 194 | } |
michael@0 | 195 | |
michael@0 | 196 | // if no characters are left it can't be capitalized |
michael@0 | 197 | if (nl <= 0) { |
michael@0 | 198 | *pcaptype = NOCAP; |
michael@0 | 199 | *p = '\0'; |
michael@0 | 200 | return 0; |
michael@0 | 201 | } |
michael@0 | 202 | |
michael@0 | 203 | // now determine the capitalization type of the first nl letters |
michael@0 | 204 | int ncap = 0; |
michael@0 | 205 | int nneutral = 0; |
michael@0 | 206 | int nc = 0; |
michael@0 | 207 | |
michael@0 | 208 | if (!utf8) { |
michael@0 | 209 | while (nl > 0) { |
michael@0 | 210 | nc++; |
michael@0 | 211 | if (csconv[(*q)].ccase) ncap++; |
michael@0 | 212 | if (csconv[(*q)].cupper == csconv[(*q)].clower) nneutral++; |
michael@0 | 213 | *p++ = *q++; |
michael@0 | 214 | nl--; |
michael@0 | 215 | } |
michael@0 | 216 | // remember to terminate the destination string |
michael@0 | 217 | *p = '\0'; |
michael@0 | 218 | firstcap = csconv[(unsigned char)(*dest)].ccase; |
michael@0 | 219 | } else { |
michael@0 | 220 | unsigned short idx; |
michael@0 | 221 | w_char t[MAXWORDLEN]; |
michael@0 | 222 | nc = u8_u16(t, MAXWORDLEN, src); |
michael@0 | 223 | for (int i = 0; i < nc; i++) { |
michael@0 | 224 | idx = (t[i].h << 8) + t[i].l; |
michael@0 | 225 | unsigned short low = unicodetolower(idx, langnum); |
michael@0 | 226 | if (idx != low) ncap++; |
michael@0 | 227 | if (unicodetoupper(idx, langnum) == low) nneutral++; |
michael@0 | 228 | } |
michael@0 | 229 | u16_u8(dest, MAXWORDUTF8LEN, t, nc); |
michael@0 | 230 | if (ncap) { |
michael@0 | 231 | idx = (t[0].h << 8) + t[0].l; |
michael@0 | 232 | firstcap = (idx != unicodetolower(idx, langnum)); |
michael@0 | 233 | } |
michael@0 | 234 | } |
michael@0 | 235 | |
michael@0 | 236 | // now finally set the captype |
michael@0 | 237 | if (ncap == 0) { |
michael@0 | 238 | *pcaptype = NOCAP; |
michael@0 | 239 | } else if ((ncap == 1) && firstcap) { |
michael@0 | 240 | *pcaptype = INITCAP; |
michael@0 | 241 | } else if ((ncap == nc) || ((ncap + nneutral) == nc)){ |
michael@0 | 242 | *pcaptype = ALLCAP; |
michael@0 | 243 | } else if ((ncap > 1) && firstcap) { |
michael@0 | 244 | *pcaptype = HUHINITCAP; |
michael@0 | 245 | } else { |
michael@0 | 246 | *pcaptype = HUHCAP; |
michael@0 | 247 | } |
michael@0 | 248 | return strlen(dest); |
michael@0 | 249 | } |
michael@0 | 250 | |
michael@0 | 251 | void Hunspell::mkallcap(char * p) |
michael@0 | 252 | { |
michael@0 | 253 | if (utf8) { |
michael@0 | 254 | w_char u[MAXWORDLEN]; |
michael@0 | 255 | int nc = u8_u16(u, MAXWORDLEN, p); |
michael@0 | 256 | unsigned short idx; |
michael@0 | 257 | for (int i = 0; i < nc; i++) { |
michael@0 | 258 | idx = (u[i].h << 8) + u[i].l; |
michael@0 | 259 | if (idx != unicodetoupper(idx, langnum)) { |
michael@0 | 260 | u[i].h = (unsigned char) (unicodetoupper(idx, langnum) >> 8); |
michael@0 | 261 | u[i].l = (unsigned char) (unicodetoupper(idx, langnum) & 0x00FF); |
michael@0 | 262 | } |
michael@0 | 263 | } |
michael@0 | 264 | u16_u8(p, MAXWORDUTF8LEN, u, nc); |
michael@0 | 265 | } else { |
michael@0 | 266 | while (*p != '\0') { |
michael@0 | 267 | *p = csconv[((unsigned char) *p)].cupper; |
michael@0 | 268 | p++; |
michael@0 | 269 | } |
michael@0 | 270 | } |
michael@0 | 271 | } |
michael@0 | 272 | |
michael@0 | 273 | int Hunspell::mkallcap2(char * p, w_char * u, int nc) |
michael@0 | 274 | { |
michael@0 | 275 | if (utf8) { |
michael@0 | 276 | unsigned short idx; |
michael@0 | 277 | for (int i = 0; i < nc; i++) { |
michael@0 | 278 | idx = (u[i].h << 8) + u[i].l; |
michael@0 | 279 | unsigned short up = unicodetoupper(idx, langnum); |
michael@0 | 280 | if (idx != up) { |
michael@0 | 281 | u[i].h = (unsigned char) (up >> 8); |
michael@0 | 282 | u[i].l = (unsigned char) (up & 0x00FF); |
michael@0 | 283 | } |
michael@0 | 284 | } |
michael@0 | 285 | u16_u8(p, MAXWORDUTF8LEN, u, nc); |
michael@0 | 286 | return strlen(p); |
michael@0 | 287 | } else { |
michael@0 | 288 | while (*p != '\0') { |
michael@0 | 289 | *p = csconv[((unsigned char) *p)].cupper; |
michael@0 | 290 | p++; |
michael@0 | 291 | } |
michael@0 | 292 | } |
michael@0 | 293 | return nc; |
michael@0 | 294 | } |
michael@0 | 295 | |
michael@0 | 296 | |
michael@0 | 297 | void Hunspell::mkallsmall(char * p) |
michael@0 | 298 | { |
michael@0 | 299 | while (*p != '\0') { |
michael@0 | 300 | *p = csconv[((unsigned char) *p)].clower; |
michael@0 | 301 | p++; |
michael@0 | 302 | } |
michael@0 | 303 | } |
michael@0 | 304 | |
michael@0 | 305 | int Hunspell::mkallsmall2(char * p, w_char * u, int nc) |
michael@0 | 306 | { |
michael@0 | 307 | if (utf8) { |
michael@0 | 308 | unsigned short idx; |
michael@0 | 309 | for (int i = 0; i < nc; i++) { |
michael@0 | 310 | idx = (u[i].h << 8) + u[i].l; |
michael@0 | 311 | unsigned short low = unicodetolower(idx, langnum); |
michael@0 | 312 | if (idx != low) { |
michael@0 | 313 | u[i].h = (unsigned char) (low >> 8); |
michael@0 | 314 | u[i].l = (unsigned char) (low & 0x00FF); |
michael@0 | 315 | } |
michael@0 | 316 | } |
michael@0 | 317 | u16_u8(p, MAXWORDUTF8LEN, u, nc); |
michael@0 | 318 | return strlen(p); |
michael@0 | 319 | } else { |
michael@0 | 320 | while (*p != '\0') { |
michael@0 | 321 | *p = csconv[((unsigned char) *p)].clower; |
michael@0 | 322 | p++; |
michael@0 | 323 | } |
michael@0 | 324 | } |
michael@0 | 325 | return nc; |
michael@0 | 326 | } |
michael@0 | 327 | |
michael@0 | 328 | // convert UTF-8 sharp S codes to latin 1 |
michael@0 | 329 | char * Hunspell::sharps_u8_l1(char * dest, char * source) { |
michael@0 | 330 | char * p = dest; |
michael@0 | 331 | *p = *source; |
michael@0 | 332 | for (p++, source++; *(source - 1); p++, source++) { |
michael@0 | 333 | *p = *source; |
michael@0 | 334 | if (*source == '\x9F') *--p = '\xDF'; |
michael@0 | 335 | } |
michael@0 | 336 | return dest; |
michael@0 | 337 | } |
michael@0 | 338 | |
michael@0 | 339 | // recursive search for right ss - sharp s permutations |
michael@0 | 340 | hentry * Hunspell::spellsharps(char * base, char * pos, int n, |
michael@0 | 341 | int repnum, char * tmp, int * info, char **root) { |
michael@0 | 342 | pos = strstr(pos, "ss"); |
michael@0 | 343 | if (pos && (n < MAXSHARPS)) { |
michael@0 | 344 | *pos = '\xC3'; |
michael@0 | 345 | *(pos + 1) = '\x9F'; |
michael@0 | 346 | hentry * h = spellsharps(base, pos + 2, n + 1, repnum + 1, tmp, info, root); |
michael@0 | 347 | if (h) return h; |
michael@0 | 348 | *pos = 's'; |
michael@0 | 349 | *(pos + 1) = 's'; |
michael@0 | 350 | h = spellsharps(base, pos + 2, n + 1, repnum, tmp, info, root); |
michael@0 | 351 | if (h) return h; |
michael@0 | 352 | } else if (repnum > 0) { |
michael@0 | 353 | if (utf8) return checkword(base, info, root); |
michael@0 | 354 | return checkword(sharps_u8_l1(tmp, base), info, root); |
michael@0 | 355 | } |
michael@0 | 356 | return NULL; |
michael@0 | 357 | } |
michael@0 | 358 | |
michael@0 | 359 | int Hunspell::is_keepcase(const hentry * rv) { |
michael@0 | 360 | return pAMgr && rv->astr && pAMgr->get_keepcase() && |
michael@0 | 361 | TESTAFF(rv->astr, pAMgr->get_keepcase(), rv->alen); |
michael@0 | 362 | } |
michael@0 | 363 | |
michael@0 | 364 | /* insert a word to the beginning of the suggestion array and return ns */ |
michael@0 | 365 | int Hunspell::insert_sug(char ***slst, char * word, int ns) { |
michael@0 | 366 | char * dup = mystrdup(word); |
michael@0 | 367 | if (!dup) return ns; |
michael@0 | 368 | if (ns == MAXSUGGESTION) { |
michael@0 | 369 | ns--; |
michael@0 | 370 | free((*slst)[ns]); |
michael@0 | 371 | } |
michael@0 | 372 | for (int k = ns; k > 0; k--) (*slst)[k] = (*slst)[k - 1]; |
michael@0 | 373 | (*slst)[0] = dup; |
michael@0 | 374 | return ns + 1; |
michael@0 | 375 | } |
michael@0 | 376 | |
michael@0 | 377 | int Hunspell::spell(const char * word, int * info, char ** root) |
michael@0 | 378 | { |
michael@0 | 379 | struct hentry * rv=NULL; |
michael@0 | 380 | // need larger vector. For example, Turkish capital letter I converted a |
michael@0 | 381 | // 2-byte UTF-8 character (dotless i) by mkallsmall. |
michael@0 | 382 | char cw[MAXWORDUTF8LEN]; |
michael@0 | 383 | char wspace[MAXWORDUTF8LEN]; |
michael@0 | 384 | w_char unicw[MAXWORDLEN]; |
michael@0 | 385 | // Hunspell supports XML input of the simplified API (see manual) |
michael@0 | 386 | if (strcmp(word, SPELL_XML) == 0) return 1; |
michael@0 | 387 | int nc = strlen(word); |
michael@0 | 388 | int wl2 = 0; |
michael@0 | 389 | if (utf8) { |
michael@0 | 390 | if (nc >= MAXWORDUTF8LEN) return 0; |
michael@0 | 391 | } else { |
michael@0 | 392 | if (nc >= MAXWORDLEN) return 0; |
michael@0 | 393 | } |
michael@0 | 394 | int captype = 0; |
michael@0 | 395 | int abbv = 0; |
michael@0 | 396 | int wl = 0; |
michael@0 | 397 | |
michael@0 | 398 | // input conversion |
michael@0 | 399 | RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; |
michael@0 | 400 | if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv); |
michael@0 | 401 | else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); |
michael@0 | 402 | |
michael@0 | 403 | int info2 = 0; |
michael@0 | 404 | if (wl == 0 || maxdic == 0) return 1; |
michael@0 | 405 | if (root) *root = NULL; |
michael@0 | 406 | |
michael@0 | 407 | // allow numbers with dots, dashes and commas (but forbid double separators: "..", "--" etc.) |
michael@0 | 408 | enum { NBEGIN, NNUM, NSEP }; |
michael@0 | 409 | int nstate = NBEGIN; |
michael@0 | 410 | int i; |
michael@0 | 411 | |
michael@0 | 412 | for (i = 0; (i < wl); i++) { |
michael@0 | 413 | if ((cw[i] <= '9') && (cw[i] >= '0')) { |
michael@0 | 414 | nstate = NNUM; |
michael@0 | 415 | } else if ((cw[i] == ',') || (cw[i] == '.') || (cw[i] == '-')) { |
michael@0 | 416 | if ((nstate == NSEP) || (i == 0)) break; |
michael@0 | 417 | nstate = NSEP; |
michael@0 | 418 | } else break; |
michael@0 | 419 | } |
michael@0 | 420 | if ((i == wl) && (nstate == NNUM)) return 1; |
michael@0 | 421 | if (!info) info = &info2; else *info = 0; |
michael@0 | 422 | |
michael@0 | 423 | switch(captype) { |
michael@0 | 424 | case HUHCAP: |
michael@0 | 425 | case HUHINITCAP: |
michael@0 | 426 | *info += SPELL_ORIGCAP; |
michael@0 | 427 | case NOCAP: { |
michael@0 | 428 | rv = checkword(cw, info, root); |
michael@0 | 429 | if ((abbv) && !(rv)) { |
michael@0 | 430 | memcpy(wspace,cw,wl); |
michael@0 | 431 | *(wspace+wl) = '.'; |
michael@0 | 432 | *(wspace+wl+1) = '\0'; |
michael@0 | 433 | rv = checkword(wspace, info, root); |
michael@0 | 434 | } |
michael@0 | 435 | break; |
michael@0 | 436 | } |
michael@0 | 437 | case ALLCAP: { |
michael@0 | 438 | *info += SPELL_ORIGCAP; |
michael@0 | 439 | rv = checkword(cw, info, root); |
michael@0 | 440 | if (rv) break; |
michael@0 | 441 | if (abbv) { |
michael@0 | 442 | memcpy(wspace,cw,wl); |
michael@0 | 443 | *(wspace+wl) = '.'; |
michael@0 | 444 | *(wspace+wl+1) = '\0'; |
michael@0 | 445 | rv = checkword(wspace, info, root); |
michael@0 | 446 | if (rv) break; |
michael@0 | 447 | } |
michael@0 | 448 | // Spec. prefix handling for Catalan, French, Italian: |
michael@0 | 449 | // prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia). |
michael@0 | 450 | if (pAMgr && strchr(cw, '\'')) { |
michael@0 | 451 | wl = mkallsmall2(cw, unicw, nc); |
michael@0 | 452 | //There are no really sane circumstances where this could fail, |
michael@0 | 453 | //but anyway... |
michael@0 | 454 | if (char * apostrophe = strchr(cw, '\'')) { |
michael@0 | 455 | if (utf8) { |
michael@0 | 456 | w_char tmpword[MAXWORDLEN]; |
michael@0 | 457 | *apostrophe = '\0'; |
michael@0 | 458 | wl2 = u8_u16(tmpword, MAXWORDLEN, cw); |
michael@0 | 459 | *apostrophe = '\''; |
michael@0 | 460 | if (wl2 < nc) { |
michael@0 | 461 | mkinitcap2(apostrophe + 1, unicw + wl2 + 1, nc - wl2 - 1); |
michael@0 | 462 | rv = checkword(cw, info, root); |
michael@0 | 463 | if (rv) break; |
michael@0 | 464 | } |
michael@0 | 465 | } else { |
michael@0 | 466 | mkinitcap2(apostrophe + 1, unicw, nc); |
michael@0 | 467 | rv = checkword(cw, info, root); |
michael@0 | 468 | if (rv) break; |
michael@0 | 469 | } |
michael@0 | 470 | } |
michael@0 | 471 | mkinitcap2(cw, unicw, nc); |
michael@0 | 472 | rv = checkword(cw, info, root); |
michael@0 | 473 | if (rv) break; |
michael@0 | 474 | } |
michael@0 | 475 | if (pAMgr && pAMgr->get_checksharps() && strstr(cw, "SS")) { |
michael@0 | 476 | char tmpword[MAXWORDUTF8LEN]; |
michael@0 | 477 | wl = mkallsmall2(cw, unicw, nc); |
michael@0 | 478 | memcpy(wspace,cw,(wl+1)); |
michael@0 | 479 | rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); |
michael@0 | 480 | if (!rv) { |
michael@0 | 481 | wl2 = mkinitcap2(cw, unicw, nc); |
michael@0 | 482 | rv = spellsharps(cw, cw, 0, 0, tmpword, info, root); |
michael@0 | 483 | } |
michael@0 | 484 | if ((abbv) && !(rv)) { |
michael@0 | 485 | *(wspace+wl) = '.'; |
michael@0 | 486 | *(wspace+wl+1) = '\0'; |
michael@0 | 487 | rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); |
michael@0 | 488 | if (!rv) { |
michael@0 | 489 | memcpy(wspace, cw, wl2); |
michael@0 | 490 | *(wspace+wl2) = '.'; |
michael@0 | 491 | *(wspace+wl2+1) = '\0'; |
michael@0 | 492 | rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); |
michael@0 | 493 | } |
michael@0 | 494 | } |
michael@0 | 495 | if (rv) break; |
michael@0 | 496 | } |
michael@0 | 497 | } |
michael@0 | 498 | case INITCAP: { |
michael@0 | 499 | *info += SPELL_ORIGCAP; |
michael@0 | 500 | wl = mkallsmall2(cw, unicw, nc); |
michael@0 | 501 | memcpy(wspace,cw,(wl+1)); |
michael@0 | 502 | wl2 = mkinitcap2(cw, unicw, nc); |
michael@0 | 503 | if (captype == INITCAP) *info += SPELL_INITCAP; |
michael@0 | 504 | rv = checkword(cw, info, root); |
michael@0 | 505 | if (captype == INITCAP) *info -= SPELL_INITCAP; |
michael@0 | 506 | // forbid bad capitalization |
michael@0 | 507 | // (for example, ijs -> Ijs instead of IJs in Dutch) |
michael@0 | 508 | // use explicit forms in dic: Ijs/F (F = FORBIDDENWORD flag) |
michael@0 | 509 | if (*info & SPELL_FORBIDDEN) { |
michael@0 | 510 | rv = NULL; |
michael@0 | 511 | break; |
michael@0 | 512 | } |
michael@0 | 513 | if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL; |
michael@0 | 514 | if (rv) break; |
michael@0 | 515 | |
michael@0 | 516 | rv = checkword(wspace, info, root); |
michael@0 | 517 | if (abbv && !rv) { |
michael@0 | 518 | |
michael@0 | 519 | *(wspace+wl) = '.'; |
michael@0 | 520 | *(wspace+wl+1) = '\0'; |
michael@0 | 521 | rv = checkword(wspace, info, root); |
michael@0 | 522 | if (!rv) { |
michael@0 | 523 | memcpy(wspace, cw, wl2); |
michael@0 | 524 | *(wspace+wl2) = '.'; |
michael@0 | 525 | *(wspace+wl2+1) = '\0'; |
michael@0 | 526 | if (captype == INITCAP) *info += SPELL_INITCAP; |
michael@0 | 527 | rv = checkword(wspace, info, root); |
michael@0 | 528 | if (captype == INITCAP) *info -= SPELL_INITCAP; |
michael@0 | 529 | if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL; |
michael@0 | 530 | break; |
michael@0 | 531 | } |
michael@0 | 532 | } |
michael@0 | 533 | if (rv && is_keepcase(rv) && |
michael@0 | 534 | ((captype == ALLCAP) || |
michael@0 | 535 | // if CHECKSHARPS: KEEPCASE words with \xDF are allowed |
michael@0 | 536 | // in INITCAP form, too. |
michael@0 | 537 | !(pAMgr->get_checksharps() && |
michael@0 | 538 | ((utf8 && strstr(wspace, "\xC3\x9F")) || |
michael@0 | 539 | (!utf8 && strchr(wspace, '\xDF')))))) rv = NULL; |
michael@0 | 540 | break; |
michael@0 | 541 | } |
michael@0 | 542 | } |
michael@0 | 543 | |
michael@0 | 544 | if (rv) { |
michael@0 | 545 | if (pAMgr && pAMgr->get_warn() && rv->astr && |
michael@0 | 546 | TESTAFF(rv->astr, pAMgr->get_warn(), rv->alen)) { |
michael@0 | 547 | *info += SPELL_WARN; |
michael@0 | 548 | if (pAMgr->get_forbidwarn()) return 0; |
michael@0 | 549 | return HUNSPELL_OK_WARN; |
michael@0 | 550 | } |
michael@0 | 551 | return HUNSPELL_OK; |
michael@0 | 552 | } |
michael@0 | 553 | |
michael@0 | 554 | // recursive breaking at break points |
michael@0 | 555 | if (wordbreak) { |
michael@0 | 556 | char * s; |
michael@0 | 557 | char r; |
michael@0 | 558 | int nbr = 0; |
michael@0 | 559 | wl = strlen(cw); |
michael@0 | 560 | int numbreak = pAMgr ? pAMgr->get_numbreak() : 0; |
michael@0 | 561 | |
michael@0 | 562 | // calculate break points for recursion limit |
michael@0 | 563 | for (int j = 0; j < numbreak; j++) { |
michael@0 | 564 | s = cw; |
michael@0 | 565 | do { |
michael@0 | 566 | s = (char *) strstr(s, wordbreak[j]); |
michael@0 | 567 | if (s) { |
michael@0 | 568 | nbr++; |
michael@0 | 569 | s++; |
michael@0 | 570 | } |
michael@0 | 571 | } while (s); |
michael@0 | 572 | } |
michael@0 | 573 | if (nbr >= 10) return 0; |
michael@0 | 574 | |
michael@0 | 575 | // check boundary patterns (^begin and end$) |
michael@0 | 576 | for (int j = 0; j < numbreak; j++) { |
michael@0 | 577 | int plen = strlen(wordbreak[j]); |
michael@0 | 578 | if (plen == 1 || plen > wl) continue; |
michael@0 | 579 | if (wordbreak[j][0] == '^' && strncmp(cw, wordbreak[j] + 1, plen - 1) == 0 |
michael@0 | 580 | && spell(cw + plen - 1)) return 1; |
michael@0 | 581 | if (wordbreak[j][plen - 1] == '$' && |
michael@0 | 582 | strncmp(cw + wl - plen + 1, wordbreak[j], plen - 1) == 0) { |
michael@0 | 583 | r = cw[wl - plen + 1]; |
michael@0 | 584 | cw[wl - plen + 1] = '\0'; |
michael@0 | 585 | if (spell(cw)) return 1; |
michael@0 | 586 | cw[wl - plen + 1] = r; |
michael@0 | 587 | } |
michael@0 | 588 | } |
michael@0 | 589 | |
michael@0 | 590 | // other patterns |
michael@0 | 591 | for (int j = 0; j < numbreak; j++) { |
michael@0 | 592 | int plen = strlen(wordbreak[j]); |
michael@0 | 593 | s=(char *) strstr(cw, wordbreak[j]); |
michael@0 | 594 | if (s && (s > cw) && (s < cw + wl - plen)) { |
michael@0 | 595 | if (!spell(s + plen)) continue; |
michael@0 | 596 | r = *s; |
michael@0 | 597 | *s = '\0'; |
michael@0 | 598 | // examine 2 sides of the break point |
michael@0 | 599 | if (spell(cw)) return 1; |
michael@0 | 600 | *s = r; |
michael@0 | 601 | |
michael@0 | 602 | // LANG_hu: spec. dash rule |
michael@0 | 603 | if (langnum == LANG_hu && strcmp(wordbreak[j], "-") == 0) { |
michael@0 | 604 | r = s[1]; |
michael@0 | 605 | s[1] = '\0'; |
michael@0 | 606 | if (spell(cw)) return 1; // check the first part with dash |
michael@0 | 607 | s[1] = r; |
michael@0 | 608 | } |
michael@0 | 609 | // end of LANG speficic region |
michael@0 | 610 | |
michael@0 | 611 | } |
michael@0 | 612 | } |
michael@0 | 613 | } |
michael@0 | 614 | |
michael@0 | 615 | return 0; |
michael@0 | 616 | } |
michael@0 | 617 | |
michael@0 | 618 | struct hentry * Hunspell::checkword(const char * w, int * info, char ** root) |
michael@0 | 619 | { |
michael@0 | 620 | struct hentry * he = NULL; |
michael@0 | 621 | int len, i; |
michael@0 | 622 | char w2[MAXWORDUTF8LEN]; |
michael@0 | 623 | const char * word; |
michael@0 | 624 | |
michael@0 | 625 | char * ignoredchars = pAMgr->get_ignore(); |
michael@0 | 626 | if (ignoredchars != NULL) { |
michael@0 | 627 | strcpy(w2, w); |
michael@0 | 628 | if (utf8) { |
michael@0 | 629 | int ignoredchars_utf16_len; |
michael@0 | 630 | unsigned short * ignoredchars_utf16 = pAMgr->get_ignore_utf16(&ignoredchars_utf16_len); |
michael@0 | 631 | remove_ignored_chars_utf(w2, ignoredchars_utf16, ignoredchars_utf16_len); |
michael@0 | 632 | } else { |
michael@0 | 633 | remove_ignored_chars(w2,ignoredchars); |
michael@0 | 634 | } |
michael@0 | 635 | word = w2; |
michael@0 | 636 | } else word = w; |
michael@0 | 637 | |
michael@0 | 638 | len = strlen(word); |
michael@0 | 639 | |
michael@0 | 640 | if (!len) |
michael@0 | 641 | return NULL; |
michael@0 | 642 | |
michael@0 | 643 | // word reversing wrapper for complex prefixes |
michael@0 | 644 | if (complexprefixes) { |
michael@0 | 645 | if (word != w2) { |
michael@0 | 646 | strcpy(w2, word); |
michael@0 | 647 | word = w2; |
michael@0 | 648 | } |
michael@0 | 649 | if (utf8) reverseword_utf(w2); else reverseword(w2); |
michael@0 | 650 | } |
michael@0 | 651 | |
michael@0 | 652 | // look word in hash table |
michael@0 | 653 | for (i = 0; (i < maxdic) && !he; i ++) { |
michael@0 | 654 | he = (pHMgr[i])->lookup(word); |
michael@0 | 655 | |
michael@0 | 656 | // check forbidden and onlyincompound words |
michael@0 | 657 | if ((he) && (he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) { |
michael@0 | 658 | if (info) *info += SPELL_FORBIDDEN; |
michael@0 | 659 | // LANG_hu section: set dash information for suggestions |
michael@0 | 660 | if (langnum == LANG_hu) { |
michael@0 | 661 | if (pAMgr->get_compoundflag() && |
michael@0 | 662 | TESTAFF(he->astr, pAMgr->get_compoundflag(), he->alen)) { |
michael@0 | 663 | if (info) *info += SPELL_COMPOUND; |
michael@0 | 664 | } |
michael@0 | 665 | } |
michael@0 | 666 | return NULL; |
michael@0 | 667 | } |
michael@0 | 668 | |
michael@0 | 669 | // he = next not needaffix, onlyincompound homonym or onlyupcase word |
michael@0 | 670 | while (he && (he->astr) && |
michael@0 | 671 | ((pAMgr->get_needaffix() && TESTAFF(he->astr, pAMgr->get_needaffix(), he->alen)) || |
michael@0 | 672 | (pAMgr->get_onlyincompound() && TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) || |
michael@0 | 673 | (info && (*info & SPELL_INITCAP) && TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)) |
michael@0 | 674 | )) he = he->next_homonym; |
michael@0 | 675 | } |
michael@0 | 676 | |
michael@0 | 677 | // check with affixes |
michael@0 | 678 | if (!he && pAMgr) { |
michael@0 | 679 | // try stripping off affixes */ |
michael@0 | 680 | he = pAMgr->affix_check(word, len, 0); |
michael@0 | 681 | |
michael@0 | 682 | // check compound restriction and onlyupcase |
michael@0 | 683 | if (he && he->astr && ( |
michael@0 | 684 | (pAMgr->get_onlyincompound() && |
michael@0 | 685 | TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) || |
michael@0 | 686 | (info && (*info & SPELL_INITCAP) && |
michael@0 | 687 | TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)))) { |
michael@0 | 688 | he = NULL; |
michael@0 | 689 | } |
michael@0 | 690 | |
michael@0 | 691 | if (he) { |
michael@0 | 692 | if ((he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) { |
michael@0 | 693 | if (info) *info += SPELL_FORBIDDEN; |
michael@0 | 694 | return NULL; |
michael@0 | 695 | } |
michael@0 | 696 | if (root) { |
michael@0 | 697 | *root = mystrdup(he->word); |
michael@0 | 698 | if (*root && complexprefixes) { |
michael@0 | 699 | if (utf8) reverseword_utf(*root); else reverseword(*root); |
michael@0 | 700 | } |
michael@0 | 701 | } |
michael@0 | 702 | // try check compound word |
michael@0 | 703 | } else if (pAMgr->get_compound()) { |
michael@0 | 704 | he = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 0, info); |
michael@0 | 705 | // LANG_hu section: `moving rule' with last dash |
michael@0 | 706 | if ((!he) && (langnum == LANG_hu) && (word[len-1] == '-')) { |
michael@0 | 707 | char * dup = mystrdup(word); |
michael@0 | 708 | if (!dup) return NULL; |
michael@0 | 709 | dup[len-1] = '\0'; |
michael@0 | 710 | he = pAMgr->compound_check(dup, len-1, -5, 0, 100, 0, NULL, 1, 0, info); |
michael@0 | 711 | free(dup); |
michael@0 | 712 | } |
michael@0 | 713 | // end of LANG speficic region |
michael@0 | 714 | if (he) { |
michael@0 | 715 | if (root) { |
michael@0 | 716 | *root = mystrdup(he->word); |
michael@0 | 717 | if (*root && complexprefixes) { |
michael@0 | 718 | if (utf8) reverseword_utf(*root); else reverseword(*root); |
michael@0 | 719 | } |
michael@0 | 720 | } |
michael@0 | 721 | if (info) *info += SPELL_COMPOUND; |
michael@0 | 722 | } |
michael@0 | 723 | } |
michael@0 | 724 | |
michael@0 | 725 | } |
michael@0 | 726 | |
michael@0 | 727 | return he; |
michael@0 | 728 | } |
michael@0 | 729 | |
michael@0 | 730 | int Hunspell::suggest(char*** slst, const char * word) |
michael@0 | 731 | { |
michael@0 | 732 | int onlycmpdsug = 0; |
michael@0 | 733 | char cw[MAXWORDUTF8LEN]; |
michael@0 | 734 | char wspace[MAXWORDUTF8LEN]; |
michael@0 | 735 | if (!pSMgr || maxdic == 0) return 0; |
michael@0 | 736 | w_char unicw[MAXWORDLEN]; |
michael@0 | 737 | *slst = NULL; |
michael@0 | 738 | // process XML input of the simplified API (see manual) |
michael@0 | 739 | if (strncmp(word, SPELL_XML, sizeof(SPELL_XML) - 3) == 0) { |
michael@0 | 740 | return spellml(slst, word); |
michael@0 | 741 | } |
michael@0 | 742 | int nc = strlen(word); |
michael@0 | 743 | if (utf8) { |
michael@0 | 744 | if (nc >= MAXWORDUTF8LEN) return 0; |
michael@0 | 745 | } else { |
michael@0 | 746 | if (nc >= MAXWORDLEN) return 0; |
michael@0 | 747 | } |
michael@0 | 748 | int captype = 0; |
michael@0 | 749 | int abbv = 0; |
michael@0 | 750 | int wl = 0; |
michael@0 | 751 | |
michael@0 | 752 | // input conversion |
michael@0 | 753 | RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; |
michael@0 | 754 | if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv); |
michael@0 | 755 | else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); |
michael@0 | 756 | |
michael@0 | 757 | if (wl == 0) return 0; |
michael@0 | 758 | int ns = 0; |
michael@0 | 759 | int capwords = 0; |
michael@0 | 760 | |
michael@0 | 761 | // check capitalized form for FORCEUCASE |
michael@0 | 762 | if (pAMgr && captype == NOCAP && pAMgr->get_forceucase()) { |
michael@0 | 763 | int info = SPELL_ORIGCAP; |
michael@0 | 764 | char ** wlst; |
michael@0 | 765 | if (checkword(cw, &info, NULL)) { |
michael@0 | 766 | if (*slst) { |
michael@0 | 767 | wlst = *slst; |
michael@0 | 768 | } else { |
michael@0 | 769 | wlst = (char **) malloc(MAXSUGGESTION * sizeof(char *)); |
michael@0 | 770 | if (wlst == NULL) return -1; |
michael@0 | 771 | *slst = wlst; |
michael@0 | 772 | for (int i = 0; i < MAXSUGGESTION; i++) { |
michael@0 | 773 | wlst[i] = NULL; |
michael@0 | 774 | } |
michael@0 | 775 | } |
michael@0 | 776 | wlst[0] = mystrdup(cw); |
michael@0 | 777 | mkinitcap(wlst[0]); |
michael@0 | 778 | return 1; |
michael@0 | 779 | } |
michael@0 | 780 | } |
michael@0 | 781 | |
michael@0 | 782 | switch(captype) { |
michael@0 | 783 | case NOCAP: { |
michael@0 | 784 | ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug); |
michael@0 | 785 | break; |
michael@0 | 786 | } |
michael@0 | 787 | |
michael@0 | 788 | case INITCAP: { |
michael@0 | 789 | capwords = 1; |
michael@0 | 790 | ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug); |
michael@0 | 791 | if (ns == -1) break; |
michael@0 | 792 | memcpy(wspace,cw,(wl+1)); |
michael@0 | 793 | mkallsmall2(wspace, unicw, nc); |
michael@0 | 794 | ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); |
michael@0 | 795 | break; |
michael@0 | 796 | } |
michael@0 | 797 | case HUHINITCAP: |
michael@0 | 798 | capwords = 1; |
michael@0 | 799 | case HUHCAP: { |
michael@0 | 800 | ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug); |
michael@0 | 801 | if (ns != -1) { |
michael@0 | 802 | int prevns; |
michael@0 | 803 | // something.The -> something. The |
michael@0 | 804 | char * dot = strchr(cw, '.'); |
michael@0 | 805 | if (dot && (dot > cw)) { |
michael@0 | 806 | int captype_; |
michael@0 | 807 | if (utf8) { |
michael@0 | 808 | w_char w_[MAXWORDLEN]; |
michael@0 | 809 | int wl_ = u8_u16(w_, MAXWORDLEN, dot + 1); |
michael@0 | 810 | captype_ = get_captype_utf8(w_, wl_, langnum); |
michael@0 | 811 | } else captype_ = get_captype(dot+1, strlen(dot+1), csconv); |
michael@0 | 812 | if (captype_ == INITCAP) { |
michael@0 | 813 | char * st = mystrdup(cw); |
michael@0 | 814 | if (st) st = (char *) realloc(st, wl + 2); |
michael@0 | 815 | if (st) { |
michael@0 | 816 | st[(dot - cw) + 1] = ' '; |
michael@0 | 817 | strcpy(st + (dot - cw) + 2, dot + 1); |
michael@0 | 818 | ns = insert_sug(slst, st, ns); |
michael@0 | 819 | free(st); |
michael@0 | 820 | } |
michael@0 | 821 | } |
michael@0 | 822 | } |
michael@0 | 823 | if (captype == HUHINITCAP) { |
michael@0 | 824 | // TheOpenOffice.org -> The OpenOffice.org |
michael@0 | 825 | memcpy(wspace,cw,(wl+1)); |
michael@0 | 826 | mkinitsmall2(wspace, unicw, nc); |
michael@0 | 827 | ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); |
michael@0 | 828 | } |
michael@0 | 829 | memcpy(wspace,cw,(wl+1)); |
michael@0 | 830 | mkallsmall2(wspace, unicw, nc); |
michael@0 | 831 | if (spell(wspace)) ns = insert_sug(slst, wspace, ns); |
michael@0 | 832 | prevns = ns; |
michael@0 | 833 | ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); |
michael@0 | 834 | if (captype == HUHINITCAP) { |
michael@0 | 835 | mkinitcap2(wspace, unicw, nc); |
michael@0 | 836 | if (spell(wspace)) ns = insert_sug(slst, wspace, ns); |
michael@0 | 837 | ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); |
michael@0 | 838 | } |
michael@0 | 839 | // aNew -> "a New" (instead of "a new") |
michael@0 | 840 | for (int j = prevns; j < ns; j++) { |
michael@0 | 841 | char * space = strchr((*slst)[j],' '); |
michael@0 | 842 | if (space) { |
michael@0 | 843 | int slen = strlen(space + 1); |
michael@0 | 844 | // different case after space (need capitalisation) |
michael@0 | 845 | if ((slen < wl) && strcmp(cw + wl - slen, space + 1)) { |
michael@0 | 846 | w_char w[MAXWORDLEN]; |
michael@0 | 847 | int wc = 0; |
michael@0 | 848 | char * r = (*slst)[j]; |
michael@0 | 849 | if (utf8) wc = u8_u16(w, MAXWORDLEN, space + 1); |
michael@0 | 850 | mkinitcap2(space + 1, w, wc); |
michael@0 | 851 | // set as first suggestion |
michael@0 | 852 | for (int k = j; k > 0; k--) (*slst)[k] = (*slst)[k - 1]; |
michael@0 | 853 | (*slst)[0] = r; |
michael@0 | 854 | } |
michael@0 | 855 | } |
michael@0 | 856 | } |
michael@0 | 857 | } |
michael@0 | 858 | break; |
michael@0 | 859 | } |
michael@0 | 860 | |
michael@0 | 861 | case ALLCAP: { |
michael@0 | 862 | memcpy(wspace, cw, (wl+1)); |
michael@0 | 863 | mkallsmall2(wspace, unicw, nc); |
michael@0 | 864 | ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); |
michael@0 | 865 | if (ns == -1) break; |
michael@0 | 866 | if (pAMgr && pAMgr->get_keepcase() && spell(wspace)) |
michael@0 | 867 | ns = insert_sug(slst, wspace, ns); |
michael@0 | 868 | mkinitcap2(wspace, unicw, nc); |
michael@0 | 869 | ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); |
michael@0 | 870 | for (int j=0; j < ns; j++) { |
michael@0 | 871 | mkallcap((*slst)[j]); |
michael@0 | 872 | if (pAMgr && pAMgr->get_checksharps()) { |
michael@0 | 873 | char * pos; |
michael@0 | 874 | if (utf8) { |
michael@0 | 875 | pos = strstr((*slst)[j], "\xC3\x9F"); |
michael@0 | 876 | while (pos) { |
michael@0 | 877 | *pos = 'S'; |
michael@0 | 878 | *(pos+1) = 'S'; |
michael@0 | 879 | pos = strstr(pos+2, "\xC3\x9F"); |
michael@0 | 880 | } |
michael@0 | 881 | } else { |
michael@0 | 882 | pos = strchr((*slst)[j], '\xDF'); |
michael@0 | 883 | while (pos) { |
michael@0 | 884 | (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 2); |
michael@0 | 885 | mystrrep((*slst)[j], "\xDF", "SS"); |
michael@0 | 886 | pos = strchr((*slst)[j], '\xDF'); |
michael@0 | 887 | } |
michael@0 | 888 | } |
michael@0 | 889 | } |
michael@0 | 890 | } |
michael@0 | 891 | break; |
michael@0 | 892 | } |
michael@0 | 893 | } |
michael@0 | 894 | |
michael@0 | 895 | // LANG_hu section: replace '-' with ' ' in Hungarian |
michael@0 | 896 | if (langnum == LANG_hu) { |
michael@0 | 897 | for (int j=0; j < ns; j++) { |
michael@0 | 898 | char * pos = strchr((*slst)[j],'-'); |
michael@0 | 899 | if (pos) { |
michael@0 | 900 | int info; |
michael@0 | 901 | char w[MAXWORDUTF8LEN]; |
michael@0 | 902 | *pos = '\0'; |
michael@0 | 903 | strcpy(w, (*slst)[j]); |
michael@0 | 904 | strcat(w, pos + 1); |
michael@0 | 905 | spell(w, &info, NULL); |
michael@0 | 906 | if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) { |
michael@0 | 907 | *pos = ' '; |
michael@0 | 908 | } else *pos = '-'; |
michael@0 | 909 | } |
michael@0 | 910 | } |
michael@0 | 911 | } |
michael@0 | 912 | // END OF LANG_hu section |
michael@0 | 913 | |
michael@0 | 914 | // try ngram approach since found nothing or only compound words |
michael@0 | 915 | if (pAMgr && (ns == 0 || onlycmpdsug) && (pAMgr->get_maxngramsugs() != 0) && (*slst)) { |
michael@0 | 916 | switch(captype) { |
michael@0 | 917 | case NOCAP: { |
michael@0 | 918 | ns = pSMgr->ngsuggest(*slst, cw, ns, pHMgr, maxdic); |
michael@0 | 919 | break; |
michael@0 | 920 | } |
michael@0 | 921 | case HUHINITCAP: |
michael@0 | 922 | capwords = 1; |
michael@0 | 923 | case HUHCAP: { |
michael@0 | 924 | memcpy(wspace,cw,(wl+1)); |
michael@0 | 925 | mkallsmall2(wspace, unicw, nc); |
michael@0 | 926 | ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic); |
michael@0 | 927 | break; |
michael@0 | 928 | } |
michael@0 | 929 | case INITCAP: { |
michael@0 | 930 | capwords = 1; |
michael@0 | 931 | memcpy(wspace,cw,(wl+1)); |
michael@0 | 932 | mkallsmall2(wspace, unicw, nc); |
michael@0 | 933 | ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic); |
michael@0 | 934 | break; |
michael@0 | 935 | } |
michael@0 | 936 | case ALLCAP: { |
michael@0 | 937 | memcpy(wspace,cw,(wl+1)); |
michael@0 | 938 | mkallsmall2(wspace, unicw, nc); |
michael@0 | 939 | int oldns = ns; |
michael@0 | 940 | ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic); |
michael@0 | 941 | for (int j = oldns; j < ns; j++) |
michael@0 | 942 | mkallcap((*slst)[j]); |
michael@0 | 943 | break; |
michael@0 | 944 | } |
michael@0 | 945 | } |
michael@0 | 946 | } |
michael@0 | 947 | |
michael@0 | 948 | // try dash suggestion (Afo-American -> Afro-American) |
michael@0 | 949 | if (char * pos = strchr(cw, '-')) { |
michael@0 | 950 | char * ppos = cw; |
michael@0 | 951 | int nodashsug = 1; |
michael@0 | 952 | char ** nlst = NULL; |
michael@0 | 953 | int nn = 0; |
michael@0 | 954 | int last = 0; |
michael@0 | 955 | if (*slst) { |
michael@0 | 956 | for (int j = 0; j < ns && nodashsug == 1; j++) { |
michael@0 | 957 | if (strchr((*slst)[j], '-')) nodashsug = 0; |
michael@0 | 958 | } |
michael@0 | 959 | } |
michael@0 | 960 | while (nodashsug && !last) { |
michael@0 | 961 | if (*pos == '\0') last = 1; else *pos = '\0'; |
michael@0 | 962 | if (!spell(ppos)) { |
michael@0 | 963 | nn = suggest(&nlst, ppos); |
michael@0 | 964 | for (int j = nn - 1; j >= 0; j--) { |
michael@0 | 965 | strncpy(wspace, cw, ppos - cw); |
michael@0 | 966 | strcpy(wspace + (ppos - cw), nlst[j]); |
michael@0 | 967 | if (!last) { |
michael@0 | 968 | strcat(wspace, "-"); |
michael@0 | 969 | strcat(wspace, pos + 1); |
michael@0 | 970 | } |
michael@0 | 971 | ns = insert_sug(slst, wspace, ns); |
michael@0 | 972 | free(nlst[j]); |
michael@0 | 973 | } |
michael@0 | 974 | if (nlst != NULL) free(nlst); |
michael@0 | 975 | nodashsug = 0; |
michael@0 | 976 | } |
michael@0 | 977 | if (!last) { |
michael@0 | 978 | *pos = '-'; |
michael@0 | 979 | ppos = pos + 1; |
michael@0 | 980 | pos = strchr(ppos, '-'); |
michael@0 | 981 | } |
michael@0 | 982 | if (!pos) pos = cw + strlen(cw); |
michael@0 | 983 | } |
michael@0 | 984 | } |
michael@0 | 985 | |
michael@0 | 986 | // word reversing wrapper for complex prefixes |
michael@0 | 987 | if (complexprefixes) { |
michael@0 | 988 | for (int j = 0; j < ns; j++) { |
michael@0 | 989 | if (utf8) reverseword_utf((*slst)[j]); else reverseword((*slst)[j]); |
michael@0 | 990 | } |
michael@0 | 991 | } |
michael@0 | 992 | |
michael@0 | 993 | // capitalize |
michael@0 | 994 | if (capwords) for (int j=0; j < ns; j++) { |
michael@0 | 995 | mkinitcap((*slst)[j]); |
michael@0 | 996 | } |
michael@0 | 997 | |
michael@0 | 998 | // expand suggestions with dot(s) |
michael@0 | 999 | if (abbv && pAMgr && pAMgr->get_sugswithdots()) { |
michael@0 | 1000 | for (int j = 0; j < ns; j++) { |
michael@0 | 1001 | (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 1 + abbv); |
michael@0 | 1002 | strcat((*slst)[j], word + strlen(word) - abbv); |
michael@0 | 1003 | } |
michael@0 | 1004 | } |
michael@0 | 1005 | |
michael@0 | 1006 | // remove bad capitalized and forbidden forms |
michael@0 | 1007 | if (pAMgr && (pAMgr->get_keepcase() || pAMgr->get_forbiddenword())) { |
michael@0 | 1008 | switch (captype) { |
michael@0 | 1009 | case INITCAP: |
michael@0 | 1010 | case ALLCAP: { |
michael@0 | 1011 | int l = 0; |
michael@0 | 1012 | for (int j=0; j < ns; j++) { |
michael@0 | 1013 | if (!strchr((*slst)[j],' ') && !spell((*slst)[j])) { |
michael@0 | 1014 | char s[MAXSWUTF8L]; |
michael@0 | 1015 | w_char w[MAXSWL]; |
michael@0 | 1016 | int len; |
michael@0 | 1017 | if (utf8) { |
michael@0 | 1018 | len = u8_u16(w, MAXSWL, (*slst)[j]); |
michael@0 | 1019 | } else { |
michael@0 | 1020 | strcpy(s, (*slst)[j]); |
michael@0 | 1021 | len = strlen(s); |
michael@0 | 1022 | } |
michael@0 | 1023 | mkallsmall2(s, w, len); |
michael@0 | 1024 | free((*slst)[j]); |
michael@0 | 1025 | if (spell(s)) { |
michael@0 | 1026 | (*slst)[l] = mystrdup(s); |
michael@0 | 1027 | if ((*slst)[l]) l++; |
michael@0 | 1028 | } else { |
michael@0 | 1029 | mkinitcap2(s, w, len); |
michael@0 | 1030 | if (spell(s)) { |
michael@0 | 1031 | (*slst)[l] = mystrdup(s); |
michael@0 | 1032 | if ((*slst)[l]) l++; |
michael@0 | 1033 | } |
michael@0 | 1034 | } |
michael@0 | 1035 | } else { |
michael@0 | 1036 | (*slst)[l] = (*slst)[j]; |
michael@0 | 1037 | l++; |
michael@0 | 1038 | } |
michael@0 | 1039 | } |
michael@0 | 1040 | ns = l; |
michael@0 | 1041 | } |
michael@0 | 1042 | } |
michael@0 | 1043 | } |
michael@0 | 1044 | |
michael@0 | 1045 | // remove duplications |
michael@0 | 1046 | int l = 0; |
michael@0 | 1047 | for (int j = 0; j < ns; j++) { |
michael@0 | 1048 | (*slst)[l] = (*slst)[j]; |
michael@0 | 1049 | for (int k = 0; k < l; k++) { |
michael@0 | 1050 | if (strcmp((*slst)[k], (*slst)[j]) == 0) { |
michael@0 | 1051 | free((*slst)[j]); |
michael@0 | 1052 | l--; |
michael@0 | 1053 | break; |
michael@0 | 1054 | } |
michael@0 | 1055 | } |
michael@0 | 1056 | l++; |
michael@0 | 1057 | } |
michael@0 | 1058 | ns = l; |
michael@0 | 1059 | |
michael@0 | 1060 | // output conversion |
michael@0 | 1061 | rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL; |
michael@0 | 1062 | for (int j = 0; rl && j < ns; j++) { |
michael@0 | 1063 | if (rl->conv((*slst)[j], wspace)) { |
michael@0 | 1064 | free((*slst)[j]); |
michael@0 | 1065 | (*slst)[j] = mystrdup(wspace); |
michael@0 | 1066 | } |
michael@0 | 1067 | } |
michael@0 | 1068 | |
michael@0 | 1069 | // if suggestions removed by nosuggest, onlyincompound parameters |
michael@0 | 1070 | if (l == 0 && *slst) { |
michael@0 | 1071 | free(*slst); |
michael@0 | 1072 | *slst = NULL; |
michael@0 | 1073 | } |
michael@0 | 1074 | return l; |
michael@0 | 1075 | } |
michael@0 | 1076 | |
michael@0 | 1077 | void Hunspell::free_list(char *** slst, int n) { |
michael@0 | 1078 | freelist(slst, n); |
michael@0 | 1079 | } |
michael@0 | 1080 | |
michael@0 | 1081 | char * Hunspell::get_dic_encoding() |
michael@0 | 1082 | { |
michael@0 | 1083 | return encoding; |
michael@0 | 1084 | } |
michael@0 | 1085 | |
michael@0 | 1086 | #ifdef HUNSPELL_EXPERIMENTAL |
michael@0 | 1087 | // XXX need UTF-8 support |
michael@0 | 1088 | int Hunspell::suggest_auto(char*** slst, const char * word) |
michael@0 | 1089 | { |
michael@0 | 1090 | char cw[MAXWORDUTF8LEN]; |
michael@0 | 1091 | char wspace[MAXWORDUTF8LEN]; |
michael@0 | 1092 | if (!pSMgr || maxdic == 0) return 0; |
michael@0 | 1093 | int wl = strlen(word); |
michael@0 | 1094 | if (utf8) { |
michael@0 | 1095 | if (wl >= MAXWORDUTF8LEN) return 0; |
michael@0 | 1096 | } else { |
michael@0 | 1097 | if (wl >= MAXWORDLEN) return 0; |
michael@0 | 1098 | } |
michael@0 | 1099 | int captype = 0; |
michael@0 | 1100 | int abbv = 0; |
michael@0 | 1101 | wl = cleanword(cw, word, &captype, &abbv); |
michael@0 | 1102 | if (wl == 0) return 0; |
michael@0 | 1103 | int ns = 0; |
michael@0 | 1104 | *slst = NULL; // HU, nsug in pSMgr->suggest |
michael@0 | 1105 | |
michael@0 | 1106 | switch(captype) { |
michael@0 | 1107 | case NOCAP: { |
michael@0 | 1108 | ns = pSMgr->suggest_auto(slst, cw, ns); |
michael@0 | 1109 | if (ns>0) break; |
michael@0 | 1110 | break; |
michael@0 | 1111 | } |
michael@0 | 1112 | |
michael@0 | 1113 | case INITCAP: { |
michael@0 | 1114 | memcpy(wspace,cw,(wl+1)); |
michael@0 | 1115 | mkallsmall(wspace); |
michael@0 | 1116 | ns = pSMgr->suggest_auto(slst, wspace, ns); |
michael@0 | 1117 | for (int j=0; j < ns; j++) |
michael@0 | 1118 | mkinitcap((*slst)[j]); |
michael@0 | 1119 | ns = pSMgr->suggest_auto(slst, cw, ns); |
michael@0 | 1120 | break; |
michael@0 | 1121 | |
michael@0 | 1122 | } |
michael@0 | 1123 | |
michael@0 | 1124 | case HUHINITCAP: |
michael@0 | 1125 | case HUHCAP: { |
michael@0 | 1126 | ns = pSMgr->suggest_auto(slst, cw, ns); |
michael@0 | 1127 | if (ns == 0) { |
michael@0 | 1128 | memcpy(wspace,cw,(wl+1)); |
michael@0 | 1129 | mkallsmall(wspace); |
michael@0 | 1130 | ns = pSMgr->suggest_auto(slst, wspace, ns); |
michael@0 | 1131 | } |
michael@0 | 1132 | break; |
michael@0 | 1133 | } |
michael@0 | 1134 | |
michael@0 | 1135 | case ALLCAP: { |
michael@0 | 1136 | memcpy(wspace,cw,(wl+1)); |
michael@0 | 1137 | mkallsmall(wspace); |
michael@0 | 1138 | ns = pSMgr->suggest_auto(slst, wspace, ns); |
michael@0 | 1139 | |
michael@0 | 1140 | mkinitcap(wspace); |
michael@0 | 1141 | ns = pSMgr->suggest_auto(slst, wspace, ns); |
michael@0 | 1142 | |
michael@0 | 1143 | for (int j=0; j < ns; j++) |
michael@0 | 1144 | mkallcap((*slst)[j]); |
michael@0 | 1145 | break; |
michael@0 | 1146 | } |
michael@0 | 1147 | } |
michael@0 | 1148 | |
michael@0 | 1149 | // word reversing wrapper for complex prefixes |
michael@0 | 1150 | if (complexprefixes) { |
michael@0 | 1151 | for (int j = 0; j < ns; j++) { |
michael@0 | 1152 | if (utf8) reverseword_utf((*slst)[j]); else reverseword((*slst)[j]); |
michael@0 | 1153 | } |
michael@0 | 1154 | } |
michael@0 | 1155 | |
michael@0 | 1156 | // expand suggestions with dot(s) |
michael@0 | 1157 | if (abbv && pAMgr && pAMgr->get_sugswithdots()) { |
michael@0 | 1158 | for (int j = 0; j < ns; j++) { |
michael@0 | 1159 | (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 1 + abbv); |
michael@0 | 1160 | strcat((*slst)[j], word + strlen(word) - abbv); |
michael@0 | 1161 | } |
michael@0 | 1162 | } |
michael@0 | 1163 | |
michael@0 | 1164 | // LANG_hu section: replace '-' with ' ' in Hungarian |
michael@0 | 1165 | if (langnum == LANG_hu) { |
michael@0 | 1166 | for (int j=0; j < ns; j++) { |
michael@0 | 1167 | char * pos = strchr((*slst)[j],'-'); |
michael@0 | 1168 | if (pos) { |
michael@0 | 1169 | int info; |
michael@0 | 1170 | char w[MAXWORDUTF8LEN]; |
michael@0 | 1171 | *pos = '\0'; |
michael@0 | 1172 | strcpy(w, (*slst)[j]); |
michael@0 | 1173 | strcat(w, pos + 1); |
michael@0 | 1174 | spell(w, &info, NULL); |
michael@0 | 1175 | if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) { |
michael@0 | 1176 | *pos = ' '; |
michael@0 | 1177 | } else *pos = '-'; |
michael@0 | 1178 | } |
michael@0 | 1179 | } |
michael@0 | 1180 | } |
michael@0 | 1181 | // END OF LANG_hu section |
michael@0 | 1182 | return ns; |
michael@0 | 1183 | } |
michael@0 | 1184 | #endif |
michael@0 | 1185 | |
michael@0 | 1186 | int Hunspell::stem(char*** slst, char ** desc, int n) |
michael@0 | 1187 | { |
michael@0 | 1188 | char result[MAXLNLEN]; |
michael@0 | 1189 | char result2[MAXLNLEN]; |
michael@0 | 1190 | *slst = NULL; |
michael@0 | 1191 | if (n == 0) return 0; |
michael@0 | 1192 | *result2 = '\0'; |
michael@0 | 1193 | for (int i = 0; i < n; i++) { |
michael@0 | 1194 | *result = '\0'; |
michael@0 | 1195 | // add compound word parts (except the last one) |
michael@0 | 1196 | char * s = (char *) desc[i]; |
michael@0 | 1197 | char * part = strstr(s, MORPH_PART); |
michael@0 | 1198 | if (part) { |
michael@0 | 1199 | char * nextpart = strstr(part + 1, MORPH_PART); |
michael@0 | 1200 | while (nextpart) { |
michael@0 | 1201 | copy_field(result + strlen(result), part, MORPH_PART); |
michael@0 | 1202 | part = nextpart; |
michael@0 | 1203 | nextpart = strstr(part + 1, MORPH_PART); |
michael@0 | 1204 | } |
michael@0 | 1205 | s = part; |
michael@0 | 1206 | } |
michael@0 | 1207 | |
michael@0 | 1208 | char **pl; |
michael@0 | 1209 | char tok[MAXLNLEN]; |
michael@0 | 1210 | strcpy(tok, s); |
michael@0 | 1211 | char * alt = strstr(tok, " | "); |
michael@0 | 1212 | while (alt) { |
michael@0 | 1213 | alt[1] = MSEP_ALT; |
michael@0 | 1214 | alt = strstr(alt, " | "); |
michael@0 | 1215 | } |
michael@0 | 1216 | int pln = line_tok(tok, &pl, MSEP_ALT); |
michael@0 | 1217 | for (int k = 0; k < pln; k++) { |
michael@0 | 1218 | // add derivational suffixes |
michael@0 | 1219 | if (strstr(pl[k], MORPH_DERI_SFX)) { |
michael@0 | 1220 | // remove inflectional suffixes |
michael@0 | 1221 | char * is = strstr(pl[k], MORPH_INFL_SFX); |
michael@0 | 1222 | if (is) *is = '\0'; |
michael@0 | 1223 | char * sg = pSMgr->suggest_gen(&(pl[k]), 1, pl[k]); |
michael@0 | 1224 | if (sg) { |
michael@0 | 1225 | char ** gen; |
michael@0 | 1226 | int genl = line_tok(sg, &gen, MSEP_REC); |
michael@0 | 1227 | free(sg); |
michael@0 | 1228 | for (int j = 0; j < genl; j++) { |
michael@0 | 1229 | sprintf(result2 + strlen(result2), "%c%s%s", |
michael@0 | 1230 | MSEP_REC, result, gen[j]); |
michael@0 | 1231 | } |
michael@0 | 1232 | freelist(&gen, genl); |
michael@0 | 1233 | } |
michael@0 | 1234 | } else { |
michael@0 | 1235 | sprintf(result2 + strlen(result2), "%c%s", MSEP_REC, result); |
michael@0 | 1236 | if (strstr(pl[k], MORPH_SURF_PFX)) { |
michael@0 | 1237 | copy_field(result2 + strlen(result2), pl[k], MORPH_SURF_PFX); |
michael@0 | 1238 | } |
michael@0 | 1239 | copy_field(result2 + strlen(result2), pl[k], MORPH_STEM); |
michael@0 | 1240 | } |
michael@0 | 1241 | } |
michael@0 | 1242 | freelist(&pl, pln); |
michael@0 | 1243 | } |
michael@0 | 1244 | int sln = line_tok(result2, slst, MSEP_REC); |
michael@0 | 1245 | return uniqlist(*slst, sln); |
michael@0 | 1246 | |
michael@0 | 1247 | } |
michael@0 | 1248 | |
michael@0 | 1249 | int Hunspell::stem(char*** slst, const char * word) |
michael@0 | 1250 | { |
michael@0 | 1251 | char ** pl; |
michael@0 | 1252 | int pln = analyze(&pl, word); |
michael@0 | 1253 | int pln2 = stem(slst, pl, pln); |
michael@0 | 1254 | freelist(&pl, pln); |
michael@0 | 1255 | return pln2; |
michael@0 | 1256 | } |
michael@0 | 1257 | |
michael@0 | 1258 | #ifdef HUNSPELL_EXPERIMENTAL |
michael@0 | 1259 | int Hunspell::suggest_pos_stems(char*** slst, const char * word) |
michael@0 | 1260 | { |
michael@0 | 1261 | char cw[MAXWORDUTF8LEN]; |
michael@0 | 1262 | char wspace[MAXWORDUTF8LEN]; |
michael@0 | 1263 | if (! pSMgr || maxdic == 0) return 0; |
michael@0 | 1264 | int wl = strlen(word); |
michael@0 | 1265 | if (utf8) { |
michael@0 | 1266 | if (wl >= MAXWORDUTF8LEN) return 0; |
michael@0 | 1267 | } else { |
michael@0 | 1268 | if (wl >= MAXWORDLEN) return 0; |
michael@0 | 1269 | } |
michael@0 | 1270 | int captype = 0; |
michael@0 | 1271 | int abbv = 0; |
michael@0 | 1272 | wl = cleanword(cw, word, &captype, &abbv); |
michael@0 | 1273 | if (wl == 0) return 0; |
michael@0 | 1274 | |
michael@0 | 1275 | int ns = 0; // ns=0 = normalized input |
michael@0 | 1276 | |
michael@0 | 1277 | *slst = NULL; // HU, nsug in pSMgr->suggest |
michael@0 | 1278 | |
michael@0 | 1279 | switch(captype) { |
michael@0 | 1280 | case HUHCAP: |
michael@0 | 1281 | case NOCAP: { |
michael@0 | 1282 | ns = pSMgr->suggest_pos_stems(slst, cw, ns); |
michael@0 | 1283 | |
michael@0 | 1284 | if ((abbv) && (ns == 0)) { |
michael@0 | 1285 | memcpy(wspace,cw,wl); |
michael@0 | 1286 | *(wspace+wl) = '.'; |
michael@0 | 1287 | *(wspace+wl+1) = '\0'; |
michael@0 | 1288 | ns = pSMgr->suggest_pos_stems(slst, wspace, ns); |
michael@0 | 1289 | } |
michael@0 | 1290 | |
michael@0 | 1291 | break; |
michael@0 | 1292 | } |
michael@0 | 1293 | |
michael@0 | 1294 | case INITCAP: { |
michael@0 | 1295 | |
michael@0 | 1296 | ns = pSMgr->suggest_pos_stems(slst, cw, ns); |
michael@0 | 1297 | |
michael@0 | 1298 | if (ns == 0 || ((*slst)[0][0] == '#')) { |
michael@0 | 1299 | memcpy(wspace,cw,(wl+1)); |
michael@0 | 1300 | mkallsmall(wspace); |
michael@0 | 1301 | ns = pSMgr->suggest_pos_stems(slst, wspace, ns); |
michael@0 | 1302 | } |
michael@0 | 1303 | |
michael@0 | 1304 | break; |
michael@0 | 1305 | |
michael@0 | 1306 | } |
michael@0 | 1307 | |
michael@0 | 1308 | case ALLCAP: { |
michael@0 | 1309 | ns = pSMgr->suggest_pos_stems(slst, cw, ns); |
michael@0 | 1310 | if (ns != 0) break; |
michael@0 | 1311 | |
michael@0 | 1312 | memcpy(wspace,cw,(wl+1)); |
michael@0 | 1313 | mkallsmall(wspace); |
michael@0 | 1314 | ns = pSMgr->suggest_pos_stems(slst, wspace, ns); |
michael@0 | 1315 | |
michael@0 | 1316 | if (ns == 0) { |
michael@0 | 1317 | mkinitcap(wspace); |
michael@0 | 1318 | ns = pSMgr->suggest_pos_stems(slst, wspace, ns); |
michael@0 | 1319 | } |
michael@0 | 1320 | break; |
michael@0 | 1321 | } |
michael@0 | 1322 | } |
michael@0 | 1323 | |
michael@0 | 1324 | return ns; |
michael@0 | 1325 | } |
michael@0 | 1326 | #endif // END OF HUNSPELL_EXPERIMENTAL CODE |
michael@0 | 1327 | |
michael@0 | 1328 | const char * Hunspell::get_wordchars() |
michael@0 | 1329 | { |
michael@0 | 1330 | return pAMgr->get_wordchars(); |
michael@0 | 1331 | } |
michael@0 | 1332 | |
michael@0 | 1333 | unsigned short * Hunspell::get_wordchars_utf16(int * len) |
michael@0 | 1334 | { |
michael@0 | 1335 | return pAMgr->get_wordchars_utf16(len); |
michael@0 | 1336 | } |
michael@0 | 1337 | |
michael@0 | 1338 | void Hunspell::mkinitcap(char * p) |
michael@0 | 1339 | { |
michael@0 | 1340 | if (!utf8) { |
michael@0 | 1341 | if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper; |
michael@0 | 1342 | } else { |
michael@0 | 1343 | int len; |
michael@0 | 1344 | w_char u[MAXWORDLEN]; |
michael@0 | 1345 | len = u8_u16(u, MAXWORDLEN, p); |
michael@0 | 1346 | unsigned short i = unicodetoupper((u[0].h << 8) + u[0].l, langnum); |
michael@0 | 1347 | u[0].h = (unsigned char) (i >> 8); |
michael@0 | 1348 | u[0].l = (unsigned char) (i & 0x00FF); |
michael@0 | 1349 | u16_u8(p, MAXWORDUTF8LEN, u, len); |
michael@0 | 1350 | } |
michael@0 | 1351 | } |
michael@0 | 1352 | |
michael@0 | 1353 | int Hunspell::mkinitcap2(char * p, w_char * u, int nc) |
michael@0 | 1354 | { |
michael@0 | 1355 | if (!utf8) { |
michael@0 | 1356 | if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper; |
michael@0 | 1357 | } else if (nc > 0) { |
michael@0 | 1358 | unsigned short i = unicodetoupper((u[0].h << 8) + u[0].l, langnum); |
michael@0 | 1359 | u[0].h = (unsigned char) (i >> 8); |
michael@0 | 1360 | u[0].l = (unsigned char) (i & 0x00FF); |
michael@0 | 1361 | u16_u8(p, MAXWORDUTF8LEN, u, nc); |
michael@0 | 1362 | return strlen(p); |
michael@0 | 1363 | } |
michael@0 | 1364 | return nc; |
michael@0 | 1365 | } |
michael@0 | 1366 | |
michael@0 | 1367 | int Hunspell::mkinitsmall2(char * p, w_char * u, int nc) |
michael@0 | 1368 | { |
michael@0 | 1369 | if (!utf8) { |
michael@0 | 1370 | if (*p != '\0') *p = csconv[((unsigned char)*p)].clower; |
michael@0 | 1371 | } else if (nc > 0) { |
michael@0 | 1372 | unsigned short i = unicodetolower((u[0].h << 8) + u[0].l, langnum); |
michael@0 | 1373 | u[0].h = (unsigned char) (i >> 8); |
michael@0 | 1374 | u[0].l = (unsigned char) (i & 0x00FF); |
michael@0 | 1375 | u16_u8(p, MAXWORDUTF8LEN, u, nc); |
michael@0 | 1376 | return strlen(p); |
michael@0 | 1377 | } |
michael@0 | 1378 | return nc; |
michael@0 | 1379 | } |
michael@0 | 1380 | |
michael@0 | 1381 | int Hunspell::add(const char * word) |
michael@0 | 1382 | { |
michael@0 | 1383 | if (pHMgr[0]) return (pHMgr[0])->add(word); |
michael@0 | 1384 | return 0; |
michael@0 | 1385 | } |
michael@0 | 1386 | |
michael@0 | 1387 | int Hunspell::add_with_affix(const char * word, const char * example) |
michael@0 | 1388 | { |
michael@0 | 1389 | if (pHMgr[0]) return (pHMgr[0])->add_with_affix(word, example); |
michael@0 | 1390 | return 0; |
michael@0 | 1391 | } |
michael@0 | 1392 | |
michael@0 | 1393 | int Hunspell::remove(const char * word) |
michael@0 | 1394 | { |
michael@0 | 1395 | if (pHMgr[0]) return (pHMgr[0])->remove(word); |
michael@0 | 1396 | return 0; |
michael@0 | 1397 | } |
michael@0 | 1398 | |
michael@0 | 1399 | const char * Hunspell::get_version() |
michael@0 | 1400 | { |
michael@0 | 1401 | return pAMgr->get_version(); |
michael@0 | 1402 | } |
michael@0 | 1403 | |
michael@0 | 1404 | struct cs_info * Hunspell::get_csconv() |
michael@0 | 1405 | { |
michael@0 | 1406 | return csconv; |
michael@0 | 1407 | } |
michael@0 | 1408 | |
michael@0 | 1409 | void Hunspell::cat_result(char * result, char * st) |
michael@0 | 1410 | { |
michael@0 | 1411 | if (st) { |
michael@0 | 1412 | if (*result) mystrcat(result, "\n", MAXLNLEN); |
michael@0 | 1413 | mystrcat(result, st, MAXLNLEN); |
michael@0 | 1414 | free(st); |
michael@0 | 1415 | } |
michael@0 | 1416 | } |
michael@0 | 1417 | |
michael@0 | 1418 | int Hunspell::analyze(char*** slst, const char * word) |
michael@0 | 1419 | { |
michael@0 | 1420 | char cw[MAXWORDUTF8LEN]; |
michael@0 | 1421 | char wspace[MAXWORDUTF8LEN]; |
michael@0 | 1422 | w_char unicw[MAXWORDLEN]; |
michael@0 | 1423 | int wl2 = 0; |
michael@0 | 1424 | *slst = NULL; |
michael@0 | 1425 | if (! pSMgr || maxdic == 0) return 0; |
michael@0 | 1426 | int nc = strlen(word); |
michael@0 | 1427 | if (utf8) { |
michael@0 | 1428 | if (nc >= MAXWORDUTF8LEN) return 0; |
michael@0 | 1429 | } else { |
michael@0 | 1430 | if (nc >= MAXWORDLEN) return 0; |
michael@0 | 1431 | } |
michael@0 | 1432 | int captype = 0; |
michael@0 | 1433 | int abbv = 0; |
michael@0 | 1434 | int wl = 0; |
michael@0 | 1435 | |
michael@0 | 1436 | // input conversion |
michael@0 | 1437 | RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; |
michael@0 | 1438 | if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv); |
michael@0 | 1439 | else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); |
michael@0 | 1440 | |
michael@0 | 1441 | if (wl == 0) { |
michael@0 | 1442 | if (abbv) { |
michael@0 | 1443 | for (wl = 0; wl < abbv; wl++) cw[wl] = '.'; |
michael@0 | 1444 | cw[wl] = '\0'; |
michael@0 | 1445 | abbv = 0; |
michael@0 | 1446 | } else return 0; |
michael@0 | 1447 | } |
michael@0 | 1448 | |
michael@0 | 1449 | char result[MAXLNLEN]; |
michael@0 | 1450 | char * st = NULL; |
michael@0 | 1451 | |
michael@0 | 1452 | *result = '\0'; |
michael@0 | 1453 | |
michael@0 | 1454 | int n = 0; |
michael@0 | 1455 | int n2 = 0; |
michael@0 | 1456 | int n3 = 0; |
michael@0 | 1457 | |
michael@0 | 1458 | // test numbers |
michael@0 | 1459 | // LANG_hu section: set dash information for suggestions |
michael@0 | 1460 | if (langnum == LANG_hu) { |
michael@0 | 1461 | while ((n < wl) && |
michael@0 | 1462 | (((cw[n] <= '9') && (cw[n] >= '0')) || (((cw[n] == '.') || (cw[n] == ',')) && (n > 0)))) { |
michael@0 | 1463 | n++; |
michael@0 | 1464 | if ((cw[n] == '.') || (cw[n] == ',')) { |
michael@0 | 1465 | if (((n2 == 0) && (n > 3)) || |
michael@0 | 1466 | ((n2 > 0) && ((cw[n-1] == '.') || (cw[n-1] == ',')))) break; |
michael@0 | 1467 | n2++; |
michael@0 | 1468 | n3 = n; |
michael@0 | 1469 | } |
michael@0 | 1470 | } |
michael@0 | 1471 | |
michael@0 | 1472 | if ((n == wl) && (n3 > 0) && (n - n3 > 3)) return 0; |
michael@0 | 1473 | if ((n == wl) || ((n>0) && ((cw[n]=='%') || (cw[n]=='\xB0')) && checkword(cw+n, NULL, NULL))) { |
michael@0 | 1474 | mystrcat(result, cw, MAXLNLEN); |
michael@0 | 1475 | result[n - 1] = '\0'; |
michael@0 | 1476 | if (n == wl) cat_result(result, pSMgr->suggest_morph(cw + n - 1)); |
michael@0 | 1477 | else { |
michael@0 | 1478 | char sign = cw[n]; |
michael@0 | 1479 | cw[n] = '\0'; |
michael@0 | 1480 | cat_result(result, pSMgr->suggest_morph(cw + n - 1)); |
michael@0 | 1481 | mystrcat(result, "+", MAXLNLEN); // XXX SPEC. MORPHCODE |
michael@0 | 1482 | cw[n] = sign; |
michael@0 | 1483 | cat_result(result, pSMgr->suggest_morph(cw + n)); |
michael@0 | 1484 | } |
michael@0 | 1485 | return line_tok(result, slst, MSEP_REC); |
michael@0 | 1486 | } |
michael@0 | 1487 | } |
michael@0 | 1488 | // END OF LANG_hu section |
michael@0 | 1489 | |
michael@0 | 1490 | switch(captype) { |
michael@0 | 1491 | case HUHCAP: |
michael@0 | 1492 | case HUHINITCAP: |
michael@0 | 1493 | case NOCAP: { |
michael@0 | 1494 | cat_result(result, pSMgr->suggest_morph(cw)); |
michael@0 | 1495 | if (abbv) { |
michael@0 | 1496 | memcpy(wspace,cw,wl); |
michael@0 | 1497 | *(wspace+wl) = '.'; |
michael@0 | 1498 | *(wspace+wl+1) = '\0'; |
michael@0 | 1499 | cat_result(result, pSMgr->suggest_morph(wspace)); |
michael@0 | 1500 | } |
michael@0 | 1501 | break; |
michael@0 | 1502 | } |
michael@0 | 1503 | case INITCAP: { |
michael@0 | 1504 | wl = mkallsmall2(cw, unicw, nc); |
michael@0 | 1505 | memcpy(wspace,cw,(wl+1)); |
michael@0 | 1506 | wl2 = mkinitcap2(cw, unicw, nc); |
michael@0 | 1507 | cat_result(result, pSMgr->suggest_morph(wspace)); |
michael@0 | 1508 | cat_result(result, pSMgr->suggest_morph(cw)); |
michael@0 | 1509 | if (abbv) { |
michael@0 | 1510 | *(wspace+wl) = '.'; |
michael@0 | 1511 | *(wspace+wl+1) = '\0'; |
michael@0 | 1512 | cat_result(result, pSMgr->suggest_morph(wspace)); |
michael@0 | 1513 | |
michael@0 | 1514 | memcpy(wspace, cw, wl2); |
michael@0 | 1515 | *(wspace+wl2) = '.'; |
michael@0 | 1516 | *(wspace+wl2+1) = '\0'; |
michael@0 | 1517 | |
michael@0 | 1518 | cat_result(result, pSMgr->suggest_morph(wspace)); |
michael@0 | 1519 | } |
michael@0 | 1520 | break; |
michael@0 | 1521 | } |
michael@0 | 1522 | case ALLCAP: { |
michael@0 | 1523 | cat_result(result, pSMgr->suggest_morph(cw)); |
michael@0 | 1524 | if (abbv) { |
michael@0 | 1525 | memcpy(wspace,cw,wl); |
michael@0 | 1526 | *(wspace+wl) = '.'; |
michael@0 | 1527 | *(wspace+wl+1) = '\0'; |
michael@0 | 1528 | cat_result(result, pSMgr->suggest_morph(cw)); |
michael@0 | 1529 | } |
michael@0 | 1530 | wl = mkallsmall2(cw, unicw, nc); |
michael@0 | 1531 | memcpy(wspace,cw,(wl+1)); |
michael@0 | 1532 | wl2 = mkinitcap2(cw, unicw, nc); |
michael@0 | 1533 | |
michael@0 | 1534 | cat_result(result, pSMgr->suggest_morph(wspace)); |
michael@0 | 1535 | cat_result(result, pSMgr->suggest_morph(cw)); |
michael@0 | 1536 | if (abbv) { |
michael@0 | 1537 | *(wspace+wl) = '.'; |
michael@0 | 1538 | *(wspace+wl+1) = '\0'; |
michael@0 | 1539 | cat_result(result, pSMgr->suggest_morph(wspace)); |
michael@0 | 1540 | |
michael@0 | 1541 | memcpy(wspace, cw, wl2); |
michael@0 | 1542 | *(wspace+wl2) = '.'; |
michael@0 | 1543 | *(wspace+wl2+1) = '\0'; |
michael@0 | 1544 | |
michael@0 | 1545 | cat_result(result, pSMgr->suggest_morph(wspace)); |
michael@0 | 1546 | } |
michael@0 | 1547 | break; |
michael@0 | 1548 | } |
michael@0 | 1549 | } |
michael@0 | 1550 | |
michael@0 | 1551 | if (*result) { |
michael@0 | 1552 | // word reversing wrapper for complex prefixes |
michael@0 | 1553 | if (complexprefixes) { |
michael@0 | 1554 | if (utf8) reverseword_utf(result); else reverseword(result); |
michael@0 | 1555 | } |
michael@0 | 1556 | return line_tok(result, slst, MSEP_REC); |
michael@0 | 1557 | } |
michael@0 | 1558 | |
michael@0 | 1559 | // compound word with dash (HU) I18n |
michael@0 | 1560 | char * dash = NULL; |
michael@0 | 1561 | int nresult = 0; |
michael@0 | 1562 | // LANG_hu section: set dash information for suggestions |
michael@0 | 1563 | if (langnum == LANG_hu) dash = (char *) strchr(cw,'-'); |
michael@0 | 1564 | if ((langnum == LANG_hu) && dash) { |
michael@0 | 1565 | *dash='\0'; |
michael@0 | 1566 | // examine 2 sides of the dash |
michael@0 | 1567 | if (dash[1] == '\0') { // base word ending with dash |
michael@0 | 1568 | if (spell(cw)) { |
michael@0 | 1569 | char * p = pSMgr->suggest_morph(cw); |
michael@0 | 1570 | if (p) { |
michael@0 | 1571 | int ret = line_tok(p, slst, MSEP_REC); |
michael@0 | 1572 | free(p); |
michael@0 | 1573 | return ret; |
michael@0 | 1574 | } |
michael@0 | 1575 | |
michael@0 | 1576 | } |
michael@0 | 1577 | } else if ((dash[1] == 'e') && (dash[2] == '\0')) { // XXX (HU) -e hat. |
michael@0 | 1578 | if (spell(cw) && (spell("-e"))) { |
michael@0 | 1579 | st = pSMgr->suggest_morph(cw); |
michael@0 | 1580 | if (st) { |
michael@0 | 1581 | mystrcat(result, st, MAXLNLEN); |
michael@0 | 1582 | free(st); |
michael@0 | 1583 | } |
michael@0 | 1584 | mystrcat(result,"+", MAXLNLEN); // XXX spec. separator in MORPHCODE |
michael@0 | 1585 | st = pSMgr->suggest_morph("-e"); |
michael@0 | 1586 | if (st) { |
michael@0 | 1587 | mystrcat(result, st, MAXLNLEN); |
michael@0 | 1588 | free(st); |
michael@0 | 1589 | } |
michael@0 | 1590 | return line_tok(result, slst, MSEP_REC); |
michael@0 | 1591 | } |
michael@0 | 1592 | } else { |
michael@0 | 1593 | // first word ending with dash: word- XXX ??? |
michael@0 | 1594 | char r2 = *(dash + 1); |
michael@0 | 1595 | dash[0]='-'; |
michael@0 | 1596 | dash[1]='\0'; |
michael@0 | 1597 | nresult = spell(cw); |
michael@0 | 1598 | dash[1] = r2; |
michael@0 | 1599 | dash[0]='\0'; |
michael@0 | 1600 | if (nresult && spell(dash+1) && ((strlen(dash+1) > 1) || |
michael@0 | 1601 | ((dash[1] > '0') && (dash[1] < '9')))) { |
michael@0 | 1602 | st = pSMgr->suggest_morph(cw); |
michael@0 | 1603 | if (st) { |
michael@0 | 1604 | mystrcat(result, st, MAXLNLEN); |
michael@0 | 1605 | free(st); |
michael@0 | 1606 | mystrcat(result,"+", MAXLNLEN); // XXX spec. separator in MORPHCODE |
michael@0 | 1607 | } |
michael@0 | 1608 | st = pSMgr->suggest_morph(dash+1); |
michael@0 | 1609 | if (st) { |
michael@0 | 1610 | mystrcat(result, st, MAXLNLEN); |
michael@0 | 1611 | free(st); |
michael@0 | 1612 | } |
michael@0 | 1613 | return line_tok(result, slst, MSEP_REC); |
michael@0 | 1614 | } |
michael@0 | 1615 | } |
michael@0 | 1616 | // affixed number in correct word |
michael@0 | 1617 | if (nresult && (dash > cw) && (((*(dash-1)<='9') && |
michael@0 | 1618 | (*(dash-1)>='0')) || (*(dash-1)=='.'))) { |
michael@0 | 1619 | *dash='-'; |
michael@0 | 1620 | n = 1; |
michael@0 | 1621 | if (*(dash - n) == '.') n++; |
michael@0 | 1622 | // search first not a number character to left from dash |
michael@0 | 1623 | while (((dash - n)>=cw) && ((*(dash - n)=='0') || (n < 3)) && (n < 6)) { |
michael@0 | 1624 | n++; |
michael@0 | 1625 | } |
michael@0 | 1626 | if ((dash - n) < cw) n--; |
michael@0 | 1627 | // numbers: valami1000000-hoz |
michael@0 | 1628 | // examine 100000-hoz, 10000-hoz 1000-hoz, 10-hoz, |
michael@0 | 1629 | // 56-hoz, 6-hoz |
michael@0 | 1630 | for(; n >= 1; n--) { |
michael@0 | 1631 | if ((*(dash - n) >= '0') && (*(dash - n) <= '9') && checkword(dash - n, NULL, NULL)) { |
michael@0 | 1632 | mystrcat(result, cw, MAXLNLEN); |
michael@0 | 1633 | result[dash - cw - n] = '\0'; |
michael@0 | 1634 | st = pSMgr->suggest_morph(dash - n); |
michael@0 | 1635 | if (st) { |
michael@0 | 1636 | mystrcat(result, st, MAXLNLEN); |
michael@0 | 1637 | free(st); |
michael@0 | 1638 | } |
michael@0 | 1639 | return line_tok(result, slst, MSEP_REC); |
michael@0 | 1640 | } |
michael@0 | 1641 | } |
michael@0 | 1642 | } |
michael@0 | 1643 | } |
michael@0 | 1644 | return 0; |
michael@0 | 1645 | } |
michael@0 | 1646 | |
michael@0 | 1647 | int Hunspell::generate(char*** slst, const char * word, char ** pl, int pln) |
michael@0 | 1648 | { |
michael@0 | 1649 | *slst = NULL; |
michael@0 | 1650 | if (!pSMgr || !pln) return 0; |
michael@0 | 1651 | char **pl2; |
michael@0 | 1652 | int pl2n = analyze(&pl2, word); |
michael@0 | 1653 | int captype = 0; |
michael@0 | 1654 | int abbv = 0; |
michael@0 | 1655 | char cw[MAXWORDUTF8LEN]; |
michael@0 | 1656 | cleanword(cw, word, &captype, &abbv); |
michael@0 | 1657 | char result[MAXLNLEN]; |
michael@0 | 1658 | *result = '\0'; |
michael@0 | 1659 | |
michael@0 | 1660 | for (int i = 0; i < pln; i++) { |
michael@0 | 1661 | cat_result(result, pSMgr->suggest_gen(pl2, pl2n, pl[i])); |
michael@0 | 1662 | } |
michael@0 | 1663 | freelist(&pl2, pl2n); |
michael@0 | 1664 | |
michael@0 | 1665 | if (*result) { |
michael@0 | 1666 | // allcap |
michael@0 | 1667 | if (captype == ALLCAP) mkallcap(result); |
michael@0 | 1668 | |
michael@0 | 1669 | // line split |
michael@0 | 1670 | int linenum = line_tok(result, slst, MSEP_REC); |
michael@0 | 1671 | |
michael@0 | 1672 | // capitalize |
michael@0 | 1673 | if (captype == INITCAP || captype == HUHINITCAP) { |
michael@0 | 1674 | for (int j=0; j < linenum; j++) mkinitcap((*slst)[j]); |
michael@0 | 1675 | } |
michael@0 | 1676 | |
michael@0 | 1677 | // temporary filtering of prefix related errors (eg. |
michael@0 | 1678 | // generate("undrinkable", "eats") --> "undrinkables" and "*undrinks") |
michael@0 | 1679 | |
michael@0 | 1680 | int r = 0; |
michael@0 | 1681 | for (int j=0; j < linenum; j++) { |
michael@0 | 1682 | if (!spell((*slst)[j])) { |
michael@0 | 1683 | free((*slst)[j]); |
michael@0 | 1684 | (*slst)[j] = NULL; |
michael@0 | 1685 | } else { |
michael@0 | 1686 | if (r < j) (*slst)[r] = (*slst)[j]; |
michael@0 | 1687 | r++; |
michael@0 | 1688 | } |
michael@0 | 1689 | } |
michael@0 | 1690 | if (r > 0) return r; |
michael@0 | 1691 | free(*slst); |
michael@0 | 1692 | *slst = NULL; |
michael@0 | 1693 | } |
michael@0 | 1694 | return 0; |
michael@0 | 1695 | } |
michael@0 | 1696 | |
michael@0 | 1697 | int Hunspell::generate(char*** slst, const char * word, const char * pattern) |
michael@0 | 1698 | { |
michael@0 | 1699 | char **pl; |
michael@0 | 1700 | int pln = analyze(&pl, pattern); |
michael@0 | 1701 | int n = generate(slst, word, pl, pln); |
michael@0 | 1702 | freelist(&pl, pln); |
michael@0 | 1703 | return uniqlist(*slst, n); |
michael@0 | 1704 | } |
michael@0 | 1705 | |
michael@0 | 1706 | // minimal XML parser functions |
michael@0 | 1707 | int Hunspell::get_xml_par(char * dest, const char * par, int max) |
michael@0 | 1708 | { |
michael@0 | 1709 | char * d = dest; |
michael@0 | 1710 | if (!par) return 0; |
michael@0 | 1711 | char end = *par; |
michael@0 | 1712 | char * dmax = dest + max; |
michael@0 | 1713 | if (end == '>') end = '<'; |
michael@0 | 1714 | else if (end != '\'' && end != '"') return 0; // bad XML |
michael@0 | 1715 | for (par++; d < dmax && *par != '\0' && *par != end; par++, d++) *d = *par; |
michael@0 | 1716 | *d = '\0'; |
michael@0 | 1717 | mystrrep(dest, "<", "<"); |
michael@0 | 1718 | mystrrep(dest, "&", "&"); |
michael@0 | 1719 | return (int)(d - dest); |
michael@0 | 1720 | } |
michael@0 | 1721 | |
michael@0 | 1722 | int Hunspell::get_langnum() const |
michael@0 | 1723 | { |
michael@0 | 1724 | return langnum; |
michael@0 | 1725 | } |
michael@0 | 1726 | |
michael@0 | 1727 | // return the beginning of the element (attr == NULL) or the attribute |
michael@0 | 1728 | const char * Hunspell::get_xml_pos(const char * s, const char * attr) |
michael@0 | 1729 | { |
michael@0 | 1730 | const char * end = strchr(s, '>'); |
michael@0 | 1731 | const char * p = s; |
michael@0 | 1732 | if (attr == NULL) return end; |
michael@0 | 1733 | do { |
michael@0 | 1734 | p = strstr(p, attr); |
michael@0 | 1735 | if (!p || p >= end) return 0; |
michael@0 | 1736 | } while (*(p-1) != ' ' && *(p-1) != '\n'); |
michael@0 | 1737 | return p + strlen(attr); |
michael@0 | 1738 | } |
michael@0 | 1739 | |
michael@0 | 1740 | int Hunspell::check_xml_par(const char * q, const char * attr, const char * value) { |
michael@0 | 1741 | char cw[MAXWORDUTF8LEN]; |
michael@0 | 1742 | if (get_xml_par(cw, get_xml_pos(q, attr), MAXWORDUTF8LEN - 1) && |
michael@0 | 1743 | strcmp(cw, value) == 0) return 1; |
michael@0 | 1744 | return 0; |
michael@0 | 1745 | } |
michael@0 | 1746 | |
michael@0 | 1747 | int Hunspell::get_xml_list(char ***slst, char * list, const char * tag) { |
michael@0 | 1748 | int n = 0; |
michael@0 | 1749 | char * p; |
michael@0 | 1750 | if (!list) return 0; |
michael@0 | 1751 | for (p = list; (p = strstr(p, tag)); p++) n++; |
michael@0 | 1752 | if (n == 0) return 0; |
michael@0 | 1753 | *slst = (char **) malloc(sizeof(char *) * n); |
michael@0 | 1754 | if (!*slst) return 0; |
michael@0 | 1755 | for (p = list, n = 0; (p = strstr(p, tag)); p++, n++) { |
michael@0 | 1756 | int l = strlen(p); |
michael@0 | 1757 | (*slst)[n] = (char *) malloc(l + 1); |
michael@0 | 1758 | if (!(*slst)[n]) return n; |
michael@0 | 1759 | if (!get_xml_par((*slst)[n], p + strlen(tag) - 1, l)) { |
michael@0 | 1760 | free((*slst)[n]); |
michael@0 | 1761 | break; |
michael@0 | 1762 | } |
michael@0 | 1763 | } |
michael@0 | 1764 | return n; |
michael@0 | 1765 | } |
michael@0 | 1766 | |
michael@0 | 1767 | int Hunspell::spellml(char*** slst, const char * word) |
michael@0 | 1768 | { |
michael@0 | 1769 | char *q, *q2; |
michael@0 | 1770 | char cw[MAXWORDUTF8LEN], cw2[MAXWORDUTF8LEN]; |
michael@0 | 1771 | q = (char *) strstr(word, "<query"); |
michael@0 | 1772 | if (!q) return 0; // bad XML input |
michael@0 | 1773 | q2 = strchr(q, '>'); |
michael@0 | 1774 | if (!q2) return 0; // bad XML input |
michael@0 | 1775 | q2 = strstr(q2, "<word"); |
michael@0 | 1776 | if (!q2) return 0; // bad XML input |
michael@0 | 1777 | if (check_xml_par(q, "type=", "analyze")) { |
michael@0 | 1778 | int n = 0, s = 0; |
michael@0 | 1779 | if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 10)) n = analyze(slst, cw); |
michael@0 | 1780 | if (n == 0) return 0; |
michael@0 | 1781 | // convert the result to <code><a>ana1</a><a>ana2</a></code> format |
michael@0 | 1782 | for (int i = 0; i < n; i++) s+= strlen((*slst)[i]); |
michael@0 | 1783 | char * r = (char *) malloc(6 + 5 * s + 7 * n + 7 + 1); // XXX 5*s->&->& |
michael@0 | 1784 | if (!r) return 0; |
michael@0 | 1785 | strcpy(r, "<code>"); |
michael@0 | 1786 | for (int i = 0; i < n; i++) { |
michael@0 | 1787 | int l = strlen(r); |
michael@0 | 1788 | strcpy(r + l, "<a>"); |
michael@0 | 1789 | strcpy(r + l + 3, (*slst)[i]); |
michael@0 | 1790 | mystrrep(r + l + 3, "\t", " "); |
michael@0 | 1791 | mystrrep(r + l + 3, "<", "<"); |
michael@0 | 1792 | mystrrep(r + l + 3, "&", "&"); |
michael@0 | 1793 | strcat(r, "</a>"); |
michael@0 | 1794 | free((*slst)[i]); |
michael@0 | 1795 | } |
michael@0 | 1796 | strcat(r, "</code>"); |
michael@0 | 1797 | (*slst)[0] = r; |
michael@0 | 1798 | return 1; |
michael@0 | 1799 | } else if (check_xml_par(q, "type=", "stem")) { |
michael@0 | 1800 | if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 1)) return stem(slst, cw); |
michael@0 | 1801 | } else if (check_xml_par(q, "type=", "generate")) { |
michael@0 | 1802 | int n = get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 1); |
michael@0 | 1803 | if (n == 0) return 0; |
michael@0 | 1804 | char * q3 = strstr(q2 + 1, "<word"); |
michael@0 | 1805 | if (q3) { |
michael@0 | 1806 | if (get_xml_par(cw2, strchr(q3, '>'), MAXWORDUTF8LEN - 1)) { |
michael@0 | 1807 | return generate(slst, cw, cw2); |
michael@0 | 1808 | } |
michael@0 | 1809 | } else { |
michael@0 | 1810 | if ((q2 = strstr(q2 + 1, "<code"))) { |
michael@0 | 1811 | char ** slst2; |
michael@0 | 1812 | if ((n = get_xml_list(&slst2, strchr(q2, '>'), "<a>"))) { |
michael@0 | 1813 | int n2 = generate(slst, cw, slst2, n); |
michael@0 | 1814 | freelist(&slst2, n); |
michael@0 | 1815 | return uniqlist(*slst, n2); |
michael@0 | 1816 | } |
michael@0 | 1817 | freelist(&slst2, n); |
michael@0 | 1818 | } |
michael@0 | 1819 | } |
michael@0 | 1820 | } |
michael@0 | 1821 | return 0; |
michael@0 | 1822 | } |
michael@0 | 1823 | |
michael@0 | 1824 | |
michael@0 | 1825 | #ifdef HUNSPELL_EXPERIMENTAL |
michael@0 | 1826 | // XXX need UTF-8 support |
michael@0 | 1827 | char * Hunspell::morph_with_correction(const char * word) |
michael@0 | 1828 | { |
michael@0 | 1829 | char cw[MAXWORDUTF8LEN]; |
michael@0 | 1830 | char wspace[MAXWORDUTF8LEN]; |
michael@0 | 1831 | if (! pSMgr || maxdic == 0) return NULL; |
michael@0 | 1832 | int wl = strlen(word); |
michael@0 | 1833 | if (utf8) { |
michael@0 | 1834 | if (wl >= MAXWORDUTF8LEN) return NULL; |
michael@0 | 1835 | } else { |
michael@0 | 1836 | if (wl >= MAXWORDLEN) return NULL; |
michael@0 | 1837 | } |
michael@0 | 1838 | int captype = 0; |
michael@0 | 1839 | int abbv = 0; |
michael@0 | 1840 | wl = cleanword(cw, word, &captype, &abbv); |
michael@0 | 1841 | if (wl == 0) return NULL; |
michael@0 | 1842 | |
michael@0 | 1843 | char result[MAXLNLEN]; |
michael@0 | 1844 | char * st = NULL; |
michael@0 | 1845 | |
michael@0 | 1846 | *result = '\0'; |
michael@0 | 1847 | |
michael@0 | 1848 | |
michael@0 | 1849 | switch(captype) { |
michael@0 | 1850 | case NOCAP: { |
michael@0 | 1851 | st = pSMgr->suggest_morph_for_spelling_error(cw); |
michael@0 | 1852 | if (st) { |
michael@0 | 1853 | mystrcat(result, st, MAXLNLEN); |
michael@0 | 1854 | free(st); |
michael@0 | 1855 | } |
michael@0 | 1856 | if (abbv) { |
michael@0 | 1857 | memcpy(wspace,cw,wl); |
michael@0 | 1858 | *(wspace+wl) = '.'; |
michael@0 | 1859 | *(wspace+wl+1) = '\0'; |
michael@0 | 1860 | st = pSMgr->suggest_morph_for_spelling_error(wspace); |
michael@0 | 1861 | if (st) { |
michael@0 | 1862 | if (*result) mystrcat(result, "\n", MAXLNLEN); |
michael@0 | 1863 | mystrcat(result, st, MAXLNLEN); |
michael@0 | 1864 | free(st); |
michael@0 | 1865 | } |
michael@0 | 1866 | } |
michael@0 | 1867 | break; |
michael@0 | 1868 | } |
michael@0 | 1869 | case INITCAP: { |
michael@0 | 1870 | memcpy(wspace,cw,(wl+1)); |
michael@0 | 1871 | mkallsmall(wspace); |
michael@0 | 1872 | st = pSMgr->suggest_morph_for_spelling_error(wspace); |
michael@0 | 1873 | if (st) { |
michael@0 | 1874 | mystrcat(result, st, MAXLNLEN); |
michael@0 | 1875 | free(st); |
michael@0 | 1876 | } |
michael@0 | 1877 | st = pSMgr->suggest_morph_for_spelling_error(cw); |
michael@0 | 1878 | if (st) { |
michael@0 | 1879 | if (*result) mystrcat(result, "\n", MAXLNLEN); |
michael@0 | 1880 | mystrcat(result, st, MAXLNLEN); |
michael@0 | 1881 | free(st); |
michael@0 | 1882 | } |
michael@0 | 1883 | if (abbv) { |
michael@0 | 1884 | memcpy(wspace,cw,wl); |
michael@0 | 1885 | *(wspace+wl) = '.'; |
michael@0 | 1886 | *(wspace+wl+1) = '\0'; |
michael@0 | 1887 | mkallsmall(wspace); |
michael@0 | 1888 | st = pSMgr->suggest_morph_for_spelling_error(wspace); |
michael@0 | 1889 | if (st) { |
michael@0 | 1890 | if (*result) mystrcat(result, "\n", MAXLNLEN); |
michael@0 | 1891 | mystrcat(result, st, MAXLNLEN); |
michael@0 | 1892 | free(st); |
michael@0 | 1893 | } |
michael@0 | 1894 | mkinitcap(wspace); |
michael@0 | 1895 | st = pSMgr->suggest_morph_for_spelling_error(wspace); |
michael@0 | 1896 | if (st) { |
michael@0 | 1897 | if (*result) mystrcat(result, "\n", MAXLNLEN); |
michael@0 | 1898 | mystrcat(result, st, MAXLNLEN); |
michael@0 | 1899 | free(st); |
michael@0 | 1900 | } |
michael@0 | 1901 | } |
michael@0 | 1902 | break; |
michael@0 | 1903 | } |
michael@0 | 1904 | case HUHCAP: { |
michael@0 | 1905 | st = pSMgr->suggest_morph_for_spelling_error(cw); |
michael@0 | 1906 | if (st) { |
michael@0 | 1907 | mystrcat(result, st, MAXLNLEN); |
michael@0 | 1908 | free(st); |
michael@0 | 1909 | } |
michael@0 | 1910 | memcpy(wspace,cw,(wl+1)); |
michael@0 | 1911 | mkallsmall(wspace); |
michael@0 | 1912 | st = pSMgr->suggest_morph_for_spelling_error(wspace); |
michael@0 | 1913 | if (st) { |
michael@0 | 1914 | if (*result) mystrcat(result, "\n", MAXLNLEN); |
michael@0 | 1915 | mystrcat(result, st, MAXLNLEN); |
michael@0 | 1916 | free(st); |
michael@0 | 1917 | } |
michael@0 | 1918 | break; |
michael@0 | 1919 | } |
michael@0 | 1920 | case ALLCAP: { |
michael@0 | 1921 | memcpy(wspace,cw,(wl+1)); |
michael@0 | 1922 | st = pSMgr->suggest_morph_for_spelling_error(wspace); |
michael@0 | 1923 | if (st) { |
michael@0 | 1924 | mystrcat(result, st, MAXLNLEN); |
michael@0 | 1925 | free(st); |
michael@0 | 1926 | } |
michael@0 | 1927 | mkallsmall(wspace); |
michael@0 | 1928 | st = pSMgr->suggest_morph_for_spelling_error(wspace); |
michael@0 | 1929 | if (st) { |
michael@0 | 1930 | if (*result) mystrcat(result, "\n", MAXLNLEN); |
michael@0 | 1931 | mystrcat(result, st, MAXLNLEN); |
michael@0 | 1932 | free(st); |
michael@0 | 1933 | } |
michael@0 | 1934 | mkinitcap(wspace); |
michael@0 | 1935 | st = pSMgr->suggest_morph_for_spelling_error(wspace); |
michael@0 | 1936 | if (st) { |
michael@0 | 1937 | if (*result) mystrcat(result, "\n", MAXLNLEN); |
michael@0 | 1938 | mystrcat(result, st, MAXLNLEN); |
michael@0 | 1939 | free(st); |
michael@0 | 1940 | } |
michael@0 | 1941 | if (abbv) { |
michael@0 | 1942 | memcpy(wspace,cw,(wl+1)); |
michael@0 | 1943 | *(wspace+wl) = '.'; |
michael@0 | 1944 | *(wspace+wl+1) = '\0'; |
michael@0 | 1945 | if (*result) mystrcat(result, "\n", MAXLNLEN); |
michael@0 | 1946 | st = pSMgr->suggest_morph_for_spelling_error(wspace); |
michael@0 | 1947 | if (st) { |
michael@0 | 1948 | mystrcat(result, st, MAXLNLEN); |
michael@0 | 1949 | free(st); |
michael@0 | 1950 | } |
michael@0 | 1951 | mkallsmall(wspace); |
michael@0 | 1952 | st = pSMgr->suggest_morph_for_spelling_error(wspace); |
michael@0 | 1953 | if (st) { |
michael@0 | 1954 | if (*result) mystrcat(result, "\n", MAXLNLEN); |
michael@0 | 1955 | mystrcat(result, st, MAXLNLEN); |
michael@0 | 1956 | free(st); |
michael@0 | 1957 | } |
michael@0 | 1958 | mkinitcap(wspace); |
michael@0 | 1959 | st = pSMgr->suggest_morph_for_spelling_error(wspace); |
michael@0 | 1960 | if (st) { |
michael@0 | 1961 | if (*result) mystrcat(result, "\n", MAXLNLEN); |
michael@0 | 1962 | mystrcat(result, st, MAXLNLEN); |
michael@0 | 1963 | free(st); |
michael@0 | 1964 | } |
michael@0 | 1965 | } |
michael@0 | 1966 | break; |
michael@0 | 1967 | } |
michael@0 | 1968 | } |
michael@0 | 1969 | |
michael@0 | 1970 | if (*result) return mystrdup(result); |
michael@0 | 1971 | return NULL; |
michael@0 | 1972 | } |
michael@0 | 1973 | |
michael@0 | 1974 | #endif // END OF HUNSPELL_EXPERIMENTAL CODE |
michael@0 | 1975 | |
michael@0 | 1976 | Hunhandle *Hunspell_create(const char * affpath, const char * dpath) |
michael@0 | 1977 | { |
michael@0 | 1978 | return (Hunhandle*)(new Hunspell(affpath, dpath)); |
michael@0 | 1979 | } |
michael@0 | 1980 | |
michael@0 | 1981 | Hunhandle *Hunspell_create_key(const char * affpath, const char * dpath, |
michael@0 | 1982 | const char * key) |
michael@0 | 1983 | { |
michael@0 | 1984 | return (Hunhandle*)(new Hunspell(affpath, dpath, key)); |
michael@0 | 1985 | } |
michael@0 | 1986 | |
michael@0 | 1987 | void Hunspell_destroy(Hunhandle *pHunspell) |
michael@0 | 1988 | { |
michael@0 | 1989 | delete (Hunspell*)(pHunspell); |
michael@0 | 1990 | } |
michael@0 | 1991 | |
michael@0 | 1992 | int Hunspell_spell(Hunhandle *pHunspell, const char *word) |
michael@0 | 1993 | { |
michael@0 | 1994 | return ((Hunspell*)pHunspell)->spell(word); |
michael@0 | 1995 | } |
michael@0 | 1996 | |
michael@0 | 1997 | char *Hunspell_get_dic_encoding(Hunhandle *pHunspell) |
michael@0 | 1998 | { |
michael@0 | 1999 | return ((Hunspell*)pHunspell)->get_dic_encoding(); |
michael@0 | 2000 | } |
michael@0 | 2001 | |
michael@0 | 2002 | int Hunspell_suggest(Hunhandle *pHunspell, char*** slst, const char * word) |
michael@0 | 2003 | { |
michael@0 | 2004 | return ((Hunspell*)pHunspell)->suggest(slst, word); |
michael@0 | 2005 | } |
michael@0 | 2006 | |
michael@0 | 2007 | int Hunspell_analyze(Hunhandle *pHunspell, char*** slst, const char * word) |
michael@0 | 2008 | { |
michael@0 | 2009 | return ((Hunspell*)pHunspell)->analyze(slst, word); |
michael@0 | 2010 | } |
michael@0 | 2011 | |
michael@0 | 2012 | int Hunspell_stem(Hunhandle *pHunspell, char*** slst, const char * word) |
michael@0 | 2013 | { |
michael@0 | 2014 | return ((Hunspell*)pHunspell)->stem(slst, word); |
michael@0 | 2015 | } |
michael@0 | 2016 | |
michael@0 | 2017 | int Hunspell_stem2(Hunhandle *pHunspell, char*** slst, char** desc, int n) |
michael@0 | 2018 | { |
michael@0 | 2019 | return ((Hunspell*)pHunspell)->stem(slst, desc, n); |
michael@0 | 2020 | } |
michael@0 | 2021 | |
michael@0 | 2022 | int Hunspell_generate(Hunhandle *pHunspell, char*** slst, const char * word, |
michael@0 | 2023 | const char * word2) |
michael@0 | 2024 | { |
michael@0 | 2025 | return ((Hunspell*)pHunspell)->generate(slst, word, word2); |
michael@0 | 2026 | } |
michael@0 | 2027 | |
michael@0 | 2028 | int Hunspell_generate2(Hunhandle *pHunspell, char*** slst, const char * word, |
michael@0 | 2029 | char** desc, int n) |
michael@0 | 2030 | { |
michael@0 | 2031 | return ((Hunspell*)pHunspell)->generate(slst, word, desc, n); |
michael@0 | 2032 | } |
michael@0 | 2033 | |
michael@0 | 2034 | /* functions for run-time modification of the dictionary */ |
michael@0 | 2035 | |
michael@0 | 2036 | /* add word to the run-time dictionary */ |
michael@0 | 2037 | |
michael@0 | 2038 | int Hunspell_add(Hunhandle *pHunspell, const char * word) { |
michael@0 | 2039 | return ((Hunspell*)pHunspell)->add(word); |
michael@0 | 2040 | } |
michael@0 | 2041 | |
michael@0 | 2042 | /* add word to the run-time dictionary with affix flags of |
michael@0 | 2043 | * the example (a dictionary word): Hunspell will recognize |
michael@0 | 2044 | * affixed forms of the new word, too. |
michael@0 | 2045 | */ |
michael@0 | 2046 | |
michael@0 | 2047 | int Hunspell_add_with_affix(Hunhandle *pHunspell, const char * word, |
michael@0 | 2048 | const char * example) { |
michael@0 | 2049 | return ((Hunspell*)pHunspell)->add_with_affix(word, example); |
michael@0 | 2050 | } |
michael@0 | 2051 | |
michael@0 | 2052 | /* remove word from the run-time dictionary */ |
michael@0 | 2053 | |
michael@0 | 2054 | int Hunspell_remove(Hunhandle *pHunspell, const char * word) { |
michael@0 | 2055 | return ((Hunspell*)pHunspell)->remove(word); |
michael@0 | 2056 | } |
michael@0 | 2057 | |
michael@0 | 2058 | void Hunspell_free_list(Hunhandle *, char *** slst, int n) { |
michael@0 | 2059 | freelist(slst, n); |
michael@0 | 2060 | } |