1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/extensions/spellcheck/hunspell/src/hunspell.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,2060 @@ 1.4 +/******* BEGIN LICENSE BLOCK ******* 1.5 + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 1.6 + * 1.7 + * The contents of this file are subject to the Mozilla Public License Version 1.8 + * 1.1 (the "License"); you may not use this file except in compliance with 1.9 + * the License. You may obtain a copy of the License at 1.10 + * http://www.mozilla.org/MPL/ 1.11 + * 1.12 + * Software distributed under the License is distributed on an "AS IS" basis, 1.13 + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 1.14 + * for the specific language governing rights and limitations under the 1.15 + * License. 1.16 + * 1.17 + * The Initial Developers of the Original Code are Kevin Hendricks (MySpell) 1.18 + * and László Németh (Hunspell). Portions created by the Initial Developers 1.19 + * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved. 1.20 + * 1.21 + * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca) 1.22 + * David Einstein (deinst@world.std.com) 1.23 + * László Németh (nemethl@gyorsposta.hu) 1.24 + * Caolan McNamara (caolanm@redhat.com) 1.25 + * Davide Prina 1.26 + * Giuseppe Modugno 1.27 + * Gianluca Turconi 1.28 + * Simon Brouwer 1.29 + * Noll Janos 1.30 + * Biro Arpad 1.31 + * Goldman Eleonora 1.32 + * Sarlos Tamas 1.33 + * Bencsath Boldizsar 1.34 + * Halacsy Peter 1.35 + * Dvornik Laszlo 1.36 + * Gefferth Andras 1.37 + * Nagy Viktor 1.38 + * Varga Daniel 1.39 + * Chris Halls 1.40 + * Rene Engelhard 1.41 + * Bram Moolenaar 1.42 + * Dafydd Jones 1.43 + * Harri Pitkanen 1.44 + * Andras Timar 1.45 + * Tor Lillqvist 1.46 + * 1.47 + * Alternatively, the contents of this file may be used under the terms of 1.48 + * either the GNU General Public License Version 2 or later (the "GPL"), or 1.49 + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 1.50 + * in which case the provisions of the GPL or the LGPL are applicable instead 1.51 + * of those above. If you wish to allow use of your version of this file only 1.52 + * under the terms of either the GPL or the LGPL, and not to allow others to 1.53 + * use your version of this file under the terms of the MPL, indicate your 1.54 + * decision by deleting the provisions above and replace them with the notice 1.55 + * and other provisions required by the GPL or the LGPL. If you do not delete 1.56 + * the provisions above, a recipient may use your version of this file under 1.57 + * the terms of any one of the MPL, the GPL or the LGPL. 1.58 + * 1.59 + ******* END LICENSE BLOCK *******/ 1.60 + 1.61 +#include <stdlib.h> 1.62 +#include <string.h> 1.63 +#include <stdio.h> 1.64 + 1.65 +#include "hunspell.hxx" 1.66 +#include "hunspell.h" 1.67 +#ifndef MOZILLA_CLIENT 1.68 +# include "config.h" 1.69 +#endif 1.70 +#include "csutil.hxx" 1.71 + 1.72 +Hunspell::Hunspell(const char * affpath, const char * dpath, const char * key) 1.73 +{ 1.74 + encoding = NULL; 1.75 + csconv = NULL; 1.76 + utf8 = 0; 1.77 + complexprefixes = 0; 1.78 + affixpath = mystrdup(affpath); 1.79 + maxdic = 0; 1.80 + 1.81 + /* first set up the hash manager */ 1.82 + pHMgr[0] = new HashMgr(dpath, affpath, key); 1.83 + if (pHMgr[0]) maxdic = 1; 1.84 + 1.85 + /* next set up the affix manager */ 1.86 + /* it needs access to the hash manager lookup methods */ 1.87 + pAMgr = new AffixMgr(affpath, pHMgr, &maxdic, key); 1.88 + 1.89 + /* get the preferred try string and the dictionary */ 1.90 + /* encoding from the Affix Manager for that dictionary */ 1.91 + char * try_string = pAMgr->get_try_string(); 1.92 + encoding = pAMgr->get_encoding(); 1.93 + langnum = pAMgr->get_langnum(); 1.94 + utf8 = pAMgr->get_utf8(); 1.95 + if (!utf8) 1.96 + csconv = get_current_cs(encoding); 1.97 + complexprefixes = pAMgr->get_complexprefixes(); 1.98 + wordbreak = pAMgr->get_breaktable(); 1.99 + 1.100 + /* and finally set up the suggestion manager */ 1.101 + pSMgr = new SuggestMgr(try_string, MAXSUGGESTION, pAMgr); 1.102 + if (try_string) free(try_string); 1.103 +} 1.104 + 1.105 +Hunspell::~Hunspell() 1.106 +{ 1.107 + if (pSMgr) delete pSMgr; 1.108 + if (pAMgr) delete pAMgr; 1.109 + for (int i = 0; i < maxdic; i++) delete pHMgr[i]; 1.110 + maxdic = 0; 1.111 + pSMgr = NULL; 1.112 + pAMgr = NULL; 1.113 +#ifdef MOZILLA_CLIENT 1.114 + delete [] csconv; 1.115 +#endif 1.116 + csconv= NULL; 1.117 + if (encoding) free(encoding); 1.118 + encoding = NULL; 1.119 + if (affixpath) free(affixpath); 1.120 + affixpath = NULL; 1.121 +} 1.122 + 1.123 +// load extra dictionaries 1.124 +int Hunspell::add_dic(const char * dpath, const char * key) { 1.125 + if (maxdic == MAXDIC || !affixpath) return 1; 1.126 + pHMgr[maxdic] = new HashMgr(dpath, affixpath, key); 1.127 + if (pHMgr[maxdic]) maxdic++; else return 1; 1.128 + return 0; 1.129 +} 1.130 + 1.131 +// make a copy of src at destination while removing all leading 1.132 +// blanks and removing any trailing periods after recording 1.133 +// their presence with the abbreviation flag 1.134 +// also since already going through character by character, 1.135 +// set the capitalization type 1.136 +// return the length of the "cleaned" (and UTF-8 encoded) word 1.137 + 1.138 +int Hunspell::cleanword2(char * dest, const char * src, 1.139 + w_char * dest_utf, int * nc, int * pcaptype, int * pabbrev) 1.140 +{ 1.141 + unsigned char * p = (unsigned char *) dest; 1.142 + const unsigned char * q = (const unsigned char * ) src; 1.143 + 1.144 + // first skip over any leading blanks 1.145 + while ((*q != '\0') && (*q == ' ')) q++; 1.146 + 1.147 + // now strip off any trailing periods (recording their presence) 1.148 + *pabbrev = 0; 1.149 + int nl = strlen((const char *)q); 1.150 + while ((nl > 0) && (*(q+nl-1)=='.')) { 1.151 + nl--; 1.152 + (*pabbrev)++; 1.153 + } 1.154 + 1.155 + // if no characters are left it can't be capitalized 1.156 + if (nl <= 0) { 1.157 + *pcaptype = NOCAP; 1.158 + *p = '\0'; 1.159 + return 0; 1.160 + } 1.161 + 1.162 + strncpy(dest, (char *) q, nl); 1.163 + *(dest + nl) = '\0'; 1.164 + nl = strlen(dest); 1.165 + if (utf8) { 1.166 + *nc = u8_u16(dest_utf, MAXWORDLEN, dest); 1.167 + // don't check too long words 1.168 + if (*nc >= MAXWORDLEN) return 0; 1.169 + if (*nc == -1) { // big Unicode character (non BMP area) 1.170 + *pcaptype = NOCAP; 1.171 + return nl; 1.172 + } 1.173 + *pcaptype = get_captype_utf8(dest_utf, *nc, langnum); 1.174 + } else { 1.175 + *pcaptype = get_captype(dest, nl, csconv); 1.176 + *nc = nl; 1.177 + } 1.178 + return nl; 1.179 +} 1.180 + 1.181 +int Hunspell::cleanword(char * dest, const char * src, 1.182 + int * pcaptype, int * pabbrev) 1.183 +{ 1.184 + unsigned char * p = (unsigned char *) dest; 1.185 + const unsigned char * q = (const unsigned char * ) src; 1.186 + int firstcap = 0; 1.187 + 1.188 + // first skip over any leading blanks 1.189 + while ((*q != '\0') && (*q == ' ')) q++; 1.190 + 1.191 + // now strip off any trailing periods (recording their presence) 1.192 + *pabbrev = 0; 1.193 + int nl = strlen((const char *)q); 1.194 + while ((nl > 0) && (*(q+nl-1)=='.')) { 1.195 + nl--; 1.196 + (*pabbrev)++; 1.197 + } 1.198 + 1.199 + // if no characters are left it can't be capitalized 1.200 + if (nl <= 0) { 1.201 + *pcaptype = NOCAP; 1.202 + *p = '\0'; 1.203 + return 0; 1.204 + } 1.205 + 1.206 + // now determine the capitalization type of the first nl letters 1.207 + int ncap = 0; 1.208 + int nneutral = 0; 1.209 + int nc = 0; 1.210 + 1.211 + if (!utf8) { 1.212 + while (nl > 0) { 1.213 + nc++; 1.214 + if (csconv[(*q)].ccase) ncap++; 1.215 + if (csconv[(*q)].cupper == csconv[(*q)].clower) nneutral++; 1.216 + *p++ = *q++; 1.217 + nl--; 1.218 + } 1.219 + // remember to terminate the destination string 1.220 + *p = '\0'; 1.221 + firstcap = csconv[(unsigned char)(*dest)].ccase; 1.222 + } else { 1.223 + unsigned short idx; 1.224 + w_char t[MAXWORDLEN]; 1.225 + nc = u8_u16(t, MAXWORDLEN, src); 1.226 + for (int i = 0; i < nc; i++) { 1.227 + idx = (t[i].h << 8) + t[i].l; 1.228 + unsigned short low = unicodetolower(idx, langnum); 1.229 + if (idx != low) ncap++; 1.230 + if (unicodetoupper(idx, langnum) == low) nneutral++; 1.231 + } 1.232 + u16_u8(dest, MAXWORDUTF8LEN, t, nc); 1.233 + if (ncap) { 1.234 + idx = (t[0].h << 8) + t[0].l; 1.235 + firstcap = (idx != unicodetolower(idx, langnum)); 1.236 + } 1.237 + } 1.238 + 1.239 + // now finally set the captype 1.240 + if (ncap == 0) { 1.241 + *pcaptype = NOCAP; 1.242 + } else if ((ncap == 1) && firstcap) { 1.243 + *pcaptype = INITCAP; 1.244 + } else if ((ncap == nc) || ((ncap + nneutral) == nc)){ 1.245 + *pcaptype = ALLCAP; 1.246 + } else if ((ncap > 1) && firstcap) { 1.247 + *pcaptype = HUHINITCAP; 1.248 + } else { 1.249 + *pcaptype = HUHCAP; 1.250 + } 1.251 + return strlen(dest); 1.252 +} 1.253 + 1.254 +void Hunspell::mkallcap(char * p) 1.255 +{ 1.256 + if (utf8) { 1.257 + w_char u[MAXWORDLEN]; 1.258 + int nc = u8_u16(u, MAXWORDLEN, p); 1.259 + unsigned short idx; 1.260 + for (int i = 0; i < nc; i++) { 1.261 + idx = (u[i].h << 8) + u[i].l; 1.262 + if (idx != unicodetoupper(idx, langnum)) { 1.263 + u[i].h = (unsigned char) (unicodetoupper(idx, langnum) >> 8); 1.264 + u[i].l = (unsigned char) (unicodetoupper(idx, langnum) & 0x00FF); 1.265 + } 1.266 + } 1.267 + u16_u8(p, MAXWORDUTF8LEN, u, nc); 1.268 + } else { 1.269 + while (*p != '\0') { 1.270 + *p = csconv[((unsigned char) *p)].cupper; 1.271 + p++; 1.272 + } 1.273 + } 1.274 +} 1.275 + 1.276 +int Hunspell::mkallcap2(char * p, w_char * u, int nc) 1.277 +{ 1.278 + if (utf8) { 1.279 + unsigned short idx; 1.280 + for (int i = 0; i < nc; i++) { 1.281 + idx = (u[i].h << 8) + u[i].l; 1.282 + unsigned short up = unicodetoupper(idx, langnum); 1.283 + if (idx != up) { 1.284 + u[i].h = (unsigned char) (up >> 8); 1.285 + u[i].l = (unsigned char) (up & 0x00FF); 1.286 + } 1.287 + } 1.288 + u16_u8(p, MAXWORDUTF8LEN, u, nc); 1.289 + return strlen(p); 1.290 + } else { 1.291 + while (*p != '\0') { 1.292 + *p = csconv[((unsigned char) *p)].cupper; 1.293 + p++; 1.294 + } 1.295 + } 1.296 + return nc; 1.297 +} 1.298 + 1.299 + 1.300 +void Hunspell::mkallsmall(char * p) 1.301 +{ 1.302 + while (*p != '\0') { 1.303 + *p = csconv[((unsigned char) *p)].clower; 1.304 + p++; 1.305 + } 1.306 +} 1.307 + 1.308 +int Hunspell::mkallsmall2(char * p, w_char * u, int nc) 1.309 +{ 1.310 + if (utf8) { 1.311 + unsigned short idx; 1.312 + for (int i = 0; i < nc; i++) { 1.313 + idx = (u[i].h << 8) + u[i].l; 1.314 + unsigned short low = unicodetolower(idx, langnum); 1.315 + if (idx != low) { 1.316 + u[i].h = (unsigned char) (low >> 8); 1.317 + u[i].l = (unsigned char) (low & 0x00FF); 1.318 + } 1.319 + } 1.320 + u16_u8(p, MAXWORDUTF8LEN, u, nc); 1.321 + return strlen(p); 1.322 + } else { 1.323 + while (*p != '\0') { 1.324 + *p = csconv[((unsigned char) *p)].clower; 1.325 + p++; 1.326 + } 1.327 + } 1.328 + return nc; 1.329 +} 1.330 + 1.331 +// convert UTF-8 sharp S codes to latin 1 1.332 +char * Hunspell::sharps_u8_l1(char * dest, char * source) { 1.333 + char * p = dest; 1.334 + *p = *source; 1.335 + for (p++, source++; *(source - 1); p++, source++) { 1.336 + *p = *source; 1.337 + if (*source == '\x9F') *--p = '\xDF'; 1.338 + } 1.339 + return dest; 1.340 +} 1.341 + 1.342 +// recursive search for right ss - sharp s permutations 1.343 +hentry * Hunspell::spellsharps(char * base, char * pos, int n, 1.344 + int repnum, char * tmp, int * info, char **root) { 1.345 + pos = strstr(pos, "ss"); 1.346 + if (pos && (n < MAXSHARPS)) { 1.347 + *pos = '\xC3'; 1.348 + *(pos + 1) = '\x9F'; 1.349 + hentry * h = spellsharps(base, pos + 2, n + 1, repnum + 1, tmp, info, root); 1.350 + if (h) return h; 1.351 + *pos = 's'; 1.352 + *(pos + 1) = 's'; 1.353 + h = spellsharps(base, pos + 2, n + 1, repnum, tmp, info, root); 1.354 + if (h) return h; 1.355 + } else if (repnum > 0) { 1.356 + if (utf8) return checkword(base, info, root); 1.357 + return checkword(sharps_u8_l1(tmp, base), info, root); 1.358 + } 1.359 + return NULL; 1.360 +} 1.361 + 1.362 +int Hunspell::is_keepcase(const hentry * rv) { 1.363 + return pAMgr && rv->astr && pAMgr->get_keepcase() && 1.364 + TESTAFF(rv->astr, pAMgr->get_keepcase(), rv->alen); 1.365 +} 1.366 + 1.367 +/* insert a word to the beginning of the suggestion array and return ns */ 1.368 +int Hunspell::insert_sug(char ***slst, char * word, int ns) { 1.369 + char * dup = mystrdup(word); 1.370 + if (!dup) return ns; 1.371 + if (ns == MAXSUGGESTION) { 1.372 + ns--; 1.373 + free((*slst)[ns]); 1.374 + } 1.375 + for (int k = ns; k > 0; k--) (*slst)[k] = (*slst)[k - 1]; 1.376 + (*slst)[0] = dup; 1.377 + return ns + 1; 1.378 +} 1.379 + 1.380 +int Hunspell::spell(const char * word, int * info, char ** root) 1.381 +{ 1.382 + struct hentry * rv=NULL; 1.383 + // need larger vector. For example, Turkish capital letter I converted a 1.384 + // 2-byte UTF-8 character (dotless i) by mkallsmall. 1.385 + char cw[MAXWORDUTF8LEN]; 1.386 + char wspace[MAXWORDUTF8LEN]; 1.387 + w_char unicw[MAXWORDLEN]; 1.388 + // Hunspell supports XML input of the simplified API (see manual) 1.389 + if (strcmp(word, SPELL_XML) == 0) return 1; 1.390 + int nc = strlen(word); 1.391 + int wl2 = 0; 1.392 + if (utf8) { 1.393 + if (nc >= MAXWORDUTF8LEN) return 0; 1.394 + } else { 1.395 + if (nc >= MAXWORDLEN) return 0; 1.396 + } 1.397 + int captype = 0; 1.398 + int abbv = 0; 1.399 + int wl = 0; 1.400 + 1.401 + // input conversion 1.402 + RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; 1.403 + if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv); 1.404 + else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); 1.405 + 1.406 + int info2 = 0; 1.407 + if (wl == 0 || maxdic == 0) return 1; 1.408 + if (root) *root = NULL; 1.409 + 1.410 + // allow numbers with dots, dashes and commas (but forbid double separators: "..", "--" etc.) 1.411 + enum { NBEGIN, NNUM, NSEP }; 1.412 + int nstate = NBEGIN; 1.413 + int i; 1.414 + 1.415 + for (i = 0; (i < wl); i++) { 1.416 + if ((cw[i] <= '9') && (cw[i] >= '0')) { 1.417 + nstate = NNUM; 1.418 + } else if ((cw[i] == ',') || (cw[i] == '.') || (cw[i] == '-')) { 1.419 + if ((nstate == NSEP) || (i == 0)) break; 1.420 + nstate = NSEP; 1.421 + } else break; 1.422 + } 1.423 + if ((i == wl) && (nstate == NNUM)) return 1; 1.424 + if (!info) info = &info2; else *info = 0; 1.425 + 1.426 + switch(captype) { 1.427 + case HUHCAP: 1.428 + case HUHINITCAP: 1.429 + *info += SPELL_ORIGCAP; 1.430 + case NOCAP: { 1.431 + rv = checkword(cw, info, root); 1.432 + if ((abbv) && !(rv)) { 1.433 + memcpy(wspace,cw,wl); 1.434 + *(wspace+wl) = '.'; 1.435 + *(wspace+wl+1) = '\0'; 1.436 + rv = checkword(wspace, info, root); 1.437 + } 1.438 + break; 1.439 + } 1.440 + case ALLCAP: { 1.441 + *info += SPELL_ORIGCAP; 1.442 + rv = checkword(cw, info, root); 1.443 + if (rv) break; 1.444 + if (abbv) { 1.445 + memcpy(wspace,cw,wl); 1.446 + *(wspace+wl) = '.'; 1.447 + *(wspace+wl+1) = '\0'; 1.448 + rv = checkword(wspace, info, root); 1.449 + if (rv) break; 1.450 + } 1.451 + // Spec. prefix handling for Catalan, French, Italian: 1.452 + // prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia). 1.453 + if (pAMgr && strchr(cw, '\'')) { 1.454 + wl = mkallsmall2(cw, unicw, nc); 1.455 + //There are no really sane circumstances where this could fail, 1.456 + //but anyway... 1.457 + if (char * apostrophe = strchr(cw, '\'')) { 1.458 + if (utf8) { 1.459 + w_char tmpword[MAXWORDLEN]; 1.460 + *apostrophe = '\0'; 1.461 + wl2 = u8_u16(tmpword, MAXWORDLEN, cw); 1.462 + *apostrophe = '\''; 1.463 + if (wl2 < nc) { 1.464 + mkinitcap2(apostrophe + 1, unicw + wl2 + 1, nc - wl2 - 1); 1.465 + rv = checkword(cw, info, root); 1.466 + if (rv) break; 1.467 + } 1.468 + } else { 1.469 + mkinitcap2(apostrophe + 1, unicw, nc); 1.470 + rv = checkword(cw, info, root); 1.471 + if (rv) break; 1.472 + } 1.473 + } 1.474 + mkinitcap2(cw, unicw, nc); 1.475 + rv = checkword(cw, info, root); 1.476 + if (rv) break; 1.477 + } 1.478 + if (pAMgr && pAMgr->get_checksharps() && strstr(cw, "SS")) { 1.479 + char tmpword[MAXWORDUTF8LEN]; 1.480 + wl = mkallsmall2(cw, unicw, nc); 1.481 + memcpy(wspace,cw,(wl+1)); 1.482 + rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); 1.483 + if (!rv) { 1.484 + wl2 = mkinitcap2(cw, unicw, nc); 1.485 + rv = spellsharps(cw, cw, 0, 0, tmpword, info, root); 1.486 + } 1.487 + if ((abbv) && !(rv)) { 1.488 + *(wspace+wl) = '.'; 1.489 + *(wspace+wl+1) = '\0'; 1.490 + rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); 1.491 + if (!rv) { 1.492 + memcpy(wspace, cw, wl2); 1.493 + *(wspace+wl2) = '.'; 1.494 + *(wspace+wl2+1) = '\0'; 1.495 + rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); 1.496 + } 1.497 + } 1.498 + if (rv) break; 1.499 + } 1.500 + } 1.501 + case INITCAP: { 1.502 + *info += SPELL_ORIGCAP; 1.503 + wl = mkallsmall2(cw, unicw, nc); 1.504 + memcpy(wspace,cw,(wl+1)); 1.505 + wl2 = mkinitcap2(cw, unicw, nc); 1.506 + if (captype == INITCAP) *info += SPELL_INITCAP; 1.507 + rv = checkword(cw, info, root); 1.508 + if (captype == INITCAP) *info -= SPELL_INITCAP; 1.509 + // forbid bad capitalization 1.510 + // (for example, ijs -> Ijs instead of IJs in Dutch) 1.511 + // use explicit forms in dic: Ijs/F (F = FORBIDDENWORD flag) 1.512 + if (*info & SPELL_FORBIDDEN) { 1.513 + rv = NULL; 1.514 + break; 1.515 + } 1.516 + if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL; 1.517 + if (rv) break; 1.518 + 1.519 + rv = checkword(wspace, info, root); 1.520 + if (abbv && !rv) { 1.521 + 1.522 + *(wspace+wl) = '.'; 1.523 + *(wspace+wl+1) = '\0'; 1.524 + rv = checkword(wspace, info, root); 1.525 + if (!rv) { 1.526 + memcpy(wspace, cw, wl2); 1.527 + *(wspace+wl2) = '.'; 1.528 + *(wspace+wl2+1) = '\0'; 1.529 + if (captype == INITCAP) *info += SPELL_INITCAP; 1.530 + rv = checkword(wspace, info, root); 1.531 + if (captype == INITCAP) *info -= SPELL_INITCAP; 1.532 + if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL; 1.533 + break; 1.534 + } 1.535 + } 1.536 + if (rv && is_keepcase(rv) && 1.537 + ((captype == ALLCAP) || 1.538 + // if CHECKSHARPS: KEEPCASE words with \xDF are allowed 1.539 + // in INITCAP form, too. 1.540 + !(pAMgr->get_checksharps() && 1.541 + ((utf8 && strstr(wspace, "\xC3\x9F")) || 1.542 + (!utf8 && strchr(wspace, '\xDF')))))) rv = NULL; 1.543 + break; 1.544 + } 1.545 + } 1.546 + 1.547 + if (rv) { 1.548 + if (pAMgr && pAMgr->get_warn() && rv->astr && 1.549 + TESTAFF(rv->astr, pAMgr->get_warn(), rv->alen)) { 1.550 + *info += SPELL_WARN; 1.551 + if (pAMgr->get_forbidwarn()) return 0; 1.552 + return HUNSPELL_OK_WARN; 1.553 + } 1.554 + return HUNSPELL_OK; 1.555 + } 1.556 + 1.557 + // recursive breaking at break points 1.558 + if (wordbreak) { 1.559 + char * s; 1.560 + char r; 1.561 + int nbr = 0; 1.562 + wl = strlen(cw); 1.563 + int numbreak = pAMgr ? pAMgr->get_numbreak() : 0; 1.564 + 1.565 + // calculate break points for recursion limit 1.566 + for (int j = 0; j < numbreak; j++) { 1.567 + s = cw; 1.568 + do { 1.569 + s = (char *) strstr(s, wordbreak[j]); 1.570 + if (s) { 1.571 + nbr++; 1.572 + s++; 1.573 + } 1.574 + } while (s); 1.575 + } 1.576 + if (nbr >= 10) return 0; 1.577 + 1.578 + // check boundary patterns (^begin and end$) 1.579 + for (int j = 0; j < numbreak; j++) { 1.580 + int plen = strlen(wordbreak[j]); 1.581 + if (plen == 1 || plen > wl) continue; 1.582 + if (wordbreak[j][0] == '^' && strncmp(cw, wordbreak[j] + 1, plen - 1) == 0 1.583 + && spell(cw + plen - 1)) return 1; 1.584 + if (wordbreak[j][plen - 1] == '$' && 1.585 + strncmp(cw + wl - plen + 1, wordbreak[j], plen - 1) == 0) { 1.586 + r = cw[wl - plen + 1]; 1.587 + cw[wl - plen + 1] = '\0'; 1.588 + if (spell(cw)) return 1; 1.589 + cw[wl - plen + 1] = r; 1.590 + } 1.591 + } 1.592 + 1.593 + // other patterns 1.594 + for (int j = 0; j < numbreak; j++) { 1.595 + int plen = strlen(wordbreak[j]); 1.596 + s=(char *) strstr(cw, wordbreak[j]); 1.597 + if (s && (s > cw) && (s < cw + wl - plen)) { 1.598 + if (!spell(s + plen)) continue; 1.599 + r = *s; 1.600 + *s = '\0'; 1.601 + // examine 2 sides of the break point 1.602 + if (spell(cw)) return 1; 1.603 + *s = r; 1.604 + 1.605 + // LANG_hu: spec. dash rule 1.606 + if (langnum == LANG_hu && strcmp(wordbreak[j], "-") == 0) { 1.607 + r = s[1]; 1.608 + s[1] = '\0'; 1.609 + if (spell(cw)) return 1; // check the first part with dash 1.610 + s[1] = r; 1.611 + } 1.612 + // end of LANG speficic region 1.613 + 1.614 + } 1.615 + } 1.616 + } 1.617 + 1.618 + return 0; 1.619 +} 1.620 + 1.621 +struct hentry * Hunspell::checkword(const char * w, int * info, char ** root) 1.622 +{ 1.623 + struct hentry * he = NULL; 1.624 + int len, i; 1.625 + char w2[MAXWORDUTF8LEN]; 1.626 + const char * word; 1.627 + 1.628 + char * ignoredchars = pAMgr->get_ignore(); 1.629 + if (ignoredchars != NULL) { 1.630 + strcpy(w2, w); 1.631 + if (utf8) { 1.632 + int ignoredchars_utf16_len; 1.633 + unsigned short * ignoredchars_utf16 = pAMgr->get_ignore_utf16(&ignoredchars_utf16_len); 1.634 + remove_ignored_chars_utf(w2, ignoredchars_utf16, ignoredchars_utf16_len); 1.635 + } else { 1.636 + remove_ignored_chars(w2,ignoredchars); 1.637 + } 1.638 + word = w2; 1.639 + } else word = w; 1.640 + 1.641 + len = strlen(word); 1.642 + 1.643 + if (!len) 1.644 + return NULL; 1.645 + 1.646 + // word reversing wrapper for complex prefixes 1.647 + if (complexprefixes) { 1.648 + if (word != w2) { 1.649 + strcpy(w2, word); 1.650 + word = w2; 1.651 + } 1.652 + if (utf8) reverseword_utf(w2); else reverseword(w2); 1.653 + } 1.654 + 1.655 + // look word in hash table 1.656 + for (i = 0; (i < maxdic) && !he; i ++) { 1.657 + he = (pHMgr[i])->lookup(word); 1.658 + 1.659 + // check forbidden and onlyincompound words 1.660 + if ((he) && (he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) { 1.661 + if (info) *info += SPELL_FORBIDDEN; 1.662 + // LANG_hu section: set dash information for suggestions 1.663 + if (langnum == LANG_hu) { 1.664 + if (pAMgr->get_compoundflag() && 1.665 + TESTAFF(he->astr, pAMgr->get_compoundflag(), he->alen)) { 1.666 + if (info) *info += SPELL_COMPOUND; 1.667 + } 1.668 + } 1.669 + return NULL; 1.670 + } 1.671 + 1.672 + // he = next not needaffix, onlyincompound homonym or onlyupcase word 1.673 + while (he && (he->astr) && 1.674 + ((pAMgr->get_needaffix() && TESTAFF(he->astr, pAMgr->get_needaffix(), he->alen)) || 1.675 + (pAMgr->get_onlyincompound() && TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) || 1.676 + (info && (*info & SPELL_INITCAP) && TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)) 1.677 + )) he = he->next_homonym; 1.678 + } 1.679 + 1.680 + // check with affixes 1.681 + if (!he && pAMgr) { 1.682 + // try stripping off affixes */ 1.683 + he = pAMgr->affix_check(word, len, 0); 1.684 + 1.685 + // check compound restriction and onlyupcase 1.686 + if (he && he->astr && ( 1.687 + (pAMgr->get_onlyincompound() && 1.688 + TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) || 1.689 + (info && (*info & SPELL_INITCAP) && 1.690 + TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)))) { 1.691 + he = NULL; 1.692 + } 1.693 + 1.694 + if (he) { 1.695 + if ((he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) { 1.696 + if (info) *info += SPELL_FORBIDDEN; 1.697 + return NULL; 1.698 + } 1.699 + if (root) { 1.700 + *root = mystrdup(he->word); 1.701 + if (*root && complexprefixes) { 1.702 + if (utf8) reverseword_utf(*root); else reverseword(*root); 1.703 + } 1.704 + } 1.705 + // try check compound word 1.706 + } else if (pAMgr->get_compound()) { 1.707 + he = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 0, info); 1.708 + // LANG_hu section: `moving rule' with last dash 1.709 + if ((!he) && (langnum == LANG_hu) && (word[len-1] == '-')) { 1.710 + char * dup = mystrdup(word); 1.711 + if (!dup) return NULL; 1.712 + dup[len-1] = '\0'; 1.713 + he = pAMgr->compound_check(dup, len-1, -5, 0, 100, 0, NULL, 1, 0, info); 1.714 + free(dup); 1.715 + } 1.716 + // end of LANG speficic region 1.717 + if (he) { 1.718 + if (root) { 1.719 + *root = mystrdup(he->word); 1.720 + if (*root && complexprefixes) { 1.721 + if (utf8) reverseword_utf(*root); else reverseword(*root); 1.722 + } 1.723 + } 1.724 + if (info) *info += SPELL_COMPOUND; 1.725 + } 1.726 + } 1.727 + 1.728 + } 1.729 + 1.730 + return he; 1.731 +} 1.732 + 1.733 +int Hunspell::suggest(char*** slst, const char * word) 1.734 +{ 1.735 + int onlycmpdsug = 0; 1.736 + char cw[MAXWORDUTF8LEN]; 1.737 + char wspace[MAXWORDUTF8LEN]; 1.738 + if (!pSMgr || maxdic == 0) return 0; 1.739 + w_char unicw[MAXWORDLEN]; 1.740 + *slst = NULL; 1.741 + // process XML input of the simplified API (see manual) 1.742 + if (strncmp(word, SPELL_XML, sizeof(SPELL_XML) - 3) == 0) { 1.743 + return spellml(slst, word); 1.744 + } 1.745 + int nc = strlen(word); 1.746 + if (utf8) { 1.747 + if (nc >= MAXWORDUTF8LEN) return 0; 1.748 + } else { 1.749 + if (nc >= MAXWORDLEN) return 0; 1.750 + } 1.751 + int captype = 0; 1.752 + int abbv = 0; 1.753 + int wl = 0; 1.754 + 1.755 + // input conversion 1.756 + RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; 1.757 + if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv); 1.758 + else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); 1.759 + 1.760 + if (wl == 0) return 0; 1.761 + int ns = 0; 1.762 + int capwords = 0; 1.763 + 1.764 + // check capitalized form for FORCEUCASE 1.765 + if (pAMgr && captype == NOCAP && pAMgr->get_forceucase()) { 1.766 + int info = SPELL_ORIGCAP; 1.767 + char ** wlst; 1.768 + if (checkword(cw, &info, NULL)) { 1.769 + if (*slst) { 1.770 + wlst = *slst; 1.771 + } else { 1.772 + wlst = (char **) malloc(MAXSUGGESTION * sizeof(char *)); 1.773 + if (wlst == NULL) return -1; 1.774 + *slst = wlst; 1.775 + for (int i = 0; i < MAXSUGGESTION; i++) { 1.776 + wlst[i] = NULL; 1.777 + } 1.778 + } 1.779 + wlst[0] = mystrdup(cw); 1.780 + mkinitcap(wlst[0]); 1.781 + return 1; 1.782 + } 1.783 + } 1.784 + 1.785 + switch(captype) { 1.786 + case NOCAP: { 1.787 + ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug); 1.788 + break; 1.789 + } 1.790 + 1.791 + case INITCAP: { 1.792 + capwords = 1; 1.793 + ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug); 1.794 + if (ns == -1) break; 1.795 + memcpy(wspace,cw,(wl+1)); 1.796 + mkallsmall2(wspace, unicw, nc); 1.797 + ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); 1.798 + break; 1.799 + } 1.800 + case HUHINITCAP: 1.801 + capwords = 1; 1.802 + case HUHCAP: { 1.803 + ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug); 1.804 + if (ns != -1) { 1.805 + int prevns; 1.806 + // something.The -> something. The 1.807 + char * dot = strchr(cw, '.'); 1.808 + if (dot && (dot > cw)) { 1.809 + int captype_; 1.810 + if (utf8) { 1.811 + w_char w_[MAXWORDLEN]; 1.812 + int wl_ = u8_u16(w_, MAXWORDLEN, dot + 1); 1.813 + captype_ = get_captype_utf8(w_, wl_, langnum); 1.814 + } else captype_ = get_captype(dot+1, strlen(dot+1), csconv); 1.815 + if (captype_ == INITCAP) { 1.816 + char * st = mystrdup(cw); 1.817 + if (st) st = (char *) realloc(st, wl + 2); 1.818 + if (st) { 1.819 + st[(dot - cw) + 1] = ' '; 1.820 + strcpy(st + (dot - cw) + 2, dot + 1); 1.821 + ns = insert_sug(slst, st, ns); 1.822 + free(st); 1.823 + } 1.824 + } 1.825 + } 1.826 + if (captype == HUHINITCAP) { 1.827 + // TheOpenOffice.org -> The OpenOffice.org 1.828 + memcpy(wspace,cw,(wl+1)); 1.829 + mkinitsmall2(wspace, unicw, nc); 1.830 + ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); 1.831 + } 1.832 + memcpy(wspace,cw,(wl+1)); 1.833 + mkallsmall2(wspace, unicw, nc); 1.834 + if (spell(wspace)) ns = insert_sug(slst, wspace, ns); 1.835 + prevns = ns; 1.836 + ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); 1.837 + if (captype == HUHINITCAP) { 1.838 + mkinitcap2(wspace, unicw, nc); 1.839 + if (spell(wspace)) ns = insert_sug(slst, wspace, ns); 1.840 + ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); 1.841 + } 1.842 + // aNew -> "a New" (instead of "a new") 1.843 + for (int j = prevns; j < ns; j++) { 1.844 + char * space = strchr((*slst)[j],' '); 1.845 + if (space) { 1.846 + int slen = strlen(space + 1); 1.847 + // different case after space (need capitalisation) 1.848 + if ((slen < wl) && strcmp(cw + wl - slen, space + 1)) { 1.849 + w_char w[MAXWORDLEN]; 1.850 + int wc = 0; 1.851 + char * r = (*slst)[j]; 1.852 + if (utf8) wc = u8_u16(w, MAXWORDLEN, space + 1); 1.853 + mkinitcap2(space + 1, w, wc); 1.854 + // set as first suggestion 1.855 + for (int k = j; k > 0; k--) (*slst)[k] = (*slst)[k - 1]; 1.856 + (*slst)[0] = r; 1.857 + } 1.858 + } 1.859 + } 1.860 + } 1.861 + break; 1.862 + } 1.863 + 1.864 + case ALLCAP: { 1.865 + memcpy(wspace, cw, (wl+1)); 1.866 + mkallsmall2(wspace, unicw, nc); 1.867 + ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); 1.868 + if (ns == -1) break; 1.869 + if (pAMgr && pAMgr->get_keepcase() && spell(wspace)) 1.870 + ns = insert_sug(slst, wspace, ns); 1.871 + mkinitcap2(wspace, unicw, nc); 1.872 + ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); 1.873 + for (int j=0; j < ns; j++) { 1.874 + mkallcap((*slst)[j]); 1.875 + if (pAMgr && pAMgr->get_checksharps()) { 1.876 + char * pos; 1.877 + if (utf8) { 1.878 + pos = strstr((*slst)[j], "\xC3\x9F"); 1.879 + while (pos) { 1.880 + *pos = 'S'; 1.881 + *(pos+1) = 'S'; 1.882 + pos = strstr(pos+2, "\xC3\x9F"); 1.883 + } 1.884 + } else { 1.885 + pos = strchr((*slst)[j], '\xDF'); 1.886 + while (pos) { 1.887 + (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 2); 1.888 + mystrrep((*slst)[j], "\xDF", "SS"); 1.889 + pos = strchr((*slst)[j], '\xDF'); 1.890 + } 1.891 + } 1.892 + } 1.893 + } 1.894 + break; 1.895 + } 1.896 + } 1.897 + 1.898 + // LANG_hu section: replace '-' with ' ' in Hungarian 1.899 + if (langnum == LANG_hu) { 1.900 + for (int j=0; j < ns; j++) { 1.901 + char * pos = strchr((*slst)[j],'-'); 1.902 + if (pos) { 1.903 + int info; 1.904 + char w[MAXWORDUTF8LEN]; 1.905 + *pos = '\0'; 1.906 + strcpy(w, (*slst)[j]); 1.907 + strcat(w, pos + 1); 1.908 + spell(w, &info, NULL); 1.909 + if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) { 1.910 + *pos = ' '; 1.911 + } else *pos = '-'; 1.912 + } 1.913 + } 1.914 + } 1.915 + // END OF LANG_hu section 1.916 + 1.917 + // try ngram approach since found nothing or only compound words 1.918 + if (pAMgr && (ns == 0 || onlycmpdsug) && (pAMgr->get_maxngramsugs() != 0) && (*slst)) { 1.919 + switch(captype) { 1.920 + case NOCAP: { 1.921 + ns = pSMgr->ngsuggest(*slst, cw, ns, pHMgr, maxdic); 1.922 + break; 1.923 + } 1.924 + case HUHINITCAP: 1.925 + capwords = 1; 1.926 + case HUHCAP: { 1.927 + memcpy(wspace,cw,(wl+1)); 1.928 + mkallsmall2(wspace, unicw, nc); 1.929 + ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic); 1.930 + break; 1.931 + } 1.932 + case INITCAP: { 1.933 + capwords = 1; 1.934 + memcpy(wspace,cw,(wl+1)); 1.935 + mkallsmall2(wspace, unicw, nc); 1.936 + ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic); 1.937 + break; 1.938 + } 1.939 + case ALLCAP: { 1.940 + memcpy(wspace,cw,(wl+1)); 1.941 + mkallsmall2(wspace, unicw, nc); 1.942 + int oldns = ns; 1.943 + ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic); 1.944 + for (int j = oldns; j < ns; j++) 1.945 + mkallcap((*slst)[j]); 1.946 + break; 1.947 + } 1.948 + } 1.949 + } 1.950 + 1.951 + // try dash suggestion (Afo-American -> Afro-American) 1.952 + if (char * pos = strchr(cw, '-')) { 1.953 + char * ppos = cw; 1.954 + int nodashsug = 1; 1.955 + char ** nlst = NULL; 1.956 + int nn = 0; 1.957 + int last = 0; 1.958 + if (*slst) { 1.959 + for (int j = 0; j < ns && nodashsug == 1; j++) { 1.960 + if (strchr((*slst)[j], '-')) nodashsug = 0; 1.961 + } 1.962 + } 1.963 + while (nodashsug && !last) { 1.964 + if (*pos == '\0') last = 1; else *pos = '\0'; 1.965 + if (!spell(ppos)) { 1.966 + nn = suggest(&nlst, ppos); 1.967 + for (int j = nn - 1; j >= 0; j--) { 1.968 + strncpy(wspace, cw, ppos - cw); 1.969 + strcpy(wspace + (ppos - cw), nlst[j]); 1.970 + if (!last) { 1.971 + strcat(wspace, "-"); 1.972 + strcat(wspace, pos + 1); 1.973 + } 1.974 + ns = insert_sug(slst, wspace, ns); 1.975 + free(nlst[j]); 1.976 + } 1.977 + if (nlst != NULL) free(nlst); 1.978 + nodashsug = 0; 1.979 + } 1.980 + if (!last) { 1.981 + *pos = '-'; 1.982 + ppos = pos + 1; 1.983 + pos = strchr(ppos, '-'); 1.984 + } 1.985 + if (!pos) pos = cw + strlen(cw); 1.986 + } 1.987 + } 1.988 + 1.989 + // word reversing wrapper for complex prefixes 1.990 + if (complexprefixes) { 1.991 + for (int j = 0; j < ns; j++) { 1.992 + if (utf8) reverseword_utf((*slst)[j]); else reverseword((*slst)[j]); 1.993 + } 1.994 + } 1.995 + 1.996 + // capitalize 1.997 + if (capwords) for (int j=0; j < ns; j++) { 1.998 + mkinitcap((*slst)[j]); 1.999 + } 1.1000 + 1.1001 + // expand suggestions with dot(s) 1.1002 + if (abbv && pAMgr && pAMgr->get_sugswithdots()) { 1.1003 + for (int j = 0; j < ns; j++) { 1.1004 + (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 1 + abbv); 1.1005 + strcat((*slst)[j], word + strlen(word) - abbv); 1.1006 + } 1.1007 + } 1.1008 + 1.1009 + // remove bad capitalized and forbidden forms 1.1010 + if (pAMgr && (pAMgr->get_keepcase() || pAMgr->get_forbiddenword())) { 1.1011 + switch (captype) { 1.1012 + case INITCAP: 1.1013 + case ALLCAP: { 1.1014 + int l = 0; 1.1015 + for (int j=0; j < ns; j++) { 1.1016 + if (!strchr((*slst)[j],' ') && !spell((*slst)[j])) { 1.1017 + char s[MAXSWUTF8L]; 1.1018 + w_char w[MAXSWL]; 1.1019 + int len; 1.1020 + if (utf8) { 1.1021 + len = u8_u16(w, MAXSWL, (*slst)[j]); 1.1022 + } else { 1.1023 + strcpy(s, (*slst)[j]); 1.1024 + len = strlen(s); 1.1025 + } 1.1026 + mkallsmall2(s, w, len); 1.1027 + free((*slst)[j]); 1.1028 + if (spell(s)) { 1.1029 + (*slst)[l] = mystrdup(s); 1.1030 + if ((*slst)[l]) l++; 1.1031 + } else { 1.1032 + mkinitcap2(s, w, len); 1.1033 + if (spell(s)) { 1.1034 + (*slst)[l] = mystrdup(s); 1.1035 + if ((*slst)[l]) l++; 1.1036 + } 1.1037 + } 1.1038 + } else { 1.1039 + (*slst)[l] = (*slst)[j]; 1.1040 + l++; 1.1041 + } 1.1042 + } 1.1043 + ns = l; 1.1044 + } 1.1045 + } 1.1046 + } 1.1047 + 1.1048 + // remove duplications 1.1049 + int l = 0; 1.1050 + for (int j = 0; j < ns; j++) { 1.1051 + (*slst)[l] = (*slst)[j]; 1.1052 + for (int k = 0; k < l; k++) { 1.1053 + if (strcmp((*slst)[k], (*slst)[j]) == 0) { 1.1054 + free((*slst)[j]); 1.1055 + l--; 1.1056 + break; 1.1057 + } 1.1058 + } 1.1059 + l++; 1.1060 + } 1.1061 + ns = l; 1.1062 + 1.1063 + // output conversion 1.1064 + rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL; 1.1065 + for (int j = 0; rl && j < ns; j++) { 1.1066 + if (rl->conv((*slst)[j], wspace)) { 1.1067 + free((*slst)[j]); 1.1068 + (*slst)[j] = mystrdup(wspace); 1.1069 + } 1.1070 + } 1.1071 + 1.1072 + // if suggestions removed by nosuggest, onlyincompound parameters 1.1073 + if (l == 0 && *slst) { 1.1074 + free(*slst); 1.1075 + *slst = NULL; 1.1076 + } 1.1077 + return l; 1.1078 +} 1.1079 + 1.1080 +void Hunspell::free_list(char *** slst, int n) { 1.1081 + freelist(slst, n); 1.1082 +} 1.1083 + 1.1084 +char * Hunspell::get_dic_encoding() 1.1085 +{ 1.1086 + return encoding; 1.1087 +} 1.1088 + 1.1089 +#ifdef HUNSPELL_EXPERIMENTAL 1.1090 +// XXX need UTF-8 support 1.1091 +int Hunspell::suggest_auto(char*** slst, const char * word) 1.1092 +{ 1.1093 + char cw[MAXWORDUTF8LEN]; 1.1094 + char wspace[MAXWORDUTF8LEN]; 1.1095 + if (!pSMgr || maxdic == 0) return 0; 1.1096 + int wl = strlen(word); 1.1097 + if (utf8) { 1.1098 + if (wl >= MAXWORDUTF8LEN) return 0; 1.1099 + } else { 1.1100 + if (wl >= MAXWORDLEN) return 0; 1.1101 + } 1.1102 + int captype = 0; 1.1103 + int abbv = 0; 1.1104 + wl = cleanword(cw, word, &captype, &abbv); 1.1105 + if (wl == 0) return 0; 1.1106 + int ns = 0; 1.1107 + *slst = NULL; // HU, nsug in pSMgr->suggest 1.1108 + 1.1109 + switch(captype) { 1.1110 + case NOCAP: { 1.1111 + ns = pSMgr->suggest_auto(slst, cw, ns); 1.1112 + if (ns>0) break; 1.1113 + break; 1.1114 + } 1.1115 + 1.1116 + case INITCAP: { 1.1117 + memcpy(wspace,cw,(wl+1)); 1.1118 + mkallsmall(wspace); 1.1119 + ns = pSMgr->suggest_auto(slst, wspace, ns); 1.1120 + for (int j=0; j < ns; j++) 1.1121 + mkinitcap((*slst)[j]); 1.1122 + ns = pSMgr->suggest_auto(slst, cw, ns); 1.1123 + break; 1.1124 + 1.1125 + } 1.1126 + 1.1127 + case HUHINITCAP: 1.1128 + case HUHCAP: { 1.1129 + ns = pSMgr->suggest_auto(slst, cw, ns); 1.1130 + if (ns == 0) { 1.1131 + memcpy(wspace,cw,(wl+1)); 1.1132 + mkallsmall(wspace); 1.1133 + ns = pSMgr->suggest_auto(slst, wspace, ns); 1.1134 + } 1.1135 + break; 1.1136 + } 1.1137 + 1.1138 + case ALLCAP: { 1.1139 + memcpy(wspace,cw,(wl+1)); 1.1140 + mkallsmall(wspace); 1.1141 + ns = pSMgr->suggest_auto(slst, wspace, ns); 1.1142 + 1.1143 + mkinitcap(wspace); 1.1144 + ns = pSMgr->suggest_auto(slst, wspace, ns); 1.1145 + 1.1146 + for (int j=0; j < ns; j++) 1.1147 + mkallcap((*slst)[j]); 1.1148 + break; 1.1149 + } 1.1150 + } 1.1151 + 1.1152 + // word reversing wrapper for complex prefixes 1.1153 + if (complexprefixes) { 1.1154 + for (int j = 0; j < ns; j++) { 1.1155 + if (utf8) reverseword_utf((*slst)[j]); else reverseword((*slst)[j]); 1.1156 + } 1.1157 + } 1.1158 + 1.1159 + // expand suggestions with dot(s) 1.1160 + if (abbv && pAMgr && pAMgr->get_sugswithdots()) { 1.1161 + for (int j = 0; j < ns; j++) { 1.1162 + (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 1 + abbv); 1.1163 + strcat((*slst)[j], word + strlen(word) - abbv); 1.1164 + } 1.1165 + } 1.1166 + 1.1167 + // LANG_hu section: replace '-' with ' ' in Hungarian 1.1168 + if (langnum == LANG_hu) { 1.1169 + for (int j=0; j < ns; j++) { 1.1170 + char * pos = strchr((*slst)[j],'-'); 1.1171 + if (pos) { 1.1172 + int info; 1.1173 + char w[MAXWORDUTF8LEN]; 1.1174 + *pos = '\0'; 1.1175 + strcpy(w, (*slst)[j]); 1.1176 + strcat(w, pos + 1); 1.1177 + spell(w, &info, NULL); 1.1178 + if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) { 1.1179 + *pos = ' '; 1.1180 + } else *pos = '-'; 1.1181 + } 1.1182 + } 1.1183 + } 1.1184 + // END OF LANG_hu section 1.1185 + return ns; 1.1186 +} 1.1187 +#endif 1.1188 + 1.1189 +int Hunspell::stem(char*** slst, char ** desc, int n) 1.1190 +{ 1.1191 + char result[MAXLNLEN]; 1.1192 + char result2[MAXLNLEN]; 1.1193 + *slst = NULL; 1.1194 + if (n == 0) return 0; 1.1195 + *result2 = '\0'; 1.1196 + for (int i = 0; i < n; i++) { 1.1197 + *result = '\0'; 1.1198 + // add compound word parts (except the last one) 1.1199 + char * s = (char *) desc[i]; 1.1200 + char * part = strstr(s, MORPH_PART); 1.1201 + if (part) { 1.1202 + char * nextpart = strstr(part + 1, MORPH_PART); 1.1203 + while (nextpart) { 1.1204 + copy_field(result + strlen(result), part, MORPH_PART); 1.1205 + part = nextpart; 1.1206 + nextpart = strstr(part + 1, MORPH_PART); 1.1207 + } 1.1208 + s = part; 1.1209 + } 1.1210 + 1.1211 + char **pl; 1.1212 + char tok[MAXLNLEN]; 1.1213 + strcpy(tok, s); 1.1214 + char * alt = strstr(tok, " | "); 1.1215 + while (alt) { 1.1216 + alt[1] = MSEP_ALT; 1.1217 + alt = strstr(alt, " | "); 1.1218 + } 1.1219 + int pln = line_tok(tok, &pl, MSEP_ALT); 1.1220 + for (int k = 0; k < pln; k++) { 1.1221 + // add derivational suffixes 1.1222 + if (strstr(pl[k], MORPH_DERI_SFX)) { 1.1223 + // remove inflectional suffixes 1.1224 + char * is = strstr(pl[k], MORPH_INFL_SFX); 1.1225 + if (is) *is = '\0'; 1.1226 + char * sg = pSMgr->suggest_gen(&(pl[k]), 1, pl[k]); 1.1227 + if (sg) { 1.1228 + char ** gen; 1.1229 + int genl = line_tok(sg, &gen, MSEP_REC); 1.1230 + free(sg); 1.1231 + for (int j = 0; j < genl; j++) { 1.1232 + sprintf(result2 + strlen(result2), "%c%s%s", 1.1233 + MSEP_REC, result, gen[j]); 1.1234 + } 1.1235 + freelist(&gen, genl); 1.1236 + } 1.1237 + } else { 1.1238 + sprintf(result2 + strlen(result2), "%c%s", MSEP_REC, result); 1.1239 + if (strstr(pl[k], MORPH_SURF_PFX)) { 1.1240 + copy_field(result2 + strlen(result2), pl[k], MORPH_SURF_PFX); 1.1241 + } 1.1242 + copy_field(result2 + strlen(result2), pl[k], MORPH_STEM); 1.1243 + } 1.1244 + } 1.1245 + freelist(&pl, pln); 1.1246 + } 1.1247 + int sln = line_tok(result2, slst, MSEP_REC); 1.1248 + return uniqlist(*slst, sln); 1.1249 + 1.1250 +} 1.1251 + 1.1252 +int Hunspell::stem(char*** slst, const char * word) 1.1253 +{ 1.1254 + char ** pl; 1.1255 + int pln = analyze(&pl, word); 1.1256 + int pln2 = stem(slst, pl, pln); 1.1257 + freelist(&pl, pln); 1.1258 + return pln2; 1.1259 +} 1.1260 + 1.1261 +#ifdef HUNSPELL_EXPERIMENTAL 1.1262 +int Hunspell::suggest_pos_stems(char*** slst, const char * word) 1.1263 +{ 1.1264 + char cw[MAXWORDUTF8LEN]; 1.1265 + char wspace[MAXWORDUTF8LEN]; 1.1266 + if (! pSMgr || maxdic == 0) return 0; 1.1267 + int wl = strlen(word); 1.1268 + if (utf8) { 1.1269 + if (wl >= MAXWORDUTF8LEN) return 0; 1.1270 + } else { 1.1271 + if (wl >= MAXWORDLEN) return 0; 1.1272 + } 1.1273 + int captype = 0; 1.1274 + int abbv = 0; 1.1275 + wl = cleanword(cw, word, &captype, &abbv); 1.1276 + if (wl == 0) return 0; 1.1277 + 1.1278 + int ns = 0; // ns=0 = normalized input 1.1279 + 1.1280 + *slst = NULL; // HU, nsug in pSMgr->suggest 1.1281 + 1.1282 + switch(captype) { 1.1283 + case HUHCAP: 1.1284 + case NOCAP: { 1.1285 + ns = pSMgr->suggest_pos_stems(slst, cw, ns); 1.1286 + 1.1287 + if ((abbv) && (ns == 0)) { 1.1288 + memcpy(wspace,cw,wl); 1.1289 + *(wspace+wl) = '.'; 1.1290 + *(wspace+wl+1) = '\0'; 1.1291 + ns = pSMgr->suggest_pos_stems(slst, wspace, ns); 1.1292 + } 1.1293 + 1.1294 + break; 1.1295 + } 1.1296 + 1.1297 + case INITCAP: { 1.1298 + 1.1299 + ns = pSMgr->suggest_pos_stems(slst, cw, ns); 1.1300 + 1.1301 + if (ns == 0 || ((*slst)[0][0] == '#')) { 1.1302 + memcpy(wspace,cw,(wl+1)); 1.1303 + mkallsmall(wspace); 1.1304 + ns = pSMgr->suggest_pos_stems(slst, wspace, ns); 1.1305 + } 1.1306 + 1.1307 + break; 1.1308 + 1.1309 + } 1.1310 + 1.1311 + case ALLCAP: { 1.1312 + ns = pSMgr->suggest_pos_stems(slst, cw, ns); 1.1313 + if (ns != 0) break; 1.1314 + 1.1315 + memcpy(wspace,cw,(wl+1)); 1.1316 + mkallsmall(wspace); 1.1317 + ns = pSMgr->suggest_pos_stems(slst, wspace, ns); 1.1318 + 1.1319 + if (ns == 0) { 1.1320 + mkinitcap(wspace); 1.1321 + ns = pSMgr->suggest_pos_stems(slst, wspace, ns); 1.1322 + } 1.1323 + break; 1.1324 + } 1.1325 + } 1.1326 + 1.1327 + return ns; 1.1328 +} 1.1329 +#endif // END OF HUNSPELL_EXPERIMENTAL CODE 1.1330 + 1.1331 +const char * Hunspell::get_wordchars() 1.1332 +{ 1.1333 + return pAMgr->get_wordchars(); 1.1334 +} 1.1335 + 1.1336 +unsigned short * Hunspell::get_wordchars_utf16(int * len) 1.1337 +{ 1.1338 + return pAMgr->get_wordchars_utf16(len); 1.1339 +} 1.1340 + 1.1341 +void Hunspell::mkinitcap(char * p) 1.1342 +{ 1.1343 + if (!utf8) { 1.1344 + if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper; 1.1345 + } else { 1.1346 + int len; 1.1347 + w_char u[MAXWORDLEN]; 1.1348 + len = u8_u16(u, MAXWORDLEN, p); 1.1349 + unsigned short i = unicodetoupper((u[0].h << 8) + u[0].l, langnum); 1.1350 + u[0].h = (unsigned char) (i >> 8); 1.1351 + u[0].l = (unsigned char) (i & 0x00FF); 1.1352 + u16_u8(p, MAXWORDUTF8LEN, u, len); 1.1353 + } 1.1354 +} 1.1355 + 1.1356 +int Hunspell::mkinitcap2(char * p, w_char * u, int nc) 1.1357 +{ 1.1358 + if (!utf8) { 1.1359 + if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper; 1.1360 + } else if (nc > 0) { 1.1361 + unsigned short i = unicodetoupper((u[0].h << 8) + u[0].l, langnum); 1.1362 + u[0].h = (unsigned char) (i >> 8); 1.1363 + u[0].l = (unsigned char) (i & 0x00FF); 1.1364 + u16_u8(p, MAXWORDUTF8LEN, u, nc); 1.1365 + return strlen(p); 1.1366 + } 1.1367 + return nc; 1.1368 +} 1.1369 + 1.1370 +int Hunspell::mkinitsmall2(char * p, w_char * u, int nc) 1.1371 +{ 1.1372 + if (!utf8) { 1.1373 + if (*p != '\0') *p = csconv[((unsigned char)*p)].clower; 1.1374 + } else if (nc > 0) { 1.1375 + unsigned short i = unicodetolower((u[0].h << 8) + u[0].l, langnum); 1.1376 + u[0].h = (unsigned char) (i >> 8); 1.1377 + u[0].l = (unsigned char) (i & 0x00FF); 1.1378 + u16_u8(p, MAXWORDUTF8LEN, u, nc); 1.1379 + return strlen(p); 1.1380 + } 1.1381 + return nc; 1.1382 +} 1.1383 + 1.1384 +int Hunspell::add(const char * word) 1.1385 +{ 1.1386 + if (pHMgr[0]) return (pHMgr[0])->add(word); 1.1387 + return 0; 1.1388 +} 1.1389 + 1.1390 +int Hunspell::add_with_affix(const char * word, const char * example) 1.1391 +{ 1.1392 + if (pHMgr[0]) return (pHMgr[0])->add_with_affix(word, example); 1.1393 + return 0; 1.1394 +} 1.1395 + 1.1396 +int Hunspell::remove(const char * word) 1.1397 +{ 1.1398 + if (pHMgr[0]) return (pHMgr[0])->remove(word); 1.1399 + return 0; 1.1400 +} 1.1401 + 1.1402 +const char * Hunspell::get_version() 1.1403 +{ 1.1404 + return pAMgr->get_version(); 1.1405 +} 1.1406 + 1.1407 +struct cs_info * Hunspell::get_csconv() 1.1408 +{ 1.1409 + return csconv; 1.1410 +} 1.1411 + 1.1412 +void Hunspell::cat_result(char * result, char * st) 1.1413 +{ 1.1414 + if (st) { 1.1415 + if (*result) mystrcat(result, "\n", MAXLNLEN); 1.1416 + mystrcat(result, st, MAXLNLEN); 1.1417 + free(st); 1.1418 + } 1.1419 +} 1.1420 + 1.1421 +int Hunspell::analyze(char*** slst, const char * word) 1.1422 +{ 1.1423 + char cw[MAXWORDUTF8LEN]; 1.1424 + char wspace[MAXWORDUTF8LEN]; 1.1425 + w_char unicw[MAXWORDLEN]; 1.1426 + int wl2 = 0; 1.1427 + *slst = NULL; 1.1428 + if (! pSMgr || maxdic == 0) return 0; 1.1429 + int nc = strlen(word); 1.1430 + if (utf8) { 1.1431 + if (nc >= MAXWORDUTF8LEN) return 0; 1.1432 + } else { 1.1433 + if (nc >= MAXWORDLEN) return 0; 1.1434 + } 1.1435 + int captype = 0; 1.1436 + int abbv = 0; 1.1437 + int wl = 0; 1.1438 + 1.1439 + // input conversion 1.1440 + RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; 1.1441 + if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv); 1.1442 + else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); 1.1443 + 1.1444 + if (wl == 0) { 1.1445 + if (abbv) { 1.1446 + for (wl = 0; wl < abbv; wl++) cw[wl] = '.'; 1.1447 + cw[wl] = '\0'; 1.1448 + abbv = 0; 1.1449 + } else return 0; 1.1450 + } 1.1451 + 1.1452 + char result[MAXLNLEN]; 1.1453 + char * st = NULL; 1.1454 + 1.1455 + *result = '\0'; 1.1456 + 1.1457 + int n = 0; 1.1458 + int n2 = 0; 1.1459 + int n3 = 0; 1.1460 + 1.1461 + // test numbers 1.1462 + // LANG_hu section: set dash information for suggestions 1.1463 + if (langnum == LANG_hu) { 1.1464 + while ((n < wl) && 1.1465 + (((cw[n] <= '9') && (cw[n] >= '0')) || (((cw[n] == '.') || (cw[n] == ',')) && (n > 0)))) { 1.1466 + n++; 1.1467 + if ((cw[n] == '.') || (cw[n] == ',')) { 1.1468 + if (((n2 == 0) && (n > 3)) || 1.1469 + ((n2 > 0) && ((cw[n-1] == '.') || (cw[n-1] == ',')))) break; 1.1470 + n2++; 1.1471 + n3 = n; 1.1472 + } 1.1473 + } 1.1474 + 1.1475 + if ((n == wl) && (n3 > 0) && (n - n3 > 3)) return 0; 1.1476 + if ((n == wl) || ((n>0) && ((cw[n]=='%') || (cw[n]=='\xB0')) && checkword(cw+n, NULL, NULL))) { 1.1477 + mystrcat(result, cw, MAXLNLEN); 1.1478 + result[n - 1] = '\0'; 1.1479 + if (n == wl) cat_result(result, pSMgr->suggest_morph(cw + n - 1)); 1.1480 + else { 1.1481 + char sign = cw[n]; 1.1482 + cw[n] = '\0'; 1.1483 + cat_result(result, pSMgr->suggest_morph(cw + n - 1)); 1.1484 + mystrcat(result, "+", MAXLNLEN); // XXX SPEC. MORPHCODE 1.1485 + cw[n] = sign; 1.1486 + cat_result(result, pSMgr->suggest_morph(cw + n)); 1.1487 + } 1.1488 + return line_tok(result, slst, MSEP_REC); 1.1489 + } 1.1490 + } 1.1491 + // END OF LANG_hu section 1.1492 + 1.1493 + switch(captype) { 1.1494 + case HUHCAP: 1.1495 + case HUHINITCAP: 1.1496 + case NOCAP: { 1.1497 + cat_result(result, pSMgr->suggest_morph(cw)); 1.1498 + if (abbv) { 1.1499 + memcpy(wspace,cw,wl); 1.1500 + *(wspace+wl) = '.'; 1.1501 + *(wspace+wl+1) = '\0'; 1.1502 + cat_result(result, pSMgr->suggest_morph(wspace)); 1.1503 + } 1.1504 + break; 1.1505 + } 1.1506 + case INITCAP: { 1.1507 + wl = mkallsmall2(cw, unicw, nc); 1.1508 + memcpy(wspace,cw,(wl+1)); 1.1509 + wl2 = mkinitcap2(cw, unicw, nc); 1.1510 + cat_result(result, pSMgr->suggest_morph(wspace)); 1.1511 + cat_result(result, pSMgr->suggest_morph(cw)); 1.1512 + if (abbv) { 1.1513 + *(wspace+wl) = '.'; 1.1514 + *(wspace+wl+1) = '\0'; 1.1515 + cat_result(result, pSMgr->suggest_morph(wspace)); 1.1516 + 1.1517 + memcpy(wspace, cw, wl2); 1.1518 + *(wspace+wl2) = '.'; 1.1519 + *(wspace+wl2+1) = '\0'; 1.1520 + 1.1521 + cat_result(result, pSMgr->suggest_morph(wspace)); 1.1522 + } 1.1523 + break; 1.1524 + } 1.1525 + case ALLCAP: { 1.1526 + cat_result(result, pSMgr->suggest_morph(cw)); 1.1527 + if (abbv) { 1.1528 + memcpy(wspace,cw,wl); 1.1529 + *(wspace+wl) = '.'; 1.1530 + *(wspace+wl+1) = '\0'; 1.1531 + cat_result(result, pSMgr->suggest_morph(cw)); 1.1532 + } 1.1533 + wl = mkallsmall2(cw, unicw, nc); 1.1534 + memcpy(wspace,cw,(wl+1)); 1.1535 + wl2 = mkinitcap2(cw, unicw, nc); 1.1536 + 1.1537 + cat_result(result, pSMgr->suggest_morph(wspace)); 1.1538 + cat_result(result, pSMgr->suggest_morph(cw)); 1.1539 + if (abbv) { 1.1540 + *(wspace+wl) = '.'; 1.1541 + *(wspace+wl+1) = '\0'; 1.1542 + cat_result(result, pSMgr->suggest_morph(wspace)); 1.1543 + 1.1544 + memcpy(wspace, cw, wl2); 1.1545 + *(wspace+wl2) = '.'; 1.1546 + *(wspace+wl2+1) = '\0'; 1.1547 + 1.1548 + cat_result(result, pSMgr->suggest_morph(wspace)); 1.1549 + } 1.1550 + break; 1.1551 + } 1.1552 + } 1.1553 + 1.1554 + if (*result) { 1.1555 + // word reversing wrapper for complex prefixes 1.1556 + if (complexprefixes) { 1.1557 + if (utf8) reverseword_utf(result); else reverseword(result); 1.1558 + } 1.1559 + return line_tok(result, slst, MSEP_REC); 1.1560 + } 1.1561 + 1.1562 + // compound word with dash (HU) I18n 1.1563 + char * dash = NULL; 1.1564 + int nresult = 0; 1.1565 + // LANG_hu section: set dash information for suggestions 1.1566 + if (langnum == LANG_hu) dash = (char *) strchr(cw,'-'); 1.1567 + if ((langnum == LANG_hu) && dash) { 1.1568 + *dash='\0'; 1.1569 + // examine 2 sides of the dash 1.1570 + if (dash[1] == '\0') { // base word ending with dash 1.1571 + if (spell(cw)) { 1.1572 + char * p = pSMgr->suggest_morph(cw); 1.1573 + if (p) { 1.1574 + int ret = line_tok(p, slst, MSEP_REC); 1.1575 + free(p); 1.1576 + return ret; 1.1577 + } 1.1578 + 1.1579 + } 1.1580 + } else if ((dash[1] == 'e') && (dash[2] == '\0')) { // XXX (HU) -e hat. 1.1581 + if (spell(cw) && (spell("-e"))) { 1.1582 + st = pSMgr->suggest_morph(cw); 1.1583 + if (st) { 1.1584 + mystrcat(result, st, MAXLNLEN); 1.1585 + free(st); 1.1586 + } 1.1587 + mystrcat(result,"+", MAXLNLEN); // XXX spec. separator in MORPHCODE 1.1588 + st = pSMgr->suggest_morph("-e"); 1.1589 + if (st) { 1.1590 + mystrcat(result, st, MAXLNLEN); 1.1591 + free(st); 1.1592 + } 1.1593 + return line_tok(result, slst, MSEP_REC); 1.1594 + } 1.1595 + } else { 1.1596 + // first word ending with dash: word- XXX ??? 1.1597 + char r2 = *(dash + 1); 1.1598 + dash[0]='-'; 1.1599 + dash[1]='\0'; 1.1600 + nresult = spell(cw); 1.1601 + dash[1] = r2; 1.1602 + dash[0]='\0'; 1.1603 + if (nresult && spell(dash+1) && ((strlen(dash+1) > 1) || 1.1604 + ((dash[1] > '0') && (dash[1] < '9')))) { 1.1605 + st = pSMgr->suggest_morph(cw); 1.1606 + if (st) { 1.1607 + mystrcat(result, st, MAXLNLEN); 1.1608 + free(st); 1.1609 + mystrcat(result,"+", MAXLNLEN); // XXX spec. separator in MORPHCODE 1.1610 + } 1.1611 + st = pSMgr->suggest_morph(dash+1); 1.1612 + if (st) { 1.1613 + mystrcat(result, st, MAXLNLEN); 1.1614 + free(st); 1.1615 + } 1.1616 + return line_tok(result, slst, MSEP_REC); 1.1617 + } 1.1618 + } 1.1619 + // affixed number in correct word 1.1620 + if (nresult && (dash > cw) && (((*(dash-1)<='9') && 1.1621 + (*(dash-1)>='0')) || (*(dash-1)=='.'))) { 1.1622 + *dash='-'; 1.1623 + n = 1; 1.1624 + if (*(dash - n) == '.') n++; 1.1625 + // search first not a number character to left from dash 1.1626 + while (((dash - n)>=cw) && ((*(dash - n)=='0') || (n < 3)) && (n < 6)) { 1.1627 + n++; 1.1628 + } 1.1629 + if ((dash - n) < cw) n--; 1.1630 + // numbers: valami1000000-hoz 1.1631 + // examine 100000-hoz, 10000-hoz 1000-hoz, 10-hoz, 1.1632 + // 56-hoz, 6-hoz 1.1633 + for(; n >= 1; n--) { 1.1634 + if ((*(dash - n) >= '0') && (*(dash - n) <= '9') && checkword(dash - n, NULL, NULL)) { 1.1635 + mystrcat(result, cw, MAXLNLEN); 1.1636 + result[dash - cw - n] = '\0'; 1.1637 + st = pSMgr->suggest_morph(dash - n); 1.1638 + if (st) { 1.1639 + mystrcat(result, st, MAXLNLEN); 1.1640 + free(st); 1.1641 + } 1.1642 + return line_tok(result, slst, MSEP_REC); 1.1643 + } 1.1644 + } 1.1645 + } 1.1646 + } 1.1647 + return 0; 1.1648 +} 1.1649 + 1.1650 +int Hunspell::generate(char*** slst, const char * word, char ** pl, int pln) 1.1651 +{ 1.1652 + *slst = NULL; 1.1653 + if (!pSMgr || !pln) return 0; 1.1654 + char **pl2; 1.1655 + int pl2n = analyze(&pl2, word); 1.1656 + int captype = 0; 1.1657 + int abbv = 0; 1.1658 + char cw[MAXWORDUTF8LEN]; 1.1659 + cleanword(cw, word, &captype, &abbv); 1.1660 + char result[MAXLNLEN]; 1.1661 + *result = '\0'; 1.1662 + 1.1663 + for (int i = 0; i < pln; i++) { 1.1664 + cat_result(result, pSMgr->suggest_gen(pl2, pl2n, pl[i])); 1.1665 + } 1.1666 + freelist(&pl2, pl2n); 1.1667 + 1.1668 + if (*result) { 1.1669 + // allcap 1.1670 + if (captype == ALLCAP) mkallcap(result); 1.1671 + 1.1672 + // line split 1.1673 + int linenum = line_tok(result, slst, MSEP_REC); 1.1674 + 1.1675 + // capitalize 1.1676 + if (captype == INITCAP || captype == HUHINITCAP) { 1.1677 + for (int j=0; j < linenum; j++) mkinitcap((*slst)[j]); 1.1678 + } 1.1679 + 1.1680 + // temporary filtering of prefix related errors (eg. 1.1681 + // generate("undrinkable", "eats") --> "undrinkables" and "*undrinks") 1.1682 + 1.1683 + int r = 0; 1.1684 + for (int j=0; j < linenum; j++) { 1.1685 + if (!spell((*slst)[j])) { 1.1686 + free((*slst)[j]); 1.1687 + (*slst)[j] = NULL; 1.1688 + } else { 1.1689 + if (r < j) (*slst)[r] = (*slst)[j]; 1.1690 + r++; 1.1691 + } 1.1692 + } 1.1693 + if (r > 0) return r; 1.1694 + free(*slst); 1.1695 + *slst = NULL; 1.1696 + } 1.1697 + return 0; 1.1698 +} 1.1699 + 1.1700 +int Hunspell::generate(char*** slst, const char * word, const char * pattern) 1.1701 +{ 1.1702 + char **pl; 1.1703 + int pln = analyze(&pl, pattern); 1.1704 + int n = generate(slst, word, pl, pln); 1.1705 + freelist(&pl, pln); 1.1706 + return uniqlist(*slst, n); 1.1707 +} 1.1708 + 1.1709 +// minimal XML parser functions 1.1710 +int Hunspell::get_xml_par(char * dest, const char * par, int max) 1.1711 +{ 1.1712 + char * d = dest; 1.1713 + if (!par) return 0; 1.1714 + char end = *par; 1.1715 + char * dmax = dest + max; 1.1716 + if (end == '>') end = '<'; 1.1717 + else if (end != '\'' && end != '"') return 0; // bad XML 1.1718 + for (par++; d < dmax && *par != '\0' && *par != end; par++, d++) *d = *par; 1.1719 + *d = '\0'; 1.1720 + mystrrep(dest, "<", "<"); 1.1721 + mystrrep(dest, "&", "&"); 1.1722 + return (int)(d - dest); 1.1723 +} 1.1724 + 1.1725 +int Hunspell::get_langnum() const 1.1726 +{ 1.1727 + return langnum; 1.1728 +} 1.1729 + 1.1730 +// return the beginning of the element (attr == NULL) or the attribute 1.1731 +const char * Hunspell::get_xml_pos(const char * s, const char * attr) 1.1732 +{ 1.1733 + const char * end = strchr(s, '>'); 1.1734 + const char * p = s; 1.1735 + if (attr == NULL) return end; 1.1736 + do { 1.1737 + p = strstr(p, attr); 1.1738 + if (!p || p >= end) return 0; 1.1739 + } while (*(p-1) != ' ' && *(p-1) != '\n'); 1.1740 + return p + strlen(attr); 1.1741 +} 1.1742 + 1.1743 +int Hunspell::check_xml_par(const char * q, const char * attr, const char * value) { 1.1744 + char cw[MAXWORDUTF8LEN]; 1.1745 + if (get_xml_par(cw, get_xml_pos(q, attr), MAXWORDUTF8LEN - 1) && 1.1746 + strcmp(cw, value) == 0) return 1; 1.1747 + return 0; 1.1748 +} 1.1749 + 1.1750 +int Hunspell::get_xml_list(char ***slst, char * list, const char * tag) { 1.1751 + int n = 0; 1.1752 + char * p; 1.1753 + if (!list) return 0; 1.1754 + for (p = list; (p = strstr(p, tag)); p++) n++; 1.1755 + if (n == 0) return 0; 1.1756 + *slst = (char **) malloc(sizeof(char *) * n); 1.1757 + if (!*slst) return 0; 1.1758 + for (p = list, n = 0; (p = strstr(p, tag)); p++, n++) { 1.1759 + int l = strlen(p); 1.1760 + (*slst)[n] = (char *) malloc(l + 1); 1.1761 + if (!(*slst)[n]) return n; 1.1762 + if (!get_xml_par((*slst)[n], p + strlen(tag) - 1, l)) { 1.1763 + free((*slst)[n]); 1.1764 + break; 1.1765 + } 1.1766 + } 1.1767 + return n; 1.1768 +} 1.1769 + 1.1770 +int Hunspell::spellml(char*** slst, const char * word) 1.1771 +{ 1.1772 + char *q, *q2; 1.1773 + char cw[MAXWORDUTF8LEN], cw2[MAXWORDUTF8LEN]; 1.1774 + q = (char *) strstr(word, "<query"); 1.1775 + if (!q) return 0; // bad XML input 1.1776 + q2 = strchr(q, '>'); 1.1777 + if (!q2) return 0; // bad XML input 1.1778 + q2 = strstr(q2, "<word"); 1.1779 + if (!q2) return 0; // bad XML input 1.1780 + if (check_xml_par(q, "type=", "analyze")) { 1.1781 + int n = 0, s = 0; 1.1782 + if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 10)) n = analyze(slst, cw); 1.1783 + if (n == 0) return 0; 1.1784 + // convert the result to <code><a>ana1</a><a>ana2</a></code> format 1.1785 + for (int i = 0; i < n; i++) s+= strlen((*slst)[i]); 1.1786 + char * r = (char *) malloc(6 + 5 * s + 7 * n + 7 + 1); // XXX 5*s->&->& 1.1787 + if (!r) return 0; 1.1788 + strcpy(r, "<code>"); 1.1789 + for (int i = 0; i < n; i++) { 1.1790 + int l = strlen(r); 1.1791 + strcpy(r + l, "<a>"); 1.1792 + strcpy(r + l + 3, (*slst)[i]); 1.1793 + mystrrep(r + l + 3, "\t", " "); 1.1794 + mystrrep(r + l + 3, "<", "<"); 1.1795 + mystrrep(r + l + 3, "&", "&"); 1.1796 + strcat(r, "</a>"); 1.1797 + free((*slst)[i]); 1.1798 + } 1.1799 + strcat(r, "</code>"); 1.1800 + (*slst)[0] = r; 1.1801 + return 1; 1.1802 + } else if (check_xml_par(q, "type=", "stem")) { 1.1803 + if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 1)) return stem(slst, cw); 1.1804 + } else if (check_xml_par(q, "type=", "generate")) { 1.1805 + int n = get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 1); 1.1806 + if (n == 0) return 0; 1.1807 + char * q3 = strstr(q2 + 1, "<word"); 1.1808 + if (q3) { 1.1809 + if (get_xml_par(cw2, strchr(q3, '>'), MAXWORDUTF8LEN - 1)) { 1.1810 + return generate(slst, cw, cw2); 1.1811 + } 1.1812 + } else { 1.1813 + if ((q2 = strstr(q2 + 1, "<code"))) { 1.1814 + char ** slst2; 1.1815 + if ((n = get_xml_list(&slst2, strchr(q2, '>'), "<a>"))) { 1.1816 + int n2 = generate(slst, cw, slst2, n); 1.1817 + freelist(&slst2, n); 1.1818 + return uniqlist(*slst, n2); 1.1819 + } 1.1820 + freelist(&slst2, n); 1.1821 + } 1.1822 + } 1.1823 + } 1.1824 + return 0; 1.1825 +} 1.1826 + 1.1827 + 1.1828 +#ifdef HUNSPELL_EXPERIMENTAL 1.1829 +// XXX need UTF-8 support 1.1830 +char * Hunspell::morph_with_correction(const char * word) 1.1831 +{ 1.1832 + char cw[MAXWORDUTF8LEN]; 1.1833 + char wspace[MAXWORDUTF8LEN]; 1.1834 + if (! pSMgr || maxdic == 0) return NULL; 1.1835 + int wl = strlen(word); 1.1836 + if (utf8) { 1.1837 + if (wl >= MAXWORDUTF8LEN) return NULL; 1.1838 + } else { 1.1839 + if (wl >= MAXWORDLEN) return NULL; 1.1840 + } 1.1841 + int captype = 0; 1.1842 + int abbv = 0; 1.1843 + wl = cleanword(cw, word, &captype, &abbv); 1.1844 + if (wl == 0) return NULL; 1.1845 + 1.1846 + char result[MAXLNLEN]; 1.1847 + char * st = NULL; 1.1848 + 1.1849 + *result = '\0'; 1.1850 + 1.1851 + 1.1852 + switch(captype) { 1.1853 + case NOCAP: { 1.1854 + st = pSMgr->suggest_morph_for_spelling_error(cw); 1.1855 + if (st) { 1.1856 + mystrcat(result, st, MAXLNLEN); 1.1857 + free(st); 1.1858 + } 1.1859 + if (abbv) { 1.1860 + memcpy(wspace,cw,wl); 1.1861 + *(wspace+wl) = '.'; 1.1862 + *(wspace+wl+1) = '\0'; 1.1863 + st = pSMgr->suggest_morph_for_spelling_error(wspace); 1.1864 + if (st) { 1.1865 + if (*result) mystrcat(result, "\n", MAXLNLEN); 1.1866 + mystrcat(result, st, MAXLNLEN); 1.1867 + free(st); 1.1868 + } 1.1869 + } 1.1870 + break; 1.1871 + } 1.1872 + case INITCAP: { 1.1873 + memcpy(wspace,cw,(wl+1)); 1.1874 + mkallsmall(wspace); 1.1875 + st = pSMgr->suggest_morph_for_spelling_error(wspace); 1.1876 + if (st) { 1.1877 + mystrcat(result, st, MAXLNLEN); 1.1878 + free(st); 1.1879 + } 1.1880 + st = pSMgr->suggest_morph_for_spelling_error(cw); 1.1881 + if (st) { 1.1882 + if (*result) mystrcat(result, "\n", MAXLNLEN); 1.1883 + mystrcat(result, st, MAXLNLEN); 1.1884 + free(st); 1.1885 + } 1.1886 + if (abbv) { 1.1887 + memcpy(wspace,cw,wl); 1.1888 + *(wspace+wl) = '.'; 1.1889 + *(wspace+wl+1) = '\0'; 1.1890 + mkallsmall(wspace); 1.1891 + st = pSMgr->suggest_morph_for_spelling_error(wspace); 1.1892 + if (st) { 1.1893 + if (*result) mystrcat(result, "\n", MAXLNLEN); 1.1894 + mystrcat(result, st, MAXLNLEN); 1.1895 + free(st); 1.1896 + } 1.1897 + mkinitcap(wspace); 1.1898 + st = pSMgr->suggest_morph_for_spelling_error(wspace); 1.1899 + if (st) { 1.1900 + if (*result) mystrcat(result, "\n", MAXLNLEN); 1.1901 + mystrcat(result, st, MAXLNLEN); 1.1902 + free(st); 1.1903 + } 1.1904 + } 1.1905 + break; 1.1906 + } 1.1907 + case HUHCAP: { 1.1908 + st = pSMgr->suggest_morph_for_spelling_error(cw); 1.1909 + if (st) { 1.1910 + mystrcat(result, st, MAXLNLEN); 1.1911 + free(st); 1.1912 + } 1.1913 + memcpy(wspace,cw,(wl+1)); 1.1914 + mkallsmall(wspace); 1.1915 + st = pSMgr->suggest_morph_for_spelling_error(wspace); 1.1916 + if (st) { 1.1917 + if (*result) mystrcat(result, "\n", MAXLNLEN); 1.1918 + mystrcat(result, st, MAXLNLEN); 1.1919 + free(st); 1.1920 + } 1.1921 + break; 1.1922 + } 1.1923 + case ALLCAP: { 1.1924 + memcpy(wspace,cw,(wl+1)); 1.1925 + st = pSMgr->suggest_morph_for_spelling_error(wspace); 1.1926 + if (st) { 1.1927 + mystrcat(result, st, MAXLNLEN); 1.1928 + free(st); 1.1929 + } 1.1930 + mkallsmall(wspace); 1.1931 + st = pSMgr->suggest_morph_for_spelling_error(wspace); 1.1932 + if (st) { 1.1933 + if (*result) mystrcat(result, "\n", MAXLNLEN); 1.1934 + mystrcat(result, st, MAXLNLEN); 1.1935 + free(st); 1.1936 + } 1.1937 + mkinitcap(wspace); 1.1938 + st = pSMgr->suggest_morph_for_spelling_error(wspace); 1.1939 + if (st) { 1.1940 + if (*result) mystrcat(result, "\n", MAXLNLEN); 1.1941 + mystrcat(result, st, MAXLNLEN); 1.1942 + free(st); 1.1943 + } 1.1944 + if (abbv) { 1.1945 + memcpy(wspace,cw,(wl+1)); 1.1946 + *(wspace+wl) = '.'; 1.1947 + *(wspace+wl+1) = '\0'; 1.1948 + if (*result) mystrcat(result, "\n", MAXLNLEN); 1.1949 + st = pSMgr->suggest_morph_for_spelling_error(wspace); 1.1950 + if (st) { 1.1951 + mystrcat(result, st, MAXLNLEN); 1.1952 + free(st); 1.1953 + } 1.1954 + mkallsmall(wspace); 1.1955 + st = pSMgr->suggest_morph_for_spelling_error(wspace); 1.1956 + if (st) { 1.1957 + if (*result) mystrcat(result, "\n", MAXLNLEN); 1.1958 + mystrcat(result, st, MAXLNLEN); 1.1959 + free(st); 1.1960 + } 1.1961 + mkinitcap(wspace); 1.1962 + st = pSMgr->suggest_morph_for_spelling_error(wspace); 1.1963 + if (st) { 1.1964 + if (*result) mystrcat(result, "\n", MAXLNLEN); 1.1965 + mystrcat(result, st, MAXLNLEN); 1.1966 + free(st); 1.1967 + } 1.1968 + } 1.1969 + break; 1.1970 + } 1.1971 + } 1.1972 + 1.1973 + if (*result) return mystrdup(result); 1.1974 + return NULL; 1.1975 +} 1.1976 + 1.1977 +#endif // END OF HUNSPELL_EXPERIMENTAL CODE 1.1978 + 1.1979 +Hunhandle *Hunspell_create(const char * affpath, const char * dpath) 1.1980 +{ 1.1981 + return (Hunhandle*)(new Hunspell(affpath, dpath)); 1.1982 +} 1.1983 + 1.1984 +Hunhandle *Hunspell_create_key(const char * affpath, const char * dpath, 1.1985 + const char * key) 1.1986 +{ 1.1987 + return (Hunhandle*)(new Hunspell(affpath, dpath, key)); 1.1988 +} 1.1989 + 1.1990 +void Hunspell_destroy(Hunhandle *pHunspell) 1.1991 +{ 1.1992 + delete (Hunspell*)(pHunspell); 1.1993 +} 1.1994 + 1.1995 +int Hunspell_spell(Hunhandle *pHunspell, const char *word) 1.1996 +{ 1.1997 + return ((Hunspell*)pHunspell)->spell(word); 1.1998 +} 1.1999 + 1.2000 +char *Hunspell_get_dic_encoding(Hunhandle *pHunspell) 1.2001 +{ 1.2002 + return ((Hunspell*)pHunspell)->get_dic_encoding(); 1.2003 +} 1.2004 + 1.2005 +int Hunspell_suggest(Hunhandle *pHunspell, char*** slst, const char * word) 1.2006 +{ 1.2007 + return ((Hunspell*)pHunspell)->suggest(slst, word); 1.2008 +} 1.2009 + 1.2010 +int Hunspell_analyze(Hunhandle *pHunspell, char*** slst, const char * word) 1.2011 +{ 1.2012 + return ((Hunspell*)pHunspell)->analyze(slst, word); 1.2013 +} 1.2014 + 1.2015 +int Hunspell_stem(Hunhandle *pHunspell, char*** slst, const char * word) 1.2016 +{ 1.2017 + return ((Hunspell*)pHunspell)->stem(slst, word); 1.2018 +} 1.2019 + 1.2020 +int Hunspell_stem2(Hunhandle *pHunspell, char*** slst, char** desc, int n) 1.2021 +{ 1.2022 + return ((Hunspell*)pHunspell)->stem(slst, desc, n); 1.2023 +} 1.2024 + 1.2025 +int Hunspell_generate(Hunhandle *pHunspell, char*** slst, const char * word, 1.2026 + const char * word2) 1.2027 +{ 1.2028 + return ((Hunspell*)pHunspell)->generate(slst, word, word2); 1.2029 +} 1.2030 + 1.2031 +int Hunspell_generate2(Hunhandle *pHunspell, char*** slst, const char * word, 1.2032 + char** desc, int n) 1.2033 +{ 1.2034 + return ((Hunspell*)pHunspell)->generate(slst, word, desc, n); 1.2035 +} 1.2036 + 1.2037 + /* functions for run-time modification of the dictionary */ 1.2038 + 1.2039 + /* add word to the run-time dictionary */ 1.2040 + 1.2041 +int Hunspell_add(Hunhandle *pHunspell, const char * word) { 1.2042 + return ((Hunspell*)pHunspell)->add(word); 1.2043 +} 1.2044 + 1.2045 + /* add word to the run-time dictionary with affix flags of 1.2046 + * the example (a dictionary word): Hunspell will recognize 1.2047 + * affixed forms of the new word, too. 1.2048 + */ 1.2049 + 1.2050 +int Hunspell_add_with_affix(Hunhandle *pHunspell, const char * word, 1.2051 + const char * example) { 1.2052 + return ((Hunspell*)pHunspell)->add_with_affix(word, example); 1.2053 +} 1.2054 + 1.2055 + /* remove word from the run-time dictionary */ 1.2056 + 1.2057 +int Hunspell_remove(Hunhandle *pHunspell, const char * word) { 1.2058 + return ((Hunspell*)pHunspell)->remove(word); 1.2059 +} 1.2060 + 1.2061 +void Hunspell_free_list(Hunhandle *, char *** slst, int n) { 1.2062 + freelist(slst, n); 1.2063 +}