michael@0: /******* BEGIN LICENSE BLOCK ******* michael@0: * Version: MPL 1.1/GPL 2.0/LGPL 2.1 michael@0: * michael@0: * The contents of this file are subject to the Mozilla Public License Version michael@0: * 1.1 (the "License"); you may not use this file except in compliance with michael@0: * the License. You may obtain a copy of the License at michael@0: * http://www.mozilla.org/MPL/ michael@0: * michael@0: * Software distributed under the License is distributed on an "AS IS" basis, michael@0: * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License michael@0: * for the specific language governing rights and limitations under the michael@0: * License. michael@0: * michael@0: * The Initial Developers of the Original Code are Kevin Hendricks (MySpell) michael@0: * and László Németh (Hunspell). Portions created by the Initial Developers michael@0: * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved. michael@0: * michael@0: * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca) michael@0: * David Einstein (deinst@world.std.com) michael@0: * László Németh (nemethl@gyorsposta.hu) michael@0: * Caolan McNamara (caolanm@redhat.com) michael@0: * Davide Prina michael@0: * Giuseppe Modugno michael@0: * Gianluca Turconi michael@0: * Simon Brouwer michael@0: * Noll Janos michael@0: * Biro Arpad michael@0: * Goldman Eleonora michael@0: * Sarlos Tamas michael@0: * Bencsath Boldizsar michael@0: * Halacsy Peter michael@0: * Dvornik Laszlo michael@0: * Gefferth Andras michael@0: * Nagy Viktor michael@0: * Varga Daniel michael@0: * Chris Halls michael@0: * Rene Engelhard michael@0: * Bram Moolenaar michael@0: * Dafydd Jones michael@0: * Harri Pitkanen michael@0: * Andras Timar michael@0: * Tor Lillqvist michael@0: * michael@0: * Alternatively, the contents of this file may be used under the terms of michael@0: * either the GNU General Public License Version 2 or later (the "GPL"), or michael@0: * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), michael@0: * in which case the provisions of the GPL or the LGPL are applicable instead michael@0: * of those above. If you wish to allow use of your version of this file only michael@0: * under the terms of either the GPL or the LGPL, and not to allow others to michael@0: * use your version of this file under the terms of the MPL, indicate your michael@0: * decision by deleting the provisions above and replace them with the notice michael@0: * and other provisions required by the GPL or the LGPL. If you do not delete michael@0: * the provisions above, a recipient may use your version of this file under michael@0: * the terms of any one of the MPL, the GPL or the LGPL. michael@0: * michael@0: ******* END LICENSE BLOCK *******/ michael@0: michael@0: #include michael@0: #include michael@0: #include michael@0: #include michael@0: michael@0: #include "suggestmgr.hxx" michael@0: #include "htypes.hxx" michael@0: #include "csutil.hxx" michael@0: michael@0: const w_char W_VLINE = { '\0', '|' }; michael@0: michael@0: SuggestMgr::SuggestMgr(const char * tryme, int maxn, michael@0: AffixMgr * aptr) michael@0: { michael@0: michael@0: // register affix manager and check in string of chars to michael@0: // try when building candidate suggestions michael@0: pAMgr = aptr; michael@0: michael@0: csconv = NULL; michael@0: michael@0: ckeyl = 0; michael@0: ckey = NULL; michael@0: ckey_utf = NULL; michael@0: michael@0: ctryl = 0; michael@0: ctry = NULL; michael@0: ctry_utf = NULL; michael@0: michael@0: utf8 = 0; michael@0: langnum = 0; michael@0: complexprefixes = 0; michael@0: michael@0: maxSug = maxn; michael@0: nosplitsugs = 0; michael@0: maxngramsugs = MAXNGRAMSUGS; michael@0: maxcpdsugs = MAXCOMPOUNDSUGS; michael@0: michael@0: if (pAMgr) { michael@0: langnum = pAMgr->get_langnum(); michael@0: ckey = pAMgr->get_key_string(); michael@0: nosplitsugs = pAMgr->get_nosplitsugs(); michael@0: if (pAMgr->get_maxngramsugs() >= 0) michael@0: maxngramsugs = pAMgr->get_maxngramsugs(); michael@0: utf8 = pAMgr->get_utf8(); michael@0: if (pAMgr->get_maxcpdsugs() >= 0) michael@0: maxcpdsugs = pAMgr->get_maxcpdsugs(); michael@0: if (!utf8) michael@0: { michael@0: char * enc = pAMgr->get_encoding(); michael@0: csconv = get_current_cs(enc); michael@0: free(enc); michael@0: } michael@0: complexprefixes = pAMgr->get_complexprefixes(); michael@0: } michael@0: michael@0: if (ckey) { michael@0: if (utf8) { michael@0: w_char t[MAXSWL]; michael@0: ckeyl = u8_u16(t, MAXSWL, ckey); michael@0: ckey_utf = (w_char *) malloc(ckeyl * sizeof(w_char)); michael@0: if (ckey_utf) memcpy(ckey_utf, t, ckeyl * sizeof(w_char)); michael@0: else ckeyl = 0; michael@0: } else { michael@0: ckeyl = strlen(ckey); michael@0: } michael@0: } michael@0: michael@0: if (tryme) { michael@0: ctry = mystrdup(tryme); michael@0: if (ctry) ctryl = strlen(ctry); michael@0: if (ctry && utf8) { michael@0: w_char t[MAXSWL]; michael@0: ctryl = u8_u16(t, MAXSWL, tryme); michael@0: ctry_utf = (w_char *) malloc(ctryl * sizeof(w_char)); michael@0: if (ctry_utf) memcpy(ctry_utf, t, ctryl * sizeof(w_char)); michael@0: else ctryl = 0; michael@0: } michael@0: } michael@0: } michael@0: michael@0: michael@0: SuggestMgr::~SuggestMgr() michael@0: { michael@0: pAMgr = NULL; michael@0: if (ckey) free(ckey); michael@0: ckey = NULL; michael@0: if (ckey_utf) free(ckey_utf); michael@0: ckey_utf = NULL; michael@0: ckeyl = 0; michael@0: if (ctry) free(ctry); michael@0: ctry = NULL; michael@0: if (ctry_utf) free(ctry_utf); michael@0: ctry_utf = NULL; michael@0: ctryl = 0; michael@0: maxSug = 0; michael@0: #ifdef MOZILLA_CLIENT michael@0: delete [] csconv; michael@0: #endif michael@0: } michael@0: michael@0: int SuggestMgr::testsug(char** wlst, const char * candidate, int wl, int ns, int cpdsuggest, michael@0: int * timer, clock_t * timelimit) { michael@0: int cwrd = 1; michael@0: if (ns == maxSug) return maxSug; michael@0: for (int k=0; k < ns; k++) { michael@0: if (strcmp(candidate,wlst[k]) == 0) cwrd = 0; michael@0: } michael@0: if ((cwrd) && checkword(candidate, wl, cpdsuggest, timer, timelimit)) { michael@0: wlst[ns] = mystrdup(candidate); michael@0: if (wlst[ns] == NULL) { michael@0: for (int j=0; j 0) oldSug = nsug; michael@0: michael@0: // suggestions for an uppercase word (html -> HTML) michael@0: if ((nsug < maxSug) && (nsug > -1)) { michael@0: nsug = (utf8) ? capchars_utf(wlst, word_utf, wl, nsug, cpdsuggest) : michael@0: capchars(wlst, word, nsug, cpdsuggest); michael@0: } michael@0: michael@0: // perhaps we made a typical fault of spelling michael@0: if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { michael@0: nsug = replchars(wlst, word, nsug, cpdsuggest); michael@0: } michael@0: michael@0: // perhaps we made chose the wrong char from a related set michael@0: if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { michael@0: nsug = mapchars(wlst, word, nsug, cpdsuggest); michael@0: } michael@0: michael@0: // only suggest compound words when no other suggestion michael@0: if ((cpdsuggest == 0) && (nsug > nsugorig)) nocompoundtwowords=1; michael@0: michael@0: // did we swap the order of chars by mistake michael@0: if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { michael@0: nsug = (utf8) ? swapchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : michael@0: swapchar(wlst, word, nsug, cpdsuggest); michael@0: } michael@0: michael@0: // did we swap the order of non adjacent chars by mistake michael@0: if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { michael@0: nsug = (utf8) ? longswapchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : michael@0: longswapchar(wlst, word, nsug, cpdsuggest); michael@0: } michael@0: michael@0: // did we just hit the wrong key in place of a good char (case and keyboard) michael@0: if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { michael@0: nsug = (utf8) ? badcharkey_utf(wlst, word_utf, wl, nsug, cpdsuggest) : michael@0: badcharkey(wlst, word, nsug, cpdsuggest); michael@0: } michael@0: michael@0: // did we add a char that should not be there michael@0: if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { michael@0: nsug = (utf8) ? extrachar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : michael@0: extrachar(wlst, word, nsug, cpdsuggest); michael@0: } michael@0: michael@0: michael@0: // did we forgot a char michael@0: if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { michael@0: nsug = (utf8) ? forgotchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : michael@0: forgotchar(wlst, word, nsug, cpdsuggest); michael@0: } michael@0: michael@0: // did we move a char michael@0: if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { michael@0: nsug = (utf8) ? movechar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : michael@0: movechar(wlst, word, nsug, cpdsuggest); michael@0: } michael@0: michael@0: // did we just hit the wrong key in place of a good char michael@0: if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { michael@0: nsug = (utf8) ? badchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : michael@0: badchar(wlst, word, nsug, cpdsuggest); michael@0: } michael@0: michael@0: // did we double two characters michael@0: if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { michael@0: nsug = (utf8) ? doubletwochars_utf(wlst, word_utf, wl, nsug, cpdsuggest) : michael@0: doubletwochars(wlst, word, nsug, cpdsuggest); michael@0: } michael@0: michael@0: // perhaps we forgot to hit space and two words ran together michael@0: if (!nosplitsugs && (nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) { michael@0: nsug = twowords(wlst, word, nsug, cpdsuggest); michael@0: } michael@0: michael@0: } // repeating ``for'' statement compounding support michael@0: michael@0: if (nsug < 0) { michael@0: // we ran out of memory - we should free up as much as possible michael@0: for (int i = 0; i < maxSug; i++) michael@0: if (wlst[i] != NULL) free(wlst[i]); michael@0: free(wlst); michael@0: wlst = NULL; michael@0: } michael@0: michael@0: if (!nocompoundtwowords && (nsug > 0) && onlycompoundsug) *onlycompoundsug = 1; michael@0: michael@0: *slst = wlst; michael@0: return nsug; michael@0: } michael@0: michael@0: // generate suggestions for a word with typical mistake michael@0: // pass in address of array of char * pointers michael@0: #ifdef HUNSPELL_EXPERIMENTAL michael@0: int SuggestMgr::suggest_auto(char*** slst, const char * w, int nsug) michael@0: { michael@0: int nocompoundtwowords = 0; michael@0: char ** wlst; michael@0: int oldSug; michael@0: michael@0: char w2[MAXWORDUTF8LEN]; michael@0: const char * word = w; michael@0: michael@0: // word reversing wrapper for complex prefixes michael@0: if (complexprefixes) { michael@0: strcpy(w2, w); michael@0: if (utf8) reverseword_utf(w2); else reverseword(w2); michael@0: word = w2; michael@0: } michael@0: michael@0: if (*slst) { michael@0: wlst = *slst; michael@0: } else { michael@0: wlst = (char **) malloc(maxSug * sizeof(char *)); michael@0: if (wlst == NULL) return -1; michael@0: } michael@0: michael@0: for (int cpdsuggest=0; (cpdsuggest<2) && (nocompoundtwowords==0); cpdsuggest++) { michael@0: michael@0: // limit compound suggestion michael@0: if (cpdsuggest > 0) oldSug = nsug; michael@0: michael@0: // perhaps we made a typical fault of spelling michael@0: if ((nsug < maxSug) && (nsug > -1)) michael@0: nsug = replchars(wlst, word, nsug, cpdsuggest); michael@0: michael@0: // perhaps we made chose the wrong char from a related set michael@0: if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) michael@0: nsug = mapchars(wlst, word, nsug, cpdsuggest); michael@0: michael@0: if ((cpdsuggest==0) && (nsug>0)) nocompoundtwowords=1; michael@0: michael@0: // perhaps we forgot to hit space and two words ran together michael@0: michael@0: if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs)) && check_forbidden(word, strlen(word))) { michael@0: nsug = twowords(wlst, word, nsug, cpdsuggest); michael@0: } michael@0: michael@0: } // repeating ``for'' statement compounding support michael@0: michael@0: if (nsug < 0) { michael@0: for (int i=0;i HTML) michael@0: int SuggestMgr::capchars_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest) michael@0: { michael@0: char candidate[MAXSWUTF8L]; michael@0: w_char candidate_utf[MAXSWL]; michael@0: memcpy(candidate_utf, word, wl * sizeof(w_char)); michael@0: mkallcap_utf(candidate_utf, wl, langnum); michael@0: u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); michael@0: return testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); michael@0: } michael@0: michael@0: // suggestions for an uppercase word (html -> HTML) michael@0: int SuggestMgr::capchars(char** wlst, const char * word, int ns, int cpdsuggest) michael@0: { michael@0: char candidate[MAXSWUTF8L]; michael@0: strcpy(candidate, word); michael@0: mkallcap(candidate, csconv); michael@0: return testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); michael@0: } michael@0: michael@0: // suggestions for when chose the wrong char out of a related set michael@0: int SuggestMgr::mapchars(char** wlst, const char * word, int ns, int cpdsuggest) michael@0: { michael@0: char candidate[MAXSWUTF8L]; michael@0: clock_t timelimit; michael@0: int timer; michael@0: candidate[0] = '\0'; michael@0: michael@0: int wl = strlen(word); michael@0: if (wl < 2 || ! pAMgr) return ns; michael@0: michael@0: int nummap = pAMgr->get_nummap(); michael@0: struct mapentry* maptable = pAMgr->get_maptable(); michael@0: if (maptable==NULL) return ns; michael@0: michael@0: timelimit = clock(); michael@0: timer = MINTIMER; michael@0: return map_related(word, (char *) &candidate, 0, 0, wlst, cpdsuggest, ns, maptable, nummap, &timer, &timelimit); michael@0: } michael@0: michael@0: int SuggestMgr::map_related(const char * word, char * candidate, int wn, int cn, michael@0: char** wlst, int cpdsuggest, int ns, michael@0: const mapentry* maptable, int nummap, int * timer, clock_t * timelimit) michael@0: { michael@0: if (*(word + wn) == '\0') { michael@0: int cwrd = 1; michael@0: *(candidate + cn) = '\0'; michael@0: int wl = strlen(candidate); michael@0: for (int m=0; m < ns; m++) michael@0: if (strcmp(candidate, wlst[m]) == 0) cwrd = 0; michael@0: if ((cwrd) && checkword(candidate, wl, cpdsuggest, timer, timelimit)) { michael@0: if (ns < maxSug) { michael@0: wlst[ns] = mystrdup(candidate); michael@0: if (wlst[ns] == NULL) return -1; michael@0: ns++; michael@0: } michael@0: } michael@0: return ns; michael@0: } michael@0: int in_map = 0; michael@0: for (int j = 0; j < nummap; j++) { michael@0: for (int k = 0; k < maptable[j].len; k++) { michael@0: int len = strlen(maptable[j].set[k]); michael@0: if (strncmp(maptable[j].set[k], word + wn, len) == 0) { michael@0: in_map = 1; michael@0: for (int l = 0; l < maptable[j].len; l++) { michael@0: strcpy(candidate + cn, maptable[j].set[l]); michael@0: ns = map_related(word, candidate, wn + len, strlen(candidate), wlst, michael@0: cpdsuggest, ns, maptable, nummap, timer, timelimit); michael@0: if (!(*timer)) return ns; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: if (!in_map) { michael@0: *(candidate + cn) = *(word + wn); michael@0: ns = map_related(word, candidate, wn + 1, cn + 1, wlst, cpdsuggest, michael@0: ns, maptable, nummap, timer, timelimit); michael@0: } michael@0: return ns; michael@0: } michael@0: michael@0: // suggestions for a typical fault of spelling, that michael@0: // differs with more, than 1 letter from the right form. michael@0: int SuggestMgr::replchars(char** wlst, const char * word, int ns, int cpdsuggest) michael@0: { michael@0: char candidate[MAXSWUTF8L]; michael@0: const char * r; michael@0: int lenr, lenp; michael@0: int wl = strlen(word); michael@0: if (wl < 2 || ! pAMgr) return ns; michael@0: int numrep = pAMgr->get_numrep(); michael@0: struct replentry* reptable = pAMgr->get_reptable(); michael@0: if (reptable==NULL) return ns; michael@0: for (int i=0; i < numrep; i++ ) { michael@0: r = word; michael@0: lenr = strlen(reptable[i].pattern2); michael@0: lenp = strlen(reptable[i].pattern); michael@0: // search every occurence of the pattern in the word michael@0: while ((r=strstr(r, reptable[i].pattern)) != NULL && (!reptable[i].end || strlen(r) == strlen(reptable[i].pattern)) && michael@0: (!reptable[i].start || r == word)) { michael@0: strcpy(candidate, word); michael@0: if (r-word + lenr + strlen(r+lenp) >= MAXSWUTF8L) break; michael@0: strcpy(candidate+(r-word),reptable[i].pattern2); michael@0: strcpy(candidate+(r-word)+lenr, r+lenp); michael@0: ns = testsug(wlst, candidate, wl-lenp+lenr, ns, cpdsuggest, NULL, NULL); michael@0: if (ns == -1) return -1; michael@0: // check REP suggestions with space michael@0: char * sp = strchr(candidate, ' '); michael@0: if (sp) { michael@0: char * prev = candidate; michael@0: while (sp) { michael@0: *sp = '\0'; michael@0: if (checkword(prev, strlen(prev), 0, NULL, NULL)) { michael@0: int oldns = ns; michael@0: *sp = ' '; michael@0: ns = testsug(wlst, sp + 1, strlen(sp + 1), ns, cpdsuggest, NULL, NULL); michael@0: if (ns == -1) return -1; michael@0: if (oldns < ns) { michael@0: free(wlst[ns - 1]); michael@0: wlst[ns - 1] = mystrdup(candidate); michael@0: if (!wlst[ns - 1]) return -1; michael@0: } michael@0: } michael@0: *sp = ' '; michael@0: prev = sp + 1; michael@0: sp = strchr(prev, ' '); michael@0: } michael@0: } michael@0: r++; // search for the next letter michael@0: } michael@0: } michael@0: return ns; michael@0: } michael@0: michael@0: // perhaps we doubled two characters (pattern aba -> ababa, for example vacation -> vacacation) michael@0: int SuggestMgr::doubletwochars(char** wlst, const char * word, int ns, int cpdsuggest) michael@0: { michael@0: char candidate[MAXSWUTF8L]; michael@0: int state=0; michael@0: int wl = strlen(word); michael@0: if (wl < 5 || ! pAMgr) return ns; michael@0: for (int i=2; i < wl; i++ ) { michael@0: if (word[i]==word[i-2]) { michael@0: state++; michael@0: if (state==3) { michael@0: strcpy(candidate,word); michael@0: strcpy(candidate+i-1,word+i+1); michael@0: ns = testsug(wlst, candidate, wl-2, ns, cpdsuggest, NULL, NULL); michael@0: if (ns == -1) return -1; michael@0: state=0; michael@0: } michael@0: } else { michael@0: state=0; michael@0: } michael@0: } michael@0: return ns; michael@0: } michael@0: michael@0: // perhaps we doubled two characters (pattern aba -> ababa, for example vacation -> vacacation) michael@0: int SuggestMgr::doubletwochars_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest) michael@0: { michael@0: w_char candidate_utf[MAXSWL]; michael@0: char candidate[MAXSWUTF8L]; michael@0: int state=0; michael@0: if (wl < 5 || ! pAMgr) return ns; michael@0: for (int i=2; i < wl; i++) { michael@0: if (w_char_eq(word[i], word[i-2])) { michael@0: state++; michael@0: if (state==3) { michael@0: memcpy(candidate_utf, word, (i - 1) * sizeof(w_char)); michael@0: memcpy(candidate_utf+i-1, word+i+1, (wl-i-1) * sizeof(w_char)); michael@0: u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl-2); michael@0: ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); michael@0: if (ns == -1) return -1; michael@0: state=0; michael@0: } michael@0: } else { michael@0: state=0; michael@0: } michael@0: } michael@0: return ns; michael@0: } michael@0: michael@0: // error is wrong char in place of correct one (case and keyboard related version) michael@0: int SuggestMgr::badcharkey(char ** wlst, const char * word, int ns, int cpdsuggest) michael@0: { michael@0: char tmpc; michael@0: char candidate[MAXSWUTF8L]; michael@0: int wl = strlen(word); michael@0: strcpy(candidate, word); michael@0: // swap out each char one by one and try uppercase and neighbor michael@0: // keyboard chars in its place to see if that makes a good word michael@0: michael@0: for (int i=0; i < wl; i++) { michael@0: tmpc = candidate[i]; michael@0: // check with uppercase letters michael@0: candidate[i] = csconv[((unsigned char)tmpc)].cupper; michael@0: if (tmpc != candidate[i]) { michael@0: ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); michael@0: if (ns == -1) return -1; michael@0: candidate[i] = tmpc; michael@0: } michael@0: // check neighbor characters in keyboard string michael@0: if (!ckey) continue; michael@0: char * loc = strchr(ckey, tmpc); michael@0: while (loc) { michael@0: if ((loc > ckey) && (*(loc - 1) != '|')) { michael@0: candidate[i] = *(loc - 1); michael@0: ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); michael@0: if (ns == -1) return -1; michael@0: } michael@0: if ((*(loc + 1) != '|') && (*(loc + 1) != '\0')) { michael@0: candidate[i] = *(loc + 1); michael@0: ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); michael@0: if (ns == -1) return -1; michael@0: } michael@0: loc = strchr(loc + 1, tmpc); michael@0: } michael@0: candidate[i] = tmpc; michael@0: } michael@0: return ns; michael@0: } michael@0: michael@0: // error is wrong char in place of correct one (case and keyboard related version) michael@0: int SuggestMgr::badcharkey_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest) michael@0: { michael@0: w_char tmpc; michael@0: w_char candidate_utf[MAXSWL]; michael@0: char candidate[MAXSWUTF8L]; michael@0: memcpy(candidate_utf, word, wl * sizeof(w_char)); michael@0: // swap out each char one by one and try all the tryme michael@0: // chars in its place to see if that makes a good word michael@0: for (int i=0; i < wl; i++) { michael@0: tmpc = candidate_utf[i]; michael@0: // check with uppercase letters michael@0: mkallcap_utf(candidate_utf + i, 1, langnum); michael@0: if (!w_char_eq(tmpc, candidate_utf[i])) { michael@0: u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); michael@0: ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); michael@0: if (ns == -1) return -1; michael@0: candidate_utf[i] = tmpc; michael@0: } michael@0: // check neighbor characters in keyboard string michael@0: if (!ckey) continue; michael@0: w_char * loc = ckey_utf; michael@0: while ((loc < (ckey_utf + ckeyl)) && !w_char_eq(*loc, tmpc)) loc++; michael@0: while (loc < (ckey_utf + ckeyl)) { michael@0: if ((loc > ckey_utf) && !w_char_eq(*(loc - 1), W_VLINE)) { michael@0: candidate_utf[i] = *(loc - 1); michael@0: u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); michael@0: ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); michael@0: if (ns == -1) return -1; michael@0: } michael@0: if (((loc + 1) < (ckey_utf + ckeyl)) && !w_char_eq(*(loc + 1), W_VLINE)) { michael@0: candidate_utf[i] = *(loc + 1); michael@0: u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); michael@0: ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); michael@0: if (ns == -1) return -1; michael@0: } michael@0: do { loc++; } while ((loc < (ckey_utf + ckeyl)) && !w_char_eq(*loc, tmpc)); michael@0: } michael@0: candidate_utf[i] = tmpc; michael@0: } michael@0: return ns; michael@0: } michael@0: michael@0: // error is wrong char in place of correct one michael@0: int SuggestMgr::badchar(char ** wlst, const char * word, int ns, int cpdsuggest) michael@0: { michael@0: char tmpc; michael@0: char candidate[MAXSWUTF8L]; michael@0: clock_t timelimit = clock(); michael@0: int timer = MINTIMER; michael@0: int wl = strlen(word); michael@0: strcpy(candidate, word); michael@0: // swap out each char one by one and try all the tryme michael@0: // chars in its place to see if that makes a good word michael@0: for (int j=0; j < ctryl; j++) { michael@0: for (int i=wl-1; i >= 0; i--) { michael@0: tmpc = candidate[i]; michael@0: if (ctry[j] == tmpc) continue; michael@0: candidate[i] = ctry[j]; michael@0: ns = testsug(wlst, candidate, wl, ns, cpdsuggest, &timer, &timelimit); michael@0: if (ns == -1) return -1; michael@0: if (!timer) return ns; michael@0: candidate[i] = tmpc; michael@0: } michael@0: } michael@0: return ns; michael@0: } michael@0: michael@0: // error is wrong char in place of correct one michael@0: int SuggestMgr::badchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest) michael@0: { michael@0: w_char tmpc; michael@0: w_char candidate_utf[MAXSWL]; michael@0: char candidate[MAXSWUTF8L]; michael@0: clock_t timelimit = clock(); michael@0: int timer = MINTIMER; michael@0: memcpy(candidate_utf, word, wl * sizeof(w_char)); michael@0: // swap out each char one by one and try all the tryme michael@0: // chars in its place to see if that makes a good word michael@0: for (int j=0; j < ctryl; j++) { michael@0: for (int i=wl-1; i >= 0; i--) { michael@0: tmpc = candidate_utf[i]; michael@0: if (w_char_eq(tmpc, ctry_utf[j])) continue; michael@0: candidate_utf[i] = ctry_utf[j]; michael@0: u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); michael@0: ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, &timer, &timelimit); michael@0: if (ns == -1) return -1; michael@0: if (!timer) return ns; michael@0: candidate_utf[i] = tmpc; michael@0: } michael@0: } michael@0: return ns; michael@0: } michael@0: michael@0: // error is word has an extra letter it does not need michael@0: int SuggestMgr::extrachar_utf(char** wlst, const w_char * word, int wl, int ns, int cpdsuggest) michael@0: { michael@0: char candidate[MAXSWUTF8L]; michael@0: w_char candidate_utf[MAXSWL]; michael@0: w_char * p; michael@0: w_char tmpc = W_VLINE; // not used value, only for VCC warning message michael@0: if (wl < 2) return ns; michael@0: // try omitting one char of word at a time michael@0: memcpy(candidate_utf, word, wl * sizeof(w_char)); michael@0: for (p = candidate_utf + wl - 1; p >= candidate_utf; p--) { michael@0: w_char tmpc2 = *p; michael@0: if (p < candidate_utf + wl - 1) *p = tmpc; michael@0: u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl - 1); michael@0: ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); michael@0: if (ns == -1) return -1; michael@0: tmpc = tmpc2; michael@0: } michael@0: return ns; michael@0: } michael@0: michael@0: // error is word has an extra letter it does not need michael@0: int SuggestMgr::extrachar(char** wlst, const char * word, int ns, int cpdsuggest) michael@0: { michael@0: char tmpc = '\0'; michael@0: char candidate[MAXSWUTF8L]; michael@0: char * p; michael@0: int wl = strlen(word); michael@0: if (wl < 2) return ns; michael@0: // try omitting one char of word at a time michael@0: strcpy (candidate, word); michael@0: for (p = candidate + wl - 1; p >=candidate; p--) { michael@0: char tmpc2 = *p; michael@0: *p = tmpc; michael@0: ns = testsug(wlst, candidate, wl-1, ns, cpdsuggest, NULL, NULL); michael@0: if (ns == -1) return -1; michael@0: tmpc = tmpc2; michael@0: } michael@0: return ns; michael@0: } michael@0: michael@0: // error is missing a letter it needs michael@0: int SuggestMgr::forgotchar(char ** wlst, const char * word, int ns, int cpdsuggest) michael@0: { michael@0: char candidate[MAXSWUTF8L]; michael@0: char * p; michael@0: clock_t timelimit = clock(); michael@0: int timer = MINTIMER; michael@0: int wl = strlen(word); michael@0: // try inserting a tryme character before every letter (and the null terminator) michael@0: for (int i = 0; i < ctryl; i++) { michael@0: strcpy(candidate, word); michael@0: for (p = candidate + wl; p >= candidate; p--) { michael@0: *(p+1) = *p; michael@0: *p = ctry[i]; michael@0: ns = testsug(wlst, candidate, wl+1, ns, cpdsuggest, &timer, &timelimit); michael@0: if (ns == -1) return -1; michael@0: if (!timer) return ns; michael@0: } michael@0: } michael@0: return ns; michael@0: } michael@0: michael@0: // error is missing a letter it needs michael@0: int SuggestMgr::forgotchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest) michael@0: { michael@0: w_char candidate_utf[MAXSWL]; michael@0: char candidate[MAXSWUTF8L]; michael@0: w_char * p; michael@0: clock_t timelimit = clock(); michael@0: int timer = MINTIMER; michael@0: // try inserting a tryme character at the end of the word and before every letter michael@0: for (int i = 0; i < ctryl; i++) { michael@0: memcpy (candidate_utf, word, wl * sizeof(w_char)); michael@0: for (p = candidate_utf + wl; p >= candidate_utf; p--) { michael@0: *(p + 1) = *p; michael@0: *p = ctry_utf[i]; michael@0: u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl + 1); michael@0: ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, &timer, &timelimit); michael@0: if (ns == -1) return -1; michael@0: if (!timer) return ns; michael@0: } michael@0: } michael@0: return ns; michael@0: } michael@0: michael@0: michael@0: /* error is should have been two words */ michael@0: int SuggestMgr::twowords(char ** wlst, const char * word, int ns, int cpdsuggest) michael@0: { michael@0: char candidate[MAXSWUTF8L]; michael@0: char * p; michael@0: int c1, c2; michael@0: int forbidden = 0; michael@0: int cwrd; michael@0: michael@0: int wl=strlen(word); michael@0: if (wl < 3) return ns; michael@0: michael@0: if (langnum == LANG_hu) forbidden = check_forbidden(word, wl); michael@0: michael@0: strcpy(candidate + 1, word); michael@0: // split the string into two pieces after every char michael@0: // if both pieces are good words make them a suggestion michael@0: for (p = candidate + 1; p[1] != '\0'; p++) { michael@0: p[-1] = *p; michael@0: // go to end of the UTF-8 character michael@0: while (utf8 && ((p[1] & 0xc0) == 0x80)) { michael@0: *p = p[1]; michael@0: p++; michael@0: } michael@0: if (utf8 && p[1] == '\0') break; // last UTF-8 character michael@0: *p = '\0'; michael@0: c1 = checkword(candidate,strlen(candidate), cpdsuggest, NULL, NULL); michael@0: if (c1) { michael@0: c2 = checkword((p+1),strlen(p+1), cpdsuggest, NULL, NULL); michael@0: if (c2) { michael@0: *p = ' '; michael@0: michael@0: // spec. Hungarian code (need a better compound word support) michael@0: if ((langnum == LANG_hu) && !forbidden && michael@0: // if 3 repeating letter, use - instead of space michael@0: (((p[-1] == p[1]) && (((p>candidate+1) && (p[-1] == p[-2])) || (p[-1] == p[2]))) || michael@0: // or multiple compounding, with more, than 6 syllables michael@0: ((c1 == 3) && (c2 >= 2)))) *p = '-'; michael@0: michael@0: cwrd = 1; michael@0: for (int k=0; k < ns; k++) michael@0: if (strcmp(candidate,wlst[k]) == 0) cwrd = 0; michael@0: if (ns < maxSug) { michael@0: if (cwrd) { michael@0: wlst[ns] = mystrdup(candidate); michael@0: if (wlst[ns] == NULL) return -1; michael@0: ns++; michael@0: } michael@0: } else return ns; michael@0: // add two word suggestion with dash, if TRY string contains michael@0: // "a" or "-" michael@0: // NOTE: cwrd doesn't modified for REP twoword sugg. michael@0: if (ctry && (strchr(ctry, 'a') || strchr(ctry, '-')) && michael@0: mystrlen(p + 1) > 1 && michael@0: mystrlen(candidate) - mystrlen(p) > 1) { michael@0: *p = '-'; michael@0: for (int k=0; k < ns; k++) michael@0: if (strcmp(candidate,wlst[k]) == 0) cwrd = 0; michael@0: if (ns < maxSug) { michael@0: if (cwrd) { michael@0: wlst[ns] = mystrdup(candidate); michael@0: if (wlst[ns] == NULL) return -1; michael@0: ns++; michael@0: } michael@0: } else return ns; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: return ns; michael@0: } michael@0: michael@0: michael@0: // error is adjacent letter were swapped michael@0: int SuggestMgr::swapchar(char ** wlst, const char * word, int ns, int cpdsuggest) michael@0: { michael@0: char candidate[MAXSWUTF8L]; michael@0: char * p; michael@0: char tmpc; michael@0: int wl=strlen(word); michael@0: // try swapping adjacent chars one by one michael@0: strcpy(candidate, word); michael@0: for (p = candidate; p[1] != 0; p++) { michael@0: tmpc = *p; michael@0: *p = p[1]; michael@0: p[1] = tmpc; michael@0: ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); michael@0: if (ns == -1) return -1; michael@0: p[1] = *p; michael@0: *p = tmpc; michael@0: } michael@0: // try double swaps for short words michael@0: // ahev -> have, owudl -> would michael@0: if (wl == 4 || wl == 5) { michael@0: candidate[0] = word[1]; michael@0: candidate[1] = word[0]; michael@0: candidate[2] = word[2]; michael@0: candidate[wl - 2] = word[wl - 1]; michael@0: candidate[wl - 1] = word[wl - 2]; michael@0: ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); michael@0: if (ns == -1) return -1; michael@0: if (wl == 5) { michael@0: candidate[0] = word[0]; michael@0: candidate[1] = word[2]; michael@0: candidate[2] = word[1]; michael@0: ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); michael@0: if (ns == -1) return -1; michael@0: } michael@0: } michael@0: return ns; michael@0: } michael@0: michael@0: // error is adjacent letter were swapped michael@0: int SuggestMgr::swapchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest) michael@0: { michael@0: w_char candidate_utf[MAXSWL]; michael@0: char candidate[MAXSWUTF8L]; michael@0: w_char * p; michael@0: w_char tmpc; michael@0: int len = 0; michael@0: // try swapping adjacent chars one by one michael@0: memcpy (candidate_utf, word, wl * sizeof(w_char)); michael@0: for (p = candidate_utf; p < (candidate_utf + wl - 1); p++) { michael@0: tmpc = *p; michael@0: *p = p[1]; michael@0: p[1] = tmpc; michael@0: u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); michael@0: if (len == 0) len = strlen(candidate); michael@0: ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL); michael@0: if (ns == -1) return -1; michael@0: p[1] = *p; michael@0: *p = tmpc; michael@0: } michael@0: // try double swaps for short words michael@0: // ahev -> have, owudl -> would, suodn -> sound michael@0: if (wl == 4 || wl == 5) { michael@0: candidate_utf[0] = word[1]; michael@0: candidate_utf[1] = word[0]; michael@0: candidate_utf[2] = word[2]; michael@0: candidate_utf[wl - 2] = word[wl - 1]; michael@0: candidate_utf[wl - 1] = word[wl - 2]; michael@0: u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); michael@0: ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL); michael@0: if (ns == -1) return -1; michael@0: if (wl == 5) { michael@0: candidate_utf[0] = word[0]; michael@0: candidate_utf[1] = word[2]; michael@0: candidate_utf[2] = word[1]; michael@0: u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); michael@0: ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL); michael@0: if (ns == -1) return -1; michael@0: } michael@0: } michael@0: return ns; michael@0: } michael@0: michael@0: // error is not adjacent letter were swapped michael@0: int SuggestMgr::longswapchar(char ** wlst, const char * word, int ns, int cpdsuggest) michael@0: { michael@0: char candidate[MAXSWUTF8L]; michael@0: char * p; michael@0: char * q; michael@0: char tmpc; michael@0: int wl=strlen(word); michael@0: // try swapping not adjacent chars one by one michael@0: strcpy(candidate, word); michael@0: for (p = candidate; *p != 0; p++) { michael@0: for (q = candidate; *q != 0; q++) { michael@0: if (abs((int)(p-q)) > 1) { michael@0: tmpc = *p; michael@0: *p = *q; michael@0: *q = tmpc; michael@0: ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); michael@0: if (ns == -1) return -1; michael@0: *q = *p; michael@0: *p = tmpc; michael@0: } michael@0: } michael@0: } michael@0: return ns; michael@0: } michael@0: michael@0: michael@0: // error is adjacent letter were swapped michael@0: int SuggestMgr::longswapchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest) michael@0: { michael@0: w_char candidate_utf[MAXSWL]; michael@0: char candidate[MAXSWUTF8L]; michael@0: w_char * p; michael@0: w_char * q; michael@0: w_char tmpc; michael@0: // try swapping not adjacent chars michael@0: memcpy (candidate_utf, word, wl * sizeof(w_char)); michael@0: for (p = candidate_utf; p < (candidate_utf + wl); p++) { michael@0: for (q = candidate_utf; q < (candidate_utf + wl); q++) { michael@0: if (abs((int)(p-q)) > 1) { michael@0: tmpc = *p; michael@0: *p = *q; michael@0: *q = tmpc; michael@0: u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); michael@0: ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); michael@0: if (ns == -1) return -1; michael@0: *q = *p; michael@0: *p = tmpc; michael@0: } michael@0: } michael@0: } michael@0: return ns; michael@0: } michael@0: michael@0: // error is a letter was moved michael@0: int SuggestMgr::movechar(char ** wlst, const char * word, int ns, int cpdsuggest) michael@0: { michael@0: char candidate[MAXSWUTF8L]; michael@0: char * p; michael@0: char * q; michael@0: char tmpc; michael@0: michael@0: int wl=strlen(word); michael@0: // try moving a char michael@0: strcpy(candidate, word); michael@0: for (p = candidate; *p != 0; p++) { michael@0: for (q = p + 1; (*q != 0) && ((q - p) < 10); q++) { michael@0: tmpc = *(q-1); michael@0: *(q-1) = *q; michael@0: *q = tmpc; michael@0: if ((q-p) < 2) continue; // omit swap char michael@0: ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); michael@0: if (ns == -1) return -1; michael@0: } michael@0: strcpy(candidate, word); michael@0: } michael@0: for (p = candidate + wl - 1; p > candidate; p--) { michael@0: for (q = p - 1; (q >= candidate) && ((p - q) < 10); q--) { michael@0: tmpc = *(q+1); michael@0: *(q+1) = *q; michael@0: *q = tmpc; michael@0: if ((p-q) < 2) continue; // omit swap char michael@0: ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); michael@0: if (ns == -1) return -1; michael@0: } michael@0: strcpy(candidate, word); michael@0: } michael@0: return ns; michael@0: } michael@0: michael@0: // error is a letter was moved michael@0: int SuggestMgr::movechar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest) michael@0: { michael@0: w_char candidate_utf[MAXSWL]; michael@0: char candidate[MAXSWUTF8L]; michael@0: w_char * p; michael@0: w_char * q; michael@0: w_char tmpc; michael@0: // try moving a char michael@0: memcpy (candidate_utf, word, wl * sizeof(w_char)); michael@0: for (p = candidate_utf; p < (candidate_utf + wl); p++) { michael@0: for (q = p + 1; (q < (candidate_utf + wl)) && ((q - p) < 10); q++) { michael@0: tmpc = *(q-1); michael@0: *(q-1) = *q; michael@0: *q = tmpc; michael@0: if ((q-p) < 2) continue; // omit swap char michael@0: u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); michael@0: ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); michael@0: if (ns == -1) return -1; michael@0: } michael@0: memcpy (candidate_utf, word, wl * sizeof(w_char)); michael@0: } michael@0: for (p = candidate_utf + wl - 1; p > candidate_utf; p--) { michael@0: for (q = p - 1; (q >= candidate_utf) && ((p - q) < 10); q--) { michael@0: tmpc = *(q+1); michael@0: *(q+1) = *q; michael@0: *q = tmpc; michael@0: if ((p-q) < 2) continue; // omit swap char michael@0: u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); michael@0: ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); michael@0: if (ns == -1) return -1; michael@0: } michael@0: memcpy (candidate_utf, word, wl * sizeof(w_char)); michael@0: } michael@0: return ns; michael@0: } michael@0: michael@0: // generate a set of suggestions for very poorly spelled words michael@0: int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr** pHMgr, int md) michael@0: { michael@0: michael@0: int i, j; michael@0: int lval; michael@0: int sc, scphon; michael@0: int lp, lpphon; michael@0: int nonbmp = 0; michael@0: michael@0: // exhaustively search through all root words michael@0: // keeping track of the MAX_ROOTS most similar root words michael@0: struct hentry * roots[MAX_ROOTS]; michael@0: char * rootsphon[MAX_ROOTS]; michael@0: int scores[MAX_ROOTS]; michael@0: int scoresphon[MAX_ROOTS]; michael@0: for (i = 0; i < MAX_ROOTS; i++) { michael@0: roots[i] = NULL; michael@0: scores[i] = -100 * i; michael@0: rootsphon[i] = NULL; michael@0: scoresphon[i] = -100 * i; michael@0: } michael@0: lp = MAX_ROOTS - 1; michael@0: lpphon = MAX_ROOTS - 1; michael@0: scphon = -20000; michael@0: int low = NGRAM_LOWERING; michael@0: michael@0: char w2[MAXWORDUTF8LEN]; michael@0: char f[MAXSWUTF8L]; michael@0: char * word = w; michael@0: michael@0: // word reversing wrapper for complex prefixes michael@0: if (complexprefixes) { michael@0: strcpy(w2, w); michael@0: if (utf8) reverseword_utf(w2); else reverseword(w2); michael@0: word = w2; michael@0: } michael@0: michael@0: char mw[MAXSWUTF8L]; michael@0: w_char u8[MAXSWL]; michael@0: int nc = strlen(word); michael@0: int n = (utf8) ? u8_u16(u8, MAXSWL, word) : nc; michael@0: michael@0: // set character based ngram suggestion for words with non-BMP Unicode characters michael@0: if (n == -1) { michael@0: utf8 = 0; // XXX not state-free michael@0: n = nc; michael@0: nonbmp = 1; michael@0: low = 0; michael@0: } michael@0: michael@0: struct hentry* hp = NULL; michael@0: int col = -1; michael@0: phonetable * ph = (pAMgr) ? pAMgr->get_phonetable() : NULL; michael@0: char target[MAXSWUTF8L]; michael@0: char candidate[MAXSWUTF8L]; michael@0: if (ph) { michael@0: if (utf8) { michael@0: w_char _w[MAXSWL]; michael@0: int _wl = u8_u16(_w, MAXSWL, word); michael@0: mkallcap_utf(_w, _wl, langnum); michael@0: u16_u8(candidate, MAXSWUTF8L, _w, _wl); michael@0: } else { michael@0: strcpy(candidate, word); michael@0: if (!nonbmp) mkallcap(candidate, csconv); michael@0: } michael@0: phonet(candidate, target, nc, *ph); // XXX phonet() is 8-bit (nc, not n) michael@0: } michael@0: michael@0: FLAG forbiddenword = pAMgr ? pAMgr->get_forbiddenword() : FLAG_NULL; michael@0: FLAG nosuggest = pAMgr ? pAMgr->get_nosuggest() : FLAG_NULL; michael@0: FLAG nongramsuggest = pAMgr ? pAMgr->get_nongramsuggest() : FLAG_NULL; michael@0: FLAG onlyincompound = pAMgr ? pAMgr->get_onlyincompound() : FLAG_NULL; michael@0: michael@0: for (i = 0; i < md; i++) { michael@0: while (0 != (hp = (pHMgr[i])->walk_hashtable(col, hp))) { michael@0: if ((hp->astr) && (pAMgr) && michael@0: (TESTAFF(hp->astr, forbiddenword, hp->alen) || michael@0: TESTAFF(hp->astr, ONLYUPCASEFLAG, hp->alen) || michael@0: TESTAFF(hp->astr, nosuggest, hp->alen) || michael@0: TESTAFF(hp->astr, nongramsuggest, hp->alen) || michael@0: TESTAFF(hp->astr, onlyincompound, hp->alen))) continue; michael@0: michael@0: sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + low) + michael@0: leftcommonsubstring(word, HENTRY_WORD(hp)); michael@0: michael@0: // check special pronounciation michael@0: if ((hp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) { michael@0: int sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) + michael@0: + leftcommonsubstring(word, f); michael@0: if (sc2 > sc) sc = sc2; michael@0: } michael@0: michael@0: scphon = -20000; michael@0: if (ph && (sc > 2) && (abs(n - (int) hp->clen) <= 3)) { michael@0: char target2[MAXSWUTF8L]; michael@0: if (utf8) { michael@0: w_char _w[MAXSWL]; michael@0: int _wl = u8_u16(_w, MAXSWL, HENTRY_WORD(hp)); michael@0: mkallcap_utf(_w, _wl, langnum); michael@0: u16_u8(candidate, MAXSWUTF8L, _w, _wl); michael@0: } else { michael@0: strcpy(candidate, HENTRY_WORD(hp)); michael@0: mkallcap(candidate, csconv); michael@0: } michael@0: phonet(candidate, target2, -1, *ph); michael@0: scphon = 2 * ngram(3, target, target2, NGRAM_LONGER_WORSE); michael@0: } michael@0: michael@0: if (sc > scores[lp]) { michael@0: scores[lp] = sc; michael@0: roots[lp] = hp; michael@0: lval = sc; michael@0: for (j=0; j < MAX_ROOTS; j++) michael@0: if (scores[j] < lval) { michael@0: lp = j; michael@0: lval = scores[j]; michael@0: } michael@0: } michael@0: michael@0: michael@0: if (scphon > scoresphon[lpphon]) { michael@0: scoresphon[lpphon] = scphon; michael@0: rootsphon[lpphon] = HENTRY_WORD(hp); michael@0: lval = scphon; michael@0: for (j=0; j < MAX_ROOTS; j++) michael@0: if (scoresphon[j] < lval) { michael@0: lpphon = j; michael@0: lval = scoresphon[j]; michael@0: } michael@0: } michael@0: }} michael@0: michael@0: // find minimum threshold for a passable suggestion michael@0: // mangle original word three differnt ways michael@0: // and score them to generate a minimum acceptable score michael@0: int thresh = 0; michael@0: for (int sp = 1; sp < 4; sp++) { michael@0: if (utf8) { michael@0: for (int k=sp; k < n; k+=4) *((unsigned short *) u8 + k) = '*'; michael@0: u16_u8(mw, MAXSWUTF8L, u8, n); michael@0: thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low); michael@0: } else { michael@0: strcpy(mw, word); michael@0: for (int k=sp; k < n; k+=4) *(mw + k) = '*'; michael@0: thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low); michael@0: } michael@0: } michael@0: thresh = thresh / 3; michael@0: thresh--; michael@0: michael@0: // now expand affixes on each of these root words and michael@0: // and use length adjusted ngram scores to select michael@0: // possible suggestions michael@0: char * guess[MAX_GUESS]; michael@0: char * guessorig[MAX_GUESS]; michael@0: int gscore[MAX_GUESS]; michael@0: for(i=0;iexpand_rootword(glst, MAX_WORDS, HENTRY_WORD(rp), rp->blen, michael@0: rp->astr, rp->alen, word, nc, michael@0: ((rp->var & H_OPT_PHON) ? copy_field(f, HENTRY_DATA(rp), MORPH_PHON) : NULL)); michael@0: michael@0: for (int k = 0; k < nw ; k++) { michael@0: sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH + low) + michael@0: leftcommonsubstring(word, glst[k].word); michael@0: michael@0: if (sc > thresh) { michael@0: if (sc > gscore[lp]) { michael@0: if (guess[lp]) { michael@0: free (guess[lp]); michael@0: if (guessorig[lp]) { michael@0: free(guessorig[lp]); michael@0: guessorig[lp] = NULL; michael@0: } michael@0: } michael@0: gscore[lp] = sc; michael@0: guess[lp] = glst[k].word; michael@0: guessorig[lp] = glst[k].orig; michael@0: lval = sc; michael@0: for (j=0; j < MAX_GUESS; j++) michael@0: if (gscore[j] < lval) { michael@0: lp = j; michael@0: lval = gscore[j]; michael@0: } michael@0: } else { michael@0: free(glst[k].word); michael@0: if (glst[k].orig) free(glst[k].orig); michael@0: } michael@0: } else { michael@0: free(glst[k].word); michael@0: if (glst[k].orig) free(glst[k].orig); michael@0: } michael@0: } michael@0: } michael@0: } michael@0: free(glst); michael@0: michael@0: // now we are done generating guesses michael@0: // sort in order of decreasing score michael@0: michael@0: michael@0: bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS); michael@0: if (ph) bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS); michael@0: michael@0: // weight suggestions with a similarity index, based on michael@0: // the longest common subsequent algorithm and resort michael@0: michael@0: int is_swap = 0; michael@0: int re = 0; michael@0: double fact = 1.0; michael@0: if (pAMgr) { michael@0: int maxd = pAMgr->get_maxdiff(); michael@0: if (maxd >= 0) fact = (10.0 - maxd)/5.0; michael@0: } michael@0: michael@0: for (i=0; i < MAX_GUESS; i++) { michael@0: if (guess[i]) { michael@0: // lowering guess[i] michael@0: char gl[MAXSWUTF8L]; michael@0: int len; michael@0: if (utf8) { michael@0: w_char _w[MAXSWL]; michael@0: len = u8_u16(_w, MAXSWL, guess[i]); michael@0: mkallsmall_utf(_w, len, langnum); michael@0: u16_u8(gl, MAXSWUTF8L, _w, len); michael@0: } else { michael@0: strcpy(gl, guess[i]); michael@0: if (!nonbmp) mkallsmall(gl, csconv); michael@0: len = strlen(guess[i]); michael@0: } michael@0: michael@0: int _lcs = lcslen(word, gl); michael@0: michael@0: // same characters with different casing michael@0: if ((n == len) && (n == _lcs)) { michael@0: gscore[i] += 2000; michael@0: break; michael@0: } michael@0: // using 2-gram instead of 3, and other weightening michael@0: michael@0: re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) + michael@0: ngram(2, gl, word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED); michael@0: michael@0: gscore[i] = michael@0: // length of longest common subsequent minus length difference michael@0: 2 * _lcs - abs((int) (n - len)) + michael@0: // weight length of the left common substring michael@0: leftcommonsubstring(word, gl) + michael@0: // weight equal character positions michael@0: (!nonbmp && commoncharacterpositions(word, gl, &is_swap) ? 1: 0) + michael@0: // swap character (not neighboring) michael@0: ((is_swap) ? 10 : 0) + michael@0: // ngram michael@0: ngram(4, word, gl, NGRAM_ANY_MISMATCH + low) + michael@0: // weighted ngrams michael@0: re + michael@0: // different limit for dictionaries with PHONE rules michael@0: (ph ? (re < len * fact ? -1000 : 0) : (re < (n + len)*fact? -1000 : 0)); michael@0: } michael@0: } michael@0: michael@0: bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS); michael@0: michael@0: // phonetic version michael@0: if (ph) for (i=0; i < MAX_ROOTS; i++) { michael@0: if (rootsphon[i]) { michael@0: // lowering rootphon[i] michael@0: char gl[MAXSWUTF8L]; michael@0: int len; michael@0: if (utf8) { michael@0: w_char _w[MAXSWL]; michael@0: len = u8_u16(_w, MAXSWL, rootsphon[i]); michael@0: mkallsmall_utf(_w, len, langnum); michael@0: u16_u8(gl, MAXSWUTF8L, _w, len); michael@0: } else { michael@0: strcpy(gl, rootsphon[i]); michael@0: if (!nonbmp) mkallsmall(gl, csconv); michael@0: len = strlen(rootsphon[i]); michael@0: } michael@0: michael@0: // heuristic weigthing of ngram scores michael@0: scoresphon[i] += 2 * lcslen(word, gl) - abs((int) (n - len)) + michael@0: // weight length of the left common substring michael@0: leftcommonsubstring(word, gl); michael@0: } michael@0: } michael@0: michael@0: if (ph) bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS); michael@0: michael@0: // copy over michael@0: int oldns = ns; michael@0: michael@0: int same = 0; michael@0: for (i=0; i < MAX_GUESS; i++) { michael@0: if (guess[i]) { michael@0: if ((ns < oldns + maxngramsugs) && (ns < maxSug) && (!same || (gscore[i] > 1000))) { michael@0: int unique = 1; michael@0: // leave only excellent suggestions, if exists michael@0: if (gscore[i] > 1000) same = 1; else if (gscore[i] < -100) { michael@0: same = 1; michael@0: // keep the best ngram suggestions, unless in ONLYMAXDIFF mode michael@0: if (ns > oldns || (pAMgr && pAMgr->get_onlymaxdiff())) { michael@0: free(guess[i]); michael@0: if (guessorig[i]) free(guessorig[i]); michael@0: continue; michael@0: } michael@0: } michael@0: for (j = 0; j < ns; j++) { michael@0: // don't suggest previous suggestions or a previous suggestion with prefixes or affixes michael@0: if ((!guessorig[i] && strstr(guess[i], wlst[j])) || michael@0: (guessorig[i] && strstr(guessorig[i], wlst[j])) || michael@0: // check forbidden words michael@0: !checkword(guess[i], strlen(guess[i]), 0, NULL, NULL)) unique = 0; michael@0: } michael@0: if (unique) { michael@0: wlst[ns++] = guess[i]; michael@0: if (guessorig[i]) { michael@0: free(guess[i]); michael@0: wlst[ns-1] = guessorig[i]; michael@0: } michael@0: } else { michael@0: free(guess[i]); michael@0: if (guessorig[i]) free(guessorig[i]); michael@0: } michael@0: } else { michael@0: free(guess[i]); michael@0: if (guessorig[i]) free(guessorig[i]); michael@0: } michael@0: } michael@0: } michael@0: michael@0: oldns = ns; michael@0: if (ph) for (i=0; i < MAX_ROOTS; i++) { michael@0: if (rootsphon[i]) { michael@0: if ((ns < oldns + MAXPHONSUGS) && (ns < maxSug)) { michael@0: int unique = 1; michael@0: for (j = 0; j < ns; j++) { michael@0: // don't suggest previous suggestions or a previous suggestion with prefixes or affixes michael@0: if (strstr(rootsphon[i], wlst[j]) || michael@0: // check forbidden words michael@0: !checkword(rootsphon[i], strlen(rootsphon[i]), 0, NULL, NULL)) unique = 0; michael@0: } michael@0: if (unique) { michael@0: wlst[ns++] = mystrdup(rootsphon[i]); michael@0: if (!wlst[ns - 1]) return ns - 1; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: if (nonbmp) utf8 = 1; michael@0: return ns; michael@0: } michael@0: michael@0: michael@0: // see if a candidate suggestion is spelled correctly michael@0: // needs to check both root words and words with affixes michael@0: michael@0: // obsolote MySpell-HU modifications: michael@0: // return value 2 and 3 marks compounding with hyphen (-) michael@0: // `3' marks roots without suffix michael@0: int SuggestMgr::checkword(const char * word, int len, int cpdsuggest, int * timer, clock_t * timelimit) michael@0: { michael@0: struct hentry * rv=NULL; michael@0: struct hentry * rv2=NULL; michael@0: int nosuffix = 0; michael@0: michael@0: // check time limit michael@0: if (timer) { michael@0: (*timer)--; michael@0: if (!(*timer) && timelimit) { michael@0: if ((clock() - *timelimit) > TIMELIMIT) return 0; michael@0: *timer = MAXPLUSTIMER; michael@0: } michael@0: } michael@0: michael@0: if (pAMgr) { michael@0: if (cpdsuggest==1) { michael@0: if (pAMgr->get_compound()) { michael@0: rv = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 1, 0); //EXT michael@0: if (rv && (!(rv2 = pAMgr->lookup(word)) || !rv2->astr || michael@0: !(TESTAFF(rv2->astr,pAMgr->get_forbiddenword(),rv2->alen) || michael@0: TESTAFF(rv2->astr,pAMgr->get_nosuggest(),rv2->alen)))) return 3; // XXX obsolote categorisation + only ICONV needs affix flag check? michael@0: } michael@0: return 0; michael@0: } michael@0: michael@0: rv = pAMgr->lookup(word); michael@0: michael@0: if (rv) { michael@0: if ((rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen) michael@0: || TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen))) return 0; michael@0: while (rv) { michael@0: if (rv->astr && (TESTAFF(rv->astr,pAMgr->get_needaffix(),rv->alen) || michael@0: TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || michael@0: TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) { michael@0: rv = rv->next_homonym; michael@0: } else break; michael@0: } michael@0: } else rv = pAMgr->prefix_check(word, len, 0); // only prefix, and prefix + suffix XXX michael@0: michael@0: if (rv) { michael@0: nosuffix=1; michael@0: } else { michael@0: rv = pAMgr->suffix_check(word, len, 0, NULL, NULL, 0, NULL); // only suffix michael@0: } michael@0: michael@0: if (!rv && pAMgr->have_contclass()) { michael@0: rv = pAMgr->suffix_check_twosfx(word, len, 0, NULL, FLAG_NULL); michael@0: if (!rv) rv = pAMgr->prefix_check_twosfx(word, len, 1, FLAG_NULL); michael@0: } michael@0: michael@0: // check forbidden words michael@0: if ((rv) && (rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen) || michael@0: TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || michael@0: TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen) || michael@0: TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) return 0; michael@0: michael@0: if (rv) { // XXX obsolote michael@0: if ((pAMgr->get_compoundflag()) && michael@0: TESTAFF(rv->astr, pAMgr->get_compoundflag(), rv->alen)) return 2 + nosuffix; michael@0: return 1; michael@0: } michael@0: } michael@0: return 0; michael@0: } michael@0: michael@0: int SuggestMgr::check_forbidden(const char * word, int len) michael@0: { michael@0: struct hentry * rv = NULL; michael@0: michael@0: if (pAMgr) { michael@0: rv = pAMgr->lookup(word); michael@0: if (rv && rv->astr && (TESTAFF(rv->astr,pAMgr->get_needaffix(),rv->alen) || michael@0: TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) rv = NULL; michael@0: if (!(pAMgr->prefix_check(word,len,1))) michael@0: rv = pAMgr->suffix_check(word,len, 0, NULL, NULL, 0, NULL); // prefix+suffix, suffix michael@0: // check forbidden words michael@0: if ((rv) && (rv->astr) && TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen)) return 1; michael@0: } michael@0: return 0; michael@0: } michael@0: michael@0: #ifdef HUNSPELL_EXPERIMENTAL michael@0: // suggest possible stems michael@0: int SuggestMgr::suggest_pos_stems(char*** slst, const char * w, int nsug) michael@0: { michael@0: char ** wlst; michael@0: michael@0: struct hentry * rv = NULL; michael@0: michael@0: char w2[MAXSWUTF8L]; michael@0: const char * word = w; michael@0: michael@0: // word reversing wrapper for complex prefixes michael@0: if (complexprefixes) { michael@0: strcpy(w2, w); michael@0: if (utf8) reverseword_utf(w2); else reverseword(w2); michael@0: word = w2; michael@0: } michael@0: michael@0: int wl = strlen(word); michael@0: michael@0: michael@0: if (*slst) { michael@0: wlst = *slst; michael@0: } else { michael@0: wlst = (char **) calloc(maxSug, sizeof(char *)); michael@0: if (wlst == NULL) return -1; michael@0: } michael@0: michael@0: rv = pAMgr->suffix_check(word, wl, 0, NULL, wlst, maxSug, &nsug); michael@0: michael@0: // delete dash from end of word michael@0: if (nsug > 0) { michael@0: for (int j=0; j < nsug; j++) { michael@0: if (wlst[j][strlen(wlst[j]) - 1] == '-') wlst[j][strlen(wlst[j]) - 1] = '\0'; michael@0: } michael@0: } michael@0: michael@0: *slst = wlst; michael@0: return nsug; michael@0: } michael@0: #endif // END OF HUNSPELL_EXPERIMENTAL CODE michael@0: michael@0: michael@0: char * SuggestMgr::suggest_morph(const char * w) michael@0: { michael@0: char result[MAXLNLEN]; michael@0: char * r = (char *) result; michael@0: char * st; michael@0: michael@0: struct hentry * rv = NULL; michael@0: michael@0: *result = '\0'; michael@0: michael@0: if (! pAMgr) return NULL; michael@0: michael@0: char w2[MAXSWUTF8L]; michael@0: const char * word = w; michael@0: michael@0: // word reversing wrapper for complex prefixes michael@0: if (complexprefixes) { michael@0: strcpy(w2, w); michael@0: if (utf8) reverseword_utf(w2); else reverseword(w2); michael@0: word = w2; michael@0: } michael@0: michael@0: rv = pAMgr->lookup(word); michael@0: michael@0: while (rv) { michael@0: if ((!rv->astr) || !(TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->alen) || michael@0: TESTAFF(rv->astr, pAMgr->get_needaffix(), rv->alen) || michael@0: TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) { michael@0: if (!HENTRY_FIND(rv, MORPH_STEM)) { michael@0: mystrcat(result, " ", MAXLNLEN); michael@0: mystrcat(result, MORPH_STEM, MAXLNLEN); michael@0: mystrcat(result, word, MAXLNLEN); michael@0: } michael@0: if (HENTRY_DATA(rv)) { michael@0: mystrcat(result, " ", MAXLNLEN); michael@0: mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); michael@0: } michael@0: mystrcat(result, "\n", MAXLNLEN); michael@0: } michael@0: rv = rv->next_homonym; michael@0: } michael@0: michael@0: st = pAMgr->affix_check_morph(word,strlen(word)); michael@0: if (st) { michael@0: mystrcat(result, st, MAXLNLEN); michael@0: free(st); michael@0: } michael@0: michael@0: if (pAMgr->get_compound() && (*result == '\0')) michael@0: pAMgr->compound_check_morph(word, strlen(word), michael@0: 0, 0, 100, 0,NULL, 0, &r, NULL); michael@0: michael@0: return (*result) ? mystrdup(line_uniq(result, MSEP_REC)) : NULL; michael@0: } michael@0: michael@0: #ifdef HUNSPELL_EXPERIMENTAL michael@0: char * SuggestMgr::suggest_morph_for_spelling_error(const char * word) michael@0: { michael@0: char * p = NULL; michael@0: char ** wlst = (char **) calloc(maxSug, sizeof(char *)); michael@0: if (!**wlst) return NULL; michael@0: // we will use only the first suggestion michael@0: for (int i = 0; i < maxSug - 1; i++) wlst[i] = ""; michael@0: int ns = suggest(&wlst, word, maxSug - 1, NULL); michael@0: if (ns == maxSug) { michael@0: p = suggest_morph(wlst[maxSug - 1]); michael@0: free(wlst[maxSug - 1]); michael@0: } michael@0: if (wlst) free(wlst); michael@0: return p; michael@0: } michael@0: #endif // END OF HUNSPELL_EXPERIMENTAL CODE michael@0: michael@0: /* affixation */ michael@0: char * SuggestMgr::suggest_hentry_gen(hentry * rv, char * pattern) michael@0: { michael@0: char result[MAXLNLEN]; michael@0: *result = '\0'; michael@0: int sfxcount = get_sfxcount(pattern); michael@0: michael@0: if (get_sfxcount(HENTRY_DATA(rv)) > sfxcount) return NULL; michael@0: michael@0: if (HENTRY_DATA(rv)) { michael@0: char * aff = pAMgr->morphgen(HENTRY_WORD(rv), rv->blen, rv->astr, rv->alen, michael@0: HENTRY_DATA(rv), pattern, 0); michael@0: if (aff) { michael@0: mystrcat(result, aff, MAXLNLEN); michael@0: mystrcat(result, "\n", MAXLNLEN); michael@0: free(aff); michael@0: } michael@0: } michael@0: michael@0: // check all allomorphs michael@0: char allomorph[MAXLNLEN]; michael@0: char * p = NULL; michael@0: if (HENTRY_DATA(rv)) p = (char *) strstr(HENTRY_DATA2(rv), MORPH_ALLOMORPH); michael@0: while (p) { michael@0: struct hentry * rv2 = NULL; michael@0: p += MORPH_TAG_LEN; michael@0: int plen = fieldlen(p); michael@0: strncpy(allomorph, p, plen); michael@0: allomorph[plen] = '\0'; michael@0: rv2 = pAMgr->lookup(allomorph); michael@0: while (rv2) { michael@0: // if (HENTRY_DATA(rv2) && get_sfxcount(HENTRY_DATA(rv2)) <= sfxcount) { michael@0: if (HENTRY_DATA(rv2)) { michael@0: char * st = (char *) strstr(HENTRY_DATA2(rv2), MORPH_STEM); michael@0: if (st && (strncmp(st + MORPH_TAG_LEN, michael@0: HENTRY_WORD(rv), fieldlen(st + MORPH_TAG_LEN)) == 0)) { michael@0: char * aff = pAMgr->morphgen(HENTRY_WORD(rv2), rv2->blen, rv2->astr, rv2->alen, michael@0: HENTRY_DATA(rv2), pattern, 0); michael@0: if (aff) { michael@0: mystrcat(result, aff, MAXLNLEN); michael@0: mystrcat(result, "\n", MAXLNLEN); michael@0: free(aff); michael@0: } michael@0: } michael@0: } michael@0: rv2 = rv2->next_homonym; michael@0: } michael@0: p = strstr(p + plen, MORPH_ALLOMORPH); michael@0: } michael@0: michael@0: return (*result) ? mystrdup(result) : NULL; michael@0: } michael@0: michael@0: char * SuggestMgr::suggest_gen(char ** desc, int n, char * pattern) { michael@0: char result[MAXLNLEN]; michael@0: char result2[MAXLNLEN]; michael@0: char newpattern[MAXLNLEN]; michael@0: *newpattern = '\0'; michael@0: if (n == 0) return 0; michael@0: *result2 = '\0'; michael@0: struct hentry * rv = NULL; michael@0: if (!pAMgr) return NULL; michael@0: michael@0: // search affixed forms with and without derivational suffixes michael@0: while(1) { michael@0: michael@0: for (int k = 0; k < n; k++) { michael@0: *result = '\0'; michael@0: // add compound word parts (except the last one) michael@0: char * s = (char *) desc[k]; michael@0: char * part = strstr(s, MORPH_PART); michael@0: if (part) { michael@0: char * nextpart = strstr(part + 1, MORPH_PART); michael@0: while (nextpart) { michael@0: copy_field(result + strlen(result), part, MORPH_PART); michael@0: part = nextpart; michael@0: nextpart = strstr(part + 1, MORPH_PART); michael@0: } michael@0: s = part; michael@0: } michael@0: michael@0: char **pl; michael@0: char tok[MAXLNLEN]; michael@0: strcpy(tok, s); michael@0: char * alt = strstr(tok, " | "); michael@0: while (alt) { michael@0: alt[1] = MSEP_ALT; michael@0: alt = strstr(alt, " | "); michael@0: } michael@0: int pln = line_tok(tok, &pl, MSEP_ALT); michael@0: for (int i = 0; i < pln; i++) { michael@0: // remove inflectional and terminal suffixes michael@0: char * is = strstr(pl[i], MORPH_INFL_SFX); michael@0: if (is) *is = '\0'; michael@0: char * ts = strstr(pl[i], MORPH_TERM_SFX); michael@0: while (ts) { michael@0: *ts = '_'; michael@0: ts = strstr(pl[i], MORPH_TERM_SFX); michael@0: } michael@0: char * st = strstr(s, MORPH_STEM); michael@0: if (st) { michael@0: copy_field(tok, st, MORPH_STEM); michael@0: rv = pAMgr->lookup(tok); michael@0: while (rv) { michael@0: char newpat[MAXLNLEN]; michael@0: strcpy(newpat, pl[i]); michael@0: strcat(newpat, pattern); michael@0: char * sg = suggest_hentry_gen(rv, newpat); michael@0: if (!sg) sg = suggest_hentry_gen(rv, pattern); michael@0: if (sg) { michael@0: char ** gen; michael@0: int genl = line_tok(sg, &gen, MSEP_REC); michael@0: free(sg); michael@0: sg = NULL; michael@0: for (int j = 0; j < genl; j++) { michael@0: if (strstr(pl[i], MORPH_SURF_PFX)) { michael@0: int r2l = strlen(result2); michael@0: result2[r2l] = MSEP_REC; michael@0: strcpy(result2 + r2l + 1, result); michael@0: copy_field(result2 + strlen(result2), pl[i], MORPH_SURF_PFX); michael@0: mystrcat(result2, gen[j], MAXLNLEN); michael@0: } else { michael@0: sprintf(result2 + strlen(result2), "%c%s%s", michael@0: MSEP_REC, result, gen[j]); michael@0: } michael@0: } michael@0: freelist(&gen, genl); michael@0: } michael@0: rv = rv->next_homonym; michael@0: } michael@0: } michael@0: } michael@0: freelist(&pl, pln); michael@0: } michael@0: michael@0: if (*result2 || !strstr(pattern, MORPH_DERI_SFX)) break; michael@0: strcpy(newpattern, pattern); michael@0: pattern = newpattern; michael@0: char * ds = strstr(pattern, MORPH_DERI_SFX); michael@0: while (ds) { michael@0: strncpy(ds, MORPH_TERM_SFX, MORPH_TAG_LEN); michael@0: ds = strstr(pattern, MORPH_DERI_SFX); michael@0: } michael@0: } michael@0: return (*result2 ? mystrdup(result2) : NULL); michael@0: } michael@0: michael@0: michael@0: // generate an n-gram score comparing s1 and s2 michael@0: int SuggestMgr::ngram(int n, char * s1, const char * s2, int opt) michael@0: { michael@0: int nscore = 0; michael@0: int ns; michael@0: int l1; michael@0: int l2; michael@0: int test = 0; michael@0: michael@0: if (utf8) { michael@0: w_char su1[MAXSWL]; michael@0: w_char su2[MAXSWL]; michael@0: l1 = u8_u16(su1, MAXSWL, s1); michael@0: l2 = u8_u16(su2, MAXSWL, s2); michael@0: if ((l2 <= 0) || (l1 == -1)) return 0; michael@0: // lowering dictionary word michael@0: if (opt & NGRAM_LOWERING) mkallsmall_utf(su2, l2, langnum); michael@0: for (int j = 1; j <= n; j++) { michael@0: ns = 0; michael@0: for (int i = 0; i <= (l1-j); i++) { michael@0: int k = 0; michael@0: for (int l = 0; l <= (l2-j); l++) { michael@0: for (k = 0; k < j; k++) { michael@0: w_char * c1 = su1 + i + k; michael@0: w_char * c2 = su2 + l + k; michael@0: if ((c1->l != c2->l) || (c1->h != c2->h)) break; michael@0: } michael@0: if (k == j) { michael@0: ns++; michael@0: break; michael@0: } michael@0: } michael@0: if (k != j && opt & NGRAM_WEIGHTED) { michael@0: ns--; michael@0: test++; michael@0: if (i == 0 || i == l1-j) ns--; // side weight michael@0: } michael@0: } michael@0: nscore = nscore + ns; michael@0: if (ns < 2 && !(opt & NGRAM_WEIGHTED)) break; michael@0: } michael@0: } else { michael@0: l2 = strlen(s2); michael@0: if (l2 == 0) return 0; michael@0: l1 = strlen(s1); michael@0: char *t = mystrdup(s2); michael@0: if (opt & NGRAM_LOWERING) mkallsmall(t, csconv); michael@0: for (int j = 1; j <= n; j++) { michael@0: ns = 0; michael@0: for (int i = 0; i <= (l1-j); i++) { michael@0: char c = *(s1 + i + j); michael@0: *(s1 + i + j) = '\0'; michael@0: if (strstr(t,(s1+i))) { michael@0: ns++; michael@0: } else if (opt & NGRAM_WEIGHTED) { michael@0: ns--; michael@0: test++; michael@0: if (i == 0 || i == l1-j) ns--; // side weight michael@0: } michael@0: *(s1 + i + j ) = c; michael@0: } michael@0: nscore = nscore + ns; michael@0: if (ns < 2 && !(opt & NGRAM_WEIGHTED)) break; michael@0: } michael@0: free(t); michael@0: } michael@0: michael@0: ns = 0; michael@0: if (opt & NGRAM_LONGER_WORSE) ns = (l2-l1)-2; michael@0: if (opt & NGRAM_ANY_MISMATCH) ns = abs(l2-l1)-2; michael@0: ns = (nscore - ((ns > 0) ? ns : 0)); michael@0: return ns; michael@0: } michael@0: michael@0: // length of the left common substring of s1 and (decapitalised) s2 michael@0: int SuggestMgr::leftcommonsubstring(char * s1, const char * s2) { michael@0: if (utf8) { michael@0: w_char su1[MAXSWL]; michael@0: w_char su2[MAXSWL]; michael@0: su1[0].l = su2[0].l = su1[0].h = su2[0].h = 0; michael@0: // decapitalize dictionary word michael@0: if (complexprefixes) { michael@0: int l1 = u8_u16(su1, MAXSWL, s1); michael@0: int l2 = u8_u16(su2, MAXSWL, s2); michael@0: if (*((short *)su1+l1-1) == *((short *)su2+l2-1)) return 1; michael@0: } else { michael@0: int i; michael@0: u8_u16(su1, 1, s1); michael@0: u8_u16(su2, 1, s2); michael@0: unsigned short idx = (su2->h << 8) + su2->l; michael@0: unsigned short otheridx = (su1->h << 8) + su1->l; michael@0: if (otheridx != idx && michael@0: (otheridx != unicodetolower(idx, langnum))) return 0; michael@0: int l1 = u8_u16(su1, MAXSWL, s1); michael@0: int l2 = u8_u16(su2, MAXSWL, s2); michael@0: for(i = 1; (i < l1) && (i < l2) && michael@0: (su1[i].l == su2[i].l) && (su1[i].h == su2[i].h); i++); michael@0: return i; michael@0: } michael@0: } else { michael@0: if (complexprefixes) { michael@0: int l1 = strlen(s1); michael@0: int l2 = strlen(s2); michael@0: if (*(s2+l1-1) == *(s2+l2-1)) return 1; michael@0: } else { michael@0: char * olds = s1; michael@0: // decapitalise dictionary word michael@0: if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower)) return 0; michael@0: do { michael@0: s1++; s2++; michael@0: } while ((*s1 == *s2) && (*s1 != '\0')); michael@0: return (int)(s1 - olds); michael@0: } michael@0: } michael@0: return 0; michael@0: } michael@0: michael@0: int SuggestMgr::commoncharacterpositions(char * s1, const char * s2, int * is_swap) { michael@0: int num = 0; michael@0: int diff = 0; michael@0: int diffpos[2]; michael@0: *is_swap = 0; michael@0: if (utf8) { michael@0: w_char su1[MAXSWL]; michael@0: w_char su2[MAXSWL]; michael@0: int l1 = u8_u16(su1, MAXSWL, s1); michael@0: int l2 = u8_u16(su2, MAXSWL, s2); michael@0: // decapitalize dictionary word michael@0: if (complexprefixes) { michael@0: mkallsmall_utf(su2+l2-1, 1, langnum); michael@0: } else { michael@0: mkallsmall_utf(su2, 1, langnum); michael@0: } michael@0: for (int i = 0; (i < l1) && (i < l2); i++) { michael@0: if (((short *) su1)[i] == ((short *) su2)[i]) { michael@0: num++; michael@0: } else { michael@0: if (diff < 2) diffpos[diff] = i; michael@0: diff++; michael@0: } michael@0: } michael@0: if ((diff == 2) && (l1 == l2) && michael@0: (((short *) su1)[diffpos[0]] == ((short *) su2)[diffpos[1]]) && michael@0: (((short *) su1)[diffpos[1]] == ((short *) su2)[diffpos[0]])) *is_swap = 1; michael@0: } else { michael@0: int i; michael@0: char t[MAXSWUTF8L]; michael@0: strcpy(t, s2); michael@0: // decapitalize dictionary word michael@0: if (complexprefixes) { michael@0: int l2 = strlen(t); michael@0: *(t+l2-1) = csconv[((unsigned char)*(t+l2-1))].clower; michael@0: } else { michael@0: mkallsmall(t, csconv); michael@0: } michael@0: for (i = 0; (*(s1+i) != 0) && (*(t+i) != 0); i++) { michael@0: if (*(s1+i) == *(t+i)) { michael@0: num++; michael@0: } else { michael@0: if (diff < 2) diffpos[diff] = i; michael@0: diff++; michael@0: } michael@0: } michael@0: if ((diff == 2) && (*(s1+i) == 0) && (*(t+i) == 0) && michael@0: (*(s1+diffpos[0]) == *(t+diffpos[1])) && michael@0: (*(s1+diffpos[1]) == *(t+diffpos[0]))) *is_swap = 1; michael@0: } michael@0: return num; michael@0: } michael@0: michael@0: int SuggestMgr::mystrlen(const char * word) { michael@0: if (utf8) { michael@0: w_char w[MAXSWL]; michael@0: return u8_u16(w, MAXSWL, word); michael@0: } else return strlen(word); michael@0: } michael@0: michael@0: // sort in decreasing order of score michael@0: void SuggestMgr::bubblesort(char** rword, char** rword2, int* rsc, int n ) michael@0: { michael@0: int m = 1; michael@0: while (m < n) { michael@0: int j = m; michael@0: while (j > 0) { michael@0: if (rsc[j-1] < rsc[j]) { michael@0: int sctmp = rsc[j-1]; michael@0: char * wdtmp = rword[j-1]; michael@0: rsc[j-1] = rsc[j]; michael@0: rword[j-1] = rword[j]; michael@0: rsc[j] = sctmp; michael@0: rword[j] = wdtmp; michael@0: if (rword2) { michael@0: wdtmp = rword2[j-1]; michael@0: rword2[j-1] = rword2[j]; michael@0: rword2[j] = wdtmp; michael@0: } michael@0: j--; michael@0: } else break; michael@0: } michael@0: m++; michael@0: } michael@0: return; michael@0: } michael@0: michael@0: // longest common subsequence michael@0: void SuggestMgr::lcs(const char * s, const char * s2, int * l1, int * l2, char ** result) { michael@0: int n, m; michael@0: w_char su[MAXSWL]; michael@0: w_char su2[MAXSWL]; michael@0: char * b; michael@0: char * c; michael@0: int i; michael@0: int j; michael@0: if (utf8) { michael@0: m = u8_u16(su, MAXSWL, s); michael@0: n = u8_u16(su2, MAXSWL, s2); michael@0: } else { michael@0: m = strlen(s); michael@0: n = strlen(s2); michael@0: } michael@0: c = (char *) malloc((m + 1) * (n + 1)); michael@0: b = (char *) malloc((m + 1) * (n + 1)); michael@0: if (!c || !b) { michael@0: if (c) free(c); michael@0: if (b) free(b); michael@0: *result = NULL; michael@0: return; michael@0: } michael@0: for (i = 1; i <= m; i++) c[i*(n+1)] = 0; michael@0: for (j = 0; j <= n; j++) c[j] = 0; michael@0: for (i = 1; i <= m; i++) { michael@0: for (j = 1; j <= n; j++) { michael@0: if ( ((utf8) && (*((short *) su+i-1) == *((short *)su2+j-1))) michael@0: || ((!utf8) && ((*(s+i-1)) == (*(s2+j-1))))) { michael@0: c[i*(n+1) + j] = c[(i-1)*(n+1) + j-1]+1; michael@0: b[i*(n+1) + j] = LCS_UPLEFT; michael@0: } else if (c[(i-1)*(n+1) + j] >= c[i*(n+1) + j-1]) { michael@0: c[i*(n+1) + j] = c[(i-1)*(n+1) + j]; michael@0: b[i*(n+1) + j] = LCS_UP; michael@0: } else { michael@0: c[i*(n+1) + j] = c[i*(n+1) + j-1]; michael@0: b[i*(n+1) + j] = LCS_LEFT; michael@0: } michael@0: } michael@0: } michael@0: *result = b; michael@0: free(c); michael@0: *l1 = m; michael@0: *l2 = n; michael@0: } michael@0: michael@0: int SuggestMgr::lcslen(const char * s, const char* s2) { michael@0: int m; michael@0: int n; michael@0: int i; michael@0: int j; michael@0: char * result; michael@0: int len = 0; michael@0: lcs(s, s2, &m, &n, &result); michael@0: if (!result) return 0; michael@0: i = m; michael@0: j = n; michael@0: while ((i != 0) && (j != 0)) { michael@0: if (result[i*(n+1) + j] == LCS_UPLEFT) { michael@0: len++; michael@0: i--; michael@0: j--; michael@0: } else if (result[i*(n+1) + j] == LCS_UP) { michael@0: i--; michael@0: } else j--; michael@0: } michael@0: free(result); michael@0: return len; michael@0: }