1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/extensions/spellcheck/hunspell/src/phonet.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,309 @@ 1.4 +/******* BEGIN LICENSE BLOCK ******* 1.5 + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 1.6 + * 1.7 + * The contents of this file are subject to the Mozilla Public License Version 1.8 + * 1.1 (the "License"); you may not use this file except in compliance with 1.9 + * the License. You may obtain a copy of the License at 1.10 + * http://www.mozilla.org/MPL/ 1.11 + * 1.12 + * Software distributed under the License is distributed on an "AS IS" basis, 1.13 + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 1.14 + * for the specific language governing rights and limitations under the 1.15 + * License. 1.16 + * 1.17 + * The Initial Developer of the Original Code is Björn Jacke. Portions created 1.18 + * by the Initial Developers are Copyright (C) 2000-2007 the Initial 1.19 + * Developers. All Rights Reserved. 1.20 + * 1.21 + * Contributor(s): Björn Jacke (bjoern.jacke@gmx.de) 1.22 + * László Németh (nemethl@gyorsposta.hu) 1.23 + * Caolan McNamara (caolanm@redhat.com) 1.24 + * 1.25 + * Alternatively, the contents of this file may be used under the terms of 1.26 + * either the GNU General Public License Version 2 or later (the "GPL"), or 1.27 + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 1.28 + * in which case the provisions of the GPL or the LGPL are applicable instead 1.29 + * of those above. If you wish to allow use of your version of this file only 1.30 + * under the terms of either the GPL or the LGPL, and not to allow others to 1.31 + * use your version of this file under the terms of the MPL, indicate your 1.32 + * decision by deleting the provisions above and replace them with the notice 1.33 + * and other provisions required by the GPL or the LGPL. If you do not delete 1.34 + * the provisions above, a recipient may use your version of this file under 1.35 + * the terms of any one of the MPL, the GPL or the LGPL. 1.36 + * 1.37 + * Changelog: 1.38 + * 2000-01-05 Björn Jacke <bjoern.jacke AT gmx.de> 1.39 + * Initial Release insprired by the article about phonetic 1.40 + * transformations out of c't 25/1999 1.41 + * 1.42 + * 2007-07-26 Björn Jacke <bjoern.jacke AT gmx.de> 1.43 + * Released under MPL/GPL/LGPL tri-license for Hunspell 1.44 + * 1.45 + * 2007-08-23 László Németh <nemeth at OOo> 1.46 + * Porting from Aspell to Hunspell using C-like structs 1.47 + * 1.48 + ******* END LICENSE BLOCK *******/ 1.49 + 1.50 +#include <stdlib.h> 1.51 +#include <string.h> 1.52 +#include <stdio.h> 1.53 +#include <ctype.h> 1.54 + 1.55 +#include "csutil.hxx" 1.56 +#include "phonet.hxx" 1.57 + 1.58 +void init_phonet_hash(phonetable & parms) 1.59 + { 1.60 + int i, k; 1.61 + 1.62 + for (i = 0; i < HASHSIZE; i++) { 1.63 + parms.hash[i] = -1; 1.64 + } 1.65 + 1.66 + for (i = 0; parms.rules[i][0] != '\0'; i += 2) { 1.67 + /** set hash value **/ 1.68 + k = (unsigned char) parms.rules[i][0]; 1.69 + 1.70 + if (parms.hash[k] < 0) { 1.71 + parms.hash[k] = i; 1.72 + } 1.73 + } 1.74 + } 1.75 + 1.76 +// like strcpy but safe if the strings overlap 1.77 +// but only if dest < src 1.78 +static inline void strmove(char * dest, char * src) { 1.79 + while (*src) 1.80 + *dest++ = *src++; 1.81 + *dest = '\0'; 1.82 +} 1.83 + 1.84 +static int myisalpha(char ch) { 1.85 + if ((unsigned char) ch < 128) return isalpha(ch); 1.86 + return 1; 1.87 +} 1.88 + 1.89 +/* phonetic transcription algorithm */ 1.90 +/* see: http://aspell.net/man-html/Phonetic-Code.html */ 1.91 +/* convert string to uppercase before this call */ 1.92 +int phonet (const char * inword, char * target, 1.93 + int len, 1.94 + phonetable & parms) 1.95 + { 1.96 + /** Do phonetic transformation. **/ 1.97 + /** "len" = length of "inword" incl. '\0'. **/ 1.98 + 1.99 + /** result: >= 0: length of "target" **/ 1.100 + /** otherwise: error **/ 1.101 + 1.102 + int i,j,k=0,n,p,z; 1.103 + int k0,n0,p0=-333,z0; 1.104 + char c, c0; 1.105 + const char * s; 1.106 + typedef unsigned char uchar; 1.107 + char word[MAXPHONETUTF8LEN + 1]; 1.108 + if (len == -1) len = strlen(inword); 1.109 + if (len > MAXPHONETUTF8LEN) return 0; 1.110 + strcpy(word, inword); 1.111 + 1.112 + /** check word **/ 1.113 + i = j = z = 0; 1.114 + while ((c = word[i]) != '\0') { 1.115 + n = parms.hash[(uchar) c]; 1.116 + z0 = 0; 1.117 + 1.118 + if (n >= 0) { 1.119 + /** check all rules for the same letter **/ 1.120 + while (parms.rules[n][0] == c) { 1.121 + 1.122 + /** check whole string **/ 1.123 + k = 1; /** number of found letters **/ 1.124 + p = 5; /** default priority **/ 1.125 + s = parms.rules[n]; 1.126 + s++; /** important for (see below) "*(s-1)" **/ 1.127 + 1.128 + while (*s != '\0' && word[i+k] == *s 1.129 + && !isdigit ((unsigned char) *s) && strchr ("(-<^$", *s) == NULL) { 1.130 + k++; 1.131 + s++; 1.132 + } 1.133 + if (*s == '(') { 1.134 + /** check letters in "(..)" **/ 1.135 + if (myisalpha(word[i+k]) // ...could be implied? 1.136 + && strchr(s+1, word[i+k]) != NULL) { 1.137 + k++; 1.138 + while (*s != ')') 1.139 + s++; 1.140 + s++; 1.141 + } 1.142 + } 1.143 + p0 = (int) *s; 1.144 + k0 = k; 1.145 + while (*s == '-' && k > 1) { 1.146 + k--; 1.147 + s++; 1.148 + } 1.149 + if (*s == '<') 1.150 + s++; 1.151 + if (isdigit ((unsigned char) *s)) { 1.152 + /** determine priority **/ 1.153 + p = *s - '0'; 1.154 + s++; 1.155 + } 1.156 + if (*s == '^' && *(s+1) == '^') 1.157 + s++; 1.158 + 1.159 + if (*s == '\0' 1.160 + || (*s == '^' 1.161 + && (i == 0 || ! myisalpha(word[i-1])) 1.162 + && (*(s+1) != '$' 1.163 + || (! myisalpha(word[i+k0]) ))) 1.164 + || (*s == '$' && i > 0 1.165 + && myisalpha(word[i-1]) 1.166 + && (! myisalpha(word[i+k0]) ))) 1.167 + { 1.168 + /** search for followup rules, if: **/ 1.169 + /** parms.followup and k > 1 and NO '-' in searchstring **/ 1.170 + c0 = word[i+k-1]; 1.171 + n0 = parms.hash[(uchar) c0]; 1.172 + 1.173 +// if (parms.followup && k > 1 && n0 >= 0 1.174 + if (k > 1 && n0 >= 0 1.175 + && p0 != (int) '-' && word[i+k] != '\0') { 1.176 + /** test follow-up rule for "word[i+k]" **/ 1.177 + while (parms.rules[n0][0] == c0) { 1.178 + 1.179 + /** check whole string **/ 1.180 + k0 = k; 1.181 + p0 = 5; 1.182 + s = parms.rules[n0]; 1.183 + s++; 1.184 + while (*s != '\0' && word[i+k0] == *s 1.185 + && ! isdigit((unsigned char) *s) && strchr("(-<^$",*s) == NULL) { 1.186 + k0++; 1.187 + s++; 1.188 + } 1.189 + if (*s == '(') { 1.190 + /** check letters **/ 1.191 + if (myisalpha(word[i+k0]) 1.192 + && strchr (s+1, word[i+k0]) != NULL) { 1.193 + k0++; 1.194 + while (*s != ')' && *s != '\0') 1.195 + s++; 1.196 + if (*s == ')') 1.197 + s++; 1.198 + } 1.199 + } 1.200 + while (*s == '-') { 1.201 + /** "k0" gets NOT reduced **/ 1.202 + /** because "if (k0 == k)" **/ 1.203 + s++; 1.204 + } 1.205 + if (*s == '<') 1.206 + s++; 1.207 + if (isdigit ((unsigned char) *s)) { 1.208 + p0 = *s - '0'; 1.209 + s++; 1.210 + } 1.211 + 1.212 + if (*s == '\0' 1.213 + /** *s == '^' cuts **/ 1.214 + || (*s == '$' && ! myisalpha(word[i+k0]))) 1.215 + { 1.216 + if (k0 == k) { 1.217 + /** this is just a piece of the string **/ 1.218 + n0 += 2; 1.219 + continue; 1.220 + } 1.221 + 1.222 + if (p0 < p) { 1.223 + /** priority too low **/ 1.224 + n0 += 2; 1.225 + continue; 1.226 + } 1.227 + /** rule fits; stop search **/ 1.228 + break; 1.229 + } 1.230 + n0 += 2; 1.231 + } /** End of "while (parms.rules[n0][0] == c0)" **/ 1.232 + 1.233 + if (p0 >= p && parms.rules[n0][0] == c0) { 1.234 + n += 2; 1.235 + continue; 1.236 + } 1.237 + } /** end of follow-up stuff **/ 1.238 + 1.239 + /** replace string **/ 1.240 + s = parms.rules[n+1]; 1.241 + p0 = (parms.rules[n][0] != '\0' 1.242 + && strchr (parms.rules[n]+1,'<') != NULL) ? 1:0; 1.243 + if (p0 == 1 && z == 0) { 1.244 + /** rule with '<' is used **/ 1.245 + if (j > 0 && *s != '\0' 1.246 + && (target[j-1] == c || target[j-1] == *s)) { 1.247 + j--; 1.248 + } 1.249 + z0 = 1; 1.250 + z = 1; 1.251 + k0 = 0; 1.252 + while (*s != '\0' && word[i+k0] != '\0') { 1.253 + word[i+k0] = *s; 1.254 + k0++; 1.255 + s++; 1.256 + } 1.257 + if (k > k0) 1.258 + strmove (&word[0]+i+k0, &word[0]+i+k); 1.259 + 1.260 + /** new "actual letter" **/ 1.261 + c = word[i]; 1.262 + } 1.263 + else { /** no '<' rule used **/ 1.264 + i += k - 1; 1.265 + z = 0; 1.266 + while (*s != '\0' 1.267 + && *(s+1) != '\0' && j < len) { 1.268 + if (j == 0 || target[j-1] != *s) { 1.269 + target[j] = *s; 1.270 + j++; 1.271 + } 1.272 + s++; 1.273 + } 1.274 + /** new "actual letter" **/ 1.275 + c = *s; 1.276 + if (parms.rules[n][0] != '\0' 1.277 + && strstr (parms.rules[n]+1, "^^") != NULL) { 1.278 + if (c != '\0') { 1.279 + target[j] = c; 1.280 + j++; 1.281 + } 1.282 + strmove (&word[0], &word[0]+i+1); 1.283 + i = 0; 1.284 + z0 = 1; 1.285 + } 1.286 + } 1.287 + break; 1.288 + } /** end of follow-up stuff **/ 1.289 + n += 2; 1.290 + } /** end of while (parms.rules[n][0] == c) **/ 1.291 + } /** end of if (n >= 0) **/ 1.292 + if (z0 == 0) { 1.293 +// if (k && (assert(p0!=-333),!p0) && j < len && c != '\0' 1.294 +// && (!parms.collapse_result || j == 0 || target[j-1] != c)){ 1.295 + if (k && !p0 && j < len && c != '\0' 1.296 + && (1 || j == 0 || target[j-1] != c)){ 1.297 + /** condense only double letters **/ 1.298 + target[j] = c; 1.299 + ///printf("\n setting \n"); 1.300 + j++; 1.301 + } 1.302 + 1.303 + i++; 1.304 + z = 0; 1.305 + k=0; 1.306 + } 1.307 + } /** end of while ((c = word[i]) != '\0') **/ 1.308 + 1.309 + target[j] = '\0'; 1.310 + return (j); 1.311 + 1.312 + } /** end of function "phonet" **/