|
1 /******* BEGIN LICENSE BLOCK ******* |
|
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 |
|
3 * |
|
4 * The contents of this file are subject to the Mozilla Public License Version |
|
5 * 1.1 (the "License"); you may not use this file except in compliance with |
|
6 * the License. You may obtain a copy of the License at |
|
7 * http://www.mozilla.org/MPL/ |
|
8 * |
|
9 * Software distributed under the License is distributed on an "AS IS" basis, |
|
10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License |
|
11 * for the specific language governing rights and limitations under the |
|
12 * License. |
|
13 * |
|
14 * The Initial Developer of the Original Code is Björn Jacke. Portions created |
|
15 * by the Initial Developers are Copyright (C) 2000-2007 the Initial |
|
16 * Developers. All Rights Reserved. |
|
17 * |
|
18 * Contributor(s): Björn Jacke (bjoern.jacke@gmx.de) |
|
19 * László Németh (nemethl@gyorsposta.hu) |
|
20 * Caolan McNamara (caolanm@redhat.com) |
|
21 * |
|
22 * Alternatively, the contents of this file may be used under the terms of |
|
23 * either the GNU General Public License Version 2 or later (the "GPL"), or |
|
24 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), |
|
25 * in which case the provisions of the GPL or the LGPL are applicable instead |
|
26 * of those above. If you wish to allow use of your version of this file only |
|
27 * under the terms of either the GPL or the LGPL, and not to allow others to |
|
28 * use your version of this file under the terms of the MPL, indicate your |
|
29 * decision by deleting the provisions above and replace them with the notice |
|
30 * and other provisions required by the GPL or the LGPL. If you do not delete |
|
31 * the provisions above, a recipient may use your version of this file under |
|
32 * the terms of any one of the MPL, the GPL or the LGPL. |
|
33 * |
|
34 * Changelog: |
|
35 * 2000-01-05 Björn Jacke <bjoern.jacke AT gmx.de> |
|
36 * Initial Release insprired by the article about phonetic |
|
37 * transformations out of c't 25/1999 |
|
38 * |
|
39 * 2007-07-26 Björn Jacke <bjoern.jacke AT gmx.de> |
|
40 * Released under MPL/GPL/LGPL tri-license for Hunspell |
|
41 * |
|
42 * 2007-08-23 László Németh <nemeth at OOo> |
|
43 * Porting from Aspell to Hunspell using C-like structs |
|
44 * |
|
45 ******* END LICENSE BLOCK *******/ |
|
46 |
|
47 #include <stdlib.h> |
|
48 #include <string.h> |
|
49 #include <stdio.h> |
|
50 #include <ctype.h> |
|
51 |
|
52 #include "csutil.hxx" |
|
53 #include "phonet.hxx" |
|
54 |
|
55 void init_phonet_hash(phonetable & parms) |
|
56 { |
|
57 int i, k; |
|
58 |
|
59 for (i = 0; i < HASHSIZE; i++) { |
|
60 parms.hash[i] = -1; |
|
61 } |
|
62 |
|
63 for (i = 0; parms.rules[i][0] != '\0'; i += 2) { |
|
64 /** set hash value **/ |
|
65 k = (unsigned char) parms.rules[i][0]; |
|
66 |
|
67 if (parms.hash[k] < 0) { |
|
68 parms.hash[k] = i; |
|
69 } |
|
70 } |
|
71 } |
|
72 |
|
73 // like strcpy but safe if the strings overlap |
|
74 // but only if dest < src |
|
75 static inline void strmove(char * dest, char * src) { |
|
76 while (*src) |
|
77 *dest++ = *src++; |
|
78 *dest = '\0'; |
|
79 } |
|
80 |
|
81 static int myisalpha(char ch) { |
|
82 if ((unsigned char) ch < 128) return isalpha(ch); |
|
83 return 1; |
|
84 } |
|
85 |
|
86 /* phonetic transcription algorithm */ |
|
87 /* see: http://aspell.net/man-html/Phonetic-Code.html */ |
|
88 /* convert string to uppercase before this call */ |
|
89 int phonet (const char * inword, char * target, |
|
90 int len, |
|
91 phonetable & parms) |
|
92 { |
|
93 /** Do phonetic transformation. **/ |
|
94 /** "len" = length of "inword" incl. '\0'. **/ |
|
95 |
|
96 /** result: >= 0: length of "target" **/ |
|
97 /** otherwise: error **/ |
|
98 |
|
99 int i,j,k=0,n,p,z; |
|
100 int k0,n0,p0=-333,z0; |
|
101 char c, c0; |
|
102 const char * s; |
|
103 typedef unsigned char uchar; |
|
104 char word[MAXPHONETUTF8LEN + 1]; |
|
105 if (len == -1) len = strlen(inword); |
|
106 if (len > MAXPHONETUTF8LEN) return 0; |
|
107 strcpy(word, inword); |
|
108 |
|
109 /** check word **/ |
|
110 i = j = z = 0; |
|
111 while ((c = word[i]) != '\0') { |
|
112 n = parms.hash[(uchar) c]; |
|
113 z0 = 0; |
|
114 |
|
115 if (n >= 0) { |
|
116 /** check all rules for the same letter **/ |
|
117 while (parms.rules[n][0] == c) { |
|
118 |
|
119 /** check whole string **/ |
|
120 k = 1; /** number of found letters **/ |
|
121 p = 5; /** default priority **/ |
|
122 s = parms.rules[n]; |
|
123 s++; /** important for (see below) "*(s-1)" **/ |
|
124 |
|
125 while (*s != '\0' && word[i+k] == *s |
|
126 && !isdigit ((unsigned char) *s) && strchr ("(-<^$", *s) == NULL) { |
|
127 k++; |
|
128 s++; |
|
129 } |
|
130 if (*s == '(') { |
|
131 /** check letters in "(..)" **/ |
|
132 if (myisalpha(word[i+k]) // ...could be implied? |
|
133 && strchr(s+1, word[i+k]) != NULL) { |
|
134 k++; |
|
135 while (*s != ')') |
|
136 s++; |
|
137 s++; |
|
138 } |
|
139 } |
|
140 p0 = (int) *s; |
|
141 k0 = k; |
|
142 while (*s == '-' && k > 1) { |
|
143 k--; |
|
144 s++; |
|
145 } |
|
146 if (*s == '<') |
|
147 s++; |
|
148 if (isdigit ((unsigned char) *s)) { |
|
149 /** determine priority **/ |
|
150 p = *s - '0'; |
|
151 s++; |
|
152 } |
|
153 if (*s == '^' && *(s+1) == '^') |
|
154 s++; |
|
155 |
|
156 if (*s == '\0' |
|
157 || (*s == '^' |
|
158 && (i == 0 || ! myisalpha(word[i-1])) |
|
159 && (*(s+1) != '$' |
|
160 || (! myisalpha(word[i+k0]) ))) |
|
161 || (*s == '$' && i > 0 |
|
162 && myisalpha(word[i-1]) |
|
163 && (! myisalpha(word[i+k0]) ))) |
|
164 { |
|
165 /** search for followup rules, if: **/ |
|
166 /** parms.followup and k > 1 and NO '-' in searchstring **/ |
|
167 c0 = word[i+k-1]; |
|
168 n0 = parms.hash[(uchar) c0]; |
|
169 |
|
170 // if (parms.followup && k > 1 && n0 >= 0 |
|
171 if (k > 1 && n0 >= 0 |
|
172 && p0 != (int) '-' && word[i+k] != '\0') { |
|
173 /** test follow-up rule for "word[i+k]" **/ |
|
174 while (parms.rules[n0][0] == c0) { |
|
175 |
|
176 /** check whole string **/ |
|
177 k0 = k; |
|
178 p0 = 5; |
|
179 s = parms.rules[n0]; |
|
180 s++; |
|
181 while (*s != '\0' && word[i+k0] == *s |
|
182 && ! isdigit((unsigned char) *s) && strchr("(-<^$",*s) == NULL) { |
|
183 k0++; |
|
184 s++; |
|
185 } |
|
186 if (*s == '(') { |
|
187 /** check letters **/ |
|
188 if (myisalpha(word[i+k0]) |
|
189 && strchr (s+1, word[i+k0]) != NULL) { |
|
190 k0++; |
|
191 while (*s != ')' && *s != '\0') |
|
192 s++; |
|
193 if (*s == ')') |
|
194 s++; |
|
195 } |
|
196 } |
|
197 while (*s == '-') { |
|
198 /** "k0" gets NOT reduced **/ |
|
199 /** because "if (k0 == k)" **/ |
|
200 s++; |
|
201 } |
|
202 if (*s == '<') |
|
203 s++; |
|
204 if (isdigit ((unsigned char) *s)) { |
|
205 p0 = *s - '0'; |
|
206 s++; |
|
207 } |
|
208 |
|
209 if (*s == '\0' |
|
210 /** *s == '^' cuts **/ |
|
211 || (*s == '$' && ! myisalpha(word[i+k0]))) |
|
212 { |
|
213 if (k0 == k) { |
|
214 /** this is just a piece of the string **/ |
|
215 n0 += 2; |
|
216 continue; |
|
217 } |
|
218 |
|
219 if (p0 < p) { |
|
220 /** priority too low **/ |
|
221 n0 += 2; |
|
222 continue; |
|
223 } |
|
224 /** rule fits; stop search **/ |
|
225 break; |
|
226 } |
|
227 n0 += 2; |
|
228 } /** End of "while (parms.rules[n0][0] == c0)" **/ |
|
229 |
|
230 if (p0 >= p && parms.rules[n0][0] == c0) { |
|
231 n += 2; |
|
232 continue; |
|
233 } |
|
234 } /** end of follow-up stuff **/ |
|
235 |
|
236 /** replace string **/ |
|
237 s = parms.rules[n+1]; |
|
238 p0 = (parms.rules[n][0] != '\0' |
|
239 && strchr (parms.rules[n]+1,'<') != NULL) ? 1:0; |
|
240 if (p0 == 1 && z == 0) { |
|
241 /** rule with '<' is used **/ |
|
242 if (j > 0 && *s != '\0' |
|
243 && (target[j-1] == c || target[j-1] == *s)) { |
|
244 j--; |
|
245 } |
|
246 z0 = 1; |
|
247 z = 1; |
|
248 k0 = 0; |
|
249 while (*s != '\0' && word[i+k0] != '\0') { |
|
250 word[i+k0] = *s; |
|
251 k0++; |
|
252 s++; |
|
253 } |
|
254 if (k > k0) |
|
255 strmove (&word[0]+i+k0, &word[0]+i+k); |
|
256 |
|
257 /** new "actual letter" **/ |
|
258 c = word[i]; |
|
259 } |
|
260 else { /** no '<' rule used **/ |
|
261 i += k - 1; |
|
262 z = 0; |
|
263 while (*s != '\0' |
|
264 && *(s+1) != '\0' && j < len) { |
|
265 if (j == 0 || target[j-1] != *s) { |
|
266 target[j] = *s; |
|
267 j++; |
|
268 } |
|
269 s++; |
|
270 } |
|
271 /** new "actual letter" **/ |
|
272 c = *s; |
|
273 if (parms.rules[n][0] != '\0' |
|
274 && strstr (parms.rules[n]+1, "^^") != NULL) { |
|
275 if (c != '\0') { |
|
276 target[j] = c; |
|
277 j++; |
|
278 } |
|
279 strmove (&word[0], &word[0]+i+1); |
|
280 i = 0; |
|
281 z0 = 1; |
|
282 } |
|
283 } |
|
284 break; |
|
285 } /** end of follow-up stuff **/ |
|
286 n += 2; |
|
287 } /** end of while (parms.rules[n][0] == c) **/ |
|
288 } /** end of if (n >= 0) **/ |
|
289 if (z0 == 0) { |
|
290 // if (k && (assert(p0!=-333),!p0) && j < len && c != '\0' |
|
291 // && (!parms.collapse_result || j == 0 || target[j-1] != c)){ |
|
292 if (k && !p0 && j < len && c != '\0' |
|
293 && (1 || j == 0 || target[j-1] != c)){ |
|
294 /** condense only double letters **/ |
|
295 target[j] = c; |
|
296 ///printf("\n setting \n"); |
|
297 j++; |
|
298 } |
|
299 |
|
300 i++; |
|
301 z = 0; |
|
302 k=0; |
|
303 } |
|
304 } /** end of while ((c = word[i]) != '\0') **/ |
|
305 |
|
306 target[j] = '\0'; |
|
307 return (j); |
|
308 |
|
309 } /** end of function "phonet" **/ |