michael@0: /******* BEGIN LICENSE BLOCK ******* michael@0: * Version: MPL 1.1/GPL 2.0/LGPL 2.1 michael@0: * michael@0: * The contents of this file are subject to the Mozilla Public License Version michael@0: * 1.1 (the "License"); you may not use this file except in compliance with michael@0: * the License. You may obtain a copy of the License at michael@0: * http://www.mozilla.org/MPL/ michael@0: * michael@0: * Software distributed under the License is distributed on an "AS IS" basis, michael@0: * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License michael@0: * for the specific language governing rights and limitations under the michael@0: * License. michael@0: * michael@0: * The Initial Developer of the Original Code is Björn Jacke. Portions created michael@0: * by the Initial Developers are Copyright (C) 2000-2007 the Initial michael@0: * Developers. All Rights Reserved. michael@0: * michael@0: * Contributor(s): Björn Jacke (bjoern.jacke@gmx.de) michael@0: * László Németh (nemethl@gyorsposta.hu) michael@0: * Caolan McNamara (caolanm@redhat.com) michael@0: * michael@0: * Alternatively, the contents of this file may be used under the terms of michael@0: * either the GNU General Public License Version 2 or later (the "GPL"), or michael@0: * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), michael@0: * in which case the provisions of the GPL or the LGPL are applicable instead michael@0: * of those above. If you wish to allow use of your version of this file only michael@0: * under the terms of either the GPL or the LGPL, and not to allow others to michael@0: * use your version of this file under the terms of the MPL, indicate your michael@0: * decision by deleting the provisions above and replace them with the notice michael@0: * and other provisions required by the GPL or the LGPL. If you do not delete michael@0: * the provisions above, a recipient may use your version of this file under michael@0: * the terms of any one of the MPL, the GPL or the LGPL. michael@0: * michael@0: * Changelog: michael@0: * 2000-01-05 Björn Jacke michael@0: * Initial Release insprired by the article about phonetic michael@0: * transformations out of c't 25/1999 michael@0: * michael@0: * 2007-07-26 Björn Jacke michael@0: * Released under MPL/GPL/LGPL tri-license for Hunspell michael@0: * michael@0: * 2007-08-23 László Németh michael@0: * Porting from Aspell to Hunspell using C-like structs michael@0: * michael@0: ******* END LICENSE BLOCK *******/ michael@0: michael@0: #include michael@0: #include michael@0: #include michael@0: #include michael@0: michael@0: #include "csutil.hxx" michael@0: #include "phonet.hxx" michael@0: michael@0: void init_phonet_hash(phonetable & parms) michael@0: { michael@0: int i, k; michael@0: michael@0: for (i = 0; i < HASHSIZE; i++) { michael@0: parms.hash[i] = -1; michael@0: } michael@0: michael@0: for (i = 0; parms.rules[i][0] != '\0'; i += 2) { michael@0: /** set hash value **/ michael@0: k = (unsigned char) parms.rules[i][0]; michael@0: michael@0: if (parms.hash[k] < 0) { michael@0: parms.hash[k] = i; michael@0: } michael@0: } michael@0: } michael@0: michael@0: // like strcpy but safe if the strings overlap michael@0: // but only if dest < src michael@0: static inline void strmove(char * dest, char * src) { michael@0: while (*src) michael@0: *dest++ = *src++; michael@0: *dest = '\0'; michael@0: } michael@0: michael@0: static int myisalpha(char ch) { michael@0: if ((unsigned char) ch < 128) return isalpha(ch); michael@0: return 1; michael@0: } michael@0: michael@0: /* phonetic transcription algorithm */ michael@0: /* see: http://aspell.net/man-html/Phonetic-Code.html */ michael@0: /* convert string to uppercase before this call */ michael@0: int phonet (const char * inword, char * target, michael@0: int len, michael@0: phonetable & parms) michael@0: { michael@0: /** Do phonetic transformation. **/ michael@0: /** "len" = length of "inword" incl. '\0'. **/ michael@0: michael@0: /** result: >= 0: length of "target" **/ michael@0: /** otherwise: error **/ michael@0: michael@0: int i,j,k=0,n,p,z; michael@0: int k0,n0,p0=-333,z0; michael@0: char c, c0; michael@0: const char * s; michael@0: typedef unsigned char uchar; michael@0: char word[MAXPHONETUTF8LEN + 1]; michael@0: if (len == -1) len = strlen(inword); michael@0: if (len > MAXPHONETUTF8LEN) return 0; michael@0: strcpy(word, inword); michael@0: michael@0: /** check word **/ michael@0: i = j = z = 0; michael@0: while ((c = word[i]) != '\0') { michael@0: n = parms.hash[(uchar) c]; michael@0: z0 = 0; michael@0: michael@0: if (n >= 0) { michael@0: /** check all rules for the same letter **/ michael@0: while (parms.rules[n][0] == c) { michael@0: michael@0: /** check whole string **/ michael@0: k = 1; /** number of found letters **/ michael@0: p = 5; /** default priority **/ michael@0: s = parms.rules[n]; michael@0: s++; /** important for (see below) "*(s-1)" **/ michael@0: michael@0: while (*s != '\0' && word[i+k] == *s michael@0: && !isdigit ((unsigned char) *s) && strchr ("(-<^$", *s) == NULL) { michael@0: k++; michael@0: s++; michael@0: } michael@0: if (*s == '(') { michael@0: /** check letters in "(..)" **/ michael@0: if (myisalpha(word[i+k]) // ...could be implied? michael@0: && strchr(s+1, word[i+k]) != NULL) { michael@0: k++; michael@0: while (*s != ')') michael@0: s++; michael@0: s++; michael@0: } michael@0: } michael@0: p0 = (int) *s; michael@0: k0 = k; michael@0: while (*s == '-' && k > 1) { michael@0: k--; michael@0: s++; michael@0: } michael@0: if (*s == '<') michael@0: s++; michael@0: if (isdigit ((unsigned char) *s)) { michael@0: /** determine priority **/ michael@0: p = *s - '0'; michael@0: s++; michael@0: } michael@0: if (*s == '^' && *(s+1) == '^') michael@0: s++; michael@0: michael@0: if (*s == '\0' michael@0: || (*s == '^' michael@0: && (i == 0 || ! myisalpha(word[i-1])) michael@0: && (*(s+1) != '$' michael@0: || (! myisalpha(word[i+k0]) ))) michael@0: || (*s == '$' && i > 0 michael@0: && myisalpha(word[i-1]) michael@0: && (! myisalpha(word[i+k0]) ))) michael@0: { michael@0: /** search for followup rules, if: **/ michael@0: /** parms.followup and k > 1 and NO '-' in searchstring **/ michael@0: c0 = word[i+k-1]; michael@0: n0 = parms.hash[(uchar) c0]; michael@0: michael@0: // if (parms.followup && k > 1 && n0 >= 0 michael@0: if (k > 1 && n0 >= 0 michael@0: && p0 != (int) '-' && word[i+k] != '\0') { michael@0: /** test follow-up rule for "word[i+k]" **/ michael@0: while (parms.rules[n0][0] == c0) { michael@0: michael@0: /** check whole string **/ michael@0: k0 = k; michael@0: p0 = 5; michael@0: s = parms.rules[n0]; michael@0: s++; michael@0: while (*s != '\0' && word[i+k0] == *s michael@0: && ! isdigit((unsigned char) *s) && strchr("(-<^$",*s) == NULL) { michael@0: k0++; michael@0: s++; michael@0: } michael@0: if (*s == '(') { michael@0: /** check letters **/ michael@0: if (myisalpha(word[i+k0]) michael@0: && strchr (s+1, word[i+k0]) != NULL) { michael@0: k0++; michael@0: while (*s != ')' && *s != '\0') michael@0: s++; michael@0: if (*s == ')') michael@0: s++; michael@0: } michael@0: } michael@0: while (*s == '-') { michael@0: /** "k0" gets NOT reduced **/ michael@0: /** because "if (k0 == k)" **/ michael@0: s++; michael@0: } michael@0: if (*s == '<') michael@0: s++; michael@0: if (isdigit ((unsigned char) *s)) { michael@0: p0 = *s - '0'; michael@0: s++; michael@0: } michael@0: michael@0: if (*s == '\0' michael@0: /** *s == '^' cuts **/ michael@0: || (*s == '$' && ! myisalpha(word[i+k0]))) michael@0: { michael@0: if (k0 == k) { michael@0: /** this is just a piece of the string **/ michael@0: n0 += 2; michael@0: continue; michael@0: } michael@0: michael@0: if (p0 < p) { michael@0: /** priority too low **/ michael@0: n0 += 2; michael@0: continue; michael@0: } michael@0: /** rule fits; stop search **/ michael@0: break; michael@0: } michael@0: n0 += 2; michael@0: } /** End of "while (parms.rules[n0][0] == c0)" **/ michael@0: michael@0: if (p0 >= p && parms.rules[n0][0] == c0) { michael@0: n += 2; michael@0: continue; michael@0: } michael@0: } /** end of follow-up stuff **/ michael@0: michael@0: /** replace string **/ michael@0: s = parms.rules[n+1]; michael@0: p0 = (parms.rules[n][0] != '\0' michael@0: && strchr (parms.rules[n]+1,'<') != NULL) ? 1:0; michael@0: if (p0 == 1 && z == 0) { michael@0: /** rule with '<' is used **/ michael@0: if (j > 0 && *s != '\0' michael@0: && (target[j-1] == c || target[j-1] == *s)) { michael@0: j--; michael@0: } michael@0: z0 = 1; michael@0: z = 1; michael@0: k0 = 0; michael@0: while (*s != '\0' && word[i+k0] != '\0') { michael@0: word[i+k0] = *s; michael@0: k0++; michael@0: s++; michael@0: } michael@0: if (k > k0) michael@0: strmove (&word[0]+i+k0, &word[0]+i+k); michael@0: michael@0: /** new "actual letter" **/ michael@0: c = word[i]; michael@0: } michael@0: else { /** no '<' rule used **/ michael@0: i += k - 1; michael@0: z = 0; michael@0: while (*s != '\0' michael@0: && *(s+1) != '\0' && j < len) { michael@0: if (j == 0 || target[j-1] != *s) { michael@0: target[j] = *s; michael@0: j++; michael@0: } michael@0: s++; michael@0: } michael@0: /** new "actual letter" **/ michael@0: c = *s; michael@0: if (parms.rules[n][0] != '\0' michael@0: && strstr (parms.rules[n]+1, "^^") != NULL) { michael@0: if (c != '\0') { michael@0: target[j] = c; michael@0: j++; michael@0: } michael@0: strmove (&word[0], &word[0]+i+1); michael@0: i = 0; michael@0: z0 = 1; michael@0: } michael@0: } michael@0: break; michael@0: } /** end of follow-up stuff **/ michael@0: n += 2; michael@0: } /** end of while (parms.rules[n][0] == c) **/ michael@0: } /** end of if (n >= 0) **/ michael@0: if (z0 == 0) { michael@0: // if (k && (assert(p0!=-333),!p0) && j < len && c != '\0' michael@0: // && (!parms.collapse_result || j == 0 || target[j-1] != c)){ michael@0: if (k && !p0 && j < len && c != '\0' michael@0: && (1 || j == 0 || target[j-1] != c)){ michael@0: /** condense only double letters **/ michael@0: target[j] = c; michael@0: ///printf("\n setting \n"); michael@0: j++; michael@0: } michael@0: michael@0: i++; michael@0: z = 0; michael@0: k=0; michael@0: } michael@0: } /** end of while ((c = word[i]) != '\0') **/ michael@0: michael@0: target[j] = '\0'; michael@0: return (j); michael@0: michael@0: } /** end of function "phonet" **/