extensions/spellcheck/hunspell/src/phonet.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/extensions/spellcheck/hunspell/src/phonet.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,309 @@
     1.4 +/******* BEGIN LICENSE BLOCK *******
     1.5 + * Version: MPL 1.1/GPL 2.0/LGPL 2.1
     1.6 + * 
     1.7 + * The contents of this file are subject to the Mozilla Public License Version
     1.8 + * 1.1 (the "License"); you may not use this file except in compliance with
     1.9 + * the License. You may obtain a copy of the License at
    1.10 + * http://www.mozilla.org/MPL/
    1.11 + * 
    1.12 + * Software distributed under the License is distributed on an "AS IS" basis,
    1.13 + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
    1.14 + * for the specific language governing rights and limitations under the
    1.15 + * License.
    1.16 + * 
    1.17 + * The Initial Developer of the Original Code is Björn Jacke. Portions created
    1.18 + * by the Initial Developers are Copyright (C) 2000-2007 the Initial
    1.19 + * Developers. All Rights Reserved.
    1.20 + * 
    1.21 + * Contributor(s): Björn Jacke (bjoern.jacke@gmx.de)
    1.22 + *                 László Németh (nemethl@gyorsposta.hu)
    1.23 + *                 Caolan McNamara (caolanm@redhat.com)
    1.24 + * 
    1.25 + * Alternatively, the contents of this file may be used under the terms of
    1.26 + * either the GNU General Public License Version 2 or later (the "GPL"), or
    1.27 + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
    1.28 + * in which case the provisions of the GPL or the LGPL are applicable instead
    1.29 + * of those above. If you wish to allow use of your version of this file only
    1.30 + * under the terms of either the GPL or the LGPL, and not to allow others to
    1.31 + * use your version of this file under the terms of the MPL, indicate your
    1.32 + * decision by deleting the provisions above and replace them with the notice
    1.33 + * and other provisions required by the GPL or the LGPL. If you do not delete
    1.34 + * the provisions above, a recipient may use your version of this file under
    1.35 + * the terms of any one of the MPL, the GPL or the LGPL.
    1.36 + *
    1.37 + * Changelog:
    1.38 + *  2000-01-05  Björn Jacke <bjoern.jacke AT gmx.de>
    1.39 + *              Initial Release insprired by the article about phonetic
    1.40 + *              transformations out of c't 25/1999
    1.41 + *
    1.42 + *  2007-07-26  Björn Jacke <bjoern.jacke AT gmx.de>
    1.43 + *              Released under MPL/GPL/LGPL tri-license for Hunspell
    1.44 + *
    1.45 + *  2007-08-23  László Németh <nemeth at OOo>
    1.46 + *              Porting from Aspell to Hunspell using C-like structs
    1.47 + *
    1.48 + ******* END LICENSE BLOCK *******/
    1.49 +
    1.50 +#include <stdlib.h> 
    1.51 +#include <string.h>
    1.52 +#include <stdio.h> 
    1.53 +#include <ctype.h>
    1.54 +
    1.55 +#include "csutil.hxx"
    1.56 +#include "phonet.hxx"
    1.57 +
    1.58 +void init_phonet_hash(phonetable & parms) 
    1.59 +  {
    1.60 +    int i, k;
    1.61 +
    1.62 +    for (i = 0; i < HASHSIZE; i++) {
    1.63 +      parms.hash[i] = -1;
    1.64 +    }
    1.65 +
    1.66 +    for (i = 0; parms.rules[i][0] != '\0'; i += 2) {
    1.67 +      /**  set hash value  **/
    1.68 +      k = (unsigned char) parms.rules[i][0];
    1.69 +
    1.70 +      if (parms.hash[k] < 0) {
    1.71 +	parms.hash[k] = i;
    1.72 +      }
    1.73 +    }
    1.74 +  }
    1.75 +
    1.76 +// like strcpy but safe if the strings overlap
    1.77 +//   but only if dest < src
    1.78 +static inline void strmove(char * dest, char * src) {
    1.79 +  while (*src) 
    1.80 +    *dest++ = *src++;
    1.81 +  *dest = '\0';
    1.82 +}
    1.83 +
    1.84 +static int myisalpha(char ch) {
    1.85 +  if ((unsigned char) ch < 128) return isalpha(ch);
    1.86 +  return 1;
    1.87 +}
    1.88 +
    1.89 +/*  phonetic transcription algorithm                   */
    1.90 +/*  see: http://aspell.net/man-html/Phonetic-Code.html */
    1.91 +/*  convert string to uppercase before this call       */
    1.92 +int phonet (const char * inword, char * target,
    1.93 +              int len,
    1.94 +	      phonetable & parms)
    1.95 +  {
    1.96 +    /**       Do phonetic transformation.       **/
    1.97 +    /**  "len" = length of "inword" incl. '\0'. **/
    1.98 +
    1.99 +    /**  result:  >= 0:  length of "target"    **/
   1.100 +    /**            otherwise:  error            **/
   1.101 +
   1.102 +    int  i,j,k=0,n,p,z;
   1.103 +    int  k0,n0,p0=-333,z0;
   1.104 +    char c, c0;
   1.105 +    const char * s;
   1.106 +    typedef unsigned char uchar;    
   1.107 +    char word[MAXPHONETUTF8LEN + 1];
   1.108 +    if (len == -1) len = strlen(inword);
   1.109 +    if (len > MAXPHONETUTF8LEN) return 0;
   1.110 +    strcpy(word, inword);
   1.111 +  
   1.112 +    /**  check word  **/
   1.113 +    i = j = z = 0;
   1.114 +    while ((c = word[i]) != '\0') {
   1.115 +      n = parms.hash[(uchar) c];
   1.116 +      z0 = 0;
   1.117 +
   1.118 +      if (n >= 0) {
   1.119 +        /**  check all rules for the same letter  **/
   1.120 +        while (parms.rules[n][0] == c) {
   1.121 +
   1.122 +          /**  check whole string  **/
   1.123 +          k = 1;   /** number of found letters  **/
   1.124 +          p = 5;   /** default priority  **/
   1.125 +          s = parms.rules[n];
   1.126 +          s++;     /**  important for (see below)  "*(s-1)"  **/
   1.127 +          
   1.128 +          while (*s != '\0'  &&  word[i+k] == *s
   1.129 +                 &&  !isdigit ((unsigned char) *s)  &&  strchr ("(-<^$", *s) == NULL) {
   1.130 +            k++;
   1.131 +            s++;
   1.132 +          }
   1.133 +          if (*s == '(') {
   1.134 +            /**  check letters in "(..)"  **/
   1.135 +            if (myisalpha(word[i+k])  // ...could be implied?
   1.136 +                && strchr(s+1, word[i+k]) != NULL) {
   1.137 +              k++;
   1.138 +              while (*s != ')')
   1.139 +                s++;
   1.140 +              s++;
   1.141 +            }
   1.142 +          }
   1.143 +          p0 = (int) *s;
   1.144 +          k0 = k;
   1.145 +          while (*s == '-'  &&  k > 1) {
   1.146 +            k--;
   1.147 +            s++;
   1.148 +          }
   1.149 +          if (*s == '<')
   1.150 +            s++;
   1.151 +          if (isdigit ((unsigned char) *s)) {
   1.152 +            /**  determine priority  **/
   1.153 +            p = *s - '0';
   1.154 +            s++;
   1.155 +          }
   1.156 +          if (*s == '^'  &&  *(s+1) == '^')
   1.157 +            s++;
   1.158 +
   1.159 +          if (*s == '\0'
   1.160 +              || (*s == '^'  
   1.161 +                  && (i == 0  ||  ! myisalpha(word[i-1]))
   1.162 +                  && (*(s+1) != '$'
   1.163 +                      || (! myisalpha(word[i+k0]) )))
   1.164 +              || (*s == '$'  &&  i > 0  
   1.165 +                  &&  myisalpha(word[i-1])
   1.166 +                  && (! myisalpha(word[i+k0]) ))) 
   1.167 +          {
   1.168 +            /**  search for followup rules, if:     **/
   1.169 +            /**  parms.followup and k > 1  and  NO '-' in searchstring **/
   1.170 +            c0 = word[i+k-1];
   1.171 +            n0 = parms.hash[(uchar) c0];
   1.172 +
   1.173 +//            if (parms.followup  &&  k > 1  &&  n0 >= 0
   1.174 +            if (k > 1  &&  n0 >= 0
   1.175 +                &&  p0 != (int) '-'  &&  word[i+k] != '\0') {
   1.176 +              /**  test follow-up rule for "word[i+k]"  **/
   1.177 +              while (parms.rules[n0][0] == c0) {
   1.178 +
   1.179 +                /**  check whole string  **/
   1.180 +                k0 = k;
   1.181 +                p0 = 5;
   1.182 +                s = parms.rules[n0];
   1.183 +                s++;
   1.184 +                while (*s != '\0'  &&  word[i+k0] == *s
   1.185 +                       && ! isdigit((unsigned char) *s)  &&  strchr("(-<^$",*s) == NULL) {
   1.186 +                  k0++;
   1.187 +                  s++;
   1.188 +                }
   1.189 +                if (*s == '(') {
   1.190 +                  /**  check letters  **/
   1.191 +                  if (myisalpha(word[i+k0])
   1.192 +                      &&  strchr (s+1, word[i+k0]) != NULL) {
   1.193 +                    k0++;
   1.194 +                    while (*s != ')'  &&  *s != '\0')
   1.195 +                      s++;
   1.196 +                    if (*s == ')')
   1.197 +                      s++;
   1.198 +                  }
   1.199 +                }
   1.200 +                while (*s == '-') {
   1.201 +                  /**  "k0" gets NOT reduced   **/
   1.202 +                  /**  because "if (k0 == k)"  **/
   1.203 +                  s++;
   1.204 +                }
   1.205 +                if (*s == '<')
   1.206 +                  s++;
   1.207 +                if (isdigit ((unsigned char) *s)) {
   1.208 +                  p0 = *s - '0';
   1.209 +                  s++;
   1.210 +                }
   1.211 +
   1.212 +                if (*s == '\0'
   1.213 +                    /**  *s == '^' cuts  **/
   1.214 +                    || (*s == '$'  &&  ! myisalpha(word[i+k0]))) 
   1.215 +                {
   1.216 +                  if (k0 == k) {
   1.217 +                    /**  this is just a piece of the string  **/
   1.218 +                    n0 += 2;
   1.219 +                    continue;
   1.220 +                  }
   1.221 +
   1.222 +                  if (p0 < p) {
   1.223 +                    /**  priority too low  **/
   1.224 +                    n0 += 2;
   1.225 +                    continue;
   1.226 +                  }
   1.227 +                  /**  rule fits; stop search  **/
   1.228 +                  break;
   1.229 +                }
   1.230 +                n0 += 2;
   1.231 +              } /**  End of "while (parms.rules[n0][0] == c0)"  **/
   1.232 +
   1.233 +              if (p0 >= p  && parms.rules[n0][0] == c0) {
   1.234 +                n += 2;
   1.235 +                continue;
   1.236 +              }
   1.237 +            } /** end of follow-up stuff **/
   1.238 +
   1.239 +            /**  replace string  **/
   1.240 +            s = parms.rules[n+1];
   1.241 +            p0 = (parms.rules[n][0] != '\0'
   1.242 +                 &&  strchr (parms.rules[n]+1,'<') != NULL) ? 1:0;
   1.243 +            if (p0 == 1 &&  z == 0) {
   1.244 +              /**  rule with '<' is used  **/
   1.245 +              if (j > 0  &&  *s != '\0'
   1.246 +                 && (target[j-1] == c  ||  target[j-1] == *s)) {
   1.247 +                j--;
   1.248 +              }
   1.249 +              z0 = 1;
   1.250 +              z = 1;
   1.251 +              k0 = 0;
   1.252 +              while (*s != '\0'  &&  word[i+k0] != '\0') {
   1.253 +                word[i+k0] = *s;
   1.254 +                k0++;
   1.255 +                s++;
   1.256 +              }
   1.257 +              if (k > k0)
   1.258 +                strmove (&word[0]+i+k0, &word[0]+i+k);
   1.259 +
   1.260 +              /**  new "actual letter"  **/
   1.261 +              c = word[i];
   1.262 +            }
   1.263 +            else { /** no '<' rule used **/
   1.264 +              i += k - 1;
   1.265 +              z = 0;
   1.266 +              while (*s != '\0'
   1.267 +                     &&  *(s+1) != '\0'  &&  j < len) {
   1.268 +                if (j == 0  ||  target[j-1] != *s) {
   1.269 +                  target[j] = *s;
   1.270 +                  j++;
   1.271 +                }
   1.272 +                s++;
   1.273 +              }
   1.274 +              /**  new "actual letter"  **/
   1.275 +              c = *s;
   1.276 +              if (parms.rules[n][0] != '\0'
   1.277 +                 &&  strstr (parms.rules[n]+1, "^^") != NULL) {
   1.278 +                if (c != '\0') {
   1.279 +                  target[j] = c;
   1.280 +                  j++;
   1.281 +                }
   1.282 +                strmove (&word[0], &word[0]+i+1);
   1.283 +                i = 0;
   1.284 +                z0 = 1;
   1.285 +              }
   1.286 +            }
   1.287 +            break;
   1.288 +          }  /** end of follow-up stuff **/
   1.289 +          n += 2;
   1.290 +        } /**  end of while (parms.rules[n][0] == c)  **/
   1.291 +      } /**  end of if (n >= 0)  **/
   1.292 +      if (z0 == 0) {
   1.293 +//        if (k && (assert(p0!=-333),!p0) &&  j < len &&  c != '\0'
   1.294 +//           && (!parms.collapse_result  ||  j == 0  ||  target[j-1] != c)){
   1.295 +        if (k && !p0 && j < len &&  c != '\0'
   1.296 +           && (1 || j == 0  ||  target[j-1] != c)){
   1.297 +           /**  condense only double letters  **/
   1.298 +          target[j] = c;
   1.299 +	  ///printf("\n setting \n");
   1.300 +          j++;
   1.301 +        }
   1.302 +
   1.303 +        i++;
   1.304 +        z = 0;
   1.305 +	k=0;
   1.306 +      }
   1.307 +    }  /**  end of   while ((c = word[i]) != '\0')  **/
   1.308 +
   1.309 +    target[j] = '\0';
   1.310 +    return (j);
   1.311 +
   1.312 +  }  /**  end of function "phonet"  **/

mercurial