intl/hyphenation/src/hyphen.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/hyphenation/src/hyphen.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1160 @@
     1.4 +/* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both
     1.5 + * licenses follows.
     1.6 + */
     1.7 +
     1.8 +/* LibHnj - a library for high quality hyphenation and justification
     1.9 + * Copyright (C) 1998 Raph Levien, 
    1.10 + * 	     (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org), 
    1.11 + *           (C) 2001 Peter Novodvorsky (nidd@cs.msu.su)
    1.12 + *           (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo)
    1.13 + *
    1.14 + * This library is free software; you can redistribute it and/or
    1.15 + * modify it under the terms of the GNU Library General Public
    1.16 + * License as published by the Free Software Foundation; either
    1.17 + * version 2 of the License, or (at your option) any later version.
    1.18 + *
    1.19 + * This library is distributed in the hope that it will be useful,
    1.20 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
    1.21 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    1.22 + * Library General Public License for more details.
    1.23 + *
    1.24 + * You should have received a copy of the GNU Library General Public
    1.25 + * License along with this library; if not, write to the 
    1.26 + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 
    1.27 + * Boston, MA  02111-1307  USA.
    1.28 +*/
    1.29 +
    1.30 +/*
    1.31 + * The contents of this file are subject to the Mozilla Public License
    1.32 + * Version 1.0 (the "MPL"); you may not use this file except in
    1.33 + * compliance with the MPL.  You may obtain a copy of the MPL at
    1.34 + * http://www.mozilla.org/MPL/
    1.35 + *
    1.36 + * Software distributed under the MPL is distributed on an "AS IS" basis,
    1.37 + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL
    1.38 + * for the specific language governing rights and limitations under the
    1.39 + * MPL.
    1.40 + *
    1.41 + */
    1.42 +#include <stdlib.h> /* for NULL, malloc */
    1.43 +#include <stdio.h>  /* for fprintf */
    1.44 +#include <string.h> /* for strdup */
    1.45 +
    1.46 +#ifdef UNX
    1.47 +#include <unistd.h> /* for exit */
    1.48 +#endif
    1.49 +
    1.50 +#define noVERBOSE
    1.51 +
    1.52 +/* calculate hyphenmin values with long ligature length (2 or 3 characters
    1.53 + * instead of 1 or 2) for comparison with hyphenation without ligatures */
    1.54 +#define noLONG_LIGATURE
    1.55 +
    1.56 +#ifdef LONG_LIGATURE
    1.57 +#define LIG_xx	1
    1.58 +#define LIG_xxx	2
    1.59 +#else
    1.60 +#define LIG_xx	0
    1.61 +#define LIG_xxx	1
    1.62 +#endif
    1.63 +
    1.64 +#include "hnjalloc.h"
    1.65 +#include "hyphen.h"
    1.66 +
    1.67 +static char *
    1.68 +hnj_strdup (const char *s)
    1.69 +{
    1.70 +  char *new;
    1.71 +  int l;
    1.72 +
    1.73 +  l = strlen (s);
    1.74 +  new = hnj_malloc (l + 1);
    1.75 +  memcpy (new, s, l);
    1.76 +  new[l] = 0;
    1.77 +  return new;
    1.78 +}
    1.79 +
    1.80 +/* remove cross-platform text line end characters */
    1.81 +void hnj_strchomp(char * s)
    1.82 +{
    1.83 +  int k = strlen(s);
    1.84 +  if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';
    1.85 +  if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
    1.86 +}
    1.87 +
    1.88 +/* a little bit of a hash table implementation. This simply maps strings
    1.89 +   to state numbers */
    1.90 +
    1.91 +typedef struct _HashTab HashTab;
    1.92 +typedef struct _HashEntry HashEntry;
    1.93 +
    1.94 +/* A cheap, but effective, hack. */
    1.95 +#define HASH_SIZE 31627
    1.96 +
    1.97 +struct _HashTab {
    1.98 +  HashEntry *entries[HASH_SIZE];
    1.99 +};
   1.100 +
   1.101 +struct _HashEntry {
   1.102 +  HashEntry *next;
   1.103 +  char *key;
   1.104 +  int val;
   1.105 +};
   1.106 +
   1.107 +/* a char* hash function from ASU - adapted from Gtk+ */
   1.108 +static unsigned int
   1.109 +hnj_string_hash (const char *s)
   1.110 +{
   1.111 +  const char *p;
   1.112 +  unsigned int h=0, g;
   1.113 +  for(p = s; *p != '\0'; p += 1) {
   1.114 +    h = ( h << 4 ) + *p;
   1.115 +    if ( ( g = h & 0xf0000000 ) ) {
   1.116 +      h = h ^ (g >> 24);
   1.117 +      h = h ^ g;
   1.118 +    }
   1.119 +  }
   1.120 +  return h /* % M */;
   1.121 +}
   1.122 +
   1.123 +static HashTab *
   1.124 +hnj_hash_new (void)
   1.125 +{
   1.126 +  HashTab *hashtab;
   1.127 +  int i;
   1.128 +
   1.129 +  hashtab = hnj_malloc (sizeof(HashTab));
   1.130 +  for (i = 0; i < HASH_SIZE; i++)
   1.131 +    hashtab->entries[i] = NULL;
   1.132 +
   1.133 +  return hashtab;
   1.134 +}
   1.135 +
   1.136 +static void
   1.137 +hnj_hash_free (HashTab *hashtab)
   1.138 +{
   1.139 +  int i;
   1.140 +  HashEntry *e, *next;
   1.141 +
   1.142 +  for (i = 0; i < HASH_SIZE; i++)
   1.143 +    for (e = hashtab->entries[i]; e; e = next)
   1.144 +      {
   1.145 +	next = e->next;
   1.146 +	hnj_free (e->key);
   1.147 +	hnj_free (e);
   1.148 +      }
   1.149 +
   1.150 +  hnj_free (hashtab);
   1.151 +}
   1.152 +
   1.153 +/* assumes that key is not already present! */
   1.154 +static void
   1.155 +hnj_hash_insert (HashTab *hashtab, const char *key, int val)
   1.156 +{
   1.157 +  int i;
   1.158 +  HashEntry *e;
   1.159 +
   1.160 +  i = hnj_string_hash (key) % HASH_SIZE;
   1.161 +  e = hnj_malloc (sizeof(HashEntry));
   1.162 +  e->next = hashtab->entries[i];
   1.163 +  e->key = hnj_strdup (key);
   1.164 +  e->val = val;
   1.165 +  hashtab->entries[i] = e;
   1.166 +}
   1.167 +
   1.168 +/* return val if found, otherwise -1 */
   1.169 +static int
   1.170 +hnj_hash_lookup (HashTab *hashtab, const char *key)
   1.171 +{
   1.172 +  int i;
   1.173 +  HashEntry *e;
   1.174 +  i = hnj_string_hash (key) % HASH_SIZE;
   1.175 +  for (e = hashtab->entries[i]; e; e = e->next)
   1.176 +    if (!strcmp (key, e->key))
   1.177 +      return e->val;
   1.178 +  return -1;
   1.179 +}
   1.180 +
   1.181 +/* Get the state number, allocating a new state if necessary. */
   1.182 +static int
   1.183 +hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string)
   1.184 +{
   1.185 +  int state_num;
   1.186 +
   1.187 +  state_num = hnj_hash_lookup (hashtab, string);
   1.188 +
   1.189 +  if (state_num >= 0)
   1.190 +    return state_num;
   1.191 +
   1.192 +  hnj_hash_insert (hashtab, string, dict->num_states);
   1.193 +  /* predicate is true if dict->num_states is a power of two */
   1.194 +  if (!(dict->num_states & (dict->num_states - 1)))
   1.195 +    {
   1.196 +      dict->states = hnj_realloc (dict->states,
   1.197 +				  (dict->num_states << 1) *
   1.198 +				  sizeof(HyphenState));
   1.199 +    }
   1.200 +  dict->states[dict->num_states].match = NULL;
   1.201 +  dict->states[dict->num_states].repl = NULL;
   1.202 +  dict->states[dict->num_states].fallback_state = -1;
   1.203 +  dict->states[dict->num_states].num_trans = 0;
   1.204 +  dict->states[dict->num_states].trans = NULL;
   1.205 +  return dict->num_states++;
   1.206 +}
   1.207 +
   1.208 +/* add a transition from state1 to state2 through ch - assumes that the
   1.209 +   transition does not already exist */
   1.210 +static void
   1.211 +hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch)
   1.212 +{
   1.213 +  int num_trans;
   1.214 +
   1.215 +  num_trans = dict->states[state1].num_trans;
   1.216 +  if (num_trans == 0)
   1.217 +    {
   1.218 +      dict->states[state1].trans = hnj_malloc (sizeof(HyphenTrans));
   1.219 +    }
   1.220 +  else if (!(num_trans & (num_trans - 1)))
   1.221 +    {
   1.222 +      dict->states[state1].trans = hnj_realloc (dict->states[state1].trans,
   1.223 +						(num_trans << 1) *
   1.224 +						sizeof(HyphenTrans));
   1.225 +    }
   1.226 +  dict->states[state1].trans[num_trans].ch = ch;
   1.227 +  dict->states[state1].trans[num_trans].new_state = state2;
   1.228 +  dict->states[state1].num_trans++;
   1.229 +}
   1.230 +
   1.231 +#ifdef VERBOSE
   1.232 +HashTab *global[1];
   1.233 +
   1.234 +static char *
   1.235 +get_state_str (int state, int level)
   1.236 +{
   1.237 +  int i;
   1.238 +  HashEntry *e;
   1.239 +
   1.240 +  for (i = 0; i < HASH_SIZE; i++)
   1.241 +    for (e = global[level]->entries[i]; e; e = e->next)
   1.242 +      if (e->val == state)
   1.243 +	return e->key;
   1.244 +  return NULL;
   1.245 +}
   1.246 +#endif
   1.247 +
   1.248 +void hnj_hyphen_load_line(char * buf, HyphenDict * dict, HashTab * hashtab) {
   1.249 +  int i, j;
   1.250 +  char word[MAX_CHARS];
   1.251 +  char pattern[MAX_CHARS];
   1.252 +  char * repl;
   1.253 +  signed char replindex;
   1.254 +  signed char replcut;
   1.255 +  int state_num = 0;
   1.256 +  int last_state;
   1.257 +  char ch;
   1.258 +  int found;
   1.259 +
   1.260 +	  if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) {
   1.261 +	    dict->lhmin = atoi(buf + 13);
   1.262 +	    return;
   1.263 +	  } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) {
   1.264 +	    dict->rhmin = atoi(buf + 14);
   1.265 +	    return;
   1.266 +	  } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) {
   1.267 +	    dict->clhmin = atoi(buf + 21);
   1.268 +	    return;
   1.269 +	  } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) {
   1.270 +	    dict->crhmin = atoi(buf + 22);
   1.271 +	    return;
   1.272 +	  } else if (strncmp(buf, "NOHYPHEN", 8) == 0) {
   1.273 +	    char * space = buf + 8;
   1.274 +	    while (*space != '\0' && (*space == ' ' || *space == '\t')) space++;
   1.275 +	    if (*buf != '\0') dict->nohyphen = hnj_strdup(space);
   1.276 +	    if (dict->nohyphen) {
   1.277 +	        char * nhe = dict->nohyphen + strlen(dict->nohyphen) - 1;
   1.278 +	        *nhe = 0;
   1.279 +	        for (nhe = nhe - 1; nhe > dict->nohyphen; nhe--) {
   1.280 +	                if (*nhe == ',') {
   1.281 +	                    dict->nohyphenl++;
   1.282 +	                    *nhe = 0;
   1.283 +	                }
   1.284 +	        }
   1.285 +	    }
   1.286 +	    return;
   1.287 +	  } 
   1.288 +	  j = 0;
   1.289 +	  pattern[j] = '0';
   1.290 +          repl = strchr(buf, '/');
   1.291 +          replindex = 0;
   1.292 +          replcut = 0;
   1.293 +          if (repl) {
   1.294 +            char * index = strchr(repl + 1, ',');
   1.295 +            *repl = '\0';
   1.296 +            if (index) {
   1.297 +                char * index2 = strchr(index + 1, ',');
   1.298 +                *index = '\0';
   1.299 +                if (index2) {
   1.300 +                    *index2 = '\0';
   1.301 +                    replindex = (signed char) atoi(index + 1) - 1;
   1.302 +                    replcut = (signed char) atoi(index2 + 1);                
   1.303 +                }
   1.304 +            } else {
   1.305 +                hnj_strchomp(repl + 1);
   1.306 +                replindex = 0;
   1.307 +                replcut = (signed char) strlen(buf);
   1.308 +            }
   1.309 +            repl = hnj_strdup(repl + 1);
   1.310 +          }
   1.311 +	  for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++)
   1.312 +	    {
   1.313 +	      if (buf[i] >= '0' && buf[i] <= '9')
   1.314 +		pattern[j] = buf[i];
   1.315 +	      else
   1.316 +		{
   1.317 +		  word[j] = buf[i];
   1.318 +		  pattern[++j] = '0';
   1.319 +		}
   1.320 +	    }
   1.321 +	  word[j] = '\0';
   1.322 +	  pattern[j + 1] = '\0';
   1.323 +
   1.324 +          i = 0;
   1.325 +	  if (!repl) {
   1.326 +	    /* Optimize away leading zeroes */
   1.327 +            for (; pattern[i] == '0'; i++);
   1.328 +          } else {
   1.329 +            if (*word == '.') i++;
   1.330 +            /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */
   1.331 +            if (dict->utf8) {
   1.332 +                int pu = -1;        /* unicode character position */
   1.333 +                int ps = -1;        /* unicode start position (original replindex) */
   1.334 +                int pc = (*word == '.') ? 1: 0; /* 8-bit character position */
   1.335 +                for (; pc < (strlen(word) + 1); pc++) {
   1.336 +                /* beginning of an UTF-8 character (not '10' start bits) */
   1.337 +                    if ((((unsigned char) word[pc]) >> 6) != 2) pu++;
   1.338 +                    if ((ps < 0) && (replindex == pu)) {
   1.339 +                        ps = replindex;
   1.340 +                        replindex = (signed char) pc;
   1.341 +                    }
   1.342 +                    if ((ps >= 0) && ((pu - ps) == replcut)) {
   1.343 +                        replcut = (signed char) (pc - replindex);
   1.344 +                        break;
   1.345 +                    }
   1.346 +                }
   1.347 +                if (*word == '.') replindex--;
   1.348 +            }
   1.349 +          }
   1.350 +
   1.351 +#ifdef VERBOSE
   1.352 +	  printf ("word %s pattern %s, j = %d  repl: %s\n", word, pattern + i, j, repl);
   1.353 +#endif
   1.354 +	  found = hnj_hash_lookup (hashtab, word);
   1.355 +	  state_num = hnj_get_state (dict, hashtab, word);
   1.356 +	  dict->states[state_num].match = hnj_strdup (pattern + i);
   1.357 +	  dict->states[state_num].repl = repl;
   1.358 +	  dict->states[state_num].replindex = replindex;
   1.359 +          if (!replcut) {
   1.360 +            dict->states[state_num].replcut = (signed char) strlen(word);
   1.361 +          } else {
   1.362 +            dict->states[state_num].replcut = replcut;
   1.363 +          }
   1.364 +
   1.365 +	  /* now, put in the prefix transitions */
   1.366 +          for (; found < 0 ;j--)
   1.367 +	    {
   1.368 +	      last_state = state_num;
   1.369 +	      ch = word[j - 1];
   1.370 +	      word[j - 1] = '\0';
   1.371 +	      found = hnj_hash_lookup (hashtab, word);
   1.372 +	      state_num = hnj_get_state (dict, hashtab, word);
   1.373 +	      hnj_add_trans (dict, state_num, last_state, ch);
   1.374 +	    }
   1.375 +}
   1.376 +
   1.377 +HyphenDict *
   1.378 +hnj_hyphen_load (const char *fn)
   1.379 +{
   1.380 +  HyphenDict *dict[2];
   1.381 +  HashTab *hashtab;
   1.382 +  FILE *f;
   1.383 +  char buf[MAX_CHARS];
   1.384 +  int nextlevel = 0;
   1.385 +  int i, j, k;
   1.386 +  HashEntry *e;
   1.387 +  int state_num = 0;
   1.388 +
   1.389 +  f = fopen (fn, "r");
   1.390 +  if (f == NULL)
   1.391 +    return NULL;
   1.392 +
   1.393 +// loading one or two dictionaries (separated by NEXTLEVEL keyword)
   1.394 +for (k = 0; k < 2; k++) { 
   1.395 +  hashtab = hnj_hash_new ();
   1.396 +#ifdef VERBOSE
   1.397 +  global[k] = hashtab;
   1.398 +#endif
   1.399 +  hnj_hash_insert (hashtab, "", 0);
   1.400 +  dict[k] = hnj_malloc (sizeof(HyphenDict));
   1.401 +  dict[k]->num_states = 1;
   1.402 +  dict[k]->states = hnj_malloc (sizeof(HyphenState));
   1.403 +  dict[k]->states[0].match = NULL;
   1.404 +  dict[k]->states[0].repl = NULL;
   1.405 +  dict[k]->states[0].fallback_state = -1;
   1.406 +  dict[k]->states[0].num_trans = 0;
   1.407 +  dict[k]->states[0].trans = NULL;
   1.408 +  dict[k]->nextlevel = NULL;
   1.409 +  dict[k]->lhmin = 0;
   1.410 +  dict[k]->rhmin = 0;
   1.411 +  dict[k]->clhmin = 0;
   1.412 +  dict[k]->crhmin = 0;
   1.413 +  dict[k]->nohyphen = NULL;
   1.414 +  dict[k]->nohyphenl = 0;
   1.415 +
   1.416 +  /* read in character set info */
   1.417 +  if (k == 0) {
   1.418 +    for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;
   1.419 +    if (fgets(dict[k]->cset,  sizeof(dict[k]->cset),f) != NULL) {
   1.420 +      for (i=0;i<MAX_NAME;i++)
   1.421 +        if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))
   1.422 +          dict[k]->cset[i] = 0;
   1.423 +    } else {
   1.424 +      dict[k]->cset[0] = 0;
   1.425 +    }
   1.426 +    dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);
   1.427 +  } else {
   1.428 +    strncpy(dict[k]->cset, dict[0]->cset, sizeof(dict[k]->cset)-1);
   1.429 +    dict[k]->cset[sizeof(dict[k]->cset)-1] = '\0';
   1.430 +    dict[k]->utf8 = dict[0]->utf8;
   1.431 +  }
   1.432 +
   1.433 +  if (k == 0 || nextlevel) {
   1.434 +    while (fgets (buf, sizeof(buf), f) != NULL) {
   1.435 +      if (strncmp(buf, "NEXTLEVEL", 9) == 0) {
   1.436 +	nextlevel = 1;
   1.437 +	break;
   1.438 +      } else if (buf[0] != '%') hnj_hyphen_load_line(buf, dict[k], hashtab);
   1.439 +    }
   1.440 +  } else if (k == 1) {
   1.441 +    /* default first level: hyphen and ASCII apostrophe */
   1.442 +    if (!dict[0]->utf8) hnj_hyphen_load_line("NOHYPHEN ',-\n", dict[k], hashtab);
   1.443 +    else hnj_hyphen_load_line("NOHYPHEN ',\xe2\x80\x93,\xe2\x80\x99,-\n", dict[k], hashtab);
   1.444 +    strncpy(buf, "1-1\n", MAX_CHARS-1); // buf rewritten by hnj_hyphen_load here
   1.445 +    buf[MAX_CHARS-1] = '\0';
   1.446 +    hnj_hyphen_load_line(buf, dict[k], hashtab); /* remove hyphen */
   1.447 +    hnj_hyphen_load_line("1'1\n", dict[k], hashtab); /* ASCII apostrophe */
   1.448 +    if (dict[0]->utf8) {
   1.449 +      hnj_hyphen_load_line("1\xe2\x80\x93" "1\n", dict[k], hashtab); /* endash */
   1.450 +      hnj_hyphen_load_line("1\xe2\x80\x99" "1\n", dict[k], hashtab); /* apostrophe */
   1.451 +    }
   1.452 +  }
   1.453 +
   1.454 +  /* Could do unioning of matches here (instead of the preprocessor script).
   1.455 +     If we did, the pseudocode would look something like this:
   1.456 +
   1.457 +     foreach state in the hash table
   1.458 +        foreach i = [1..length(state) - 1]
   1.459 +           state to check is substr (state, i)
   1.460 +           look it up
   1.461 +           if found, and if there is a match, union the match in.
   1.462 +
   1.463 +     It's also possible to avoid the quadratic blowup by doing the
   1.464 +     search in order of increasing state string sizes - then you
   1.465 +     can break the loop after finding the first match.
   1.466 +
   1.467 +     This step should be optional in any case - if there is a
   1.468 +     preprocessed rule table, it's always faster to use that.
   1.469 +
   1.470 +*/
   1.471 +
   1.472 +  /* put in the fallback states */
   1.473 +  for (i = 0; i < HASH_SIZE; i++)
   1.474 +    for (e = hashtab->entries[i]; e; e = e->next)
   1.475 +      {
   1.476 +	if (*(e->key)) for (j = 1; 1; j++)
   1.477 +	  {          
   1.478 +	    state_num = hnj_hash_lookup (hashtab, e->key + j);
   1.479 +	    if (state_num >= 0)
   1.480 +	      break;
   1.481 +	  }
   1.482 +        /* KBH: FIXME state 0 fallback_state should always be -1? */
   1.483 +	if (e->val)
   1.484 +	  dict[k]->states[e->val].fallback_state = state_num;
   1.485 +      }
   1.486 +#ifdef VERBOSE
   1.487 +  for (i = 0; i < HASH_SIZE; i++)
   1.488 +    for (e = hashtab->entries[i]; e; e = e->next)
   1.489 +      {
   1.490 +	printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,
   1.491 +		dict[k]->states[e->val].fallback_state);
   1.492 +	for (j = 0; j < dict[k]->states[e->val].num_trans; j++)
   1.493 +	  printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch,
   1.494 +		  dict[k]->states[e->val].trans[j].new_state);
   1.495 +      }
   1.496 +#endif
   1.497 +
   1.498 +#ifndef VERBOSE
   1.499 +  hnj_hash_free (hashtab);
   1.500 +#endif
   1.501 +  state_num = 0;
   1.502 +}
   1.503 +  fclose(f);
   1.504 +  if (nextlevel) dict[0]->nextlevel = dict[1];
   1.505 +  else {
   1.506 +    dict[1] -> nextlevel = dict[0];
   1.507 +    dict[1]->lhmin = dict[0]->lhmin;
   1.508 +    dict[1]->rhmin = dict[0]->rhmin;
   1.509 +    dict[1]->clhmin = (dict[0]->clhmin) ? dict[0]->clhmin : ((dict[0]->lhmin) ? dict[0]->lhmin : 3);
   1.510 +    dict[1]->crhmin = (dict[0]->crhmin) ? dict[0]->crhmin : ((dict[0]->rhmin) ? dict[0]->rhmin : 3);
   1.511 +#ifdef VERBOSE
   1.512 +    HashTab *r = global[0];
   1.513 +    global[0] = global[1];
   1.514 +    global[1] = r;
   1.515 +#endif
   1.516 +    return dict[1];
   1.517 +  }
   1.518 +  return dict[0];
   1.519 +}
   1.520 +
   1.521 +void hnj_hyphen_free (HyphenDict *dict)
   1.522 +{
   1.523 +  int state_num;
   1.524 +  HyphenState *hstate;
   1.525 +
   1.526 +  for (state_num = 0; state_num < dict->num_states; state_num++)
   1.527 +    {
   1.528 +      hstate = &dict->states[state_num];
   1.529 +      if (hstate->match)
   1.530 +	hnj_free (hstate->match);
   1.531 +      if (hstate->repl)
   1.532 +	hnj_free (hstate->repl);
   1.533 +      if (hstate->trans)
   1.534 +	hnj_free (hstate->trans);
   1.535 +    }
   1.536 +  if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel);
   1.537 +
   1.538 +  if (dict->nohyphen) hnj_free(dict->nohyphen);
   1.539 +
   1.540 +  hnj_free (dict->states);
   1.541 +
   1.542 +  hnj_free (dict);
   1.543 +}
   1.544 +
   1.545 +#define MAX_WORD 256
   1.546 +
   1.547 +int hnj_hyphen_hyphenate (HyphenDict *dict,
   1.548 +			   const char *word, int word_size,
   1.549 +			   char *hyphens)
   1.550 +{
   1.551 +  char *prep_word;
   1.552 +  int i, j, k;
   1.553 +  int state;
   1.554 +  char ch;
   1.555 +  HyphenState *hstate;
   1.556 +  char *match;
   1.557 +  int offset;
   1.558 +
   1.559 +  prep_word = hnj_malloc (word_size + 3);
   1.560 +
   1.561 +  j = 0;
   1.562 +  prep_word[j++] = '.';
   1.563 +
   1.564 +  for (i = 0; i < word_size; i++) {
   1.565 +    if (word[i] <= '9' && word[i] >= '0') {
   1.566 +      prep_word[j++] = '.';
   1.567 +    } else {
   1.568 +      prep_word[j++] = word[i];
   1.569 +    }
   1.570 +  }
   1.571 +
   1.572 +  prep_word[j++] = '.';
   1.573 +  prep_word[j] = '\0';
   1.574 +
   1.575 +  for (i = 0; i < word_size + 5; i++)
   1.576 +    hyphens[i] = '0';
   1.577 +
   1.578 +#ifdef VERBOSE
   1.579 +  printf ("prep_word = %s\n", prep_word);
   1.580 +#endif
   1.581 +
   1.582 +  /* now, run the finite state machine */
   1.583 +  state = 0;
   1.584 +  for (i = 0; i < j; i++)
   1.585 +    {
   1.586 +      ch = prep_word[i];
   1.587 +      for (;;)
   1.588 +	{
   1.589 +
   1.590 +	  if (state == -1) {
   1.591 +            /* return 1; */
   1.592 +	    /*  KBH: FIXME shouldn't this be as follows? */
   1.593 +            state = 0;
   1.594 +            goto try_next_letter;
   1.595 +          }          
   1.596 +
   1.597 +#ifdef VERBOSE
   1.598 +	  char *state_str;
   1.599 +	  state_str = get_state_str (state, 0);
   1.600 +
   1.601 +	  for (k = 0; k < i - strlen (state_str); k++)
   1.602 +	    putchar (' ');
   1.603 +	  printf ("%s", state_str);
   1.604 +#endif
   1.605 +
   1.606 +	  hstate = &dict->states[state];
   1.607 +	  for (k = 0; k < hstate->num_trans; k++)
   1.608 +	    if (hstate->trans[k].ch == ch)
   1.609 +	      {
   1.610 +		state = hstate->trans[k].new_state;
   1.611 +		goto found_state;
   1.612 +	      }
   1.613 +	  state = hstate->fallback_state;
   1.614 +#ifdef VERBOSE
   1.615 +	  printf (" falling back, fallback_state %d\n", state);
   1.616 +#endif
   1.617 +	}
   1.618 +    found_state:
   1.619 +#ifdef VERBOSE
   1.620 +      printf ("found state %d\n",state);
   1.621 +#endif
   1.622 +      /* Additional optimization is possible here - especially,
   1.623 +	 elimination of trailing zeroes from the match. Leading zeroes
   1.624 +	 have already been optimized. */
   1.625 +      match = dict->states[state].match;
   1.626 +      /* replacing rules not handled by hyphen_hyphenate() */
   1.627 +      if (match && !dict->states[state].repl)
   1.628 +	{
   1.629 +	  offset = i + 1 - strlen (match);
   1.630 +#ifdef VERBOSE
   1.631 +	  for (k = 0; k < offset; k++)
   1.632 +	    putchar (' ');
   1.633 +	  printf ("%s\n", match);
   1.634 +#endif
   1.635 +	  /* This is a linear search because I tried a binary search and
   1.636 +	     found it to be just a teeny bit slower. */
   1.637 +	  for (k = 0; match[k]; k++)
   1.638 +	    if (hyphens[offset + k] < match[k])
   1.639 +	      hyphens[offset + k] = match[k];
   1.640 +	}
   1.641 +
   1.642 +      /* KBH: we need this to make sure we keep looking in a word */
   1.643 +      /* for patterns even if the current character is not known in state 0 */
   1.644 +      /* since patterns for hyphenation may occur anywhere in the word */
   1.645 +      try_next_letter: ;
   1.646 +
   1.647 +    }
   1.648 +#ifdef VERBOSE
   1.649 +  for (i = 0; i < j; i++)
   1.650 +    putchar (hyphens[i]);
   1.651 +  putchar ('\n');
   1.652 +#endif
   1.653 +
   1.654 +  for (i = 0; i < j - 4; i++)
   1.655 +#if 0
   1.656 +    if (hyphens[i + 1] & 1)
   1.657 +      hyphens[i] = '-';
   1.658 +#else
   1.659 +    hyphens[i] = hyphens[i + 1];
   1.660 +#endif
   1.661 +  hyphens[0] = '0';
   1.662 +  for (; i < word_size; i++)
   1.663 +    hyphens[i] = '0';
   1.664 +  hyphens[word_size] = '\0';
   1.665 +
   1.666 +  hnj_free (prep_word);
   1.667 +    
   1.668 +  return 0;    
   1.669 +}
   1.670 +
   1.671 +/* Unicode ligature length */
   1.672 +int hnj_ligature(unsigned char c) {
   1.673 +    switch (c) {
   1.674 +        case 0x80:			/* ff */
   1.675 +        case 0x81:			/* fi */
   1.676 +        case 0x82: return LIG_xx;	/* fl */
   1.677 +        case 0x83:			/* ffi */
   1.678 +        case 0x84: return LIG_xxx;	/* ffl */
   1.679 +        case 0x85:			/* long st */
   1.680 +        case 0x86: return LIG_xx;	/* st */
   1.681 +    }
   1.682 +    return 0;
   1.683 +}
   1.684 +
   1.685 +/* character length of the first n byte of the input word */
   1.686 +int hnj_hyphen_strnlen(const char * word, int n, int utf8)
   1.687 +{
   1.688 +    int i = 0;
   1.689 +    int j = 0;
   1.690 +    while (j < n && word[j] != '\0') {
   1.691 +      i++;
   1.692 +      // Unicode ligature support
   1.693 +      if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC))  {
   1.694 +        i += hnj_ligature(word[j + 2]);
   1.695 +      }
   1.696 +      for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++);
   1.697 +    }
   1.698 +    return i;
   1.699 +}
   1.700 +
   1.701 +int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens,
   1.702 +	char *** rep, int ** pos, int ** cut, int lhmin)
   1.703 +{
   1.704 +    int i = 1, j;
   1.705 +
   1.706 +    // Unicode ligature support
   1.707 +    if (utf8 && ((unsigned char) word[0] == 0xEF) && ((unsigned char) word[1] == 0xAC))  {
   1.708 +      i += hnj_ligature(word[2]);
   1.709 +    }
   1.710 +
   1.711 +    // ignore numbers
   1.712 +    for (j = 0; word[j] <= '9' && word[j] >= '0'; j++) i--;
   1.713 +
   1.714 +    for (j = 0; i < lhmin && word[j] != '\0'; i++) do {
   1.715 +      // check length of the non-standard part
   1.716 +      if (*rep && *pos && *cut && (*rep)[j]) {
   1.717 +        char * rh = strchr((*rep)[j], '=');
   1.718 +        if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) +
   1.719 +          hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) {
   1.720 +            free((*rep)[j]);
   1.721 +            (*rep)[j] = NULL;
   1.722 +            hyphens[j] = '0';
   1.723 +          }
   1.724 +       } else {
   1.725 +         hyphens[j] = '0';
   1.726 +       }
   1.727 +       j++;
   1.728 +
   1.729 +       // Unicode ligature support
   1.730 +       if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC))  {
   1.731 +         i += hnj_ligature(word[j + 2]);
   1.732 +       }
   1.733 +    } while (utf8 && (word[j] & 0xc0) == 0x80);
   1.734 +    return 0;
   1.735 +}
   1.736 +
   1.737 +int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens,
   1.738 +	char *** rep, int ** pos, int ** cut, int rhmin)
   1.739 +{
   1.740 +    int i = 0;
   1.741 +    int j;
   1.742 +
   1.743 +    // ignore numbers
   1.744 +    for (j = word_size - 1; j > 0 && word[j] <= '9' && word[j] >= '0'; j--) i--;
   1.745 +
   1.746 +    for (j = word_size - 1; i < rhmin && j > 0; j--) {
   1.747 +      // check length of the non-standard part
   1.748 +      if (*rep && *pos && *cut && (*rep)[j]) {
   1.749 +        char * rh = strchr((*rep)[j], '=');
   1.750 +        if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) +
   1.751 +          hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) {
   1.752 +            free((*rep)[j]);
   1.753 +            (*rep)[j] = NULL;
   1.754 +            hyphens[j] = '0';
   1.755 +          }
   1.756 +       } else {
   1.757 +         hyphens[j] = '0';
   1.758 +       }
   1.759 +       if (!utf8 || (word[j] & 0xc0) == 0xc0 || (word[j] & 0x80) != 0x80) i++;
   1.760 +    }
   1.761 +    return 0;
   1.762 +}
   1.763 +
   1.764 +// recursive function for compound level hyphenation
   1.765 +int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
   1.766 +    char * hyphens, char *** rep, int ** pos, int ** cut,
   1.767 +    int clhmin, int crhmin, int lend, int rend)
   1.768 +{
   1.769 +  char *prep_word;
   1.770 +  int i, j, k;
   1.771 +  int state;
   1.772 +  char ch;
   1.773 +  HyphenState *hstate;
   1.774 +  char *match;
   1.775 +  char *repl;
   1.776 +  signed char replindex;
   1.777 +  signed char replcut;
   1.778 +  int offset;
   1.779 +  int * matchlen;
   1.780 +  int * matchindex;
   1.781 +  char ** matchrepl;  
   1.782 +  int isrepl = 0;
   1.783 +  int nHyphCount;
   1.784 +
   1.785 +  size_t prep_word_size = word_size + 3;
   1.786 +  prep_word = hnj_malloc (prep_word_size);
   1.787 +  matchlen = hnj_malloc ((word_size + 3) * sizeof(int));
   1.788 +  matchindex = hnj_malloc ((word_size + 3) * sizeof(int));
   1.789 +  matchrepl = hnj_malloc ((word_size + 3) * sizeof(char *));
   1.790 +
   1.791 +  j = 0;
   1.792 +  prep_word[j++] = '.';
   1.793 +  
   1.794 +  for (i = 0; i < word_size; i++) {
   1.795 +    if (word[i] <= '9' && word[i] >= '0') {
   1.796 +      prep_word[j++] = '.';
   1.797 +    } else {
   1.798 +      prep_word[j++] = word[i];
   1.799 +    }
   1.800 +  }
   1.801 +
   1.802 +
   1.803 +
   1.804 +  prep_word[j++] = '.';
   1.805 +  prep_word[j] = '\0';
   1.806 +
   1.807 +  for (i = 0; i < j; i++)
   1.808 +    hyphens[i] = '0';    
   1.809 +
   1.810 +#ifdef VERBOSE
   1.811 +  printf ("prep_word = %s\n", prep_word);
   1.812 +#endif
   1.813 +
   1.814 +  /* now, run the finite state machine */
   1.815 +  state = 0;
   1.816 +  for (i = 0; i < j; i++)
   1.817 +    {
   1.818 +      ch = prep_word[i];
   1.819 +      for (;;)
   1.820 +	{
   1.821 +
   1.822 +	  if (state == -1) {
   1.823 +            /* return 1; */
   1.824 +	    /*  KBH: FIXME shouldn't this be as follows? */
   1.825 +            state = 0;
   1.826 +            goto try_next_letter;
   1.827 +          }          
   1.828 +
   1.829 +#ifdef VERBOSE
   1.830 +	  char *state_str;
   1.831 +	  state_str = get_state_str (state, 1);
   1.832 +
   1.833 +	  for (k = 0; k < i - strlen (state_str); k++)
   1.834 +	    putchar (' ');
   1.835 +	  printf ("%s", state_str);
   1.836 +#endif
   1.837 +
   1.838 +	  hstate = &dict->states[state];
   1.839 +	  for (k = 0; k < hstate->num_trans; k++)
   1.840 +	    if (hstate->trans[k].ch == ch)
   1.841 +	      {
   1.842 +		state = hstate->trans[k].new_state;
   1.843 +		goto found_state;
   1.844 +	      }
   1.845 +	  state = hstate->fallback_state;
   1.846 +#ifdef VERBOSE
   1.847 +	  printf (" falling back, fallback_state %d\n", state);
   1.848 +#endif
   1.849 +	}
   1.850 +    found_state:
   1.851 +#ifdef VERBOSE
   1.852 +      printf ("found state %d\n",state);
   1.853 +#endif
   1.854 +      /* Additional optimization is possible here - especially,
   1.855 +	 elimination of trailing zeroes from the match. Leading zeroes
   1.856 +	 have already been optimized. */
   1.857 +      match = dict->states[state].match;
   1.858 +      repl = dict->states[state].repl;
   1.859 +      replindex = dict->states[state].replindex;
   1.860 +      replcut = dict->states[state].replcut;
   1.861 +      /* replacing rules not handled by hyphen_hyphenate() */
   1.862 +      if (match)
   1.863 +	{
   1.864 +	  offset = i + 1 - strlen (match);
   1.865 +#ifdef VERBOSE
   1.866 +	  for (k = 0; k < offset; k++)
   1.867 +	    putchar (' ');
   1.868 +	  printf ("%s (%s)\n", match, repl);
   1.869 +#endif
   1.870 +          if (repl) {
   1.871 +            if (!isrepl) for(; isrepl < word_size; isrepl++) {
   1.872 +                matchrepl[isrepl] = NULL;
   1.873 +                matchindex[isrepl] = -1;
   1.874 +            }
   1.875 +            matchlen[offset + replindex] = replcut;
   1.876 +          }
   1.877 +	  /* This is a linear search because I tried a binary search and
   1.878 +	     found it to be just a teeny bit slower. */
   1.879 +	  for (k = 0; match[k]; k++) {
   1.880 +	    if ((hyphens[offset + k] < match[k])) {
   1.881 +	      hyphens[offset + k] = match[k];
   1.882 +              if (match[k]&1) {
   1.883 +                matchrepl[offset + k] = repl;
   1.884 +                if (repl && (k >= replindex) && (k <= replindex + replcut)) {
   1.885 +                    matchindex[offset + replindex] = offset + k;
   1.886 +                }
   1.887 +              }
   1.888 +            }
   1.889 +          }
   1.890 +          
   1.891 +	}
   1.892 +
   1.893 +      /* KBH: we need this to make sure we keep looking in a word */
   1.894 +      /* for patterns even if the current character is not known in state 0 */
   1.895 +      /* since patterns for hyphenation may occur anywhere in the word */
   1.896 +      try_next_letter: ;
   1.897 +
   1.898 +    }
   1.899 +#ifdef VERBOSE
   1.900 +  for (i = 0; i < j; i++)
   1.901 +    putchar (hyphens[i]);
   1.902 +  putchar ('\n');
   1.903 +#endif
   1.904 +
   1.905 +  for (i = 0; i < j - 3; i++)
   1.906 +#if 0
   1.907 +    if (hyphens[i + 1] & 1)
   1.908 +      hyphens[i] = '-';
   1.909 +#else
   1.910 +    hyphens[i] = hyphens[i + 1];
   1.911 +#endif
   1.912 +  for (; i < word_size; i++)
   1.913 +    hyphens[i] = '0';
   1.914 +  hyphens[word_size] = '\0';
   1.915 +
   1.916 +       /* now create a new char string showing hyphenation positions */
   1.917 +       /* count the hyphens and allocate space for the new hyphenated string */
   1.918 +       nHyphCount = 0;
   1.919 +       for (i = 0; i < word_size; i++)
   1.920 +          if (hyphens[i]&1)
   1.921 +             nHyphCount++;
   1.922 +       j = 0;
   1.923 +       for (i = 0; i < word_size; i++) {
   1.924 +           if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) { 
   1.925 +                if (rep && pos && cut) {
   1.926 +                    if (!*rep)
   1.927 +                        *rep = (char **) calloc(word_size, sizeof(char *));
   1.928 +                    if (!*pos)
   1.929 +                        *pos = (int *) calloc(word_size, sizeof(int));
   1.930 +                    if (!*cut) {
   1.931 +                        *cut = (int *) calloc(word_size, sizeof(int));
   1.932 +                    }
   1.933 +                    (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]);
   1.934 +                    (*pos)[matchindex[i] - 1] = matchindex[i] - i;
   1.935 +                    (*cut)[matchindex[i] - 1] = matchlen[i];
   1.936 +                }
   1.937 +                j += strlen(matchrepl[matchindex[i]]);
   1.938 +                i += matchlen[i] - 1;
   1.939 +          }
   1.940 +       }
   1.941 +
   1.942 +  hnj_free (matchrepl);
   1.943 +  hnj_free (matchlen);
   1.944 +  hnj_free (matchindex);
   1.945 +
   1.946 +  // recursive hyphenation of the first (compound) level segments
   1.947 +  if (dict->nextlevel) {
   1.948 +     char ** rep2;
   1.949 +     int * pos2;
   1.950 +     int * cut2;
   1.951 +     char * hyphens2;
   1.952 +     int begin = 0;
   1.953 +
   1.954 +     rep2 = hnj_malloc (word_size * sizeof(char *));
   1.955 +     pos2 = hnj_malloc (word_size * sizeof(int));
   1.956 +     cut2 = hnj_malloc (word_size * sizeof(int));
   1.957 +     hyphens2 = hnj_malloc (word_size + 3);
   1.958 +     for (i = 0; i < word_size; i++) rep2[i] = NULL;
   1.959 +     for (i = 0; i < word_size; i++) if 
   1.960 +        (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) {
   1.961 +        if (i - begin > 1) {
   1.962 +            int hyph = 0;
   1.963 +            prep_word[i + 2] = '\0';
   1.964 +            /* non-standard hyphenation at compound boundary (Schiffahrt) */
   1.965 +            if (rep && *rep && *pos && *cut && (*rep)[i]) {
   1.966 +                char * l = strchr((*rep)[i], '=');
   1.967 +                size_t offset = 2 + i - (*pos)[i];
   1.968 +                strncpy(prep_word + offset, (*rep)[i], prep_word_size - offset - 1);
   1.969 +                prep_word[prep_word_size - 1] = '\0';
   1.970 +                if (l) {
   1.971 +                    hyph = (l - (*rep)[i]) - (*pos)[i];
   1.972 +                    prep_word[2 + i + hyph] = '\0';
   1.973 +                }
   1.974 +            }
   1.975 +            hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph,
   1.976 +                hyphens2, &rep2, &pos2, &cut2, clhmin,
   1.977 +                crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend));
   1.978 +            for (j = 0; j < i - begin - 1; j++) {
   1.979 +                hyphens[begin + j] = hyphens2[j];
   1.980 +                if (rep2[j] && rep && pos && cut) {
   1.981 +                    if (!*rep && !*pos && !*cut) {
   1.982 +                        int k;
   1.983 +                        *rep = (char **) malloc(sizeof(char *) * word_size);
   1.984 +                        *pos = (int *) malloc(sizeof(int) * word_size);
   1.985 +                        *cut = (int *) malloc(sizeof(int) * word_size);
   1.986 +                        for (k = 0; k < word_size; k++) {
   1.987 +                            (*rep)[k] = NULL;
   1.988 +                            (*pos)[k] = 0;
   1.989 +                            (*cut)[k] = 0;
   1.990 +                        }
   1.991 +                    }
   1.992 +                    (*rep)[begin + j] = rep2[j];
   1.993 +                    (*pos)[begin + j] = pos2[j];
   1.994 +                    (*cut)[begin + j] = cut2[j];
   1.995 +                }
   1.996 +            }
   1.997 +            prep_word[i + 2] = word[i + 1];
   1.998 +            if (*rep && *pos && *cut && (*rep)[i]) {
   1.999 +                size_t offset = 1;
  1.1000 +                strncpy(prep_word + offset, word, prep_word_size - offset - 1);
  1.1001 +                prep_word[prep_word_size - 1] = '\0';
  1.1002 +            }
  1.1003 +        }
  1.1004 +        begin = i + 1;
  1.1005 +        for (j = 0; j < word_size; j++) rep2[j] = NULL;
  1.1006 +     }
  1.1007 +     
  1.1008 +     // non-compound
  1.1009 +     if (begin == 0) {
  1.1010 +        hnj_hyphen_hyph_(dict->nextlevel, word, word_size,
  1.1011 +            hyphens, rep, pos, cut, clhmin, crhmin, lend, rend);
  1.1012 +        if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
  1.1013 +            rep, pos, cut, clhmin);
  1.1014 +        if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
  1.1015 +            rep, pos, cut, crhmin);
  1.1016 +     }
  1.1017 +     
  1.1018 +     free(rep2);
  1.1019 +     free(cut2);
  1.1020 +     free(pos2);
  1.1021 +     free(hyphens2);
  1.1022 +  }
  1.1023 +
  1.1024 +  hnj_free (prep_word);
  1.1025 +  return 0;
  1.1026 +}
  1.1027 +
  1.1028 +/* UTF-8 normalization of hyphen and non-standard positions */
  1.1029 +int hnj_hyphen_norm(const char *word, int word_size, char * hyphens,
  1.1030 +	char *** rep, int ** pos, int ** cut)
  1.1031 +{
  1.1032 +  int i, j, k;
  1.1033 +  if ((((unsigned char) word[0]) >> 6) == 2) {
  1.1034 +    fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word);
  1.1035 +    return 1;
  1.1036 +  }
  1.1037 +
  1.1038 +  /* calculate UTF-8 character positions */
  1.1039 +  for (i = 0, j = -1; i < word_size; i++) {
  1.1040 +    /* beginning of an UTF-8 character (not '10' start bits) */
  1.1041 +    if ((((unsigned char) word[i]) >> 6) != 2) j++;
  1.1042 +    hyphens[j] = hyphens[i];
  1.1043 +    if (rep && pos && cut && *rep && *pos && *cut) {
  1.1044 +        int l = (*pos)[i];
  1.1045 +        (*pos)[j] = 0;
  1.1046 +        for (k = 0; k < l; k++) {
  1.1047 +            if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++;
  1.1048 +        }
  1.1049 +        k = i - l + 1;
  1.1050 +        l = k + (*cut)[i];
  1.1051 +        (*cut)[j] = 0;        
  1.1052 +        for (; k < l; k++) {
  1.1053 +            if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++;
  1.1054 +        }
  1.1055 +        (*rep)[j] = (*rep)[i];
  1.1056 +        if (j < i) {
  1.1057 +            (*rep)[i] = NULL;
  1.1058 +            (*pos)[i] = 0;
  1.1059 +            (*cut)[i] = 0;
  1.1060 +        }
  1.1061 +    }
  1.1062 +  }
  1.1063 +  hyphens[j + 1] = '\0';
  1.1064 +#ifdef VERBOSE
  1.1065 +  printf ("nums: %s\n", hyphens);
  1.1066 +#endif
  1.1067 +  return 0;
  1.1068 +}
  1.1069 +
  1.1070 +/* get the word with all possible hyphenations (output: hyphword) */
  1.1071 +void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens, 
  1.1072 +    char * hyphword, char *** rep, int ** pos, int ** cut)
  1.1073 +{
  1.1074 +  int hyphenslen = l + 5;
  1.1075 +
  1.1076 +  int i, j;
  1.1077 +  for (i = 0, j = 0; i < l; i++, j++) {
  1.1078 +    if (hyphens[i]&1) {
  1.1079 +      hyphword[j] = word[i];
  1.1080 +      if (*rep && *pos && *cut && (*rep)[i]) {
  1.1081 +        size_t offset = j - (*pos)[i] + 1;
  1.1082 +        strncpy(hyphword + offset, (*rep)[i], hyphenslen - offset - 1);
  1.1083 +        hyphword[hyphenslen-1] = '\0';
  1.1084 +        j += strlen((*rep)[i]) - (*pos)[i];
  1.1085 +        i += (*cut)[i] - (*pos)[i];
  1.1086 +      } else hyphword[++j] = '=';
  1.1087 +    } else hyphword[j] = word[i];
  1.1088 +  }
  1.1089 +  hyphword[j] = '\0';
  1.1090 +}
  1.1091 +
  1.1092 +
  1.1093 +/* main api function with default hyphenmin parameters */
  1.1094 +int hnj_hyphen_hyphenate2 (HyphenDict *dict,
  1.1095 +			   const char *word, int word_size, char * hyphens,
  1.1096 +			   char *hyphword, char *** rep, int ** pos, int ** cut)
  1.1097 +{
  1.1098 +  hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
  1.1099 +    dict->clhmin, dict->crhmin, 1, 1);
  1.1100 +  hnj_hyphen_lhmin(dict->utf8, word, word_size,
  1.1101 +    hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2));
  1.1102 +  hnj_hyphen_rhmin(dict->utf8, word, word_size,
  1.1103 +    hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2));
  1.1104 +
  1.1105 +  /* nohyphen */
  1.1106 +  if (dict->nohyphen) {
  1.1107 +    char * nh = dict->nohyphen;
  1.1108 +    int nhi;
  1.1109 +    for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {
  1.1110 +        char * nhy = (char *) strstr(word, nh);
  1.1111 +        while (nhy) {
  1.1112 +            hyphens[nhy - word + strlen(nh) - 1] = '0';
  1.1113 +            if (nhy - word  - 1 >= 0) hyphens[nhy - word - 1] = '0';
  1.1114 +            nhy = (char *) strstr(nhy + 1, nh);
  1.1115 +        }
  1.1116 +        nh = nh + strlen(nh) + 1;
  1.1117 +    }
  1.1118 +  }
  1.1119 +
  1.1120 +  if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
  1.1121 +  if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
  1.1122 +#ifdef VERBOSE
  1.1123 +  printf ("nums: %s\n", hyphens);
  1.1124 +#endif
  1.1125 +  return 0;
  1.1126 +}
  1.1127 +
  1.1128 +/* previous main api function with hyphenmin parameters */
  1.1129 +int hnj_hyphen_hyphenate3 (HyphenDict *dict,
  1.1130 +	const char *word, int word_size, char * hyphens,
  1.1131 +	char *hyphword, char *** rep, int ** pos, int ** cut,
  1.1132 +	int lhmin, int rhmin, int clhmin, int crhmin)
  1.1133 +{
  1.1134 +  lhmin = (lhmin > dict->lhmin) ? lhmin : dict->lhmin;
  1.1135 +  rhmin = (rhmin > dict->rhmin) ? rhmin : dict->rhmin;
  1.1136 +  clhmin = (clhmin > dict->clhmin) ? clhmin : dict->clhmin;
  1.1137 +  crhmin = (crhmin > dict->crhmin) ? crhmin : dict->crhmin;
  1.1138 +  hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
  1.1139 +    clhmin, crhmin, 1, 1);
  1.1140 +  hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
  1.1141 +    rep, pos, cut, (lhmin > 0 ? lhmin : 2));
  1.1142 +  hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
  1.1143 +    rep, pos, cut, (rhmin > 0 ? rhmin : 2));
  1.1144 +  if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
  1.1145 +
  1.1146 +  /* nohyphen */
  1.1147 +  if (dict->nohyphen) {
  1.1148 +    char * nh = dict->nohyphen;
  1.1149 +    int nhi;
  1.1150 +    for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {
  1.1151 +        char * nhy = (char *) strstr(word, nh);
  1.1152 +        while (nhy) {
  1.1153 +            hyphens[nhy - word + strlen(nh) - 1] = 0;
  1.1154 +            if (nhy - word  - 1 >= 0) hyphens[nhy - word - 1] = 0;
  1.1155 +            nhy = (char *) strstr(nhy + 1, nh);
  1.1156 +        }
  1.1157 +        nh = nh + strlen(nh) + 1;
  1.1158 +    }
  1.1159 +  }
  1.1160 +
  1.1161 +  if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
  1.1162 +  return 0;
  1.1163 +}

mercurial