michael@0: /* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both
michael@0:  * licenses follows.
michael@0:  */
michael@0: 
michael@0: /* LibHnj - a library for high quality hyphenation and justification
michael@0:  * Copyright (C) 1998 Raph Levien, 
michael@0:  * 	     (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org), 
michael@0:  *           (C) 2001 Peter Novodvorsky (nidd@cs.msu.su)
michael@0:  *           (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo)
michael@0:  *
michael@0:  * This library is free software; you can redistribute it and/or
michael@0:  * modify it under the terms of the GNU Library General Public
michael@0:  * License as published by the Free Software Foundation; either
michael@0:  * version 2 of the License, or (at your option) any later version.
michael@0:  *
michael@0:  * This library is distributed in the hope that it will be useful,
michael@0:  * but WITHOUT ANY WARRANTY; without even the implied warranty of
michael@0:  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
michael@0:  * Library General Public License for more details.
michael@0:  *
michael@0:  * You should have received a copy of the GNU Library General Public
michael@0:  * License along with this library; if not, write to the 
michael@0:  * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 
michael@0:  * Boston, MA  02111-1307  USA.
michael@0: */
michael@0: 
michael@0: /*
michael@0:  * The contents of this file are subject to the Mozilla Public License
michael@0:  * Version 1.0 (the "MPL"); you may not use this file except in
michael@0:  * compliance with the MPL.  You may obtain a copy of the MPL at
michael@0:  * http://www.mozilla.org/MPL/
michael@0:  *
michael@0:  * Software distributed under the MPL is distributed on an "AS IS" basis,
michael@0:  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL
michael@0:  * for the specific language governing rights and limitations under the
michael@0:  * MPL.
michael@0:  *
michael@0:  */
michael@0: #include <stdlib.h> /* for NULL, malloc */
michael@0: #include <stdio.h>  /* for fprintf */
michael@0: #include <string.h> /* for strdup */
michael@0: 
michael@0: #ifdef UNX
michael@0: #include <unistd.h> /* for exit */
michael@0: #endif
michael@0: 
michael@0: #define noVERBOSE
michael@0: 
michael@0: /* calculate hyphenmin values with long ligature length (2 or 3 characters
michael@0:  * instead of 1 or 2) for comparison with hyphenation without ligatures */
michael@0: #define noLONG_LIGATURE
michael@0: 
michael@0: #ifdef LONG_LIGATURE
michael@0: #define LIG_xx	1
michael@0: #define LIG_xxx	2
michael@0: #else
michael@0: #define LIG_xx	0
michael@0: #define LIG_xxx	1
michael@0: #endif
michael@0: 
michael@0: #include "hnjalloc.h"
michael@0: #include "hyphen.h"
michael@0: 
michael@0: static char *
michael@0: hnj_strdup (const char *s)
michael@0: {
michael@0:   char *new;
michael@0:   int l;
michael@0: 
michael@0:   l = strlen (s);
michael@0:   new = hnj_malloc (l + 1);
michael@0:   memcpy (new, s, l);
michael@0:   new[l] = 0;
michael@0:   return new;
michael@0: }
michael@0: 
michael@0: /* remove cross-platform text line end characters */
michael@0: void hnj_strchomp(char * s)
michael@0: {
michael@0:   int k = strlen(s);
michael@0:   if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';
michael@0:   if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
michael@0: }
michael@0: 
michael@0: /* a little bit of a hash table implementation. This simply maps strings
michael@0:    to state numbers */
michael@0: 
michael@0: typedef struct _HashTab HashTab;
michael@0: typedef struct _HashEntry HashEntry;
michael@0: 
michael@0: /* A cheap, but effective, hack. */
michael@0: #define HASH_SIZE 31627
michael@0: 
michael@0: struct _HashTab {
michael@0:   HashEntry *entries[HASH_SIZE];
michael@0: };
michael@0: 
michael@0: struct _HashEntry {
michael@0:   HashEntry *next;
michael@0:   char *key;
michael@0:   int val;
michael@0: };
michael@0: 
michael@0: /* a char* hash function from ASU - adapted from Gtk+ */
michael@0: static unsigned int
michael@0: hnj_string_hash (const char *s)
michael@0: {
michael@0:   const char *p;
michael@0:   unsigned int h=0, g;
michael@0:   for(p = s; *p != '\0'; p += 1) {
michael@0:     h = ( h << 4 ) + *p;
michael@0:     if ( ( g = h & 0xf0000000 ) ) {
michael@0:       h = h ^ (g >> 24);
michael@0:       h = h ^ g;
michael@0:     }
michael@0:   }
michael@0:   return h /* % M */;
michael@0: }
michael@0: 
michael@0: static HashTab *
michael@0: hnj_hash_new (void)
michael@0: {
michael@0:   HashTab *hashtab;
michael@0:   int i;
michael@0: 
michael@0:   hashtab = hnj_malloc (sizeof(HashTab));
michael@0:   for (i = 0; i < HASH_SIZE; i++)
michael@0:     hashtab->entries[i] = NULL;
michael@0: 
michael@0:   return hashtab;
michael@0: }
michael@0: 
michael@0: static void
michael@0: hnj_hash_free (HashTab *hashtab)
michael@0: {
michael@0:   int i;
michael@0:   HashEntry *e, *next;
michael@0: 
michael@0:   for (i = 0; i < HASH_SIZE; i++)
michael@0:     for (e = hashtab->entries[i]; e; e = next)
michael@0:       {
michael@0: 	next = e->next;
michael@0: 	hnj_free (e->key);
michael@0: 	hnj_free (e);
michael@0:       }
michael@0: 
michael@0:   hnj_free (hashtab);
michael@0: }
michael@0: 
michael@0: /* assumes that key is not already present! */
michael@0: static void
michael@0: hnj_hash_insert (HashTab *hashtab, const char *key, int val)
michael@0: {
michael@0:   int i;
michael@0:   HashEntry *e;
michael@0: 
michael@0:   i = hnj_string_hash (key) % HASH_SIZE;
michael@0:   e = hnj_malloc (sizeof(HashEntry));
michael@0:   e->next = hashtab->entries[i];
michael@0:   e->key = hnj_strdup (key);
michael@0:   e->val = val;
michael@0:   hashtab->entries[i] = e;
michael@0: }
michael@0: 
michael@0: /* return val if found, otherwise -1 */
michael@0: static int
michael@0: hnj_hash_lookup (HashTab *hashtab, const char *key)
michael@0: {
michael@0:   int i;
michael@0:   HashEntry *e;
michael@0:   i = hnj_string_hash (key) % HASH_SIZE;
michael@0:   for (e = hashtab->entries[i]; e; e = e->next)
michael@0:     if (!strcmp (key, e->key))
michael@0:       return e->val;
michael@0:   return -1;
michael@0: }
michael@0: 
michael@0: /* Get the state number, allocating a new state if necessary. */
michael@0: static int
michael@0: hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string)
michael@0: {
michael@0:   int state_num;
michael@0: 
michael@0:   state_num = hnj_hash_lookup (hashtab, string);
michael@0: 
michael@0:   if (state_num >= 0)
michael@0:     return state_num;
michael@0: 
michael@0:   hnj_hash_insert (hashtab, string, dict->num_states);
michael@0:   /* predicate is true if dict->num_states is a power of two */
michael@0:   if (!(dict->num_states & (dict->num_states - 1)))
michael@0:     {
michael@0:       dict->states = hnj_realloc (dict->states,
michael@0: 				  (dict->num_states << 1) *
michael@0: 				  sizeof(HyphenState));
michael@0:     }
michael@0:   dict->states[dict->num_states].match = NULL;
michael@0:   dict->states[dict->num_states].repl = NULL;
michael@0:   dict->states[dict->num_states].fallback_state = -1;
michael@0:   dict->states[dict->num_states].num_trans = 0;
michael@0:   dict->states[dict->num_states].trans = NULL;
michael@0:   return dict->num_states++;
michael@0: }
michael@0: 
michael@0: /* add a transition from state1 to state2 through ch - assumes that the
michael@0:    transition does not already exist */
michael@0: static void
michael@0: hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch)
michael@0: {
michael@0:   int num_trans;
michael@0: 
michael@0:   num_trans = dict->states[state1].num_trans;
michael@0:   if (num_trans == 0)
michael@0:     {
michael@0:       dict->states[state1].trans = hnj_malloc (sizeof(HyphenTrans));
michael@0:     }
michael@0:   else if (!(num_trans & (num_trans - 1)))
michael@0:     {
michael@0:       dict->states[state1].trans = hnj_realloc (dict->states[state1].trans,
michael@0: 						(num_trans << 1) *
michael@0: 						sizeof(HyphenTrans));
michael@0:     }
michael@0:   dict->states[state1].trans[num_trans].ch = ch;
michael@0:   dict->states[state1].trans[num_trans].new_state = state2;
michael@0:   dict->states[state1].num_trans++;
michael@0: }
michael@0: 
michael@0: #ifdef VERBOSE
michael@0: HashTab *global[1];
michael@0: 
michael@0: static char *
michael@0: get_state_str (int state, int level)
michael@0: {
michael@0:   int i;
michael@0:   HashEntry *e;
michael@0: 
michael@0:   for (i = 0; i < HASH_SIZE; i++)
michael@0:     for (e = global[level]->entries[i]; e; e = e->next)
michael@0:       if (e->val == state)
michael@0: 	return e->key;
michael@0:   return NULL;
michael@0: }
michael@0: #endif
michael@0: 
michael@0: void hnj_hyphen_load_line(char * buf, HyphenDict * dict, HashTab * hashtab) {
michael@0:   int i, j;
michael@0:   char word[MAX_CHARS];
michael@0:   char pattern[MAX_CHARS];
michael@0:   char * repl;
michael@0:   signed char replindex;
michael@0:   signed char replcut;
michael@0:   int state_num = 0;
michael@0:   int last_state;
michael@0:   char ch;
michael@0:   int found;
michael@0: 
michael@0: 	  if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) {
michael@0: 	    dict->lhmin = atoi(buf + 13);
michael@0: 	    return;
michael@0: 	  } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) {
michael@0: 	    dict->rhmin = atoi(buf + 14);
michael@0: 	    return;
michael@0: 	  } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) {
michael@0: 	    dict->clhmin = atoi(buf + 21);
michael@0: 	    return;
michael@0: 	  } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) {
michael@0: 	    dict->crhmin = atoi(buf + 22);
michael@0: 	    return;
michael@0: 	  } else if (strncmp(buf, "NOHYPHEN", 8) == 0) {
michael@0: 	    char * space = buf + 8;
michael@0: 	    while (*space != '\0' && (*space == ' ' || *space == '\t')) space++;
michael@0: 	    if (*buf != '\0') dict->nohyphen = hnj_strdup(space);
michael@0: 	    if (dict->nohyphen) {
michael@0: 	        char * nhe = dict->nohyphen + strlen(dict->nohyphen) - 1;
michael@0: 	        *nhe = 0;
michael@0: 	        for (nhe = nhe - 1; nhe > dict->nohyphen; nhe--) {
michael@0: 	                if (*nhe == ',') {
michael@0: 	                    dict->nohyphenl++;
michael@0: 	                    *nhe = 0;
michael@0: 	                }
michael@0: 	        }
michael@0: 	    }
michael@0: 	    return;
michael@0: 	  } 
michael@0: 	  j = 0;
michael@0: 	  pattern[j] = '0';
michael@0:           repl = strchr(buf, '/');
michael@0:           replindex = 0;
michael@0:           replcut = 0;
michael@0:           if (repl) {
michael@0:             char * index = strchr(repl + 1, ',');
michael@0:             *repl = '\0';
michael@0:             if (index) {
michael@0:                 char * index2 = strchr(index + 1, ',');
michael@0:                 *index = '\0';
michael@0:                 if (index2) {
michael@0:                     *index2 = '\0';
michael@0:                     replindex = (signed char) atoi(index + 1) - 1;
michael@0:                     replcut = (signed char) atoi(index2 + 1);                
michael@0:                 }
michael@0:             } else {
michael@0:                 hnj_strchomp(repl + 1);
michael@0:                 replindex = 0;
michael@0:                 replcut = (signed char) strlen(buf);
michael@0:             }
michael@0:             repl = hnj_strdup(repl + 1);
michael@0:           }
michael@0: 	  for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++)
michael@0: 	    {
michael@0: 	      if (buf[i] >= '0' && buf[i] <= '9')
michael@0: 		pattern[j] = buf[i];
michael@0: 	      else
michael@0: 		{
michael@0: 		  word[j] = buf[i];
michael@0: 		  pattern[++j] = '0';
michael@0: 		}
michael@0: 	    }
michael@0: 	  word[j] = '\0';
michael@0: 	  pattern[j + 1] = '\0';
michael@0: 
michael@0:           i = 0;
michael@0: 	  if (!repl) {
michael@0: 	    /* Optimize away leading zeroes */
michael@0:             for (; pattern[i] == '0'; i++);
michael@0:           } else {
michael@0:             if (*word == '.') i++;
michael@0:             /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */
michael@0:             if (dict->utf8) {
michael@0:                 int pu = -1;        /* unicode character position */
michael@0:                 int ps = -1;        /* unicode start position (original replindex) */
michael@0:                 int pc = (*word == '.') ? 1: 0; /* 8-bit character position */
michael@0:                 for (; pc < (strlen(word) + 1); pc++) {
michael@0:                 /* beginning of an UTF-8 character (not '10' start bits) */
michael@0:                     if ((((unsigned char) word[pc]) >> 6) != 2) pu++;
michael@0:                     if ((ps < 0) && (replindex == pu)) {
michael@0:                         ps = replindex;
michael@0:                         replindex = (signed char) pc;
michael@0:                     }
michael@0:                     if ((ps >= 0) && ((pu - ps) == replcut)) {
michael@0:                         replcut = (signed char) (pc - replindex);
michael@0:                         break;
michael@0:                     }
michael@0:                 }
michael@0:                 if (*word == '.') replindex--;
michael@0:             }
michael@0:           }
michael@0: 
michael@0: #ifdef VERBOSE
michael@0: 	  printf ("word %s pattern %s, j = %d  repl: %s\n", word, pattern + i, j, repl);
michael@0: #endif
michael@0: 	  found = hnj_hash_lookup (hashtab, word);
michael@0: 	  state_num = hnj_get_state (dict, hashtab, word);
michael@0: 	  dict->states[state_num].match = hnj_strdup (pattern + i);
michael@0: 	  dict->states[state_num].repl = repl;
michael@0: 	  dict->states[state_num].replindex = replindex;
michael@0:           if (!replcut) {
michael@0:             dict->states[state_num].replcut = (signed char) strlen(word);
michael@0:           } else {
michael@0:             dict->states[state_num].replcut = replcut;
michael@0:           }
michael@0: 
michael@0: 	  /* now, put in the prefix transitions */
michael@0:           for (; found < 0 ;j--)
michael@0: 	    {
michael@0: 	      last_state = state_num;
michael@0: 	      ch = word[j - 1];
michael@0: 	      word[j - 1] = '\0';
michael@0: 	      found = hnj_hash_lookup (hashtab, word);
michael@0: 	      state_num = hnj_get_state (dict, hashtab, word);
michael@0: 	      hnj_add_trans (dict, state_num, last_state, ch);
michael@0: 	    }
michael@0: }
michael@0: 
michael@0: HyphenDict *
michael@0: hnj_hyphen_load (const char *fn)
michael@0: {
michael@0:   HyphenDict *dict[2];
michael@0:   HashTab *hashtab;
michael@0:   FILE *f;
michael@0:   char buf[MAX_CHARS];
michael@0:   int nextlevel = 0;
michael@0:   int i, j, k;
michael@0:   HashEntry *e;
michael@0:   int state_num = 0;
michael@0: 
michael@0:   f = fopen (fn, "r");
michael@0:   if (f == NULL)
michael@0:     return NULL;
michael@0: 
michael@0: // loading one or two dictionaries (separated by NEXTLEVEL keyword)
michael@0: for (k = 0; k < 2; k++) { 
michael@0:   hashtab = hnj_hash_new ();
michael@0: #ifdef VERBOSE
michael@0:   global[k] = hashtab;
michael@0: #endif
michael@0:   hnj_hash_insert (hashtab, "", 0);
michael@0:   dict[k] = hnj_malloc (sizeof(HyphenDict));
michael@0:   dict[k]->num_states = 1;
michael@0:   dict[k]->states = hnj_malloc (sizeof(HyphenState));
michael@0:   dict[k]->states[0].match = NULL;
michael@0:   dict[k]->states[0].repl = NULL;
michael@0:   dict[k]->states[0].fallback_state = -1;
michael@0:   dict[k]->states[0].num_trans = 0;
michael@0:   dict[k]->states[0].trans = NULL;
michael@0:   dict[k]->nextlevel = NULL;
michael@0:   dict[k]->lhmin = 0;
michael@0:   dict[k]->rhmin = 0;
michael@0:   dict[k]->clhmin = 0;
michael@0:   dict[k]->crhmin = 0;
michael@0:   dict[k]->nohyphen = NULL;
michael@0:   dict[k]->nohyphenl = 0;
michael@0: 
michael@0:   /* read in character set info */
michael@0:   if (k == 0) {
michael@0:     for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;
michael@0:     if (fgets(dict[k]->cset,  sizeof(dict[k]->cset),f) != NULL) {
michael@0:       for (i=0;i<MAX_NAME;i++)
michael@0:         if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))
michael@0:           dict[k]->cset[i] = 0;
michael@0:     } else {
michael@0:       dict[k]->cset[0] = 0;
michael@0:     }
michael@0:     dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);
michael@0:   } else {
michael@0:     strncpy(dict[k]->cset, dict[0]->cset, sizeof(dict[k]->cset)-1);
michael@0:     dict[k]->cset[sizeof(dict[k]->cset)-1] = '\0';
michael@0:     dict[k]->utf8 = dict[0]->utf8;
michael@0:   }
michael@0: 
michael@0:   if (k == 0 || nextlevel) {
michael@0:     while (fgets (buf, sizeof(buf), f) != NULL) {
michael@0:       if (strncmp(buf, "NEXTLEVEL", 9) == 0) {
michael@0: 	nextlevel = 1;
michael@0: 	break;
michael@0:       } else if (buf[0] != '%') hnj_hyphen_load_line(buf, dict[k], hashtab);
michael@0:     }
michael@0:   } else if (k == 1) {
michael@0:     /* default first level: hyphen and ASCII apostrophe */
michael@0:     if (!dict[0]->utf8) hnj_hyphen_load_line("NOHYPHEN ',-\n", dict[k], hashtab);
michael@0:     else hnj_hyphen_load_line("NOHYPHEN ',\xe2\x80\x93,\xe2\x80\x99,-\n", dict[k], hashtab);
michael@0:     strncpy(buf, "1-1\n", MAX_CHARS-1); // buf rewritten by hnj_hyphen_load here
michael@0:     buf[MAX_CHARS-1] = '\0';
michael@0:     hnj_hyphen_load_line(buf, dict[k], hashtab); /* remove hyphen */
michael@0:     hnj_hyphen_load_line("1'1\n", dict[k], hashtab); /* ASCII apostrophe */
michael@0:     if (dict[0]->utf8) {
michael@0:       hnj_hyphen_load_line("1\xe2\x80\x93" "1\n", dict[k], hashtab); /* endash */
michael@0:       hnj_hyphen_load_line("1\xe2\x80\x99" "1\n", dict[k], hashtab); /* apostrophe */
michael@0:     }
michael@0:   }
michael@0: 
michael@0:   /* Could do unioning of matches here (instead of the preprocessor script).
michael@0:      If we did, the pseudocode would look something like this:
michael@0: 
michael@0:      foreach state in the hash table
michael@0:         foreach i = [1..length(state) - 1]
michael@0:            state to check is substr (state, i)
michael@0:            look it up
michael@0:            if found, and if there is a match, union the match in.
michael@0: 
michael@0:      It's also possible to avoid the quadratic blowup by doing the
michael@0:      search in order of increasing state string sizes - then you
michael@0:      can break the loop after finding the first match.
michael@0: 
michael@0:      This step should be optional in any case - if there is a
michael@0:      preprocessed rule table, it's always faster to use that.
michael@0: 
michael@0: */
michael@0: 
michael@0:   /* put in the fallback states */
michael@0:   for (i = 0; i < HASH_SIZE; i++)
michael@0:     for (e = hashtab->entries[i]; e; e = e->next)
michael@0:       {
michael@0: 	if (*(e->key)) for (j = 1; 1; j++)
michael@0: 	  {          
michael@0: 	    state_num = hnj_hash_lookup (hashtab, e->key + j);
michael@0: 	    if (state_num >= 0)
michael@0: 	      break;
michael@0: 	  }
michael@0:         /* KBH: FIXME state 0 fallback_state should always be -1? */
michael@0: 	if (e->val)
michael@0: 	  dict[k]->states[e->val].fallback_state = state_num;
michael@0:       }
michael@0: #ifdef VERBOSE
michael@0:   for (i = 0; i < HASH_SIZE; i++)
michael@0:     for (e = hashtab->entries[i]; e; e = e->next)
michael@0:       {
michael@0: 	printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,
michael@0: 		dict[k]->states[e->val].fallback_state);
michael@0: 	for (j = 0; j < dict[k]->states[e->val].num_trans; j++)
michael@0: 	  printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch,
michael@0: 		  dict[k]->states[e->val].trans[j].new_state);
michael@0:       }
michael@0: #endif
michael@0: 
michael@0: #ifndef VERBOSE
michael@0:   hnj_hash_free (hashtab);
michael@0: #endif
michael@0:   state_num = 0;
michael@0: }
michael@0:   fclose(f);
michael@0:   if (nextlevel) dict[0]->nextlevel = dict[1];
michael@0:   else {
michael@0:     dict[1] -> nextlevel = dict[0];
michael@0:     dict[1]->lhmin = dict[0]->lhmin;
michael@0:     dict[1]->rhmin = dict[0]->rhmin;
michael@0:     dict[1]->clhmin = (dict[0]->clhmin) ? dict[0]->clhmin : ((dict[0]->lhmin) ? dict[0]->lhmin : 3);
michael@0:     dict[1]->crhmin = (dict[0]->crhmin) ? dict[0]->crhmin : ((dict[0]->rhmin) ? dict[0]->rhmin : 3);
michael@0: #ifdef VERBOSE
michael@0:     HashTab *r = global[0];
michael@0:     global[0] = global[1];
michael@0:     global[1] = r;
michael@0: #endif
michael@0:     return dict[1];
michael@0:   }
michael@0:   return dict[0];
michael@0: }
michael@0: 
michael@0: void hnj_hyphen_free (HyphenDict *dict)
michael@0: {
michael@0:   int state_num;
michael@0:   HyphenState *hstate;
michael@0: 
michael@0:   for (state_num = 0; state_num < dict->num_states; state_num++)
michael@0:     {
michael@0:       hstate = &dict->states[state_num];
michael@0:       if (hstate->match)
michael@0: 	hnj_free (hstate->match);
michael@0:       if (hstate->repl)
michael@0: 	hnj_free (hstate->repl);
michael@0:       if (hstate->trans)
michael@0: 	hnj_free (hstate->trans);
michael@0:     }
michael@0:   if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel);
michael@0: 
michael@0:   if (dict->nohyphen) hnj_free(dict->nohyphen);
michael@0: 
michael@0:   hnj_free (dict->states);
michael@0: 
michael@0:   hnj_free (dict);
michael@0: }
michael@0: 
michael@0: #define MAX_WORD 256
michael@0: 
michael@0: int hnj_hyphen_hyphenate (HyphenDict *dict,
michael@0: 			   const char *word, int word_size,
michael@0: 			   char *hyphens)
michael@0: {
michael@0:   char *prep_word;
michael@0:   int i, j, k;
michael@0:   int state;
michael@0:   char ch;
michael@0:   HyphenState *hstate;
michael@0:   char *match;
michael@0:   int offset;
michael@0: 
michael@0:   prep_word = hnj_malloc (word_size + 3);
michael@0: 
michael@0:   j = 0;
michael@0:   prep_word[j++] = '.';
michael@0: 
michael@0:   for (i = 0; i < word_size; i++) {
michael@0:     if (word[i] <= '9' && word[i] >= '0') {
michael@0:       prep_word[j++] = '.';
michael@0:     } else {
michael@0:       prep_word[j++] = word[i];
michael@0:     }
michael@0:   }
michael@0: 
michael@0:   prep_word[j++] = '.';
michael@0:   prep_word[j] = '\0';
michael@0: 
michael@0:   for (i = 0; i < word_size + 5; i++)
michael@0:     hyphens[i] = '0';
michael@0: 
michael@0: #ifdef VERBOSE
michael@0:   printf ("prep_word = %s\n", prep_word);
michael@0: #endif
michael@0: 
michael@0:   /* now, run the finite state machine */
michael@0:   state = 0;
michael@0:   for (i = 0; i < j; i++)
michael@0:     {
michael@0:       ch = prep_word[i];
michael@0:       for (;;)
michael@0: 	{
michael@0: 
michael@0: 	  if (state == -1) {
michael@0:             /* return 1; */
michael@0: 	    /*  KBH: FIXME shouldn't this be as follows? */
michael@0:             state = 0;
michael@0:             goto try_next_letter;
michael@0:           }          
michael@0: 
michael@0: #ifdef VERBOSE
michael@0: 	  char *state_str;
michael@0: 	  state_str = get_state_str (state, 0);
michael@0: 
michael@0: 	  for (k = 0; k < i - strlen (state_str); k++)
michael@0: 	    putchar (' ');
michael@0: 	  printf ("%s", state_str);
michael@0: #endif
michael@0: 
michael@0: 	  hstate = &dict->states[state];
michael@0: 	  for (k = 0; k < hstate->num_trans; k++)
michael@0: 	    if (hstate->trans[k].ch == ch)
michael@0: 	      {
michael@0: 		state = hstate->trans[k].new_state;
michael@0: 		goto found_state;
michael@0: 	      }
michael@0: 	  state = hstate->fallback_state;
michael@0: #ifdef VERBOSE
michael@0: 	  printf (" falling back, fallback_state %d\n", state);
michael@0: #endif
michael@0: 	}
michael@0:     found_state:
michael@0: #ifdef VERBOSE
michael@0:       printf ("found state %d\n",state);
michael@0: #endif
michael@0:       /* Additional optimization is possible here - especially,
michael@0: 	 elimination of trailing zeroes from the match. Leading zeroes
michael@0: 	 have already been optimized. */
michael@0:       match = dict->states[state].match;
michael@0:       /* replacing rules not handled by hyphen_hyphenate() */
michael@0:       if (match && !dict->states[state].repl)
michael@0: 	{
michael@0: 	  offset = i + 1 - strlen (match);
michael@0: #ifdef VERBOSE
michael@0: 	  for (k = 0; k < offset; k++)
michael@0: 	    putchar (' ');
michael@0: 	  printf ("%s\n", match);
michael@0: #endif
michael@0: 	  /* This is a linear search because I tried a binary search and
michael@0: 	     found it to be just a teeny bit slower. */
michael@0: 	  for (k = 0; match[k]; k++)
michael@0: 	    if (hyphens[offset + k] < match[k])
michael@0: 	      hyphens[offset + k] = match[k];
michael@0: 	}
michael@0: 
michael@0:       /* KBH: we need this to make sure we keep looking in a word */
michael@0:       /* for patterns even if the current character is not known in state 0 */
michael@0:       /* since patterns for hyphenation may occur anywhere in the word */
michael@0:       try_next_letter: ;
michael@0: 
michael@0:     }
michael@0: #ifdef VERBOSE
michael@0:   for (i = 0; i < j; i++)
michael@0:     putchar (hyphens[i]);
michael@0:   putchar ('\n');
michael@0: #endif
michael@0: 
michael@0:   for (i = 0; i < j - 4; i++)
michael@0: #if 0
michael@0:     if (hyphens[i + 1] & 1)
michael@0:       hyphens[i] = '-';
michael@0: #else
michael@0:     hyphens[i] = hyphens[i + 1];
michael@0: #endif
michael@0:   hyphens[0] = '0';
michael@0:   for (; i < word_size; i++)
michael@0:     hyphens[i] = '0';
michael@0:   hyphens[word_size] = '\0';
michael@0: 
michael@0:   hnj_free (prep_word);
michael@0:     
michael@0:   return 0;    
michael@0: }
michael@0: 
michael@0: /* Unicode ligature length */
michael@0: int hnj_ligature(unsigned char c) {
michael@0:     switch (c) {
michael@0:         case 0x80:			/* ff */
michael@0:         case 0x81:			/* fi */
michael@0:         case 0x82: return LIG_xx;	/* fl */
michael@0:         case 0x83:			/* ffi */
michael@0:         case 0x84: return LIG_xxx;	/* ffl */
michael@0:         case 0x85:			/* long st */
michael@0:         case 0x86: return LIG_xx;	/* st */
michael@0:     }
michael@0:     return 0;
michael@0: }
michael@0: 
michael@0: /* character length of the first n byte of the input word */
michael@0: int hnj_hyphen_strnlen(const char * word, int n, int utf8)
michael@0: {
michael@0:     int i = 0;
michael@0:     int j = 0;
michael@0:     while (j < n && word[j] != '\0') {
michael@0:       i++;
michael@0:       // Unicode ligature support
michael@0:       if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC))  {
michael@0:         i += hnj_ligature(word[j + 2]);
michael@0:       }
michael@0:       for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++);
michael@0:     }
michael@0:     return i;
michael@0: }
michael@0: 
michael@0: int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens,
michael@0: 	char *** rep, int ** pos, int ** cut, int lhmin)
michael@0: {
michael@0:     int i = 1, j;
michael@0: 
michael@0:     // Unicode ligature support
michael@0:     if (utf8 && ((unsigned char) word[0] == 0xEF) && ((unsigned char) word[1] == 0xAC))  {
michael@0:       i += hnj_ligature(word[2]);
michael@0:     }
michael@0: 
michael@0:     // ignore numbers
michael@0:     for (j = 0; word[j] <= '9' && word[j] >= '0'; j++) i--;
michael@0: 
michael@0:     for (j = 0; i < lhmin && word[j] != '\0'; i++) do {
michael@0:       // check length of the non-standard part
michael@0:       if (*rep && *pos && *cut && (*rep)[j]) {
michael@0:         char * rh = strchr((*rep)[j], '=');
michael@0:         if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) +
michael@0:           hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) {
michael@0:             free((*rep)[j]);
michael@0:             (*rep)[j] = NULL;
michael@0:             hyphens[j] = '0';
michael@0:           }
michael@0:        } else {
michael@0:          hyphens[j] = '0';
michael@0:        }
michael@0:        j++;
michael@0: 
michael@0:        // Unicode ligature support
michael@0:        if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC))  {
michael@0:          i += hnj_ligature(word[j + 2]);
michael@0:        }
michael@0:     } while (utf8 && (word[j] & 0xc0) == 0x80);
michael@0:     return 0;
michael@0: }
michael@0: 
michael@0: int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens,
michael@0: 	char *** rep, int ** pos, int ** cut, int rhmin)
michael@0: {
michael@0:     int i = 0;
michael@0:     int j;
michael@0: 
michael@0:     // ignore numbers
michael@0:     for (j = word_size - 1; j > 0 && word[j] <= '9' && word[j] >= '0'; j--) i--;
michael@0: 
michael@0:     for (j = word_size - 1; i < rhmin && j > 0; j--) {
michael@0:       // check length of the non-standard part
michael@0:       if (*rep && *pos && *cut && (*rep)[j]) {
michael@0:         char * rh = strchr((*rep)[j], '=');
michael@0:         if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) +
michael@0:           hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) {
michael@0:             free((*rep)[j]);
michael@0:             (*rep)[j] = NULL;
michael@0:             hyphens[j] = '0';
michael@0:           }
michael@0:        } else {
michael@0:          hyphens[j] = '0';
michael@0:        }
michael@0:        if (!utf8 || (word[j] & 0xc0) == 0xc0 || (word[j] & 0x80) != 0x80) i++;
michael@0:     }
michael@0:     return 0;
michael@0: }
michael@0: 
michael@0: // recursive function for compound level hyphenation
michael@0: int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
michael@0:     char * hyphens, char *** rep, int ** pos, int ** cut,
michael@0:     int clhmin, int crhmin, int lend, int rend)
michael@0: {
michael@0:   char *prep_word;
michael@0:   int i, j, k;
michael@0:   int state;
michael@0:   char ch;
michael@0:   HyphenState *hstate;
michael@0:   char *match;
michael@0:   char *repl;
michael@0:   signed char replindex;
michael@0:   signed char replcut;
michael@0:   int offset;
michael@0:   int * matchlen;
michael@0:   int * matchindex;
michael@0:   char ** matchrepl;  
michael@0:   int isrepl = 0;
michael@0:   int nHyphCount;
michael@0: 
michael@0:   size_t prep_word_size = word_size + 3;
michael@0:   prep_word = hnj_malloc (prep_word_size);
michael@0:   matchlen = hnj_malloc ((word_size + 3) * sizeof(int));
michael@0:   matchindex = hnj_malloc ((word_size + 3) * sizeof(int));
michael@0:   matchrepl = hnj_malloc ((word_size + 3) * sizeof(char *));
michael@0: 
michael@0:   j = 0;
michael@0:   prep_word[j++] = '.';
michael@0:   
michael@0:   for (i = 0; i < word_size; i++) {
michael@0:     if (word[i] <= '9' && word[i] >= '0') {
michael@0:       prep_word[j++] = '.';
michael@0:     } else {
michael@0:       prep_word[j++] = word[i];
michael@0:     }
michael@0:   }
michael@0: 
michael@0: 
michael@0: 
michael@0:   prep_word[j++] = '.';
michael@0:   prep_word[j] = '\0';
michael@0: 
michael@0:   for (i = 0; i < j; i++)
michael@0:     hyphens[i] = '0';    
michael@0: 
michael@0: #ifdef VERBOSE
michael@0:   printf ("prep_word = %s\n", prep_word);
michael@0: #endif
michael@0: 
michael@0:   /* now, run the finite state machine */
michael@0:   state = 0;
michael@0:   for (i = 0; i < j; i++)
michael@0:     {
michael@0:       ch = prep_word[i];
michael@0:       for (;;)
michael@0: 	{
michael@0: 
michael@0: 	  if (state == -1) {
michael@0:             /* return 1; */
michael@0: 	    /*  KBH: FIXME shouldn't this be as follows? */
michael@0:             state = 0;
michael@0:             goto try_next_letter;
michael@0:           }          
michael@0: 
michael@0: #ifdef VERBOSE
michael@0: 	  char *state_str;
michael@0: 	  state_str = get_state_str (state, 1);
michael@0: 
michael@0: 	  for (k = 0; k < i - strlen (state_str); k++)
michael@0: 	    putchar (' ');
michael@0: 	  printf ("%s", state_str);
michael@0: #endif
michael@0: 
michael@0: 	  hstate = &dict->states[state];
michael@0: 	  for (k = 0; k < hstate->num_trans; k++)
michael@0: 	    if (hstate->trans[k].ch == ch)
michael@0: 	      {
michael@0: 		state = hstate->trans[k].new_state;
michael@0: 		goto found_state;
michael@0: 	      }
michael@0: 	  state = hstate->fallback_state;
michael@0: #ifdef VERBOSE
michael@0: 	  printf (" falling back, fallback_state %d\n", state);
michael@0: #endif
michael@0: 	}
michael@0:     found_state:
michael@0: #ifdef VERBOSE
michael@0:       printf ("found state %d\n",state);
michael@0: #endif
michael@0:       /* Additional optimization is possible here - especially,
michael@0: 	 elimination of trailing zeroes from the match. Leading zeroes
michael@0: 	 have already been optimized. */
michael@0:       match = dict->states[state].match;
michael@0:       repl = dict->states[state].repl;
michael@0:       replindex = dict->states[state].replindex;
michael@0:       replcut = dict->states[state].replcut;
michael@0:       /* replacing rules not handled by hyphen_hyphenate() */
michael@0:       if (match)
michael@0: 	{
michael@0: 	  offset = i + 1 - strlen (match);
michael@0: #ifdef VERBOSE
michael@0: 	  for (k = 0; k < offset; k++)
michael@0: 	    putchar (' ');
michael@0: 	  printf ("%s (%s)\n", match, repl);
michael@0: #endif
michael@0:           if (repl) {
michael@0:             if (!isrepl) for(; isrepl < word_size; isrepl++) {
michael@0:                 matchrepl[isrepl] = NULL;
michael@0:                 matchindex[isrepl] = -1;
michael@0:             }
michael@0:             matchlen[offset + replindex] = replcut;
michael@0:           }
michael@0: 	  /* This is a linear search because I tried a binary search and
michael@0: 	     found it to be just a teeny bit slower. */
michael@0: 	  for (k = 0; match[k]; k++) {
michael@0: 	    if ((hyphens[offset + k] < match[k])) {
michael@0: 	      hyphens[offset + k] = match[k];
michael@0:               if (match[k]&1) {
michael@0:                 matchrepl[offset + k] = repl;
michael@0:                 if (repl && (k >= replindex) && (k <= replindex + replcut)) {
michael@0:                     matchindex[offset + replindex] = offset + k;
michael@0:                 }
michael@0:               }
michael@0:             }
michael@0:           }
michael@0:           
michael@0: 	}
michael@0: 
michael@0:       /* KBH: we need this to make sure we keep looking in a word */
michael@0:       /* for patterns even if the current character is not known in state 0 */
michael@0:       /* since patterns for hyphenation may occur anywhere in the word */
michael@0:       try_next_letter: ;
michael@0: 
michael@0:     }
michael@0: #ifdef VERBOSE
michael@0:   for (i = 0; i < j; i++)
michael@0:     putchar (hyphens[i]);
michael@0:   putchar ('\n');
michael@0: #endif
michael@0: 
michael@0:   for (i = 0; i < j - 3; i++)
michael@0: #if 0
michael@0:     if (hyphens[i + 1] & 1)
michael@0:       hyphens[i] = '-';
michael@0: #else
michael@0:     hyphens[i] = hyphens[i + 1];
michael@0: #endif
michael@0:   for (; i < word_size; i++)
michael@0:     hyphens[i] = '0';
michael@0:   hyphens[word_size] = '\0';
michael@0: 
michael@0:        /* now create a new char string showing hyphenation positions */
michael@0:        /* count the hyphens and allocate space for the new hyphenated string */
michael@0:        nHyphCount = 0;
michael@0:        for (i = 0; i < word_size; i++)
michael@0:           if (hyphens[i]&1)
michael@0:              nHyphCount++;
michael@0:        j = 0;
michael@0:        for (i = 0; i < word_size; i++) {
michael@0:            if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) { 
michael@0:                 if (rep && pos && cut) {
michael@0:                     if (!*rep)
michael@0:                         *rep = (char **) calloc(word_size, sizeof(char *));
michael@0:                     if (!*pos)
michael@0:                         *pos = (int *) calloc(word_size, sizeof(int));
michael@0:                     if (!*cut) {
michael@0:                         *cut = (int *) calloc(word_size, sizeof(int));
michael@0:                     }
michael@0:                     (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]);
michael@0:                     (*pos)[matchindex[i] - 1] = matchindex[i] - i;
michael@0:                     (*cut)[matchindex[i] - 1] = matchlen[i];
michael@0:                 }
michael@0:                 j += strlen(matchrepl[matchindex[i]]);
michael@0:                 i += matchlen[i] - 1;
michael@0:           }
michael@0:        }
michael@0: 
michael@0:   hnj_free (matchrepl);
michael@0:   hnj_free (matchlen);
michael@0:   hnj_free (matchindex);
michael@0: 
michael@0:   // recursive hyphenation of the first (compound) level segments
michael@0:   if (dict->nextlevel) {
michael@0:      char ** rep2;
michael@0:      int * pos2;
michael@0:      int * cut2;
michael@0:      char * hyphens2;
michael@0:      int begin = 0;
michael@0: 
michael@0:      rep2 = hnj_malloc (word_size * sizeof(char *));
michael@0:      pos2 = hnj_malloc (word_size * sizeof(int));
michael@0:      cut2 = hnj_malloc (word_size * sizeof(int));
michael@0:      hyphens2 = hnj_malloc (word_size + 3);
michael@0:      for (i = 0; i < word_size; i++) rep2[i] = NULL;
michael@0:      for (i = 0; i < word_size; i++) if 
michael@0:         (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) {
michael@0:         if (i - begin > 1) {
michael@0:             int hyph = 0;
michael@0:             prep_word[i + 2] = '\0';
michael@0:             /* non-standard hyphenation at compound boundary (Schiffahrt) */
michael@0:             if (rep && *rep && *pos && *cut && (*rep)[i]) {
michael@0:                 char * l = strchr((*rep)[i], '=');
michael@0:                 size_t offset = 2 + i - (*pos)[i];
michael@0:                 strncpy(prep_word + offset, (*rep)[i], prep_word_size - offset - 1);
michael@0:                 prep_word[prep_word_size - 1] = '\0';
michael@0:                 if (l) {
michael@0:                     hyph = (l - (*rep)[i]) - (*pos)[i];
michael@0:                     prep_word[2 + i + hyph] = '\0';
michael@0:                 }
michael@0:             }
michael@0:             hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph,
michael@0:                 hyphens2, &rep2, &pos2, &cut2, clhmin,
michael@0:                 crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend));
michael@0:             for (j = 0; j < i - begin - 1; j++) {
michael@0:                 hyphens[begin + j] = hyphens2[j];
michael@0:                 if (rep2[j] && rep && pos && cut) {
michael@0:                     if (!*rep && !*pos && !*cut) {
michael@0:                         int k;
michael@0:                         *rep = (char **) malloc(sizeof(char *) * word_size);
michael@0:                         *pos = (int *) malloc(sizeof(int) * word_size);
michael@0:                         *cut = (int *) malloc(sizeof(int) * word_size);
michael@0:                         for (k = 0; k < word_size; k++) {
michael@0:                             (*rep)[k] = NULL;
michael@0:                             (*pos)[k] = 0;
michael@0:                             (*cut)[k] = 0;
michael@0:                         }
michael@0:                     }
michael@0:                     (*rep)[begin + j] = rep2[j];
michael@0:                     (*pos)[begin + j] = pos2[j];
michael@0:                     (*cut)[begin + j] = cut2[j];
michael@0:                 }
michael@0:             }
michael@0:             prep_word[i + 2] = word[i + 1];
michael@0:             if (*rep && *pos && *cut && (*rep)[i]) {
michael@0:                 size_t offset = 1;
michael@0:                 strncpy(prep_word + offset, word, prep_word_size - offset - 1);
michael@0:                 prep_word[prep_word_size - 1] = '\0';
michael@0:             }
michael@0:         }
michael@0:         begin = i + 1;
michael@0:         for (j = 0; j < word_size; j++) rep2[j] = NULL;
michael@0:      }
michael@0:      
michael@0:      // non-compound
michael@0:      if (begin == 0) {
michael@0:         hnj_hyphen_hyph_(dict->nextlevel, word, word_size,
michael@0:             hyphens, rep, pos, cut, clhmin, crhmin, lend, rend);
michael@0:         if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
michael@0:             rep, pos, cut, clhmin);
michael@0:         if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
michael@0:             rep, pos, cut, crhmin);
michael@0:      }
michael@0:      
michael@0:      free(rep2);
michael@0:      free(cut2);
michael@0:      free(pos2);
michael@0:      free(hyphens2);
michael@0:   }
michael@0: 
michael@0:   hnj_free (prep_word);
michael@0:   return 0;
michael@0: }
michael@0: 
michael@0: /* UTF-8 normalization of hyphen and non-standard positions */
michael@0: int hnj_hyphen_norm(const char *word, int word_size, char * hyphens,
michael@0: 	char *** rep, int ** pos, int ** cut)
michael@0: {
michael@0:   int i, j, k;
michael@0:   if ((((unsigned char) word[0]) >> 6) == 2) {
michael@0:     fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word);
michael@0:     return 1;
michael@0:   }
michael@0: 
michael@0:   /* calculate UTF-8 character positions */
michael@0:   for (i = 0, j = -1; i < word_size; i++) {
michael@0:     /* beginning of an UTF-8 character (not '10' start bits) */
michael@0:     if ((((unsigned char) word[i]) >> 6) != 2) j++;
michael@0:     hyphens[j] = hyphens[i];
michael@0:     if (rep && pos && cut && *rep && *pos && *cut) {
michael@0:         int l = (*pos)[i];
michael@0:         (*pos)[j] = 0;
michael@0:         for (k = 0; k < l; k++) {
michael@0:             if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++;
michael@0:         }
michael@0:         k = i - l + 1;
michael@0:         l = k + (*cut)[i];
michael@0:         (*cut)[j] = 0;        
michael@0:         for (; k < l; k++) {
michael@0:             if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++;
michael@0:         }
michael@0:         (*rep)[j] = (*rep)[i];
michael@0:         if (j < i) {
michael@0:             (*rep)[i] = NULL;
michael@0:             (*pos)[i] = 0;
michael@0:             (*cut)[i] = 0;
michael@0:         }
michael@0:     }
michael@0:   }
michael@0:   hyphens[j + 1] = '\0';
michael@0: #ifdef VERBOSE
michael@0:   printf ("nums: %s\n", hyphens);
michael@0: #endif
michael@0:   return 0;
michael@0: }
michael@0: 
michael@0: /* get the word with all possible hyphenations (output: hyphword) */
michael@0: void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens, 
michael@0:     char * hyphword, char *** rep, int ** pos, int ** cut)
michael@0: {
michael@0:   int hyphenslen = l + 5;
michael@0: 
michael@0:   int i, j;
michael@0:   for (i = 0, j = 0; i < l; i++, j++) {
michael@0:     if (hyphens[i]&1) {
michael@0:       hyphword[j] = word[i];
michael@0:       if (*rep && *pos && *cut && (*rep)[i]) {
michael@0:         size_t offset = j - (*pos)[i] + 1;
michael@0:         strncpy(hyphword + offset, (*rep)[i], hyphenslen - offset - 1);
michael@0:         hyphword[hyphenslen-1] = '\0';
michael@0:         j += strlen((*rep)[i]) - (*pos)[i];
michael@0:         i += (*cut)[i] - (*pos)[i];
michael@0:       } else hyphword[++j] = '=';
michael@0:     } else hyphword[j] = word[i];
michael@0:   }
michael@0:   hyphword[j] = '\0';
michael@0: }
michael@0: 
michael@0: 
michael@0: /* main api function with default hyphenmin parameters */
michael@0: int hnj_hyphen_hyphenate2 (HyphenDict *dict,
michael@0: 			   const char *word, int word_size, char * hyphens,
michael@0: 			   char *hyphword, char *** rep, int ** pos, int ** cut)
michael@0: {
michael@0:   hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
michael@0:     dict->clhmin, dict->crhmin, 1, 1);
michael@0:   hnj_hyphen_lhmin(dict->utf8, word, word_size,
michael@0:     hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2));
michael@0:   hnj_hyphen_rhmin(dict->utf8, word, word_size,
michael@0:     hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2));
michael@0: 
michael@0:   /* nohyphen */
michael@0:   if (dict->nohyphen) {
michael@0:     char * nh = dict->nohyphen;
michael@0:     int nhi;
michael@0:     for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {
michael@0:         char * nhy = (char *) strstr(word, nh);
michael@0:         while (nhy) {
michael@0:             hyphens[nhy - word + strlen(nh) - 1] = '0';
michael@0:             if (nhy - word  - 1 >= 0) hyphens[nhy - word - 1] = '0';
michael@0:             nhy = (char *) strstr(nhy + 1, nh);
michael@0:         }
michael@0:         nh = nh + strlen(nh) + 1;
michael@0:     }
michael@0:   }
michael@0: 
michael@0:   if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
michael@0:   if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
michael@0: #ifdef VERBOSE
michael@0:   printf ("nums: %s\n", hyphens);
michael@0: #endif
michael@0:   return 0;
michael@0: }
michael@0: 
michael@0: /* previous main api function with hyphenmin parameters */
michael@0: int hnj_hyphen_hyphenate3 (HyphenDict *dict,
michael@0: 	const char *word, int word_size, char * hyphens,
michael@0: 	char *hyphword, char *** rep, int ** pos, int ** cut,
michael@0: 	int lhmin, int rhmin, int clhmin, int crhmin)
michael@0: {
michael@0:   lhmin = (lhmin > dict->lhmin) ? lhmin : dict->lhmin;
michael@0:   rhmin = (rhmin > dict->rhmin) ? rhmin : dict->rhmin;
michael@0:   clhmin = (clhmin > dict->clhmin) ? clhmin : dict->clhmin;
michael@0:   crhmin = (crhmin > dict->crhmin) ? crhmin : dict->crhmin;
michael@0:   hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
michael@0:     clhmin, crhmin, 1, 1);
michael@0:   hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
michael@0:     rep, pos, cut, (lhmin > 0 ? lhmin : 2));
michael@0:   hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
michael@0:     rep, pos, cut, (rhmin > 0 ? rhmin : 2));
michael@0:   if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
michael@0: 
michael@0:   /* nohyphen */
michael@0:   if (dict->nohyphen) {
michael@0:     char * nh = dict->nohyphen;
michael@0:     int nhi;
michael@0:     for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {
michael@0:         char * nhy = (char *) strstr(word, nh);
michael@0:         while (nhy) {
michael@0:             hyphens[nhy - word + strlen(nh) - 1] = 0;
michael@0:             if (nhy - word  - 1 >= 0) hyphens[nhy - word - 1] = 0;
michael@0:             nhy = (char *) strstr(nhy + 1, nh);
michael@0:         }
michael@0:         nh = nh + strlen(nh) + 1;
michael@0:     }
michael@0:   }
michael@0: 
michael@0:   if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
michael@0:   return 0;
michael@0: }