intl/hyphenation/src/hyphen.c

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

     1 /* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both
     2  * licenses follows.
     3  */
     5 /* LibHnj - a library for high quality hyphenation and justification
     6  * Copyright (C) 1998 Raph Levien, 
     7  * 	     (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org), 
     8  *           (C) 2001 Peter Novodvorsky (nidd@cs.msu.su)
     9  *           (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo)
    10  *
    11  * This library is free software; you can redistribute it and/or
    12  * modify it under the terms of the GNU Library General Public
    13  * License as published by the Free Software Foundation; either
    14  * version 2 of the License, or (at your option) any later version.
    15  *
    16  * This library is distributed in the hope that it will be useful,
    17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
    18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    19  * Library General Public License for more details.
    20  *
    21  * You should have received a copy of the GNU Library General Public
    22  * License along with this library; if not, write to the 
    23  * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 
    24  * Boston, MA  02111-1307  USA.
    25 */
    27 /*
    28  * The contents of this file are subject to the Mozilla Public License
    29  * Version 1.0 (the "MPL"); you may not use this file except in
    30  * compliance with the MPL.  You may obtain a copy of the MPL at
    31  * http://www.mozilla.org/MPL/
    32  *
    33  * Software distributed under the MPL is distributed on an "AS IS" basis,
    34  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL
    35  * for the specific language governing rights and limitations under the
    36  * MPL.
    37  *
    38  */
    39 #include <stdlib.h> /* for NULL, malloc */
    40 #include <stdio.h>  /* for fprintf */
    41 #include <string.h> /* for strdup */
    43 #ifdef UNX
    44 #include <unistd.h> /* for exit */
    45 #endif
    47 #define noVERBOSE
    49 /* calculate hyphenmin values with long ligature length (2 or 3 characters
    50  * instead of 1 or 2) for comparison with hyphenation without ligatures */
    51 #define noLONG_LIGATURE
    53 #ifdef LONG_LIGATURE
    54 #define LIG_xx	1
    55 #define LIG_xxx	2
    56 #else
    57 #define LIG_xx	0
    58 #define LIG_xxx	1
    59 #endif
    61 #include "hnjalloc.h"
    62 #include "hyphen.h"
    64 static char *
    65 hnj_strdup (const char *s)
    66 {
    67   char *new;
    68   int l;
    70   l = strlen (s);
    71   new = hnj_malloc (l + 1);
    72   memcpy (new, s, l);
    73   new[l] = 0;
    74   return new;
    75 }
    77 /* remove cross-platform text line end characters */
    78 void hnj_strchomp(char * s)
    79 {
    80   int k = strlen(s);
    81   if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';
    82   if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
    83 }
    85 /* a little bit of a hash table implementation. This simply maps strings
    86    to state numbers */
    88 typedef struct _HashTab HashTab;
    89 typedef struct _HashEntry HashEntry;
    91 /* A cheap, but effective, hack. */
    92 #define HASH_SIZE 31627
    94 struct _HashTab {
    95   HashEntry *entries[HASH_SIZE];
    96 };
    98 struct _HashEntry {
    99   HashEntry *next;
   100   char *key;
   101   int val;
   102 };
   104 /* a char* hash function from ASU - adapted from Gtk+ */
   105 static unsigned int
   106 hnj_string_hash (const char *s)
   107 {
   108   const char *p;
   109   unsigned int h=0, g;
   110   for(p = s; *p != '\0'; p += 1) {
   111     h = ( h << 4 ) + *p;
   112     if ( ( g = h & 0xf0000000 ) ) {
   113       h = h ^ (g >> 24);
   114       h = h ^ g;
   115     }
   116   }
   117   return h /* % M */;
   118 }
   120 static HashTab *
   121 hnj_hash_new (void)
   122 {
   123   HashTab *hashtab;
   124   int i;
   126   hashtab = hnj_malloc (sizeof(HashTab));
   127   for (i = 0; i < HASH_SIZE; i++)
   128     hashtab->entries[i] = NULL;
   130   return hashtab;
   131 }
   133 static void
   134 hnj_hash_free (HashTab *hashtab)
   135 {
   136   int i;
   137   HashEntry *e, *next;
   139   for (i = 0; i < HASH_SIZE; i++)
   140     for (e = hashtab->entries[i]; e; e = next)
   141       {
   142 	next = e->next;
   143 	hnj_free (e->key);
   144 	hnj_free (e);
   145       }
   147   hnj_free (hashtab);
   148 }
   150 /* assumes that key is not already present! */
   151 static void
   152 hnj_hash_insert (HashTab *hashtab, const char *key, int val)
   153 {
   154   int i;
   155   HashEntry *e;
   157   i = hnj_string_hash (key) % HASH_SIZE;
   158   e = hnj_malloc (sizeof(HashEntry));
   159   e->next = hashtab->entries[i];
   160   e->key = hnj_strdup (key);
   161   e->val = val;
   162   hashtab->entries[i] = e;
   163 }
   165 /* return val if found, otherwise -1 */
   166 static int
   167 hnj_hash_lookup (HashTab *hashtab, const char *key)
   168 {
   169   int i;
   170   HashEntry *e;
   171   i = hnj_string_hash (key) % HASH_SIZE;
   172   for (e = hashtab->entries[i]; e; e = e->next)
   173     if (!strcmp (key, e->key))
   174       return e->val;
   175   return -1;
   176 }
   178 /* Get the state number, allocating a new state if necessary. */
   179 static int
   180 hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string)
   181 {
   182   int state_num;
   184   state_num = hnj_hash_lookup (hashtab, string);
   186   if (state_num >= 0)
   187     return state_num;
   189   hnj_hash_insert (hashtab, string, dict->num_states);
   190   /* predicate is true if dict->num_states is a power of two */
   191   if (!(dict->num_states & (dict->num_states - 1)))
   192     {
   193       dict->states = hnj_realloc (dict->states,
   194 				  (dict->num_states << 1) *
   195 				  sizeof(HyphenState));
   196     }
   197   dict->states[dict->num_states].match = NULL;
   198   dict->states[dict->num_states].repl = NULL;
   199   dict->states[dict->num_states].fallback_state = -1;
   200   dict->states[dict->num_states].num_trans = 0;
   201   dict->states[dict->num_states].trans = NULL;
   202   return dict->num_states++;
   203 }
   205 /* add a transition from state1 to state2 through ch - assumes that the
   206    transition does not already exist */
   207 static void
   208 hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch)
   209 {
   210   int num_trans;
   212   num_trans = dict->states[state1].num_trans;
   213   if (num_trans == 0)
   214     {
   215       dict->states[state1].trans = hnj_malloc (sizeof(HyphenTrans));
   216     }
   217   else if (!(num_trans & (num_trans - 1)))
   218     {
   219       dict->states[state1].trans = hnj_realloc (dict->states[state1].trans,
   220 						(num_trans << 1) *
   221 						sizeof(HyphenTrans));
   222     }
   223   dict->states[state1].trans[num_trans].ch = ch;
   224   dict->states[state1].trans[num_trans].new_state = state2;
   225   dict->states[state1].num_trans++;
   226 }
   228 #ifdef VERBOSE
   229 HashTab *global[1];
   231 static char *
   232 get_state_str (int state, int level)
   233 {
   234   int i;
   235   HashEntry *e;
   237   for (i = 0; i < HASH_SIZE; i++)
   238     for (e = global[level]->entries[i]; e; e = e->next)
   239       if (e->val == state)
   240 	return e->key;
   241   return NULL;
   242 }
   243 #endif
   245 void hnj_hyphen_load_line(char * buf, HyphenDict * dict, HashTab * hashtab) {
   246   int i, j;
   247   char word[MAX_CHARS];
   248   char pattern[MAX_CHARS];
   249   char * repl;
   250   signed char replindex;
   251   signed char replcut;
   252   int state_num = 0;
   253   int last_state;
   254   char ch;
   255   int found;
   257 	  if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) {
   258 	    dict->lhmin = atoi(buf + 13);
   259 	    return;
   260 	  } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) {
   261 	    dict->rhmin = atoi(buf + 14);
   262 	    return;
   263 	  } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) {
   264 	    dict->clhmin = atoi(buf + 21);
   265 	    return;
   266 	  } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) {
   267 	    dict->crhmin = atoi(buf + 22);
   268 	    return;
   269 	  } else if (strncmp(buf, "NOHYPHEN", 8) == 0) {
   270 	    char * space = buf + 8;
   271 	    while (*space != '\0' && (*space == ' ' || *space == '\t')) space++;
   272 	    if (*buf != '\0') dict->nohyphen = hnj_strdup(space);
   273 	    if (dict->nohyphen) {
   274 	        char * nhe = dict->nohyphen + strlen(dict->nohyphen) - 1;
   275 	        *nhe = 0;
   276 	        for (nhe = nhe - 1; nhe > dict->nohyphen; nhe--) {
   277 	                if (*nhe == ',') {
   278 	                    dict->nohyphenl++;
   279 	                    *nhe = 0;
   280 	                }
   281 	        }
   282 	    }
   283 	    return;
   284 	  } 
   285 	  j = 0;
   286 	  pattern[j] = '0';
   287           repl = strchr(buf, '/');
   288           replindex = 0;
   289           replcut = 0;
   290           if (repl) {
   291             char * index = strchr(repl + 1, ',');
   292             *repl = '\0';
   293             if (index) {
   294                 char * index2 = strchr(index + 1, ',');
   295                 *index = '\0';
   296                 if (index2) {
   297                     *index2 = '\0';
   298                     replindex = (signed char) atoi(index + 1) - 1;
   299                     replcut = (signed char) atoi(index2 + 1);                
   300                 }
   301             } else {
   302                 hnj_strchomp(repl + 1);
   303                 replindex = 0;
   304                 replcut = (signed char) strlen(buf);
   305             }
   306             repl = hnj_strdup(repl + 1);
   307           }
   308 	  for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++)
   309 	    {
   310 	      if (buf[i] >= '0' && buf[i] <= '9')
   311 		pattern[j] = buf[i];
   312 	      else
   313 		{
   314 		  word[j] = buf[i];
   315 		  pattern[++j] = '0';
   316 		}
   317 	    }
   318 	  word[j] = '\0';
   319 	  pattern[j + 1] = '\0';
   321           i = 0;
   322 	  if (!repl) {
   323 	    /* Optimize away leading zeroes */
   324             for (; pattern[i] == '0'; i++);
   325           } else {
   326             if (*word == '.') i++;
   327             /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */
   328             if (dict->utf8) {
   329                 int pu = -1;        /* unicode character position */
   330                 int ps = -1;        /* unicode start position (original replindex) */
   331                 int pc = (*word == '.') ? 1: 0; /* 8-bit character position */
   332                 for (; pc < (strlen(word) + 1); pc++) {
   333                 /* beginning of an UTF-8 character (not '10' start bits) */
   334                     if ((((unsigned char) word[pc]) >> 6) != 2) pu++;
   335                     if ((ps < 0) && (replindex == pu)) {
   336                         ps = replindex;
   337                         replindex = (signed char) pc;
   338                     }
   339                     if ((ps >= 0) && ((pu - ps) == replcut)) {
   340                         replcut = (signed char) (pc - replindex);
   341                         break;
   342                     }
   343                 }
   344                 if (*word == '.') replindex--;
   345             }
   346           }
   348 #ifdef VERBOSE
   349 	  printf ("word %s pattern %s, j = %d  repl: %s\n", word, pattern + i, j, repl);
   350 #endif
   351 	  found = hnj_hash_lookup (hashtab, word);
   352 	  state_num = hnj_get_state (dict, hashtab, word);
   353 	  dict->states[state_num].match = hnj_strdup (pattern + i);
   354 	  dict->states[state_num].repl = repl;
   355 	  dict->states[state_num].replindex = replindex;
   356           if (!replcut) {
   357             dict->states[state_num].replcut = (signed char) strlen(word);
   358           } else {
   359             dict->states[state_num].replcut = replcut;
   360           }
   362 	  /* now, put in the prefix transitions */
   363           for (; found < 0 ;j--)
   364 	    {
   365 	      last_state = state_num;
   366 	      ch = word[j - 1];
   367 	      word[j - 1] = '\0';
   368 	      found = hnj_hash_lookup (hashtab, word);
   369 	      state_num = hnj_get_state (dict, hashtab, word);
   370 	      hnj_add_trans (dict, state_num, last_state, ch);
   371 	    }
   372 }
   374 HyphenDict *
   375 hnj_hyphen_load (const char *fn)
   376 {
   377   HyphenDict *dict[2];
   378   HashTab *hashtab;
   379   FILE *f;
   380   char buf[MAX_CHARS];
   381   int nextlevel = 0;
   382   int i, j, k;
   383   HashEntry *e;
   384   int state_num = 0;
   386   f = fopen (fn, "r");
   387   if (f == NULL)
   388     return NULL;
   390 // loading one or two dictionaries (separated by NEXTLEVEL keyword)
   391 for (k = 0; k < 2; k++) { 
   392   hashtab = hnj_hash_new ();
   393 #ifdef VERBOSE
   394   global[k] = hashtab;
   395 #endif
   396   hnj_hash_insert (hashtab, "", 0);
   397   dict[k] = hnj_malloc (sizeof(HyphenDict));
   398   dict[k]->num_states = 1;
   399   dict[k]->states = hnj_malloc (sizeof(HyphenState));
   400   dict[k]->states[0].match = NULL;
   401   dict[k]->states[0].repl = NULL;
   402   dict[k]->states[0].fallback_state = -1;
   403   dict[k]->states[0].num_trans = 0;
   404   dict[k]->states[0].trans = NULL;
   405   dict[k]->nextlevel = NULL;
   406   dict[k]->lhmin = 0;
   407   dict[k]->rhmin = 0;
   408   dict[k]->clhmin = 0;
   409   dict[k]->crhmin = 0;
   410   dict[k]->nohyphen = NULL;
   411   dict[k]->nohyphenl = 0;
   413   /* read in character set info */
   414   if (k == 0) {
   415     for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;
   416     if (fgets(dict[k]->cset,  sizeof(dict[k]->cset),f) != NULL) {
   417       for (i=0;i<MAX_NAME;i++)
   418         if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))
   419           dict[k]->cset[i] = 0;
   420     } else {
   421       dict[k]->cset[0] = 0;
   422     }
   423     dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);
   424   } else {
   425     strncpy(dict[k]->cset, dict[0]->cset, sizeof(dict[k]->cset)-1);
   426     dict[k]->cset[sizeof(dict[k]->cset)-1] = '\0';
   427     dict[k]->utf8 = dict[0]->utf8;
   428   }
   430   if (k == 0 || nextlevel) {
   431     while (fgets (buf, sizeof(buf), f) != NULL) {
   432       if (strncmp(buf, "NEXTLEVEL", 9) == 0) {
   433 	nextlevel = 1;
   434 	break;
   435       } else if (buf[0] != '%') hnj_hyphen_load_line(buf, dict[k], hashtab);
   436     }
   437   } else if (k == 1) {
   438     /* default first level: hyphen and ASCII apostrophe */
   439     if (!dict[0]->utf8) hnj_hyphen_load_line("NOHYPHEN ',-\n", dict[k], hashtab);
   440     else hnj_hyphen_load_line("NOHYPHEN ',\xe2\x80\x93,\xe2\x80\x99,-\n", dict[k], hashtab);
   441     strncpy(buf, "1-1\n", MAX_CHARS-1); // buf rewritten by hnj_hyphen_load here
   442     buf[MAX_CHARS-1] = '\0';
   443     hnj_hyphen_load_line(buf, dict[k], hashtab); /* remove hyphen */
   444     hnj_hyphen_load_line("1'1\n", dict[k], hashtab); /* ASCII apostrophe */
   445     if (dict[0]->utf8) {
   446       hnj_hyphen_load_line("1\xe2\x80\x93" "1\n", dict[k], hashtab); /* endash */
   447       hnj_hyphen_load_line("1\xe2\x80\x99" "1\n", dict[k], hashtab); /* apostrophe */
   448     }
   449   }
   451   /* Could do unioning of matches here (instead of the preprocessor script).
   452      If we did, the pseudocode would look something like this:
   454      foreach state in the hash table
   455         foreach i = [1..length(state) - 1]
   456            state to check is substr (state, i)
   457            look it up
   458            if found, and if there is a match, union the match in.
   460      It's also possible to avoid the quadratic blowup by doing the
   461      search in order of increasing state string sizes - then you
   462      can break the loop after finding the first match.
   464      This step should be optional in any case - if there is a
   465      preprocessed rule table, it's always faster to use that.
   467 */
   469   /* put in the fallback states */
   470   for (i = 0; i < HASH_SIZE; i++)
   471     for (e = hashtab->entries[i]; e; e = e->next)
   472       {
   473 	if (*(e->key)) for (j = 1; 1; j++)
   474 	  {          
   475 	    state_num = hnj_hash_lookup (hashtab, e->key + j);
   476 	    if (state_num >= 0)
   477 	      break;
   478 	  }
   479         /* KBH: FIXME state 0 fallback_state should always be -1? */
   480 	if (e->val)
   481 	  dict[k]->states[e->val].fallback_state = state_num;
   482       }
   483 #ifdef VERBOSE
   484   for (i = 0; i < HASH_SIZE; i++)
   485     for (e = hashtab->entries[i]; e; e = e->next)
   486       {
   487 	printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,
   488 		dict[k]->states[e->val].fallback_state);
   489 	for (j = 0; j < dict[k]->states[e->val].num_trans; j++)
   490 	  printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch,
   491 		  dict[k]->states[e->val].trans[j].new_state);
   492       }
   493 #endif
   495 #ifndef VERBOSE
   496   hnj_hash_free (hashtab);
   497 #endif
   498   state_num = 0;
   499 }
   500   fclose(f);
   501   if (nextlevel) dict[0]->nextlevel = dict[1];
   502   else {
   503     dict[1] -> nextlevel = dict[0];
   504     dict[1]->lhmin = dict[0]->lhmin;
   505     dict[1]->rhmin = dict[0]->rhmin;
   506     dict[1]->clhmin = (dict[0]->clhmin) ? dict[0]->clhmin : ((dict[0]->lhmin) ? dict[0]->lhmin : 3);
   507     dict[1]->crhmin = (dict[0]->crhmin) ? dict[0]->crhmin : ((dict[0]->rhmin) ? dict[0]->rhmin : 3);
   508 #ifdef VERBOSE
   509     HashTab *r = global[0];
   510     global[0] = global[1];
   511     global[1] = r;
   512 #endif
   513     return dict[1];
   514   }
   515   return dict[0];
   516 }
   518 void hnj_hyphen_free (HyphenDict *dict)
   519 {
   520   int state_num;
   521   HyphenState *hstate;
   523   for (state_num = 0; state_num < dict->num_states; state_num++)
   524     {
   525       hstate = &dict->states[state_num];
   526       if (hstate->match)
   527 	hnj_free (hstate->match);
   528       if (hstate->repl)
   529 	hnj_free (hstate->repl);
   530       if (hstate->trans)
   531 	hnj_free (hstate->trans);
   532     }
   533   if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel);
   535   if (dict->nohyphen) hnj_free(dict->nohyphen);
   537   hnj_free (dict->states);
   539   hnj_free (dict);
   540 }
   542 #define MAX_WORD 256
   544 int hnj_hyphen_hyphenate (HyphenDict *dict,
   545 			   const char *word, int word_size,
   546 			   char *hyphens)
   547 {
   548   char *prep_word;
   549   int i, j, k;
   550   int state;
   551   char ch;
   552   HyphenState *hstate;
   553   char *match;
   554   int offset;
   556   prep_word = hnj_malloc (word_size + 3);
   558   j = 0;
   559   prep_word[j++] = '.';
   561   for (i = 0; i < word_size; i++) {
   562     if (word[i] <= '9' && word[i] >= '0') {
   563       prep_word[j++] = '.';
   564     } else {
   565       prep_word[j++] = word[i];
   566     }
   567   }
   569   prep_word[j++] = '.';
   570   prep_word[j] = '\0';
   572   for (i = 0; i < word_size + 5; i++)
   573     hyphens[i] = '0';
   575 #ifdef VERBOSE
   576   printf ("prep_word = %s\n", prep_word);
   577 #endif
   579   /* now, run the finite state machine */
   580   state = 0;
   581   for (i = 0; i < j; i++)
   582     {
   583       ch = prep_word[i];
   584       for (;;)
   585 	{
   587 	  if (state == -1) {
   588             /* return 1; */
   589 	    /*  KBH: FIXME shouldn't this be as follows? */
   590             state = 0;
   591             goto try_next_letter;
   592           }          
   594 #ifdef VERBOSE
   595 	  char *state_str;
   596 	  state_str = get_state_str (state, 0);
   598 	  for (k = 0; k < i - strlen (state_str); k++)
   599 	    putchar (' ');
   600 	  printf ("%s", state_str);
   601 #endif
   603 	  hstate = &dict->states[state];
   604 	  for (k = 0; k < hstate->num_trans; k++)
   605 	    if (hstate->trans[k].ch == ch)
   606 	      {
   607 		state = hstate->trans[k].new_state;
   608 		goto found_state;
   609 	      }
   610 	  state = hstate->fallback_state;
   611 #ifdef VERBOSE
   612 	  printf (" falling back, fallback_state %d\n", state);
   613 #endif
   614 	}
   615     found_state:
   616 #ifdef VERBOSE
   617       printf ("found state %d\n",state);
   618 #endif
   619       /* Additional optimization is possible here - especially,
   620 	 elimination of trailing zeroes from the match. Leading zeroes
   621 	 have already been optimized. */
   622       match = dict->states[state].match;
   623       /* replacing rules not handled by hyphen_hyphenate() */
   624       if (match && !dict->states[state].repl)
   625 	{
   626 	  offset = i + 1 - strlen (match);
   627 #ifdef VERBOSE
   628 	  for (k = 0; k < offset; k++)
   629 	    putchar (' ');
   630 	  printf ("%s\n", match);
   631 #endif
   632 	  /* This is a linear search because I tried a binary search and
   633 	     found it to be just a teeny bit slower. */
   634 	  for (k = 0; match[k]; k++)
   635 	    if (hyphens[offset + k] < match[k])
   636 	      hyphens[offset + k] = match[k];
   637 	}
   639       /* KBH: we need this to make sure we keep looking in a word */
   640       /* for patterns even if the current character is not known in state 0 */
   641       /* since patterns for hyphenation may occur anywhere in the word */
   642       try_next_letter: ;
   644     }
   645 #ifdef VERBOSE
   646   for (i = 0; i < j; i++)
   647     putchar (hyphens[i]);
   648   putchar ('\n');
   649 #endif
   651   for (i = 0; i < j - 4; i++)
   652 #if 0
   653     if (hyphens[i + 1] & 1)
   654       hyphens[i] = '-';
   655 #else
   656     hyphens[i] = hyphens[i + 1];
   657 #endif
   658   hyphens[0] = '0';
   659   for (; i < word_size; i++)
   660     hyphens[i] = '0';
   661   hyphens[word_size] = '\0';
   663   hnj_free (prep_word);
   665   return 0;    
   666 }
   668 /* Unicode ligature length */
   669 int hnj_ligature(unsigned char c) {
   670     switch (c) {
   671         case 0x80:			/* ff */
   672         case 0x81:			/* fi */
   673         case 0x82: return LIG_xx;	/* fl */
   674         case 0x83:			/* ffi */
   675         case 0x84: return LIG_xxx;	/* ffl */
   676         case 0x85:			/* long st */
   677         case 0x86: return LIG_xx;	/* st */
   678     }
   679     return 0;
   680 }
   682 /* character length of the first n byte of the input word */
   683 int hnj_hyphen_strnlen(const char * word, int n, int utf8)
   684 {
   685     int i = 0;
   686     int j = 0;
   687     while (j < n && word[j] != '\0') {
   688       i++;
   689       // Unicode ligature support
   690       if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC))  {
   691         i += hnj_ligature(word[j + 2]);
   692       }
   693       for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++);
   694     }
   695     return i;
   696 }
   698 int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens,
   699 	char *** rep, int ** pos, int ** cut, int lhmin)
   700 {
   701     int i = 1, j;
   703     // Unicode ligature support
   704     if (utf8 && ((unsigned char) word[0] == 0xEF) && ((unsigned char) word[1] == 0xAC))  {
   705       i += hnj_ligature(word[2]);
   706     }
   708     // ignore numbers
   709     for (j = 0; word[j] <= '9' && word[j] >= '0'; j++) i--;
   711     for (j = 0; i < lhmin && word[j] != '\0'; i++) do {
   712       // check length of the non-standard part
   713       if (*rep && *pos && *cut && (*rep)[j]) {
   714         char * rh = strchr((*rep)[j], '=');
   715         if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) +
   716           hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) {
   717             free((*rep)[j]);
   718             (*rep)[j] = NULL;
   719             hyphens[j] = '0';
   720           }
   721        } else {
   722          hyphens[j] = '0';
   723        }
   724        j++;
   726        // Unicode ligature support
   727        if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC))  {
   728          i += hnj_ligature(word[j + 2]);
   729        }
   730     } while (utf8 && (word[j] & 0xc0) == 0x80);
   731     return 0;
   732 }
   734 int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens,
   735 	char *** rep, int ** pos, int ** cut, int rhmin)
   736 {
   737     int i = 0;
   738     int j;
   740     // ignore numbers
   741     for (j = word_size - 1; j > 0 && word[j] <= '9' && word[j] >= '0'; j--) i--;
   743     for (j = word_size - 1; i < rhmin && j > 0; j--) {
   744       // check length of the non-standard part
   745       if (*rep && *pos && *cut && (*rep)[j]) {
   746         char * rh = strchr((*rep)[j], '=');
   747         if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) +
   748           hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) {
   749             free((*rep)[j]);
   750             (*rep)[j] = NULL;
   751             hyphens[j] = '0';
   752           }
   753        } else {
   754          hyphens[j] = '0';
   755        }
   756        if (!utf8 || (word[j] & 0xc0) == 0xc0 || (word[j] & 0x80) != 0x80) i++;
   757     }
   758     return 0;
   759 }
   761 // recursive function for compound level hyphenation
   762 int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
   763     char * hyphens, char *** rep, int ** pos, int ** cut,
   764     int clhmin, int crhmin, int lend, int rend)
   765 {
   766   char *prep_word;
   767   int i, j, k;
   768   int state;
   769   char ch;
   770   HyphenState *hstate;
   771   char *match;
   772   char *repl;
   773   signed char replindex;
   774   signed char replcut;
   775   int offset;
   776   int * matchlen;
   777   int * matchindex;
   778   char ** matchrepl;  
   779   int isrepl = 0;
   780   int nHyphCount;
   782   size_t prep_word_size = word_size + 3;
   783   prep_word = hnj_malloc (prep_word_size);
   784   matchlen = hnj_malloc ((word_size + 3) * sizeof(int));
   785   matchindex = hnj_malloc ((word_size + 3) * sizeof(int));
   786   matchrepl = hnj_malloc ((word_size + 3) * sizeof(char *));
   788   j = 0;
   789   prep_word[j++] = '.';
   791   for (i = 0; i < word_size; i++) {
   792     if (word[i] <= '9' && word[i] >= '0') {
   793       prep_word[j++] = '.';
   794     } else {
   795       prep_word[j++] = word[i];
   796     }
   797   }
   801   prep_word[j++] = '.';
   802   prep_word[j] = '\0';
   804   for (i = 0; i < j; i++)
   805     hyphens[i] = '0';    
   807 #ifdef VERBOSE
   808   printf ("prep_word = %s\n", prep_word);
   809 #endif
   811   /* now, run the finite state machine */
   812   state = 0;
   813   for (i = 0; i < j; i++)
   814     {
   815       ch = prep_word[i];
   816       for (;;)
   817 	{
   819 	  if (state == -1) {
   820             /* return 1; */
   821 	    /*  KBH: FIXME shouldn't this be as follows? */
   822             state = 0;
   823             goto try_next_letter;
   824           }          
   826 #ifdef VERBOSE
   827 	  char *state_str;
   828 	  state_str = get_state_str (state, 1);
   830 	  for (k = 0; k < i - strlen (state_str); k++)
   831 	    putchar (' ');
   832 	  printf ("%s", state_str);
   833 #endif
   835 	  hstate = &dict->states[state];
   836 	  for (k = 0; k < hstate->num_trans; k++)
   837 	    if (hstate->trans[k].ch == ch)
   838 	      {
   839 		state = hstate->trans[k].new_state;
   840 		goto found_state;
   841 	      }
   842 	  state = hstate->fallback_state;
   843 #ifdef VERBOSE
   844 	  printf (" falling back, fallback_state %d\n", state);
   845 #endif
   846 	}
   847     found_state:
   848 #ifdef VERBOSE
   849       printf ("found state %d\n",state);
   850 #endif
   851       /* Additional optimization is possible here - especially,
   852 	 elimination of trailing zeroes from the match. Leading zeroes
   853 	 have already been optimized. */
   854       match = dict->states[state].match;
   855       repl = dict->states[state].repl;
   856       replindex = dict->states[state].replindex;
   857       replcut = dict->states[state].replcut;
   858       /* replacing rules not handled by hyphen_hyphenate() */
   859       if (match)
   860 	{
   861 	  offset = i + 1 - strlen (match);
   862 #ifdef VERBOSE
   863 	  for (k = 0; k < offset; k++)
   864 	    putchar (' ');
   865 	  printf ("%s (%s)\n", match, repl);
   866 #endif
   867           if (repl) {
   868             if (!isrepl) for(; isrepl < word_size; isrepl++) {
   869                 matchrepl[isrepl] = NULL;
   870                 matchindex[isrepl] = -1;
   871             }
   872             matchlen[offset + replindex] = replcut;
   873           }
   874 	  /* This is a linear search because I tried a binary search and
   875 	     found it to be just a teeny bit slower. */
   876 	  for (k = 0; match[k]; k++) {
   877 	    if ((hyphens[offset + k] < match[k])) {
   878 	      hyphens[offset + k] = match[k];
   879               if (match[k]&1) {
   880                 matchrepl[offset + k] = repl;
   881                 if (repl && (k >= replindex) && (k <= replindex + replcut)) {
   882                     matchindex[offset + replindex] = offset + k;
   883                 }
   884               }
   885             }
   886           }
   888 	}
   890       /* KBH: we need this to make sure we keep looking in a word */
   891       /* for patterns even if the current character is not known in state 0 */
   892       /* since patterns for hyphenation may occur anywhere in the word */
   893       try_next_letter: ;
   895     }
   896 #ifdef VERBOSE
   897   for (i = 0; i < j; i++)
   898     putchar (hyphens[i]);
   899   putchar ('\n');
   900 #endif
   902   for (i = 0; i < j - 3; i++)
   903 #if 0
   904     if (hyphens[i + 1] & 1)
   905       hyphens[i] = '-';
   906 #else
   907     hyphens[i] = hyphens[i + 1];
   908 #endif
   909   for (; i < word_size; i++)
   910     hyphens[i] = '0';
   911   hyphens[word_size] = '\0';
   913        /* now create a new char string showing hyphenation positions */
   914        /* count the hyphens and allocate space for the new hyphenated string */
   915        nHyphCount = 0;
   916        for (i = 0; i < word_size; i++)
   917           if (hyphens[i]&1)
   918              nHyphCount++;
   919        j = 0;
   920        for (i = 0; i < word_size; i++) {
   921            if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) { 
   922                 if (rep && pos && cut) {
   923                     if (!*rep)
   924                         *rep = (char **) calloc(word_size, sizeof(char *));
   925                     if (!*pos)
   926                         *pos = (int *) calloc(word_size, sizeof(int));
   927                     if (!*cut) {
   928                         *cut = (int *) calloc(word_size, sizeof(int));
   929                     }
   930                     (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]);
   931                     (*pos)[matchindex[i] - 1] = matchindex[i] - i;
   932                     (*cut)[matchindex[i] - 1] = matchlen[i];
   933                 }
   934                 j += strlen(matchrepl[matchindex[i]]);
   935                 i += matchlen[i] - 1;
   936           }
   937        }
   939   hnj_free (matchrepl);
   940   hnj_free (matchlen);
   941   hnj_free (matchindex);
   943   // recursive hyphenation of the first (compound) level segments
   944   if (dict->nextlevel) {
   945      char ** rep2;
   946      int * pos2;
   947      int * cut2;
   948      char * hyphens2;
   949      int begin = 0;
   951      rep2 = hnj_malloc (word_size * sizeof(char *));
   952      pos2 = hnj_malloc (word_size * sizeof(int));
   953      cut2 = hnj_malloc (word_size * sizeof(int));
   954      hyphens2 = hnj_malloc (word_size + 3);
   955      for (i = 0; i < word_size; i++) rep2[i] = NULL;
   956      for (i = 0; i < word_size; i++) if 
   957         (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) {
   958         if (i - begin > 1) {
   959             int hyph = 0;
   960             prep_word[i + 2] = '\0';
   961             /* non-standard hyphenation at compound boundary (Schiffahrt) */
   962             if (rep && *rep && *pos && *cut && (*rep)[i]) {
   963                 char * l = strchr((*rep)[i], '=');
   964                 size_t offset = 2 + i - (*pos)[i];
   965                 strncpy(prep_word + offset, (*rep)[i], prep_word_size - offset - 1);
   966                 prep_word[prep_word_size - 1] = '\0';
   967                 if (l) {
   968                     hyph = (l - (*rep)[i]) - (*pos)[i];
   969                     prep_word[2 + i + hyph] = '\0';
   970                 }
   971             }
   972             hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph,
   973                 hyphens2, &rep2, &pos2, &cut2, clhmin,
   974                 crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend));
   975             for (j = 0; j < i - begin - 1; j++) {
   976                 hyphens[begin + j] = hyphens2[j];
   977                 if (rep2[j] && rep && pos && cut) {
   978                     if (!*rep && !*pos && !*cut) {
   979                         int k;
   980                         *rep = (char **) malloc(sizeof(char *) * word_size);
   981                         *pos = (int *) malloc(sizeof(int) * word_size);
   982                         *cut = (int *) malloc(sizeof(int) * word_size);
   983                         for (k = 0; k < word_size; k++) {
   984                             (*rep)[k] = NULL;
   985                             (*pos)[k] = 0;
   986                             (*cut)[k] = 0;
   987                         }
   988                     }
   989                     (*rep)[begin + j] = rep2[j];
   990                     (*pos)[begin + j] = pos2[j];
   991                     (*cut)[begin + j] = cut2[j];
   992                 }
   993             }
   994             prep_word[i + 2] = word[i + 1];
   995             if (*rep && *pos && *cut && (*rep)[i]) {
   996                 size_t offset = 1;
   997                 strncpy(prep_word + offset, word, prep_word_size - offset - 1);
   998                 prep_word[prep_word_size - 1] = '\0';
   999             }
  1001         begin = i + 1;
  1002         for (j = 0; j < word_size; j++) rep2[j] = NULL;
  1005      // non-compound
  1006      if (begin == 0) {
  1007         hnj_hyphen_hyph_(dict->nextlevel, word, word_size,
  1008             hyphens, rep, pos, cut, clhmin, crhmin, lend, rend);
  1009         if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
  1010             rep, pos, cut, clhmin);
  1011         if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
  1012             rep, pos, cut, crhmin);
  1015      free(rep2);
  1016      free(cut2);
  1017      free(pos2);
  1018      free(hyphens2);
  1021   hnj_free (prep_word);
  1022   return 0;
  1025 /* UTF-8 normalization of hyphen and non-standard positions */
  1026 int hnj_hyphen_norm(const char *word, int word_size, char * hyphens,
  1027 	char *** rep, int ** pos, int ** cut)
  1029   int i, j, k;
  1030   if ((((unsigned char) word[0]) >> 6) == 2) {
  1031     fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word);
  1032     return 1;
  1035   /* calculate UTF-8 character positions */
  1036   for (i = 0, j = -1; i < word_size; i++) {
  1037     /* beginning of an UTF-8 character (not '10' start bits) */
  1038     if ((((unsigned char) word[i]) >> 6) != 2) j++;
  1039     hyphens[j] = hyphens[i];
  1040     if (rep && pos && cut && *rep && *pos && *cut) {
  1041         int l = (*pos)[i];
  1042         (*pos)[j] = 0;
  1043         for (k = 0; k < l; k++) {
  1044             if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++;
  1046         k = i - l + 1;
  1047         l = k + (*cut)[i];
  1048         (*cut)[j] = 0;        
  1049         for (; k < l; k++) {
  1050             if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++;
  1052         (*rep)[j] = (*rep)[i];
  1053         if (j < i) {
  1054             (*rep)[i] = NULL;
  1055             (*pos)[i] = 0;
  1056             (*cut)[i] = 0;
  1060   hyphens[j + 1] = '\0';
  1061 #ifdef VERBOSE
  1062   printf ("nums: %s\n", hyphens);
  1063 #endif
  1064   return 0;
  1067 /* get the word with all possible hyphenations (output: hyphword) */
  1068 void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens, 
  1069     char * hyphword, char *** rep, int ** pos, int ** cut)
  1071   int hyphenslen = l + 5;
  1073   int i, j;
  1074   for (i = 0, j = 0; i < l; i++, j++) {
  1075     if (hyphens[i]&1) {
  1076       hyphword[j] = word[i];
  1077       if (*rep && *pos && *cut && (*rep)[i]) {
  1078         size_t offset = j - (*pos)[i] + 1;
  1079         strncpy(hyphword + offset, (*rep)[i], hyphenslen - offset - 1);
  1080         hyphword[hyphenslen-1] = '\0';
  1081         j += strlen((*rep)[i]) - (*pos)[i];
  1082         i += (*cut)[i] - (*pos)[i];
  1083       } else hyphword[++j] = '=';
  1084     } else hyphword[j] = word[i];
  1086   hyphword[j] = '\0';
  1090 /* main api function with default hyphenmin parameters */
  1091 int hnj_hyphen_hyphenate2 (HyphenDict *dict,
  1092 			   const char *word, int word_size, char * hyphens,
  1093 			   char *hyphword, char *** rep, int ** pos, int ** cut)
  1095   hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
  1096     dict->clhmin, dict->crhmin, 1, 1);
  1097   hnj_hyphen_lhmin(dict->utf8, word, word_size,
  1098     hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2));
  1099   hnj_hyphen_rhmin(dict->utf8, word, word_size,
  1100     hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2));
  1102   /* nohyphen */
  1103   if (dict->nohyphen) {
  1104     char * nh = dict->nohyphen;
  1105     int nhi;
  1106     for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {
  1107         char * nhy = (char *) strstr(word, nh);
  1108         while (nhy) {
  1109             hyphens[nhy - word + strlen(nh) - 1] = '0';
  1110             if (nhy - word  - 1 >= 0) hyphens[nhy - word - 1] = '0';
  1111             nhy = (char *) strstr(nhy + 1, nh);
  1113         nh = nh + strlen(nh) + 1;
  1117   if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
  1118   if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
  1119 #ifdef VERBOSE
  1120   printf ("nums: %s\n", hyphens);
  1121 #endif
  1122   return 0;
  1125 /* previous main api function with hyphenmin parameters */
  1126 int hnj_hyphen_hyphenate3 (HyphenDict *dict,
  1127 	const char *word, int word_size, char * hyphens,
  1128 	char *hyphword, char *** rep, int ** pos, int ** cut,
  1129 	int lhmin, int rhmin, int clhmin, int crhmin)
  1131   lhmin = (lhmin > dict->lhmin) ? lhmin : dict->lhmin;
  1132   rhmin = (rhmin > dict->rhmin) ? rhmin : dict->rhmin;
  1133   clhmin = (clhmin > dict->clhmin) ? clhmin : dict->clhmin;
  1134   crhmin = (crhmin > dict->crhmin) ? crhmin : dict->crhmin;
  1135   hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
  1136     clhmin, crhmin, 1, 1);
  1137   hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
  1138     rep, pos, cut, (lhmin > 0 ? lhmin : 2));
  1139   hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
  1140     rep, pos, cut, (rhmin > 0 ? rhmin : 2));
  1141   if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
  1143   /* nohyphen */
  1144   if (dict->nohyphen) {
  1145     char * nh = dict->nohyphen;
  1146     int nhi;
  1147     for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {
  1148         char * nhy = (char *) strstr(word, nh);
  1149         while (nhy) {
  1150             hyphens[nhy - word + strlen(nh) - 1] = 0;
  1151             if (nhy - word  - 1 >= 0) hyphens[nhy - word - 1] = 0;
  1152             nhy = (char *) strstr(nhy + 1, nh);
  1154         nh = nh + strlen(nh) + 1;
  1158   if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
  1159   return 0;

mercurial