The Tor Browser: intl/hyphenation/src/hyphen.c@fc2d59ddac77

     1 /* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both

     2  * licenses follows.

     3  */

     5 /* LibHnj - a library for high quality hyphenation and justification

     6  * Copyright (C) 1998 Raph Levien,

     7  * 	     (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org),

     8  *           (C) 2001 Peter Novodvorsky (nidd@cs.msu.su)

     9  *           (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo)

    10  *

    11  * This library is free software; you can redistribute it and/or

    12  * modify it under the terms of the GNU Library General Public

    13  * License as published by the Free Software Foundation; either

    14  * version 2 of the License, or (at your option) any later version.

    15  *

    16  * This library is distributed in the hope that it will be useful,

    17  * but WITHOUT ANY WARRANTY; without even the implied warranty of

    18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU

    19  * Library General Public License for more details.

    20  *

    21  * You should have received a copy of the GNU Library General Public

    22  * License along with this library; if not, write to the

    23  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,

    24  * Boston, MA  02111-1307  USA.

    25 */

    27 /*

    28  * The contents of this file are subject to the Mozilla Public License

    29  * Version 1.0 (the "MPL"); you may not use this file except in

    30  * compliance with the MPL.  You may obtain a copy of the MPL at

    31  * http://www.mozilla.org/MPL/

    32  *

    33  * Software distributed under the MPL is distributed on an "AS IS" basis,

    34  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL

    35  * for the specific language governing rights and limitations under the

    36  * MPL.

    37  *

    38  */

    39 #include <stdlib.h> /* for NULL, malloc */

    40 #include <stdio.h>  /* for fprintf */

    41 #include <string.h> /* for strdup */

    43 #ifdef UNX

    44 #include <unistd.h> /* for exit */

    45 #endif

    47 #define noVERBOSE

    49 /* calculate hyphenmin values with long ligature length (2 or 3 characters

    50  * instead of 1 or 2) for comparison with hyphenation without ligatures */

    51 #define noLONG_LIGATURE

    53 #ifdef LONG_LIGATURE

    54 #define LIG_xx	1

    55 #define LIG_xxx	2

    56 #else

    57 #define LIG_xx	0

    58 #define LIG_xxx	1

    59 #endif

    61 #include "hnjalloc.h"

    62 #include "hyphen.h"

    64 static char *

    65 hnj_strdup (const char *s)

    66 {

    67   char *new;

    68   int l;

    70   l = strlen (s);

    71   new = hnj_malloc (l + 1);

    72   memcpy (new, s, l);

    73   new[l] = 0;

    74   return new;

    75 }

    77 /* remove cross-platform text line end characters */

    78 void hnj_strchomp(char * s)

    79 {

    80   int k = strlen(s);

    81   if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';

    82   if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';

    83 }

    85 /* a little bit of a hash table implementation. This simply maps strings

    86    to state numbers */

    88 typedef struct _HashTab HashTab;

    89 typedef struct _HashEntry HashEntry;

    91 /* A cheap, but effective, hack. */

    92 #define HASH_SIZE 31627

    94 struct _HashTab {

    95   HashEntry *entries[HASH_SIZE];

    96 };

    98 struct _HashEntry {

    99   HashEntry *next;

   100   char *key;

   101   int val;

   102 };

   104 /* a char* hash function from ASU - adapted from Gtk+ */

   105 static unsigned int

   106 hnj_string_hash (const char *s)

   107 {

   108   const char *p;

   109   unsigned int h=0, g;

   110   for(p = s; *p != '\0'; p += 1) {

   111     h = ( h << 4 ) + *p;

   112     if ( ( g = h & 0xf0000000 ) ) {

   113       h = h ^ (g >> 24);

   114       h = h ^ g;

   115     }

   116   }

   117   return h /* % M */;

   118 }

   120 static HashTab *

   121 hnj_hash_new (void)

   122 {

   123   HashTab *hashtab;

   124   int i;

   126   hashtab = hnj_malloc (sizeof(HashTab));

   127   for (i = 0; i < HASH_SIZE; i++)

   128     hashtab->entries[i] = NULL;

   130   return hashtab;

   131 }

   133 static void

   134 hnj_hash_free (HashTab *hashtab)

   135 {

   136   int i;

   137   HashEntry *e, *next;

   139   for (i = 0; i < HASH_SIZE; i++)

   140     for (e = hashtab->entries[i]; e; e = next)

   141       {

   142 	next = e->next;

   143 	hnj_free (e->key);

   144 	hnj_free (e);

   145       }

   147   hnj_free (hashtab);

   148 }

   150 /* assumes that key is not already present! */

   151 static void

   152 hnj_hash_insert (HashTab *hashtab, const char *key, int val)

   153 {

   154   int i;

   155   HashEntry *e;

   157   i = hnj_string_hash (key) % HASH_SIZE;

   158   e = hnj_malloc (sizeof(HashEntry));

   159   e->next = hashtab->entries[i];

   160   e->key = hnj_strdup (key);

   161   e->val = val;

   162   hashtab->entries[i] = e;

   163 }

   165 /* return val if found, otherwise -1 */

   166 static int

   167 hnj_hash_lookup (HashTab *hashtab, const char *key)

   168 {

   169   int i;

   170   HashEntry *e;

   171   i = hnj_string_hash (key) % HASH_SIZE;

   172   for (e = hashtab->entries[i]; e; e = e->next)

   173     if (!strcmp (key, e->key))

   174       return e->val;

   175   return -1;

   176 }

   178 /* Get the state number, allocating a new state if necessary. */

   179 static int

   180 hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string)

   181 {

   182   int state_num;

   184   state_num = hnj_hash_lookup (hashtab, string);

   186   if (state_num >= 0)

   187     return state_num;

   189   hnj_hash_insert (hashtab, string, dict->num_states);

   190   /* predicate is true if dict->num_states is a power of two */

   191   if (!(dict->num_states & (dict->num_states - 1)))

   192     {

   193       dict->states = hnj_realloc (dict->states,

   194 				  (dict->num_states << 1) *

   195 				  sizeof(HyphenState));

   196     }

   197   dict->states[dict->num_states].match = NULL;

   198   dict->states[dict->num_states].repl = NULL;

   199   dict->states[dict->num_states].fallback_state = -1;

   200   dict->states[dict->num_states].num_trans = 0;

   201   dict->states[dict->num_states].trans = NULL;

   202   return dict->num_states++;

   203 }

   205 /* add a transition from state1 to state2 through ch - assumes that the

   206    transition does not already exist */

   207 static void

   208 hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch)

   209 {

   210   int num_trans;

   212   num_trans = dict->states[state1].num_trans;

   213   if (num_trans == 0)

   214     {

   215       dict->states[state1].trans = hnj_malloc (sizeof(HyphenTrans));

   216     }

   217   else if (!(num_trans & (num_trans - 1)))

   218     {

   219       dict->states[state1].trans = hnj_realloc (dict->states[state1].trans,

   220 						(num_trans << 1) *

   221 						sizeof(HyphenTrans));

   222     }

   223   dict->states[state1].trans[num_trans].ch = ch;

   224   dict->states[state1].trans[num_trans].new_state = state2;

   225   dict->states[state1].num_trans++;

   226 }

   228 #ifdef VERBOSE

   229 HashTab *global[1];

   231 static char *

   232 get_state_str (int state, int level)

   233 {

   234   int i;

   235   HashEntry *e;

   237   for (i = 0; i < HASH_SIZE; i++)

   238     for (e = global[level]->entries[i]; e; e = e->next)

   239       if (e->val == state)

   240 	return e->key;

   241   return NULL;

   242 }

   243 #endif

   245 void hnj_hyphen_load_line(char * buf, HyphenDict * dict, HashTab * hashtab) {

   246   int i, j;

   247   char word[MAX_CHARS];

   248   char pattern[MAX_CHARS];

   249   char * repl;

   250   signed char replindex;

   251   signed char replcut;

   252   int state_num = 0;

   253   int last_state;

   254   char ch;

   255   int found;

   257 	  if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) {

   258 	    dict->lhmin = atoi(buf + 13);

   259 	    return;

   260 	  } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) {

   261 	    dict->rhmin = atoi(buf + 14);

   262 	    return;

   263 	  } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) {

   264 	    dict->clhmin = atoi(buf + 21);

   265 	    return;

   266 	  } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) {

   267 	    dict->crhmin = atoi(buf + 22);

   268 	    return;

   269 	  } else if (strncmp(buf, "NOHYPHEN", 8) == 0) {

   270 	    char * space = buf + 8;

   271 	    while (*space != '\0' && (*space == ' ' || *space == '\t')) space++;

   272 	    if (*buf != '\0') dict->nohyphen = hnj_strdup(space);

   273 	    if (dict->nohyphen) {

   274 	        char * nhe = dict->nohyphen + strlen(dict->nohyphen) - 1;

   275 	        *nhe = 0;

   276 	        for (nhe = nhe - 1; nhe > dict->nohyphen; nhe--) {

   277 	                if (*nhe == ',') {

   278 	                    dict->nohyphenl++;

   279 	                    *nhe = 0;

   280 	                }

   281 	        }

   282 	    }

   283 	    return;

   284 	  }

   285 	  j = 0;

   286 	  pattern[j] = '0';

   287           repl = strchr(buf, '/');

   288           replindex = 0;

   289           replcut = 0;

   290           if (repl) {

   291             char * index = strchr(repl + 1, ',');

   292             *repl = '\0';

   293             if (index) {

   294                 char * index2 = strchr(index + 1, ',');

   295                 *index = '\0';

   296                 if (index2) {

   297                     *index2 = '\0';

   298                     replindex = (signed char) atoi(index + 1) - 1;

   299                     replcut = (signed char) atoi(index2 + 1);

   300                 }

   301             } else {

   302                 hnj_strchomp(repl + 1);

   303                 replindex = 0;

   304                 replcut = (signed char) strlen(buf);

   305             }

   306             repl = hnj_strdup(repl + 1);

   307           }

   308 	  for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++)

   309 	    {

   310 	      if (buf[i] >= '0' && buf[i] <= '9')

   311 		pattern[j] = buf[i];

   312 	      else

   313 		{

   314 		  word[j] = buf[i];

   315 		  pattern[++j] = '0';

   316 		}

   317 	    }

   318 	  word[j] = '\0';

   319 	  pattern[j + 1] = '\0';

   321           i = 0;

   322 	  if (!repl) {

   323 	    /* Optimize away leading zeroes */

   324             for (; pattern[i] == '0'; i++);

   325           } else {

   326             if (*word == '.') i++;

   327             /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */

   328             if (dict->utf8) {

   329                 int pu = -1;        /* unicode character position */

   330                 int ps = -1;        /* unicode start position (original replindex) */

   331                 int pc = (*word == '.') ? 1: 0; /* 8-bit character position */

   332                 for (; pc < (strlen(word) + 1); pc++) {

   333                 /* beginning of an UTF-8 character (not '10' start bits) */

   334                     if ((((unsigned char) word[pc]) >> 6) != 2) pu++;

   335                     if ((ps < 0) && (replindex == pu)) {

   336                         ps = replindex;

   337                         replindex = (signed char) pc;

   338                     }

   339                     if ((ps >= 0) && ((pu - ps) == replcut)) {

   340                         replcut = (signed char) (pc - replindex);

   341                         break;

   342                     }

   343                 }

   344                 if (*word == '.') replindex--;

   345             }

   346           }

   348 #ifdef VERBOSE

   349 	  printf ("word %s pattern %s, j = %d  repl: %s\n", word, pattern + i, j, repl);

   350 #endif

   351 	  found = hnj_hash_lookup (hashtab, word);

   352 	  state_num = hnj_get_state (dict, hashtab, word);

   353 	  dict->states[state_num].match = hnj_strdup (pattern + i);

   354 	  dict->states[state_num].repl = repl;

   355 	  dict->states[state_num].replindex = replindex;

   356           if (!replcut) {

   357             dict->states[state_num].replcut = (signed char) strlen(word);

   358           } else {

   359             dict->states[state_num].replcut = replcut;

   360           }

   362 	  /* now, put in the prefix transitions */

   363           for (; found < 0 ;j--)

   364 	    {

   365 	      last_state = state_num;

   366 	      ch = word[j - 1];

   367 	      word[j - 1] = '\0';

   368 	      found = hnj_hash_lookup (hashtab, word);

   369 	      state_num = hnj_get_state (dict, hashtab, word);

   370 	      hnj_add_trans (dict, state_num, last_state, ch);

   371 	    }

   372 }

   374 HyphenDict *

   375 hnj_hyphen_load (const char *fn)

   376 {

   377   HyphenDict *dict[2];

   378   HashTab *hashtab;

   379   FILE *f;

   380   char buf[MAX_CHARS];

   381   int nextlevel = 0;

   382   int i, j, k;

   383   HashEntry *e;

   384   int state_num = 0;

   386   f = fopen (fn, "r");

   387   if (f == NULL)

   388     return NULL;

   390 // loading one or two dictionaries (separated by NEXTLEVEL keyword)

   391 for (k = 0; k < 2; k++) {

   392   hashtab = hnj_hash_new ();

   393 #ifdef VERBOSE

   394   global[k] = hashtab;

   395 #endif

   396   hnj_hash_insert (hashtab, "", 0);

   397   dict[k] = hnj_malloc (sizeof(HyphenDict));

   398   dict[k]->num_states = 1;

   399   dict[k]->states = hnj_malloc (sizeof(HyphenState));

   400   dict[k]->states[0].match = NULL;

   401   dict[k]->states[0].repl = NULL;

   402   dict[k]->states[0].fallback_state = -1;

   403   dict[k]->states[0].num_trans = 0;

   404   dict[k]->states[0].trans = NULL;

   405   dict[k]->nextlevel = NULL;

   406   dict[k]->lhmin = 0;

   407   dict[k]->rhmin = 0;

   408   dict[k]->clhmin = 0;

   409   dict[k]->crhmin = 0;

   410   dict[k]->nohyphen = NULL;

   411   dict[k]->nohyphenl = 0;

   413   /* read in character set info */

   414   if (k == 0) {

   415     for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;

   416     if (fgets(dict[k]->cset,  sizeof(dict[k]->cset),f) != NULL) {

   417       for (i=0;i<MAX_NAME;i++)

   418         if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))

   419           dict[k]->cset[i] = 0;

   420     } else {

   421       dict[k]->cset[0] = 0;

   422     }

   423     dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);

   424   } else {

   425     strncpy(dict[k]->cset, dict[0]->cset, sizeof(dict[k]->cset)-1);

   426     dict[k]->cset[sizeof(dict[k]->cset)-1] = '\0';

   427     dict[k]->utf8 = dict[0]->utf8;

   428   }

   430   if (k == 0 || nextlevel) {

   431     while (fgets (buf, sizeof(buf), f) != NULL) {

   432       if (strncmp(buf, "NEXTLEVEL", 9) == 0) {

   433 	nextlevel = 1;

   434 	break;

   435       } else if (buf[0] != '%') hnj_hyphen_load_line(buf, dict[k], hashtab);

   436     }

   437   } else if (k == 1) {

   438     /* default first level: hyphen and ASCII apostrophe */

   439     if (!dict[0]->utf8) hnj_hyphen_load_line("NOHYPHEN ',-\n", dict[k], hashtab);

   440     else hnj_hyphen_load_line("NOHYPHEN ',\xe2\x80\x93,\xe2\x80\x99,-\n", dict[k], hashtab);

   441     strncpy(buf, "1-1\n", MAX_CHARS-1); // buf rewritten by hnj_hyphen_load here

   442     buf[MAX_CHARS-1] = '\0';

   443     hnj_hyphen_load_line(buf, dict[k], hashtab); /* remove hyphen */

   444     hnj_hyphen_load_line("1'1\n", dict[k], hashtab); /* ASCII apostrophe */

   445     if (dict[0]->utf8) {

   446       hnj_hyphen_load_line("1\xe2\x80\x93" "1\n", dict[k], hashtab); /* endash */

   447       hnj_hyphen_load_line("1\xe2\x80\x99" "1\n", dict[k], hashtab); /* apostrophe */

   448     }

   449   }

   451   /* Could do unioning of matches here (instead of the preprocessor script).

   452      If we did, the pseudocode would look something like this:

   454      foreach state in the hash table

   455         foreach i = [1..length(state) - 1]

   456            state to check is substr (state, i)

   457            look it up

   458            if found, and if there is a match, union the match in.

   460      It's also possible to avoid the quadratic blowup by doing the

   461      search in order of increasing state string sizes - then you

   462      can break the loop after finding the first match.

   464      This step should be optional in any case - if there is a

   465      preprocessed rule table, it's always faster to use that.

   467 */

   469   /* put in the fallback states */

   470   for (i = 0; i < HASH_SIZE; i++)

   471     for (e = hashtab->entries[i]; e; e = e->next)

   472       {

   473 	if (*(e->key)) for (j = 1; 1; j++)

   474 	  {

   475 	    state_num = hnj_hash_lookup (hashtab, e->key + j);

   476 	    if (state_num >= 0)

   477 	      break;

   478 	  }

   479         /* KBH: FIXME state 0 fallback_state should always be -1? */

   480 	if (e->val)

   481 	  dict[k]->states[e->val].fallback_state = state_num;

   482       }

   483 #ifdef VERBOSE

   484   for (i = 0; i < HASH_SIZE; i++)

   485     for (e = hashtab->entries[i]; e; e = e->next)

   486       {

   487 	printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,

   488 		dict[k]->states[e->val].fallback_state);

   489 	for (j = 0; j < dict[k]->states[e->val].num_trans; j++)

   490 	  printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch,

   491 		  dict[k]->states[e->val].trans[j].new_state);

   492       }

   493 #endif

   495 #ifndef VERBOSE

   496   hnj_hash_free (hashtab);

   497 #endif

   498   state_num = 0;

   499 }

   500   fclose(f);

   501   if (nextlevel) dict[0]->nextlevel = dict[1];

   502   else {

   503     dict[1] -> nextlevel = dict[0];

   504     dict[1]->lhmin = dict[0]->lhmin;

   505     dict[1]->rhmin = dict[0]->rhmin;

   506     dict[1]->clhmin = (dict[0]->clhmin) ? dict[0]->clhmin : ((dict[0]->lhmin) ? dict[0]->lhmin : 3);

   507     dict[1]->crhmin = (dict[0]->crhmin) ? dict[0]->crhmin : ((dict[0]->rhmin) ? dict[0]->rhmin : 3);

   508 #ifdef VERBOSE

   509     HashTab *r = global[0];

   510     global[0] = global[1];

   511     global[1] = r;

   512 #endif

   513     return dict[1];

   514   }

   515   return dict[0];

   516 }

   518 void hnj_hyphen_free (HyphenDict *dict)

   519 {

   520   int state_num;

   521   HyphenState *hstate;

   523   for (state_num = 0; state_num < dict->num_states; state_num++)

   524     {

   525       hstate = &dict->states[state_num];

   526       if (hstate->match)

   527 	hnj_free (hstate->match);

   528       if (hstate->repl)

   529 	hnj_free (hstate->repl);

   530       if (hstate->trans)

   531 	hnj_free (hstate->trans);

   532     }

   533   if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel);

   535   if (dict->nohyphen) hnj_free(dict->nohyphen);

   537   hnj_free (dict->states);

   539   hnj_free (dict);

   540 }

   542 #define MAX_WORD 256

   544 int hnj_hyphen_hyphenate (HyphenDict *dict,

   545 			   const char *word, int word_size,

   546 			   char *hyphens)

   547 {

   548   char *prep_word;

   549   int i, j, k;

   550   int state;

   551   char ch;

   552   HyphenState *hstate;

   553   char *match;

   554   int offset;

   556   prep_word = hnj_malloc (word_size + 3);

   558   j = 0;

   559   prep_word[j++] = '.';

   561   for (i = 0; i < word_size; i++) {

   562     if (word[i] <= '9' && word[i] >= '0') {

   563       prep_word[j++] = '.';

   564     } else {

   565       prep_word[j++] = word[i];

   566     }

   567   }

   569   prep_word[j++] = '.';

   570   prep_word[j] = '\0';

   572   for (i = 0; i < word_size + 5; i++)

   573     hyphens[i] = '0';

   575 #ifdef VERBOSE

   576   printf ("prep_word = %s\n", prep_word);

   577 #endif

   579   /* now, run the finite state machine */

   580   state = 0;

   581   for (i = 0; i < j; i++)

   582     {

   583       ch = prep_word[i];

   584       for (;;)

   585 	{

   587 	  if (state == -1) {

   588             /* return 1; */

   589 	    /*  KBH: FIXME shouldn't this be as follows? */

   590             state = 0;

   591             goto try_next_letter;

   592           }

   594 #ifdef VERBOSE

   595 	  char *state_str;

   596 	  state_str = get_state_str (state, 0);

   598 	  for (k = 0; k < i - strlen (state_str); k++)

   599 	    putchar (' ');

   600 	  printf ("%s", state_str);

   601 #endif

   603 	  hstate = &dict->states[state];

   604 	  for (k = 0; k < hstate->num_trans; k++)

   605 	    if (hstate->trans[k].ch == ch)

   606 	      {

   607 		state = hstate->trans[k].new_state;

   608 		goto found_state;

   609 	      }

   610 	  state = hstate->fallback_state;

   611 #ifdef VERBOSE

   612 	  printf (" falling back, fallback_state %d\n", state);

   613 #endif

   614 	}

   615     found_state:

   616 #ifdef VERBOSE

   617       printf ("found state %d\n",state);

   618 #endif

   619       /* Additional optimization is possible here - especially,

   620 	 elimination of trailing zeroes from the match. Leading zeroes

   621 	 have already been optimized. */

   622       match = dict->states[state].match;

   623       /* replacing rules not handled by hyphen_hyphenate() */

   624       if (match && !dict->states[state].repl)

   625 	{

   626 	  offset = i + 1 - strlen (match);

   627 #ifdef VERBOSE

   628 	  for (k = 0; k < offset; k++)

   629 	    putchar (' ');

   630 	  printf ("%s\n", match);

   631 #endif

   632 	  /* This is a linear search because I tried a binary search and

   633 	     found it to be just a teeny bit slower. */

   634 	  for (k = 0; match[k]; k++)

   635 	    if (hyphens[offset + k] < match[k])

   636 	      hyphens[offset + k] = match[k];

   637 	}

   639       /* KBH: we need this to make sure we keep looking in a word */

   640       /* for patterns even if the current character is not known in state 0 */

   641       /* since patterns for hyphenation may occur anywhere in the word */

   642       try_next_letter: ;

   644     }

   645 #ifdef VERBOSE

   646   for (i = 0; i < j; i++)

   647     putchar (hyphens[i]);

   648   putchar ('\n');

   649 #endif

   651   for (i = 0; i < j - 4; i++)

   652 #if 0

   653     if (hyphens[i + 1] & 1)

   654       hyphens[i] = '-';

   655 #else

   656     hyphens[i] = hyphens[i + 1];

   657 #endif

   658   hyphens[0] = '0';

   659   for (; i < word_size; i++)

   660     hyphens[i] = '0';

   661   hyphens[word_size] = '\0';

   663   hnj_free (prep_word);

   665   return 0;

   666 }

   668 /* Unicode ligature length */

   669 int hnj_ligature(unsigned char c) {

   670     switch (c) {

   671         case 0x80:			/* ff */

   672         case 0x81:			/* fi */

   673         case 0x82: return LIG_xx;	/* fl */

   674         case 0x83:			/* ffi */

   675         case 0x84: return LIG_xxx;	/* ffl */

   676         case 0x85:			/* long st */

   677         case 0x86: return LIG_xx;	/* st */

   678     }

   679     return 0;

   680 }

   682 /* character length of the first n byte of the input word */

   683 int hnj_hyphen_strnlen(const char * word, int n, int utf8)

   684 {

   685     int i = 0;

   686     int j = 0;

   687     while (j < n && word[j] != '\0') {

   688       i++;

   689       // Unicode ligature support

   690       if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC))  {

   691         i += hnj_ligature(word[j + 2]);

   692       }

   693       for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++);

   694     }

   695     return i;

   696 }

   698 int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens,

   699 	char *** rep, int ** pos, int ** cut, int lhmin)

   700 {

   701     int i = 1, j;

   703     // Unicode ligature support

   704     if (utf8 && ((unsigned char) word[0] == 0xEF) && ((unsigned char) word[1] == 0xAC))  {

   705       i += hnj_ligature(word[2]);

   706     }

   708     // ignore numbers

   709     for (j = 0; word[j] <= '9' && word[j] >= '0'; j++) i--;

   711     for (j = 0; i < lhmin && word[j] != '\0'; i++) do {

   712       // check length of the non-standard part

   713       if (*rep && *pos && *cut && (*rep)[j]) {

   714         char * rh = strchr((*rep)[j], '=');

   715         if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) +

   716           hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) {

   717             free((*rep)[j]);

   718             (*rep)[j] = NULL;

   719             hyphens[j] = '0';

   720           }

   721        } else {

   722          hyphens[j] = '0';

   723        }

   724        j++;

   726        // Unicode ligature support

   727        if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC))  {

   728          i += hnj_ligature(word[j + 2]);

   729        }

   730     } while (utf8 && (word[j] & 0xc0) == 0x80);

   731     return 0;

   732 }

   734 int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens,

   735 	char *** rep, int ** pos, int ** cut, int rhmin)

   736 {

   737     int i = 0;

   738     int j;

   740     // ignore numbers

   741     for (j = word_size - 1; j > 0 && word[j] <= '9' && word[j] >= '0'; j--) i--;

   743     for (j = word_size - 1; i < rhmin && j > 0; j--) {

   744       // check length of the non-standard part

   745       if (*rep && *pos && *cut && (*rep)[j]) {

   746         char * rh = strchr((*rep)[j], '=');

   747         if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) +

   748           hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) {

   749             free((*rep)[j]);

   750             (*rep)[j] = NULL;

   751             hyphens[j] = '0';

   752           }

   753        } else {

   754          hyphens[j] = '0';

   755        }

   756        if (!utf8 || (word[j] & 0xc0) == 0xc0 || (word[j] & 0x80) != 0x80) i++;

   757     }

   758     return 0;

   759 }

   761 // recursive function for compound level hyphenation

   762 int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,

   763     char * hyphens, char *** rep, int ** pos, int ** cut,

   764     int clhmin, int crhmin, int lend, int rend)

   765 {

   766   char *prep_word;

   767   int i, j, k;

   768   int state;

   769   char ch;

   770   HyphenState *hstate;

   771   char *match;

   772   char *repl;

   773   signed char replindex;

   774   signed char replcut;

   775   int offset;

   776   int * matchlen;

   777   int * matchindex;

   778   char ** matchrepl;

   779   int isrepl = 0;

   780   int nHyphCount;

   782   size_t prep_word_size = word_size + 3;

   783   prep_word = hnj_malloc (prep_word_size);

   784   matchlen = hnj_malloc ((word_size + 3) * sizeof(int));

   785   matchindex = hnj_malloc ((word_size + 3) * sizeof(int));

   786   matchrepl = hnj_malloc ((word_size + 3) * sizeof(char *));

   788   j = 0;

   789   prep_word[j++] = '.';

   791   for (i = 0; i < word_size; i++) {

   792     if (word[i] <= '9' && word[i] >= '0') {

   793       prep_word[j++] = '.';

   794     } else {

   795       prep_word[j++] = word[i];

   796     }

   797   }

   801   prep_word[j++] = '.';

   802   prep_word[j] = '\0';

   804   for (i = 0; i < j; i++)

   805     hyphens[i] = '0';

   807 #ifdef VERBOSE

   808   printf ("prep_word = %s\n", prep_word);

   809 #endif

   811   /* now, run the finite state machine */

   812   state = 0;

   813   for (i = 0; i < j; i++)

   814     {

   815       ch = prep_word[i];

   816       for (;;)

   817 	{

   819 	  if (state == -1) {

   820             /* return 1; */

   821 	    /*  KBH: FIXME shouldn't this be as follows? */

   822             state = 0;

   823             goto try_next_letter;

   824           }

   826 #ifdef VERBOSE

   827 	  char *state_str;

   828 	  state_str = get_state_str (state, 1);

   830 	  for (k = 0; k < i - strlen (state_str); k++)

   831 	    putchar (' ');

   832 	  printf ("%s", state_str);

   833 #endif

   835 	  hstate = &dict->states[state];

   836 	  for (k = 0; k < hstate->num_trans; k++)

   837 	    if (hstate->trans[k].ch == ch)

   838 	      {

   839 		state = hstate->trans[k].new_state;

   840 		goto found_state;

   841 	      }

   842 	  state = hstate->fallback_state;

   843 #ifdef VERBOSE

   844 	  printf (" falling back, fallback_state %d\n", state);

   845 #endif

   846 	}

   847     found_state:

   848 #ifdef VERBOSE

   849       printf ("found state %d\n",state);

   850 #endif

   851       /* Additional optimization is possible here - especially,

   852 	 elimination of trailing zeroes from the match. Leading zeroes

   853 	 have already been optimized. */

   854       match = dict->states[state].match;

   855       repl = dict->states[state].repl;

   856       replindex = dict->states[state].replindex;

   857       replcut = dict->states[state].replcut;

   858       /* replacing rules not handled by hyphen_hyphenate() */

   859       if (match)

   860 	{

   861 	  offset = i + 1 - strlen (match);

   862 #ifdef VERBOSE

   863 	  for (k = 0; k < offset; k++)

   864 	    putchar (' ');

   865 	  printf ("%s (%s)\n", match, repl);

   866 #endif

   867           if (repl) {

   868             if (!isrepl) for(; isrepl < word_size; isrepl++) {

   869                 matchrepl[isrepl] = NULL;

   870                 matchindex[isrepl] = -1;

   871             }

   872             matchlen[offset + replindex] = replcut;

   873           }

   874 	  /* This is a linear search because I tried a binary search and

   875 	     found it to be just a teeny bit slower. */

   876 	  for (k = 0; match[k]; k++) {

   877 	    if ((hyphens[offset + k] < match[k])) {

   878 	      hyphens[offset + k] = match[k];

   879               if (match[k]&1) {

   880                 matchrepl[offset + k] = repl;

   881                 if (repl && (k >= replindex) && (k <= replindex + replcut)) {

   882                     matchindex[offset + replindex] = offset + k;

   883                 }

   884               }

   885             }

   886           }

   888 	}

   890       /* KBH: we need this to make sure we keep looking in a word */

   891       /* for patterns even if the current character is not known in state 0 */

   892       /* since patterns for hyphenation may occur anywhere in the word */

   893       try_next_letter: ;

   895     }

   896 #ifdef VERBOSE

   897   for (i = 0; i < j; i++)

   898     putchar (hyphens[i]);

   899   putchar ('\n');

   900 #endif

   902   for (i = 0; i < j - 3; i++)

   903 #if 0

   904     if (hyphens[i + 1] & 1)

   905       hyphens[i] = '-';

   906 #else

   907     hyphens[i] = hyphens[i + 1];

   908 #endif

   909   for (; i < word_size; i++)

   910     hyphens[i] = '0';

   911   hyphens[word_size] = '\0';

   913        /* now create a new char string showing hyphenation positions */

   914        /* count the hyphens and allocate space for the new hyphenated string */

   915        nHyphCount = 0;

   916        for (i = 0; i < word_size; i++)

   917           if (hyphens[i]&1)

   918              nHyphCount++;

   919        j = 0;

   920        for (i = 0; i < word_size; i++) {

   921            if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) {

   922                 if (rep && pos && cut) {

   923                     if (!*rep)

   924                         *rep = (char **) calloc(word_size, sizeof(char *));

   925                     if (!*pos)

   926                         *pos = (int *) calloc(word_size, sizeof(int));

   927                     if (!*cut) {

   928                         *cut = (int *) calloc(word_size, sizeof(int));

   929                     }

   930                     (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]);

   931                     (*pos)[matchindex[i] - 1] = matchindex[i] - i;

   932                     (*cut)[matchindex[i] - 1] = matchlen[i];

   933                 }

   934                 j += strlen(matchrepl[matchindex[i]]);

   935                 i += matchlen[i] - 1;

   936           }

   937        }

   939   hnj_free (matchrepl);

   940   hnj_free (matchlen);

   941   hnj_free (matchindex);

   943   // recursive hyphenation of the first (compound) level segments

   944   if (dict->nextlevel) {

   945      char ** rep2;

   946      int * pos2;

   947      int * cut2;

   948      char * hyphens2;

   949      int begin = 0;

   951      rep2 = hnj_malloc (word_size * sizeof(char *));

   952      pos2 = hnj_malloc (word_size * sizeof(int));

   953      cut2 = hnj_malloc (word_size * sizeof(int));

   954      hyphens2 = hnj_malloc (word_size + 3);

   955      for (i = 0; i < word_size; i++) rep2[i] = NULL;

   956      for (i = 0; i < word_size; i++) if

   957         (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) {

   958         if (i - begin > 1) {

   959             int hyph = 0;

   960             prep_word[i + 2] = '\0';

   961             /* non-standard hyphenation at compound boundary (Schiffahrt) */

   962             if (rep && *rep && *pos && *cut && (*rep)[i]) {

   963                 char * l = strchr((*rep)[i], '=');

   964                 size_t offset = 2 + i - (*pos)[i];

   965                 strncpy(prep_word + offset, (*rep)[i], prep_word_size - offset - 1);

   966                 prep_word[prep_word_size - 1] = '\0';

   967                 if (l) {

   968                     hyph = (l - (*rep)[i]) - (*pos)[i];

   969                     prep_word[2 + i + hyph] = '\0';

   970                 }

   971             }

   972             hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph,

   973                 hyphens2, &rep2, &pos2, &cut2, clhmin,

   974                 crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend));

   975             for (j = 0; j < i - begin - 1; j++) {

   976                 hyphens[begin + j] = hyphens2[j];

   977                 if (rep2[j] && rep && pos && cut) {

   978                     if (!*rep && !*pos && !*cut) {

   979                         int k;

   980                         *rep = (char **) malloc(sizeof(char *) * word_size);

   981                         *pos = (int *) malloc(sizeof(int) * word_size);

   982                         *cut = (int *) malloc(sizeof(int) * word_size);

   983                         for (k = 0; k < word_size; k++) {

   984                             (*rep)[k] = NULL;

   985                             (*pos)[k] = 0;

   986                             (*cut)[k] = 0;

   987                         }

   988                     }

   989                     (*rep)[begin + j] = rep2[j];

   990                     (*pos)[begin + j] = pos2[j];

   991                     (*cut)[begin + j] = cut2[j];

   992                 }

   993             }

   994             prep_word[i + 2] = word[i + 1];

   995             if (*rep && *pos && *cut && (*rep)[i]) {

   996                 size_t offset = 1;

   997                 strncpy(prep_word + offset, word, prep_word_size - offset - 1);

   998                 prep_word[prep_word_size - 1] = '\0';

   999             }

  1000         }

  1001         begin = i + 1;

  1002         for (j = 0; j < word_size; j++) rep2[j] = NULL;

  1003      }

  1005      // non-compound

  1006      if (begin == 0) {

  1007         hnj_hyphen_hyph_(dict->nextlevel, word, word_size,

  1008             hyphens, rep, pos, cut, clhmin, crhmin, lend, rend);

  1009         if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,

  1010             rep, pos, cut, clhmin);

  1011         if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,

  1012             rep, pos, cut, crhmin);

  1013      }

  1015      free(rep2);

  1016      free(cut2);

  1017      free(pos2);

  1018      free(hyphens2);

  1019   }

  1021   hnj_free (prep_word);

  1022   return 0;

  1023 }

  1025 /* UTF-8 normalization of hyphen and non-standard positions */

  1026 int hnj_hyphen_norm(const char *word, int word_size, char * hyphens,

  1027 	char *** rep, int ** pos, int ** cut)

  1028 {

  1029   int i, j, k;

  1030   if ((((unsigned char) word[0]) >> 6) == 2) {

  1031     fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word);

  1032     return 1;

  1033   }

  1035   /* calculate UTF-8 character positions */

  1036   for (i = 0, j = -1; i < word_size; i++) {

  1037     /* beginning of an UTF-8 character (not '10' start bits) */

  1038     if ((((unsigned char) word[i]) >> 6) != 2) j++;

  1039     hyphens[j] = hyphens[i];

  1040     if (rep && pos && cut && *rep && *pos && *cut) {

  1041         int l = (*pos)[i];

  1042         (*pos)[j] = 0;

  1043         for (k = 0; k < l; k++) {

  1044             if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++;

  1045         }

  1046         k = i - l + 1;

  1047         l = k + (*cut)[i];

  1048         (*cut)[j] = 0;

  1049         for (; k < l; k++) {

  1050             if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++;

  1051         }

  1052         (*rep)[j] = (*rep)[i];

  1053         if (j < i) {

  1054             (*rep)[i] = NULL;

  1055             (*pos)[i] = 0;

  1056             (*cut)[i] = 0;

  1057         }

  1058     }

  1059   }

  1060   hyphens[j + 1] = '\0';

  1061 #ifdef VERBOSE

  1062   printf ("nums: %s\n", hyphens);

  1063 #endif

  1064   return 0;

  1065 }

  1067 /* get the word with all possible hyphenations (output: hyphword) */

  1068 void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens,

  1069     char * hyphword, char *** rep, int ** pos, int ** cut)

  1070 {

  1071   int hyphenslen = l + 5;

  1073   int i, j;

  1074   for (i = 0, j = 0; i < l; i++, j++) {

  1075     if (hyphens[i]&1) {

  1076       hyphword[j] = word[i];

  1077       if (*rep && *pos && *cut && (*rep)[i]) {

  1078         size_t offset = j - (*pos)[i] + 1;

  1079         strncpy(hyphword + offset, (*rep)[i], hyphenslen - offset - 1);

  1080         hyphword[hyphenslen-1] = '\0';

  1081         j += strlen((*rep)[i]) - (*pos)[i];

  1082         i += (*cut)[i] - (*pos)[i];

  1083       } else hyphword[++j] = '=';

  1084     } else hyphword[j] = word[i];

  1085   }

  1086   hyphword[j] = '\0';

  1087 }

  1090 /* main api function with default hyphenmin parameters */

  1091 int hnj_hyphen_hyphenate2 (HyphenDict *dict,

  1092 			   const char *word, int word_size, char * hyphens,

  1093 			   char *hyphword, char *** rep, int ** pos, int ** cut)

  1094 {

  1095   hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,

  1096     dict->clhmin, dict->crhmin, 1, 1);

  1097   hnj_hyphen_lhmin(dict->utf8, word, word_size,

  1098     hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2));

  1099   hnj_hyphen_rhmin(dict->utf8, word, word_size,

  1100     hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2));

  1102   /* nohyphen */

  1103   if (dict->nohyphen) {

  1104     char * nh = dict->nohyphen;

  1105     int nhi;

  1106     for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {

  1107         char * nhy = (char *) strstr(word, nh);

  1108         while (nhy) {

  1109             hyphens[nhy - word + strlen(nh) - 1] = '0';

  1110             if (nhy - word  - 1 >= 0) hyphens[nhy - word - 1] = '0';

  1111             nhy = (char *) strstr(nhy + 1, nh);

  1112         }

  1113         nh = nh + strlen(nh) + 1;

  1114     }

  1115   }

  1117   if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);

  1118   if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);

  1119 #ifdef VERBOSE

  1120   printf ("nums: %s\n", hyphens);

  1121 #endif

  1122   return 0;

  1123 }

  1125 /* previous main api function with hyphenmin parameters */

  1126 int hnj_hyphen_hyphenate3 (HyphenDict *dict,

  1127 	const char *word, int word_size, char * hyphens,

  1128 	char *hyphword, char *** rep, int ** pos, int ** cut,

  1129 	int lhmin, int rhmin, int clhmin, int crhmin)

  1130 {

  1131   lhmin = (lhmin > dict->lhmin) ? lhmin : dict->lhmin;

  1132   rhmin = (rhmin > dict->rhmin) ? rhmin : dict->rhmin;

  1133   clhmin = (clhmin > dict->clhmin) ? clhmin : dict->clhmin;

  1134   crhmin = (crhmin > dict->crhmin) ? crhmin : dict->crhmin;

  1135   hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,

  1136     clhmin, crhmin, 1, 1);

  1137   hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,

  1138     rep, pos, cut, (lhmin > 0 ? lhmin : 2));

  1139   hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,

  1140     rep, pos, cut, (rhmin > 0 ? rhmin : 2));

  1141   if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);

  1143   /* nohyphen */

  1144   if (dict->nohyphen) {

  1145     char * nh = dict->nohyphen;

  1146     int nhi;

  1147     for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {

  1148         char * nhy = (char *) strstr(word, nh);

  1149         while (nhy) {

  1150             hyphens[nhy - word + strlen(nh) - 1] = 0;

  1151             if (nhy - word  - 1 >= 0) hyphens[nhy - word - 1] = 0;

  1152             nhy = (char *) strstr(nhy + 1, nh);

  1153         }

  1154         nh = nh + strlen(nh) + 1;

  1155     }

  1156   }

  1158   if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);

  1159   return 0;

  1160 }

The Tor Browser / file revision

intl/hyphenation/src/hyphen.c@fc2d59ddac77

intl/hyphenation/src/hyphen.c