intl/hyphenation/src/hyphen.c

branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
equal deleted inserted replaced
-1:000000000000 0:0adaa130bec6
1 /* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both
2 * licenses follows.
3 */
4
5 /* LibHnj - a library for high quality hyphenation and justification
6 * Copyright (C) 1998 Raph Levien,
7 * (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org),
8 * (C) 2001 Peter Novodvorsky (nidd@cs.msu.su)
9 * (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo)
10 *
11 * This library is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Library General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
15 *
16 * This library is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Library General Public License for more details.
20 *
21 * You should have received a copy of the GNU Library General Public
22 * License along with this library; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA 02111-1307 USA.
25 */
26
27 /*
28 * The contents of this file are subject to the Mozilla Public License
29 * Version 1.0 (the "MPL"); you may not use this file except in
30 * compliance with the MPL. You may obtain a copy of the MPL at
31 * http://www.mozilla.org/MPL/
32 *
33 * Software distributed under the MPL is distributed on an "AS IS" basis,
34 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL
35 * for the specific language governing rights and limitations under the
36 * MPL.
37 *
38 */
39 #include <stdlib.h> /* for NULL, malloc */
40 #include <stdio.h> /* for fprintf */
41 #include <string.h> /* for strdup */
42
43 #ifdef UNX
44 #include <unistd.h> /* for exit */
45 #endif
46
47 #define noVERBOSE
48
49 /* calculate hyphenmin values with long ligature length (2 or 3 characters
50 * instead of 1 or 2) for comparison with hyphenation without ligatures */
51 #define noLONG_LIGATURE
52
53 #ifdef LONG_LIGATURE
54 #define LIG_xx 1
55 #define LIG_xxx 2
56 #else
57 #define LIG_xx 0
58 #define LIG_xxx 1
59 #endif
60
61 #include "hnjalloc.h"
62 #include "hyphen.h"
63
64 static char *
65 hnj_strdup (const char *s)
66 {
67 char *new;
68 int l;
69
70 l = strlen (s);
71 new = hnj_malloc (l + 1);
72 memcpy (new, s, l);
73 new[l] = 0;
74 return new;
75 }
76
77 /* remove cross-platform text line end characters */
78 void hnj_strchomp(char * s)
79 {
80 int k = strlen(s);
81 if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';
82 if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
83 }
84
85 /* a little bit of a hash table implementation. This simply maps strings
86 to state numbers */
87
88 typedef struct _HashTab HashTab;
89 typedef struct _HashEntry HashEntry;
90
91 /* A cheap, but effective, hack. */
92 #define HASH_SIZE 31627
93
94 struct _HashTab {
95 HashEntry *entries[HASH_SIZE];
96 };
97
98 struct _HashEntry {
99 HashEntry *next;
100 char *key;
101 int val;
102 };
103
104 /* a char* hash function from ASU - adapted from Gtk+ */
105 static unsigned int
106 hnj_string_hash (const char *s)
107 {
108 const char *p;
109 unsigned int h=0, g;
110 for(p = s; *p != '\0'; p += 1) {
111 h = ( h << 4 ) + *p;
112 if ( ( g = h & 0xf0000000 ) ) {
113 h = h ^ (g >> 24);
114 h = h ^ g;
115 }
116 }
117 return h /* % M */;
118 }
119
120 static HashTab *
121 hnj_hash_new (void)
122 {
123 HashTab *hashtab;
124 int i;
125
126 hashtab = hnj_malloc (sizeof(HashTab));
127 for (i = 0; i < HASH_SIZE; i++)
128 hashtab->entries[i] = NULL;
129
130 return hashtab;
131 }
132
133 static void
134 hnj_hash_free (HashTab *hashtab)
135 {
136 int i;
137 HashEntry *e, *next;
138
139 for (i = 0; i < HASH_SIZE; i++)
140 for (e = hashtab->entries[i]; e; e = next)
141 {
142 next = e->next;
143 hnj_free (e->key);
144 hnj_free (e);
145 }
146
147 hnj_free (hashtab);
148 }
149
150 /* assumes that key is not already present! */
151 static void
152 hnj_hash_insert (HashTab *hashtab, const char *key, int val)
153 {
154 int i;
155 HashEntry *e;
156
157 i = hnj_string_hash (key) % HASH_SIZE;
158 e = hnj_malloc (sizeof(HashEntry));
159 e->next = hashtab->entries[i];
160 e->key = hnj_strdup (key);
161 e->val = val;
162 hashtab->entries[i] = e;
163 }
164
165 /* return val if found, otherwise -1 */
166 static int
167 hnj_hash_lookup (HashTab *hashtab, const char *key)
168 {
169 int i;
170 HashEntry *e;
171 i = hnj_string_hash (key) % HASH_SIZE;
172 for (e = hashtab->entries[i]; e; e = e->next)
173 if (!strcmp (key, e->key))
174 return e->val;
175 return -1;
176 }
177
178 /* Get the state number, allocating a new state if necessary. */
179 static int
180 hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string)
181 {
182 int state_num;
183
184 state_num = hnj_hash_lookup (hashtab, string);
185
186 if (state_num >= 0)
187 return state_num;
188
189 hnj_hash_insert (hashtab, string, dict->num_states);
190 /* predicate is true if dict->num_states is a power of two */
191 if (!(dict->num_states & (dict->num_states - 1)))
192 {
193 dict->states = hnj_realloc (dict->states,
194 (dict->num_states << 1) *
195 sizeof(HyphenState));
196 }
197 dict->states[dict->num_states].match = NULL;
198 dict->states[dict->num_states].repl = NULL;
199 dict->states[dict->num_states].fallback_state = -1;
200 dict->states[dict->num_states].num_trans = 0;
201 dict->states[dict->num_states].trans = NULL;
202 return dict->num_states++;
203 }
204
205 /* add a transition from state1 to state2 through ch - assumes that the
206 transition does not already exist */
207 static void
208 hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch)
209 {
210 int num_trans;
211
212 num_trans = dict->states[state1].num_trans;
213 if (num_trans == 0)
214 {
215 dict->states[state1].trans = hnj_malloc (sizeof(HyphenTrans));
216 }
217 else if (!(num_trans & (num_trans - 1)))
218 {
219 dict->states[state1].trans = hnj_realloc (dict->states[state1].trans,
220 (num_trans << 1) *
221 sizeof(HyphenTrans));
222 }
223 dict->states[state1].trans[num_trans].ch = ch;
224 dict->states[state1].trans[num_trans].new_state = state2;
225 dict->states[state1].num_trans++;
226 }
227
228 #ifdef VERBOSE
229 HashTab *global[1];
230
231 static char *
232 get_state_str (int state, int level)
233 {
234 int i;
235 HashEntry *e;
236
237 for (i = 0; i < HASH_SIZE; i++)
238 for (e = global[level]->entries[i]; e; e = e->next)
239 if (e->val == state)
240 return e->key;
241 return NULL;
242 }
243 #endif
244
245 void hnj_hyphen_load_line(char * buf, HyphenDict * dict, HashTab * hashtab) {
246 int i, j;
247 char word[MAX_CHARS];
248 char pattern[MAX_CHARS];
249 char * repl;
250 signed char replindex;
251 signed char replcut;
252 int state_num = 0;
253 int last_state;
254 char ch;
255 int found;
256
257 if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) {
258 dict->lhmin = atoi(buf + 13);
259 return;
260 } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) {
261 dict->rhmin = atoi(buf + 14);
262 return;
263 } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) {
264 dict->clhmin = atoi(buf + 21);
265 return;
266 } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) {
267 dict->crhmin = atoi(buf + 22);
268 return;
269 } else if (strncmp(buf, "NOHYPHEN", 8) == 0) {
270 char * space = buf + 8;
271 while (*space != '\0' && (*space == ' ' || *space == '\t')) space++;
272 if (*buf != '\0') dict->nohyphen = hnj_strdup(space);
273 if (dict->nohyphen) {
274 char * nhe = dict->nohyphen + strlen(dict->nohyphen) - 1;
275 *nhe = 0;
276 for (nhe = nhe - 1; nhe > dict->nohyphen; nhe--) {
277 if (*nhe == ',') {
278 dict->nohyphenl++;
279 *nhe = 0;
280 }
281 }
282 }
283 return;
284 }
285 j = 0;
286 pattern[j] = '0';
287 repl = strchr(buf, '/');
288 replindex = 0;
289 replcut = 0;
290 if (repl) {
291 char * index = strchr(repl + 1, ',');
292 *repl = '\0';
293 if (index) {
294 char * index2 = strchr(index + 1, ',');
295 *index = '\0';
296 if (index2) {
297 *index2 = '\0';
298 replindex = (signed char) atoi(index + 1) - 1;
299 replcut = (signed char) atoi(index2 + 1);
300 }
301 } else {
302 hnj_strchomp(repl + 1);
303 replindex = 0;
304 replcut = (signed char) strlen(buf);
305 }
306 repl = hnj_strdup(repl + 1);
307 }
308 for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++)
309 {
310 if (buf[i] >= '0' && buf[i] <= '9')
311 pattern[j] = buf[i];
312 else
313 {
314 word[j] = buf[i];
315 pattern[++j] = '0';
316 }
317 }
318 word[j] = '\0';
319 pattern[j + 1] = '\0';
320
321 i = 0;
322 if (!repl) {
323 /* Optimize away leading zeroes */
324 for (; pattern[i] == '0'; i++);
325 } else {
326 if (*word == '.') i++;
327 /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */
328 if (dict->utf8) {
329 int pu = -1; /* unicode character position */
330 int ps = -1; /* unicode start position (original replindex) */
331 int pc = (*word == '.') ? 1: 0; /* 8-bit character position */
332 for (; pc < (strlen(word) + 1); pc++) {
333 /* beginning of an UTF-8 character (not '10' start bits) */
334 if ((((unsigned char) word[pc]) >> 6) != 2) pu++;
335 if ((ps < 0) && (replindex == pu)) {
336 ps = replindex;
337 replindex = (signed char) pc;
338 }
339 if ((ps >= 0) && ((pu - ps) == replcut)) {
340 replcut = (signed char) (pc - replindex);
341 break;
342 }
343 }
344 if (*word == '.') replindex--;
345 }
346 }
347
348 #ifdef VERBOSE
349 printf ("word %s pattern %s, j = %d repl: %s\n", word, pattern + i, j, repl);
350 #endif
351 found = hnj_hash_lookup (hashtab, word);
352 state_num = hnj_get_state (dict, hashtab, word);
353 dict->states[state_num].match = hnj_strdup (pattern + i);
354 dict->states[state_num].repl = repl;
355 dict->states[state_num].replindex = replindex;
356 if (!replcut) {
357 dict->states[state_num].replcut = (signed char) strlen(word);
358 } else {
359 dict->states[state_num].replcut = replcut;
360 }
361
362 /* now, put in the prefix transitions */
363 for (; found < 0 ;j--)
364 {
365 last_state = state_num;
366 ch = word[j - 1];
367 word[j - 1] = '\0';
368 found = hnj_hash_lookup (hashtab, word);
369 state_num = hnj_get_state (dict, hashtab, word);
370 hnj_add_trans (dict, state_num, last_state, ch);
371 }
372 }
373
374 HyphenDict *
375 hnj_hyphen_load (const char *fn)
376 {
377 HyphenDict *dict[2];
378 HashTab *hashtab;
379 FILE *f;
380 char buf[MAX_CHARS];
381 int nextlevel = 0;
382 int i, j, k;
383 HashEntry *e;
384 int state_num = 0;
385
386 f = fopen (fn, "r");
387 if (f == NULL)
388 return NULL;
389
390 // loading one or two dictionaries (separated by NEXTLEVEL keyword)
391 for (k = 0; k < 2; k++) {
392 hashtab = hnj_hash_new ();
393 #ifdef VERBOSE
394 global[k] = hashtab;
395 #endif
396 hnj_hash_insert (hashtab, "", 0);
397 dict[k] = hnj_malloc (sizeof(HyphenDict));
398 dict[k]->num_states = 1;
399 dict[k]->states = hnj_malloc (sizeof(HyphenState));
400 dict[k]->states[0].match = NULL;
401 dict[k]->states[0].repl = NULL;
402 dict[k]->states[0].fallback_state = -1;
403 dict[k]->states[0].num_trans = 0;
404 dict[k]->states[0].trans = NULL;
405 dict[k]->nextlevel = NULL;
406 dict[k]->lhmin = 0;
407 dict[k]->rhmin = 0;
408 dict[k]->clhmin = 0;
409 dict[k]->crhmin = 0;
410 dict[k]->nohyphen = NULL;
411 dict[k]->nohyphenl = 0;
412
413 /* read in character set info */
414 if (k == 0) {
415 for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;
416 if (fgets(dict[k]->cset, sizeof(dict[k]->cset),f) != NULL) {
417 for (i=0;i<MAX_NAME;i++)
418 if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))
419 dict[k]->cset[i] = 0;
420 } else {
421 dict[k]->cset[0] = 0;
422 }
423 dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);
424 } else {
425 strncpy(dict[k]->cset, dict[0]->cset, sizeof(dict[k]->cset)-1);
426 dict[k]->cset[sizeof(dict[k]->cset)-1] = '\0';
427 dict[k]->utf8 = dict[0]->utf8;
428 }
429
430 if (k == 0 || nextlevel) {
431 while (fgets (buf, sizeof(buf), f) != NULL) {
432 if (strncmp(buf, "NEXTLEVEL", 9) == 0) {
433 nextlevel = 1;
434 break;
435 } else if (buf[0] != '%') hnj_hyphen_load_line(buf, dict[k], hashtab);
436 }
437 } else if (k == 1) {
438 /* default first level: hyphen and ASCII apostrophe */
439 if (!dict[0]->utf8) hnj_hyphen_load_line("NOHYPHEN ',-\n", dict[k], hashtab);
440 else hnj_hyphen_load_line("NOHYPHEN ',\xe2\x80\x93,\xe2\x80\x99,-\n", dict[k], hashtab);
441 strncpy(buf, "1-1\n", MAX_CHARS-1); // buf rewritten by hnj_hyphen_load here
442 buf[MAX_CHARS-1] = '\0';
443 hnj_hyphen_load_line(buf, dict[k], hashtab); /* remove hyphen */
444 hnj_hyphen_load_line("1'1\n", dict[k], hashtab); /* ASCII apostrophe */
445 if (dict[0]->utf8) {
446 hnj_hyphen_load_line("1\xe2\x80\x93" "1\n", dict[k], hashtab); /* endash */
447 hnj_hyphen_load_line("1\xe2\x80\x99" "1\n", dict[k], hashtab); /* apostrophe */
448 }
449 }
450
451 /* Could do unioning of matches here (instead of the preprocessor script).
452 If we did, the pseudocode would look something like this:
453
454 foreach state in the hash table
455 foreach i = [1..length(state) - 1]
456 state to check is substr (state, i)
457 look it up
458 if found, and if there is a match, union the match in.
459
460 It's also possible to avoid the quadratic blowup by doing the
461 search in order of increasing state string sizes - then you
462 can break the loop after finding the first match.
463
464 This step should be optional in any case - if there is a
465 preprocessed rule table, it's always faster to use that.
466
467 */
468
469 /* put in the fallback states */
470 for (i = 0; i < HASH_SIZE; i++)
471 for (e = hashtab->entries[i]; e; e = e->next)
472 {
473 if (*(e->key)) for (j = 1; 1; j++)
474 {
475 state_num = hnj_hash_lookup (hashtab, e->key + j);
476 if (state_num >= 0)
477 break;
478 }
479 /* KBH: FIXME state 0 fallback_state should always be -1? */
480 if (e->val)
481 dict[k]->states[e->val].fallback_state = state_num;
482 }
483 #ifdef VERBOSE
484 for (i = 0; i < HASH_SIZE; i++)
485 for (e = hashtab->entries[i]; e; e = e->next)
486 {
487 printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,
488 dict[k]->states[e->val].fallback_state);
489 for (j = 0; j < dict[k]->states[e->val].num_trans; j++)
490 printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch,
491 dict[k]->states[e->val].trans[j].new_state);
492 }
493 #endif
494
495 #ifndef VERBOSE
496 hnj_hash_free (hashtab);
497 #endif
498 state_num = 0;
499 }
500 fclose(f);
501 if (nextlevel) dict[0]->nextlevel = dict[1];
502 else {
503 dict[1] -> nextlevel = dict[0];
504 dict[1]->lhmin = dict[0]->lhmin;
505 dict[1]->rhmin = dict[0]->rhmin;
506 dict[1]->clhmin = (dict[0]->clhmin) ? dict[0]->clhmin : ((dict[0]->lhmin) ? dict[0]->lhmin : 3);
507 dict[1]->crhmin = (dict[0]->crhmin) ? dict[0]->crhmin : ((dict[0]->rhmin) ? dict[0]->rhmin : 3);
508 #ifdef VERBOSE
509 HashTab *r = global[0];
510 global[0] = global[1];
511 global[1] = r;
512 #endif
513 return dict[1];
514 }
515 return dict[0];
516 }
517
518 void hnj_hyphen_free (HyphenDict *dict)
519 {
520 int state_num;
521 HyphenState *hstate;
522
523 for (state_num = 0; state_num < dict->num_states; state_num++)
524 {
525 hstate = &dict->states[state_num];
526 if (hstate->match)
527 hnj_free (hstate->match);
528 if (hstate->repl)
529 hnj_free (hstate->repl);
530 if (hstate->trans)
531 hnj_free (hstate->trans);
532 }
533 if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel);
534
535 if (dict->nohyphen) hnj_free(dict->nohyphen);
536
537 hnj_free (dict->states);
538
539 hnj_free (dict);
540 }
541
542 #define MAX_WORD 256
543
544 int hnj_hyphen_hyphenate (HyphenDict *dict,
545 const char *word, int word_size,
546 char *hyphens)
547 {
548 char *prep_word;
549 int i, j, k;
550 int state;
551 char ch;
552 HyphenState *hstate;
553 char *match;
554 int offset;
555
556 prep_word = hnj_malloc (word_size + 3);
557
558 j = 0;
559 prep_word[j++] = '.';
560
561 for (i = 0; i < word_size; i++) {
562 if (word[i] <= '9' && word[i] >= '0') {
563 prep_word[j++] = '.';
564 } else {
565 prep_word[j++] = word[i];
566 }
567 }
568
569 prep_word[j++] = '.';
570 prep_word[j] = '\0';
571
572 for (i = 0; i < word_size + 5; i++)
573 hyphens[i] = '0';
574
575 #ifdef VERBOSE
576 printf ("prep_word = %s\n", prep_word);
577 #endif
578
579 /* now, run the finite state machine */
580 state = 0;
581 for (i = 0; i < j; i++)
582 {
583 ch = prep_word[i];
584 for (;;)
585 {
586
587 if (state == -1) {
588 /* return 1; */
589 /* KBH: FIXME shouldn't this be as follows? */
590 state = 0;
591 goto try_next_letter;
592 }
593
594 #ifdef VERBOSE
595 char *state_str;
596 state_str = get_state_str (state, 0);
597
598 for (k = 0; k < i - strlen (state_str); k++)
599 putchar (' ');
600 printf ("%s", state_str);
601 #endif
602
603 hstate = &dict->states[state];
604 for (k = 0; k < hstate->num_trans; k++)
605 if (hstate->trans[k].ch == ch)
606 {
607 state = hstate->trans[k].new_state;
608 goto found_state;
609 }
610 state = hstate->fallback_state;
611 #ifdef VERBOSE
612 printf (" falling back, fallback_state %d\n", state);
613 #endif
614 }
615 found_state:
616 #ifdef VERBOSE
617 printf ("found state %d\n",state);
618 #endif
619 /* Additional optimization is possible here - especially,
620 elimination of trailing zeroes from the match. Leading zeroes
621 have already been optimized. */
622 match = dict->states[state].match;
623 /* replacing rules not handled by hyphen_hyphenate() */
624 if (match && !dict->states[state].repl)
625 {
626 offset = i + 1 - strlen (match);
627 #ifdef VERBOSE
628 for (k = 0; k < offset; k++)
629 putchar (' ');
630 printf ("%s\n", match);
631 #endif
632 /* This is a linear search because I tried a binary search and
633 found it to be just a teeny bit slower. */
634 for (k = 0; match[k]; k++)
635 if (hyphens[offset + k] < match[k])
636 hyphens[offset + k] = match[k];
637 }
638
639 /* KBH: we need this to make sure we keep looking in a word */
640 /* for patterns even if the current character is not known in state 0 */
641 /* since patterns for hyphenation may occur anywhere in the word */
642 try_next_letter: ;
643
644 }
645 #ifdef VERBOSE
646 for (i = 0; i < j; i++)
647 putchar (hyphens[i]);
648 putchar ('\n');
649 #endif
650
651 for (i = 0; i < j - 4; i++)
652 #if 0
653 if (hyphens[i + 1] & 1)
654 hyphens[i] = '-';
655 #else
656 hyphens[i] = hyphens[i + 1];
657 #endif
658 hyphens[0] = '0';
659 for (; i < word_size; i++)
660 hyphens[i] = '0';
661 hyphens[word_size] = '\0';
662
663 hnj_free (prep_word);
664
665 return 0;
666 }
667
668 /* Unicode ligature length */
669 int hnj_ligature(unsigned char c) {
670 switch (c) {
671 case 0x80: /* ff */
672 case 0x81: /* fi */
673 case 0x82: return LIG_xx; /* fl */
674 case 0x83: /* ffi */
675 case 0x84: return LIG_xxx; /* ffl */
676 case 0x85: /* long st */
677 case 0x86: return LIG_xx; /* st */
678 }
679 return 0;
680 }
681
682 /* character length of the first n byte of the input word */
683 int hnj_hyphen_strnlen(const char * word, int n, int utf8)
684 {
685 int i = 0;
686 int j = 0;
687 while (j < n && word[j] != '\0') {
688 i++;
689 // Unicode ligature support
690 if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) {
691 i += hnj_ligature(word[j + 2]);
692 }
693 for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++);
694 }
695 return i;
696 }
697
698 int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens,
699 char *** rep, int ** pos, int ** cut, int lhmin)
700 {
701 int i = 1, j;
702
703 // Unicode ligature support
704 if (utf8 && ((unsigned char) word[0] == 0xEF) && ((unsigned char) word[1] == 0xAC)) {
705 i += hnj_ligature(word[2]);
706 }
707
708 // ignore numbers
709 for (j = 0; word[j] <= '9' && word[j] >= '0'; j++) i--;
710
711 for (j = 0; i < lhmin && word[j] != '\0'; i++) do {
712 // check length of the non-standard part
713 if (*rep && *pos && *cut && (*rep)[j]) {
714 char * rh = strchr((*rep)[j], '=');
715 if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) +
716 hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) {
717 free((*rep)[j]);
718 (*rep)[j] = NULL;
719 hyphens[j] = '0';
720 }
721 } else {
722 hyphens[j] = '0';
723 }
724 j++;
725
726 // Unicode ligature support
727 if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) {
728 i += hnj_ligature(word[j + 2]);
729 }
730 } while (utf8 && (word[j] & 0xc0) == 0x80);
731 return 0;
732 }
733
734 int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens,
735 char *** rep, int ** pos, int ** cut, int rhmin)
736 {
737 int i = 0;
738 int j;
739
740 // ignore numbers
741 for (j = word_size - 1; j > 0 && word[j] <= '9' && word[j] >= '0'; j--) i--;
742
743 for (j = word_size - 1; i < rhmin && j > 0; j--) {
744 // check length of the non-standard part
745 if (*rep && *pos && *cut && (*rep)[j]) {
746 char * rh = strchr((*rep)[j], '=');
747 if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) +
748 hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) {
749 free((*rep)[j]);
750 (*rep)[j] = NULL;
751 hyphens[j] = '0';
752 }
753 } else {
754 hyphens[j] = '0';
755 }
756 if (!utf8 || (word[j] & 0xc0) == 0xc0 || (word[j] & 0x80) != 0x80) i++;
757 }
758 return 0;
759 }
760
761 // recursive function for compound level hyphenation
762 int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
763 char * hyphens, char *** rep, int ** pos, int ** cut,
764 int clhmin, int crhmin, int lend, int rend)
765 {
766 char *prep_word;
767 int i, j, k;
768 int state;
769 char ch;
770 HyphenState *hstate;
771 char *match;
772 char *repl;
773 signed char replindex;
774 signed char replcut;
775 int offset;
776 int * matchlen;
777 int * matchindex;
778 char ** matchrepl;
779 int isrepl = 0;
780 int nHyphCount;
781
782 size_t prep_word_size = word_size + 3;
783 prep_word = hnj_malloc (prep_word_size);
784 matchlen = hnj_malloc ((word_size + 3) * sizeof(int));
785 matchindex = hnj_malloc ((word_size + 3) * sizeof(int));
786 matchrepl = hnj_malloc ((word_size + 3) * sizeof(char *));
787
788 j = 0;
789 prep_word[j++] = '.';
790
791 for (i = 0; i < word_size; i++) {
792 if (word[i] <= '9' && word[i] >= '0') {
793 prep_word[j++] = '.';
794 } else {
795 prep_word[j++] = word[i];
796 }
797 }
798
799
800
801 prep_word[j++] = '.';
802 prep_word[j] = '\0';
803
804 for (i = 0; i < j; i++)
805 hyphens[i] = '0';
806
807 #ifdef VERBOSE
808 printf ("prep_word = %s\n", prep_word);
809 #endif
810
811 /* now, run the finite state machine */
812 state = 0;
813 for (i = 0; i < j; i++)
814 {
815 ch = prep_word[i];
816 for (;;)
817 {
818
819 if (state == -1) {
820 /* return 1; */
821 /* KBH: FIXME shouldn't this be as follows? */
822 state = 0;
823 goto try_next_letter;
824 }
825
826 #ifdef VERBOSE
827 char *state_str;
828 state_str = get_state_str (state, 1);
829
830 for (k = 0; k < i - strlen (state_str); k++)
831 putchar (' ');
832 printf ("%s", state_str);
833 #endif
834
835 hstate = &dict->states[state];
836 for (k = 0; k < hstate->num_trans; k++)
837 if (hstate->trans[k].ch == ch)
838 {
839 state = hstate->trans[k].new_state;
840 goto found_state;
841 }
842 state = hstate->fallback_state;
843 #ifdef VERBOSE
844 printf (" falling back, fallback_state %d\n", state);
845 #endif
846 }
847 found_state:
848 #ifdef VERBOSE
849 printf ("found state %d\n",state);
850 #endif
851 /* Additional optimization is possible here - especially,
852 elimination of trailing zeroes from the match. Leading zeroes
853 have already been optimized. */
854 match = dict->states[state].match;
855 repl = dict->states[state].repl;
856 replindex = dict->states[state].replindex;
857 replcut = dict->states[state].replcut;
858 /* replacing rules not handled by hyphen_hyphenate() */
859 if (match)
860 {
861 offset = i + 1 - strlen (match);
862 #ifdef VERBOSE
863 for (k = 0; k < offset; k++)
864 putchar (' ');
865 printf ("%s (%s)\n", match, repl);
866 #endif
867 if (repl) {
868 if (!isrepl) for(; isrepl < word_size; isrepl++) {
869 matchrepl[isrepl] = NULL;
870 matchindex[isrepl] = -1;
871 }
872 matchlen[offset + replindex] = replcut;
873 }
874 /* This is a linear search because I tried a binary search and
875 found it to be just a teeny bit slower. */
876 for (k = 0; match[k]; k++) {
877 if ((hyphens[offset + k] < match[k])) {
878 hyphens[offset + k] = match[k];
879 if (match[k]&1) {
880 matchrepl[offset + k] = repl;
881 if (repl && (k >= replindex) && (k <= replindex + replcut)) {
882 matchindex[offset + replindex] = offset + k;
883 }
884 }
885 }
886 }
887
888 }
889
890 /* KBH: we need this to make sure we keep looking in a word */
891 /* for patterns even if the current character is not known in state 0 */
892 /* since patterns for hyphenation may occur anywhere in the word */
893 try_next_letter: ;
894
895 }
896 #ifdef VERBOSE
897 for (i = 0; i < j; i++)
898 putchar (hyphens[i]);
899 putchar ('\n');
900 #endif
901
902 for (i = 0; i < j - 3; i++)
903 #if 0
904 if (hyphens[i + 1] & 1)
905 hyphens[i] = '-';
906 #else
907 hyphens[i] = hyphens[i + 1];
908 #endif
909 for (; i < word_size; i++)
910 hyphens[i] = '0';
911 hyphens[word_size] = '\0';
912
913 /* now create a new char string showing hyphenation positions */
914 /* count the hyphens and allocate space for the new hyphenated string */
915 nHyphCount = 0;
916 for (i = 0; i < word_size; i++)
917 if (hyphens[i]&1)
918 nHyphCount++;
919 j = 0;
920 for (i = 0; i < word_size; i++) {
921 if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) {
922 if (rep && pos && cut) {
923 if (!*rep)
924 *rep = (char **) calloc(word_size, sizeof(char *));
925 if (!*pos)
926 *pos = (int *) calloc(word_size, sizeof(int));
927 if (!*cut) {
928 *cut = (int *) calloc(word_size, sizeof(int));
929 }
930 (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]);
931 (*pos)[matchindex[i] - 1] = matchindex[i] - i;
932 (*cut)[matchindex[i] - 1] = matchlen[i];
933 }
934 j += strlen(matchrepl[matchindex[i]]);
935 i += matchlen[i] - 1;
936 }
937 }
938
939 hnj_free (matchrepl);
940 hnj_free (matchlen);
941 hnj_free (matchindex);
942
943 // recursive hyphenation of the first (compound) level segments
944 if (dict->nextlevel) {
945 char ** rep2;
946 int * pos2;
947 int * cut2;
948 char * hyphens2;
949 int begin = 0;
950
951 rep2 = hnj_malloc (word_size * sizeof(char *));
952 pos2 = hnj_malloc (word_size * sizeof(int));
953 cut2 = hnj_malloc (word_size * sizeof(int));
954 hyphens2 = hnj_malloc (word_size + 3);
955 for (i = 0; i < word_size; i++) rep2[i] = NULL;
956 for (i = 0; i < word_size; i++) if
957 (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) {
958 if (i - begin > 1) {
959 int hyph = 0;
960 prep_word[i + 2] = '\0';
961 /* non-standard hyphenation at compound boundary (Schiffahrt) */
962 if (rep && *rep && *pos && *cut && (*rep)[i]) {
963 char * l = strchr((*rep)[i], '=');
964 size_t offset = 2 + i - (*pos)[i];
965 strncpy(prep_word + offset, (*rep)[i], prep_word_size - offset - 1);
966 prep_word[prep_word_size - 1] = '\0';
967 if (l) {
968 hyph = (l - (*rep)[i]) - (*pos)[i];
969 prep_word[2 + i + hyph] = '\0';
970 }
971 }
972 hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph,
973 hyphens2, &rep2, &pos2, &cut2, clhmin,
974 crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend));
975 for (j = 0; j < i - begin - 1; j++) {
976 hyphens[begin + j] = hyphens2[j];
977 if (rep2[j] && rep && pos && cut) {
978 if (!*rep && !*pos && !*cut) {
979 int k;
980 *rep = (char **) malloc(sizeof(char *) * word_size);
981 *pos = (int *) malloc(sizeof(int) * word_size);
982 *cut = (int *) malloc(sizeof(int) * word_size);
983 for (k = 0; k < word_size; k++) {
984 (*rep)[k] = NULL;
985 (*pos)[k] = 0;
986 (*cut)[k] = 0;
987 }
988 }
989 (*rep)[begin + j] = rep2[j];
990 (*pos)[begin + j] = pos2[j];
991 (*cut)[begin + j] = cut2[j];
992 }
993 }
994 prep_word[i + 2] = word[i + 1];
995 if (*rep && *pos && *cut && (*rep)[i]) {
996 size_t offset = 1;
997 strncpy(prep_word + offset, word, prep_word_size - offset - 1);
998 prep_word[prep_word_size - 1] = '\0';
999 }
1000 }
1001 begin = i + 1;
1002 for (j = 0; j < word_size; j++) rep2[j] = NULL;
1003 }
1004
1005 // non-compound
1006 if (begin == 0) {
1007 hnj_hyphen_hyph_(dict->nextlevel, word, word_size,
1008 hyphens, rep, pos, cut, clhmin, crhmin, lend, rend);
1009 if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
1010 rep, pos, cut, clhmin);
1011 if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
1012 rep, pos, cut, crhmin);
1013 }
1014
1015 free(rep2);
1016 free(cut2);
1017 free(pos2);
1018 free(hyphens2);
1019 }
1020
1021 hnj_free (prep_word);
1022 return 0;
1023 }
1024
1025 /* UTF-8 normalization of hyphen and non-standard positions */
1026 int hnj_hyphen_norm(const char *word, int word_size, char * hyphens,
1027 char *** rep, int ** pos, int ** cut)
1028 {
1029 int i, j, k;
1030 if ((((unsigned char) word[0]) >> 6) == 2) {
1031 fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word);
1032 return 1;
1033 }
1034
1035 /* calculate UTF-8 character positions */
1036 for (i = 0, j = -1; i < word_size; i++) {
1037 /* beginning of an UTF-8 character (not '10' start bits) */
1038 if ((((unsigned char) word[i]) >> 6) != 2) j++;
1039 hyphens[j] = hyphens[i];
1040 if (rep && pos && cut && *rep && *pos && *cut) {
1041 int l = (*pos)[i];
1042 (*pos)[j] = 0;
1043 for (k = 0; k < l; k++) {
1044 if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++;
1045 }
1046 k = i - l + 1;
1047 l = k + (*cut)[i];
1048 (*cut)[j] = 0;
1049 for (; k < l; k++) {
1050 if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++;
1051 }
1052 (*rep)[j] = (*rep)[i];
1053 if (j < i) {
1054 (*rep)[i] = NULL;
1055 (*pos)[i] = 0;
1056 (*cut)[i] = 0;
1057 }
1058 }
1059 }
1060 hyphens[j + 1] = '\0';
1061 #ifdef VERBOSE
1062 printf ("nums: %s\n", hyphens);
1063 #endif
1064 return 0;
1065 }
1066
1067 /* get the word with all possible hyphenations (output: hyphword) */
1068 void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens,
1069 char * hyphword, char *** rep, int ** pos, int ** cut)
1070 {
1071 int hyphenslen = l + 5;
1072
1073 int i, j;
1074 for (i = 0, j = 0; i < l; i++, j++) {
1075 if (hyphens[i]&1) {
1076 hyphword[j] = word[i];
1077 if (*rep && *pos && *cut && (*rep)[i]) {
1078 size_t offset = j - (*pos)[i] + 1;
1079 strncpy(hyphword + offset, (*rep)[i], hyphenslen - offset - 1);
1080 hyphword[hyphenslen-1] = '\0';
1081 j += strlen((*rep)[i]) - (*pos)[i];
1082 i += (*cut)[i] - (*pos)[i];
1083 } else hyphword[++j] = '=';
1084 } else hyphword[j] = word[i];
1085 }
1086 hyphword[j] = '\0';
1087 }
1088
1089
1090 /* main api function with default hyphenmin parameters */
1091 int hnj_hyphen_hyphenate2 (HyphenDict *dict,
1092 const char *word, int word_size, char * hyphens,
1093 char *hyphword, char *** rep, int ** pos, int ** cut)
1094 {
1095 hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
1096 dict->clhmin, dict->crhmin, 1, 1);
1097 hnj_hyphen_lhmin(dict->utf8, word, word_size,
1098 hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2));
1099 hnj_hyphen_rhmin(dict->utf8, word, word_size,
1100 hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2));
1101
1102 /* nohyphen */
1103 if (dict->nohyphen) {
1104 char * nh = dict->nohyphen;
1105 int nhi;
1106 for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {
1107 char * nhy = (char *) strstr(word, nh);
1108 while (nhy) {
1109 hyphens[nhy - word + strlen(nh) - 1] = '0';
1110 if (nhy - word - 1 >= 0) hyphens[nhy - word - 1] = '0';
1111 nhy = (char *) strstr(nhy + 1, nh);
1112 }
1113 nh = nh + strlen(nh) + 1;
1114 }
1115 }
1116
1117 if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
1118 if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
1119 #ifdef VERBOSE
1120 printf ("nums: %s\n", hyphens);
1121 #endif
1122 return 0;
1123 }
1124
1125 /* previous main api function with hyphenmin parameters */
1126 int hnj_hyphen_hyphenate3 (HyphenDict *dict,
1127 const char *word, int word_size, char * hyphens,
1128 char *hyphword, char *** rep, int ** pos, int ** cut,
1129 int lhmin, int rhmin, int clhmin, int crhmin)
1130 {
1131 lhmin = (lhmin > dict->lhmin) ? lhmin : dict->lhmin;
1132 rhmin = (rhmin > dict->rhmin) ? rhmin : dict->rhmin;
1133 clhmin = (clhmin > dict->clhmin) ? clhmin : dict->clhmin;
1134 crhmin = (crhmin > dict->crhmin) ? crhmin : dict->crhmin;
1135 hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
1136 clhmin, crhmin, 1, 1);
1137 hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
1138 rep, pos, cut, (lhmin > 0 ? lhmin : 2));
1139 hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
1140 rep, pos, cut, (rhmin > 0 ? rhmin : 2));
1141 if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
1142
1143 /* nohyphen */
1144 if (dict->nohyphen) {
1145 char * nh = dict->nohyphen;
1146 int nhi;
1147 for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {
1148 char * nhy = (char *) strstr(word, nh);
1149 while (nhy) {
1150 hyphens[nhy - word + strlen(nh) - 1] = 0;
1151 if (nhy - word - 1 >= 0) hyphens[nhy - word - 1] = 0;
1152 nhy = (char *) strstr(nhy + 1, nh);
1153 }
1154 nh = nh + strlen(nh) + 1;
1155 }
1156 }
1157
1158 if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
1159 return 0;
1160 }

mercurial