|
1 /* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both |
|
2 * licenses follows. |
|
3 */ |
|
4 |
|
5 /* LibHnj - a library for high quality hyphenation and justification |
|
6 * Copyright (C) 1998 Raph Levien, |
|
7 * (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org), |
|
8 * (C) 2001 Peter Novodvorsky (nidd@cs.msu.su) |
|
9 * (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo) |
|
10 * |
|
11 * This library is free software; you can redistribute it and/or |
|
12 * modify it under the terms of the GNU Library General Public |
|
13 * License as published by the Free Software Foundation; either |
|
14 * version 2 of the License, or (at your option) any later version. |
|
15 * |
|
16 * This library is distributed in the hope that it will be useful, |
|
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
19 * Library General Public License for more details. |
|
20 * |
|
21 * You should have received a copy of the GNU Library General Public |
|
22 * License along with this library; if not, write to the |
|
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
|
24 * Boston, MA 02111-1307 USA. |
|
25 */ |
|
26 |
|
27 /* |
|
28 * The contents of this file are subject to the Mozilla Public License |
|
29 * Version 1.0 (the "MPL"); you may not use this file except in |
|
30 * compliance with the MPL. You may obtain a copy of the MPL at |
|
31 * http://www.mozilla.org/MPL/ |
|
32 * |
|
33 * Software distributed under the MPL is distributed on an "AS IS" basis, |
|
34 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL |
|
35 * for the specific language governing rights and limitations under the |
|
36 * MPL. |
|
37 * |
|
38 */ |
|
39 #include <stdlib.h> /* for NULL, malloc */ |
|
40 #include <stdio.h> /* for fprintf */ |
|
41 #include <string.h> /* for strdup */ |
|
42 |
|
43 #ifdef UNX |
|
44 #include <unistd.h> /* for exit */ |
|
45 #endif |
|
46 |
|
47 #define noVERBOSE |
|
48 |
|
49 /* calculate hyphenmin values with long ligature length (2 or 3 characters |
|
50 * instead of 1 or 2) for comparison with hyphenation without ligatures */ |
|
51 #define noLONG_LIGATURE |
|
52 |
|
53 #ifdef LONG_LIGATURE |
|
54 #define LIG_xx 1 |
|
55 #define LIG_xxx 2 |
|
56 #else |
|
57 #define LIG_xx 0 |
|
58 #define LIG_xxx 1 |
|
59 #endif |
|
60 |
|
61 #include "hnjalloc.h" |
|
62 #include "hyphen.h" |
|
63 |
|
64 static char * |
|
65 hnj_strdup (const char *s) |
|
66 { |
|
67 char *new; |
|
68 int l; |
|
69 |
|
70 l = strlen (s); |
|
71 new = hnj_malloc (l + 1); |
|
72 memcpy (new, s, l); |
|
73 new[l] = 0; |
|
74 return new; |
|
75 } |
|
76 |
|
77 /* remove cross-platform text line end characters */ |
|
78 void hnj_strchomp(char * s) |
|
79 { |
|
80 int k = strlen(s); |
|
81 if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0'; |
|
82 if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0'; |
|
83 } |
|
84 |
|
85 /* a little bit of a hash table implementation. This simply maps strings |
|
86 to state numbers */ |
|
87 |
|
88 typedef struct _HashTab HashTab; |
|
89 typedef struct _HashEntry HashEntry; |
|
90 |
|
91 /* A cheap, but effective, hack. */ |
|
92 #define HASH_SIZE 31627 |
|
93 |
|
94 struct _HashTab { |
|
95 HashEntry *entries[HASH_SIZE]; |
|
96 }; |
|
97 |
|
98 struct _HashEntry { |
|
99 HashEntry *next; |
|
100 char *key; |
|
101 int val; |
|
102 }; |
|
103 |
|
104 /* a char* hash function from ASU - adapted from Gtk+ */ |
|
105 static unsigned int |
|
106 hnj_string_hash (const char *s) |
|
107 { |
|
108 const char *p; |
|
109 unsigned int h=0, g; |
|
110 for(p = s; *p != '\0'; p += 1) { |
|
111 h = ( h << 4 ) + *p; |
|
112 if ( ( g = h & 0xf0000000 ) ) { |
|
113 h = h ^ (g >> 24); |
|
114 h = h ^ g; |
|
115 } |
|
116 } |
|
117 return h /* % M */; |
|
118 } |
|
119 |
|
120 static HashTab * |
|
121 hnj_hash_new (void) |
|
122 { |
|
123 HashTab *hashtab; |
|
124 int i; |
|
125 |
|
126 hashtab = hnj_malloc (sizeof(HashTab)); |
|
127 for (i = 0; i < HASH_SIZE; i++) |
|
128 hashtab->entries[i] = NULL; |
|
129 |
|
130 return hashtab; |
|
131 } |
|
132 |
|
133 static void |
|
134 hnj_hash_free (HashTab *hashtab) |
|
135 { |
|
136 int i; |
|
137 HashEntry *e, *next; |
|
138 |
|
139 for (i = 0; i < HASH_SIZE; i++) |
|
140 for (e = hashtab->entries[i]; e; e = next) |
|
141 { |
|
142 next = e->next; |
|
143 hnj_free (e->key); |
|
144 hnj_free (e); |
|
145 } |
|
146 |
|
147 hnj_free (hashtab); |
|
148 } |
|
149 |
|
150 /* assumes that key is not already present! */ |
|
151 static void |
|
152 hnj_hash_insert (HashTab *hashtab, const char *key, int val) |
|
153 { |
|
154 int i; |
|
155 HashEntry *e; |
|
156 |
|
157 i = hnj_string_hash (key) % HASH_SIZE; |
|
158 e = hnj_malloc (sizeof(HashEntry)); |
|
159 e->next = hashtab->entries[i]; |
|
160 e->key = hnj_strdup (key); |
|
161 e->val = val; |
|
162 hashtab->entries[i] = e; |
|
163 } |
|
164 |
|
165 /* return val if found, otherwise -1 */ |
|
166 static int |
|
167 hnj_hash_lookup (HashTab *hashtab, const char *key) |
|
168 { |
|
169 int i; |
|
170 HashEntry *e; |
|
171 i = hnj_string_hash (key) % HASH_SIZE; |
|
172 for (e = hashtab->entries[i]; e; e = e->next) |
|
173 if (!strcmp (key, e->key)) |
|
174 return e->val; |
|
175 return -1; |
|
176 } |
|
177 |
|
178 /* Get the state number, allocating a new state if necessary. */ |
|
179 static int |
|
180 hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string) |
|
181 { |
|
182 int state_num; |
|
183 |
|
184 state_num = hnj_hash_lookup (hashtab, string); |
|
185 |
|
186 if (state_num >= 0) |
|
187 return state_num; |
|
188 |
|
189 hnj_hash_insert (hashtab, string, dict->num_states); |
|
190 /* predicate is true if dict->num_states is a power of two */ |
|
191 if (!(dict->num_states & (dict->num_states - 1))) |
|
192 { |
|
193 dict->states = hnj_realloc (dict->states, |
|
194 (dict->num_states << 1) * |
|
195 sizeof(HyphenState)); |
|
196 } |
|
197 dict->states[dict->num_states].match = NULL; |
|
198 dict->states[dict->num_states].repl = NULL; |
|
199 dict->states[dict->num_states].fallback_state = -1; |
|
200 dict->states[dict->num_states].num_trans = 0; |
|
201 dict->states[dict->num_states].trans = NULL; |
|
202 return dict->num_states++; |
|
203 } |
|
204 |
|
205 /* add a transition from state1 to state2 through ch - assumes that the |
|
206 transition does not already exist */ |
|
207 static void |
|
208 hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch) |
|
209 { |
|
210 int num_trans; |
|
211 |
|
212 num_trans = dict->states[state1].num_trans; |
|
213 if (num_trans == 0) |
|
214 { |
|
215 dict->states[state1].trans = hnj_malloc (sizeof(HyphenTrans)); |
|
216 } |
|
217 else if (!(num_trans & (num_trans - 1))) |
|
218 { |
|
219 dict->states[state1].trans = hnj_realloc (dict->states[state1].trans, |
|
220 (num_trans << 1) * |
|
221 sizeof(HyphenTrans)); |
|
222 } |
|
223 dict->states[state1].trans[num_trans].ch = ch; |
|
224 dict->states[state1].trans[num_trans].new_state = state2; |
|
225 dict->states[state1].num_trans++; |
|
226 } |
|
227 |
|
228 #ifdef VERBOSE |
|
229 HashTab *global[1]; |
|
230 |
|
231 static char * |
|
232 get_state_str (int state, int level) |
|
233 { |
|
234 int i; |
|
235 HashEntry *e; |
|
236 |
|
237 for (i = 0; i < HASH_SIZE; i++) |
|
238 for (e = global[level]->entries[i]; e; e = e->next) |
|
239 if (e->val == state) |
|
240 return e->key; |
|
241 return NULL; |
|
242 } |
|
243 #endif |
|
244 |
|
245 void hnj_hyphen_load_line(char * buf, HyphenDict * dict, HashTab * hashtab) { |
|
246 int i, j; |
|
247 char word[MAX_CHARS]; |
|
248 char pattern[MAX_CHARS]; |
|
249 char * repl; |
|
250 signed char replindex; |
|
251 signed char replcut; |
|
252 int state_num = 0; |
|
253 int last_state; |
|
254 char ch; |
|
255 int found; |
|
256 |
|
257 if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) { |
|
258 dict->lhmin = atoi(buf + 13); |
|
259 return; |
|
260 } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) { |
|
261 dict->rhmin = atoi(buf + 14); |
|
262 return; |
|
263 } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) { |
|
264 dict->clhmin = atoi(buf + 21); |
|
265 return; |
|
266 } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) { |
|
267 dict->crhmin = atoi(buf + 22); |
|
268 return; |
|
269 } else if (strncmp(buf, "NOHYPHEN", 8) == 0) { |
|
270 char * space = buf + 8; |
|
271 while (*space != '\0' && (*space == ' ' || *space == '\t')) space++; |
|
272 if (*buf != '\0') dict->nohyphen = hnj_strdup(space); |
|
273 if (dict->nohyphen) { |
|
274 char * nhe = dict->nohyphen + strlen(dict->nohyphen) - 1; |
|
275 *nhe = 0; |
|
276 for (nhe = nhe - 1; nhe > dict->nohyphen; nhe--) { |
|
277 if (*nhe == ',') { |
|
278 dict->nohyphenl++; |
|
279 *nhe = 0; |
|
280 } |
|
281 } |
|
282 } |
|
283 return; |
|
284 } |
|
285 j = 0; |
|
286 pattern[j] = '0'; |
|
287 repl = strchr(buf, '/'); |
|
288 replindex = 0; |
|
289 replcut = 0; |
|
290 if (repl) { |
|
291 char * index = strchr(repl + 1, ','); |
|
292 *repl = '\0'; |
|
293 if (index) { |
|
294 char * index2 = strchr(index + 1, ','); |
|
295 *index = '\0'; |
|
296 if (index2) { |
|
297 *index2 = '\0'; |
|
298 replindex = (signed char) atoi(index + 1) - 1; |
|
299 replcut = (signed char) atoi(index2 + 1); |
|
300 } |
|
301 } else { |
|
302 hnj_strchomp(repl + 1); |
|
303 replindex = 0; |
|
304 replcut = (signed char) strlen(buf); |
|
305 } |
|
306 repl = hnj_strdup(repl + 1); |
|
307 } |
|
308 for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++) |
|
309 { |
|
310 if (buf[i] >= '0' && buf[i] <= '9') |
|
311 pattern[j] = buf[i]; |
|
312 else |
|
313 { |
|
314 word[j] = buf[i]; |
|
315 pattern[++j] = '0'; |
|
316 } |
|
317 } |
|
318 word[j] = '\0'; |
|
319 pattern[j + 1] = '\0'; |
|
320 |
|
321 i = 0; |
|
322 if (!repl) { |
|
323 /* Optimize away leading zeroes */ |
|
324 for (; pattern[i] == '0'; i++); |
|
325 } else { |
|
326 if (*word == '.') i++; |
|
327 /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */ |
|
328 if (dict->utf8) { |
|
329 int pu = -1; /* unicode character position */ |
|
330 int ps = -1; /* unicode start position (original replindex) */ |
|
331 int pc = (*word == '.') ? 1: 0; /* 8-bit character position */ |
|
332 for (; pc < (strlen(word) + 1); pc++) { |
|
333 /* beginning of an UTF-8 character (not '10' start bits) */ |
|
334 if ((((unsigned char) word[pc]) >> 6) != 2) pu++; |
|
335 if ((ps < 0) && (replindex == pu)) { |
|
336 ps = replindex; |
|
337 replindex = (signed char) pc; |
|
338 } |
|
339 if ((ps >= 0) && ((pu - ps) == replcut)) { |
|
340 replcut = (signed char) (pc - replindex); |
|
341 break; |
|
342 } |
|
343 } |
|
344 if (*word == '.') replindex--; |
|
345 } |
|
346 } |
|
347 |
|
348 #ifdef VERBOSE |
|
349 printf ("word %s pattern %s, j = %d repl: %s\n", word, pattern + i, j, repl); |
|
350 #endif |
|
351 found = hnj_hash_lookup (hashtab, word); |
|
352 state_num = hnj_get_state (dict, hashtab, word); |
|
353 dict->states[state_num].match = hnj_strdup (pattern + i); |
|
354 dict->states[state_num].repl = repl; |
|
355 dict->states[state_num].replindex = replindex; |
|
356 if (!replcut) { |
|
357 dict->states[state_num].replcut = (signed char) strlen(word); |
|
358 } else { |
|
359 dict->states[state_num].replcut = replcut; |
|
360 } |
|
361 |
|
362 /* now, put in the prefix transitions */ |
|
363 for (; found < 0 ;j--) |
|
364 { |
|
365 last_state = state_num; |
|
366 ch = word[j - 1]; |
|
367 word[j - 1] = '\0'; |
|
368 found = hnj_hash_lookup (hashtab, word); |
|
369 state_num = hnj_get_state (dict, hashtab, word); |
|
370 hnj_add_trans (dict, state_num, last_state, ch); |
|
371 } |
|
372 } |
|
373 |
|
374 HyphenDict * |
|
375 hnj_hyphen_load (const char *fn) |
|
376 { |
|
377 HyphenDict *dict[2]; |
|
378 HashTab *hashtab; |
|
379 FILE *f; |
|
380 char buf[MAX_CHARS]; |
|
381 int nextlevel = 0; |
|
382 int i, j, k; |
|
383 HashEntry *e; |
|
384 int state_num = 0; |
|
385 |
|
386 f = fopen (fn, "r"); |
|
387 if (f == NULL) |
|
388 return NULL; |
|
389 |
|
390 // loading one or two dictionaries (separated by NEXTLEVEL keyword) |
|
391 for (k = 0; k < 2; k++) { |
|
392 hashtab = hnj_hash_new (); |
|
393 #ifdef VERBOSE |
|
394 global[k] = hashtab; |
|
395 #endif |
|
396 hnj_hash_insert (hashtab, "", 0); |
|
397 dict[k] = hnj_malloc (sizeof(HyphenDict)); |
|
398 dict[k]->num_states = 1; |
|
399 dict[k]->states = hnj_malloc (sizeof(HyphenState)); |
|
400 dict[k]->states[0].match = NULL; |
|
401 dict[k]->states[0].repl = NULL; |
|
402 dict[k]->states[0].fallback_state = -1; |
|
403 dict[k]->states[0].num_trans = 0; |
|
404 dict[k]->states[0].trans = NULL; |
|
405 dict[k]->nextlevel = NULL; |
|
406 dict[k]->lhmin = 0; |
|
407 dict[k]->rhmin = 0; |
|
408 dict[k]->clhmin = 0; |
|
409 dict[k]->crhmin = 0; |
|
410 dict[k]->nohyphen = NULL; |
|
411 dict[k]->nohyphenl = 0; |
|
412 |
|
413 /* read in character set info */ |
|
414 if (k == 0) { |
|
415 for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0; |
|
416 if (fgets(dict[k]->cset, sizeof(dict[k]->cset),f) != NULL) { |
|
417 for (i=0;i<MAX_NAME;i++) |
|
418 if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n')) |
|
419 dict[k]->cset[i] = 0; |
|
420 } else { |
|
421 dict[k]->cset[0] = 0; |
|
422 } |
|
423 dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0); |
|
424 } else { |
|
425 strncpy(dict[k]->cset, dict[0]->cset, sizeof(dict[k]->cset)-1); |
|
426 dict[k]->cset[sizeof(dict[k]->cset)-1] = '\0'; |
|
427 dict[k]->utf8 = dict[0]->utf8; |
|
428 } |
|
429 |
|
430 if (k == 0 || nextlevel) { |
|
431 while (fgets (buf, sizeof(buf), f) != NULL) { |
|
432 if (strncmp(buf, "NEXTLEVEL", 9) == 0) { |
|
433 nextlevel = 1; |
|
434 break; |
|
435 } else if (buf[0] != '%') hnj_hyphen_load_line(buf, dict[k], hashtab); |
|
436 } |
|
437 } else if (k == 1) { |
|
438 /* default first level: hyphen and ASCII apostrophe */ |
|
439 if (!dict[0]->utf8) hnj_hyphen_load_line("NOHYPHEN ',-\n", dict[k], hashtab); |
|
440 else hnj_hyphen_load_line("NOHYPHEN ',\xe2\x80\x93,\xe2\x80\x99,-\n", dict[k], hashtab); |
|
441 strncpy(buf, "1-1\n", MAX_CHARS-1); // buf rewritten by hnj_hyphen_load here |
|
442 buf[MAX_CHARS-1] = '\0'; |
|
443 hnj_hyphen_load_line(buf, dict[k], hashtab); /* remove hyphen */ |
|
444 hnj_hyphen_load_line("1'1\n", dict[k], hashtab); /* ASCII apostrophe */ |
|
445 if (dict[0]->utf8) { |
|
446 hnj_hyphen_load_line("1\xe2\x80\x93" "1\n", dict[k], hashtab); /* endash */ |
|
447 hnj_hyphen_load_line("1\xe2\x80\x99" "1\n", dict[k], hashtab); /* apostrophe */ |
|
448 } |
|
449 } |
|
450 |
|
451 /* Could do unioning of matches here (instead of the preprocessor script). |
|
452 If we did, the pseudocode would look something like this: |
|
453 |
|
454 foreach state in the hash table |
|
455 foreach i = [1..length(state) - 1] |
|
456 state to check is substr (state, i) |
|
457 look it up |
|
458 if found, and if there is a match, union the match in. |
|
459 |
|
460 It's also possible to avoid the quadratic blowup by doing the |
|
461 search in order of increasing state string sizes - then you |
|
462 can break the loop after finding the first match. |
|
463 |
|
464 This step should be optional in any case - if there is a |
|
465 preprocessed rule table, it's always faster to use that. |
|
466 |
|
467 */ |
|
468 |
|
469 /* put in the fallback states */ |
|
470 for (i = 0; i < HASH_SIZE; i++) |
|
471 for (e = hashtab->entries[i]; e; e = e->next) |
|
472 { |
|
473 if (*(e->key)) for (j = 1; 1; j++) |
|
474 { |
|
475 state_num = hnj_hash_lookup (hashtab, e->key + j); |
|
476 if (state_num >= 0) |
|
477 break; |
|
478 } |
|
479 /* KBH: FIXME state 0 fallback_state should always be -1? */ |
|
480 if (e->val) |
|
481 dict[k]->states[e->val].fallback_state = state_num; |
|
482 } |
|
483 #ifdef VERBOSE |
|
484 for (i = 0; i < HASH_SIZE; i++) |
|
485 for (e = hashtab->entries[i]; e; e = e->next) |
|
486 { |
|
487 printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val, |
|
488 dict[k]->states[e->val].fallback_state); |
|
489 for (j = 0; j < dict[k]->states[e->val].num_trans; j++) |
|
490 printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch, |
|
491 dict[k]->states[e->val].trans[j].new_state); |
|
492 } |
|
493 #endif |
|
494 |
|
495 #ifndef VERBOSE |
|
496 hnj_hash_free (hashtab); |
|
497 #endif |
|
498 state_num = 0; |
|
499 } |
|
500 fclose(f); |
|
501 if (nextlevel) dict[0]->nextlevel = dict[1]; |
|
502 else { |
|
503 dict[1] -> nextlevel = dict[0]; |
|
504 dict[1]->lhmin = dict[0]->lhmin; |
|
505 dict[1]->rhmin = dict[0]->rhmin; |
|
506 dict[1]->clhmin = (dict[0]->clhmin) ? dict[0]->clhmin : ((dict[0]->lhmin) ? dict[0]->lhmin : 3); |
|
507 dict[1]->crhmin = (dict[0]->crhmin) ? dict[0]->crhmin : ((dict[0]->rhmin) ? dict[0]->rhmin : 3); |
|
508 #ifdef VERBOSE |
|
509 HashTab *r = global[0]; |
|
510 global[0] = global[1]; |
|
511 global[1] = r; |
|
512 #endif |
|
513 return dict[1]; |
|
514 } |
|
515 return dict[0]; |
|
516 } |
|
517 |
|
518 void hnj_hyphen_free (HyphenDict *dict) |
|
519 { |
|
520 int state_num; |
|
521 HyphenState *hstate; |
|
522 |
|
523 for (state_num = 0; state_num < dict->num_states; state_num++) |
|
524 { |
|
525 hstate = &dict->states[state_num]; |
|
526 if (hstate->match) |
|
527 hnj_free (hstate->match); |
|
528 if (hstate->repl) |
|
529 hnj_free (hstate->repl); |
|
530 if (hstate->trans) |
|
531 hnj_free (hstate->trans); |
|
532 } |
|
533 if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel); |
|
534 |
|
535 if (dict->nohyphen) hnj_free(dict->nohyphen); |
|
536 |
|
537 hnj_free (dict->states); |
|
538 |
|
539 hnj_free (dict); |
|
540 } |
|
541 |
|
542 #define MAX_WORD 256 |
|
543 |
|
544 int hnj_hyphen_hyphenate (HyphenDict *dict, |
|
545 const char *word, int word_size, |
|
546 char *hyphens) |
|
547 { |
|
548 char *prep_word; |
|
549 int i, j, k; |
|
550 int state; |
|
551 char ch; |
|
552 HyphenState *hstate; |
|
553 char *match; |
|
554 int offset; |
|
555 |
|
556 prep_word = hnj_malloc (word_size + 3); |
|
557 |
|
558 j = 0; |
|
559 prep_word[j++] = '.'; |
|
560 |
|
561 for (i = 0; i < word_size; i++) { |
|
562 if (word[i] <= '9' && word[i] >= '0') { |
|
563 prep_word[j++] = '.'; |
|
564 } else { |
|
565 prep_word[j++] = word[i]; |
|
566 } |
|
567 } |
|
568 |
|
569 prep_word[j++] = '.'; |
|
570 prep_word[j] = '\0'; |
|
571 |
|
572 for (i = 0; i < word_size + 5; i++) |
|
573 hyphens[i] = '0'; |
|
574 |
|
575 #ifdef VERBOSE |
|
576 printf ("prep_word = %s\n", prep_word); |
|
577 #endif |
|
578 |
|
579 /* now, run the finite state machine */ |
|
580 state = 0; |
|
581 for (i = 0; i < j; i++) |
|
582 { |
|
583 ch = prep_word[i]; |
|
584 for (;;) |
|
585 { |
|
586 |
|
587 if (state == -1) { |
|
588 /* return 1; */ |
|
589 /* KBH: FIXME shouldn't this be as follows? */ |
|
590 state = 0; |
|
591 goto try_next_letter; |
|
592 } |
|
593 |
|
594 #ifdef VERBOSE |
|
595 char *state_str; |
|
596 state_str = get_state_str (state, 0); |
|
597 |
|
598 for (k = 0; k < i - strlen (state_str); k++) |
|
599 putchar (' '); |
|
600 printf ("%s", state_str); |
|
601 #endif |
|
602 |
|
603 hstate = &dict->states[state]; |
|
604 for (k = 0; k < hstate->num_trans; k++) |
|
605 if (hstate->trans[k].ch == ch) |
|
606 { |
|
607 state = hstate->trans[k].new_state; |
|
608 goto found_state; |
|
609 } |
|
610 state = hstate->fallback_state; |
|
611 #ifdef VERBOSE |
|
612 printf (" falling back, fallback_state %d\n", state); |
|
613 #endif |
|
614 } |
|
615 found_state: |
|
616 #ifdef VERBOSE |
|
617 printf ("found state %d\n",state); |
|
618 #endif |
|
619 /* Additional optimization is possible here - especially, |
|
620 elimination of trailing zeroes from the match. Leading zeroes |
|
621 have already been optimized. */ |
|
622 match = dict->states[state].match; |
|
623 /* replacing rules not handled by hyphen_hyphenate() */ |
|
624 if (match && !dict->states[state].repl) |
|
625 { |
|
626 offset = i + 1 - strlen (match); |
|
627 #ifdef VERBOSE |
|
628 for (k = 0; k < offset; k++) |
|
629 putchar (' '); |
|
630 printf ("%s\n", match); |
|
631 #endif |
|
632 /* This is a linear search because I tried a binary search and |
|
633 found it to be just a teeny bit slower. */ |
|
634 for (k = 0; match[k]; k++) |
|
635 if (hyphens[offset + k] < match[k]) |
|
636 hyphens[offset + k] = match[k]; |
|
637 } |
|
638 |
|
639 /* KBH: we need this to make sure we keep looking in a word */ |
|
640 /* for patterns even if the current character is not known in state 0 */ |
|
641 /* since patterns for hyphenation may occur anywhere in the word */ |
|
642 try_next_letter: ; |
|
643 |
|
644 } |
|
645 #ifdef VERBOSE |
|
646 for (i = 0; i < j; i++) |
|
647 putchar (hyphens[i]); |
|
648 putchar ('\n'); |
|
649 #endif |
|
650 |
|
651 for (i = 0; i < j - 4; i++) |
|
652 #if 0 |
|
653 if (hyphens[i + 1] & 1) |
|
654 hyphens[i] = '-'; |
|
655 #else |
|
656 hyphens[i] = hyphens[i + 1]; |
|
657 #endif |
|
658 hyphens[0] = '0'; |
|
659 for (; i < word_size; i++) |
|
660 hyphens[i] = '0'; |
|
661 hyphens[word_size] = '\0'; |
|
662 |
|
663 hnj_free (prep_word); |
|
664 |
|
665 return 0; |
|
666 } |
|
667 |
|
668 /* Unicode ligature length */ |
|
669 int hnj_ligature(unsigned char c) { |
|
670 switch (c) { |
|
671 case 0x80: /* ff */ |
|
672 case 0x81: /* fi */ |
|
673 case 0x82: return LIG_xx; /* fl */ |
|
674 case 0x83: /* ffi */ |
|
675 case 0x84: return LIG_xxx; /* ffl */ |
|
676 case 0x85: /* long st */ |
|
677 case 0x86: return LIG_xx; /* st */ |
|
678 } |
|
679 return 0; |
|
680 } |
|
681 |
|
682 /* character length of the first n byte of the input word */ |
|
683 int hnj_hyphen_strnlen(const char * word, int n, int utf8) |
|
684 { |
|
685 int i = 0; |
|
686 int j = 0; |
|
687 while (j < n && word[j] != '\0') { |
|
688 i++; |
|
689 // Unicode ligature support |
|
690 if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) { |
|
691 i += hnj_ligature(word[j + 2]); |
|
692 } |
|
693 for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++); |
|
694 } |
|
695 return i; |
|
696 } |
|
697 |
|
698 int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens, |
|
699 char *** rep, int ** pos, int ** cut, int lhmin) |
|
700 { |
|
701 int i = 1, j; |
|
702 |
|
703 // Unicode ligature support |
|
704 if (utf8 && ((unsigned char) word[0] == 0xEF) && ((unsigned char) word[1] == 0xAC)) { |
|
705 i += hnj_ligature(word[2]); |
|
706 } |
|
707 |
|
708 // ignore numbers |
|
709 for (j = 0; word[j] <= '9' && word[j] >= '0'; j++) i--; |
|
710 |
|
711 for (j = 0; i < lhmin && word[j] != '\0'; i++) do { |
|
712 // check length of the non-standard part |
|
713 if (*rep && *pos && *cut && (*rep)[j]) { |
|
714 char * rh = strchr((*rep)[j], '='); |
|
715 if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) + |
|
716 hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) { |
|
717 free((*rep)[j]); |
|
718 (*rep)[j] = NULL; |
|
719 hyphens[j] = '0'; |
|
720 } |
|
721 } else { |
|
722 hyphens[j] = '0'; |
|
723 } |
|
724 j++; |
|
725 |
|
726 // Unicode ligature support |
|
727 if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) { |
|
728 i += hnj_ligature(word[j + 2]); |
|
729 } |
|
730 } while (utf8 && (word[j] & 0xc0) == 0x80); |
|
731 return 0; |
|
732 } |
|
733 |
|
734 int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens, |
|
735 char *** rep, int ** pos, int ** cut, int rhmin) |
|
736 { |
|
737 int i = 0; |
|
738 int j; |
|
739 |
|
740 // ignore numbers |
|
741 for (j = word_size - 1; j > 0 && word[j] <= '9' && word[j] >= '0'; j--) i--; |
|
742 |
|
743 for (j = word_size - 1; i < rhmin && j > 0; j--) { |
|
744 // check length of the non-standard part |
|
745 if (*rep && *pos && *cut && (*rep)[j]) { |
|
746 char * rh = strchr((*rep)[j], '='); |
|
747 if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) + |
|
748 hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) { |
|
749 free((*rep)[j]); |
|
750 (*rep)[j] = NULL; |
|
751 hyphens[j] = '0'; |
|
752 } |
|
753 } else { |
|
754 hyphens[j] = '0'; |
|
755 } |
|
756 if (!utf8 || (word[j] & 0xc0) == 0xc0 || (word[j] & 0x80) != 0x80) i++; |
|
757 } |
|
758 return 0; |
|
759 } |
|
760 |
|
761 // recursive function for compound level hyphenation |
|
762 int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size, |
|
763 char * hyphens, char *** rep, int ** pos, int ** cut, |
|
764 int clhmin, int crhmin, int lend, int rend) |
|
765 { |
|
766 char *prep_word; |
|
767 int i, j, k; |
|
768 int state; |
|
769 char ch; |
|
770 HyphenState *hstate; |
|
771 char *match; |
|
772 char *repl; |
|
773 signed char replindex; |
|
774 signed char replcut; |
|
775 int offset; |
|
776 int * matchlen; |
|
777 int * matchindex; |
|
778 char ** matchrepl; |
|
779 int isrepl = 0; |
|
780 int nHyphCount; |
|
781 |
|
782 size_t prep_word_size = word_size + 3; |
|
783 prep_word = hnj_malloc (prep_word_size); |
|
784 matchlen = hnj_malloc ((word_size + 3) * sizeof(int)); |
|
785 matchindex = hnj_malloc ((word_size + 3) * sizeof(int)); |
|
786 matchrepl = hnj_malloc ((word_size + 3) * sizeof(char *)); |
|
787 |
|
788 j = 0; |
|
789 prep_word[j++] = '.'; |
|
790 |
|
791 for (i = 0; i < word_size; i++) { |
|
792 if (word[i] <= '9' && word[i] >= '0') { |
|
793 prep_word[j++] = '.'; |
|
794 } else { |
|
795 prep_word[j++] = word[i]; |
|
796 } |
|
797 } |
|
798 |
|
799 |
|
800 |
|
801 prep_word[j++] = '.'; |
|
802 prep_word[j] = '\0'; |
|
803 |
|
804 for (i = 0; i < j; i++) |
|
805 hyphens[i] = '0'; |
|
806 |
|
807 #ifdef VERBOSE |
|
808 printf ("prep_word = %s\n", prep_word); |
|
809 #endif |
|
810 |
|
811 /* now, run the finite state machine */ |
|
812 state = 0; |
|
813 for (i = 0; i < j; i++) |
|
814 { |
|
815 ch = prep_word[i]; |
|
816 for (;;) |
|
817 { |
|
818 |
|
819 if (state == -1) { |
|
820 /* return 1; */ |
|
821 /* KBH: FIXME shouldn't this be as follows? */ |
|
822 state = 0; |
|
823 goto try_next_letter; |
|
824 } |
|
825 |
|
826 #ifdef VERBOSE |
|
827 char *state_str; |
|
828 state_str = get_state_str (state, 1); |
|
829 |
|
830 for (k = 0; k < i - strlen (state_str); k++) |
|
831 putchar (' '); |
|
832 printf ("%s", state_str); |
|
833 #endif |
|
834 |
|
835 hstate = &dict->states[state]; |
|
836 for (k = 0; k < hstate->num_trans; k++) |
|
837 if (hstate->trans[k].ch == ch) |
|
838 { |
|
839 state = hstate->trans[k].new_state; |
|
840 goto found_state; |
|
841 } |
|
842 state = hstate->fallback_state; |
|
843 #ifdef VERBOSE |
|
844 printf (" falling back, fallback_state %d\n", state); |
|
845 #endif |
|
846 } |
|
847 found_state: |
|
848 #ifdef VERBOSE |
|
849 printf ("found state %d\n",state); |
|
850 #endif |
|
851 /* Additional optimization is possible here - especially, |
|
852 elimination of trailing zeroes from the match. Leading zeroes |
|
853 have already been optimized. */ |
|
854 match = dict->states[state].match; |
|
855 repl = dict->states[state].repl; |
|
856 replindex = dict->states[state].replindex; |
|
857 replcut = dict->states[state].replcut; |
|
858 /* replacing rules not handled by hyphen_hyphenate() */ |
|
859 if (match) |
|
860 { |
|
861 offset = i + 1 - strlen (match); |
|
862 #ifdef VERBOSE |
|
863 for (k = 0; k < offset; k++) |
|
864 putchar (' '); |
|
865 printf ("%s (%s)\n", match, repl); |
|
866 #endif |
|
867 if (repl) { |
|
868 if (!isrepl) for(; isrepl < word_size; isrepl++) { |
|
869 matchrepl[isrepl] = NULL; |
|
870 matchindex[isrepl] = -1; |
|
871 } |
|
872 matchlen[offset + replindex] = replcut; |
|
873 } |
|
874 /* This is a linear search because I tried a binary search and |
|
875 found it to be just a teeny bit slower. */ |
|
876 for (k = 0; match[k]; k++) { |
|
877 if ((hyphens[offset + k] < match[k])) { |
|
878 hyphens[offset + k] = match[k]; |
|
879 if (match[k]&1) { |
|
880 matchrepl[offset + k] = repl; |
|
881 if (repl && (k >= replindex) && (k <= replindex + replcut)) { |
|
882 matchindex[offset + replindex] = offset + k; |
|
883 } |
|
884 } |
|
885 } |
|
886 } |
|
887 |
|
888 } |
|
889 |
|
890 /* KBH: we need this to make sure we keep looking in a word */ |
|
891 /* for patterns even if the current character is not known in state 0 */ |
|
892 /* since patterns for hyphenation may occur anywhere in the word */ |
|
893 try_next_letter: ; |
|
894 |
|
895 } |
|
896 #ifdef VERBOSE |
|
897 for (i = 0; i < j; i++) |
|
898 putchar (hyphens[i]); |
|
899 putchar ('\n'); |
|
900 #endif |
|
901 |
|
902 for (i = 0; i < j - 3; i++) |
|
903 #if 0 |
|
904 if (hyphens[i + 1] & 1) |
|
905 hyphens[i] = '-'; |
|
906 #else |
|
907 hyphens[i] = hyphens[i + 1]; |
|
908 #endif |
|
909 for (; i < word_size; i++) |
|
910 hyphens[i] = '0'; |
|
911 hyphens[word_size] = '\0'; |
|
912 |
|
913 /* now create a new char string showing hyphenation positions */ |
|
914 /* count the hyphens and allocate space for the new hyphenated string */ |
|
915 nHyphCount = 0; |
|
916 for (i = 0; i < word_size; i++) |
|
917 if (hyphens[i]&1) |
|
918 nHyphCount++; |
|
919 j = 0; |
|
920 for (i = 0; i < word_size; i++) { |
|
921 if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) { |
|
922 if (rep && pos && cut) { |
|
923 if (!*rep) |
|
924 *rep = (char **) calloc(word_size, sizeof(char *)); |
|
925 if (!*pos) |
|
926 *pos = (int *) calloc(word_size, sizeof(int)); |
|
927 if (!*cut) { |
|
928 *cut = (int *) calloc(word_size, sizeof(int)); |
|
929 } |
|
930 (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]); |
|
931 (*pos)[matchindex[i] - 1] = matchindex[i] - i; |
|
932 (*cut)[matchindex[i] - 1] = matchlen[i]; |
|
933 } |
|
934 j += strlen(matchrepl[matchindex[i]]); |
|
935 i += matchlen[i] - 1; |
|
936 } |
|
937 } |
|
938 |
|
939 hnj_free (matchrepl); |
|
940 hnj_free (matchlen); |
|
941 hnj_free (matchindex); |
|
942 |
|
943 // recursive hyphenation of the first (compound) level segments |
|
944 if (dict->nextlevel) { |
|
945 char ** rep2; |
|
946 int * pos2; |
|
947 int * cut2; |
|
948 char * hyphens2; |
|
949 int begin = 0; |
|
950 |
|
951 rep2 = hnj_malloc (word_size * sizeof(char *)); |
|
952 pos2 = hnj_malloc (word_size * sizeof(int)); |
|
953 cut2 = hnj_malloc (word_size * sizeof(int)); |
|
954 hyphens2 = hnj_malloc (word_size + 3); |
|
955 for (i = 0; i < word_size; i++) rep2[i] = NULL; |
|
956 for (i = 0; i < word_size; i++) if |
|
957 (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) { |
|
958 if (i - begin > 1) { |
|
959 int hyph = 0; |
|
960 prep_word[i + 2] = '\0'; |
|
961 /* non-standard hyphenation at compound boundary (Schiffahrt) */ |
|
962 if (rep && *rep && *pos && *cut && (*rep)[i]) { |
|
963 char * l = strchr((*rep)[i], '='); |
|
964 size_t offset = 2 + i - (*pos)[i]; |
|
965 strncpy(prep_word + offset, (*rep)[i], prep_word_size - offset - 1); |
|
966 prep_word[prep_word_size - 1] = '\0'; |
|
967 if (l) { |
|
968 hyph = (l - (*rep)[i]) - (*pos)[i]; |
|
969 prep_word[2 + i + hyph] = '\0'; |
|
970 } |
|
971 } |
|
972 hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph, |
|
973 hyphens2, &rep2, &pos2, &cut2, clhmin, |
|
974 crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend)); |
|
975 for (j = 0; j < i - begin - 1; j++) { |
|
976 hyphens[begin + j] = hyphens2[j]; |
|
977 if (rep2[j] && rep && pos && cut) { |
|
978 if (!*rep && !*pos && !*cut) { |
|
979 int k; |
|
980 *rep = (char **) malloc(sizeof(char *) * word_size); |
|
981 *pos = (int *) malloc(sizeof(int) * word_size); |
|
982 *cut = (int *) malloc(sizeof(int) * word_size); |
|
983 for (k = 0; k < word_size; k++) { |
|
984 (*rep)[k] = NULL; |
|
985 (*pos)[k] = 0; |
|
986 (*cut)[k] = 0; |
|
987 } |
|
988 } |
|
989 (*rep)[begin + j] = rep2[j]; |
|
990 (*pos)[begin + j] = pos2[j]; |
|
991 (*cut)[begin + j] = cut2[j]; |
|
992 } |
|
993 } |
|
994 prep_word[i + 2] = word[i + 1]; |
|
995 if (*rep && *pos && *cut && (*rep)[i]) { |
|
996 size_t offset = 1; |
|
997 strncpy(prep_word + offset, word, prep_word_size - offset - 1); |
|
998 prep_word[prep_word_size - 1] = '\0'; |
|
999 } |
|
1000 } |
|
1001 begin = i + 1; |
|
1002 for (j = 0; j < word_size; j++) rep2[j] = NULL; |
|
1003 } |
|
1004 |
|
1005 // non-compound |
|
1006 if (begin == 0) { |
|
1007 hnj_hyphen_hyph_(dict->nextlevel, word, word_size, |
|
1008 hyphens, rep, pos, cut, clhmin, crhmin, lend, rend); |
|
1009 if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens, |
|
1010 rep, pos, cut, clhmin); |
|
1011 if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens, |
|
1012 rep, pos, cut, crhmin); |
|
1013 } |
|
1014 |
|
1015 free(rep2); |
|
1016 free(cut2); |
|
1017 free(pos2); |
|
1018 free(hyphens2); |
|
1019 } |
|
1020 |
|
1021 hnj_free (prep_word); |
|
1022 return 0; |
|
1023 } |
|
1024 |
|
1025 /* UTF-8 normalization of hyphen and non-standard positions */ |
|
1026 int hnj_hyphen_norm(const char *word, int word_size, char * hyphens, |
|
1027 char *** rep, int ** pos, int ** cut) |
|
1028 { |
|
1029 int i, j, k; |
|
1030 if ((((unsigned char) word[0]) >> 6) == 2) { |
|
1031 fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word); |
|
1032 return 1; |
|
1033 } |
|
1034 |
|
1035 /* calculate UTF-8 character positions */ |
|
1036 for (i = 0, j = -1; i < word_size; i++) { |
|
1037 /* beginning of an UTF-8 character (not '10' start bits) */ |
|
1038 if ((((unsigned char) word[i]) >> 6) != 2) j++; |
|
1039 hyphens[j] = hyphens[i]; |
|
1040 if (rep && pos && cut && *rep && *pos && *cut) { |
|
1041 int l = (*pos)[i]; |
|
1042 (*pos)[j] = 0; |
|
1043 for (k = 0; k < l; k++) { |
|
1044 if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++; |
|
1045 } |
|
1046 k = i - l + 1; |
|
1047 l = k + (*cut)[i]; |
|
1048 (*cut)[j] = 0; |
|
1049 for (; k < l; k++) { |
|
1050 if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++; |
|
1051 } |
|
1052 (*rep)[j] = (*rep)[i]; |
|
1053 if (j < i) { |
|
1054 (*rep)[i] = NULL; |
|
1055 (*pos)[i] = 0; |
|
1056 (*cut)[i] = 0; |
|
1057 } |
|
1058 } |
|
1059 } |
|
1060 hyphens[j + 1] = '\0'; |
|
1061 #ifdef VERBOSE |
|
1062 printf ("nums: %s\n", hyphens); |
|
1063 #endif |
|
1064 return 0; |
|
1065 } |
|
1066 |
|
1067 /* get the word with all possible hyphenations (output: hyphword) */ |
|
1068 void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens, |
|
1069 char * hyphword, char *** rep, int ** pos, int ** cut) |
|
1070 { |
|
1071 int hyphenslen = l + 5; |
|
1072 |
|
1073 int i, j; |
|
1074 for (i = 0, j = 0; i < l; i++, j++) { |
|
1075 if (hyphens[i]&1) { |
|
1076 hyphword[j] = word[i]; |
|
1077 if (*rep && *pos && *cut && (*rep)[i]) { |
|
1078 size_t offset = j - (*pos)[i] + 1; |
|
1079 strncpy(hyphword + offset, (*rep)[i], hyphenslen - offset - 1); |
|
1080 hyphword[hyphenslen-1] = '\0'; |
|
1081 j += strlen((*rep)[i]) - (*pos)[i]; |
|
1082 i += (*cut)[i] - (*pos)[i]; |
|
1083 } else hyphword[++j] = '='; |
|
1084 } else hyphword[j] = word[i]; |
|
1085 } |
|
1086 hyphword[j] = '\0'; |
|
1087 } |
|
1088 |
|
1089 |
|
1090 /* main api function with default hyphenmin parameters */ |
|
1091 int hnj_hyphen_hyphenate2 (HyphenDict *dict, |
|
1092 const char *word, int word_size, char * hyphens, |
|
1093 char *hyphword, char *** rep, int ** pos, int ** cut) |
|
1094 { |
|
1095 hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut, |
|
1096 dict->clhmin, dict->crhmin, 1, 1); |
|
1097 hnj_hyphen_lhmin(dict->utf8, word, word_size, |
|
1098 hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2)); |
|
1099 hnj_hyphen_rhmin(dict->utf8, word, word_size, |
|
1100 hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2)); |
|
1101 |
|
1102 /* nohyphen */ |
|
1103 if (dict->nohyphen) { |
|
1104 char * nh = dict->nohyphen; |
|
1105 int nhi; |
|
1106 for (nhi = 0; nhi <= dict->nohyphenl; nhi++) { |
|
1107 char * nhy = (char *) strstr(word, nh); |
|
1108 while (nhy) { |
|
1109 hyphens[nhy - word + strlen(nh) - 1] = '0'; |
|
1110 if (nhy - word - 1 >= 0) hyphens[nhy - word - 1] = '0'; |
|
1111 nhy = (char *) strstr(nhy + 1, nh); |
|
1112 } |
|
1113 nh = nh + strlen(nh) + 1; |
|
1114 } |
|
1115 } |
|
1116 |
|
1117 if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut); |
|
1118 if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut); |
|
1119 #ifdef VERBOSE |
|
1120 printf ("nums: %s\n", hyphens); |
|
1121 #endif |
|
1122 return 0; |
|
1123 } |
|
1124 |
|
1125 /* previous main api function with hyphenmin parameters */ |
|
1126 int hnj_hyphen_hyphenate3 (HyphenDict *dict, |
|
1127 const char *word, int word_size, char * hyphens, |
|
1128 char *hyphword, char *** rep, int ** pos, int ** cut, |
|
1129 int lhmin, int rhmin, int clhmin, int crhmin) |
|
1130 { |
|
1131 lhmin = (lhmin > dict->lhmin) ? lhmin : dict->lhmin; |
|
1132 rhmin = (rhmin > dict->rhmin) ? rhmin : dict->rhmin; |
|
1133 clhmin = (clhmin > dict->clhmin) ? clhmin : dict->clhmin; |
|
1134 crhmin = (crhmin > dict->crhmin) ? crhmin : dict->crhmin; |
|
1135 hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut, |
|
1136 clhmin, crhmin, 1, 1); |
|
1137 hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens, |
|
1138 rep, pos, cut, (lhmin > 0 ? lhmin : 2)); |
|
1139 hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens, |
|
1140 rep, pos, cut, (rhmin > 0 ? rhmin : 2)); |
|
1141 if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut); |
|
1142 |
|
1143 /* nohyphen */ |
|
1144 if (dict->nohyphen) { |
|
1145 char * nh = dict->nohyphen; |
|
1146 int nhi; |
|
1147 for (nhi = 0; nhi <= dict->nohyphenl; nhi++) { |
|
1148 char * nhy = (char *) strstr(word, nh); |
|
1149 while (nhy) { |
|
1150 hyphens[nhy - word + strlen(nh) - 1] = 0; |
|
1151 if (nhy - word - 1 >= 0) hyphens[nhy - word - 1] = 0; |
|
1152 nhy = (char *) strstr(nhy + 1, nh); |
|
1153 } |
|
1154 nh = nh + strlen(nh) + 1; |
|
1155 } |
|
1156 } |
|
1157 |
|
1158 if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut); |
|
1159 return 0; |
|
1160 } |