intl/hyphenation/src/hyphen.c

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both
michael@0 2 * licenses follows.
michael@0 3 */
michael@0 4
michael@0 5 /* LibHnj - a library for high quality hyphenation and justification
michael@0 6 * Copyright (C) 1998 Raph Levien,
michael@0 7 * (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org),
michael@0 8 * (C) 2001 Peter Novodvorsky (nidd@cs.msu.su)
michael@0 9 * (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo)
michael@0 10 *
michael@0 11 * This library is free software; you can redistribute it and/or
michael@0 12 * modify it under the terms of the GNU Library General Public
michael@0 13 * License as published by the Free Software Foundation; either
michael@0 14 * version 2 of the License, or (at your option) any later version.
michael@0 15 *
michael@0 16 * This library is distributed in the hope that it will be useful,
michael@0 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
michael@0 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
michael@0 19 * Library General Public License for more details.
michael@0 20 *
michael@0 21 * You should have received a copy of the GNU Library General Public
michael@0 22 * License along with this library; if not, write to the
michael@0 23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
michael@0 24 * Boston, MA 02111-1307 USA.
michael@0 25 */
michael@0 26
michael@0 27 /*
michael@0 28 * The contents of this file are subject to the Mozilla Public License
michael@0 29 * Version 1.0 (the "MPL"); you may not use this file except in
michael@0 30 * compliance with the MPL. You may obtain a copy of the MPL at
michael@0 31 * http://www.mozilla.org/MPL/
michael@0 32 *
michael@0 33 * Software distributed under the MPL is distributed on an "AS IS" basis,
michael@0 34 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL
michael@0 35 * for the specific language governing rights and limitations under the
michael@0 36 * MPL.
michael@0 37 *
michael@0 38 */
michael@0 39 #include <stdlib.h> /* for NULL, malloc */
michael@0 40 #include <stdio.h> /* for fprintf */
michael@0 41 #include <string.h> /* for strdup */
michael@0 42
michael@0 43 #ifdef UNX
michael@0 44 #include <unistd.h> /* for exit */
michael@0 45 #endif
michael@0 46
michael@0 47 #define noVERBOSE
michael@0 48
michael@0 49 /* calculate hyphenmin values with long ligature length (2 or 3 characters
michael@0 50 * instead of 1 or 2) for comparison with hyphenation without ligatures */
michael@0 51 #define noLONG_LIGATURE
michael@0 52
michael@0 53 #ifdef LONG_LIGATURE
michael@0 54 #define LIG_xx 1
michael@0 55 #define LIG_xxx 2
michael@0 56 #else
michael@0 57 #define LIG_xx 0
michael@0 58 #define LIG_xxx 1
michael@0 59 #endif
michael@0 60
michael@0 61 #include "hnjalloc.h"
michael@0 62 #include "hyphen.h"
michael@0 63
michael@0 64 static char *
michael@0 65 hnj_strdup (const char *s)
michael@0 66 {
michael@0 67 char *new;
michael@0 68 int l;
michael@0 69
michael@0 70 l = strlen (s);
michael@0 71 new = hnj_malloc (l + 1);
michael@0 72 memcpy (new, s, l);
michael@0 73 new[l] = 0;
michael@0 74 return new;
michael@0 75 }
michael@0 76
michael@0 77 /* remove cross-platform text line end characters */
michael@0 78 void hnj_strchomp(char * s)
michael@0 79 {
michael@0 80 int k = strlen(s);
michael@0 81 if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';
michael@0 82 if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
michael@0 83 }
michael@0 84
michael@0 85 /* a little bit of a hash table implementation. This simply maps strings
michael@0 86 to state numbers */
michael@0 87
michael@0 88 typedef struct _HashTab HashTab;
michael@0 89 typedef struct _HashEntry HashEntry;
michael@0 90
michael@0 91 /* A cheap, but effective, hack. */
michael@0 92 #define HASH_SIZE 31627
michael@0 93
michael@0 94 struct _HashTab {
michael@0 95 HashEntry *entries[HASH_SIZE];
michael@0 96 };
michael@0 97
michael@0 98 struct _HashEntry {
michael@0 99 HashEntry *next;
michael@0 100 char *key;
michael@0 101 int val;
michael@0 102 };
michael@0 103
michael@0 104 /* a char* hash function from ASU - adapted from Gtk+ */
michael@0 105 static unsigned int
michael@0 106 hnj_string_hash (const char *s)
michael@0 107 {
michael@0 108 const char *p;
michael@0 109 unsigned int h=0, g;
michael@0 110 for(p = s; *p != '\0'; p += 1) {
michael@0 111 h = ( h << 4 ) + *p;
michael@0 112 if ( ( g = h & 0xf0000000 ) ) {
michael@0 113 h = h ^ (g >> 24);
michael@0 114 h = h ^ g;
michael@0 115 }
michael@0 116 }
michael@0 117 return h /* % M */;
michael@0 118 }
michael@0 119
michael@0 120 static HashTab *
michael@0 121 hnj_hash_new (void)
michael@0 122 {
michael@0 123 HashTab *hashtab;
michael@0 124 int i;
michael@0 125
michael@0 126 hashtab = hnj_malloc (sizeof(HashTab));
michael@0 127 for (i = 0; i < HASH_SIZE; i++)
michael@0 128 hashtab->entries[i] = NULL;
michael@0 129
michael@0 130 return hashtab;
michael@0 131 }
michael@0 132
michael@0 133 static void
michael@0 134 hnj_hash_free (HashTab *hashtab)
michael@0 135 {
michael@0 136 int i;
michael@0 137 HashEntry *e, *next;
michael@0 138
michael@0 139 for (i = 0; i < HASH_SIZE; i++)
michael@0 140 for (e = hashtab->entries[i]; e; e = next)
michael@0 141 {
michael@0 142 next = e->next;
michael@0 143 hnj_free (e->key);
michael@0 144 hnj_free (e);
michael@0 145 }
michael@0 146
michael@0 147 hnj_free (hashtab);
michael@0 148 }
michael@0 149
michael@0 150 /* assumes that key is not already present! */
michael@0 151 static void
michael@0 152 hnj_hash_insert (HashTab *hashtab, const char *key, int val)
michael@0 153 {
michael@0 154 int i;
michael@0 155 HashEntry *e;
michael@0 156
michael@0 157 i = hnj_string_hash (key) % HASH_SIZE;
michael@0 158 e = hnj_malloc (sizeof(HashEntry));
michael@0 159 e->next = hashtab->entries[i];
michael@0 160 e->key = hnj_strdup (key);
michael@0 161 e->val = val;
michael@0 162 hashtab->entries[i] = e;
michael@0 163 }
michael@0 164
michael@0 165 /* return val if found, otherwise -1 */
michael@0 166 static int
michael@0 167 hnj_hash_lookup (HashTab *hashtab, const char *key)
michael@0 168 {
michael@0 169 int i;
michael@0 170 HashEntry *e;
michael@0 171 i = hnj_string_hash (key) % HASH_SIZE;
michael@0 172 for (e = hashtab->entries[i]; e; e = e->next)
michael@0 173 if (!strcmp (key, e->key))
michael@0 174 return e->val;
michael@0 175 return -1;
michael@0 176 }
michael@0 177
michael@0 178 /* Get the state number, allocating a new state if necessary. */
michael@0 179 static int
michael@0 180 hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string)
michael@0 181 {
michael@0 182 int state_num;
michael@0 183
michael@0 184 state_num = hnj_hash_lookup (hashtab, string);
michael@0 185
michael@0 186 if (state_num >= 0)
michael@0 187 return state_num;
michael@0 188
michael@0 189 hnj_hash_insert (hashtab, string, dict->num_states);
michael@0 190 /* predicate is true if dict->num_states is a power of two */
michael@0 191 if (!(dict->num_states & (dict->num_states - 1)))
michael@0 192 {
michael@0 193 dict->states = hnj_realloc (dict->states,
michael@0 194 (dict->num_states << 1) *
michael@0 195 sizeof(HyphenState));
michael@0 196 }
michael@0 197 dict->states[dict->num_states].match = NULL;
michael@0 198 dict->states[dict->num_states].repl = NULL;
michael@0 199 dict->states[dict->num_states].fallback_state = -1;
michael@0 200 dict->states[dict->num_states].num_trans = 0;
michael@0 201 dict->states[dict->num_states].trans = NULL;
michael@0 202 return dict->num_states++;
michael@0 203 }
michael@0 204
michael@0 205 /* add a transition from state1 to state2 through ch - assumes that the
michael@0 206 transition does not already exist */
michael@0 207 static void
michael@0 208 hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch)
michael@0 209 {
michael@0 210 int num_trans;
michael@0 211
michael@0 212 num_trans = dict->states[state1].num_trans;
michael@0 213 if (num_trans == 0)
michael@0 214 {
michael@0 215 dict->states[state1].trans = hnj_malloc (sizeof(HyphenTrans));
michael@0 216 }
michael@0 217 else if (!(num_trans & (num_trans - 1)))
michael@0 218 {
michael@0 219 dict->states[state1].trans = hnj_realloc (dict->states[state1].trans,
michael@0 220 (num_trans << 1) *
michael@0 221 sizeof(HyphenTrans));
michael@0 222 }
michael@0 223 dict->states[state1].trans[num_trans].ch = ch;
michael@0 224 dict->states[state1].trans[num_trans].new_state = state2;
michael@0 225 dict->states[state1].num_trans++;
michael@0 226 }
michael@0 227
michael@0 228 #ifdef VERBOSE
michael@0 229 HashTab *global[1];
michael@0 230
michael@0 231 static char *
michael@0 232 get_state_str (int state, int level)
michael@0 233 {
michael@0 234 int i;
michael@0 235 HashEntry *e;
michael@0 236
michael@0 237 for (i = 0; i < HASH_SIZE; i++)
michael@0 238 for (e = global[level]->entries[i]; e; e = e->next)
michael@0 239 if (e->val == state)
michael@0 240 return e->key;
michael@0 241 return NULL;
michael@0 242 }
michael@0 243 #endif
michael@0 244
michael@0 245 void hnj_hyphen_load_line(char * buf, HyphenDict * dict, HashTab * hashtab) {
michael@0 246 int i, j;
michael@0 247 char word[MAX_CHARS];
michael@0 248 char pattern[MAX_CHARS];
michael@0 249 char * repl;
michael@0 250 signed char replindex;
michael@0 251 signed char replcut;
michael@0 252 int state_num = 0;
michael@0 253 int last_state;
michael@0 254 char ch;
michael@0 255 int found;
michael@0 256
michael@0 257 if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) {
michael@0 258 dict->lhmin = atoi(buf + 13);
michael@0 259 return;
michael@0 260 } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) {
michael@0 261 dict->rhmin = atoi(buf + 14);
michael@0 262 return;
michael@0 263 } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) {
michael@0 264 dict->clhmin = atoi(buf + 21);
michael@0 265 return;
michael@0 266 } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) {
michael@0 267 dict->crhmin = atoi(buf + 22);
michael@0 268 return;
michael@0 269 } else if (strncmp(buf, "NOHYPHEN", 8) == 0) {
michael@0 270 char * space = buf + 8;
michael@0 271 while (*space != '\0' && (*space == ' ' || *space == '\t')) space++;
michael@0 272 if (*buf != '\0') dict->nohyphen = hnj_strdup(space);
michael@0 273 if (dict->nohyphen) {
michael@0 274 char * nhe = dict->nohyphen + strlen(dict->nohyphen) - 1;
michael@0 275 *nhe = 0;
michael@0 276 for (nhe = nhe - 1; nhe > dict->nohyphen; nhe--) {
michael@0 277 if (*nhe == ',') {
michael@0 278 dict->nohyphenl++;
michael@0 279 *nhe = 0;
michael@0 280 }
michael@0 281 }
michael@0 282 }
michael@0 283 return;
michael@0 284 }
michael@0 285 j = 0;
michael@0 286 pattern[j] = '0';
michael@0 287 repl = strchr(buf, '/');
michael@0 288 replindex = 0;
michael@0 289 replcut = 0;
michael@0 290 if (repl) {
michael@0 291 char * index = strchr(repl + 1, ',');
michael@0 292 *repl = '\0';
michael@0 293 if (index) {
michael@0 294 char * index2 = strchr(index + 1, ',');
michael@0 295 *index = '\0';
michael@0 296 if (index2) {
michael@0 297 *index2 = '\0';
michael@0 298 replindex = (signed char) atoi(index + 1) - 1;
michael@0 299 replcut = (signed char) atoi(index2 + 1);
michael@0 300 }
michael@0 301 } else {
michael@0 302 hnj_strchomp(repl + 1);
michael@0 303 replindex = 0;
michael@0 304 replcut = (signed char) strlen(buf);
michael@0 305 }
michael@0 306 repl = hnj_strdup(repl + 1);
michael@0 307 }
michael@0 308 for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++)
michael@0 309 {
michael@0 310 if (buf[i] >= '0' && buf[i] <= '9')
michael@0 311 pattern[j] = buf[i];
michael@0 312 else
michael@0 313 {
michael@0 314 word[j] = buf[i];
michael@0 315 pattern[++j] = '0';
michael@0 316 }
michael@0 317 }
michael@0 318 word[j] = '\0';
michael@0 319 pattern[j + 1] = '\0';
michael@0 320
michael@0 321 i = 0;
michael@0 322 if (!repl) {
michael@0 323 /* Optimize away leading zeroes */
michael@0 324 for (; pattern[i] == '0'; i++);
michael@0 325 } else {
michael@0 326 if (*word == '.') i++;
michael@0 327 /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */
michael@0 328 if (dict->utf8) {
michael@0 329 int pu = -1; /* unicode character position */
michael@0 330 int ps = -1; /* unicode start position (original replindex) */
michael@0 331 int pc = (*word == '.') ? 1: 0; /* 8-bit character position */
michael@0 332 for (; pc < (strlen(word) + 1); pc++) {
michael@0 333 /* beginning of an UTF-8 character (not '10' start bits) */
michael@0 334 if ((((unsigned char) word[pc]) >> 6) != 2) pu++;
michael@0 335 if ((ps < 0) && (replindex == pu)) {
michael@0 336 ps = replindex;
michael@0 337 replindex = (signed char) pc;
michael@0 338 }
michael@0 339 if ((ps >= 0) && ((pu - ps) == replcut)) {
michael@0 340 replcut = (signed char) (pc - replindex);
michael@0 341 break;
michael@0 342 }
michael@0 343 }
michael@0 344 if (*word == '.') replindex--;
michael@0 345 }
michael@0 346 }
michael@0 347
michael@0 348 #ifdef VERBOSE
michael@0 349 printf ("word %s pattern %s, j = %d repl: %s\n", word, pattern + i, j, repl);
michael@0 350 #endif
michael@0 351 found = hnj_hash_lookup (hashtab, word);
michael@0 352 state_num = hnj_get_state (dict, hashtab, word);
michael@0 353 dict->states[state_num].match = hnj_strdup (pattern + i);
michael@0 354 dict->states[state_num].repl = repl;
michael@0 355 dict->states[state_num].replindex = replindex;
michael@0 356 if (!replcut) {
michael@0 357 dict->states[state_num].replcut = (signed char) strlen(word);
michael@0 358 } else {
michael@0 359 dict->states[state_num].replcut = replcut;
michael@0 360 }
michael@0 361
michael@0 362 /* now, put in the prefix transitions */
michael@0 363 for (; found < 0 ;j--)
michael@0 364 {
michael@0 365 last_state = state_num;
michael@0 366 ch = word[j - 1];
michael@0 367 word[j - 1] = '\0';
michael@0 368 found = hnj_hash_lookup (hashtab, word);
michael@0 369 state_num = hnj_get_state (dict, hashtab, word);
michael@0 370 hnj_add_trans (dict, state_num, last_state, ch);
michael@0 371 }
michael@0 372 }
michael@0 373
michael@0 374 HyphenDict *
michael@0 375 hnj_hyphen_load (const char *fn)
michael@0 376 {
michael@0 377 HyphenDict *dict[2];
michael@0 378 HashTab *hashtab;
michael@0 379 FILE *f;
michael@0 380 char buf[MAX_CHARS];
michael@0 381 int nextlevel = 0;
michael@0 382 int i, j, k;
michael@0 383 HashEntry *e;
michael@0 384 int state_num = 0;
michael@0 385
michael@0 386 f = fopen (fn, "r");
michael@0 387 if (f == NULL)
michael@0 388 return NULL;
michael@0 389
michael@0 390 // loading one or two dictionaries (separated by NEXTLEVEL keyword)
michael@0 391 for (k = 0; k < 2; k++) {
michael@0 392 hashtab = hnj_hash_new ();
michael@0 393 #ifdef VERBOSE
michael@0 394 global[k] = hashtab;
michael@0 395 #endif
michael@0 396 hnj_hash_insert (hashtab, "", 0);
michael@0 397 dict[k] = hnj_malloc (sizeof(HyphenDict));
michael@0 398 dict[k]->num_states = 1;
michael@0 399 dict[k]->states = hnj_malloc (sizeof(HyphenState));
michael@0 400 dict[k]->states[0].match = NULL;
michael@0 401 dict[k]->states[0].repl = NULL;
michael@0 402 dict[k]->states[0].fallback_state = -1;
michael@0 403 dict[k]->states[0].num_trans = 0;
michael@0 404 dict[k]->states[0].trans = NULL;
michael@0 405 dict[k]->nextlevel = NULL;
michael@0 406 dict[k]->lhmin = 0;
michael@0 407 dict[k]->rhmin = 0;
michael@0 408 dict[k]->clhmin = 0;
michael@0 409 dict[k]->crhmin = 0;
michael@0 410 dict[k]->nohyphen = NULL;
michael@0 411 dict[k]->nohyphenl = 0;
michael@0 412
michael@0 413 /* read in character set info */
michael@0 414 if (k == 0) {
michael@0 415 for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;
michael@0 416 if (fgets(dict[k]->cset, sizeof(dict[k]->cset),f) != NULL) {
michael@0 417 for (i=0;i<MAX_NAME;i++)
michael@0 418 if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))
michael@0 419 dict[k]->cset[i] = 0;
michael@0 420 } else {
michael@0 421 dict[k]->cset[0] = 0;
michael@0 422 }
michael@0 423 dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);
michael@0 424 } else {
michael@0 425 strncpy(dict[k]->cset, dict[0]->cset, sizeof(dict[k]->cset)-1);
michael@0 426 dict[k]->cset[sizeof(dict[k]->cset)-1] = '\0';
michael@0 427 dict[k]->utf8 = dict[0]->utf8;
michael@0 428 }
michael@0 429
michael@0 430 if (k == 0 || nextlevel) {
michael@0 431 while (fgets (buf, sizeof(buf), f) != NULL) {
michael@0 432 if (strncmp(buf, "NEXTLEVEL", 9) == 0) {
michael@0 433 nextlevel = 1;
michael@0 434 break;
michael@0 435 } else if (buf[0] != '%') hnj_hyphen_load_line(buf, dict[k], hashtab);
michael@0 436 }
michael@0 437 } else if (k == 1) {
michael@0 438 /* default first level: hyphen and ASCII apostrophe */
michael@0 439 if (!dict[0]->utf8) hnj_hyphen_load_line("NOHYPHEN ',-\n", dict[k], hashtab);
michael@0 440 else hnj_hyphen_load_line("NOHYPHEN ',\xe2\x80\x93,\xe2\x80\x99,-\n", dict[k], hashtab);
michael@0 441 strncpy(buf, "1-1\n", MAX_CHARS-1); // buf rewritten by hnj_hyphen_load here
michael@0 442 buf[MAX_CHARS-1] = '\0';
michael@0 443 hnj_hyphen_load_line(buf, dict[k], hashtab); /* remove hyphen */
michael@0 444 hnj_hyphen_load_line("1'1\n", dict[k], hashtab); /* ASCII apostrophe */
michael@0 445 if (dict[0]->utf8) {
michael@0 446 hnj_hyphen_load_line("1\xe2\x80\x93" "1\n", dict[k], hashtab); /* endash */
michael@0 447 hnj_hyphen_load_line("1\xe2\x80\x99" "1\n", dict[k], hashtab); /* apostrophe */
michael@0 448 }
michael@0 449 }
michael@0 450
michael@0 451 /* Could do unioning of matches here (instead of the preprocessor script).
michael@0 452 If we did, the pseudocode would look something like this:
michael@0 453
michael@0 454 foreach state in the hash table
michael@0 455 foreach i = [1..length(state) - 1]
michael@0 456 state to check is substr (state, i)
michael@0 457 look it up
michael@0 458 if found, and if there is a match, union the match in.
michael@0 459
michael@0 460 It's also possible to avoid the quadratic blowup by doing the
michael@0 461 search in order of increasing state string sizes - then you
michael@0 462 can break the loop after finding the first match.
michael@0 463
michael@0 464 This step should be optional in any case - if there is a
michael@0 465 preprocessed rule table, it's always faster to use that.
michael@0 466
michael@0 467 */
michael@0 468
michael@0 469 /* put in the fallback states */
michael@0 470 for (i = 0; i < HASH_SIZE; i++)
michael@0 471 for (e = hashtab->entries[i]; e; e = e->next)
michael@0 472 {
michael@0 473 if (*(e->key)) for (j = 1; 1; j++)
michael@0 474 {
michael@0 475 state_num = hnj_hash_lookup (hashtab, e->key + j);
michael@0 476 if (state_num >= 0)
michael@0 477 break;
michael@0 478 }
michael@0 479 /* KBH: FIXME state 0 fallback_state should always be -1? */
michael@0 480 if (e->val)
michael@0 481 dict[k]->states[e->val].fallback_state = state_num;
michael@0 482 }
michael@0 483 #ifdef VERBOSE
michael@0 484 for (i = 0; i < HASH_SIZE; i++)
michael@0 485 for (e = hashtab->entries[i]; e; e = e->next)
michael@0 486 {
michael@0 487 printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,
michael@0 488 dict[k]->states[e->val].fallback_state);
michael@0 489 for (j = 0; j < dict[k]->states[e->val].num_trans; j++)
michael@0 490 printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch,
michael@0 491 dict[k]->states[e->val].trans[j].new_state);
michael@0 492 }
michael@0 493 #endif
michael@0 494
michael@0 495 #ifndef VERBOSE
michael@0 496 hnj_hash_free (hashtab);
michael@0 497 #endif
michael@0 498 state_num = 0;
michael@0 499 }
michael@0 500 fclose(f);
michael@0 501 if (nextlevel) dict[0]->nextlevel = dict[1];
michael@0 502 else {
michael@0 503 dict[1] -> nextlevel = dict[0];
michael@0 504 dict[1]->lhmin = dict[0]->lhmin;
michael@0 505 dict[1]->rhmin = dict[0]->rhmin;
michael@0 506 dict[1]->clhmin = (dict[0]->clhmin) ? dict[0]->clhmin : ((dict[0]->lhmin) ? dict[0]->lhmin : 3);
michael@0 507 dict[1]->crhmin = (dict[0]->crhmin) ? dict[0]->crhmin : ((dict[0]->rhmin) ? dict[0]->rhmin : 3);
michael@0 508 #ifdef VERBOSE
michael@0 509 HashTab *r = global[0];
michael@0 510 global[0] = global[1];
michael@0 511 global[1] = r;
michael@0 512 #endif
michael@0 513 return dict[1];
michael@0 514 }
michael@0 515 return dict[0];
michael@0 516 }
michael@0 517
michael@0 518 void hnj_hyphen_free (HyphenDict *dict)
michael@0 519 {
michael@0 520 int state_num;
michael@0 521 HyphenState *hstate;
michael@0 522
michael@0 523 for (state_num = 0; state_num < dict->num_states; state_num++)
michael@0 524 {
michael@0 525 hstate = &dict->states[state_num];
michael@0 526 if (hstate->match)
michael@0 527 hnj_free (hstate->match);
michael@0 528 if (hstate->repl)
michael@0 529 hnj_free (hstate->repl);
michael@0 530 if (hstate->trans)
michael@0 531 hnj_free (hstate->trans);
michael@0 532 }
michael@0 533 if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel);
michael@0 534
michael@0 535 if (dict->nohyphen) hnj_free(dict->nohyphen);
michael@0 536
michael@0 537 hnj_free (dict->states);
michael@0 538
michael@0 539 hnj_free (dict);
michael@0 540 }
michael@0 541
michael@0 542 #define MAX_WORD 256
michael@0 543
michael@0 544 int hnj_hyphen_hyphenate (HyphenDict *dict,
michael@0 545 const char *word, int word_size,
michael@0 546 char *hyphens)
michael@0 547 {
michael@0 548 char *prep_word;
michael@0 549 int i, j, k;
michael@0 550 int state;
michael@0 551 char ch;
michael@0 552 HyphenState *hstate;
michael@0 553 char *match;
michael@0 554 int offset;
michael@0 555
michael@0 556 prep_word = hnj_malloc (word_size + 3);
michael@0 557
michael@0 558 j = 0;
michael@0 559 prep_word[j++] = '.';
michael@0 560
michael@0 561 for (i = 0; i < word_size; i++) {
michael@0 562 if (word[i] <= '9' && word[i] >= '0') {
michael@0 563 prep_word[j++] = '.';
michael@0 564 } else {
michael@0 565 prep_word[j++] = word[i];
michael@0 566 }
michael@0 567 }
michael@0 568
michael@0 569 prep_word[j++] = '.';
michael@0 570 prep_word[j] = '\0';
michael@0 571
michael@0 572 for (i = 0; i < word_size + 5; i++)
michael@0 573 hyphens[i] = '0';
michael@0 574
michael@0 575 #ifdef VERBOSE
michael@0 576 printf ("prep_word = %s\n", prep_word);
michael@0 577 #endif
michael@0 578
michael@0 579 /* now, run the finite state machine */
michael@0 580 state = 0;
michael@0 581 for (i = 0; i < j; i++)
michael@0 582 {
michael@0 583 ch = prep_word[i];
michael@0 584 for (;;)
michael@0 585 {
michael@0 586
michael@0 587 if (state == -1) {
michael@0 588 /* return 1; */
michael@0 589 /* KBH: FIXME shouldn't this be as follows? */
michael@0 590 state = 0;
michael@0 591 goto try_next_letter;
michael@0 592 }
michael@0 593
michael@0 594 #ifdef VERBOSE
michael@0 595 char *state_str;
michael@0 596 state_str = get_state_str (state, 0);
michael@0 597
michael@0 598 for (k = 0; k < i - strlen (state_str); k++)
michael@0 599 putchar (' ');
michael@0 600 printf ("%s", state_str);
michael@0 601 #endif
michael@0 602
michael@0 603 hstate = &dict->states[state];
michael@0 604 for (k = 0; k < hstate->num_trans; k++)
michael@0 605 if (hstate->trans[k].ch == ch)
michael@0 606 {
michael@0 607 state = hstate->trans[k].new_state;
michael@0 608 goto found_state;
michael@0 609 }
michael@0 610 state = hstate->fallback_state;
michael@0 611 #ifdef VERBOSE
michael@0 612 printf (" falling back, fallback_state %d\n", state);
michael@0 613 #endif
michael@0 614 }
michael@0 615 found_state:
michael@0 616 #ifdef VERBOSE
michael@0 617 printf ("found state %d\n",state);
michael@0 618 #endif
michael@0 619 /* Additional optimization is possible here - especially,
michael@0 620 elimination of trailing zeroes from the match. Leading zeroes
michael@0 621 have already been optimized. */
michael@0 622 match = dict->states[state].match;
michael@0 623 /* replacing rules not handled by hyphen_hyphenate() */
michael@0 624 if (match && !dict->states[state].repl)
michael@0 625 {
michael@0 626 offset = i + 1 - strlen (match);
michael@0 627 #ifdef VERBOSE
michael@0 628 for (k = 0; k < offset; k++)
michael@0 629 putchar (' ');
michael@0 630 printf ("%s\n", match);
michael@0 631 #endif
michael@0 632 /* This is a linear search because I tried a binary search and
michael@0 633 found it to be just a teeny bit slower. */
michael@0 634 for (k = 0; match[k]; k++)
michael@0 635 if (hyphens[offset + k] < match[k])
michael@0 636 hyphens[offset + k] = match[k];
michael@0 637 }
michael@0 638
michael@0 639 /* KBH: we need this to make sure we keep looking in a word */
michael@0 640 /* for patterns even if the current character is not known in state 0 */
michael@0 641 /* since patterns for hyphenation may occur anywhere in the word */
michael@0 642 try_next_letter: ;
michael@0 643
michael@0 644 }
michael@0 645 #ifdef VERBOSE
michael@0 646 for (i = 0; i < j; i++)
michael@0 647 putchar (hyphens[i]);
michael@0 648 putchar ('\n');
michael@0 649 #endif
michael@0 650
michael@0 651 for (i = 0; i < j - 4; i++)
michael@0 652 #if 0
michael@0 653 if (hyphens[i + 1] & 1)
michael@0 654 hyphens[i] = '-';
michael@0 655 #else
michael@0 656 hyphens[i] = hyphens[i + 1];
michael@0 657 #endif
michael@0 658 hyphens[0] = '0';
michael@0 659 for (; i < word_size; i++)
michael@0 660 hyphens[i] = '0';
michael@0 661 hyphens[word_size] = '\0';
michael@0 662
michael@0 663 hnj_free (prep_word);
michael@0 664
michael@0 665 return 0;
michael@0 666 }
michael@0 667
michael@0 668 /* Unicode ligature length */
michael@0 669 int hnj_ligature(unsigned char c) {
michael@0 670 switch (c) {
michael@0 671 case 0x80: /* ff */
michael@0 672 case 0x81: /* fi */
michael@0 673 case 0x82: return LIG_xx; /* fl */
michael@0 674 case 0x83: /* ffi */
michael@0 675 case 0x84: return LIG_xxx; /* ffl */
michael@0 676 case 0x85: /* long st */
michael@0 677 case 0x86: return LIG_xx; /* st */
michael@0 678 }
michael@0 679 return 0;
michael@0 680 }
michael@0 681
michael@0 682 /* character length of the first n byte of the input word */
michael@0 683 int hnj_hyphen_strnlen(const char * word, int n, int utf8)
michael@0 684 {
michael@0 685 int i = 0;
michael@0 686 int j = 0;
michael@0 687 while (j < n && word[j] != '\0') {
michael@0 688 i++;
michael@0 689 // Unicode ligature support
michael@0 690 if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) {
michael@0 691 i += hnj_ligature(word[j + 2]);
michael@0 692 }
michael@0 693 for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++);
michael@0 694 }
michael@0 695 return i;
michael@0 696 }
michael@0 697
michael@0 698 int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens,
michael@0 699 char *** rep, int ** pos, int ** cut, int lhmin)
michael@0 700 {
michael@0 701 int i = 1, j;
michael@0 702
michael@0 703 // Unicode ligature support
michael@0 704 if (utf8 && ((unsigned char) word[0] == 0xEF) && ((unsigned char) word[1] == 0xAC)) {
michael@0 705 i += hnj_ligature(word[2]);
michael@0 706 }
michael@0 707
michael@0 708 // ignore numbers
michael@0 709 for (j = 0; word[j] <= '9' && word[j] >= '0'; j++) i--;
michael@0 710
michael@0 711 for (j = 0; i < lhmin && word[j] != '\0'; i++) do {
michael@0 712 // check length of the non-standard part
michael@0 713 if (*rep && *pos && *cut && (*rep)[j]) {
michael@0 714 char * rh = strchr((*rep)[j], '=');
michael@0 715 if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) +
michael@0 716 hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) {
michael@0 717 free((*rep)[j]);
michael@0 718 (*rep)[j] = NULL;
michael@0 719 hyphens[j] = '0';
michael@0 720 }
michael@0 721 } else {
michael@0 722 hyphens[j] = '0';
michael@0 723 }
michael@0 724 j++;
michael@0 725
michael@0 726 // Unicode ligature support
michael@0 727 if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) {
michael@0 728 i += hnj_ligature(word[j + 2]);
michael@0 729 }
michael@0 730 } while (utf8 && (word[j] & 0xc0) == 0x80);
michael@0 731 return 0;
michael@0 732 }
michael@0 733
michael@0 734 int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens,
michael@0 735 char *** rep, int ** pos, int ** cut, int rhmin)
michael@0 736 {
michael@0 737 int i = 0;
michael@0 738 int j;
michael@0 739
michael@0 740 // ignore numbers
michael@0 741 for (j = word_size - 1; j > 0 && word[j] <= '9' && word[j] >= '0'; j--) i--;
michael@0 742
michael@0 743 for (j = word_size - 1; i < rhmin && j > 0; j--) {
michael@0 744 // check length of the non-standard part
michael@0 745 if (*rep && *pos && *cut && (*rep)[j]) {
michael@0 746 char * rh = strchr((*rep)[j], '=');
michael@0 747 if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) +
michael@0 748 hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) {
michael@0 749 free((*rep)[j]);
michael@0 750 (*rep)[j] = NULL;
michael@0 751 hyphens[j] = '0';
michael@0 752 }
michael@0 753 } else {
michael@0 754 hyphens[j] = '0';
michael@0 755 }
michael@0 756 if (!utf8 || (word[j] & 0xc0) == 0xc0 || (word[j] & 0x80) != 0x80) i++;
michael@0 757 }
michael@0 758 return 0;
michael@0 759 }
michael@0 760
michael@0 761 // recursive function for compound level hyphenation
michael@0 762 int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
michael@0 763 char * hyphens, char *** rep, int ** pos, int ** cut,
michael@0 764 int clhmin, int crhmin, int lend, int rend)
michael@0 765 {
michael@0 766 char *prep_word;
michael@0 767 int i, j, k;
michael@0 768 int state;
michael@0 769 char ch;
michael@0 770 HyphenState *hstate;
michael@0 771 char *match;
michael@0 772 char *repl;
michael@0 773 signed char replindex;
michael@0 774 signed char replcut;
michael@0 775 int offset;
michael@0 776 int * matchlen;
michael@0 777 int * matchindex;
michael@0 778 char ** matchrepl;
michael@0 779 int isrepl = 0;
michael@0 780 int nHyphCount;
michael@0 781
michael@0 782 size_t prep_word_size = word_size + 3;
michael@0 783 prep_word = hnj_malloc (prep_word_size);
michael@0 784 matchlen = hnj_malloc ((word_size + 3) * sizeof(int));
michael@0 785 matchindex = hnj_malloc ((word_size + 3) * sizeof(int));
michael@0 786 matchrepl = hnj_malloc ((word_size + 3) * sizeof(char *));
michael@0 787
michael@0 788 j = 0;
michael@0 789 prep_word[j++] = '.';
michael@0 790
michael@0 791 for (i = 0; i < word_size; i++) {
michael@0 792 if (word[i] <= '9' && word[i] >= '0') {
michael@0 793 prep_word[j++] = '.';
michael@0 794 } else {
michael@0 795 prep_word[j++] = word[i];
michael@0 796 }
michael@0 797 }
michael@0 798
michael@0 799
michael@0 800
michael@0 801 prep_word[j++] = '.';
michael@0 802 prep_word[j] = '\0';
michael@0 803
michael@0 804 for (i = 0; i < j; i++)
michael@0 805 hyphens[i] = '0';
michael@0 806
michael@0 807 #ifdef VERBOSE
michael@0 808 printf ("prep_word = %s\n", prep_word);
michael@0 809 #endif
michael@0 810
michael@0 811 /* now, run the finite state machine */
michael@0 812 state = 0;
michael@0 813 for (i = 0; i < j; i++)
michael@0 814 {
michael@0 815 ch = prep_word[i];
michael@0 816 for (;;)
michael@0 817 {
michael@0 818
michael@0 819 if (state == -1) {
michael@0 820 /* return 1; */
michael@0 821 /* KBH: FIXME shouldn't this be as follows? */
michael@0 822 state = 0;
michael@0 823 goto try_next_letter;
michael@0 824 }
michael@0 825
michael@0 826 #ifdef VERBOSE
michael@0 827 char *state_str;
michael@0 828 state_str = get_state_str (state, 1);
michael@0 829
michael@0 830 for (k = 0; k < i - strlen (state_str); k++)
michael@0 831 putchar (' ');
michael@0 832 printf ("%s", state_str);
michael@0 833 #endif
michael@0 834
michael@0 835 hstate = &dict->states[state];
michael@0 836 for (k = 0; k < hstate->num_trans; k++)
michael@0 837 if (hstate->trans[k].ch == ch)
michael@0 838 {
michael@0 839 state = hstate->trans[k].new_state;
michael@0 840 goto found_state;
michael@0 841 }
michael@0 842 state = hstate->fallback_state;
michael@0 843 #ifdef VERBOSE
michael@0 844 printf (" falling back, fallback_state %d\n", state);
michael@0 845 #endif
michael@0 846 }
michael@0 847 found_state:
michael@0 848 #ifdef VERBOSE
michael@0 849 printf ("found state %d\n",state);
michael@0 850 #endif
michael@0 851 /* Additional optimization is possible here - especially,
michael@0 852 elimination of trailing zeroes from the match. Leading zeroes
michael@0 853 have already been optimized. */
michael@0 854 match = dict->states[state].match;
michael@0 855 repl = dict->states[state].repl;
michael@0 856 replindex = dict->states[state].replindex;
michael@0 857 replcut = dict->states[state].replcut;
michael@0 858 /* replacing rules not handled by hyphen_hyphenate() */
michael@0 859 if (match)
michael@0 860 {
michael@0 861 offset = i + 1 - strlen (match);
michael@0 862 #ifdef VERBOSE
michael@0 863 for (k = 0; k < offset; k++)
michael@0 864 putchar (' ');
michael@0 865 printf ("%s (%s)\n", match, repl);
michael@0 866 #endif
michael@0 867 if (repl) {
michael@0 868 if (!isrepl) for(; isrepl < word_size; isrepl++) {
michael@0 869 matchrepl[isrepl] = NULL;
michael@0 870 matchindex[isrepl] = -1;
michael@0 871 }
michael@0 872 matchlen[offset + replindex] = replcut;
michael@0 873 }
michael@0 874 /* This is a linear search because I tried a binary search and
michael@0 875 found it to be just a teeny bit slower. */
michael@0 876 for (k = 0; match[k]; k++) {
michael@0 877 if ((hyphens[offset + k] < match[k])) {
michael@0 878 hyphens[offset + k] = match[k];
michael@0 879 if (match[k]&1) {
michael@0 880 matchrepl[offset + k] = repl;
michael@0 881 if (repl && (k >= replindex) && (k <= replindex + replcut)) {
michael@0 882 matchindex[offset + replindex] = offset + k;
michael@0 883 }
michael@0 884 }
michael@0 885 }
michael@0 886 }
michael@0 887
michael@0 888 }
michael@0 889
michael@0 890 /* KBH: we need this to make sure we keep looking in a word */
michael@0 891 /* for patterns even if the current character is not known in state 0 */
michael@0 892 /* since patterns for hyphenation may occur anywhere in the word */
michael@0 893 try_next_letter: ;
michael@0 894
michael@0 895 }
michael@0 896 #ifdef VERBOSE
michael@0 897 for (i = 0; i < j; i++)
michael@0 898 putchar (hyphens[i]);
michael@0 899 putchar ('\n');
michael@0 900 #endif
michael@0 901
michael@0 902 for (i = 0; i < j - 3; i++)
michael@0 903 #if 0
michael@0 904 if (hyphens[i + 1] & 1)
michael@0 905 hyphens[i] = '-';
michael@0 906 #else
michael@0 907 hyphens[i] = hyphens[i + 1];
michael@0 908 #endif
michael@0 909 for (; i < word_size; i++)
michael@0 910 hyphens[i] = '0';
michael@0 911 hyphens[word_size] = '\0';
michael@0 912
michael@0 913 /* now create a new char string showing hyphenation positions */
michael@0 914 /* count the hyphens and allocate space for the new hyphenated string */
michael@0 915 nHyphCount = 0;
michael@0 916 for (i = 0; i < word_size; i++)
michael@0 917 if (hyphens[i]&1)
michael@0 918 nHyphCount++;
michael@0 919 j = 0;
michael@0 920 for (i = 0; i < word_size; i++) {
michael@0 921 if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) {
michael@0 922 if (rep && pos && cut) {
michael@0 923 if (!*rep)
michael@0 924 *rep = (char **) calloc(word_size, sizeof(char *));
michael@0 925 if (!*pos)
michael@0 926 *pos = (int *) calloc(word_size, sizeof(int));
michael@0 927 if (!*cut) {
michael@0 928 *cut = (int *) calloc(word_size, sizeof(int));
michael@0 929 }
michael@0 930 (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]);
michael@0 931 (*pos)[matchindex[i] - 1] = matchindex[i] - i;
michael@0 932 (*cut)[matchindex[i] - 1] = matchlen[i];
michael@0 933 }
michael@0 934 j += strlen(matchrepl[matchindex[i]]);
michael@0 935 i += matchlen[i] - 1;
michael@0 936 }
michael@0 937 }
michael@0 938
michael@0 939 hnj_free (matchrepl);
michael@0 940 hnj_free (matchlen);
michael@0 941 hnj_free (matchindex);
michael@0 942
michael@0 943 // recursive hyphenation of the first (compound) level segments
michael@0 944 if (dict->nextlevel) {
michael@0 945 char ** rep2;
michael@0 946 int * pos2;
michael@0 947 int * cut2;
michael@0 948 char * hyphens2;
michael@0 949 int begin = 0;
michael@0 950
michael@0 951 rep2 = hnj_malloc (word_size * sizeof(char *));
michael@0 952 pos2 = hnj_malloc (word_size * sizeof(int));
michael@0 953 cut2 = hnj_malloc (word_size * sizeof(int));
michael@0 954 hyphens2 = hnj_malloc (word_size + 3);
michael@0 955 for (i = 0; i < word_size; i++) rep2[i] = NULL;
michael@0 956 for (i = 0; i < word_size; i++) if
michael@0 957 (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) {
michael@0 958 if (i - begin > 1) {
michael@0 959 int hyph = 0;
michael@0 960 prep_word[i + 2] = '\0';
michael@0 961 /* non-standard hyphenation at compound boundary (Schiffahrt) */
michael@0 962 if (rep && *rep && *pos && *cut && (*rep)[i]) {
michael@0 963 char * l = strchr((*rep)[i], '=');
michael@0 964 size_t offset = 2 + i - (*pos)[i];
michael@0 965 strncpy(prep_word + offset, (*rep)[i], prep_word_size - offset - 1);
michael@0 966 prep_word[prep_word_size - 1] = '\0';
michael@0 967 if (l) {
michael@0 968 hyph = (l - (*rep)[i]) - (*pos)[i];
michael@0 969 prep_word[2 + i + hyph] = '\0';
michael@0 970 }
michael@0 971 }
michael@0 972 hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph,
michael@0 973 hyphens2, &rep2, &pos2, &cut2, clhmin,
michael@0 974 crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend));
michael@0 975 for (j = 0; j < i - begin - 1; j++) {
michael@0 976 hyphens[begin + j] = hyphens2[j];
michael@0 977 if (rep2[j] && rep && pos && cut) {
michael@0 978 if (!*rep && !*pos && !*cut) {
michael@0 979 int k;
michael@0 980 *rep = (char **) malloc(sizeof(char *) * word_size);
michael@0 981 *pos = (int *) malloc(sizeof(int) * word_size);
michael@0 982 *cut = (int *) malloc(sizeof(int) * word_size);
michael@0 983 for (k = 0; k < word_size; k++) {
michael@0 984 (*rep)[k] = NULL;
michael@0 985 (*pos)[k] = 0;
michael@0 986 (*cut)[k] = 0;
michael@0 987 }
michael@0 988 }
michael@0 989 (*rep)[begin + j] = rep2[j];
michael@0 990 (*pos)[begin + j] = pos2[j];
michael@0 991 (*cut)[begin + j] = cut2[j];
michael@0 992 }
michael@0 993 }
michael@0 994 prep_word[i + 2] = word[i + 1];
michael@0 995 if (*rep && *pos && *cut && (*rep)[i]) {
michael@0 996 size_t offset = 1;
michael@0 997 strncpy(prep_word + offset, word, prep_word_size - offset - 1);
michael@0 998 prep_word[prep_word_size - 1] = '\0';
michael@0 999 }
michael@0 1000 }
michael@0 1001 begin = i + 1;
michael@0 1002 for (j = 0; j < word_size; j++) rep2[j] = NULL;
michael@0 1003 }
michael@0 1004
michael@0 1005 // non-compound
michael@0 1006 if (begin == 0) {
michael@0 1007 hnj_hyphen_hyph_(dict->nextlevel, word, word_size,
michael@0 1008 hyphens, rep, pos, cut, clhmin, crhmin, lend, rend);
michael@0 1009 if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
michael@0 1010 rep, pos, cut, clhmin);
michael@0 1011 if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
michael@0 1012 rep, pos, cut, crhmin);
michael@0 1013 }
michael@0 1014
michael@0 1015 free(rep2);
michael@0 1016 free(cut2);
michael@0 1017 free(pos2);
michael@0 1018 free(hyphens2);
michael@0 1019 }
michael@0 1020
michael@0 1021 hnj_free (prep_word);
michael@0 1022 return 0;
michael@0 1023 }
michael@0 1024
michael@0 1025 /* UTF-8 normalization of hyphen and non-standard positions */
michael@0 1026 int hnj_hyphen_norm(const char *word, int word_size, char * hyphens,
michael@0 1027 char *** rep, int ** pos, int ** cut)
michael@0 1028 {
michael@0 1029 int i, j, k;
michael@0 1030 if ((((unsigned char) word[0]) >> 6) == 2) {
michael@0 1031 fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word);
michael@0 1032 return 1;
michael@0 1033 }
michael@0 1034
michael@0 1035 /* calculate UTF-8 character positions */
michael@0 1036 for (i = 0, j = -1; i < word_size; i++) {
michael@0 1037 /* beginning of an UTF-8 character (not '10' start bits) */
michael@0 1038 if ((((unsigned char) word[i]) >> 6) != 2) j++;
michael@0 1039 hyphens[j] = hyphens[i];
michael@0 1040 if (rep && pos && cut && *rep && *pos && *cut) {
michael@0 1041 int l = (*pos)[i];
michael@0 1042 (*pos)[j] = 0;
michael@0 1043 for (k = 0; k < l; k++) {
michael@0 1044 if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++;
michael@0 1045 }
michael@0 1046 k = i - l + 1;
michael@0 1047 l = k + (*cut)[i];
michael@0 1048 (*cut)[j] = 0;
michael@0 1049 for (; k < l; k++) {
michael@0 1050 if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++;
michael@0 1051 }
michael@0 1052 (*rep)[j] = (*rep)[i];
michael@0 1053 if (j < i) {
michael@0 1054 (*rep)[i] = NULL;
michael@0 1055 (*pos)[i] = 0;
michael@0 1056 (*cut)[i] = 0;
michael@0 1057 }
michael@0 1058 }
michael@0 1059 }
michael@0 1060 hyphens[j + 1] = '\0';
michael@0 1061 #ifdef VERBOSE
michael@0 1062 printf ("nums: %s\n", hyphens);
michael@0 1063 #endif
michael@0 1064 return 0;
michael@0 1065 }
michael@0 1066
michael@0 1067 /* get the word with all possible hyphenations (output: hyphword) */
michael@0 1068 void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens,
michael@0 1069 char * hyphword, char *** rep, int ** pos, int ** cut)
michael@0 1070 {
michael@0 1071 int hyphenslen = l + 5;
michael@0 1072
michael@0 1073 int i, j;
michael@0 1074 for (i = 0, j = 0; i < l; i++, j++) {
michael@0 1075 if (hyphens[i]&1) {
michael@0 1076 hyphword[j] = word[i];
michael@0 1077 if (*rep && *pos && *cut && (*rep)[i]) {
michael@0 1078 size_t offset = j - (*pos)[i] + 1;
michael@0 1079 strncpy(hyphword + offset, (*rep)[i], hyphenslen - offset - 1);
michael@0 1080 hyphword[hyphenslen-1] = '\0';
michael@0 1081 j += strlen((*rep)[i]) - (*pos)[i];
michael@0 1082 i += (*cut)[i] - (*pos)[i];
michael@0 1083 } else hyphword[++j] = '=';
michael@0 1084 } else hyphword[j] = word[i];
michael@0 1085 }
michael@0 1086 hyphword[j] = '\0';
michael@0 1087 }
michael@0 1088
michael@0 1089
michael@0 1090 /* main api function with default hyphenmin parameters */
michael@0 1091 int hnj_hyphen_hyphenate2 (HyphenDict *dict,
michael@0 1092 const char *word, int word_size, char * hyphens,
michael@0 1093 char *hyphword, char *** rep, int ** pos, int ** cut)
michael@0 1094 {
michael@0 1095 hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
michael@0 1096 dict->clhmin, dict->crhmin, 1, 1);
michael@0 1097 hnj_hyphen_lhmin(dict->utf8, word, word_size,
michael@0 1098 hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2));
michael@0 1099 hnj_hyphen_rhmin(dict->utf8, word, word_size,
michael@0 1100 hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2));
michael@0 1101
michael@0 1102 /* nohyphen */
michael@0 1103 if (dict->nohyphen) {
michael@0 1104 char * nh = dict->nohyphen;
michael@0 1105 int nhi;
michael@0 1106 for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {
michael@0 1107 char * nhy = (char *) strstr(word, nh);
michael@0 1108 while (nhy) {
michael@0 1109 hyphens[nhy - word + strlen(nh) - 1] = '0';
michael@0 1110 if (nhy - word - 1 >= 0) hyphens[nhy - word - 1] = '0';
michael@0 1111 nhy = (char *) strstr(nhy + 1, nh);
michael@0 1112 }
michael@0 1113 nh = nh + strlen(nh) + 1;
michael@0 1114 }
michael@0 1115 }
michael@0 1116
michael@0 1117 if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
michael@0 1118 if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
michael@0 1119 #ifdef VERBOSE
michael@0 1120 printf ("nums: %s\n", hyphens);
michael@0 1121 #endif
michael@0 1122 return 0;
michael@0 1123 }
michael@0 1124
michael@0 1125 /* previous main api function with hyphenmin parameters */
michael@0 1126 int hnj_hyphen_hyphenate3 (HyphenDict *dict,
michael@0 1127 const char *word, int word_size, char * hyphens,
michael@0 1128 char *hyphword, char *** rep, int ** pos, int ** cut,
michael@0 1129 int lhmin, int rhmin, int clhmin, int crhmin)
michael@0 1130 {
michael@0 1131 lhmin = (lhmin > dict->lhmin) ? lhmin : dict->lhmin;
michael@0 1132 rhmin = (rhmin > dict->rhmin) ? rhmin : dict->rhmin;
michael@0 1133 clhmin = (clhmin > dict->clhmin) ? clhmin : dict->clhmin;
michael@0 1134 crhmin = (crhmin > dict->crhmin) ? crhmin : dict->crhmin;
michael@0 1135 hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
michael@0 1136 clhmin, crhmin, 1, 1);
michael@0 1137 hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
michael@0 1138 rep, pos, cut, (lhmin > 0 ? lhmin : 2));
michael@0 1139 hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
michael@0 1140 rep, pos, cut, (rhmin > 0 ? rhmin : 2));
michael@0 1141 if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
michael@0 1142
michael@0 1143 /* nohyphen */
michael@0 1144 if (dict->nohyphen) {
michael@0 1145 char * nh = dict->nohyphen;
michael@0 1146 int nhi;
michael@0 1147 for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {
michael@0 1148 char * nhy = (char *) strstr(word, nh);
michael@0 1149 while (nhy) {
michael@0 1150 hyphens[nhy - word + strlen(nh) - 1] = 0;
michael@0 1151 if (nhy - word - 1 >= 0) hyphens[nhy - word - 1] = 0;
michael@0 1152 nhy = (char *) strstr(nhy + 1, nh);
michael@0 1153 }
michael@0 1154 nh = nh + strlen(nh) + 1;
michael@0 1155 }
michael@0 1156 }
michael@0 1157
michael@0 1158 if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
michael@0 1159 return 0;
michael@0 1160 }

mercurial