Sat, 03 Jan 2015 20:18:00 +0100
Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.
michael@0 | 1 | /* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both |
michael@0 | 2 | * licenses follows. |
michael@0 | 3 | */ |
michael@0 | 4 | |
michael@0 | 5 | /* LibHnj - a library for high quality hyphenation and justification |
michael@0 | 6 | * Copyright (C) 1998 Raph Levien, |
michael@0 | 7 | * (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org), |
michael@0 | 8 | * (C) 2001 Peter Novodvorsky (nidd@cs.msu.su) |
michael@0 | 9 | * (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo) |
michael@0 | 10 | * |
michael@0 | 11 | * This library is free software; you can redistribute it and/or |
michael@0 | 12 | * modify it under the terms of the GNU Library General Public |
michael@0 | 13 | * License as published by the Free Software Foundation; either |
michael@0 | 14 | * version 2 of the License, or (at your option) any later version. |
michael@0 | 15 | * |
michael@0 | 16 | * This library is distributed in the hope that it will be useful, |
michael@0 | 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
michael@0 | 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
michael@0 | 19 | * Library General Public License for more details. |
michael@0 | 20 | * |
michael@0 | 21 | * You should have received a copy of the GNU Library General Public |
michael@0 | 22 | * License along with this library; if not, write to the |
michael@0 | 23 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
michael@0 | 24 | * Boston, MA 02111-1307 USA. |
michael@0 | 25 | */ |
michael@0 | 26 | |
michael@0 | 27 | /* |
michael@0 | 28 | * The contents of this file are subject to the Mozilla Public License |
michael@0 | 29 | * Version 1.0 (the "MPL"); you may not use this file except in |
michael@0 | 30 | * compliance with the MPL. You may obtain a copy of the MPL at |
michael@0 | 31 | * http://www.mozilla.org/MPL/ |
michael@0 | 32 | * |
michael@0 | 33 | * Software distributed under the MPL is distributed on an "AS IS" basis, |
michael@0 | 34 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL |
michael@0 | 35 | * for the specific language governing rights and limitations under the |
michael@0 | 36 | * MPL. |
michael@0 | 37 | * |
michael@0 | 38 | */ |
michael@0 | 39 | #include <stdlib.h> /* for NULL, malloc */ |
michael@0 | 40 | #include <stdio.h> /* for fprintf */ |
michael@0 | 41 | #include <string.h> /* for strdup */ |
michael@0 | 42 | |
michael@0 | 43 | #ifdef UNX |
michael@0 | 44 | #include <unistd.h> /* for exit */ |
michael@0 | 45 | #endif |
michael@0 | 46 | |
michael@0 | 47 | #define noVERBOSE |
michael@0 | 48 | |
michael@0 | 49 | /* calculate hyphenmin values with long ligature length (2 or 3 characters |
michael@0 | 50 | * instead of 1 or 2) for comparison with hyphenation without ligatures */ |
michael@0 | 51 | #define noLONG_LIGATURE |
michael@0 | 52 | |
michael@0 | 53 | #ifdef LONG_LIGATURE |
michael@0 | 54 | #define LIG_xx 1 |
michael@0 | 55 | #define LIG_xxx 2 |
michael@0 | 56 | #else |
michael@0 | 57 | #define LIG_xx 0 |
michael@0 | 58 | #define LIG_xxx 1 |
michael@0 | 59 | #endif |
michael@0 | 60 | |
michael@0 | 61 | #include "hnjalloc.h" |
michael@0 | 62 | #include "hyphen.h" |
michael@0 | 63 | |
michael@0 | 64 | static char * |
michael@0 | 65 | hnj_strdup (const char *s) |
michael@0 | 66 | { |
michael@0 | 67 | char *new; |
michael@0 | 68 | int l; |
michael@0 | 69 | |
michael@0 | 70 | l = strlen (s); |
michael@0 | 71 | new = hnj_malloc (l + 1); |
michael@0 | 72 | memcpy (new, s, l); |
michael@0 | 73 | new[l] = 0; |
michael@0 | 74 | return new; |
michael@0 | 75 | } |
michael@0 | 76 | |
michael@0 | 77 | /* remove cross-platform text line end characters */ |
michael@0 | 78 | void hnj_strchomp(char * s) |
michael@0 | 79 | { |
michael@0 | 80 | int k = strlen(s); |
michael@0 | 81 | if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0'; |
michael@0 | 82 | if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0'; |
michael@0 | 83 | } |
michael@0 | 84 | |
michael@0 | 85 | /* a little bit of a hash table implementation. This simply maps strings |
michael@0 | 86 | to state numbers */ |
michael@0 | 87 | |
michael@0 | 88 | typedef struct _HashTab HashTab; |
michael@0 | 89 | typedef struct _HashEntry HashEntry; |
michael@0 | 90 | |
michael@0 | 91 | /* A cheap, but effective, hack. */ |
michael@0 | 92 | #define HASH_SIZE 31627 |
michael@0 | 93 | |
michael@0 | 94 | struct _HashTab { |
michael@0 | 95 | HashEntry *entries[HASH_SIZE]; |
michael@0 | 96 | }; |
michael@0 | 97 | |
michael@0 | 98 | struct _HashEntry { |
michael@0 | 99 | HashEntry *next; |
michael@0 | 100 | char *key; |
michael@0 | 101 | int val; |
michael@0 | 102 | }; |
michael@0 | 103 | |
michael@0 | 104 | /* a char* hash function from ASU - adapted from Gtk+ */ |
michael@0 | 105 | static unsigned int |
michael@0 | 106 | hnj_string_hash (const char *s) |
michael@0 | 107 | { |
michael@0 | 108 | const char *p; |
michael@0 | 109 | unsigned int h=0, g; |
michael@0 | 110 | for(p = s; *p != '\0'; p += 1) { |
michael@0 | 111 | h = ( h << 4 ) + *p; |
michael@0 | 112 | if ( ( g = h & 0xf0000000 ) ) { |
michael@0 | 113 | h = h ^ (g >> 24); |
michael@0 | 114 | h = h ^ g; |
michael@0 | 115 | } |
michael@0 | 116 | } |
michael@0 | 117 | return h /* % M */; |
michael@0 | 118 | } |
michael@0 | 119 | |
michael@0 | 120 | static HashTab * |
michael@0 | 121 | hnj_hash_new (void) |
michael@0 | 122 | { |
michael@0 | 123 | HashTab *hashtab; |
michael@0 | 124 | int i; |
michael@0 | 125 | |
michael@0 | 126 | hashtab = hnj_malloc (sizeof(HashTab)); |
michael@0 | 127 | for (i = 0; i < HASH_SIZE; i++) |
michael@0 | 128 | hashtab->entries[i] = NULL; |
michael@0 | 129 | |
michael@0 | 130 | return hashtab; |
michael@0 | 131 | } |
michael@0 | 132 | |
michael@0 | 133 | static void |
michael@0 | 134 | hnj_hash_free (HashTab *hashtab) |
michael@0 | 135 | { |
michael@0 | 136 | int i; |
michael@0 | 137 | HashEntry *e, *next; |
michael@0 | 138 | |
michael@0 | 139 | for (i = 0; i < HASH_SIZE; i++) |
michael@0 | 140 | for (e = hashtab->entries[i]; e; e = next) |
michael@0 | 141 | { |
michael@0 | 142 | next = e->next; |
michael@0 | 143 | hnj_free (e->key); |
michael@0 | 144 | hnj_free (e); |
michael@0 | 145 | } |
michael@0 | 146 | |
michael@0 | 147 | hnj_free (hashtab); |
michael@0 | 148 | } |
michael@0 | 149 | |
michael@0 | 150 | /* assumes that key is not already present! */ |
michael@0 | 151 | static void |
michael@0 | 152 | hnj_hash_insert (HashTab *hashtab, const char *key, int val) |
michael@0 | 153 | { |
michael@0 | 154 | int i; |
michael@0 | 155 | HashEntry *e; |
michael@0 | 156 | |
michael@0 | 157 | i = hnj_string_hash (key) % HASH_SIZE; |
michael@0 | 158 | e = hnj_malloc (sizeof(HashEntry)); |
michael@0 | 159 | e->next = hashtab->entries[i]; |
michael@0 | 160 | e->key = hnj_strdup (key); |
michael@0 | 161 | e->val = val; |
michael@0 | 162 | hashtab->entries[i] = e; |
michael@0 | 163 | } |
michael@0 | 164 | |
michael@0 | 165 | /* return val if found, otherwise -1 */ |
michael@0 | 166 | static int |
michael@0 | 167 | hnj_hash_lookup (HashTab *hashtab, const char *key) |
michael@0 | 168 | { |
michael@0 | 169 | int i; |
michael@0 | 170 | HashEntry *e; |
michael@0 | 171 | i = hnj_string_hash (key) % HASH_SIZE; |
michael@0 | 172 | for (e = hashtab->entries[i]; e; e = e->next) |
michael@0 | 173 | if (!strcmp (key, e->key)) |
michael@0 | 174 | return e->val; |
michael@0 | 175 | return -1; |
michael@0 | 176 | } |
michael@0 | 177 | |
michael@0 | 178 | /* Get the state number, allocating a new state if necessary. */ |
michael@0 | 179 | static int |
michael@0 | 180 | hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string) |
michael@0 | 181 | { |
michael@0 | 182 | int state_num; |
michael@0 | 183 | |
michael@0 | 184 | state_num = hnj_hash_lookup (hashtab, string); |
michael@0 | 185 | |
michael@0 | 186 | if (state_num >= 0) |
michael@0 | 187 | return state_num; |
michael@0 | 188 | |
michael@0 | 189 | hnj_hash_insert (hashtab, string, dict->num_states); |
michael@0 | 190 | /* predicate is true if dict->num_states is a power of two */ |
michael@0 | 191 | if (!(dict->num_states & (dict->num_states - 1))) |
michael@0 | 192 | { |
michael@0 | 193 | dict->states = hnj_realloc (dict->states, |
michael@0 | 194 | (dict->num_states << 1) * |
michael@0 | 195 | sizeof(HyphenState)); |
michael@0 | 196 | } |
michael@0 | 197 | dict->states[dict->num_states].match = NULL; |
michael@0 | 198 | dict->states[dict->num_states].repl = NULL; |
michael@0 | 199 | dict->states[dict->num_states].fallback_state = -1; |
michael@0 | 200 | dict->states[dict->num_states].num_trans = 0; |
michael@0 | 201 | dict->states[dict->num_states].trans = NULL; |
michael@0 | 202 | return dict->num_states++; |
michael@0 | 203 | } |
michael@0 | 204 | |
michael@0 | 205 | /* add a transition from state1 to state2 through ch - assumes that the |
michael@0 | 206 | transition does not already exist */ |
michael@0 | 207 | static void |
michael@0 | 208 | hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch) |
michael@0 | 209 | { |
michael@0 | 210 | int num_trans; |
michael@0 | 211 | |
michael@0 | 212 | num_trans = dict->states[state1].num_trans; |
michael@0 | 213 | if (num_trans == 0) |
michael@0 | 214 | { |
michael@0 | 215 | dict->states[state1].trans = hnj_malloc (sizeof(HyphenTrans)); |
michael@0 | 216 | } |
michael@0 | 217 | else if (!(num_trans & (num_trans - 1))) |
michael@0 | 218 | { |
michael@0 | 219 | dict->states[state1].trans = hnj_realloc (dict->states[state1].trans, |
michael@0 | 220 | (num_trans << 1) * |
michael@0 | 221 | sizeof(HyphenTrans)); |
michael@0 | 222 | } |
michael@0 | 223 | dict->states[state1].trans[num_trans].ch = ch; |
michael@0 | 224 | dict->states[state1].trans[num_trans].new_state = state2; |
michael@0 | 225 | dict->states[state1].num_trans++; |
michael@0 | 226 | } |
michael@0 | 227 | |
michael@0 | 228 | #ifdef VERBOSE |
michael@0 | 229 | HashTab *global[1]; |
michael@0 | 230 | |
michael@0 | 231 | static char * |
michael@0 | 232 | get_state_str (int state, int level) |
michael@0 | 233 | { |
michael@0 | 234 | int i; |
michael@0 | 235 | HashEntry *e; |
michael@0 | 236 | |
michael@0 | 237 | for (i = 0; i < HASH_SIZE; i++) |
michael@0 | 238 | for (e = global[level]->entries[i]; e; e = e->next) |
michael@0 | 239 | if (e->val == state) |
michael@0 | 240 | return e->key; |
michael@0 | 241 | return NULL; |
michael@0 | 242 | } |
michael@0 | 243 | #endif |
michael@0 | 244 | |
michael@0 | 245 | void hnj_hyphen_load_line(char * buf, HyphenDict * dict, HashTab * hashtab) { |
michael@0 | 246 | int i, j; |
michael@0 | 247 | char word[MAX_CHARS]; |
michael@0 | 248 | char pattern[MAX_CHARS]; |
michael@0 | 249 | char * repl; |
michael@0 | 250 | signed char replindex; |
michael@0 | 251 | signed char replcut; |
michael@0 | 252 | int state_num = 0; |
michael@0 | 253 | int last_state; |
michael@0 | 254 | char ch; |
michael@0 | 255 | int found; |
michael@0 | 256 | |
michael@0 | 257 | if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) { |
michael@0 | 258 | dict->lhmin = atoi(buf + 13); |
michael@0 | 259 | return; |
michael@0 | 260 | } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) { |
michael@0 | 261 | dict->rhmin = atoi(buf + 14); |
michael@0 | 262 | return; |
michael@0 | 263 | } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) { |
michael@0 | 264 | dict->clhmin = atoi(buf + 21); |
michael@0 | 265 | return; |
michael@0 | 266 | } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) { |
michael@0 | 267 | dict->crhmin = atoi(buf + 22); |
michael@0 | 268 | return; |
michael@0 | 269 | } else if (strncmp(buf, "NOHYPHEN", 8) == 0) { |
michael@0 | 270 | char * space = buf + 8; |
michael@0 | 271 | while (*space != '\0' && (*space == ' ' || *space == '\t')) space++; |
michael@0 | 272 | if (*buf != '\0') dict->nohyphen = hnj_strdup(space); |
michael@0 | 273 | if (dict->nohyphen) { |
michael@0 | 274 | char * nhe = dict->nohyphen + strlen(dict->nohyphen) - 1; |
michael@0 | 275 | *nhe = 0; |
michael@0 | 276 | for (nhe = nhe - 1; nhe > dict->nohyphen; nhe--) { |
michael@0 | 277 | if (*nhe == ',') { |
michael@0 | 278 | dict->nohyphenl++; |
michael@0 | 279 | *nhe = 0; |
michael@0 | 280 | } |
michael@0 | 281 | } |
michael@0 | 282 | } |
michael@0 | 283 | return; |
michael@0 | 284 | } |
michael@0 | 285 | j = 0; |
michael@0 | 286 | pattern[j] = '0'; |
michael@0 | 287 | repl = strchr(buf, '/'); |
michael@0 | 288 | replindex = 0; |
michael@0 | 289 | replcut = 0; |
michael@0 | 290 | if (repl) { |
michael@0 | 291 | char * index = strchr(repl + 1, ','); |
michael@0 | 292 | *repl = '\0'; |
michael@0 | 293 | if (index) { |
michael@0 | 294 | char * index2 = strchr(index + 1, ','); |
michael@0 | 295 | *index = '\0'; |
michael@0 | 296 | if (index2) { |
michael@0 | 297 | *index2 = '\0'; |
michael@0 | 298 | replindex = (signed char) atoi(index + 1) - 1; |
michael@0 | 299 | replcut = (signed char) atoi(index2 + 1); |
michael@0 | 300 | } |
michael@0 | 301 | } else { |
michael@0 | 302 | hnj_strchomp(repl + 1); |
michael@0 | 303 | replindex = 0; |
michael@0 | 304 | replcut = (signed char) strlen(buf); |
michael@0 | 305 | } |
michael@0 | 306 | repl = hnj_strdup(repl + 1); |
michael@0 | 307 | } |
michael@0 | 308 | for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++) |
michael@0 | 309 | { |
michael@0 | 310 | if (buf[i] >= '0' && buf[i] <= '9') |
michael@0 | 311 | pattern[j] = buf[i]; |
michael@0 | 312 | else |
michael@0 | 313 | { |
michael@0 | 314 | word[j] = buf[i]; |
michael@0 | 315 | pattern[++j] = '0'; |
michael@0 | 316 | } |
michael@0 | 317 | } |
michael@0 | 318 | word[j] = '\0'; |
michael@0 | 319 | pattern[j + 1] = '\0'; |
michael@0 | 320 | |
michael@0 | 321 | i = 0; |
michael@0 | 322 | if (!repl) { |
michael@0 | 323 | /* Optimize away leading zeroes */ |
michael@0 | 324 | for (; pattern[i] == '0'; i++); |
michael@0 | 325 | } else { |
michael@0 | 326 | if (*word == '.') i++; |
michael@0 | 327 | /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */ |
michael@0 | 328 | if (dict->utf8) { |
michael@0 | 329 | int pu = -1; /* unicode character position */ |
michael@0 | 330 | int ps = -1; /* unicode start position (original replindex) */ |
michael@0 | 331 | int pc = (*word == '.') ? 1: 0; /* 8-bit character position */ |
michael@0 | 332 | for (; pc < (strlen(word) + 1); pc++) { |
michael@0 | 333 | /* beginning of an UTF-8 character (not '10' start bits) */ |
michael@0 | 334 | if ((((unsigned char) word[pc]) >> 6) != 2) pu++; |
michael@0 | 335 | if ((ps < 0) && (replindex == pu)) { |
michael@0 | 336 | ps = replindex; |
michael@0 | 337 | replindex = (signed char) pc; |
michael@0 | 338 | } |
michael@0 | 339 | if ((ps >= 0) && ((pu - ps) == replcut)) { |
michael@0 | 340 | replcut = (signed char) (pc - replindex); |
michael@0 | 341 | break; |
michael@0 | 342 | } |
michael@0 | 343 | } |
michael@0 | 344 | if (*word == '.') replindex--; |
michael@0 | 345 | } |
michael@0 | 346 | } |
michael@0 | 347 | |
michael@0 | 348 | #ifdef VERBOSE |
michael@0 | 349 | printf ("word %s pattern %s, j = %d repl: %s\n", word, pattern + i, j, repl); |
michael@0 | 350 | #endif |
michael@0 | 351 | found = hnj_hash_lookup (hashtab, word); |
michael@0 | 352 | state_num = hnj_get_state (dict, hashtab, word); |
michael@0 | 353 | dict->states[state_num].match = hnj_strdup (pattern + i); |
michael@0 | 354 | dict->states[state_num].repl = repl; |
michael@0 | 355 | dict->states[state_num].replindex = replindex; |
michael@0 | 356 | if (!replcut) { |
michael@0 | 357 | dict->states[state_num].replcut = (signed char) strlen(word); |
michael@0 | 358 | } else { |
michael@0 | 359 | dict->states[state_num].replcut = replcut; |
michael@0 | 360 | } |
michael@0 | 361 | |
michael@0 | 362 | /* now, put in the prefix transitions */ |
michael@0 | 363 | for (; found < 0 ;j--) |
michael@0 | 364 | { |
michael@0 | 365 | last_state = state_num; |
michael@0 | 366 | ch = word[j - 1]; |
michael@0 | 367 | word[j - 1] = '\0'; |
michael@0 | 368 | found = hnj_hash_lookup (hashtab, word); |
michael@0 | 369 | state_num = hnj_get_state (dict, hashtab, word); |
michael@0 | 370 | hnj_add_trans (dict, state_num, last_state, ch); |
michael@0 | 371 | } |
michael@0 | 372 | } |
michael@0 | 373 | |
michael@0 | 374 | HyphenDict * |
michael@0 | 375 | hnj_hyphen_load (const char *fn) |
michael@0 | 376 | { |
michael@0 | 377 | HyphenDict *dict[2]; |
michael@0 | 378 | HashTab *hashtab; |
michael@0 | 379 | FILE *f; |
michael@0 | 380 | char buf[MAX_CHARS]; |
michael@0 | 381 | int nextlevel = 0; |
michael@0 | 382 | int i, j, k; |
michael@0 | 383 | HashEntry *e; |
michael@0 | 384 | int state_num = 0; |
michael@0 | 385 | |
michael@0 | 386 | f = fopen (fn, "r"); |
michael@0 | 387 | if (f == NULL) |
michael@0 | 388 | return NULL; |
michael@0 | 389 | |
michael@0 | 390 | // loading one or two dictionaries (separated by NEXTLEVEL keyword) |
michael@0 | 391 | for (k = 0; k < 2; k++) { |
michael@0 | 392 | hashtab = hnj_hash_new (); |
michael@0 | 393 | #ifdef VERBOSE |
michael@0 | 394 | global[k] = hashtab; |
michael@0 | 395 | #endif |
michael@0 | 396 | hnj_hash_insert (hashtab, "", 0); |
michael@0 | 397 | dict[k] = hnj_malloc (sizeof(HyphenDict)); |
michael@0 | 398 | dict[k]->num_states = 1; |
michael@0 | 399 | dict[k]->states = hnj_malloc (sizeof(HyphenState)); |
michael@0 | 400 | dict[k]->states[0].match = NULL; |
michael@0 | 401 | dict[k]->states[0].repl = NULL; |
michael@0 | 402 | dict[k]->states[0].fallback_state = -1; |
michael@0 | 403 | dict[k]->states[0].num_trans = 0; |
michael@0 | 404 | dict[k]->states[0].trans = NULL; |
michael@0 | 405 | dict[k]->nextlevel = NULL; |
michael@0 | 406 | dict[k]->lhmin = 0; |
michael@0 | 407 | dict[k]->rhmin = 0; |
michael@0 | 408 | dict[k]->clhmin = 0; |
michael@0 | 409 | dict[k]->crhmin = 0; |
michael@0 | 410 | dict[k]->nohyphen = NULL; |
michael@0 | 411 | dict[k]->nohyphenl = 0; |
michael@0 | 412 | |
michael@0 | 413 | /* read in character set info */ |
michael@0 | 414 | if (k == 0) { |
michael@0 | 415 | for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0; |
michael@0 | 416 | if (fgets(dict[k]->cset, sizeof(dict[k]->cset),f) != NULL) { |
michael@0 | 417 | for (i=0;i<MAX_NAME;i++) |
michael@0 | 418 | if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n')) |
michael@0 | 419 | dict[k]->cset[i] = 0; |
michael@0 | 420 | } else { |
michael@0 | 421 | dict[k]->cset[0] = 0; |
michael@0 | 422 | } |
michael@0 | 423 | dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0); |
michael@0 | 424 | } else { |
michael@0 | 425 | strncpy(dict[k]->cset, dict[0]->cset, sizeof(dict[k]->cset)-1); |
michael@0 | 426 | dict[k]->cset[sizeof(dict[k]->cset)-1] = '\0'; |
michael@0 | 427 | dict[k]->utf8 = dict[0]->utf8; |
michael@0 | 428 | } |
michael@0 | 429 | |
michael@0 | 430 | if (k == 0 || nextlevel) { |
michael@0 | 431 | while (fgets (buf, sizeof(buf), f) != NULL) { |
michael@0 | 432 | if (strncmp(buf, "NEXTLEVEL", 9) == 0) { |
michael@0 | 433 | nextlevel = 1; |
michael@0 | 434 | break; |
michael@0 | 435 | } else if (buf[0] != '%') hnj_hyphen_load_line(buf, dict[k], hashtab); |
michael@0 | 436 | } |
michael@0 | 437 | } else if (k == 1) { |
michael@0 | 438 | /* default first level: hyphen and ASCII apostrophe */ |
michael@0 | 439 | if (!dict[0]->utf8) hnj_hyphen_load_line("NOHYPHEN ',-\n", dict[k], hashtab); |
michael@0 | 440 | else hnj_hyphen_load_line("NOHYPHEN ',\xe2\x80\x93,\xe2\x80\x99,-\n", dict[k], hashtab); |
michael@0 | 441 | strncpy(buf, "1-1\n", MAX_CHARS-1); // buf rewritten by hnj_hyphen_load here |
michael@0 | 442 | buf[MAX_CHARS-1] = '\0'; |
michael@0 | 443 | hnj_hyphen_load_line(buf, dict[k], hashtab); /* remove hyphen */ |
michael@0 | 444 | hnj_hyphen_load_line("1'1\n", dict[k], hashtab); /* ASCII apostrophe */ |
michael@0 | 445 | if (dict[0]->utf8) { |
michael@0 | 446 | hnj_hyphen_load_line("1\xe2\x80\x93" "1\n", dict[k], hashtab); /* endash */ |
michael@0 | 447 | hnj_hyphen_load_line("1\xe2\x80\x99" "1\n", dict[k], hashtab); /* apostrophe */ |
michael@0 | 448 | } |
michael@0 | 449 | } |
michael@0 | 450 | |
michael@0 | 451 | /* Could do unioning of matches here (instead of the preprocessor script). |
michael@0 | 452 | If we did, the pseudocode would look something like this: |
michael@0 | 453 | |
michael@0 | 454 | foreach state in the hash table |
michael@0 | 455 | foreach i = [1..length(state) - 1] |
michael@0 | 456 | state to check is substr (state, i) |
michael@0 | 457 | look it up |
michael@0 | 458 | if found, and if there is a match, union the match in. |
michael@0 | 459 | |
michael@0 | 460 | It's also possible to avoid the quadratic blowup by doing the |
michael@0 | 461 | search in order of increasing state string sizes - then you |
michael@0 | 462 | can break the loop after finding the first match. |
michael@0 | 463 | |
michael@0 | 464 | This step should be optional in any case - if there is a |
michael@0 | 465 | preprocessed rule table, it's always faster to use that. |
michael@0 | 466 | |
michael@0 | 467 | */ |
michael@0 | 468 | |
michael@0 | 469 | /* put in the fallback states */ |
michael@0 | 470 | for (i = 0; i < HASH_SIZE; i++) |
michael@0 | 471 | for (e = hashtab->entries[i]; e; e = e->next) |
michael@0 | 472 | { |
michael@0 | 473 | if (*(e->key)) for (j = 1; 1; j++) |
michael@0 | 474 | { |
michael@0 | 475 | state_num = hnj_hash_lookup (hashtab, e->key + j); |
michael@0 | 476 | if (state_num >= 0) |
michael@0 | 477 | break; |
michael@0 | 478 | } |
michael@0 | 479 | /* KBH: FIXME state 0 fallback_state should always be -1? */ |
michael@0 | 480 | if (e->val) |
michael@0 | 481 | dict[k]->states[e->val].fallback_state = state_num; |
michael@0 | 482 | } |
michael@0 | 483 | #ifdef VERBOSE |
michael@0 | 484 | for (i = 0; i < HASH_SIZE; i++) |
michael@0 | 485 | for (e = hashtab->entries[i]; e; e = e->next) |
michael@0 | 486 | { |
michael@0 | 487 | printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val, |
michael@0 | 488 | dict[k]->states[e->val].fallback_state); |
michael@0 | 489 | for (j = 0; j < dict[k]->states[e->val].num_trans; j++) |
michael@0 | 490 | printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch, |
michael@0 | 491 | dict[k]->states[e->val].trans[j].new_state); |
michael@0 | 492 | } |
michael@0 | 493 | #endif |
michael@0 | 494 | |
michael@0 | 495 | #ifndef VERBOSE |
michael@0 | 496 | hnj_hash_free (hashtab); |
michael@0 | 497 | #endif |
michael@0 | 498 | state_num = 0; |
michael@0 | 499 | } |
michael@0 | 500 | fclose(f); |
michael@0 | 501 | if (nextlevel) dict[0]->nextlevel = dict[1]; |
michael@0 | 502 | else { |
michael@0 | 503 | dict[1] -> nextlevel = dict[0]; |
michael@0 | 504 | dict[1]->lhmin = dict[0]->lhmin; |
michael@0 | 505 | dict[1]->rhmin = dict[0]->rhmin; |
michael@0 | 506 | dict[1]->clhmin = (dict[0]->clhmin) ? dict[0]->clhmin : ((dict[0]->lhmin) ? dict[0]->lhmin : 3); |
michael@0 | 507 | dict[1]->crhmin = (dict[0]->crhmin) ? dict[0]->crhmin : ((dict[0]->rhmin) ? dict[0]->rhmin : 3); |
michael@0 | 508 | #ifdef VERBOSE |
michael@0 | 509 | HashTab *r = global[0]; |
michael@0 | 510 | global[0] = global[1]; |
michael@0 | 511 | global[1] = r; |
michael@0 | 512 | #endif |
michael@0 | 513 | return dict[1]; |
michael@0 | 514 | } |
michael@0 | 515 | return dict[0]; |
michael@0 | 516 | } |
michael@0 | 517 | |
michael@0 | 518 | void hnj_hyphen_free (HyphenDict *dict) |
michael@0 | 519 | { |
michael@0 | 520 | int state_num; |
michael@0 | 521 | HyphenState *hstate; |
michael@0 | 522 | |
michael@0 | 523 | for (state_num = 0; state_num < dict->num_states; state_num++) |
michael@0 | 524 | { |
michael@0 | 525 | hstate = &dict->states[state_num]; |
michael@0 | 526 | if (hstate->match) |
michael@0 | 527 | hnj_free (hstate->match); |
michael@0 | 528 | if (hstate->repl) |
michael@0 | 529 | hnj_free (hstate->repl); |
michael@0 | 530 | if (hstate->trans) |
michael@0 | 531 | hnj_free (hstate->trans); |
michael@0 | 532 | } |
michael@0 | 533 | if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel); |
michael@0 | 534 | |
michael@0 | 535 | if (dict->nohyphen) hnj_free(dict->nohyphen); |
michael@0 | 536 | |
michael@0 | 537 | hnj_free (dict->states); |
michael@0 | 538 | |
michael@0 | 539 | hnj_free (dict); |
michael@0 | 540 | } |
michael@0 | 541 | |
michael@0 | 542 | #define MAX_WORD 256 |
michael@0 | 543 | |
michael@0 | 544 | int hnj_hyphen_hyphenate (HyphenDict *dict, |
michael@0 | 545 | const char *word, int word_size, |
michael@0 | 546 | char *hyphens) |
michael@0 | 547 | { |
michael@0 | 548 | char *prep_word; |
michael@0 | 549 | int i, j, k; |
michael@0 | 550 | int state; |
michael@0 | 551 | char ch; |
michael@0 | 552 | HyphenState *hstate; |
michael@0 | 553 | char *match; |
michael@0 | 554 | int offset; |
michael@0 | 555 | |
michael@0 | 556 | prep_word = hnj_malloc (word_size + 3); |
michael@0 | 557 | |
michael@0 | 558 | j = 0; |
michael@0 | 559 | prep_word[j++] = '.'; |
michael@0 | 560 | |
michael@0 | 561 | for (i = 0; i < word_size; i++) { |
michael@0 | 562 | if (word[i] <= '9' && word[i] >= '0') { |
michael@0 | 563 | prep_word[j++] = '.'; |
michael@0 | 564 | } else { |
michael@0 | 565 | prep_word[j++] = word[i]; |
michael@0 | 566 | } |
michael@0 | 567 | } |
michael@0 | 568 | |
michael@0 | 569 | prep_word[j++] = '.'; |
michael@0 | 570 | prep_word[j] = '\0'; |
michael@0 | 571 | |
michael@0 | 572 | for (i = 0; i < word_size + 5; i++) |
michael@0 | 573 | hyphens[i] = '0'; |
michael@0 | 574 | |
michael@0 | 575 | #ifdef VERBOSE |
michael@0 | 576 | printf ("prep_word = %s\n", prep_word); |
michael@0 | 577 | #endif |
michael@0 | 578 | |
michael@0 | 579 | /* now, run the finite state machine */ |
michael@0 | 580 | state = 0; |
michael@0 | 581 | for (i = 0; i < j; i++) |
michael@0 | 582 | { |
michael@0 | 583 | ch = prep_word[i]; |
michael@0 | 584 | for (;;) |
michael@0 | 585 | { |
michael@0 | 586 | |
michael@0 | 587 | if (state == -1) { |
michael@0 | 588 | /* return 1; */ |
michael@0 | 589 | /* KBH: FIXME shouldn't this be as follows? */ |
michael@0 | 590 | state = 0; |
michael@0 | 591 | goto try_next_letter; |
michael@0 | 592 | } |
michael@0 | 593 | |
michael@0 | 594 | #ifdef VERBOSE |
michael@0 | 595 | char *state_str; |
michael@0 | 596 | state_str = get_state_str (state, 0); |
michael@0 | 597 | |
michael@0 | 598 | for (k = 0; k < i - strlen (state_str); k++) |
michael@0 | 599 | putchar (' '); |
michael@0 | 600 | printf ("%s", state_str); |
michael@0 | 601 | #endif |
michael@0 | 602 | |
michael@0 | 603 | hstate = &dict->states[state]; |
michael@0 | 604 | for (k = 0; k < hstate->num_trans; k++) |
michael@0 | 605 | if (hstate->trans[k].ch == ch) |
michael@0 | 606 | { |
michael@0 | 607 | state = hstate->trans[k].new_state; |
michael@0 | 608 | goto found_state; |
michael@0 | 609 | } |
michael@0 | 610 | state = hstate->fallback_state; |
michael@0 | 611 | #ifdef VERBOSE |
michael@0 | 612 | printf (" falling back, fallback_state %d\n", state); |
michael@0 | 613 | #endif |
michael@0 | 614 | } |
michael@0 | 615 | found_state: |
michael@0 | 616 | #ifdef VERBOSE |
michael@0 | 617 | printf ("found state %d\n",state); |
michael@0 | 618 | #endif |
michael@0 | 619 | /* Additional optimization is possible here - especially, |
michael@0 | 620 | elimination of trailing zeroes from the match. Leading zeroes |
michael@0 | 621 | have already been optimized. */ |
michael@0 | 622 | match = dict->states[state].match; |
michael@0 | 623 | /* replacing rules not handled by hyphen_hyphenate() */ |
michael@0 | 624 | if (match && !dict->states[state].repl) |
michael@0 | 625 | { |
michael@0 | 626 | offset = i + 1 - strlen (match); |
michael@0 | 627 | #ifdef VERBOSE |
michael@0 | 628 | for (k = 0; k < offset; k++) |
michael@0 | 629 | putchar (' '); |
michael@0 | 630 | printf ("%s\n", match); |
michael@0 | 631 | #endif |
michael@0 | 632 | /* This is a linear search because I tried a binary search and |
michael@0 | 633 | found it to be just a teeny bit slower. */ |
michael@0 | 634 | for (k = 0; match[k]; k++) |
michael@0 | 635 | if (hyphens[offset + k] < match[k]) |
michael@0 | 636 | hyphens[offset + k] = match[k]; |
michael@0 | 637 | } |
michael@0 | 638 | |
michael@0 | 639 | /* KBH: we need this to make sure we keep looking in a word */ |
michael@0 | 640 | /* for patterns even if the current character is not known in state 0 */ |
michael@0 | 641 | /* since patterns for hyphenation may occur anywhere in the word */ |
michael@0 | 642 | try_next_letter: ; |
michael@0 | 643 | |
michael@0 | 644 | } |
michael@0 | 645 | #ifdef VERBOSE |
michael@0 | 646 | for (i = 0; i < j; i++) |
michael@0 | 647 | putchar (hyphens[i]); |
michael@0 | 648 | putchar ('\n'); |
michael@0 | 649 | #endif |
michael@0 | 650 | |
michael@0 | 651 | for (i = 0; i < j - 4; i++) |
michael@0 | 652 | #if 0 |
michael@0 | 653 | if (hyphens[i + 1] & 1) |
michael@0 | 654 | hyphens[i] = '-'; |
michael@0 | 655 | #else |
michael@0 | 656 | hyphens[i] = hyphens[i + 1]; |
michael@0 | 657 | #endif |
michael@0 | 658 | hyphens[0] = '0'; |
michael@0 | 659 | for (; i < word_size; i++) |
michael@0 | 660 | hyphens[i] = '0'; |
michael@0 | 661 | hyphens[word_size] = '\0'; |
michael@0 | 662 | |
michael@0 | 663 | hnj_free (prep_word); |
michael@0 | 664 | |
michael@0 | 665 | return 0; |
michael@0 | 666 | } |
michael@0 | 667 | |
michael@0 | 668 | /* Unicode ligature length */ |
michael@0 | 669 | int hnj_ligature(unsigned char c) { |
michael@0 | 670 | switch (c) { |
michael@0 | 671 | case 0x80: /* ff */ |
michael@0 | 672 | case 0x81: /* fi */ |
michael@0 | 673 | case 0x82: return LIG_xx; /* fl */ |
michael@0 | 674 | case 0x83: /* ffi */ |
michael@0 | 675 | case 0x84: return LIG_xxx; /* ffl */ |
michael@0 | 676 | case 0x85: /* long st */ |
michael@0 | 677 | case 0x86: return LIG_xx; /* st */ |
michael@0 | 678 | } |
michael@0 | 679 | return 0; |
michael@0 | 680 | } |
michael@0 | 681 | |
michael@0 | 682 | /* character length of the first n byte of the input word */ |
michael@0 | 683 | int hnj_hyphen_strnlen(const char * word, int n, int utf8) |
michael@0 | 684 | { |
michael@0 | 685 | int i = 0; |
michael@0 | 686 | int j = 0; |
michael@0 | 687 | while (j < n && word[j] != '\0') { |
michael@0 | 688 | i++; |
michael@0 | 689 | // Unicode ligature support |
michael@0 | 690 | if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) { |
michael@0 | 691 | i += hnj_ligature(word[j + 2]); |
michael@0 | 692 | } |
michael@0 | 693 | for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++); |
michael@0 | 694 | } |
michael@0 | 695 | return i; |
michael@0 | 696 | } |
michael@0 | 697 | |
michael@0 | 698 | int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens, |
michael@0 | 699 | char *** rep, int ** pos, int ** cut, int lhmin) |
michael@0 | 700 | { |
michael@0 | 701 | int i = 1, j; |
michael@0 | 702 | |
michael@0 | 703 | // Unicode ligature support |
michael@0 | 704 | if (utf8 && ((unsigned char) word[0] == 0xEF) && ((unsigned char) word[1] == 0xAC)) { |
michael@0 | 705 | i += hnj_ligature(word[2]); |
michael@0 | 706 | } |
michael@0 | 707 | |
michael@0 | 708 | // ignore numbers |
michael@0 | 709 | for (j = 0; word[j] <= '9' && word[j] >= '0'; j++) i--; |
michael@0 | 710 | |
michael@0 | 711 | for (j = 0; i < lhmin && word[j] != '\0'; i++) do { |
michael@0 | 712 | // check length of the non-standard part |
michael@0 | 713 | if (*rep && *pos && *cut && (*rep)[j]) { |
michael@0 | 714 | char * rh = strchr((*rep)[j], '='); |
michael@0 | 715 | if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) + |
michael@0 | 716 | hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) { |
michael@0 | 717 | free((*rep)[j]); |
michael@0 | 718 | (*rep)[j] = NULL; |
michael@0 | 719 | hyphens[j] = '0'; |
michael@0 | 720 | } |
michael@0 | 721 | } else { |
michael@0 | 722 | hyphens[j] = '0'; |
michael@0 | 723 | } |
michael@0 | 724 | j++; |
michael@0 | 725 | |
michael@0 | 726 | // Unicode ligature support |
michael@0 | 727 | if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) { |
michael@0 | 728 | i += hnj_ligature(word[j + 2]); |
michael@0 | 729 | } |
michael@0 | 730 | } while (utf8 && (word[j] & 0xc0) == 0x80); |
michael@0 | 731 | return 0; |
michael@0 | 732 | } |
michael@0 | 733 | |
michael@0 | 734 | int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens, |
michael@0 | 735 | char *** rep, int ** pos, int ** cut, int rhmin) |
michael@0 | 736 | { |
michael@0 | 737 | int i = 0; |
michael@0 | 738 | int j; |
michael@0 | 739 | |
michael@0 | 740 | // ignore numbers |
michael@0 | 741 | for (j = word_size - 1; j > 0 && word[j] <= '9' && word[j] >= '0'; j--) i--; |
michael@0 | 742 | |
michael@0 | 743 | for (j = word_size - 1; i < rhmin && j > 0; j--) { |
michael@0 | 744 | // check length of the non-standard part |
michael@0 | 745 | if (*rep && *pos && *cut && (*rep)[j]) { |
michael@0 | 746 | char * rh = strchr((*rep)[j], '='); |
michael@0 | 747 | if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) + |
michael@0 | 748 | hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) { |
michael@0 | 749 | free((*rep)[j]); |
michael@0 | 750 | (*rep)[j] = NULL; |
michael@0 | 751 | hyphens[j] = '0'; |
michael@0 | 752 | } |
michael@0 | 753 | } else { |
michael@0 | 754 | hyphens[j] = '0'; |
michael@0 | 755 | } |
michael@0 | 756 | if (!utf8 || (word[j] & 0xc0) == 0xc0 || (word[j] & 0x80) != 0x80) i++; |
michael@0 | 757 | } |
michael@0 | 758 | return 0; |
michael@0 | 759 | } |
michael@0 | 760 | |
michael@0 | 761 | // recursive function for compound level hyphenation |
michael@0 | 762 | int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size, |
michael@0 | 763 | char * hyphens, char *** rep, int ** pos, int ** cut, |
michael@0 | 764 | int clhmin, int crhmin, int lend, int rend) |
michael@0 | 765 | { |
michael@0 | 766 | char *prep_word; |
michael@0 | 767 | int i, j, k; |
michael@0 | 768 | int state; |
michael@0 | 769 | char ch; |
michael@0 | 770 | HyphenState *hstate; |
michael@0 | 771 | char *match; |
michael@0 | 772 | char *repl; |
michael@0 | 773 | signed char replindex; |
michael@0 | 774 | signed char replcut; |
michael@0 | 775 | int offset; |
michael@0 | 776 | int * matchlen; |
michael@0 | 777 | int * matchindex; |
michael@0 | 778 | char ** matchrepl; |
michael@0 | 779 | int isrepl = 0; |
michael@0 | 780 | int nHyphCount; |
michael@0 | 781 | |
michael@0 | 782 | size_t prep_word_size = word_size + 3; |
michael@0 | 783 | prep_word = hnj_malloc (prep_word_size); |
michael@0 | 784 | matchlen = hnj_malloc ((word_size + 3) * sizeof(int)); |
michael@0 | 785 | matchindex = hnj_malloc ((word_size + 3) * sizeof(int)); |
michael@0 | 786 | matchrepl = hnj_malloc ((word_size + 3) * sizeof(char *)); |
michael@0 | 787 | |
michael@0 | 788 | j = 0; |
michael@0 | 789 | prep_word[j++] = '.'; |
michael@0 | 790 | |
michael@0 | 791 | for (i = 0; i < word_size; i++) { |
michael@0 | 792 | if (word[i] <= '9' && word[i] >= '0') { |
michael@0 | 793 | prep_word[j++] = '.'; |
michael@0 | 794 | } else { |
michael@0 | 795 | prep_word[j++] = word[i]; |
michael@0 | 796 | } |
michael@0 | 797 | } |
michael@0 | 798 | |
michael@0 | 799 | |
michael@0 | 800 | |
michael@0 | 801 | prep_word[j++] = '.'; |
michael@0 | 802 | prep_word[j] = '\0'; |
michael@0 | 803 | |
michael@0 | 804 | for (i = 0; i < j; i++) |
michael@0 | 805 | hyphens[i] = '0'; |
michael@0 | 806 | |
michael@0 | 807 | #ifdef VERBOSE |
michael@0 | 808 | printf ("prep_word = %s\n", prep_word); |
michael@0 | 809 | #endif |
michael@0 | 810 | |
michael@0 | 811 | /* now, run the finite state machine */ |
michael@0 | 812 | state = 0; |
michael@0 | 813 | for (i = 0; i < j; i++) |
michael@0 | 814 | { |
michael@0 | 815 | ch = prep_word[i]; |
michael@0 | 816 | for (;;) |
michael@0 | 817 | { |
michael@0 | 818 | |
michael@0 | 819 | if (state == -1) { |
michael@0 | 820 | /* return 1; */ |
michael@0 | 821 | /* KBH: FIXME shouldn't this be as follows? */ |
michael@0 | 822 | state = 0; |
michael@0 | 823 | goto try_next_letter; |
michael@0 | 824 | } |
michael@0 | 825 | |
michael@0 | 826 | #ifdef VERBOSE |
michael@0 | 827 | char *state_str; |
michael@0 | 828 | state_str = get_state_str (state, 1); |
michael@0 | 829 | |
michael@0 | 830 | for (k = 0; k < i - strlen (state_str); k++) |
michael@0 | 831 | putchar (' '); |
michael@0 | 832 | printf ("%s", state_str); |
michael@0 | 833 | #endif |
michael@0 | 834 | |
michael@0 | 835 | hstate = &dict->states[state]; |
michael@0 | 836 | for (k = 0; k < hstate->num_trans; k++) |
michael@0 | 837 | if (hstate->trans[k].ch == ch) |
michael@0 | 838 | { |
michael@0 | 839 | state = hstate->trans[k].new_state; |
michael@0 | 840 | goto found_state; |
michael@0 | 841 | } |
michael@0 | 842 | state = hstate->fallback_state; |
michael@0 | 843 | #ifdef VERBOSE |
michael@0 | 844 | printf (" falling back, fallback_state %d\n", state); |
michael@0 | 845 | #endif |
michael@0 | 846 | } |
michael@0 | 847 | found_state: |
michael@0 | 848 | #ifdef VERBOSE |
michael@0 | 849 | printf ("found state %d\n",state); |
michael@0 | 850 | #endif |
michael@0 | 851 | /* Additional optimization is possible here - especially, |
michael@0 | 852 | elimination of trailing zeroes from the match. Leading zeroes |
michael@0 | 853 | have already been optimized. */ |
michael@0 | 854 | match = dict->states[state].match; |
michael@0 | 855 | repl = dict->states[state].repl; |
michael@0 | 856 | replindex = dict->states[state].replindex; |
michael@0 | 857 | replcut = dict->states[state].replcut; |
michael@0 | 858 | /* replacing rules not handled by hyphen_hyphenate() */ |
michael@0 | 859 | if (match) |
michael@0 | 860 | { |
michael@0 | 861 | offset = i + 1 - strlen (match); |
michael@0 | 862 | #ifdef VERBOSE |
michael@0 | 863 | for (k = 0; k < offset; k++) |
michael@0 | 864 | putchar (' '); |
michael@0 | 865 | printf ("%s (%s)\n", match, repl); |
michael@0 | 866 | #endif |
michael@0 | 867 | if (repl) { |
michael@0 | 868 | if (!isrepl) for(; isrepl < word_size; isrepl++) { |
michael@0 | 869 | matchrepl[isrepl] = NULL; |
michael@0 | 870 | matchindex[isrepl] = -1; |
michael@0 | 871 | } |
michael@0 | 872 | matchlen[offset + replindex] = replcut; |
michael@0 | 873 | } |
michael@0 | 874 | /* This is a linear search because I tried a binary search and |
michael@0 | 875 | found it to be just a teeny bit slower. */ |
michael@0 | 876 | for (k = 0; match[k]; k++) { |
michael@0 | 877 | if ((hyphens[offset + k] < match[k])) { |
michael@0 | 878 | hyphens[offset + k] = match[k]; |
michael@0 | 879 | if (match[k]&1) { |
michael@0 | 880 | matchrepl[offset + k] = repl; |
michael@0 | 881 | if (repl && (k >= replindex) && (k <= replindex + replcut)) { |
michael@0 | 882 | matchindex[offset + replindex] = offset + k; |
michael@0 | 883 | } |
michael@0 | 884 | } |
michael@0 | 885 | } |
michael@0 | 886 | } |
michael@0 | 887 | |
michael@0 | 888 | } |
michael@0 | 889 | |
michael@0 | 890 | /* KBH: we need this to make sure we keep looking in a word */ |
michael@0 | 891 | /* for patterns even if the current character is not known in state 0 */ |
michael@0 | 892 | /* since patterns for hyphenation may occur anywhere in the word */ |
michael@0 | 893 | try_next_letter: ; |
michael@0 | 894 | |
michael@0 | 895 | } |
michael@0 | 896 | #ifdef VERBOSE |
michael@0 | 897 | for (i = 0; i < j; i++) |
michael@0 | 898 | putchar (hyphens[i]); |
michael@0 | 899 | putchar ('\n'); |
michael@0 | 900 | #endif |
michael@0 | 901 | |
michael@0 | 902 | for (i = 0; i < j - 3; i++) |
michael@0 | 903 | #if 0 |
michael@0 | 904 | if (hyphens[i + 1] & 1) |
michael@0 | 905 | hyphens[i] = '-'; |
michael@0 | 906 | #else |
michael@0 | 907 | hyphens[i] = hyphens[i + 1]; |
michael@0 | 908 | #endif |
michael@0 | 909 | for (; i < word_size; i++) |
michael@0 | 910 | hyphens[i] = '0'; |
michael@0 | 911 | hyphens[word_size] = '\0'; |
michael@0 | 912 | |
michael@0 | 913 | /* now create a new char string showing hyphenation positions */ |
michael@0 | 914 | /* count the hyphens and allocate space for the new hyphenated string */ |
michael@0 | 915 | nHyphCount = 0; |
michael@0 | 916 | for (i = 0; i < word_size; i++) |
michael@0 | 917 | if (hyphens[i]&1) |
michael@0 | 918 | nHyphCount++; |
michael@0 | 919 | j = 0; |
michael@0 | 920 | for (i = 0; i < word_size; i++) { |
michael@0 | 921 | if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) { |
michael@0 | 922 | if (rep && pos && cut) { |
michael@0 | 923 | if (!*rep) |
michael@0 | 924 | *rep = (char **) calloc(word_size, sizeof(char *)); |
michael@0 | 925 | if (!*pos) |
michael@0 | 926 | *pos = (int *) calloc(word_size, sizeof(int)); |
michael@0 | 927 | if (!*cut) { |
michael@0 | 928 | *cut = (int *) calloc(word_size, sizeof(int)); |
michael@0 | 929 | } |
michael@0 | 930 | (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]); |
michael@0 | 931 | (*pos)[matchindex[i] - 1] = matchindex[i] - i; |
michael@0 | 932 | (*cut)[matchindex[i] - 1] = matchlen[i]; |
michael@0 | 933 | } |
michael@0 | 934 | j += strlen(matchrepl[matchindex[i]]); |
michael@0 | 935 | i += matchlen[i] - 1; |
michael@0 | 936 | } |
michael@0 | 937 | } |
michael@0 | 938 | |
michael@0 | 939 | hnj_free (matchrepl); |
michael@0 | 940 | hnj_free (matchlen); |
michael@0 | 941 | hnj_free (matchindex); |
michael@0 | 942 | |
michael@0 | 943 | // recursive hyphenation of the first (compound) level segments |
michael@0 | 944 | if (dict->nextlevel) { |
michael@0 | 945 | char ** rep2; |
michael@0 | 946 | int * pos2; |
michael@0 | 947 | int * cut2; |
michael@0 | 948 | char * hyphens2; |
michael@0 | 949 | int begin = 0; |
michael@0 | 950 | |
michael@0 | 951 | rep2 = hnj_malloc (word_size * sizeof(char *)); |
michael@0 | 952 | pos2 = hnj_malloc (word_size * sizeof(int)); |
michael@0 | 953 | cut2 = hnj_malloc (word_size * sizeof(int)); |
michael@0 | 954 | hyphens2 = hnj_malloc (word_size + 3); |
michael@0 | 955 | for (i = 0; i < word_size; i++) rep2[i] = NULL; |
michael@0 | 956 | for (i = 0; i < word_size; i++) if |
michael@0 | 957 | (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) { |
michael@0 | 958 | if (i - begin > 1) { |
michael@0 | 959 | int hyph = 0; |
michael@0 | 960 | prep_word[i + 2] = '\0'; |
michael@0 | 961 | /* non-standard hyphenation at compound boundary (Schiffahrt) */ |
michael@0 | 962 | if (rep && *rep && *pos && *cut && (*rep)[i]) { |
michael@0 | 963 | char * l = strchr((*rep)[i], '='); |
michael@0 | 964 | size_t offset = 2 + i - (*pos)[i]; |
michael@0 | 965 | strncpy(prep_word + offset, (*rep)[i], prep_word_size - offset - 1); |
michael@0 | 966 | prep_word[prep_word_size - 1] = '\0'; |
michael@0 | 967 | if (l) { |
michael@0 | 968 | hyph = (l - (*rep)[i]) - (*pos)[i]; |
michael@0 | 969 | prep_word[2 + i + hyph] = '\0'; |
michael@0 | 970 | } |
michael@0 | 971 | } |
michael@0 | 972 | hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph, |
michael@0 | 973 | hyphens2, &rep2, &pos2, &cut2, clhmin, |
michael@0 | 974 | crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend)); |
michael@0 | 975 | for (j = 0; j < i - begin - 1; j++) { |
michael@0 | 976 | hyphens[begin + j] = hyphens2[j]; |
michael@0 | 977 | if (rep2[j] && rep && pos && cut) { |
michael@0 | 978 | if (!*rep && !*pos && !*cut) { |
michael@0 | 979 | int k; |
michael@0 | 980 | *rep = (char **) malloc(sizeof(char *) * word_size); |
michael@0 | 981 | *pos = (int *) malloc(sizeof(int) * word_size); |
michael@0 | 982 | *cut = (int *) malloc(sizeof(int) * word_size); |
michael@0 | 983 | for (k = 0; k < word_size; k++) { |
michael@0 | 984 | (*rep)[k] = NULL; |
michael@0 | 985 | (*pos)[k] = 0; |
michael@0 | 986 | (*cut)[k] = 0; |
michael@0 | 987 | } |
michael@0 | 988 | } |
michael@0 | 989 | (*rep)[begin + j] = rep2[j]; |
michael@0 | 990 | (*pos)[begin + j] = pos2[j]; |
michael@0 | 991 | (*cut)[begin + j] = cut2[j]; |
michael@0 | 992 | } |
michael@0 | 993 | } |
michael@0 | 994 | prep_word[i + 2] = word[i + 1]; |
michael@0 | 995 | if (*rep && *pos && *cut && (*rep)[i]) { |
michael@0 | 996 | size_t offset = 1; |
michael@0 | 997 | strncpy(prep_word + offset, word, prep_word_size - offset - 1); |
michael@0 | 998 | prep_word[prep_word_size - 1] = '\0'; |
michael@0 | 999 | } |
michael@0 | 1000 | } |
michael@0 | 1001 | begin = i + 1; |
michael@0 | 1002 | for (j = 0; j < word_size; j++) rep2[j] = NULL; |
michael@0 | 1003 | } |
michael@0 | 1004 | |
michael@0 | 1005 | // non-compound |
michael@0 | 1006 | if (begin == 0) { |
michael@0 | 1007 | hnj_hyphen_hyph_(dict->nextlevel, word, word_size, |
michael@0 | 1008 | hyphens, rep, pos, cut, clhmin, crhmin, lend, rend); |
michael@0 | 1009 | if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens, |
michael@0 | 1010 | rep, pos, cut, clhmin); |
michael@0 | 1011 | if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens, |
michael@0 | 1012 | rep, pos, cut, crhmin); |
michael@0 | 1013 | } |
michael@0 | 1014 | |
michael@0 | 1015 | free(rep2); |
michael@0 | 1016 | free(cut2); |
michael@0 | 1017 | free(pos2); |
michael@0 | 1018 | free(hyphens2); |
michael@0 | 1019 | } |
michael@0 | 1020 | |
michael@0 | 1021 | hnj_free (prep_word); |
michael@0 | 1022 | return 0; |
michael@0 | 1023 | } |
michael@0 | 1024 | |
michael@0 | 1025 | /* UTF-8 normalization of hyphen and non-standard positions */ |
michael@0 | 1026 | int hnj_hyphen_norm(const char *word, int word_size, char * hyphens, |
michael@0 | 1027 | char *** rep, int ** pos, int ** cut) |
michael@0 | 1028 | { |
michael@0 | 1029 | int i, j, k; |
michael@0 | 1030 | if ((((unsigned char) word[0]) >> 6) == 2) { |
michael@0 | 1031 | fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word); |
michael@0 | 1032 | return 1; |
michael@0 | 1033 | } |
michael@0 | 1034 | |
michael@0 | 1035 | /* calculate UTF-8 character positions */ |
michael@0 | 1036 | for (i = 0, j = -1; i < word_size; i++) { |
michael@0 | 1037 | /* beginning of an UTF-8 character (not '10' start bits) */ |
michael@0 | 1038 | if ((((unsigned char) word[i]) >> 6) != 2) j++; |
michael@0 | 1039 | hyphens[j] = hyphens[i]; |
michael@0 | 1040 | if (rep && pos && cut && *rep && *pos && *cut) { |
michael@0 | 1041 | int l = (*pos)[i]; |
michael@0 | 1042 | (*pos)[j] = 0; |
michael@0 | 1043 | for (k = 0; k < l; k++) { |
michael@0 | 1044 | if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++; |
michael@0 | 1045 | } |
michael@0 | 1046 | k = i - l + 1; |
michael@0 | 1047 | l = k + (*cut)[i]; |
michael@0 | 1048 | (*cut)[j] = 0; |
michael@0 | 1049 | for (; k < l; k++) { |
michael@0 | 1050 | if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++; |
michael@0 | 1051 | } |
michael@0 | 1052 | (*rep)[j] = (*rep)[i]; |
michael@0 | 1053 | if (j < i) { |
michael@0 | 1054 | (*rep)[i] = NULL; |
michael@0 | 1055 | (*pos)[i] = 0; |
michael@0 | 1056 | (*cut)[i] = 0; |
michael@0 | 1057 | } |
michael@0 | 1058 | } |
michael@0 | 1059 | } |
michael@0 | 1060 | hyphens[j + 1] = '\0'; |
michael@0 | 1061 | #ifdef VERBOSE |
michael@0 | 1062 | printf ("nums: %s\n", hyphens); |
michael@0 | 1063 | #endif |
michael@0 | 1064 | return 0; |
michael@0 | 1065 | } |
michael@0 | 1066 | |
michael@0 | 1067 | /* get the word with all possible hyphenations (output: hyphword) */ |
michael@0 | 1068 | void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens, |
michael@0 | 1069 | char * hyphword, char *** rep, int ** pos, int ** cut) |
michael@0 | 1070 | { |
michael@0 | 1071 | int hyphenslen = l + 5; |
michael@0 | 1072 | |
michael@0 | 1073 | int i, j; |
michael@0 | 1074 | for (i = 0, j = 0; i < l; i++, j++) { |
michael@0 | 1075 | if (hyphens[i]&1) { |
michael@0 | 1076 | hyphword[j] = word[i]; |
michael@0 | 1077 | if (*rep && *pos && *cut && (*rep)[i]) { |
michael@0 | 1078 | size_t offset = j - (*pos)[i] + 1; |
michael@0 | 1079 | strncpy(hyphword + offset, (*rep)[i], hyphenslen - offset - 1); |
michael@0 | 1080 | hyphword[hyphenslen-1] = '\0'; |
michael@0 | 1081 | j += strlen((*rep)[i]) - (*pos)[i]; |
michael@0 | 1082 | i += (*cut)[i] - (*pos)[i]; |
michael@0 | 1083 | } else hyphword[++j] = '='; |
michael@0 | 1084 | } else hyphword[j] = word[i]; |
michael@0 | 1085 | } |
michael@0 | 1086 | hyphword[j] = '\0'; |
michael@0 | 1087 | } |
michael@0 | 1088 | |
michael@0 | 1089 | |
michael@0 | 1090 | /* main api function with default hyphenmin parameters */ |
michael@0 | 1091 | int hnj_hyphen_hyphenate2 (HyphenDict *dict, |
michael@0 | 1092 | const char *word, int word_size, char * hyphens, |
michael@0 | 1093 | char *hyphword, char *** rep, int ** pos, int ** cut) |
michael@0 | 1094 | { |
michael@0 | 1095 | hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut, |
michael@0 | 1096 | dict->clhmin, dict->crhmin, 1, 1); |
michael@0 | 1097 | hnj_hyphen_lhmin(dict->utf8, word, word_size, |
michael@0 | 1098 | hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2)); |
michael@0 | 1099 | hnj_hyphen_rhmin(dict->utf8, word, word_size, |
michael@0 | 1100 | hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2)); |
michael@0 | 1101 | |
michael@0 | 1102 | /* nohyphen */ |
michael@0 | 1103 | if (dict->nohyphen) { |
michael@0 | 1104 | char * nh = dict->nohyphen; |
michael@0 | 1105 | int nhi; |
michael@0 | 1106 | for (nhi = 0; nhi <= dict->nohyphenl; nhi++) { |
michael@0 | 1107 | char * nhy = (char *) strstr(word, nh); |
michael@0 | 1108 | while (nhy) { |
michael@0 | 1109 | hyphens[nhy - word + strlen(nh) - 1] = '0'; |
michael@0 | 1110 | if (nhy - word - 1 >= 0) hyphens[nhy - word - 1] = '0'; |
michael@0 | 1111 | nhy = (char *) strstr(nhy + 1, nh); |
michael@0 | 1112 | } |
michael@0 | 1113 | nh = nh + strlen(nh) + 1; |
michael@0 | 1114 | } |
michael@0 | 1115 | } |
michael@0 | 1116 | |
michael@0 | 1117 | if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut); |
michael@0 | 1118 | if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut); |
michael@0 | 1119 | #ifdef VERBOSE |
michael@0 | 1120 | printf ("nums: %s\n", hyphens); |
michael@0 | 1121 | #endif |
michael@0 | 1122 | return 0; |
michael@0 | 1123 | } |
michael@0 | 1124 | |
michael@0 | 1125 | /* previous main api function with hyphenmin parameters */ |
michael@0 | 1126 | int hnj_hyphen_hyphenate3 (HyphenDict *dict, |
michael@0 | 1127 | const char *word, int word_size, char * hyphens, |
michael@0 | 1128 | char *hyphword, char *** rep, int ** pos, int ** cut, |
michael@0 | 1129 | int lhmin, int rhmin, int clhmin, int crhmin) |
michael@0 | 1130 | { |
michael@0 | 1131 | lhmin = (lhmin > dict->lhmin) ? lhmin : dict->lhmin; |
michael@0 | 1132 | rhmin = (rhmin > dict->rhmin) ? rhmin : dict->rhmin; |
michael@0 | 1133 | clhmin = (clhmin > dict->clhmin) ? clhmin : dict->clhmin; |
michael@0 | 1134 | crhmin = (crhmin > dict->crhmin) ? crhmin : dict->crhmin; |
michael@0 | 1135 | hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut, |
michael@0 | 1136 | clhmin, crhmin, 1, 1); |
michael@0 | 1137 | hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens, |
michael@0 | 1138 | rep, pos, cut, (lhmin > 0 ? lhmin : 2)); |
michael@0 | 1139 | hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens, |
michael@0 | 1140 | rep, pos, cut, (rhmin > 0 ? rhmin : 2)); |
michael@0 | 1141 | if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut); |
michael@0 | 1142 | |
michael@0 | 1143 | /* nohyphen */ |
michael@0 | 1144 | if (dict->nohyphen) { |
michael@0 | 1145 | char * nh = dict->nohyphen; |
michael@0 | 1146 | int nhi; |
michael@0 | 1147 | for (nhi = 0; nhi <= dict->nohyphenl; nhi++) { |
michael@0 | 1148 | char * nhy = (char *) strstr(word, nh); |
michael@0 | 1149 | while (nhy) { |
michael@0 | 1150 | hyphens[nhy - word + strlen(nh) - 1] = 0; |
michael@0 | 1151 | if (nhy - word - 1 >= 0) hyphens[nhy - word - 1] = 0; |
michael@0 | 1152 | nhy = (char *) strstr(nhy + 1, nh); |
michael@0 | 1153 | } |
michael@0 | 1154 | nh = nh + strlen(nh) + 1; |
michael@0 | 1155 | } |
michael@0 | 1156 | } |
michael@0 | 1157 | |
michael@0 | 1158 | if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut); |
michael@0 | 1159 | return 0; |
michael@0 | 1160 | } |