michael@0: /* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both michael@0: * licenses follows. michael@0: */ michael@0: michael@0: /* LibHnj - a library for high quality hyphenation and justification michael@0: * Copyright (C) 1998 Raph Levien, michael@0: * (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org), michael@0: * (C) 2001 Peter Novodvorsky (nidd@cs.msu.su) michael@0: * (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo) michael@0: * michael@0: * This library is free software; you can redistribute it and/or michael@0: * modify it under the terms of the GNU Library General Public michael@0: * License as published by the Free Software Foundation; either michael@0: * version 2 of the License, or (at your option) any later version. michael@0: * michael@0: * This library is distributed in the hope that it will be useful, michael@0: * but WITHOUT ANY WARRANTY; without even the implied warranty of michael@0: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU michael@0: * Library General Public License for more details. michael@0: * michael@0: * You should have received a copy of the GNU Library General Public michael@0: * License along with this library; if not, write to the michael@0: * Free Software Foundation, Inc., 59 Temple Place - Suite 330, michael@0: * Boston, MA 02111-1307 USA. michael@0: */ michael@0: michael@0: /* michael@0: * The contents of this file are subject to the Mozilla Public License michael@0: * Version 1.0 (the "MPL"); you may not use this file except in michael@0: * compliance with the MPL. You may obtain a copy of the MPL at michael@0: * http://www.mozilla.org/MPL/ michael@0: * michael@0: * Software distributed under the MPL is distributed on an "AS IS" basis, michael@0: * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL michael@0: * for the specific language governing rights and limitations under the michael@0: * MPL. michael@0: * michael@0: */ michael@0: #include /* for NULL, malloc */ michael@0: #include /* for fprintf */ michael@0: #include /* for strdup */ michael@0: michael@0: #ifdef UNX michael@0: #include /* for exit */ michael@0: #endif michael@0: michael@0: #define noVERBOSE michael@0: michael@0: /* calculate hyphenmin values with long ligature length (2 or 3 characters michael@0: * instead of 1 or 2) for comparison with hyphenation without ligatures */ michael@0: #define noLONG_LIGATURE michael@0: michael@0: #ifdef LONG_LIGATURE michael@0: #define LIG_xx 1 michael@0: #define LIG_xxx 2 michael@0: #else michael@0: #define LIG_xx 0 michael@0: #define LIG_xxx 1 michael@0: #endif michael@0: michael@0: #include "hnjalloc.h" michael@0: #include "hyphen.h" michael@0: michael@0: static char * michael@0: hnj_strdup (const char *s) michael@0: { michael@0: char *new; michael@0: int l; michael@0: michael@0: l = strlen (s); michael@0: new = hnj_malloc (l + 1); michael@0: memcpy (new, s, l); michael@0: new[l] = 0; michael@0: return new; michael@0: } michael@0: michael@0: /* remove cross-platform text line end characters */ michael@0: void hnj_strchomp(char * s) michael@0: { michael@0: int k = strlen(s); michael@0: if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0'; michael@0: if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0'; michael@0: } michael@0: michael@0: /* a little bit of a hash table implementation. This simply maps strings michael@0: to state numbers */ michael@0: michael@0: typedef struct _HashTab HashTab; michael@0: typedef struct _HashEntry HashEntry; michael@0: michael@0: /* A cheap, but effective, hack. */ michael@0: #define HASH_SIZE 31627 michael@0: michael@0: struct _HashTab { michael@0: HashEntry *entries[HASH_SIZE]; michael@0: }; michael@0: michael@0: struct _HashEntry { michael@0: HashEntry *next; michael@0: char *key; michael@0: int val; michael@0: }; michael@0: michael@0: /* a char* hash function from ASU - adapted from Gtk+ */ michael@0: static unsigned int michael@0: hnj_string_hash (const char *s) michael@0: { michael@0: const char *p; michael@0: unsigned int h=0, g; michael@0: for(p = s; *p != '\0'; p += 1) { michael@0: h = ( h << 4 ) + *p; michael@0: if ( ( g = h & 0xf0000000 ) ) { michael@0: h = h ^ (g >> 24); michael@0: h = h ^ g; michael@0: } michael@0: } michael@0: return h /* % M */; michael@0: } michael@0: michael@0: static HashTab * michael@0: hnj_hash_new (void) michael@0: { michael@0: HashTab *hashtab; michael@0: int i; michael@0: michael@0: hashtab = hnj_malloc (sizeof(HashTab)); michael@0: for (i = 0; i < HASH_SIZE; i++) michael@0: hashtab->entries[i] = NULL; michael@0: michael@0: return hashtab; michael@0: } michael@0: michael@0: static void michael@0: hnj_hash_free (HashTab *hashtab) michael@0: { michael@0: int i; michael@0: HashEntry *e, *next; michael@0: michael@0: for (i = 0; i < HASH_SIZE; i++) michael@0: for (e = hashtab->entries[i]; e; e = next) michael@0: { michael@0: next = e->next; michael@0: hnj_free (e->key); michael@0: hnj_free (e); michael@0: } michael@0: michael@0: hnj_free (hashtab); michael@0: } michael@0: michael@0: /* assumes that key is not already present! */ michael@0: static void michael@0: hnj_hash_insert (HashTab *hashtab, const char *key, int val) michael@0: { michael@0: int i; michael@0: HashEntry *e; michael@0: michael@0: i = hnj_string_hash (key) % HASH_SIZE; michael@0: e = hnj_malloc (sizeof(HashEntry)); michael@0: e->next = hashtab->entries[i]; michael@0: e->key = hnj_strdup (key); michael@0: e->val = val; michael@0: hashtab->entries[i] = e; michael@0: } michael@0: michael@0: /* return val if found, otherwise -1 */ michael@0: static int michael@0: hnj_hash_lookup (HashTab *hashtab, const char *key) michael@0: { michael@0: int i; michael@0: HashEntry *e; michael@0: i = hnj_string_hash (key) % HASH_SIZE; michael@0: for (e = hashtab->entries[i]; e; e = e->next) michael@0: if (!strcmp (key, e->key)) michael@0: return e->val; michael@0: return -1; michael@0: } michael@0: michael@0: /* Get the state number, allocating a new state if necessary. */ michael@0: static int michael@0: hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string) michael@0: { michael@0: int state_num; michael@0: michael@0: state_num = hnj_hash_lookup (hashtab, string); michael@0: michael@0: if (state_num >= 0) michael@0: return state_num; michael@0: michael@0: hnj_hash_insert (hashtab, string, dict->num_states); michael@0: /* predicate is true if dict->num_states is a power of two */ michael@0: if (!(dict->num_states & (dict->num_states - 1))) michael@0: { michael@0: dict->states = hnj_realloc (dict->states, michael@0: (dict->num_states << 1) * michael@0: sizeof(HyphenState)); michael@0: } michael@0: dict->states[dict->num_states].match = NULL; michael@0: dict->states[dict->num_states].repl = NULL; michael@0: dict->states[dict->num_states].fallback_state = -1; michael@0: dict->states[dict->num_states].num_trans = 0; michael@0: dict->states[dict->num_states].trans = NULL; michael@0: return dict->num_states++; michael@0: } michael@0: michael@0: /* add a transition from state1 to state2 through ch - assumes that the michael@0: transition does not already exist */ michael@0: static void michael@0: hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch) michael@0: { michael@0: int num_trans; michael@0: michael@0: num_trans = dict->states[state1].num_trans; michael@0: if (num_trans == 0) michael@0: { michael@0: dict->states[state1].trans = hnj_malloc (sizeof(HyphenTrans)); michael@0: } michael@0: else if (!(num_trans & (num_trans - 1))) michael@0: { michael@0: dict->states[state1].trans = hnj_realloc (dict->states[state1].trans, michael@0: (num_trans << 1) * michael@0: sizeof(HyphenTrans)); michael@0: } michael@0: dict->states[state1].trans[num_trans].ch = ch; michael@0: dict->states[state1].trans[num_trans].new_state = state2; michael@0: dict->states[state1].num_trans++; michael@0: } michael@0: michael@0: #ifdef VERBOSE michael@0: HashTab *global[1]; michael@0: michael@0: static char * michael@0: get_state_str (int state, int level) michael@0: { michael@0: int i; michael@0: HashEntry *e; michael@0: michael@0: for (i = 0; i < HASH_SIZE; i++) michael@0: for (e = global[level]->entries[i]; e; e = e->next) michael@0: if (e->val == state) michael@0: return e->key; michael@0: return NULL; michael@0: } michael@0: #endif michael@0: michael@0: void hnj_hyphen_load_line(char * buf, HyphenDict * dict, HashTab * hashtab) { michael@0: int i, j; michael@0: char word[MAX_CHARS]; michael@0: char pattern[MAX_CHARS]; michael@0: char * repl; michael@0: signed char replindex; michael@0: signed char replcut; michael@0: int state_num = 0; michael@0: int last_state; michael@0: char ch; michael@0: int found; michael@0: michael@0: if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) { michael@0: dict->lhmin = atoi(buf + 13); michael@0: return; michael@0: } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) { michael@0: dict->rhmin = atoi(buf + 14); michael@0: return; michael@0: } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) { michael@0: dict->clhmin = atoi(buf + 21); michael@0: return; michael@0: } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) { michael@0: dict->crhmin = atoi(buf + 22); michael@0: return; michael@0: } else if (strncmp(buf, "NOHYPHEN", 8) == 0) { michael@0: char * space = buf + 8; michael@0: while (*space != '\0' && (*space == ' ' || *space == '\t')) space++; michael@0: if (*buf != '\0') dict->nohyphen = hnj_strdup(space); michael@0: if (dict->nohyphen) { michael@0: char * nhe = dict->nohyphen + strlen(dict->nohyphen) - 1; michael@0: *nhe = 0; michael@0: for (nhe = nhe - 1; nhe > dict->nohyphen; nhe--) { michael@0: if (*nhe == ',') { michael@0: dict->nohyphenl++; michael@0: *nhe = 0; michael@0: } michael@0: } michael@0: } michael@0: return; michael@0: } michael@0: j = 0; michael@0: pattern[j] = '0'; michael@0: repl = strchr(buf, '/'); michael@0: replindex = 0; michael@0: replcut = 0; michael@0: if (repl) { michael@0: char * index = strchr(repl + 1, ','); michael@0: *repl = '\0'; michael@0: if (index) { michael@0: char * index2 = strchr(index + 1, ','); michael@0: *index = '\0'; michael@0: if (index2) { michael@0: *index2 = '\0'; michael@0: replindex = (signed char) atoi(index + 1) - 1; michael@0: replcut = (signed char) atoi(index2 + 1); michael@0: } michael@0: } else { michael@0: hnj_strchomp(repl + 1); michael@0: replindex = 0; michael@0: replcut = (signed char) strlen(buf); michael@0: } michael@0: repl = hnj_strdup(repl + 1); michael@0: } michael@0: for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++) michael@0: { michael@0: if (buf[i] >= '0' && buf[i] <= '9') michael@0: pattern[j] = buf[i]; michael@0: else michael@0: { michael@0: word[j] = buf[i]; michael@0: pattern[++j] = '0'; michael@0: } michael@0: } michael@0: word[j] = '\0'; michael@0: pattern[j + 1] = '\0'; michael@0: michael@0: i = 0; michael@0: if (!repl) { michael@0: /* Optimize away leading zeroes */ michael@0: for (; pattern[i] == '0'; i++); michael@0: } else { michael@0: if (*word == '.') i++; michael@0: /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */ michael@0: if (dict->utf8) { michael@0: int pu = -1; /* unicode character position */ michael@0: int ps = -1; /* unicode start position (original replindex) */ michael@0: int pc = (*word == '.') ? 1: 0; /* 8-bit character position */ michael@0: for (; pc < (strlen(word) + 1); pc++) { michael@0: /* beginning of an UTF-8 character (not '10' start bits) */ michael@0: if ((((unsigned char) word[pc]) >> 6) != 2) pu++; michael@0: if ((ps < 0) && (replindex == pu)) { michael@0: ps = replindex; michael@0: replindex = (signed char) pc; michael@0: } michael@0: if ((ps >= 0) && ((pu - ps) == replcut)) { michael@0: replcut = (signed char) (pc - replindex); michael@0: break; michael@0: } michael@0: } michael@0: if (*word == '.') replindex--; michael@0: } michael@0: } michael@0: michael@0: #ifdef VERBOSE michael@0: printf ("word %s pattern %s, j = %d repl: %s\n", word, pattern + i, j, repl); michael@0: #endif michael@0: found = hnj_hash_lookup (hashtab, word); michael@0: state_num = hnj_get_state (dict, hashtab, word); michael@0: dict->states[state_num].match = hnj_strdup (pattern + i); michael@0: dict->states[state_num].repl = repl; michael@0: dict->states[state_num].replindex = replindex; michael@0: if (!replcut) { michael@0: dict->states[state_num].replcut = (signed char) strlen(word); michael@0: } else { michael@0: dict->states[state_num].replcut = replcut; michael@0: } michael@0: michael@0: /* now, put in the prefix transitions */ michael@0: for (; found < 0 ;j--) michael@0: { michael@0: last_state = state_num; michael@0: ch = word[j - 1]; michael@0: word[j - 1] = '\0'; michael@0: found = hnj_hash_lookup (hashtab, word); michael@0: state_num = hnj_get_state (dict, hashtab, word); michael@0: hnj_add_trans (dict, state_num, last_state, ch); michael@0: } michael@0: } michael@0: michael@0: HyphenDict * michael@0: hnj_hyphen_load (const char *fn) michael@0: { michael@0: HyphenDict *dict[2]; michael@0: HashTab *hashtab; michael@0: FILE *f; michael@0: char buf[MAX_CHARS]; michael@0: int nextlevel = 0; michael@0: int i, j, k; michael@0: HashEntry *e; michael@0: int state_num = 0; michael@0: michael@0: f = fopen (fn, "r"); michael@0: if (f == NULL) michael@0: return NULL; michael@0: michael@0: // loading one or two dictionaries (separated by NEXTLEVEL keyword) michael@0: for (k = 0; k < 2; k++) { michael@0: hashtab = hnj_hash_new (); michael@0: #ifdef VERBOSE michael@0: global[k] = hashtab; michael@0: #endif michael@0: hnj_hash_insert (hashtab, "", 0); michael@0: dict[k] = hnj_malloc (sizeof(HyphenDict)); michael@0: dict[k]->num_states = 1; michael@0: dict[k]->states = hnj_malloc (sizeof(HyphenState)); michael@0: dict[k]->states[0].match = NULL; michael@0: dict[k]->states[0].repl = NULL; michael@0: dict[k]->states[0].fallback_state = -1; michael@0: dict[k]->states[0].num_trans = 0; michael@0: dict[k]->states[0].trans = NULL; michael@0: dict[k]->nextlevel = NULL; michael@0: dict[k]->lhmin = 0; michael@0: dict[k]->rhmin = 0; michael@0: dict[k]->clhmin = 0; michael@0: dict[k]->crhmin = 0; michael@0: dict[k]->nohyphen = NULL; michael@0: dict[k]->nohyphenl = 0; michael@0: michael@0: /* read in character set info */ michael@0: if (k == 0) { michael@0: for (i=0;icset[i]= 0; michael@0: if (fgets(dict[k]->cset, sizeof(dict[k]->cset),f) != NULL) { michael@0: for (i=0;icset[i] == '\r') || (dict[k]->cset[i] == '\n')) michael@0: dict[k]->cset[i] = 0; michael@0: } else { michael@0: dict[k]->cset[0] = 0; michael@0: } michael@0: dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0); michael@0: } else { michael@0: strncpy(dict[k]->cset, dict[0]->cset, sizeof(dict[k]->cset)-1); michael@0: dict[k]->cset[sizeof(dict[k]->cset)-1] = '\0'; michael@0: dict[k]->utf8 = dict[0]->utf8; michael@0: } michael@0: michael@0: if (k == 0 || nextlevel) { michael@0: while (fgets (buf, sizeof(buf), f) != NULL) { michael@0: if (strncmp(buf, "NEXTLEVEL", 9) == 0) { michael@0: nextlevel = 1; michael@0: break; michael@0: } else if (buf[0] != '%') hnj_hyphen_load_line(buf, dict[k], hashtab); michael@0: } michael@0: } else if (k == 1) { michael@0: /* default first level: hyphen and ASCII apostrophe */ michael@0: if (!dict[0]->utf8) hnj_hyphen_load_line("NOHYPHEN ',-\n", dict[k], hashtab); michael@0: else hnj_hyphen_load_line("NOHYPHEN ',\xe2\x80\x93,\xe2\x80\x99,-\n", dict[k], hashtab); michael@0: strncpy(buf, "1-1\n", MAX_CHARS-1); // buf rewritten by hnj_hyphen_load here michael@0: buf[MAX_CHARS-1] = '\0'; michael@0: hnj_hyphen_load_line(buf, dict[k], hashtab); /* remove hyphen */ michael@0: hnj_hyphen_load_line("1'1\n", dict[k], hashtab); /* ASCII apostrophe */ michael@0: if (dict[0]->utf8) { michael@0: hnj_hyphen_load_line("1\xe2\x80\x93" "1\n", dict[k], hashtab); /* endash */ michael@0: hnj_hyphen_load_line("1\xe2\x80\x99" "1\n", dict[k], hashtab); /* apostrophe */ michael@0: } michael@0: } michael@0: michael@0: /* Could do unioning of matches here (instead of the preprocessor script). michael@0: If we did, the pseudocode would look something like this: michael@0: michael@0: foreach state in the hash table michael@0: foreach i = [1..length(state) - 1] michael@0: state to check is substr (state, i) michael@0: look it up michael@0: if found, and if there is a match, union the match in. michael@0: michael@0: It's also possible to avoid the quadratic blowup by doing the michael@0: search in order of increasing state string sizes - then you michael@0: can break the loop after finding the first match. michael@0: michael@0: This step should be optional in any case - if there is a michael@0: preprocessed rule table, it's always faster to use that. michael@0: michael@0: */ michael@0: michael@0: /* put in the fallback states */ michael@0: for (i = 0; i < HASH_SIZE; i++) michael@0: for (e = hashtab->entries[i]; e; e = e->next) michael@0: { michael@0: if (*(e->key)) for (j = 1; 1; j++) michael@0: { michael@0: state_num = hnj_hash_lookup (hashtab, e->key + j); michael@0: if (state_num >= 0) michael@0: break; michael@0: } michael@0: /* KBH: FIXME state 0 fallback_state should always be -1? */ michael@0: if (e->val) michael@0: dict[k]->states[e->val].fallback_state = state_num; michael@0: } michael@0: #ifdef VERBOSE michael@0: for (i = 0; i < HASH_SIZE; i++) michael@0: for (e = hashtab->entries[i]; e; e = e->next) michael@0: { michael@0: printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val, michael@0: dict[k]->states[e->val].fallback_state); michael@0: for (j = 0; j < dict[k]->states[e->val].num_trans; j++) michael@0: printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch, michael@0: dict[k]->states[e->val].trans[j].new_state); michael@0: } michael@0: #endif michael@0: michael@0: #ifndef VERBOSE michael@0: hnj_hash_free (hashtab); michael@0: #endif michael@0: state_num = 0; michael@0: } michael@0: fclose(f); michael@0: if (nextlevel) dict[0]->nextlevel = dict[1]; michael@0: else { michael@0: dict[1] -> nextlevel = dict[0]; michael@0: dict[1]->lhmin = dict[0]->lhmin; michael@0: dict[1]->rhmin = dict[0]->rhmin; michael@0: dict[1]->clhmin = (dict[0]->clhmin) ? dict[0]->clhmin : ((dict[0]->lhmin) ? dict[0]->lhmin : 3); michael@0: dict[1]->crhmin = (dict[0]->crhmin) ? dict[0]->crhmin : ((dict[0]->rhmin) ? dict[0]->rhmin : 3); michael@0: #ifdef VERBOSE michael@0: HashTab *r = global[0]; michael@0: global[0] = global[1]; michael@0: global[1] = r; michael@0: #endif michael@0: return dict[1]; michael@0: } michael@0: return dict[0]; michael@0: } michael@0: michael@0: void hnj_hyphen_free (HyphenDict *dict) michael@0: { michael@0: int state_num; michael@0: HyphenState *hstate; michael@0: michael@0: for (state_num = 0; state_num < dict->num_states; state_num++) michael@0: { michael@0: hstate = &dict->states[state_num]; michael@0: if (hstate->match) michael@0: hnj_free (hstate->match); michael@0: if (hstate->repl) michael@0: hnj_free (hstate->repl); michael@0: if (hstate->trans) michael@0: hnj_free (hstate->trans); michael@0: } michael@0: if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel); michael@0: michael@0: if (dict->nohyphen) hnj_free(dict->nohyphen); michael@0: michael@0: hnj_free (dict->states); michael@0: michael@0: hnj_free (dict); michael@0: } michael@0: michael@0: #define MAX_WORD 256 michael@0: michael@0: int hnj_hyphen_hyphenate (HyphenDict *dict, michael@0: const char *word, int word_size, michael@0: char *hyphens) michael@0: { michael@0: char *prep_word; michael@0: int i, j, k; michael@0: int state; michael@0: char ch; michael@0: HyphenState *hstate; michael@0: char *match; michael@0: int offset; michael@0: michael@0: prep_word = hnj_malloc (word_size + 3); michael@0: michael@0: j = 0; michael@0: prep_word[j++] = '.'; michael@0: michael@0: for (i = 0; i < word_size; i++) { michael@0: if (word[i] <= '9' && word[i] >= '0') { michael@0: prep_word[j++] = '.'; michael@0: } else { michael@0: prep_word[j++] = word[i]; michael@0: } michael@0: } michael@0: michael@0: prep_word[j++] = '.'; michael@0: prep_word[j] = '\0'; michael@0: michael@0: for (i = 0; i < word_size + 5; i++) michael@0: hyphens[i] = '0'; michael@0: michael@0: #ifdef VERBOSE michael@0: printf ("prep_word = %s\n", prep_word); michael@0: #endif michael@0: michael@0: /* now, run the finite state machine */ michael@0: state = 0; michael@0: for (i = 0; i < j; i++) michael@0: { michael@0: ch = prep_word[i]; michael@0: for (;;) michael@0: { michael@0: michael@0: if (state == -1) { michael@0: /* return 1; */ michael@0: /* KBH: FIXME shouldn't this be as follows? */ michael@0: state = 0; michael@0: goto try_next_letter; michael@0: } michael@0: michael@0: #ifdef VERBOSE michael@0: char *state_str; michael@0: state_str = get_state_str (state, 0); michael@0: michael@0: for (k = 0; k < i - strlen (state_str); k++) michael@0: putchar (' '); michael@0: printf ("%s", state_str); michael@0: #endif michael@0: michael@0: hstate = &dict->states[state]; michael@0: for (k = 0; k < hstate->num_trans; k++) michael@0: if (hstate->trans[k].ch == ch) michael@0: { michael@0: state = hstate->trans[k].new_state; michael@0: goto found_state; michael@0: } michael@0: state = hstate->fallback_state; michael@0: #ifdef VERBOSE michael@0: printf (" falling back, fallback_state %d\n", state); michael@0: #endif michael@0: } michael@0: found_state: michael@0: #ifdef VERBOSE michael@0: printf ("found state %d\n",state); michael@0: #endif michael@0: /* Additional optimization is possible here - especially, michael@0: elimination of trailing zeroes from the match. Leading zeroes michael@0: have already been optimized. */ michael@0: match = dict->states[state].match; michael@0: /* replacing rules not handled by hyphen_hyphenate() */ michael@0: if (match && !dict->states[state].repl) michael@0: { michael@0: offset = i + 1 - strlen (match); michael@0: #ifdef VERBOSE michael@0: for (k = 0; k < offset; k++) michael@0: putchar (' '); michael@0: printf ("%s\n", match); michael@0: #endif michael@0: /* This is a linear search because I tried a binary search and michael@0: found it to be just a teeny bit slower. */ michael@0: for (k = 0; match[k]; k++) michael@0: if (hyphens[offset + k] < match[k]) michael@0: hyphens[offset + k] = match[k]; michael@0: } michael@0: michael@0: /* KBH: we need this to make sure we keep looking in a word */ michael@0: /* for patterns even if the current character is not known in state 0 */ michael@0: /* since patterns for hyphenation may occur anywhere in the word */ michael@0: try_next_letter: ; michael@0: michael@0: } michael@0: #ifdef VERBOSE michael@0: for (i = 0; i < j; i++) michael@0: putchar (hyphens[i]); michael@0: putchar ('\n'); michael@0: #endif michael@0: michael@0: for (i = 0; i < j - 4; i++) michael@0: #if 0 michael@0: if (hyphens[i + 1] & 1) michael@0: hyphens[i] = '-'; michael@0: #else michael@0: hyphens[i] = hyphens[i + 1]; michael@0: #endif michael@0: hyphens[0] = '0'; michael@0: for (; i < word_size; i++) michael@0: hyphens[i] = '0'; michael@0: hyphens[word_size] = '\0'; michael@0: michael@0: hnj_free (prep_word); michael@0: michael@0: return 0; michael@0: } michael@0: michael@0: /* Unicode ligature length */ michael@0: int hnj_ligature(unsigned char c) { michael@0: switch (c) { michael@0: case 0x80: /* ff */ michael@0: case 0x81: /* fi */ michael@0: case 0x82: return LIG_xx; /* fl */ michael@0: case 0x83: /* ffi */ michael@0: case 0x84: return LIG_xxx; /* ffl */ michael@0: case 0x85: /* long st */ michael@0: case 0x86: return LIG_xx; /* st */ michael@0: } michael@0: return 0; michael@0: } michael@0: michael@0: /* character length of the first n byte of the input word */ michael@0: int hnj_hyphen_strnlen(const char * word, int n, int utf8) michael@0: { michael@0: int i = 0; michael@0: int j = 0; michael@0: while (j < n && word[j] != '\0') { michael@0: i++; michael@0: // Unicode ligature support michael@0: if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) { michael@0: i += hnj_ligature(word[j + 2]); michael@0: } michael@0: for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++); michael@0: } michael@0: return i; michael@0: } michael@0: michael@0: int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens, michael@0: char *** rep, int ** pos, int ** cut, int lhmin) michael@0: { michael@0: int i = 1, j; michael@0: michael@0: // Unicode ligature support michael@0: if (utf8 && ((unsigned char) word[0] == 0xEF) && ((unsigned char) word[1] == 0xAC)) { michael@0: i += hnj_ligature(word[2]); michael@0: } michael@0: michael@0: // ignore numbers michael@0: for (j = 0; word[j] <= '9' && word[j] >= '0'; j++) i--; michael@0: michael@0: for (j = 0; i < lhmin && word[j] != '\0'; i++) do { michael@0: // check length of the non-standard part michael@0: if (*rep && *pos && *cut && (*rep)[j]) { michael@0: char * rh = strchr((*rep)[j], '='); michael@0: if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) + michael@0: hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) { michael@0: free((*rep)[j]); michael@0: (*rep)[j] = NULL; michael@0: hyphens[j] = '0'; michael@0: } michael@0: } else { michael@0: hyphens[j] = '0'; michael@0: } michael@0: j++; michael@0: michael@0: // Unicode ligature support michael@0: if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) { michael@0: i += hnj_ligature(word[j + 2]); michael@0: } michael@0: } while (utf8 && (word[j] & 0xc0) == 0x80); michael@0: return 0; michael@0: } michael@0: michael@0: int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens, michael@0: char *** rep, int ** pos, int ** cut, int rhmin) michael@0: { michael@0: int i = 0; michael@0: int j; michael@0: michael@0: // ignore numbers michael@0: for (j = word_size - 1; j > 0 && word[j] <= '9' && word[j] >= '0'; j--) i--; michael@0: michael@0: for (j = word_size - 1; i < rhmin && j > 0; j--) { michael@0: // check length of the non-standard part michael@0: if (*rep && *pos && *cut && (*rep)[j]) { michael@0: char * rh = strchr((*rep)[j], '='); michael@0: if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) + michael@0: hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) { michael@0: free((*rep)[j]); michael@0: (*rep)[j] = NULL; michael@0: hyphens[j] = '0'; michael@0: } michael@0: } else { michael@0: hyphens[j] = '0'; michael@0: } michael@0: if (!utf8 || (word[j] & 0xc0) == 0xc0 || (word[j] & 0x80) != 0x80) i++; michael@0: } michael@0: return 0; michael@0: } michael@0: michael@0: // recursive function for compound level hyphenation michael@0: int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size, michael@0: char * hyphens, char *** rep, int ** pos, int ** cut, michael@0: int clhmin, int crhmin, int lend, int rend) michael@0: { michael@0: char *prep_word; michael@0: int i, j, k; michael@0: int state; michael@0: char ch; michael@0: HyphenState *hstate; michael@0: char *match; michael@0: char *repl; michael@0: signed char replindex; michael@0: signed char replcut; michael@0: int offset; michael@0: int * matchlen; michael@0: int * matchindex; michael@0: char ** matchrepl; michael@0: int isrepl = 0; michael@0: int nHyphCount; michael@0: michael@0: size_t prep_word_size = word_size + 3; michael@0: prep_word = hnj_malloc (prep_word_size); michael@0: matchlen = hnj_malloc ((word_size + 3) * sizeof(int)); michael@0: matchindex = hnj_malloc ((word_size + 3) * sizeof(int)); michael@0: matchrepl = hnj_malloc ((word_size + 3) * sizeof(char *)); michael@0: michael@0: j = 0; michael@0: prep_word[j++] = '.'; michael@0: michael@0: for (i = 0; i < word_size; i++) { michael@0: if (word[i] <= '9' && word[i] >= '0') { michael@0: prep_word[j++] = '.'; michael@0: } else { michael@0: prep_word[j++] = word[i]; michael@0: } michael@0: } michael@0: michael@0: michael@0: michael@0: prep_word[j++] = '.'; michael@0: prep_word[j] = '\0'; michael@0: michael@0: for (i = 0; i < j; i++) michael@0: hyphens[i] = '0'; michael@0: michael@0: #ifdef VERBOSE michael@0: printf ("prep_word = %s\n", prep_word); michael@0: #endif michael@0: michael@0: /* now, run the finite state machine */ michael@0: state = 0; michael@0: for (i = 0; i < j; i++) michael@0: { michael@0: ch = prep_word[i]; michael@0: for (;;) michael@0: { michael@0: michael@0: if (state == -1) { michael@0: /* return 1; */ michael@0: /* KBH: FIXME shouldn't this be as follows? */ michael@0: state = 0; michael@0: goto try_next_letter; michael@0: } michael@0: michael@0: #ifdef VERBOSE michael@0: char *state_str; michael@0: state_str = get_state_str (state, 1); michael@0: michael@0: for (k = 0; k < i - strlen (state_str); k++) michael@0: putchar (' '); michael@0: printf ("%s", state_str); michael@0: #endif michael@0: michael@0: hstate = &dict->states[state]; michael@0: for (k = 0; k < hstate->num_trans; k++) michael@0: if (hstate->trans[k].ch == ch) michael@0: { michael@0: state = hstate->trans[k].new_state; michael@0: goto found_state; michael@0: } michael@0: state = hstate->fallback_state; michael@0: #ifdef VERBOSE michael@0: printf (" falling back, fallback_state %d\n", state); michael@0: #endif michael@0: } michael@0: found_state: michael@0: #ifdef VERBOSE michael@0: printf ("found state %d\n",state); michael@0: #endif michael@0: /* Additional optimization is possible here - especially, michael@0: elimination of trailing zeroes from the match. Leading zeroes michael@0: have already been optimized. */ michael@0: match = dict->states[state].match; michael@0: repl = dict->states[state].repl; michael@0: replindex = dict->states[state].replindex; michael@0: replcut = dict->states[state].replcut; michael@0: /* replacing rules not handled by hyphen_hyphenate() */ michael@0: if (match) michael@0: { michael@0: offset = i + 1 - strlen (match); michael@0: #ifdef VERBOSE michael@0: for (k = 0; k < offset; k++) michael@0: putchar (' '); michael@0: printf ("%s (%s)\n", match, repl); michael@0: #endif michael@0: if (repl) { michael@0: if (!isrepl) for(; isrepl < word_size; isrepl++) { michael@0: matchrepl[isrepl] = NULL; michael@0: matchindex[isrepl] = -1; michael@0: } michael@0: matchlen[offset + replindex] = replcut; michael@0: } michael@0: /* This is a linear search because I tried a binary search and michael@0: found it to be just a teeny bit slower. */ michael@0: for (k = 0; match[k]; k++) { michael@0: if ((hyphens[offset + k] < match[k])) { michael@0: hyphens[offset + k] = match[k]; michael@0: if (match[k]&1) { michael@0: matchrepl[offset + k] = repl; michael@0: if (repl && (k >= replindex) && (k <= replindex + replcut)) { michael@0: matchindex[offset + replindex] = offset + k; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: } michael@0: michael@0: /* KBH: we need this to make sure we keep looking in a word */ michael@0: /* for patterns even if the current character is not known in state 0 */ michael@0: /* since patterns for hyphenation may occur anywhere in the word */ michael@0: try_next_letter: ; michael@0: michael@0: } michael@0: #ifdef VERBOSE michael@0: for (i = 0; i < j; i++) michael@0: putchar (hyphens[i]); michael@0: putchar ('\n'); michael@0: #endif michael@0: michael@0: for (i = 0; i < j - 3; i++) michael@0: #if 0 michael@0: if (hyphens[i + 1] & 1) michael@0: hyphens[i] = '-'; michael@0: #else michael@0: hyphens[i] = hyphens[i + 1]; michael@0: #endif michael@0: for (; i < word_size; i++) michael@0: hyphens[i] = '0'; michael@0: hyphens[word_size] = '\0'; michael@0: michael@0: /* now create a new char string showing hyphenation positions */ michael@0: /* count the hyphens and allocate space for the new hyphenated string */ michael@0: nHyphCount = 0; michael@0: for (i = 0; i < word_size; i++) michael@0: if (hyphens[i]&1) michael@0: nHyphCount++; michael@0: j = 0; michael@0: for (i = 0; i < word_size; i++) { michael@0: if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) { michael@0: if (rep && pos && cut) { michael@0: if (!*rep) michael@0: *rep = (char **) calloc(word_size, sizeof(char *)); michael@0: if (!*pos) michael@0: *pos = (int *) calloc(word_size, sizeof(int)); michael@0: if (!*cut) { michael@0: *cut = (int *) calloc(word_size, sizeof(int)); michael@0: } michael@0: (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]); michael@0: (*pos)[matchindex[i] - 1] = matchindex[i] - i; michael@0: (*cut)[matchindex[i] - 1] = matchlen[i]; michael@0: } michael@0: j += strlen(matchrepl[matchindex[i]]); michael@0: i += matchlen[i] - 1; michael@0: } michael@0: } michael@0: michael@0: hnj_free (matchrepl); michael@0: hnj_free (matchlen); michael@0: hnj_free (matchindex); michael@0: michael@0: // recursive hyphenation of the first (compound) level segments michael@0: if (dict->nextlevel) { michael@0: char ** rep2; michael@0: int * pos2; michael@0: int * cut2; michael@0: char * hyphens2; michael@0: int begin = 0; michael@0: michael@0: rep2 = hnj_malloc (word_size * sizeof(char *)); michael@0: pos2 = hnj_malloc (word_size * sizeof(int)); michael@0: cut2 = hnj_malloc (word_size * sizeof(int)); michael@0: hyphens2 = hnj_malloc (word_size + 3); michael@0: for (i = 0; i < word_size; i++) rep2[i] = NULL; michael@0: for (i = 0; i < word_size; i++) if michael@0: (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) { michael@0: if (i - begin > 1) { michael@0: int hyph = 0; michael@0: prep_word[i + 2] = '\0'; michael@0: /* non-standard hyphenation at compound boundary (Schiffahrt) */ michael@0: if (rep && *rep && *pos && *cut && (*rep)[i]) { michael@0: char * l = strchr((*rep)[i], '='); michael@0: size_t offset = 2 + i - (*pos)[i]; michael@0: strncpy(prep_word + offset, (*rep)[i], prep_word_size - offset - 1); michael@0: prep_word[prep_word_size - 1] = '\0'; michael@0: if (l) { michael@0: hyph = (l - (*rep)[i]) - (*pos)[i]; michael@0: prep_word[2 + i + hyph] = '\0'; michael@0: } michael@0: } michael@0: hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph, michael@0: hyphens2, &rep2, &pos2, &cut2, clhmin, michael@0: crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend)); michael@0: for (j = 0; j < i - begin - 1; j++) { michael@0: hyphens[begin + j] = hyphens2[j]; michael@0: if (rep2[j] && rep && pos && cut) { michael@0: if (!*rep && !*pos && !*cut) { michael@0: int k; michael@0: *rep = (char **) malloc(sizeof(char *) * word_size); michael@0: *pos = (int *) malloc(sizeof(int) * word_size); michael@0: *cut = (int *) malloc(sizeof(int) * word_size); michael@0: for (k = 0; k < word_size; k++) { michael@0: (*rep)[k] = NULL; michael@0: (*pos)[k] = 0; michael@0: (*cut)[k] = 0; michael@0: } michael@0: } michael@0: (*rep)[begin + j] = rep2[j]; michael@0: (*pos)[begin + j] = pos2[j]; michael@0: (*cut)[begin + j] = cut2[j]; michael@0: } michael@0: } michael@0: prep_word[i + 2] = word[i + 1]; michael@0: if (*rep && *pos && *cut && (*rep)[i]) { michael@0: size_t offset = 1; michael@0: strncpy(prep_word + offset, word, prep_word_size - offset - 1); michael@0: prep_word[prep_word_size - 1] = '\0'; michael@0: } michael@0: } michael@0: begin = i + 1; michael@0: for (j = 0; j < word_size; j++) rep2[j] = NULL; michael@0: } michael@0: michael@0: // non-compound michael@0: if (begin == 0) { michael@0: hnj_hyphen_hyph_(dict->nextlevel, word, word_size, michael@0: hyphens, rep, pos, cut, clhmin, crhmin, lend, rend); michael@0: if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens, michael@0: rep, pos, cut, clhmin); michael@0: if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens, michael@0: rep, pos, cut, crhmin); michael@0: } michael@0: michael@0: free(rep2); michael@0: free(cut2); michael@0: free(pos2); michael@0: free(hyphens2); michael@0: } michael@0: michael@0: hnj_free (prep_word); michael@0: return 0; michael@0: } michael@0: michael@0: /* UTF-8 normalization of hyphen and non-standard positions */ michael@0: int hnj_hyphen_norm(const char *word, int word_size, char * hyphens, michael@0: char *** rep, int ** pos, int ** cut) michael@0: { michael@0: int i, j, k; michael@0: if ((((unsigned char) word[0]) >> 6) == 2) { michael@0: fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word); michael@0: return 1; michael@0: } michael@0: michael@0: /* calculate UTF-8 character positions */ michael@0: for (i = 0, j = -1; i < word_size; i++) { michael@0: /* beginning of an UTF-8 character (not '10' start bits) */ michael@0: if ((((unsigned char) word[i]) >> 6) != 2) j++; michael@0: hyphens[j] = hyphens[i]; michael@0: if (rep && pos && cut && *rep && *pos && *cut) { michael@0: int l = (*pos)[i]; michael@0: (*pos)[j] = 0; michael@0: for (k = 0; k < l; k++) { michael@0: if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++; michael@0: } michael@0: k = i - l + 1; michael@0: l = k + (*cut)[i]; michael@0: (*cut)[j] = 0; michael@0: for (; k < l; k++) { michael@0: if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++; michael@0: } michael@0: (*rep)[j] = (*rep)[i]; michael@0: if (j < i) { michael@0: (*rep)[i] = NULL; michael@0: (*pos)[i] = 0; michael@0: (*cut)[i] = 0; michael@0: } michael@0: } michael@0: } michael@0: hyphens[j + 1] = '\0'; michael@0: #ifdef VERBOSE michael@0: printf ("nums: %s\n", hyphens); michael@0: #endif michael@0: return 0; michael@0: } michael@0: michael@0: /* get the word with all possible hyphenations (output: hyphword) */ michael@0: void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens, michael@0: char * hyphword, char *** rep, int ** pos, int ** cut) michael@0: { michael@0: int hyphenslen = l + 5; michael@0: michael@0: int i, j; michael@0: for (i = 0, j = 0; i < l; i++, j++) { michael@0: if (hyphens[i]&1) { michael@0: hyphword[j] = word[i]; michael@0: if (*rep && *pos && *cut && (*rep)[i]) { michael@0: size_t offset = j - (*pos)[i] + 1; michael@0: strncpy(hyphword + offset, (*rep)[i], hyphenslen - offset - 1); michael@0: hyphword[hyphenslen-1] = '\0'; michael@0: j += strlen((*rep)[i]) - (*pos)[i]; michael@0: i += (*cut)[i] - (*pos)[i]; michael@0: } else hyphword[++j] = '='; michael@0: } else hyphword[j] = word[i]; michael@0: } michael@0: hyphword[j] = '\0'; michael@0: } michael@0: michael@0: michael@0: /* main api function with default hyphenmin parameters */ michael@0: int hnj_hyphen_hyphenate2 (HyphenDict *dict, michael@0: const char *word, int word_size, char * hyphens, michael@0: char *hyphword, char *** rep, int ** pos, int ** cut) michael@0: { michael@0: hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut, michael@0: dict->clhmin, dict->crhmin, 1, 1); michael@0: hnj_hyphen_lhmin(dict->utf8, word, word_size, michael@0: hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2)); michael@0: hnj_hyphen_rhmin(dict->utf8, word, word_size, michael@0: hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2)); michael@0: michael@0: /* nohyphen */ michael@0: if (dict->nohyphen) { michael@0: char * nh = dict->nohyphen; michael@0: int nhi; michael@0: for (nhi = 0; nhi <= dict->nohyphenl; nhi++) { michael@0: char * nhy = (char *) strstr(word, nh); michael@0: while (nhy) { michael@0: hyphens[nhy - word + strlen(nh) - 1] = '0'; michael@0: if (nhy - word - 1 >= 0) hyphens[nhy - word - 1] = '0'; michael@0: nhy = (char *) strstr(nhy + 1, nh); michael@0: } michael@0: nh = nh + strlen(nh) + 1; michael@0: } michael@0: } michael@0: michael@0: if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut); michael@0: if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut); michael@0: #ifdef VERBOSE michael@0: printf ("nums: %s\n", hyphens); michael@0: #endif michael@0: return 0; michael@0: } michael@0: michael@0: /* previous main api function with hyphenmin parameters */ michael@0: int hnj_hyphen_hyphenate3 (HyphenDict *dict, michael@0: const char *word, int word_size, char * hyphens, michael@0: char *hyphword, char *** rep, int ** pos, int ** cut, michael@0: int lhmin, int rhmin, int clhmin, int crhmin) michael@0: { michael@0: lhmin = (lhmin > dict->lhmin) ? lhmin : dict->lhmin; michael@0: rhmin = (rhmin > dict->rhmin) ? rhmin : dict->rhmin; michael@0: clhmin = (clhmin > dict->clhmin) ? clhmin : dict->clhmin; michael@0: crhmin = (crhmin > dict->crhmin) ? crhmin : dict->crhmin; michael@0: hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut, michael@0: clhmin, crhmin, 1, 1); michael@0: hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens, michael@0: rep, pos, cut, (lhmin > 0 ? lhmin : 2)); michael@0: hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens, michael@0: rep, pos, cut, (rhmin > 0 ? rhmin : 2)); michael@0: if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut); michael@0: michael@0: /* nohyphen */ michael@0: if (dict->nohyphen) { michael@0: char * nh = dict->nohyphen; michael@0: int nhi; michael@0: for (nhi = 0; nhi <= dict->nohyphenl; nhi++) { michael@0: char * nhy = (char *) strstr(word, nh); michael@0: while (nhy) { michael@0: hyphens[nhy - word + strlen(nh) - 1] = 0; michael@0: if (nhy - word - 1 >= 0) hyphens[nhy - word - 1] = 0; michael@0: nhy = (char *) strstr(nhy + 1, nh); michael@0: } michael@0: nh = nh + strlen(nh) + 1; michael@0: } michael@0: } michael@0: michael@0: if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut); michael@0: return 0; michael@0: }