intl/icu/source/tools/toolutil/ucmstate.c

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /*
michael@0 2 *******************************************************************************
michael@0 3 *
michael@0 4 * Copyright (C) 2003-2012, International Business Machines
michael@0 5 * Corporation and others. All Rights Reserved.
michael@0 6 *
michael@0 7 *******************************************************************************
michael@0 8 * file name: ucmstate.c
michael@0 9 * encoding: US-ASCII
michael@0 10 * tab size: 8 (not used)
michael@0 11 * indentation:4
michael@0 12 *
michael@0 13 * created on: 2003oct09
michael@0 14 * created by: Markus W. Scherer
michael@0 15 *
michael@0 16 * This file handles ICU .ucm file state information as part of the ucm module.
michael@0 17 * Most of this code used to be in makeconv.c.
michael@0 18 */
michael@0 19
michael@0 20 #include "unicode/utypes.h"
michael@0 21 #include "cstring.h"
michael@0 22 #include "cmemory.h"
michael@0 23 #include "uarrsort.h"
michael@0 24 #include "ucnvmbcs.h"
michael@0 25 #include "ucnv_ext.h"
michael@0 26 #include "uparse.h"
michael@0 27 #include "ucm.h"
michael@0 28 #include <stdio.h>
michael@0 29
michael@0 30 #if !UCONFIG_NO_CONVERSION
michael@0 31
michael@0 32 /* MBCS state handling ------------------------------------------------------ */
michael@0 33
michael@0 34 /*
michael@0 35 * state table row grammar (ebnf-style):
michael@0 36 * (whitespace is allowed between all tokens)
michael@0 37 *
michael@0 38 * row=[[firstentry ','] entry (',' entry)*]
michael@0 39 * firstentry="initial" | "surrogates"
michael@0 40 * (initial state (default for state 0), output is all surrogate pairs)
michael@0 41 * entry=range [':' nextstate] ['.' action]
michael@0 42 * range=number ['-' number]
michael@0 43 * nextstate=number
michael@0 44 * (0..7f)
michael@0 45 * action='u' | 's' | 'p' | 'i'
michael@0 46 * (unassigned, state change only, surrogate pair, illegal)
michael@0 47 * number=(1- or 2-digit hexadecimal number)
michael@0 48 */
michael@0 49 static const char *
michael@0 50 parseState(const char *s, int32_t state[256], uint32_t *pFlags) {
michael@0 51 const char *t;
michael@0 52 uint32_t start, end, i;
michael@0 53 int32_t entry;
michael@0 54
michael@0 55 /* initialize the state: all illegal with U+ffff */
michael@0 56 for(i=0; i<256; ++i) {
michael@0 57 state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0xffff);
michael@0 58 }
michael@0 59
michael@0 60 /* skip leading white space */
michael@0 61 s=u_skipWhitespace(s);
michael@0 62
michael@0 63 /* is there an "initial" or "surrogates" directive? */
michael@0 64 if(uprv_strncmp("initial", s, 7)==0) {
michael@0 65 *pFlags=MBCS_STATE_FLAG_DIRECT;
michael@0 66 s=u_skipWhitespace(s+7);
michael@0 67 if(*s++!=',') {
michael@0 68 return s-1;
michael@0 69 }
michael@0 70 } else if(*pFlags==0 && uprv_strncmp("surrogates", s, 10)==0) {
michael@0 71 *pFlags=MBCS_STATE_FLAG_SURROGATES;
michael@0 72 s=u_skipWhitespace(s+10);
michael@0 73 if(*s++!=',') {
michael@0 74 return s-1;
michael@0 75 }
michael@0 76 } else if(*s==0) {
michael@0 77 /* empty state row: all-illegal */
michael@0 78 return NULL;
michael@0 79 }
michael@0 80
michael@0 81 for(;;) {
michael@0 82 /* read an entry, the start of the range first */
michael@0 83 s=u_skipWhitespace(s);
michael@0 84 start=uprv_strtoul(s, (char **)&t, 16);
michael@0 85 if(s==t || 0xff<start) {
michael@0 86 return s;
michael@0 87 }
michael@0 88 s=u_skipWhitespace(t);
michael@0 89
michael@0 90 /* read the end of the range if there is one */
michael@0 91 if(*s=='-') {
michael@0 92 s=u_skipWhitespace(s+1);
michael@0 93 end=uprv_strtoul(s, (char **)&t, 16);
michael@0 94 if(s==t || end<start || 0xff<end) {
michael@0 95 return s;
michael@0 96 }
michael@0 97 s=u_skipWhitespace(t);
michael@0 98 } else {
michael@0 99 end=start;
michael@0 100 }
michael@0 101
michael@0 102 /* determine the state entrys for this range */
michael@0 103 if(*s!=':' && *s!='.') {
michael@0 104 /* the default is: final state with valid entries */
michael@0 105 entry=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_16, 0);
michael@0 106 } else {
michael@0 107 entry=MBCS_ENTRY_TRANSITION(0, 0);
michael@0 108 if(*s==':') {
michael@0 109 /* get the next state, default to 0 */
michael@0 110 s=u_skipWhitespace(s+1);
michael@0 111 i=uprv_strtoul(s, (char **)&t, 16);
michael@0 112 if(s!=t) {
michael@0 113 if(0x7f<i) {
michael@0 114 return s;
michael@0 115 }
michael@0 116 s=u_skipWhitespace(t);
michael@0 117 entry=MBCS_ENTRY_SET_STATE(entry, i);
michael@0 118 }
michael@0 119 }
michael@0 120
michael@0 121 /* get the state action, default to valid */
michael@0 122 if(*s=='.') {
michael@0 123 /* this is a final state */
michael@0 124 entry=MBCS_ENTRY_SET_FINAL(entry);
michael@0 125
michael@0 126 s=u_skipWhitespace(s+1);
michael@0 127 if(*s=='u') {
michael@0 128 /* unassigned set U+fffe */
michael@0 129 entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe);
michael@0 130 s=u_skipWhitespace(s+1);
michael@0 131 } else if(*s=='p') {
michael@0 132 if(*pFlags!=MBCS_STATE_FLAG_DIRECT) {
michael@0 133 entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16_PAIR);
michael@0 134 } else {
michael@0 135 entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16);
michael@0 136 }
michael@0 137 s=u_skipWhitespace(s+1);
michael@0 138 } else if(*s=='s') {
michael@0 139 entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_CHANGE_ONLY);
michael@0 140 s=u_skipWhitespace(s+1);
michael@0 141 } else if(*s=='i') {
michael@0 142 /* illegal set U+ffff */
michael@0 143 entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_ILLEGAL, 0xffff);
michael@0 144 s=u_skipWhitespace(s+1);
michael@0 145 } else {
michael@0 146 /* default to valid */
michael@0 147 entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16);
michael@0 148 }
michael@0 149 } else {
michael@0 150 /* this is an intermediate state, nothing to do */
michael@0 151 }
michael@0 152 }
michael@0 153
michael@0 154 /* adjust "final valid" states according to the state flags */
michael@0 155 if(MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16) {
michael@0 156 switch(*pFlags) {
michael@0 157 case 0:
michael@0 158 /* no adjustment */
michael@0 159 break;
michael@0 160 case MBCS_STATE_FLAG_DIRECT:
michael@0 161 /* set the valid-direct code point to "unassigned"==0xfffe */
michael@0 162 entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_DIRECT_16, 0xfffe);
michael@0 163 break;
michael@0 164 case MBCS_STATE_FLAG_SURROGATES:
michael@0 165 entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_16_PAIR, 0);
michael@0 166 break;
michael@0 167 default:
michael@0 168 break;
michael@0 169 }
michael@0 170 }
michael@0 171
michael@0 172 /* set this entry for the range */
michael@0 173 for(i=start; i<=end; ++i) {
michael@0 174 state[i]=entry;
michael@0 175 }
michael@0 176
michael@0 177 if(*s==',') {
michael@0 178 ++s;
michael@0 179 } else {
michael@0 180 return *s==0 ? NULL : s;
michael@0 181 }
michael@0 182 }
michael@0 183 }
michael@0 184
michael@0 185 U_CAPI void U_EXPORT2
michael@0 186 ucm_addState(UCMStates *states, const char *s) {
michael@0 187 const char *error;
michael@0 188
michael@0 189 if(states->countStates==MBCS_MAX_STATE_COUNT) {
michael@0 190 fprintf(stderr, "ucm error: too many states (maximum %u)\n", MBCS_MAX_STATE_COUNT);
michael@0 191 exit(U_INVALID_TABLE_FORMAT);
michael@0 192 }
michael@0 193
michael@0 194 error=parseState(s, states->stateTable[states->countStates],
michael@0 195 &states->stateFlags[states->countStates]);
michael@0 196 if(error!=NULL) {
michael@0 197 fprintf(stderr, "ucm error: parse error in state definition at '%s'\n", error);
michael@0 198 exit(U_INVALID_TABLE_FORMAT);
michael@0 199 }
michael@0 200
michael@0 201 ++states->countStates;
michael@0 202 }
michael@0 203
michael@0 204 U_CAPI UBool U_EXPORT2
michael@0 205 ucm_parseHeaderLine(UCMFile *ucm,
michael@0 206 char *line, char **pKey, char **pValue) {
michael@0 207 UCMStates *states;
michael@0 208 char *s, *end;
michael@0 209 char c;
michael@0 210
michael@0 211 states=&ucm->states;
michael@0 212
michael@0 213 /* remove comments and trailing CR and LF and remove whitespace from the end */
michael@0 214 for(end=line; (c=*end)!=0; ++end) {
michael@0 215 if(c=='#' || c=='\r' || c=='\n') {
michael@0 216 break;
michael@0 217 }
michael@0 218 }
michael@0 219 while(end>line && (*(end-1)==' ' || *(end-1)=='\t')) {
michael@0 220 --end;
michael@0 221 }
michael@0 222 *end=0;
michael@0 223
michael@0 224 /* skip leading white space and ignore empty lines */
michael@0 225 s=(char *)u_skipWhitespace(line);
michael@0 226 if(*s==0) {
michael@0 227 return TRUE;
michael@0 228 }
michael@0 229
michael@0 230 /* stop at the beginning of the mapping section */
michael@0 231 if(uprv_memcmp(s, "CHARMAP", 7)==0) {
michael@0 232 return FALSE;
michael@0 233 }
michael@0 234
michael@0 235 /* get the key name, bracketed in <> */
michael@0 236 if(*s!='<') {
michael@0 237 fprintf(stderr, "ucm error: no header field <key> in line \"%s\"\n", line);
michael@0 238 exit(U_INVALID_TABLE_FORMAT);
michael@0 239 }
michael@0 240 *pKey=++s;
michael@0 241 while(*s!='>') {
michael@0 242 if(*s==0) {
michael@0 243 fprintf(stderr, "ucm error: incomplete header field <key> in line \"%s\"\n", line);
michael@0 244 exit(U_INVALID_TABLE_FORMAT);
michael@0 245 }
michael@0 246 ++s;
michael@0 247 }
michael@0 248 *s=0;
michael@0 249
michael@0 250 /* get the value string, possibly quoted */
michael@0 251 s=(char *)u_skipWhitespace(s+1);
michael@0 252 if(*s!='"') {
michael@0 253 *pValue=s;
michael@0 254 } else {
michael@0 255 /* remove the quotes */
michael@0 256 *pValue=s+1;
michael@0 257 if(end>*pValue && *(end-1)=='"') {
michael@0 258 *--end=0;
michael@0 259 }
michael@0 260 }
michael@0 261
michael@0 262 /* collect the information from the header field, ignore unknown keys */
michael@0 263 if(uprv_strcmp(*pKey, "uconv_class")==0) {
michael@0 264 if(uprv_strcmp(*pValue, "DBCS")==0) {
michael@0 265 states->conversionType=UCNV_DBCS;
michael@0 266 } else if(uprv_strcmp(*pValue, "SBCS")==0) {
michael@0 267 states->conversionType = UCNV_SBCS;
michael@0 268 } else if(uprv_strcmp(*pValue, "MBCS")==0) {
michael@0 269 states->conversionType = UCNV_MBCS;
michael@0 270 } else if(uprv_strcmp(*pValue, "EBCDIC_STATEFUL")==0) {
michael@0 271 states->conversionType = UCNV_EBCDIC_STATEFUL;
michael@0 272 } else {
michael@0 273 fprintf(stderr, "ucm error: unknown <uconv_class> %s\n", *pValue);
michael@0 274 exit(U_INVALID_TABLE_FORMAT);
michael@0 275 }
michael@0 276 return TRUE;
michael@0 277 } else if(uprv_strcmp(*pKey, "mb_cur_max")==0) {
michael@0 278 c=**pValue;
michael@0 279 if('1'<=c && c<='4' && (*pValue)[1]==0) {
michael@0 280 states->maxCharLength=(int8_t)(c-'0');
michael@0 281 states->outputType=(int8_t)(states->maxCharLength-1);
michael@0 282 } else {
michael@0 283 fprintf(stderr, "ucm error: illegal <mb_cur_max> %s\n", *pValue);
michael@0 284 exit(U_INVALID_TABLE_FORMAT);
michael@0 285 }
michael@0 286 return TRUE;
michael@0 287 } else if(uprv_strcmp(*pKey, "mb_cur_min")==0) {
michael@0 288 c=**pValue;
michael@0 289 if('1'<=c && c<='4' && (*pValue)[1]==0) {
michael@0 290 states->minCharLength=(int8_t)(c-'0');
michael@0 291 } else {
michael@0 292 fprintf(stderr, "ucm error: illegal <mb_cur_min> %s\n", *pValue);
michael@0 293 exit(U_INVALID_TABLE_FORMAT);
michael@0 294 }
michael@0 295 return TRUE;
michael@0 296 } else if(uprv_strcmp(*pKey, "icu:state")==0) {
michael@0 297 /* if an SBCS/DBCS/EBCDIC_STATEFUL converter has icu:state, then turn it into MBCS */
michael@0 298 switch(states->conversionType) {
michael@0 299 case UCNV_SBCS:
michael@0 300 case UCNV_DBCS:
michael@0 301 case UCNV_EBCDIC_STATEFUL:
michael@0 302 states->conversionType=UCNV_MBCS;
michael@0 303 break;
michael@0 304 case UCNV_MBCS:
michael@0 305 break;
michael@0 306 default:
michael@0 307 fprintf(stderr, "ucm error: <icu:state> entry for non-MBCS table or before the <uconv_class> line\n");
michael@0 308 exit(U_INVALID_TABLE_FORMAT);
michael@0 309 }
michael@0 310
michael@0 311 if(states->maxCharLength==0) {
michael@0 312 fprintf(stderr, "ucm error: <icu:state> before the <mb_cur_max> line\n");
michael@0 313 exit(U_INVALID_TABLE_FORMAT);
michael@0 314 }
michael@0 315 ucm_addState(states, *pValue);
michael@0 316 return TRUE;
michael@0 317 } else if(uprv_strcmp(*pKey, "icu:base")==0) {
michael@0 318 if(**pValue==0) {
michael@0 319 fprintf(stderr, "ucm error: <icu:base> without a base table name\n");
michael@0 320 exit(U_INVALID_TABLE_FORMAT);
michael@0 321 }
michael@0 322 uprv_strcpy(ucm->baseName, *pValue);
michael@0 323 return TRUE;
michael@0 324 }
michael@0 325
michael@0 326 return FALSE;
michael@0 327 }
michael@0 328
michael@0 329 /* post-processing ---------------------------------------------------------- */
michael@0 330
michael@0 331 static int32_t
michael@0 332 sumUpStates(UCMStates *states) {
michael@0 333 int32_t entry, sum, state, cell, count;
michael@0 334 UBool allStatesReady;
michael@0 335
michael@0 336 /*
michael@0 337 * Sum up the offsets for all states.
michael@0 338 * In each final state (where there are only final entries),
michael@0 339 * the offsets add up directly.
michael@0 340 * In all other state table rows, for each transition entry to another state,
michael@0 341 * the offsets sum of that state needs to be added.
michael@0 342 * This is achieved in at most countStates iterations.
michael@0 343 */
michael@0 344 allStatesReady=FALSE;
michael@0 345 for(count=states->countStates; !allStatesReady && count>=0; --count) {
michael@0 346 allStatesReady=TRUE;
michael@0 347 for(state=states->countStates-1; state>=0; --state) {
michael@0 348 if(!(states->stateFlags[state]&MBCS_STATE_FLAG_READY)) {
michael@0 349 allStatesReady=FALSE;
michael@0 350 sum=0;
michael@0 351
michael@0 352 /* at first, add up only the final delta offsets to keep them <512 */
michael@0 353 for(cell=0; cell<256; ++cell) {
michael@0 354 entry=states->stateTable[state][cell];
michael@0 355 if(MBCS_ENTRY_IS_FINAL(entry)) {
michael@0 356 switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
michael@0 357 case MBCS_STATE_VALID_16:
michael@0 358 states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum);
michael@0 359 sum+=1;
michael@0 360 break;
michael@0 361 case MBCS_STATE_VALID_16_PAIR:
michael@0 362 states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum);
michael@0 363 sum+=2;
michael@0 364 break;
michael@0 365 default:
michael@0 366 /* no addition */
michael@0 367 break;
michael@0 368 }
michael@0 369 }
michael@0 370 }
michael@0 371
michael@0 372 /* now, add up the delta offsets for the transitional entries */
michael@0 373 for(cell=0; cell<256; ++cell) {
michael@0 374 entry=states->stateTable[state][cell];
michael@0 375 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
michael@0 376 if(states->stateFlags[MBCS_ENTRY_TRANSITION_STATE(entry)]&MBCS_STATE_FLAG_READY) {
michael@0 377 states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_SET_OFFSET(entry, sum);
michael@0 378 sum+=states->stateOffsetSum[MBCS_ENTRY_TRANSITION_STATE(entry)];
michael@0 379 } else {
michael@0 380 /* that next state does not have a sum yet, we cannot finish the one for this state */
michael@0 381 sum=-1;
michael@0 382 break;
michael@0 383 }
michael@0 384 }
michael@0 385 }
michael@0 386
michael@0 387 if(sum!=-1) {
michael@0 388 states->stateOffsetSum[state]=sum;
michael@0 389 states->stateFlags[state]|=MBCS_STATE_FLAG_READY;
michael@0 390 }
michael@0 391 }
michael@0 392 }
michael@0 393 }
michael@0 394
michael@0 395 if(!allStatesReady) {
michael@0 396 fprintf(stderr, "ucm error: the state table contains loops\n");
michael@0 397 exit(U_INVALID_TABLE_FORMAT);
michael@0 398 }
michael@0 399
michael@0 400 /*
michael@0 401 * For all "direct" (i.e., initial) states>0,
michael@0 402 * the offsets need to be increased by the sum of
michael@0 403 * the previous initial states.
michael@0 404 */
michael@0 405 sum=states->stateOffsetSum[0];
michael@0 406 for(state=1; state<states->countStates; ++state) {
michael@0 407 if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
michael@0 408 int32_t sum2=sum;
michael@0 409 sum+=states->stateOffsetSum[state];
michael@0 410 for(cell=0; cell<256; ++cell) {
michael@0 411 entry=states->stateTable[state][cell];
michael@0 412 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
michael@0 413 states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_ADD_OFFSET(entry, sum2);
michael@0 414 }
michael@0 415 }
michael@0 416 }
michael@0 417 }
michael@0 418
michael@0 419 /* round up to the next even number to have the following data 32-bit-aligned */
michael@0 420 return states->countToUCodeUnits=(sum+1)&~1;
michael@0 421 }
michael@0 422
michael@0 423 U_CAPI void U_EXPORT2
michael@0 424 ucm_processStates(UCMStates *states, UBool ignoreSISOCheck) {
michael@0 425 int32_t entry, state, cell, count;
michael@0 426
michael@0 427 if(states->conversionType==UCNV_UNSUPPORTED_CONVERTER) {
michael@0 428 fprintf(stderr, "ucm error: missing conversion type (<uconv_class>)\n");
michael@0 429 exit(U_INVALID_TABLE_FORMAT);
michael@0 430 }
michael@0 431
michael@0 432 if(states->countStates==0) {
michael@0 433 switch(states->conversionType) {
michael@0 434 case UCNV_SBCS:
michael@0 435 /* SBCS: use MBCS data structure with a default state table */
michael@0 436 if(states->maxCharLength!=1) {
michael@0 437 fprintf(stderr, "error: SBCS codepage with max B/char!=1\n");
michael@0 438 exit(U_INVALID_TABLE_FORMAT);
michael@0 439 }
michael@0 440 states->conversionType=UCNV_MBCS;
michael@0 441 ucm_addState(states, "0-ff");
michael@0 442 break;
michael@0 443 case UCNV_MBCS:
michael@0 444 fprintf(stderr, "ucm error: missing state table information (<icu:state>) for MBCS\n");
michael@0 445 exit(U_INVALID_TABLE_FORMAT);
michael@0 446 break;
michael@0 447 case UCNV_EBCDIC_STATEFUL:
michael@0 448 /* EBCDIC_STATEFUL: use MBCS data structure with a default state table */
michael@0 449 if(states->minCharLength!=1 || states->maxCharLength!=2) {
michael@0 450 fprintf(stderr, "error: DBCS codepage with min B/char!=1 or max B/char!=2\n");
michael@0 451 exit(U_INVALID_TABLE_FORMAT);
michael@0 452 }
michael@0 453 states->conversionType=UCNV_MBCS;
michael@0 454 ucm_addState(states, "0-ff, e:1.s, f:0.s");
michael@0 455 ucm_addState(states, "initial, 0-3f:4, e:1.s, f:0.s, 40:3, 41-fe:2, ff:4");
michael@0 456 ucm_addState(states, "0-40:1.i, 41-fe:1., ff:1.i");
michael@0 457 ucm_addState(states, "0-ff:1.i, 40:1.");
michael@0 458 ucm_addState(states, "0-ff:1.i");
michael@0 459 break;
michael@0 460 case UCNV_DBCS:
michael@0 461 /* DBCS: use MBCS data structure with a default state table */
michael@0 462 if(states->minCharLength!=2 || states->maxCharLength!=2) {
michael@0 463 fprintf(stderr, "error: DBCS codepage with min or max B/char!=2\n");
michael@0 464 exit(U_INVALID_TABLE_FORMAT);
michael@0 465 }
michael@0 466 states->conversionType = UCNV_MBCS;
michael@0 467 ucm_addState(states, "0-3f:3, 40:2, 41-fe:1, ff:3");
michael@0 468 ucm_addState(states, "41-fe");
michael@0 469 ucm_addState(states, "40");
michael@0 470 ucm_addState(states, "");
michael@0 471 break;
michael@0 472 default:
michael@0 473 fprintf(stderr, "ucm error: unknown charset structure\n");
michael@0 474 exit(U_INVALID_TABLE_FORMAT);
michael@0 475 break;
michael@0 476 }
michael@0 477 }
michael@0 478
michael@0 479 /*
michael@0 480 * check that the min/max character lengths are reasonable;
michael@0 481 * to do this right, all paths through the state table would have to be
michael@0 482 * recursively walked while keeping track of the sequence lengths,
michael@0 483 * but these simple checks cover most state tables in practice
michael@0 484 */
michael@0 485 if(states->maxCharLength<states->minCharLength) {
michael@0 486 fprintf(stderr, "ucm error: max B/char < min B/char\n");
michael@0 487 exit(U_INVALID_TABLE_FORMAT);
michael@0 488 }
michael@0 489
michael@0 490 /* count non-direct states and compare with max B/char */
michael@0 491 count=0;
michael@0 492 for(state=0; state<states->countStates; ++state) {
michael@0 493 if((states->stateFlags[state]&0xf)!=MBCS_STATE_FLAG_DIRECT) {
michael@0 494 ++count;
michael@0 495 }
michael@0 496 }
michael@0 497 if(states->maxCharLength>count+1) {
michael@0 498 fprintf(stderr, "ucm error: max B/char too large\n");
michael@0 499 exit(U_INVALID_TABLE_FORMAT);
michael@0 500 }
michael@0 501
michael@0 502 if(states->minCharLength==1) {
michael@0 503 int32_t action;
michael@0 504
michael@0 505 /*
michael@0 506 * if there are single-byte characters,
michael@0 507 * then the initial state must have direct result states
michael@0 508 */
michael@0 509 for(cell=0; cell<256; ++cell) {
michael@0 510 entry=states->stateTable[0][cell];
michael@0 511 if( MBCS_ENTRY_IS_FINAL(entry) &&
michael@0 512 ((action=MBCS_ENTRY_FINAL_ACTION(entry))==MBCS_STATE_VALID_DIRECT_16 ||
michael@0 513 action==MBCS_STATE_UNASSIGNED)
michael@0 514 ) {
michael@0 515 break;
michael@0 516 }
michael@0 517 }
michael@0 518
michael@0 519 if(cell==256) {
michael@0 520 fprintf(stderr, "ucm warning: min B/char too small\n");
michael@0 521 }
michael@0 522 }
michael@0 523
michael@0 524 /*
michael@0 525 * make sure that all "next state" values are within limits
michael@0 526 * and that all next states after final ones have the "direct"
michael@0 527 * flag of initial states
michael@0 528 */
michael@0 529 for(state=states->countStates-1; state>=0; --state) {
michael@0 530 for(cell=0; cell<256; ++cell) {
michael@0 531 entry=states->stateTable[state][cell];
michael@0 532 if((uint8_t)MBCS_ENTRY_STATE(entry)>=states->countStates) {
michael@0 533 fprintf(stderr, "ucm error: state table entry [%x][%x] has a next state of %x that is too high\n",
michael@0 534 (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry));
michael@0 535 exit(U_INVALID_TABLE_FORMAT);
michael@0 536 }
michael@0 537 if(MBCS_ENTRY_IS_FINAL(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)!=MBCS_STATE_FLAG_DIRECT) {
michael@0 538 fprintf(stderr, "ucm error: state table entry [%x][%x] is final but has a non-initial next state of %x\n",
michael@0 539 (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry));
michael@0 540 exit(U_INVALID_TABLE_FORMAT);
michael@0 541 } else if(MBCS_ENTRY_IS_TRANSITION(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)==MBCS_STATE_FLAG_DIRECT) {
michael@0 542 fprintf(stderr, "ucm error: state table entry [%x][%x] is not final but has an initial next state of %x\n",
michael@0 543 (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry));
michael@0 544 exit(U_INVALID_TABLE_FORMAT);
michael@0 545 }
michael@0 546 }
michael@0 547 }
michael@0 548
michael@0 549 /* is this an SI/SO (like EBCDIC-stateful) state table? */
michael@0 550 if(states->countStates>=2 && (states->stateFlags[1]&0xf)==MBCS_STATE_FLAG_DIRECT) {
michael@0 551 if(states->maxCharLength!=2) {
michael@0 552 fprintf(stderr, "ucm error: SI/SO codepages must have max 2 bytes/char (not %x)\n", (int)states->maxCharLength);
michael@0 553 exit(U_INVALID_TABLE_FORMAT);
michael@0 554 }
michael@0 555 if(states->countStates<3) {
michael@0 556 fprintf(stderr, "ucm error: SI/SO codepages must have at least 3 states (not %x)\n", (int)states->countStates);
michael@0 557 exit(U_INVALID_TABLE_FORMAT);
michael@0 558 }
michael@0 559 /* are the SI/SO all in the right places? */
michael@0 560 if( ignoreSISOCheck ||
michael@0 561 (states->stateTable[0][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) &&
michael@0 562 states->stateTable[0][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0) &&
michael@0 563 states->stateTable[1][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) &&
michael@0 564 states->stateTable[1][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0))
michael@0 565 ) {
michael@0 566 states->outputType=MBCS_OUTPUT_2_SISO;
michael@0 567 } else {
michael@0 568 fprintf(stderr, "ucm error: SI/SO codepages must have in states 0 and 1 transitions e:1.s, f:0.s\n");
michael@0 569 exit(U_INVALID_TABLE_FORMAT);
michael@0 570 }
michael@0 571 state=2;
michael@0 572 } else {
michael@0 573 state=1;
michael@0 574 }
michael@0 575
michael@0 576 /* check that no unexpected state is a "direct" one */
michael@0 577 while(state<states->countStates) {
michael@0 578 if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
michael@0 579 fprintf(stderr, "ucm error: state %d is 'initial' - not supported except for SI/SO codepages\n", (int)state);
michael@0 580 exit(U_INVALID_TABLE_FORMAT);
michael@0 581 }
michael@0 582 ++state;
michael@0 583 }
michael@0 584
michael@0 585 sumUpStates(states);
michael@0 586 }
michael@0 587
michael@0 588 /* find a fallback for this offset; return the index or -1 if not found */
michael@0 589 U_CAPI int32_t U_EXPORT2
michael@0 590 ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
michael@0 591 uint32_t offset) {
michael@0 592 int32_t i;
michael@0 593
michael@0 594 if(countToUFallbacks==0) {
michael@0 595 /* shortcut: most codepages do not have fallbacks from codepage to Unicode */
michael@0 596 return -1;
michael@0 597 }
michael@0 598
michael@0 599 /* do a linear search for the fallback mapping (the table is not yet sorted) */
michael@0 600 for(i=0; i<countToUFallbacks; ++i) {
michael@0 601 if(offset==toUFallbacks[i].offset) {
michael@0 602 return i;
michael@0 603 }
michael@0 604 }
michael@0 605 return -1;
michael@0 606 }
michael@0 607
michael@0 608 /*
michael@0 609 * This function tries to compact toUnicode tables for 2-byte codepages
michael@0 610 * by finding lead bytes with all-unassigned trail bytes and adding another state
michael@0 611 * for them.
michael@0 612 */
michael@0 613 static void
michael@0 614 compactToUnicode2(UCMStates *states,
michael@0 615 uint16_t **pUnicodeCodeUnits,
michael@0 616 _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
michael@0 617 UBool verbose) {
michael@0 618 int32_t (*oldStateTable)[256];
michael@0 619 uint16_t count[256];
michael@0 620 uint16_t *oldUnicodeCodeUnits;
michael@0 621 int32_t entry, offset, oldOffset, trailOffset, oldTrailOffset, savings, sum;
michael@0 622 int32_t i, j, leadState, trailState, newState, fallback;
michael@0 623 uint16_t unit;
michael@0 624
michael@0 625 /* find the lead state */
michael@0 626 if(states->outputType==MBCS_OUTPUT_2_SISO) {
michael@0 627 /* use the DBCS lead state for SI/SO codepages */
michael@0 628 leadState=1;
michael@0 629 } else {
michael@0 630 leadState=0;
michael@0 631 }
michael@0 632
michael@0 633 /* find the main trail state: the most used target state */
michael@0 634 uprv_memset(count, 0, sizeof(count));
michael@0 635 for(i=0; i<256; ++i) {
michael@0 636 entry=states->stateTable[leadState][i];
michael@0 637 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
michael@0 638 ++count[MBCS_ENTRY_TRANSITION_STATE(entry)];
michael@0 639 }
michael@0 640 }
michael@0 641 trailState=0;
michael@0 642 for(i=1; i<states->countStates; ++i) {
michael@0 643 if(count[i]>count[trailState]) {
michael@0 644 trailState=i;
michael@0 645 }
michael@0 646 }
michael@0 647
michael@0 648 /* count possible savings from lead bytes with all-unassigned results in all trail bytes */
michael@0 649 uprv_memset(count, 0, sizeof(count));
michael@0 650 savings=0;
michael@0 651 /* for each lead byte */
michael@0 652 for(i=0; i<256; ++i) {
michael@0 653 entry=states->stateTable[leadState][i];
michael@0 654 if(MBCS_ENTRY_IS_TRANSITION(entry) && (MBCS_ENTRY_TRANSITION_STATE(entry))==trailState) {
michael@0 655 /* the offset is different for each lead byte */
michael@0 656 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
michael@0 657 /* for each trail byte for this lead byte */
michael@0 658 for(j=0; j<256; ++j) {
michael@0 659 entry=states->stateTable[trailState][j];
michael@0 660 switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
michael@0 661 case MBCS_STATE_VALID_16:
michael@0 662 entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
michael@0 663 if((*pUnicodeCodeUnits)[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) {
michael@0 664 ++count[i];
michael@0 665 } else {
michael@0 666 j=999; /* do not count for this lead byte because there are assignments */
michael@0 667 }
michael@0 668 break;
michael@0 669 case MBCS_STATE_VALID_16_PAIR:
michael@0 670 entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
michael@0 671 if((*pUnicodeCodeUnits)[entry]==0xfffe) {
michael@0 672 count[i]+=2;
michael@0 673 } else {
michael@0 674 j=999; /* do not count for this lead byte because there are assignments */
michael@0 675 }
michael@0 676 break;
michael@0 677 default:
michael@0 678 break;
michael@0 679 }
michael@0 680 }
michael@0 681 if(j==256) {
michael@0 682 /* all trail bytes for this lead byte are unassigned */
michael@0 683 savings+=count[i];
michael@0 684 } else {
michael@0 685 count[i]=0;
michael@0 686 }
michael@0 687 }
michael@0 688 }
michael@0 689 /* subtract from the possible savings the cost of an additional state */
michael@0 690 savings=savings*2-1024; /* count bytes, not 16-bit words */
michael@0 691 if(savings<=0) {
michael@0 692 return;
michael@0 693 }
michael@0 694 if(verbose) {
michael@0 695 printf("compacting toUnicode data saves %ld bytes\n", (long)savings);
michael@0 696 }
michael@0 697 if(states->countStates>=MBCS_MAX_STATE_COUNT) {
michael@0 698 fprintf(stderr, "cannot compact toUnicode because the maximum number of states is reached\n");
michael@0 699 return;
michael@0 700 }
michael@0 701
michael@0 702 /* make a copy of the state table */
michael@0 703 oldStateTable=(int32_t (*)[256])uprv_malloc(states->countStates*1024);
michael@0 704 if(oldStateTable==NULL) {
michael@0 705 fprintf(stderr, "cannot compact toUnicode: out of memory\n");
michael@0 706 return;
michael@0 707 }
michael@0 708 uprv_memcpy(oldStateTable, states->stateTable, states->countStates*1024);
michael@0 709
michael@0 710 /* add the new state */
michael@0 711 /*
michael@0 712 * this function does not catch the degenerate case where all lead bytes
michael@0 713 * have all-unassigned trail bytes and the lead state could be removed
michael@0 714 */
michael@0 715 newState=states->countStates++;
michael@0 716 states->stateFlags[newState]=0;
michael@0 717 /* copy the old trail state, turning all assigned states into unassigned ones */
michael@0 718 for(i=0; i<256; ++i) {
michael@0 719 entry=states->stateTable[trailState][i];
michael@0 720 switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
michael@0 721 case MBCS_STATE_VALID_16:
michael@0 722 case MBCS_STATE_VALID_16_PAIR:
michael@0 723 states->stateTable[newState][i]=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe);
michael@0 724 break;
michael@0 725 default:
michael@0 726 states->stateTable[newState][i]=entry;
michael@0 727 break;
michael@0 728 }
michael@0 729 }
michael@0 730
michael@0 731 /* in the lead state, redirect all lead bytes with all-unassigned trail bytes to the new state */
michael@0 732 for(i=0; i<256; ++i) {
michael@0 733 if(count[i]>0) {
michael@0 734 states->stateTable[leadState][i]=MBCS_ENTRY_SET_STATE(states->stateTable[leadState][i], newState);
michael@0 735 }
michael@0 736 }
michael@0 737
michael@0 738 /* sum up the new state table */
michael@0 739 for(i=0; i<states->countStates; ++i) {
michael@0 740 states->stateFlags[i]&=~MBCS_STATE_FLAG_READY;
michael@0 741 }
michael@0 742 sum=sumUpStates(states);
michael@0 743
michael@0 744 /* allocate a new, smaller code units array */
michael@0 745 oldUnicodeCodeUnits=*pUnicodeCodeUnits;
michael@0 746 if(sum==0) {
michael@0 747 *pUnicodeCodeUnits=NULL;
michael@0 748 if(oldUnicodeCodeUnits!=NULL) {
michael@0 749 uprv_free(oldUnicodeCodeUnits);
michael@0 750 }
michael@0 751 uprv_free(oldStateTable);
michael@0 752 return;
michael@0 753 }
michael@0 754 *pUnicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t));
michael@0 755 if(*pUnicodeCodeUnits==NULL) {
michael@0 756 fprintf(stderr, "cannot compact toUnicode: out of memory allocating %ld 16-bit code units\n",
michael@0 757 (long)sum);
michael@0 758 /* revert to the old state table */
michael@0 759 *pUnicodeCodeUnits=oldUnicodeCodeUnits;
michael@0 760 --states->countStates;
michael@0 761 uprv_memcpy(states->stateTable, oldStateTable, states->countStates*1024);
michael@0 762 uprv_free(oldStateTable);
michael@0 763 return;
michael@0 764 }
michael@0 765 for(i=0; i<sum; ++i) {
michael@0 766 (*pUnicodeCodeUnits)[i]=0xfffe;
michael@0 767 }
michael@0 768
michael@0 769 /* copy the code units for all assigned characters */
michael@0 770 /*
michael@0 771 * The old state table has the same lead _and_ trail states for assigned characters!
michael@0 772 * The differences are in the offsets, and in the trail states for some unassigned characters.
michael@0 773 * For each character with an assigned state in the new table, it was assigned in the old one.
michael@0 774 * Only still-assigned characters are copied.
michael@0 775 * Note that fallback mappings need to get their offset values adjusted.
michael@0 776 */
michael@0 777
michael@0 778 /* for each initial state */
michael@0 779 for(leadState=0; leadState<states->countStates; ++leadState) {
michael@0 780 if((states->stateFlags[leadState]&0xf)==MBCS_STATE_FLAG_DIRECT) {
michael@0 781 /* for each lead byte from there */
michael@0 782 for(i=0; i<256; ++i) {
michael@0 783 entry=states->stateTable[leadState][i];
michael@0 784 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
michael@0 785 trailState=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
michael@0 786 /* the new state does not have assigned states */
michael@0 787 if(trailState!=newState) {
michael@0 788 trailOffset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
michael@0 789 oldTrailOffset=MBCS_ENTRY_TRANSITION_OFFSET(oldStateTable[leadState][i]);
michael@0 790 /* for each trail byte */
michael@0 791 for(j=0; j<256; ++j) {
michael@0 792 entry=states->stateTable[trailState][j];
michael@0 793 /* copy assigned-character code units and adjust fallback offsets */
michael@0 794 switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
michael@0 795 case MBCS_STATE_VALID_16:
michael@0 796 offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry);
michael@0 797 /* find the old offset according to the old state table */
michael@0 798 oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]);
michael@0 799 unit=(*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset];
michael@0 800 if(unit==0xfffe && (fallback=ucm_findFallback(toUFallbacks, countToUFallbacks, oldOffset))>=0) {
michael@0 801 toUFallbacks[fallback].offset=0x80000000|offset;
michael@0 802 }
michael@0 803 break;
michael@0 804 case MBCS_STATE_VALID_16_PAIR:
michael@0 805 offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry);
michael@0 806 /* find the old offset according to the old state table */
michael@0 807 oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]);
michael@0 808 (*pUnicodeCodeUnits)[offset++]=oldUnicodeCodeUnits[oldOffset++];
michael@0 809 (*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset];
michael@0 810 break;
michael@0 811 default:
michael@0 812 break;
michael@0 813 }
michael@0 814 }
michael@0 815 }
michael@0 816 }
michael@0 817 }
michael@0 818 }
michael@0 819 }
michael@0 820
michael@0 821 /* remove temporary flags from fallback offsets that protected them from being modified twice */
michael@0 822 for(i=0; i<countToUFallbacks; ++i) {
michael@0 823 toUFallbacks[i].offset&=0x7fffffff;
michael@0 824 }
michael@0 825
michael@0 826 /* free temporary memory */
michael@0 827 uprv_free(oldUnicodeCodeUnits);
michael@0 828 uprv_free(oldStateTable);
michael@0 829 }
michael@0 830
michael@0 831 /*
michael@0 832 * recursive sub-function of compactToUnicodeHelper()
michael@0 833 * returns:
michael@0 834 * >0 number of bytes that are used in unicodeCodeUnits[] that could be saved,
michael@0 835 * if all sequences from this state are unassigned, returns the
michael@0 836 * <0 there are assignments in unicodeCodeUnits[]
michael@0 837 * 0 no use of unicodeCodeUnits[]
michael@0 838 */
michael@0 839 static int32_t
michael@0 840 findUnassigned(UCMStates *states,
michael@0 841 uint16_t *unicodeCodeUnits,
michael@0 842 _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
michael@0 843 int32_t state, int32_t offset, uint32_t b) {
michael@0 844 int32_t i, entry, savings, localSavings, belowSavings;
michael@0 845 UBool haveAssigned;
michael@0 846
michael@0 847 localSavings=belowSavings=0;
michael@0 848 haveAssigned=FALSE;
michael@0 849 for(i=0; i<256; ++i) {
michael@0 850 entry=states->stateTable[state][i];
michael@0 851 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
michael@0 852 savings=findUnassigned(states,
michael@0 853 unicodeCodeUnits,
michael@0 854 toUFallbacks, countToUFallbacks,
michael@0 855 MBCS_ENTRY_TRANSITION_STATE(entry),
michael@0 856 offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),
michael@0 857 (b<<8)|(uint32_t)i);
michael@0 858 if(savings<0) {
michael@0 859 haveAssigned=TRUE;
michael@0 860 } else if(savings>0) {
michael@0 861 printf(" all-unassigned sequences from prefix 0x%02lx state %ld use %ld bytes\n",
michael@0 862 (unsigned long)((b<<8)|i), (long)state, (long)savings);
michael@0 863 belowSavings+=savings;
michael@0 864 }
michael@0 865 } else if(!haveAssigned) {
michael@0 866 switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
michael@0 867 case MBCS_STATE_VALID_16:
michael@0 868 entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
michael@0 869 if(unicodeCodeUnits[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) {
michael@0 870 localSavings+=2;
michael@0 871 } else {
michael@0 872 haveAssigned=TRUE;
michael@0 873 }
michael@0 874 break;
michael@0 875 case MBCS_STATE_VALID_16_PAIR:
michael@0 876 entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
michael@0 877 if(unicodeCodeUnits[entry]==0xfffe) {
michael@0 878 localSavings+=4;
michael@0 879 } else {
michael@0 880 haveAssigned=TRUE;
michael@0 881 }
michael@0 882 break;
michael@0 883 default:
michael@0 884 break;
michael@0 885 }
michael@0 886 }
michael@0 887 }
michael@0 888 if(haveAssigned) {
michael@0 889 return -1;
michael@0 890 } else {
michael@0 891 return localSavings+belowSavings;
michael@0 892 }
michael@0 893 }
michael@0 894
michael@0 895 /* helper function for finding compaction opportunities */
michael@0 896 static void
michael@0 897 compactToUnicodeHelper(UCMStates *states,
michael@0 898 uint16_t *unicodeCodeUnits,
michael@0 899 _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks) {
michael@0 900 int32_t state, savings;
michael@0 901
michael@0 902 /* for each initial state */
michael@0 903 for(state=0; state<states->countStates; ++state) {
michael@0 904 if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
michael@0 905 savings=findUnassigned(states,
michael@0 906 unicodeCodeUnits,
michael@0 907 toUFallbacks, countToUFallbacks,
michael@0 908 state, 0, 0);
michael@0 909 if(savings>0) {
michael@0 910 printf(" all-unassigned sequences from initial state %ld use %ld bytes\n",
michael@0 911 (long)state, (long)savings);
michael@0 912 }
michael@0 913 }
michael@0 914 }
michael@0 915 }
michael@0 916
michael@0 917 static int32_t
michael@0 918 compareFallbacks(const void *context, const void *fb1, const void *fb2) {
michael@0 919 return ((const _MBCSToUFallback *)fb1)->offset-((const _MBCSToUFallback *)fb2)->offset;
michael@0 920 }
michael@0 921
michael@0 922 U_CAPI void U_EXPORT2
michael@0 923 ucm_optimizeStates(UCMStates *states,
michael@0 924 uint16_t **pUnicodeCodeUnits,
michael@0 925 _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
michael@0 926 UBool verbose) {
michael@0 927 UErrorCode errorCode;
michael@0 928 int32_t state, cell, entry;
michael@0 929
michael@0 930 /* test each state table entry */
michael@0 931 for(state=0; state<states->countStates; ++state) {
michael@0 932 for(cell=0; cell<256; ++cell) {
michael@0 933 entry=states->stateTable[state][cell];
michael@0 934 /*
michael@0 935 * if the entry is a final one with an MBCS_STATE_VALID_DIRECT_16 action code
michael@0 936 * and the code point is "unassigned" (0xfffe), then change it to
michael@0 937 * the "unassigned" action code with bits 26..23 set to zero and U+fffe.
michael@0 938 */
michael@0 939 if(MBCS_ENTRY_SET_STATE(entry, 0)==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, 0xfffe)) {
michael@0 940 states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_UNASSIGNED);
michael@0 941 }
michael@0 942 }
michael@0 943 }
michael@0 944
michael@0 945 /* try to compact the toUnicode tables */
michael@0 946 if(states->maxCharLength==2) {
michael@0 947 compactToUnicode2(states, pUnicodeCodeUnits, toUFallbacks, countToUFallbacks, verbose);
michael@0 948 } else if(states->maxCharLength>2) {
michael@0 949 if(verbose) {
michael@0 950 compactToUnicodeHelper(states, *pUnicodeCodeUnits, toUFallbacks, countToUFallbacks);
michael@0 951 }
michael@0 952 }
michael@0 953
michael@0 954 /* sort toUFallbacks */
michael@0 955 /*
michael@0 956 * It should be safe to sort them before compactToUnicode2() is called,
michael@0 957 * because it should not change the relative order of the offset values
michael@0 958 * that it adjusts, but they need to be sorted at some point, and
michael@0 959 * it is safest here.
michael@0 960 */
michael@0 961 if(countToUFallbacks>0) {
michael@0 962 errorCode=U_ZERO_ERROR; /* nothing bad will happen... */
michael@0 963 uprv_sortArray(toUFallbacks, countToUFallbacks,
michael@0 964 sizeof(_MBCSToUFallback),
michael@0 965 compareFallbacks, NULL, FALSE, &errorCode);
michael@0 966 }
michael@0 967 }
michael@0 968
michael@0 969 /* use a complete state table ----------------------------------------------- */
michael@0 970
michael@0 971 U_CAPI int32_t U_EXPORT2
michael@0 972 ucm_countChars(UCMStates *states,
michael@0 973 const uint8_t *bytes, int32_t length) {
michael@0 974 uint32_t offset;
michael@0 975 int32_t i, entry, count;
michael@0 976 uint8_t state;
michael@0 977
michael@0 978 offset=0;
michael@0 979 count=0;
michael@0 980 state=0;
michael@0 981
michael@0 982 if(states->countStates==0) {
michael@0 983 fprintf(stderr, "ucm error: there is no state information!\n");
michael@0 984 return -1;
michael@0 985 }
michael@0 986
michael@0 987 /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */
michael@0 988 if(length==2 && states->outputType==MBCS_OUTPUT_2_SISO) {
michael@0 989 state=1;
michael@0 990 }
michael@0 991
michael@0 992 /*
michael@0 993 * Walk down the state table like in conversion,
michael@0 994 * much like getNextUChar().
michael@0 995 * We assume that c<=0x10ffff.
michael@0 996 */
michael@0 997 for(i=0; i<length; ++i) {
michael@0 998 entry=states->stateTable[state][bytes[i]];
michael@0 999 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
michael@0 1000 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
michael@0 1001 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
michael@0 1002 } else {
michael@0 1003 switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
michael@0 1004 case MBCS_STATE_ILLEGAL:
michael@0 1005 fprintf(stderr, "ucm error: byte sequence ends in illegal state\n");
michael@0 1006 return -1;
michael@0 1007 case MBCS_STATE_CHANGE_ONLY:
michael@0 1008 fprintf(stderr, "ucm error: byte sequence ends in state-change-only\n");
michael@0 1009 return -1;
michael@0 1010 case MBCS_STATE_UNASSIGNED:
michael@0 1011 case MBCS_STATE_FALLBACK_DIRECT_16:
michael@0 1012 case MBCS_STATE_VALID_DIRECT_16:
michael@0 1013 case MBCS_STATE_FALLBACK_DIRECT_20:
michael@0 1014 case MBCS_STATE_VALID_DIRECT_20:
michael@0 1015 case MBCS_STATE_VALID_16:
michael@0 1016 case MBCS_STATE_VALID_16_PAIR:
michael@0 1017 /* count a complete character and prepare for a new one */
michael@0 1018 ++count;
michael@0 1019 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry);
michael@0 1020 offset=0;
michael@0 1021 break;
michael@0 1022 default:
michael@0 1023 /* reserved, must never occur */
michael@0 1024 fprintf(stderr, "ucm error: byte sequence reached reserved action code, entry: 0x%02lx\n", (unsigned long)entry);
michael@0 1025 return -1;
michael@0 1026 }
michael@0 1027 }
michael@0 1028 }
michael@0 1029
michael@0 1030 if(offset!=0) {
michael@0 1031 fprintf(stderr, "ucm error: byte sequence too short, ends in non-final state %u\n", state);
michael@0 1032 return -1;
michael@0 1033 }
michael@0 1034
michael@0 1035 /*
michael@0 1036 * for SI/SO (like EBCDIC-stateful), multiple-character results
michael@0 1037 * must consist of only double-byte sequences
michael@0 1038 */
michael@0 1039 if(count>1 && states->outputType==MBCS_OUTPUT_2_SISO && length!=2*count) {
michael@0 1040 fprintf(stderr, "ucm error: SI/SO (like EBCDIC-stateful) result with %d characters does not contain all DBCS\n", (int)count);
michael@0 1041 return -1;
michael@0 1042 }
michael@0 1043
michael@0 1044 return count;
michael@0 1045 }
michael@0 1046 #endif
michael@0 1047

mercurial