1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/tools/toolutil/ucmstate.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1047 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* 1.7 +* Copyright (C) 2003-2012, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +******************************************************************************* 1.11 +* file name: ucmstate.c 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 2003oct09 1.17 +* created by: Markus W. Scherer 1.18 +* 1.19 +* This file handles ICU .ucm file state information as part of the ucm module. 1.20 +* Most of this code used to be in makeconv.c. 1.21 +*/ 1.22 + 1.23 +#include "unicode/utypes.h" 1.24 +#include "cstring.h" 1.25 +#include "cmemory.h" 1.26 +#include "uarrsort.h" 1.27 +#include "ucnvmbcs.h" 1.28 +#include "ucnv_ext.h" 1.29 +#include "uparse.h" 1.30 +#include "ucm.h" 1.31 +#include <stdio.h> 1.32 + 1.33 +#if !UCONFIG_NO_CONVERSION 1.34 + 1.35 +/* MBCS state handling ------------------------------------------------------ */ 1.36 + 1.37 +/* 1.38 + * state table row grammar (ebnf-style): 1.39 + * (whitespace is allowed between all tokens) 1.40 + * 1.41 + * row=[[firstentry ','] entry (',' entry)*] 1.42 + * firstentry="initial" | "surrogates" 1.43 + * (initial state (default for state 0), output is all surrogate pairs) 1.44 + * entry=range [':' nextstate] ['.' action] 1.45 + * range=number ['-' number] 1.46 + * nextstate=number 1.47 + * (0..7f) 1.48 + * action='u' | 's' | 'p' | 'i' 1.49 + * (unassigned, state change only, surrogate pair, illegal) 1.50 + * number=(1- or 2-digit hexadecimal number) 1.51 + */ 1.52 +static const char * 1.53 +parseState(const char *s, int32_t state[256], uint32_t *pFlags) { 1.54 + const char *t; 1.55 + uint32_t start, end, i; 1.56 + int32_t entry; 1.57 + 1.58 + /* initialize the state: all illegal with U+ffff */ 1.59 + for(i=0; i<256; ++i) { 1.60 + state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0xffff); 1.61 + } 1.62 + 1.63 + /* skip leading white space */ 1.64 + s=u_skipWhitespace(s); 1.65 + 1.66 + /* is there an "initial" or "surrogates" directive? */ 1.67 + if(uprv_strncmp("initial", s, 7)==0) { 1.68 + *pFlags=MBCS_STATE_FLAG_DIRECT; 1.69 + s=u_skipWhitespace(s+7); 1.70 + if(*s++!=',') { 1.71 + return s-1; 1.72 + } 1.73 + } else if(*pFlags==0 && uprv_strncmp("surrogates", s, 10)==0) { 1.74 + *pFlags=MBCS_STATE_FLAG_SURROGATES; 1.75 + s=u_skipWhitespace(s+10); 1.76 + if(*s++!=',') { 1.77 + return s-1; 1.78 + } 1.79 + } else if(*s==0) { 1.80 + /* empty state row: all-illegal */ 1.81 + return NULL; 1.82 + } 1.83 + 1.84 + for(;;) { 1.85 + /* read an entry, the start of the range first */ 1.86 + s=u_skipWhitespace(s); 1.87 + start=uprv_strtoul(s, (char **)&t, 16); 1.88 + if(s==t || 0xff<start) { 1.89 + return s; 1.90 + } 1.91 + s=u_skipWhitespace(t); 1.92 + 1.93 + /* read the end of the range if there is one */ 1.94 + if(*s=='-') { 1.95 + s=u_skipWhitespace(s+1); 1.96 + end=uprv_strtoul(s, (char **)&t, 16); 1.97 + if(s==t || end<start || 0xff<end) { 1.98 + return s; 1.99 + } 1.100 + s=u_skipWhitespace(t); 1.101 + } else { 1.102 + end=start; 1.103 + } 1.104 + 1.105 + /* determine the state entrys for this range */ 1.106 + if(*s!=':' && *s!='.') { 1.107 + /* the default is: final state with valid entries */ 1.108 + entry=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_16, 0); 1.109 + } else { 1.110 + entry=MBCS_ENTRY_TRANSITION(0, 0); 1.111 + if(*s==':') { 1.112 + /* get the next state, default to 0 */ 1.113 + s=u_skipWhitespace(s+1); 1.114 + i=uprv_strtoul(s, (char **)&t, 16); 1.115 + if(s!=t) { 1.116 + if(0x7f<i) { 1.117 + return s; 1.118 + } 1.119 + s=u_skipWhitespace(t); 1.120 + entry=MBCS_ENTRY_SET_STATE(entry, i); 1.121 + } 1.122 + } 1.123 + 1.124 + /* get the state action, default to valid */ 1.125 + if(*s=='.') { 1.126 + /* this is a final state */ 1.127 + entry=MBCS_ENTRY_SET_FINAL(entry); 1.128 + 1.129 + s=u_skipWhitespace(s+1); 1.130 + if(*s=='u') { 1.131 + /* unassigned set U+fffe */ 1.132 + entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe); 1.133 + s=u_skipWhitespace(s+1); 1.134 + } else if(*s=='p') { 1.135 + if(*pFlags!=MBCS_STATE_FLAG_DIRECT) { 1.136 + entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16_PAIR); 1.137 + } else { 1.138 + entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16); 1.139 + } 1.140 + s=u_skipWhitespace(s+1); 1.141 + } else if(*s=='s') { 1.142 + entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_CHANGE_ONLY); 1.143 + s=u_skipWhitespace(s+1); 1.144 + } else if(*s=='i') { 1.145 + /* illegal set U+ffff */ 1.146 + entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_ILLEGAL, 0xffff); 1.147 + s=u_skipWhitespace(s+1); 1.148 + } else { 1.149 + /* default to valid */ 1.150 + entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16); 1.151 + } 1.152 + } else { 1.153 + /* this is an intermediate state, nothing to do */ 1.154 + } 1.155 + } 1.156 + 1.157 + /* adjust "final valid" states according to the state flags */ 1.158 + if(MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16) { 1.159 + switch(*pFlags) { 1.160 + case 0: 1.161 + /* no adjustment */ 1.162 + break; 1.163 + case MBCS_STATE_FLAG_DIRECT: 1.164 + /* set the valid-direct code point to "unassigned"==0xfffe */ 1.165 + entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_DIRECT_16, 0xfffe); 1.166 + break; 1.167 + case MBCS_STATE_FLAG_SURROGATES: 1.168 + entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_16_PAIR, 0); 1.169 + break; 1.170 + default: 1.171 + break; 1.172 + } 1.173 + } 1.174 + 1.175 + /* set this entry for the range */ 1.176 + for(i=start; i<=end; ++i) { 1.177 + state[i]=entry; 1.178 + } 1.179 + 1.180 + if(*s==',') { 1.181 + ++s; 1.182 + } else { 1.183 + return *s==0 ? NULL : s; 1.184 + } 1.185 + } 1.186 +} 1.187 + 1.188 +U_CAPI void U_EXPORT2 1.189 +ucm_addState(UCMStates *states, const char *s) { 1.190 + const char *error; 1.191 + 1.192 + if(states->countStates==MBCS_MAX_STATE_COUNT) { 1.193 + fprintf(stderr, "ucm error: too many states (maximum %u)\n", MBCS_MAX_STATE_COUNT); 1.194 + exit(U_INVALID_TABLE_FORMAT); 1.195 + } 1.196 + 1.197 + error=parseState(s, states->stateTable[states->countStates], 1.198 + &states->stateFlags[states->countStates]); 1.199 + if(error!=NULL) { 1.200 + fprintf(stderr, "ucm error: parse error in state definition at '%s'\n", error); 1.201 + exit(U_INVALID_TABLE_FORMAT); 1.202 + } 1.203 + 1.204 + ++states->countStates; 1.205 +} 1.206 + 1.207 +U_CAPI UBool U_EXPORT2 1.208 +ucm_parseHeaderLine(UCMFile *ucm, 1.209 + char *line, char **pKey, char **pValue) { 1.210 + UCMStates *states; 1.211 + char *s, *end; 1.212 + char c; 1.213 + 1.214 + states=&ucm->states; 1.215 + 1.216 + /* remove comments and trailing CR and LF and remove whitespace from the end */ 1.217 + for(end=line; (c=*end)!=0; ++end) { 1.218 + if(c=='#' || c=='\r' || c=='\n') { 1.219 + break; 1.220 + } 1.221 + } 1.222 + while(end>line && (*(end-1)==' ' || *(end-1)=='\t')) { 1.223 + --end; 1.224 + } 1.225 + *end=0; 1.226 + 1.227 + /* skip leading white space and ignore empty lines */ 1.228 + s=(char *)u_skipWhitespace(line); 1.229 + if(*s==0) { 1.230 + return TRUE; 1.231 + } 1.232 + 1.233 + /* stop at the beginning of the mapping section */ 1.234 + if(uprv_memcmp(s, "CHARMAP", 7)==0) { 1.235 + return FALSE; 1.236 + } 1.237 + 1.238 + /* get the key name, bracketed in <> */ 1.239 + if(*s!='<') { 1.240 + fprintf(stderr, "ucm error: no header field <key> in line \"%s\"\n", line); 1.241 + exit(U_INVALID_TABLE_FORMAT); 1.242 + } 1.243 + *pKey=++s; 1.244 + while(*s!='>') { 1.245 + if(*s==0) { 1.246 + fprintf(stderr, "ucm error: incomplete header field <key> in line \"%s\"\n", line); 1.247 + exit(U_INVALID_TABLE_FORMAT); 1.248 + } 1.249 + ++s; 1.250 + } 1.251 + *s=0; 1.252 + 1.253 + /* get the value string, possibly quoted */ 1.254 + s=(char *)u_skipWhitespace(s+1); 1.255 + if(*s!='"') { 1.256 + *pValue=s; 1.257 + } else { 1.258 + /* remove the quotes */ 1.259 + *pValue=s+1; 1.260 + if(end>*pValue && *(end-1)=='"') { 1.261 + *--end=0; 1.262 + } 1.263 + } 1.264 + 1.265 + /* collect the information from the header field, ignore unknown keys */ 1.266 + if(uprv_strcmp(*pKey, "uconv_class")==0) { 1.267 + if(uprv_strcmp(*pValue, "DBCS")==0) { 1.268 + states->conversionType=UCNV_DBCS; 1.269 + } else if(uprv_strcmp(*pValue, "SBCS")==0) { 1.270 + states->conversionType = UCNV_SBCS; 1.271 + } else if(uprv_strcmp(*pValue, "MBCS")==0) { 1.272 + states->conversionType = UCNV_MBCS; 1.273 + } else if(uprv_strcmp(*pValue, "EBCDIC_STATEFUL")==0) { 1.274 + states->conversionType = UCNV_EBCDIC_STATEFUL; 1.275 + } else { 1.276 + fprintf(stderr, "ucm error: unknown <uconv_class> %s\n", *pValue); 1.277 + exit(U_INVALID_TABLE_FORMAT); 1.278 + } 1.279 + return TRUE; 1.280 + } else if(uprv_strcmp(*pKey, "mb_cur_max")==0) { 1.281 + c=**pValue; 1.282 + if('1'<=c && c<='4' && (*pValue)[1]==0) { 1.283 + states->maxCharLength=(int8_t)(c-'0'); 1.284 + states->outputType=(int8_t)(states->maxCharLength-1); 1.285 + } else { 1.286 + fprintf(stderr, "ucm error: illegal <mb_cur_max> %s\n", *pValue); 1.287 + exit(U_INVALID_TABLE_FORMAT); 1.288 + } 1.289 + return TRUE; 1.290 + } else if(uprv_strcmp(*pKey, "mb_cur_min")==0) { 1.291 + c=**pValue; 1.292 + if('1'<=c && c<='4' && (*pValue)[1]==0) { 1.293 + states->minCharLength=(int8_t)(c-'0'); 1.294 + } else { 1.295 + fprintf(stderr, "ucm error: illegal <mb_cur_min> %s\n", *pValue); 1.296 + exit(U_INVALID_TABLE_FORMAT); 1.297 + } 1.298 + return TRUE; 1.299 + } else if(uprv_strcmp(*pKey, "icu:state")==0) { 1.300 + /* if an SBCS/DBCS/EBCDIC_STATEFUL converter has icu:state, then turn it into MBCS */ 1.301 + switch(states->conversionType) { 1.302 + case UCNV_SBCS: 1.303 + case UCNV_DBCS: 1.304 + case UCNV_EBCDIC_STATEFUL: 1.305 + states->conversionType=UCNV_MBCS; 1.306 + break; 1.307 + case UCNV_MBCS: 1.308 + break; 1.309 + default: 1.310 + fprintf(stderr, "ucm error: <icu:state> entry for non-MBCS table or before the <uconv_class> line\n"); 1.311 + exit(U_INVALID_TABLE_FORMAT); 1.312 + } 1.313 + 1.314 + if(states->maxCharLength==0) { 1.315 + fprintf(stderr, "ucm error: <icu:state> before the <mb_cur_max> line\n"); 1.316 + exit(U_INVALID_TABLE_FORMAT); 1.317 + } 1.318 + ucm_addState(states, *pValue); 1.319 + return TRUE; 1.320 + } else if(uprv_strcmp(*pKey, "icu:base")==0) { 1.321 + if(**pValue==0) { 1.322 + fprintf(stderr, "ucm error: <icu:base> without a base table name\n"); 1.323 + exit(U_INVALID_TABLE_FORMAT); 1.324 + } 1.325 + uprv_strcpy(ucm->baseName, *pValue); 1.326 + return TRUE; 1.327 + } 1.328 + 1.329 + return FALSE; 1.330 +} 1.331 + 1.332 +/* post-processing ---------------------------------------------------------- */ 1.333 + 1.334 +static int32_t 1.335 +sumUpStates(UCMStates *states) { 1.336 + int32_t entry, sum, state, cell, count; 1.337 + UBool allStatesReady; 1.338 + 1.339 + /* 1.340 + * Sum up the offsets for all states. 1.341 + * In each final state (where there are only final entries), 1.342 + * the offsets add up directly. 1.343 + * In all other state table rows, for each transition entry to another state, 1.344 + * the offsets sum of that state needs to be added. 1.345 + * This is achieved in at most countStates iterations. 1.346 + */ 1.347 + allStatesReady=FALSE; 1.348 + for(count=states->countStates; !allStatesReady && count>=0; --count) { 1.349 + allStatesReady=TRUE; 1.350 + for(state=states->countStates-1; state>=0; --state) { 1.351 + if(!(states->stateFlags[state]&MBCS_STATE_FLAG_READY)) { 1.352 + allStatesReady=FALSE; 1.353 + sum=0; 1.354 + 1.355 + /* at first, add up only the final delta offsets to keep them <512 */ 1.356 + for(cell=0; cell<256; ++cell) { 1.357 + entry=states->stateTable[state][cell]; 1.358 + if(MBCS_ENTRY_IS_FINAL(entry)) { 1.359 + switch(MBCS_ENTRY_FINAL_ACTION(entry)) { 1.360 + case MBCS_STATE_VALID_16: 1.361 + states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum); 1.362 + sum+=1; 1.363 + break; 1.364 + case MBCS_STATE_VALID_16_PAIR: 1.365 + states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum); 1.366 + sum+=2; 1.367 + break; 1.368 + default: 1.369 + /* no addition */ 1.370 + break; 1.371 + } 1.372 + } 1.373 + } 1.374 + 1.375 + /* now, add up the delta offsets for the transitional entries */ 1.376 + for(cell=0; cell<256; ++cell) { 1.377 + entry=states->stateTable[state][cell]; 1.378 + if(MBCS_ENTRY_IS_TRANSITION(entry)) { 1.379 + if(states->stateFlags[MBCS_ENTRY_TRANSITION_STATE(entry)]&MBCS_STATE_FLAG_READY) { 1.380 + states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_SET_OFFSET(entry, sum); 1.381 + sum+=states->stateOffsetSum[MBCS_ENTRY_TRANSITION_STATE(entry)]; 1.382 + } else { 1.383 + /* that next state does not have a sum yet, we cannot finish the one for this state */ 1.384 + sum=-1; 1.385 + break; 1.386 + } 1.387 + } 1.388 + } 1.389 + 1.390 + if(sum!=-1) { 1.391 + states->stateOffsetSum[state]=sum; 1.392 + states->stateFlags[state]|=MBCS_STATE_FLAG_READY; 1.393 + } 1.394 + } 1.395 + } 1.396 + } 1.397 + 1.398 + if(!allStatesReady) { 1.399 + fprintf(stderr, "ucm error: the state table contains loops\n"); 1.400 + exit(U_INVALID_TABLE_FORMAT); 1.401 + } 1.402 + 1.403 + /* 1.404 + * For all "direct" (i.e., initial) states>0, 1.405 + * the offsets need to be increased by the sum of 1.406 + * the previous initial states. 1.407 + */ 1.408 + sum=states->stateOffsetSum[0]; 1.409 + for(state=1; state<states->countStates; ++state) { 1.410 + if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) { 1.411 + int32_t sum2=sum; 1.412 + sum+=states->stateOffsetSum[state]; 1.413 + for(cell=0; cell<256; ++cell) { 1.414 + entry=states->stateTable[state][cell]; 1.415 + if(MBCS_ENTRY_IS_TRANSITION(entry)) { 1.416 + states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_ADD_OFFSET(entry, sum2); 1.417 + } 1.418 + } 1.419 + } 1.420 + } 1.421 + 1.422 + /* round up to the next even number to have the following data 32-bit-aligned */ 1.423 + return states->countToUCodeUnits=(sum+1)&~1; 1.424 +} 1.425 + 1.426 +U_CAPI void U_EXPORT2 1.427 +ucm_processStates(UCMStates *states, UBool ignoreSISOCheck) { 1.428 + int32_t entry, state, cell, count; 1.429 + 1.430 + if(states->conversionType==UCNV_UNSUPPORTED_CONVERTER) { 1.431 + fprintf(stderr, "ucm error: missing conversion type (<uconv_class>)\n"); 1.432 + exit(U_INVALID_TABLE_FORMAT); 1.433 + } 1.434 + 1.435 + if(states->countStates==0) { 1.436 + switch(states->conversionType) { 1.437 + case UCNV_SBCS: 1.438 + /* SBCS: use MBCS data structure with a default state table */ 1.439 + if(states->maxCharLength!=1) { 1.440 + fprintf(stderr, "error: SBCS codepage with max B/char!=1\n"); 1.441 + exit(U_INVALID_TABLE_FORMAT); 1.442 + } 1.443 + states->conversionType=UCNV_MBCS; 1.444 + ucm_addState(states, "0-ff"); 1.445 + break; 1.446 + case UCNV_MBCS: 1.447 + fprintf(stderr, "ucm error: missing state table information (<icu:state>) for MBCS\n"); 1.448 + exit(U_INVALID_TABLE_FORMAT); 1.449 + break; 1.450 + case UCNV_EBCDIC_STATEFUL: 1.451 + /* EBCDIC_STATEFUL: use MBCS data structure with a default state table */ 1.452 + if(states->minCharLength!=1 || states->maxCharLength!=2) { 1.453 + fprintf(stderr, "error: DBCS codepage with min B/char!=1 or max B/char!=2\n"); 1.454 + exit(U_INVALID_TABLE_FORMAT); 1.455 + } 1.456 + states->conversionType=UCNV_MBCS; 1.457 + ucm_addState(states, "0-ff, e:1.s, f:0.s"); 1.458 + ucm_addState(states, "initial, 0-3f:4, e:1.s, f:0.s, 40:3, 41-fe:2, ff:4"); 1.459 + ucm_addState(states, "0-40:1.i, 41-fe:1., ff:1.i"); 1.460 + ucm_addState(states, "0-ff:1.i, 40:1."); 1.461 + ucm_addState(states, "0-ff:1.i"); 1.462 + break; 1.463 + case UCNV_DBCS: 1.464 + /* DBCS: use MBCS data structure with a default state table */ 1.465 + if(states->minCharLength!=2 || states->maxCharLength!=2) { 1.466 + fprintf(stderr, "error: DBCS codepage with min or max B/char!=2\n"); 1.467 + exit(U_INVALID_TABLE_FORMAT); 1.468 + } 1.469 + states->conversionType = UCNV_MBCS; 1.470 + ucm_addState(states, "0-3f:3, 40:2, 41-fe:1, ff:3"); 1.471 + ucm_addState(states, "41-fe"); 1.472 + ucm_addState(states, "40"); 1.473 + ucm_addState(states, ""); 1.474 + break; 1.475 + default: 1.476 + fprintf(stderr, "ucm error: unknown charset structure\n"); 1.477 + exit(U_INVALID_TABLE_FORMAT); 1.478 + break; 1.479 + } 1.480 + } 1.481 + 1.482 + /* 1.483 + * check that the min/max character lengths are reasonable; 1.484 + * to do this right, all paths through the state table would have to be 1.485 + * recursively walked while keeping track of the sequence lengths, 1.486 + * but these simple checks cover most state tables in practice 1.487 + */ 1.488 + if(states->maxCharLength<states->minCharLength) { 1.489 + fprintf(stderr, "ucm error: max B/char < min B/char\n"); 1.490 + exit(U_INVALID_TABLE_FORMAT); 1.491 + } 1.492 + 1.493 + /* count non-direct states and compare with max B/char */ 1.494 + count=0; 1.495 + for(state=0; state<states->countStates; ++state) { 1.496 + if((states->stateFlags[state]&0xf)!=MBCS_STATE_FLAG_DIRECT) { 1.497 + ++count; 1.498 + } 1.499 + } 1.500 + if(states->maxCharLength>count+1) { 1.501 + fprintf(stderr, "ucm error: max B/char too large\n"); 1.502 + exit(U_INVALID_TABLE_FORMAT); 1.503 + } 1.504 + 1.505 + if(states->minCharLength==1) { 1.506 + int32_t action; 1.507 + 1.508 + /* 1.509 + * if there are single-byte characters, 1.510 + * then the initial state must have direct result states 1.511 + */ 1.512 + for(cell=0; cell<256; ++cell) { 1.513 + entry=states->stateTable[0][cell]; 1.514 + if( MBCS_ENTRY_IS_FINAL(entry) && 1.515 + ((action=MBCS_ENTRY_FINAL_ACTION(entry))==MBCS_STATE_VALID_DIRECT_16 || 1.516 + action==MBCS_STATE_UNASSIGNED) 1.517 + ) { 1.518 + break; 1.519 + } 1.520 + } 1.521 + 1.522 + if(cell==256) { 1.523 + fprintf(stderr, "ucm warning: min B/char too small\n"); 1.524 + } 1.525 + } 1.526 + 1.527 + /* 1.528 + * make sure that all "next state" values are within limits 1.529 + * and that all next states after final ones have the "direct" 1.530 + * flag of initial states 1.531 + */ 1.532 + for(state=states->countStates-1; state>=0; --state) { 1.533 + for(cell=0; cell<256; ++cell) { 1.534 + entry=states->stateTable[state][cell]; 1.535 + if((uint8_t)MBCS_ENTRY_STATE(entry)>=states->countStates) { 1.536 + fprintf(stderr, "ucm error: state table entry [%x][%x] has a next state of %x that is too high\n", 1.537 + (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry)); 1.538 + exit(U_INVALID_TABLE_FORMAT); 1.539 + } 1.540 + if(MBCS_ENTRY_IS_FINAL(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)!=MBCS_STATE_FLAG_DIRECT) { 1.541 + fprintf(stderr, "ucm error: state table entry [%x][%x] is final but has a non-initial next state of %x\n", 1.542 + (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry)); 1.543 + exit(U_INVALID_TABLE_FORMAT); 1.544 + } else if(MBCS_ENTRY_IS_TRANSITION(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)==MBCS_STATE_FLAG_DIRECT) { 1.545 + fprintf(stderr, "ucm error: state table entry [%x][%x] is not final but has an initial next state of %x\n", 1.546 + (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry)); 1.547 + exit(U_INVALID_TABLE_FORMAT); 1.548 + } 1.549 + } 1.550 + } 1.551 + 1.552 + /* is this an SI/SO (like EBCDIC-stateful) state table? */ 1.553 + if(states->countStates>=2 && (states->stateFlags[1]&0xf)==MBCS_STATE_FLAG_DIRECT) { 1.554 + if(states->maxCharLength!=2) { 1.555 + fprintf(stderr, "ucm error: SI/SO codepages must have max 2 bytes/char (not %x)\n", (int)states->maxCharLength); 1.556 + exit(U_INVALID_TABLE_FORMAT); 1.557 + } 1.558 + if(states->countStates<3) { 1.559 + fprintf(stderr, "ucm error: SI/SO codepages must have at least 3 states (not %x)\n", (int)states->countStates); 1.560 + exit(U_INVALID_TABLE_FORMAT); 1.561 + } 1.562 + /* are the SI/SO all in the right places? */ 1.563 + if( ignoreSISOCheck || 1.564 + (states->stateTable[0][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) && 1.565 + states->stateTable[0][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0) && 1.566 + states->stateTable[1][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) && 1.567 + states->stateTable[1][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0)) 1.568 + ) { 1.569 + states->outputType=MBCS_OUTPUT_2_SISO; 1.570 + } else { 1.571 + fprintf(stderr, "ucm error: SI/SO codepages must have in states 0 and 1 transitions e:1.s, f:0.s\n"); 1.572 + exit(U_INVALID_TABLE_FORMAT); 1.573 + } 1.574 + state=2; 1.575 + } else { 1.576 + state=1; 1.577 + } 1.578 + 1.579 + /* check that no unexpected state is a "direct" one */ 1.580 + while(state<states->countStates) { 1.581 + if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) { 1.582 + fprintf(stderr, "ucm error: state %d is 'initial' - not supported except for SI/SO codepages\n", (int)state); 1.583 + exit(U_INVALID_TABLE_FORMAT); 1.584 + } 1.585 + ++state; 1.586 + } 1.587 + 1.588 + sumUpStates(states); 1.589 +} 1.590 + 1.591 +/* find a fallback for this offset; return the index or -1 if not found */ 1.592 +U_CAPI int32_t U_EXPORT2 1.593 +ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, 1.594 + uint32_t offset) { 1.595 + int32_t i; 1.596 + 1.597 + if(countToUFallbacks==0) { 1.598 + /* shortcut: most codepages do not have fallbacks from codepage to Unicode */ 1.599 + return -1; 1.600 + } 1.601 + 1.602 + /* do a linear search for the fallback mapping (the table is not yet sorted) */ 1.603 + for(i=0; i<countToUFallbacks; ++i) { 1.604 + if(offset==toUFallbacks[i].offset) { 1.605 + return i; 1.606 + } 1.607 + } 1.608 + return -1; 1.609 +} 1.610 + 1.611 +/* 1.612 + * This function tries to compact toUnicode tables for 2-byte codepages 1.613 + * by finding lead bytes with all-unassigned trail bytes and adding another state 1.614 + * for them. 1.615 + */ 1.616 +static void 1.617 +compactToUnicode2(UCMStates *states, 1.618 + uint16_t **pUnicodeCodeUnits, 1.619 + _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, 1.620 + UBool verbose) { 1.621 + int32_t (*oldStateTable)[256]; 1.622 + uint16_t count[256]; 1.623 + uint16_t *oldUnicodeCodeUnits; 1.624 + int32_t entry, offset, oldOffset, trailOffset, oldTrailOffset, savings, sum; 1.625 + int32_t i, j, leadState, trailState, newState, fallback; 1.626 + uint16_t unit; 1.627 + 1.628 + /* find the lead state */ 1.629 + if(states->outputType==MBCS_OUTPUT_2_SISO) { 1.630 + /* use the DBCS lead state for SI/SO codepages */ 1.631 + leadState=1; 1.632 + } else { 1.633 + leadState=0; 1.634 + } 1.635 + 1.636 + /* find the main trail state: the most used target state */ 1.637 + uprv_memset(count, 0, sizeof(count)); 1.638 + for(i=0; i<256; ++i) { 1.639 + entry=states->stateTable[leadState][i]; 1.640 + if(MBCS_ENTRY_IS_TRANSITION(entry)) { 1.641 + ++count[MBCS_ENTRY_TRANSITION_STATE(entry)]; 1.642 + } 1.643 + } 1.644 + trailState=0; 1.645 + for(i=1; i<states->countStates; ++i) { 1.646 + if(count[i]>count[trailState]) { 1.647 + trailState=i; 1.648 + } 1.649 + } 1.650 + 1.651 + /* count possible savings from lead bytes with all-unassigned results in all trail bytes */ 1.652 + uprv_memset(count, 0, sizeof(count)); 1.653 + savings=0; 1.654 + /* for each lead byte */ 1.655 + for(i=0; i<256; ++i) { 1.656 + entry=states->stateTable[leadState][i]; 1.657 + if(MBCS_ENTRY_IS_TRANSITION(entry) && (MBCS_ENTRY_TRANSITION_STATE(entry))==trailState) { 1.658 + /* the offset is different for each lead byte */ 1.659 + offset=MBCS_ENTRY_TRANSITION_OFFSET(entry); 1.660 + /* for each trail byte for this lead byte */ 1.661 + for(j=0; j<256; ++j) { 1.662 + entry=states->stateTable[trailState][j]; 1.663 + switch(MBCS_ENTRY_FINAL_ACTION(entry)) { 1.664 + case MBCS_STATE_VALID_16: 1.665 + entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); 1.666 + if((*pUnicodeCodeUnits)[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) { 1.667 + ++count[i]; 1.668 + } else { 1.669 + j=999; /* do not count for this lead byte because there are assignments */ 1.670 + } 1.671 + break; 1.672 + case MBCS_STATE_VALID_16_PAIR: 1.673 + entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); 1.674 + if((*pUnicodeCodeUnits)[entry]==0xfffe) { 1.675 + count[i]+=2; 1.676 + } else { 1.677 + j=999; /* do not count for this lead byte because there are assignments */ 1.678 + } 1.679 + break; 1.680 + default: 1.681 + break; 1.682 + } 1.683 + } 1.684 + if(j==256) { 1.685 + /* all trail bytes for this lead byte are unassigned */ 1.686 + savings+=count[i]; 1.687 + } else { 1.688 + count[i]=0; 1.689 + } 1.690 + } 1.691 + } 1.692 + /* subtract from the possible savings the cost of an additional state */ 1.693 + savings=savings*2-1024; /* count bytes, not 16-bit words */ 1.694 + if(savings<=0) { 1.695 + return; 1.696 + } 1.697 + if(verbose) { 1.698 + printf("compacting toUnicode data saves %ld bytes\n", (long)savings); 1.699 + } 1.700 + if(states->countStates>=MBCS_MAX_STATE_COUNT) { 1.701 + fprintf(stderr, "cannot compact toUnicode because the maximum number of states is reached\n"); 1.702 + return; 1.703 + } 1.704 + 1.705 + /* make a copy of the state table */ 1.706 + oldStateTable=(int32_t (*)[256])uprv_malloc(states->countStates*1024); 1.707 + if(oldStateTable==NULL) { 1.708 + fprintf(stderr, "cannot compact toUnicode: out of memory\n"); 1.709 + return; 1.710 + } 1.711 + uprv_memcpy(oldStateTable, states->stateTable, states->countStates*1024); 1.712 + 1.713 + /* add the new state */ 1.714 + /* 1.715 + * this function does not catch the degenerate case where all lead bytes 1.716 + * have all-unassigned trail bytes and the lead state could be removed 1.717 + */ 1.718 + newState=states->countStates++; 1.719 + states->stateFlags[newState]=0; 1.720 + /* copy the old trail state, turning all assigned states into unassigned ones */ 1.721 + for(i=0; i<256; ++i) { 1.722 + entry=states->stateTable[trailState][i]; 1.723 + switch(MBCS_ENTRY_FINAL_ACTION(entry)) { 1.724 + case MBCS_STATE_VALID_16: 1.725 + case MBCS_STATE_VALID_16_PAIR: 1.726 + states->stateTable[newState][i]=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe); 1.727 + break; 1.728 + default: 1.729 + states->stateTable[newState][i]=entry; 1.730 + break; 1.731 + } 1.732 + } 1.733 + 1.734 + /* in the lead state, redirect all lead bytes with all-unassigned trail bytes to the new state */ 1.735 + for(i=0; i<256; ++i) { 1.736 + if(count[i]>0) { 1.737 + states->stateTable[leadState][i]=MBCS_ENTRY_SET_STATE(states->stateTable[leadState][i], newState); 1.738 + } 1.739 + } 1.740 + 1.741 + /* sum up the new state table */ 1.742 + for(i=0; i<states->countStates; ++i) { 1.743 + states->stateFlags[i]&=~MBCS_STATE_FLAG_READY; 1.744 + } 1.745 + sum=sumUpStates(states); 1.746 + 1.747 + /* allocate a new, smaller code units array */ 1.748 + oldUnicodeCodeUnits=*pUnicodeCodeUnits; 1.749 + if(sum==0) { 1.750 + *pUnicodeCodeUnits=NULL; 1.751 + if(oldUnicodeCodeUnits!=NULL) { 1.752 + uprv_free(oldUnicodeCodeUnits); 1.753 + } 1.754 + uprv_free(oldStateTable); 1.755 + return; 1.756 + } 1.757 + *pUnicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t)); 1.758 + if(*pUnicodeCodeUnits==NULL) { 1.759 + fprintf(stderr, "cannot compact toUnicode: out of memory allocating %ld 16-bit code units\n", 1.760 + (long)sum); 1.761 + /* revert to the old state table */ 1.762 + *pUnicodeCodeUnits=oldUnicodeCodeUnits; 1.763 + --states->countStates; 1.764 + uprv_memcpy(states->stateTable, oldStateTable, states->countStates*1024); 1.765 + uprv_free(oldStateTable); 1.766 + return; 1.767 + } 1.768 + for(i=0; i<sum; ++i) { 1.769 + (*pUnicodeCodeUnits)[i]=0xfffe; 1.770 + } 1.771 + 1.772 + /* copy the code units for all assigned characters */ 1.773 + /* 1.774 + * The old state table has the same lead _and_ trail states for assigned characters! 1.775 + * The differences are in the offsets, and in the trail states for some unassigned characters. 1.776 + * For each character with an assigned state in the new table, it was assigned in the old one. 1.777 + * Only still-assigned characters are copied. 1.778 + * Note that fallback mappings need to get their offset values adjusted. 1.779 + */ 1.780 + 1.781 + /* for each initial state */ 1.782 + for(leadState=0; leadState<states->countStates; ++leadState) { 1.783 + if((states->stateFlags[leadState]&0xf)==MBCS_STATE_FLAG_DIRECT) { 1.784 + /* for each lead byte from there */ 1.785 + for(i=0; i<256; ++i) { 1.786 + entry=states->stateTable[leadState][i]; 1.787 + if(MBCS_ENTRY_IS_TRANSITION(entry)) { 1.788 + trailState=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 1.789 + /* the new state does not have assigned states */ 1.790 + if(trailState!=newState) { 1.791 + trailOffset=MBCS_ENTRY_TRANSITION_OFFSET(entry); 1.792 + oldTrailOffset=MBCS_ENTRY_TRANSITION_OFFSET(oldStateTable[leadState][i]); 1.793 + /* for each trail byte */ 1.794 + for(j=0; j<256; ++j) { 1.795 + entry=states->stateTable[trailState][j]; 1.796 + /* copy assigned-character code units and adjust fallback offsets */ 1.797 + switch(MBCS_ENTRY_FINAL_ACTION(entry)) { 1.798 + case MBCS_STATE_VALID_16: 1.799 + offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry); 1.800 + /* find the old offset according to the old state table */ 1.801 + oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]); 1.802 + unit=(*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset]; 1.803 + if(unit==0xfffe && (fallback=ucm_findFallback(toUFallbacks, countToUFallbacks, oldOffset))>=0) { 1.804 + toUFallbacks[fallback].offset=0x80000000|offset; 1.805 + } 1.806 + break; 1.807 + case MBCS_STATE_VALID_16_PAIR: 1.808 + offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry); 1.809 + /* find the old offset according to the old state table */ 1.810 + oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]); 1.811 + (*pUnicodeCodeUnits)[offset++]=oldUnicodeCodeUnits[oldOffset++]; 1.812 + (*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset]; 1.813 + break; 1.814 + default: 1.815 + break; 1.816 + } 1.817 + } 1.818 + } 1.819 + } 1.820 + } 1.821 + } 1.822 + } 1.823 + 1.824 + /* remove temporary flags from fallback offsets that protected them from being modified twice */ 1.825 + for(i=0; i<countToUFallbacks; ++i) { 1.826 + toUFallbacks[i].offset&=0x7fffffff; 1.827 + } 1.828 + 1.829 + /* free temporary memory */ 1.830 + uprv_free(oldUnicodeCodeUnits); 1.831 + uprv_free(oldStateTable); 1.832 +} 1.833 + 1.834 +/* 1.835 + * recursive sub-function of compactToUnicodeHelper() 1.836 + * returns: 1.837 + * >0 number of bytes that are used in unicodeCodeUnits[] that could be saved, 1.838 + * if all sequences from this state are unassigned, returns the 1.839 + * <0 there are assignments in unicodeCodeUnits[] 1.840 + * 0 no use of unicodeCodeUnits[] 1.841 + */ 1.842 +static int32_t 1.843 +findUnassigned(UCMStates *states, 1.844 + uint16_t *unicodeCodeUnits, 1.845 + _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, 1.846 + int32_t state, int32_t offset, uint32_t b) { 1.847 + int32_t i, entry, savings, localSavings, belowSavings; 1.848 + UBool haveAssigned; 1.849 + 1.850 + localSavings=belowSavings=0; 1.851 + haveAssigned=FALSE; 1.852 + for(i=0; i<256; ++i) { 1.853 + entry=states->stateTable[state][i]; 1.854 + if(MBCS_ENTRY_IS_TRANSITION(entry)) { 1.855 + savings=findUnassigned(states, 1.856 + unicodeCodeUnits, 1.857 + toUFallbacks, countToUFallbacks, 1.858 + MBCS_ENTRY_TRANSITION_STATE(entry), 1.859 + offset+MBCS_ENTRY_TRANSITION_OFFSET(entry), 1.860 + (b<<8)|(uint32_t)i); 1.861 + if(savings<0) { 1.862 + haveAssigned=TRUE; 1.863 + } else if(savings>0) { 1.864 + printf(" all-unassigned sequences from prefix 0x%02lx state %ld use %ld bytes\n", 1.865 + (unsigned long)((b<<8)|i), (long)state, (long)savings); 1.866 + belowSavings+=savings; 1.867 + } 1.868 + } else if(!haveAssigned) { 1.869 + switch(MBCS_ENTRY_FINAL_ACTION(entry)) { 1.870 + case MBCS_STATE_VALID_16: 1.871 + entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); 1.872 + if(unicodeCodeUnits[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) { 1.873 + localSavings+=2; 1.874 + } else { 1.875 + haveAssigned=TRUE; 1.876 + } 1.877 + break; 1.878 + case MBCS_STATE_VALID_16_PAIR: 1.879 + entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); 1.880 + if(unicodeCodeUnits[entry]==0xfffe) { 1.881 + localSavings+=4; 1.882 + } else { 1.883 + haveAssigned=TRUE; 1.884 + } 1.885 + break; 1.886 + default: 1.887 + break; 1.888 + } 1.889 + } 1.890 + } 1.891 + if(haveAssigned) { 1.892 + return -1; 1.893 + } else { 1.894 + return localSavings+belowSavings; 1.895 + } 1.896 +} 1.897 + 1.898 +/* helper function for finding compaction opportunities */ 1.899 +static void 1.900 +compactToUnicodeHelper(UCMStates *states, 1.901 + uint16_t *unicodeCodeUnits, 1.902 + _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks) { 1.903 + int32_t state, savings; 1.904 + 1.905 + /* for each initial state */ 1.906 + for(state=0; state<states->countStates; ++state) { 1.907 + if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) { 1.908 + savings=findUnassigned(states, 1.909 + unicodeCodeUnits, 1.910 + toUFallbacks, countToUFallbacks, 1.911 + state, 0, 0); 1.912 + if(savings>0) { 1.913 + printf(" all-unassigned sequences from initial state %ld use %ld bytes\n", 1.914 + (long)state, (long)savings); 1.915 + } 1.916 + } 1.917 + } 1.918 +} 1.919 + 1.920 +static int32_t 1.921 +compareFallbacks(const void *context, const void *fb1, const void *fb2) { 1.922 + return ((const _MBCSToUFallback *)fb1)->offset-((const _MBCSToUFallback *)fb2)->offset; 1.923 +} 1.924 + 1.925 +U_CAPI void U_EXPORT2 1.926 +ucm_optimizeStates(UCMStates *states, 1.927 + uint16_t **pUnicodeCodeUnits, 1.928 + _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, 1.929 + UBool verbose) { 1.930 + UErrorCode errorCode; 1.931 + int32_t state, cell, entry; 1.932 + 1.933 + /* test each state table entry */ 1.934 + for(state=0; state<states->countStates; ++state) { 1.935 + for(cell=0; cell<256; ++cell) { 1.936 + entry=states->stateTable[state][cell]; 1.937 + /* 1.938 + * if the entry is a final one with an MBCS_STATE_VALID_DIRECT_16 action code 1.939 + * and the code point is "unassigned" (0xfffe), then change it to 1.940 + * the "unassigned" action code with bits 26..23 set to zero and U+fffe. 1.941 + */ 1.942 + if(MBCS_ENTRY_SET_STATE(entry, 0)==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, 0xfffe)) { 1.943 + states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_UNASSIGNED); 1.944 + } 1.945 + } 1.946 + } 1.947 + 1.948 + /* try to compact the toUnicode tables */ 1.949 + if(states->maxCharLength==2) { 1.950 + compactToUnicode2(states, pUnicodeCodeUnits, toUFallbacks, countToUFallbacks, verbose); 1.951 + } else if(states->maxCharLength>2) { 1.952 + if(verbose) { 1.953 + compactToUnicodeHelper(states, *pUnicodeCodeUnits, toUFallbacks, countToUFallbacks); 1.954 + } 1.955 + } 1.956 + 1.957 + /* sort toUFallbacks */ 1.958 + /* 1.959 + * It should be safe to sort them before compactToUnicode2() is called, 1.960 + * because it should not change the relative order of the offset values 1.961 + * that it adjusts, but they need to be sorted at some point, and 1.962 + * it is safest here. 1.963 + */ 1.964 + if(countToUFallbacks>0) { 1.965 + errorCode=U_ZERO_ERROR; /* nothing bad will happen... */ 1.966 + uprv_sortArray(toUFallbacks, countToUFallbacks, 1.967 + sizeof(_MBCSToUFallback), 1.968 + compareFallbacks, NULL, FALSE, &errorCode); 1.969 + } 1.970 +} 1.971 + 1.972 +/* use a complete state table ----------------------------------------------- */ 1.973 + 1.974 +U_CAPI int32_t U_EXPORT2 1.975 +ucm_countChars(UCMStates *states, 1.976 + const uint8_t *bytes, int32_t length) { 1.977 + uint32_t offset; 1.978 + int32_t i, entry, count; 1.979 + uint8_t state; 1.980 + 1.981 + offset=0; 1.982 + count=0; 1.983 + state=0; 1.984 + 1.985 + if(states->countStates==0) { 1.986 + fprintf(stderr, "ucm error: there is no state information!\n"); 1.987 + return -1; 1.988 + } 1.989 + 1.990 + /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */ 1.991 + if(length==2 && states->outputType==MBCS_OUTPUT_2_SISO) { 1.992 + state=1; 1.993 + } 1.994 + 1.995 + /* 1.996 + * Walk down the state table like in conversion, 1.997 + * much like getNextUChar(). 1.998 + * We assume that c<=0x10ffff. 1.999 + */ 1.1000 + for(i=0; i<length; ++i) { 1.1001 + entry=states->stateTable[state][bytes[i]]; 1.1002 + if(MBCS_ENTRY_IS_TRANSITION(entry)) { 1.1003 + state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 1.1004 + offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 1.1005 + } else { 1.1006 + switch(MBCS_ENTRY_FINAL_ACTION(entry)) { 1.1007 + case MBCS_STATE_ILLEGAL: 1.1008 + fprintf(stderr, "ucm error: byte sequence ends in illegal state\n"); 1.1009 + return -1; 1.1010 + case MBCS_STATE_CHANGE_ONLY: 1.1011 + fprintf(stderr, "ucm error: byte sequence ends in state-change-only\n"); 1.1012 + return -1; 1.1013 + case MBCS_STATE_UNASSIGNED: 1.1014 + case MBCS_STATE_FALLBACK_DIRECT_16: 1.1015 + case MBCS_STATE_VALID_DIRECT_16: 1.1016 + case MBCS_STATE_FALLBACK_DIRECT_20: 1.1017 + case MBCS_STATE_VALID_DIRECT_20: 1.1018 + case MBCS_STATE_VALID_16: 1.1019 + case MBCS_STATE_VALID_16_PAIR: 1.1020 + /* count a complete character and prepare for a new one */ 1.1021 + ++count; 1.1022 + state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); 1.1023 + offset=0; 1.1024 + break; 1.1025 + default: 1.1026 + /* reserved, must never occur */ 1.1027 + fprintf(stderr, "ucm error: byte sequence reached reserved action code, entry: 0x%02lx\n", (unsigned long)entry); 1.1028 + return -1; 1.1029 + } 1.1030 + } 1.1031 + } 1.1032 + 1.1033 + if(offset!=0) { 1.1034 + fprintf(stderr, "ucm error: byte sequence too short, ends in non-final state %u\n", state); 1.1035 + return -1; 1.1036 + } 1.1037 + 1.1038 + /* 1.1039 + * for SI/SO (like EBCDIC-stateful), multiple-character results 1.1040 + * must consist of only double-byte sequences 1.1041 + */ 1.1042 + if(count>1 && states->outputType==MBCS_OUTPUT_2_SISO && length!=2*count) { 1.1043 + fprintf(stderr, "ucm error: SI/SO (like EBCDIC-stateful) result with %d characters does not contain all DBCS\n", (int)count); 1.1044 + return -1; 1.1045 + } 1.1046 + 1.1047 + return count; 1.1048 +} 1.1049 +#endif 1.1050 +