michael@0: /* michael@0: ******************************************************************************* michael@0: * michael@0: * Copyright (C) 2003-2012, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ******************************************************************************* michael@0: * file name: ucmstate.c michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2003oct09 michael@0: * created by: Markus W. Scherer michael@0: * michael@0: * This file handles ICU .ucm file state information as part of the ucm module. michael@0: * Most of this code used to be in makeconv.c. michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: #include "cstring.h" michael@0: #include "cmemory.h" michael@0: #include "uarrsort.h" michael@0: #include "ucnvmbcs.h" michael@0: #include "ucnv_ext.h" michael@0: #include "uparse.h" michael@0: #include "ucm.h" michael@0: #include michael@0: michael@0: #if !UCONFIG_NO_CONVERSION michael@0: michael@0: /* MBCS state handling ------------------------------------------------------ */ michael@0: michael@0: /* michael@0: * state table row grammar (ebnf-style): michael@0: * (whitespace is allowed between all tokens) michael@0: * michael@0: * row=[[firstentry ','] entry (',' entry)*] michael@0: * firstentry="initial" | "surrogates" michael@0: * (initial state (default for state 0), output is all surrogate pairs) michael@0: * entry=range [':' nextstate] ['.' action] michael@0: * range=number ['-' number] michael@0: * nextstate=number michael@0: * (0..7f) michael@0: * action='u' | 's' | 'p' | 'i' michael@0: * (unassigned, state change only, surrogate pair, illegal) michael@0: * number=(1- or 2-digit hexadecimal number) michael@0: */ michael@0: static const char * michael@0: parseState(const char *s, int32_t state[256], uint32_t *pFlags) { michael@0: const char *t; michael@0: uint32_t start, end, i; michael@0: int32_t entry; michael@0: michael@0: /* initialize the state: all illegal with U+ffff */ michael@0: for(i=0; i<256; ++i) { michael@0: state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0xffff); michael@0: } michael@0: michael@0: /* skip leading white space */ michael@0: s=u_skipWhitespace(s); michael@0: michael@0: /* is there an "initial" or "surrogates" directive? */ michael@0: if(uprv_strncmp("initial", s, 7)==0) { michael@0: *pFlags=MBCS_STATE_FLAG_DIRECT; michael@0: s=u_skipWhitespace(s+7); michael@0: if(*s++!=',') { michael@0: return s-1; michael@0: } michael@0: } else if(*pFlags==0 && uprv_strncmp("surrogates", s, 10)==0) { michael@0: *pFlags=MBCS_STATE_FLAG_SURROGATES; michael@0: s=u_skipWhitespace(s+10); michael@0: if(*s++!=',') { michael@0: return s-1; michael@0: } michael@0: } else if(*s==0) { michael@0: /* empty state row: all-illegal */ michael@0: return NULL; michael@0: } michael@0: michael@0: for(;;) { michael@0: /* read an entry, the start of the range first */ michael@0: s=u_skipWhitespace(s); michael@0: start=uprv_strtoul(s, (char **)&t, 16); michael@0: if(s==t || 0xffcountStates==MBCS_MAX_STATE_COUNT) { michael@0: fprintf(stderr, "ucm error: too many states (maximum %u)\n", MBCS_MAX_STATE_COUNT); michael@0: exit(U_INVALID_TABLE_FORMAT); michael@0: } michael@0: michael@0: error=parseState(s, states->stateTable[states->countStates], michael@0: &states->stateFlags[states->countStates]); michael@0: if(error!=NULL) { michael@0: fprintf(stderr, "ucm error: parse error in state definition at '%s'\n", error); michael@0: exit(U_INVALID_TABLE_FORMAT); michael@0: } michael@0: michael@0: ++states->countStates; michael@0: } michael@0: michael@0: U_CAPI UBool U_EXPORT2 michael@0: ucm_parseHeaderLine(UCMFile *ucm, michael@0: char *line, char **pKey, char **pValue) { michael@0: UCMStates *states; michael@0: char *s, *end; michael@0: char c; michael@0: michael@0: states=&ucm->states; michael@0: michael@0: /* remove comments and trailing CR and LF and remove whitespace from the end */ michael@0: for(end=line; (c=*end)!=0; ++end) { michael@0: if(c=='#' || c=='\r' || c=='\n') { michael@0: break; michael@0: } michael@0: } michael@0: while(end>line && (*(end-1)==' ' || *(end-1)=='\t')) { michael@0: --end; michael@0: } michael@0: *end=0; michael@0: michael@0: /* skip leading white space and ignore empty lines */ michael@0: s=(char *)u_skipWhitespace(line); michael@0: if(*s==0) { michael@0: return TRUE; michael@0: } michael@0: michael@0: /* stop at the beginning of the mapping section */ michael@0: if(uprv_memcmp(s, "CHARMAP", 7)==0) { michael@0: return FALSE; michael@0: } michael@0: michael@0: /* get the key name, bracketed in <> */ michael@0: if(*s!='<') { michael@0: fprintf(stderr, "ucm error: no header field in line \"%s\"\n", line); michael@0: exit(U_INVALID_TABLE_FORMAT); michael@0: } michael@0: *pKey=++s; michael@0: while(*s!='>') { michael@0: if(*s==0) { michael@0: fprintf(stderr, "ucm error: incomplete header field in line \"%s\"\n", line); michael@0: exit(U_INVALID_TABLE_FORMAT); michael@0: } michael@0: ++s; michael@0: } michael@0: *s=0; michael@0: michael@0: /* get the value string, possibly quoted */ michael@0: s=(char *)u_skipWhitespace(s+1); michael@0: if(*s!='"') { michael@0: *pValue=s; michael@0: } else { michael@0: /* remove the quotes */ michael@0: *pValue=s+1; michael@0: if(end>*pValue && *(end-1)=='"') { michael@0: *--end=0; michael@0: } michael@0: } michael@0: michael@0: /* collect the information from the header field, ignore unknown keys */ michael@0: if(uprv_strcmp(*pKey, "uconv_class")==0) { michael@0: if(uprv_strcmp(*pValue, "DBCS")==0) { michael@0: states->conversionType=UCNV_DBCS; michael@0: } else if(uprv_strcmp(*pValue, "SBCS")==0) { michael@0: states->conversionType = UCNV_SBCS; michael@0: } else if(uprv_strcmp(*pValue, "MBCS")==0) { michael@0: states->conversionType = UCNV_MBCS; michael@0: } else if(uprv_strcmp(*pValue, "EBCDIC_STATEFUL")==0) { michael@0: states->conversionType = UCNV_EBCDIC_STATEFUL; michael@0: } else { michael@0: fprintf(stderr, "ucm error: unknown %s\n", *pValue); michael@0: exit(U_INVALID_TABLE_FORMAT); michael@0: } michael@0: return TRUE; michael@0: } else if(uprv_strcmp(*pKey, "mb_cur_max")==0) { michael@0: c=**pValue; michael@0: if('1'<=c && c<='4' && (*pValue)[1]==0) { michael@0: states->maxCharLength=(int8_t)(c-'0'); michael@0: states->outputType=(int8_t)(states->maxCharLength-1); michael@0: } else { michael@0: fprintf(stderr, "ucm error: illegal %s\n", *pValue); michael@0: exit(U_INVALID_TABLE_FORMAT); michael@0: } michael@0: return TRUE; michael@0: } else if(uprv_strcmp(*pKey, "mb_cur_min")==0) { michael@0: c=**pValue; michael@0: if('1'<=c && c<='4' && (*pValue)[1]==0) { michael@0: states->minCharLength=(int8_t)(c-'0'); michael@0: } else { michael@0: fprintf(stderr, "ucm error: illegal %s\n", *pValue); michael@0: exit(U_INVALID_TABLE_FORMAT); michael@0: } michael@0: return TRUE; michael@0: } else if(uprv_strcmp(*pKey, "icu:state")==0) { michael@0: /* if an SBCS/DBCS/EBCDIC_STATEFUL converter has icu:state, then turn it into MBCS */ michael@0: switch(states->conversionType) { michael@0: case UCNV_SBCS: michael@0: case UCNV_DBCS: michael@0: case UCNV_EBCDIC_STATEFUL: michael@0: states->conversionType=UCNV_MBCS; michael@0: break; michael@0: case UCNV_MBCS: michael@0: break; michael@0: default: michael@0: fprintf(stderr, "ucm error: entry for non-MBCS table or before the line\n"); michael@0: exit(U_INVALID_TABLE_FORMAT); michael@0: } michael@0: michael@0: if(states->maxCharLength==0) { michael@0: fprintf(stderr, "ucm error: before the line\n"); michael@0: exit(U_INVALID_TABLE_FORMAT); michael@0: } michael@0: ucm_addState(states, *pValue); michael@0: return TRUE; michael@0: } else if(uprv_strcmp(*pKey, "icu:base")==0) { michael@0: if(**pValue==0) { michael@0: fprintf(stderr, "ucm error: without a base table name\n"); michael@0: exit(U_INVALID_TABLE_FORMAT); michael@0: } michael@0: uprv_strcpy(ucm->baseName, *pValue); michael@0: return TRUE; michael@0: } michael@0: michael@0: return FALSE; michael@0: } michael@0: michael@0: /* post-processing ---------------------------------------------------------- */ michael@0: michael@0: static int32_t michael@0: sumUpStates(UCMStates *states) { michael@0: int32_t entry, sum, state, cell, count; michael@0: UBool allStatesReady; michael@0: michael@0: /* michael@0: * Sum up the offsets for all states. michael@0: * In each final state (where there are only final entries), michael@0: * the offsets add up directly. michael@0: * In all other state table rows, for each transition entry to another state, michael@0: * the offsets sum of that state needs to be added. michael@0: * This is achieved in at most countStates iterations. michael@0: */ michael@0: allStatesReady=FALSE; michael@0: for(count=states->countStates; !allStatesReady && count>=0; --count) { michael@0: allStatesReady=TRUE; michael@0: for(state=states->countStates-1; state>=0; --state) { michael@0: if(!(states->stateFlags[state]&MBCS_STATE_FLAG_READY)) { michael@0: allStatesReady=FALSE; michael@0: sum=0; michael@0: michael@0: /* at first, add up only the final delta offsets to keep them <512 */ michael@0: for(cell=0; cell<256; ++cell) { michael@0: entry=states->stateTable[state][cell]; michael@0: if(MBCS_ENTRY_IS_FINAL(entry)) { michael@0: switch(MBCS_ENTRY_FINAL_ACTION(entry)) { michael@0: case MBCS_STATE_VALID_16: michael@0: states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum); michael@0: sum+=1; michael@0: break; michael@0: case MBCS_STATE_VALID_16_PAIR: michael@0: states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum); michael@0: sum+=2; michael@0: break; michael@0: default: michael@0: /* no addition */ michael@0: break; michael@0: } michael@0: } michael@0: } michael@0: michael@0: /* now, add up the delta offsets for the transitional entries */ michael@0: for(cell=0; cell<256; ++cell) { michael@0: entry=states->stateTable[state][cell]; michael@0: if(MBCS_ENTRY_IS_TRANSITION(entry)) { michael@0: if(states->stateFlags[MBCS_ENTRY_TRANSITION_STATE(entry)]&MBCS_STATE_FLAG_READY) { michael@0: states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_SET_OFFSET(entry, sum); michael@0: sum+=states->stateOffsetSum[MBCS_ENTRY_TRANSITION_STATE(entry)]; michael@0: } else { michael@0: /* that next state does not have a sum yet, we cannot finish the one for this state */ michael@0: sum=-1; michael@0: break; michael@0: } michael@0: } michael@0: } michael@0: michael@0: if(sum!=-1) { michael@0: states->stateOffsetSum[state]=sum; michael@0: states->stateFlags[state]|=MBCS_STATE_FLAG_READY; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: if(!allStatesReady) { michael@0: fprintf(stderr, "ucm error: the state table contains loops\n"); michael@0: exit(U_INVALID_TABLE_FORMAT); michael@0: } michael@0: michael@0: /* michael@0: * For all "direct" (i.e., initial) states>0, michael@0: * the offsets need to be increased by the sum of michael@0: * the previous initial states. michael@0: */ michael@0: sum=states->stateOffsetSum[0]; michael@0: for(state=1; statecountStates; ++state) { michael@0: if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) { michael@0: int32_t sum2=sum; michael@0: sum+=states->stateOffsetSum[state]; michael@0: for(cell=0; cell<256; ++cell) { michael@0: entry=states->stateTable[state][cell]; michael@0: if(MBCS_ENTRY_IS_TRANSITION(entry)) { michael@0: states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_ADD_OFFSET(entry, sum2); michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: /* round up to the next even number to have the following data 32-bit-aligned */ michael@0: return states->countToUCodeUnits=(sum+1)&~1; michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: ucm_processStates(UCMStates *states, UBool ignoreSISOCheck) { michael@0: int32_t entry, state, cell, count; michael@0: michael@0: if(states->conversionType==UCNV_UNSUPPORTED_CONVERTER) { michael@0: fprintf(stderr, "ucm error: missing conversion type ()\n"); michael@0: exit(U_INVALID_TABLE_FORMAT); michael@0: } michael@0: michael@0: if(states->countStates==0) { michael@0: switch(states->conversionType) { michael@0: case UCNV_SBCS: michael@0: /* SBCS: use MBCS data structure with a default state table */ michael@0: if(states->maxCharLength!=1) { michael@0: fprintf(stderr, "error: SBCS codepage with max B/char!=1\n"); michael@0: exit(U_INVALID_TABLE_FORMAT); michael@0: } michael@0: states->conversionType=UCNV_MBCS; michael@0: ucm_addState(states, "0-ff"); michael@0: break; michael@0: case UCNV_MBCS: michael@0: fprintf(stderr, "ucm error: missing state table information () for MBCS\n"); michael@0: exit(U_INVALID_TABLE_FORMAT); michael@0: break; michael@0: case UCNV_EBCDIC_STATEFUL: michael@0: /* EBCDIC_STATEFUL: use MBCS data structure with a default state table */ michael@0: if(states->minCharLength!=1 || states->maxCharLength!=2) { michael@0: fprintf(stderr, "error: DBCS codepage with min B/char!=1 or max B/char!=2\n"); michael@0: exit(U_INVALID_TABLE_FORMAT); michael@0: } michael@0: states->conversionType=UCNV_MBCS; michael@0: ucm_addState(states, "0-ff, e:1.s, f:0.s"); michael@0: ucm_addState(states, "initial, 0-3f:4, e:1.s, f:0.s, 40:3, 41-fe:2, ff:4"); michael@0: ucm_addState(states, "0-40:1.i, 41-fe:1., ff:1.i"); michael@0: ucm_addState(states, "0-ff:1.i, 40:1."); michael@0: ucm_addState(states, "0-ff:1.i"); michael@0: break; michael@0: case UCNV_DBCS: michael@0: /* DBCS: use MBCS data structure with a default state table */ michael@0: if(states->minCharLength!=2 || states->maxCharLength!=2) { michael@0: fprintf(stderr, "error: DBCS codepage with min or max B/char!=2\n"); michael@0: exit(U_INVALID_TABLE_FORMAT); michael@0: } michael@0: states->conversionType = UCNV_MBCS; michael@0: ucm_addState(states, "0-3f:3, 40:2, 41-fe:1, ff:3"); michael@0: ucm_addState(states, "41-fe"); michael@0: ucm_addState(states, "40"); michael@0: ucm_addState(states, ""); michael@0: break; michael@0: default: michael@0: fprintf(stderr, "ucm error: unknown charset structure\n"); michael@0: exit(U_INVALID_TABLE_FORMAT); michael@0: break; michael@0: } michael@0: } michael@0: michael@0: /* michael@0: * check that the min/max character lengths are reasonable; michael@0: * to do this right, all paths through the state table would have to be michael@0: * recursively walked while keeping track of the sequence lengths, michael@0: * but these simple checks cover most state tables in practice michael@0: */ michael@0: if(states->maxCharLengthminCharLength) { michael@0: fprintf(stderr, "ucm error: max B/char < min B/char\n"); michael@0: exit(U_INVALID_TABLE_FORMAT); michael@0: } michael@0: michael@0: /* count non-direct states and compare with max B/char */ michael@0: count=0; michael@0: for(state=0; statecountStates; ++state) { michael@0: if((states->stateFlags[state]&0xf)!=MBCS_STATE_FLAG_DIRECT) { michael@0: ++count; michael@0: } michael@0: } michael@0: if(states->maxCharLength>count+1) { michael@0: fprintf(stderr, "ucm error: max B/char too large\n"); michael@0: exit(U_INVALID_TABLE_FORMAT); michael@0: } michael@0: michael@0: if(states->minCharLength==1) { michael@0: int32_t action; michael@0: michael@0: /* michael@0: * if there are single-byte characters, michael@0: * then the initial state must have direct result states michael@0: */ michael@0: for(cell=0; cell<256; ++cell) { michael@0: entry=states->stateTable[0][cell]; michael@0: if( MBCS_ENTRY_IS_FINAL(entry) && michael@0: ((action=MBCS_ENTRY_FINAL_ACTION(entry))==MBCS_STATE_VALID_DIRECT_16 || michael@0: action==MBCS_STATE_UNASSIGNED) michael@0: ) { michael@0: break; michael@0: } michael@0: } michael@0: michael@0: if(cell==256) { michael@0: fprintf(stderr, "ucm warning: min B/char too small\n"); michael@0: } michael@0: } michael@0: michael@0: /* michael@0: * make sure that all "next state" values are within limits michael@0: * and that all next states after final ones have the "direct" michael@0: * flag of initial states michael@0: */ michael@0: for(state=states->countStates-1; state>=0; --state) { michael@0: for(cell=0; cell<256; ++cell) { michael@0: entry=states->stateTable[state][cell]; michael@0: if((uint8_t)MBCS_ENTRY_STATE(entry)>=states->countStates) { michael@0: fprintf(stderr, "ucm error: state table entry [%x][%x] has a next state of %x that is too high\n", michael@0: (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry)); michael@0: exit(U_INVALID_TABLE_FORMAT); michael@0: } michael@0: if(MBCS_ENTRY_IS_FINAL(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)!=MBCS_STATE_FLAG_DIRECT) { michael@0: fprintf(stderr, "ucm error: state table entry [%x][%x] is final but has a non-initial next state of %x\n", michael@0: (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry)); michael@0: exit(U_INVALID_TABLE_FORMAT); michael@0: } else if(MBCS_ENTRY_IS_TRANSITION(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)==MBCS_STATE_FLAG_DIRECT) { michael@0: fprintf(stderr, "ucm error: state table entry [%x][%x] is not final but has an initial next state of %x\n", michael@0: (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry)); michael@0: exit(U_INVALID_TABLE_FORMAT); michael@0: } michael@0: } michael@0: } michael@0: michael@0: /* is this an SI/SO (like EBCDIC-stateful) state table? */ michael@0: if(states->countStates>=2 && (states->stateFlags[1]&0xf)==MBCS_STATE_FLAG_DIRECT) { michael@0: if(states->maxCharLength!=2) { michael@0: fprintf(stderr, "ucm error: SI/SO codepages must have max 2 bytes/char (not %x)\n", (int)states->maxCharLength); michael@0: exit(U_INVALID_TABLE_FORMAT); michael@0: } michael@0: if(states->countStates<3) { michael@0: fprintf(stderr, "ucm error: SI/SO codepages must have at least 3 states (not %x)\n", (int)states->countStates); michael@0: exit(U_INVALID_TABLE_FORMAT); michael@0: } michael@0: /* are the SI/SO all in the right places? */ michael@0: if( ignoreSISOCheck || michael@0: (states->stateTable[0][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) && michael@0: states->stateTable[0][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0) && michael@0: states->stateTable[1][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) && michael@0: states->stateTable[1][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0)) michael@0: ) { michael@0: states->outputType=MBCS_OUTPUT_2_SISO; michael@0: } else { michael@0: fprintf(stderr, "ucm error: SI/SO codepages must have in states 0 and 1 transitions e:1.s, f:0.s\n"); michael@0: exit(U_INVALID_TABLE_FORMAT); michael@0: } michael@0: state=2; michael@0: } else { michael@0: state=1; michael@0: } michael@0: michael@0: /* check that no unexpected state is a "direct" one */ michael@0: while(statecountStates) { michael@0: if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) { michael@0: fprintf(stderr, "ucm error: state %d is 'initial' - not supported except for SI/SO codepages\n", (int)state); michael@0: exit(U_INVALID_TABLE_FORMAT); michael@0: } michael@0: ++state; michael@0: } michael@0: michael@0: sumUpStates(states); michael@0: } michael@0: michael@0: /* find a fallback for this offset; return the index or -1 if not found */ michael@0: U_CAPI int32_t U_EXPORT2 michael@0: ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, michael@0: uint32_t offset) { michael@0: int32_t i; michael@0: michael@0: if(countToUFallbacks==0) { michael@0: /* shortcut: most codepages do not have fallbacks from codepage to Unicode */ michael@0: return -1; michael@0: } michael@0: michael@0: /* do a linear search for the fallback mapping (the table is not yet sorted) */ michael@0: for(i=0; ioutputType==MBCS_OUTPUT_2_SISO) { michael@0: /* use the DBCS lead state for SI/SO codepages */ michael@0: leadState=1; michael@0: } else { michael@0: leadState=0; michael@0: } michael@0: michael@0: /* find the main trail state: the most used target state */ michael@0: uprv_memset(count, 0, sizeof(count)); michael@0: for(i=0; i<256; ++i) { michael@0: entry=states->stateTable[leadState][i]; michael@0: if(MBCS_ENTRY_IS_TRANSITION(entry)) { michael@0: ++count[MBCS_ENTRY_TRANSITION_STATE(entry)]; michael@0: } michael@0: } michael@0: trailState=0; michael@0: for(i=1; icountStates; ++i) { michael@0: if(count[i]>count[trailState]) { michael@0: trailState=i; michael@0: } michael@0: } michael@0: michael@0: /* count possible savings from lead bytes with all-unassigned results in all trail bytes */ michael@0: uprv_memset(count, 0, sizeof(count)); michael@0: savings=0; michael@0: /* for each lead byte */ michael@0: for(i=0; i<256; ++i) { michael@0: entry=states->stateTable[leadState][i]; michael@0: if(MBCS_ENTRY_IS_TRANSITION(entry) && (MBCS_ENTRY_TRANSITION_STATE(entry))==trailState) { michael@0: /* the offset is different for each lead byte */ michael@0: offset=MBCS_ENTRY_TRANSITION_OFFSET(entry); michael@0: /* for each trail byte for this lead byte */ michael@0: for(j=0; j<256; ++j) { michael@0: entry=states->stateTable[trailState][j]; michael@0: switch(MBCS_ENTRY_FINAL_ACTION(entry)) { michael@0: case MBCS_STATE_VALID_16: michael@0: entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); michael@0: if((*pUnicodeCodeUnits)[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) { michael@0: ++count[i]; michael@0: } else { michael@0: j=999; /* do not count for this lead byte because there are assignments */ michael@0: } michael@0: break; michael@0: case MBCS_STATE_VALID_16_PAIR: michael@0: entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); michael@0: if((*pUnicodeCodeUnits)[entry]==0xfffe) { michael@0: count[i]+=2; michael@0: } else { michael@0: j=999; /* do not count for this lead byte because there are assignments */ michael@0: } michael@0: break; michael@0: default: michael@0: break; michael@0: } michael@0: } michael@0: if(j==256) { michael@0: /* all trail bytes for this lead byte are unassigned */ michael@0: savings+=count[i]; michael@0: } else { michael@0: count[i]=0; michael@0: } michael@0: } michael@0: } michael@0: /* subtract from the possible savings the cost of an additional state */ michael@0: savings=savings*2-1024; /* count bytes, not 16-bit words */ michael@0: if(savings<=0) { michael@0: return; michael@0: } michael@0: if(verbose) { michael@0: printf("compacting toUnicode data saves %ld bytes\n", (long)savings); michael@0: } michael@0: if(states->countStates>=MBCS_MAX_STATE_COUNT) { michael@0: fprintf(stderr, "cannot compact toUnicode because the maximum number of states is reached\n"); michael@0: return; michael@0: } michael@0: michael@0: /* make a copy of the state table */ michael@0: oldStateTable=(int32_t (*)[256])uprv_malloc(states->countStates*1024); michael@0: if(oldStateTable==NULL) { michael@0: fprintf(stderr, "cannot compact toUnicode: out of memory\n"); michael@0: return; michael@0: } michael@0: uprv_memcpy(oldStateTable, states->stateTable, states->countStates*1024); michael@0: michael@0: /* add the new state */ michael@0: /* michael@0: * this function does not catch the degenerate case where all lead bytes michael@0: * have all-unassigned trail bytes and the lead state could be removed michael@0: */ michael@0: newState=states->countStates++; michael@0: states->stateFlags[newState]=0; michael@0: /* copy the old trail state, turning all assigned states into unassigned ones */ michael@0: for(i=0; i<256; ++i) { michael@0: entry=states->stateTable[trailState][i]; michael@0: switch(MBCS_ENTRY_FINAL_ACTION(entry)) { michael@0: case MBCS_STATE_VALID_16: michael@0: case MBCS_STATE_VALID_16_PAIR: michael@0: states->stateTable[newState][i]=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe); michael@0: break; michael@0: default: michael@0: states->stateTable[newState][i]=entry; michael@0: break; michael@0: } michael@0: } michael@0: michael@0: /* in the lead state, redirect all lead bytes with all-unassigned trail bytes to the new state */ michael@0: for(i=0; i<256; ++i) { michael@0: if(count[i]>0) { michael@0: states->stateTable[leadState][i]=MBCS_ENTRY_SET_STATE(states->stateTable[leadState][i], newState); michael@0: } michael@0: } michael@0: michael@0: /* sum up the new state table */ michael@0: for(i=0; icountStates; ++i) { michael@0: states->stateFlags[i]&=~MBCS_STATE_FLAG_READY; michael@0: } michael@0: sum=sumUpStates(states); michael@0: michael@0: /* allocate a new, smaller code units array */ michael@0: oldUnicodeCodeUnits=*pUnicodeCodeUnits; michael@0: if(sum==0) { michael@0: *pUnicodeCodeUnits=NULL; michael@0: if(oldUnicodeCodeUnits!=NULL) { michael@0: uprv_free(oldUnicodeCodeUnits); michael@0: } michael@0: uprv_free(oldStateTable); michael@0: return; michael@0: } michael@0: *pUnicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t)); michael@0: if(*pUnicodeCodeUnits==NULL) { michael@0: fprintf(stderr, "cannot compact toUnicode: out of memory allocating %ld 16-bit code units\n", michael@0: (long)sum); michael@0: /* revert to the old state table */ michael@0: *pUnicodeCodeUnits=oldUnicodeCodeUnits; michael@0: --states->countStates; michael@0: uprv_memcpy(states->stateTable, oldStateTable, states->countStates*1024); michael@0: uprv_free(oldStateTable); michael@0: return; michael@0: } michael@0: for(i=0; icountStates; ++leadState) { michael@0: if((states->stateFlags[leadState]&0xf)==MBCS_STATE_FLAG_DIRECT) { michael@0: /* for each lead byte from there */ michael@0: for(i=0; i<256; ++i) { michael@0: entry=states->stateTable[leadState][i]; michael@0: if(MBCS_ENTRY_IS_TRANSITION(entry)) { michael@0: trailState=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); michael@0: /* the new state does not have assigned states */ michael@0: if(trailState!=newState) { michael@0: trailOffset=MBCS_ENTRY_TRANSITION_OFFSET(entry); michael@0: oldTrailOffset=MBCS_ENTRY_TRANSITION_OFFSET(oldStateTable[leadState][i]); michael@0: /* for each trail byte */ michael@0: for(j=0; j<256; ++j) { michael@0: entry=states->stateTable[trailState][j]; michael@0: /* copy assigned-character code units and adjust fallback offsets */ michael@0: switch(MBCS_ENTRY_FINAL_ACTION(entry)) { michael@0: case MBCS_STATE_VALID_16: michael@0: offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry); michael@0: /* find the old offset according to the old state table */ michael@0: oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]); michael@0: unit=(*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset]; michael@0: if(unit==0xfffe && (fallback=ucm_findFallback(toUFallbacks, countToUFallbacks, oldOffset))>=0) { michael@0: toUFallbacks[fallback].offset=0x80000000|offset; michael@0: } michael@0: break; michael@0: case MBCS_STATE_VALID_16_PAIR: michael@0: offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry); michael@0: /* find the old offset according to the old state table */ michael@0: oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]); michael@0: (*pUnicodeCodeUnits)[offset++]=oldUnicodeCodeUnits[oldOffset++]; michael@0: (*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset]; michael@0: break; michael@0: default: michael@0: break; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: /* remove temporary flags from fallback offsets that protected them from being modified twice */ michael@0: for(i=0; i0 number of bytes that are used in unicodeCodeUnits[] that could be saved, michael@0: * if all sequences from this state are unassigned, returns the michael@0: * <0 there are assignments in unicodeCodeUnits[] michael@0: * 0 no use of unicodeCodeUnits[] michael@0: */ michael@0: static int32_t michael@0: findUnassigned(UCMStates *states, michael@0: uint16_t *unicodeCodeUnits, michael@0: _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, michael@0: int32_t state, int32_t offset, uint32_t b) { michael@0: int32_t i, entry, savings, localSavings, belowSavings; michael@0: UBool haveAssigned; michael@0: michael@0: localSavings=belowSavings=0; michael@0: haveAssigned=FALSE; michael@0: for(i=0; i<256; ++i) { michael@0: entry=states->stateTable[state][i]; michael@0: if(MBCS_ENTRY_IS_TRANSITION(entry)) { michael@0: savings=findUnassigned(states, michael@0: unicodeCodeUnits, michael@0: toUFallbacks, countToUFallbacks, michael@0: MBCS_ENTRY_TRANSITION_STATE(entry), michael@0: offset+MBCS_ENTRY_TRANSITION_OFFSET(entry), michael@0: (b<<8)|(uint32_t)i); michael@0: if(savings<0) { michael@0: haveAssigned=TRUE; michael@0: } else if(savings>0) { michael@0: printf(" all-unassigned sequences from prefix 0x%02lx state %ld use %ld bytes\n", michael@0: (unsigned long)((b<<8)|i), (long)state, (long)savings); michael@0: belowSavings+=savings; michael@0: } michael@0: } else if(!haveAssigned) { michael@0: switch(MBCS_ENTRY_FINAL_ACTION(entry)) { michael@0: case MBCS_STATE_VALID_16: michael@0: entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); michael@0: if(unicodeCodeUnits[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) { michael@0: localSavings+=2; michael@0: } else { michael@0: haveAssigned=TRUE; michael@0: } michael@0: break; michael@0: case MBCS_STATE_VALID_16_PAIR: michael@0: entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); michael@0: if(unicodeCodeUnits[entry]==0xfffe) { michael@0: localSavings+=4; michael@0: } else { michael@0: haveAssigned=TRUE; michael@0: } michael@0: break; michael@0: default: michael@0: break; michael@0: } michael@0: } michael@0: } michael@0: if(haveAssigned) { michael@0: return -1; michael@0: } else { michael@0: return localSavings+belowSavings; michael@0: } michael@0: } michael@0: michael@0: /* helper function for finding compaction opportunities */ michael@0: static void michael@0: compactToUnicodeHelper(UCMStates *states, michael@0: uint16_t *unicodeCodeUnits, michael@0: _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks) { michael@0: int32_t state, savings; michael@0: michael@0: /* for each initial state */ michael@0: for(state=0; statecountStates; ++state) { michael@0: if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) { michael@0: savings=findUnassigned(states, michael@0: unicodeCodeUnits, michael@0: toUFallbacks, countToUFallbacks, michael@0: state, 0, 0); michael@0: if(savings>0) { michael@0: printf(" all-unassigned sequences from initial state %ld use %ld bytes\n", michael@0: (long)state, (long)savings); michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: static int32_t michael@0: compareFallbacks(const void *context, const void *fb1, const void *fb2) { michael@0: return ((const _MBCSToUFallback *)fb1)->offset-((const _MBCSToUFallback *)fb2)->offset; michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: ucm_optimizeStates(UCMStates *states, michael@0: uint16_t **pUnicodeCodeUnits, michael@0: _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, michael@0: UBool verbose) { michael@0: UErrorCode errorCode; michael@0: int32_t state, cell, entry; michael@0: michael@0: /* test each state table entry */ michael@0: for(state=0; statecountStates; ++state) { michael@0: for(cell=0; cell<256; ++cell) { michael@0: entry=states->stateTable[state][cell]; michael@0: /* michael@0: * if the entry is a final one with an MBCS_STATE_VALID_DIRECT_16 action code michael@0: * and the code point is "unassigned" (0xfffe), then change it to michael@0: * the "unassigned" action code with bits 26..23 set to zero and U+fffe. michael@0: */ michael@0: if(MBCS_ENTRY_SET_STATE(entry, 0)==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, 0xfffe)) { michael@0: states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_UNASSIGNED); michael@0: } michael@0: } michael@0: } michael@0: michael@0: /* try to compact the toUnicode tables */ michael@0: if(states->maxCharLength==2) { michael@0: compactToUnicode2(states, pUnicodeCodeUnits, toUFallbacks, countToUFallbacks, verbose); michael@0: } else if(states->maxCharLength>2) { michael@0: if(verbose) { michael@0: compactToUnicodeHelper(states, *pUnicodeCodeUnits, toUFallbacks, countToUFallbacks); michael@0: } michael@0: } michael@0: michael@0: /* sort toUFallbacks */ michael@0: /* michael@0: * It should be safe to sort them before compactToUnicode2() is called, michael@0: * because it should not change the relative order of the offset values michael@0: * that it adjusts, but they need to be sorted at some point, and michael@0: * it is safest here. michael@0: */ michael@0: if(countToUFallbacks>0) { michael@0: errorCode=U_ZERO_ERROR; /* nothing bad will happen... */ michael@0: uprv_sortArray(toUFallbacks, countToUFallbacks, michael@0: sizeof(_MBCSToUFallback), michael@0: compareFallbacks, NULL, FALSE, &errorCode); michael@0: } michael@0: } michael@0: michael@0: /* use a complete state table ----------------------------------------------- */ michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: ucm_countChars(UCMStates *states, michael@0: const uint8_t *bytes, int32_t length) { michael@0: uint32_t offset; michael@0: int32_t i, entry, count; michael@0: uint8_t state; michael@0: michael@0: offset=0; michael@0: count=0; michael@0: state=0; michael@0: michael@0: if(states->countStates==0) { michael@0: fprintf(stderr, "ucm error: there is no state information!\n"); michael@0: return -1; michael@0: } michael@0: michael@0: /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */ michael@0: if(length==2 && states->outputType==MBCS_OUTPUT_2_SISO) { michael@0: state=1; michael@0: } michael@0: michael@0: /* michael@0: * Walk down the state table like in conversion, michael@0: * much like getNextUChar(). michael@0: * We assume that c<=0x10ffff. michael@0: */ michael@0: for(i=0; istateTable[state][bytes[i]]; michael@0: if(MBCS_ENTRY_IS_TRANSITION(entry)) { michael@0: state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); michael@0: offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); michael@0: } else { michael@0: switch(MBCS_ENTRY_FINAL_ACTION(entry)) { michael@0: case MBCS_STATE_ILLEGAL: michael@0: fprintf(stderr, "ucm error: byte sequence ends in illegal state\n"); michael@0: return -1; michael@0: case MBCS_STATE_CHANGE_ONLY: michael@0: fprintf(stderr, "ucm error: byte sequence ends in state-change-only\n"); michael@0: return -1; michael@0: case MBCS_STATE_UNASSIGNED: michael@0: case MBCS_STATE_FALLBACK_DIRECT_16: michael@0: case MBCS_STATE_VALID_DIRECT_16: michael@0: case MBCS_STATE_FALLBACK_DIRECT_20: michael@0: case MBCS_STATE_VALID_DIRECT_20: michael@0: case MBCS_STATE_VALID_16: michael@0: case MBCS_STATE_VALID_16_PAIR: michael@0: /* count a complete character and prepare for a new one */ michael@0: ++count; michael@0: state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); michael@0: offset=0; michael@0: break; michael@0: default: michael@0: /* reserved, must never occur */ michael@0: fprintf(stderr, "ucm error: byte sequence reached reserved action code, entry: 0x%02lx\n", (unsigned long)entry); michael@0: return -1; michael@0: } michael@0: } michael@0: } michael@0: michael@0: if(offset!=0) { michael@0: fprintf(stderr, "ucm error: byte sequence too short, ends in non-final state %u\n", state); michael@0: return -1; michael@0: } michael@0: michael@0: /* michael@0: * for SI/SO (like EBCDIC-stateful), multiple-character results michael@0: * must consist of only double-byte sequences michael@0: */ michael@0: if(count>1 && states->outputType==MBCS_OUTPUT_2_SISO && length!=2*count) { michael@0: fprintf(stderr, "ucm error: SI/SO (like EBCDIC-stateful) result with %d characters does not contain all DBCS\n", (int)count); michael@0: return -1; michael@0: } michael@0: michael@0: return count; michael@0: } michael@0: #endif michael@0: