intl/icu/source/tools/toolutil/ucmstate.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/tools/toolutil/ucmstate.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1047 @@
     1.4 +/*
     1.5 +*******************************************************************************
     1.6 +*
     1.7 +*   Copyright (C) 2003-2012, International Business Machines
     1.8 +*   Corporation and others.  All Rights Reserved.
     1.9 +*
    1.10 +*******************************************************************************
    1.11 +*   file name:  ucmstate.c
    1.12 +*   encoding:   US-ASCII
    1.13 +*   tab size:   8 (not used)
    1.14 +*   indentation:4
    1.15 +*
    1.16 +*   created on: 2003oct09
    1.17 +*   created by: Markus W. Scherer
    1.18 +*
    1.19 +*   This file handles ICU .ucm file state information as part of the ucm module.
    1.20 +*   Most of this code used to be in makeconv.c.
    1.21 +*/
    1.22 +
    1.23 +#include "unicode/utypes.h"
    1.24 +#include "cstring.h"
    1.25 +#include "cmemory.h"
    1.26 +#include "uarrsort.h"
    1.27 +#include "ucnvmbcs.h"
    1.28 +#include "ucnv_ext.h"
    1.29 +#include "uparse.h"
    1.30 +#include "ucm.h"
    1.31 +#include <stdio.h>
    1.32 +
    1.33 +#if !UCONFIG_NO_CONVERSION
    1.34 +
    1.35 +/* MBCS state handling ------------------------------------------------------ */
    1.36 +
    1.37 +/*
    1.38 + * state table row grammar (ebnf-style):
    1.39 + * (whitespace is allowed between all tokens)
    1.40 + *
    1.41 + * row=[[firstentry ','] entry (',' entry)*]
    1.42 + * firstentry="initial" | "surrogates"
    1.43 + *            (initial state (default for state 0), output is all surrogate pairs)
    1.44 + * entry=range [':' nextstate] ['.' action]
    1.45 + * range=number ['-' number]
    1.46 + * nextstate=number
    1.47 + *           (0..7f)
    1.48 + * action='u' | 's' | 'p' | 'i'
    1.49 + *        (unassigned, state change only, surrogate pair, illegal)
    1.50 + * number=(1- or 2-digit hexadecimal number)
    1.51 + */
    1.52 +static const char *
    1.53 +parseState(const char *s, int32_t state[256], uint32_t *pFlags) {
    1.54 +    const char *t;
    1.55 +    uint32_t start, end, i;
    1.56 +    int32_t entry;
    1.57 +
    1.58 +    /* initialize the state: all illegal with U+ffff */
    1.59 +    for(i=0; i<256; ++i) {
    1.60 +        state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0xffff);
    1.61 +    }
    1.62 +
    1.63 +    /* skip leading white space */
    1.64 +    s=u_skipWhitespace(s);
    1.65 +
    1.66 +    /* is there an "initial" or "surrogates" directive? */
    1.67 +    if(uprv_strncmp("initial", s, 7)==0) {
    1.68 +        *pFlags=MBCS_STATE_FLAG_DIRECT;
    1.69 +        s=u_skipWhitespace(s+7);
    1.70 +        if(*s++!=',') {
    1.71 +            return s-1;
    1.72 +        }
    1.73 +    } else if(*pFlags==0 && uprv_strncmp("surrogates", s, 10)==0) {
    1.74 +        *pFlags=MBCS_STATE_FLAG_SURROGATES;
    1.75 +        s=u_skipWhitespace(s+10);
    1.76 +        if(*s++!=',') {
    1.77 +            return s-1;
    1.78 +        }
    1.79 +    } else if(*s==0) {
    1.80 +        /* empty state row: all-illegal */
    1.81 +        return NULL;
    1.82 +    }
    1.83 +
    1.84 +    for(;;) {
    1.85 +        /* read an entry, the start of the range first */
    1.86 +        s=u_skipWhitespace(s);
    1.87 +        start=uprv_strtoul(s, (char **)&t, 16);
    1.88 +        if(s==t || 0xff<start) {
    1.89 +            return s;
    1.90 +        }
    1.91 +        s=u_skipWhitespace(t);
    1.92 +
    1.93 +        /* read the end of the range if there is one */
    1.94 +        if(*s=='-') {
    1.95 +            s=u_skipWhitespace(s+1);
    1.96 +            end=uprv_strtoul(s, (char **)&t, 16);
    1.97 +            if(s==t || end<start || 0xff<end) {
    1.98 +                return s;
    1.99 +            }
   1.100 +            s=u_skipWhitespace(t);
   1.101 +        } else {
   1.102 +            end=start;
   1.103 +        }
   1.104 +
   1.105 +        /* determine the state entrys for this range */
   1.106 +        if(*s!=':' && *s!='.') {
   1.107 +            /* the default is: final state with valid entries */
   1.108 +            entry=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_16, 0);
   1.109 +        } else {
   1.110 +            entry=MBCS_ENTRY_TRANSITION(0, 0);
   1.111 +            if(*s==':') {
   1.112 +                /* get the next state, default to 0 */
   1.113 +                s=u_skipWhitespace(s+1);
   1.114 +                i=uprv_strtoul(s, (char **)&t, 16);
   1.115 +                if(s!=t) {
   1.116 +                    if(0x7f<i) {
   1.117 +                        return s;
   1.118 +                    }
   1.119 +                    s=u_skipWhitespace(t);
   1.120 +                    entry=MBCS_ENTRY_SET_STATE(entry, i);
   1.121 +                }
   1.122 +            }
   1.123 +
   1.124 +            /* get the state action, default to valid */
   1.125 +            if(*s=='.') {
   1.126 +                /* this is a final state */
   1.127 +                entry=MBCS_ENTRY_SET_FINAL(entry);
   1.128 +
   1.129 +                s=u_skipWhitespace(s+1);
   1.130 +                if(*s=='u') {
   1.131 +                    /* unassigned set U+fffe */
   1.132 +                    entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe);
   1.133 +                    s=u_skipWhitespace(s+1);
   1.134 +                } else if(*s=='p') {
   1.135 +                    if(*pFlags!=MBCS_STATE_FLAG_DIRECT) {
   1.136 +                        entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16_PAIR);
   1.137 +                    } else {
   1.138 +                        entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16);
   1.139 +                    }
   1.140 +                    s=u_skipWhitespace(s+1);
   1.141 +                } else if(*s=='s') {
   1.142 +                    entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_CHANGE_ONLY);
   1.143 +                    s=u_skipWhitespace(s+1);
   1.144 +                } else if(*s=='i') {
   1.145 +                    /* illegal set U+ffff */
   1.146 +                    entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_ILLEGAL, 0xffff);
   1.147 +                    s=u_skipWhitespace(s+1);
   1.148 +                } else {
   1.149 +                    /* default to valid */
   1.150 +                    entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16);
   1.151 +                }
   1.152 +            } else {
   1.153 +                /* this is an intermediate state, nothing to do */
   1.154 +            }
   1.155 +        }
   1.156 +
   1.157 +        /* adjust "final valid" states according to the state flags */
   1.158 +        if(MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16) {
   1.159 +            switch(*pFlags) {
   1.160 +            case 0:
   1.161 +                /* no adjustment */
   1.162 +                break;
   1.163 +            case MBCS_STATE_FLAG_DIRECT:
   1.164 +                /* set the valid-direct code point to "unassigned"==0xfffe */
   1.165 +                entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_DIRECT_16, 0xfffe);
   1.166 +                break;
   1.167 +            case MBCS_STATE_FLAG_SURROGATES:
   1.168 +                entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_16_PAIR, 0);
   1.169 +                break;
   1.170 +            default:
   1.171 +                break;
   1.172 +            }
   1.173 +        }
   1.174 +
   1.175 +        /* set this entry for the range */
   1.176 +        for(i=start; i<=end; ++i) {
   1.177 +            state[i]=entry;
   1.178 +        }
   1.179 +
   1.180 +        if(*s==',') {
   1.181 +            ++s;
   1.182 +        } else {
   1.183 +            return *s==0 ? NULL : s;
   1.184 +        }
   1.185 +    }
   1.186 +}
   1.187 +
   1.188 +U_CAPI void U_EXPORT2
   1.189 +ucm_addState(UCMStates *states, const char *s) {
   1.190 +    const char *error;
   1.191 +
   1.192 +    if(states->countStates==MBCS_MAX_STATE_COUNT) {
   1.193 +        fprintf(stderr, "ucm error: too many states (maximum %u)\n", MBCS_MAX_STATE_COUNT);
   1.194 +        exit(U_INVALID_TABLE_FORMAT);
   1.195 +    }
   1.196 +
   1.197 +    error=parseState(s, states->stateTable[states->countStates],
   1.198 +                       &states->stateFlags[states->countStates]);
   1.199 +    if(error!=NULL) {
   1.200 +        fprintf(stderr, "ucm error: parse error in state definition at '%s'\n", error);
   1.201 +        exit(U_INVALID_TABLE_FORMAT);
   1.202 +    }
   1.203 +
   1.204 +    ++states->countStates;
   1.205 +}
   1.206 +
   1.207 +U_CAPI UBool U_EXPORT2
   1.208 +ucm_parseHeaderLine(UCMFile *ucm,
   1.209 +                    char *line, char **pKey, char **pValue) {
   1.210 +    UCMStates *states;
   1.211 +    char *s, *end;
   1.212 +    char c;
   1.213 +
   1.214 +    states=&ucm->states;
   1.215 +
   1.216 +    /* remove comments and trailing CR and LF and remove whitespace from the end */
   1.217 +    for(end=line; (c=*end)!=0; ++end) {
   1.218 +        if(c=='#' || c=='\r' || c=='\n') {
   1.219 +            break;
   1.220 +        }
   1.221 +    }
   1.222 +    while(end>line && (*(end-1)==' ' || *(end-1)=='\t')) {
   1.223 +        --end;
   1.224 +    }
   1.225 +    *end=0;
   1.226 +
   1.227 +    /* skip leading white space and ignore empty lines */
   1.228 +    s=(char *)u_skipWhitespace(line);
   1.229 +    if(*s==0) {
   1.230 +        return TRUE;
   1.231 +    }
   1.232 +
   1.233 +    /* stop at the beginning of the mapping section */
   1.234 +    if(uprv_memcmp(s, "CHARMAP", 7)==0) {
   1.235 +        return FALSE;
   1.236 +    }
   1.237 +
   1.238 +    /* get the key name, bracketed in <> */
   1.239 +    if(*s!='<') {
   1.240 +        fprintf(stderr, "ucm error: no header field <key> in line \"%s\"\n", line);
   1.241 +        exit(U_INVALID_TABLE_FORMAT);
   1.242 +    }
   1.243 +    *pKey=++s;
   1.244 +    while(*s!='>') {
   1.245 +        if(*s==0) {
   1.246 +            fprintf(stderr, "ucm error: incomplete header field <key> in line \"%s\"\n", line);
   1.247 +            exit(U_INVALID_TABLE_FORMAT);
   1.248 +        }
   1.249 +        ++s;
   1.250 +    }
   1.251 +    *s=0;
   1.252 +
   1.253 +    /* get the value string, possibly quoted */
   1.254 +    s=(char *)u_skipWhitespace(s+1);
   1.255 +    if(*s!='"') {
   1.256 +        *pValue=s;
   1.257 +    } else {
   1.258 +        /* remove the quotes */
   1.259 +        *pValue=s+1;
   1.260 +        if(end>*pValue && *(end-1)=='"') {
   1.261 +            *--end=0;
   1.262 +        }
   1.263 +    }
   1.264 +
   1.265 +    /* collect the information from the header field, ignore unknown keys */
   1.266 +    if(uprv_strcmp(*pKey, "uconv_class")==0) {
   1.267 +        if(uprv_strcmp(*pValue, "DBCS")==0) {
   1.268 +            states->conversionType=UCNV_DBCS;
   1.269 +        } else if(uprv_strcmp(*pValue, "SBCS")==0) {
   1.270 +            states->conversionType = UCNV_SBCS;
   1.271 +        } else if(uprv_strcmp(*pValue, "MBCS")==0) {
   1.272 +            states->conversionType = UCNV_MBCS;
   1.273 +        } else if(uprv_strcmp(*pValue, "EBCDIC_STATEFUL")==0) {
   1.274 +            states->conversionType = UCNV_EBCDIC_STATEFUL;
   1.275 +        } else {
   1.276 +            fprintf(stderr, "ucm error: unknown <uconv_class> %s\n", *pValue);
   1.277 +            exit(U_INVALID_TABLE_FORMAT);
   1.278 +        }
   1.279 +        return TRUE;
   1.280 +    } else if(uprv_strcmp(*pKey, "mb_cur_max")==0) {
   1.281 +        c=**pValue;
   1.282 +        if('1'<=c && c<='4' && (*pValue)[1]==0) {
   1.283 +            states->maxCharLength=(int8_t)(c-'0');
   1.284 +            states->outputType=(int8_t)(states->maxCharLength-1);
   1.285 +        } else {
   1.286 +            fprintf(stderr, "ucm error: illegal <mb_cur_max> %s\n", *pValue);
   1.287 +            exit(U_INVALID_TABLE_FORMAT);
   1.288 +        }
   1.289 +        return TRUE;
   1.290 +    } else if(uprv_strcmp(*pKey, "mb_cur_min")==0) {
   1.291 +        c=**pValue;
   1.292 +        if('1'<=c && c<='4' && (*pValue)[1]==0) {
   1.293 +            states->minCharLength=(int8_t)(c-'0');
   1.294 +        } else {
   1.295 +            fprintf(stderr, "ucm error: illegal <mb_cur_min> %s\n", *pValue);
   1.296 +            exit(U_INVALID_TABLE_FORMAT);
   1.297 +        }
   1.298 +        return TRUE;
   1.299 +    } else if(uprv_strcmp(*pKey, "icu:state")==0) {
   1.300 +        /* if an SBCS/DBCS/EBCDIC_STATEFUL converter has icu:state, then turn it into MBCS */
   1.301 +        switch(states->conversionType) {
   1.302 +        case UCNV_SBCS:
   1.303 +        case UCNV_DBCS:
   1.304 +        case UCNV_EBCDIC_STATEFUL:
   1.305 +            states->conversionType=UCNV_MBCS;
   1.306 +            break;
   1.307 +        case UCNV_MBCS:
   1.308 +            break;
   1.309 +        default:
   1.310 +            fprintf(stderr, "ucm error: <icu:state> entry for non-MBCS table or before the <uconv_class> line\n");
   1.311 +            exit(U_INVALID_TABLE_FORMAT);
   1.312 +        }
   1.313 +
   1.314 +        if(states->maxCharLength==0) {
   1.315 +            fprintf(stderr, "ucm error: <icu:state> before the <mb_cur_max> line\n");
   1.316 +            exit(U_INVALID_TABLE_FORMAT);
   1.317 +        }
   1.318 +        ucm_addState(states, *pValue);
   1.319 +        return TRUE;
   1.320 +    } else if(uprv_strcmp(*pKey, "icu:base")==0) {
   1.321 +        if(**pValue==0) {
   1.322 +            fprintf(stderr, "ucm error: <icu:base> without a base table name\n");
   1.323 +            exit(U_INVALID_TABLE_FORMAT);
   1.324 +        }
   1.325 +        uprv_strcpy(ucm->baseName, *pValue);
   1.326 +        return TRUE;
   1.327 +    }
   1.328 +
   1.329 +    return FALSE;
   1.330 +}
   1.331 +
   1.332 +/* post-processing ---------------------------------------------------------- */
   1.333 +
   1.334 +static int32_t
   1.335 +sumUpStates(UCMStates *states) {
   1.336 +    int32_t entry, sum, state, cell, count;
   1.337 +    UBool allStatesReady;
   1.338 +
   1.339 +    /*
   1.340 +     * Sum up the offsets for all states.
   1.341 +     * In each final state (where there are only final entries),
   1.342 +     * the offsets add up directly.
   1.343 +     * In all other state table rows, for each transition entry to another state,
   1.344 +     * the offsets sum of that state needs to be added.
   1.345 +     * This is achieved in at most countStates iterations.
   1.346 +     */
   1.347 +    allStatesReady=FALSE;
   1.348 +    for(count=states->countStates; !allStatesReady && count>=0; --count) {
   1.349 +        allStatesReady=TRUE;
   1.350 +        for(state=states->countStates-1; state>=0; --state) {
   1.351 +            if(!(states->stateFlags[state]&MBCS_STATE_FLAG_READY)) {
   1.352 +                allStatesReady=FALSE;
   1.353 +                sum=0;
   1.354 +
   1.355 +                /* at first, add up only the final delta offsets to keep them <512 */
   1.356 +                for(cell=0; cell<256; ++cell) {
   1.357 +                    entry=states->stateTable[state][cell];
   1.358 +                    if(MBCS_ENTRY_IS_FINAL(entry)) {
   1.359 +                        switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
   1.360 +                        case MBCS_STATE_VALID_16:
   1.361 +                            states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum);
   1.362 +                            sum+=1;
   1.363 +                            break;
   1.364 +                        case MBCS_STATE_VALID_16_PAIR:
   1.365 +                            states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum);
   1.366 +                            sum+=2;
   1.367 +                            break;
   1.368 +                        default:
   1.369 +                            /* no addition */
   1.370 +                            break;
   1.371 +                        }
   1.372 +                    }
   1.373 +                }
   1.374 +
   1.375 +                /* now, add up the delta offsets for the transitional entries */
   1.376 +                for(cell=0; cell<256; ++cell) {
   1.377 +                    entry=states->stateTable[state][cell];
   1.378 +                    if(MBCS_ENTRY_IS_TRANSITION(entry)) {
   1.379 +                        if(states->stateFlags[MBCS_ENTRY_TRANSITION_STATE(entry)]&MBCS_STATE_FLAG_READY) {
   1.380 +                            states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_SET_OFFSET(entry, sum);
   1.381 +                            sum+=states->stateOffsetSum[MBCS_ENTRY_TRANSITION_STATE(entry)];
   1.382 +                        } else {
   1.383 +                            /* that next state does not have a sum yet, we cannot finish the one for this state */
   1.384 +                            sum=-1;
   1.385 +                            break;
   1.386 +                        }
   1.387 +                    }
   1.388 +                }
   1.389 +
   1.390 +                if(sum!=-1) {
   1.391 +                    states->stateOffsetSum[state]=sum;
   1.392 +                    states->stateFlags[state]|=MBCS_STATE_FLAG_READY;
   1.393 +                }
   1.394 +            }
   1.395 +        }
   1.396 +    }
   1.397 +
   1.398 +    if(!allStatesReady) {
   1.399 +        fprintf(stderr, "ucm error: the state table contains loops\n");
   1.400 +        exit(U_INVALID_TABLE_FORMAT);
   1.401 +    }
   1.402 +
   1.403 +    /*
   1.404 +     * For all "direct" (i.e., initial) states>0,
   1.405 +     * the offsets need to be increased by the sum of
   1.406 +     * the previous initial states.
   1.407 +     */
   1.408 +    sum=states->stateOffsetSum[0];
   1.409 +    for(state=1; state<states->countStates; ++state) {
   1.410 +        if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
   1.411 +            int32_t sum2=sum;
   1.412 +            sum+=states->stateOffsetSum[state];
   1.413 +            for(cell=0; cell<256; ++cell) {
   1.414 +                entry=states->stateTable[state][cell];
   1.415 +                if(MBCS_ENTRY_IS_TRANSITION(entry)) {
   1.416 +                    states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_ADD_OFFSET(entry, sum2);
   1.417 +                }
   1.418 +            }
   1.419 +        }
   1.420 +    }
   1.421 +
   1.422 +    /* round up to the next even number to have the following data 32-bit-aligned */
   1.423 +    return states->countToUCodeUnits=(sum+1)&~1;
   1.424 +}
   1.425 +
   1.426 +U_CAPI void U_EXPORT2
   1.427 +ucm_processStates(UCMStates *states, UBool ignoreSISOCheck) {
   1.428 +    int32_t entry, state, cell, count;
   1.429 +
   1.430 +    if(states->conversionType==UCNV_UNSUPPORTED_CONVERTER) {
   1.431 +        fprintf(stderr, "ucm error: missing conversion type (<uconv_class>)\n");
   1.432 +        exit(U_INVALID_TABLE_FORMAT);
   1.433 +    }
   1.434 +
   1.435 +    if(states->countStates==0) {
   1.436 +        switch(states->conversionType) {
   1.437 +        case UCNV_SBCS:
   1.438 +            /* SBCS: use MBCS data structure with a default state table */
   1.439 +            if(states->maxCharLength!=1) {
   1.440 +                fprintf(stderr, "error: SBCS codepage with max B/char!=1\n");
   1.441 +                exit(U_INVALID_TABLE_FORMAT);
   1.442 +            }
   1.443 +            states->conversionType=UCNV_MBCS;
   1.444 +            ucm_addState(states, "0-ff");
   1.445 +            break;
   1.446 +        case UCNV_MBCS:
   1.447 +            fprintf(stderr, "ucm error: missing state table information (<icu:state>) for MBCS\n");
   1.448 +            exit(U_INVALID_TABLE_FORMAT);
   1.449 +            break;
   1.450 +        case UCNV_EBCDIC_STATEFUL:
   1.451 +            /* EBCDIC_STATEFUL: use MBCS data structure with a default state table */
   1.452 +            if(states->minCharLength!=1 || states->maxCharLength!=2) {
   1.453 +                fprintf(stderr, "error: DBCS codepage with min B/char!=1 or max B/char!=2\n");
   1.454 +                exit(U_INVALID_TABLE_FORMAT);
   1.455 +            }
   1.456 +            states->conversionType=UCNV_MBCS;
   1.457 +            ucm_addState(states, "0-ff, e:1.s, f:0.s");
   1.458 +            ucm_addState(states, "initial, 0-3f:4, e:1.s, f:0.s, 40:3, 41-fe:2, ff:4");
   1.459 +            ucm_addState(states, "0-40:1.i, 41-fe:1., ff:1.i");
   1.460 +            ucm_addState(states, "0-ff:1.i, 40:1.");
   1.461 +            ucm_addState(states, "0-ff:1.i");
   1.462 +            break;
   1.463 +        case UCNV_DBCS:
   1.464 +            /* DBCS: use MBCS data structure with a default state table */
   1.465 +            if(states->minCharLength!=2 || states->maxCharLength!=2) {
   1.466 +                fprintf(stderr, "error: DBCS codepage with min or max B/char!=2\n");
   1.467 +                exit(U_INVALID_TABLE_FORMAT);
   1.468 +            }
   1.469 +            states->conversionType = UCNV_MBCS;
   1.470 +            ucm_addState(states, "0-3f:3, 40:2, 41-fe:1, ff:3");
   1.471 +            ucm_addState(states, "41-fe");
   1.472 +            ucm_addState(states, "40");
   1.473 +            ucm_addState(states, "");
   1.474 +            break;
   1.475 +        default:
   1.476 +            fprintf(stderr, "ucm error: unknown charset structure\n");
   1.477 +            exit(U_INVALID_TABLE_FORMAT);
   1.478 +            break;
   1.479 +        }
   1.480 +    }
   1.481 +
   1.482 +    /*
   1.483 +     * check that the min/max character lengths are reasonable;
   1.484 +     * to do this right, all paths through the state table would have to be
   1.485 +     * recursively walked while keeping track of the sequence lengths,
   1.486 +     * but these simple checks cover most state tables in practice
   1.487 +     */
   1.488 +    if(states->maxCharLength<states->minCharLength) {
   1.489 +        fprintf(stderr, "ucm error: max B/char < min B/char\n");
   1.490 +        exit(U_INVALID_TABLE_FORMAT);
   1.491 +    }
   1.492 +
   1.493 +    /* count non-direct states and compare with max B/char */
   1.494 +    count=0;
   1.495 +    for(state=0; state<states->countStates; ++state) {
   1.496 +        if((states->stateFlags[state]&0xf)!=MBCS_STATE_FLAG_DIRECT) {
   1.497 +            ++count;
   1.498 +        }
   1.499 +    }
   1.500 +    if(states->maxCharLength>count+1) {
   1.501 +        fprintf(stderr, "ucm error: max B/char too large\n");
   1.502 +        exit(U_INVALID_TABLE_FORMAT);
   1.503 +    }
   1.504 +
   1.505 +    if(states->minCharLength==1) {
   1.506 +        int32_t action;
   1.507 +
   1.508 +        /*
   1.509 +         * if there are single-byte characters,
   1.510 +         * then the initial state must have direct result states
   1.511 +         */
   1.512 +        for(cell=0; cell<256; ++cell) {
   1.513 +            entry=states->stateTable[0][cell];
   1.514 +            if( MBCS_ENTRY_IS_FINAL(entry) &&
   1.515 +                ((action=MBCS_ENTRY_FINAL_ACTION(entry))==MBCS_STATE_VALID_DIRECT_16 ||
   1.516 +                 action==MBCS_STATE_UNASSIGNED)
   1.517 +            ) {
   1.518 +                break;
   1.519 +            }
   1.520 +        }
   1.521 +
   1.522 +        if(cell==256) {
   1.523 +            fprintf(stderr, "ucm warning: min B/char too small\n");
   1.524 +        }
   1.525 +    }
   1.526 +
   1.527 +    /*
   1.528 +     * make sure that all "next state" values are within limits
   1.529 +     * and that all next states after final ones have the "direct"
   1.530 +     * flag of initial states
   1.531 +     */
   1.532 +    for(state=states->countStates-1; state>=0; --state) {
   1.533 +        for(cell=0; cell<256; ++cell) {
   1.534 +            entry=states->stateTable[state][cell];
   1.535 +            if((uint8_t)MBCS_ENTRY_STATE(entry)>=states->countStates) {
   1.536 +                fprintf(stderr, "ucm error: state table entry [%x][%x] has a next state of %x that is too high\n",
   1.537 +                    (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry));
   1.538 +                exit(U_INVALID_TABLE_FORMAT);
   1.539 +            }
   1.540 +            if(MBCS_ENTRY_IS_FINAL(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)!=MBCS_STATE_FLAG_DIRECT) {
   1.541 +                fprintf(stderr, "ucm error: state table entry [%x][%x] is final but has a non-initial next state of %x\n",
   1.542 +                    (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry));
   1.543 +                exit(U_INVALID_TABLE_FORMAT);
   1.544 +            } else if(MBCS_ENTRY_IS_TRANSITION(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)==MBCS_STATE_FLAG_DIRECT) {
   1.545 +                fprintf(stderr, "ucm error: state table entry [%x][%x] is not final but has an initial next state of %x\n",
   1.546 +                    (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry));
   1.547 +                exit(U_INVALID_TABLE_FORMAT);
   1.548 +            }
   1.549 +        }
   1.550 +    }
   1.551 +
   1.552 +    /* is this an SI/SO (like EBCDIC-stateful) state table? */
   1.553 +    if(states->countStates>=2 && (states->stateFlags[1]&0xf)==MBCS_STATE_FLAG_DIRECT) {
   1.554 +        if(states->maxCharLength!=2) {
   1.555 +            fprintf(stderr, "ucm error: SI/SO codepages must have max 2 bytes/char (not %x)\n", (int)states->maxCharLength);
   1.556 +            exit(U_INVALID_TABLE_FORMAT);
   1.557 +        }
   1.558 +        if(states->countStates<3) {
   1.559 +            fprintf(stderr, "ucm error: SI/SO codepages must have at least 3 states (not %x)\n", (int)states->countStates);
   1.560 +            exit(U_INVALID_TABLE_FORMAT);
   1.561 +        }
   1.562 +        /* are the SI/SO all in the right places? */
   1.563 +        if( ignoreSISOCheck ||
   1.564 +           (states->stateTable[0][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) &&
   1.565 +            states->stateTable[0][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0) &&
   1.566 +            states->stateTable[1][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) &&
   1.567 +            states->stateTable[1][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0))
   1.568 +        ) {
   1.569 +            states->outputType=MBCS_OUTPUT_2_SISO;
   1.570 +        } else {
   1.571 +            fprintf(stderr, "ucm error: SI/SO codepages must have in states 0 and 1 transitions e:1.s, f:0.s\n");
   1.572 +            exit(U_INVALID_TABLE_FORMAT);
   1.573 +        }
   1.574 +        state=2;
   1.575 +    } else {
   1.576 +        state=1;
   1.577 +    }
   1.578 +
   1.579 +    /* check that no unexpected state is a "direct" one */
   1.580 +    while(state<states->countStates) {
   1.581 +        if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
   1.582 +            fprintf(stderr, "ucm error: state %d is 'initial' - not supported except for SI/SO codepages\n", (int)state);
   1.583 +            exit(U_INVALID_TABLE_FORMAT);
   1.584 +        }
   1.585 +        ++state;
   1.586 +    }
   1.587 +
   1.588 +    sumUpStates(states);
   1.589 +}
   1.590 +
   1.591 +/* find a fallback for this offset; return the index or -1 if not found */
   1.592 +U_CAPI int32_t U_EXPORT2
   1.593 +ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
   1.594 +                 uint32_t offset) {
   1.595 +    int32_t i;
   1.596 +
   1.597 +    if(countToUFallbacks==0) {
   1.598 +        /* shortcut: most codepages do not have fallbacks from codepage to Unicode */
   1.599 +        return -1;
   1.600 +    }
   1.601 +
   1.602 +    /* do a linear search for the fallback mapping (the table is not yet sorted) */
   1.603 +    for(i=0; i<countToUFallbacks; ++i) {
   1.604 +        if(offset==toUFallbacks[i].offset) {
   1.605 +            return i;
   1.606 +        }
   1.607 +    }
   1.608 +    return -1;
   1.609 +}
   1.610 +
   1.611 +/*
   1.612 + * This function tries to compact toUnicode tables for 2-byte codepages
   1.613 + * by finding lead bytes with all-unassigned trail bytes and adding another state
   1.614 + * for them.
   1.615 + */
   1.616 +static void
   1.617 +compactToUnicode2(UCMStates *states,
   1.618 +                  uint16_t **pUnicodeCodeUnits,
   1.619 +                  _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
   1.620 +                  UBool verbose) {
   1.621 +    int32_t (*oldStateTable)[256];
   1.622 +    uint16_t count[256];
   1.623 +    uint16_t *oldUnicodeCodeUnits;
   1.624 +    int32_t entry, offset, oldOffset, trailOffset, oldTrailOffset, savings, sum;
   1.625 +    int32_t i, j, leadState, trailState, newState, fallback;
   1.626 +    uint16_t unit;
   1.627 +
   1.628 +    /* find the lead state */
   1.629 +    if(states->outputType==MBCS_OUTPUT_2_SISO) {
   1.630 +        /* use the DBCS lead state for SI/SO codepages */
   1.631 +        leadState=1;
   1.632 +    } else {
   1.633 +        leadState=0;
   1.634 +    }
   1.635 +
   1.636 +    /* find the main trail state: the most used target state */
   1.637 +    uprv_memset(count, 0, sizeof(count));
   1.638 +    for(i=0; i<256; ++i) {
   1.639 +        entry=states->stateTable[leadState][i];
   1.640 +        if(MBCS_ENTRY_IS_TRANSITION(entry)) {
   1.641 +            ++count[MBCS_ENTRY_TRANSITION_STATE(entry)];
   1.642 +        }
   1.643 +    }
   1.644 +    trailState=0;
   1.645 +    for(i=1; i<states->countStates; ++i) {
   1.646 +        if(count[i]>count[trailState]) {
   1.647 +            trailState=i;
   1.648 +        }
   1.649 +    }
   1.650 +
   1.651 +    /* count possible savings from lead bytes with all-unassigned results in all trail bytes */
   1.652 +    uprv_memset(count, 0, sizeof(count));
   1.653 +    savings=0;
   1.654 +    /* for each lead byte */
   1.655 +    for(i=0; i<256; ++i) {
   1.656 +        entry=states->stateTable[leadState][i];
   1.657 +        if(MBCS_ENTRY_IS_TRANSITION(entry) && (MBCS_ENTRY_TRANSITION_STATE(entry))==trailState) {
   1.658 +            /* the offset is different for each lead byte */
   1.659 +            offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
   1.660 +            /* for each trail byte for this lead byte */
   1.661 +            for(j=0; j<256; ++j) {
   1.662 +                entry=states->stateTable[trailState][j];
   1.663 +                switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
   1.664 +                case MBCS_STATE_VALID_16:
   1.665 +                    entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
   1.666 +                    if((*pUnicodeCodeUnits)[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) {
   1.667 +                        ++count[i];
   1.668 +                    } else {
   1.669 +                        j=999; /* do not count for this lead byte because there are assignments */
   1.670 +                    }
   1.671 +                    break;
   1.672 +                case MBCS_STATE_VALID_16_PAIR:
   1.673 +                    entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
   1.674 +                    if((*pUnicodeCodeUnits)[entry]==0xfffe) {
   1.675 +                        count[i]+=2;
   1.676 +                    } else {
   1.677 +                        j=999; /* do not count for this lead byte because there are assignments */
   1.678 +                    }
   1.679 +                    break;
   1.680 +                default:
   1.681 +                    break;
   1.682 +                }
   1.683 +            }
   1.684 +            if(j==256) {
   1.685 +                /* all trail bytes for this lead byte are unassigned */
   1.686 +                savings+=count[i];
   1.687 +            } else {
   1.688 +                count[i]=0;
   1.689 +            }
   1.690 +        }
   1.691 +    }
   1.692 +    /* subtract from the possible savings the cost of an additional state */
   1.693 +    savings=savings*2-1024; /* count bytes, not 16-bit words */
   1.694 +    if(savings<=0) {
   1.695 +        return;
   1.696 +    }
   1.697 +    if(verbose) {
   1.698 +        printf("compacting toUnicode data saves %ld bytes\n", (long)savings);
   1.699 +    }
   1.700 +    if(states->countStates>=MBCS_MAX_STATE_COUNT) {
   1.701 +        fprintf(stderr, "cannot compact toUnicode because the maximum number of states is reached\n");
   1.702 +        return;
   1.703 +    }
   1.704 +
   1.705 +    /* make a copy of the state table */
   1.706 +    oldStateTable=(int32_t (*)[256])uprv_malloc(states->countStates*1024);
   1.707 +    if(oldStateTable==NULL) {
   1.708 +        fprintf(stderr, "cannot compact toUnicode: out of memory\n");
   1.709 +        return;
   1.710 +    }
   1.711 +    uprv_memcpy(oldStateTable, states->stateTable, states->countStates*1024);
   1.712 +
   1.713 +    /* add the new state */
   1.714 +    /*
   1.715 +     * this function does not catch the degenerate case where all lead bytes
   1.716 +     * have all-unassigned trail bytes and the lead state could be removed
   1.717 +     */
   1.718 +    newState=states->countStates++;
   1.719 +    states->stateFlags[newState]=0;
   1.720 +    /* copy the old trail state, turning all assigned states into unassigned ones */
   1.721 +    for(i=0; i<256; ++i) {
   1.722 +        entry=states->stateTable[trailState][i];
   1.723 +        switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
   1.724 +        case MBCS_STATE_VALID_16:
   1.725 +        case MBCS_STATE_VALID_16_PAIR:
   1.726 +            states->stateTable[newState][i]=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe);
   1.727 +            break;
   1.728 +        default:
   1.729 +            states->stateTable[newState][i]=entry;
   1.730 +            break;
   1.731 +        }
   1.732 +    }
   1.733 +
   1.734 +    /* in the lead state, redirect all lead bytes with all-unassigned trail bytes to the new state */
   1.735 +    for(i=0; i<256; ++i) {
   1.736 +        if(count[i]>0) {
   1.737 +            states->stateTable[leadState][i]=MBCS_ENTRY_SET_STATE(states->stateTable[leadState][i], newState);
   1.738 +        }
   1.739 +    }
   1.740 +
   1.741 +    /* sum up the new state table */
   1.742 +    for(i=0; i<states->countStates; ++i) {
   1.743 +        states->stateFlags[i]&=~MBCS_STATE_FLAG_READY;
   1.744 +    }
   1.745 +    sum=sumUpStates(states);
   1.746 +
   1.747 +    /* allocate a new, smaller code units array */
   1.748 +    oldUnicodeCodeUnits=*pUnicodeCodeUnits;
   1.749 +    if(sum==0) {
   1.750 +        *pUnicodeCodeUnits=NULL;
   1.751 +        if(oldUnicodeCodeUnits!=NULL) {
   1.752 +            uprv_free(oldUnicodeCodeUnits);
   1.753 +        }
   1.754 +        uprv_free(oldStateTable);
   1.755 +        return;
   1.756 +    }
   1.757 +    *pUnicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t));
   1.758 +    if(*pUnicodeCodeUnits==NULL) {
   1.759 +        fprintf(stderr, "cannot compact toUnicode: out of memory allocating %ld 16-bit code units\n",
   1.760 +            (long)sum);
   1.761 +        /* revert to the old state table */
   1.762 +        *pUnicodeCodeUnits=oldUnicodeCodeUnits;
   1.763 +        --states->countStates;
   1.764 +        uprv_memcpy(states->stateTable, oldStateTable, states->countStates*1024);
   1.765 +        uprv_free(oldStateTable);
   1.766 +        return;
   1.767 +    }
   1.768 +    for(i=0; i<sum; ++i) {
   1.769 +        (*pUnicodeCodeUnits)[i]=0xfffe;
   1.770 +    }
   1.771 +
   1.772 +    /* copy the code units for all assigned characters */
   1.773 +    /*
   1.774 +     * The old state table has the same lead _and_ trail states for assigned characters!
   1.775 +     * The differences are in the offsets, and in the trail states for some unassigned characters.
   1.776 +     * For each character with an assigned state in the new table, it was assigned in the old one.
   1.777 +     * Only still-assigned characters are copied.
   1.778 +     * Note that fallback mappings need to get their offset values adjusted.
   1.779 +     */
   1.780 +
   1.781 +    /* for each initial state */
   1.782 +    for(leadState=0; leadState<states->countStates; ++leadState) {
   1.783 +        if((states->stateFlags[leadState]&0xf)==MBCS_STATE_FLAG_DIRECT) {
   1.784 +            /* for each lead byte from there */
   1.785 +            for(i=0; i<256; ++i) {
   1.786 +                entry=states->stateTable[leadState][i];
   1.787 +                if(MBCS_ENTRY_IS_TRANSITION(entry)) {
   1.788 +                    trailState=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
   1.789 +                    /* the new state does not have assigned states */
   1.790 +                    if(trailState!=newState) {
   1.791 +                        trailOffset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
   1.792 +                        oldTrailOffset=MBCS_ENTRY_TRANSITION_OFFSET(oldStateTable[leadState][i]);
   1.793 +                        /* for each trail byte */
   1.794 +                        for(j=0; j<256; ++j) {
   1.795 +                            entry=states->stateTable[trailState][j];
   1.796 +                            /* copy assigned-character code units and adjust fallback offsets */
   1.797 +                            switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
   1.798 +                            case MBCS_STATE_VALID_16:
   1.799 +                                offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry);
   1.800 +                                /* find the old offset according to the old state table */
   1.801 +                                oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]);
   1.802 +                                unit=(*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset];
   1.803 +                                if(unit==0xfffe && (fallback=ucm_findFallback(toUFallbacks, countToUFallbacks, oldOffset))>=0) {
   1.804 +                                    toUFallbacks[fallback].offset=0x80000000|offset;
   1.805 +                                }
   1.806 +                                break;
   1.807 +                            case MBCS_STATE_VALID_16_PAIR:
   1.808 +                                offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry);
   1.809 +                                /* find the old offset according to the old state table */
   1.810 +                                oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]);
   1.811 +                                (*pUnicodeCodeUnits)[offset++]=oldUnicodeCodeUnits[oldOffset++];
   1.812 +                                (*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset];
   1.813 +                                break;
   1.814 +                            default:
   1.815 +                                break;
   1.816 +                            }
   1.817 +                        }
   1.818 +                    }
   1.819 +                }
   1.820 +            }
   1.821 +        }
   1.822 +    }
   1.823 +
   1.824 +    /* remove temporary flags from fallback offsets that protected them from being modified twice */
   1.825 +    for(i=0; i<countToUFallbacks; ++i) {
   1.826 +        toUFallbacks[i].offset&=0x7fffffff;
   1.827 +    }
   1.828 +
   1.829 +    /* free temporary memory */
   1.830 +    uprv_free(oldUnicodeCodeUnits);
   1.831 +    uprv_free(oldStateTable);
   1.832 +}
   1.833 +
   1.834 +/*
   1.835 + * recursive sub-function of compactToUnicodeHelper()
   1.836 + * returns:
   1.837 + * >0 number of bytes that are used in unicodeCodeUnits[] that could be saved,
   1.838 + *    if all sequences from this state are unassigned, returns the
   1.839 + * <0 there are assignments in unicodeCodeUnits[]
   1.840 + * 0  no use of unicodeCodeUnits[]
   1.841 + */
   1.842 +static int32_t
   1.843 +findUnassigned(UCMStates *states,
   1.844 +               uint16_t *unicodeCodeUnits,
   1.845 +               _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
   1.846 +               int32_t state, int32_t offset, uint32_t b) {
   1.847 +    int32_t i, entry, savings, localSavings, belowSavings;
   1.848 +    UBool haveAssigned;
   1.849 +
   1.850 +    localSavings=belowSavings=0;
   1.851 +    haveAssigned=FALSE;
   1.852 +    for(i=0; i<256; ++i) {
   1.853 +        entry=states->stateTable[state][i];
   1.854 +        if(MBCS_ENTRY_IS_TRANSITION(entry)) {
   1.855 +            savings=findUnassigned(states,
   1.856 +                        unicodeCodeUnits,
   1.857 +                        toUFallbacks, countToUFallbacks,
   1.858 +                        MBCS_ENTRY_TRANSITION_STATE(entry),
   1.859 +                        offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),
   1.860 +                        (b<<8)|(uint32_t)i);
   1.861 +            if(savings<0) {
   1.862 +                haveAssigned=TRUE;
   1.863 +            } else if(savings>0) {
   1.864 +                printf("    all-unassigned sequences from prefix 0x%02lx state %ld use %ld bytes\n",
   1.865 +                    (unsigned long)((b<<8)|i), (long)state, (long)savings);
   1.866 +                belowSavings+=savings;
   1.867 +            }
   1.868 +        } else if(!haveAssigned) {
   1.869 +            switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
   1.870 +            case MBCS_STATE_VALID_16:
   1.871 +                entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
   1.872 +                if(unicodeCodeUnits[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) {
   1.873 +                    localSavings+=2;
   1.874 +                } else {
   1.875 +                    haveAssigned=TRUE;
   1.876 +                }
   1.877 +                break;
   1.878 +            case MBCS_STATE_VALID_16_PAIR:
   1.879 +                entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
   1.880 +                if(unicodeCodeUnits[entry]==0xfffe) {
   1.881 +                    localSavings+=4;
   1.882 +                } else {
   1.883 +                    haveAssigned=TRUE;
   1.884 +                }
   1.885 +                break;
   1.886 +            default:
   1.887 +                break;
   1.888 +            }
   1.889 +        }
   1.890 +    }
   1.891 +    if(haveAssigned) {
   1.892 +        return -1;
   1.893 +    } else {
   1.894 +        return localSavings+belowSavings;
   1.895 +    }
   1.896 +}
   1.897 +
   1.898 +/* helper function for finding compaction opportunities */
   1.899 +static void
   1.900 +compactToUnicodeHelper(UCMStates *states,
   1.901 +                       uint16_t *unicodeCodeUnits,
   1.902 +                       _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks) {
   1.903 +    int32_t state, savings;
   1.904 +
   1.905 +    /* for each initial state */
   1.906 +    for(state=0; state<states->countStates; ++state) {
   1.907 +        if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
   1.908 +            savings=findUnassigned(states,
   1.909 +                        unicodeCodeUnits,
   1.910 +                        toUFallbacks, countToUFallbacks,
   1.911 +                        state, 0, 0);
   1.912 +            if(savings>0) {
   1.913 +                printf("    all-unassigned sequences from initial state %ld use %ld bytes\n",
   1.914 +                    (long)state, (long)savings);
   1.915 +            }
   1.916 +        }
   1.917 +    }
   1.918 +}
   1.919 +
   1.920 +static int32_t
   1.921 +compareFallbacks(const void *context, const void *fb1, const void *fb2) {
   1.922 +    return ((const _MBCSToUFallback *)fb1)->offset-((const _MBCSToUFallback *)fb2)->offset;
   1.923 +}
   1.924 +
   1.925 +U_CAPI void U_EXPORT2
   1.926 +ucm_optimizeStates(UCMStates *states,
   1.927 +                   uint16_t **pUnicodeCodeUnits,
   1.928 +                   _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
   1.929 +                   UBool verbose) {
   1.930 +    UErrorCode errorCode;
   1.931 +    int32_t state, cell, entry;
   1.932 +
   1.933 +    /* test each state table entry */
   1.934 +    for(state=0; state<states->countStates; ++state) {
   1.935 +        for(cell=0; cell<256; ++cell) {
   1.936 +            entry=states->stateTable[state][cell];
   1.937 +            /*
   1.938 +             * if the entry is a final one with an MBCS_STATE_VALID_DIRECT_16 action code
   1.939 +             * and the code point is "unassigned" (0xfffe), then change it to
   1.940 +             * the "unassigned" action code with bits 26..23 set to zero and U+fffe.
   1.941 +             */
   1.942 +            if(MBCS_ENTRY_SET_STATE(entry, 0)==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, 0xfffe)) {
   1.943 +                states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_UNASSIGNED);
   1.944 +            }
   1.945 +        }
   1.946 +    }
   1.947 +
   1.948 +    /* try to compact the toUnicode tables */
   1.949 +    if(states->maxCharLength==2) {
   1.950 +        compactToUnicode2(states, pUnicodeCodeUnits, toUFallbacks, countToUFallbacks, verbose);
   1.951 +    } else if(states->maxCharLength>2) {
   1.952 +        if(verbose) {
   1.953 +            compactToUnicodeHelper(states, *pUnicodeCodeUnits, toUFallbacks, countToUFallbacks);
   1.954 +        }
   1.955 +    }
   1.956 +
   1.957 +    /* sort toUFallbacks */
   1.958 +    /*
   1.959 +     * It should be safe to sort them before compactToUnicode2() is called,
   1.960 +     * because it should not change the relative order of the offset values
   1.961 +     * that it adjusts, but they need to be sorted at some point, and
   1.962 +     * it is safest here.
   1.963 +     */
   1.964 +    if(countToUFallbacks>0) {
   1.965 +        errorCode=U_ZERO_ERROR; /* nothing bad will happen... */
   1.966 +        uprv_sortArray(toUFallbacks, countToUFallbacks,
   1.967 +                       sizeof(_MBCSToUFallback),
   1.968 +                       compareFallbacks, NULL, FALSE, &errorCode);
   1.969 +    }
   1.970 +}
   1.971 +
   1.972 +/* use a complete state table ----------------------------------------------- */
   1.973 +
   1.974 +U_CAPI int32_t U_EXPORT2
   1.975 +ucm_countChars(UCMStates *states,
   1.976 +               const uint8_t *bytes, int32_t length) {
   1.977 +    uint32_t offset;
   1.978 +    int32_t i, entry, count;
   1.979 +    uint8_t state;
   1.980 +
   1.981 +    offset=0;
   1.982 +    count=0;
   1.983 +    state=0;
   1.984 +
   1.985 +    if(states->countStates==0) {
   1.986 +        fprintf(stderr, "ucm error: there is no state information!\n");
   1.987 +        return -1;
   1.988 +    }
   1.989 +
   1.990 +    /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */
   1.991 +    if(length==2 && states->outputType==MBCS_OUTPUT_2_SISO) {
   1.992 +        state=1;
   1.993 +    }
   1.994 +
   1.995 +    /*
   1.996 +     * Walk down the state table like in conversion,
   1.997 +     * much like getNextUChar().
   1.998 +     * We assume that c<=0x10ffff.
   1.999 +     */
  1.1000 +    for(i=0; i<length; ++i) {
  1.1001 +        entry=states->stateTable[state][bytes[i]];
  1.1002 +        if(MBCS_ENTRY_IS_TRANSITION(entry)) {
  1.1003 +            state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
  1.1004 +            offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
  1.1005 +        } else {
  1.1006 +            switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
  1.1007 +            case MBCS_STATE_ILLEGAL:
  1.1008 +                fprintf(stderr, "ucm error: byte sequence ends in illegal state\n");
  1.1009 +                return -1;
  1.1010 +            case MBCS_STATE_CHANGE_ONLY:
  1.1011 +                fprintf(stderr, "ucm error: byte sequence ends in state-change-only\n");
  1.1012 +                return -1;
  1.1013 +            case MBCS_STATE_UNASSIGNED:
  1.1014 +            case MBCS_STATE_FALLBACK_DIRECT_16:
  1.1015 +            case MBCS_STATE_VALID_DIRECT_16:
  1.1016 +            case MBCS_STATE_FALLBACK_DIRECT_20:
  1.1017 +            case MBCS_STATE_VALID_DIRECT_20:
  1.1018 +            case MBCS_STATE_VALID_16:
  1.1019 +            case MBCS_STATE_VALID_16_PAIR:
  1.1020 +                /* count a complete character and prepare for a new one */
  1.1021 +                ++count;
  1.1022 +                state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry);
  1.1023 +                offset=0;
  1.1024 +                break;
  1.1025 +            default:
  1.1026 +                /* reserved, must never occur */
  1.1027 +                fprintf(stderr, "ucm error: byte sequence reached reserved action code, entry: 0x%02lx\n", (unsigned long)entry);
  1.1028 +                return -1;
  1.1029 +            }
  1.1030 +        }
  1.1031 +    }
  1.1032 +
  1.1033 +    if(offset!=0) {
  1.1034 +        fprintf(stderr, "ucm error: byte sequence too short, ends in non-final state %u\n", state);
  1.1035 +        return -1;
  1.1036 +    }
  1.1037 +
  1.1038 +    /*
  1.1039 +     * for SI/SO (like EBCDIC-stateful), multiple-character results
  1.1040 +     * must consist of only double-byte sequences
  1.1041 +     */
  1.1042 +    if(count>1 && states->outputType==MBCS_OUTPUT_2_SISO && length!=2*count) {
  1.1043 +        fprintf(stderr, "ucm error: SI/SO (like EBCDIC-stateful) result with %d characters does not contain all DBCS\n", (int)count);
  1.1044 +        return -1;
  1.1045 +    }
  1.1046 +
  1.1047 +    return count;
  1.1048 +}
  1.1049 +#endif
  1.1050 +

mercurial