intl/icu/source/tools/toolutil/ucmstate.c

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*
     2 *******************************************************************************
     3 *
     4 *   Copyright (C) 2003-2012, International Business Machines
     5 *   Corporation and others.  All Rights Reserved.
     6 *
     7 *******************************************************************************
     8 *   file name:  ucmstate.c
     9 *   encoding:   US-ASCII
    10 *   tab size:   8 (not used)
    11 *   indentation:4
    12 *
    13 *   created on: 2003oct09
    14 *   created by: Markus W. Scherer
    15 *
    16 *   This file handles ICU .ucm file state information as part of the ucm module.
    17 *   Most of this code used to be in makeconv.c.
    18 */
    20 #include "unicode/utypes.h"
    21 #include "cstring.h"
    22 #include "cmemory.h"
    23 #include "uarrsort.h"
    24 #include "ucnvmbcs.h"
    25 #include "ucnv_ext.h"
    26 #include "uparse.h"
    27 #include "ucm.h"
    28 #include <stdio.h>
    30 #if !UCONFIG_NO_CONVERSION
    32 /* MBCS state handling ------------------------------------------------------ */
    34 /*
    35  * state table row grammar (ebnf-style):
    36  * (whitespace is allowed between all tokens)
    37  *
    38  * row=[[firstentry ','] entry (',' entry)*]
    39  * firstentry="initial" | "surrogates"
    40  *            (initial state (default for state 0), output is all surrogate pairs)
    41  * entry=range [':' nextstate] ['.' action]
    42  * range=number ['-' number]
    43  * nextstate=number
    44  *           (0..7f)
    45  * action='u' | 's' | 'p' | 'i'
    46  *        (unassigned, state change only, surrogate pair, illegal)
    47  * number=(1- or 2-digit hexadecimal number)
    48  */
    49 static const char *
    50 parseState(const char *s, int32_t state[256], uint32_t *pFlags) {
    51     const char *t;
    52     uint32_t start, end, i;
    53     int32_t entry;
    55     /* initialize the state: all illegal with U+ffff */
    56     for(i=0; i<256; ++i) {
    57         state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0xffff);
    58     }
    60     /* skip leading white space */
    61     s=u_skipWhitespace(s);
    63     /* is there an "initial" or "surrogates" directive? */
    64     if(uprv_strncmp("initial", s, 7)==0) {
    65         *pFlags=MBCS_STATE_FLAG_DIRECT;
    66         s=u_skipWhitespace(s+7);
    67         if(*s++!=',') {
    68             return s-1;
    69         }
    70     } else if(*pFlags==0 && uprv_strncmp("surrogates", s, 10)==0) {
    71         *pFlags=MBCS_STATE_FLAG_SURROGATES;
    72         s=u_skipWhitespace(s+10);
    73         if(*s++!=',') {
    74             return s-1;
    75         }
    76     } else if(*s==0) {
    77         /* empty state row: all-illegal */
    78         return NULL;
    79     }
    81     for(;;) {
    82         /* read an entry, the start of the range first */
    83         s=u_skipWhitespace(s);
    84         start=uprv_strtoul(s, (char **)&t, 16);
    85         if(s==t || 0xff<start) {
    86             return s;
    87         }
    88         s=u_skipWhitespace(t);
    90         /* read the end of the range if there is one */
    91         if(*s=='-') {
    92             s=u_skipWhitespace(s+1);
    93             end=uprv_strtoul(s, (char **)&t, 16);
    94             if(s==t || end<start || 0xff<end) {
    95                 return s;
    96             }
    97             s=u_skipWhitespace(t);
    98         } else {
    99             end=start;
   100         }
   102         /* determine the state entrys for this range */
   103         if(*s!=':' && *s!='.') {
   104             /* the default is: final state with valid entries */
   105             entry=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_16, 0);
   106         } else {
   107             entry=MBCS_ENTRY_TRANSITION(0, 0);
   108             if(*s==':') {
   109                 /* get the next state, default to 0 */
   110                 s=u_skipWhitespace(s+1);
   111                 i=uprv_strtoul(s, (char **)&t, 16);
   112                 if(s!=t) {
   113                     if(0x7f<i) {
   114                         return s;
   115                     }
   116                     s=u_skipWhitespace(t);
   117                     entry=MBCS_ENTRY_SET_STATE(entry, i);
   118                 }
   119             }
   121             /* get the state action, default to valid */
   122             if(*s=='.') {
   123                 /* this is a final state */
   124                 entry=MBCS_ENTRY_SET_FINAL(entry);
   126                 s=u_skipWhitespace(s+1);
   127                 if(*s=='u') {
   128                     /* unassigned set U+fffe */
   129                     entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe);
   130                     s=u_skipWhitespace(s+1);
   131                 } else if(*s=='p') {
   132                     if(*pFlags!=MBCS_STATE_FLAG_DIRECT) {
   133                         entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16_PAIR);
   134                     } else {
   135                         entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16);
   136                     }
   137                     s=u_skipWhitespace(s+1);
   138                 } else if(*s=='s') {
   139                     entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_CHANGE_ONLY);
   140                     s=u_skipWhitespace(s+1);
   141                 } else if(*s=='i') {
   142                     /* illegal set U+ffff */
   143                     entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_ILLEGAL, 0xffff);
   144                     s=u_skipWhitespace(s+1);
   145                 } else {
   146                     /* default to valid */
   147                     entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16);
   148                 }
   149             } else {
   150                 /* this is an intermediate state, nothing to do */
   151             }
   152         }
   154         /* adjust "final valid" states according to the state flags */
   155         if(MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16) {
   156             switch(*pFlags) {
   157             case 0:
   158                 /* no adjustment */
   159                 break;
   160             case MBCS_STATE_FLAG_DIRECT:
   161                 /* set the valid-direct code point to "unassigned"==0xfffe */
   162                 entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_DIRECT_16, 0xfffe);
   163                 break;
   164             case MBCS_STATE_FLAG_SURROGATES:
   165                 entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_16_PAIR, 0);
   166                 break;
   167             default:
   168                 break;
   169             }
   170         }
   172         /* set this entry for the range */
   173         for(i=start; i<=end; ++i) {
   174             state[i]=entry;
   175         }
   177         if(*s==',') {
   178             ++s;
   179         } else {
   180             return *s==0 ? NULL : s;
   181         }
   182     }
   183 }
   185 U_CAPI void U_EXPORT2
   186 ucm_addState(UCMStates *states, const char *s) {
   187     const char *error;
   189     if(states->countStates==MBCS_MAX_STATE_COUNT) {
   190         fprintf(stderr, "ucm error: too many states (maximum %u)\n", MBCS_MAX_STATE_COUNT);
   191         exit(U_INVALID_TABLE_FORMAT);
   192     }
   194     error=parseState(s, states->stateTable[states->countStates],
   195                        &states->stateFlags[states->countStates]);
   196     if(error!=NULL) {
   197         fprintf(stderr, "ucm error: parse error in state definition at '%s'\n", error);
   198         exit(U_INVALID_TABLE_FORMAT);
   199     }
   201     ++states->countStates;
   202 }
   204 U_CAPI UBool U_EXPORT2
   205 ucm_parseHeaderLine(UCMFile *ucm,
   206                     char *line, char **pKey, char **pValue) {
   207     UCMStates *states;
   208     char *s, *end;
   209     char c;
   211     states=&ucm->states;
   213     /* remove comments and trailing CR and LF and remove whitespace from the end */
   214     for(end=line; (c=*end)!=0; ++end) {
   215         if(c=='#' || c=='\r' || c=='\n') {
   216             break;
   217         }
   218     }
   219     while(end>line && (*(end-1)==' ' || *(end-1)=='\t')) {
   220         --end;
   221     }
   222     *end=0;
   224     /* skip leading white space and ignore empty lines */
   225     s=(char *)u_skipWhitespace(line);
   226     if(*s==0) {
   227         return TRUE;
   228     }
   230     /* stop at the beginning of the mapping section */
   231     if(uprv_memcmp(s, "CHARMAP", 7)==0) {
   232         return FALSE;
   233     }
   235     /* get the key name, bracketed in <> */
   236     if(*s!='<') {
   237         fprintf(stderr, "ucm error: no header field <key> in line \"%s\"\n", line);
   238         exit(U_INVALID_TABLE_FORMAT);
   239     }
   240     *pKey=++s;
   241     while(*s!='>') {
   242         if(*s==0) {
   243             fprintf(stderr, "ucm error: incomplete header field <key> in line \"%s\"\n", line);
   244             exit(U_INVALID_TABLE_FORMAT);
   245         }
   246         ++s;
   247     }
   248     *s=0;
   250     /* get the value string, possibly quoted */
   251     s=(char *)u_skipWhitespace(s+1);
   252     if(*s!='"') {
   253         *pValue=s;
   254     } else {
   255         /* remove the quotes */
   256         *pValue=s+1;
   257         if(end>*pValue && *(end-1)=='"') {
   258             *--end=0;
   259         }
   260     }
   262     /* collect the information from the header field, ignore unknown keys */
   263     if(uprv_strcmp(*pKey, "uconv_class")==0) {
   264         if(uprv_strcmp(*pValue, "DBCS")==0) {
   265             states->conversionType=UCNV_DBCS;
   266         } else if(uprv_strcmp(*pValue, "SBCS")==0) {
   267             states->conversionType = UCNV_SBCS;
   268         } else if(uprv_strcmp(*pValue, "MBCS")==0) {
   269             states->conversionType = UCNV_MBCS;
   270         } else if(uprv_strcmp(*pValue, "EBCDIC_STATEFUL")==0) {
   271             states->conversionType = UCNV_EBCDIC_STATEFUL;
   272         } else {
   273             fprintf(stderr, "ucm error: unknown <uconv_class> %s\n", *pValue);
   274             exit(U_INVALID_TABLE_FORMAT);
   275         }
   276         return TRUE;
   277     } else if(uprv_strcmp(*pKey, "mb_cur_max")==0) {
   278         c=**pValue;
   279         if('1'<=c && c<='4' && (*pValue)[1]==0) {
   280             states->maxCharLength=(int8_t)(c-'0');
   281             states->outputType=(int8_t)(states->maxCharLength-1);
   282         } else {
   283             fprintf(stderr, "ucm error: illegal <mb_cur_max> %s\n", *pValue);
   284             exit(U_INVALID_TABLE_FORMAT);
   285         }
   286         return TRUE;
   287     } else if(uprv_strcmp(*pKey, "mb_cur_min")==0) {
   288         c=**pValue;
   289         if('1'<=c && c<='4' && (*pValue)[1]==0) {
   290             states->minCharLength=(int8_t)(c-'0');
   291         } else {
   292             fprintf(stderr, "ucm error: illegal <mb_cur_min> %s\n", *pValue);
   293             exit(U_INVALID_TABLE_FORMAT);
   294         }
   295         return TRUE;
   296     } else if(uprv_strcmp(*pKey, "icu:state")==0) {
   297         /* if an SBCS/DBCS/EBCDIC_STATEFUL converter has icu:state, then turn it into MBCS */
   298         switch(states->conversionType) {
   299         case UCNV_SBCS:
   300         case UCNV_DBCS:
   301         case UCNV_EBCDIC_STATEFUL:
   302             states->conversionType=UCNV_MBCS;
   303             break;
   304         case UCNV_MBCS:
   305             break;
   306         default:
   307             fprintf(stderr, "ucm error: <icu:state> entry for non-MBCS table or before the <uconv_class> line\n");
   308             exit(U_INVALID_TABLE_FORMAT);
   309         }
   311         if(states->maxCharLength==0) {
   312             fprintf(stderr, "ucm error: <icu:state> before the <mb_cur_max> line\n");
   313             exit(U_INVALID_TABLE_FORMAT);
   314         }
   315         ucm_addState(states, *pValue);
   316         return TRUE;
   317     } else if(uprv_strcmp(*pKey, "icu:base")==0) {
   318         if(**pValue==0) {
   319             fprintf(stderr, "ucm error: <icu:base> without a base table name\n");
   320             exit(U_INVALID_TABLE_FORMAT);
   321         }
   322         uprv_strcpy(ucm->baseName, *pValue);
   323         return TRUE;
   324     }
   326     return FALSE;
   327 }
   329 /* post-processing ---------------------------------------------------------- */
   331 static int32_t
   332 sumUpStates(UCMStates *states) {
   333     int32_t entry, sum, state, cell, count;
   334     UBool allStatesReady;
   336     /*
   337      * Sum up the offsets for all states.
   338      * In each final state (where there are only final entries),
   339      * the offsets add up directly.
   340      * In all other state table rows, for each transition entry to another state,
   341      * the offsets sum of that state needs to be added.
   342      * This is achieved in at most countStates iterations.
   343      */
   344     allStatesReady=FALSE;
   345     for(count=states->countStates; !allStatesReady && count>=0; --count) {
   346         allStatesReady=TRUE;
   347         for(state=states->countStates-1; state>=0; --state) {
   348             if(!(states->stateFlags[state]&MBCS_STATE_FLAG_READY)) {
   349                 allStatesReady=FALSE;
   350                 sum=0;
   352                 /* at first, add up only the final delta offsets to keep them <512 */
   353                 for(cell=0; cell<256; ++cell) {
   354                     entry=states->stateTable[state][cell];
   355                     if(MBCS_ENTRY_IS_FINAL(entry)) {
   356                         switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
   357                         case MBCS_STATE_VALID_16:
   358                             states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum);
   359                             sum+=1;
   360                             break;
   361                         case MBCS_STATE_VALID_16_PAIR:
   362                             states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum);
   363                             sum+=2;
   364                             break;
   365                         default:
   366                             /* no addition */
   367                             break;
   368                         }
   369                     }
   370                 }
   372                 /* now, add up the delta offsets for the transitional entries */
   373                 for(cell=0; cell<256; ++cell) {
   374                     entry=states->stateTable[state][cell];
   375                     if(MBCS_ENTRY_IS_TRANSITION(entry)) {
   376                         if(states->stateFlags[MBCS_ENTRY_TRANSITION_STATE(entry)]&MBCS_STATE_FLAG_READY) {
   377                             states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_SET_OFFSET(entry, sum);
   378                             sum+=states->stateOffsetSum[MBCS_ENTRY_TRANSITION_STATE(entry)];
   379                         } else {
   380                             /* that next state does not have a sum yet, we cannot finish the one for this state */
   381                             sum=-1;
   382                             break;
   383                         }
   384                     }
   385                 }
   387                 if(sum!=-1) {
   388                     states->stateOffsetSum[state]=sum;
   389                     states->stateFlags[state]|=MBCS_STATE_FLAG_READY;
   390                 }
   391             }
   392         }
   393     }
   395     if(!allStatesReady) {
   396         fprintf(stderr, "ucm error: the state table contains loops\n");
   397         exit(U_INVALID_TABLE_FORMAT);
   398     }
   400     /*
   401      * For all "direct" (i.e., initial) states>0,
   402      * the offsets need to be increased by the sum of
   403      * the previous initial states.
   404      */
   405     sum=states->stateOffsetSum[0];
   406     for(state=1; state<states->countStates; ++state) {
   407         if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
   408             int32_t sum2=sum;
   409             sum+=states->stateOffsetSum[state];
   410             for(cell=0; cell<256; ++cell) {
   411                 entry=states->stateTable[state][cell];
   412                 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
   413                     states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_ADD_OFFSET(entry, sum2);
   414                 }
   415             }
   416         }
   417     }
   419     /* round up to the next even number to have the following data 32-bit-aligned */
   420     return states->countToUCodeUnits=(sum+1)&~1;
   421 }
   423 U_CAPI void U_EXPORT2
   424 ucm_processStates(UCMStates *states, UBool ignoreSISOCheck) {
   425     int32_t entry, state, cell, count;
   427     if(states->conversionType==UCNV_UNSUPPORTED_CONVERTER) {
   428         fprintf(stderr, "ucm error: missing conversion type (<uconv_class>)\n");
   429         exit(U_INVALID_TABLE_FORMAT);
   430     }
   432     if(states->countStates==0) {
   433         switch(states->conversionType) {
   434         case UCNV_SBCS:
   435             /* SBCS: use MBCS data structure with a default state table */
   436             if(states->maxCharLength!=1) {
   437                 fprintf(stderr, "error: SBCS codepage with max B/char!=1\n");
   438                 exit(U_INVALID_TABLE_FORMAT);
   439             }
   440             states->conversionType=UCNV_MBCS;
   441             ucm_addState(states, "0-ff");
   442             break;
   443         case UCNV_MBCS:
   444             fprintf(stderr, "ucm error: missing state table information (<icu:state>) for MBCS\n");
   445             exit(U_INVALID_TABLE_FORMAT);
   446             break;
   447         case UCNV_EBCDIC_STATEFUL:
   448             /* EBCDIC_STATEFUL: use MBCS data structure with a default state table */
   449             if(states->minCharLength!=1 || states->maxCharLength!=2) {
   450                 fprintf(stderr, "error: DBCS codepage with min B/char!=1 or max B/char!=2\n");
   451                 exit(U_INVALID_TABLE_FORMAT);
   452             }
   453             states->conversionType=UCNV_MBCS;
   454             ucm_addState(states, "0-ff, e:1.s, f:0.s");
   455             ucm_addState(states, "initial, 0-3f:4, e:1.s, f:0.s, 40:3, 41-fe:2, ff:4");
   456             ucm_addState(states, "0-40:1.i, 41-fe:1., ff:1.i");
   457             ucm_addState(states, "0-ff:1.i, 40:1.");
   458             ucm_addState(states, "0-ff:1.i");
   459             break;
   460         case UCNV_DBCS:
   461             /* DBCS: use MBCS data structure with a default state table */
   462             if(states->minCharLength!=2 || states->maxCharLength!=2) {
   463                 fprintf(stderr, "error: DBCS codepage with min or max B/char!=2\n");
   464                 exit(U_INVALID_TABLE_FORMAT);
   465             }
   466             states->conversionType = UCNV_MBCS;
   467             ucm_addState(states, "0-3f:3, 40:2, 41-fe:1, ff:3");
   468             ucm_addState(states, "41-fe");
   469             ucm_addState(states, "40");
   470             ucm_addState(states, "");
   471             break;
   472         default:
   473             fprintf(stderr, "ucm error: unknown charset structure\n");
   474             exit(U_INVALID_TABLE_FORMAT);
   475             break;
   476         }
   477     }
   479     /*
   480      * check that the min/max character lengths are reasonable;
   481      * to do this right, all paths through the state table would have to be
   482      * recursively walked while keeping track of the sequence lengths,
   483      * but these simple checks cover most state tables in practice
   484      */
   485     if(states->maxCharLength<states->minCharLength) {
   486         fprintf(stderr, "ucm error: max B/char < min B/char\n");
   487         exit(U_INVALID_TABLE_FORMAT);
   488     }
   490     /* count non-direct states and compare with max B/char */
   491     count=0;
   492     for(state=0; state<states->countStates; ++state) {
   493         if((states->stateFlags[state]&0xf)!=MBCS_STATE_FLAG_DIRECT) {
   494             ++count;
   495         }
   496     }
   497     if(states->maxCharLength>count+1) {
   498         fprintf(stderr, "ucm error: max B/char too large\n");
   499         exit(U_INVALID_TABLE_FORMAT);
   500     }
   502     if(states->minCharLength==1) {
   503         int32_t action;
   505         /*
   506          * if there are single-byte characters,
   507          * then the initial state must have direct result states
   508          */
   509         for(cell=0; cell<256; ++cell) {
   510             entry=states->stateTable[0][cell];
   511             if( MBCS_ENTRY_IS_FINAL(entry) &&
   512                 ((action=MBCS_ENTRY_FINAL_ACTION(entry))==MBCS_STATE_VALID_DIRECT_16 ||
   513                  action==MBCS_STATE_UNASSIGNED)
   514             ) {
   515                 break;
   516             }
   517         }
   519         if(cell==256) {
   520             fprintf(stderr, "ucm warning: min B/char too small\n");
   521         }
   522     }
   524     /*
   525      * make sure that all "next state" values are within limits
   526      * and that all next states after final ones have the "direct"
   527      * flag of initial states
   528      */
   529     for(state=states->countStates-1; state>=0; --state) {
   530         for(cell=0; cell<256; ++cell) {
   531             entry=states->stateTable[state][cell];
   532             if((uint8_t)MBCS_ENTRY_STATE(entry)>=states->countStates) {
   533                 fprintf(stderr, "ucm error: state table entry [%x][%x] has a next state of %x that is too high\n",
   534                     (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry));
   535                 exit(U_INVALID_TABLE_FORMAT);
   536             }
   537             if(MBCS_ENTRY_IS_FINAL(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)!=MBCS_STATE_FLAG_DIRECT) {
   538                 fprintf(stderr, "ucm error: state table entry [%x][%x] is final but has a non-initial next state of %x\n",
   539                     (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry));
   540                 exit(U_INVALID_TABLE_FORMAT);
   541             } else if(MBCS_ENTRY_IS_TRANSITION(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)==MBCS_STATE_FLAG_DIRECT) {
   542                 fprintf(stderr, "ucm error: state table entry [%x][%x] is not final but has an initial next state of %x\n",
   543                     (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry));
   544                 exit(U_INVALID_TABLE_FORMAT);
   545             }
   546         }
   547     }
   549     /* is this an SI/SO (like EBCDIC-stateful) state table? */
   550     if(states->countStates>=2 && (states->stateFlags[1]&0xf)==MBCS_STATE_FLAG_DIRECT) {
   551         if(states->maxCharLength!=2) {
   552             fprintf(stderr, "ucm error: SI/SO codepages must have max 2 bytes/char (not %x)\n", (int)states->maxCharLength);
   553             exit(U_INVALID_TABLE_FORMAT);
   554         }
   555         if(states->countStates<3) {
   556             fprintf(stderr, "ucm error: SI/SO codepages must have at least 3 states (not %x)\n", (int)states->countStates);
   557             exit(U_INVALID_TABLE_FORMAT);
   558         }
   559         /* are the SI/SO all in the right places? */
   560         if( ignoreSISOCheck ||
   561            (states->stateTable[0][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) &&
   562             states->stateTable[0][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0) &&
   563             states->stateTable[1][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) &&
   564             states->stateTable[1][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0))
   565         ) {
   566             states->outputType=MBCS_OUTPUT_2_SISO;
   567         } else {
   568             fprintf(stderr, "ucm error: SI/SO codepages must have in states 0 and 1 transitions e:1.s, f:0.s\n");
   569             exit(U_INVALID_TABLE_FORMAT);
   570         }
   571         state=2;
   572     } else {
   573         state=1;
   574     }
   576     /* check that no unexpected state is a "direct" one */
   577     while(state<states->countStates) {
   578         if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
   579             fprintf(stderr, "ucm error: state %d is 'initial' - not supported except for SI/SO codepages\n", (int)state);
   580             exit(U_INVALID_TABLE_FORMAT);
   581         }
   582         ++state;
   583     }
   585     sumUpStates(states);
   586 }
   588 /* find a fallback for this offset; return the index or -1 if not found */
   589 U_CAPI int32_t U_EXPORT2
   590 ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
   591                  uint32_t offset) {
   592     int32_t i;
   594     if(countToUFallbacks==0) {
   595         /* shortcut: most codepages do not have fallbacks from codepage to Unicode */
   596         return -1;
   597     }
   599     /* do a linear search for the fallback mapping (the table is not yet sorted) */
   600     for(i=0; i<countToUFallbacks; ++i) {
   601         if(offset==toUFallbacks[i].offset) {
   602             return i;
   603         }
   604     }
   605     return -1;
   606 }
   608 /*
   609  * This function tries to compact toUnicode tables for 2-byte codepages
   610  * by finding lead bytes with all-unassigned trail bytes and adding another state
   611  * for them.
   612  */
   613 static void
   614 compactToUnicode2(UCMStates *states,
   615                   uint16_t **pUnicodeCodeUnits,
   616                   _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
   617                   UBool verbose) {
   618     int32_t (*oldStateTable)[256];
   619     uint16_t count[256];
   620     uint16_t *oldUnicodeCodeUnits;
   621     int32_t entry, offset, oldOffset, trailOffset, oldTrailOffset, savings, sum;
   622     int32_t i, j, leadState, trailState, newState, fallback;
   623     uint16_t unit;
   625     /* find the lead state */
   626     if(states->outputType==MBCS_OUTPUT_2_SISO) {
   627         /* use the DBCS lead state for SI/SO codepages */
   628         leadState=1;
   629     } else {
   630         leadState=0;
   631     }
   633     /* find the main trail state: the most used target state */
   634     uprv_memset(count, 0, sizeof(count));
   635     for(i=0; i<256; ++i) {
   636         entry=states->stateTable[leadState][i];
   637         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
   638             ++count[MBCS_ENTRY_TRANSITION_STATE(entry)];
   639         }
   640     }
   641     trailState=0;
   642     for(i=1; i<states->countStates; ++i) {
   643         if(count[i]>count[trailState]) {
   644             trailState=i;
   645         }
   646     }
   648     /* count possible savings from lead bytes with all-unassigned results in all trail bytes */
   649     uprv_memset(count, 0, sizeof(count));
   650     savings=0;
   651     /* for each lead byte */
   652     for(i=0; i<256; ++i) {
   653         entry=states->stateTable[leadState][i];
   654         if(MBCS_ENTRY_IS_TRANSITION(entry) && (MBCS_ENTRY_TRANSITION_STATE(entry))==trailState) {
   655             /* the offset is different for each lead byte */
   656             offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
   657             /* for each trail byte for this lead byte */
   658             for(j=0; j<256; ++j) {
   659                 entry=states->stateTable[trailState][j];
   660                 switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
   661                 case MBCS_STATE_VALID_16:
   662                     entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
   663                     if((*pUnicodeCodeUnits)[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) {
   664                         ++count[i];
   665                     } else {
   666                         j=999; /* do not count for this lead byte because there are assignments */
   667                     }
   668                     break;
   669                 case MBCS_STATE_VALID_16_PAIR:
   670                     entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
   671                     if((*pUnicodeCodeUnits)[entry]==0xfffe) {
   672                         count[i]+=2;
   673                     } else {
   674                         j=999; /* do not count for this lead byte because there are assignments */
   675                     }
   676                     break;
   677                 default:
   678                     break;
   679                 }
   680             }
   681             if(j==256) {
   682                 /* all trail bytes for this lead byte are unassigned */
   683                 savings+=count[i];
   684             } else {
   685                 count[i]=0;
   686             }
   687         }
   688     }
   689     /* subtract from the possible savings the cost of an additional state */
   690     savings=savings*2-1024; /* count bytes, not 16-bit words */
   691     if(savings<=0) {
   692         return;
   693     }
   694     if(verbose) {
   695         printf("compacting toUnicode data saves %ld bytes\n", (long)savings);
   696     }
   697     if(states->countStates>=MBCS_MAX_STATE_COUNT) {
   698         fprintf(stderr, "cannot compact toUnicode because the maximum number of states is reached\n");
   699         return;
   700     }
   702     /* make a copy of the state table */
   703     oldStateTable=(int32_t (*)[256])uprv_malloc(states->countStates*1024);
   704     if(oldStateTable==NULL) {
   705         fprintf(stderr, "cannot compact toUnicode: out of memory\n");
   706         return;
   707     }
   708     uprv_memcpy(oldStateTable, states->stateTable, states->countStates*1024);
   710     /* add the new state */
   711     /*
   712      * this function does not catch the degenerate case where all lead bytes
   713      * have all-unassigned trail bytes and the lead state could be removed
   714      */
   715     newState=states->countStates++;
   716     states->stateFlags[newState]=0;
   717     /* copy the old trail state, turning all assigned states into unassigned ones */
   718     for(i=0; i<256; ++i) {
   719         entry=states->stateTable[trailState][i];
   720         switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
   721         case MBCS_STATE_VALID_16:
   722         case MBCS_STATE_VALID_16_PAIR:
   723             states->stateTable[newState][i]=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe);
   724             break;
   725         default:
   726             states->stateTable[newState][i]=entry;
   727             break;
   728         }
   729     }
   731     /* in the lead state, redirect all lead bytes with all-unassigned trail bytes to the new state */
   732     for(i=0; i<256; ++i) {
   733         if(count[i]>0) {
   734             states->stateTable[leadState][i]=MBCS_ENTRY_SET_STATE(states->stateTable[leadState][i], newState);
   735         }
   736     }
   738     /* sum up the new state table */
   739     for(i=0; i<states->countStates; ++i) {
   740         states->stateFlags[i]&=~MBCS_STATE_FLAG_READY;
   741     }
   742     sum=sumUpStates(states);
   744     /* allocate a new, smaller code units array */
   745     oldUnicodeCodeUnits=*pUnicodeCodeUnits;
   746     if(sum==0) {
   747         *pUnicodeCodeUnits=NULL;
   748         if(oldUnicodeCodeUnits!=NULL) {
   749             uprv_free(oldUnicodeCodeUnits);
   750         }
   751         uprv_free(oldStateTable);
   752         return;
   753     }
   754     *pUnicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t));
   755     if(*pUnicodeCodeUnits==NULL) {
   756         fprintf(stderr, "cannot compact toUnicode: out of memory allocating %ld 16-bit code units\n",
   757             (long)sum);
   758         /* revert to the old state table */
   759         *pUnicodeCodeUnits=oldUnicodeCodeUnits;
   760         --states->countStates;
   761         uprv_memcpy(states->stateTable, oldStateTable, states->countStates*1024);
   762         uprv_free(oldStateTable);
   763         return;
   764     }
   765     for(i=0; i<sum; ++i) {
   766         (*pUnicodeCodeUnits)[i]=0xfffe;
   767     }
   769     /* copy the code units for all assigned characters */
   770     /*
   771      * The old state table has the same lead _and_ trail states for assigned characters!
   772      * The differences are in the offsets, and in the trail states for some unassigned characters.
   773      * For each character with an assigned state in the new table, it was assigned in the old one.
   774      * Only still-assigned characters are copied.
   775      * Note that fallback mappings need to get their offset values adjusted.
   776      */
   778     /* for each initial state */
   779     for(leadState=0; leadState<states->countStates; ++leadState) {
   780         if((states->stateFlags[leadState]&0xf)==MBCS_STATE_FLAG_DIRECT) {
   781             /* for each lead byte from there */
   782             for(i=0; i<256; ++i) {
   783                 entry=states->stateTable[leadState][i];
   784                 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
   785                     trailState=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
   786                     /* the new state does not have assigned states */
   787                     if(trailState!=newState) {
   788                         trailOffset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
   789                         oldTrailOffset=MBCS_ENTRY_TRANSITION_OFFSET(oldStateTable[leadState][i]);
   790                         /* for each trail byte */
   791                         for(j=0; j<256; ++j) {
   792                             entry=states->stateTable[trailState][j];
   793                             /* copy assigned-character code units and adjust fallback offsets */
   794                             switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
   795                             case MBCS_STATE_VALID_16:
   796                                 offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry);
   797                                 /* find the old offset according to the old state table */
   798                                 oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]);
   799                                 unit=(*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset];
   800                                 if(unit==0xfffe && (fallback=ucm_findFallback(toUFallbacks, countToUFallbacks, oldOffset))>=0) {
   801                                     toUFallbacks[fallback].offset=0x80000000|offset;
   802                                 }
   803                                 break;
   804                             case MBCS_STATE_VALID_16_PAIR:
   805                                 offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry);
   806                                 /* find the old offset according to the old state table */
   807                                 oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]);
   808                                 (*pUnicodeCodeUnits)[offset++]=oldUnicodeCodeUnits[oldOffset++];
   809                                 (*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset];
   810                                 break;
   811                             default:
   812                                 break;
   813                             }
   814                         }
   815                     }
   816                 }
   817             }
   818         }
   819     }
   821     /* remove temporary flags from fallback offsets that protected them from being modified twice */
   822     for(i=0; i<countToUFallbacks; ++i) {
   823         toUFallbacks[i].offset&=0x7fffffff;
   824     }
   826     /* free temporary memory */
   827     uprv_free(oldUnicodeCodeUnits);
   828     uprv_free(oldStateTable);
   829 }
   831 /*
   832  * recursive sub-function of compactToUnicodeHelper()
   833  * returns:
   834  * >0 number of bytes that are used in unicodeCodeUnits[] that could be saved,
   835  *    if all sequences from this state are unassigned, returns the
   836  * <0 there are assignments in unicodeCodeUnits[]
   837  * 0  no use of unicodeCodeUnits[]
   838  */
   839 static int32_t
   840 findUnassigned(UCMStates *states,
   841                uint16_t *unicodeCodeUnits,
   842                _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
   843                int32_t state, int32_t offset, uint32_t b) {
   844     int32_t i, entry, savings, localSavings, belowSavings;
   845     UBool haveAssigned;
   847     localSavings=belowSavings=0;
   848     haveAssigned=FALSE;
   849     for(i=0; i<256; ++i) {
   850         entry=states->stateTable[state][i];
   851         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
   852             savings=findUnassigned(states,
   853                         unicodeCodeUnits,
   854                         toUFallbacks, countToUFallbacks,
   855                         MBCS_ENTRY_TRANSITION_STATE(entry),
   856                         offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),
   857                         (b<<8)|(uint32_t)i);
   858             if(savings<0) {
   859                 haveAssigned=TRUE;
   860             } else if(savings>0) {
   861                 printf("    all-unassigned sequences from prefix 0x%02lx state %ld use %ld bytes\n",
   862                     (unsigned long)((b<<8)|i), (long)state, (long)savings);
   863                 belowSavings+=savings;
   864             }
   865         } else if(!haveAssigned) {
   866             switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
   867             case MBCS_STATE_VALID_16:
   868                 entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
   869                 if(unicodeCodeUnits[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) {
   870                     localSavings+=2;
   871                 } else {
   872                     haveAssigned=TRUE;
   873                 }
   874                 break;
   875             case MBCS_STATE_VALID_16_PAIR:
   876                 entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
   877                 if(unicodeCodeUnits[entry]==0xfffe) {
   878                     localSavings+=4;
   879                 } else {
   880                     haveAssigned=TRUE;
   881                 }
   882                 break;
   883             default:
   884                 break;
   885             }
   886         }
   887     }
   888     if(haveAssigned) {
   889         return -1;
   890     } else {
   891         return localSavings+belowSavings;
   892     }
   893 }
   895 /* helper function for finding compaction opportunities */
   896 static void
   897 compactToUnicodeHelper(UCMStates *states,
   898                        uint16_t *unicodeCodeUnits,
   899                        _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks) {
   900     int32_t state, savings;
   902     /* for each initial state */
   903     for(state=0; state<states->countStates; ++state) {
   904         if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
   905             savings=findUnassigned(states,
   906                         unicodeCodeUnits,
   907                         toUFallbacks, countToUFallbacks,
   908                         state, 0, 0);
   909             if(savings>0) {
   910                 printf("    all-unassigned sequences from initial state %ld use %ld bytes\n",
   911                     (long)state, (long)savings);
   912             }
   913         }
   914     }
   915 }
   917 static int32_t
   918 compareFallbacks(const void *context, const void *fb1, const void *fb2) {
   919     return ((const _MBCSToUFallback *)fb1)->offset-((const _MBCSToUFallback *)fb2)->offset;
   920 }
   922 U_CAPI void U_EXPORT2
   923 ucm_optimizeStates(UCMStates *states,
   924                    uint16_t **pUnicodeCodeUnits,
   925                    _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
   926                    UBool verbose) {
   927     UErrorCode errorCode;
   928     int32_t state, cell, entry;
   930     /* test each state table entry */
   931     for(state=0; state<states->countStates; ++state) {
   932         for(cell=0; cell<256; ++cell) {
   933             entry=states->stateTable[state][cell];
   934             /*
   935              * if the entry is a final one with an MBCS_STATE_VALID_DIRECT_16 action code
   936              * and the code point is "unassigned" (0xfffe), then change it to
   937              * the "unassigned" action code with bits 26..23 set to zero and U+fffe.
   938              */
   939             if(MBCS_ENTRY_SET_STATE(entry, 0)==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, 0xfffe)) {
   940                 states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_UNASSIGNED);
   941             }
   942         }
   943     }
   945     /* try to compact the toUnicode tables */
   946     if(states->maxCharLength==2) {
   947         compactToUnicode2(states, pUnicodeCodeUnits, toUFallbacks, countToUFallbacks, verbose);
   948     } else if(states->maxCharLength>2) {
   949         if(verbose) {
   950             compactToUnicodeHelper(states, *pUnicodeCodeUnits, toUFallbacks, countToUFallbacks);
   951         }
   952     }
   954     /* sort toUFallbacks */
   955     /*
   956      * It should be safe to sort them before compactToUnicode2() is called,
   957      * because it should not change the relative order of the offset values
   958      * that it adjusts, but they need to be sorted at some point, and
   959      * it is safest here.
   960      */
   961     if(countToUFallbacks>0) {
   962         errorCode=U_ZERO_ERROR; /* nothing bad will happen... */
   963         uprv_sortArray(toUFallbacks, countToUFallbacks,
   964                        sizeof(_MBCSToUFallback),
   965                        compareFallbacks, NULL, FALSE, &errorCode);
   966     }
   967 }
   969 /* use a complete state table ----------------------------------------------- */
   971 U_CAPI int32_t U_EXPORT2
   972 ucm_countChars(UCMStates *states,
   973                const uint8_t *bytes, int32_t length) {
   974     uint32_t offset;
   975     int32_t i, entry, count;
   976     uint8_t state;
   978     offset=0;
   979     count=0;
   980     state=0;
   982     if(states->countStates==0) {
   983         fprintf(stderr, "ucm error: there is no state information!\n");
   984         return -1;
   985     }
   987     /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */
   988     if(length==2 && states->outputType==MBCS_OUTPUT_2_SISO) {
   989         state=1;
   990     }
   992     /*
   993      * Walk down the state table like in conversion,
   994      * much like getNextUChar().
   995      * We assume that c<=0x10ffff.
   996      */
   997     for(i=0; i<length; ++i) {
   998         entry=states->stateTable[state][bytes[i]];
   999         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
  1000             state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
  1001             offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
  1002         } else {
  1003             switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
  1004             case MBCS_STATE_ILLEGAL:
  1005                 fprintf(stderr, "ucm error: byte sequence ends in illegal state\n");
  1006                 return -1;
  1007             case MBCS_STATE_CHANGE_ONLY:
  1008                 fprintf(stderr, "ucm error: byte sequence ends in state-change-only\n");
  1009                 return -1;
  1010             case MBCS_STATE_UNASSIGNED:
  1011             case MBCS_STATE_FALLBACK_DIRECT_16:
  1012             case MBCS_STATE_VALID_DIRECT_16:
  1013             case MBCS_STATE_FALLBACK_DIRECT_20:
  1014             case MBCS_STATE_VALID_DIRECT_20:
  1015             case MBCS_STATE_VALID_16:
  1016             case MBCS_STATE_VALID_16_PAIR:
  1017                 /* count a complete character and prepare for a new one */
  1018                 ++count;
  1019                 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry);
  1020                 offset=0;
  1021                 break;
  1022             default:
  1023                 /* reserved, must never occur */
  1024                 fprintf(stderr, "ucm error: byte sequence reached reserved action code, entry: 0x%02lx\n", (unsigned long)entry);
  1025                 return -1;
  1030     if(offset!=0) {
  1031         fprintf(stderr, "ucm error: byte sequence too short, ends in non-final state %u\n", state);
  1032         return -1;
  1035     /*
  1036      * for SI/SO (like EBCDIC-stateful), multiple-character results
  1037      * must consist of only double-byte sequences
  1038      */
  1039     if(count>1 && states->outputType==MBCS_OUTPUT_2_SISO && length!=2*count) {
  1040         fprintf(stderr, "ucm error: SI/SO (like EBCDIC-stateful) result with %d characters does not contain all DBCS\n", (int)count);
  1041         return -1;
  1044     return count;
  1046 #endif

mercurial