intl/icu/source/tools/toolutil/ucm.c

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*
     2 *******************************************************************************
     3 *
     4 *   Copyright (C) 2003-2013, International Business Machines
     5 *   Corporation and others.  All Rights Reserved.
     6 *
     7 *******************************************************************************
     8 *   file name:  ucm.c
     9 *   encoding:   US-ASCII
    10 *   tab size:   8 (not used)
    11 *   indentation:4
    12 *
    13 *   created on: 2003jun20
    14 *   created by: Markus W. Scherer
    15 *
    16 *   This file reads a .ucm file, stores its mappings and sorts them.
    17 *   It implements handling of Unicode conversion mappings from .ucm files
    18 *   for makeconv, canonucm, rptp2ucm, etc.
    19 *
    20 *   Unicode code point sequences with a length of more than 1,
    21 *   as well as byte sequences with more than 4 bytes or more than one complete
    22 *   character sequence are handled to support m:n mappings.
    23 */
    25 #include "unicode/utypes.h"
    26 #include "unicode/ustring.h"
    27 #include "cstring.h"
    28 #include "cmemory.h"
    29 #include "filestrm.h"
    30 #include "uarrsort.h"
    31 #include "ucnvmbcs.h"
    32 #include "ucnv_bld.h"
    33 #include "ucnv_ext.h"
    34 #include "uparse.h"
    35 #include "ucm.h"
    36 #include <stdio.h>
    38 #if !UCONFIG_NO_CONVERSION
    40 /* -------------------------------------------------------------------------- */
    42 static void
    43 printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) {
    44     int32_t j;
    46     for(j=0; j<m->uLen; ++j) {
    47         fprintf(f, "<U%04lX>", (long)codePoints[j]);
    48     }
    50     fputc(' ', f);
    52     for(j=0; j<m->bLen; ++j) {
    53         fprintf(f, "\\x%02X", bytes[j]);
    54     }
    56     if(m->f>=0) {
    57         fprintf(f, " |%u\n", m->f);
    58     } else {
    59         fputs("\n", f);
    60     }
    61 }
    63 U_CAPI void U_EXPORT2
    64 ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) {
    65     printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f);
    66 }
    68 U_CAPI void U_EXPORT2
    69 ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) {
    70     UCMapping *m;
    71     int32_t i, length;
    73     m=table->mappings;
    74     length=table->mappingsLength;
    75     if(byUnicode) {
    76         for(i=0; i<length; ++m, ++i) {
    77             ucm_printMapping(table, m, f);
    78         }
    79     } else {
    80         const int32_t *map=table->reverseMap;
    81         for(i=0; i<length; ++i) {
    82             ucm_printMapping(table, m+map[i], f);
    83         }
    84     }
    85 }
    87 /* mapping comparisons ------------------------------------------------------ */
    89 static int32_t
    90 compareUnicode(UCMTable *lTable, const UCMapping *l,
    91                UCMTable *rTable, const UCMapping *r) {
    92     const UChar32 *lu, *ru;
    93     int32_t result, i, length;
    95     if(l->uLen==1 && r->uLen==1) {
    96         /* compare two single code points */
    97         return l->u-r->u;
    98     }
   100     /* get pointers to the code point sequences */
   101     lu=UCM_GET_CODE_POINTS(lTable, l);
   102     ru=UCM_GET_CODE_POINTS(rTable, r);
   104     /* get the minimum length */
   105     if(l->uLen<=r->uLen) {
   106         length=l->uLen;
   107     } else {
   108         length=r->uLen;
   109     }
   111     /* compare the code points */
   112     for(i=0; i<length; ++i) {
   113         result=lu[i]-ru[i];
   114         if(result!=0) {
   115             return result;
   116         }
   117     }
   119     /* compare the lengths */
   120     return l->uLen-r->uLen;
   121 }
   123 static int32_t
   124 compareBytes(UCMTable *lTable, const UCMapping *l,
   125              UCMTable *rTable, const UCMapping *r,
   126              UBool lexical) {
   127     const uint8_t *lb, *rb;
   128     int32_t result, i, length;
   130     /*
   131      * A lexical comparison is used for sorting in the builder, to allow
   132      * an efficient search for a byte sequence that could be a prefix
   133      * of a previously entered byte sequence.
   134      *
   135      * Comparing by lengths first is for compatibility with old .ucm tools
   136      * like canonucm and rptp2ucm.
   137      */
   138     if(lexical) {
   139         /* get the minimum length and continue */
   140         if(l->bLen<=r->bLen) {
   141             length=l->bLen;
   142         } else {
   143             length=r->bLen;
   144         }
   145     } else {
   146         /* compare lengths first */
   147         result=l->bLen-r->bLen;
   148         if(result!=0) {
   149             return result;
   150         } else {
   151             length=l->bLen;
   152         }
   153     }
   155     /* get pointers to the byte sequences */
   156     lb=UCM_GET_BYTES(lTable, l);
   157     rb=UCM_GET_BYTES(rTable, r);
   159     /* compare the bytes */
   160     for(i=0; i<length; ++i) {
   161         result=lb[i]-rb[i];
   162         if(result!=0) {
   163             return result;
   164         }
   165     }
   167     /* compare the lengths */
   168     return l->bLen-r->bLen;
   169 }
   171 /* compare UCMappings for sorting */
   172 static int32_t
   173 compareMappings(UCMTable *lTable, const UCMapping *l,
   174                 UCMTable *rTable, const UCMapping *r,
   175                 UBool uFirst) {
   176     int32_t result;
   178     /* choose which side to compare first */
   179     if(uFirst) {
   180         /* Unicode then bytes */
   181         result=compareUnicode(lTable, l, rTable, r);
   182         if(result==0) {
   183             result=compareBytes(lTable, l, rTable, r, FALSE); /* not lexically, like canonucm */
   184         }
   185     } else {
   186         /* bytes then Unicode */
   187         result=compareBytes(lTable, l, rTable, r, TRUE); /* lexically, for builder */
   188         if(result==0) {
   189             result=compareUnicode(lTable, l, rTable, r);
   190         }
   191     }
   193     if(result!=0) {
   194         return result;
   195     }
   197     /* compare the flags */
   198     return l->f-r->f;
   199 }
   201 /* sorting by Unicode first sorts mappings directly */
   202 static int32_t
   203 compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) {
   204     return compareMappings(
   205         (UCMTable *)context, (const UCMapping *)left,
   206         (UCMTable *)context, (const UCMapping *)right, TRUE);
   207 }
   209 /* sorting by bytes first sorts the reverseMap; use indirection to mappings */
   210 static int32_t
   211 compareMappingsBytesFirst(const void *context, const void *left, const void *right) {
   212     UCMTable *table=(UCMTable *)context;
   213     int32_t l=*(const int32_t *)left, r=*(const int32_t *)right;
   214     return compareMappings(
   215         table, table->mappings+l,
   216         table, table->mappings+r, FALSE);
   217 }
   219 U_CAPI void U_EXPORT2
   220 ucm_sortTable(UCMTable *t) {
   221     UErrorCode errorCode;
   222     int32_t i;
   224     if(t->isSorted) {
   225         return;
   226     }
   228     errorCode=U_ZERO_ERROR;
   230     /* 1. sort by Unicode first */
   231     uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping),
   232                    compareMappingsUnicodeFirst, t,
   233                    FALSE, &errorCode);
   235     /* build the reverseMap */
   236     if(t->reverseMap==NULL) {
   237         /*
   238          * allocate mappingsCapacity instead of mappingsLength so that
   239          * if mappings are added, the reverseMap need not be
   240          * reallocated each time
   241          * (see ucm_moveMappings() and ucm_addMapping())
   242          */
   243         t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t));
   244         if(t->reverseMap==NULL) {
   245             fprintf(stderr, "ucm error: unable to allocate reverseMap\n");
   246             exit(U_MEMORY_ALLOCATION_ERROR);
   247         }
   248     }
   249     for(i=0; i<t->mappingsLength; ++i) {
   250         t->reverseMap[i]=i;
   251     }
   253     /* 2. sort reverseMap by mappings bytes first */
   254     uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t),
   255                    compareMappingsBytesFirst, t,
   256                    FALSE, &errorCode);
   258     if(U_FAILURE(errorCode)) {
   259         fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n",
   260                 u_errorName(errorCode));
   261         exit(errorCode);
   262     }
   264     t->isSorted=TRUE;
   265 }
   267 /*
   268  * remove mappings with their move flag set from the base table
   269  * and move some of them (with UCM_MOVE_TO_EXT) to the extension table
   270  */
   271 U_CAPI void U_EXPORT2
   272 ucm_moveMappings(UCMTable *base, UCMTable *ext) {
   273     UCMapping *mb, *mbLimit;
   274     int8_t flag;
   276     mb=base->mappings;
   277     mbLimit=mb+base->mappingsLength;
   279     while(mb<mbLimit) {
   280         flag=mb->moveFlag;
   281         if(flag!=0) {
   282             /* reset the move flag */
   283             mb->moveFlag=0;
   285             if(ext!=NULL && (flag&UCM_MOVE_TO_EXT)) {
   286                 /* add the mapping to the extension table */
   287                 ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb));
   288             }
   290             /* remove this mapping: move the last base mapping down and overwrite the current one */
   291             if(mb<(mbLimit-1)) {
   292                 uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping));
   293             }
   294             --mbLimit;
   295             --base->mappingsLength;
   296             base->isSorted=FALSE;
   297         } else {
   298             ++mb;
   299         }
   300     }
   301 }
   303 enum {
   304     NEEDS_MOVE=1,
   305     HAS_ERRORS=2
   306 };
   308 static uint8_t
   309 checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
   310                     UBool moveToExt, UBool intersectBase) {
   311     UCMapping *mb, *me, *mbLimit, *meLimit;
   312     int32_t cmp;
   313     uint8_t result;
   315     mb=base->mappings;
   316     mbLimit=mb+base->mappingsLength;
   318     me=ext->mappings;
   319     meLimit=me+ext->mappingsLength;
   321     result=0;
   323     for(;;) {
   324         /* skip irrelevant mappings on both sides */
   325         for(;;) {
   326             if(mb==mbLimit) {
   327                 return result;
   328             }
   330             if((0<=mb->f && mb->f<=2) || mb->f==4) {
   331                 break;
   332             }
   334             ++mb;
   335         }
   337         for(;;) {
   338             if(me==meLimit) {
   339                 return result;
   340             }
   342             if((0<=me->f && me->f<=2) || me->f==4) {
   343                 break;
   344             }
   346             ++me;
   347         }
   349         /* compare the base and extension mappings */
   350         cmp=compareUnicode(base, mb, ext, me);
   351         if(cmp<0) {
   352             if(intersectBase && (intersectBase!=2 || mb->bLen>1)) {
   353                 /*
   354                  * mapping in base but not in ext, move it
   355                  *
   356                  * if ext is DBCS, move DBCS mappings here
   357                  * and check SBCS ones for Unicode prefix below
   358                  */
   359                 mb->moveFlag|=UCM_MOVE_TO_EXT;
   360                 result|=NEEDS_MOVE;
   362             /* does mb map from an input sequence that is a prefix of me's? */
   363             } else if( mb->uLen<me->uLen &&
   364                 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
   365             ) {
   366                 if(moveToExt) {
   367                     /* mark this mapping to be moved to the extension table */
   368                     mb->moveFlag|=UCM_MOVE_TO_EXT;
   369                     result|=NEEDS_MOVE;
   370                 } else {
   371                     fprintf(stderr,
   372                             "ucm error: the base table contains a mapping whose input sequence\n"
   373                             "           is a prefix of the input sequence of an extension mapping\n");
   374                     ucm_printMapping(base, mb, stderr);
   375                     ucm_printMapping(ext, me, stderr);
   376                     result|=HAS_ERRORS;
   377                 }
   378             }
   380             ++mb;
   381         } else if(cmp==0) {
   382             /*
   383              * same output: remove the extension mapping,
   384              * otherwise treat as an error
   385              */
   386             if( mb->f==me->f && mb->bLen==me->bLen &&
   387                 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
   388             ) {
   389                 me->moveFlag|=UCM_REMOVE_MAPPING;
   390                 result|=NEEDS_MOVE;
   391             } else if(intersectBase) {
   392                 /* mapping in base but not in ext, move it */
   393                 mb->moveFlag|=UCM_MOVE_TO_EXT;
   394                 result|=NEEDS_MOVE;
   395             } else {
   396                 fprintf(stderr,
   397                         "ucm error: the base table contains a mapping whose input sequence\n"
   398                         "           is the same as the input sequence of an extension mapping\n"
   399                         "           but it maps differently\n");
   400                 ucm_printMapping(base, mb, stderr);
   401                 ucm_printMapping(ext, me, stderr);
   402                 result|=HAS_ERRORS;
   403             }
   405             ++mb;
   406         } else /* cmp>0 */ {
   407             ++me;
   408         }
   409     }
   410 }
   412 static uint8_t
   413 checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
   414                   UBool moveToExt, UBool intersectBase) {
   415     UCMapping *mb, *me;
   416     int32_t *baseMap, *extMap;
   417     int32_t b, e, bLimit, eLimit, cmp;
   418     uint8_t result;
   419     UBool isSISO;
   421     baseMap=base->reverseMap;
   422     extMap=ext->reverseMap;
   424     b=e=0;
   425     bLimit=base->mappingsLength;
   426     eLimit=ext->mappingsLength;
   428     result=0;
   430     isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO);
   432     for(;;) {
   433         /* skip irrelevant mappings on both sides */
   434         for(;; ++b) {
   435             if(b==bLimit) {
   436                 return result;
   437             }
   438             mb=base->mappings+baseMap[b];
   440             if(intersectBase==2 && mb->bLen==1) {
   441                 /*
   442                  * comparing a base against a DBCS extension:
   443                  * leave SBCS base mappings alone
   444                  */
   445                 continue;
   446             }
   448             if(mb->f==0 || mb->f==3) {
   449                 break;
   450             }
   451         }
   453         for(;;) {
   454             if(e==eLimit) {
   455                 return result;
   456             }
   457             me=ext->mappings+extMap[e];
   459             if(me->f==0 || me->f==3) {
   460                 break;
   461             }
   463             ++e;
   464         }
   466         /* compare the base and extension mappings */
   467         cmp=compareBytes(base, mb, ext, me, TRUE);
   468         if(cmp<0) {
   469             if(intersectBase) {
   470                 /* mapping in base but not in ext, move it */
   471                 mb->moveFlag|=UCM_MOVE_TO_EXT;
   472                 result|=NEEDS_MOVE;
   474             /*
   475              * does mb map from an input sequence that is a prefix of me's?
   476              * for SI/SO tables, a single byte is never a prefix because it
   477              * occurs in a separate single-byte state
   478              */
   479             } else if( mb->bLen<me->bLen &&
   480                 (!isSISO || mb->bLen>1) &&
   481                 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
   482             ) {
   483                 if(moveToExt) {
   484                     /* mark this mapping to be moved to the extension table */
   485                     mb->moveFlag|=UCM_MOVE_TO_EXT;
   486                     result|=NEEDS_MOVE;
   487                 } else {
   488                     fprintf(stderr,
   489                             "ucm error: the base table contains a mapping whose input sequence\n"
   490                             "           is a prefix of the input sequence of an extension mapping\n");
   491                     ucm_printMapping(base, mb, stderr);
   492                     ucm_printMapping(ext, me, stderr);
   493                     result|=HAS_ERRORS;
   494                 }
   495             }
   497             ++b;
   498         } else if(cmp==0) {
   499             /*
   500              * same output: remove the extension mapping,
   501              * otherwise treat as an error
   502              */
   503             if( mb->f==me->f && mb->uLen==me->uLen &&
   504                 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
   505             ) {
   506                 me->moveFlag|=UCM_REMOVE_MAPPING;
   507                 result|=NEEDS_MOVE;
   508             } else if(intersectBase) {
   509                 /* mapping in base but not in ext, move it */
   510                 mb->moveFlag|=UCM_MOVE_TO_EXT;
   511                 result|=NEEDS_MOVE;
   512             } else {
   513                 fprintf(stderr,
   514                         "ucm error: the base table contains a mapping whose input sequence\n"
   515                         "           is the same as the input sequence of an extension mapping\n"
   516                         "           but it maps differently\n");
   517                 ucm_printMapping(base, mb, stderr);
   518                 ucm_printMapping(ext, me, stderr);
   519                 result|=HAS_ERRORS;
   520             }
   522             ++b;
   523         } else /* cmp>0 */ {
   524             ++e;
   525         }
   526     }
   527 }
   529 U_CAPI UBool U_EXPORT2
   530 ucm_checkValidity(UCMTable *table, UCMStates *baseStates) {
   531     UCMapping *m, *mLimit;
   532     int32_t count;
   533     UBool isOK;
   535     m=table->mappings;
   536     mLimit=m+table->mappingsLength;
   537     isOK=TRUE;
   539     while(m<mLimit) {
   540         count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen);
   541         if(count<1) {
   542             ucm_printMapping(table, m, stderr);
   543             isOK=FALSE;
   544         }
   545         ++m;
   546     }
   548     return isOK;
   549 }
   551 U_CAPI UBool U_EXPORT2
   552 ucm_checkBaseExt(UCMStates *baseStates,
   553                  UCMTable *base, UCMTable *ext, UCMTable *moveTarget,
   554                  UBool intersectBase) {
   555     uint8_t result;
   557     /* if we have an extension table, we must always use precision flags */
   558     if(base->flagsType&UCM_FLAGS_IMPLICIT) {
   559         fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n");
   560         return FALSE;
   561     }
   562     if(ext->flagsType&UCM_FLAGS_IMPLICIT) {
   563         fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n");
   564         return FALSE;
   565     }
   567     /* checking requires both tables to be sorted */
   568     ucm_sortTable(base);
   569     ucm_sortTable(ext);
   571     /* check */
   572     result=
   573         checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase)|
   574         checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase);
   576     if(result&HAS_ERRORS) {
   577         return FALSE;
   578     }
   580     if(result&NEEDS_MOVE) {
   581         ucm_moveMappings(ext, NULL);
   582         ucm_moveMappings(base, moveTarget);
   583         ucm_sortTable(base);
   584         ucm_sortTable(ext);
   585         if(moveTarget!=NULL) {
   586             ucm_sortTable(moveTarget);
   587         }
   588     }
   590     return TRUE;
   591 }
   593 /* merge tables for rptp2ucm ------------------------------------------------ */
   595 U_CAPI void U_EXPORT2
   596 ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable,
   597                 const uint8_t *subchar, int32_t subcharLength,
   598                 uint8_t subchar1) {
   599     UCMapping *fromUMapping, *toUMapping;
   600     int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp;
   602     ucm_sortTable(fromUTable);
   603     ucm_sortTable(toUTable);
   605     fromUMapping=fromUTable->mappings;
   606     toUMapping=toUTable->mappings;
   608     fromUTop=fromUTable->mappingsLength;
   609     toUTop=toUTable->mappingsLength;
   611     fromUIndex=toUIndex=0;
   613     while(fromUIndex<fromUTop && toUIndex<toUTop) {
   614         cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, TRUE);
   615         if(cmp==0) {
   616             /* equal: roundtrip, nothing to do (flags are initially 0) */
   617             ++fromUMapping;
   618             ++toUMapping;
   620             ++fromUIndex;
   621             ++toUIndex;
   622         } else if(cmp<0) {
   623             /*
   624              * the fromU mapping does not have a toU counterpart:
   625              * fallback Unicode->codepage
   626              */
   627             if( (fromUMapping->bLen==subcharLength &&
   628                  0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
   629                 (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
   630             ) {
   631                 fromUMapping->f=2; /* SUB mapping */
   632             } else {
   633                 fromUMapping->f=1; /* normal fallback */
   634             }
   636             ++fromUMapping;
   637             ++fromUIndex;
   638         } else {
   639             /*
   640              * the toU mapping does not have a fromU counterpart:
   641              * (reverse) fallback codepage->Unicode, copy it to the fromU table
   642              */
   644             /* ignore reverse fallbacks to Unicode SUB */
   645             if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
   646                 toUMapping->f=3; /* reverse fallback */
   647                 ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
   649                 /* the table may have been reallocated */
   650                 fromUMapping=fromUTable->mappings+fromUIndex;
   651             }
   653             ++toUMapping;
   654             ++toUIndex;
   655         }
   656     }
   658     /* either one or both tables are exhausted */
   659     while(fromUIndex<fromUTop) {
   660         /* leftover fromU mappings are fallbacks */
   661         if( (fromUMapping->bLen==subcharLength &&
   662              0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
   663             (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
   664         ) {
   665             fromUMapping->f=2; /* SUB mapping */
   666         } else {
   667             fromUMapping->f=1; /* normal fallback */
   668         }
   670         ++fromUMapping;
   671         ++fromUIndex;
   672     }
   674     while(toUIndex<toUTop) {
   675         /* leftover toU mappings are reverse fallbacks */
   677         /* ignore reverse fallbacks to Unicode SUB */
   678         if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
   679             toUMapping->f=3; /* reverse fallback */
   680             ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
   681         }
   683         ++toUMapping;
   684         ++toUIndex;
   685     }
   687     fromUTable->isSorted=FALSE;
   688 }
   690 /* separate extension mappings out of base table for rptp2ucm --------------- */
   692 U_CAPI UBool U_EXPORT2
   693 ucm_separateMappings(UCMFile *ucm, UBool isSISO) {
   694     UCMTable *table;
   695     UCMapping *m, *mLimit;
   696     int32_t type;
   697     UBool needsMove, isOK;
   699     table=ucm->base;
   700     m=table->mappings;
   701     mLimit=m+table->mappingsLength;
   703     needsMove=FALSE;
   704     isOK=TRUE;
   706     for(; m<mLimit; ++m) {
   707         if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) {
   708             fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n");
   709             ucm_printMapping(table, m, stderr);
   710             m->moveFlag|=UCM_REMOVE_MAPPING;
   711             needsMove=TRUE;
   712             continue;
   713         }
   715         type=ucm_mappingType(
   716                 &ucm->states, m,
   717                 UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m));
   718         if(type<0) {
   719             /* illegal byte sequence */
   720             printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr);
   721             isOK=FALSE;
   722         } else if(type>0) {
   723             m->moveFlag|=UCM_MOVE_TO_EXT;
   724             needsMove=TRUE;
   725         }
   726     }
   728     if(!isOK) {
   729         return FALSE;
   730     }
   731     if(needsMove) {
   732         ucm_moveMappings(ucm->base, ucm->ext);
   733         return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, FALSE);
   734     } else {
   735         ucm_sortTable(ucm->base);
   736         return TRUE;
   737     }
   738 }
   740 /* ucm parser --------------------------------------------------------------- */
   742 U_CAPI int8_t U_EXPORT2
   743 ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) {
   744     const char *s=*ps;
   745     char *end;
   746     uint8_t byte;
   747     int8_t bLen;
   749     bLen=0;
   750     for(;;) {
   751         /* skip an optional plus sign */
   752         if(bLen>0 && *s=='+') {
   753             ++s;
   754         }
   755         if(*s!='\\') {
   756             break;
   757         }
   759         if( s[1]!='x' ||
   760             (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4
   761         ) {
   762             fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line);
   763             return -1;
   764         }
   766         if(bLen==UCNV_EXT_MAX_BYTES) {
   767             fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line);
   768             return -1;
   769         }
   770         bytes[bLen++]=byte;
   771         s=end;
   772     }
   774     *ps=s;
   775     return bLen;
   776 }
   778 /* parse a mapping line; must not be empty */
   779 U_CAPI UBool U_EXPORT2
   780 ucm_parseMappingLine(UCMapping *m,
   781                      UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
   782                      uint8_t bytes[UCNV_EXT_MAX_BYTES],
   783                      const char *line) {
   784     const char *s;
   785     char *end;
   786     UChar32 cp;
   787     int32_t u16Length;
   788     int8_t uLen, bLen, f;
   790     s=line;
   791     uLen=bLen=0;
   793     /* parse code points */
   794     for(;;) {
   795         /* skip an optional plus sign */
   796         if(uLen>0 && *s=='+') {
   797             ++s;
   798         }
   799         if(*s!='<') {
   800             break;
   801         }
   803         if( s[1]!='U' ||
   804             (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 ||
   805             *end!='>'
   806         ) {
   807             fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line);
   808             return FALSE;
   809         }
   810         if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) {
   811             fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line);
   812             return FALSE;
   813         }
   815         if(uLen==UCNV_EXT_MAX_UCHARS) {
   816             fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line);
   817             return FALSE;
   818         }
   819         codePoints[uLen++]=cp;
   820         s=end+1;
   821     }
   823     if(uLen==0) {
   824         fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line);
   825         return FALSE;
   826     } else if(uLen==1) {
   827         m->u=codePoints[0];
   828     } else {
   829         UErrorCode errorCode=U_ZERO_ERROR;
   830         u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode);
   831         if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) ||
   832             u16Length>UCNV_EXT_MAX_UCHARS
   833         ) {
   834             fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line);
   835             return FALSE;
   836         }
   837     }
   839     s=u_skipWhitespace(s);
   841     /* parse bytes */
   842     bLen=ucm_parseBytes(bytes, line, &s);
   844     if(bLen<0) {
   845         return FALSE;
   846     } else if(bLen==0) {
   847         fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line);
   848         return FALSE;
   849     } else if(bLen<=4) {
   850         uprv_memcpy(m->b.bytes, bytes, bLen);
   851     }
   853     /* skip everything until the fallback indicator, even the start of a comment */
   854     for(;;) {
   855         if(*s==0) {
   856             f=-1; /* no fallback indicator */
   857             break;
   858         } else if(*s=='|') {
   859             f=(int8_t)(s[1]-'0');
   860             if((uint8_t)f>4) {
   861                 fprintf(stderr, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line);
   862                 return FALSE;
   863             }
   864             break;
   865         }
   866         ++s;
   867     }
   869     m->uLen=uLen;
   870     m->bLen=bLen;
   871     m->f=f;
   872     return TRUE;
   873 }
   875 /* general APIs ------------------------------------------------------------- */
   877 U_CAPI UCMTable * U_EXPORT2
   878 ucm_openTable() {
   879     UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable));
   880     if(table==NULL) {
   881         fprintf(stderr, "ucm error: unable to allocate a UCMTable\n");
   882         exit(U_MEMORY_ALLOCATION_ERROR);
   883     }
   885     memset(table, 0, sizeof(UCMTable));
   886     return table;
   887 }
   889 U_CAPI void U_EXPORT2
   890 ucm_closeTable(UCMTable *table) {
   891     if(table!=NULL) {
   892         uprv_free(table->mappings);
   893         uprv_free(table->codePoints);
   894         uprv_free(table->bytes);
   895         uprv_free(table->reverseMap);
   896         uprv_free(table);
   897     }
   898 }
   900 U_CAPI void U_EXPORT2
   901 ucm_resetTable(UCMTable *table) {
   902     if(table!=NULL) {
   903         table->mappingsLength=0;
   904         table->flagsType=0;
   905         table->unicodeMask=0;
   906         table->bytesLength=table->codePointsLength=0;
   907         table->isSorted=FALSE;
   908     }
   909 }
   911 U_CAPI void U_EXPORT2
   912 ucm_addMapping(UCMTable *table,
   913                UCMapping *m,
   914                UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
   915                uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
   916     UCMapping *tm;
   917     UChar32 c;
   918     int32_t idx;
   920     if(table->mappingsLength>=table->mappingsCapacity) {
   921         /* make the mappings array larger */
   922         if(table->mappingsCapacity==0) {
   923             table->mappingsCapacity=1000;
   924         } else {
   925             table->mappingsCapacity*=10;
   926         }
   927         table->mappings=(UCMapping *)uprv_realloc(table->mappings,
   928                                              table->mappingsCapacity*sizeof(UCMapping));
   929         if(table->mappings==NULL) {
   930             fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n",
   931                             (int)table->mappingsCapacity);
   932             exit(U_MEMORY_ALLOCATION_ERROR);
   933         }
   935         if(table->reverseMap!=NULL) {
   936             /* the reverseMap must be reallocated in a new sort */
   937             uprv_free(table->reverseMap);
   938             table->reverseMap=NULL;
   939         }
   940     }
   942     if(m->uLen>1 && table->codePointsCapacity==0) {
   943         table->codePointsCapacity=10000;
   944         table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4);
   945         if(table->codePoints==NULL) {
   946             fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n",
   947                             (int)table->codePointsCapacity);
   948             exit(U_MEMORY_ALLOCATION_ERROR);
   949         }
   950     }
   952     if(m->bLen>4 && table->bytesCapacity==0) {
   953         table->bytesCapacity=10000;
   954         table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity);
   955         if(table->bytes==NULL) {
   956             fprintf(stderr, "ucm error: unable to allocate %d bytes\n",
   957                             (int)table->bytesCapacity);
   958             exit(U_MEMORY_ALLOCATION_ERROR);
   959         }
   960     }
   962     if(m->uLen>1) {
   963         idx=table->codePointsLength;
   964         table->codePointsLength+=m->uLen;
   965         if(table->codePointsLength>table->codePointsCapacity) {
   966             fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n");
   967             exit(U_MEMORY_ALLOCATION_ERROR);
   968         }
   970         uprv_memcpy(table->codePoints+idx, codePoints, m->uLen*4);
   971         m->u=idx;
   972     }
   974     if(m->bLen>4) {
   975         idx=table->bytesLength;
   976         table->bytesLength+=m->bLen;
   977         if(table->bytesLength>table->bytesCapacity) {
   978             fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n");
   979             exit(U_MEMORY_ALLOCATION_ERROR);
   980         }
   982         uprv_memcpy(table->bytes+idx, bytes, m->bLen);
   983         m->b.idx=idx;
   984     }
   986     /* set unicodeMask */
   987     for(idx=0; idx<m->uLen; ++idx) {
   988         c=codePoints[idx];
   989         if(c>=0x10000) {
   990             table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */
   991         } else if(U_IS_SURROGATE(c)) {
   992             table->unicodeMask|=UCNV_HAS_SURROGATES;    /* there are surrogate code points */
   993         }
   994     }
   996     /* set flagsType */
   997     if(m->f<0) {
   998         table->flagsType|=UCM_FLAGS_IMPLICIT;
   999     } else {
  1000         table->flagsType|=UCM_FLAGS_EXPLICIT;
  1003     tm=table->mappings+table->mappingsLength++;
  1004     uprv_memcpy(tm, m, sizeof(UCMapping));
  1006     table->isSorted=FALSE;
  1009 U_CAPI UCMFile * U_EXPORT2
  1010 ucm_open() {
  1011     UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile));
  1012     if(ucm==NULL) {
  1013         fprintf(stderr, "ucm error: unable to allocate a UCMFile\n");
  1014         exit(U_MEMORY_ALLOCATION_ERROR);
  1017     memset(ucm, 0, sizeof(UCMFile));
  1019     ucm->base=ucm_openTable();
  1020     ucm->ext=ucm_openTable();
  1022     ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT;
  1023     ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER;
  1024     ucm->states.outputType=-1;
  1025     ucm->states.minCharLength=ucm->states.maxCharLength=1;
  1027     return ucm;
  1030 U_CAPI void U_EXPORT2
  1031 ucm_close(UCMFile *ucm) {
  1032     if(ucm!=NULL) {
  1033         ucm_closeTable(ucm->base);
  1034         ucm_closeTable(ucm->ext);
  1035         uprv_free(ucm);
  1039 U_CAPI int32_t U_EXPORT2
  1040 ucm_mappingType(UCMStates *baseStates,
  1041                 UCMapping *m,
  1042                 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
  1043                 uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
  1044     /* check validity of the bytes and count the characters in them */
  1045     int32_t count=ucm_countChars(baseStates, bytes, m->bLen);
  1046     if(count<1) {
  1047         /* illegal byte sequence */
  1048         return -1;
  1051     /*
  1052      * Suitable for an ICU conversion base table means:
  1053      * - a 1:1 mapping (1 Unicode code point : 1 byte sequence)
  1054      * - precision flag 0..3
  1055      * - SBCS: any 1:1 mapping
  1056      *         (the table stores additional bits to distinguish mapping types)
  1057      * - MBCS: not a |2 SUB mapping for <subchar1>
  1058      * - MBCS: not a |1 fallback to 0x00
  1059      * - MBCS: not a multi-byte mapping with leading 0x00 bytes
  1061      * Further restrictions for fromUnicode tables
  1062      * are enforced in makeconv (MBCSOkForBaseFromUnicode()).
  1064      * All of the MBCS fromUnicode specific tests could be removed from here,
  1065      * but the ones above are for unusual mappings, and removing the tests
  1066      * from here would change canonucm output which seems gratuitous.
  1067      * (Markus Scherer 2006-nov-28)
  1069      * Exception: All implicit mappings (f<0) that need to be moved
  1070      * because of fromUnicode restrictions _must_ be moved here because
  1071      * makeconv uses a hack for moving mappings only for the fromUnicode table
  1072      * that only works with non-negative values of f.
  1073      */
  1074     if( m->uLen==1 && count==1 && m->f<=3 &&
  1075         (baseStates->maxCharLength==1 ||
  1076             !((m->f==2 && m->bLen==1) ||
  1077               (m->f==1 && bytes[0]==0) ||
  1078               (m->f<=1 && m->bLen>1 && bytes[0]==0)))
  1079     ) {
  1080         return 0; /* suitable for a base table */
  1081     } else {
  1082         return 1; /* needs to go into an extension table */
  1086 U_CAPI UBool U_EXPORT2
  1087 ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates,
  1088                    UCMapping *m,
  1089                    UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
  1090                    uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
  1091     int32_t type;
  1093     if(m->f==2 && m->uLen>1) {
  1094         fprintf(stderr, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n");
  1095         printMapping(m, codePoints, bytes, stderr);
  1096         return FALSE;
  1099     if(baseStates!=NULL) {
  1100         /* check validity of the bytes and count the characters in them */
  1101         type=ucm_mappingType(baseStates, m, codePoints, bytes);
  1102         if(type<0) {
  1103             /* illegal byte sequence */
  1104             printMapping(m, codePoints, bytes, stderr);
  1105             return FALSE;
  1107     } else {
  1108         /* not used - adding a mapping for an extension-only table before its base table is read */
  1109         type=1;
  1112     /*
  1113      * Add the mapping to the base table if this is requested and suitable.
  1114      * Otherwise, add it to the extension table.
  1115      */
  1116     if(forBase && type==0) {
  1117         ucm_addMapping(ucm->base, m, codePoints, bytes);
  1118     } else {
  1119         ucm_addMapping(ucm->ext, m, codePoints, bytes);
  1122     return TRUE;
  1125 U_CAPI UBool U_EXPORT2
  1126 ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) {
  1127     UCMapping m={ 0 };
  1128     UChar32 codePoints[UCNV_EXT_MAX_UCHARS];
  1129     uint8_t bytes[UCNV_EXT_MAX_BYTES];
  1131     const char *s;
  1133     /* ignore empty and comment lines */
  1134     if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') {
  1135         return TRUE;
  1138     return
  1139         ucm_parseMappingLine(&m, codePoints, bytes, line) &&
  1140         ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes);
  1143 U_CAPI void U_EXPORT2
  1144 ucm_readTable(UCMFile *ucm, FileStream* convFile,
  1145               UBool forBase, UCMStates *baseStates,
  1146               UErrorCode *pErrorCode) {
  1147     char line[500];
  1148     char *end;
  1149     UBool isOK;
  1151     if(U_FAILURE(*pErrorCode)) {
  1152         return;
  1155     isOK=TRUE;
  1157     for(;;) {
  1158         /* read the next line */
  1159         if(!T_FileStream_readLine(convFile, line, sizeof(line))) {
  1160             fprintf(stderr, "incomplete charmap section\n");
  1161             isOK=FALSE;
  1162             break;
  1165         /* remove CR LF */
  1166         end=uprv_strchr(line, 0);
  1167         while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) {
  1168             --end;
  1170         *end=0;
  1172         /* ignore empty and comment lines */
  1173         if(line[0]==0 || line[0]=='#') {
  1174             continue;
  1177         /* stop at the end of the mapping table */
  1178         if(0==uprv_strcmp(line, "END CHARMAP")) {
  1179             break;
  1182         isOK&=ucm_addMappingFromLine(ucm, line, forBase, baseStates);
  1185     if(!isOK) {
  1186         *pErrorCode=U_INVALID_TABLE_FORMAT;
  1189 #endif

mercurial