intl/icu/source/common/utrie2_builder.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /*
     2 ******************************************************************************
     3 *
     4 *   Copyright (C) 2001-2011, International Business Machines
     5 *   Corporation and others.  All Rights Reserved.
     6 *
     7 ******************************************************************************
     8 *   file name:  utrie2_builder.cpp
     9 *   encoding:   US-ASCII
    10 *   tab size:   8 (not used)
    11 *   indentation:4
    12 *
    13 *   created on: 2008sep26 (split off from utrie2.c)
    14 *   created by: Markus W. Scherer
    15 *
    16 *   This is a common implementation of a Unicode trie.
    17 *   It is a kind of compressed, serializable table of 16- or 32-bit values associated with
    18 *   Unicode code points (0..0x10ffff).
    19 *   This is the second common version of a Unicode trie (hence the name UTrie2).
    20 *   See utrie2.h for a comparison.
    21 *
    22 *   This file contains only the builder code.
    23 *   See utrie2.c for the runtime and enumeration code.
    24 */
    25 #ifdef UTRIE2_DEBUG
    26 #   include <stdio.h>
    27 #endif
    29 #include "unicode/utypes.h"
    30 #include "cmemory.h"
    31 #include "utrie2.h"
    32 #include "utrie2_impl.h"
    34 #include "utrie.h" /* for utrie2_fromUTrie() and utrie_swap() */
    36 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
    38 /* Implementation notes ----------------------------------------------------- */
    40 /*
    41  * The UTRIE2_SHIFT_1, UTRIE2_SHIFT_2, UTRIE2_INDEX_SHIFT and other values
    42  * have been chosen to minimize trie sizes overall.
    43  * Most of the code is flexible enough to work with a range of values,
    44  * within certain limits.
    45  *
    46  * Exception: Support for separate values for lead surrogate code _units_
    47  * vs. code _points_ was added after the constants were fixed,
    48  * and has not been tested nor particularly designed for different constant values.
    49  * (Especially the utrie2_enum() code that jumps to the special LSCP index-2
    50  * part and back.)
    51  *
    52  * Requires UTRIE2_SHIFT_2<=6. Otherwise 0xc0 which is the top of the ASCII-linear data
    53  * including the bad-UTF-8-data block is not a multiple of UTRIE2_DATA_BLOCK_LENGTH
    54  * and map[block>>UTRIE2_SHIFT_2] (used in reference counting and compaction
    55  * remapping) stops working.
    56  *
    57  * Requires UTRIE2_SHIFT_1>=10 because utrie2_enumForLeadSurrogate()
    58  * assumes that a single index-2 block is used for 0x400 code points
    59  * corresponding to one lead surrogate.
    60  *
    61  * Requires UTRIE2_SHIFT_1<=16. Otherwise one single index-2 block contains
    62  * more than one Unicode plane, and the split of the index-2 table into a BMP
    63  * part and a supplementary part, with a gap in between, would not work.
    64  *
    65  * Requires UTRIE2_INDEX_SHIFT>=1 not because of the code but because
    66  * there is data with more than 64k distinct values,
    67  * for example for Unihan collation with a separate collation weight per
    68  * Han character.
    69  */
    71 /* Building a trie ----------------------------------------------------------*/
    73 enum {
    74     /** The null index-2 block, following the gap in the index-2 table. */
    75     UNEWTRIE2_INDEX_2_NULL_OFFSET=UNEWTRIE2_INDEX_GAP_OFFSET+UNEWTRIE2_INDEX_GAP_LENGTH,
    77     /** The start of allocated index-2 blocks. */
    78     UNEWTRIE2_INDEX_2_START_OFFSET=UNEWTRIE2_INDEX_2_NULL_OFFSET+UTRIE2_INDEX_2_BLOCK_LENGTH,
    80     /**
    81      * The null data block.
    82      * Length 64=0x40 even if UTRIE2_DATA_BLOCK_LENGTH is smaller,
    83      * to work with 6-bit trail bytes from 2-byte UTF-8.
    84      */
    85     UNEWTRIE2_DATA_NULL_OFFSET=UTRIE2_DATA_START_OFFSET,
    87     /** The start of allocated data blocks. */
    88     UNEWTRIE2_DATA_START_OFFSET=UNEWTRIE2_DATA_NULL_OFFSET+0x40,
    90     /**
    91      * The start of data blocks for U+0800 and above.
    92      * Below, compaction uses a block length of 64 for 2-byte UTF-8.
    93      * From here on, compaction uses UTRIE2_DATA_BLOCK_LENGTH.
    94      * Data values for 0x780 code points beyond ASCII.
    95      */
    96     UNEWTRIE2_DATA_0800_OFFSET=UNEWTRIE2_DATA_START_OFFSET+0x780
    97 };
    99 /* Start with allocation of 16k data entries. */
   100 #define UNEWTRIE2_INITIAL_DATA_LENGTH ((int32_t)1<<14)
   102 /* Grow about 8x each time. */
   103 #define UNEWTRIE2_MEDIUM_DATA_LENGTH ((int32_t)1<<17)
   105 static int32_t
   106 allocIndex2Block(UNewTrie2 *trie);
   108 U_CAPI UTrie2 * U_EXPORT2
   109 utrie2_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode) {
   110     UTrie2 *trie;
   111     UNewTrie2 *newTrie;
   112     uint32_t *data;
   113     int32_t i, j;
   115     if(U_FAILURE(*pErrorCode)) {
   116         return NULL;
   117     }
   119     trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2));
   120     newTrie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2));
   121     data=(uint32_t *)uprv_malloc(UNEWTRIE2_INITIAL_DATA_LENGTH*4);
   122     if(trie==NULL || newTrie==NULL || data==NULL) {
   123         uprv_free(trie);
   124         uprv_free(newTrie);
   125         uprv_free(data);
   126         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
   127         return 0;
   128     }
   130     uprv_memset(trie, 0, sizeof(UTrie2));
   131     trie->initialValue=initialValue;
   132     trie->errorValue=errorValue;
   133     trie->highStart=0x110000;
   134     trie->newTrie=newTrie;
   136     newTrie->data=data;
   137     newTrie->dataCapacity=UNEWTRIE2_INITIAL_DATA_LENGTH;
   138     newTrie->initialValue=initialValue;
   139     newTrie->errorValue=errorValue;
   140     newTrie->highStart=0x110000;
   141     newTrie->firstFreeBlock=0;  /* no free block in the list */
   142     newTrie->isCompacted=FALSE;
   144     /*
   145      * preallocate and reset
   146      * - ASCII
   147      * - the bad-UTF-8-data block
   148      * - the null data block
   149      */
   150     for(i=0; i<0x80; ++i) {
   151         newTrie->data[i]=initialValue;
   152     }
   153     for(; i<0xc0; ++i) {
   154         newTrie->data[i]=errorValue;
   155     }
   156     for(i=UNEWTRIE2_DATA_NULL_OFFSET; i<UNEWTRIE2_DATA_START_OFFSET; ++i) {
   157         newTrie->data[i]=initialValue;
   158     }
   159     newTrie->dataNullOffset=UNEWTRIE2_DATA_NULL_OFFSET;
   160     newTrie->dataLength=UNEWTRIE2_DATA_START_OFFSET;
   162     /* set the index-2 indexes for the 2=0x80>>UTRIE2_SHIFT_2 ASCII data blocks */
   163     for(i=0, j=0; j<0x80; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
   164         newTrie->index2[i]=j;
   165         newTrie->map[i]=1;
   166     }
   167     /* reference counts for the bad-UTF-8-data block */
   168     for(; j<0xc0; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
   169         newTrie->map[i]=0;
   170     }
   171     /*
   172      * Reference counts for the null data block: all blocks except for the ASCII blocks.
   173      * Plus 1 so that we don't drop this block during compaction.
   174      * Plus as many as needed for lead surrogate code points.
   175      */
   176     /* i==newTrie->dataNullOffset */
   177     newTrie->map[i++]=
   178         (0x110000>>UTRIE2_SHIFT_2)-
   179         (0x80>>UTRIE2_SHIFT_2)+
   180         1+
   181         UTRIE2_LSCP_INDEX_2_LENGTH;
   182     j+=UTRIE2_DATA_BLOCK_LENGTH;
   183     for(; j<UNEWTRIE2_DATA_START_OFFSET; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
   184         newTrie->map[i]=0;
   185     }
   187     /*
   188      * set the remaining indexes in the BMP index-2 block
   189      * to the null data block
   190      */
   191     for(i=0x80>>UTRIE2_SHIFT_2; i<UTRIE2_INDEX_2_BMP_LENGTH; ++i) {
   192         newTrie->index2[i]=UNEWTRIE2_DATA_NULL_OFFSET;
   193     }
   195     /*
   196      * Fill the index gap with impossible values so that compaction
   197      * does not overlap other index-2 blocks with the gap.
   198      */
   199     for(i=0; i<UNEWTRIE2_INDEX_GAP_LENGTH; ++i) {
   200         newTrie->index2[UNEWTRIE2_INDEX_GAP_OFFSET+i]=-1;
   201     }
   203     /* set the indexes in the null index-2 block */
   204     for(i=0; i<UTRIE2_INDEX_2_BLOCK_LENGTH; ++i) {
   205         newTrie->index2[UNEWTRIE2_INDEX_2_NULL_OFFSET+i]=UNEWTRIE2_DATA_NULL_OFFSET;
   206     }
   207     newTrie->index2NullOffset=UNEWTRIE2_INDEX_2_NULL_OFFSET;
   208     newTrie->index2Length=UNEWTRIE2_INDEX_2_START_OFFSET;
   210     /* set the index-1 indexes for the linear index-2 block */
   211     for(i=0, j=0;
   212         i<UTRIE2_OMITTED_BMP_INDEX_1_LENGTH;
   213         ++i, j+=UTRIE2_INDEX_2_BLOCK_LENGTH
   214     ) {
   215         newTrie->index1[i]=j;
   216     }
   218     /* set the remaining index-1 indexes to the null index-2 block */
   219     for(; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) {
   220         newTrie->index1[i]=UNEWTRIE2_INDEX_2_NULL_OFFSET;
   221     }
   223     /*
   224      * Preallocate and reset data for U+0080..U+07ff,
   225      * for 2-byte UTF-8 which will be compacted in 64-blocks
   226      * even if UTRIE2_DATA_BLOCK_LENGTH is smaller.
   227      */
   228     for(i=0x80; i<0x800; i+=UTRIE2_DATA_BLOCK_LENGTH) {
   229         utrie2_set32(trie, i, initialValue, pErrorCode);
   230     }
   232     return trie;
   233 }
   235 static UNewTrie2 *
   236 cloneBuilder(const UNewTrie2 *other) {
   237     UNewTrie2 *trie;
   239     trie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2));
   240     if(trie==NULL) {
   241         return NULL;
   242     }
   244     trie->data=(uint32_t *)uprv_malloc(other->dataCapacity*4);
   245     if(trie->data==NULL) {
   246         uprv_free(trie);
   247         return NULL;
   248     }
   249     trie->dataCapacity=other->dataCapacity;
   251     /* clone data */
   252     uprv_memcpy(trie->index1, other->index1, sizeof(trie->index1));
   253     uprv_memcpy(trie->index2, other->index2, other->index2Length*4);
   254     trie->index2NullOffset=other->index2NullOffset;
   255     trie->index2Length=other->index2Length;
   257     uprv_memcpy(trie->data, other->data, other->dataLength*4);
   258     trie->dataNullOffset=other->dataNullOffset;
   259     trie->dataLength=other->dataLength;
   261     /* reference counters */
   262     if(other->isCompacted) {
   263         trie->firstFreeBlock=0;
   264     } else {
   265         uprv_memcpy(trie->map, other->map, (other->dataLength>>UTRIE2_SHIFT_2)*4);
   266         trie->firstFreeBlock=other->firstFreeBlock;
   267     }
   269     trie->initialValue=other->initialValue;
   270     trie->errorValue=other->errorValue;
   271     trie->highStart=other->highStart;
   272     trie->isCompacted=other->isCompacted;
   274     return trie;
   275 }
   277 U_CAPI UTrie2 * U_EXPORT2
   278 utrie2_clone(const UTrie2 *other, UErrorCode *pErrorCode) {
   279     UTrie2 *trie;
   281     if(U_FAILURE(*pErrorCode)) {
   282         return NULL;
   283     }
   284     if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) {
   285         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   286         return NULL;
   287     }
   289     trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2));
   290     if(trie==NULL) {
   291         return NULL;
   292     }
   293     uprv_memcpy(trie, other, sizeof(UTrie2));
   295     if(other->memory!=NULL) {
   296         trie->memory=uprv_malloc(other->length);
   297         if(trie->memory!=NULL) {
   298             trie->isMemoryOwned=TRUE;
   299             uprv_memcpy(trie->memory, other->memory, other->length);
   301             /* make the clone's pointers point to its own memory */
   302             trie->index=(uint16_t *)trie->memory+(other->index-(uint16_t *)other->memory);
   303             if(other->data16!=NULL) {
   304                 trie->data16=(uint16_t *)trie->memory+(other->data16-(uint16_t *)other->memory);
   305             }
   306             if(other->data32!=NULL) {
   307                 trie->data32=(uint32_t *)trie->memory+(other->data32-(uint32_t *)other->memory);
   308             }
   309         }
   310     } else /* other->newTrie!=NULL */ {
   311         trie->newTrie=cloneBuilder(other->newTrie);
   312     }
   314     if(trie->memory==NULL && trie->newTrie==NULL) {
   315         uprv_free(trie);
   316         trie=NULL;
   317     }
   318     return trie;
   319 }
   321 typedef struct NewTrieAndStatus {
   322     UTrie2 *trie;
   323     UErrorCode errorCode;
   324     UBool exclusiveLimit;  /* rather than inclusive range end */
   325 } NewTrieAndStatus;
   327 static UBool U_CALLCONV
   328 copyEnumRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
   329     NewTrieAndStatus *nt=(NewTrieAndStatus *)context;
   330     if(value!=nt->trie->initialValue) {
   331         if(nt->exclusiveLimit) {
   332             --end;
   333         }
   334         if(start==end) {
   335             utrie2_set32(nt->trie, start, value, &nt->errorCode);
   336         } else {
   337             utrie2_setRange32(nt->trie, start, end, value, TRUE, &nt->errorCode);
   338         }
   339         return U_SUCCESS(nt->errorCode);
   340     } else {
   341         return TRUE;
   342     }
   343 }
   345 #ifdef UTRIE2_DEBUG
   346 static void
   347 utrie_printLengths(const UTrie *trie) {
   348     long indexLength=trie->indexLength;
   349     long dataLength=(long)trie->dataLength;
   350     long totalLength=(long)sizeof(UTrieHeader)+indexLength*2+dataLength*(trie->data32!=NULL ? 4 : 2);
   351     printf("**UTrieLengths** index:%6ld  data:%6ld  serialized:%6ld\n",
   352            indexLength, dataLength, totalLength);
   353 }
   355 static void
   356 utrie2_printLengths(const UTrie2 *trie, const char *which) {
   357     long indexLength=trie->indexLength;
   358     long dataLength=(long)trie->dataLength;
   359     long totalLength=(long)sizeof(UTrie2Header)+indexLength*2+dataLength*(trie->data32!=NULL ? 4 : 2);
   360     printf("**UTrie2Lengths(%s)** index:%6ld  data:%6ld  serialized:%6ld\n",
   361            which, indexLength, dataLength, totalLength);
   362 }
   363 #endif
   365 U_CAPI UTrie2 * U_EXPORT2
   366 utrie2_cloneAsThawed(const UTrie2 *other, UErrorCode *pErrorCode) {
   367     NewTrieAndStatus context;
   368     UChar lead;
   370     if(U_FAILURE(*pErrorCode)) {
   371         return NULL;
   372     }
   373     if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) {
   374         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   375         return NULL;
   376     }
   377     if(other->newTrie!=NULL && !other->newTrie->isCompacted) {
   378         return utrie2_clone(other, pErrorCode);  /* clone an unfrozen trie */
   379     }
   381     /* Clone the frozen trie by enumerating it and building a new one. */
   382     context.trie=utrie2_open(other->initialValue, other->errorValue, pErrorCode);
   383     if(U_FAILURE(*pErrorCode)) {
   384         return NULL;
   385     }
   386     context.exclusiveLimit=FALSE;
   387     context.errorCode=*pErrorCode;
   388     utrie2_enum(other, NULL, copyEnumRange, &context);
   389     *pErrorCode=context.errorCode;
   390     for(lead=0xd800; lead<0xdc00; ++lead) {
   391         uint32_t value;
   392         if(other->data32==NULL) {
   393             value=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(other, lead);
   394         } else {
   395             value=UTRIE2_GET32_FROM_U16_SINGLE_LEAD(other, lead);
   396         }
   397         if(value!=other->initialValue) {
   398             utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode);
   399         }
   400     }
   401     if(U_FAILURE(*pErrorCode)) {
   402         utrie2_close(context.trie);
   403         context.trie=NULL;
   404     }
   405     return context.trie;
   406 }
   408 /* Almost the same as utrie2_cloneAsThawed() but copies a UTrie and freezes the clone. */
   409 U_CAPI UTrie2 * U_EXPORT2
   410 utrie2_fromUTrie(const UTrie *trie1, uint32_t errorValue, UErrorCode *pErrorCode) {
   411     NewTrieAndStatus context;
   412     UChar lead;
   414     if(U_FAILURE(*pErrorCode)) {
   415         return NULL;
   416     }
   417     if(trie1==NULL) {
   418         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   419         return NULL;
   420     }
   421     context.trie=utrie2_open(trie1->initialValue, errorValue, pErrorCode);
   422     if(U_FAILURE(*pErrorCode)) {
   423         return NULL;
   424     }
   425     context.exclusiveLimit=TRUE;
   426     context.errorCode=*pErrorCode;
   427     utrie_enum(trie1, NULL, copyEnumRange, &context);
   428     *pErrorCode=context.errorCode;
   429     for(lead=0xd800; lead<0xdc00; ++lead) {
   430         uint32_t value;
   431         if(trie1->data32==NULL) {
   432             value=UTRIE_GET16_FROM_LEAD(trie1, lead);
   433         } else {
   434             value=UTRIE_GET32_FROM_LEAD(trie1, lead);
   435         }
   436         if(value!=trie1->initialValue) {
   437             utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode);
   438         }
   439     }
   440     if(U_SUCCESS(*pErrorCode)) {
   441         utrie2_freeze(context.trie,
   442                       trie1->data32!=NULL ? UTRIE2_32_VALUE_BITS : UTRIE2_16_VALUE_BITS,
   443                       pErrorCode);
   444     }
   445 #ifdef UTRIE2_DEBUG
   446     if(U_SUCCESS(*pErrorCode)) {
   447         utrie_printLengths(trie1);
   448         utrie2_printLengths(context.trie, "fromUTrie");
   449     }
   450 #endif
   451     if(U_FAILURE(*pErrorCode)) {
   452         utrie2_close(context.trie);
   453         context.trie=NULL;
   454     }
   455     return context.trie;
   456 }
   458 static inline UBool
   459 isInNullBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
   460     int32_t i2, block;
   462     if(U_IS_LEAD(c) && forLSCP) {
   463         i2=(UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2))+
   464             (c>>UTRIE2_SHIFT_2);
   465     } else {
   466         i2=trie->index1[c>>UTRIE2_SHIFT_1]+
   467             ((c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK);
   468     }
   469     block=trie->index2[i2];
   470     return (UBool)(block==trie->dataNullOffset);
   471 }
   473 static int32_t
   474 allocIndex2Block(UNewTrie2 *trie) {
   475     int32_t newBlock, newTop;
   477     newBlock=trie->index2Length;
   478     newTop=newBlock+UTRIE2_INDEX_2_BLOCK_LENGTH;
   479     if(newTop>LENGTHOF(trie->index2)) {
   480         /*
   481          * Should never occur.
   482          * Either UTRIE2_MAX_BUILD_TIME_INDEX_LENGTH is incorrect,
   483          * or the code writes more values than should be possible.
   484          */
   485         return -1;
   486     }
   487     trie->index2Length=newTop;
   488     uprv_memcpy(trie->index2+newBlock, trie->index2+trie->index2NullOffset, UTRIE2_INDEX_2_BLOCK_LENGTH*4);
   489     return newBlock;
   490 }
   492 static int32_t
   493 getIndex2Block(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
   494     int32_t i1, i2;
   496     if(U_IS_LEAD(c) && forLSCP) {
   497         return UTRIE2_LSCP_INDEX_2_OFFSET;
   498     }
   500     i1=c>>UTRIE2_SHIFT_1;
   501     i2=trie->index1[i1];
   502     if(i2==trie->index2NullOffset) {
   503         i2=allocIndex2Block(trie);
   504         if(i2<0) {
   505             return -1;  /* program error */
   506         }
   507         trie->index1[i1]=i2;
   508     }
   509     return i2;
   510 }
   512 static int32_t
   513 allocDataBlock(UNewTrie2 *trie, int32_t copyBlock) {
   514     int32_t newBlock, newTop;
   516     if(trie->firstFreeBlock!=0) {
   517         /* get the first free block */
   518         newBlock=trie->firstFreeBlock;
   519         trie->firstFreeBlock=-trie->map[newBlock>>UTRIE2_SHIFT_2];
   520     } else {
   521         /* get a new block from the high end */
   522         newBlock=trie->dataLength;
   523         newTop=newBlock+UTRIE2_DATA_BLOCK_LENGTH;
   524         if(newTop>trie->dataCapacity) {
   525             /* out of memory in the data array */
   526             int32_t capacity;
   527             uint32_t *data;
   529             if(trie->dataCapacity<UNEWTRIE2_MEDIUM_DATA_LENGTH) {
   530                 capacity=UNEWTRIE2_MEDIUM_DATA_LENGTH;
   531             } else if(trie->dataCapacity<UNEWTRIE2_MAX_DATA_LENGTH) {
   532                 capacity=UNEWTRIE2_MAX_DATA_LENGTH;
   533             } else {
   534                 /*
   535                  * Should never occur.
   536                  * Either UNEWTRIE2_MAX_DATA_LENGTH is incorrect,
   537                  * or the code writes more values than should be possible.
   538                  */
   539                 return -1;
   540             }
   541             data=(uint32_t *)uprv_malloc(capacity*4);
   542             if(data==NULL) {
   543                 return -1;
   544             }
   545             uprv_memcpy(data, trie->data, trie->dataLength*4);
   546             uprv_free(trie->data);
   547             trie->data=data;
   548             trie->dataCapacity=capacity;
   549         }
   550         trie->dataLength=newTop;
   551     }
   552     uprv_memcpy(trie->data+newBlock, trie->data+copyBlock, UTRIE2_DATA_BLOCK_LENGTH*4);
   553     trie->map[newBlock>>UTRIE2_SHIFT_2]=0;
   554     return newBlock;
   555 }
   557 /* call when the block's reference counter reaches 0 */
   558 static void
   559 releaseDataBlock(UNewTrie2 *trie, int32_t block) {
   560     /* put this block at the front of the free-block chain */
   561     trie->map[block>>UTRIE2_SHIFT_2]=-trie->firstFreeBlock;
   562     trie->firstFreeBlock=block;
   563 }
   565 static inline UBool
   566 isWritableBlock(UNewTrie2 *trie, int32_t block) {
   567     return (UBool)(block!=trie->dataNullOffset && 1==trie->map[block>>UTRIE2_SHIFT_2]);
   568 }
   570 static inline void
   571 setIndex2Entry(UNewTrie2 *trie, int32_t i2, int32_t block) {
   572     int32_t oldBlock;
   573     ++trie->map[block>>UTRIE2_SHIFT_2];  /* increment first, in case block==oldBlock! */
   574     oldBlock=trie->index2[i2];
   575     if(0 == --trie->map[oldBlock>>UTRIE2_SHIFT_2]) {
   576         releaseDataBlock(trie, oldBlock);
   577     }
   578     trie->index2[i2]=block;
   579 }
   581 /**
   582  * No error checking for illegal arguments.
   583  *
   584  * @return -1 if no new data block available (out of memory in data array)
   585  * @internal
   586  */
   587 static int32_t
   588 getDataBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
   589     int32_t i2, oldBlock, newBlock;
   591     i2=getIndex2Block(trie, c, forLSCP);
   592     if(i2<0) {
   593         return -1;  /* program error */
   594     }
   596     i2+=(c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK;
   597     oldBlock=trie->index2[i2];
   598     if(isWritableBlock(trie, oldBlock)) {
   599         return oldBlock;
   600     }
   602     /* allocate a new data block */
   603     newBlock=allocDataBlock(trie, oldBlock);
   604     if(newBlock<0) {
   605         /* out of memory in the data array */
   606         return -1;
   607     }
   608     setIndex2Entry(trie, i2, newBlock);
   609     return newBlock;
   610 }
   612 /**
   613  * @return TRUE if the value was successfully set
   614  */
   615 static void
   616 set32(UNewTrie2 *trie,
   617       UChar32 c, UBool forLSCP, uint32_t value,
   618       UErrorCode *pErrorCode) {
   619     int32_t block;
   621     if(trie==NULL || trie->isCompacted) {
   622         *pErrorCode=U_NO_WRITE_PERMISSION;
   623         return;
   624     }
   626     block=getDataBlock(trie, c, forLSCP);
   627     if(block<0) {
   628         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
   629         return;
   630     }
   632     trie->data[block+(c&UTRIE2_DATA_MASK)]=value;
   633 }
   635 U_CAPI void U_EXPORT2
   636 utrie2_set32(UTrie2 *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode) {
   637     if(U_FAILURE(*pErrorCode)) {
   638         return;
   639     }
   640     if((uint32_t)c>0x10ffff) {
   641         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   642         return;
   643     }
   644     set32(trie->newTrie, c, TRUE, value, pErrorCode);
   645 }
   647 U_CAPI void U_EXPORT2
   648 utrie2_set32ForLeadSurrogateCodeUnit(UTrie2 *trie,
   649                                      UChar32 c, uint32_t value,
   650                                      UErrorCode *pErrorCode) {
   651     if(U_FAILURE(*pErrorCode)) {
   652         return;
   653     }
   654     if(!U_IS_LEAD(c)) {
   655         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   656         return;
   657     }
   658     set32(trie->newTrie, c, FALSE, value, pErrorCode);
   659 }
   661 static void
   662 writeBlock(uint32_t *block, uint32_t value) {
   663     uint32_t *limit=block+UTRIE2_DATA_BLOCK_LENGTH;
   664     while(block<limit) {
   665         *block++=value;
   666     }
   667 }
   669 /**
   670  * initialValue is ignored if overwrite=TRUE
   671  * @internal
   672  */
   673 static void
   674 fillBlock(uint32_t *block, UChar32 start, UChar32 limit,
   675           uint32_t value, uint32_t initialValue, UBool overwrite) {
   676     uint32_t *pLimit;
   678     pLimit=block+limit;
   679     block+=start;
   680     if(overwrite) {
   681         while(block<pLimit) {
   682             *block++=value;
   683         }
   684     } else {
   685         while(block<pLimit) {
   686             if(*block==initialValue) {
   687                 *block=value;
   688             }
   689             ++block;
   690         }
   691     }
   692 }
   694 U_CAPI void U_EXPORT2
   695 utrie2_setRange32(UTrie2 *trie,
   696                   UChar32 start, UChar32 end,
   697                   uint32_t value, UBool overwrite,
   698                   UErrorCode *pErrorCode) {
   699     /*
   700      * repeat value in [start..end]
   701      * mark index values for repeat-data blocks by setting bit 31 of the index values
   702      * fill around existing values if any, if(overwrite)
   703      */
   704     UNewTrie2 *newTrie;
   705     int32_t block, rest, repeatBlock;
   706     UChar32 limit;
   708     if(U_FAILURE(*pErrorCode)) {
   709         return;
   710     }
   711     if((uint32_t)start>0x10ffff || (uint32_t)end>0x10ffff || start>end) {
   712         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   713         return;
   714     }
   715     newTrie=trie->newTrie;
   716     if(newTrie==NULL || newTrie->isCompacted) {
   717         *pErrorCode=U_NO_WRITE_PERMISSION;
   718         return;
   719     }
   720     if(!overwrite && value==newTrie->initialValue) {
   721         return; /* nothing to do */
   722     }
   724     limit=end+1;
   725     if(start&UTRIE2_DATA_MASK) {
   726         UChar32 nextStart;
   728         /* set partial block at [start..following block boundary[ */
   729         block=getDataBlock(newTrie, start, TRUE);
   730         if(block<0) {
   731             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
   732             return;
   733         }
   735         nextStart=(start+UTRIE2_DATA_BLOCK_LENGTH)&~UTRIE2_DATA_MASK;
   736         if(nextStart<=limit) {
   737             fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, UTRIE2_DATA_BLOCK_LENGTH,
   738                       value, newTrie->initialValue, overwrite);
   739             start=nextStart;
   740         } else {
   741             fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, limit&UTRIE2_DATA_MASK,
   742                       value, newTrie->initialValue, overwrite);
   743             return;
   744         }
   745     }
   747     /* number of positions in the last, partial block */
   748     rest=limit&UTRIE2_DATA_MASK;
   750     /* round down limit to a block boundary */
   751     limit&=~UTRIE2_DATA_MASK;
   753     /* iterate over all-value blocks */
   754     if(value==newTrie->initialValue) {
   755         repeatBlock=newTrie->dataNullOffset;
   756     } else {
   757         repeatBlock=-1;
   758     }
   760     while(start<limit) {
   761         int32_t i2;
   762         UBool setRepeatBlock=FALSE;
   764         if(value==newTrie->initialValue && isInNullBlock(newTrie, start, TRUE)) {
   765             start+=UTRIE2_DATA_BLOCK_LENGTH; /* nothing to do */
   766             continue;
   767         }
   769         /* get index value */
   770         i2=getIndex2Block(newTrie, start, TRUE);
   771         if(i2<0) {
   772             *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
   773             return;
   774         }
   775         i2+=(start>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK;
   776         block=newTrie->index2[i2];
   777         if(isWritableBlock(newTrie, block)) {
   778             /* already allocated */
   779             if(overwrite && block>=UNEWTRIE2_DATA_0800_OFFSET) {
   780                 /*
   781                  * We overwrite all values, and it's not a
   782                  * protected (ASCII-linear or 2-byte UTF-8) block:
   783                  * replace with the repeatBlock.
   784                  */
   785                 setRepeatBlock=TRUE;
   786             } else {
   787                 /* !overwrite, or protected block: just write the values into this block */
   788                 fillBlock(newTrie->data+block,
   789                           0, UTRIE2_DATA_BLOCK_LENGTH,
   790                           value, newTrie->initialValue, overwrite);
   791             }
   792         } else if(newTrie->data[block]!=value && (overwrite || block==newTrie->dataNullOffset)) {
   793             /*
   794              * Set the repeatBlock instead of the null block or previous repeat block:
   795              *
   796              * If !isWritableBlock() then all entries in the block have the same value
   797              * because it's the null block or a range block (the repeatBlock from a previous
   798              * call to utrie2_setRange32()).
   799              * No other blocks are used multiple times before compacting.
   800              *
   801              * The null block is the only non-writable block with the initialValue because
   802              * of the repeatBlock initialization above. (If value==initialValue, then
   803              * the repeatBlock will be the null data block.)
   804              *
   805              * We set our repeatBlock if the desired value differs from the block's value,
   806              * and if we overwrite any data or if the data is all initial values
   807              * (which is the same as the block being the null block, see above).
   808              */
   809             setRepeatBlock=TRUE;
   810         }
   811         if(setRepeatBlock) {
   812             if(repeatBlock>=0) {
   813                 setIndex2Entry(newTrie, i2, repeatBlock);
   814             } else {
   815                 /* create and set and fill the repeatBlock */
   816                 repeatBlock=getDataBlock(newTrie, start, TRUE);
   817                 if(repeatBlock<0) {
   818                     *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
   819                     return;
   820                 }
   821                 writeBlock(newTrie->data+repeatBlock, value);
   822             }
   823         }
   825         start+=UTRIE2_DATA_BLOCK_LENGTH;
   826     }
   828     if(rest>0) {
   829         /* set partial block at [last block boundary..limit[ */
   830         block=getDataBlock(newTrie, start, TRUE);
   831         if(block<0) {
   832             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
   833             return;
   834         }
   836         fillBlock(newTrie->data+block, 0, rest, value, newTrie->initialValue, overwrite);
   837     }
   839     return;
   840 }
   842 /* compaction --------------------------------------------------------------- */
   844 static inline UBool
   845 equal_int32(const int32_t *s, const int32_t *t, int32_t length) {
   846     while(length>0 && *s==*t) {
   847         ++s;
   848         ++t;
   849         --length;
   850     }
   851     return (UBool)(length==0);
   852 }
   854 static inline UBool
   855 equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) {
   856     while(length>0 && *s==*t) {
   857         ++s;
   858         ++t;
   859         --length;
   860     }
   861     return (UBool)(length==0);
   862 }
   864 static int32_t
   865 findSameIndex2Block(const int32_t *idx, int32_t index2Length, int32_t otherBlock) {
   866     int32_t block;
   868     /* ensure that we do not even partially get past index2Length */
   869     index2Length-=UTRIE2_INDEX_2_BLOCK_LENGTH;
   871     for(block=0; block<=index2Length; ++block) {
   872         if(equal_int32(idx+block, idx+otherBlock, UTRIE2_INDEX_2_BLOCK_LENGTH)) {
   873             return block;
   874         }
   875     }
   876     return -1;
   877 }
   879 static int32_t
   880 findSameDataBlock(const uint32_t *data, int32_t dataLength, int32_t otherBlock, int32_t blockLength) {
   881     int32_t block;
   883     /* ensure that we do not even partially get past dataLength */
   884     dataLength-=blockLength;
   886     for(block=0; block<=dataLength; block+=UTRIE2_DATA_GRANULARITY) {
   887         if(equal_uint32(data+block, data+otherBlock, blockLength)) {
   888             return block;
   889         }
   890     }
   891     return -1;
   892 }
   894 /*
   895  * Find the start of the last range in the trie by enumerating backward.
   896  * Indexes for supplementary code points higher than this will be omitted.
   897  */
   898 static UChar32
   899 findHighStart(UNewTrie2 *trie, uint32_t highValue) {
   900     const uint32_t *data32;
   902     uint32_t value, initialValue;
   903     UChar32 c, prev;
   904     int32_t i1, i2, j, i2Block, prevI2Block, index2NullOffset, block, prevBlock, nullBlock;
   906     data32=trie->data;
   907     initialValue=trie->initialValue;
   909     index2NullOffset=trie->index2NullOffset;
   910     nullBlock=trie->dataNullOffset;
   912     /* set variables for previous range */
   913     if(highValue==initialValue) {
   914         prevI2Block=index2NullOffset;
   915         prevBlock=nullBlock;
   916     } else {
   917         prevI2Block=-1;
   918         prevBlock=-1;
   919     }
   920     prev=0x110000;
   922     /* enumerate index-2 blocks */
   923     i1=UNEWTRIE2_INDEX_1_LENGTH;
   924     c=prev;
   925     while(c>0) {
   926         i2Block=trie->index1[--i1];
   927         if(i2Block==prevI2Block) {
   928             /* the index-2 block is the same as the previous one, and filled with highValue */
   929             c-=UTRIE2_CP_PER_INDEX_1_ENTRY;
   930             continue;
   931         }
   932         prevI2Block=i2Block;
   933         if(i2Block==index2NullOffset) {
   934             /* this is the null index-2 block */
   935             if(highValue!=initialValue) {
   936                 return c;
   937             }
   938             c-=UTRIE2_CP_PER_INDEX_1_ENTRY;
   939         } else {
   940             /* enumerate data blocks for one index-2 block */
   941             for(i2=UTRIE2_INDEX_2_BLOCK_LENGTH; i2>0;) {
   942                 block=trie->index2[i2Block+ --i2];
   943                 if(block==prevBlock) {
   944                     /* the block is the same as the previous one, and filled with highValue */
   945                     c-=UTRIE2_DATA_BLOCK_LENGTH;
   946                     continue;
   947                 }
   948                 prevBlock=block;
   949                 if(block==nullBlock) {
   950                     /* this is the null data block */
   951                     if(highValue!=initialValue) {
   952                         return c;
   953                     }
   954                     c-=UTRIE2_DATA_BLOCK_LENGTH;
   955                 } else {
   956                     for(j=UTRIE2_DATA_BLOCK_LENGTH; j>0;) {
   957                         value=data32[block+ --j];
   958                         if(value!=highValue) {
   959                             return c;
   960                         }
   961                         --c;
   962                     }
   963                 }
   964             }
   965         }
   966     }
   968     /* deliver last range */
   969     return 0;
   970 }
   972 /*
   973  * Compact a build-time trie.
   974  *
   975  * The compaction
   976  * - removes blocks that are identical with earlier ones
   977  * - overlaps adjacent blocks as much as possible (if overlap==TRUE)
   978  * - moves blocks in steps of the data granularity
   979  * - moves and overlaps blocks that overlap with multiple values in the overlap region
   980  *
   981  * It does not
   982  * - try to move and overlap blocks that are not already adjacent
   983  */
   984 static void
   985 compactData(UNewTrie2 *trie) {
   986     int32_t start, newStart, movedStart;
   987     int32_t blockLength, overlap;
   988     int32_t i, mapIndex, blockCount;
   990     /* do not compact linear-ASCII data */
   991     newStart=UTRIE2_DATA_START_OFFSET;
   992     for(start=0, i=0; start<newStart; start+=UTRIE2_DATA_BLOCK_LENGTH, ++i) {
   993         trie->map[i]=start;
   994     }
   996     /*
   997      * Start with a block length of 64 for 2-byte UTF-8,
   998      * then switch to UTRIE2_DATA_BLOCK_LENGTH.
   999      */
  1000     blockLength=64;
  1001     blockCount=blockLength>>UTRIE2_SHIFT_2;
  1002     for(start=newStart; start<trie->dataLength;) {
  1003         /*
  1004          * start: index of first entry of current block
  1005          * newStart: index where the current block is to be moved
  1006          *           (right after current end of already-compacted data)
  1007          */
  1008         if(start==UNEWTRIE2_DATA_0800_OFFSET) {
  1009             blockLength=UTRIE2_DATA_BLOCK_LENGTH;
  1010             blockCount=1;
  1013         /* skip blocks that are not used */
  1014         if(trie->map[start>>UTRIE2_SHIFT_2]<=0) {
  1015             /* advance start to the next block */
  1016             start+=blockLength;
  1018             /* leave newStart with the previous block! */
  1019             continue;
  1022         /* search for an identical block */
  1023         if( (movedStart=findSameDataBlock(trie->data, newStart, start, blockLength))
  1024              >=0
  1025         ) {
  1026             /* found an identical block, set the other block's index value for the current block */
  1027             for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
  1028                 trie->map[mapIndex++]=movedStart;
  1029                 movedStart+=UTRIE2_DATA_BLOCK_LENGTH;
  1032             /* advance start to the next block */
  1033             start+=blockLength;
  1035             /* leave newStart with the previous block! */
  1036             continue;
  1039         /* see if the beginning of this block can be overlapped with the end of the previous block */
  1040         /* look for maximum overlap (modulo granularity) with the previous, adjacent block */
  1041         for(overlap=blockLength-UTRIE2_DATA_GRANULARITY;
  1042             overlap>0 && !equal_uint32(trie->data+(newStart-overlap), trie->data+start, overlap);
  1043             overlap-=UTRIE2_DATA_GRANULARITY) {}
  1045         if(overlap>0 || newStart<start) {
  1046             /* some overlap, or just move the whole block */
  1047             movedStart=newStart-overlap;
  1048             for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
  1049                 trie->map[mapIndex++]=movedStart;
  1050                 movedStart+=UTRIE2_DATA_BLOCK_LENGTH;
  1053             /* move the non-overlapping indexes to their new positions */
  1054             start+=overlap;
  1055             for(i=blockLength-overlap; i>0; --i) {
  1056                 trie->data[newStart++]=trie->data[start++];
  1058         } else /* no overlap && newStart==start */ {
  1059             for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
  1060                 trie->map[mapIndex++]=start;
  1061                 start+=UTRIE2_DATA_BLOCK_LENGTH;
  1063             newStart=start;
  1067     /* now adjust the index-2 table */
  1068     for(i=0; i<trie->index2Length; ++i) {
  1069         if(i==UNEWTRIE2_INDEX_GAP_OFFSET) {
  1070             /* Gap indexes are invalid (-1). Skip over the gap. */
  1071             i+=UNEWTRIE2_INDEX_GAP_LENGTH;
  1073         trie->index2[i]=trie->map[trie->index2[i]>>UTRIE2_SHIFT_2];
  1075     trie->dataNullOffset=trie->map[trie->dataNullOffset>>UTRIE2_SHIFT_2];
  1077     /* ensure dataLength alignment */
  1078     while((newStart&(UTRIE2_DATA_GRANULARITY-1))!=0) {
  1079         trie->data[newStart++]=trie->initialValue;
  1082 #ifdef UTRIE2_DEBUG
  1083     /* we saved some space */
  1084     printf("compacting UTrie2: count of 32-bit data words %lu->%lu\n",
  1085             (long)trie->dataLength, (long)newStart);
  1086 #endif
  1088     trie->dataLength=newStart;
  1091 static void
  1092 compactIndex2(UNewTrie2 *trie) {
  1093     int32_t i, start, newStart, movedStart, overlap;
  1095     /* do not compact linear-BMP index-2 blocks */
  1096     newStart=UTRIE2_INDEX_2_BMP_LENGTH;
  1097     for(start=0, i=0; start<newStart; start+=UTRIE2_INDEX_2_BLOCK_LENGTH, ++i) {
  1098         trie->map[i]=start;
  1101     /* Reduce the index table gap to what will be needed at runtime. */
  1102     newStart+=UTRIE2_UTF8_2B_INDEX_2_LENGTH+((trie->highStart-0x10000)>>UTRIE2_SHIFT_1);
  1104     for(start=UNEWTRIE2_INDEX_2_NULL_OFFSET; start<trie->index2Length;) {
  1105         /*
  1106          * start: index of first entry of current block
  1107          * newStart: index where the current block is to be moved
  1108          *           (right after current end of already-compacted data)
  1109          */
  1111         /* search for an identical block */
  1112         if( (movedStart=findSameIndex2Block(trie->index2, newStart, start))
  1113              >=0
  1114         ) {
  1115             /* found an identical block, set the other block's index value for the current block */
  1116             trie->map[start>>UTRIE2_SHIFT_1_2]=movedStart;
  1118             /* advance start to the next block */
  1119             start+=UTRIE2_INDEX_2_BLOCK_LENGTH;
  1121             /* leave newStart with the previous block! */
  1122             continue;
  1125         /* see if the beginning of this block can be overlapped with the end of the previous block */
  1126         /* look for maximum overlap with the previous, adjacent block */
  1127         for(overlap=UTRIE2_INDEX_2_BLOCK_LENGTH-1;
  1128             overlap>0 && !equal_int32(trie->index2+(newStart-overlap), trie->index2+start, overlap);
  1129             --overlap) {}
  1131         if(overlap>0 || newStart<start) {
  1132             /* some overlap, or just move the whole block */
  1133             trie->map[start>>UTRIE2_SHIFT_1_2]=newStart-overlap;
  1135             /* move the non-overlapping indexes to their new positions */
  1136             start+=overlap;
  1137             for(i=UTRIE2_INDEX_2_BLOCK_LENGTH-overlap; i>0; --i) {
  1138                 trie->index2[newStart++]=trie->index2[start++];
  1140         } else /* no overlap && newStart==start */ {
  1141             trie->map[start>>UTRIE2_SHIFT_1_2]=start;
  1142             start+=UTRIE2_INDEX_2_BLOCK_LENGTH;
  1143             newStart=start;
  1147     /* now adjust the index-1 table */
  1148     for(i=0; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) {
  1149         trie->index1[i]=trie->map[trie->index1[i]>>UTRIE2_SHIFT_1_2];
  1151     trie->index2NullOffset=trie->map[trie->index2NullOffset>>UTRIE2_SHIFT_1_2];
  1153     /*
  1154      * Ensure data table alignment:
  1155      * Needs to be granularity-aligned for 16-bit trie
  1156      * (so that dataMove will be down-shiftable),
  1157      * and 2-aligned for uint32_t data.
  1158      */
  1159     while((newStart&((UTRIE2_DATA_GRANULARITY-1)|1))!=0) {
  1160         /* Arbitrary value: 0x3fffc not possible for real data. */
  1161         trie->index2[newStart++]=(int32_t)0xffff<<UTRIE2_INDEX_SHIFT;
  1164 #ifdef UTRIE2_DEBUG
  1165     /* we saved some space */
  1166     printf("compacting UTrie2: count of 16-bit index-2 words %lu->%lu\n",
  1167             (long)trie->index2Length, (long)newStart);
  1168 #endif
  1170     trie->index2Length=newStart;
  1173 static void
  1174 compactTrie(UTrie2 *trie, UErrorCode *pErrorCode) {
  1175     UNewTrie2 *newTrie;
  1176     UChar32 highStart, suppHighStart;
  1177     uint32_t highValue;
  1179     newTrie=trie->newTrie;
  1181     /* find highStart and round it up */
  1182     highValue=utrie2_get32(trie, 0x10ffff);
  1183     highStart=findHighStart(newTrie, highValue);
  1184     highStart=(highStart+(UTRIE2_CP_PER_INDEX_1_ENTRY-1))&~(UTRIE2_CP_PER_INDEX_1_ENTRY-1);
  1185     if(highStart==0x110000) {
  1186         highValue=trie->errorValue;
  1189     /*
  1190      * Set trie->highStart only after utrie2_get32(trie, highStart).
  1191      * Otherwise utrie2_get32(trie, highStart) would try to read the highValue.
  1192      */
  1193     trie->highStart=newTrie->highStart=highStart;
  1195 #ifdef UTRIE2_DEBUG
  1196     printf("UTrie2: highStart U+%04lx  highValue 0x%lx  initialValue 0x%lx\n",
  1197             (long)highStart, (long)highValue, (long)trie->initialValue);
  1198 #endif
  1200     if(highStart<0x110000) {
  1201         /* Blank out [highStart..10ffff] to release associated data blocks. */
  1202         suppHighStart= highStart<=0x10000 ? 0x10000 : highStart;
  1203         utrie2_setRange32(trie, suppHighStart, 0x10ffff, trie->initialValue, TRUE, pErrorCode);
  1204         if(U_FAILURE(*pErrorCode)) {
  1205             return;
  1209     compactData(newTrie);
  1210     if(highStart>0x10000) {
  1211         compactIndex2(newTrie);
  1212 #ifdef UTRIE2_DEBUG
  1213     } else {
  1214         printf("UTrie2: highStart U+%04lx  count of 16-bit index-2 words %lu->%lu\n",
  1215                 (long)highStart, (long)trie->newTrie->index2Length, (long)UTRIE2_INDEX_1_OFFSET);
  1216 #endif
  1219     /*
  1220      * Store the highValue in the data array and round up the dataLength.
  1221      * Must be done after compactData() because that assumes that dataLength
  1222      * is a multiple of UTRIE2_DATA_BLOCK_LENGTH.
  1223      */
  1224     newTrie->data[newTrie->dataLength++]=highValue;
  1225     while((newTrie->dataLength&(UTRIE2_DATA_GRANULARITY-1))!=0) {
  1226         newTrie->data[newTrie->dataLength++]=trie->initialValue;
  1229     newTrie->isCompacted=TRUE;
  1232 /* serialization ------------------------------------------------------------ */
  1234 /**
  1235  * Maximum length of the runtime index array.
  1236  * Limited by its own 16-bit index values, and by uint16_t UTrie2Header.indexLength.
  1237  * (The actual maximum length is lower,
  1238  * (0x110000>>UTRIE2_SHIFT_2)+UTRIE2_UTF8_2B_INDEX_2_LENGTH+UTRIE2_MAX_INDEX_1_LENGTH.)
  1239  */
  1240 #define UTRIE2_MAX_INDEX_LENGTH 0xffff
  1242 /**
  1243  * Maximum length of the runtime data array.
  1244  * Limited by 16-bit index values that are left-shifted by UTRIE2_INDEX_SHIFT,
  1245  * and by uint16_t UTrie2Header.shiftedDataLength.
  1246  */
  1247 #define UTRIE2_MAX_DATA_LENGTH (0xffff<<UTRIE2_INDEX_SHIFT)
  1249 /* Compact and internally serialize the trie. */
  1250 U_CAPI void U_EXPORT2
  1251 utrie2_freeze(UTrie2 *trie, UTrie2ValueBits valueBits, UErrorCode *pErrorCode) {
  1252     UNewTrie2 *newTrie;
  1253     UTrie2Header *header;
  1254     uint32_t *p;
  1255     uint16_t *dest16;
  1256     int32_t i, length;
  1257     int32_t allIndexesLength;
  1258     int32_t dataMove;  /* >0 if the data is moved to the end of the index array */
  1259     UChar32 highStart;
  1261     /* argument check */
  1262     if(U_FAILURE(*pErrorCode)) {
  1263         return;
  1265     if( trie==NULL ||
  1266         valueBits<0 || UTRIE2_COUNT_VALUE_BITS<=valueBits
  1267     ) {
  1268         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  1269         return;
  1271     newTrie=trie->newTrie;
  1272     if(newTrie==NULL) {
  1273         /* already frozen */
  1274         UTrie2ValueBits frozenValueBits=
  1275             trie->data16!=NULL ? UTRIE2_16_VALUE_BITS : UTRIE2_32_VALUE_BITS;
  1276         if(valueBits!=frozenValueBits) {
  1277             *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  1279         return;
  1282     /* compact if necessary */
  1283     if(!newTrie->isCompacted) {
  1284         compactTrie(trie, pErrorCode);
  1285         if(U_FAILURE(*pErrorCode)) {
  1286             return;
  1289     highStart=trie->highStart;
  1291     if(highStart<=0x10000) {
  1292         allIndexesLength=UTRIE2_INDEX_1_OFFSET;
  1293     } else {
  1294         allIndexesLength=newTrie->index2Length;
  1296     if(valueBits==UTRIE2_16_VALUE_BITS) {
  1297         dataMove=allIndexesLength;
  1298     } else {
  1299         dataMove=0;
  1302     /* are indexLength and dataLength within limits? */
  1303     if( /* for unshifted indexLength */
  1304         allIndexesLength>UTRIE2_MAX_INDEX_LENGTH ||
  1305         /* for unshifted dataNullOffset */
  1306         (dataMove+newTrie->dataNullOffset)>0xffff ||
  1307         /* for unshifted 2-byte UTF-8 index-2 values */
  1308         (dataMove+UNEWTRIE2_DATA_0800_OFFSET)>0xffff ||
  1309         /* for shiftedDataLength */
  1310         (dataMove+newTrie->dataLength)>UTRIE2_MAX_DATA_LENGTH
  1311     ) {
  1312         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  1313         return;
  1316     /* calculate the total serialized length */
  1317     length=sizeof(UTrie2Header)+allIndexesLength*2;
  1318     if(valueBits==UTRIE2_16_VALUE_BITS) {
  1319         length+=newTrie->dataLength*2;
  1320     } else {
  1321         length+=newTrie->dataLength*4;
  1324     trie->memory=uprv_malloc(length);
  1325     if(trie->memory==NULL) {
  1326         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
  1327         return;
  1329     trie->length=length;
  1330     trie->isMemoryOwned=TRUE;
  1332     trie->indexLength=allIndexesLength;
  1333     trie->dataLength=newTrie->dataLength;
  1334     if(highStart<=0x10000) {
  1335         trie->index2NullOffset=0xffff;
  1336     } else {
  1337         trie->index2NullOffset=UTRIE2_INDEX_2_OFFSET+newTrie->index2NullOffset;
  1339     trie->dataNullOffset=(uint16_t)(dataMove+newTrie->dataNullOffset);
  1340     trie->highValueIndex=dataMove+trie->dataLength-UTRIE2_DATA_GRANULARITY;
  1342     /* set the header fields */
  1343     header=(UTrie2Header *)trie->memory;
  1345     header->signature=UTRIE2_SIG; /* "Tri2" */
  1346     header->options=(uint16_t)valueBits;
  1348     header->indexLength=(uint16_t)trie->indexLength;
  1349     header->shiftedDataLength=(uint16_t)(trie->dataLength>>UTRIE2_INDEX_SHIFT);
  1350     header->index2NullOffset=trie->index2NullOffset;
  1351     header->dataNullOffset=trie->dataNullOffset;
  1352     header->shiftedHighStart=(uint16_t)(highStart>>UTRIE2_SHIFT_1);
  1354     /* fill the index and data arrays */
  1355     dest16=(uint16_t *)(header+1);
  1356     trie->index=dest16;
  1358     /* write the index-2 array values shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove */
  1359     p=(uint32_t *)newTrie->index2;
  1360     for(i=UTRIE2_INDEX_2_BMP_LENGTH; i>0; --i) {
  1361         *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT);
  1364     /* write UTF-8 2-byte index-2 values, not right-shifted */
  1365     for(i=0; i<(0xc2-0xc0); ++i) {                                  /* C0..C1 */
  1366         *dest16++=(uint16_t)(dataMove+UTRIE2_BAD_UTF8_DATA_OFFSET);
  1368     for(; i<(0xe0-0xc0); ++i) {                                     /* C2..DF */
  1369         *dest16++=(uint16_t)(dataMove+newTrie->index2[i<<(6-UTRIE2_SHIFT_2)]);
  1372     if(highStart>0x10000) {
  1373         int32_t index1Length=(highStart-0x10000)>>UTRIE2_SHIFT_1;
  1374         int32_t index2Offset=UTRIE2_INDEX_2_BMP_LENGTH+UTRIE2_UTF8_2B_INDEX_2_LENGTH+index1Length;
  1376         /* write 16-bit index-1 values for supplementary code points */
  1377         p=(uint32_t *)newTrie->index1+UTRIE2_OMITTED_BMP_INDEX_1_LENGTH;
  1378         for(i=index1Length; i>0; --i) {
  1379             *dest16++=(uint16_t)(UTRIE2_INDEX_2_OFFSET + *p++);
  1382         /*
  1383          * write the index-2 array values for supplementary code points,
  1384          * shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove
  1385          */
  1386         p=(uint32_t *)newTrie->index2+index2Offset;
  1387         for(i=newTrie->index2Length-index2Offset; i>0; --i) {
  1388             *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT);
  1392     /* write the 16/32-bit data array */
  1393     switch(valueBits) {
  1394     case UTRIE2_16_VALUE_BITS:
  1395         /* write 16-bit data values */
  1396         trie->data16=dest16;
  1397         trie->data32=NULL;
  1398         p=newTrie->data;
  1399         for(i=newTrie->dataLength; i>0; --i) {
  1400             *dest16++=(uint16_t)*p++;
  1402         break;
  1403     case UTRIE2_32_VALUE_BITS:
  1404         /* write 32-bit data values */
  1405         trie->data16=NULL;
  1406         trie->data32=(uint32_t *)dest16;
  1407         uprv_memcpy(dest16, newTrie->data, newTrie->dataLength*4);
  1408         break;
  1409     default:
  1410         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  1411         return;
  1414     /* Delete the UNewTrie2. */
  1415     uprv_free(newTrie->data);
  1416     uprv_free(newTrie);
  1417     trie->newTrie=NULL;
  1420 U_CAPI UBool U_EXPORT2
  1421 utrie2_isFrozen(const UTrie2 *trie) {
  1422     return (UBool)(trie->newTrie==NULL);
  1425 U_CAPI int32_t U_EXPORT2
  1426 utrie2_serialize(UTrie2 *trie,
  1427                  void *data, int32_t capacity,
  1428                  UErrorCode *pErrorCode) {
  1429     /* argument check */
  1430     if(U_FAILURE(*pErrorCode)) {
  1431         return 0;
  1434     if( trie==NULL || trie->memory==NULL || trie->newTrie!=NULL ||
  1435         capacity<0 || (capacity>0 && (data==NULL || (U_POINTER_MASK_LSB(data, 3)!=0)))
  1436     ) {
  1437         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  1438         return 0;
  1441     if(capacity>=trie->length) {
  1442         uprv_memcpy(data, trie->memory, trie->length);
  1443     } else {
  1444         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1446     return trie->length;
  1449 /*
  1450  * This is here to avoid a dependency from utrie2.cpp on utrie.c.
  1451  * This file already depends on utrie.c.
  1452  * Otherwise, this should be in utrie2.cpp right after utrie2_swap().
  1453  */
  1454 U_CAPI int32_t U_EXPORT2
  1455 utrie2_swapAnyVersion(const UDataSwapper *ds,
  1456                       const void *inData, int32_t length, void *outData,
  1457                       UErrorCode *pErrorCode) {
  1458     if(U_SUCCESS(*pErrorCode)) {
  1459         switch(utrie2_getVersion(inData, length, TRUE)) {
  1460         case 1:
  1461             return utrie_swap(ds, inData, length, outData, pErrorCode);
  1462         case 2:
  1463             return utrie2_swap(ds, inData, length, outData, pErrorCode);
  1464         default:
  1465             *pErrorCode=U_INVALID_FORMAT_ERROR;
  1466             return 0;
  1469     return 0;

mercurial