intl/icu/source/common/utrie.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*
     2 ******************************************************************************
     3 *
     4 *   Copyright (C) 2001-2012, International Business Machines
     5 *   Corporation and others.  All Rights Reserved.
     6 *
     7 ******************************************************************************
     8 *   file name:  utrie.cpp
     9 *   encoding:   US-ASCII
    10 *   tab size:   8 (not used)
    11 *   indentation:4
    12 *
    13 *   created on: 2001oct20
    14 *   created by: Markus W. Scherer
    15 *
    16 *   This is a common implementation of a "folded" trie.
    17 *   It is a kind of compressed, serializable table of 16- or 32-bit values associated with
    18 *   Unicode code points (0..0x10ffff).
    19 */
    21 #ifdef UTRIE_DEBUG
    22 #   include <stdio.h>
    23 #endif
    25 #include "unicode/utypes.h"
    26 #include "cmemory.h"
    27 #include "utrie.h"
    29 /* miscellaneous ------------------------------------------------------------ */
    31 #undef ABS
    32 #define ABS(x) ((x)>=0 ? (x) : -(x))
    34 static inline UBool
    35 equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) {
    36     while(length>0 && *s==*t) {
    37         ++s;
    38         ++t;
    39         --length;
    40     }
    41     return (UBool)(length==0);
    42 }
    44 /* Building a trie ----------------------------------------------------------*/
    46 U_CAPI UNewTrie * U_EXPORT2
    47 utrie_open(UNewTrie *fillIn,
    48            uint32_t *aliasData, int32_t maxDataLength,
    49            uint32_t initialValue, uint32_t leadUnitValue,
    50            UBool latin1Linear) {
    51     UNewTrie *trie;
    52     int32_t i, j;
    54     if( maxDataLength<UTRIE_DATA_BLOCK_LENGTH ||
    55         (latin1Linear && maxDataLength<1024)
    56     ) {
    57         return NULL;
    58     }
    60     if(fillIn!=NULL) {
    61         trie=fillIn;
    62     } else {
    63         trie=(UNewTrie *)uprv_malloc(sizeof(UNewTrie));
    64         if(trie==NULL) {
    65             return NULL;
    66         }
    67     }
    68     uprv_memset(trie, 0, sizeof(UNewTrie));
    69     trie->isAllocated= (UBool)(fillIn==NULL);
    71     if(aliasData!=NULL) {
    72         trie->data=aliasData;
    73         trie->isDataAllocated=FALSE;
    74     } else {
    75         trie->data=(uint32_t *)uprv_malloc(maxDataLength*4);
    76         if(trie->data==NULL) {
    77             uprv_free(trie);
    78             return NULL;
    79         }
    80         trie->isDataAllocated=TRUE;
    81     }
    83     /* preallocate and reset the first data block (block index 0) */
    84     j=UTRIE_DATA_BLOCK_LENGTH;
    86     if(latin1Linear) {
    87         /* preallocate and reset the first block (number 0) and Latin-1 (U+0000..U+00ff) after that */
    88         /* made sure above that maxDataLength>=1024 */
    90         /* set indexes to point to consecutive data blocks */
    91         i=0;
    92         do {
    93             /* do this at least for trie->index[0] even if that block is only partly used for Latin-1 */
    94             trie->index[i++]=j;
    95             j+=UTRIE_DATA_BLOCK_LENGTH;
    96         } while(i<(256>>UTRIE_SHIFT));
    97     }
    99     /* reset the initially allocated blocks to the initial value */
   100     trie->dataLength=j;
   101     while(j>0) {
   102         trie->data[--j]=initialValue;
   103     }
   105     trie->leadUnitValue=leadUnitValue;
   106     trie->indexLength=UTRIE_MAX_INDEX_LENGTH;
   107     trie->dataCapacity=maxDataLength;
   108     trie->isLatin1Linear=latin1Linear;
   109     trie->isCompacted=FALSE;
   110     return trie;
   111 }
   113 U_CAPI UNewTrie * U_EXPORT2
   114 utrie_clone(UNewTrie *fillIn, const UNewTrie *other, uint32_t *aliasData, int32_t aliasDataCapacity) {
   115     UNewTrie *trie;
   116     UBool isDataAllocated;
   118     /* do not clone if other is not valid or already compacted */
   119     if(other==NULL || other->data==NULL || other->isCompacted) {
   120         return NULL;
   121     }
   123     /* clone data */
   124     if(aliasData!=NULL && aliasDataCapacity>=other->dataCapacity) {
   125         isDataAllocated=FALSE;
   126     } else {
   127         aliasDataCapacity=other->dataCapacity;
   128         aliasData=(uint32_t *)uprv_malloc(other->dataCapacity*4);
   129         if(aliasData==NULL) {
   130             return NULL;
   131         }
   132         isDataAllocated=TRUE;
   133     }
   135     trie=utrie_open(fillIn, aliasData, aliasDataCapacity,
   136                     other->data[0], other->leadUnitValue,
   137                     other->isLatin1Linear);
   138     if(trie==NULL) {
   139         uprv_free(aliasData);
   140     } else {
   141         uprv_memcpy(trie->index, other->index, sizeof(trie->index));
   142         uprv_memcpy(trie->data, other->data, other->dataLength*4);
   143         trie->dataLength=other->dataLength;
   144         trie->isDataAllocated=isDataAllocated;
   145     }
   147     return trie;
   148 }
   150 U_CAPI void U_EXPORT2
   151 utrie_close(UNewTrie *trie) {
   152     if(trie!=NULL) {
   153         if(trie->isDataAllocated) {
   154             uprv_free(trie->data);
   155             trie->data=NULL;
   156         }
   157         if(trie->isAllocated) {
   158             uprv_free(trie);
   159         }
   160     }
   161 }
   163 U_CAPI uint32_t * U_EXPORT2
   164 utrie_getData(UNewTrie *trie, int32_t *pLength) {
   165     if(trie==NULL || pLength==NULL) {
   166         return NULL;
   167     }
   169     *pLength=trie->dataLength;
   170     return trie->data;
   171 }
   173 static int32_t
   174 utrie_allocDataBlock(UNewTrie *trie) {
   175     int32_t newBlock, newTop;
   177     newBlock=trie->dataLength;
   178     newTop=newBlock+UTRIE_DATA_BLOCK_LENGTH;
   179     if(newTop>trie->dataCapacity) {
   180         /* out of memory in the data array */
   181         return -1;
   182     }
   183     trie->dataLength=newTop;
   184     return newBlock;
   185 }
   187 /**
   188  * No error checking for illegal arguments.
   189  *
   190  * @return -1 if no new data block available (out of memory in data array)
   191  * @internal
   192  */
   193 static int32_t
   194 utrie_getDataBlock(UNewTrie *trie, UChar32 c) {
   195     int32_t indexValue, newBlock;
   197     c>>=UTRIE_SHIFT;
   198     indexValue=trie->index[c];
   199     if(indexValue>0) {
   200         return indexValue;
   201     }
   203     /* allocate a new data block */
   204     newBlock=utrie_allocDataBlock(trie);
   205     if(newBlock<0) {
   206         /* out of memory in the data array */
   207         return -1;
   208     }
   209     trie->index[c]=newBlock;
   211     /* copy-on-write for a block from a setRange() */
   212     uprv_memcpy(trie->data+newBlock, trie->data-indexValue, 4*UTRIE_DATA_BLOCK_LENGTH);
   213     return newBlock;
   214 }
   216 /**
   217  * @return TRUE if the value was successfully set
   218  */
   219 U_CAPI UBool U_EXPORT2
   220 utrie_set32(UNewTrie *trie, UChar32 c, uint32_t value) {
   221     int32_t block;
   223     /* valid, uncompacted trie and valid c? */
   224     if(trie==NULL || trie->isCompacted || (uint32_t)c>0x10ffff) {
   225         return FALSE;
   226     }
   228     block=utrie_getDataBlock(trie, c);
   229     if(block<0) {
   230         return FALSE;
   231     }
   233     trie->data[block+(c&UTRIE_MASK)]=value;
   234     return TRUE;
   235 }
   237 U_CAPI uint32_t U_EXPORT2
   238 utrie_get32(UNewTrie *trie, UChar32 c, UBool *pInBlockZero) {
   239     int32_t block;
   241     /* valid, uncompacted trie and valid c? */
   242     if(trie==NULL || trie->isCompacted || (uint32_t)c>0x10ffff) {
   243         if(pInBlockZero!=NULL) {
   244             *pInBlockZero=TRUE;
   245         }
   246         return 0;
   247     }
   249     block=trie->index[c>>UTRIE_SHIFT];
   250     if(pInBlockZero!=NULL) {
   251         *pInBlockZero= (UBool)(block==0);
   252     }
   254     return trie->data[ABS(block)+(c&UTRIE_MASK)];
   255 }
   257 /**
   258  * @internal
   259  */
   260 static void
   261 utrie_fillBlock(uint32_t *block, UChar32 start, UChar32 limit,
   262                 uint32_t value, uint32_t initialValue, UBool overwrite) {
   263     uint32_t *pLimit;
   265     pLimit=block+limit;
   266     block+=start;
   267     if(overwrite) {
   268         while(block<pLimit) {
   269             *block++=value;
   270         }
   271     } else {
   272         while(block<pLimit) {
   273             if(*block==initialValue) {
   274                 *block=value;
   275             }
   276             ++block;
   277         }
   278     }
   279 }
   281 U_CAPI UBool U_EXPORT2
   282 utrie_setRange32(UNewTrie *trie, UChar32 start, UChar32 limit, uint32_t value, UBool overwrite) {
   283     /*
   284      * repeat value in [start..limit[
   285      * mark index values for repeat-data blocks by setting bit 31 of the index values
   286      * fill around existing values if any, if(overwrite)
   287      */
   288     uint32_t initialValue;
   289     int32_t block, rest, repeatBlock;
   291     /* valid, uncompacted trie and valid indexes? */
   292     if( trie==NULL || trie->isCompacted ||
   293         (uint32_t)start>0x10ffff || (uint32_t)limit>0x110000 || start>limit
   294     ) {
   295         return FALSE;
   296     }
   297     if(start==limit) {
   298         return TRUE; /* nothing to do */
   299     }
   301     initialValue=trie->data[0];
   302     if(start&UTRIE_MASK) {
   303         UChar32 nextStart;
   305         /* set partial block at [start..following block boundary[ */
   306         block=utrie_getDataBlock(trie, start);
   307         if(block<0) {
   308             return FALSE;
   309         }
   311         nextStart=(start+UTRIE_DATA_BLOCK_LENGTH)&~UTRIE_MASK;
   312         if(nextStart<=limit) {
   313             utrie_fillBlock(trie->data+block, start&UTRIE_MASK, UTRIE_DATA_BLOCK_LENGTH,
   314                             value, initialValue, overwrite);
   315             start=nextStart;
   316         } else {
   317             utrie_fillBlock(trie->data+block, start&UTRIE_MASK, limit&UTRIE_MASK,
   318                             value, initialValue, overwrite);
   319             return TRUE;
   320         }
   321     }
   323     /* number of positions in the last, partial block */
   324     rest=limit&UTRIE_MASK;
   326     /* round down limit to a block boundary */
   327     limit&=~UTRIE_MASK;
   329     /* iterate over all-value blocks */
   330     if(value==initialValue) {
   331         repeatBlock=0;
   332     } else {
   333         repeatBlock=-1;
   334     }
   335     while(start<limit) {
   336         /* get index value */
   337         block=trie->index[start>>UTRIE_SHIFT];
   338         if(block>0) {
   339             /* already allocated, fill in value */
   340             utrie_fillBlock(trie->data+block, 0, UTRIE_DATA_BLOCK_LENGTH, value, initialValue, overwrite);
   341         } else if(trie->data[-block]!=value && (block==0 || overwrite)) {
   342             /* set the repeatBlock instead of the current block 0 or range block */
   343             if(repeatBlock>=0) {
   344                 trie->index[start>>UTRIE_SHIFT]=-repeatBlock;
   345             } else {
   346                 /* create and set and fill the repeatBlock */
   347                 repeatBlock=utrie_getDataBlock(trie, start);
   348                 if(repeatBlock<0) {
   349                     return FALSE;
   350                 }
   352                 /* set the negative block number to indicate that it is a repeat block */
   353                 trie->index[start>>UTRIE_SHIFT]=-repeatBlock;
   354                 utrie_fillBlock(trie->data+repeatBlock, 0, UTRIE_DATA_BLOCK_LENGTH, value, initialValue, TRUE);
   355             }
   356         }
   358         start+=UTRIE_DATA_BLOCK_LENGTH;
   359     }
   361     if(rest>0) {
   362         /* set partial block at [last block boundary..limit[ */
   363         block=utrie_getDataBlock(trie, start);
   364         if(block<0) {
   365             return FALSE;
   366         }
   368         utrie_fillBlock(trie->data+block, 0, rest, value, initialValue, overwrite);
   369     }
   371     return TRUE;
   372 }
   374 static int32_t
   375 _findSameIndexBlock(const int32_t *idx, int32_t indexLength,
   376                     int32_t otherBlock) {
   377     int32_t block, i;
   379     for(block=UTRIE_BMP_INDEX_LENGTH; block<indexLength; block+=UTRIE_SURROGATE_BLOCK_COUNT) {
   380         for(i=0; i<UTRIE_SURROGATE_BLOCK_COUNT; ++i) {
   381             if(idx[block+i]!=idx[otherBlock+i]) {
   382                 break;
   383             }
   384         }
   385         if(i==UTRIE_SURROGATE_BLOCK_COUNT) {
   386             return block;
   387         }
   388     }
   389     return indexLength;
   390 }
   392 /*
   393  * Fold the normalization data for supplementary code points into
   394  * a compact area on top of the BMP-part of the trie index,
   395  * with the lead surrogates indexing this compact area.
   396  *
   397  * Duplicate the index values for lead surrogates:
   398  * From inside the BMP area, where some may be overridden with folded values,
   399  * to just after the BMP area, where they can be retrieved for
   400  * code point lookups.
   401  */
   402 static void
   403 utrie_fold(UNewTrie *trie, UNewTrieGetFoldedValue *getFoldedValue, UErrorCode *pErrorCode) {
   404     int32_t leadIndexes[UTRIE_SURROGATE_BLOCK_COUNT];
   405     int32_t *idx;
   406     uint32_t value;
   407     UChar32 c;
   408     int32_t indexLength, block;
   409 #ifdef UTRIE_DEBUG
   410     int countLeadCUWithData=0;
   411 #endif
   413     idx=trie->index;
   415     /* copy the lead surrogate indexes into a temporary array */
   416     uprv_memcpy(leadIndexes, idx+(0xd800>>UTRIE_SHIFT), 4*UTRIE_SURROGATE_BLOCK_COUNT);
   418     /*
   419      * set all values for lead surrogate code *units* to leadUnitValue
   420      * so that, by default, runtime lookups will find no data for associated
   421      * supplementary code points, unless there is data for such code points
   422      * which will result in a non-zero folding value below that is set for
   423      * the respective lead units
   424      *
   425      * the above saved the indexes for surrogate code *points*
   426      * fill the indexes with simplified code from utrie_setRange32()
   427      */
   428     if(trie->leadUnitValue==trie->data[0]) {
   429         block=0; /* leadUnitValue==initialValue, use all-initial-value block */
   430     } else {
   431         /* create and fill the repeatBlock */
   432         block=utrie_allocDataBlock(trie);
   433         if(block<0) {
   434             /* data table overflow */
   435             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
   436             return;
   437         }
   438         utrie_fillBlock(trie->data+block, 0, UTRIE_DATA_BLOCK_LENGTH, trie->leadUnitValue, trie->data[0], TRUE);
   439         block=-block; /* negative block number to indicate that it is a repeat block */
   440     }
   441     for(c=(0xd800>>UTRIE_SHIFT); c<(0xdc00>>UTRIE_SHIFT); ++c) {
   442         trie->index[c]=block;
   443     }
   445     /*
   446      * Fold significant index values into the area just after the BMP indexes.
   447      * In case the first lead surrogate has significant data,
   448      * its index block must be used first (in which case the folding is a no-op).
   449      * Later all folded index blocks are moved up one to insert the copied
   450      * lead surrogate indexes.
   451      */
   452     indexLength=UTRIE_BMP_INDEX_LENGTH;
   454     /* search for any index (stage 1) entries for supplementary code points */
   455     for(c=0x10000; c<0x110000;) {
   456         if(idx[c>>UTRIE_SHIFT]!=0) {
   457             /* there is data, treat the full block for a lead surrogate */
   458             c&=~0x3ff;
   460 #ifdef UTRIE_DEBUG
   461             ++countLeadCUWithData;
   462             /* printf("supplementary data for lead surrogate U+%04lx\n", (long)(0xd7c0+(c>>10))); */
   463 #endif
   465             /* is there an identical index block? */
   466             block=_findSameIndexBlock(idx, indexLength, c>>UTRIE_SHIFT);
   468             /*
   469              * get a folded value for [c..c+0x400[ and,
   470              * if different from the value for the lead surrogate code point,
   471              * set it for the lead surrogate code unit
   472              */
   473             value=getFoldedValue(trie, c, block+UTRIE_SURROGATE_BLOCK_COUNT);
   474             if(value!=utrie_get32(trie, U16_LEAD(c), NULL)) {
   475                 if(!utrie_set32(trie, U16_LEAD(c), value)) {
   476                     /* data table overflow */
   477                     *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
   478                     return;
   479                 }
   481                 /* if we did not find an identical index block... */
   482                 if(block==indexLength) {
   483                     /* move the actual index (stage 1) entries from the supplementary position to the new one */
   484                     uprv_memmove(idx+indexLength,
   485                                  idx+(c>>UTRIE_SHIFT),
   486                                  4*UTRIE_SURROGATE_BLOCK_COUNT);
   487                     indexLength+=UTRIE_SURROGATE_BLOCK_COUNT;
   488                 }
   489             }
   490             c+=0x400;
   491         } else {
   492             c+=UTRIE_DATA_BLOCK_LENGTH;
   493         }
   494     }
   495 #ifdef UTRIE_DEBUG
   496     if(countLeadCUWithData>0) {
   497         printf("supplementary data for %d lead surrogates\n", countLeadCUWithData);
   498     }
   499 #endif
   501     /*
   502      * index array overflow?
   503      * This is to guarantee that a folding offset is of the form
   504      * UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023.
   505      * If the index is too large, then n>=1024 and more than 10 bits are necessary.
   506      *
   507      * In fact, it can only ever become n==1024 with completely unfoldable data and
   508      * the additional block of duplicated values for lead surrogates.
   509      */
   510     if(indexLength>=UTRIE_MAX_INDEX_LENGTH) {
   511         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   512         return;
   513     }
   515     /*
   516      * make space for the lead surrogate index block and
   517      * insert it between the BMP indexes and the folded ones
   518      */
   519     uprv_memmove(idx+UTRIE_BMP_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT,
   520                  idx+UTRIE_BMP_INDEX_LENGTH,
   521                  4*(indexLength-UTRIE_BMP_INDEX_LENGTH));
   522     uprv_memcpy(idx+UTRIE_BMP_INDEX_LENGTH,
   523                 leadIndexes,
   524                 4*UTRIE_SURROGATE_BLOCK_COUNT);
   525     indexLength+=UTRIE_SURROGATE_BLOCK_COUNT;
   527 #ifdef UTRIE_DEBUG
   528     printf("trie index count: BMP %ld  all Unicode %ld  folded %ld\n",
   529            UTRIE_BMP_INDEX_LENGTH, (long)UTRIE_MAX_INDEX_LENGTH, indexLength);
   530 #endif
   532     trie->indexLength=indexLength;
   533 }
   535 /*
   536  * Set a value in the trie index map to indicate which data block
   537  * is referenced and which one is not.
   538  * utrie_compact() will remove data blocks that are not used at all.
   539  * Set
   540  * - 0 if it is used
   541  * - -1 if it is not used
   542  */
   543 static void
   544 _findUnusedBlocks(UNewTrie *trie) {
   545     int32_t i;
   547     /* fill the entire map with "not used" */
   548     uprv_memset(trie->map, 0xff, (UTRIE_MAX_BUILD_TIME_DATA_LENGTH>>UTRIE_SHIFT)*4);
   550     /* mark each block that _is_ used with 0 */
   551     for(i=0; i<trie->indexLength; ++i) {
   552         trie->map[ABS(trie->index[i])>>UTRIE_SHIFT]=0;
   553     }
   555     /* never move the all-initial-value block 0 */
   556     trie->map[0]=0;
   557 }
   559 static int32_t
   560 _findSameDataBlock(const uint32_t *data, int32_t dataLength,
   561                    int32_t otherBlock, int32_t step) {
   562     int32_t block;
   564     /* ensure that we do not even partially get past dataLength */
   565     dataLength-=UTRIE_DATA_BLOCK_LENGTH;
   567     for(block=0; block<=dataLength; block+=step) {
   568         if(equal_uint32(data+block, data+otherBlock, UTRIE_DATA_BLOCK_LENGTH)) {
   569             return block;
   570         }
   571     }
   572     return -1;
   573 }
   575 /*
   576  * Compact a folded build-time trie.
   577  *
   578  * The compaction
   579  * - removes blocks that are identical with earlier ones
   580  * - overlaps adjacent blocks as much as possible (if overlap==TRUE)
   581  * - moves blocks in steps of the data granularity
   582  * - moves and overlaps blocks that overlap with multiple values in the overlap region
   583  *
   584  * It does not
   585  * - try to move and overlap blocks that are not already adjacent
   586  */
   587 static void
   588 utrie_compact(UNewTrie *trie, UBool overlap, UErrorCode *pErrorCode) {
   589     int32_t i, start, newStart, overlapStart;
   591     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
   592         return;
   593     }
   595     /* valid, uncompacted trie? */
   596     if(trie==NULL) {
   597         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   598         return;
   599     }
   600     if(trie->isCompacted) {
   601         return; /* nothing left to do */
   602     }
   604     /* compaction */
   606     /* initialize the index map with "block is used/unused" flags */
   607     _findUnusedBlocks(trie);
   609     /* if Latin-1 is preallocated and linear, then do not compact Latin-1 data */
   610     if(trie->isLatin1Linear && UTRIE_SHIFT<=8) {
   611         overlapStart=UTRIE_DATA_BLOCK_LENGTH+256;
   612     } else {
   613         overlapStart=UTRIE_DATA_BLOCK_LENGTH;
   614     }
   616     newStart=UTRIE_DATA_BLOCK_LENGTH;
   617     for(start=newStart; start<trie->dataLength;) {
   618         /*
   619          * start: index of first entry of current block
   620          * newStart: index where the current block is to be moved
   621          *           (right after current end of already-compacted data)
   622          */
   624         /* skip blocks that are not used */
   625         if(trie->map[start>>UTRIE_SHIFT]<0) {
   626             /* advance start to the next block */
   627             start+=UTRIE_DATA_BLOCK_LENGTH;
   629             /* leave newStart with the previous block! */
   630             continue;
   631         }
   633         /* search for an identical block */
   634         if( start>=overlapStart &&
   635             (i=_findSameDataBlock(trie->data, newStart, start,
   636                             overlap ? UTRIE_DATA_GRANULARITY : UTRIE_DATA_BLOCK_LENGTH))
   637              >=0
   638         ) {
   639             /* found an identical block, set the other block's index value for the current block */
   640             trie->map[start>>UTRIE_SHIFT]=i;
   642             /* advance start to the next block */
   643             start+=UTRIE_DATA_BLOCK_LENGTH;
   645             /* leave newStart with the previous block! */
   646             continue;
   647         }
   649         /* see if the beginning of this block can be overlapped with the end of the previous block */
   650         if(overlap && start>=overlapStart) {
   651             /* look for maximum overlap (modulo granularity) with the previous, adjacent block */
   652             for(i=UTRIE_DATA_BLOCK_LENGTH-UTRIE_DATA_GRANULARITY;
   653                 i>0 && !equal_uint32(trie->data+(newStart-i), trie->data+start, i);
   654                 i-=UTRIE_DATA_GRANULARITY) {}
   655         } else {
   656             i=0;
   657         }
   659         if(i>0) {
   660             /* some overlap */
   661             trie->map[start>>UTRIE_SHIFT]=newStart-i;
   663             /* move the non-overlapping indexes to their new positions */
   664             start+=i;
   665             for(i=UTRIE_DATA_BLOCK_LENGTH-i; i>0; --i) {
   666                 trie->data[newStart++]=trie->data[start++];
   667             }
   668         } else if(newStart<start) {
   669             /* no overlap, just move the indexes to their new positions */
   670             trie->map[start>>UTRIE_SHIFT]=newStart;
   671             for(i=UTRIE_DATA_BLOCK_LENGTH; i>0; --i) {
   672                 trie->data[newStart++]=trie->data[start++];
   673             }
   674         } else /* no overlap && newStart==start */ {
   675             trie->map[start>>UTRIE_SHIFT]=start;
   676             newStart+=UTRIE_DATA_BLOCK_LENGTH;
   677             start=newStart;
   678         }
   679     }
   681     /* now adjust the index (stage 1) table */
   682     for(i=0; i<trie->indexLength; ++i) {
   683         trie->index[i]=trie->map[ABS(trie->index[i])>>UTRIE_SHIFT];
   684     }
   686 #ifdef UTRIE_DEBUG
   687     /* we saved some space */
   688     printf("compacting trie: count of 32-bit words %lu->%lu\n",
   689             (long)trie->dataLength, (long)newStart);
   690 #endif
   692     trie->dataLength=newStart;
   693 }
   695 /* serialization ------------------------------------------------------------ */
   697 /*
   698  * Default function for the folding value:
   699  * Just store the offset (16 bits) if there is any non-initial-value entry.
   700  *
   701  * The offset parameter is never 0.
   702  * Returning the offset itself is safe for UTRIE_SHIFT>=5 because
   703  * for UTRIE_SHIFT==5 the maximum index length is UTRIE_MAX_INDEX_LENGTH==0x8800
   704  * which fits into 16-bit trie values;
   705  * for higher UTRIE_SHIFT, UTRIE_MAX_INDEX_LENGTH decreases.
   706  *
   707  * Theoretically, it would be safer for all possible UTRIE_SHIFT including
   708  * those of 4 and lower to return offset>>UTRIE_SURROGATE_BLOCK_BITS
   709  * which would always result in a value of 0x40..0x43f
   710  * (start/end 1k blocks of supplementary Unicode code points).
   711  * However, this would be uglier, and would not work for some existing
   712  * binary data file formats.
   713  *
   714  * Also, we do not plan to change UTRIE_SHIFT because it would change binary
   715  * data file formats, and we would probably not make it smaller because of
   716  * the then even larger BMP index length even for empty tries.
   717  */
   718 static uint32_t U_CALLCONV
   719 defaultGetFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset) {
   720     uint32_t value, initialValue;
   721     UChar32 limit;
   722     UBool inBlockZero;
   724     initialValue=trie->data[0];
   725     limit=start+0x400;
   726     while(start<limit) {
   727         value=utrie_get32(trie, start, &inBlockZero);
   728         if(inBlockZero) {
   729             start+=UTRIE_DATA_BLOCK_LENGTH;
   730         } else if(value!=initialValue) {
   731             return (uint32_t)offset;
   732         } else {
   733             ++start;
   734         }
   735     }
   736     return 0;
   737 }
   739 U_CAPI int32_t U_EXPORT2
   740 utrie_serialize(UNewTrie *trie, void *dt, int32_t capacity,
   741                 UNewTrieGetFoldedValue *getFoldedValue,
   742                 UBool reduceTo16Bits,
   743                 UErrorCode *pErrorCode) {
   744     UTrieHeader *header;
   745     uint32_t *p;
   746     uint16_t *dest16;
   747     int32_t i, length;
   748     uint8_t* data = NULL;
   750     /* argument check */
   751     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
   752         return 0;
   753     }
   755     if(trie==NULL || capacity<0 || (capacity>0 && dt==NULL)) {
   756         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   757         return 0;
   758     }
   759     if(getFoldedValue==NULL) {
   760         getFoldedValue=defaultGetFoldedValue;
   761     }
   763     data = (uint8_t*)dt;
   764     /* fold and compact if necessary, also checks that indexLength is within limits */
   765     if(!trie->isCompacted) {
   766         /* compact once without overlap to improve folding */
   767         utrie_compact(trie, FALSE, pErrorCode);
   769         /* fold the supplementary part of the index array */
   770         utrie_fold(trie, getFoldedValue, pErrorCode);
   772         /* compact again with overlap for minimum data array length */
   773         utrie_compact(trie, TRUE, pErrorCode);
   775         trie->isCompacted=TRUE;
   776         if(U_FAILURE(*pErrorCode)) {
   777             return 0;
   778         }
   779     }
   781     /* is dataLength within limits? */
   782     if( (reduceTo16Bits ? (trie->dataLength+trie->indexLength) : trie->dataLength) >= UTRIE_MAX_DATA_LENGTH) {
   783         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   784     }
   786     length=sizeof(UTrieHeader)+2*trie->indexLength;
   787     if(reduceTo16Bits) {
   788         length+=2*trie->dataLength;
   789     } else {
   790         length+=4*trie->dataLength;
   791     }
   793     if(length>capacity) {
   794         return length; /* preflighting */
   795     }
   797 #ifdef UTRIE_DEBUG
   798     printf("**UTrieLengths(serialize)** index:%6ld  data:%6ld  serialized:%6ld\n",
   799            (long)trie->indexLength, (long)trie->dataLength, (long)length);
   800 #endif
   802     /* set the header fields */
   803     header=(UTrieHeader *)data;
   804     data+=sizeof(UTrieHeader);
   806     header->signature=0x54726965; /* "Trie" */
   807     header->options=UTRIE_SHIFT | (UTRIE_INDEX_SHIFT<<UTRIE_OPTIONS_INDEX_SHIFT);
   809     if(!reduceTo16Bits) {
   810         header->options|=UTRIE_OPTIONS_DATA_IS_32_BIT;
   811     }
   812     if(trie->isLatin1Linear) {
   813         header->options|=UTRIE_OPTIONS_LATIN1_IS_LINEAR;
   814     }
   816     header->indexLength=trie->indexLength;
   817     header->dataLength=trie->dataLength;
   819     /* write the index (stage 1) array and the 16/32-bit data (stage 2) array */
   820     if(reduceTo16Bits) {
   821         /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT, after adding indexLength */
   822         p=(uint32_t *)trie->index;
   823         dest16=(uint16_t *)data;
   824         for(i=trie->indexLength; i>0; --i) {
   825             *dest16++=(uint16_t)((*p++ + trie->indexLength)>>UTRIE_INDEX_SHIFT);
   826         }
   828         /* write 16-bit data values */
   829         p=trie->data;
   830         for(i=trie->dataLength; i>0; --i) {
   831             *dest16++=(uint16_t)*p++;
   832         }
   833     } else {
   834         /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT */
   835         p=(uint32_t *)trie->index;
   836         dest16=(uint16_t *)data;
   837         for(i=trie->indexLength; i>0; --i) {
   838             *dest16++=(uint16_t)(*p++ >> UTRIE_INDEX_SHIFT);
   839         }
   841         /* write 32-bit data values */
   842         uprv_memcpy(dest16, trie->data, 4*trie->dataLength);
   843     }
   845     return length;
   846 }
   848 /* inverse to defaultGetFoldedValue() */
   849 U_CAPI int32_t U_EXPORT2
   850 utrie_defaultGetFoldingOffset(uint32_t data) {
   851     return (int32_t)data;
   852 }
   854 U_CAPI int32_t U_EXPORT2
   855 utrie_unserialize(UTrie *trie, const void *data, int32_t length, UErrorCode *pErrorCode) {
   856     const UTrieHeader *header;
   857     const uint16_t *p16;
   858     uint32_t options;
   860     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
   861         return -1;
   862     }
   864     /* enough data for a trie header? */
   865     if(length<(int32_t)sizeof(UTrieHeader)) {
   866         *pErrorCode=U_INVALID_FORMAT_ERROR;
   867         return -1;
   868     }
   870     /* check the signature */
   871     header=(const UTrieHeader *)data;
   872     if(header->signature!=0x54726965) {
   873         *pErrorCode=U_INVALID_FORMAT_ERROR;
   874         return -1;
   875     }
   877     /* get the options and check the shift values */
   878     options=header->options;
   879     if( (options&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_SHIFT ||
   880         ((options>>UTRIE_OPTIONS_INDEX_SHIFT)&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_INDEX_SHIFT
   881     ) {
   882         *pErrorCode=U_INVALID_FORMAT_ERROR;
   883         return -1;
   884     }
   885     trie->isLatin1Linear= (UBool)((options&UTRIE_OPTIONS_LATIN1_IS_LINEAR)!=0);
   887     /* get the length values */
   888     trie->indexLength=header->indexLength;
   889     trie->dataLength=header->dataLength;
   891     length-=(int32_t)sizeof(UTrieHeader);
   893     /* enough data for the index? */
   894     if(length<2*trie->indexLength) {
   895         *pErrorCode=U_INVALID_FORMAT_ERROR;
   896         return -1;
   897     }
   898     p16=(const uint16_t *)(header+1);
   899     trie->index=p16;
   900     p16+=trie->indexLength;
   901     length-=2*trie->indexLength;
   903     /* get the data */
   904     if(options&UTRIE_OPTIONS_DATA_IS_32_BIT) {
   905         if(length<4*trie->dataLength) {
   906             *pErrorCode=U_INVALID_FORMAT_ERROR;
   907             return -1;
   908         }
   909         trie->data32=(const uint32_t *)p16;
   910         trie->initialValue=trie->data32[0];
   911         length=(int32_t)sizeof(UTrieHeader)+2*trie->indexLength+4*trie->dataLength;
   912     } else {
   913         if(length<2*trie->dataLength) {
   914             *pErrorCode=U_INVALID_FORMAT_ERROR;
   915             return -1;
   916         }
   918         /* the "data16" data is used via the index pointer */
   919         trie->data32=NULL;
   920         trie->initialValue=trie->index[trie->indexLength];
   921         length=(int32_t)sizeof(UTrieHeader)+2*trie->indexLength+2*trie->dataLength;
   922     }
   924     trie->getFoldingOffset=utrie_defaultGetFoldingOffset;
   926     return length;
   927 }
   929 U_CAPI int32_t U_EXPORT2
   930 utrie_unserializeDummy(UTrie *trie,
   931                        void *data, int32_t length,
   932                        uint32_t initialValue, uint32_t leadUnitValue,
   933                        UBool make16BitTrie,
   934                        UErrorCode *pErrorCode) {
   935     uint16_t *p16;
   936     int32_t actualLength, latin1Length, i, limit;
   937     uint16_t block;
   939     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
   940         return -1;
   941     }
   943     /* calculate the actual size of the dummy trie data */
   945     /* max(Latin-1, block 0) */
   946     latin1Length= 256; /*UTRIE_SHIFT<=8 ? 256 : UTRIE_DATA_BLOCK_LENGTH;*/
   948     trie->indexLength=UTRIE_BMP_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT;
   949     trie->dataLength=latin1Length;
   950     if(leadUnitValue!=initialValue) {
   951         trie->dataLength+=UTRIE_DATA_BLOCK_LENGTH;
   952     }
   954     actualLength=trie->indexLength*2;
   955     if(make16BitTrie) {
   956         actualLength+=trie->dataLength*2;
   957     } else {
   958         actualLength+=trie->dataLength*4;
   959     }
   961     /* enough space for the dummy trie? */
   962     if(length<actualLength) {
   963         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   964         return actualLength;
   965     }
   967     trie->isLatin1Linear=TRUE;
   968     trie->initialValue=initialValue;
   970     /* fill the index and data arrays */
   971     p16=(uint16_t *)data;
   972     trie->index=p16;
   974     if(make16BitTrie) {
   975         /* indexes to block 0 */
   976         block=(uint16_t)(trie->indexLength>>UTRIE_INDEX_SHIFT);
   977         limit=trie->indexLength;
   978         for(i=0; i<limit; ++i) {
   979             p16[i]=block;
   980         }
   982         if(leadUnitValue!=initialValue) {
   983             /* indexes for lead surrogate code units to the block after Latin-1 */
   984             block+=(uint16_t)(latin1Length>>UTRIE_INDEX_SHIFT);
   985             i=0xd800>>UTRIE_SHIFT;
   986             limit=0xdc00>>UTRIE_SHIFT;
   987             for(; i<limit; ++i) {
   988                 p16[i]=block;
   989             }
   990         }
   992         trie->data32=NULL;
   994         /* Latin-1 data */
   995         p16+=trie->indexLength;
   996         for(i=0; i<latin1Length; ++i) {
   997             p16[i]=(uint16_t)initialValue;
   998         }
  1000         /* data for lead surrogate code units */
  1001         if(leadUnitValue!=initialValue) {
  1002             limit=latin1Length+UTRIE_DATA_BLOCK_LENGTH;
  1003             for(/* i=latin1Length */; i<limit; ++i) {
  1004                 p16[i]=(uint16_t)leadUnitValue;
  1007     } else {
  1008         uint32_t *p32;
  1010         /* indexes to block 0 */
  1011         uprv_memset(p16, 0, trie->indexLength*2);
  1013         if(leadUnitValue!=initialValue) {
  1014             /* indexes for lead surrogate code units to the block after Latin-1 */
  1015             block=(uint16_t)(latin1Length>>UTRIE_INDEX_SHIFT);
  1016             i=0xd800>>UTRIE_SHIFT;
  1017             limit=0xdc00>>UTRIE_SHIFT;
  1018             for(; i<limit; ++i) {
  1019                 p16[i]=block;
  1023         trie->data32=p32=(uint32_t *)(p16+trie->indexLength);
  1025         /* Latin-1 data */
  1026         for(i=0; i<latin1Length; ++i) {
  1027             p32[i]=initialValue;
  1030         /* data for lead surrogate code units */
  1031         if(leadUnitValue!=initialValue) {
  1032             limit=latin1Length+UTRIE_DATA_BLOCK_LENGTH;
  1033             for(/* i=latin1Length */; i<limit; ++i) {
  1034                 p32[i]=leadUnitValue;
  1039     trie->getFoldingOffset=utrie_defaultGetFoldingOffset;
  1041     return actualLength;
  1044 /* enumeration -------------------------------------------------------------- */
  1046 /* default UTrieEnumValue() returns the input value itself */
  1047 static uint32_t U_CALLCONV
  1048 enumSameValue(const void * /*context*/, uint32_t value) {
  1049     return value;
  1052 /**
  1053  * Enumerate all ranges of code points with the same relevant values.
  1054  * The values are transformed from the raw trie entries by the enumValue function.
  1055  */
  1056 U_CAPI void U_EXPORT2
  1057 utrie_enum(const UTrie *trie,
  1058            UTrieEnumValue *enumValue, UTrieEnumRange *enumRange, const void *context) {
  1059     const uint32_t *data32;
  1060     const uint16_t *idx;
  1062     uint32_t value, prevValue, initialValue;
  1063     UChar32 c, prev;
  1064     int32_t l, i, j, block, prevBlock, nullBlock, offset;
  1066     /* check arguments */
  1067     if(trie==NULL || trie->index==NULL || enumRange==NULL) {
  1068         return;
  1070     if(enumValue==NULL) {
  1071         enumValue=enumSameValue;
  1074     idx=trie->index;
  1075     data32=trie->data32;
  1077     /* get the enumeration value that corresponds to an initial-value trie data entry */
  1078     initialValue=enumValue(context, trie->initialValue);
  1080     if(data32==NULL) {
  1081         nullBlock=trie->indexLength;
  1082     } else {
  1083         nullBlock=0;
  1086     /* set variables for previous range */
  1087     prevBlock=nullBlock;
  1088     prev=0;
  1089     prevValue=initialValue;
  1091     /* enumerate BMP - the main loop enumerates data blocks */
  1092     for(i=0, c=0; c<=0xffff; ++i) {
  1093         if(c==0xd800) {
  1094             /* skip lead surrogate code _units_, go to lead surr. code _points_ */
  1095             i=UTRIE_BMP_INDEX_LENGTH;
  1096         } else if(c==0xdc00) {
  1097             /* go back to regular BMP code points */
  1098             i=c>>UTRIE_SHIFT;
  1101         block=idx[i]<<UTRIE_INDEX_SHIFT;
  1102         if(block==prevBlock) {
  1103             /* the block is the same as the previous one, and filled with value */
  1104             c+=UTRIE_DATA_BLOCK_LENGTH;
  1105         } else if(block==nullBlock) {
  1106             /* this is the all-initial-value block */
  1107             if(prevValue!=initialValue) {
  1108                 if(prev<c) {
  1109                     if(!enumRange(context, prev, c, prevValue)) {
  1110                         return;
  1113                 prevBlock=nullBlock;
  1114                 prev=c;
  1115                 prevValue=initialValue;
  1117             c+=UTRIE_DATA_BLOCK_LENGTH;
  1118         } else {
  1119             prevBlock=block;
  1120             for(j=0; j<UTRIE_DATA_BLOCK_LENGTH; ++j) {
  1121                 value=enumValue(context, data32!=NULL ? data32[block+j] : idx[block+j]);
  1122                 if(value!=prevValue) {
  1123                     if(prev<c) {
  1124                         if(!enumRange(context, prev, c, prevValue)) {
  1125                             return;
  1128                     if(j>0) {
  1129                         /* the block is not filled with all the same value */
  1130                         prevBlock=-1;
  1132                     prev=c;
  1133                     prevValue=value;
  1135                 ++c;
  1140     /* enumerate supplementary code points */
  1141     for(l=0xd800; l<0xdc00;) {
  1142         /* lead surrogate access */
  1143         offset=idx[l>>UTRIE_SHIFT]<<UTRIE_INDEX_SHIFT;
  1144         if(offset==nullBlock) {
  1145             /* no entries for a whole block of lead surrogates */
  1146             if(prevValue!=initialValue) {
  1147                 if(prev<c) {
  1148                     if(!enumRange(context, prev, c, prevValue)) {
  1149                         return;
  1152                 prevBlock=nullBlock;
  1153                 prev=c;
  1154                 prevValue=initialValue;
  1157             l+=UTRIE_DATA_BLOCK_LENGTH;
  1158             c+=UTRIE_DATA_BLOCK_LENGTH<<10;
  1159             continue;
  1162         value= data32!=NULL ? data32[offset+(l&UTRIE_MASK)] : idx[offset+(l&UTRIE_MASK)];
  1164         /* enumerate trail surrogates for this lead surrogate */
  1165         offset=trie->getFoldingOffset(value);
  1166         if(offset<=0) {
  1167             /* no data for this lead surrogate */
  1168             if(prevValue!=initialValue) {
  1169                 if(prev<c) {
  1170                     if(!enumRange(context, prev, c, prevValue)) {
  1171                         return;
  1174                 prevBlock=nullBlock;
  1175                 prev=c;
  1176                 prevValue=initialValue;
  1179             /* nothing else to do for the supplementary code points for this lead surrogate */
  1180             c+=0x400;
  1181         } else {
  1182             /* enumerate code points for this lead surrogate */
  1183             i=offset;
  1184             offset+=UTRIE_SURROGATE_BLOCK_COUNT;
  1185             do {
  1186                 /* copy of most of the body of the BMP loop */
  1187                 block=idx[i]<<UTRIE_INDEX_SHIFT;
  1188                 if(block==prevBlock) {
  1189                     /* the block is the same as the previous one, and filled with value */
  1190                     c+=UTRIE_DATA_BLOCK_LENGTH;
  1191                 } else if(block==nullBlock) {
  1192                     /* this is the all-initial-value block */
  1193                     if(prevValue!=initialValue) {
  1194                         if(prev<c) {
  1195                             if(!enumRange(context, prev, c, prevValue)) {
  1196                                 return;
  1199                         prevBlock=nullBlock;
  1200                         prev=c;
  1201                         prevValue=initialValue;
  1203                     c+=UTRIE_DATA_BLOCK_LENGTH;
  1204                 } else {
  1205                     prevBlock=block;
  1206                     for(j=0; j<UTRIE_DATA_BLOCK_LENGTH; ++j) {
  1207                         value=enumValue(context, data32!=NULL ? data32[block+j] : idx[block+j]);
  1208                         if(value!=prevValue) {
  1209                             if(prev<c) {
  1210                                 if(!enumRange(context, prev, c, prevValue)) {
  1211                                     return;
  1214                             if(j>0) {
  1215                                 /* the block is not filled with all the same value */
  1216                                 prevBlock=-1;
  1218                             prev=c;
  1219                             prevValue=value;
  1221                         ++c;
  1224             } while(++i<offset);
  1227         ++l;
  1230     /* deliver last range */
  1231     enumRange(context, prev, c, prevValue);

mercurial