intl/icu/source/i18n/ucol_bld.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*
     2 *******************************************************************************
     3 *
     4 *   Copyright (C) 2001-2013, International Business Machines
     5 *   Corporation and others.  All Rights Reserved.
     6 *
     7 *******************************************************************************
     8 *   file name:  ucol_bld.cpp
     9 *   encoding:   US-ASCII
    10 *   tab size:   8 (not used)
    11 *   indentation:4
    12 *
    13 *   created 02/22/2001
    14 *   created by: Vladimir Weinstein
    15 *
    16 * This module builds a collator based on the rule set.
    17 *
    18 */
    20 #include "unicode/utypes.h"
    22 #if !UCONFIG_NO_COLLATION
    24 #include "unicode/ucoleitr.h"
    25 #include "unicode/udata.h"
    26 #include "unicode/uchar.h"
    27 #include "unicode/uniset.h"
    28 #include "unicode/uscript.h"
    29 #include "unicode/ustring.h"
    30 #include "unicode/utf16.h"
    31 #include "normalizer2impl.h"
    32 #include "uassert.h"
    33 #include "ucol_bld.h"
    34 #include "ucol_elm.h"
    35 #include "ucol_cnt.h"
    36 #include "ucln_in.h"
    37 #include "umutex.h"
    38 #include "cmemory.h"
    39 #include "cstring.h"
    41 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
    43 static const InverseUCATableHeader* _staticInvUCA = NULL;
    44 static UDataMemory* invUCA_DATA_MEM = NULL;
    45 static icu::UInitOnce gStaticInvUCAInitOnce = U_INITONCE_INITIALIZER;
    47 U_CDECL_BEGIN
    48 static UBool U_CALLCONV
    49 isAcceptableInvUCA(void * /*context*/,
    50                    const char * /*type*/, const char * /*name*/,
    51                    const UDataInfo *pInfo)
    52 {
    53     /* context, type & name are intentionally not used */
    54     if( pInfo->size>=20 &&
    55         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
    56         pInfo->charsetFamily==U_CHARSET_FAMILY &&
    57         pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 &&   /* dataFormat="InvC" */
    58         pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
    59         pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
    60         pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
    61         pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
    62         pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
    63         //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
    64         //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
    65         //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
    66         )
    67     {
    68         // TODO: Check that the invuca data version (pInfo->dataVersion)
    69         // matches the ucadata version.
    70         return TRUE;
    71     } else {
    72         return FALSE;
    73     }
    74 }
    75 U_CDECL_END
    77 /*
    78 * Takes two CEs (lead and continuation) and
    79 * compares them as CEs should be compared:
    80 * primary vs. primary, secondary vs. secondary
    81 * tertiary vs. tertiary
    82 */
    83 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
    84     uint32_t s1 = source0, s2, t1 = target0, t2;
    85     if(isContinuation(source1)) {
    86         s2 = source1;
    87     } else {
    88         s2 = 0;
    89     }
    90     if(isContinuation(target1)) {
    91         t2 = target1;
    92     } else {
    93         t2 = 0;
    94     }
    96     uint32_t s = 0, t = 0;
    97     if(s1 == t1 && s2 == t2) {
    98         return 0;
    99     }
   100     s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
   101     t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
   102     if(s < t) {
   103         return -1;
   104     } else if(s > t) {
   105         return 1;
   106     } else {
   107         s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
   108         t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
   109         if(s < t) {
   110             return -1;
   111         } else if(s > t) {
   112             return 1;
   113         } else {
   114             s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
   115             t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
   116             if(s < t) {
   117                 return -1;
   118             } else {
   119                 return 1;
   120             }
   121         }
   122     }
   123 }
   125 static
   126 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
   127     uint32_t bottom = 0, top = src->invUCA->tableSize;
   128     uint32_t i = 0;
   129     uint32_t first = 0, second = 0;
   130     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
   131     int32_t res = 0;
   133     while(bottom < top-1) {
   134         i = (top+bottom)/2;
   135         first = *(CETable+3*i);
   136         second = *(CETable+3*i+1);
   137         res = compareCEs(first, second, CE, SecondCE);
   138         if(res > 0) {
   139             top = i;
   140         } else if(res < 0) {
   141             bottom = i;
   142         } else {
   143             break;
   144         }
   145     }
   147     /* weiv:                                                  */
   148     /* in searching for elements, I have removed the failure  */
   149     /* The reason for this is that the builder does not rely  */
   150     /* on search mechanism telling it that it didn't find an  */
   151     /* element. However, indirect positioning relies on being */
   152     /* able to find the elements around any CE, even if it is */
   153     /* not defined in the UCA. */
   154     return i;
   155     /*
   156     if((first == CE && second == SecondCE)) {
   157     return i;
   158     } else {
   159     return -1;
   160     }
   161     */
   162 }
   164 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
   165     0xFFFF0000,
   166     0xFFFFFF00,
   167     0xFFFFFFFF
   168 };
   170 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
   171                                             uint32_t CE, uint32_t contCE,
   172                                             uint32_t *nextCE, uint32_t *nextContCE,
   173                                             uint32_t strength)
   174 {
   175     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
   176     int32_t iCE;
   178     iCE = ucol_inv_findCE(src, CE, contCE);
   180     if(iCE<0) {
   181         *nextCE = UCOL_NOT_FOUND;
   182         return -1;
   183     }
   185     CE &= strengthMask[strength];
   186     contCE &= strengthMask[strength];
   188     *nextCE = CE;
   189     *nextContCE = contCE;
   191     while((*nextCE  & strengthMask[strength]) == CE
   192         && (*nextContCE  & strengthMask[strength]) == contCE)
   193     {
   194         *nextCE = (*(CETable+3*(++iCE)));
   195         *nextContCE = (*(CETable+3*(iCE)+1));
   196     }
   198     return iCE;
   199 }
   201 U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
   202                                             uint32_t CE, uint32_t contCE,
   203                                             uint32_t *prevCE, uint32_t *prevContCE,
   204                                             uint32_t strength)
   205 {
   206     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
   207     int32_t iCE;
   209     iCE = ucol_inv_findCE(src, CE, contCE);
   211     if(iCE<0) {
   212         *prevCE = UCOL_NOT_FOUND;
   213         return -1;
   214     }
   216     CE &= strengthMask[strength];
   217     contCE &= strengthMask[strength];
   219     *prevCE = CE;
   220     *prevContCE = contCE;
   222     while((*prevCE  & strengthMask[strength]) == CE
   223         && (*prevContCE  & strengthMask[strength])== contCE
   224         && iCE > 0) /* this condition should prevent falling off the edge of the world */
   225     {
   226         /* here, we end up in a singularity - zero */
   227         *prevCE = (*(CETable+3*(--iCE)));
   228         *prevContCE = (*(CETable+3*(iCE)+1));
   229     }
   231     return iCE;
   232 }
   234 U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE,
   235                                                        uint32_t prevCE, uint32_t prevContCE)
   236 {
   237     if(prevCE == CE && prevContCE == contCE) {
   238         return UCOL_IDENTICAL;
   239     }
   240     if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY])
   241         || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY]))
   242     {
   243         return UCOL_PRIMARY;
   244     }
   245     if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY])
   246         || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY]))
   247     {
   248         return UCOL_SECONDARY;
   249     }
   250     return UCOL_TERTIARY;
   251 }
   254 /*static
   255 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
   257     uint32_t CE = lh->baseCE;
   258     uint32_t SecondCE = lh->baseContCE;
   260     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
   261     uint32_t previousCE, previousContCE;
   262     int32_t iCE;
   264     iCE = ucol_inv_findCE(src, CE, SecondCE);
   266     if(iCE<0) {
   267         return -1;
   268     }
   270     CE &= strengthMask[strength];
   271     SecondCE &= strengthMask[strength];
   273     previousCE = CE;
   274     previousContCE = SecondCE;
   276     while((previousCE  & strengthMask[strength]) == CE && (previousContCE  & strengthMask[strength])== SecondCE) {
   277         previousCE = (*(CETable+3*(--iCE)));
   278         previousContCE = (*(CETable+3*(iCE)+1));
   279     }
   280     lh->previousCE = previousCE;
   281     lh->previousContCE = previousContCE;
   283     return iCE;
   284 }*/
   286 static
   287 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
   288     uint32_t CE = lh->baseCE;
   289     uint32_t SecondCE = lh->baseContCE;
   291     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
   292     uint32_t nextCE, nextContCE;
   293     int32_t iCE;
   295     iCE = ucol_inv_findCE(src, CE, SecondCE);
   297     if(iCE<0) {
   298         return -1;
   299     }
   301     CE &= strengthMask[strength];
   302     SecondCE &= strengthMask[strength];
   304     nextCE = CE;
   305     nextContCE = SecondCE;
   307     while((nextCE  & strengthMask[strength]) == CE
   308         && (nextContCE  & strengthMask[strength]) == SecondCE)
   309     {
   310         nextCE = (*(CETable+3*(++iCE)));
   311         nextContCE = (*(CETable+3*(iCE)+1));
   312     }
   314     lh->nextCE = nextCE;
   315     lh->nextContCE = nextContCE;
   317     return iCE;
   318 }
   320 static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
   321     /* reset all the gaps */
   322     int32_t i = 0;
   323     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
   324     uint32_t st = 0;
   325     uint32_t t1, t2;
   326     int32_t pos;
   328     UColToken *tok = lh->first;
   329     uint32_t tokStrength = tok->strength;
   331     for(i = 0; i<3; i++) {
   332         lh->gapsHi[3*i] = 0;
   333         lh->gapsHi[3*i+1] = 0;
   334         lh->gapsHi[3*i+2] = 0;
   335         lh->gapsLo[3*i] = 0;
   336         lh->gapsLo[3*i+1] = 0;
   337         lh->gapsLo[3*i+2] = 0;
   338         lh->numStr[i] = 0;
   339         lh->fStrToken[i] = NULL;
   340         lh->lStrToken[i] = NULL;
   341         lh->pos[i] = -1;
   342     }
   344     UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
   346     if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
   347         //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
   348         lh->pos[0] = 0;
   349         t1 = lh->baseCE;
   350         t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;
   351         lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
   352         lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
   353         lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
   354         uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16);
   355         primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
   357         t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
   358         t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER;
   360         lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
   361         lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
   362         lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
   363     } else if(lh->indirect == TRUE && lh->nextCE != 0) {
   364         //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
   365         lh->pos[0] = 0;
   366         t1 = lh->baseCE;
   367         t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;
   368         lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
   369         lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
   370         lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
   371         t1 = lh->nextCE;
   372         t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;
   373         lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
   374         lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
   375         lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
   376     } else {
   377         for(;;) {
   378             if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
   379                 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
   380                     lh->fStrToken[tokStrength] = tok;
   381                 } else { /* The CE must be implicit, since it's not in the table */
   382                     /* Error */
   383                     *status = U_INTERNAL_PROGRAM_ERROR;
   384                 }
   385             }
   387             while(tok != NULL && tok->strength >= tokStrength) {
   388                 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
   389                     lh->lStrToken[tokStrength] = tok;
   390                 }
   391                 tok = tok->next;
   392             }
   393             if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
   394                 /* check if previous interval is the same and merge the intervals if it is so */
   395                 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
   396                     lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
   397                     lh->fStrToken[tokStrength+1] = NULL;
   398                     lh->lStrToken[tokStrength+1] = NULL;
   399                     lh->pos[tokStrength+1] = -1;
   400                 }
   401             }
   402             if(tok != NULL) {
   403                 tokStrength = tok->strength;
   404             } else {
   405                 break;
   406             }
   407         }
   408         for(st = 0; st < 3; st++) {
   409             if((pos = lh->pos[st]) >= 0) {
   410                 t1 = *(CETable+3*(pos));
   411                 t2 = *(CETable+3*(pos)+1);
   412                 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
   413                 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
   414                 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
   415                 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
   416                 //pos--;
   417                 //t1 = *(CETable+3*(pos));
   418                 //t2 = *(CETable+3*(pos)+1);
   419                 t1 = lh->baseCE;
   420                 t2 = lh->baseContCE;
   421                 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
   422                 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
   423                 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
   424             }
   425         }
   426     }
   427 }
   430 #define ucol_countBytes(value, noOfBytes)   \
   431 {                               \
   432     uint32_t mask = 0xFFFFFFFF;   \
   433     (noOfBytes) = 0;              \
   434     while(mask != 0) {            \
   435     if(((value) & mask) != 0) { \
   436     (noOfBytes)++;            \
   437     }                           \
   438     mask >>= 8;                 \
   439     }                             \
   440 }
   442 static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
   443     if(U_SUCCESS(*status)) {
   444         g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
   445     }
   446     return g->current;
   447 }
   449 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) {
   450     /* TODO: rename to enum names */
   451     uint32_t high, low, count=1;
   452     uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
   454     if(strength == UCOL_SECONDARY) {
   455         low = UCOL_COMMON_TOP2<<24;
   456         high = 0xFFFFFFFF;
   457         count = 0xFF - UCOL_COMMON_TOP2;
   458     } else {
   459         low = UCOL_BYTE_COMMON << 24; //0x05000000;
   460         high = 0x40000000;
   461         count = 0x40 - UCOL_BYTE_COMMON;
   462     }
   464     if(tok->next != NULL && tok->next->strength == strength) {
   465         count = tok->next->toInsert;
   466     }
   468     g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
   469     g->current = UCOL_BYTE_COMMON<<24;
   471     if(g->noOfRanges == 0) {
   472         *status = U_INTERNAL_PROGRAM_ERROR;
   473     }
   474     return g->current;
   475 }
   477 static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
   478     uint32_t strength = tok->strength;
   479     uint32_t low = lows[fStrength*3+strength];
   480     uint32_t high = highs[fStrength*3+strength];
   481     uint32_t maxByte = 0;
   482     if(strength == UCOL_TERTIARY) {
   483         maxByte = 0x3F;
   484     } else if(strength == UCOL_PRIMARY) {
   485         maxByte = 0xFE;
   486     } else {
   487         maxByte = 0xFF;
   488     }
   490     uint32_t count = tok->toInsert;
   492     if(low >= high && strength > UCOL_PRIMARY) {
   493         int32_t s = strength;
   494         for(;;) {
   495             s--;
   496             if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
   497                 if(strength == UCOL_SECONDARY) {
   498                     if (low < UCOL_COMMON_TOP2<<24 ) {
   499                        // Override if low range is less than UCOL_COMMON_TOP2.
   500 		        low = UCOL_COMMON_TOP2<<24;
   501                     }
   502                     high = 0xFFFFFFFF;
   503                 } else {
   504                     // Override if low range is less than UCOL_COMMON_BOT3.
   505 		    if ( low < UCOL_COMMON_BOT3<<24 ) {
   506                         low = UCOL_COMMON_BOT3<<24;
   507 		    }
   508                     high = 0x40000000;
   509                 }
   510                 break;
   511             }
   512             if(s<0) {
   513                 *status = U_INTERNAL_PROGRAM_ERROR;
   514                 return 0;
   515             }
   516         }
   517     }
   519     if(low < 0x02000000) {
   520         // We must not use CE weight byte 02, so we set it as the minimum lower bound.
   521         // See http://site.icu-project.org/design/collation/bytes
   522         low = 0x02000000;
   523     }
   525     if(strength == UCOL_SECONDARY) { /* similar as simple */
   526         if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
   527             low = UCOL_COMMON_TOP2<<24;
   528         }
   529         if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
   530             high = UCOL_COMMON_TOP2<<24;
   531         }
   532         if(low < (UCOL_COMMON_BOT2<<24)) {
   533             g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
   534             g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
   535             //g->current = UCOL_COMMON_BOT2<<24;
   536             return g->current;
   537         }
   538     }
   540     g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
   541     if(g->noOfRanges == 0) {
   542         *status = U_INTERNAL_PROGRAM_ERROR;
   543     }
   544     g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
   545     return g->current;
   546 }
   548 static
   549 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
   550     uint32_t i = 0;
   551     UChar c;
   553     if(U_FAILURE(*status)) {
   554         return 0;
   555     }
   557     if(sourceLen > resLen) {
   558         *status = U_MEMORY_ALLOCATION_ERROR;
   559         return 0;
   560     }
   562     for(i = 0; i < sourceLen; i++) {
   563         c = source[i];
   564         if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
   565             switch(c - 0x3000) {
   566             case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
   567             case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
   568                 c++;
   569                 break;
   570             case 0xF5:
   571                 c = 0x30AB;
   572                 break;
   573             case 0xF6:
   574                 c = 0x30B1;
   575                 break;
   576             }
   577         }
   578         resBuf[i] = c;
   579     }
   580     return sourceLen;
   581 }
   583 static
   584 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
   585     uint32_t i = 0;
   586     UChar c;
   588     if(U_FAILURE(*status)) {
   589         return 0;
   590     }
   592     if(sourceLen > resLen) {
   593         *status = U_MEMORY_ALLOCATION_ERROR;
   594         return 0;
   595     }
   597     for(i = 0; i < sourceLen; i++) {
   598         c = source[i];
   599         if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
   600             switch(c - 0x3000) {
   601             case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
   602             case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
   603                 c--;
   604                 break;
   605             case 0xAB:
   606                 c = 0x30F5;
   607                 break;
   608             case 0xB1:
   609                 c = 0x30F6;
   610                 break;
   611             }
   612         }
   613         resBuf[i] = c;
   614     }
   615     return sourceLen;
   616 }
   618 U_NAMESPACE_BEGIN
   620 static
   621 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
   622     uint32_t i = 0;
   623     UChar n[128];
   624     uint32_t nLen = 0;
   625     uint32_t uCount = 0, lCount = 0;
   627     collIterate s;
   628     uint32_t order = 0;
   630     if(U_FAILURE(*status)) {
   631         return UCOL_LOWER_CASE;
   632     }
   634     nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
   635     if(U_SUCCESS(*status)) {
   636         for(i = 0; i < nLen; i++) {
   637             uprv_init_collIterate(UCA, &n[i], 1, &s, status);
   638             order = ucol_getNextCE(UCA, &s, status);
   639             if(isContinuation(order)) {
   640                 *status = U_INTERNAL_PROGRAM_ERROR;
   641                 return UCOL_LOWER_CASE;
   642             }
   643             if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
   644                 uCount++;
   645             } else {
   646                 if(u_islower(n[i])) {
   647                     lCount++;
   648                 } else if(U_SUCCESS(*status)) {
   649                     UChar sk[1], lk[1];
   650                     u_toSmallKana(&n[i], 1, sk, 1, status);
   651                     u_toLargeKana(&n[i], 1, lk, 1, status);
   652                     if(sk[0] == n[i] && lk[0] != n[i]) {
   653                         lCount++;
   654                     }
   655                 }
   656             }
   657         }
   658     }
   660     if(uCount != 0 && lCount != 0) {
   661         return UCOL_MIXED_CASE;
   662     } else if(uCount != 0) {
   663         return UCOL_UPPER_CASE;
   664     } else {
   665         return UCOL_LOWER_CASE;
   666     }
   667 }
   670 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
   671     /* this one makes the table and stuff */
   672     uint32_t noOfBytes[3];
   673     uint32_t i;
   675     for(i = 0; i<3; i++) {
   676         ucol_countBytes(CEparts[i], noOfBytes[i]);
   677     }
   679     /* Here we have to pack CEs from parts */
   681     uint32_t CEi = 0;
   682     uint32_t value = 0;
   684     while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
   685         if(CEi > 0) {
   686             value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
   687         } else {
   688             value = 0;
   689         }
   691         if(2*CEi<noOfBytes[0]) {
   692             value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
   693         }
   694         if(CEi<noOfBytes[1]) {
   695             value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
   696         }
   697         if(CEi<noOfBytes[2]) {
   698             value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
   699         }
   700         tok->CEs[CEi] = value;
   701         CEi++;
   702     }
   703     if(CEi == 0) { /* totally ignorable */
   704         tok->noOfCEs = 1;
   705         tok->CEs[0] = 0;
   706     } else { /* there is at least something */
   707         tok->noOfCEs = CEi;
   708     }
   711     // we want to set case bits here and now, not later.
   712     // Case bits handling
   713     if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables
   714         tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
   715         int32_t cSize = (tok->source & 0xFF000000) >> 24;
   716         UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
   718         if(cSize > 1) {
   719             // Do it manually
   720             tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
   721         } else {
   722             // Copy it from the UCA
   723             uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
   724             tok->CEs[0] |= (caseCE & 0xC0);
   725         }
   726     }
   728 #if UCOL_DEBUG==2
   729     fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
   730     for(i = 0; i<tok->noOfCEs; i++) {
   731         fprintf(stderr, "%08X ", tok->CEs[i]);
   732     }
   733     fprintf(stderr, "\n");
   734 #endif
   735 }
   737 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
   738     ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
   739     uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
   741     UColToken *tok = lh->last;
   742     uint32_t t[UCOL_STRENGTH_LIMIT];
   744     uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
   746     /* must initialize ranges to avoid memory check warnings */
   747     for (int i = 0; i < UCOL_CE_STRENGTH_LIMIT; i++) {
   748         uprv_memset(Gens[i].ranges, 0, sizeof(Gens[i].ranges));
   749     }
   751     tok->toInsert = 1;
   752     t[tok->strength] = 1;
   754     while(tok->previous != NULL) {
   755         if(tok->previous->strength < tok->strength) { /* going up */
   756             t[tok->strength] = 0;
   757             t[tok->previous->strength]++;
   758         } else if(tok->previous->strength > tok->strength) { /* going down */
   759             t[tok->previous->strength] = 1;
   760         } else {
   761             t[tok->strength]++;
   762         }
   763         tok=tok->previous;
   764         tok->toInsert = t[tok->strength];
   765     }
   767     tok->toInsert = t[tok->strength];
   768     ucol_inv_getGapPositions(src, lh, status);
   770 #if UCOL_DEBUG
   771     fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
   772     int32_t j = 2;
   773     for(j = 2; j >= 0; j--) {
   774         fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
   775         fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
   776     }
   777     tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
   779     do {
   780         fprintf(stderr,"%i", tok->strength);
   781         tok = tok->next;
   782     } while(tok != NULL);
   783     fprintf(stderr, "\n");
   785     tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
   787     do {
   788         fprintf(stderr,"%i", tok->toInsert);
   789         tok = tok->next;
   790     } while(tok != NULL);
   791 #endif
   793     tok = lh->first;
   794     uint32_t fStrength = UCOL_IDENTICAL;
   795     uint32_t initStrength = UCOL_IDENTICAL;
   798     CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
   799     CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8;
   800     CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16;
   802     while (tok != NULL && U_SUCCESS(*status)) {
   803         fStrength = tok->strength;
   804         if(fStrength < initStrength) {
   805             initStrength = fStrength;
   806             if(lh->pos[fStrength] == -1) {
   807                 while(lh->pos[fStrength] == -1 && fStrength > 0) {
   808                     fStrength--;
   809                 }
   810                 if(lh->pos[fStrength] == -1) {
   811                     *status = U_INTERNAL_PROGRAM_ERROR;
   812                     return;
   813                 }
   814             }
   815             if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
   816                 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
   817                 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
   818                 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
   819                 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
   820             } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
   821                 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
   822                 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
   823                 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
   824                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
   825             } else { /* primaries */
   826                 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
   827                 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
   828                 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
   829                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
   830             }
   831         } else {
   832             if(tok->strength == UCOL_TERTIARY) {
   833                 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status);
   834             } else if(tok->strength == UCOL_SECONDARY) {
   835                 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status);
   836                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
   837             } else if(tok->strength == UCOL_PRIMARY) {
   838                 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status);
   839                 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
   840                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
   841             }
   842         }
   843         ucol_doCE(src, CEparts, tok, status);
   844         tok = tok->next;
   845     }
   846 }
   848 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
   849     UCAElements el;
   850     UColToken *tok = lh->first;
   851     UColToken *expt = NULL;
   852     uint32_t i = 0, j = 0;
   853     const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status);
   855     while(tok != NULL && U_SUCCESS(*status)) {
   856         /* first, check if there are any expansions */
   857         /* if there are expansions, we need to do a little bit more processing */
   858         /* since parts of expansion can be tailored, while others are not */
   859         if(tok->expansion != 0) {
   860             uint32_t len = tok->expansion >> 24;
   861             uint32_t currentSequenceLen = len;
   862             uint32_t expOffset = tok->expansion & 0x00FFFFFF;
   863             //uint32_t exp = currentSequenceLen | expOffset;
   864             UColToken exp;
   865             exp.source = currentSequenceLen | expOffset;
   866             exp.rulesToParseHdl = &(src->source);
   868             while(len > 0) {
   869                 currentSequenceLen = len;
   870                 while(currentSequenceLen > 0) {
   871                     exp.source = (currentSequenceLen << 24) | expOffset;
   872                     if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
   873                         uint32_t noOfCEsToCopy = expt->noOfCEs;
   874                         for(j = 0; j<noOfCEsToCopy; j++) {
   875                             tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
   876                         }
   877                         tok->noOfExpCEs += noOfCEsToCopy;
   878                         // Smart people never try to add codepoints and CEs.
   879                         // For some odd reason, it won't work.
   880                         expOffset += currentSequenceLen; //noOfCEsToCopy;
   881                         len -= currentSequenceLen; //noOfCEsToCopy;
   882                         break;
   883                     } else {
   884                         currentSequenceLen--;
   885                     }
   886                 }
   887                 if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */
   888                     /* will have to get one from UCA */
   889                     /* first, get the UChars from the rules */
   890                     /* then pick CEs out until there is no more and stuff them into expansion */
   891                     collIterate s;
   892                     uint32_t order = 0;
   893                     uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status);
   895                     for(;;) {
   896                         order = ucol_getNextCE(src->UCA, &s, status);
   897                         if(order == UCOL_NO_MORE_CES) {
   898                             break;
   899                         }
   900                         tok->expCEs[tok->noOfExpCEs++] = order;
   901                     }
   902                     expOffset++;
   903                     len--;
   904                 }
   905             }
   906         } else {
   907             tok->noOfExpCEs = 0;
   908         }
   910         /* set the ucaelement with obtained values */
   911         el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
   912         /* copy CEs */
   913         for(i = 0; i<tok->noOfCEs; i++) {
   914             el.CEs[i] = tok->CEs[i];
   915         }
   916         for(i = 0; i<tok->noOfExpCEs; i++) {
   917             el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
   918         }
   920         /* copy UChars */
   921         // We kept prefix and source kind of together, as it is a kind of a contraction.
   922         // However, now we have to slice the prefix off the main thing -
   923         el.prefix = el.prefixChars;
   924         el.cPoints = el.uchars;
   925         if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
   926             // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
   927             // decomposed elements to the unsaf table.
   928             el.prefixSize = tok->prefix>>24;
   929             uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
   931             el.cSize = (tok->source >> 24)-(tok->prefix>>24);
   932             uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
   933         } else {
   934             el.prefixSize = 0;
   935             *el.prefix = 0;
   937             el.cSize = (tok->source >> 24);
   938             uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
   939         }
   940         if(src->UCA != NULL) {
   941             for(i = 0; i<el.cSize; i++) {
   942                 if(UCOL_ISJAMO(el.cPoints[i])) {
   943                     t->image->jamoSpecial = TRUE;
   944                 }
   945             }
   946             if (!src->buildCCTabFlag && el.cSize > 0) {
   947                 // Check the trailing canonical combining class (tccc) of the last character.
   948                 const UChar *s = el.cPoints + el.cSize;
   949                 uint16_t fcd = nfcImpl->previousFCD16(el.cPoints, s);
   950                 if ((fcd & 0xff) != 0) {
   951                     src->buildCCTabFlag = TRUE;
   952                 }
   953             }
   954         }
   956         /* and then, add it */
   957 #if UCOL_DEBUG==2
   958         fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
   959 #endif
   960         uprv_uca_addAnElement(t, &el, status);
   962 #if UCOL_DEBUG_DUPLICATES
   963         if(*status != U_ZERO_ERROR) {
   964             fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
   965             *status = U_ZERO_ERROR;
   966         }
   967 #endif
   969         tok = tok->next;
   970     }
   971 }
   973 U_CDECL_BEGIN
   974 static UBool U_CALLCONV
   975 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
   976     UErrorCode status = U_ZERO_ERROR;
   977     tempUCATable *t = (tempUCATable *)context;
   978     if(value == 0) {
   979         while(start < limit) {
   980             uint32_t CE = utrie_get32(t->mapping, start, NULL);
   981             if(CE == UCOL_NOT_FOUND) {
   982                 UCAElements el;
   983                 el.isThai = FALSE;
   984                 el.prefixSize = 0;
   985                 el.prefixChars[0] = 0;
   986                 el.prefix = el.prefixChars;
   987                 el.cPoints = el.uchars;
   989                 el.cSize = 0;
   990                 U16_APPEND_UNSAFE(el.uchars, el.cSize, start);
   992                 el.noOfCEs = 1;
   993                 el.CEs[0] = 0;
   994                 uprv_uca_addAnElement(t, &el, &status);
   996             }
   997             start++;
   998         }
   999     }
  1000     if(U_FAILURE(status)) {
  1001         return FALSE;
  1002     } else {
  1003         return TRUE;
  1006 U_CDECL_END
  1008 static void
  1009 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
  1010                                UChar32 start, UChar32 end,
  1011                                UErrorCode *status)
  1013     //UChar decomp[256];
  1014     uint32_t CE = UCOL_NOT_FOUND;
  1015     UChar32 u = 0;
  1016     UCAElements el;
  1017     el.isThai = FALSE;
  1018     el.prefixSize = 0;
  1019     el.prefixChars[0] = 0;
  1020     collIterate colIt;
  1022     if(U_SUCCESS(*status)) {
  1023         for(u = start; u<=end; u++) {
  1024             if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
  1025                 /* this test is for contractions that are missing the starting element. */
  1026                 || ((isCntTableElement(CE)) &&
  1027                 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND))
  1030                 el.cSize = 0;
  1031                 U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
  1032                 //decomp[0] = (UChar)u;
  1033                 //el.uchars[0] = (UChar)u;
  1034                 el.cPoints = el.uchars;
  1035                 //el.cSize = 1;
  1036                 el.noOfCEs = 0;
  1037                 el.prefix = el.prefixChars;
  1038                 el.prefixSize = 0;
  1039                 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
  1040                 // We actually want to check whether this element is a special
  1041                 // If it is an implicit element (hangul, CJK - we want to copy the
  1042                 // special, not the resolved CEs) - for hangul, copying resolved
  1043                 // would just make things the same (there is an expansion and it
  1044                 // takes approximately the same amount of time to resolve as
  1045                 // falling back to the UCA).
  1046                 /*
  1047                 UTRIE_GET32(src->UCA->mapping, u, CE);
  1048                 tag = getCETag(CE);
  1049                 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
  1050                 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
  1051                 || tag == LEAD_SURROGATE_TAG) {
  1052                 el.CEs[el.noOfCEs++] = CE;
  1053                 } else {
  1054                 */
  1055                 // It turns out that it does not make sense to keep implicits
  1056                 // unresolved. The cost of resolving them is big enough so that
  1057                 // it doesn't make any difference whether we have to go to the UCA
  1058                 // or not.
  1060                     uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status);
  1061                     while(CE != UCOL_NO_MORE_CES) {
  1062                         CE = ucol_getNextCE(src->UCA, &colIt, status);
  1063                         if(CE != UCOL_NO_MORE_CES) {
  1064                             el.CEs[el.noOfCEs++] = CE;
  1068                 uprv_uca_addAnElement(t, &el, status);
  1074 U_NAMESPACE_END
  1076 U_CFUNC UCATableHeader *
  1077 ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
  1078     U_NAMESPACE_USE
  1080     uint32_t i = 0;
  1081     if(U_FAILURE(*status)) {
  1082         return NULL;
  1084     /*
  1085     2.  Eliminate the negative lists by doing the following for each non-null negative list:
  1086     o   if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
  1087     create new ListHeader X
  1088     o   reverse the list, add to the end of X's positive list. Reset the strength of the
  1089     first item you add, based on the stronger strength levels of the two lists.
  1090     */
  1091     /*
  1092     3.  For each ListHeader with a non-null positive list:
  1093     */
  1094     /*
  1095     o   Find all character strings with CEs between the baseCE and the
  1096     next/previous CE, at the strength of the first token. Add these to the
  1097     tailoring.
  1098     ? That is, if UCA has ...  x <<< X << x' <<< X' < y ..., and the
  1099     tailoring has & x < z...
  1100     ? Then we change the tailoring to & x  <<< X << x' <<< X' < z ...
  1101     */
  1102     /* It is possible that this part should be done even while constructing list */
  1103     /* The problem is that it is unknown what is going to be the strongest weight */
  1104     /* So we might as well do it here */
  1106     /*
  1107     o   Allocate CEs for each token in the list, based on the total number N of the
  1108     largest level difference, and the gap G between baseCE and nextCE at that
  1109     level. The relation * between the last item and nextCE is the same as the
  1110     strongest strength.
  1111     o   Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
  1112     ? There are 3 primary items: a, d, e. Fit them into the primary gap.
  1113     Then fit b and c into the secondary gap between a and d, then fit q
  1114     into the tertiary gap between b and c.
  1116     o   Example: baseCE << b <<< q << c * nextCE(X,2)
  1117     ? There are 2 secondary items: b, c. Fit them into the secondary gap.
  1118     Then fit q into the tertiary gap between b and c.
  1119     o   When incrementing primary values, we will not cross high byte
  1120     boundaries except where there is only a single-byte primary. That is to
  1121     ensure that the script reordering will continue to work.
  1122     */
  1123     UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
  1124     /* test for NULL */
  1125     if (image == NULL) {
  1126         *status = U_MEMORY_ALLOCATION_ERROR;
  1127         return NULL;
  1129     uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
  1131     for(i = 0; i<src->resultLen; i++) {
  1132         /* now we need to generate the CEs */
  1133         /* We stuff the initial value in the buffers, and increase the appropriate buffer */
  1134         /* According to strength                                                          */
  1135         if(U_SUCCESS(*status)) {
  1136             if(src->lh[i].first) { // if there are any elements
  1137                 // due to the way parser works, subsequent tailorings
  1138                 // may remove all the elements from a sequence, therefore
  1139                 // leaving an empty tailoring sequence.
  1140                 ucol_initBuffers(src, &src->lh[i], status);
  1143         if(U_FAILURE(*status)) {
  1144             uprv_free(image);
  1145             return NULL;
  1149     if(src->varTop != NULL) { /* stuff the variable top value */
  1150         src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
  1151         /* remove it from the list */
  1152         if(src->varTop->listHeader->first == src->varTop) { /* first in list */
  1153             src->varTop->listHeader->first = src->varTop->next;
  1155         if(src->varTop->listHeader->last == src->varTop) { /* first in list */
  1156             src->varTop->listHeader->last = src->varTop->previous;
  1158         if(src->varTop->next != NULL) {
  1159             src->varTop->next->previous = src->varTop->previous;
  1161         if(src->varTop->previous != NULL) {
  1162             src->varTop->previous->next = src->varTop->next;
  1167     tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
  1168     if(U_FAILURE(*status)) {
  1169         uprv_free(image);
  1170         return NULL;
  1174     /* After this, we have assigned CE values to all regular CEs      */
  1175     /* now we will go through list once more and resolve expansions,  */
  1176     /* make UCAElements structs and add them to table                 */
  1177     for(i = 0; i<src->resultLen; i++) {
  1178         /* now we need to generate the CEs */
  1179         /* We stuff the initial value in the buffers, and increase the appropriate buffer */
  1180         /* According to strength                                                          */
  1181         if(U_SUCCESS(*status)) {
  1182             ucol_createElements(src, t, &src->lh[i], status);
  1186     UCAElements el;
  1187     el.isThai = FALSE;
  1188     el.prefixSize = 0;
  1189     el.prefixChars[0] = 0;
  1191     /* add latin-1 stuff */
  1192     ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
  1194     /* add stuff for copying */
  1195     if(src->copySet != NULL) {
  1196         int32_t i = 0;
  1197         UnicodeSet *set = (UnicodeSet *)src->copySet;
  1198         for(i = 0; i < set->getRangeCount(); i++) {
  1199             ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status);
  1203     if(U_SUCCESS(*status)) {
  1204         /* copy contractions from the UCA - this is felt mostly for cyrillic*/
  1206         uint32_t tailoredCE = UCOL_NOT_FOUND;
  1207         UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
  1208         int32_t maxUCAContractionLength = src->UCA->image->contractionUCACombosWidth;
  1209         UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
  1210         // Check for null pointer
  1211         if (ucaEl == NULL) {
  1212             *status = U_MEMORY_ALLOCATION_ERROR;
  1213             return NULL;
  1215         while(*conts != 0) {
  1216             // A continuation is NUL-terminated and NUL-padded
  1217             // except if it has the maximum length.
  1218             int32_t contractionLength = maxUCAContractionLength;
  1219             while(contractionLength > 0 && conts[contractionLength - 1] == 0) {
  1220                 --contractionLength;
  1222             UChar32 first;
  1223             int32_t firstLength = 0;
  1224             U16_NEXT(conts, firstLength, contractionLength, first);
  1225             tailoredCE = utrie_get32(t->mapping, first, NULL);
  1226             if(tailoredCE != UCOL_NOT_FOUND) {
  1227                 UBool needToAdd = TRUE;
  1228                 if(isCntTableElement(tailoredCE)) {
  1229                     if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+firstLength, status) == TRUE) {
  1230                         needToAdd = FALSE;
  1233                 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) {
  1234                     UCAElements elm;
  1235                     elm.cPoints = el.uchars;
  1236                     elm.noOfCEs = 0;
  1237                     elm.uchars[0] = *conts;
  1238                     elm.uchars[1] = 0;
  1239                     elm.cSize = 1;
  1240                     elm.prefixChars[0] = *(conts+2);
  1241                     elm.isThai = FALSE;
  1242                     elm.prefix = elm.prefixChars;
  1243                     elm.prefixSize = 1;
  1244                     UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm);
  1245                     if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) {
  1246                         needToAdd = TRUE;
  1249                 if(src->removeSet != NULL && uset_contains(src->removeSet, first)) {
  1250                     needToAdd = FALSE;
  1253                 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
  1254                     if (*(conts+1) != 0) {  // contractions
  1255                         el.prefix = el.prefixChars;
  1256                         el.prefixSize = 0;
  1257                         el.cPoints = el.uchars;
  1258                         el.noOfCEs = 0;
  1259                         u_memcpy(el.uchars, conts, contractionLength);
  1260                         el.cSize = contractionLength;
  1261                         ucol_setText(ucaEl, el.uchars, el.cSize, status);
  1263                     else { // pre-context character
  1264                         UChar str[4] = { 0 };
  1265                         int32_t len=0;
  1266                         int32_t preKeyLen=0;
  1268                         el.cPoints = el.uchars;
  1269                         el.noOfCEs = 0;
  1270                         el.uchars[0] = *conts;
  1271                         el.uchars[1] = 0;
  1272                         el.cSize = 1;
  1273                         el.prefixChars[0] = *(conts+2);
  1274                         el.prefix = el.prefixChars;
  1275                         el.prefixSize = 1;
  1276                         if (el.prefixChars[0]!=0) {
  1277                             // get CE of prefix character first
  1278                             str[0]=el.prefixChars[0];
  1279                             str[1]=0;
  1280                             ucol_setText(ucaEl, str, 1, status);
  1281                             while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status))
  1282                                     != UCOL_NULLORDER) {
  1283                                 preKeyLen++;  // count number of keys for prefix character
  1285                             str[len++] = el.prefixChars[0];
  1288                         str[len++] = el.uchars[0];
  1289                         str[len]=0;
  1290                         ucol_setText(ucaEl, str, len, status);
  1291                         // Skip the keys for prefix character, then copy the rest to el.
  1292                         while ((preKeyLen-->0) && 
  1293                                (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
  1294                             continue;
  1298                     while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
  1299                         el.noOfCEs++;
  1301                     uprv_uca_addAnElement(t, &el, status);
  1304             } else if(src->removeSet != NULL && uset_contains(src->removeSet, first)) {
  1305                 ucol_uprv_bld_copyRangeFromUCA(src, t, first, first, status);
  1307             conts+=maxUCAContractionLength;
  1309         ucol_closeElements(ucaEl);
  1312     // Add completely ignorable elements
  1313     utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
  1315     // add tailoring characters related canonical closures
  1316     uprv_uca_canonicalClosure(t, src, NULL, status);
  1318     /* still need to produce compatibility closure */
  1320     UCATableHeader *myData = uprv_uca_assembleTable(t, status);
  1322     uprv_uca_closeTempTable(t);
  1323     uprv_free(image);
  1325     return myData;
  1328 U_CDECL_BEGIN
  1329 static UBool U_CALLCONV
  1330 ucol_bld_cleanup(void)
  1332     udata_close(invUCA_DATA_MEM);
  1333     invUCA_DATA_MEM = NULL;
  1334     _staticInvUCA = NULL;
  1335     gStaticInvUCAInitOnce.reset();
  1336     return TRUE;
  1338 U_CDECL_END
  1340 static void U_CALLCONV initInverseUCA(UErrorCode &status) {
  1341     U_ASSERT(invUCA_DATA_MEM == NULL);
  1342     U_ASSERT(_staticInvUCA == NULL);
  1343     ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
  1344     InverseUCATableHeader *newInvUCA = NULL;
  1345     UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, &status);
  1347     if(U_FAILURE(status)) {
  1348         if (result) {
  1349             udata_close(result);
  1351         // This is not needed, as we are talking about
  1352         // memory we got from UData
  1353         //uprv_free(newInvUCA);
  1354         return;
  1357     if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
  1358         newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
  1359         UCollator *UCA = ucol_initUCA(&status);
  1360         // UCA versions of UCA and inverse UCA should match
  1361         if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) {
  1362             status = U_INVALID_FORMAT_ERROR;
  1363             udata_close(result);
  1364             return;
  1367         invUCA_DATA_MEM = result;
  1368         _staticInvUCA = newInvUCA;
  1373 U_CAPI const InverseUCATableHeader * U_EXPORT2
  1374 ucol_initInverseUCA(UErrorCode *status)
  1376     umtx_initOnce(gStaticInvUCAInitOnce, &initInverseUCA, *status);
  1377     return _staticInvUCA;
  1380 /* This is the data that is used for non-script reordering codes. These _must_ be kept
  1381  * in order that they are to be applied as defaults and in synch with the UColReorderCode enum.
  1382  */
  1383 static const char * const ReorderingTokenNames[] = {
  1384     "SPACE",
  1385     "PUNCT",
  1386     "SYMBOL",
  1387     "CURRENCY",
  1388     "DIGIT"
  1389 };
  1391 static void toUpper(const char* src, char* dst, uint32_t length) {
  1392    for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) {
  1393        *dst = uprv_toupper(*src);
  1395    *dst = '\0';
  1398 U_INTERNAL int32_t U_EXPORT2 
  1399 ucol_findReorderingEntry(const char* name) {
  1400     char buffer[32];
  1401     toUpper(name, buffer, 32);
  1402     for (uint32_t entry = 0; entry < LENGTHOF(ReorderingTokenNames); entry++) {
  1403         if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) {
  1404             return entry + UCOL_REORDER_CODE_FIRST;
  1407     return USCRIPT_INVALID_CODE;
  1410 #endif /* #if !UCONFIG_NO_COLLATION */

mercurial