intl/icu/source/i18n/ucol_bld.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/ucol_bld.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1410 @@
     1.4 +/*
     1.5 +*******************************************************************************
     1.6 +*
     1.7 +*   Copyright (C) 2001-2013, International Business Machines
     1.8 +*   Corporation and others.  All Rights Reserved.
     1.9 +*
    1.10 +*******************************************************************************
    1.11 +*   file name:  ucol_bld.cpp
    1.12 +*   encoding:   US-ASCII
    1.13 +*   tab size:   8 (not used)
    1.14 +*   indentation:4
    1.15 +*
    1.16 +*   created 02/22/2001
    1.17 +*   created by: Vladimir Weinstein
    1.18 +*
    1.19 +* This module builds a collator based on the rule set.
    1.20 +*
    1.21 +*/
    1.22 +
    1.23 +#include "unicode/utypes.h"
    1.24 +
    1.25 +#if !UCONFIG_NO_COLLATION
    1.26 +
    1.27 +#include "unicode/ucoleitr.h"
    1.28 +#include "unicode/udata.h"
    1.29 +#include "unicode/uchar.h"
    1.30 +#include "unicode/uniset.h"
    1.31 +#include "unicode/uscript.h"
    1.32 +#include "unicode/ustring.h"
    1.33 +#include "unicode/utf16.h"
    1.34 +#include "normalizer2impl.h"
    1.35 +#include "uassert.h"
    1.36 +#include "ucol_bld.h"
    1.37 +#include "ucol_elm.h"
    1.38 +#include "ucol_cnt.h"
    1.39 +#include "ucln_in.h"
    1.40 +#include "umutex.h"
    1.41 +#include "cmemory.h"
    1.42 +#include "cstring.h"
    1.43 +
    1.44 +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
    1.45 +
    1.46 +static const InverseUCATableHeader* _staticInvUCA = NULL;
    1.47 +static UDataMemory* invUCA_DATA_MEM = NULL;
    1.48 +static icu::UInitOnce gStaticInvUCAInitOnce = U_INITONCE_INITIALIZER;
    1.49 +
    1.50 +U_CDECL_BEGIN
    1.51 +static UBool U_CALLCONV
    1.52 +isAcceptableInvUCA(void * /*context*/,
    1.53 +                   const char * /*type*/, const char * /*name*/,
    1.54 +                   const UDataInfo *pInfo)
    1.55 +{
    1.56 +    /* context, type & name are intentionally not used */
    1.57 +    if( pInfo->size>=20 &&
    1.58 +        pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
    1.59 +        pInfo->charsetFamily==U_CHARSET_FAMILY &&
    1.60 +        pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 &&   /* dataFormat="InvC" */
    1.61 +        pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
    1.62 +        pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
    1.63 +        pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
    1.64 +        pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
    1.65 +        pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
    1.66 +        //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
    1.67 +        //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
    1.68 +        //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
    1.69 +        )
    1.70 +    {
    1.71 +        // TODO: Check that the invuca data version (pInfo->dataVersion)
    1.72 +        // matches the ucadata version.
    1.73 +        return TRUE;
    1.74 +    } else {
    1.75 +        return FALSE;
    1.76 +    }
    1.77 +}
    1.78 +U_CDECL_END
    1.79 +
    1.80 +/*
    1.81 +* Takes two CEs (lead and continuation) and
    1.82 +* compares them as CEs should be compared:
    1.83 +* primary vs. primary, secondary vs. secondary
    1.84 +* tertiary vs. tertiary
    1.85 +*/
    1.86 +static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
    1.87 +    uint32_t s1 = source0, s2, t1 = target0, t2;
    1.88 +    if(isContinuation(source1)) {
    1.89 +        s2 = source1;
    1.90 +    } else {
    1.91 +        s2 = 0;
    1.92 +    }
    1.93 +    if(isContinuation(target1)) {
    1.94 +        t2 = target1;
    1.95 +    } else {
    1.96 +        t2 = 0;
    1.97 +    }
    1.98 +
    1.99 +    uint32_t s = 0, t = 0;
   1.100 +    if(s1 == t1 && s2 == t2) {
   1.101 +        return 0;
   1.102 +    }
   1.103 +    s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
   1.104 +    t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
   1.105 +    if(s < t) {
   1.106 +        return -1;
   1.107 +    } else if(s > t) {
   1.108 +        return 1;
   1.109 +    } else {
   1.110 +        s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
   1.111 +        t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
   1.112 +        if(s < t) {
   1.113 +            return -1;
   1.114 +        } else if(s > t) {
   1.115 +            return 1;
   1.116 +        } else {
   1.117 +            s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
   1.118 +            t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
   1.119 +            if(s < t) {
   1.120 +                return -1;
   1.121 +            } else {
   1.122 +                return 1;
   1.123 +            }
   1.124 +        }
   1.125 +    }
   1.126 +}
   1.127 +
   1.128 +static
   1.129 +int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
   1.130 +    uint32_t bottom = 0, top = src->invUCA->tableSize;
   1.131 +    uint32_t i = 0;
   1.132 +    uint32_t first = 0, second = 0;
   1.133 +    uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
   1.134 +    int32_t res = 0;
   1.135 +
   1.136 +    while(bottom < top-1) {
   1.137 +        i = (top+bottom)/2;
   1.138 +        first = *(CETable+3*i);
   1.139 +        second = *(CETable+3*i+1);
   1.140 +        res = compareCEs(first, second, CE, SecondCE);
   1.141 +        if(res > 0) {
   1.142 +            top = i;
   1.143 +        } else if(res < 0) {
   1.144 +            bottom = i;
   1.145 +        } else {
   1.146 +            break;
   1.147 +        }
   1.148 +    }
   1.149 +
   1.150 +    /* weiv:                                                  */
   1.151 +    /* in searching for elements, I have removed the failure  */
   1.152 +    /* The reason for this is that the builder does not rely  */
   1.153 +    /* on search mechanism telling it that it didn't find an  */
   1.154 +    /* element. However, indirect positioning relies on being */
   1.155 +    /* able to find the elements around any CE, even if it is */
   1.156 +    /* not defined in the UCA. */
   1.157 +    return i;
   1.158 +    /*
   1.159 +    if((first == CE && second == SecondCE)) {
   1.160 +    return i;
   1.161 +    } else {
   1.162 +    return -1;
   1.163 +    }
   1.164 +    */
   1.165 +}
   1.166 +
   1.167 +static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
   1.168 +    0xFFFF0000,
   1.169 +    0xFFFFFF00,
   1.170 +    0xFFFFFFFF
   1.171 +};
   1.172 +
   1.173 +U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
   1.174 +                                            uint32_t CE, uint32_t contCE,
   1.175 +                                            uint32_t *nextCE, uint32_t *nextContCE,
   1.176 +                                            uint32_t strength)
   1.177 +{
   1.178 +    uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
   1.179 +    int32_t iCE;
   1.180 +
   1.181 +    iCE = ucol_inv_findCE(src, CE, contCE);
   1.182 +
   1.183 +    if(iCE<0) {
   1.184 +        *nextCE = UCOL_NOT_FOUND;
   1.185 +        return -1;
   1.186 +    }
   1.187 +
   1.188 +    CE &= strengthMask[strength];
   1.189 +    contCE &= strengthMask[strength];
   1.190 +
   1.191 +    *nextCE = CE;
   1.192 +    *nextContCE = contCE;
   1.193 +
   1.194 +    while((*nextCE  & strengthMask[strength]) == CE
   1.195 +        && (*nextContCE  & strengthMask[strength]) == contCE)
   1.196 +    {
   1.197 +        *nextCE = (*(CETable+3*(++iCE)));
   1.198 +        *nextContCE = (*(CETable+3*(iCE)+1));
   1.199 +    }
   1.200 +
   1.201 +    return iCE;
   1.202 +}
   1.203 +
   1.204 +U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
   1.205 +                                            uint32_t CE, uint32_t contCE,
   1.206 +                                            uint32_t *prevCE, uint32_t *prevContCE,
   1.207 +                                            uint32_t strength)
   1.208 +{
   1.209 +    uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
   1.210 +    int32_t iCE;
   1.211 +
   1.212 +    iCE = ucol_inv_findCE(src, CE, contCE);
   1.213 +
   1.214 +    if(iCE<0) {
   1.215 +        *prevCE = UCOL_NOT_FOUND;
   1.216 +        return -1;
   1.217 +    }
   1.218 +
   1.219 +    CE &= strengthMask[strength];
   1.220 +    contCE &= strengthMask[strength];
   1.221 +
   1.222 +    *prevCE = CE;
   1.223 +    *prevContCE = contCE;
   1.224 +
   1.225 +    while((*prevCE  & strengthMask[strength]) == CE
   1.226 +        && (*prevContCE  & strengthMask[strength])== contCE
   1.227 +        && iCE > 0) /* this condition should prevent falling off the edge of the world */
   1.228 +    {
   1.229 +        /* here, we end up in a singularity - zero */
   1.230 +        *prevCE = (*(CETable+3*(--iCE)));
   1.231 +        *prevContCE = (*(CETable+3*(iCE)+1));
   1.232 +    }
   1.233 +
   1.234 +    return iCE;
   1.235 +}
   1.236 +
   1.237 +U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE,
   1.238 +                                                       uint32_t prevCE, uint32_t prevContCE)
   1.239 +{
   1.240 +    if(prevCE == CE && prevContCE == contCE) {
   1.241 +        return UCOL_IDENTICAL;
   1.242 +    }
   1.243 +    if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY])
   1.244 +        || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY]))
   1.245 +    {
   1.246 +        return UCOL_PRIMARY;
   1.247 +    }
   1.248 +    if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY])
   1.249 +        || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY]))
   1.250 +    {
   1.251 +        return UCOL_SECONDARY;
   1.252 +    }
   1.253 +    return UCOL_TERTIARY;
   1.254 +}
   1.255 +
   1.256 +
   1.257 +/*static
   1.258 +inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
   1.259 +
   1.260 +    uint32_t CE = lh->baseCE;
   1.261 +    uint32_t SecondCE = lh->baseContCE;
   1.262 +
   1.263 +    uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
   1.264 +    uint32_t previousCE, previousContCE;
   1.265 +    int32_t iCE;
   1.266 +
   1.267 +    iCE = ucol_inv_findCE(src, CE, SecondCE);
   1.268 +
   1.269 +    if(iCE<0) {
   1.270 +        return -1;
   1.271 +    }
   1.272 +
   1.273 +    CE &= strengthMask[strength];
   1.274 +    SecondCE &= strengthMask[strength];
   1.275 +
   1.276 +    previousCE = CE;
   1.277 +    previousContCE = SecondCE;
   1.278 +
   1.279 +    while((previousCE  & strengthMask[strength]) == CE && (previousContCE  & strengthMask[strength])== SecondCE) {
   1.280 +        previousCE = (*(CETable+3*(--iCE)));
   1.281 +        previousContCE = (*(CETable+3*(iCE)+1));
   1.282 +    }
   1.283 +    lh->previousCE = previousCE;
   1.284 +    lh->previousContCE = previousContCE;
   1.285 +
   1.286 +    return iCE;
   1.287 +}*/
   1.288 +
   1.289 +static
   1.290 +inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
   1.291 +    uint32_t CE = lh->baseCE;
   1.292 +    uint32_t SecondCE = lh->baseContCE;
   1.293 +
   1.294 +    uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
   1.295 +    uint32_t nextCE, nextContCE;
   1.296 +    int32_t iCE;
   1.297 +
   1.298 +    iCE = ucol_inv_findCE(src, CE, SecondCE);
   1.299 +
   1.300 +    if(iCE<0) {
   1.301 +        return -1;
   1.302 +    }
   1.303 +
   1.304 +    CE &= strengthMask[strength];
   1.305 +    SecondCE &= strengthMask[strength];
   1.306 +
   1.307 +    nextCE = CE;
   1.308 +    nextContCE = SecondCE;
   1.309 +
   1.310 +    while((nextCE  & strengthMask[strength]) == CE
   1.311 +        && (nextContCE  & strengthMask[strength]) == SecondCE)
   1.312 +    {
   1.313 +        nextCE = (*(CETable+3*(++iCE)));
   1.314 +        nextContCE = (*(CETable+3*(iCE)+1));
   1.315 +    }
   1.316 +
   1.317 +    lh->nextCE = nextCE;
   1.318 +    lh->nextContCE = nextContCE;
   1.319 +
   1.320 +    return iCE;
   1.321 +}
   1.322 +
   1.323 +static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
   1.324 +    /* reset all the gaps */
   1.325 +    int32_t i = 0;
   1.326 +    uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
   1.327 +    uint32_t st = 0;
   1.328 +    uint32_t t1, t2;
   1.329 +    int32_t pos;
   1.330 +
   1.331 +    UColToken *tok = lh->first;
   1.332 +    uint32_t tokStrength = tok->strength;
   1.333 +
   1.334 +    for(i = 0; i<3; i++) {
   1.335 +        lh->gapsHi[3*i] = 0;
   1.336 +        lh->gapsHi[3*i+1] = 0;
   1.337 +        lh->gapsHi[3*i+2] = 0;
   1.338 +        lh->gapsLo[3*i] = 0;
   1.339 +        lh->gapsLo[3*i+1] = 0;
   1.340 +        lh->gapsLo[3*i+2] = 0;
   1.341 +        lh->numStr[i] = 0;
   1.342 +        lh->fStrToken[i] = NULL;
   1.343 +        lh->lStrToken[i] = NULL;
   1.344 +        lh->pos[i] = -1;
   1.345 +    }
   1.346 +
   1.347 +    UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
   1.348 +
   1.349 +    if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
   1.350 +        //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
   1.351 +        lh->pos[0] = 0;
   1.352 +        t1 = lh->baseCE;
   1.353 +        t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;
   1.354 +        lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
   1.355 +        lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
   1.356 +        lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
   1.357 +        uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16);
   1.358 +        primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
   1.359 +
   1.360 +        t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
   1.361 +        t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER;
   1.362 +
   1.363 +        lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
   1.364 +        lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
   1.365 +        lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
   1.366 +    } else if(lh->indirect == TRUE && lh->nextCE != 0) {
   1.367 +        //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
   1.368 +        lh->pos[0] = 0;
   1.369 +        t1 = lh->baseCE;
   1.370 +        t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;
   1.371 +        lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
   1.372 +        lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
   1.373 +        lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
   1.374 +        t1 = lh->nextCE;
   1.375 +        t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;
   1.376 +        lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
   1.377 +        lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
   1.378 +        lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
   1.379 +    } else {
   1.380 +        for(;;) {
   1.381 +            if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
   1.382 +                if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
   1.383 +                    lh->fStrToken[tokStrength] = tok;
   1.384 +                } else { /* The CE must be implicit, since it's not in the table */
   1.385 +                    /* Error */
   1.386 +                    *status = U_INTERNAL_PROGRAM_ERROR;
   1.387 +                }
   1.388 +            }
   1.389 +
   1.390 +            while(tok != NULL && tok->strength >= tokStrength) {
   1.391 +                if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
   1.392 +                    lh->lStrToken[tokStrength] = tok;
   1.393 +                }
   1.394 +                tok = tok->next;
   1.395 +            }
   1.396 +            if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
   1.397 +                /* check if previous interval is the same and merge the intervals if it is so */
   1.398 +                if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
   1.399 +                    lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
   1.400 +                    lh->fStrToken[tokStrength+1] = NULL;
   1.401 +                    lh->lStrToken[tokStrength+1] = NULL;
   1.402 +                    lh->pos[tokStrength+1] = -1;
   1.403 +                }
   1.404 +            }
   1.405 +            if(tok != NULL) {
   1.406 +                tokStrength = tok->strength;
   1.407 +            } else {
   1.408 +                break;
   1.409 +            }
   1.410 +        }
   1.411 +        for(st = 0; st < 3; st++) {
   1.412 +            if((pos = lh->pos[st]) >= 0) {
   1.413 +                t1 = *(CETable+3*(pos));
   1.414 +                t2 = *(CETable+3*(pos)+1);
   1.415 +                lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
   1.416 +                lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
   1.417 +                //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
   1.418 +                lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
   1.419 +                //pos--;
   1.420 +                //t1 = *(CETable+3*(pos));
   1.421 +                //t2 = *(CETable+3*(pos)+1);
   1.422 +                t1 = lh->baseCE;
   1.423 +                t2 = lh->baseContCE;
   1.424 +                lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
   1.425 +                lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
   1.426 +                lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
   1.427 +            }
   1.428 +        }
   1.429 +    }
   1.430 +}
   1.431 +
   1.432 +
   1.433 +#define ucol_countBytes(value, noOfBytes)   \
   1.434 +{                               \
   1.435 +    uint32_t mask = 0xFFFFFFFF;   \
   1.436 +    (noOfBytes) = 0;              \
   1.437 +    while(mask != 0) {            \
   1.438 +    if(((value) & mask) != 0) { \
   1.439 +    (noOfBytes)++;            \
   1.440 +    }                           \
   1.441 +    mask >>= 8;                 \
   1.442 +    }                             \
   1.443 +}
   1.444 +
   1.445 +static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
   1.446 +    if(U_SUCCESS(*status)) {
   1.447 +        g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
   1.448 +    }
   1.449 +    return g->current;
   1.450 +}
   1.451 +
   1.452 +static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) {
   1.453 +    /* TODO: rename to enum names */
   1.454 +    uint32_t high, low, count=1;
   1.455 +    uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
   1.456 +
   1.457 +    if(strength == UCOL_SECONDARY) {
   1.458 +        low = UCOL_COMMON_TOP2<<24;
   1.459 +        high = 0xFFFFFFFF;
   1.460 +        count = 0xFF - UCOL_COMMON_TOP2;
   1.461 +    } else {
   1.462 +        low = UCOL_BYTE_COMMON << 24; //0x05000000;
   1.463 +        high = 0x40000000;
   1.464 +        count = 0x40 - UCOL_BYTE_COMMON;
   1.465 +    }
   1.466 +
   1.467 +    if(tok->next != NULL && tok->next->strength == strength) {
   1.468 +        count = tok->next->toInsert;
   1.469 +    }
   1.470 +
   1.471 +    g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
   1.472 +    g->current = UCOL_BYTE_COMMON<<24;
   1.473 +
   1.474 +    if(g->noOfRanges == 0) {
   1.475 +        *status = U_INTERNAL_PROGRAM_ERROR;
   1.476 +    }
   1.477 +    return g->current;
   1.478 +}
   1.479 +
   1.480 +static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
   1.481 +    uint32_t strength = tok->strength;
   1.482 +    uint32_t low = lows[fStrength*3+strength];
   1.483 +    uint32_t high = highs[fStrength*3+strength];
   1.484 +    uint32_t maxByte = 0;
   1.485 +    if(strength == UCOL_TERTIARY) {
   1.486 +        maxByte = 0x3F;
   1.487 +    } else if(strength == UCOL_PRIMARY) {
   1.488 +        maxByte = 0xFE;
   1.489 +    } else {
   1.490 +        maxByte = 0xFF;
   1.491 +    }
   1.492 +
   1.493 +    uint32_t count = tok->toInsert;
   1.494 +
   1.495 +    if(low >= high && strength > UCOL_PRIMARY) {
   1.496 +        int32_t s = strength;
   1.497 +        for(;;) {
   1.498 +            s--;
   1.499 +            if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
   1.500 +                if(strength == UCOL_SECONDARY) {
   1.501 +                    if (low < UCOL_COMMON_TOP2<<24 ) {
   1.502 +                       // Override if low range is less than UCOL_COMMON_TOP2.
   1.503 +		        low = UCOL_COMMON_TOP2<<24;
   1.504 +                    }
   1.505 +                    high = 0xFFFFFFFF;
   1.506 +                } else {
   1.507 +                    // Override if low range is less than UCOL_COMMON_BOT3.
   1.508 +		    if ( low < UCOL_COMMON_BOT3<<24 ) {
   1.509 +                        low = UCOL_COMMON_BOT3<<24;
   1.510 +		    }
   1.511 +                    high = 0x40000000;
   1.512 +                }
   1.513 +                break;
   1.514 +            }
   1.515 +            if(s<0) {
   1.516 +                *status = U_INTERNAL_PROGRAM_ERROR;
   1.517 +                return 0;
   1.518 +            }
   1.519 +        }
   1.520 +    }
   1.521 +
   1.522 +    if(low < 0x02000000) {
   1.523 +        // We must not use CE weight byte 02, so we set it as the minimum lower bound.
   1.524 +        // See http://site.icu-project.org/design/collation/bytes
   1.525 +        low = 0x02000000;
   1.526 +    }
   1.527 +
   1.528 +    if(strength == UCOL_SECONDARY) { /* similar as simple */
   1.529 +        if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
   1.530 +            low = UCOL_COMMON_TOP2<<24;
   1.531 +        }
   1.532 +        if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
   1.533 +            high = UCOL_COMMON_TOP2<<24;
   1.534 +        }
   1.535 +        if(low < (UCOL_COMMON_BOT2<<24)) {
   1.536 +            g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
   1.537 +            g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
   1.538 +            //g->current = UCOL_COMMON_BOT2<<24;
   1.539 +            return g->current;
   1.540 +        }
   1.541 +    }
   1.542 +
   1.543 +    g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
   1.544 +    if(g->noOfRanges == 0) {
   1.545 +        *status = U_INTERNAL_PROGRAM_ERROR;
   1.546 +    }
   1.547 +    g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
   1.548 +    return g->current;
   1.549 +}
   1.550 +
   1.551 +static
   1.552 +uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
   1.553 +    uint32_t i = 0;
   1.554 +    UChar c;
   1.555 +
   1.556 +    if(U_FAILURE(*status)) {
   1.557 +        return 0;
   1.558 +    }
   1.559 +
   1.560 +    if(sourceLen > resLen) {
   1.561 +        *status = U_MEMORY_ALLOCATION_ERROR;
   1.562 +        return 0;
   1.563 +    }
   1.564 +
   1.565 +    for(i = 0; i < sourceLen; i++) {
   1.566 +        c = source[i];
   1.567 +        if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
   1.568 +            switch(c - 0x3000) {
   1.569 +            case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
   1.570 +            case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
   1.571 +                c++;
   1.572 +                break;
   1.573 +            case 0xF5:
   1.574 +                c = 0x30AB;
   1.575 +                break;
   1.576 +            case 0xF6:
   1.577 +                c = 0x30B1;
   1.578 +                break;
   1.579 +            }
   1.580 +        }
   1.581 +        resBuf[i] = c;
   1.582 +    }
   1.583 +    return sourceLen;
   1.584 +}
   1.585 +
   1.586 +static
   1.587 +uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
   1.588 +    uint32_t i = 0;
   1.589 +    UChar c;
   1.590 +
   1.591 +    if(U_FAILURE(*status)) {
   1.592 +        return 0;
   1.593 +    }
   1.594 +
   1.595 +    if(sourceLen > resLen) {
   1.596 +        *status = U_MEMORY_ALLOCATION_ERROR;
   1.597 +        return 0;
   1.598 +    }
   1.599 +
   1.600 +    for(i = 0; i < sourceLen; i++) {
   1.601 +        c = source[i];
   1.602 +        if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
   1.603 +            switch(c - 0x3000) {
   1.604 +            case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
   1.605 +            case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
   1.606 +                c--;
   1.607 +                break;
   1.608 +            case 0xAB:
   1.609 +                c = 0x30F5;
   1.610 +                break;
   1.611 +            case 0xB1:
   1.612 +                c = 0x30F6;
   1.613 +                break;
   1.614 +            }
   1.615 +        }
   1.616 +        resBuf[i] = c;
   1.617 +    }
   1.618 +    return sourceLen;
   1.619 +}
   1.620 +
   1.621 +U_NAMESPACE_BEGIN
   1.622 +
   1.623 +static
   1.624 +uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
   1.625 +    uint32_t i = 0;
   1.626 +    UChar n[128];
   1.627 +    uint32_t nLen = 0;
   1.628 +    uint32_t uCount = 0, lCount = 0;
   1.629 +
   1.630 +    collIterate s;
   1.631 +    uint32_t order = 0;
   1.632 +
   1.633 +    if(U_FAILURE(*status)) {
   1.634 +        return UCOL_LOWER_CASE;
   1.635 +    }
   1.636 +
   1.637 +    nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
   1.638 +    if(U_SUCCESS(*status)) {
   1.639 +        for(i = 0; i < nLen; i++) {
   1.640 +            uprv_init_collIterate(UCA, &n[i], 1, &s, status);
   1.641 +            order = ucol_getNextCE(UCA, &s, status);
   1.642 +            if(isContinuation(order)) {
   1.643 +                *status = U_INTERNAL_PROGRAM_ERROR;
   1.644 +                return UCOL_LOWER_CASE;
   1.645 +            }
   1.646 +            if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
   1.647 +                uCount++;
   1.648 +            } else {
   1.649 +                if(u_islower(n[i])) {
   1.650 +                    lCount++;
   1.651 +                } else if(U_SUCCESS(*status)) {
   1.652 +                    UChar sk[1], lk[1];
   1.653 +                    u_toSmallKana(&n[i], 1, sk, 1, status);
   1.654 +                    u_toLargeKana(&n[i], 1, lk, 1, status);
   1.655 +                    if(sk[0] == n[i] && lk[0] != n[i]) {
   1.656 +                        lCount++;
   1.657 +                    }
   1.658 +                }
   1.659 +            }
   1.660 +        }
   1.661 +    }
   1.662 +
   1.663 +    if(uCount != 0 && lCount != 0) {
   1.664 +        return UCOL_MIXED_CASE;
   1.665 +    } else if(uCount != 0) {
   1.666 +        return UCOL_UPPER_CASE;
   1.667 +    } else {
   1.668 +        return UCOL_LOWER_CASE;
   1.669 +    }
   1.670 +}
   1.671 +
   1.672 +
   1.673 +U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
   1.674 +    /* this one makes the table and stuff */
   1.675 +    uint32_t noOfBytes[3];
   1.676 +    uint32_t i;
   1.677 +
   1.678 +    for(i = 0; i<3; i++) {
   1.679 +        ucol_countBytes(CEparts[i], noOfBytes[i]);
   1.680 +    }
   1.681 +
   1.682 +    /* Here we have to pack CEs from parts */
   1.683 +
   1.684 +    uint32_t CEi = 0;
   1.685 +    uint32_t value = 0;
   1.686 +
   1.687 +    while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
   1.688 +        if(CEi > 0) {
   1.689 +            value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
   1.690 +        } else {
   1.691 +            value = 0;
   1.692 +        }
   1.693 +
   1.694 +        if(2*CEi<noOfBytes[0]) {
   1.695 +            value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
   1.696 +        }
   1.697 +        if(CEi<noOfBytes[1]) {
   1.698 +            value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
   1.699 +        }
   1.700 +        if(CEi<noOfBytes[2]) {
   1.701 +            value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
   1.702 +        }
   1.703 +        tok->CEs[CEi] = value;
   1.704 +        CEi++;
   1.705 +    }
   1.706 +    if(CEi == 0) { /* totally ignorable */
   1.707 +        tok->noOfCEs = 1;
   1.708 +        tok->CEs[0] = 0;
   1.709 +    } else { /* there is at least something */
   1.710 +        tok->noOfCEs = CEi;
   1.711 +    }
   1.712 +
   1.713 +
   1.714 +    // we want to set case bits here and now, not later.
   1.715 +    // Case bits handling
   1.716 +    if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables
   1.717 +        tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
   1.718 +        int32_t cSize = (tok->source & 0xFF000000) >> 24;
   1.719 +        UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
   1.720 +
   1.721 +        if(cSize > 1) {
   1.722 +            // Do it manually
   1.723 +            tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
   1.724 +        } else {
   1.725 +            // Copy it from the UCA
   1.726 +            uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
   1.727 +            tok->CEs[0] |= (caseCE & 0xC0);
   1.728 +        }
   1.729 +    }
   1.730 +
   1.731 +#if UCOL_DEBUG==2
   1.732 +    fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
   1.733 +    for(i = 0; i<tok->noOfCEs; i++) {
   1.734 +        fprintf(stderr, "%08X ", tok->CEs[i]);
   1.735 +    }
   1.736 +    fprintf(stderr, "\n");
   1.737 +#endif
   1.738 +}
   1.739 +
   1.740 +U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
   1.741 +    ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
   1.742 +    uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
   1.743 +
   1.744 +    UColToken *tok = lh->last;
   1.745 +    uint32_t t[UCOL_STRENGTH_LIMIT];
   1.746 +
   1.747 +    uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
   1.748 +
   1.749 +    /* must initialize ranges to avoid memory check warnings */
   1.750 +    for (int i = 0; i < UCOL_CE_STRENGTH_LIMIT; i++) {
   1.751 +        uprv_memset(Gens[i].ranges, 0, sizeof(Gens[i].ranges));
   1.752 +    }
   1.753 +
   1.754 +    tok->toInsert = 1;
   1.755 +    t[tok->strength] = 1;
   1.756 +
   1.757 +    while(tok->previous != NULL) {
   1.758 +        if(tok->previous->strength < tok->strength) { /* going up */
   1.759 +            t[tok->strength] = 0;
   1.760 +            t[tok->previous->strength]++;
   1.761 +        } else if(tok->previous->strength > tok->strength) { /* going down */
   1.762 +            t[tok->previous->strength] = 1;
   1.763 +        } else {
   1.764 +            t[tok->strength]++;
   1.765 +        }
   1.766 +        tok=tok->previous;
   1.767 +        tok->toInsert = t[tok->strength];
   1.768 +    }
   1.769 +
   1.770 +    tok->toInsert = t[tok->strength];
   1.771 +    ucol_inv_getGapPositions(src, lh, status);
   1.772 +
   1.773 +#if UCOL_DEBUG
   1.774 +    fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
   1.775 +    int32_t j = 2;
   1.776 +    for(j = 2; j >= 0; j--) {
   1.777 +        fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
   1.778 +        fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
   1.779 +    }
   1.780 +    tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
   1.781 +
   1.782 +    do {
   1.783 +        fprintf(stderr,"%i", tok->strength);
   1.784 +        tok = tok->next;
   1.785 +    } while(tok != NULL);
   1.786 +    fprintf(stderr, "\n");
   1.787 +
   1.788 +    tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
   1.789 +
   1.790 +    do {
   1.791 +        fprintf(stderr,"%i", tok->toInsert);
   1.792 +        tok = tok->next;
   1.793 +    } while(tok != NULL);
   1.794 +#endif
   1.795 +
   1.796 +    tok = lh->first;
   1.797 +    uint32_t fStrength = UCOL_IDENTICAL;
   1.798 +    uint32_t initStrength = UCOL_IDENTICAL;
   1.799 +
   1.800 +
   1.801 +    CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
   1.802 +    CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8;
   1.803 +    CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16;
   1.804 +
   1.805 +    while (tok != NULL && U_SUCCESS(*status)) {
   1.806 +        fStrength = tok->strength;
   1.807 +        if(fStrength < initStrength) {
   1.808 +            initStrength = fStrength;
   1.809 +            if(lh->pos[fStrength] == -1) {
   1.810 +                while(lh->pos[fStrength] == -1 && fStrength > 0) {
   1.811 +                    fStrength--;
   1.812 +                }
   1.813 +                if(lh->pos[fStrength] == -1) {
   1.814 +                    *status = U_INTERNAL_PROGRAM_ERROR;
   1.815 +                    return;
   1.816 +                }
   1.817 +            }
   1.818 +            if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
   1.819 +                CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
   1.820 +                CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
   1.821 +                /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
   1.822 +                CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
   1.823 +            } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
   1.824 +                CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
   1.825 +                /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
   1.826 +                CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
   1.827 +                CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
   1.828 +            } else { /* primaries */
   1.829 +                /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
   1.830 +                CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
   1.831 +                CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
   1.832 +                CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
   1.833 +            }
   1.834 +        } else {
   1.835 +            if(tok->strength == UCOL_TERTIARY) {
   1.836 +                CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status);
   1.837 +            } else if(tok->strength == UCOL_SECONDARY) {
   1.838 +                CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status);
   1.839 +                CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
   1.840 +            } else if(tok->strength == UCOL_PRIMARY) {
   1.841 +                CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status);
   1.842 +                CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
   1.843 +                CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
   1.844 +            }
   1.845 +        }
   1.846 +        ucol_doCE(src, CEparts, tok, status);
   1.847 +        tok = tok->next;
   1.848 +    }
   1.849 +}
   1.850 +
   1.851 +U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
   1.852 +    UCAElements el;
   1.853 +    UColToken *tok = lh->first;
   1.854 +    UColToken *expt = NULL;
   1.855 +    uint32_t i = 0, j = 0;
   1.856 +    const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status);
   1.857 +
   1.858 +    while(tok != NULL && U_SUCCESS(*status)) {
   1.859 +        /* first, check if there are any expansions */
   1.860 +        /* if there are expansions, we need to do a little bit more processing */
   1.861 +        /* since parts of expansion can be tailored, while others are not */
   1.862 +        if(tok->expansion != 0) {
   1.863 +            uint32_t len = tok->expansion >> 24;
   1.864 +            uint32_t currentSequenceLen = len;
   1.865 +            uint32_t expOffset = tok->expansion & 0x00FFFFFF;
   1.866 +            //uint32_t exp = currentSequenceLen | expOffset;
   1.867 +            UColToken exp;
   1.868 +            exp.source = currentSequenceLen | expOffset;
   1.869 +            exp.rulesToParseHdl = &(src->source);
   1.870 +
   1.871 +            while(len > 0) {
   1.872 +                currentSequenceLen = len;
   1.873 +                while(currentSequenceLen > 0) {
   1.874 +                    exp.source = (currentSequenceLen << 24) | expOffset;
   1.875 +                    if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
   1.876 +                        uint32_t noOfCEsToCopy = expt->noOfCEs;
   1.877 +                        for(j = 0; j<noOfCEsToCopy; j++) {
   1.878 +                            tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
   1.879 +                        }
   1.880 +                        tok->noOfExpCEs += noOfCEsToCopy;
   1.881 +                        // Smart people never try to add codepoints and CEs.
   1.882 +                        // For some odd reason, it won't work.
   1.883 +                        expOffset += currentSequenceLen; //noOfCEsToCopy;
   1.884 +                        len -= currentSequenceLen; //noOfCEsToCopy;
   1.885 +                        break;
   1.886 +                    } else {
   1.887 +                        currentSequenceLen--;
   1.888 +                    }
   1.889 +                }
   1.890 +                if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */
   1.891 +                    /* will have to get one from UCA */
   1.892 +                    /* first, get the UChars from the rules */
   1.893 +                    /* then pick CEs out until there is no more and stuff them into expansion */
   1.894 +                    collIterate s;
   1.895 +                    uint32_t order = 0;
   1.896 +                    uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status);
   1.897 +
   1.898 +                    for(;;) {
   1.899 +                        order = ucol_getNextCE(src->UCA, &s, status);
   1.900 +                        if(order == UCOL_NO_MORE_CES) {
   1.901 +                            break;
   1.902 +                        }
   1.903 +                        tok->expCEs[tok->noOfExpCEs++] = order;
   1.904 +                    }
   1.905 +                    expOffset++;
   1.906 +                    len--;
   1.907 +                }
   1.908 +            }
   1.909 +        } else {
   1.910 +            tok->noOfExpCEs = 0;
   1.911 +        }
   1.912 +
   1.913 +        /* set the ucaelement with obtained values */
   1.914 +        el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
   1.915 +        /* copy CEs */
   1.916 +        for(i = 0; i<tok->noOfCEs; i++) {
   1.917 +            el.CEs[i] = tok->CEs[i];
   1.918 +        }
   1.919 +        for(i = 0; i<tok->noOfExpCEs; i++) {
   1.920 +            el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
   1.921 +        }
   1.922 +
   1.923 +        /* copy UChars */
   1.924 +        // We kept prefix and source kind of together, as it is a kind of a contraction.
   1.925 +        // However, now we have to slice the prefix off the main thing -
   1.926 +        el.prefix = el.prefixChars;
   1.927 +        el.cPoints = el.uchars;
   1.928 +        if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
   1.929 +            // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
   1.930 +            // decomposed elements to the unsaf table.
   1.931 +            el.prefixSize = tok->prefix>>24;
   1.932 +            uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
   1.933 +
   1.934 +            el.cSize = (tok->source >> 24)-(tok->prefix>>24);
   1.935 +            uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
   1.936 +        } else {
   1.937 +            el.prefixSize = 0;
   1.938 +            *el.prefix = 0;
   1.939 +
   1.940 +            el.cSize = (tok->source >> 24);
   1.941 +            uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
   1.942 +        }
   1.943 +        if(src->UCA != NULL) {
   1.944 +            for(i = 0; i<el.cSize; i++) {
   1.945 +                if(UCOL_ISJAMO(el.cPoints[i])) {
   1.946 +                    t->image->jamoSpecial = TRUE;
   1.947 +                }
   1.948 +            }
   1.949 +            if (!src->buildCCTabFlag && el.cSize > 0) {
   1.950 +                // Check the trailing canonical combining class (tccc) of the last character.
   1.951 +                const UChar *s = el.cPoints + el.cSize;
   1.952 +                uint16_t fcd = nfcImpl->previousFCD16(el.cPoints, s);
   1.953 +                if ((fcd & 0xff) != 0) {
   1.954 +                    src->buildCCTabFlag = TRUE;
   1.955 +                }
   1.956 +            }
   1.957 +        }
   1.958 +
   1.959 +        /* and then, add it */
   1.960 +#if UCOL_DEBUG==2
   1.961 +        fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
   1.962 +#endif
   1.963 +        uprv_uca_addAnElement(t, &el, status);
   1.964 +
   1.965 +#if UCOL_DEBUG_DUPLICATES
   1.966 +        if(*status != U_ZERO_ERROR) {
   1.967 +            fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
   1.968 +            *status = U_ZERO_ERROR;
   1.969 +        }
   1.970 +#endif
   1.971 +
   1.972 +        tok = tok->next;
   1.973 +    }
   1.974 +}
   1.975 +
   1.976 +U_CDECL_BEGIN
   1.977 +static UBool U_CALLCONV
   1.978 +_processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
   1.979 +    UErrorCode status = U_ZERO_ERROR;
   1.980 +    tempUCATable *t = (tempUCATable *)context;
   1.981 +    if(value == 0) {
   1.982 +        while(start < limit) {
   1.983 +            uint32_t CE = utrie_get32(t->mapping, start, NULL);
   1.984 +            if(CE == UCOL_NOT_FOUND) {
   1.985 +                UCAElements el;
   1.986 +                el.isThai = FALSE;
   1.987 +                el.prefixSize = 0;
   1.988 +                el.prefixChars[0] = 0;
   1.989 +                el.prefix = el.prefixChars;
   1.990 +                el.cPoints = el.uchars;
   1.991 +
   1.992 +                el.cSize = 0;
   1.993 +                U16_APPEND_UNSAFE(el.uchars, el.cSize, start);
   1.994 +
   1.995 +                el.noOfCEs = 1;
   1.996 +                el.CEs[0] = 0;
   1.997 +                uprv_uca_addAnElement(t, &el, &status);
   1.998 +
   1.999 +            }
  1.1000 +            start++;
  1.1001 +        }
  1.1002 +    }
  1.1003 +    if(U_FAILURE(status)) {
  1.1004 +        return FALSE;
  1.1005 +    } else {
  1.1006 +        return TRUE;
  1.1007 +    }
  1.1008 +}
  1.1009 +U_CDECL_END
  1.1010 +
  1.1011 +static void
  1.1012 +ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
  1.1013 +                               UChar32 start, UChar32 end,
  1.1014 +                               UErrorCode *status)
  1.1015 +{
  1.1016 +    //UChar decomp[256];
  1.1017 +    uint32_t CE = UCOL_NOT_FOUND;
  1.1018 +    UChar32 u = 0;
  1.1019 +    UCAElements el;
  1.1020 +    el.isThai = FALSE;
  1.1021 +    el.prefixSize = 0;
  1.1022 +    el.prefixChars[0] = 0;
  1.1023 +    collIterate colIt;
  1.1024 +
  1.1025 +    if(U_SUCCESS(*status)) {
  1.1026 +        for(u = start; u<=end; u++) {
  1.1027 +            if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
  1.1028 +                /* this test is for contractions that are missing the starting element. */
  1.1029 +                || ((isCntTableElement(CE)) &&
  1.1030 +                (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND))
  1.1031 +                )
  1.1032 +            {
  1.1033 +                el.cSize = 0;
  1.1034 +                U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
  1.1035 +                //decomp[0] = (UChar)u;
  1.1036 +                //el.uchars[0] = (UChar)u;
  1.1037 +                el.cPoints = el.uchars;
  1.1038 +                //el.cSize = 1;
  1.1039 +                el.noOfCEs = 0;
  1.1040 +                el.prefix = el.prefixChars;
  1.1041 +                el.prefixSize = 0;
  1.1042 +                //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
  1.1043 +                // We actually want to check whether this element is a special
  1.1044 +                // If it is an implicit element (hangul, CJK - we want to copy the
  1.1045 +                // special, not the resolved CEs) - for hangul, copying resolved
  1.1046 +                // would just make things the same (there is an expansion and it
  1.1047 +                // takes approximately the same amount of time to resolve as
  1.1048 +                // falling back to the UCA).
  1.1049 +                /*
  1.1050 +                UTRIE_GET32(src->UCA->mapping, u, CE);
  1.1051 +                tag = getCETag(CE);
  1.1052 +                if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
  1.1053 +                || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
  1.1054 +                || tag == LEAD_SURROGATE_TAG) {
  1.1055 +                el.CEs[el.noOfCEs++] = CE;
  1.1056 +                } else {
  1.1057 +                */
  1.1058 +                // It turns out that it does not make sense to keep implicits
  1.1059 +                // unresolved. The cost of resolving them is big enough so that
  1.1060 +                // it doesn't make any difference whether we have to go to the UCA
  1.1061 +                // or not.
  1.1062 +                {
  1.1063 +                    uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status);
  1.1064 +                    while(CE != UCOL_NO_MORE_CES) {
  1.1065 +                        CE = ucol_getNextCE(src->UCA, &colIt, status);
  1.1066 +                        if(CE != UCOL_NO_MORE_CES) {
  1.1067 +                            el.CEs[el.noOfCEs++] = CE;
  1.1068 +                        }
  1.1069 +                    }
  1.1070 +                }
  1.1071 +                uprv_uca_addAnElement(t, &el, status);
  1.1072 +            }
  1.1073 +        }
  1.1074 +    }
  1.1075 +}
  1.1076 +
  1.1077 +U_NAMESPACE_END
  1.1078 +
  1.1079 +U_CFUNC UCATableHeader *
  1.1080 +ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
  1.1081 +    U_NAMESPACE_USE
  1.1082 +
  1.1083 +    uint32_t i = 0;
  1.1084 +    if(U_FAILURE(*status)) {
  1.1085 +        return NULL;
  1.1086 +    }
  1.1087 +    /*
  1.1088 +    2.  Eliminate the negative lists by doing the following for each non-null negative list:
  1.1089 +    o   if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
  1.1090 +    create new ListHeader X
  1.1091 +    o   reverse the list, add to the end of X's positive list. Reset the strength of the
  1.1092 +    first item you add, based on the stronger strength levels of the two lists.
  1.1093 +    */
  1.1094 +    /*
  1.1095 +    3.  For each ListHeader with a non-null positive list:
  1.1096 +    */
  1.1097 +    /*
  1.1098 +    o   Find all character strings with CEs between the baseCE and the
  1.1099 +    next/previous CE, at the strength of the first token. Add these to the
  1.1100 +    tailoring.
  1.1101 +    ? That is, if UCA has ...  x <<< X << x' <<< X' < y ..., and the
  1.1102 +    tailoring has & x < z...
  1.1103 +    ? Then we change the tailoring to & x  <<< X << x' <<< X' < z ...
  1.1104 +    */
  1.1105 +    /* It is possible that this part should be done even while constructing list */
  1.1106 +    /* The problem is that it is unknown what is going to be the strongest weight */
  1.1107 +    /* So we might as well do it here */
  1.1108 +
  1.1109 +    /*
  1.1110 +    o   Allocate CEs for each token in the list, based on the total number N of the
  1.1111 +    largest level difference, and the gap G between baseCE and nextCE at that
  1.1112 +    level. The relation * between the last item and nextCE is the same as the
  1.1113 +    strongest strength.
  1.1114 +    o   Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
  1.1115 +    ? There are 3 primary items: a, d, e. Fit them into the primary gap.
  1.1116 +    Then fit b and c into the secondary gap between a and d, then fit q
  1.1117 +    into the tertiary gap between b and c.
  1.1118 +
  1.1119 +    o   Example: baseCE << b <<< q << c * nextCE(X,2)
  1.1120 +    ? There are 2 secondary items: b, c. Fit them into the secondary gap.
  1.1121 +    Then fit q into the tertiary gap between b and c.
  1.1122 +    o   When incrementing primary values, we will not cross high byte
  1.1123 +    boundaries except where there is only a single-byte primary. That is to
  1.1124 +    ensure that the script reordering will continue to work.
  1.1125 +    */
  1.1126 +    UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
  1.1127 +    /* test for NULL */
  1.1128 +    if (image == NULL) {
  1.1129 +        *status = U_MEMORY_ALLOCATION_ERROR;
  1.1130 +        return NULL;
  1.1131 +    }
  1.1132 +    uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
  1.1133 +
  1.1134 +    for(i = 0; i<src->resultLen; i++) {
  1.1135 +        /* now we need to generate the CEs */
  1.1136 +        /* We stuff the initial value in the buffers, and increase the appropriate buffer */
  1.1137 +        /* According to strength                                                          */
  1.1138 +        if(U_SUCCESS(*status)) {
  1.1139 +            if(src->lh[i].first) { // if there are any elements
  1.1140 +                // due to the way parser works, subsequent tailorings
  1.1141 +                // may remove all the elements from a sequence, therefore
  1.1142 +                // leaving an empty tailoring sequence.
  1.1143 +                ucol_initBuffers(src, &src->lh[i], status);
  1.1144 +            }
  1.1145 +        }
  1.1146 +        if(U_FAILURE(*status)) {
  1.1147 +            uprv_free(image);
  1.1148 +            return NULL;
  1.1149 +        }
  1.1150 +    }
  1.1151 +
  1.1152 +    if(src->varTop != NULL) { /* stuff the variable top value */
  1.1153 +        src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
  1.1154 +        /* remove it from the list */
  1.1155 +        if(src->varTop->listHeader->first == src->varTop) { /* first in list */
  1.1156 +            src->varTop->listHeader->first = src->varTop->next;
  1.1157 +        }
  1.1158 +        if(src->varTop->listHeader->last == src->varTop) { /* first in list */
  1.1159 +            src->varTop->listHeader->last = src->varTop->previous;
  1.1160 +        }
  1.1161 +        if(src->varTop->next != NULL) {
  1.1162 +            src->varTop->next->previous = src->varTop->previous;
  1.1163 +        }
  1.1164 +        if(src->varTop->previous != NULL) {
  1.1165 +            src->varTop->previous->next = src->varTop->next;
  1.1166 +        }
  1.1167 +    }
  1.1168 +
  1.1169 +
  1.1170 +    tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
  1.1171 +    if(U_FAILURE(*status)) {
  1.1172 +        uprv_free(image);
  1.1173 +        return NULL;
  1.1174 +    }
  1.1175 +
  1.1176 +
  1.1177 +    /* After this, we have assigned CE values to all regular CEs      */
  1.1178 +    /* now we will go through list once more and resolve expansions,  */
  1.1179 +    /* make UCAElements structs and add them to table                 */
  1.1180 +    for(i = 0; i<src->resultLen; i++) {
  1.1181 +        /* now we need to generate the CEs */
  1.1182 +        /* We stuff the initial value in the buffers, and increase the appropriate buffer */
  1.1183 +        /* According to strength                                                          */
  1.1184 +        if(U_SUCCESS(*status)) {
  1.1185 +            ucol_createElements(src, t, &src->lh[i], status);
  1.1186 +        }
  1.1187 +    }
  1.1188 +
  1.1189 +    UCAElements el;
  1.1190 +    el.isThai = FALSE;
  1.1191 +    el.prefixSize = 0;
  1.1192 +    el.prefixChars[0] = 0;
  1.1193 +
  1.1194 +    /* add latin-1 stuff */
  1.1195 +    ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
  1.1196 +
  1.1197 +    /* add stuff for copying */
  1.1198 +    if(src->copySet != NULL) {
  1.1199 +        int32_t i = 0;
  1.1200 +        UnicodeSet *set = (UnicodeSet *)src->copySet;
  1.1201 +        for(i = 0; i < set->getRangeCount(); i++) {
  1.1202 +            ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status);
  1.1203 +        }
  1.1204 +    }
  1.1205 +
  1.1206 +    if(U_SUCCESS(*status)) {
  1.1207 +        /* copy contractions from the UCA - this is felt mostly for cyrillic*/
  1.1208 +
  1.1209 +        uint32_t tailoredCE = UCOL_NOT_FOUND;
  1.1210 +        UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
  1.1211 +        int32_t maxUCAContractionLength = src->UCA->image->contractionUCACombosWidth;
  1.1212 +        UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
  1.1213 +        // Check for null pointer
  1.1214 +        if (ucaEl == NULL) {
  1.1215 +            *status = U_MEMORY_ALLOCATION_ERROR;
  1.1216 +            return NULL;
  1.1217 +        }
  1.1218 +        while(*conts != 0) {
  1.1219 +            // A continuation is NUL-terminated and NUL-padded
  1.1220 +            // except if it has the maximum length.
  1.1221 +            int32_t contractionLength = maxUCAContractionLength;
  1.1222 +            while(contractionLength > 0 && conts[contractionLength - 1] == 0) {
  1.1223 +                --contractionLength;
  1.1224 +            }
  1.1225 +            UChar32 first;
  1.1226 +            int32_t firstLength = 0;
  1.1227 +            U16_NEXT(conts, firstLength, contractionLength, first);
  1.1228 +            tailoredCE = utrie_get32(t->mapping, first, NULL);
  1.1229 +            if(tailoredCE != UCOL_NOT_FOUND) {
  1.1230 +                UBool needToAdd = TRUE;
  1.1231 +                if(isCntTableElement(tailoredCE)) {
  1.1232 +                    if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+firstLength, status) == TRUE) {
  1.1233 +                        needToAdd = FALSE;
  1.1234 +                    }
  1.1235 +                }
  1.1236 +                if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) {
  1.1237 +                    UCAElements elm;
  1.1238 +                    elm.cPoints = el.uchars;
  1.1239 +                    elm.noOfCEs = 0;
  1.1240 +                    elm.uchars[0] = *conts;
  1.1241 +                    elm.uchars[1] = 0;
  1.1242 +                    elm.cSize = 1;
  1.1243 +                    elm.prefixChars[0] = *(conts+2);
  1.1244 +                    elm.isThai = FALSE;
  1.1245 +                    elm.prefix = elm.prefixChars;
  1.1246 +                    elm.prefixSize = 1;
  1.1247 +                    UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm);
  1.1248 +                    if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) {
  1.1249 +                        needToAdd = TRUE;
  1.1250 +                    }
  1.1251 +                }
  1.1252 +                if(src->removeSet != NULL && uset_contains(src->removeSet, first)) {
  1.1253 +                    needToAdd = FALSE;
  1.1254 +                }
  1.1255 +
  1.1256 +                if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
  1.1257 +                    if (*(conts+1) != 0) {  // contractions
  1.1258 +                        el.prefix = el.prefixChars;
  1.1259 +                        el.prefixSize = 0;
  1.1260 +                        el.cPoints = el.uchars;
  1.1261 +                        el.noOfCEs = 0;
  1.1262 +                        u_memcpy(el.uchars, conts, contractionLength);
  1.1263 +                        el.cSize = contractionLength;
  1.1264 +                        ucol_setText(ucaEl, el.uchars, el.cSize, status);
  1.1265 +                    }
  1.1266 +                    else { // pre-context character
  1.1267 +                        UChar str[4] = { 0 };
  1.1268 +                        int32_t len=0;
  1.1269 +                        int32_t preKeyLen=0;
  1.1270 +                        
  1.1271 +                        el.cPoints = el.uchars;
  1.1272 +                        el.noOfCEs = 0;
  1.1273 +                        el.uchars[0] = *conts;
  1.1274 +                        el.uchars[1] = 0;
  1.1275 +                        el.cSize = 1;
  1.1276 +                        el.prefixChars[0] = *(conts+2);
  1.1277 +                        el.prefix = el.prefixChars;
  1.1278 +                        el.prefixSize = 1;
  1.1279 +                        if (el.prefixChars[0]!=0) {
  1.1280 +                            // get CE of prefix character first
  1.1281 +                            str[0]=el.prefixChars[0];
  1.1282 +                            str[1]=0;
  1.1283 +                            ucol_setText(ucaEl, str, 1, status);
  1.1284 +                            while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status))
  1.1285 +                                    != UCOL_NULLORDER) {
  1.1286 +                                preKeyLen++;  // count number of keys for prefix character
  1.1287 +                            }
  1.1288 +                            str[len++] = el.prefixChars[0];
  1.1289 +                        }
  1.1290 +
  1.1291 +                        str[len++] = el.uchars[0];
  1.1292 +                        str[len]=0;
  1.1293 +                        ucol_setText(ucaEl, str, len, status);
  1.1294 +                        // Skip the keys for prefix character, then copy the rest to el.
  1.1295 +                        while ((preKeyLen-->0) && 
  1.1296 +                               (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
  1.1297 +                            continue;
  1.1298 +                        }
  1.1299 +                           
  1.1300 +                    }
  1.1301 +                    while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
  1.1302 +                        el.noOfCEs++;
  1.1303 +                    }
  1.1304 +                    uprv_uca_addAnElement(t, &el, status);
  1.1305 +                }
  1.1306 +
  1.1307 +            } else if(src->removeSet != NULL && uset_contains(src->removeSet, first)) {
  1.1308 +                ucol_uprv_bld_copyRangeFromUCA(src, t, first, first, status);
  1.1309 +            }
  1.1310 +            conts+=maxUCAContractionLength;
  1.1311 +        }
  1.1312 +        ucol_closeElements(ucaEl);
  1.1313 +    }
  1.1314 +
  1.1315 +    // Add completely ignorable elements
  1.1316 +    utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
  1.1317 +
  1.1318 +    // add tailoring characters related canonical closures
  1.1319 +    uprv_uca_canonicalClosure(t, src, NULL, status);
  1.1320 +
  1.1321 +    /* still need to produce compatibility closure */
  1.1322 +
  1.1323 +    UCATableHeader *myData = uprv_uca_assembleTable(t, status);
  1.1324 +
  1.1325 +    uprv_uca_closeTempTable(t);
  1.1326 +    uprv_free(image);
  1.1327 +
  1.1328 +    return myData;
  1.1329 +}
  1.1330 +
  1.1331 +U_CDECL_BEGIN
  1.1332 +static UBool U_CALLCONV
  1.1333 +ucol_bld_cleanup(void)
  1.1334 +{
  1.1335 +    udata_close(invUCA_DATA_MEM);
  1.1336 +    invUCA_DATA_MEM = NULL;
  1.1337 +    _staticInvUCA = NULL;
  1.1338 +    gStaticInvUCAInitOnce.reset();
  1.1339 +    return TRUE;
  1.1340 +}
  1.1341 +U_CDECL_END
  1.1342 +
  1.1343 +static void U_CALLCONV initInverseUCA(UErrorCode &status) {
  1.1344 +    U_ASSERT(invUCA_DATA_MEM == NULL);
  1.1345 +    U_ASSERT(_staticInvUCA == NULL);
  1.1346 +    ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
  1.1347 +    InverseUCATableHeader *newInvUCA = NULL;
  1.1348 +    UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, &status);
  1.1349 +
  1.1350 +    if(U_FAILURE(status)) {
  1.1351 +        if (result) {
  1.1352 +            udata_close(result);
  1.1353 +        }
  1.1354 +        // This is not needed, as we are talking about
  1.1355 +        // memory we got from UData
  1.1356 +        //uprv_free(newInvUCA);
  1.1357 +        return;
  1.1358 +    }
  1.1359 +
  1.1360 +    if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
  1.1361 +        newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
  1.1362 +        UCollator *UCA = ucol_initUCA(&status);
  1.1363 +        // UCA versions of UCA and inverse UCA should match
  1.1364 +        if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) {
  1.1365 +            status = U_INVALID_FORMAT_ERROR;
  1.1366 +            udata_close(result);
  1.1367 +            return;
  1.1368 +        }
  1.1369 +
  1.1370 +        invUCA_DATA_MEM = result;
  1.1371 +        _staticInvUCA = newInvUCA;
  1.1372 +    }
  1.1373 +}
  1.1374 +
  1.1375 +
  1.1376 +U_CAPI const InverseUCATableHeader * U_EXPORT2
  1.1377 +ucol_initInverseUCA(UErrorCode *status)
  1.1378 +{
  1.1379 +    umtx_initOnce(gStaticInvUCAInitOnce, &initInverseUCA, *status);
  1.1380 +    return _staticInvUCA;
  1.1381 +}
  1.1382 +
  1.1383 +/* This is the data that is used for non-script reordering codes. These _must_ be kept
  1.1384 + * in order that they are to be applied as defaults and in synch with the UColReorderCode enum.
  1.1385 + */
  1.1386 +static const char * const ReorderingTokenNames[] = {
  1.1387 +    "SPACE",
  1.1388 +    "PUNCT",
  1.1389 +    "SYMBOL",
  1.1390 +    "CURRENCY",
  1.1391 +    "DIGIT"
  1.1392 +};
  1.1393 +
  1.1394 +static void toUpper(const char* src, char* dst, uint32_t length) {
  1.1395 +   for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) {
  1.1396 +       *dst = uprv_toupper(*src);
  1.1397 +   }
  1.1398 +   *dst = '\0';
  1.1399 +}
  1.1400 +
  1.1401 +U_INTERNAL int32_t U_EXPORT2 
  1.1402 +ucol_findReorderingEntry(const char* name) {
  1.1403 +    char buffer[32];
  1.1404 +    toUpper(name, buffer, 32);
  1.1405 +    for (uint32_t entry = 0; entry < LENGTHOF(ReorderingTokenNames); entry++) {
  1.1406 +        if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) {
  1.1407 +            return entry + UCOL_REORDER_CODE_FIRST;
  1.1408 +        }
  1.1409 +    }
  1.1410 +    return USCRIPT_INVALID_CODE;
  1.1411 +}
  1.1412 +
  1.1413 +#endif /* #if !UCONFIG_NO_COLLATION */

mercurial