1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/ucol_bld.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1410 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* 1.7 +* Copyright (C) 2001-2013, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +******************************************************************************* 1.11 +* file name: ucol_bld.cpp 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created 02/22/2001 1.17 +* created by: Vladimir Weinstein 1.18 +* 1.19 +* This module builds a collator based on the rule set. 1.20 +* 1.21 +*/ 1.22 + 1.23 +#include "unicode/utypes.h" 1.24 + 1.25 +#if !UCONFIG_NO_COLLATION 1.26 + 1.27 +#include "unicode/ucoleitr.h" 1.28 +#include "unicode/udata.h" 1.29 +#include "unicode/uchar.h" 1.30 +#include "unicode/uniset.h" 1.31 +#include "unicode/uscript.h" 1.32 +#include "unicode/ustring.h" 1.33 +#include "unicode/utf16.h" 1.34 +#include "normalizer2impl.h" 1.35 +#include "uassert.h" 1.36 +#include "ucol_bld.h" 1.37 +#include "ucol_elm.h" 1.38 +#include "ucol_cnt.h" 1.39 +#include "ucln_in.h" 1.40 +#include "umutex.h" 1.41 +#include "cmemory.h" 1.42 +#include "cstring.h" 1.43 + 1.44 +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 1.45 + 1.46 +static const InverseUCATableHeader* _staticInvUCA = NULL; 1.47 +static UDataMemory* invUCA_DATA_MEM = NULL; 1.48 +static icu::UInitOnce gStaticInvUCAInitOnce = U_INITONCE_INITIALIZER; 1.49 + 1.50 +U_CDECL_BEGIN 1.51 +static UBool U_CALLCONV 1.52 +isAcceptableInvUCA(void * /*context*/, 1.53 + const char * /*type*/, const char * /*name*/, 1.54 + const UDataInfo *pInfo) 1.55 +{ 1.56 + /* context, type & name are intentionally not used */ 1.57 + if( pInfo->size>=20 && 1.58 + pInfo->isBigEndian==U_IS_BIG_ENDIAN && 1.59 + pInfo->charsetFamily==U_CHARSET_FAMILY && 1.60 + pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 && /* dataFormat="InvC" */ 1.61 + pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 && 1.62 + pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 && 1.63 + pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 && 1.64 + pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 && 1.65 + pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&& 1.66 + //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 && 1.67 + //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 && 1.68 + //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 && 1.69 + ) 1.70 + { 1.71 + // TODO: Check that the invuca data version (pInfo->dataVersion) 1.72 + // matches the ucadata version. 1.73 + return TRUE; 1.74 + } else { 1.75 + return FALSE; 1.76 + } 1.77 +} 1.78 +U_CDECL_END 1.79 + 1.80 +/* 1.81 +* Takes two CEs (lead and continuation) and 1.82 +* compares them as CEs should be compared: 1.83 +* primary vs. primary, secondary vs. secondary 1.84 +* tertiary vs. tertiary 1.85 +*/ 1.86 +static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) { 1.87 + uint32_t s1 = source0, s2, t1 = target0, t2; 1.88 + if(isContinuation(source1)) { 1.89 + s2 = source1; 1.90 + } else { 1.91 + s2 = 0; 1.92 + } 1.93 + if(isContinuation(target1)) { 1.94 + t2 = target1; 1.95 + } else { 1.96 + t2 = 0; 1.97 + } 1.98 + 1.99 + uint32_t s = 0, t = 0; 1.100 + if(s1 == t1 && s2 == t2) { 1.101 + return 0; 1.102 + } 1.103 + s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16); 1.104 + t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16); 1.105 + if(s < t) { 1.106 + return -1; 1.107 + } else if(s > t) { 1.108 + return 1; 1.109 + } else { 1.110 + s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8; 1.111 + t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8; 1.112 + if(s < t) { 1.113 + return -1; 1.114 + } else if(s > t) { 1.115 + return 1; 1.116 + } else { 1.117 + s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF); 1.118 + t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF); 1.119 + if(s < t) { 1.120 + return -1; 1.121 + } else { 1.122 + return 1; 1.123 + } 1.124 + } 1.125 + } 1.126 +} 1.127 + 1.128 +static 1.129 +int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) { 1.130 + uint32_t bottom = 0, top = src->invUCA->tableSize; 1.131 + uint32_t i = 0; 1.132 + uint32_t first = 0, second = 0; 1.133 + uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 1.134 + int32_t res = 0; 1.135 + 1.136 + while(bottom < top-1) { 1.137 + i = (top+bottom)/2; 1.138 + first = *(CETable+3*i); 1.139 + second = *(CETable+3*i+1); 1.140 + res = compareCEs(first, second, CE, SecondCE); 1.141 + if(res > 0) { 1.142 + top = i; 1.143 + } else if(res < 0) { 1.144 + bottom = i; 1.145 + } else { 1.146 + break; 1.147 + } 1.148 + } 1.149 + 1.150 + /* weiv: */ 1.151 + /* in searching for elements, I have removed the failure */ 1.152 + /* The reason for this is that the builder does not rely */ 1.153 + /* on search mechanism telling it that it didn't find an */ 1.154 + /* element. However, indirect positioning relies on being */ 1.155 + /* able to find the elements around any CE, even if it is */ 1.156 + /* not defined in the UCA. */ 1.157 + return i; 1.158 + /* 1.159 + if((first == CE && second == SecondCE)) { 1.160 + return i; 1.161 + } else { 1.162 + return -1; 1.163 + } 1.164 + */ 1.165 +} 1.166 + 1.167 +static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = { 1.168 + 0xFFFF0000, 1.169 + 0xFFFFFF00, 1.170 + 0xFFFFFFFF 1.171 +}; 1.172 + 1.173 +U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src, 1.174 + uint32_t CE, uint32_t contCE, 1.175 + uint32_t *nextCE, uint32_t *nextContCE, 1.176 + uint32_t strength) 1.177 +{ 1.178 + uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 1.179 + int32_t iCE; 1.180 + 1.181 + iCE = ucol_inv_findCE(src, CE, contCE); 1.182 + 1.183 + if(iCE<0) { 1.184 + *nextCE = UCOL_NOT_FOUND; 1.185 + return -1; 1.186 + } 1.187 + 1.188 + CE &= strengthMask[strength]; 1.189 + contCE &= strengthMask[strength]; 1.190 + 1.191 + *nextCE = CE; 1.192 + *nextContCE = contCE; 1.193 + 1.194 + while((*nextCE & strengthMask[strength]) == CE 1.195 + && (*nextContCE & strengthMask[strength]) == contCE) 1.196 + { 1.197 + *nextCE = (*(CETable+3*(++iCE))); 1.198 + *nextContCE = (*(CETable+3*(iCE)+1)); 1.199 + } 1.200 + 1.201 + return iCE; 1.202 +} 1.203 + 1.204 +U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src, 1.205 + uint32_t CE, uint32_t contCE, 1.206 + uint32_t *prevCE, uint32_t *prevContCE, 1.207 + uint32_t strength) 1.208 +{ 1.209 + uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 1.210 + int32_t iCE; 1.211 + 1.212 + iCE = ucol_inv_findCE(src, CE, contCE); 1.213 + 1.214 + if(iCE<0) { 1.215 + *prevCE = UCOL_NOT_FOUND; 1.216 + return -1; 1.217 + } 1.218 + 1.219 + CE &= strengthMask[strength]; 1.220 + contCE &= strengthMask[strength]; 1.221 + 1.222 + *prevCE = CE; 1.223 + *prevContCE = contCE; 1.224 + 1.225 + while((*prevCE & strengthMask[strength]) == CE 1.226 + && (*prevContCE & strengthMask[strength])== contCE 1.227 + && iCE > 0) /* this condition should prevent falling off the edge of the world */ 1.228 + { 1.229 + /* here, we end up in a singularity - zero */ 1.230 + *prevCE = (*(CETable+3*(--iCE))); 1.231 + *prevContCE = (*(CETable+3*(iCE)+1)); 1.232 + } 1.233 + 1.234 + return iCE; 1.235 +} 1.236 + 1.237 +U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE, 1.238 + uint32_t prevCE, uint32_t prevContCE) 1.239 +{ 1.240 + if(prevCE == CE && prevContCE == contCE) { 1.241 + return UCOL_IDENTICAL; 1.242 + } 1.243 + if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY]) 1.244 + || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY])) 1.245 + { 1.246 + return UCOL_PRIMARY; 1.247 + } 1.248 + if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY]) 1.249 + || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY])) 1.250 + { 1.251 + return UCOL_SECONDARY; 1.252 + } 1.253 + return UCOL_TERTIARY; 1.254 +} 1.255 + 1.256 + 1.257 +/*static 1.258 +inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) { 1.259 + 1.260 + uint32_t CE = lh->baseCE; 1.261 + uint32_t SecondCE = lh->baseContCE; 1.262 + 1.263 + uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 1.264 + uint32_t previousCE, previousContCE; 1.265 + int32_t iCE; 1.266 + 1.267 + iCE = ucol_inv_findCE(src, CE, SecondCE); 1.268 + 1.269 + if(iCE<0) { 1.270 + return -1; 1.271 + } 1.272 + 1.273 + CE &= strengthMask[strength]; 1.274 + SecondCE &= strengthMask[strength]; 1.275 + 1.276 + previousCE = CE; 1.277 + previousContCE = SecondCE; 1.278 + 1.279 + while((previousCE & strengthMask[strength]) == CE && (previousContCE & strengthMask[strength])== SecondCE) { 1.280 + previousCE = (*(CETable+3*(--iCE))); 1.281 + previousContCE = (*(CETable+3*(iCE)+1)); 1.282 + } 1.283 + lh->previousCE = previousCE; 1.284 + lh->previousContCE = previousContCE; 1.285 + 1.286 + return iCE; 1.287 +}*/ 1.288 + 1.289 +static 1.290 +inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) { 1.291 + uint32_t CE = lh->baseCE; 1.292 + uint32_t SecondCE = lh->baseContCE; 1.293 + 1.294 + uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 1.295 + uint32_t nextCE, nextContCE; 1.296 + int32_t iCE; 1.297 + 1.298 + iCE = ucol_inv_findCE(src, CE, SecondCE); 1.299 + 1.300 + if(iCE<0) { 1.301 + return -1; 1.302 + } 1.303 + 1.304 + CE &= strengthMask[strength]; 1.305 + SecondCE &= strengthMask[strength]; 1.306 + 1.307 + nextCE = CE; 1.308 + nextContCE = SecondCE; 1.309 + 1.310 + while((nextCE & strengthMask[strength]) == CE 1.311 + && (nextContCE & strengthMask[strength]) == SecondCE) 1.312 + { 1.313 + nextCE = (*(CETable+3*(++iCE))); 1.314 + nextContCE = (*(CETable+3*(iCE)+1)); 1.315 + } 1.316 + 1.317 + lh->nextCE = nextCE; 1.318 + lh->nextContCE = nextContCE; 1.319 + 1.320 + return iCE; 1.321 +} 1.322 + 1.323 +static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) { 1.324 + /* reset all the gaps */ 1.325 + int32_t i = 0; 1.326 + uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 1.327 + uint32_t st = 0; 1.328 + uint32_t t1, t2; 1.329 + int32_t pos; 1.330 + 1.331 + UColToken *tok = lh->first; 1.332 + uint32_t tokStrength = tok->strength; 1.333 + 1.334 + for(i = 0; i<3; i++) { 1.335 + lh->gapsHi[3*i] = 0; 1.336 + lh->gapsHi[3*i+1] = 0; 1.337 + lh->gapsHi[3*i+2] = 0; 1.338 + lh->gapsLo[3*i] = 0; 1.339 + lh->gapsLo[3*i+1] = 0; 1.340 + lh->gapsLo[3*i+2] = 0; 1.341 + lh->numStr[i] = 0; 1.342 + lh->fStrToken[i] = NULL; 1.343 + lh->lStrToken[i] = NULL; 1.344 + lh->pos[i] = -1; 1.345 + } 1.346 + 1.347 + UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); 1.348 + 1.349 + if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ 1.350 + //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */ 1.351 + lh->pos[0] = 0; 1.352 + t1 = lh->baseCE; 1.353 + t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION; 1.354 + lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 1.355 + lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 1.356 + lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; 1.357 + uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16); 1.358 + primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1); 1.359 + 1.360 + t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505; 1.361 + t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER; 1.362 + 1.363 + lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 1.364 + lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 1.365 + lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; 1.366 + } else if(lh->indirect == TRUE && lh->nextCE != 0) { 1.367 + //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) { 1.368 + lh->pos[0] = 0; 1.369 + t1 = lh->baseCE; 1.370 + t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION; 1.371 + lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 1.372 + lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 1.373 + lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; 1.374 + t1 = lh->nextCE; 1.375 + t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION; 1.376 + lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 1.377 + lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 1.378 + lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; 1.379 + } else { 1.380 + for(;;) { 1.381 + if(tokStrength < UCOL_CE_STRENGTH_LIMIT) { 1.382 + if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) { 1.383 + lh->fStrToken[tokStrength] = tok; 1.384 + } else { /* The CE must be implicit, since it's not in the table */ 1.385 + /* Error */ 1.386 + *status = U_INTERNAL_PROGRAM_ERROR; 1.387 + } 1.388 + } 1.389 + 1.390 + while(tok != NULL && tok->strength >= tokStrength) { 1.391 + if(tokStrength < UCOL_CE_STRENGTH_LIMIT) { 1.392 + lh->lStrToken[tokStrength] = tok; 1.393 + } 1.394 + tok = tok->next; 1.395 + } 1.396 + if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) { 1.397 + /* check if previous interval is the same and merge the intervals if it is so */ 1.398 + if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) { 1.399 + lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1]; 1.400 + lh->fStrToken[tokStrength+1] = NULL; 1.401 + lh->lStrToken[tokStrength+1] = NULL; 1.402 + lh->pos[tokStrength+1] = -1; 1.403 + } 1.404 + } 1.405 + if(tok != NULL) { 1.406 + tokStrength = tok->strength; 1.407 + } else { 1.408 + break; 1.409 + } 1.410 + } 1.411 + for(st = 0; st < 3; st++) { 1.412 + if((pos = lh->pos[st]) >= 0) { 1.413 + t1 = *(CETable+3*(pos)); 1.414 + t2 = *(CETable+3*(pos)+1); 1.415 + lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 1.416 + lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 1.417 + //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; 1.418 + lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16; 1.419 + //pos--; 1.420 + //t1 = *(CETable+3*(pos)); 1.421 + //t2 = *(CETable+3*(pos)+1); 1.422 + t1 = lh->baseCE; 1.423 + t2 = lh->baseContCE; 1.424 + lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 1.425 + lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 1.426 + lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16; 1.427 + } 1.428 + } 1.429 + } 1.430 +} 1.431 + 1.432 + 1.433 +#define ucol_countBytes(value, noOfBytes) \ 1.434 +{ \ 1.435 + uint32_t mask = 0xFFFFFFFF; \ 1.436 + (noOfBytes) = 0; \ 1.437 + while(mask != 0) { \ 1.438 + if(((value) & mask) != 0) { \ 1.439 + (noOfBytes)++; \ 1.440 + } \ 1.441 + mask >>= 8; \ 1.442 + } \ 1.443 +} 1.444 + 1.445 +static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) { 1.446 + if(U_SUCCESS(*status)) { 1.447 + g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); 1.448 + } 1.449 + return g->current; 1.450 +} 1.451 + 1.452 +static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) { 1.453 + /* TODO: rename to enum names */ 1.454 + uint32_t high, low, count=1; 1.455 + uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF; 1.456 + 1.457 + if(strength == UCOL_SECONDARY) { 1.458 + low = UCOL_COMMON_TOP2<<24; 1.459 + high = 0xFFFFFFFF; 1.460 + count = 0xFF - UCOL_COMMON_TOP2; 1.461 + } else { 1.462 + low = UCOL_BYTE_COMMON << 24; //0x05000000; 1.463 + high = 0x40000000; 1.464 + count = 0x40 - UCOL_BYTE_COMMON; 1.465 + } 1.466 + 1.467 + if(tok->next != NULL && tok->next->strength == strength) { 1.468 + count = tok->next->toInsert; 1.469 + } 1.470 + 1.471 + g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges); 1.472 + g->current = UCOL_BYTE_COMMON<<24; 1.473 + 1.474 + if(g->noOfRanges == 0) { 1.475 + *status = U_INTERNAL_PROGRAM_ERROR; 1.476 + } 1.477 + return g->current; 1.478 +} 1.479 + 1.480 +static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) { 1.481 + uint32_t strength = tok->strength; 1.482 + uint32_t low = lows[fStrength*3+strength]; 1.483 + uint32_t high = highs[fStrength*3+strength]; 1.484 + uint32_t maxByte = 0; 1.485 + if(strength == UCOL_TERTIARY) { 1.486 + maxByte = 0x3F; 1.487 + } else if(strength == UCOL_PRIMARY) { 1.488 + maxByte = 0xFE; 1.489 + } else { 1.490 + maxByte = 0xFF; 1.491 + } 1.492 + 1.493 + uint32_t count = tok->toInsert; 1.494 + 1.495 + if(low >= high && strength > UCOL_PRIMARY) { 1.496 + int32_t s = strength; 1.497 + for(;;) { 1.498 + s--; 1.499 + if(lows[fStrength*3+s] != highs[fStrength*3+s]) { 1.500 + if(strength == UCOL_SECONDARY) { 1.501 + if (low < UCOL_COMMON_TOP2<<24 ) { 1.502 + // Override if low range is less than UCOL_COMMON_TOP2. 1.503 + low = UCOL_COMMON_TOP2<<24; 1.504 + } 1.505 + high = 0xFFFFFFFF; 1.506 + } else { 1.507 + // Override if low range is less than UCOL_COMMON_BOT3. 1.508 + if ( low < UCOL_COMMON_BOT3<<24 ) { 1.509 + low = UCOL_COMMON_BOT3<<24; 1.510 + } 1.511 + high = 0x40000000; 1.512 + } 1.513 + break; 1.514 + } 1.515 + if(s<0) { 1.516 + *status = U_INTERNAL_PROGRAM_ERROR; 1.517 + return 0; 1.518 + } 1.519 + } 1.520 + } 1.521 + 1.522 + if(low < 0x02000000) { 1.523 + // We must not use CE weight byte 02, so we set it as the minimum lower bound. 1.524 + // See http://site.icu-project.org/design/collation/bytes 1.525 + low = 0x02000000; 1.526 + } 1.527 + 1.528 + if(strength == UCOL_SECONDARY) { /* similar as simple */ 1.529 + if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) { 1.530 + low = UCOL_COMMON_TOP2<<24; 1.531 + } 1.532 + if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) { 1.533 + high = UCOL_COMMON_TOP2<<24; 1.534 + } 1.535 + if(low < (UCOL_COMMON_BOT2<<24)) { 1.536 + g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges); 1.537 + g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); 1.538 + //g->current = UCOL_COMMON_BOT2<<24; 1.539 + return g->current; 1.540 + } 1.541 + } 1.542 + 1.543 + g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges); 1.544 + if(g->noOfRanges == 0) { 1.545 + *status = U_INTERNAL_PROGRAM_ERROR; 1.546 + } 1.547 + g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); 1.548 + return g->current; 1.549 +} 1.550 + 1.551 +static 1.552 +uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) { 1.553 + uint32_t i = 0; 1.554 + UChar c; 1.555 + 1.556 + if(U_FAILURE(*status)) { 1.557 + return 0; 1.558 + } 1.559 + 1.560 + if(sourceLen > resLen) { 1.561 + *status = U_MEMORY_ALLOCATION_ERROR; 1.562 + return 0; 1.563 + } 1.564 + 1.565 + for(i = 0; i < sourceLen; i++) { 1.566 + c = source[i]; 1.567 + if(0x3041 <= c && c <= 0x30FA) { /* Kana range */ 1.568 + switch(c - 0x3000) { 1.569 + case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E: 1.570 + case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE: 1.571 + c++; 1.572 + break; 1.573 + case 0xF5: 1.574 + c = 0x30AB; 1.575 + break; 1.576 + case 0xF6: 1.577 + c = 0x30B1; 1.578 + break; 1.579 + } 1.580 + } 1.581 + resBuf[i] = c; 1.582 + } 1.583 + return sourceLen; 1.584 +} 1.585 + 1.586 +static 1.587 +uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) { 1.588 + uint32_t i = 0; 1.589 + UChar c; 1.590 + 1.591 + if(U_FAILURE(*status)) { 1.592 + return 0; 1.593 + } 1.594 + 1.595 + if(sourceLen > resLen) { 1.596 + *status = U_MEMORY_ALLOCATION_ERROR; 1.597 + return 0; 1.598 + } 1.599 + 1.600 + for(i = 0; i < sourceLen; i++) { 1.601 + c = source[i]; 1.602 + if(0x3041 <= c && c <= 0x30FA) { /* Kana range */ 1.603 + switch(c - 0x3000) { 1.604 + case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F: 1.605 + case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF: 1.606 + c--; 1.607 + break; 1.608 + case 0xAB: 1.609 + c = 0x30F5; 1.610 + break; 1.611 + case 0xB1: 1.612 + c = 0x30F6; 1.613 + break; 1.614 + } 1.615 + } 1.616 + resBuf[i] = c; 1.617 + } 1.618 + return sourceLen; 1.619 +} 1.620 + 1.621 +U_NAMESPACE_BEGIN 1.622 + 1.623 +static 1.624 +uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) { 1.625 + uint32_t i = 0; 1.626 + UChar n[128]; 1.627 + uint32_t nLen = 0; 1.628 + uint32_t uCount = 0, lCount = 0; 1.629 + 1.630 + collIterate s; 1.631 + uint32_t order = 0; 1.632 + 1.633 + if(U_FAILURE(*status)) { 1.634 + return UCOL_LOWER_CASE; 1.635 + } 1.636 + 1.637 + nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status); 1.638 + if(U_SUCCESS(*status)) { 1.639 + for(i = 0; i < nLen; i++) { 1.640 + uprv_init_collIterate(UCA, &n[i], 1, &s, status); 1.641 + order = ucol_getNextCE(UCA, &s, status); 1.642 + if(isContinuation(order)) { 1.643 + *status = U_INTERNAL_PROGRAM_ERROR; 1.644 + return UCOL_LOWER_CASE; 1.645 + } 1.646 + if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) { 1.647 + uCount++; 1.648 + } else { 1.649 + if(u_islower(n[i])) { 1.650 + lCount++; 1.651 + } else if(U_SUCCESS(*status)) { 1.652 + UChar sk[1], lk[1]; 1.653 + u_toSmallKana(&n[i], 1, sk, 1, status); 1.654 + u_toLargeKana(&n[i], 1, lk, 1, status); 1.655 + if(sk[0] == n[i] && lk[0] != n[i]) { 1.656 + lCount++; 1.657 + } 1.658 + } 1.659 + } 1.660 + } 1.661 + } 1.662 + 1.663 + if(uCount != 0 && lCount != 0) { 1.664 + return UCOL_MIXED_CASE; 1.665 + } else if(uCount != 0) { 1.666 + return UCOL_UPPER_CASE; 1.667 + } else { 1.668 + return UCOL_LOWER_CASE; 1.669 + } 1.670 +} 1.671 + 1.672 + 1.673 +U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) { 1.674 + /* this one makes the table and stuff */ 1.675 + uint32_t noOfBytes[3]; 1.676 + uint32_t i; 1.677 + 1.678 + for(i = 0; i<3; i++) { 1.679 + ucol_countBytes(CEparts[i], noOfBytes[i]); 1.680 + } 1.681 + 1.682 + /* Here we have to pack CEs from parts */ 1.683 + 1.684 + uint32_t CEi = 0; 1.685 + uint32_t value = 0; 1.686 + 1.687 + while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) { 1.688 + if(CEi > 0) { 1.689 + value = UCOL_CONTINUATION_MARKER; /* Continuation marker */ 1.690 + } else { 1.691 + value = 0; 1.692 + } 1.693 + 1.694 + if(2*CEi<noOfBytes[0]) { 1.695 + value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16; 1.696 + } 1.697 + if(CEi<noOfBytes[1]) { 1.698 + value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8; 1.699 + } 1.700 + if(CEi<noOfBytes[2]) { 1.701 + value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F); 1.702 + } 1.703 + tok->CEs[CEi] = value; 1.704 + CEi++; 1.705 + } 1.706 + if(CEi == 0) { /* totally ignorable */ 1.707 + tok->noOfCEs = 1; 1.708 + tok->CEs[0] = 0; 1.709 + } else { /* there is at least something */ 1.710 + tok->noOfCEs = CEi; 1.711 + } 1.712 + 1.713 + 1.714 + // we want to set case bits here and now, not later. 1.715 + // Case bits handling 1.716 + if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables 1.717 + tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field 1.718 + int32_t cSize = (tok->source & 0xFF000000) >> 24; 1.719 + UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source; 1.720 + 1.721 + if(cSize > 1) { 1.722 + // Do it manually 1.723 + tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status); 1.724 + } else { 1.725 + // Copy it from the UCA 1.726 + uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status); 1.727 + tok->CEs[0] |= (caseCE & 0xC0); 1.728 + } 1.729 + } 1.730 + 1.731 +#if UCOL_DEBUG==2 1.732 + fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2])); 1.733 + for(i = 0; i<tok->noOfCEs; i++) { 1.734 + fprintf(stderr, "%08X ", tok->CEs[i]); 1.735 + } 1.736 + fprintf(stderr, "\n"); 1.737 +#endif 1.738 +} 1.739 + 1.740 +U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) { 1.741 + ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT]; 1.742 + uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT]; 1.743 + 1.744 + UColToken *tok = lh->last; 1.745 + uint32_t t[UCOL_STRENGTH_LIMIT]; 1.746 + 1.747 + uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t)); 1.748 + 1.749 + /* must initialize ranges to avoid memory check warnings */ 1.750 + for (int i = 0; i < UCOL_CE_STRENGTH_LIMIT; i++) { 1.751 + uprv_memset(Gens[i].ranges, 0, sizeof(Gens[i].ranges)); 1.752 + } 1.753 + 1.754 + tok->toInsert = 1; 1.755 + t[tok->strength] = 1; 1.756 + 1.757 + while(tok->previous != NULL) { 1.758 + if(tok->previous->strength < tok->strength) { /* going up */ 1.759 + t[tok->strength] = 0; 1.760 + t[tok->previous->strength]++; 1.761 + } else if(tok->previous->strength > tok->strength) { /* going down */ 1.762 + t[tok->previous->strength] = 1; 1.763 + } else { 1.764 + t[tok->strength]++; 1.765 + } 1.766 + tok=tok->previous; 1.767 + tok->toInsert = t[tok->strength]; 1.768 + } 1.769 + 1.770 + tok->toInsert = t[tok->strength]; 1.771 + ucol_inv_getGapPositions(src, lh, status); 1.772 + 1.773 +#if UCOL_DEBUG 1.774 + fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE); 1.775 + int32_t j = 2; 1.776 + for(j = 2; j >= 0; j--) { 1.777 + fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]); 1.778 + fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]); 1.779 + } 1.780 + tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE]; 1.781 + 1.782 + do { 1.783 + fprintf(stderr,"%i", tok->strength); 1.784 + tok = tok->next; 1.785 + } while(tok != NULL); 1.786 + fprintf(stderr, "\n"); 1.787 + 1.788 + tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE]; 1.789 + 1.790 + do { 1.791 + fprintf(stderr,"%i", tok->toInsert); 1.792 + tok = tok->next; 1.793 + } while(tok != NULL); 1.794 +#endif 1.795 + 1.796 + tok = lh->first; 1.797 + uint32_t fStrength = UCOL_IDENTICAL; 1.798 + uint32_t initStrength = UCOL_IDENTICAL; 1.799 + 1.800 + 1.801 + CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16; 1.802 + CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8; 1.803 + CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16; 1.804 + 1.805 + while (tok != NULL && U_SUCCESS(*status)) { 1.806 + fStrength = tok->strength; 1.807 + if(fStrength < initStrength) { 1.808 + initStrength = fStrength; 1.809 + if(lh->pos[fStrength] == -1) { 1.810 + while(lh->pos[fStrength] == -1 && fStrength > 0) { 1.811 + fStrength--; 1.812 + } 1.813 + if(lh->pos[fStrength] == -1) { 1.814 + *status = U_INTERNAL_PROGRAM_ERROR; 1.815 + return; 1.816 + } 1.817 + } 1.818 + if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */ 1.819 + CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3]; 1.820 + CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1]; 1.821 + /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */ 1.822 + CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); 1.823 + } else if(initStrength == UCOL_SECONDARY) { /* secondaries */ 1.824 + CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3]; 1.825 + /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/ 1.826 + CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); 1.827 + CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); 1.828 + } else { /* primaries */ 1.829 + /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/ 1.830 + CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); 1.831 + CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status); 1.832 + CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); 1.833 + } 1.834 + } else { 1.835 + if(tok->strength == UCOL_TERTIARY) { 1.836 + CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status); 1.837 + } else if(tok->strength == UCOL_SECONDARY) { 1.838 + CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status); 1.839 + CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); 1.840 + } else if(tok->strength == UCOL_PRIMARY) { 1.841 + CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status); 1.842 + CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status); 1.843 + CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); 1.844 + } 1.845 + } 1.846 + ucol_doCE(src, CEparts, tok, status); 1.847 + tok = tok->next; 1.848 + } 1.849 +} 1.850 + 1.851 +U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) { 1.852 + UCAElements el; 1.853 + UColToken *tok = lh->first; 1.854 + UColToken *expt = NULL; 1.855 + uint32_t i = 0, j = 0; 1.856 + const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status); 1.857 + 1.858 + while(tok != NULL && U_SUCCESS(*status)) { 1.859 + /* first, check if there are any expansions */ 1.860 + /* if there are expansions, we need to do a little bit more processing */ 1.861 + /* since parts of expansion can be tailored, while others are not */ 1.862 + if(tok->expansion != 0) { 1.863 + uint32_t len = tok->expansion >> 24; 1.864 + uint32_t currentSequenceLen = len; 1.865 + uint32_t expOffset = tok->expansion & 0x00FFFFFF; 1.866 + //uint32_t exp = currentSequenceLen | expOffset; 1.867 + UColToken exp; 1.868 + exp.source = currentSequenceLen | expOffset; 1.869 + exp.rulesToParseHdl = &(src->source); 1.870 + 1.871 + while(len > 0) { 1.872 + currentSequenceLen = len; 1.873 + while(currentSequenceLen > 0) { 1.874 + exp.source = (currentSequenceLen << 24) | expOffset; 1.875 + if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */ 1.876 + uint32_t noOfCEsToCopy = expt->noOfCEs; 1.877 + for(j = 0; j<noOfCEsToCopy; j++) { 1.878 + tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j]; 1.879 + } 1.880 + tok->noOfExpCEs += noOfCEsToCopy; 1.881 + // Smart people never try to add codepoints and CEs. 1.882 + // For some odd reason, it won't work. 1.883 + expOffset += currentSequenceLen; //noOfCEsToCopy; 1.884 + len -= currentSequenceLen; //noOfCEsToCopy; 1.885 + break; 1.886 + } else { 1.887 + currentSequenceLen--; 1.888 + } 1.889 + } 1.890 + if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */ 1.891 + /* will have to get one from UCA */ 1.892 + /* first, get the UChars from the rules */ 1.893 + /* then pick CEs out until there is no more and stuff them into expansion */ 1.894 + collIterate s; 1.895 + uint32_t order = 0; 1.896 + uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status); 1.897 + 1.898 + for(;;) { 1.899 + order = ucol_getNextCE(src->UCA, &s, status); 1.900 + if(order == UCOL_NO_MORE_CES) { 1.901 + break; 1.902 + } 1.903 + tok->expCEs[tok->noOfExpCEs++] = order; 1.904 + } 1.905 + expOffset++; 1.906 + len--; 1.907 + } 1.908 + } 1.909 + } else { 1.910 + tok->noOfExpCEs = 0; 1.911 + } 1.912 + 1.913 + /* set the ucaelement with obtained values */ 1.914 + el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs; 1.915 + /* copy CEs */ 1.916 + for(i = 0; i<tok->noOfCEs; i++) { 1.917 + el.CEs[i] = tok->CEs[i]; 1.918 + } 1.919 + for(i = 0; i<tok->noOfExpCEs; i++) { 1.920 + el.CEs[i+tok->noOfCEs] = tok->expCEs[i]; 1.921 + } 1.922 + 1.923 + /* copy UChars */ 1.924 + // We kept prefix and source kind of together, as it is a kind of a contraction. 1.925 + // However, now we have to slice the prefix off the main thing - 1.926 + el.prefix = el.prefixChars; 1.927 + el.cPoints = el.uchars; 1.928 + if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the 1.929 + // addPrefix function in ucol_elm. The reason is that we need to add both composed AND 1.930 + // decomposed elements to the unsaf table. 1.931 + el.prefixSize = tok->prefix>>24; 1.932 + uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar)); 1.933 + 1.934 + el.cSize = (tok->source >> 24)-(tok->prefix>>24); 1.935 + uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar)); 1.936 + } else { 1.937 + el.prefixSize = 0; 1.938 + *el.prefix = 0; 1.939 + 1.940 + el.cSize = (tok->source >> 24); 1.941 + uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar)); 1.942 + } 1.943 + if(src->UCA != NULL) { 1.944 + for(i = 0; i<el.cSize; i++) { 1.945 + if(UCOL_ISJAMO(el.cPoints[i])) { 1.946 + t->image->jamoSpecial = TRUE; 1.947 + } 1.948 + } 1.949 + if (!src->buildCCTabFlag && el.cSize > 0) { 1.950 + // Check the trailing canonical combining class (tccc) of the last character. 1.951 + const UChar *s = el.cPoints + el.cSize; 1.952 + uint16_t fcd = nfcImpl->previousFCD16(el.cPoints, s); 1.953 + if ((fcd & 0xff) != 0) { 1.954 + src->buildCCTabFlag = TRUE; 1.955 + } 1.956 + } 1.957 + } 1.958 + 1.959 + /* and then, add it */ 1.960 +#if UCOL_DEBUG==2 1.961 + fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]); 1.962 +#endif 1.963 + uprv_uca_addAnElement(t, &el, status); 1.964 + 1.965 +#if UCOL_DEBUG_DUPLICATES 1.966 + if(*status != U_ZERO_ERROR) { 1.967 + fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource); 1.968 + *status = U_ZERO_ERROR; 1.969 + } 1.970 +#endif 1.971 + 1.972 + tok = tok->next; 1.973 + } 1.974 +} 1.975 + 1.976 +U_CDECL_BEGIN 1.977 +static UBool U_CALLCONV 1.978 +_processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) { 1.979 + UErrorCode status = U_ZERO_ERROR; 1.980 + tempUCATable *t = (tempUCATable *)context; 1.981 + if(value == 0) { 1.982 + while(start < limit) { 1.983 + uint32_t CE = utrie_get32(t->mapping, start, NULL); 1.984 + if(CE == UCOL_NOT_FOUND) { 1.985 + UCAElements el; 1.986 + el.isThai = FALSE; 1.987 + el.prefixSize = 0; 1.988 + el.prefixChars[0] = 0; 1.989 + el.prefix = el.prefixChars; 1.990 + el.cPoints = el.uchars; 1.991 + 1.992 + el.cSize = 0; 1.993 + U16_APPEND_UNSAFE(el.uchars, el.cSize, start); 1.994 + 1.995 + el.noOfCEs = 1; 1.996 + el.CEs[0] = 0; 1.997 + uprv_uca_addAnElement(t, &el, &status); 1.998 + 1.999 + } 1.1000 + start++; 1.1001 + } 1.1002 + } 1.1003 + if(U_FAILURE(status)) { 1.1004 + return FALSE; 1.1005 + } else { 1.1006 + return TRUE; 1.1007 + } 1.1008 +} 1.1009 +U_CDECL_END 1.1010 + 1.1011 +static void 1.1012 +ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t, 1.1013 + UChar32 start, UChar32 end, 1.1014 + UErrorCode *status) 1.1015 +{ 1.1016 + //UChar decomp[256]; 1.1017 + uint32_t CE = UCOL_NOT_FOUND; 1.1018 + UChar32 u = 0; 1.1019 + UCAElements el; 1.1020 + el.isThai = FALSE; 1.1021 + el.prefixSize = 0; 1.1022 + el.prefixChars[0] = 0; 1.1023 + collIterate colIt; 1.1024 + 1.1025 + if(U_SUCCESS(*status)) { 1.1026 + for(u = start; u<=end; u++) { 1.1027 + if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND 1.1028 + /* this test is for contractions that are missing the starting element. */ 1.1029 + || ((isCntTableElement(CE)) && 1.1030 + (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND)) 1.1031 + ) 1.1032 + { 1.1033 + el.cSize = 0; 1.1034 + U16_APPEND_UNSAFE(el.uchars, el.cSize, u); 1.1035 + //decomp[0] = (UChar)u; 1.1036 + //el.uchars[0] = (UChar)u; 1.1037 + el.cPoints = el.uchars; 1.1038 + //el.cSize = 1; 1.1039 + el.noOfCEs = 0; 1.1040 + el.prefix = el.prefixChars; 1.1041 + el.prefixSize = 0; 1.1042 + //uprv_init_collIterate(src->UCA, decomp, 1, &colIt); 1.1043 + // We actually want to check whether this element is a special 1.1044 + // If it is an implicit element (hangul, CJK - we want to copy the 1.1045 + // special, not the resolved CEs) - for hangul, copying resolved 1.1046 + // would just make things the same (there is an expansion and it 1.1047 + // takes approximately the same amount of time to resolve as 1.1048 + // falling back to the UCA). 1.1049 + /* 1.1050 + UTRIE_GET32(src->UCA->mapping, u, CE); 1.1051 + tag = getCETag(CE); 1.1052 + if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG 1.1053 + || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG 1.1054 + || tag == LEAD_SURROGATE_TAG) { 1.1055 + el.CEs[el.noOfCEs++] = CE; 1.1056 + } else { 1.1057 + */ 1.1058 + // It turns out that it does not make sense to keep implicits 1.1059 + // unresolved. The cost of resolving them is big enough so that 1.1060 + // it doesn't make any difference whether we have to go to the UCA 1.1061 + // or not. 1.1062 + { 1.1063 + uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status); 1.1064 + while(CE != UCOL_NO_MORE_CES) { 1.1065 + CE = ucol_getNextCE(src->UCA, &colIt, status); 1.1066 + if(CE != UCOL_NO_MORE_CES) { 1.1067 + el.CEs[el.noOfCEs++] = CE; 1.1068 + } 1.1069 + } 1.1070 + } 1.1071 + uprv_uca_addAnElement(t, &el, status); 1.1072 + } 1.1073 + } 1.1074 + } 1.1075 +} 1.1076 + 1.1077 +U_NAMESPACE_END 1.1078 + 1.1079 +U_CFUNC UCATableHeader * 1.1080 +ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) { 1.1081 + U_NAMESPACE_USE 1.1082 + 1.1083 + uint32_t i = 0; 1.1084 + if(U_FAILURE(*status)) { 1.1085 + return NULL; 1.1086 + } 1.1087 + /* 1.1088 + 2. Eliminate the negative lists by doing the following for each non-null negative list: 1.1089 + o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE, 1.1090 + create new ListHeader X 1.1091 + o reverse the list, add to the end of X's positive list. Reset the strength of the 1.1092 + first item you add, based on the stronger strength levels of the two lists. 1.1093 + */ 1.1094 + /* 1.1095 + 3. For each ListHeader with a non-null positive list: 1.1096 + */ 1.1097 + /* 1.1098 + o Find all character strings with CEs between the baseCE and the 1.1099 + next/previous CE, at the strength of the first token. Add these to the 1.1100 + tailoring. 1.1101 + ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the 1.1102 + tailoring has & x < z... 1.1103 + ? Then we change the tailoring to & x <<< X << x' <<< X' < z ... 1.1104 + */ 1.1105 + /* It is possible that this part should be done even while constructing list */ 1.1106 + /* The problem is that it is unknown what is going to be the strongest weight */ 1.1107 + /* So we might as well do it here */ 1.1108 + 1.1109 + /* 1.1110 + o Allocate CEs for each token in the list, based on the total number N of the 1.1111 + largest level difference, and the gap G between baseCE and nextCE at that 1.1112 + level. The relation * between the last item and nextCE is the same as the 1.1113 + strongest strength. 1.1114 + o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1) 1.1115 + ? There are 3 primary items: a, d, e. Fit them into the primary gap. 1.1116 + Then fit b and c into the secondary gap between a and d, then fit q 1.1117 + into the tertiary gap between b and c. 1.1118 + 1.1119 + o Example: baseCE << b <<< q << c * nextCE(X,2) 1.1120 + ? There are 2 secondary items: b, c. Fit them into the secondary gap. 1.1121 + Then fit q into the tertiary gap between b and c. 1.1122 + o When incrementing primary values, we will not cross high byte 1.1123 + boundaries except where there is only a single-byte primary. That is to 1.1124 + ensure that the script reordering will continue to work. 1.1125 + */ 1.1126 + UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader)); 1.1127 + /* test for NULL */ 1.1128 + if (image == NULL) { 1.1129 + *status = U_MEMORY_ALLOCATION_ERROR; 1.1130 + return NULL; 1.1131 + } 1.1132 + uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader)); 1.1133 + 1.1134 + for(i = 0; i<src->resultLen; i++) { 1.1135 + /* now we need to generate the CEs */ 1.1136 + /* We stuff the initial value in the buffers, and increase the appropriate buffer */ 1.1137 + /* According to strength */ 1.1138 + if(U_SUCCESS(*status)) { 1.1139 + if(src->lh[i].first) { // if there are any elements 1.1140 + // due to the way parser works, subsequent tailorings 1.1141 + // may remove all the elements from a sequence, therefore 1.1142 + // leaving an empty tailoring sequence. 1.1143 + ucol_initBuffers(src, &src->lh[i], status); 1.1144 + } 1.1145 + } 1.1146 + if(U_FAILURE(*status)) { 1.1147 + uprv_free(image); 1.1148 + return NULL; 1.1149 + } 1.1150 + } 1.1151 + 1.1152 + if(src->varTop != NULL) { /* stuff the variable top value */ 1.1153 + src->opts->variableTopValue = (*(src->varTop->CEs))>>16; 1.1154 + /* remove it from the list */ 1.1155 + if(src->varTop->listHeader->first == src->varTop) { /* first in list */ 1.1156 + src->varTop->listHeader->first = src->varTop->next; 1.1157 + } 1.1158 + if(src->varTop->listHeader->last == src->varTop) { /* first in list */ 1.1159 + src->varTop->listHeader->last = src->varTop->previous; 1.1160 + } 1.1161 + if(src->varTop->next != NULL) { 1.1162 + src->varTop->next->previous = src->varTop->previous; 1.1163 + } 1.1164 + if(src->varTop->previous != NULL) { 1.1165 + src->varTop->previous->next = src->varTop->next; 1.1166 + } 1.1167 + } 1.1168 + 1.1169 + 1.1170 + tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status); 1.1171 + if(U_FAILURE(*status)) { 1.1172 + uprv_free(image); 1.1173 + return NULL; 1.1174 + } 1.1175 + 1.1176 + 1.1177 + /* After this, we have assigned CE values to all regular CEs */ 1.1178 + /* now we will go through list once more and resolve expansions, */ 1.1179 + /* make UCAElements structs and add them to table */ 1.1180 + for(i = 0; i<src->resultLen; i++) { 1.1181 + /* now we need to generate the CEs */ 1.1182 + /* We stuff the initial value in the buffers, and increase the appropriate buffer */ 1.1183 + /* According to strength */ 1.1184 + if(U_SUCCESS(*status)) { 1.1185 + ucol_createElements(src, t, &src->lh[i], status); 1.1186 + } 1.1187 + } 1.1188 + 1.1189 + UCAElements el; 1.1190 + el.isThai = FALSE; 1.1191 + el.prefixSize = 0; 1.1192 + el.prefixChars[0] = 0; 1.1193 + 1.1194 + /* add latin-1 stuff */ 1.1195 + ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status); 1.1196 + 1.1197 + /* add stuff for copying */ 1.1198 + if(src->copySet != NULL) { 1.1199 + int32_t i = 0; 1.1200 + UnicodeSet *set = (UnicodeSet *)src->copySet; 1.1201 + for(i = 0; i < set->getRangeCount(); i++) { 1.1202 + ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status); 1.1203 + } 1.1204 + } 1.1205 + 1.1206 + if(U_SUCCESS(*status)) { 1.1207 + /* copy contractions from the UCA - this is felt mostly for cyrillic*/ 1.1208 + 1.1209 + uint32_t tailoredCE = UCOL_NOT_FOUND; 1.1210 + UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos); 1.1211 + int32_t maxUCAContractionLength = src->UCA->image->contractionUCACombosWidth; 1.1212 + UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status); 1.1213 + // Check for null pointer 1.1214 + if (ucaEl == NULL) { 1.1215 + *status = U_MEMORY_ALLOCATION_ERROR; 1.1216 + return NULL; 1.1217 + } 1.1218 + while(*conts != 0) { 1.1219 + // A continuation is NUL-terminated and NUL-padded 1.1220 + // except if it has the maximum length. 1.1221 + int32_t contractionLength = maxUCAContractionLength; 1.1222 + while(contractionLength > 0 && conts[contractionLength - 1] == 0) { 1.1223 + --contractionLength; 1.1224 + } 1.1225 + UChar32 first; 1.1226 + int32_t firstLength = 0; 1.1227 + U16_NEXT(conts, firstLength, contractionLength, first); 1.1228 + tailoredCE = utrie_get32(t->mapping, first, NULL); 1.1229 + if(tailoredCE != UCOL_NOT_FOUND) { 1.1230 + UBool needToAdd = TRUE; 1.1231 + if(isCntTableElement(tailoredCE)) { 1.1232 + if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+firstLength, status) == TRUE) { 1.1233 + needToAdd = FALSE; 1.1234 + } 1.1235 + } 1.1236 + if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) { 1.1237 + UCAElements elm; 1.1238 + elm.cPoints = el.uchars; 1.1239 + elm.noOfCEs = 0; 1.1240 + elm.uchars[0] = *conts; 1.1241 + elm.uchars[1] = 0; 1.1242 + elm.cSize = 1; 1.1243 + elm.prefixChars[0] = *(conts+2); 1.1244 + elm.isThai = FALSE; 1.1245 + elm.prefix = elm.prefixChars; 1.1246 + elm.prefixSize = 1; 1.1247 + UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm); 1.1248 + if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) { 1.1249 + needToAdd = TRUE; 1.1250 + } 1.1251 + } 1.1252 + if(src->removeSet != NULL && uset_contains(src->removeSet, first)) { 1.1253 + needToAdd = FALSE; 1.1254 + } 1.1255 + 1.1256 + if(needToAdd == TRUE) { // we need to add if this contraction is not tailored. 1.1257 + if (*(conts+1) != 0) { // contractions 1.1258 + el.prefix = el.prefixChars; 1.1259 + el.prefixSize = 0; 1.1260 + el.cPoints = el.uchars; 1.1261 + el.noOfCEs = 0; 1.1262 + u_memcpy(el.uchars, conts, contractionLength); 1.1263 + el.cSize = contractionLength; 1.1264 + ucol_setText(ucaEl, el.uchars, el.cSize, status); 1.1265 + } 1.1266 + else { // pre-context character 1.1267 + UChar str[4] = { 0 }; 1.1268 + int32_t len=0; 1.1269 + int32_t preKeyLen=0; 1.1270 + 1.1271 + el.cPoints = el.uchars; 1.1272 + el.noOfCEs = 0; 1.1273 + el.uchars[0] = *conts; 1.1274 + el.uchars[1] = 0; 1.1275 + el.cSize = 1; 1.1276 + el.prefixChars[0] = *(conts+2); 1.1277 + el.prefix = el.prefixChars; 1.1278 + el.prefixSize = 1; 1.1279 + if (el.prefixChars[0]!=0) { 1.1280 + // get CE of prefix character first 1.1281 + str[0]=el.prefixChars[0]; 1.1282 + str[1]=0; 1.1283 + ucol_setText(ucaEl, str, 1, status); 1.1284 + while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) 1.1285 + != UCOL_NULLORDER) { 1.1286 + preKeyLen++; // count number of keys for prefix character 1.1287 + } 1.1288 + str[len++] = el.prefixChars[0]; 1.1289 + } 1.1290 + 1.1291 + str[len++] = el.uchars[0]; 1.1292 + str[len]=0; 1.1293 + ucol_setText(ucaEl, str, len, status); 1.1294 + // Skip the keys for prefix character, then copy the rest to el. 1.1295 + while ((preKeyLen-->0) && 1.1296 + (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) { 1.1297 + continue; 1.1298 + } 1.1299 + 1.1300 + } 1.1301 + while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) { 1.1302 + el.noOfCEs++; 1.1303 + } 1.1304 + uprv_uca_addAnElement(t, &el, status); 1.1305 + } 1.1306 + 1.1307 + } else if(src->removeSet != NULL && uset_contains(src->removeSet, first)) { 1.1308 + ucol_uprv_bld_copyRangeFromUCA(src, t, first, first, status); 1.1309 + } 1.1310 + conts+=maxUCAContractionLength; 1.1311 + } 1.1312 + ucol_closeElements(ucaEl); 1.1313 + } 1.1314 + 1.1315 + // Add completely ignorable elements 1.1316 + utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t); 1.1317 + 1.1318 + // add tailoring characters related canonical closures 1.1319 + uprv_uca_canonicalClosure(t, src, NULL, status); 1.1320 + 1.1321 + /* still need to produce compatibility closure */ 1.1322 + 1.1323 + UCATableHeader *myData = uprv_uca_assembleTable(t, status); 1.1324 + 1.1325 + uprv_uca_closeTempTable(t); 1.1326 + uprv_free(image); 1.1327 + 1.1328 + return myData; 1.1329 +} 1.1330 + 1.1331 +U_CDECL_BEGIN 1.1332 +static UBool U_CALLCONV 1.1333 +ucol_bld_cleanup(void) 1.1334 +{ 1.1335 + udata_close(invUCA_DATA_MEM); 1.1336 + invUCA_DATA_MEM = NULL; 1.1337 + _staticInvUCA = NULL; 1.1338 + gStaticInvUCAInitOnce.reset(); 1.1339 + return TRUE; 1.1340 +} 1.1341 +U_CDECL_END 1.1342 + 1.1343 +static void U_CALLCONV initInverseUCA(UErrorCode &status) { 1.1344 + U_ASSERT(invUCA_DATA_MEM == NULL); 1.1345 + U_ASSERT(_staticInvUCA == NULL); 1.1346 + ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup); 1.1347 + InverseUCATableHeader *newInvUCA = NULL; 1.1348 + UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, &status); 1.1349 + 1.1350 + if(U_FAILURE(status)) { 1.1351 + if (result) { 1.1352 + udata_close(result); 1.1353 + } 1.1354 + // This is not needed, as we are talking about 1.1355 + // memory we got from UData 1.1356 + //uprv_free(newInvUCA); 1.1357 + return; 1.1358 + } 1.1359 + 1.1360 + if(result != NULL) { /* It looks like sometimes we can fail to find the data file */ 1.1361 + newInvUCA = (InverseUCATableHeader *)udata_getMemory(result); 1.1362 + UCollator *UCA = ucol_initUCA(&status); 1.1363 + // UCA versions of UCA and inverse UCA should match 1.1364 + if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) { 1.1365 + status = U_INVALID_FORMAT_ERROR; 1.1366 + udata_close(result); 1.1367 + return; 1.1368 + } 1.1369 + 1.1370 + invUCA_DATA_MEM = result; 1.1371 + _staticInvUCA = newInvUCA; 1.1372 + } 1.1373 +} 1.1374 + 1.1375 + 1.1376 +U_CAPI const InverseUCATableHeader * U_EXPORT2 1.1377 +ucol_initInverseUCA(UErrorCode *status) 1.1378 +{ 1.1379 + umtx_initOnce(gStaticInvUCAInitOnce, &initInverseUCA, *status); 1.1380 + return _staticInvUCA; 1.1381 +} 1.1382 + 1.1383 +/* This is the data that is used for non-script reordering codes. These _must_ be kept 1.1384 + * in order that they are to be applied as defaults and in synch with the UColReorderCode enum. 1.1385 + */ 1.1386 +static const char * const ReorderingTokenNames[] = { 1.1387 + "SPACE", 1.1388 + "PUNCT", 1.1389 + "SYMBOL", 1.1390 + "CURRENCY", 1.1391 + "DIGIT" 1.1392 +}; 1.1393 + 1.1394 +static void toUpper(const char* src, char* dst, uint32_t length) { 1.1395 + for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) { 1.1396 + *dst = uprv_toupper(*src); 1.1397 + } 1.1398 + *dst = '\0'; 1.1399 +} 1.1400 + 1.1401 +U_INTERNAL int32_t U_EXPORT2 1.1402 +ucol_findReorderingEntry(const char* name) { 1.1403 + char buffer[32]; 1.1404 + toUpper(name, buffer, 32); 1.1405 + for (uint32_t entry = 0; entry < LENGTHOF(ReorderingTokenNames); entry++) { 1.1406 + if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) { 1.1407 + return entry + UCOL_REORDER_CODE_FIRST; 1.1408 + } 1.1409 + } 1.1410 + return USCRIPT_INVALID_CODE; 1.1411 +} 1.1412 + 1.1413 +#endif /* #if !UCONFIG_NO_COLLATION */