intl/icu/source/i18n/ucol_bld.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /*
michael@0 2 *******************************************************************************
michael@0 3 *
michael@0 4 * Copyright (C) 2001-2013, International Business Machines
michael@0 5 * Corporation and others. All Rights Reserved.
michael@0 6 *
michael@0 7 *******************************************************************************
michael@0 8 * file name: ucol_bld.cpp
michael@0 9 * encoding: US-ASCII
michael@0 10 * tab size: 8 (not used)
michael@0 11 * indentation:4
michael@0 12 *
michael@0 13 * created 02/22/2001
michael@0 14 * created by: Vladimir Weinstein
michael@0 15 *
michael@0 16 * This module builds a collator based on the rule set.
michael@0 17 *
michael@0 18 */
michael@0 19
michael@0 20 #include "unicode/utypes.h"
michael@0 21
michael@0 22 #if !UCONFIG_NO_COLLATION
michael@0 23
michael@0 24 #include "unicode/ucoleitr.h"
michael@0 25 #include "unicode/udata.h"
michael@0 26 #include "unicode/uchar.h"
michael@0 27 #include "unicode/uniset.h"
michael@0 28 #include "unicode/uscript.h"
michael@0 29 #include "unicode/ustring.h"
michael@0 30 #include "unicode/utf16.h"
michael@0 31 #include "normalizer2impl.h"
michael@0 32 #include "uassert.h"
michael@0 33 #include "ucol_bld.h"
michael@0 34 #include "ucol_elm.h"
michael@0 35 #include "ucol_cnt.h"
michael@0 36 #include "ucln_in.h"
michael@0 37 #include "umutex.h"
michael@0 38 #include "cmemory.h"
michael@0 39 #include "cstring.h"
michael@0 40
michael@0 41 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
michael@0 42
michael@0 43 static const InverseUCATableHeader* _staticInvUCA = NULL;
michael@0 44 static UDataMemory* invUCA_DATA_MEM = NULL;
michael@0 45 static icu::UInitOnce gStaticInvUCAInitOnce = U_INITONCE_INITIALIZER;
michael@0 46
michael@0 47 U_CDECL_BEGIN
michael@0 48 static UBool U_CALLCONV
michael@0 49 isAcceptableInvUCA(void * /*context*/,
michael@0 50 const char * /*type*/, const char * /*name*/,
michael@0 51 const UDataInfo *pInfo)
michael@0 52 {
michael@0 53 /* context, type & name are intentionally not used */
michael@0 54 if( pInfo->size>=20 &&
michael@0 55 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
michael@0 56 pInfo->charsetFamily==U_CHARSET_FAMILY &&
michael@0 57 pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 && /* dataFormat="InvC" */
michael@0 58 pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
michael@0 59 pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
michael@0 60 pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
michael@0 61 pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
michael@0 62 pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
michael@0 63 //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
michael@0 64 //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
michael@0 65 //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
michael@0 66 )
michael@0 67 {
michael@0 68 // TODO: Check that the invuca data version (pInfo->dataVersion)
michael@0 69 // matches the ucadata version.
michael@0 70 return TRUE;
michael@0 71 } else {
michael@0 72 return FALSE;
michael@0 73 }
michael@0 74 }
michael@0 75 U_CDECL_END
michael@0 76
michael@0 77 /*
michael@0 78 * Takes two CEs (lead and continuation) and
michael@0 79 * compares them as CEs should be compared:
michael@0 80 * primary vs. primary, secondary vs. secondary
michael@0 81 * tertiary vs. tertiary
michael@0 82 */
michael@0 83 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
michael@0 84 uint32_t s1 = source0, s2, t1 = target0, t2;
michael@0 85 if(isContinuation(source1)) {
michael@0 86 s2 = source1;
michael@0 87 } else {
michael@0 88 s2 = 0;
michael@0 89 }
michael@0 90 if(isContinuation(target1)) {
michael@0 91 t2 = target1;
michael@0 92 } else {
michael@0 93 t2 = 0;
michael@0 94 }
michael@0 95
michael@0 96 uint32_t s = 0, t = 0;
michael@0 97 if(s1 == t1 && s2 == t2) {
michael@0 98 return 0;
michael@0 99 }
michael@0 100 s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
michael@0 101 t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
michael@0 102 if(s < t) {
michael@0 103 return -1;
michael@0 104 } else if(s > t) {
michael@0 105 return 1;
michael@0 106 } else {
michael@0 107 s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
michael@0 108 t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
michael@0 109 if(s < t) {
michael@0 110 return -1;
michael@0 111 } else if(s > t) {
michael@0 112 return 1;
michael@0 113 } else {
michael@0 114 s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
michael@0 115 t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
michael@0 116 if(s < t) {
michael@0 117 return -1;
michael@0 118 } else {
michael@0 119 return 1;
michael@0 120 }
michael@0 121 }
michael@0 122 }
michael@0 123 }
michael@0 124
michael@0 125 static
michael@0 126 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
michael@0 127 uint32_t bottom = 0, top = src->invUCA->tableSize;
michael@0 128 uint32_t i = 0;
michael@0 129 uint32_t first = 0, second = 0;
michael@0 130 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
michael@0 131 int32_t res = 0;
michael@0 132
michael@0 133 while(bottom < top-1) {
michael@0 134 i = (top+bottom)/2;
michael@0 135 first = *(CETable+3*i);
michael@0 136 second = *(CETable+3*i+1);
michael@0 137 res = compareCEs(first, second, CE, SecondCE);
michael@0 138 if(res > 0) {
michael@0 139 top = i;
michael@0 140 } else if(res < 0) {
michael@0 141 bottom = i;
michael@0 142 } else {
michael@0 143 break;
michael@0 144 }
michael@0 145 }
michael@0 146
michael@0 147 /* weiv: */
michael@0 148 /* in searching for elements, I have removed the failure */
michael@0 149 /* The reason for this is that the builder does not rely */
michael@0 150 /* on search mechanism telling it that it didn't find an */
michael@0 151 /* element. However, indirect positioning relies on being */
michael@0 152 /* able to find the elements around any CE, even if it is */
michael@0 153 /* not defined in the UCA. */
michael@0 154 return i;
michael@0 155 /*
michael@0 156 if((first == CE && second == SecondCE)) {
michael@0 157 return i;
michael@0 158 } else {
michael@0 159 return -1;
michael@0 160 }
michael@0 161 */
michael@0 162 }
michael@0 163
michael@0 164 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
michael@0 165 0xFFFF0000,
michael@0 166 0xFFFFFF00,
michael@0 167 0xFFFFFFFF
michael@0 168 };
michael@0 169
michael@0 170 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
michael@0 171 uint32_t CE, uint32_t contCE,
michael@0 172 uint32_t *nextCE, uint32_t *nextContCE,
michael@0 173 uint32_t strength)
michael@0 174 {
michael@0 175 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
michael@0 176 int32_t iCE;
michael@0 177
michael@0 178 iCE = ucol_inv_findCE(src, CE, contCE);
michael@0 179
michael@0 180 if(iCE<0) {
michael@0 181 *nextCE = UCOL_NOT_FOUND;
michael@0 182 return -1;
michael@0 183 }
michael@0 184
michael@0 185 CE &= strengthMask[strength];
michael@0 186 contCE &= strengthMask[strength];
michael@0 187
michael@0 188 *nextCE = CE;
michael@0 189 *nextContCE = contCE;
michael@0 190
michael@0 191 while((*nextCE & strengthMask[strength]) == CE
michael@0 192 && (*nextContCE & strengthMask[strength]) == contCE)
michael@0 193 {
michael@0 194 *nextCE = (*(CETable+3*(++iCE)));
michael@0 195 *nextContCE = (*(CETable+3*(iCE)+1));
michael@0 196 }
michael@0 197
michael@0 198 return iCE;
michael@0 199 }
michael@0 200
michael@0 201 U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
michael@0 202 uint32_t CE, uint32_t contCE,
michael@0 203 uint32_t *prevCE, uint32_t *prevContCE,
michael@0 204 uint32_t strength)
michael@0 205 {
michael@0 206 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
michael@0 207 int32_t iCE;
michael@0 208
michael@0 209 iCE = ucol_inv_findCE(src, CE, contCE);
michael@0 210
michael@0 211 if(iCE<0) {
michael@0 212 *prevCE = UCOL_NOT_FOUND;
michael@0 213 return -1;
michael@0 214 }
michael@0 215
michael@0 216 CE &= strengthMask[strength];
michael@0 217 contCE &= strengthMask[strength];
michael@0 218
michael@0 219 *prevCE = CE;
michael@0 220 *prevContCE = contCE;
michael@0 221
michael@0 222 while((*prevCE & strengthMask[strength]) == CE
michael@0 223 && (*prevContCE & strengthMask[strength])== contCE
michael@0 224 && iCE > 0) /* this condition should prevent falling off the edge of the world */
michael@0 225 {
michael@0 226 /* here, we end up in a singularity - zero */
michael@0 227 *prevCE = (*(CETable+3*(--iCE)));
michael@0 228 *prevContCE = (*(CETable+3*(iCE)+1));
michael@0 229 }
michael@0 230
michael@0 231 return iCE;
michael@0 232 }
michael@0 233
michael@0 234 U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE,
michael@0 235 uint32_t prevCE, uint32_t prevContCE)
michael@0 236 {
michael@0 237 if(prevCE == CE && prevContCE == contCE) {
michael@0 238 return UCOL_IDENTICAL;
michael@0 239 }
michael@0 240 if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY])
michael@0 241 || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY]))
michael@0 242 {
michael@0 243 return UCOL_PRIMARY;
michael@0 244 }
michael@0 245 if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY])
michael@0 246 || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY]))
michael@0 247 {
michael@0 248 return UCOL_SECONDARY;
michael@0 249 }
michael@0 250 return UCOL_TERTIARY;
michael@0 251 }
michael@0 252
michael@0 253
michael@0 254 /*static
michael@0 255 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
michael@0 256
michael@0 257 uint32_t CE = lh->baseCE;
michael@0 258 uint32_t SecondCE = lh->baseContCE;
michael@0 259
michael@0 260 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
michael@0 261 uint32_t previousCE, previousContCE;
michael@0 262 int32_t iCE;
michael@0 263
michael@0 264 iCE = ucol_inv_findCE(src, CE, SecondCE);
michael@0 265
michael@0 266 if(iCE<0) {
michael@0 267 return -1;
michael@0 268 }
michael@0 269
michael@0 270 CE &= strengthMask[strength];
michael@0 271 SecondCE &= strengthMask[strength];
michael@0 272
michael@0 273 previousCE = CE;
michael@0 274 previousContCE = SecondCE;
michael@0 275
michael@0 276 while((previousCE & strengthMask[strength]) == CE && (previousContCE & strengthMask[strength])== SecondCE) {
michael@0 277 previousCE = (*(CETable+3*(--iCE)));
michael@0 278 previousContCE = (*(CETable+3*(iCE)+1));
michael@0 279 }
michael@0 280 lh->previousCE = previousCE;
michael@0 281 lh->previousContCE = previousContCE;
michael@0 282
michael@0 283 return iCE;
michael@0 284 }*/
michael@0 285
michael@0 286 static
michael@0 287 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
michael@0 288 uint32_t CE = lh->baseCE;
michael@0 289 uint32_t SecondCE = lh->baseContCE;
michael@0 290
michael@0 291 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
michael@0 292 uint32_t nextCE, nextContCE;
michael@0 293 int32_t iCE;
michael@0 294
michael@0 295 iCE = ucol_inv_findCE(src, CE, SecondCE);
michael@0 296
michael@0 297 if(iCE<0) {
michael@0 298 return -1;
michael@0 299 }
michael@0 300
michael@0 301 CE &= strengthMask[strength];
michael@0 302 SecondCE &= strengthMask[strength];
michael@0 303
michael@0 304 nextCE = CE;
michael@0 305 nextContCE = SecondCE;
michael@0 306
michael@0 307 while((nextCE & strengthMask[strength]) == CE
michael@0 308 && (nextContCE & strengthMask[strength]) == SecondCE)
michael@0 309 {
michael@0 310 nextCE = (*(CETable+3*(++iCE)));
michael@0 311 nextContCE = (*(CETable+3*(iCE)+1));
michael@0 312 }
michael@0 313
michael@0 314 lh->nextCE = nextCE;
michael@0 315 lh->nextContCE = nextContCE;
michael@0 316
michael@0 317 return iCE;
michael@0 318 }
michael@0 319
michael@0 320 static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
michael@0 321 /* reset all the gaps */
michael@0 322 int32_t i = 0;
michael@0 323 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
michael@0 324 uint32_t st = 0;
michael@0 325 uint32_t t1, t2;
michael@0 326 int32_t pos;
michael@0 327
michael@0 328 UColToken *tok = lh->first;
michael@0 329 uint32_t tokStrength = tok->strength;
michael@0 330
michael@0 331 for(i = 0; i<3; i++) {
michael@0 332 lh->gapsHi[3*i] = 0;
michael@0 333 lh->gapsHi[3*i+1] = 0;
michael@0 334 lh->gapsHi[3*i+2] = 0;
michael@0 335 lh->gapsLo[3*i] = 0;
michael@0 336 lh->gapsLo[3*i+1] = 0;
michael@0 337 lh->gapsLo[3*i+2] = 0;
michael@0 338 lh->numStr[i] = 0;
michael@0 339 lh->fStrToken[i] = NULL;
michael@0 340 lh->lStrToken[i] = NULL;
michael@0 341 lh->pos[i] = -1;
michael@0 342 }
michael@0 343
michael@0 344 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
michael@0 345
michael@0 346 if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
michael@0 347 //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
michael@0 348 lh->pos[0] = 0;
michael@0 349 t1 = lh->baseCE;
michael@0 350 t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;
michael@0 351 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
michael@0 352 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
michael@0 353 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
michael@0 354 uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16);
michael@0 355 primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
michael@0 356
michael@0 357 t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
michael@0 358 t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER;
michael@0 359
michael@0 360 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
michael@0 361 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
michael@0 362 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
michael@0 363 } else if(lh->indirect == TRUE && lh->nextCE != 0) {
michael@0 364 //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
michael@0 365 lh->pos[0] = 0;
michael@0 366 t1 = lh->baseCE;
michael@0 367 t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;
michael@0 368 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
michael@0 369 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
michael@0 370 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
michael@0 371 t1 = lh->nextCE;
michael@0 372 t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;
michael@0 373 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
michael@0 374 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
michael@0 375 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
michael@0 376 } else {
michael@0 377 for(;;) {
michael@0 378 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
michael@0 379 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
michael@0 380 lh->fStrToken[tokStrength] = tok;
michael@0 381 } else { /* The CE must be implicit, since it's not in the table */
michael@0 382 /* Error */
michael@0 383 *status = U_INTERNAL_PROGRAM_ERROR;
michael@0 384 }
michael@0 385 }
michael@0 386
michael@0 387 while(tok != NULL && tok->strength >= tokStrength) {
michael@0 388 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
michael@0 389 lh->lStrToken[tokStrength] = tok;
michael@0 390 }
michael@0 391 tok = tok->next;
michael@0 392 }
michael@0 393 if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
michael@0 394 /* check if previous interval is the same and merge the intervals if it is so */
michael@0 395 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
michael@0 396 lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
michael@0 397 lh->fStrToken[tokStrength+1] = NULL;
michael@0 398 lh->lStrToken[tokStrength+1] = NULL;
michael@0 399 lh->pos[tokStrength+1] = -1;
michael@0 400 }
michael@0 401 }
michael@0 402 if(tok != NULL) {
michael@0 403 tokStrength = tok->strength;
michael@0 404 } else {
michael@0 405 break;
michael@0 406 }
michael@0 407 }
michael@0 408 for(st = 0; st < 3; st++) {
michael@0 409 if((pos = lh->pos[st]) >= 0) {
michael@0 410 t1 = *(CETable+3*(pos));
michael@0 411 t2 = *(CETable+3*(pos)+1);
michael@0 412 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
michael@0 413 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
michael@0 414 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
michael@0 415 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
michael@0 416 //pos--;
michael@0 417 //t1 = *(CETable+3*(pos));
michael@0 418 //t2 = *(CETable+3*(pos)+1);
michael@0 419 t1 = lh->baseCE;
michael@0 420 t2 = lh->baseContCE;
michael@0 421 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
michael@0 422 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
michael@0 423 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
michael@0 424 }
michael@0 425 }
michael@0 426 }
michael@0 427 }
michael@0 428
michael@0 429
michael@0 430 #define ucol_countBytes(value, noOfBytes) \
michael@0 431 { \
michael@0 432 uint32_t mask = 0xFFFFFFFF; \
michael@0 433 (noOfBytes) = 0; \
michael@0 434 while(mask != 0) { \
michael@0 435 if(((value) & mask) != 0) { \
michael@0 436 (noOfBytes)++; \
michael@0 437 } \
michael@0 438 mask >>= 8; \
michael@0 439 } \
michael@0 440 }
michael@0 441
michael@0 442 static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
michael@0 443 if(U_SUCCESS(*status)) {
michael@0 444 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
michael@0 445 }
michael@0 446 return g->current;
michael@0 447 }
michael@0 448
michael@0 449 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) {
michael@0 450 /* TODO: rename to enum names */
michael@0 451 uint32_t high, low, count=1;
michael@0 452 uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
michael@0 453
michael@0 454 if(strength == UCOL_SECONDARY) {
michael@0 455 low = UCOL_COMMON_TOP2<<24;
michael@0 456 high = 0xFFFFFFFF;
michael@0 457 count = 0xFF - UCOL_COMMON_TOP2;
michael@0 458 } else {
michael@0 459 low = UCOL_BYTE_COMMON << 24; //0x05000000;
michael@0 460 high = 0x40000000;
michael@0 461 count = 0x40 - UCOL_BYTE_COMMON;
michael@0 462 }
michael@0 463
michael@0 464 if(tok->next != NULL && tok->next->strength == strength) {
michael@0 465 count = tok->next->toInsert;
michael@0 466 }
michael@0 467
michael@0 468 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
michael@0 469 g->current = UCOL_BYTE_COMMON<<24;
michael@0 470
michael@0 471 if(g->noOfRanges == 0) {
michael@0 472 *status = U_INTERNAL_PROGRAM_ERROR;
michael@0 473 }
michael@0 474 return g->current;
michael@0 475 }
michael@0 476
michael@0 477 static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
michael@0 478 uint32_t strength = tok->strength;
michael@0 479 uint32_t low = lows[fStrength*3+strength];
michael@0 480 uint32_t high = highs[fStrength*3+strength];
michael@0 481 uint32_t maxByte = 0;
michael@0 482 if(strength == UCOL_TERTIARY) {
michael@0 483 maxByte = 0x3F;
michael@0 484 } else if(strength == UCOL_PRIMARY) {
michael@0 485 maxByte = 0xFE;
michael@0 486 } else {
michael@0 487 maxByte = 0xFF;
michael@0 488 }
michael@0 489
michael@0 490 uint32_t count = tok->toInsert;
michael@0 491
michael@0 492 if(low >= high && strength > UCOL_PRIMARY) {
michael@0 493 int32_t s = strength;
michael@0 494 for(;;) {
michael@0 495 s--;
michael@0 496 if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
michael@0 497 if(strength == UCOL_SECONDARY) {
michael@0 498 if (low < UCOL_COMMON_TOP2<<24 ) {
michael@0 499 // Override if low range is less than UCOL_COMMON_TOP2.
michael@0 500 low = UCOL_COMMON_TOP2<<24;
michael@0 501 }
michael@0 502 high = 0xFFFFFFFF;
michael@0 503 } else {
michael@0 504 // Override if low range is less than UCOL_COMMON_BOT3.
michael@0 505 if ( low < UCOL_COMMON_BOT3<<24 ) {
michael@0 506 low = UCOL_COMMON_BOT3<<24;
michael@0 507 }
michael@0 508 high = 0x40000000;
michael@0 509 }
michael@0 510 break;
michael@0 511 }
michael@0 512 if(s<0) {
michael@0 513 *status = U_INTERNAL_PROGRAM_ERROR;
michael@0 514 return 0;
michael@0 515 }
michael@0 516 }
michael@0 517 }
michael@0 518
michael@0 519 if(low < 0x02000000) {
michael@0 520 // We must not use CE weight byte 02, so we set it as the minimum lower bound.
michael@0 521 // See http://site.icu-project.org/design/collation/bytes
michael@0 522 low = 0x02000000;
michael@0 523 }
michael@0 524
michael@0 525 if(strength == UCOL_SECONDARY) { /* similar as simple */
michael@0 526 if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
michael@0 527 low = UCOL_COMMON_TOP2<<24;
michael@0 528 }
michael@0 529 if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
michael@0 530 high = UCOL_COMMON_TOP2<<24;
michael@0 531 }
michael@0 532 if(low < (UCOL_COMMON_BOT2<<24)) {
michael@0 533 g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
michael@0 534 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
michael@0 535 //g->current = UCOL_COMMON_BOT2<<24;
michael@0 536 return g->current;
michael@0 537 }
michael@0 538 }
michael@0 539
michael@0 540 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
michael@0 541 if(g->noOfRanges == 0) {
michael@0 542 *status = U_INTERNAL_PROGRAM_ERROR;
michael@0 543 }
michael@0 544 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
michael@0 545 return g->current;
michael@0 546 }
michael@0 547
michael@0 548 static
michael@0 549 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
michael@0 550 uint32_t i = 0;
michael@0 551 UChar c;
michael@0 552
michael@0 553 if(U_FAILURE(*status)) {
michael@0 554 return 0;
michael@0 555 }
michael@0 556
michael@0 557 if(sourceLen > resLen) {
michael@0 558 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 559 return 0;
michael@0 560 }
michael@0 561
michael@0 562 for(i = 0; i < sourceLen; i++) {
michael@0 563 c = source[i];
michael@0 564 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
michael@0 565 switch(c - 0x3000) {
michael@0 566 case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
michael@0 567 case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
michael@0 568 c++;
michael@0 569 break;
michael@0 570 case 0xF5:
michael@0 571 c = 0x30AB;
michael@0 572 break;
michael@0 573 case 0xF6:
michael@0 574 c = 0x30B1;
michael@0 575 break;
michael@0 576 }
michael@0 577 }
michael@0 578 resBuf[i] = c;
michael@0 579 }
michael@0 580 return sourceLen;
michael@0 581 }
michael@0 582
michael@0 583 static
michael@0 584 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
michael@0 585 uint32_t i = 0;
michael@0 586 UChar c;
michael@0 587
michael@0 588 if(U_FAILURE(*status)) {
michael@0 589 return 0;
michael@0 590 }
michael@0 591
michael@0 592 if(sourceLen > resLen) {
michael@0 593 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 594 return 0;
michael@0 595 }
michael@0 596
michael@0 597 for(i = 0; i < sourceLen; i++) {
michael@0 598 c = source[i];
michael@0 599 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
michael@0 600 switch(c - 0x3000) {
michael@0 601 case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
michael@0 602 case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
michael@0 603 c--;
michael@0 604 break;
michael@0 605 case 0xAB:
michael@0 606 c = 0x30F5;
michael@0 607 break;
michael@0 608 case 0xB1:
michael@0 609 c = 0x30F6;
michael@0 610 break;
michael@0 611 }
michael@0 612 }
michael@0 613 resBuf[i] = c;
michael@0 614 }
michael@0 615 return sourceLen;
michael@0 616 }
michael@0 617
michael@0 618 U_NAMESPACE_BEGIN
michael@0 619
michael@0 620 static
michael@0 621 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
michael@0 622 uint32_t i = 0;
michael@0 623 UChar n[128];
michael@0 624 uint32_t nLen = 0;
michael@0 625 uint32_t uCount = 0, lCount = 0;
michael@0 626
michael@0 627 collIterate s;
michael@0 628 uint32_t order = 0;
michael@0 629
michael@0 630 if(U_FAILURE(*status)) {
michael@0 631 return UCOL_LOWER_CASE;
michael@0 632 }
michael@0 633
michael@0 634 nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
michael@0 635 if(U_SUCCESS(*status)) {
michael@0 636 for(i = 0; i < nLen; i++) {
michael@0 637 uprv_init_collIterate(UCA, &n[i], 1, &s, status);
michael@0 638 order = ucol_getNextCE(UCA, &s, status);
michael@0 639 if(isContinuation(order)) {
michael@0 640 *status = U_INTERNAL_PROGRAM_ERROR;
michael@0 641 return UCOL_LOWER_CASE;
michael@0 642 }
michael@0 643 if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
michael@0 644 uCount++;
michael@0 645 } else {
michael@0 646 if(u_islower(n[i])) {
michael@0 647 lCount++;
michael@0 648 } else if(U_SUCCESS(*status)) {
michael@0 649 UChar sk[1], lk[1];
michael@0 650 u_toSmallKana(&n[i], 1, sk, 1, status);
michael@0 651 u_toLargeKana(&n[i], 1, lk, 1, status);
michael@0 652 if(sk[0] == n[i] && lk[0] != n[i]) {
michael@0 653 lCount++;
michael@0 654 }
michael@0 655 }
michael@0 656 }
michael@0 657 }
michael@0 658 }
michael@0 659
michael@0 660 if(uCount != 0 && lCount != 0) {
michael@0 661 return UCOL_MIXED_CASE;
michael@0 662 } else if(uCount != 0) {
michael@0 663 return UCOL_UPPER_CASE;
michael@0 664 } else {
michael@0 665 return UCOL_LOWER_CASE;
michael@0 666 }
michael@0 667 }
michael@0 668
michael@0 669
michael@0 670 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
michael@0 671 /* this one makes the table and stuff */
michael@0 672 uint32_t noOfBytes[3];
michael@0 673 uint32_t i;
michael@0 674
michael@0 675 for(i = 0; i<3; i++) {
michael@0 676 ucol_countBytes(CEparts[i], noOfBytes[i]);
michael@0 677 }
michael@0 678
michael@0 679 /* Here we have to pack CEs from parts */
michael@0 680
michael@0 681 uint32_t CEi = 0;
michael@0 682 uint32_t value = 0;
michael@0 683
michael@0 684 while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
michael@0 685 if(CEi > 0) {
michael@0 686 value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
michael@0 687 } else {
michael@0 688 value = 0;
michael@0 689 }
michael@0 690
michael@0 691 if(2*CEi<noOfBytes[0]) {
michael@0 692 value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
michael@0 693 }
michael@0 694 if(CEi<noOfBytes[1]) {
michael@0 695 value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
michael@0 696 }
michael@0 697 if(CEi<noOfBytes[2]) {
michael@0 698 value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
michael@0 699 }
michael@0 700 tok->CEs[CEi] = value;
michael@0 701 CEi++;
michael@0 702 }
michael@0 703 if(CEi == 0) { /* totally ignorable */
michael@0 704 tok->noOfCEs = 1;
michael@0 705 tok->CEs[0] = 0;
michael@0 706 } else { /* there is at least something */
michael@0 707 tok->noOfCEs = CEi;
michael@0 708 }
michael@0 709
michael@0 710
michael@0 711 // we want to set case bits here and now, not later.
michael@0 712 // Case bits handling
michael@0 713 if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables
michael@0 714 tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
michael@0 715 int32_t cSize = (tok->source & 0xFF000000) >> 24;
michael@0 716 UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
michael@0 717
michael@0 718 if(cSize > 1) {
michael@0 719 // Do it manually
michael@0 720 tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
michael@0 721 } else {
michael@0 722 // Copy it from the UCA
michael@0 723 uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
michael@0 724 tok->CEs[0] |= (caseCE & 0xC0);
michael@0 725 }
michael@0 726 }
michael@0 727
michael@0 728 #if UCOL_DEBUG==2
michael@0 729 fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
michael@0 730 for(i = 0; i<tok->noOfCEs; i++) {
michael@0 731 fprintf(stderr, "%08X ", tok->CEs[i]);
michael@0 732 }
michael@0 733 fprintf(stderr, "\n");
michael@0 734 #endif
michael@0 735 }
michael@0 736
michael@0 737 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
michael@0 738 ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
michael@0 739 uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
michael@0 740
michael@0 741 UColToken *tok = lh->last;
michael@0 742 uint32_t t[UCOL_STRENGTH_LIMIT];
michael@0 743
michael@0 744 uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
michael@0 745
michael@0 746 /* must initialize ranges to avoid memory check warnings */
michael@0 747 for (int i = 0; i < UCOL_CE_STRENGTH_LIMIT; i++) {
michael@0 748 uprv_memset(Gens[i].ranges, 0, sizeof(Gens[i].ranges));
michael@0 749 }
michael@0 750
michael@0 751 tok->toInsert = 1;
michael@0 752 t[tok->strength] = 1;
michael@0 753
michael@0 754 while(tok->previous != NULL) {
michael@0 755 if(tok->previous->strength < tok->strength) { /* going up */
michael@0 756 t[tok->strength] = 0;
michael@0 757 t[tok->previous->strength]++;
michael@0 758 } else if(tok->previous->strength > tok->strength) { /* going down */
michael@0 759 t[tok->previous->strength] = 1;
michael@0 760 } else {
michael@0 761 t[tok->strength]++;
michael@0 762 }
michael@0 763 tok=tok->previous;
michael@0 764 tok->toInsert = t[tok->strength];
michael@0 765 }
michael@0 766
michael@0 767 tok->toInsert = t[tok->strength];
michael@0 768 ucol_inv_getGapPositions(src, lh, status);
michael@0 769
michael@0 770 #if UCOL_DEBUG
michael@0 771 fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
michael@0 772 int32_t j = 2;
michael@0 773 for(j = 2; j >= 0; j--) {
michael@0 774 fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
michael@0 775 fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
michael@0 776 }
michael@0 777 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
michael@0 778
michael@0 779 do {
michael@0 780 fprintf(stderr,"%i", tok->strength);
michael@0 781 tok = tok->next;
michael@0 782 } while(tok != NULL);
michael@0 783 fprintf(stderr, "\n");
michael@0 784
michael@0 785 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
michael@0 786
michael@0 787 do {
michael@0 788 fprintf(stderr,"%i", tok->toInsert);
michael@0 789 tok = tok->next;
michael@0 790 } while(tok != NULL);
michael@0 791 #endif
michael@0 792
michael@0 793 tok = lh->first;
michael@0 794 uint32_t fStrength = UCOL_IDENTICAL;
michael@0 795 uint32_t initStrength = UCOL_IDENTICAL;
michael@0 796
michael@0 797
michael@0 798 CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
michael@0 799 CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8;
michael@0 800 CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16;
michael@0 801
michael@0 802 while (tok != NULL && U_SUCCESS(*status)) {
michael@0 803 fStrength = tok->strength;
michael@0 804 if(fStrength < initStrength) {
michael@0 805 initStrength = fStrength;
michael@0 806 if(lh->pos[fStrength] == -1) {
michael@0 807 while(lh->pos[fStrength] == -1 && fStrength > 0) {
michael@0 808 fStrength--;
michael@0 809 }
michael@0 810 if(lh->pos[fStrength] == -1) {
michael@0 811 *status = U_INTERNAL_PROGRAM_ERROR;
michael@0 812 return;
michael@0 813 }
michael@0 814 }
michael@0 815 if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
michael@0 816 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
michael@0 817 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
michael@0 818 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
michael@0 819 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
michael@0 820 } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
michael@0 821 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
michael@0 822 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
michael@0 823 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
michael@0 824 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
michael@0 825 } else { /* primaries */
michael@0 826 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
michael@0 827 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
michael@0 828 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
michael@0 829 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
michael@0 830 }
michael@0 831 } else {
michael@0 832 if(tok->strength == UCOL_TERTIARY) {
michael@0 833 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status);
michael@0 834 } else if(tok->strength == UCOL_SECONDARY) {
michael@0 835 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status);
michael@0 836 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
michael@0 837 } else if(tok->strength == UCOL_PRIMARY) {
michael@0 838 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status);
michael@0 839 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
michael@0 840 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
michael@0 841 }
michael@0 842 }
michael@0 843 ucol_doCE(src, CEparts, tok, status);
michael@0 844 tok = tok->next;
michael@0 845 }
michael@0 846 }
michael@0 847
michael@0 848 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
michael@0 849 UCAElements el;
michael@0 850 UColToken *tok = lh->first;
michael@0 851 UColToken *expt = NULL;
michael@0 852 uint32_t i = 0, j = 0;
michael@0 853 const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status);
michael@0 854
michael@0 855 while(tok != NULL && U_SUCCESS(*status)) {
michael@0 856 /* first, check if there are any expansions */
michael@0 857 /* if there are expansions, we need to do a little bit more processing */
michael@0 858 /* since parts of expansion can be tailored, while others are not */
michael@0 859 if(tok->expansion != 0) {
michael@0 860 uint32_t len = tok->expansion >> 24;
michael@0 861 uint32_t currentSequenceLen = len;
michael@0 862 uint32_t expOffset = tok->expansion & 0x00FFFFFF;
michael@0 863 //uint32_t exp = currentSequenceLen | expOffset;
michael@0 864 UColToken exp;
michael@0 865 exp.source = currentSequenceLen | expOffset;
michael@0 866 exp.rulesToParseHdl = &(src->source);
michael@0 867
michael@0 868 while(len > 0) {
michael@0 869 currentSequenceLen = len;
michael@0 870 while(currentSequenceLen > 0) {
michael@0 871 exp.source = (currentSequenceLen << 24) | expOffset;
michael@0 872 if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
michael@0 873 uint32_t noOfCEsToCopy = expt->noOfCEs;
michael@0 874 for(j = 0; j<noOfCEsToCopy; j++) {
michael@0 875 tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
michael@0 876 }
michael@0 877 tok->noOfExpCEs += noOfCEsToCopy;
michael@0 878 // Smart people never try to add codepoints and CEs.
michael@0 879 // For some odd reason, it won't work.
michael@0 880 expOffset += currentSequenceLen; //noOfCEsToCopy;
michael@0 881 len -= currentSequenceLen; //noOfCEsToCopy;
michael@0 882 break;
michael@0 883 } else {
michael@0 884 currentSequenceLen--;
michael@0 885 }
michael@0 886 }
michael@0 887 if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */
michael@0 888 /* will have to get one from UCA */
michael@0 889 /* first, get the UChars from the rules */
michael@0 890 /* then pick CEs out until there is no more and stuff them into expansion */
michael@0 891 collIterate s;
michael@0 892 uint32_t order = 0;
michael@0 893 uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status);
michael@0 894
michael@0 895 for(;;) {
michael@0 896 order = ucol_getNextCE(src->UCA, &s, status);
michael@0 897 if(order == UCOL_NO_MORE_CES) {
michael@0 898 break;
michael@0 899 }
michael@0 900 tok->expCEs[tok->noOfExpCEs++] = order;
michael@0 901 }
michael@0 902 expOffset++;
michael@0 903 len--;
michael@0 904 }
michael@0 905 }
michael@0 906 } else {
michael@0 907 tok->noOfExpCEs = 0;
michael@0 908 }
michael@0 909
michael@0 910 /* set the ucaelement with obtained values */
michael@0 911 el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
michael@0 912 /* copy CEs */
michael@0 913 for(i = 0; i<tok->noOfCEs; i++) {
michael@0 914 el.CEs[i] = tok->CEs[i];
michael@0 915 }
michael@0 916 for(i = 0; i<tok->noOfExpCEs; i++) {
michael@0 917 el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
michael@0 918 }
michael@0 919
michael@0 920 /* copy UChars */
michael@0 921 // We kept prefix and source kind of together, as it is a kind of a contraction.
michael@0 922 // However, now we have to slice the prefix off the main thing -
michael@0 923 el.prefix = el.prefixChars;
michael@0 924 el.cPoints = el.uchars;
michael@0 925 if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
michael@0 926 // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
michael@0 927 // decomposed elements to the unsaf table.
michael@0 928 el.prefixSize = tok->prefix>>24;
michael@0 929 uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
michael@0 930
michael@0 931 el.cSize = (tok->source >> 24)-(tok->prefix>>24);
michael@0 932 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
michael@0 933 } else {
michael@0 934 el.prefixSize = 0;
michael@0 935 *el.prefix = 0;
michael@0 936
michael@0 937 el.cSize = (tok->source >> 24);
michael@0 938 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
michael@0 939 }
michael@0 940 if(src->UCA != NULL) {
michael@0 941 for(i = 0; i<el.cSize; i++) {
michael@0 942 if(UCOL_ISJAMO(el.cPoints[i])) {
michael@0 943 t->image->jamoSpecial = TRUE;
michael@0 944 }
michael@0 945 }
michael@0 946 if (!src->buildCCTabFlag && el.cSize > 0) {
michael@0 947 // Check the trailing canonical combining class (tccc) of the last character.
michael@0 948 const UChar *s = el.cPoints + el.cSize;
michael@0 949 uint16_t fcd = nfcImpl->previousFCD16(el.cPoints, s);
michael@0 950 if ((fcd & 0xff) != 0) {
michael@0 951 src->buildCCTabFlag = TRUE;
michael@0 952 }
michael@0 953 }
michael@0 954 }
michael@0 955
michael@0 956 /* and then, add it */
michael@0 957 #if UCOL_DEBUG==2
michael@0 958 fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
michael@0 959 #endif
michael@0 960 uprv_uca_addAnElement(t, &el, status);
michael@0 961
michael@0 962 #if UCOL_DEBUG_DUPLICATES
michael@0 963 if(*status != U_ZERO_ERROR) {
michael@0 964 fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
michael@0 965 *status = U_ZERO_ERROR;
michael@0 966 }
michael@0 967 #endif
michael@0 968
michael@0 969 tok = tok->next;
michael@0 970 }
michael@0 971 }
michael@0 972
michael@0 973 U_CDECL_BEGIN
michael@0 974 static UBool U_CALLCONV
michael@0 975 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
michael@0 976 UErrorCode status = U_ZERO_ERROR;
michael@0 977 tempUCATable *t = (tempUCATable *)context;
michael@0 978 if(value == 0) {
michael@0 979 while(start < limit) {
michael@0 980 uint32_t CE = utrie_get32(t->mapping, start, NULL);
michael@0 981 if(CE == UCOL_NOT_FOUND) {
michael@0 982 UCAElements el;
michael@0 983 el.isThai = FALSE;
michael@0 984 el.prefixSize = 0;
michael@0 985 el.prefixChars[0] = 0;
michael@0 986 el.prefix = el.prefixChars;
michael@0 987 el.cPoints = el.uchars;
michael@0 988
michael@0 989 el.cSize = 0;
michael@0 990 U16_APPEND_UNSAFE(el.uchars, el.cSize, start);
michael@0 991
michael@0 992 el.noOfCEs = 1;
michael@0 993 el.CEs[0] = 0;
michael@0 994 uprv_uca_addAnElement(t, &el, &status);
michael@0 995
michael@0 996 }
michael@0 997 start++;
michael@0 998 }
michael@0 999 }
michael@0 1000 if(U_FAILURE(status)) {
michael@0 1001 return FALSE;
michael@0 1002 } else {
michael@0 1003 return TRUE;
michael@0 1004 }
michael@0 1005 }
michael@0 1006 U_CDECL_END
michael@0 1007
michael@0 1008 static void
michael@0 1009 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
michael@0 1010 UChar32 start, UChar32 end,
michael@0 1011 UErrorCode *status)
michael@0 1012 {
michael@0 1013 //UChar decomp[256];
michael@0 1014 uint32_t CE = UCOL_NOT_FOUND;
michael@0 1015 UChar32 u = 0;
michael@0 1016 UCAElements el;
michael@0 1017 el.isThai = FALSE;
michael@0 1018 el.prefixSize = 0;
michael@0 1019 el.prefixChars[0] = 0;
michael@0 1020 collIterate colIt;
michael@0 1021
michael@0 1022 if(U_SUCCESS(*status)) {
michael@0 1023 for(u = start; u<=end; u++) {
michael@0 1024 if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
michael@0 1025 /* this test is for contractions that are missing the starting element. */
michael@0 1026 || ((isCntTableElement(CE)) &&
michael@0 1027 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND))
michael@0 1028 )
michael@0 1029 {
michael@0 1030 el.cSize = 0;
michael@0 1031 U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
michael@0 1032 //decomp[0] = (UChar)u;
michael@0 1033 //el.uchars[0] = (UChar)u;
michael@0 1034 el.cPoints = el.uchars;
michael@0 1035 //el.cSize = 1;
michael@0 1036 el.noOfCEs = 0;
michael@0 1037 el.prefix = el.prefixChars;
michael@0 1038 el.prefixSize = 0;
michael@0 1039 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
michael@0 1040 // We actually want to check whether this element is a special
michael@0 1041 // If it is an implicit element (hangul, CJK - we want to copy the
michael@0 1042 // special, not the resolved CEs) - for hangul, copying resolved
michael@0 1043 // would just make things the same (there is an expansion and it
michael@0 1044 // takes approximately the same amount of time to resolve as
michael@0 1045 // falling back to the UCA).
michael@0 1046 /*
michael@0 1047 UTRIE_GET32(src->UCA->mapping, u, CE);
michael@0 1048 tag = getCETag(CE);
michael@0 1049 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
michael@0 1050 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
michael@0 1051 || tag == LEAD_SURROGATE_TAG) {
michael@0 1052 el.CEs[el.noOfCEs++] = CE;
michael@0 1053 } else {
michael@0 1054 */
michael@0 1055 // It turns out that it does not make sense to keep implicits
michael@0 1056 // unresolved. The cost of resolving them is big enough so that
michael@0 1057 // it doesn't make any difference whether we have to go to the UCA
michael@0 1058 // or not.
michael@0 1059 {
michael@0 1060 uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status);
michael@0 1061 while(CE != UCOL_NO_MORE_CES) {
michael@0 1062 CE = ucol_getNextCE(src->UCA, &colIt, status);
michael@0 1063 if(CE != UCOL_NO_MORE_CES) {
michael@0 1064 el.CEs[el.noOfCEs++] = CE;
michael@0 1065 }
michael@0 1066 }
michael@0 1067 }
michael@0 1068 uprv_uca_addAnElement(t, &el, status);
michael@0 1069 }
michael@0 1070 }
michael@0 1071 }
michael@0 1072 }
michael@0 1073
michael@0 1074 U_NAMESPACE_END
michael@0 1075
michael@0 1076 U_CFUNC UCATableHeader *
michael@0 1077 ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
michael@0 1078 U_NAMESPACE_USE
michael@0 1079
michael@0 1080 uint32_t i = 0;
michael@0 1081 if(U_FAILURE(*status)) {
michael@0 1082 return NULL;
michael@0 1083 }
michael@0 1084 /*
michael@0 1085 2. Eliminate the negative lists by doing the following for each non-null negative list:
michael@0 1086 o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
michael@0 1087 create new ListHeader X
michael@0 1088 o reverse the list, add to the end of X's positive list. Reset the strength of the
michael@0 1089 first item you add, based on the stronger strength levels of the two lists.
michael@0 1090 */
michael@0 1091 /*
michael@0 1092 3. For each ListHeader with a non-null positive list:
michael@0 1093 */
michael@0 1094 /*
michael@0 1095 o Find all character strings with CEs between the baseCE and the
michael@0 1096 next/previous CE, at the strength of the first token. Add these to the
michael@0 1097 tailoring.
michael@0 1098 ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the
michael@0 1099 tailoring has & x < z...
michael@0 1100 ? Then we change the tailoring to & x <<< X << x' <<< X' < z ...
michael@0 1101 */
michael@0 1102 /* It is possible that this part should be done even while constructing list */
michael@0 1103 /* The problem is that it is unknown what is going to be the strongest weight */
michael@0 1104 /* So we might as well do it here */
michael@0 1105
michael@0 1106 /*
michael@0 1107 o Allocate CEs for each token in the list, based on the total number N of the
michael@0 1108 largest level difference, and the gap G between baseCE and nextCE at that
michael@0 1109 level. The relation * between the last item and nextCE is the same as the
michael@0 1110 strongest strength.
michael@0 1111 o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
michael@0 1112 ? There are 3 primary items: a, d, e. Fit them into the primary gap.
michael@0 1113 Then fit b and c into the secondary gap between a and d, then fit q
michael@0 1114 into the tertiary gap between b and c.
michael@0 1115
michael@0 1116 o Example: baseCE << b <<< q << c * nextCE(X,2)
michael@0 1117 ? There are 2 secondary items: b, c. Fit them into the secondary gap.
michael@0 1118 Then fit q into the tertiary gap between b and c.
michael@0 1119 o When incrementing primary values, we will not cross high byte
michael@0 1120 boundaries except where there is only a single-byte primary. That is to
michael@0 1121 ensure that the script reordering will continue to work.
michael@0 1122 */
michael@0 1123 UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
michael@0 1124 /* test for NULL */
michael@0 1125 if (image == NULL) {
michael@0 1126 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 1127 return NULL;
michael@0 1128 }
michael@0 1129 uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
michael@0 1130
michael@0 1131 for(i = 0; i<src->resultLen; i++) {
michael@0 1132 /* now we need to generate the CEs */
michael@0 1133 /* We stuff the initial value in the buffers, and increase the appropriate buffer */
michael@0 1134 /* According to strength */
michael@0 1135 if(U_SUCCESS(*status)) {
michael@0 1136 if(src->lh[i].first) { // if there are any elements
michael@0 1137 // due to the way parser works, subsequent tailorings
michael@0 1138 // may remove all the elements from a sequence, therefore
michael@0 1139 // leaving an empty tailoring sequence.
michael@0 1140 ucol_initBuffers(src, &src->lh[i], status);
michael@0 1141 }
michael@0 1142 }
michael@0 1143 if(U_FAILURE(*status)) {
michael@0 1144 uprv_free(image);
michael@0 1145 return NULL;
michael@0 1146 }
michael@0 1147 }
michael@0 1148
michael@0 1149 if(src->varTop != NULL) { /* stuff the variable top value */
michael@0 1150 src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
michael@0 1151 /* remove it from the list */
michael@0 1152 if(src->varTop->listHeader->first == src->varTop) { /* first in list */
michael@0 1153 src->varTop->listHeader->first = src->varTop->next;
michael@0 1154 }
michael@0 1155 if(src->varTop->listHeader->last == src->varTop) { /* first in list */
michael@0 1156 src->varTop->listHeader->last = src->varTop->previous;
michael@0 1157 }
michael@0 1158 if(src->varTop->next != NULL) {
michael@0 1159 src->varTop->next->previous = src->varTop->previous;
michael@0 1160 }
michael@0 1161 if(src->varTop->previous != NULL) {
michael@0 1162 src->varTop->previous->next = src->varTop->next;
michael@0 1163 }
michael@0 1164 }
michael@0 1165
michael@0 1166
michael@0 1167 tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
michael@0 1168 if(U_FAILURE(*status)) {
michael@0 1169 uprv_free(image);
michael@0 1170 return NULL;
michael@0 1171 }
michael@0 1172
michael@0 1173
michael@0 1174 /* After this, we have assigned CE values to all regular CEs */
michael@0 1175 /* now we will go through list once more and resolve expansions, */
michael@0 1176 /* make UCAElements structs and add them to table */
michael@0 1177 for(i = 0; i<src->resultLen; i++) {
michael@0 1178 /* now we need to generate the CEs */
michael@0 1179 /* We stuff the initial value in the buffers, and increase the appropriate buffer */
michael@0 1180 /* According to strength */
michael@0 1181 if(U_SUCCESS(*status)) {
michael@0 1182 ucol_createElements(src, t, &src->lh[i], status);
michael@0 1183 }
michael@0 1184 }
michael@0 1185
michael@0 1186 UCAElements el;
michael@0 1187 el.isThai = FALSE;
michael@0 1188 el.prefixSize = 0;
michael@0 1189 el.prefixChars[0] = 0;
michael@0 1190
michael@0 1191 /* add latin-1 stuff */
michael@0 1192 ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
michael@0 1193
michael@0 1194 /* add stuff for copying */
michael@0 1195 if(src->copySet != NULL) {
michael@0 1196 int32_t i = 0;
michael@0 1197 UnicodeSet *set = (UnicodeSet *)src->copySet;
michael@0 1198 for(i = 0; i < set->getRangeCount(); i++) {
michael@0 1199 ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status);
michael@0 1200 }
michael@0 1201 }
michael@0 1202
michael@0 1203 if(U_SUCCESS(*status)) {
michael@0 1204 /* copy contractions from the UCA - this is felt mostly for cyrillic*/
michael@0 1205
michael@0 1206 uint32_t tailoredCE = UCOL_NOT_FOUND;
michael@0 1207 UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
michael@0 1208 int32_t maxUCAContractionLength = src->UCA->image->contractionUCACombosWidth;
michael@0 1209 UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
michael@0 1210 // Check for null pointer
michael@0 1211 if (ucaEl == NULL) {
michael@0 1212 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 1213 return NULL;
michael@0 1214 }
michael@0 1215 while(*conts != 0) {
michael@0 1216 // A continuation is NUL-terminated and NUL-padded
michael@0 1217 // except if it has the maximum length.
michael@0 1218 int32_t contractionLength = maxUCAContractionLength;
michael@0 1219 while(contractionLength > 0 && conts[contractionLength - 1] == 0) {
michael@0 1220 --contractionLength;
michael@0 1221 }
michael@0 1222 UChar32 first;
michael@0 1223 int32_t firstLength = 0;
michael@0 1224 U16_NEXT(conts, firstLength, contractionLength, first);
michael@0 1225 tailoredCE = utrie_get32(t->mapping, first, NULL);
michael@0 1226 if(tailoredCE != UCOL_NOT_FOUND) {
michael@0 1227 UBool needToAdd = TRUE;
michael@0 1228 if(isCntTableElement(tailoredCE)) {
michael@0 1229 if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+firstLength, status) == TRUE) {
michael@0 1230 needToAdd = FALSE;
michael@0 1231 }
michael@0 1232 }
michael@0 1233 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) {
michael@0 1234 UCAElements elm;
michael@0 1235 elm.cPoints = el.uchars;
michael@0 1236 elm.noOfCEs = 0;
michael@0 1237 elm.uchars[0] = *conts;
michael@0 1238 elm.uchars[1] = 0;
michael@0 1239 elm.cSize = 1;
michael@0 1240 elm.prefixChars[0] = *(conts+2);
michael@0 1241 elm.isThai = FALSE;
michael@0 1242 elm.prefix = elm.prefixChars;
michael@0 1243 elm.prefixSize = 1;
michael@0 1244 UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm);
michael@0 1245 if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) {
michael@0 1246 needToAdd = TRUE;
michael@0 1247 }
michael@0 1248 }
michael@0 1249 if(src->removeSet != NULL && uset_contains(src->removeSet, first)) {
michael@0 1250 needToAdd = FALSE;
michael@0 1251 }
michael@0 1252
michael@0 1253 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
michael@0 1254 if (*(conts+1) != 0) { // contractions
michael@0 1255 el.prefix = el.prefixChars;
michael@0 1256 el.prefixSize = 0;
michael@0 1257 el.cPoints = el.uchars;
michael@0 1258 el.noOfCEs = 0;
michael@0 1259 u_memcpy(el.uchars, conts, contractionLength);
michael@0 1260 el.cSize = contractionLength;
michael@0 1261 ucol_setText(ucaEl, el.uchars, el.cSize, status);
michael@0 1262 }
michael@0 1263 else { // pre-context character
michael@0 1264 UChar str[4] = { 0 };
michael@0 1265 int32_t len=0;
michael@0 1266 int32_t preKeyLen=0;
michael@0 1267
michael@0 1268 el.cPoints = el.uchars;
michael@0 1269 el.noOfCEs = 0;
michael@0 1270 el.uchars[0] = *conts;
michael@0 1271 el.uchars[1] = 0;
michael@0 1272 el.cSize = 1;
michael@0 1273 el.prefixChars[0] = *(conts+2);
michael@0 1274 el.prefix = el.prefixChars;
michael@0 1275 el.prefixSize = 1;
michael@0 1276 if (el.prefixChars[0]!=0) {
michael@0 1277 // get CE of prefix character first
michael@0 1278 str[0]=el.prefixChars[0];
michael@0 1279 str[1]=0;
michael@0 1280 ucol_setText(ucaEl, str, 1, status);
michael@0 1281 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status))
michael@0 1282 != UCOL_NULLORDER) {
michael@0 1283 preKeyLen++; // count number of keys for prefix character
michael@0 1284 }
michael@0 1285 str[len++] = el.prefixChars[0];
michael@0 1286 }
michael@0 1287
michael@0 1288 str[len++] = el.uchars[0];
michael@0 1289 str[len]=0;
michael@0 1290 ucol_setText(ucaEl, str, len, status);
michael@0 1291 // Skip the keys for prefix character, then copy the rest to el.
michael@0 1292 while ((preKeyLen-->0) &&
michael@0 1293 (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
michael@0 1294 continue;
michael@0 1295 }
michael@0 1296
michael@0 1297 }
michael@0 1298 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
michael@0 1299 el.noOfCEs++;
michael@0 1300 }
michael@0 1301 uprv_uca_addAnElement(t, &el, status);
michael@0 1302 }
michael@0 1303
michael@0 1304 } else if(src->removeSet != NULL && uset_contains(src->removeSet, first)) {
michael@0 1305 ucol_uprv_bld_copyRangeFromUCA(src, t, first, first, status);
michael@0 1306 }
michael@0 1307 conts+=maxUCAContractionLength;
michael@0 1308 }
michael@0 1309 ucol_closeElements(ucaEl);
michael@0 1310 }
michael@0 1311
michael@0 1312 // Add completely ignorable elements
michael@0 1313 utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
michael@0 1314
michael@0 1315 // add tailoring characters related canonical closures
michael@0 1316 uprv_uca_canonicalClosure(t, src, NULL, status);
michael@0 1317
michael@0 1318 /* still need to produce compatibility closure */
michael@0 1319
michael@0 1320 UCATableHeader *myData = uprv_uca_assembleTable(t, status);
michael@0 1321
michael@0 1322 uprv_uca_closeTempTable(t);
michael@0 1323 uprv_free(image);
michael@0 1324
michael@0 1325 return myData;
michael@0 1326 }
michael@0 1327
michael@0 1328 U_CDECL_BEGIN
michael@0 1329 static UBool U_CALLCONV
michael@0 1330 ucol_bld_cleanup(void)
michael@0 1331 {
michael@0 1332 udata_close(invUCA_DATA_MEM);
michael@0 1333 invUCA_DATA_MEM = NULL;
michael@0 1334 _staticInvUCA = NULL;
michael@0 1335 gStaticInvUCAInitOnce.reset();
michael@0 1336 return TRUE;
michael@0 1337 }
michael@0 1338 U_CDECL_END
michael@0 1339
michael@0 1340 static void U_CALLCONV initInverseUCA(UErrorCode &status) {
michael@0 1341 U_ASSERT(invUCA_DATA_MEM == NULL);
michael@0 1342 U_ASSERT(_staticInvUCA == NULL);
michael@0 1343 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
michael@0 1344 InverseUCATableHeader *newInvUCA = NULL;
michael@0 1345 UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, &status);
michael@0 1346
michael@0 1347 if(U_FAILURE(status)) {
michael@0 1348 if (result) {
michael@0 1349 udata_close(result);
michael@0 1350 }
michael@0 1351 // This is not needed, as we are talking about
michael@0 1352 // memory we got from UData
michael@0 1353 //uprv_free(newInvUCA);
michael@0 1354 return;
michael@0 1355 }
michael@0 1356
michael@0 1357 if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
michael@0 1358 newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
michael@0 1359 UCollator *UCA = ucol_initUCA(&status);
michael@0 1360 // UCA versions of UCA and inverse UCA should match
michael@0 1361 if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) {
michael@0 1362 status = U_INVALID_FORMAT_ERROR;
michael@0 1363 udata_close(result);
michael@0 1364 return;
michael@0 1365 }
michael@0 1366
michael@0 1367 invUCA_DATA_MEM = result;
michael@0 1368 _staticInvUCA = newInvUCA;
michael@0 1369 }
michael@0 1370 }
michael@0 1371
michael@0 1372
michael@0 1373 U_CAPI const InverseUCATableHeader * U_EXPORT2
michael@0 1374 ucol_initInverseUCA(UErrorCode *status)
michael@0 1375 {
michael@0 1376 umtx_initOnce(gStaticInvUCAInitOnce, &initInverseUCA, *status);
michael@0 1377 return _staticInvUCA;
michael@0 1378 }
michael@0 1379
michael@0 1380 /* This is the data that is used for non-script reordering codes. These _must_ be kept
michael@0 1381 * in order that they are to be applied as defaults and in synch with the UColReorderCode enum.
michael@0 1382 */
michael@0 1383 static const char * const ReorderingTokenNames[] = {
michael@0 1384 "SPACE",
michael@0 1385 "PUNCT",
michael@0 1386 "SYMBOL",
michael@0 1387 "CURRENCY",
michael@0 1388 "DIGIT"
michael@0 1389 };
michael@0 1390
michael@0 1391 static void toUpper(const char* src, char* dst, uint32_t length) {
michael@0 1392 for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) {
michael@0 1393 *dst = uprv_toupper(*src);
michael@0 1394 }
michael@0 1395 *dst = '\0';
michael@0 1396 }
michael@0 1397
michael@0 1398 U_INTERNAL int32_t U_EXPORT2
michael@0 1399 ucol_findReorderingEntry(const char* name) {
michael@0 1400 char buffer[32];
michael@0 1401 toUpper(name, buffer, 32);
michael@0 1402 for (uint32_t entry = 0; entry < LENGTHOF(ReorderingTokenNames); entry++) {
michael@0 1403 if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) {
michael@0 1404 return entry + UCOL_REORDER_CODE_FIRST;
michael@0 1405 }
michael@0 1406 }
michael@0 1407 return USCRIPT_INVALID_CODE;
michael@0 1408 }
michael@0 1409
michael@0 1410 #endif /* #if !UCONFIG_NO_COLLATION */

mercurial