Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* |
michael@0 | 2 | ******************************************************************************* |
michael@0 | 3 | * |
michael@0 | 4 | * Copyright (C) 2001-2013, International Business Machines |
michael@0 | 5 | * Corporation and others. All Rights Reserved. |
michael@0 | 6 | * |
michael@0 | 7 | ******************************************************************************* |
michael@0 | 8 | * file name: ucol_bld.cpp |
michael@0 | 9 | * encoding: US-ASCII |
michael@0 | 10 | * tab size: 8 (not used) |
michael@0 | 11 | * indentation:4 |
michael@0 | 12 | * |
michael@0 | 13 | * created 02/22/2001 |
michael@0 | 14 | * created by: Vladimir Weinstein |
michael@0 | 15 | * |
michael@0 | 16 | * This module builds a collator based on the rule set. |
michael@0 | 17 | * |
michael@0 | 18 | */ |
michael@0 | 19 | |
michael@0 | 20 | #include "unicode/utypes.h" |
michael@0 | 21 | |
michael@0 | 22 | #if !UCONFIG_NO_COLLATION |
michael@0 | 23 | |
michael@0 | 24 | #include "unicode/ucoleitr.h" |
michael@0 | 25 | #include "unicode/udata.h" |
michael@0 | 26 | #include "unicode/uchar.h" |
michael@0 | 27 | #include "unicode/uniset.h" |
michael@0 | 28 | #include "unicode/uscript.h" |
michael@0 | 29 | #include "unicode/ustring.h" |
michael@0 | 30 | #include "unicode/utf16.h" |
michael@0 | 31 | #include "normalizer2impl.h" |
michael@0 | 32 | #include "uassert.h" |
michael@0 | 33 | #include "ucol_bld.h" |
michael@0 | 34 | #include "ucol_elm.h" |
michael@0 | 35 | #include "ucol_cnt.h" |
michael@0 | 36 | #include "ucln_in.h" |
michael@0 | 37 | #include "umutex.h" |
michael@0 | 38 | #include "cmemory.h" |
michael@0 | 39 | #include "cstring.h" |
michael@0 | 40 | |
michael@0 | 41 | #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
michael@0 | 42 | |
michael@0 | 43 | static const InverseUCATableHeader* _staticInvUCA = NULL; |
michael@0 | 44 | static UDataMemory* invUCA_DATA_MEM = NULL; |
michael@0 | 45 | static icu::UInitOnce gStaticInvUCAInitOnce = U_INITONCE_INITIALIZER; |
michael@0 | 46 | |
michael@0 | 47 | U_CDECL_BEGIN |
michael@0 | 48 | static UBool U_CALLCONV |
michael@0 | 49 | isAcceptableInvUCA(void * /*context*/, |
michael@0 | 50 | const char * /*type*/, const char * /*name*/, |
michael@0 | 51 | const UDataInfo *pInfo) |
michael@0 | 52 | { |
michael@0 | 53 | /* context, type & name are intentionally not used */ |
michael@0 | 54 | if( pInfo->size>=20 && |
michael@0 | 55 | pInfo->isBigEndian==U_IS_BIG_ENDIAN && |
michael@0 | 56 | pInfo->charsetFamily==U_CHARSET_FAMILY && |
michael@0 | 57 | pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 && /* dataFormat="InvC" */ |
michael@0 | 58 | pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 && |
michael@0 | 59 | pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 && |
michael@0 | 60 | pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 && |
michael@0 | 61 | pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 && |
michael@0 | 62 | pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&& |
michael@0 | 63 | //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 && |
michael@0 | 64 | //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 && |
michael@0 | 65 | //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 && |
michael@0 | 66 | ) |
michael@0 | 67 | { |
michael@0 | 68 | // TODO: Check that the invuca data version (pInfo->dataVersion) |
michael@0 | 69 | // matches the ucadata version. |
michael@0 | 70 | return TRUE; |
michael@0 | 71 | } else { |
michael@0 | 72 | return FALSE; |
michael@0 | 73 | } |
michael@0 | 74 | } |
michael@0 | 75 | U_CDECL_END |
michael@0 | 76 | |
michael@0 | 77 | /* |
michael@0 | 78 | * Takes two CEs (lead and continuation) and |
michael@0 | 79 | * compares them as CEs should be compared: |
michael@0 | 80 | * primary vs. primary, secondary vs. secondary |
michael@0 | 81 | * tertiary vs. tertiary |
michael@0 | 82 | */ |
michael@0 | 83 | static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) { |
michael@0 | 84 | uint32_t s1 = source0, s2, t1 = target0, t2; |
michael@0 | 85 | if(isContinuation(source1)) { |
michael@0 | 86 | s2 = source1; |
michael@0 | 87 | } else { |
michael@0 | 88 | s2 = 0; |
michael@0 | 89 | } |
michael@0 | 90 | if(isContinuation(target1)) { |
michael@0 | 91 | t2 = target1; |
michael@0 | 92 | } else { |
michael@0 | 93 | t2 = 0; |
michael@0 | 94 | } |
michael@0 | 95 | |
michael@0 | 96 | uint32_t s = 0, t = 0; |
michael@0 | 97 | if(s1 == t1 && s2 == t2) { |
michael@0 | 98 | return 0; |
michael@0 | 99 | } |
michael@0 | 100 | s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16); |
michael@0 | 101 | t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16); |
michael@0 | 102 | if(s < t) { |
michael@0 | 103 | return -1; |
michael@0 | 104 | } else if(s > t) { |
michael@0 | 105 | return 1; |
michael@0 | 106 | } else { |
michael@0 | 107 | s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8; |
michael@0 | 108 | t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8; |
michael@0 | 109 | if(s < t) { |
michael@0 | 110 | return -1; |
michael@0 | 111 | } else if(s > t) { |
michael@0 | 112 | return 1; |
michael@0 | 113 | } else { |
michael@0 | 114 | s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF); |
michael@0 | 115 | t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF); |
michael@0 | 116 | if(s < t) { |
michael@0 | 117 | return -1; |
michael@0 | 118 | } else { |
michael@0 | 119 | return 1; |
michael@0 | 120 | } |
michael@0 | 121 | } |
michael@0 | 122 | } |
michael@0 | 123 | } |
michael@0 | 124 | |
michael@0 | 125 | static |
michael@0 | 126 | int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) { |
michael@0 | 127 | uint32_t bottom = 0, top = src->invUCA->tableSize; |
michael@0 | 128 | uint32_t i = 0; |
michael@0 | 129 | uint32_t first = 0, second = 0; |
michael@0 | 130 | uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); |
michael@0 | 131 | int32_t res = 0; |
michael@0 | 132 | |
michael@0 | 133 | while(bottom < top-1) { |
michael@0 | 134 | i = (top+bottom)/2; |
michael@0 | 135 | first = *(CETable+3*i); |
michael@0 | 136 | second = *(CETable+3*i+1); |
michael@0 | 137 | res = compareCEs(first, second, CE, SecondCE); |
michael@0 | 138 | if(res > 0) { |
michael@0 | 139 | top = i; |
michael@0 | 140 | } else if(res < 0) { |
michael@0 | 141 | bottom = i; |
michael@0 | 142 | } else { |
michael@0 | 143 | break; |
michael@0 | 144 | } |
michael@0 | 145 | } |
michael@0 | 146 | |
michael@0 | 147 | /* weiv: */ |
michael@0 | 148 | /* in searching for elements, I have removed the failure */ |
michael@0 | 149 | /* The reason for this is that the builder does not rely */ |
michael@0 | 150 | /* on search mechanism telling it that it didn't find an */ |
michael@0 | 151 | /* element. However, indirect positioning relies on being */ |
michael@0 | 152 | /* able to find the elements around any CE, even if it is */ |
michael@0 | 153 | /* not defined in the UCA. */ |
michael@0 | 154 | return i; |
michael@0 | 155 | /* |
michael@0 | 156 | if((first == CE && second == SecondCE)) { |
michael@0 | 157 | return i; |
michael@0 | 158 | } else { |
michael@0 | 159 | return -1; |
michael@0 | 160 | } |
michael@0 | 161 | */ |
michael@0 | 162 | } |
michael@0 | 163 | |
michael@0 | 164 | static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = { |
michael@0 | 165 | 0xFFFF0000, |
michael@0 | 166 | 0xFFFFFF00, |
michael@0 | 167 | 0xFFFFFFFF |
michael@0 | 168 | }; |
michael@0 | 169 | |
michael@0 | 170 | U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src, |
michael@0 | 171 | uint32_t CE, uint32_t contCE, |
michael@0 | 172 | uint32_t *nextCE, uint32_t *nextContCE, |
michael@0 | 173 | uint32_t strength) |
michael@0 | 174 | { |
michael@0 | 175 | uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); |
michael@0 | 176 | int32_t iCE; |
michael@0 | 177 | |
michael@0 | 178 | iCE = ucol_inv_findCE(src, CE, contCE); |
michael@0 | 179 | |
michael@0 | 180 | if(iCE<0) { |
michael@0 | 181 | *nextCE = UCOL_NOT_FOUND; |
michael@0 | 182 | return -1; |
michael@0 | 183 | } |
michael@0 | 184 | |
michael@0 | 185 | CE &= strengthMask[strength]; |
michael@0 | 186 | contCE &= strengthMask[strength]; |
michael@0 | 187 | |
michael@0 | 188 | *nextCE = CE; |
michael@0 | 189 | *nextContCE = contCE; |
michael@0 | 190 | |
michael@0 | 191 | while((*nextCE & strengthMask[strength]) == CE |
michael@0 | 192 | && (*nextContCE & strengthMask[strength]) == contCE) |
michael@0 | 193 | { |
michael@0 | 194 | *nextCE = (*(CETable+3*(++iCE))); |
michael@0 | 195 | *nextContCE = (*(CETable+3*(iCE)+1)); |
michael@0 | 196 | } |
michael@0 | 197 | |
michael@0 | 198 | return iCE; |
michael@0 | 199 | } |
michael@0 | 200 | |
michael@0 | 201 | U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src, |
michael@0 | 202 | uint32_t CE, uint32_t contCE, |
michael@0 | 203 | uint32_t *prevCE, uint32_t *prevContCE, |
michael@0 | 204 | uint32_t strength) |
michael@0 | 205 | { |
michael@0 | 206 | uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); |
michael@0 | 207 | int32_t iCE; |
michael@0 | 208 | |
michael@0 | 209 | iCE = ucol_inv_findCE(src, CE, contCE); |
michael@0 | 210 | |
michael@0 | 211 | if(iCE<0) { |
michael@0 | 212 | *prevCE = UCOL_NOT_FOUND; |
michael@0 | 213 | return -1; |
michael@0 | 214 | } |
michael@0 | 215 | |
michael@0 | 216 | CE &= strengthMask[strength]; |
michael@0 | 217 | contCE &= strengthMask[strength]; |
michael@0 | 218 | |
michael@0 | 219 | *prevCE = CE; |
michael@0 | 220 | *prevContCE = contCE; |
michael@0 | 221 | |
michael@0 | 222 | while((*prevCE & strengthMask[strength]) == CE |
michael@0 | 223 | && (*prevContCE & strengthMask[strength])== contCE |
michael@0 | 224 | && iCE > 0) /* this condition should prevent falling off the edge of the world */ |
michael@0 | 225 | { |
michael@0 | 226 | /* here, we end up in a singularity - zero */ |
michael@0 | 227 | *prevCE = (*(CETable+3*(--iCE))); |
michael@0 | 228 | *prevContCE = (*(CETable+3*(iCE)+1)); |
michael@0 | 229 | } |
michael@0 | 230 | |
michael@0 | 231 | return iCE; |
michael@0 | 232 | } |
michael@0 | 233 | |
michael@0 | 234 | U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE, |
michael@0 | 235 | uint32_t prevCE, uint32_t prevContCE) |
michael@0 | 236 | { |
michael@0 | 237 | if(prevCE == CE && prevContCE == contCE) { |
michael@0 | 238 | return UCOL_IDENTICAL; |
michael@0 | 239 | } |
michael@0 | 240 | if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY]) |
michael@0 | 241 | || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY])) |
michael@0 | 242 | { |
michael@0 | 243 | return UCOL_PRIMARY; |
michael@0 | 244 | } |
michael@0 | 245 | if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY]) |
michael@0 | 246 | || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY])) |
michael@0 | 247 | { |
michael@0 | 248 | return UCOL_SECONDARY; |
michael@0 | 249 | } |
michael@0 | 250 | return UCOL_TERTIARY; |
michael@0 | 251 | } |
michael@0 | 252 | |
michael@0 | 253 | |
michael@0 | 254 | /*static |
michael@0 | 255 | inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) { |
michael@0 | 256 | |
michael@0 | 257 | uint32_t CE = lh->baseCE; |
michael@0 | 258 | uint32_t SecondCE = lh->baseContCE; |
michael@0 | 259 | |
michael@0 | 260 | uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); |
michael@0 | 261 | uint32_t previousCE, previousContCE; |
michael@0 | 262 | int32_t iCE; |
michael@0 | 263 | |
michael@0 | 264 | iCE = ucol_inv_findCE(src, CE, SecondCE); |
michael@0 | 265 | |
michael@0 | 266 | if(iCE<0) { |
michael@0 | 267 | return -1; |
michael@0 | 268 | } |
michael@0 | 269 | |
michael@0 | 270 | CE &= strengthMask[strength]; |
michael@0 | 271 | SecondCE &= strengthMask[strength]; |
michael@0 | 272 | |
michael@0 | 273 | previousCE = CE; |
michael@0 | 274 | previousContCE = SecondCE; |
michael@0 | 275 | |
michael@0 | 276 | while((previousCE & strengthMask[strength]) == CE && (previousContCE & strengthMask[strength])== SecondCE) { |
michael@0 | 277 | previousCE = (*(CETable+3*(--iCE))); |
michael@0 | 278 | previousContCE = (*(CETable+3*(iCE)+1)); |
michael@0 | 279 | } |
michael@0 | 280 | lh->previousCE = previousCE; |
michael@0 | 281 | lh->previousContCE = previousContCE; |
michael@0 | 282 | |
michael@0 | 283 | return iCE; |
michael@0 | 284 | }*/ |
michael@0 | 285 | |
michael@0 | 286 | static |
michael@0 | 287 | inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) { |
michael@0 | 288 | uint32_t CE = lh->baseCE; |
michael@0 | 289 | uint32_t SecondCE = lh->baseContCE; |
michael@0 | 290 | |
michael@0 | 291 | uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); |
michael@0 | 292 | uint32_t nextCE, nextContCE; |
michael@0 | 293 | int32_t iCE; |
michael@0 | 294 | |
michael@0 | 295 | iCE = ucol_inv_findCE(src, CE, SecondCE); |
michael@0 | 296 | |
michael@0 | 297 | if(iCE<0) { |
michael@0 | 298 | return -1; |
michael@0 | 299 | } |
michael@0 | 300 | |
michael@0 | 301 | CE &= strengthMask[strength]; |
michael@0 | 302 | SecondCE &= strengthMask[strength]; |
michael@0 | 303 | |
michael@0 | 304 | nextCE = CE; |
michael@0 | 305 | nextContCE = SecondCE; |
michael@0 | 306 | |
michael@0 | 307 | while((nextCE & strengthMask[strength]) == CE |
michael@0 | 308 | && (nextContCE & strengthMask[strength]) == SecondCE) |
michael@0 | 309 | { |
michael@0 | 310 | nextCE = (*(CETable+3*(++iCE))); |
michael@0 | 311 | nextContCE = (*(CETable+3*(iCE)+1)); |
michael@0 | 312 | } |
michael@0 | 313 | |
michael@0 | 314 | lh->nextCE = nextCE; |
michael@0 | 315 | lh->nextContCE = nextContCE; |
michael@0 | 316 | |
michael@0 | 317 | return iCE; |
michael@0 | 318 | } |
michael@0 | 319 | |
michael@0 | 320 | static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) { |
michael@0 | 321 | /* reset all the gaps */ |
michael@0 | 322 | int32_t i = 0; |
michael@0 | 323 | uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); |
michael@0 | 324 | uint32_t st = 0; |
michael@0 | 325 | uint32_t t1, t2; |
michael@0 | 326 | int32_t pos; |
michael@0 | 327 | |
michael@0 | 328 | UColToken *tok = lh->first; |
michael@0 | 329 | uint32_t tokStrength = tok->strength; |
michael@0 | 330 | |
michael@0 | 331 | for(i = 0; i<3; i++) { |
michael@0 | 332 | lh->gapsHi[3*i] = 0; |
michael@0 | 333 | lh->gapsHi[3*i+1] = 0; |
michael@0 | 334 | lh->gapsHi[3*i+2] = 0; |
michael@0 | 335 | lh->gapsLo[3*i] = 0; |
michael@0 | 336 | lh->gapsLo[3*i+1] = 0; |
michael@0 | 337 | lh->gapsLo[3*i+2] = 0; |
michael@0 | 338 | lh->numStr[i] = 0; |
michael@0 | 339 | lh->fStrToken[i] = NULL; |
michael@0 | 340 | lh->lStrToken[i] = NULL; |
michael@0 | 341 | lh->pos[i] = -1; |
michael@0 | 342 | } |
michael@0 | 343 | |
michael@0 | 344 | UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); |
michael@0 | 345 | |
michael@0 | 346 | if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ |
michael@0 | 347 | //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */ |
michael@0 | 348 | lh->pos[0] = 0; |
michael@0 | 349 | t1 = lh->baseCE; |
michael@0 | 350 | t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION; |
michael@0 | 351 | lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; |
michael@0 | 352 | lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; |
michael@0 | 353 | lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; |
michael@0 | 354 | uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16); |
michael@0 | 355 | primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1); |
michael@0 | 356 | |
michael@0 | 357 | t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505; |
michael@0 | 358 | t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER; |
michael@0 | 359 | |
michael@0 | 360 | lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; |
michael@0 | 361 | lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; |
michael@0 | 362 | lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; |
michael@0 | 363 | } else if(lh->indirect == TRUE && lh->nextCE != 0) { |
michael@0 | 364 | //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) { |
michael@0 | 365 | lh->pos[0] = 0; |
michael@0 | 366 | t1 = lh->baseCE; |
michael@0 | 367 | t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION; |
michael@0 | 368 | lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; |
michael@0 | 369 | lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; |
michael@0 | 370 | lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; |
michael@0 | 371 | t1 = lh->nextCE; |
michael@0 | 372 | t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION; |
michael@0 | 373 | lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; |
michael@0 | 374 | lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; |
michael@0 | 375 | lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; |
michael@0 | 376 | } else { |
michael@0 | 377 | for(;;) { |
michael@0 | 378 | if(tokStrength < UCOL_CE_STRENGTH_LIMIT) { |
michael@0 | 379 | if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) { |
michael@0 | 380 | lh->fStrToken[tokStrength] = tok; |
michael@0 | 381 | } else { /* The CE must be implicit, since it's not in the table */ |
michael@0 | 382 | /* Error */ |
michael@0 | 383 | *status = U_INTERNAL_PROGRAM_ERROR; |
michael@0 | 384 | } |
michael@0 | 385 | } |
michael@0 | 386 | |
michael@0 | 387 | while(tok != NULL && tok->strength >= tokStrength) { |
michael@0 | 388 | if(tokStrength < UCOL_CE_STRENGTH_LIMIT) { |
michael@0 | 389 | lh->lStrToken[tokStrength] = tok; |
michael@0 | 390 | } |
michael@0 | 391 | tok = tok->next; |
michael@0 | 392 | } |
michael@0 | 393 | if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) { |
michael@0 | 394 | /* check if previous interval is the same and merge the intervals if it is so */ |
michael@0 | 395 | if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) { |
michael@0 | 396 | lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1]; |
michael@0 | 397 | lh->fStrToken[tokStrength+1] = NULL; |
michael@0 | 398 | lh->lStrToken[tokStrength+1] = NULL; |
michael@0 | 399 | lh->pos[tokStrength+1] = -1; |
michael@0 | 400 | } |
michael@0 | 401 | } |
michael@0 | 402 | if(tok != NULL) { |
michael@0 | 403 | tokStrength = tok->strength; |
michael@0 | 404 | } else { |
michael@0 | 405 | break; |
michael@0 | 406 | } |
michael@0 | 407 | } |
michael@0 | 408 | for(st = 0; st < 3; st++) { |
michael@0 | 409 | if((pos = lh->pos[st]) >= 0) { |
michael@0 | 410 | t1 = *(CETable+3*(pos)); |
michael@0 | 411 | t2 = *(CETable+3*(pos)+1); |
michael@0 | 412 | lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; |
michael@0 | 413 | lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; |
michael@0 | 414 | //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; |
michael@0 | 415 | lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16; |
michael@0 | 416 | //pos--; |
michael@0 | 417 | //t1 = *(CETable+3*(pos)); |
michael@0 | 418 | //t2 = *(CETable+3*(pos)+1); |
michael@0 | 419 | t1 = lh->baseCE; |
michael@0 | 420 | t2 = lh->baseContCE; |
michael@0 | 421 | lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; |
michael@0 | 422 | lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; |
michael@0 | 423 | lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16; |
michael@0 | 424 | } |
michael@0 | 425 | } |
michael@0 | 426 | } |
michael@0 | 427 | } |
michael@0 | 428 | |
michael@0 | 429 | |
michael@0 | 430 | #define ucol_countBytes(value, noOfBytes) \ |
michael@0 | 431 | { \ |
michael@0 | 432 | uint32_t mask = 0xFFFFFFFF; \ |
michael@0 | 433 | (noOfBytes) = 0; \ |
michael@0 | 434 | while(mask != 0) { \ |
michael@0 | 435 | if(((value) & mask) != 0) { \ |
michael@0 | 436 | (noOfBytes)++; \ |
michael@0 | 437 | } \ |
michael@0 | 438 | mask >>= 8; \ |
michael@0 | 439 | } \ |
michael@0 | 440 | } |
michael@0 | 441 | |
michael@0 | 442 | static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) { |
michael@0 | 443 | if(U_SUCCESS(*status)) { |
michael@0 | 444 | g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); |
michael@0 | 445 | } |
michael@0 | 446 | return g->current; |
michael@0 | 447 | } |
michael@0 | 448 | |
michael@0 | 449 | static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) { |
michael@0 | 450 | /* TODO: rename to enum names */ |
michael@0 | 451 | uint32_t high, low, count=1; |
michael@0 | 452 | uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF; |
michael@0 | 453 | |
michael@0 | 454 | if(strength == UCOL_SECONDARY) { |
michael@0 | 455 | low = UCOL_COMMON_TOP2<<24; |
michael@0 | 456 | high = 0xFFFFFFFF; |
michael@0 | 457 | count = 0xFF - UCOL_COMMON_TOP2; |
michael@0 | 458 | } else { |
michael@0 | 459 | low = UCOL_BYTE_COMMON << 24; //0x05000000; |
michael@0 | 460 | high = 0x40000000; |
michael@0 | 461 | count = 0x40 - UCOL_BYTE_COMMON; |
michael@0 | 462 | } |
michael@0 | 463 | |
michael@0 | 464 | if(tok->next != NULL && tok->next->strength == strength) { |
michael@0 | 465 | count = tok->next->toInsert; |
michael@0 | 466 | } |
michael@0 | 467 | |
michael@0 | 468 | g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges); |
michael@0 | 469 | g->current = UCOL_BYTE_COMMON<<24; |
michael@0 | 470 | |
michael@0 | 471 | if(g->noOfRanges == 0) { |
michael@0 | 472 | *status = U_INTERNAL_PROGRAM_ERROR; |
michael@0 | 473 | } |
michael@0 | 474 | return g->current; |
michael@0 | 475 | } |
michael@0 | 476 | |
michael@0 | 477 | static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) { |
michael@0 | 478 | uint32_t strength = tok->strength; |
michael@0 | 479 | uint32_t low = lows[fStrength*3+strength]; |
michael@0 | 480 | uint32_t high = highs[fStrength*3+strength]; |
michael@0 | 481 | uint32_t maxByte = 0; |
michael@0 | 482 | if(strength == UCOL_TERTIARY) { |
michael@0 | 483 | maxByte = 0x3F; |
michael@0 | 484 | } else if(strength == UCOL_PRIMARY) { |
michael@0 | 485 | maxByte = 0xFE; |
michael@0 | 486 | } else { |
michael@0 | 487 | maxByte = 0xFF; |
michael@0 | 488 | } |
michael@0 | 489 | |
michael@0 | 490 | uint32_t count = tok->toInsert; |
michael@0 | 491 | |
michael@0 | 492 | if(low >= high && strength > UCOL_PRIMARY) { |
michael@0 | 493 | int32_t s = strength; |
michael@0 | 494 | for(;;) { |
michael@0 | 495 | s--; |
michael@0 | 496 | if(lows[fStrength*3+s] != highs[fStrength*3+s]) { |
michael@0 | 497 | if(strength == UCOL_SECONDARY) { |
michael@0 | 498 | if (low < UCOL_COMMON_TOP2<<24 ) { |
michael@0 | 499 | // Override if low range is less than UCOL_COMMON_TOP2. |
michael@0 | 500 | low = UCOL_COMMON_TOP2<<24; |
michael@0 | 501 | } |
michael@0 | 502 | high = 0xFFFFFFFF; |
michael@0 | 503 | } else { |
michael@0 | 504 | // Override if low range is less than UCOL_COMMON_BOT3. |
michael@0 | 505 | if ( low < UCOL_COMMON_BOT3<<24 ) { |
michael@0 | 506 | low = UCOL_COMMON_BOT3<<24; |
michael@0 | 507 | } |
michael@0 | 508 | high = 0x40000000; |
michael@0 | 509 | } |
michael@0 | 510 | break; |
michael@0 | 511 | } |
michael@0 | 512 | if(s<0) { |
michael@0 | 513 | *status = U_INTERNAL_PROGRAM_ERROR; |
michael@0 | 514 | return 0; |
michael@0 | 515 | } |
michael@0 | 516 | } |
michael@0 | 517 | } |
michael@0 | 518 | |
michael@0 | 519 | if(low < 0x02000000) { |
michael@0 | 520 | // We must not use CE weight byte 02, so we set it as the minimum lower bound. |
michael@0 | 521 | // See http://site.icu-project.org/design/collation/bytes |
michael@0 | 522 | low = 0x02000000; |
michael@0 | 523 | } |
michael@0 | 524 | |
michael@0 | 525 | if(strength == UCOL_SECONDARY) { /* similar as simple */ |
michael@0 | 526 | if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) { |
michael@0 | 527 | low = UCOL_COMMON_TOP2<<24; |
michael@0 | 528 | } |
michael@0 | 529 | if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) { |
michael@0 | 530 | high = UCOL_COMMON_TOP2<<24; |
michael@0 | 531 | } |
michael@0 | 532 | if(low < (UCOL_COMMON_BOT2<<24)) { |
michael@0 | 533 | g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges); |
michael@0 | 534 | g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); |
michael@0 | 535 | //g->current = UCOL_COMMON_BOT2<<24; |
michael@0 | 536 | return g->current; |
michael@0 | 537 | } |
michael@0 | 538 | } |
michael@0 | 539 | |
michael@0 | 540 | g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges); |
michael@0 | 541 | if(g->noOfRanges == 0) { |
michael@0 | 542 | *status = U_INTERNAL_PROGRAM_ERROR; |
michael@0 | 543 | } |
michael@0 | 544 | g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); |
michael@0 | 545 | return g->current; |
michael@0 | 546 | } |
michael@0 | 547 | |
michael@0 | 548 | static |
michael@0 | 549 | uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) { |
michael@0 | 550 | uint32_t i = 0; |
michael@0 | 551 | UChar c; |
michael@0 | 552 | |
michael@0 | 553 | if(U_FAILURE(*status)) { |
michael@0 | 554 | return 0; |
michael@0 | 555 | } |
michael@0 | 556 | |
michael@0 | 557 | if(sourceLen > resLen) { |
michael@0 | 558 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 559 | return 0; |
michael@0 | 560 | } |
michael@0 | 561 | |
michael@0 | 562 | for(i = 0; i < sourceLen; i++) { |
michael@0 | 563 | c = source[i]; |
michael@0 | 564 | if(0x3041 <= c && c <= 0x30FA) { /* Kana range */ |
michael@0 | 565 | switch(c - 0x3000) { |
michael@0 | 566 | case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E: |
michael@0 | 567 | case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE: |
michael@0 | 568 | c++; |
michael@0 | 569 | break; |
michael@0 | 570 | case 0xF5: |
michael@0 | 571 | c = 0x30AB; |
michael@0 | 572 | break; |
michael@0 | 573 | case 0xF6: |
michael@0 | 574 | c = 0x30B1; |
michael@0 | 575 | break; |
michael@0 | 576 | } |
michael@0 | 577 | } |
michael@0 | 578 | resBuf[i] = c; |
michael@0 | 579 | } |
michael@0 | 580 | return sourceLen; |
michael@0 | 581 | } |
michael@0 | 582 | |
michael@0 | 583 | static |
michael@0 | 584 | uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) { |
michael@0 | 585 | uint32_t i = 0; |
michael@0 | 586 | UChar c; |
michael@0 | 587 | |
michael@0 | 588 | if(U_FAILURE(*status)) { |
michael@0 | 589 | return 0; |
michael@0 | 590 | } |
michael@0 | 591 | |
michael@0 | 592 | if(sourceLen > resLen) { |
michael@0 | 593 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 594 | return 0; |
michael@0 | 595 | } |
michael@0 | 596 | |
michael@0 | 597 | for(i = 0; i < sourceLen; i++) { |
michael@0 | 598 | c = source[i]; |
michael@0 | 599 | if(0x3041 <= c && c <= 0x30FA) { /* Kana range */ |
michael@0 | 600 | switch(c - 0x3000) { |
michael@0 | 601 | case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F: |
michael@0 | 602 | case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF: |
michael@0 | 603 | c--; |
michael@0 | 604 | break; |
michael@0 | 605 | case 0xAB: |
michael@0 | 606 | c = 0x30F5; |
michael@0 | 607 | break; |
michael@0 | 608 | case 0xB1: |
michael@0 | 609 | c = 0x30F6; |
michael@0 | 610 | break; |
michael@0 | 611 | } |
michael@0 | 612 | } |
michael@0 | 613 | resBuf[i] = c; |
michael@0 | 614 | } |
michael@0 | 615 | return sourceLen; |
michael@0 | 616 | } |
michael@0 | 617 | |
michael@0 | 618 | U_NAMESPACE_BEGIN |
michael@0 | 619 | |
michael@0 | 620 | static |
michael@0 | 621 | uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) { |
michael@0 | 622 | uint32_t i = 0; |
michael@0 | 623 | UChar n[128]; |
michael@0 | 624 | uint32_t nLen = 0; |
michael@0 | 625 | uint32_t uCount = 0, lCount = 0; |
michael@0 | 626 | |
michael@0 | 627 | collIterate s; |
michael@0 | 628 | uint32_t order = 0; |
michael@0 | 629 | |
michael@0 | 630 | if(U_FAILURE(*status)) { |
michael@0 | 631 | return UCOL_LOWER_CASE; |
michael@0 | 632 | } |
michael@0 | 633 | |
michael@0 | 634 | nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status); |
michael@0 | 635 | if(U_SUCCESS(*status)) { |
michael@0 | 636 | for(i = 0; i < nLen; i++) { |
michael@0 | 637 | uprv_init_collIterate(UCA, &n[i], 1, &s, status); |
michael@0 | 638 | order = ucol_getNextCE(UCA, &s, status); |
michael@0 | 639 | if(isContinuation(order)) { |
michael@0 | 640 | *status = U_INTERNAL_PROGRAM_ERROR; |
michael@0 | 641 | return UCOL_LOWER_CASE; |
michael@0 | 642 | } |
michael@0 | 643 | if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) { |
michael@0 | 644 | uCount++; |
michael@0 | 645 | } else { |
michael@0 | 646 | if(u_islower(n[i])) { |
michael@0 | 647 | lCount++; |
michael@0 | 648 | } else if(U_SUCCESS(*status)) { |
michael@0 | 649 | UChar sk[1], lk[1]; |
michael@0 | 650 | u_toSmallKana(&n[i], 1, sk, 1, status); |
michael@0 | 651 | u_toLargeKana(&n[i], 1, lk, 1, status); |
michael@0 | 652 | if(sk[0] == n[i] && lk[0] != n[i]) { |
michael@0 | 653 | lCount++; |
michael@0 | 654 | } |
michael@0 | 655 | } |
michael@0 | 656 | } |
michael@0 | 657 | } |
michael@0 | 658 | } |
michael@0 | 659 | |
michael@0 | 660 | if(uCount != 0 && lCount != 0) { |
michael@0 | 661 | return UCOL_MIXED_CASE; |
michael@0 | 662 | } else if(uCount != 0) { |
michael@0 | 663 | return UCOL_UPPER_CASE; |
michael@0 | 664 | } else { |
michael@0 | 665 | return UCOL_LOWER_CASE; |
michael@0 | 666 | } |
michael@0 | 667 | } |
michael@0 | 668 | |
michael@0 | 669 | |
michael@0 | 670 | U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) { |
michael@0 | 671 | /* this one makes the table and stuff */ |
michael@0 | 672 | uint32_t noOfBytes[3]; |
michael@0 | 673 | uint32_t i; |
michael@0 | 674 | |
michael@0 | 675 | for(i = 0; i<3; i++) { |
michael@0 | 676 | ucol_countBytes(CEparts[i], noOfBytes[i]); |
michael@0 | 677 | } |
michael@0 | 678 | |
michael@0 | 679 | /* Here we have to pack CEs from parts */ |
michael@0 | 680 | |
michael@0 | 681 | uint32_t CEi = 0; |
michael@0 | 682 | uint32_t value = 0; |
michael@0 | 683 | |
michael@0 | 684 | while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) { |
michael@0 | 685 | if(CEi > 0) { |
michael@0 | 686 | value = UCOL_CONTINUATION_MARKER; /* Continuation marker */ |
michael@0 | 687 | } else { |
michael@0 | 688 | value = 0; |
michael@0 | 689 | } |
michael@0 | 690 | |
michael@0 | 691 | if(2*CEi<noOfBytes[0]) { |
michael@0 | 692 | value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16; |
michael@0 | 693 | } |
michael@0 | 694 | if(CEi<noOfBytes[1]) { |
michael@0 | 695 | value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8; |
michael@0 | 696 | } |
michael@0 | 697 | if(CEi<noOfBytes[2]) { |
michael@0 | 698 | value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F); |
michael@0 | 699 | } |
michael@0 | 700 | tok->CEs[CEi] = value; |
michael@0 | 701 | CEi++; |
michael@0 | 702 | } |
michael@0 | 703 | if(CEi == 0) { /* totally ignorable */ |
michael@0 | 704 | tok->noOfCEs = 1; |
michael@0 | 705 | tok->CEs[0] = 0; |
michael@0 | 706 | } else { /* there is at least something */ |
michael@0 | 707 | tok->noOfCEs = CEi; |
michael@0 | 708 | } |
michael@0 | 709 | |
michael@0 | 710 | |
michael@0 | 711 | // we want to set case bits here and now, not later. |
michael@0 | 712 | // Case bits handling |
michael@0 | 713 | if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables |
michael@0 | 714 | tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field |
michael@0 | 715 | int32_t cSize = (tok->source & 0xFF000000) >> 24; |
michael@0 | 716 | UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source; |
michael@0 | 717 | |
michael@0 | 718 | if(cSize > 1) { |
michael@0 | 719 | // Do it manually |
michael@0 | 720 | tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status); |
michael@0 | 721 | } else { |
michael@0 | 722 | // Copy it from the UCA |
michael@0 | 723 | uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status); |
michael@0 | 724 | tok->CEs[0] |= (caseCE & 0xC0); |
michael@0 | 725 | } |
michael@0 | 726 | } |
michael@0 | 727 | |
michael@0 | 728 | #if UCOL_DEBUG==2 |
michael@0 | 729 | fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2])); |
michael@0 | 730 | for(i = 0; i<tok->noOfCEs; i++) { |
michael@0 | 731 | fprintf(stderr, "%08X ", tok->CEs[i]); |
michael@0 | 732 | } |
michael@0 | 733 | fprintf(stderr, "\n"); |
michael@0 | 734 | #endif |
michael@0 | 735 | } |
michael@0 | 736 | |
michael@0 | 737 | U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) { |
michael@0 | 738 | ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT]; |
michael@0 | 739 | uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT]; |
michael@0 | 740 | |
michael@0 | 741 | UColToken *tok = lh->last; |
michael@0 | 742 | uint32_t t[UCOL_STRENGTH_LIMIT]; |
michael@0 | 743 | |
michael@0 | 744 | uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t)); |
michael@0 | 745 | |
michael@0 | 746 | /* must initialize ranges to avoid memory check warnings */ |
michael@0 | 747 | for (int i = 0; i < UCOL_CE_STRENGTH_LIMIT; i++) { |
michael@0 | 748 | uprv_memset(Gens[i].ranges, 0, sizeof(Gens[i].ranges)); |
michael@0 | 749 | } |
michael@0 | 750 | |
michael@0 | 751 | tok->toInsert = 1; |
michael@0 | 752 | t[tok->strength] = 1; |
michael@0 | 753 | |
michael@0 | 754 | while(tok->previous != NULL) { |
michael@0 | 755 | if(tok->previous->strength < tok->strength) { /* going up */ |
michael@0 | 756 | t[tok->strength] = 0; |
michael@0 | 757 | t[tok->previous->strength]++; |
michael@0 | 758 | } else if(tok->previous->strength > tok->strength) { /* going down */ |
michael@0 | 759 | t[tok->previous->strength] = 1; |
michael@0 | 760 | } else { |
michael@0 | 761 | t[tok->strength]++; |
michael@0 | 762 | } |
michael@0 | 763 | tok=tok->previous; |
michael@0 | 764 | tok->toInsert = t[tok->strength]; |
michael@0 | 765 | } |
michael@0 | 766 | |
michael@0 | 767 | tok->toInsert = t[tok->strength]; |
michael@0 | 768 | ucol_inv_getGapPositions(src, lh, status); |
michael@0 | 769 | |
michael@0 | 770 | #if UCOL_DEBUG |
michael@0 | 771 | fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE); |
michael@0 | 772 | int32_t j = 2; |
michael@0 | 773 | for(j = 2; j >= 0; j--) { |
michael@0 | 774 | fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]); |
michael@0 | 775 | fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]); |
michael@0 | 776 | } |
michael@0 | 777 | tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE]; |
michael@0 | 778 | |
michael@0 | 779 | do { |
michael@0 | 780 | fprintf(stderr,"%i", tok->strength); |
michael@0 | 781 | tok = tok->next; |
michael@0 | 782 | } while(tok != NULL); |
michael@0 | 783 | fprintf(stderr, "\n"); |
michael@0 | 784 | |
michael@0 | 785 | tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE]; |
michael@0 | 786 | |
michael@0 | 787 | do { |
michael@0 | 788 | fprintf(stderr,"%i", tok->toInsert); |
michael@0 | 789 | tok = tok->next; |
michael@0 | 790 | } while(tok != NULL); |
michael@0 | 791 | #endif |
michael@0 | 792 | |
michael@0 | 793 | tok = lh->first; |
michael@0 | 794 | uint32_t fStrength = UCOL_IDENTICAL; |
michael@0 | 795 | uint32_t initStrength = UCOL_IDENTICAL; |
michael@0 | 796 | |
michael@0 | 797 | |
michael@0 | 798 | CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16; |
michael@0 | 799 | CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8; |
michael@0 | 800 | CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16; |
michael@0 | 801 | |
michael@0 | 802 | while (tok != NULL && U_SUCCESS(*status)) { |
michael@0 | 803 | fStrength = tok->strength; |
michael@0 | 804 | if(fStrength < initStrength) { |
michael@0 | 805 | initStrength = fStrength; |
michael@0 | 806 | if(lh->pos[fStrength] == -1) { |
michael@0 | 807 | while(lh->pos[fStrength] == -1 && fStrength > 0) { |
michael@0 | 808 | fStrength--; |
michael@0 | 809 | } |
michael@0 | 810 | if(lh->pos[fStrength] == -1) { |
michael@0 | 811 | *status = U_INTERNAL_PROGRAM_ERROR; |
michael@0 | 812 | return; |
michael@0 | 813 | } |
michael@0 | 814 | } |
michael@0 | 815 | if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */ |
michael@0 | 816 | CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3]; |
michael@0 | 817 | CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1]; |
michael@0 | 818 | /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */ |
michael@0 | 819 | CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); |
michael@0 | 820 | } else if(initStrength == UCOL_SECONDARY) { /* secondaries */ |
michael@0 | 821 | CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3]; |
michael@0 | 822 | /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/ |
michael@0 | 823 | CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); |
michael@0 | 824 | CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); |
michael@0 | 825 | } else { /* primaries */ |
michael@0 | 826 | /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/ |
michael@0 | 827 | CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); |
michael@0 | 828 | CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status); |
michael@0 | 829 | CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); |
michael@0 | 830 | } |
michael@0 | 831 | } else { |
michael@0 | 832 | if(tok->strength == UCOL_TERTIARY) { |
michael@0 | 833 | CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status); |
michael@0 | 834 | } else if(tok->strength == UCOL_SECONDARY) { |
michael@0 | 835 | CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status); |
michael@0 | 836 | CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); |
michael@0 | 837 | } else if(tok->strength == UCOL_PRIMARY) { |
michael@0 | 838 | CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status); |
michael@0 | 839 | CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status); |
michael@0 | 840 | CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); |
michael@0 | 841 | } |
michael@0 | 842 | } |
michael@0 | 843 | ucol_doCE(src, CEparts, tok, status); |
michael@0 | 844 | tok = tok->next; |
michael@0 | 845 | } |
michael@0 | 846 | } |
michael@0 | 847 | |
michael@0 | 848 | U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) { |
michael@0 | 849 | UCAElements el; |
michael@0 | 850 | UColToken *tok = lh->first; |
michael@0 | 851 | UColToken *expt = NULL; |
michael@0 | 852 | uint32_t i = 0, j = 0; |
michael@0 | 853 | const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status); |
michael@0 | 854 | |
michael@0 | 855 | while(tok != NULL && U_SUCCESS(*status)) { |
michael@0 | 856 | /* first, check if there are any expansions */ |
michael@0 | 857 | /* if there are expansions, we need to do a little bit more processing */ |
michael@0 | 858 | /* since parts of expansion can be tailored, while others are not */ |
michael@0 | 859 | if(tok->expansion != 0) { |
michael@0 | 860 | uint32_t len = tok->expansion >> 24; |
michael@0 | 861 | uint32_t currentSequenceLen = len; |
michael@0 | 862 | uint32_t expOffset = tok->expansion & 0x00FFFFFF; |
michael@0 | 863 | //uint32_t exp = currentSequenceLen | expOffset; |
michael@0 | 864 | UColToken exp; |
michael@0 | 865 | exp.source = currentSequenceLen | expOffset; |
michael@0 | 866 | exp.rulesToParseHdl = &(src->source); |
michael@0 | 867 | |
michael@0 | 868 | while(len > 0) { |
michael@0 | 869 | currentSequenceLen = len; |
michael@0 | 870 | while(currentSequenceLen > 0) { |
michael@0 | 871 | exp.source = (currentSequenceLen << 24) | expOffset; |
michael@0 | 872 | if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */ |
michael@0 | 873 | uint32_t noOfCEsToCopy = expt->noOfCEs; |
michael@0 | 874 | for(j = 0; j<noOfCEsToCopy; j++) { |
michael@0 | 875 | tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j]; |
michael@0 | 876 | } |
michael@0 | 877 | tok->noOfExpCEs += noOfCEsToCopy; |
michael@0 | 878 | // Smart people never try to add codepoints and CEs. |
michael@0 | 879 | // For some odd reason, it won't work. |
michael@0 | 880 | expOffset += currentSequenceLen; //noOfCEsToCopy; |
michael@0 | 881 | len -= currentSequenceLen; //noOfCEsToCopy; |
michael@0 | 882 | break; |
michael@0 | 883 | } else { |
michael@0 | 884 | currentSequenceLen--; |
michael@0 | 885 | } |
michael@0 | 886 | } |
michael@0 | 887 | if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */ |
michael@0 | 888 | /* will have to get one from UCA */ |
michael@0 | 889 | /* first, get the UChars from the rules */ |
michael@0 | 890 | /* then pick CEs out until there is no more and stuff them into expansion */ |
michael@0 | 891 | collIterate s; |
michael@0 | 892 | uint32_t order = 0; |
michael@0 | 893 | uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status); |
michael@0 | 894 | |
michael@0 | 895 | for(;;) { |
michael@0 | 896 | order = ucol_getNextCE(src->UCA, &s, status); |
michael@0 | 897 | if(order == UCOL_NO_MORE_CES) { |
michael@0 | 898 | break; |
michael@0 | 899 | } |
michael@0 | 900 | tok->expCEs[tok->noOfExpCEs++] = order; |
michael@0 | 901 | } |
michael@0 | 902 | expOffset++; |
michael@0 | 903 | len--; |
michael@0 | 904 | } |
michael@0 | 905 | } |
michael@0 | 906 | } else { |
michael@0 | 907 | tok->noOfExpCEs = 0; |
michael@0 | 908 | } |
michael@0 | 909 | |
michael@0 | 910 | /* set the ucaelement with obtained values */ |
michael@0 | 911 | el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs; |
michael@0 | 912 | /* copy CEs */ |
michael@0 | 913 | for(i = 0; i<tok->noOfCEs; i++) { |
michael@0 | 914 | el.CEs[i] = tok->CEs[i]; |
michael@0 | 915 | } |
michael@0 | 916 | for(i = 0; i<tok->noOfExpCEs; i++) { |
michael@0 | 917 | el.CEs[i+tok->noOfCEs] = tok->expCEs[i]; |
michael@0 | 918 | } |
michael@0 | 919 | |
michael@0 | 920 | /* copy UChars */ |
michael@0 | 921 | // We kept prefix and source kind of together, as it is a kind of a contraction. |
michael@0 | 922 | // However, now we have to slice the prefix off the main thing - |
michael@0 | 923 | el.prefix = el.prefixChars; |
michael@0 | 924 | el.cPoints = el.uchars; |
michael@0 | 925 | if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the |
michael@0 | 926 | // addPrefix function in ucol_elm. The reason is that we need to add both composed AND |
michael@0 | 927 | // decomposed elements to the unsaf table. |
michael@0 | 928 | el.prefixSize = tok->prefix>>24; |
michael@0 | 929 | uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar)); |
michael@0 | 930 | |
michael@0 | 931 | el.cSize = (tok->source >> 24)-(tok->prefix>>24); |
michael@0 | 932 | uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar)); |
michael@0 | 933 | } else { |
michael@0 | 934 | el.prefixSize = 0; |
michael@0 | 935 | *el.prefix = 0; |
michael@0 | 936 | |
michael@0 | 937 | el.cSize = (tok->source >> 24); |
michael@0 | 938 | uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar)); |
michael@0 | 939 | } |
michael@0 | 940 | if(src->UCA != NULL) { |
michael@0 | 941 | for(i = 0; i<el.cSize; i++) { |
michael@0 | 942 | if(UCOL_ISJAMO(el.cPoints[i])) { |
michael@0 | 943 | t->image->jamoSpecial = TRUE; |
michael@0 | 944 | } |
michael@0 | 945 | } |
michael@0 | 946 | if (!src->buildCCTabFlag && el.cSize > 0) { |
michael@0 | 947 | // Check the trailing canonical combining class (tccc) of the last character. |
michael@0 | 948 | const UChar *s = el.cPoints + el.cSize; |
michael@0 | 949 | uint16_t fcd = nfcImpl->previousFCD16(el.cPoints, s); |
michael@0 | 950 | if ((fcd & 0xff) != 0) { |
michael@0 | 951 | src->buildCCTabFlag = TRUE; |
michael@0 | 952 | } |
michael@0 | 953 | } |
michael@0 | 954 | } |
michael@0 | 955 | |
michael@0 | 956 | /* and then, add it */ |
michael@0 | 957 | #if UCOL_DEBUG==2 |
michael@0 | 958 | fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]); |
michael@0 | 959 | #endif |
michael@0 | 960 | uprv_uca_addAnElement(t, &el, status); |
michael@0 | 961 | |
michael@0 | 962 | #if UCOL_DEBUG_DUPLICATES |
michael@0 | 963 | if(*status != U_ZERO_ERROR) { |
michael@0 | 964 | fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource); |
michael@0 | 965 | *status = U_ZERO_ERROR; |
michael@0 | 966 | } |
michael@0 | 967 | #endif |
michael@0 | 968 | |
michael@0 | 969 | tok = tok->next; |
michael@0 | 970 | } |
michael@0 | 971 | } |
michael@0 | 972 | |
michael@0 | 973 | U_CDECL_BEGIN |
michael@0 | 974 | static UBool U_CALLCONV |
michael@0 | 975 | _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) { |
michael@0 | 976 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 977 | tempUCATable *t = (tempUCATable *)context; |
michael@0 | 978 | if(value == 0) { |
michael@0 | 979 | while(start < limit) { |
michael@0 | 980 | uint32_t CE = utrie_get32(t->mapping, start, NULL); |
michael@0 | 981 | if(CE == UCOL_NOT_FOUND) { |
michael@0 | 982 | UCAElements el; |
michael@0 | 983 | el.isThai = FALSE; |
michael@0 | 984 | el.prefixSize = 0; |
michael@0 | 985 | el.prefixChars[0] = 0; |
michael@0 | 986 | el.prefix = el.prefixChars; |
michael@0 | 987 | el.cPoints = el.uchars; |
michael@0 | 988 | |
michael@0 | 989 | el.cSize = 0; |
michael@0 | 990 | U16_APPEND_UNSAFE(el.uchars, el.cSize, start); |
michael@0 | 991 | |
michael@0 | 992 | el.noOfCEs = 1; |
michael@0 | 993 | el.CEs[0] = 0; |
michael@0 | 994 | uprv_uca_addAnElement(t, &el, &status); |
michael@0 | 995 | |
michael@0 | 996 | } |
michael@0 | 997 | start++; |
michael@0 | 998 | } |
michael@0 | 999 | } |
michael@0 | 1000 | if(U_FAILURE(status)) { |
michael@0 | 1001 | return FALSE; |
michael@0 | 1002 | } else { |
michael@0 | 1003 | return TRUE; |
michael@0 | 1004 | } |
michael@0 | 1005 | } |
michael@0 | 1006 | U_CDECL_END |
michael@0 | 1007 | |
michael@0 | 1008 | static void |
michael@0 | 1009 | ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t, |
michael@0 | 1010 | UChar32 start, UChar32 end, |
michael@0 | 1011 | UErrorCode *status) |
michael@0 | 1012 | { |
michael@0 | 1013 | //UChar decomp[256]; |
michael@0 | 1014 | uint32_t CE = UCOL_NOT_FOUND; |
michael@0 | 1015 | UChar32 u = 0; |
michael@0 | 1016 | UCAElements el; |
michael@0 | 1017 | el.isThai = FALSE; |
michael@0 | 1018 | el.prefixSize = 0; |
michael@0 | 1019 | el.prefixChars[0] = 0; |
michael@0 | 1020 | collIterate colIt; |
michael@0 | 1021 | |
michael@0 | 1022 | if(U_SUCCESS(*status)) { |
michael@0 | 1023 | for(u = start; u<=end; u++) { |
michael@0 | 1024 | if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND |
michael@0 | 1025 | /* this test is for contractions that are missing the starting element. */ |
michael@0 | 1026 | || ((isCntTableElement(CE)) && |
michael@0 | 1027 | (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND)) |
michael@0 | 1028 | ) |
michael@0 | 1029 | { |
michael@0 | 1030 | el.cSize = 0; |
michael@0 | 1031 | U16_APPEND_UNSAFE(el.uchars, el.cSize, u); |
michael@0 | 1032 | //decomp[0] = (UChar)u; |
michael@0 | 1033 | //el.uchars[0] = (UChar)u; |
michael@0 | 1034 | el.cPoints = el.uchars; |
michael@0 | 1035 | //el.cSize = 1; |
michael@0 | 1036 | el.noOfCEs = 0; |
michael@0 | 1037 | el.prefix = el.prefixChars; |
michael@0 | 1038 | el.prefixSize = 0; |
michael@0 | 1039 | //uprv_init_collIterate(src->UCA, decomp, 1, &colIt); |
michael@0 | 1040 | // We actually want to check whether this element is a special |
michael@0 | 1041 | // If it is an implicit element (hangul, CJK - we want to copy the |
michael@0 | 1042 | // special, not the resolved CEs) - for hangul, copying resolved |
michael@0 | 1043 | // would just make things the same (there is an expansion and it |
michael@0 | 1044 | // takes approximately the same amount of time to resolve as |
michael@0 | 1045 | // falling back to the UCA). |
michael@0 | 1046 | /* |
michael@0 | 1047 | UTRIE_GET32(src->UCA->mapping, u, CE); |
michael@0 | 1048 | tag = getCETag(CE); |
michael@0 | 1049 | if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG |
michael@0 | 1050 | || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG |
michael@0 | 1051 | || tag == LEAD_SURROGATE_TAG) { |
michael@0 | 1052 | el.CEs[el.noOfCEs++] = CE; |
michael@0 | 1053 | } else { |
michael@0 | 1054 | */ |
michael@0 | 1055 | // It turns out that it does not make sense to keep implicits |
michael@0 | 1056 | // unresolved. The cost of resolving them is big enough so that |
michael@0 | 1057 | // it doesn't make any difference whether we have to go to the UCA |
michael@0 | 1058 | // or not. |
michael@0 | 1059 | { |
michael@0 | 1060 | uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status); |
michael@0 | 1061 | while(CE != UCOL_NO_MORE_CES) { |
michael@0 | 1062 | CE = ucol_getNextCE(src->UCA, &colIt, status); |
michael@0 | 1063 | if(CE != UCOL_NO_MORE_CES) { |
michael@0 | 1064 | el.CEs[el.noOfCEs++] = CE; |
michael@0 | 1065 | } |
michael@0 | 1066 | } |
michael@0 | 1067 | } |
michael@0 | 1068 | uprv_uca_addAnElement(t, &el, status); |
michael@0 | 1069 | } |
michael@0 | 1070 | } |
michael@0 | 1071 | } |
michael@0 | 1072 | } |
michael@0 | 1073 | |
michael@0 | 1074 | U_NAMESPACE_END |
michael@0 | 1075 | |
michael@0 | 1076 | U_CFUNC UCATableHeader * |
michael@0 | 1077 | ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) { |
michael@0 | 1078 | U_NAMESPACE_USE |
michael@0 | 1079 | |
michael@0 | 1080 | uint32_t i = 0; |
michael@0 | 1081 | if(U_FAILURE(*status)) { |
michael@0 | 1082 | return NULL; |
michael@0 | 1083 | } |
michael@0 | 1084 | /* |
michael@0 | 1085 | 2. Eliminate the negative lists by doing the following for each non-null negative list: |
michael@0 | 1086 | o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE, |
michael@0 | 1087 | create new ListHeader X |
michael@0 | 1088 | o reverse the list, add to the end of X's positive list. Reset the strength of the |
michael@0 | 1089 | first item you add, based on the stronger strength levels of the two lists. |
michael@0 | 1090 | */ |
michael@0 | 1091 | /* |
michael@0 | 1092 | 3. For each ListHeader with a non-null positive list: |
michael@0 | 1093 | */ |
michael@0 | 1094 | /* |
michael@0 | 1095 | o Find all character strings with CEs between the baseCE and the |
michael@0 | 1096 | next/previous CE, at the strength of the first token. Add these to the |
michael@0 | 1097 | tailoring. |
michael@0 | 1098 | ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the |
michael@0 | 1099 | tailoring has & x < z... |
michael@0 | 1100 | ? Then we change the tailoring to & x <<< X << x' <<< X' < z ... |
michael@0 | 1101 | */ |
michael@0 | 1102 | /* It is possible that this part should be done even while constructing list */ |
michael@0 | 1103 | /* The problem is that it is unknown what is going to be the strongest weight */ |
michael@0 | 1104 | /* So we might as well do it here */ |
michael@0 | 1105 | |
michael@0 | 1106 | /* |
michael@0 | 1107 | o Allocate CEs for each token in the list, based on the total number N of the |
michael@0 | 1108 | largest level difference, and the gap G between baseCE and nextCE at that |
michael@0 | 1109 | level. The relation * between the last item and nextCE is the same as the |
michael@0 | 1110 | strongest strength. |
michael@0 | 1111 | o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1) |
michael@0 | 1112 | ? There are 3 primary items: a, d, e. Fit them into the primary gap. |
michael@0 | 1113 | Then fit b and c into the secondary gap between a and d, then fit q |
michael@0 | 1114 | into the tertiary gap between b and c. |
michael@0 | 1115 | |
michael@0 | 1116 | o Example: baseCE << b <<< q << c * nextCE(X,2) |
michael@0 | 1117 | ? There are 2 secondary items: b, c. Fit them into the secondary gap. |
michael@0 | 1118 | Then fit q into the tertiary gap between b and c. |
michael@0 | 1119 | o When incrementing primary values, we will not cross high byte |
michael@0 | 1120 | boundaries except where there is only a single-byte primary. That is to |
michael@0 | 1121 | ensure that the script reordering will continue to work. |
michael@0 | 1122 | */ |
michael@0 | 1123 | UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader)); |
michael@0 | 1124 | /* test for NULL */ |
michael@0 | 1125 | if (image == NULL) { |
michael@0 | 1126 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 1127 | return NULL; |
michael@0 | 1128 | } |
michael@0 | 1129 | uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader)); |
michael@0 | 1130 | |
michael@0 | 1131 | for(i = 0; i<src->resultLen; i++) { |
michael@0 | 1132 | /* now we need to generate the CEs */ |
michael@0 | 1133 | /* We stuff the initial value in the buffers, and increase the appropriate buffer */ |
michael@0 | 1134 | /* According to strength */ |
michael@0 | 1135 | if(U_SUCCESS(*status)) { |
michael@0 | 1136 | if(src->lh[i].first) { // if there are any elements |
michael@0 | 1137 | // due to the way parser works, subsequent tailorings |
michael@0 | 1138 | // may remove all the elements from a sequence, therefore |
michael@0 | 1139 | // leaving an empty tailoring sequence. |
michael@0 | 1140 | ucol_initBuffers(src, &src->lh[i], status); |
michael@0 | 1141 | } |
michael@0 | 1142 | } |
michael@0 | 1143 | if(U_FAILURE(*status)) { |
michael@0 | 1144 | uprv_free(image); |
michael@0 | 1145 | return NULL; |
michael@0 | 1146 | } |
michael@0 | 1147 | } |
michael@0 | 1148 | |
michael@0 | 1149 | if(src->varTop != NULL) { /* stuff the variable top value */ |
michael@0 | 1150 | src->opts->variableTopValue = (*(src->varTop->CEs))>>16; |
michael@0 | 1151 | /* remove it from the list */ |
michael@0 | 1152 | if(src->varTop->listHeader->first == src->varTop) { /* first in list */ |
michael@0 | 1153 | src->varTop->listHeader->first = src->varTop->next; |
michael@0 | 1154 | } |
michael@0 | 1155 | if(src->varTop->listHeader->last == src->varTop) { /* first in list */ |
michael@0 | 1156 | src->varTop->listHeader->last = src->varTop->previous; |
michael@0 | 1157 | } |
michael@0 | 1158 | if(src->varTop->next != NULL) { |
michael@0 | 1159 | src->varTop->next->previous = src->varTop->previous; |
michael@0 | 1160 | } |
michael@0 | 1161 | if(src->varTop->previous != NULL) { |
michael@0 | 1162 | src->varTop->previous->next = src->varTop->next; |
michael@0 | 1163 | } |
michael@0 | 1164 | } |
michael@0 | 1165 | |
michael@0 | 1166 | |
michael@0 | 1167 | tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status); |
michael@0 | 1168 | if(U_FAILURE(*status)) { |
michael@0 | 1169 | uprv_free(image); |
michael@0 | 1170 | return NULL; |
michael@0 | 1171 | } |
michael@0 | 1172 | |
michael@0 | 1173 | |
michael@0 | 1174 | /* After this, we have assigned CE values to all regular CEs */ |
michael@0 | 1175 | /* now we will go through list once more and resolve expansions, */ |
michael@0 | 1176 | /* make UCAElements structs and add them to table */ |
michael@0 | 1177 | for(i = 0; i<src->resultLen; i++) { |
michael@0 | 1178 | /* now we need to generate the CEs */ |
michael@0 | 1179 | /* We stuff the initial value in the buffers, and increase the appropriate buffer */ |
michael@0 | 1180 | /* According to strength */ |
michael@0 | 1181 | if(U_SUCCESS(*status)) { |
michael@0 | 1182 | ucol_createElements(src, t, &src->lh[i], status); |
michael@0 | 1183 | } |
michael@0 | 1184 | } |
michael@0 | 1185 | |
michael@0 | 1186 | UCAElements el; |
michael@0 | 1187 | el.isThai = FALSE; |
michael@0 | 1188 | el.prefixSize = 0; |
michael@0 | 1189 | el.prefixChars[0] = 0; |
michael@0 | 1190 | |
michael@0 | 1191 | /* add latin-1 stuff */ |
michael@0 | 1192 | ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status); |
michael@0 | 1193 | |
michael@0 | 1194 | /* add stuff for copying */ |
michael@0 | 1195 | if(src->copySet != NULL) { |
michael@0 | 1196 | int32_t i = 0; |
michael@0 | 1197 | UnicodeSet *set = (UnicodeSet *)src->copySet; |
michael@0 | 1198 | for(i = 0; i < set->getRangeCount(); i++) { |
michael@0 | 1199 | ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status); |
michael@0 | 1200 | } |
michael@0 | 1201 | } |
michael@0 | 1202 | |
michael@0 | 1203 | if(U_SUCCESS(*status)) { |
michael@0 | 1204 | /* copy contractions from the UCA - this is felt mostly for cyrillic*/ |
michael@0 | 1205 | |
michael@0 | 1206 | uint32_t tailoredCE = UCOL_NOT_FOUND; |
michael@0 | 1207 | UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos); |
michael@0 | 1208 | int32_t maxUCAContractionLength = src->UCA->image->contractionUCACombosWidth; |
michael@0 | 1209 | UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status); |
michael@0 | 1210 | // Check for null pointer |
michael@0 | 1211 | if (ucaEl == NULL) { |
michael@0 | 1212 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 1213 | return NULL; |
michael@0 | 1214 | } |
michael@0 | 1215 | while(*conts != 0) { |
michael@0 | 1216 | // A continuation is NUL-terminated and NUL-padded |
michael@0 | 1217 | // except if it has the maximum length. |
michael@0 | 1218 | int32_t contractionLength = maxUCAContractionLength; |
michael@0 | 1219 | while(contractionLength > 0 && conts[contractionLength - 1] == 0) { |
michael@0 | 1220 | --contractionLength; |
michael@0 | 1221 | } |
michael@0 | 1222 | UChar32 first; |
michael@0 | 1223 | int32_t firstLength = 0; |
michael@0 | 1224 | U16_NEXT(conts, firstLength, contractionLength, first); |
michael@0 | 1225 | tailoredCE = utrie_get32(t->mapping, first, NULL); |
michael@0 | 1226 | if(tailoredCE != UCOL_NOT_FOUND) { |
michael@0 | 1227 | UBool needToAdd = TRUE; |
michael@0 | 1228 | if(isCntTableElement(tailoredCE)) { |
michael@0 | 1229 | if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+firstLength, status) == TRUE) { |
michael@0 | 1230 | needToAdd = FALSE; |
michael@0 | 1231 | } |
michael@0 | 1232 | } |
michael@0 | 1233 | if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) { |
michael@0 | 1234 | UCAElements elm; |
michael@0 | 1235 | elm.cPoints = el.uchars; |
michael@0 | 1236 | elm.noOfCEs = 0; |
michael@0 | 1237 | elm.uchars[0] = *conts; |
michael@0 | 1238 | elm.uchars[1] = 0; |
michael@0 | 1239 | elm.cSize = 1; |
michael@0 | 1240 | elm.prefixChars[0] = *(conts+2); |
michael@0 | 1241 | elm.isThai = FALSE; |
michael@0 | 1242 | elm.prefix = elm.prefixChars; |
michael@0 | 1243 | elm.prefixSize = 1; |
michael@0 | 1244 | UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm); |
michael@0 | 1245 | if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) { |
michael@0 | 1246 | needToAdd = TRUE; |
michael@0 | 1247 | } |
michael@0 | 1248 | } |
michael@0 | 1249 | if(src->removeSet != NULL && uset_contains(src->removeSet, first)) { |
michael@0 | 1250 | needToAdd = FALSE; |
michael@0 | 1251 | } |
michael@0 | 1252 | |
michael@0 | 1253 | if(needToAdd == TRUE) { // we need to add if this contraction is not tailored. |
michael@0 | 1254 | if (*(conts+1) != 0) { // contractions |
michael@0 | 1255 | el.prefix = el.prefixChars; |
michael@0 | 1256 | el.prefixSize = 0; |
michael@0 | 1257 | el.cPoints = el.uchars; |
michael@0 | 1258 | el.noOfCEs = 0; |
michael@0 | 1259 | u_memcpy(el.uchars, conts, contractionLength); |
michael@0 | 1260 | el.cSize = contractionLength; |
michael@0 | 1261 | ucol_setText(ucaEl, el.uchars, el.cSize, status); |
michael@0 | 1262 | } |
michael@0 | 1263 | else { // pre-context character |
michael@0 | 1264 | UChar str[4] = { 0 }; |
michael@0 | 1265 | int32_t len=0; |
michael@0 | 1266 | int32_t preKeyLen=0; |
michael@0 | 1267 | |
michael@0 | 1268 | el.cPoints = el.uchars; |
michael@0 | 1269 | el.noOfCEs = 0; |
michael@0 | 1270 | el.uchars[0] = *conts; |
michael@0 | 1271 | el.uchars[1] = 0; |
michael@0 | 1272 | el.cSize = 1; |
michael@0 | 1273 | el.prefixChars[0] = *(conts+2); |
michael@0 | 1274 | el.prefix = el.prefixChars; |
michael@0 | 1275 | el.prefixSize = 1; |
michael@0 | 1276 | if (el.prefixChars[0]!=0) { |
michael@0 | 1277 | // get CE of prefix character first |
michael@0 | 1278 | str[0]=el.prefixChars[0]; |
michael@0 | 1279 | str[1]=0; |
michael@0 | 1280 | ucol_setText(ucaEl, str, 1, status); |
michael@0 | 1281 | while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) |
michael@0 | 1282 | != UCOL_NULLORDER) { |
michael@0 | 1283 | preKeyLen++; // count number of keys for prefix character |
michael@0 | 1284 | } |
michael@0 | 1285 | str[len++] = el.prefixChars[0]; |
michael@0 | 1286 | } |
michael@0 | 1287 | |
michael@0 | 1288 | str[len++] = el.uchars[0]; |
michael@0 | 1289 | str[len]=0; |
michael@0 | 1290 | ucol_setText(ucaEl, str, len, status); |
michael@0 | 1291 | // Skip the keys for prefix character, then copy the rest to el. |
michael@0 | 1292 | while ((preKeyLen-->0) && |
michael@0 | 1293 | (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) { |
michael@0 | 1294 | continue; |
michael@0 | 1295 | } |
michael@0 | 1296 | |
michael@0 | 1297 | } |
michael@0 | 1298 | while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) { |
michael@0 | 1299 | el.noOfCEs++; |
michael@0 | 1300 | } |
michael@0 | 1301 | uprv_uca_addAnElement(t, &el, status); |
michael@0 | 1302 | } |
michael@0 | 1303 | |
michael@0 | 1304 | } else if(src->removeSet != NULL && uset_contains(src->removeSet, first)) { |
michael@0 | 1305 | ucol_uprv_bld_copyRangeFromUCA(src, t, first, first, status); |
michael@0 | 1306 | } |
michael@0 | 1307 | conts+=maxUCAContractionLength; |
michael@0 | 1308 | } |
michael@0 | 1309 | ucol_closeElements(ucaEl); |
michael@0 | 1310 | } |
michael@0 | 1311 | |
michael@0 | 1312 | // Add completely ignorable elements |
michael@0 | 1313 | utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t); |
michael@0 | 1314 | |
michael@0 | 1315 | // add tailoring characters related canonical closures |
michael@0 | 1316 | uprv_uca_canonicalClosure(t, src, NULL, status); |
michael@0 | 1317 | |
michael@0 | 1318 | /* still need to produce compatibility closure */ |
michael@0 | 1319 | |
michael@0 | 1320 | UCATableHeader *myData = uprv_uca_assembleTable(t, status); |
michael@0 | 1321 | |
michael@0 | 1322 | uprv_uca_closeTempTable(t); |
michael@0 | 1323 | uprv_free(image); |
michael@0 | 1324 | |
michael@0 | 1325 | return myData; |
michael@0 | 1326 | } |
michael@0 | 1327 | |
michael@0 | 1328 | U_CDECL_BEGIN |
michael@0 | 1329 | static UBool U_CALLCONV |
michael@0 | 1330 | ucol_bld_cleanup(void) |
michael@0 | 1331 | { |
michael@0 | 1332 | udata_close(invUCA_DATA_MEM); |
michael@0 | 1333 | invUCA_DATA_MEM = NULL; |
michael@0 | 1334 | _staticInvUCA = NULL; |
michael@0 | 1335 | gStaticInvUCAInitOnce.reset(); |
michael@0 | 1336 | return TRUE; |
michael@0 | 1337 | } |
michael@0 | 1338 | U_CDECL_END |
michael@0 | 1339 | |
michael@0 | 1340 | static void U_CALLCONV initInverseUCA(UErrorCode &status) { |
michael@0 | 1341 | U_ASSERT(invUCA_DATA_MEM == NULL); |
michael@0 | 1342 | U_ASSERT(_staticInvUCA == NULL); |
michael@0 | 1343 | ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup); |
michael@0 | 1344 | InverseUCATableHeader *newInvUCA = NULL; |
michael@0 | 1345 | UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, &status); |
michael@0 | 1346 | |
michael@0 | 1347 | if(U_FAILURE(status)) { |
michael@0 | 1348 | if (result) { |
michael@0 | 1349 | udata_close(result); |
michael@0 | 1350 | } |
michael@0 | 1351 | // This is not needed, as we are talking about |
michael@0 | 1352 | // memory we got from UData |
michael@0 | 1353 | //uprv_free(newInvUCA); |
michael@0 | 1354 | return; |
michael@0 | 1355 | } |
michael@0 | 1356 | |
michael@0 | 1357 | if(result != NULL) { /* It looks like sometimes we can fail to find the data file */ |
michael@0 | 1358 | newInvUCA = (InverseUCATableHeader *)udata_getMemory(result); |
michael@0 | 1359 | UCollator *UCA = ucol_initUCA(&status); |
michael@0 | 1360 | // UCA versions of UCA and inverse UCA should match |
michael@0 | 1361 | if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) { |
michael@0 | 1362 | status = U_INVALID_FORMAT_ERROR; |
michael@0 | 1363 | udata_close(result); |
michael@0 | 1364 | return; |
michael@0 | 1365 | } |
michael@0 | 1366 | |
michael@0 | 1367 | invUCA_DATA_MEM = result; |
michael@0 | 1368 | _staticInvUCA = newInvUCA; |
michael@0 | 1369 | } |
michael@0 | 1370 | } |
michael@0 | 1371 | |
michael@0 | 1372 | |
michael@0 | 1373 | U_CAPI const InverseUCATableHeader * U_EXPORT2 |
michael@0 | 1374 | ucol_initInverseUCA(UErrorCode *status) |
michael@0 | 1375 | { |
michael@0 | 1376 | umtx_initOnce(gStaticInvUCAInitOnce, &initInverseUCA, *status); |
michael@0 | 1377 | return _staticInvUCA; |
michael@0 | 1378 | } |
michael@0 | 1379 | |
michael@0 | 1380 | /* This is the data that is used for non-script reordering codes. These _must_ be kept |
michael@0 | 1381 | * in order that they are to be applied as defaults and in synch with the UColReorderCode enum. |
michael@0 | 1382 | */ |
michael@0 | 1383 | static const char * const ReorderingTokenNames[] = { |
michael@0 | 1384 | "SPACE", |
michael@0 | 1385 | "PUNCT", |
michael@0 | 1386 | "SYMBOL", |
michael@0 | 1387 | "CURRENCY", |
michael@0 | 1388 | "DIGIT" |
michael@0 | 1389 | }; |
michael@0 | 1390 | |
michael@0 | 1391 | static void toUpper(const char* src, char* dst, uint32_t length) { |
michael@0 | 1392 | for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) { |
michael@0 | 1393 | *dst = uprv_toupper(*src); |
michael@0 | 1394 | } |
michael@0 | 1395 | *dst = '\0'; |
michael@0 | 1396 | } |
michael@0 | 1397 | |
michael@0 | 1398 | U_INTERNAL int32_t U_EXPORT2 |
michael@0 | 1399 | ucol_findReorderingEntry(const char* name) { |
michael@0 | 1400 | char buffer[32]; |
michael@0 | 1401 | toUpper(name, buffer, 32); |
michael@0 | 1402 | for (uint32_t entry = 0; entry < LENGTHOF(ReorderingTokenNames); entry++) { |
michael@0 | 1403 | if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) { |
michael@0 | 1404 | return entry + UCOL_REORDER_CODE_FIRST; |
michael@0 | 1405 | } |
michael@0 | 1406 | } |
michael@0 | 1407 | return USCRIPT_INVALID_CODE; |
michael@0 | 1408 | } |
michael@0 | 1409 | |
michael@0 | 1410 | #endif /* #if !UCONFIG_NO_COLLATION */ |