Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | /* |
michael@0 | 2 | ******************************************************************************* |
michael@0 | 3 | * Copyright (C) 1996-2013, International Business Machines |
michael@0 | 4 | * Corporation and others. All Rights Reserved. |
michael@0 | 5 | ******************************************************************************* |
michael@0 | 6 | * file name: ucol.cpp |
michael@0 | 7 | * encoding: US-ASCII |
michael@0 | 8 | * tab size: 8 (not used) |
michael@0 | 9 | * indentation:4 |
michael@0 | 10 | * |
michael@0 | 11 | * Modification history |
michael@0 | 12 | * Date Name Comments |
michael@0 | 13 | * 1996-1999 various members of ICU team maintained C API for collation framework |
michael@0 | 14 | * 02/16/2001 synwee Added internal method getPrevSpecialCE |
michael@0 | 15 | * 03/01/2001 synwee Added maxexpansion functionality. |
michael@0 | 16 | * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant |
michael@0 | 17 | */ |
michael@0 | 18 | |
michael@0 | 19 | #include "unicode/utypes.h" |
michael@0 | 20 | |
michael@0 | 21 | #if !UCONFIG_NO_COLLATION |
michael@0 | 22 | |
michael@0 | 23 | #include "unicode/bytestream.h" |
michael@0 | 24 | #include "unicode/coleitr.h" |
michael@0 | 25 | #include "unicode/unorm.h" |
michael@0 | 26 | #include "unicode/udata.h" |
michael@0 | 27 | #include "unicode/ustring.h" |
michael@0 | 28 | #include "unicode/utf8.h" |
michael@0 | 29 | |
michael@0 | 30 | #include "ucol_imp.h" |
michael@0 | 31 | #include "bocsu.h" |
michael@0 | 32 | |
michael@0 | 33 | #include "normalizer2impl.h" |
michael@0 | 34 | #include "unorm_it.h" |
michael@0 | 35 | #include "umutex.h" |
michael@0 | 36 | #include "cmemory.h" |
michael@0 | 37 | #include "ucln_in.h" |
michael@0 | 38 | #include "cstring.h" |
michael@0 | 39 | #include "utracimp.h" |
michael@0 | 40 | #include "putilimp.h" |
michael@0 | 41 | #include "uassert.h" |
michael@0 | 42 | #include "unicode/coll.h" |
michael@0 | 43 | |
michael@0 | 44 | #ifdef UCOL_DEBUG |
michael@0 | 45 | #include <stdio.h> |
michael@0 | 46 | #endif |
michael@0 | 47 | |
michael@0 | 48 | U_NAMESPACE_USE |
michael@0 | 49 | |
michael@0 | 50 | #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
michael@0 | 51 | |
michael@0 | 52 | #define LAST_BYTE_MASK_ 0xFF |
michael@0 | 53 | #define SECOND_LAST_BYTE_SHIFT_ 8 |
michael@0 | 54 | |
michael@0 | 55 | #define ZERO_CC_LIMIT_ 0xC0 |
michael@0 | 56 | |
michael@0 | 57 | // These are static pointers to the NFC/NFD implementation instance. |
michael@0 | 58 | // Each of them is always the same between calls to u_cleanup |
michael@0 | 59 | // and therefore writing to it is not synchronized. |
michael@0 | 60 | // They are cleaned in ucol_cleanup |
michael@0 | 61 | static const Normalizer2 *g_nfd = NULL; |
michael@0 | 62 | static const Normalizer2Impl *g_nfcImpl = NULL; |
michael@0 | 63 | |
michael@0 | 64 | // These are values from UCA required for |
michael@0 | 65 | // implicit generation and supressing sort key compression |
michael@0 | 66 | // they should regularly be in the UCA, but if one |
michael@0 | 67 | // is running without UCA, it could be a problem |
michael@0 | 68 | static const int32_t maxRegularPrimary = 0x7A; |
michael@0 | 69 | static const int32_t minImplicitPrimary = 0xE0; |
michael@0 | 70 | static const int32_t maxImplicitPrimary = 0xE4; |
michael@0 | 71 | |
michael@0 | 72 | U_CDECL_BEGIN |
michael@0 | 73 | static UBool U_CALLCONV |
michael@0 | 74 | ucol_cleanup(void) |
michael@0 | 75 | { |
michael@0 | 76 | g_nfd = NULL; |
michael@0 | 77 | g_nfcImpl = NULL; |
michael@0 | 78 | return TRUE; |
michael@0 | 79 | } |
michael@0 | 80 | |
michael@0 | 81 | static int32_t U_CALLCONV |
michael@0 | 82 | _getFoldingOffset(uint32_t data) { |
michael@0 | 83 | return (int32_t)(data&0xFFFFFF); |
michael@0 | 84 | } |
michael@0 | 85 | |
michael@0 | 86 | U_CDECL_END |
michael@0 | 87 | |
michael@0 | 88 | static inline |
michael@0 | 89 | UBool initializeNFD(UErrorCode *status) { |
michael@0 | 90 | if (g_nfd != NULL) { |
michael@0 | 91 | return TRUE; |
michael@0 | 92 | } else { |
michael@0 | 93 | // The result is constant, until the library is reloaded. |
michael@0 | 94 | g_nfd = Normalizer2Factory::getNFDInstance(*status); |
michael@0 | 95 | ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup); |
michael@0 | 96 | return U_SUCCESS(*status); |
michael@0 | 97 | } |
michael@0 | 98 | } |
michael@0 | 99 | |
michael@0 | 100 | // init FCD data |
michael@0 | 101 | static inline |
michael@0 | 102 | UBool initializeFCD(UErrorCode *status) { |
michael@0 | 103 | if (g_nfcImpl != NULL) { |
michael@0 | 104 | return TRUE; |
michael@0 | 105 | } else { |
michael@0 | 106 | // The result is constant, until the library is reloaded. |
michael@0 | 107 | g_nfcImpl = Normalizer2Factory::getNFCImpl(*status); |
michael@0 | 108 | // Note: Alternatively, we could also store this pointer in each collIterate struct, |
michael@0 | 109 | // same as Normalizer2Factory::getImpl(collIterate->nfd). |
michael@0 | 110 | ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup); |
michael@0 | 111 | return U_SUCCESS(*status); |
michael@0 | 112 | } |
michael@0 | 113 | } |
michael@0 | 114 | |
michael@0 | 115 | static |
michael@0 | 116 | inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString, |
michael@0 | 117 | int32_t sourceLen, collIterate *s, |
michael@0 | 118 | UErrorCode *status) |
michael@0 | 119 | { |
michael@0 | 120 | (s)->string = (s)->pos = sourceString; |
michael@0 | 121 | (s)->origFlags = 0; |
michael@0 | 122 | (s)->flags = 0; |
michael@0 | 123 | if (sourceLen >= 0) { |
michael@0 | 124 | s->flags |= UCOL_ITER_HASLEN; |
michael@0 | 125 | (s)->endp = (UChar *)sourceString+sourceLen; |
michael@0 | 126 | } |
michael@0 | 127 | else { |
michael@0 | 128 | /* change to enable easier checking for end of string for fcdpositon */ |
michael@0 | 129 | (s)->endp = NULL; |
michael@0 | 130 | } |
michael@0 | 131 | (s)->extendCEs = NULL; |
michael@0 | 132 | (s)->extendCEsSize = 0; |
michael@0 | 133 | (s)->CEpos = (s)->toReturn = (s)->CEs; |
michael@0 | 134 | (s)->offsetBuffer = NULL; |
michael@0 | 135 | (s)->offsetBufferSize = 0; |
michael@0 | 136 | (s)->offsetReturn = (s)->offsetStore = NULL; |
michael@0 | 137 | (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0; |
michael@0 | 138 | (s)->coll = (collator); |
michael@0 | 139 | if (initializeNFD(status)) { |
michael@0 | 140 | (s)->nfd = g_nfd; |
michael@0 | 141 | } else { |
michael@0 | 142 | return; |
michael@0 | 143 | } |
michael@0 | 144 | (s)->fcdPosition = 0; |
michael@0 | 145 | if(collator->normalizationMode == UCOL_ON) { |
michael@0 | 146 | (s)->flags |= UCOL_ITER_NORM; |
michael@0 | 147 | } |
michael@0 | 148 | if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) { |
michael@0 | 149 | (s)->flags |= UCOL_HIRAGANA_Q; |
michael@0 | 150 | } |
michael@0 | 151 | (s)->iterator = NULL; |
michael@0 | 152 | //(s)->iteratorIndex = 0; |
michael@0 | 153 | } |
michael@0 | 154 | |
michael@0 | 155 | U_CAPI void U_EXPORT2 |
michael@0 | 156 | uprv_init_collIterate(const UCollator *collator, const UChar *sourceString, |
michael@0 | 157 | int32_t sourceLen, collIterate *s, |
michael@0 | 158 | UErrorCode *status) { |
michael@0 | 159 | /* Out-of-line version for use from other files. */ |
michael@0 | 160 | IInit_collIterate(collator, sourceString, sourceLen, s, status); |
michael@0 | 161 | } |
michael@0 | 162 | |
michael@0 | 163 | U_CAPI collIterate * U_EXPORT2 |
michael@0 | 164 | uprv_new_collIterate(UErrorCode *status) { |
michael@0 | 165 | if(U_FAILURE(*status)) { |
michael@0 | 166 | return NULL; |
michael@0 | 167 | } |
michael@0 | 168 | collIterate *s = new collIterate; |
michael@0 | 169 | if(s == NULL) { |
michael@0 | 170 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 171 | return NULL; |
michael@0 | 172 | } |
michael@0 | 173 | return s; |
michael@0 | 174 | } |
michael@0 | 175 | |
michael@0 | 176 | U_CAPI void U_EXPORT2 |
michael@0 | 177 | uprv_delete_collIterate(collIterate *s) { |
michael@0 | 178 | delete s; |
michael@0 | 179 | } |
michael@0 | 180 | |
michael@0 | 181 | U_CAPI UBool U_EXPORT2 |
michael@0 | 182 | uprv_collIterateAtEnd(collIterate *s) { |
michael@0 | 183 | return s == NULL || s->pos == s->endp; |
michael@0 | 184 | } |
michael@0 | 185 | |
michael@0 | 186 | /** |
michael@0 | 187 | * Backup the state of the collIterate struct data |
michael@0 | 188 | * @param data collIterate to backup |
michael@0 | 189 | * @param backup storage |
michael@0 | 190 | */ |
michael@0 | 191 | static |
michael@0 | 192 | inline void backupState(const collIterate *data, collIterateState *backup) |
michael@0 | 193 | { |
michael@0 | 194 | backup->fcdPosition = data->fcdPosition; |
michael@0 | 195 | backup->flags = data->flags; |
michael@0 | 196 | backup->origFlags = data->origFlags; |
michael@0 | 197 | backup->pos = data->pos; |
michael@0 | 198 | backup->bufferaddress = data->writableBuffer.getBuffer(); |
michael@0 | 199 | backup->buffersize = data->writableBuffer.length(); |
michael@0 | 200 | backup->iteratorMove = 0; |
michael@0 | 201 | backup->iteratorIndex = 0; |
michael@0 | 202 | if(data->iterator != NULL) { |
michael@0 | 203 | //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT); |
michael@0 | 204 | backup->iteratorIndex = data->iterator->getState(data->iterator); |
michael@0 | 205 | // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE |
michael@0 | 206 | if(backup->iteratorIndex == UITER_NO_STATE) { |
michael@0 | 207 | while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) { |
michael@0 | 208 | backup->iteratorMove++; |
michael@0 | 209 | data->iterator->move(data->iterator, -1, UITER_CURRENT); |
michael@0 | 210 | } |
michael@0 | 211 | data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT); |
michael@0 | 212 | } |
michael@0 | 213 | } |
michael@0 | 214 | } |
michael@0 | 215 | |
michael@0 | 216 | /** |
michael@0 | 217 | * Loads the state into the collIterate struct data |
michael@0 | 218 | * @param data collIterate to backup |
michael@0 | 219 | * @param backup storage |
michael@0 | 220 | * @param forwards boolean to indicate if forwards iteration is used, |
michael@0 | 221 | * false indicates backwards iteration |
michael@0 | 222 | */ |
michael@0 | 223 | static |
michael@0 | 224 | inline void loadState(collIterate *data, const collIterateState *backup, |
michael@0 | 225 | UBool forwards) |
michael@0 | 226 | { |
michael@0 | 227 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 228 | data->flags = backup->flags; |
michael@0 | 229 | data->origFlags = backup->origFlags; |
michael@0 | 230 | if(data->iterator != NULL) { |
michael@0 | 231 | //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO); |
michael@0 | 232 | data->iterator->setState(data->iterator, backup->iteratorIndex, &status); |
michael@0 | 233 | if(backup->iteratorMove != 0) { |
michael@0 | 234 | data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT); |
michael@0 | 235 | } |
michael@0 | 236 | } |
michael@0 | 237 | data->pos = backup->pos; |
michael@0 | 238 | |
michael@0 | 239 | if ((data->flags & UCOL_ITER_INNORMBUF) && |
michael@0 | 240 | data->writableBuffer.getBuffer() != backup->bufferaddress) { |
michael@0 | 241 | /* |
michael@0 | 242 | this is when a new buffer has been reallocated and we'll have to |
michael@0 | 243 | calculate the new position. |
michael@0 | 244 | note the new buffer has to contain the contents of the old buffer. |
michael@0 | 245 | */ |
michael@0 | 246 | if (forwards) { |
michael@0 | 247 | data->pos = data->writableBuffer.getTerminatedBuffer() + |
michael@0 | 248 | (data->pos - backup->bufferaddress); |
michael@0 | 249 | } |
michael@0 | 250 | else { |
michael@0 | 251 | /* backwards direction */ |
michael@0 | 252 | int32_t temp = backup->buffersize - |
michael@0 | 253 | (int32_t)(data->pos - backup->bufferaddress); |
michael@0 | 254 | data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writableBuffer.length() - temp); |
michael@0 | 255 | } |
michael@0 | 256 | } |
michael@0 | 257 | if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { |
michael@0 | 258 | /* |
michael@0 | 259 | this is alittle tricky. |
michael@0 | 260 | if we are initially not in the normalization buffer, even if we |
michael@0 | 261 | normalize in the later stage, the data in the buffer will be |
michael@0 | 262 | ignored, since we skip back up to the data string. |
michael@0 | 263 | however if we are already in the normalization buffer, any |
michael@0 | 264 | further normalization will pull data into the normalization |
michael@0 | 265 | buffer and modify the fcdPosition. |
michael@0 | 266 | since we are keeping the data in the buffer for use, the |
michael@0 | 267 | fcdPosition can not be reverted back. |
michael@0 | 268 | arrgghh.... |
michael@0 | 269 | */ |
michael@0 | 270 | data->fcdPosition = backup->fcdPosition; |
michael@0 | 271 | } |
michael@0 | 272 | } |
michael@0 | 273 | |
michael@0 | 274 | static UBool |
michael@0 | 275 | reallocCEs(collIterate *data, int32_t newCapacity) { |
michael@0 | 276 | uint32_t *oldCEs = data->extendCEs; |
michael@0 | 277 | if(oldCEs == NULL) { |
michael@0 | 278 | oldCEs = data->CEs; |
michael@0 | 279 | } |
michael@0 | 280 | int32_t length = data->CEpos - oldCEs; |
michael@0 | 281 | uint32_t *newCEs = (uint32_t *)uprv_malloc(newCapacity * 4); |
michael@0 | 282 | if(newCEs == NULL) { |
michael@0 | 283 | return FALSE; |
michael@0 | 284 | } |
michael@0 | 285 | uprv_memcpy(newCEs, oldCEs, length * 4); |
michael@0 | 286 | uprv_free(data->extendCEs); |
michael@0 | 287 | data->extendCEs = newCEs; |
michael@0 | 288 | data->extendCEsSize = newCapacity; |
michael@0 | 289 | data->CEpos = newCEs + length; |
michael@0 | 290 | return TRUE; |
michael@0 | 291 | } |
michael@0 | 292 | |
michael@0 | 293 | static UBool |
michael@0 | 294 | increaseCEsCapacity(collIterate *data) { |
michael@0 | 295 | int32_t oldCapacity; |
michael@0 | 296 | if(data->extendCEs != NULL) { |
michael@0 | 297 | oldCapacity = data->extendCEsSize; |
michael@0 | 298 | } else { |
michael@0 | 299 | oldCapacity = LENGTHOF(data->CEs); |
michael@0 | 300 | } |
michael@0 | 301 | return reallocCEs(data, 2 * oldCapacity); |
michael@0 | 302 | } |
michael@0 | 303 | |
michael@0 | 304 | static UBool |
michael@0 | 305 | ensureCEsCapacity(collIterate *data, int32_t minCapacity) { |
michael@0 | 306 | int32_t oldCapacity; |
michael@0 | 307 | if(data->extendCEs != NULL) { |
michael@0 | 308 | oldCapacity = data->extendCEsSize; |
michael@0 | 309 | } else { |
michael@0 | 310 | oldCapacity = LENGTHOF(data->CEs); |
michael@0 | 311 | } |
michael@0 | 312 | if(minCapacity <= oldCapacity) { |
michael@0 | 313 | return TRUE; |
michael@0 | 314 | } |
michael@0 | 315 | oldCapacity *= 2; |
michael@0 | 316 | return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacity); |
michael@0 | 317 | } |
michael@0 | 318 | |
michael@0 | 319 | void collIterate::appendOffset(int32_t offset, UErrorCode &errorCode) { |
michael@0 | 320 | if(U_FAILURE(errorCode)) { |
michael@0 | 321 | return; |
michael@0 | 322 | } |
michael@0 | 323 | int32_t length = offsetStore == NULL ? 0 : (int32_t)(offsetStore - offsetBuffer); |
michael@0 | 324 | U_ASSERT(length >= offsetBufferSize || offsetStore != NULL); |
michael@0 | 325 | if(length >= offsetBufferSize) { |
michael@0 | 326 | int32_t newCapacity = 2 * offsetBufferSize + UCOL_EXPAND_CE_BUFFER_SIZE; |
michael@0 | 327 | int32_t *newBuffer = static_cast<int32_t *>(uprv_malloc(newCapacity * 4)); |
michael@0 | 328 | if(newBuffer == NULL) { |
michael@0 | 329 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 330 | return; |
michael@0 | 331 | } |
michael@0 | 332 | if(length > 0) { |
michael@0 | 333 | uprv_memcpy(newBuffer, offsetBuffer, length * 4); |
michael@0 | 334 | } |
michael@0 | 335 | uprv_free(offsetBuffer); |
michael@0 | 336 | offsetBuffer = newBuffer; |
michael@0 | 337 | offsetStore = offsetBuffer + length; |
michael@0 | 338 | offsetBufferSize = newCapacity; |
michael@0 | 339 | } |
michael@0 | 340 | *offsetStore++ = offset; |
michael@0 | 341 | } |
michael@0 | 342 | |
michael@0 | 343 | /* |
michael@0 | 344 | * collIter_eos() |
michael@0 | 345 | * Checks for a collIterate being positioned at the end of |
michael@0 | 346 | * its source string. |
michael@0 | 347 | * |
michael@0 | 348 | */ |
michael@0 | 349 | static |
michael@0 | 350 | inline UBool collIter_eos(collIterate *s) { |
michael@0 | 351 | if(s->flags & UCOL_USE_ITERATOR) { |
michael@0 | 352 | return !(s->iterator->hasNext(s->iterator)); |
michael@0 | 353 | } |
michael@0 | 354 | if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) { |
michael@0 | 355 | // Null terminated string, but not at null, so not at end. |
michael@0 | 356 | // Whether in main or normalization buffer doesn't matter. |
michael@0 | 357 | return FALSE; |
michael@0 | 358 | } |
michael@0 | 359 | |
michael@0 | 360 | // String with length. Can't be in normalization buffer, which is always |
michael@0 | 361 | // null termintated. |
michael@0 | 362 | if (s->flags & UCOL_ITER_HASLEN) { |
michael@0 | 363 | return (s->pos == s->endp); |
michael@0 | 364 | } |
michael@0 | 365 | |
michael@0 | 366 | // We are at a null termination, could be either normalization buffer or main string. |
michael@0 | 367 | if ((s->flags & UCOL_ITER_INNORMBUF) == 0) { |
michael@0 | 368 | // At null at end of main string. |
michael@0 | 369 | return TRUE; |
michael@0 | 370 | } |
michael@0 | 371 | |
michael@0 | 372 | // At null at end of normalization buffer. Need to check whether there there are |
michael@0 | 373 | // any characters left in the main buffer. |
michael@0 | 374 | if(s->origFlags & UCOL_USE_ITERATOR) { |
michael@0 | 375 | return !(s->iterator->hasNext(s->iterator)); |
michael@0 | 376 | } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) { |
michael@0 | 377 | // Null terminated main string. fcdPosition is the 'return' position into main buf. |
michael@0 | 378 | return (*s->fcdPosition == 0); |
michael@0 | 379 | } |
michael@0 | 380 | else { |
michael@0 | 381 | // Main string with an end pointer. |
michael@0 | 382 | return s->fcdPosition == s->endp; |
michael@0 | 383 | } |
michael@0 | 384 | } |
michael@0 | 385 | |
michael@0 | 386 | /* |
michael@0 | 387 | * collIter_bos() |
michael@0 | 388 | * Checks for a collIterate being positioned at the start of |
michael@0 | 389 | * its source string. |
michael@0 | 390 | * |
michael@0 | 391 | */ |
michael@0 | 392 | static |
michael@0 | 393 | inline UBool collIter_bos(collIterate *source) { |
michael@0 | 394 | // if we're going backwards, we need to know whether there is more in the |
michael@0 | 395 | // iterator, even if we are in the side buffer |
michael@0 | 396 | if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) { |
michael@0 | 397 | return !source->iterator->hasPrevious(source->iterator); |
michael@0 | 398 | } |
michael@0 | 399 | if (source->pos <= source->string || |
michael@0 | 400 | ((source->flags & UCOL_ITER_INNORMBUF) && |
michael@0 | 401 | *(source->pos - 1) == 0 && source->fcdPosition == NULL)) { |
michael@0 | 402 | return TRUE; |
michael@0 | 403 | } |
michael@0 | 404 | return FALSE; |
michael@0 | 405 | } |
michael@0 | 406 | |
michael@0 | 407 | /*static |
michael@0 | 408 | inline UBool collIter_SimpleBos(collIterate *source) { |
michael@0 | 409 | // if we're going backwards, we need to know whether there is more in the |
michael@0 | 410 | // iterator, even if we are in the side buffer |
michael@0 | 411 | if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) { |
michael@0 | 412 | return !source->iterator->hasPrevious(source->iterator); |
michael@0 | 413 | } |
michael@0 | 414 | if (source->pos == source->string) { |
michael@0 | 415 | return TRUE; |
michael@0 | 416 | } |
michael@0 | 417 | return FALSE; |
michael@0 | 418 | }*/ |
michael@0 | 419 | //return (data->pos == data->string) || |
michael@0 | 420 | |
michael@0 | 421 | |
michael@0 | 422 | /****************************************************************************/ |
michael@0 | 423 | /* Following are the open/close functions */ |
michael@0 | 424 | /* */ |
michael@0 | 425 | /****************************************************************************/ |
michael@0 | 426 | |
michael@0 | 427 | static UCollator* |
michael@0 | 428 | ucol_initFromBinary(const uint8_t *bin, int32_t length, |
michael@0 | 429 | const UCollator *base, |
michael@0 | 430 | UCollator *fillIn, |
michael@0 | 431 | UErrorCode *status) |
michael@0 | 432 | { |
michael@0 | 433 | UCollator *result = fillIn; |
michael@0 | 434 | if(U_FAILURE(*status)) { |
michael@0 | 435 | return NULL; |
michael@0 | 436 | } |
michael@0 | 437 | /* |
michael@0 | 438 | if(base == NULL) { |
michael@0 | 439 | // we don't support null base yet |
michael@0 | 440 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 441 | return NULL; |
michael@0 | 442 | } |
michael@0 | 443 | */ |
michael@0 | 444 | // We need these and we could be running without UCA |
michael@0 | 445 | uprv_uca_initImplicitConstants(status); |
michael@0 | 446 | UCATableHeader *colData = (UCATableHeader *)bin; |
michael@0 | 447 | // do we want version check here? We're trying to figure out whether collators are compatible |
michael@0 | 448 | if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 || |
michael@0 | 449 | uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) || |
michael@0 | 450 | colData->version[0] != UCOL_BUILDER_VERSION) |
michael@0 | 451 | { |
michael@0 | 452 | *status = U_COLLATOR_VERSION_MISMATCH; |
michael@0 | 453 | return NULL; |
michael@0 | 454 | } |
michael@0 | 455 | else { |
michael@0 | 456 | if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) { |
michael@0 | 457 | result = ucol_initCollator((const UCATableHeader *)bin, result, base, status); |
michael@0 | 458 | if(U_FAILURE(*status)){ |
michael@0 | 459 | return NULL; |
michael@0 | 460 | } |
michael@0 | 461 | result->hasRealData = TRUE; |
michael@0 | 462 | } |
michael@0 | 463 | else { |
michael@0 | 464 | if(base) { |
michael@0 | 465 | result = ucol_initCollator(base->image, result, base, status); |
michael@0 | 466 | ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status); |
michael@0 | 467 | if(U_FAILURE(*status)){ |
michael@0 | 468 | return NULL; |
michael@0 | 469 | } |
michael@0 | 470 | result->hasRealData = FALSE; |
michael@0 | 471 | } |
michael@0 | 472 | else { |
michael@0 | 473 | *status = U_USELESS_COLLATOR_ERROR; |
michael@0 | 474 | return NULL; |
michael@0 | 475 | } |
michael@0 | 476 | } |
michael@0 | 477 | result->freeImageOnClose = FALSE; |
michael@0 | 478 | } |
michael@0 | 479 | result->actualLocale = NULL; |
michael@0 | 480 | result->validLocale = NULL; |
michael@0 | 481 | result->requestedLocale = NULL; |
michael@0 | 482 | result->rules = NULL; |
michael@0 | 483 | result->rulesLength = 0; |
michael@0 | 484 | result->freeRulesOnClose = FALSE; |
michael@0 | 485 | result->ucaRules = NULL; |
michael@0 | 486 | return result; |
michael@0 | 487 | } |
michael@0 | 488 | |
michael@0 | 489 | U_CAPI UCollator* U_EXPORT2 |
michael@0 | 490 | ucol_openBinary(const uint8_t *bin, int32_t length, |
michael@0 | 491 | const UCollator *base, |
michael@0 | 492 | UErrorCode *status) |
michael@0 | 493 | { |
michael@0 | 494 | return ucol_initFromBinary(bin, length, base, NULL, status); |
michael@0 | 495 | } |
michael@0 | 496 | |
michael@0 | 497 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 498 | ucol_cloneBinary(const UCollator *coll, |
michael@0 | 499 | uint8_t *buffer, int32_t capacity, |
michael@0 | 500 | UErrorCode *status) |
michael@0 | 501 | { |
michael@0 | 502 | int32_t length = 0; |
michael@0 | 503 | if(U_FAILURE(*status)) { |
michael@0 | 504 | return length; |
michael@0 | 505 | } |
michael@0 | 506 | if(capacity < 0) { |
michael@0 | 507 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 508 | return length; |
michael@0 | 509 | } |
michael@0 | 510 | if(coll->hasRealData == TRUE) { |
michael@0 | 511 | length = coll->image->size; |
michael@0 | 512 | if(length <= capacity) { |
michael@0 | 513 | uprv_memcpy(buffer, coll->image, length); |
michael@0 | 514 | } else { |
michael@0 | 515 | *status = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 516 | } |
michael@0 | 517 | } else { |
michael@0 | 518 | length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet))); |
michael@0 | 519 | if(length <= capacity) { |
michael@0 | 520 | /* build the UCATableHeader with minimal entries */ |
michael@0 | 521 | /* do not copy the header from the UCA file because its values are wrong! */ |
michael@0 | 522 | /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */ |
michael@0 | 523 | |
michael@0 | 524 | /* reset everything */ |
michael@0 | 525 | uprv_memset(buffer, 0, length); |
michael@0 | 526 | |
michael@0 | 527 | /* set the tailoring-specific values */ |
michael@0 | 528 | UCATableHeader *myData = (UCATableHeader *)buffer; |
michael@0 | 529 | myData->size = length; |
michael@0 | 530 | |
michael@0 | 531 | /* offset for the options, the only part of the data that is present after the header */ |
michael@0 | 532 | myData->options = sizeof(UCATableHeader); |
michael@0 | 533 | |
michael@0 | 534 | /* need to always set the expansion value for an upper bound of the options */ |
michael@0 | 535 | myData->expansion = myData->options + sizeof(UColOptionSet); |
michael@0 | 536 | |
michael@0 | 537 | myData->magic = UCOL_HEADER_MAGIC; |
michael@0 | 538 | myData->isBigEndian = U_IS_BIG_ENDIAN; |
michael@0 | 539 | myData->charSetFamily = U_CHARSET_FAMILY; |
michael@0 | 540 | |
michael@0 | 541 | /* copy UCA's version; genrb will override all but the builder version with tailoring data */ |
michael@0 | 542 | uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo)); |
michael@0 | 543 | |
michael@0 | 544 | uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo)); |
michael@0 | 545 | uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo)); |
michael@0 | 546 | uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo)); |
michael@0 | 547 | myData->jamoSpecial = coll->image->jamoSpecial; |
michael@0 | 548 | |
michael@0 | 549 | /* copy the collator options */ |
michael@0 | 550 | uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet)); |
michael@0 | 551 | } else { |
michael@0 | 552 | *status = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 553 | } |
michael@0 | 554 | } |
michael@0 | 555 | return length; |
michael@0 | 556 | } |
michael@0 | 557 | |
michael@0 | 558 | U_CAPI UCollator* U_EXPORT2 |
michael@0 | 559 | ucol_safeClone(const UCollator *coll, void * /*stackBuffer*/, int32_t * pBufferSize, UErrorCode *status) |
michael@0 | 560 | { |
michael@0 | 561 | UCollator * localCollator; |
michael@0 | 562 | int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator); |
michael@0 | 563 | int32_t imageSize = 0; |
michael@0 | 564 | int32_t rulesSize = 0; |
michael@0 | 565 | int32_t rulesPadding = 0; |
michael@0 | 566 | int32_t defaultReorderCodesSize = 0; |
michael@0 | 567 | int32_t reorderCodesSize = 0; |
michael@0 | 568 | uint8_t *image; |
michael@0 | 569 | UChar *rules; |
michael@0 | 570 | int32_t* defaultReorderCodes; |
michael@0 | 571 | int32_t* reorderCodes; |
michael@0 | 572 | uint8_t* leadBytePermutationTable; |
michael@0 | 573 | UBool imageAllocated = FALSE; |
michael@0 | 574 | |
michael@0 | 575 | if (status == NULL || U_FAILURE(*status)){ |
michael@0 | 576 | return NULL; |
michael@0 | 577 | } |
michael@0 | 578 | if (coll == NULL) { |
michael@0 | 579 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 580 | return NULL; |
michael@0 | 581 | } |
michael@0 | 582 | |
michael@0 | 583 | if (coll->rules && coll->freeRulesOnClose) { |
michael@0 | 584 | rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar); |
michael@0 | 585 | rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar)); |
michael@0 | 586 | bufferSizeNeeded += rulesSize + rulesPadding; |
michael@0 | 587 | } |
michael@0 | 588 | // no padding for alignment needed from here since the next two are 4 byte quantities |
michael@0 | 589 | if (coll->defaultReorderCodes) { |
michael@0 | 590 | defaultReorderCodesSize = coll->defaultReorderCodesLength * sizeof(int32_t); |
michael@0 | 591 | bufferSizeNeeded += defaultReorderCodesSize; |
michael@0 | 592 | } |
michael@0 | 593 | if (coll->reorderCodes) { |
michael@0 | 594 | reorderCodesSize = coll->reorderCodesLength * sizeof(int32_t); |
michael@0 | 595 | bufferSizeNeeded += reorderCodesSize; |
michael@0 | 596 | } |
michael@0 | 597 | if (coll->leadBytePermutationTable) { |
michael@0 | 598 | bufferSizeNeeded += 256 * sizeof(uint8_t); |
michael@0 | 599 | } |
michael@0 | 600 | |
michael@0 | 601 | if (pBufferSize != NULL) { |
michael@0 | 602 | int32_t inputSize = *pBufferSize; |
michael@0 | 603 | *pBufferSize = 1; |
michael@0 | 604 | if (inputSize == 0) { |
michael@0 | 605 | return NULL; // preflighting for deprecated functionality |
michael@0 | 606 | } |
michael@0 | 607 | } |
michael@0 | 608 | |
michael@0 | 609 | char *stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded); |
michael@0 | 610 | // Null pointer check. |
michael@0 | 611 | if (stackBufferChars == NULL) { |
michael@0 | 612 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 613 | return NULL; |
michael@0 | 614 | } |
michael@0 | 615 | *status = U_SAFECLONE_ALLOCATED_WARNING; |
michael@0 | 616 | |
michael@0 | 617 | localCollator = (UCollator *)stackBufferChars; |
michael@0 | 618 | rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding); |
michael@0 | 619 | defaultReorderCodes = (int32_t*)((uint8_t*)rules + rulesSize); |
michael@0 | 620 | reorderCodes = (int32_t*)((uint8_t*)defaultReorderCodes + defaultReorderCodesSize); |
michael@0 | 621 | leadBytePermutationTable = (uint8_t*)reorderCodes + reorderCodesSize; |
michael@0 | 622 | |
michael@0 | 623 | { |
michael@0 | 624 | UErrorCode tempStatus = U_ZERO_ERROR; |
michael@0 | 625 | imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus); |
michael@0 | 626 | } |
michael@0 | 627 | if (coll->freeImageOnClose) { |
michael@0 | 628 | image = (uint8_t *)uprv_malloc(imageSize); |
michael@0 | 629 | // Null pointer check |
michael@0 | 630 | if (image == NULL) { |
michael@0 | 631 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 632 | return NULL; |
michael@0 | 633 | } |
michael@0 | 634 | ucol_cloneBinary(coll, image, imageSize, status); |
michael@0 | 635 | imageAllocated = TRUE; |
michael@0 | 636 | } |
michael@0 | 637 | else { |
michael@0 | 638 | image = (uint8_t *)coll->image; |
michael@0 | 639 | } |
michael@0 | 640 | localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status); |
michael@0 | 641 | if (U_FAILURE(*status)) { |
michael@0 | 642 | return NULL; |
michael@0 | 643 | } |
michael@0 | 644 | |
michael@0 | 645 | if (coll->rules) { |
michael@0 | 646 | if (coll->freeRulesOnClose) { |
michael@0 | 647 | localCollator->rules = u_strcpy(rules, coll->rules); |
michael@0 | 648 | //bufferEnd += rulesSize; |
michael@0 | 649 | } |
michael@0 | 650 | else { |
michael@0 | 651 | localCollator->rules = coll->rules; |
michael@0 | 652 | } |
michael@0 | 653 | localCollator->freeRulesOnClose = FALSE; |
michael@0 | 654 | localCollator->rulesLength = coll->rulesLength; |
michael@0 | 655 | } |
michael@0 | 656 | |
michael@0 | 657 | // collator reordering |
michael@0 | 658 | if (coll->defaultReorderCodes) { |
michael@0 | 659 | localCollator->defaultReorderCodes = |
michael@0 | 660 | (int32_t*) uprv_memcpy(defaultReorderCodes, coll->defaultReorderCodes, coll->defaultReorderCodesLength * sizeof(int32_t)); |
michael@0 | 661 | localCollator->defaultReorderCodesLength = coll->defaultReorderCodesLength; |
michael@0 | 662 | localCollator->freeDefaultReorderCodesOnClose = FALSE; |
michael@0 | 663 | } |
michael@0 | 664 | if (coll->reorderCodes) { |
michael@0 | 665 | localCollator->reorderCodes = |
michael@0 | 666 | (int32_t*)uprv_memcpy(reorderCodes, coll->reorderCodes, coll->reorderCodesLength * sizeof(int32_t)); |
michael@0 | 667 | localCollator->reorderCodesLength = coll->reorderCodesLength; |
michael@0 | 668 | localCollator->freeReorderCodesOnClose = FALSE; |
michael@0 | 669 | } |
michael@0 | 670 | if (coll->leadBytePermutationTable) { |
michael@0 | 671 | localCollator->leadBytePermutationTable = |
michael@0 | 672 | (uint8_t*) uprv_memcpy(leadBytePermutationTable, coll->leadBytePermutationTable, 256); |
michael@0 | 673 | localCollator->freeLeadBytePermutationTableOnClose = FALSE; |
michael@0 | 674 | } |
michael@0 | 675 | |
michael@0 | 676 | int32_t i; |
michael@0 | 677 | for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) { |
michael@0 | 678 | ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status); |
michael@0 | 679 | } |
michael@0 | 680 | // zero copies of pointers |
michael@0 | 681 | localCollator->actualLocale = NULL; |
michael@0 | 682 | localCollator->validLocale = NULL; |
michael@0 | 683 | localCollator->requestedLocale = NULL; |
michael@0 | 684 | localCollator->ucaRules = coll->ucaRules; // There should only be one copy here. |
michael@0 | 685 | localCollator->freeOnClose = TRUE; |
michael@0 | 686 | localCollator->freeImageOnClose = imageAllocated; |
michael@0 | 687 | return localCollator; |
michael@0 | 688 | } |
michael@0 | 689 | |
michael@0 | 690 | U_CAPI void U_EXPORT2 |
michael@0 | 691 | ucol_close(UCollator *coll) |
michael@0 | 692 | { |
michael@0 | 693 | UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE); |
michael@0 | 694 | UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll); |
michael@0 | 695 | if(coll != NULL) { |
michael@0 | 696 | // these are always owned by each UCollator struct, |
michael@0 | 697 | // so we always free them |
michael@0 | 698 | if(coll->validLocale != NULL) { |
michael@0 | 699 | uprv_free(coll->validLocale); |
michael@0 | 700 | } |
michael@0 | 701 | if(coll->actualLocale != NULL) { |
michael@0 | 702 | uprv_free(coll->actualLocale); |
michael@0 | 703 | } |
michael@0 | 704 | if(coll->requestedLocale != NULL) { |
michael@0 | 705 | uprv_free(coll->requestedLocale); |
michael@0 | 706 | } |
michael@0 | 707 | if(coll->latinOneCEs != NULL) { |
michael@0 | 708 | uprv_free(coll->latinOneCEs); |
michael@0 | 709 | } |
michael@0 | 710 | if(coll->options != NULL && coll->freeOptionsOnClose) { |
michael@0 | 711 | uprv_free(coll->options); |
michael@0 | 712 | } |
michael@0 | 713 | if(coll->rules != NULL && coll->freeRulesOnClose) { |
michael@0 | 714 | uprv_free((UChar *)coll->rules); |
michael@0 | 715 | } |
michael@0 | 716 | if(coll->image != NULL && coll->freeImageOnClose) { |
michael@0 | 717 | uprv_free((UCATableHeader *)coll->image); |
michael@0 | 718 | } |
michael@0 | 719 | |
michael@0 | 720 | if(coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) { |
michael@0 | 721 | uprv_free(coll->leadBytePermutationTable); |
michael@0 | 722 | } |
michael@0 | 723 | if(coll->defaultReorderCodes != NULL && coll->freeDefaultReorderCodesOnClose == TRUE) { |
michael@0 | 724 | uprv_free(coll->defaultReorderCodes); |
michael@0 | 725 | } |
michael@0 | 726 | if(coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) { |
michael@0 | 727 | uprv_free(coll->reorderCodes); |
michael@0 | 728 | } |
michael@0 | 729 | |
michael@0 | 730 | if(coll->delegate != NULL) { |
michael@0 | 731 | delete (Collator*)coll->delegate; |
michael@0 | 732 | } |
michael@0 | 733 | |
michael@0 | 734 | /* Here, it would be advisable to close: */ |
michael@0 | 735 | /* - UData for UCA (unless we stuff it in the root resb */ |
michael@0 | 736 | /* Again, do we need additional housekeeping... HMMM! */ |
michael@0 | 737 | UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose); |
michael@0 | 738 | if(coll->freeOnClose){ |
michael@0 | 739 | /* for safeClone, if freeOnClose is FALSE, |
michael@0 | 740 | don't free the other instance data */ |
michael@0 | 741 | uprv_free(coll); |
michael@0 | 742 | } |
michael@0 | 743 | } |
michael@0 | 744 | UTRACE_EXIT(); |
michael@0 | 745 | } |
michael@0 | 746 | |
michael@0 | 747 | void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) { |
michael@0 | 748 | if(U_FAILURE(*status)) { |
michael@0 | 749 | return; |
michael@0 | 750 | } |
michael@0 | 751 | result->caseFirst = (UColAttributeValue)opts->caseFirst; |
michael@0 | 752 | result->caseLevel = (UColAttributeValue)opts->caseLevel; |
michael@0 | 753 | result->frenchCollation = (UColAttributeValue)opts->frenchCollation; |
michael@0 | 754 | result->normalizationMode = (UColAttributeValue)opts->normalizationMode; |
michael@0 | 755 | if(result->normalizationMode == UCOL_ON && !initializeFCD(status)) { |
michael@0 | 756 | return; |
michael@0 | 757 | } |
michael@0 | 758 | result->strength = (UColAttributeValue)opts->strength; |
michael@0 | 759 | result->variableTopValue = opts->variableTopValue; |
michael@0 | 760 | result->alternateHandling = (UColAttributeValue)opts->alternateHandling; |
michael@0 | 761 | result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ; |
michael@0 | 762 | result->numericCollation = (UColAttributeValue)opts->numericCollation; |
michael@0 | 763 | result->caseFirstisDefault = TRUE; |
michael@0 | 764 | result->caseLevelisDefault = TRUE; |
michael@0 | 765 | result->frenchCollationisDefault = TRUE; |
michael@0 | 766 | result->normalizationModeisDefault = TRUE; |
michael@0 | 767 | result->strengthisDefault = TRUE; |
michael@0 | 768 | result->variableTopValueisDefault = TRUE; |
michael@0 | 769 | result->alternateHandlingisDefault = TRUE; |
michael@0 | 770 | result->hiraganaQisDefault = TRUE; |
michael@0 | 771 | result->numericCollationisDefault = TRUE; |
michael@0 | 772 | |
michael@0 | 773 | ucol_updateInternalState(result, status); |
michael@0 | 774 | |
michael@0 | 775 | result->options = opts; |
michael@0 | 776 | } |
michael@0 | 777 | |
michael@0 | 778 | |
michael@0 | 779 | /** |
michael@0 | 780 | * Approximate determination if a character is at a contraction end. |
michael@0 | 781 | * Guaranteed to be TRUE if a character is at the end of a contraction, |
michael@0 | 782 | * otherwise it is not deterministic. |
michael@0 | 783 | * @param c character to be determined |
michael@0 | 784 | * @param coll collator |
michael@0 | 785 | */ |
michael@0 | 786 | static |
michael@0 | 787 | inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) { |
michael@0 | 788 | if (c < coll->minContrEndCP) { |
michael@0 | 789 | return FALSE; |
michael@0 | 790 | } |
michael@0 | 791 | |
michael@0 | 792 | int32_t hash = c; |
michael@0 | 793 | uint8_t htbyte; |
michael@0 | 794 | if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) { |
michael@0 | 795 | if (U16_IS_TRAIL(c)) { |
michael@0 | 796 | return TRUE; |
michael@0 | 797 | } |
michael@0 | 798 | hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256; |
michael@0 | 799 | } |
michael@0 | 800 | htbyte = coll->contrEndCP[hash>>3]; |
michael@0 | 801 | return (((htbyte >> (hash & 7)) & 1) == 1); |
michael@0 | 802 | } |
michael@0 | 803 | |
michael@0 | 804 | |
michael@0 | 805 | |
michael@0 | 806 | /* |
michael@0 | 807 | * i_getCombiningClass() |
michael@0 | 808 | * A fast, at least partly inline version of u_getCombiningClass() |
michael@0 | 809 | * This is a candidate for further optimization. Used heavily |
michael@0 | 810 | * in contraction processing. |
michael@0 | 811 | */ |
michael@0 | 812 | static |
michael@0 | 813 | inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) { |
michael@0 | 814 | uint8_t sCC = 0; |
michael@0 | 815 | if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) { |
michael@0 | 816 | sCC = u_getCombiningClass(c); |
michael@0 | 817 | } |
michael@0 | 818 | return sCC; |
michael@0 | 819 | } |
michael@0 | 820 | |
michael@0 | 821 | UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) { |
michael@0 | 822 | UChar c; |
michael@0 | 823 | UCollator *result = fillIn; |
michael@0 | 824 | if(U_FAILURE(*status) || image == NULL) { |
michael@0 | 825 | return NULL; |
michael@0 | 826 | } |
michael@0 | 827 | |
michael@0 | 828 | if(result == NULL) { |
michael@0 | 829 | result = (UCollator *)uprv_malloc(sizeof(UCollator)); |
michael@0 | 830 | if(result == NULL) { |
michael@0 | 831 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 832 | return result; |
michael@0 | 833 | } |
michael@0 | 834 | result->freeOnClose = TRUE; |
michael@0 | 835 | } else { |
michael@0 | 836 | result->freeOnClose = FALSE; |
michael@0 | 837 | } |
michael@0 | 838 | |
michael@0 | 839 | result->delegate = NULL; |
michael@0 | 840 | |
michael@0 | 841 | result->image = image; |
michael@0 | 842 | result->mapping.getFoldingOffset = _getFoldingOffset; |
michael@0 | 843 | const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition; |
michael@0 | 844 | utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status); |
michael@0 | 845 | if(U_FAILURE(*status)) { |
michael@0 | 846 | if(result->freeOnClose == TRUE) { |
michael@0 | 847 | uprv_free(result); |
michael@0 | 848 | result = NULL; |
michael@0 | 849 | } |
michael@0 | 850 | return result; |
michael@0 | 851 | } |
michael@0 | 852 | |
michael@0 | 853 | result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping); |
michael@0 | 854 | result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs); |
michael@0 | 855 | result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex); |
michael@0 | 856 | result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion); |
michael@0 | 857 | result->rules = NULL; |
michael@0 | 858 | result->rulesLength = 0; |
michael@0 | 859 | result->freeRulesOnClose = FALSE; |
michael@0 | 860 | result->defaultReorderCodes = NULL; |
michael@0 | 861 | result->defaultReorderCodesLength = 0; |
michael@0 | 862 | result->freeDefaultReorderCodesOnClose = FALSE; |
michael@0 | 863 | result->reorderCodes = NULL; |
michael@0 | 864 | result->reorderCodesLength = 0; |
michael@0 | 865 | result->freeReorderCodesOnClose = FALSE; |
michael@0 | 866 | result->leadBytePermutationTable = NULL; |
michael@0 | 867 | result->freeLeadBytePermutationTableOnClose = FALSE; |
michael@0 | 868 | |
michael@0 | 869 | /* get the version info from UCATableHeader and populate the Collator struct*/ |
michael@0 | 870 | result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/ |
michael@0 | 871 | result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/ |
michael@0 | 872 | result->dataVersion[2] = 0; |
michael@0 | 873 | result->dataVersion[3] = 0; |
michael@0 | 874 | |
michael@0 | 875 | result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP; |
michael@0 | 876 | result->minUnsafeCP = 0; |
michael@0 | 877 | for (c=0; c<0x300; c++) { // Find the smallest unsafe char. |
michael@0 | 878 | if (ucol_unsafeCP(c, result)) break; |
michael@0 | 879 | } |
michael@0 | 880 | result->minUnsafeCP = c; |
michael@0 | 881 | |
michael@0 | 882 | result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP; |
michael@0 | 883 | result->minContrEndCP = 0; |
michael@0 | 884 | for (c=0; c<0x300; c++) { // Find the Contraction-ending char. |
michael@0 | 885 | if (ucol_contractionEndCP(c, result)) break; |
michael@0 | 886 | } |
michael@0 | 887 | result->minContrEndCP = c; |
michael@0 | 888 | |
michael@0 | 889 | /* max expansion tables */ |
michael@0 | 890 | result->endExpansionCE = (uint32_t*)((uint8_t*)result->image + |
michael@0 | 891 | result->image->endExpansionCE); |
michael@0 | 892 | result->lastEndExpansionCE = result->endExpansionCE + |
michael@0 | 893 | result->image->endExpansionCECount - 1; |
michael@0 | 894 | result->expansionCESize = (uint8_t*)result->image + |
michael@0 | 895 | result->image->expansionCESize; |
michael@0 | 896 | |
michael@0 | 897 | |
michael@0 | 898 | //result->errorCode = *status; |
michael@0 | 899 | |
michael@0 | 900 | result->latinOneCEs = NULL; |
michael@0 | 901 | |
michael@0 | 902 | result->latinOneRegenTable = FALSE; |
michael@0 | 903 | result->latinOneFailed = FALSE; |
michael@0 | 904 | result->UCA = UCA; |
michael@0 | 905 | |
michael@0 | 906 | /* Normally these will be set correctly later. This is the default if you use UCA or the default. */ |
michael@0 | 907 | result->ucaRules = NULL; |
michael@0 | 908 | result->actualLocale = NULL; |
michael@0 | 909 | result->validLocale = NULL; |
michael@0 | 910 | result->requestedLocale = NULL; |
michael@0 | 911 | result->hasRealData = FALSE; // real data lives in .dat file... |
michael@0 | 912 | result->freeImageOnClose = FALSE; |
michael@0 | 913 | |
michael@0 | 914 | /* set attributes */ |
michael@0 | 915 | ucol_setOptionsFromHeader( |
michael@0 | 916 | result, |
michael@0 | 917 | (UColOptionSet*)((uint8_t*)result->image+result->image->options), |
michael@0 | 918 | status); |
michael@0 | 919 | result->freeOptionsOnClose = FALSE; |
michael@0 | 920 | |
michael@0 | 921 | return result; |
michael@0 | 922 | } |
michael@0 | 923 | |
michael@0 | 924 | /* new Mark's code */ |
michael@0 | 925 | |
michael@0 | 926 | /** |
michael@0 | 927 | * For generation of Implicit CEs |
michael@0 | 928 | * @author Davis |
michael@0 | 929 | * |
michael@0 | 930 | * Cleaned up so that changes can be made more easily. |
michael@0 | 931 | * Old values: |
michael@0 | 932 | # First Implicit: E26A792D |
michael@0 | 933 | # Last Implicit: E3DC70C0 |
michael@0 | 934 | # First CJK: E0030300 |
michael@0 | 935 | # Last CJK: E0A9DD00 |
michael@0 | 936 | # First CJK_A: E0A9DF00 |
michael@0 | 937 | # Last CJK_A: E0DE3100 |
michael@0 | 938 | */ |
michael@0 | 939 | /* Following is a port of Mark's code for new treatment of implicits. |
michael@0 | 940 | * It is positioned here, since ucol_initUCA need to initialize the |
michael@0 | 941 | * variables below according to the data in the fractional UCA. |
michael@0 | 942 | */ |
michael@0 | 943 | |
michael@0 | 944 | /** |
michael@0 | 945 | * Function used to: |
michael@0 | 946 | * a) collapse the 2 different Han ranges from UCA into one (in the right order), and |
michael@0 | 947 | * b) bump any non-CJK characters by 10FFFF. |
michael@0 | 948 | * The relevant blocks are: |
michael@0 | 949 | * A: 4E00..9FFF; CJK Unified Ideographs |
michael@0 | 950 | * F900..FAFF; CJK Compatibility Ideographs |
michael@0 | 951 | * B: 3400..4DBF; CJK Unified Ideographs Extension A |
michael@0 | 952 | * 20000..XX; CJK Unified Ideographs Extension B (and others later on) |
michael@0 | 953 | * As long as |
michael@0 | 954 | * no new B characters are allocated between 4E00 and FAFF, and |
michael@0 | 955 | * no new A characters are outside of this range, |
michael@0 | 956 | * (very high probability) this simple code will work. |
michael@0 | 957 | * The reordered blocks are: |
michael@0 | 958 | * Block1 is CJK |
michael@0 | 959 | * Block2 is CJK_COMPAT_USED |
michael@0 | 960 | * Block3 is CJK_A |
michael@0 | 961 | * (all contiguous) |
michael@0 | 962 | * Any other CJK gets its normal code point |
michael@0 | 963 | * Any non-CJK gets +10FFFF |
michael@0 | 964 | * When we reorder Block1, we make sure that it is at the very start, |
michael@0 | 965 | * so that it will use a 3-byte form. |
michael@0 | 966 | * Warning: the we only pick up the compatibility characters that are |
michael@0 | 967 | * NOT decomposed, so that block is smaller! |
michael@0 | 968 | */ |
michael@0 | 969 | |
michael@0 | 970 | // CONSTANTS |
michael@0 | 971 | static const UChar32 |
michael@0 | 972 | NON_CJK_OFFSET = 0x110000, |
michael@0 | 973 | UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2 |
michael@0 | 974 | |
michael@0 | 975 | /** |
michael@0 | 976 | * Precomputed by initImplicitConstants() |
michael@0 | 977 | */ |
michael@0 | 978 | static int32_t |
michael@0 | 979 | final3Multiplier = 0, |
michael@0 | 980 | final4Multiplier = 0, |
michael@0 | 981 | final3Count = 0, |
michael@0 | 982 | final4Count = 0, |
michael@0 | 983 | medialCount = 0, |
michael@0 | 984 | min3Primary = 0, |
michael@0 | 985 | min4Primary = 0, |
michael@0 | 986 | max4Primary = 0, |
michael@0 | 987 | minTrail = 0, |
michael@0 | 988 | maxTrail = 0, |
michael@0 | 989 | max3Trail = 0, |
michael@0 | 990 | max4Trail = 0, |
michael@0 | 991 | min4Boundary = 0; |
michael@0 | 992 | |
michael@0 | 993 | static const UChar32 |
michael@0 | 994 | // 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; |
michael@0 | 995 | // 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; (Unicode 6.1) |
michael@0 | 996 | CJK_BASE = 0x4E00, |
michael@0 | 997 | CJK_LIMIT = 0x9FCC+1, |
michael@0 | 998 | // Unified CJK ideographs in the compatibility ideographs block. |
michael@0 | 999 | CJK_COMPAT_USED_BASE = 0xFA0E, |
michael@0 | 1000 | CJK_COMPAT_USED_LIMIT = 0xFA2F+1, |
michael@0 | 1001 | // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; |
michael@0 | 1002 | // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; |
michael@0 | 1003 | CJK_A_BASE = 0x3400, |
michael@0 | 1004 | CJK_A_LIMIT = 0x4DB5+1, |
michael@0 | 1005 | // 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;; |
michael@0 | 1006 | // 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;; |
michael@0 | 1007 | CJK_B_BASE = 0x20000, |
michael@0 | 1008 | CJK_B_LIMIT = 0x2A6D6+1, |
michael@0 | 1009 | // 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;; |
michael@0 | 1010 | // 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;; |
michael@0 | 1011 | CJK_C_BASE = 0x2A700, |
michael@0 | 1012 | CJK_C_LIMIT = 0x2B734+1, |
michael@0 | 1013 | // 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;; |
michael@0 | 1014 | // 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;; |
michael@0 | 1015 | CJK_D_BASE = 0x2B740, |
michael@0 | 1016 | CJK_D_LIMIT = 0x2B81D+1; |
michael@0 | 1017 | // when adding to this list, look for all occurrences (in project) |
michael@0 | 1018 | // of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing!!!! |
michael@0 | 1019 | |
michael@0 | 1020 | static UChar32 swapCJK(UChar32 i) { |
michael@0 | 1021 | if (i < CJK_A_BASE) { |
michael@0 | 1022 | // non-CJK |
michael@0 | 1023 | } else if (i < CJK_A_LIMIT) { |
michael@0 | 1024 | // Extension A has lower code points than the original Unihan+compat |
michael@0 | 1025 | // but sorts higher. |
michael@0 | 1026 | return i - CJK_A_BASE |
michael@0 | 1027 | + (CJK_LIMIT - CJK_BASE) |
michael@0 | 1028 | + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE); |
michael@0 | 1029 | } else if (i < CJK_BASE) { |
michael@0 | 1030 | // non-CJK |
michael@0 | 1031 | } else if (i < CJK_LIMIT) { |
michael@0 | 1032 | return i - CJK_BASE; |
michael@0 | 1033 | } else if (i < CJK_COMPAT_USED_BASE) { |
michael@0 | 1034 | // non-CJK |
michael@0 | 1035 | } else if (i < CJK_COMPAT_USED_LIMIT) { |
michael@0 | 1036 | return i - CJK_COMPAT_USED_BASE |
michael@0 | 1037 | + (CJK_LIMIT - CJK_BASE); |
michael@0 | 1038 | } else if (i < CJK_B_BASE) { |
michael@0 | 1039 | // non-CJK |
michael@0 | 1040 | } else if (i < CJK_B_LIMIT) { |
michael@0 | 1041 | return i; // non-BMP-CJK |
michael@0 | 1042 | } else if (i < CJK_C_BASE) { |
michael@0 | 1043 | // non-CJK |
michael@0 | 1044 | } else if (i < CJK_C_LIMIT) { |
michael@0 | 1045 | return i; // non-BMP-CJK |
michael@0 | 1046 | } else if (i < CJK_D_BASE) { |
michael@0 | 1047 | // non-CJK |
michael@0 | 1048 | } else if (i < CJK_D_LIMIT) { |
michael@0 | 1049 | return i; // non-BMP-CJK |
michael@0 | 1050 | } |
michael@0 | 1051 | return i + NON_CJK_OFFSET; // non-CJK |
michael@0 | 1052 | } |
michael@0 | 1053 | |
michael@0 | 1054 | U_CAPI UChar32 U_EXPORT2 |
michael@0 | 1055 | uprv_uca_getRawFromCodePoint(UChar32 i) { |
michael@0 | 1056 | return swapCJK(i)+1; |
michael@0 | 1057 | } |
michael@0 | 1058 | |
michael@0 | 1059 | U_CAPI UChar32 U_EXPORT2 |
michael@0 | 1060 | uprv_uca_getCodePointFromRaw(UChar32 i) { |
michael@0 | 1061 | i--; |
michael@0 | 1062 | UChar32 result = 0; |
michael@0 | 1063 | if(i >= NON_CJK_OFFSET) { |
michael@0 | 1064 | result = i - NON_CJK_OFFSET; |
michael@0 | 1065 | } else if(i >= CJK_B_BASE) { |
michael@0 | 1066 | result = i; |
michael@0 | 1067 | } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted |
michael@0 | 1068 | if(i < CJK_LIMIT - CJK_BASE) { |
michael@0 | 1069 | result = i + CJK_BASE; |
michael@0 | 1070 | } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { |
michael@0 | 1071 | result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE); |
michael@0 | 1072 | } else { |
michael@0 | 1073 | result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE); |
michael@0 | 1074 | } |
michael@0 | 1075 | } else { |
michael@0 | 1076 | result = -1; |
michael@0 | 1077 | } |
michael@0 | 1078 | return result; |
michael@0 | 1079 | } |
michael@0 | 1080 | |
michael@0 | 1081 | // GET IMPLICIT PRIMARY WEIGHTS |
michael@0 | 1082 | // Return value is left justified primary key |
michael@0 | 1083 | U_CAPI uint32_t U_EXPORT2 |
michael@0 | 1084 | uprv_uca_getImplicitFromRaw(UChar32 cp) { |
michael@0 | 1085 | /* |
michael@0 | 1086 | if (cp < 0 || cp > UCOL_MAX_INPUT) { |
michael@0 | 1087 | throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp)); |
michael@0 | 1088 | } |
michael@0 | 1089 | */ |
michael@0 | 1090 | int32_t last0 = cp - min4Boundary; |
michael@0 | 1091 | if (last0 < 0) { |
michael@0 | 1092 | int32_t last1 = cp / final3Count; |
michael@0 | 1093 | last0 = cp % final3Count; |
michael@0 | 1094 | |
michael@0 | 1095 | int32_t last2 = last1 / medialCount; |
michael@0 | 1096 | last1 %= medialCount; |
michael@0 | 1097 | |
michael@0 | 1098 | last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start |
michael@0 | 1099 | last1 = minTrail + last1; // offset |
michael@0 | 1100 | last2 = min3Primary + last2; // offset |
michael@0 | 1101 | /* |
michael@0 | 1102 | if (last2 >= min4Primary) { |
michael@0 | 1103 | throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2)); |
michael@0 | 1104 | } |
michael@0 | 1105 | */ |
michael@0 | 1106 | return (last2 << 24) + (last1 << 16) + (last0 << 8); |
michael@0 | 1107 | } else { |
michael@0 | 1108 | int32_t last1 = last0 / final4Count; |
michael@0 | 1109 | last0 %= final4Count; |
michael@0 | 1110 | |
michael@0 | 1111 | int32_t last2 = last1 / medialCount; |
michael@0 | 1112 | last1 %= medialCount; |
michael@0 | 1113 | |
michael@0 | 1114 | int32_t last3 = last2 / medialCount; |
michael@0 | 1115 | last2 %= medialCount; |
michael@0 | 1116 | |
michael@0 | 1117 | last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start |
michael@0 | 1118 | last1 = minTrail + last1; // offset |
michael@0 | 1119 | last2 = minTrail + last2; // offset |
michael@0 | 1120 | last3 = min4Primary + last3; // offset |
michael@0 | 1121 | /* |
michael@0 | 1122 | if (last3 > max4Primary) { |
michael@0 | 1123 | throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3)); |
michael@0 | 1124 | } |
michael@0 | 1125 | */ |
michael@0 | 1126 | return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0; |
michael@0 | 1127 | } |
michael@0 | 1128 | } |
michael@0 | 1129 | |
michael@0 | 1130 | static uint32_t U_EXPORT2 |
michael@0 | 1131 | uprv_uca_getImplicitPrimary(UChar32 cp) { |
michael@0 | 1132 | //fprintf(stdout, "Incoming: %04x\n", cp); |
michael@0 | 1133 | //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp)); |
michael@0 | 1134 | |
michael@0 | 1135 | cp = swapCJK(cp); |
michael@0 | 1136 | cp++; |
michael@0 | 1137 | // we now have a range of numbers from 0 to 21FFFF. |
michael@0 | 1138 | |
michael@0 | 1139 | //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp)); |
michael@0 | 1140 | //fprintf(stdout, "CJK swapped: %04x\n", cp); |
michael@0 | 1141 | |
michael@0 | 1142 | return uprv_uca_getImplicitFromRaw(cp); |
michael@0 | 1143 | } |
michael@0 | 1144 | |
michael@0 | 1145 | /** |
michael@0 | 1146 | * Converts implicit CE into raw integer ("code point") |
michael@0 | 1147 | * @param implicit |
michael@0 | 1148 | * @return -1 if illegal format |
michael@0 | 1149 | */ |
michael@0 | 1150 | U_CAPI UChar32 U_EXPORT2 |
michael@0 | 1151 | uprv_uca_getRawFromImplicit(uint32_t implicit) { |
michael@0 | 1152 | UChar32 result; |
michael@0 | 1153 | UChar32 b3 = implicit & 0xFF; |
michael@0 | 1154 | UChar32 b2 = (implicit >> 8) & 0xFF; |
michael@0 | 1155 | UChar32 b1 = (implicit >> 16) & 0xFF; |
michael@0 | 1156 | UChar32 b0 = (implicit >> 24) & 0xFF; |
michael@0 | 1157 | |
michael@0 | 1158 | // simple parameter checks |
michael@0 | 1159 | if (b0 < min3Primary || b0 > max4Primary |
michael@0 | 1160 | || b1 < minTrail || b1 > maxTrail) |
michael@0 | 1161 | return -1; |
michael@0 | 1162 | // normal offsets |
michael@0 | 1163 | b1 -= minTrail; |
michael@0 | 1164 | |
michael@0 | 1165 | // take care of the final values, and compose |
michael@0 | 1166 | if (b0 < min4Primary) { |
michael@0 | 1167 | if (b2 < minTrail || b2 > max3Trail || b3 != 0) |
michael@0 | 1168 | return -1; |
michael@0 | 1169 | b2 -= minTrail; |
michael@0 | 1170 | UChar32 remainder = b2 % final3Multiplier; |
michael@0 | 1171 | if (remainder != 0) |
michael@0 | 1172 | return -1; |
michael@0 | 1173 | b0 -= min3Primary; |
michael@0 | 1174 | b2 /= final3Multiplier; |
michael@0 | 1175 | result = ((b0 * medialCount) + b1) * final3Count + b2; |
michael@0 | 1176 | } else { |
michael@0 | 1177 | if (b2 < minTrail || b2 > maxTrail |
michael@0 | 1178 | || b3 < minTrail || b3 > max4Trail) |
michael@0 | 1179 | return -1; |
michael@0 | 1180 | b2 -= minTrail; |
michael@0 | 1181 | b3 -= minTrail; |
michael@0 | 1182 | UChar32 remainder = b3 % final4Multiplier; |
michael@0 | 1183 | if (remainder != 0) |
michael@0 | 1184 | return -1; |
michael@0 | 1185 | b3 /= final4Multiplier; |
michael@0 | 1186 | b0 -= min4Primary; |
michael@0 | 1187 | result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary; |
michael@0 | 1188 | } |
michael@0 | 1189 | // final check |
michael@0 | 1190 | if (result < 0 || result > UCOL_MAX_INPUT) |
michael@0 | 1191 | return -1; |
michael@0 | 1192 | return result; |
michael@0 | 1193 | } |
michael@0 | 1194 | |
michael@0 | 1195 | |
michael@0 | 1196 | static inline int32_t divideAndRoundUp(int a, int b) { |
michael@0 | 1197 | return 1 + (a-1)/b; |
michael@0 | 1198 | } |
michael@0 | 1199 | |
michael@0 | 1200 | /* this function is either called from initUCA or from genUCA before |
michael@0 | 1201 | * doing canonical closure for the UCA. |
michael@0 | 1202 | */ |
michael@0 | 1203 | |
michael@0 | 1204 | /** |
michael@0 | 1205 | * Set up to generate implicits. |
michael@0 | 1206 | * Maintenance Note: this function may end up being called more than once, due |
michael@0 | 1207 | * to threading races during initialization. Make sure that |
michael@0 | 1208 | * none of the Constants is ever transiently assigned an |
michael@0 | 1209 | * incorrect value. |
michael@0 | 1210 | * @param minPrimary |
michael@0 | 1211 | * @param maxPrimary |
michael@0 | 1212 | * @param minTrail final byte |
michael@0 | 1213 | * @param maxTrail final byte |
michael@0 | 1214 | * @param gap3 the gap we leave for tailoring for 3-byte forms |
michael@0 | 1215 | * @param gap4 the gap we leave for tailoring for 4-byte forms |
michael@0 | 1216 | */ |
michael@0 | 1217 | static void initImplicitConstants(int minPrimary, int maxPrimary, |
michael@0 | 1218 | int minTrailIn, int maxTrailIn, |
michael@0 | 1219 | int gap3, int primaries3count, |
michael@0 | 1220 | UErrorCode *status) { |
michael@0 | 1221 | // some simple parameter checks |
michael@0 | 1222 | if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) |
michael@0 | 1223 | || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF) |
michael@0 | 1224 | || (primaries3count < 1)) |
michael@0 | 1225 | { |
michael@0 | 1226 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 1227 | return; |
michael@0 | 1228 | }; |
michael@0 | 1229 | |
michael@0 | 1230 | minTrail = minTrailIn; |
michael@0 | 1231 | maxTrail = maxTrailIn; |
michael@0 | 1232 | |
michael@0 | 1233 | min3Primary = minPrimary; |
michael@0 | 1234 | max4Primary = maxPrimary; |
michael@0 | 1235 | // compute constants for use later. |
michael@0 | 1236 | // number of values we can use in trailing bytes |
michael@0 | 1237 | // leave room for empty values between AND above, e.g. if gap = 2 |
michael@0 | 1238 | // range 3..7 => +3 -4 -5 -6 -7: so 1 value |
michael@0 | 1239 | // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values |
michael@0 | 1240 | // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values |
michael@0 | 1241 | final3Multiplier = gap3 + 1; |
michael@0 | 1242 | final3Count = (maxTrail - minTrail + 1) / final3Multiplier; |
michael@0 | 1243 | max3Trail = minTrail + (final3Count - 1) * final3Multiplier; |
michael@0 | 1244 | |
michael@0 | 1245 | // medials can use full range |
michael@0 | 1246 | medialCount = (maxTrail - minTrail + 1); |
michael@0 | 1247 | // find out how many values fit in each form |
michael@0 | 1248 | int32_t threeByteCount = medialCount * final3Count; |
michael@0 | 1249 | // now determine where the 3/4 boundary is. |
michael@0 | 1250 | // we use 3 bytes below the boundary, and 4 above |
michael@0 | 1251 | int32_t primariesAvailable = maxPrimary - minPrimary + 1; |
michael@0 | 1252 | int32_t primaries4count = primariesAvailable - primaries3count; |
michael@0 | 1253 | |
michael@0 | 1254 | |
michael@0 | 1255 | int32_t min3ByteCoverage = primaries3count * threeByteCount; |
michael@0 | 1256 | min4Primary = minPrimary + primaries3count; |
michael@0 | 1257 | min4Boundary = min3ByteCoverage; |
michael@0 | 1258 | // Now expand out the multiplier for the 4 bytes, and redo. |
michael@0 | 1259 | |
michael@0 | 1260 | int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary; |
michael@0 | 1261 | int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count); |
michael@0 | 1262 | int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount); |
michael@0 | 1263 | int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte; |
michael@0 | 1264 | if (gap4 < 1) { |
michael@0 | 1265 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 1266 | return; |
michael@0 | 1267 | } |
michael@0 | 1268 | final4Multiplier = gap4 + 1; |
michael@0 | 1269 | final4Count = neededPerFinalByte; |
michael@0 | 1270 | max4Trail = minTrail + (final4Count - 1) * final4Multiplier; |
michael@0 | 1271 | } |
michael@0 | 1272 | |
michael@0 | 1273 | /** |
michael@0 | 1274 | * Supply parameters for generating implicit CEs |
michael@0 | 1275 | */ |
michael@0 | 1276 | U_CAPI void U_EXPORT2 |
michael@0 | 1277 | uprv_uca_initImplicitConstants(UErrorCode *status) { |
michael@0 | 1278 | // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms. |
michael@0 | 1279 | //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status); |
michael@0 | 1280 | initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status); |
michael@0 | 1281 | } |
michael@0 | 1282 | |
michael@0 | 1283 | |
michael@0 | 1284 | /* collIterNormalize Incremental Normalization happens here. */ |
michael@0 | 1285 | /* pick up the range of chars identifed by FCD, */ |
michael@0 | 1286 | /* normalize it into the collIterate's writable buffer, */ |
michael@0 | 1287 | /* switch the collIterate's state to use the writable buffer. */ |
michael@0 | 1288 | /* */ |
michael@0 | 1289 | static |
michael@0 | 1290 | void collIterNormalize(collIterate *collationSource) |
michael@0 | 1291 | { |
michael@0 | 1292 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 1293 | const UChar *srcP = collationSource->pos - 1; /* Start of chars to normalize */ |
michael@0 | 1294 | const UChar *endP = collationSource->fcdPosition; /* End of region to normalize+1 */ |
michael@0 | 1295 | |
michael@0 | 1296 | collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP - srcP)), |
michael@0 | 1297 | collationSource->writableBuffer, |
michael@0 | 1298 | status); |
michael@0 | 1299 | if (U_FAILURE(status)) { |
michael@0 | 1300 | #ifdef UCOL_DEBUG |
michael@0 | 1301 | fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_errorName(status)); |
michael@0 | 1302 | #endif |
michael@0 | 1303 | return; |
michael@0 | 1304 | } |
michael@0 | 1305 | |
michael@0 | 1306 | collationSource->pos = collationSource->writableBuffer.getTerminatedBuffer(); |
michael@0 | 1307 | collationSource->origFlags = collationSource->flags; |
michael@0 | 1308 | collationSource->flags |= UCOL_ITER_INNORMBUF; |
michael@0 | 1309 | collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); |
michael@0 | 1310 | } |
michael@0 | 1311 | |
michael@0 | 1312 | |
michael@0 | 1313 | // This function takes the iterator and extracts normalized stuff up to the next boundary |
michael@0 | 1314 | // It is similar in the end results to the collIterNormalize, but for the cases when we |
michael@0 | 1315 | // use an iterator |
michael@0 | 1316 | /*static |
michael@0 | 1317 | inline void normalizeIterator(collIterate *collationSource) { |
michael@0 | 1318 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 1319 | UBool wasNormalized = FALSE; |
michael@0 | 1320 | //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT); |
michael@0 | 1321 | uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator); |
michael@0 | 1322 | int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer, |
michael@0 | 1323 | (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status); |
michael@0 | 1324 | if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) { |
michael@0 | 1325 | // reallocate and terminate |
michael@0 | 1326 | if(!u_growBufferFromStatic(collationSource->stackWritableBuffer, |
michael@0 | 1327 | &collationSource->writableBuffer, |
michael@0 | 1328 | (int32_t *)&collationSource->writableBufSize, normLen + 1, |
michael@0 | 1329 | 0) |
michael@0 | 1330 | ) { |
michael@0 | 1331 | #ifdef UCOL_DEBUG |
michael@0 | 1332 | fprintf(stderr, "normalizeIterator(), out of memory\n"); |
michael@0 | 1333 | #endif |
michael@0 | 1334 | return; |
michael@0 | 1335 | } |
michael@0 | 1336 | status = U_ZERO_ERROR; |
michael@0 | 1337 | //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO); |
michael@0 | 1338 | collationSource->iterator->setState(collationSource->iterator, iterIndex, &status); |
michael@0 | 1339 | normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer, |
michael@0 | 1340 | (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status); |
michael@0 | 1341 | } |
michael@0 | 1342 | // Terminate the buffer - we already checked that it is big enough |
michael@0 | 1343 | collationSource->writableBuffer[normLen] = 0; |
michael@0 | 1344 | if(collationSource->writableBuffer != collationSource->stackWritableBuffer) { |
michael@0 | 1345 | collationSource->flags |= UCOL_ITER_ALLOCATED; |
michael@0 | 1346 | } |
michael@0 | 1347 | collationSource->pos = collationSource->writableBuffer; |
michael@0 | 1348 | collationSource->origFlags = collationSource->flags; |
michael@0 | 1349 | collationSource->flags |= UCOL_ITER_INNORMBUF; |
michael@0 | 1350 | collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); |
michael@0 | 1351 | }*/ |
michael@0 | 1352 | |
michael@0 | 1353 | |
michael@0 | 1354 | /* Incremental FCD check and normalize */ |
michael@0 | 1355 | /* Called from getNextCE when normalization state is suspect. */ |
michael@0 | 1356 | /* When entering, the state is known to be this: */ |
michael@0 | 1357 | /* o We are working in the main buffer of the collIterate, not the side */ |
michael@0 | 1358 | /* writable buffer. When in the side buffer, normalization mode is always off, */ |
michael@0 | 1359 | /* so we won't get here. */ |
michael@0 | 1360 | /* o The leading combining class from the current character is 0 or */ |
michael@0 | 1361 | /* the trailing combining class of the previous char was zero. */ |
michael@0 | 1362 | /* True because the previous call to this function will have always exited */ |
michael@0 | 1363 | /* that way, and we get called for every char where cc might be non-zero. */ |
michael@0 | 1364 | static |
michael@0 | 1365 | inline UBool collIterFCD(collIterate *collationSource) { |
michael@0 | 1366 | const UChar *srcP, *endP; |
michael@0 | 1367 | uint8_t leadingCC; |
michael@0 | 1368 | uint8_t prevTrailingCC = 0; |
michael@0 | 1369 | uint16_t fcd; |
michael@0 | 1370 | UBool needNormalize = FALSE; |
michael@0 | 1371 | |
michael@0 | 1372 | srcP = collationSource->pos-1; |
michael@0 | 1373 | |
michael@0 | 1374 | if (collationSource->flags & UCOL_ITER_HASLEN) { |
michael@0 | 1375 | endP = collationSource->endp; |
michael@0 | 1376 | } else { |
michael@0 | 1377 | endP = NULL; |
michael@0 | 1378 | } |
michael@0 | 1379 | |
michael@0 | 1380 | // Get the trailing combining class of the current character. If it's zero, we are OK. |
michael@0 | 1381 | fcd = g_nfcImpl->nextFCD16(srcP, endP); |
michael@0 | 1382 | if (fcd != 0) { |
michael@0 | 1383 | prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); |
michael@0 | 1384 | |
michael@0 | 1385 | if (prevTrailingCC != 0) { |
michael@0 | 1386 | // The current char has a non-zero trailing CC. Scan forward until we find |
michael@0 | 1387 | // a char with a leading cc of zero. |
michael@0 | 1388 | while (endP == NULL || srcP != endP) |
michael@0 | 1389 | { |
michael@0 | 1390 | const UChar *savedSrcP = srcP; |
michael@0 | 1391 | |
michael@0 | 1392 | fcd = g_nfcImpl->nextFCD16(srcP, endP); |
michael@0 | 1393 | leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); |
michael@0 | 1394 | if (leadingCC == 0) { |
michael@0 | 1395 | srcP = savedSrcP; // Hit char that is not part of combining sequence. |
michael@0 | 1396 | // back up over it. (Could be surrogate pair!) |
michael@0 | 1397 | break; |
michael@0 | 1398 | } |
michael@0 | 1399 | |
michael@0 | 1400 | if (leadingCC < prevTrailingCC) { |
michael@0 | 1401 | needNormalize = TRUE; |
michael@0 | 1402 | } |
michael@0 | 1403 | |
michael@0 | 1404 | prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); |
michael@0 | 1405 | } |
michael@0 | 1406 | } |
michael@0 | 1407 | } |
michael@0 | 1408 | |
michael@0 | 1409 | collationSource->fcdPosition = (UChar *)srcP; |
michael@0 | 1410 | |
michael@0 | 1411 | return needNormalize; |
michael@0 | 1412 | } |
michael@0 | 1413 | |
michael@0 | 1414 | /****************************************************************************/ |
michael@0 | 1415 | /* Following are the CE retrieval functions */ |
michael@0 | 1416 | /* */ |
michael@0 | 1417 | /****************************************************************************/ |
michael@0 | 1418 | |
michael@0 | 1419 | static uint32_t getImplicit(UChar32 cp, collIterate *collationSource); |
michael@0 | 1420 | static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource); |
michael@0 | 1421 | |
michael@0 | 1422 | /* there should be a macro version of this function in the header file */ |
michael@0 | 1423 | /* This is the first function that tries to fetch a collation element */ |
michael@0 | 1424 | /* If it's not succesfull or it encounters a more difficult situation */ |
michael@0 | 1425 | /* some more sofisticated and slower functions are invoked */ |
michael@0 | 1426 | static |
michael@0 | 1427 | inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) { |
michael@0 | 1428 | uint32_t order = 0; |
michael@0 | 1429 | if (collationSource->CEpos > collationSource->toReturn) { /* Are there any CEs from previous expansions? */ |
michael@0 | 1430 | order = *(collationSource->toReturn++); /* if so, return them */ |
michael@0 | 1431 | if(collationSource->CEpos == collationSource->toReturn) { |
michael@0 | 1432 | collationSource->CEpos = collationSource->toReturn = collationSource->extendCEs ? collationSource->extendCEs : collationSource->CEs; |
michael@0 | 1433 | } |
michael@0 | 1434 | return order; |
michael@0 | 1435 | } |
michael@0 | 1436 | |
michael@0 | 1437 | UChar ch = 0; |
michael@0 | 1438 | collationSource->offsetReturn = NULL; |
michael@0 | 1439 | |
michael@0 | 1440 | do { |
michael@0 | 1441 | for (;;) /* Loop handles case when incremental normalize switches */ |
michael@0 | 1442 | { /* to or from the side buffer / original string, and we */ |
michael@0 | 1443 | /* need to start again to get the next character. */ |
michael@0 | 1444 | |
michael@0 | 1445 | if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0) |
michael@0 | 1446 | { |
michael@0 | 1447 | // The source string is null terminated and we're not working from the side buffer, |
michael@0 | 1448 | // and we're not normalizing. This is the fast path. |
michael@0 | 1449 | // (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.) |
michael@0 | 1450 | ch = *collationSource->pos++; |
michael@0 | 1451 | if (ch != 0) { |
michael@0 | 1452 | break; |
michael@0 | 1453 | } |
michael@0 | 1454 | else { |
michael@0 | 1455 | return UCOL_NO_MORE_CES; |
michael@0 | 1456 | } |
michael@0 | 1457 | } |
michael@0 | 1458 | |
michael@0 | 1459 | if (collationSource->flags & UCOL_ITER_HASLEN) { |
michael@0 | 1460 | // Normal path for strings when length is specified. |
michael@0 | 1461 | // (We can't be in side buffer because it is always null terminated.) |
michael@0 | 1462 | if (collationSource->pos >= collationSource->endp) { |
michael@0 | 1463 | // Ran off of the end of the main source string. We're done. |
michael@0 | 1464 | return UCOL_NO_MORE_CES; |
michael@0 | 1465 | } |
michael@0 | 1466 | ch = *collationSource->pos++; |
michael@0 | 1467 | } |
michael@0 | 1468 | else if(collationSource->flags & UCOL_USE_ITERATOR) { |
michael@0 | 1469 | UChar32 iterCh = collationSource->iterator->next(collationSource->iterator); |
michael@0 | 1470 | if(iterCh == U_SENTINEL) { |
michael@0 | 1471 | return UCOL_NO_MORE_CES; |
michael@0 | 1472 | } |
michael@0 | 1473 | ch = (UChar)iterCh; |
michael@0 | 1474 | } |
michael@0 | 1475 | else |
michael@0 | 1476 | { |
michael@0 | 1477 | // Null terminated string. |
michael@0 | 1478 | ch = *collationSource->pos++; |
michael@0 | 1479 | if (ch == 0) { |
michael@0 | 1480 | // Ran off end of buffer. |
michael@0 | 1481 | if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { |
michael@0 | 1482 | // Ran off end of main string. backing up one character. |
michael@0 | 1483 | collationSource->pos--; |
michael@0 | 1484 | return UCOL_NO_MORE_CES; |
michael@0 | 1485 | } |
michael@0 | 1486 | else |
michael@0 | 1487 | { |
michael@0 | 1488 | // Hit null in the normalize side buffer. |
michael@0 | 1489 | // Usually this means the end of the normalized data, |
michael@0 | 1490 | // except for one odd case: a null followed by combining chars, |
michael@0 | 1491 | // which is the case if we are at the start of the buffer. |
michael@0 | 1492 | if (collationSource->pos == collationSource->writableBuffer.getBuffer()+1) { |
michael@0 | 1493 | break; |
michael@0 | 1494 | } |
michael@0 | 1495 | |
michael@0 | 1496 | // Null marked end of side buffer. |
michael@0 | 1497 | // Revert to the main string and |
michael@0 | 1498 | // loop back to top to try again to get a character. |
michael@0 | 1499 | collationSource->pos = collationSource->fcdPosition; |
michael@0 | 1500 | collationSource->flags = collationSource->origFlags; |
michael@0 | 1501 | continue; |
michael@0 | 1502 | } |
michael@0 | 1503 | } |
michael@0 | 1504 | } |
michael@0 | 1505 | |
michael@0 | 1506 | if(collationSource->flags&UCOL_HIRAGANA_Q) { |
michael@0 | 1507 | /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag |
michael@0 | 1508 | * based on whether the previous codepoint was Hiragana or Katakana. |
michael@0 | 1509 | */ |
michael@0 | 1510 | if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) || |
michael@0 | 1511 | ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) { |
michael@0 | 1512 | collationSource->flags |= UCOL_WAS_HIRAGANA; |
michael@0 | 1513 | } else { |
michael@0 | 1514 | collationSource->flags &= ~UCOL_WAS_HIRAGANA; |
michael@0 | 1515 | } |
michael@0 | 1516 | } |
michael@0 | 1517 | |
michael@0 | 1518 | // We've got a character. See if there's any fcd and/or normalization stuff to do. |
michael@0 | 1519 | // Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer. |
michael@0 | 1520 | if ((collationSource->flags & UCOL_ITER_NORM) == 0) { |
michael@0 | 1521 | break; |
michael@0 | 1522 | } |
michael@0 | 1523 | |
michael@0 | 1524 | if (collationSource->fcdPosition >= collationSource->pos) { |
michael@0 | 1525 | // An earlier FCD check has already covered the current character. |
michael@0 | 1526 | // We can go ahead and process this char. |
michael@0 | 1527 | break; |
michael@0 | 1528 | } |
michael@0 | 1529 | |
michael@0 | 1530 | if (ch < ZERO_CC_LIMIT_ ) { |
michael@0 | 1531 | // Fast fcd safe path. Trailing combining class == 0. This char is OK. |
michael@0 | 1532 | break; |
michael@0 | 1533 | } |
michael@0 | 1534 | |
michael@0 | 1535 | if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { |
michael@0 | 1536 | // We need to peek at the next character in order to tell if we are FCD |
michael@0 | 1537 | if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) { |
michael@0 | 1538 | // We are at the last char of source string. |
michael@0 | 1539 | // It is always OK for FCD check. |
michael@0 | 1540 | break; |
michael@0 | 1541 | } |
michael@0 | 1542 | |
michael@0 | 1543 | // Not at last char of source string (or we'll check against terminating null). Do the FCD fast test |
michael@0 | 1544 | if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) { |
michael@0 | 1545 | break; |
michael@0 | 1546 | } |
michael@0 | 1547 | } |
michael@0 | 1548 | |
michael@0 | 1549 | |
michael@0 | 1550 | // Need a more complete FCD check and possible normalization. |
michael@0 | 1551 | if (collIterFCD(collationSource)) { |
michael@0 | 1552 | collIterNormalize(collationSource); |
michael@0 | 1553 | } |
michael@0 | 1554 | if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { |
michael@0 | 1555 | // No normalization was needed. Go ahead and process the char we already had. |
michael@0 | 1556 | break; |
michael@0 | 1557 | } |
michael@0 | 1558 | |
michael@0 | 1559 | // Some normalization happened. Next loop iteration will pick up a char |
michael@0 | 1560 | // from the normalization buffer. |
michael@0 | 1561 | |
michael@0 | 1562 | } // end for (;;) |
michael@0 | 1563 | |
michael@0 | 1564 | |
michael@0 | 1565 | if (ch <= 0xFF) { |
michael@0 | 1566 | /* For latin-1 characters we never need to fall back to the UCA table */ |
michael@0 | 1567 | /* because all of the UCA data is replicated in the latinOneMapping array */ |
michael@0 | 1568 | order = coll->latinOneMapping[ch]; |
michael@0 | 1569 | if (order > UCOL_NOT_FOUND) { |
michael@0 | 1570 | order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); |
michael@0 | 1571 | } |
michael@0 | 1572 | } |
michael@0 | 1573 | else |
michael@0 | 1574 | { |
michael@0 | 1575 | // Always use UCA for Han, Hangul |
michael@0 | 1576 | // (Han extension A is before main Han block) |
michael@0 | 1577 | // **** Han compatibility chars ?? **** |
michael@0 | 1578 | if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && |
michael@0 | 1579 | (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) { |
michael@0 | 1580 | if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) { |
michael@0 | 1581 | // between the two target ranges; do normal lookup |
michael@0 | 1582 | // **** this range is YI, Modifier tone letters, **** |
michael@0 | 1583 | // **** Latin-D, Syloti Nagari, Phagas-pa. **** |
michael@0 | 1584 | // **** Latin-D might be tailored, so we need to **** |
michael@0 | 1585 | // **** do the normal lookup for these guys. **** |
michael@0 | 1586 | order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); |
michael@0 | 1587 | } else { |
michael@0 | 1588 | // in one of the target ranges; use UCA |
michael@0 | 1589 | order = UCOL_NOT_FOUND; |
michael@0 | 1590 | } |
michael@0 | 1591 | } else { |
michael@0 | 1592 | order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); |
michael@0 | 1593 | } |
michael@0 | 1594 | |
michael@0 | 1595 | if(order > UCOL_NOT_FOUND) { /* if a CE is special */ |
michael@0 | 1596 | order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */ |
michael@0 | 1597 | } |
michael@0 | 1598 | |
michael@0 | 1599 | if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good CE in the tailoring */ |
michael@0 | 1600 | /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */ |
michael@0 | 1601 | order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); |
michael@0 | 1602 | |
michael@0 | 1603 | if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */ |
michael@0 | 1604 | order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status); |
michael@0 | 1605 | } |
michael@0 | 1606 | } |
michael@0 | 1607 | } |
michael@0 | 1608 | } while ( order == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL ); |
michael@0 | 1609 | |
michael@0 | 1610 | if(order == UCOL_NOT_FOUND) { |
michael@0 | 1611 | order = getImplicit(ch, collationSource); |
michael@0 | 1612 | } |
michael@0 | 1613 | return order; /* return the CE */ |
michael@0 | 1614 | } |
michael@0 | 1615 | |
michael@0 | 1616 | /* ucol_getNextCE, out-of-line version for use from other files. */ |
michael@0 | 1617 | U_CAPI uint32_t U_EXPORT2 |
michael@0 | 1618 | ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) { |
michael@0 | 1619 | return ucol_IGetNextCE(coll, collationSource, status); |
michael@0 | 1620 | } |
michael@0 | 1621 | |
michael@0 | 1622 | |
michael@0 | 1623 | /** |
michael@0 | 1624 | * Incremental previous normalization happens here. Pick up the range of chars |
michael@0 | 1625 | * identifed by FCD, normalize it into the collIterate's writable buffer, |
michael@0 | 1626 | * switch the collIterate's state to use the writable buffer. |
michael@0 | 1627 | * @param data collation iterator data |
michael@0 | 1628 | */ |
michael@0 | 1629 | static |
michael@0 | 1630 | void collPrevIterNormalize(collIterate *data) |
michael@0 | 1631 | { |
michael@0 | 1632 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 1633 | const UChar *pEnd = data->pos; /* End normalize + 1 */ |
michael@0 | 1634 | const UChar *pStart; |
michael@0 | 1635 | |
michael@0 | 1636 | /* Start normalize */ |
michael@0 | 1637 | if (data->fcdPosition == NULL) { |
michael@0 | 1638 | pStart = data->string; |
michael@0 | 1639 | } |
michael@0 | 1640 | else { |
michael@0 | 1641 | pStart = data->fcdPosition + 1; |
michael@0 | 1642 | } |
michael@0 | 1643 | |
michael@0 | 1644 | int32_t normLen = |
michael@0 | 1645 | data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pStart) + 1)), |
michael@0 | 1646 | data->writableBuffer, |
michael@0 | 1647 | status). |
michael@0 | 1648 | length(); |
michael@0 | 1649 | if(U_FAILURE(status)) { |
michael@0 | 1650 | return; |
michael@0 | 1651 | } |
michael@0 | 1652 | /* |
michael@0 | 1653 | this puts the null termination infront of the normalized string instead |
michael@0 | 1654 | of the end |
michael@0 | 1655 | */ |
michael@0 | 1656 | data->writableBuffer.insert(0, (UChar)0); |
michael@0 | 1657 | |
michael@0 | 1658 | /* |
michael@0 | 1659 | * The usual case at this point is that we've got a base |
michael@0 | 1660 | * character followed by marks that were normalized. If |
michael@0 | 1661 | * fcdPosition is NULL, that means that we backed up to |
michael@0 | 1662 | * the beginning of the string and there's no base character. |
michael@0 | 1663 | * |
michael@0 | 1664 | * Forward processing will usually normalize when it sees |
michael@0 | 1665 | * the first mark, so that mark will get it's natural offset |
michael@0 | 1666 | * and the rest will get the offset of the character following |
michael@0 | 1667 | * the marks. The base character will also get its natural offset. |
michael@0 | 1668 | * |
michael@0 | 1669 | * We write the offset of the base character, if there is one, |
michael@0 | 1670 | * followed by the offset of the first mark and then the offsets |
michael@0 | 1671 | * of the rest of the marks. |
michael@0 | 1672 | */ |
michael@0 | 1673 | int32_t firstMarkOffset = 0; |
michael@0 | 1674 | int32_t trailOffset = (int32_t)(data->pos - data->string + 1); |
michael@0 | 1675 | int32_t trailCount = normLen - 1; |
michael@0 | 1676 | |
michael@0 | 1677 | if (data->fcdPosition != NULL) { |
michael@0 | 1678 | int32_t baseOffset = (int32_t)(data->fcdPosition - data->string); |
michael@0 | 1679 | UChar baseChar = *data->fcdPosition; |
michael@0 | 1680 | |
michael@0 | 1681 | firstMarkOffset = baseOffset + 1; |
michael@0 | 1682 | |
michael@0 | 1683 | /* |
michael@0 | 1684 | * If the base character is the start of a contraction, forward processing |
michael@0 | 1685 | * will normalize the marks while checking for the contraction, which means |
michael@0 | 1686 | * that the offset of the first mark will the same as the other marks. |
michael@0 | 1687 | * |
michael@0 | 1688 | * **** THIS IS PROBABLY NOT A COMPLETE TEST **** |
michael@0 | 1689 | */ |
michael@0 | 1690 | if (baseChar >= 0x100) { |
michael@0 | 1691 | uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar); |
michael@0 | 1692 | |
michael@0 | 1693 | if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) { |
michael@0 | 1694 | baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar); |
michael@0 | 1695 | } |
michael@0 | 1696 | |
michael@0 | 1697 | if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) { |
michael@0 | 1698 | firstMarkOffset = trailOffset; |
michael@0 | 1699 | } |
michael@0 | 1700 | } |
michael@0 | 1701 | |
michael@0 | 1702 | data->appendOffset(baseOffset, status); |
michael@0 | 1703 | } |
michael@0 | 1704 | |
michael@0 | 1705 | data->appendOffset(firstMarkOffset, status); |
michael@0 | 1706 | |
michael@0 | 1707 | for (int32_t i = 0; i < trailCount; i += 1) { |
michael@0 | 1708 | data->appendOffset(trailOffset, status); |
michael@0 | 1709 | } |
michael@0 | 1710 | |
michael@0 | 1711 | data->offsetRepeatValue = trailOffset; |
michael@0 | 1712 | |
michael@0 | 1713 | data->offsetReturn = data->offsetStore - 1; |
michael@0 | 1714 | if (data->offsetReturn == data->offsetBuffer) { |
michael@0 | 1715 | data->offsetStore = data->offsetBuffer; |
michael@0 | 1716 | } |
michael@0 | 1717 | |
michael@0 | 1718 | data->pos = data->writableBuffer.getTerminatedBuffer() + 1 + normLen; |
michael@0 | 1719 | data->origFlags = data->flags; |
michael@0 | 1720 | data->flags |= UCOL_ITER_INNORMBUF; |
michael@0 | 1721 | data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); |
michael@0 | 1722 | } |
michael@0 | 1723 | |
michael@0 | 1724 | |
michael@0 | 1725 | /** |
michael@0 | 1726 | * Incremental FCD check for previous iteration and normalize. Called from |
michael@0 | 1727 | * getPrevCE when normalization state is suspect. |
michael@0 | 1728 | * When entering, the state is known to be this: |
michael@0 | 1729 | * o We are working in the main buffer of the collIterate, not the side |
michael@0 | 1730 | * writable buffer. When in the side buffer, normalization mode is always |
michael@0 | 1731 | * off, so we won't get here. |
michael@0 | 1732 | * o The leading combining class from the current character is 0 or the |
michael@0 | 1733 | * trailing combining class of the previous char was zero. |
michael@0 | 1734 | * True because the previous call to this function will have always exited |
michael@0 | 1735 | * that way, and we get called for every char where cc might be non-zero. |
michael@0 | 1736 | * @param data collation iterate struct |
michael@0 | 1737 | * @return normalization status, TRUE for normalization to be done, FALSE |
michael@0 | 1738 | * otherwise |
michael@0 | 1739 | */ |
michael@0 | 1740 | static |
michael@0 | 1741 | inline UBool collPrevIterFCD(collIterate *data) |
michael@0 | 1742 | { |
michael@0 | 1743 | const UChar *src, *start; |
michael@0 | 1744 | uint8_t leadingCC; |
michael@0 | 1745 | uint8_t trailingCC = 0; |
michael@0 | 1746 | uint16_t fcd; |
michael@0 | 1747 | UBool result = FALSE; |
michael@0 | 1748 | |
michael@0 | 1749 | start = data->string; |
michael@0 | 1750 | src = data->pos + 1; |
michael@0 | 1751 | |
michael@0 | 1752 | /* Get the trailing combining class of the current character. */ |
michael@0 | 1753 | fcd = g_nfcImpl->previousFCD16(start, src); |
michael@0 | 1754 | |
michael@0 | 1755 | leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); |
michael@0 | 1756 | |
michael@0 | 1757 | if (leadingCC != 0) { |
michael@0 | 1758 | /* |
michael@0 | 1759 | The current char has a non-zero leading combining class. |
michael@0 | 1760 | Scan backward until we find a char with a trailing cc of zero. |
michael@0 | 1761 | */ |
michael@0 | 1762 | for (;;) |
michael@0 | 1763 | { |
michael@0 | 1764 | if (start == src) { |
michael@0 | 1765 | data->fcdPosition = NULL; |
michael@0 | 1766 | return result; |
michael@0 | 1767 | } |
michael@0 | 1768 | |
michael@0 | 1769 | fcd = g_nfcImpl->previousFCD16(start, src); |
michael@0 | 1770 | |
michael@0 | 1771 | trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); |
michael@0 | 1772 | |
michael@0 | 1773 | if (trailingCC == 0) { |
michael@0 | 1774 | break; |
michael@0 | 1775 | } |
michael@0 | 1776 | |
michael@0 | 1777 | if (leadingCC < trailingCC) { |
michael@0 | 1778 | result = TRUE; |
michael@0 | 1779 | } |
michael@0 | 1780 | |
michael@0 | 1781 | leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); |
michael@0 | 1782 | } |
michael@0 | 1783 | } |
michael@0 | 1784 | |
michael@0 | 1785 | data->fcdPosition = (UChar *)src; |
michael@0 | 1786 | |
michael@0 | 1787 | return result; |
michael@0 | 1788 | } |
michael@0 | 1789 | |
michael@0 | 1790 | /** gets a code unit from the string at a given offset |
michael@0 | 1791 | * Handles both normal and iterative cases. |
michael@0 | 1792 | * No error checking - caller beware! |
michael@0 | 1793 | */ |
michael@0 | 1794 | static inline |
michael@0 | 1795 | UChar peekCodeUnit(collIterate *source, int32_t offset) { |
michael@0 | 1796 | if(source->pos != NULL) { |
michael@0 | 1797 | return *(source->pos + offset); |
michael@0 | 1798 | } else if(source->iterator != NULL) { |
michael@0 | 1799 | UChar32 c; |
michael@0 | 1800 | if(offset != 0) { |
michael@0 | 1801 | source->iterator->move(source->iterator, offset, UITER_CURRENT); |
michael@0 | 1802 | c = source->iterator->next(source->iterator); |
michael@0 | 1803 | source->iterator->move(source->iterator, -offset-1, UITER_CURRENT); |
michael@0 | 1804 | } else { |
michael@0 | 1805 | c = source->iterator->current(source->iterator); |
michael@0 | 1806 | } |
michael@0 | 1807 | return c >= 0 ? (UChar)c : 0xfffd; // If the caller works properly, we should never see c<0. |
michael@0 | 1808 | } else { |
michael@0 | 1809 | return 0xfffd; |
michael@0 | 1810 | } |
michael@0 | 1811 | } |
michael@0 | 1812 | |
michael@0 | 1813 | // Code point version. Treats the offset as a _code point_ delta. |
michael@0 | 1814 | // We cannot use U16_FWD_1_UNSAFE and similar because we might not have well-formed UTF-16. |
michael@0 | 1815 | // We cannot use U16_FWD_1 and similar because we do not know the start and limit of the buffer. |
michael@0 | 1816 | static inline |
michael@0 | 1817 | UChar32 peekCodePoint(collIterate *source, int32_t offset) { |
michael@0 | 1818 | UChar32 c; |
michael@0 | 1819 | if(source->pos != NULL) { |
michael@0 | 1820 | const UChar *p = source->pos; |
michael@0 | 1821 | if(offset >= 0) { |
michael@0 | 1822 | // Skip forward over (offset-1) code points. |
michael@0 | 1823 | while(--offset >= 0) { |
michael@0 | 1824 | if(U16_IS_LEAD(*p++) && U16_IS_TRAIL(*p)) { |
michael@0 | 1825 | ++p; |
michael@0 | 1826 | } |
michael@0 | 1827 | } |
michael@0 | 1828 | // Read the code point there. |
michael@0 | 1829 | c = *p++; |
michael@0 | 1830 | UChar trail; |
michael@0 | 1831 | if(U16_IS_LEAD(c) && U16_IS_TRAIL(trail = *p)) { |
michael@0 | 1832 | c = U16_GET_SUPPLEMENTARY(c, trail); |
michael@0 | 1833 | } |
michael@0 | 1834 | } else /* offset<0 */ { |
michael@0 | 1835 | // Skip backward over (offset-1) code points. |
michael@0 | 1836 | while(++offset < 0) { |
michael@0 | 1837 | if(U16_IS_TRAIL(*--p) && U16_IS_LEAD(*(p - 1))) { |
michael@0 | 1838 | --p; |
michael@0 | 1839 | } |
michael@0 | 1840 | } |
michael@0 | 1841 | // Read the code point before that. |
michael@0 | 1842 | c = *--p; |
michael@0 | 1843 | UChar lead; |
michael@0 | 1844 | if(U16_IS_TRAIL(c) && U16_IS_LEAD(lead = *(p - 1))) { |
michael@0 | 1845 | c = U16_GET_SUPPLEMENTARY(lead, c); |
michael@0 | 1846 | } |
michael@0 | 1847 | } |
michael@0 | 1848 | } else if(source->iterator != NULL) { |
michael@0 | 1849 | if(offset >= 0) { |
michael@0 | 1850 | // Skip forward over (offset-1) code points. |
michael@0 | 1851 | int32_t fwd = offset; |
michael@0 | 1852 | while(fwd-- > 0) { |
michael@0 | 1853 | uiter_next32(source->iterator); |
michael@0 | 1854 | } |
michael@0 | 1855 | // Read the code point there. |
michael@0 | 1856 | c = uiter_current32(source->iterator); |
michael@0 | 1857 | // Return to the starting point, skipping backward over (offset-1) code points. |
michael@0 | 1858 | while(offset-- > 0) { |
michael@0 | 1859 | uiter_previous32(source->iterator); |
michael@0 | 1860 | } |
michael@0 | 1861 | } else /* offset<0 */ { |
michael@0 | 1862 | // Read backward, reading offset code points, remember only the last-read one. |
michael@0 | 1863 | int32_t back = offset; |
michael@0 | 1864 | do { |
michael@0 | 1865 | c = uiter_previous32(source->iterator); |
michael@0 | 1866 | } while(++back < 0); |
michael@0 | 1867 | // Return to the starting position, skipping forward over offset code points. |
michael@0 | 1868 | do { |
michael@0 | 1869 | uiter_next32(source->iterator); |
michael@0 | 1870 | } while(++offset < 0); |
michael@0 | 1871 | } |
michael@0 | 1872 | } else { |
michael@0 | 1873 | c = U_SENTINEL; |
michael@0 | 1874 | } |
michael@0 | 1875 | return c; |
michael@0 | 1876 | } |
michael@0 | 1877 | |
michael@0 | 1878 | /** |
michael@0 | 1879 | * Determines if we are at the start of the data string in the backwards |
michael@0 | 1880 | * collation iterator |
michael@0 | 1881 | * @param data collation iterator |
michael@0 | 1882 | * @return TRUE if we are at the start |
michael@0 | 1883 | */ |
michael@0 | 1884 | static |
michael@0 | 1885 | inline UBool isAtStartPrevIterate(collIterate *data) { |
michael@0 | 1886 | if(data->pos == NULL && data->iterator != NULL) { |
michael@0 | 1887 | return !data->iterator->hasPrevious(data->iterator); |
michael@0 | 1888 | } |
michael@0 | 1889 | //return (collIter_bos(data)) || |
michael@0 | 1890 | return (data->pos == data->string) || |
michael@0 | 1891 | ((data->flags & UCOL_ITER_INNORMBUF) && (data->pos != NULL) && |
michael@0 | 1892 | *(data->pos - 1) == 0 && data->fcdPosition == NULL); |
michael@0 | 1893 | } |
michael@0 | 1894 | |
michael@0 | 1895 | static |
michael@0 | 1896 | inline void goBackOne(collIterate *data) { |
michael@0 | 1897 | # if 0 |
michael@0 | 1898 | // somehow, it looks like we need to keep iterator synced up |
michael@0 | 1899 | // at all times, as above. |
michael@0 | 1900 | if(data->pos) { |
michael@0 | 1901 | data->pos--; |
michael@0 | 1902 | } |
michael@0 | 1903 | if(data->iterator) { |
michael@0 | 1904 | data->iterator->previous(data->iterator); |
michael@0 | 1905 | } |
michael@0 | 1906 | #endif |
michael@0 | 1907 | if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) { |
michael@0 | 1908 | data->iterator->previous(data->iterator); |
michael@0 | 1909 | } |
michael@0 | 1910 | if(data->pos) { |
michael@0 | 1911 | data->pos --; |
michael@0 | 1912 | } |
michael@0 | 1913 | } |
michael@0 | 1914 | |
michael@0 | 1915 | /** |
michael@0 | 1916 | * Inline function that gets a simple CE. |
michael@0 | 1917 | * So what it does is that it will first check the expansion buffer. If the |
michael@0 | 1918 | * expansion buffer is not empty, ie the end pointer to the expansion buffer |
michael@0 | 1919 | * is different from the string pointer, we return the collation element at the |
michael@0 | 1920 | * return pointer and decrement it. |
michael@0 | 1921 | * For more complicated CEs it resorts to getComplicatedCE. |
michael@0 | 1922 | * @param coll collator data |
michael@0 | 1923 | * @param data collation iterator struct |
michael@0 | 1924 | * @param status error status |
michael@0 | 1925 | */ |
michael@0 | 1926 | static |
michael@0 | 1927 | inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data, |
michael@0 | 1928 | UErrorCode *status) |
michael@0 | 1929 | { |
michael@0 | 1930 | uint32_t result = (uint32_t)UCOL_NULLORDER; |
michael@0 | 1931 | |
michael@0 | 1932 | if (data->offsetReturn != NULL) { |
michael@0 | 1933 | if (data->offsetRepeatCount > 0) { |
michael@0 | 1934 | data->offsetRepeatCount -= 1; |
michael@0 | 1935 | } else { |
michael@0 | 1936 | if (data->offsetReturn == data->offsetBuffer) { |
michael@0 | 1937 | data->offsetReturn = NULL; |
michael@0 | 1938 | data->offsetStore = data->offsetBuffer; |
michael@0 | 1939 | } else { |
michael@0 | 1940 | data->offsetReturn -= 1; |
michael@0 | 1941 | } |
michael@0 | 1942 | } |
michael@0 | 1943 | } |
michael@0 | 1944 | |
michael@0 | 1945 | if ((data->extendCEs && data->toReturn > data->extendCEs) || |
michael@0 | 1946 | (!data->extendCEs && data->toReturn > data->CEs)) |
michael@0 | 1947 | { |
michael@0 | 1948 | data->toReturn -= 1; |
michael@0 | 1949 | result = *(data->toReturn); |
michael@0 | 1950 | if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) { |
michael@0 | 1951 | data->CEpos = data->toReturn; |
michael@0 | 1952 | } |
michael@0 | 1953 | } |
michael@0 | 1954 | else { |
michael@0 | 1955 | UChar ch = 0; |
michael@0 | 1956 | |
michael@0 | 1957 | do { |
michael@0 | 1958 | /* |
michael@0 | 1959 | Loop handles case when incremental normalize switches to or from the |
michael@0 | 1960 | side buffer / original string, and we need to start again to get the |
michael@0 | 1961 | next character. |
michael@0 | 1962 | */ |
michael@0 | 1963 | for (;;) { |
michael@0 | 1964 | if (data->flags & UCOL_ITER_HASLEN) { |
michael@0 | 1965 | /* |
michael@0 | 1966 | Normal path for strings when length is specified. |
michael@0 | 1967 | Not in side buffer because it is always null terminated. |
michael@0 | 1968 | */ |
michael@0 | 1969 | if (data->pos <= data->string) { |
michael@0 | 1970 | /* End of the main source string */ |
michael@0 | 1971 | return UCOL_NO_MORE_CES; |
michael@0 | 1972 | } |
michael@0 | 1973 | data->pos --; |
michael@0 | 1974 | ch = *data->pos; |
michael@0 | 1975 | } |
michael@0 | 1976 | // we are using an iterator to go back. Pray for us! |
michael@0 | 1977 | else if (data->flags & UCOL_USE_ITERATOR) { |
michael@0 | 1978 | UChar32 iterCh = data->iterator->previous(data->iterator); |
michael@0 | 1979 | if(iterCh == U_SENTINEL) { |
michael@0 | 1980 | return UCOL_NO_MORE_CES; |
michael@0 | 1981 | } else { |
michael@0 | 1982 | ch = (UChar)iterCh; |
michael@0 | 1983 | } |
michael@0 | 1984 | } |
michael@0 | 1985 | else { |
michael@0 | 1986 | data->pos --; |
michael@0 | 1987 | ch = *data->pos; |
michael@0 | 1988 | /* we are in the side buffer. */ |
michael@0 | 1989 | if (ch == 0) { |
michael@0 | 1990 | /* |
michael@0 | 1991 | At the start of the normalize side buffer. |
michael@0 | 1992 | Go back to string. |
michael@0 | 1993 | Because pointer points to the last accessed character, |
michael@0 | 1994 | hence we have to increment it by one here. |
michael@0 | 1995 | */ |
michael@0 | 1996 | data->flags = data->origFlags; |
michael@0 | 1997 | data->offsetRepeatValue = 0; |
michael@0 | 1998 | |
michael@0 | 1999 | if (data->fcdPosition == NULL) { |
michael@0 | 2000 | data->pos = data->string; |
michael@0 | 2001 | return UCOL_NO_MORE_CES; |
michael@0 | 2002 | } |
michael@0 | 2003 | else { |
michael@0 | 2004 | data->pos = data->fcdPosition + 1; |
michael@0 | 2005 | } |
michael@0 | 2006 | |
michael@0 | 2007 | continue; |
michael@0 | 2008 | } |
michael@0 | 2009 | } |
michael@0 | 2010 | |
michael@0 | 2011 | if(data->flags&UCOL_HIRAGANA_Q) { |
michael@0 | 2012 | if(ch>=0x3040 && ch<=0x309f) { |
michael@0 | 2013 | data->flags |= UCOL_WAS_HIRAGANA; |
michael@0 | 2014 | } else { |
michael@0 | 2015 | data->flags &= ~UCOL_WAS_HIRAGANA; |
michael@0 | 2016 | } |
michael@0 | 2017 | } |
michael@0 | 2018 | |
michael@0 | 2019 | /* |
michael@0 | 2020 | * got a character to determine if there's fcd and/or normalization |
michael@0 | 2021 | * stuff to do. |
michael@0 | 2022 | * if the current character is not fcd. |
michael@0 | 2023 | * if current character is at the start of the string |
michael@0 | 2024 | * Trailing combining class == 0. |
michael@0 | 2025 | * Note if pos is in the writablebuffer, norm is always 0 |
michael@0 | 2026 | */ |
michael@0 | 2027 | if (ch < ZERO_CC_LIMIT_ || |
michael@0 | 2028 | // this should propel us out of the loop in the iterator case |
michael@0 | 2029 | (data->flags & UCOL_ITER_NORM) == 0 || |
michael@0 | 2030 | (data->fcdPosition != NULL && data->fcdPosition <= data->pos) |
michael@0 | 2031 | || data->string == data->pos) { |
michael@0 | 2032 | break; |
michael@0 | 2033 | } |
michael@0 | 2034 | |
michael@0 | 2035 | if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { |
michael@0 | 2036 | /* if next character is FCD */ |
michael@0 | 2037 | if (data->pos == data->string) { |
michael@0 | 2038 | /* First char of string is always OK for FCD check */ |
michael@0 | 2039 | break; |
michael@0 | 2040 | } |
michael@0 | 2041 | |
michael@0 | 2042 | /* Not first char of string, do the FCD fast test */ |
michael@0 | 2043 | if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) { |
michael@0 | 2044 | break; |
michael@0 | 2045 | } |
michael@0 | 2046 | } |
michael@0 | 2047 | |
michael@0 | 2048 | /* Need a more complete FCD check and possible normalization. */ |
michael@0 | 2049 | if (collPrevIterFCD(data)) { |
michael@0 | 2050 | collPrevIterNormalize(data); |
michael@0 | 2051 | } |
michael@0 | 2052 | |
michael@0 | 2053 | if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { |
michael@0 | 2054 | /* No normalization. Go ahead and process the char. */ |
michael@0 | 2055 | break; |
michael@0 | 2056 | } |
michael@0 | 2057 | |
michael@0 | 2058 | /* |
michael@0 | 2059 | Some normalization happened. |
michael@0 | 2060 | Next loop picks up a char from the normalization buffer. |
michael@0 | 2061 | */ |
michael@0 | 2062 | } |
michael@0 | 2063 | |
michael@0 | 2064 | /* attempt to handle contractions, after removal of the backwards |
michael@0 | 2065 | contraction |
michael@0 | 2066 | */ |
michael@0 | 2067 | if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) { |
michael@0 | 2068 | result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status); |
michael@0 | 2069 | } else { |
michael@0 | 2070 | if (ch <= 0xFF) { |
michael@0 | 2071 | result = coll->latinOneMapping[ch]; |
michael@0 | 2072 | } |
michael@0 | 2073 | else { |
michael@0 | 2074 | // Always use UCA for [3400..9FFF], [AC00..D7AF] |
michael@0 | 2075 | // **** [FA0E..FA2F] ?? **** |
michael@0 | 2076 | if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && |
michael@0 | 2077 | (ch >= 0x3400 && ch <= 0xD7AF)) { |
michael@0 | 2078 | if (ch > 0x9FFF && ch < 0xAC00) { |
michael@0 | 2079 | // between the two target ranges; do normal lookup |
michael@0 | 2080 | // **** this range is YI, Modifier tone letters, **** |
michael@0 | 2081 | // **** Latin-D, Syloti Nagari, Phagas-pa. **** |
michael@0 | 2082 | // **** Latin-D might be tailored, so we need to **** |
michael@0 | 2083 | // **** do the normal lookup for these guys. **** |
michael@0 | 2084 | result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); |
michael@0 | 2085 | } else { |
michael@0 | 2086 | result = UCOL_NOT_FOUND; |
michael@0 | 2087 | } |
michael@0 | 2088 | } else { |
michael@0 | 2089 | result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); |
michael@0 | 2090 | } |
michael@0 | 2091 | } |
michael@0 | 2092 | if (result > UCOL_NOT_FOUND) { |
michael@0 | 2093 | result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status); |
michael@0 | 2094 | } |
michael@0 | 2095 | if (result == UCOL_NOT_FOUND) { // Not found in master list |
michael@0 | 2096 | if (!isAtStartPrevIterate(data) && |
michael@0 | 2097 | ucol_contractionEndCP(ch, data->coll)) |
michael@0 | 2098 | { |
michael@0 | 2099 | result = UCOL_CONTRACTION; |
michael@0 | 2100 | } else { |
michael@0 | 2101 | if(coll->UCA) { |
michael@0 | 2102 | result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); |
michael@0 | 2103 | } |
michael@0 | 2104 | } |
michael@0 | 2105 | |
michael@0 | 2106 | if (result > UCOL_NOT_FOUND) { |
michael@0 | 2107 | if(coll->UCA) { |
michael@0 | 2108 | result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status); |
michael@0 | 2109 | } |
michael@0 | 2110 | } |
michael@0 | 2111 | } |
michael@0 | 2112 | } |
michael@0 | 2113 | } while ( result == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL ); |
michael@0 | 2114 | |
michael@0 | 2115 | if(result == UCOL_NOT_FOUND) { |
michael@0 | 2116 | result = getPrevImplicit(ch, data); |
michael@0 | 2117 | } |
michael@0 | 2118 | } |
michael@0 | 2119 | |
michael@0 | 2120 | return result; |
michael@0 | 2121 | } |
michael@0 | 2122 | |
michael@0 | 2123 | |
michael@0 | 2124 | /* ucol_getPrevCE, out-of-line version for use from other files. */ |
michael@0 | 2125 | U_CFUNC uint32_t U_EXPORT2 |
michael@0 | 2126 | ucol_getPrevCE(const UCollator *coll, collIterate *data, |
michael@0 | 2127 | UErrorCode *status) { |
michael@0 | 2128 | return ucol_IGetPrevCE(coll, data, status); |
michael@0 | 2129 | } |
michael@0 | 2130 | |
michael@0 | 2131 | |
michael@0 | 2132 | /* this should be connected to special Jamo handling */ |
michael@0 | 2133 | U_CFUNC uint32_t U_EXPORT2 |
michael@0 | 2134 | ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) { |
michael@0 | 2135 | collIterate colIt; |
michael@0 | 2136 | IInit_collIterate(coll, &u, 1, &colIt, status); |
michael@0 | 2137 | if(U_FAILURE(*status)) { |
michael@0 | 2138 | return 0; |
michael@0 | 2139 | } |
michael@0 | 2140 | return ucol_IGetNextCE(coll, &colIt, status); |
michael@0 | 2141 | } |
michael@0 | 2142 | |
michael@0 | 2143 | /** |
michael@0 | 2144 | * Inserts the argument character into the end of the buffer pushing back the |
michael@0 | 2145 | * null terminator. |
michael@0 | 2146 | * @param data collIterate struct data |
michael@0 | 2147 | * @param ch character to be appended |
michael@0 | 2148 | * @return the position of the new addition |
michael@0 | 2149 | */ |
michael@0 | 2150 | static |
michael@0 | 2151 | inline const UChar * insertBufferEnd(collIterate *data, UChar ch) |
michael@0 | 2152 | { |
michael@0 | 2153 | int32_t oldLength = data->writableBuffer.length(); |
michael@0 | 2154 | return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength; |
michael@0 | 2155 | } |
michael@0 | 2156 | |
michael@0 | 2157 | /** |
michael@0 | 2158 | * Inserts the argument string into the end of the buffer pushing back the |
michael@0 | 2159 | * null terminator. |
michael@0 | 2160 | * @param data collIterate struct data |
michael@0 | 2161 | * @param string to be appended |
michael@0 | 2162 | * @param length of the string to be appended |
michael@0 | 2163 | * @return the position of the new addition |
michael@0 | 2164 | */ |
michael@0 | 2165 | static |
michael@0 | 2166 | inline const UChar * insertBufferEnd(collIterate *data, const UChar *str, int32_t length) |
michael@0 | 2167 | { |
michael@0 | 2168 | int32_t oldLength = data->writableBuffer.length(); |
michael@0 | 2169 | return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldLength; |
michael@0 | 2170 | } |
michael@0 | 2171 | |
michael@0 | 2172 | /** |
michael@0 | 2173 | * Special normalization function for contraction in the forwards iterator. |
michael@0 | 2174 | * This normalization sequence will place the current character at source->pos |
michael@0 | 2175 | * and its following normalized sequence into the buffer. |
michael@0 | 2176 | * The fcd position, pos will be changed. |
michael@0 | 2177 | * pos will now point to positions in the buffer. |
michael@0 | 2178 | * Flags will be changed accordingly. |
michael@0 | 2179 | * @param data collation iterator data |
michael@0 | 2180 | */ |
michael@0 | 2181 | static |
michael@0 | 2182 | inline void normalizeNextContraction(collIterate *data) |
michael@0 | 2183 | { |
michael@0 | 2184 | int32_t strsize; |
michael@0 | 2185 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 2186 | /* because the pointer points to the next character */ |
michael@0 | 2187 | const UChar *pStart = data->pos - 1; |
michael@0 | 2188 | const UChar *pEnd; |
michael@0 | 2189 | |
michael@0 | 2190 | if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { |
michael@0 | 2191 | data->writableBuffer.setTo(*(pStart - 1)); |
michael@0 | 2192 | strsize = 1; |
michael@0 | 2193 | } |
michael@0 | 2194 | else { |
michael@0 | 2195 | strsize = data->writableBuffer.length(); |
michael@0 | 2196 | } |
michael@0 | 2197 | |
michael@0 | 2198 | pEnd = data->fcdPosition; |
michael@0 | 2199 | |
michael@0 | 2200 | data->writableBuffer.append( |
michael@0 | 2201 | data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), status)); |
michael@0 | 2202 | if(U_FAILURE(status)) { |
michael@0 | 2203 | return; |
michael@0 | 2204 | } |
michael@0 | 2205 | |
michael@0 | 2206 | data->pos = data->writableBuffer.getTerminatedBuffer() + strsize; |
michael@0 | 2207 | data->origFlags = data->flags; |
michael@0 | 2208 | data->flags |= UCOL_ITER_INNORMBUF; |
michael@0 | 2209 | data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); |
michael@0 | 2210 | } |
michael@0 | 2211 | |
michael@0 | 2212 | /** |
michael@0 | 2213 | * Contraction character management function that returns the next character |
michael@0 | 2214 | * for the forwards iterator. |
michael@0 | 2215 | * Does nothing if the next character is in buffer and not the first character |
michael@0 | 2216 | * in it. |
michael@0 | 2217 | * Else it checks next character in data string to see if it is normalizable. |
michael@0 | 2218 | * If it is not, the character is simply copied into the buffer, else |
michael@0 | 2219 | * the whole normalized substring is copied into the buffer, including the |
michael@0 | 2220 | * current character. |
michael@0 | 2221 | * @param data collation element iterator data |
michael@0 | 2222 | * @return next character |
michael@0 | 2223 | */ |
michael@0 | 2224 | static |
michael@0 | 2225 | inline UChar getNextNormalizedChar(collIterate *data) |
michael@0 | 2226 | { |
michael@0 | 2227 | UChar nextch; |
michael@0 | 2228 | UChar ch; |
michael@0 | 2229 | // Here we need to add the iterator code. One problem is the way |
michael@0 | 2230 | // end of string is handled. If we just return next char, it could |
michael@0 | 2231 | // be the sentinel. Most of the cases already check for this, but we |
michael@0 | 2232 | // need to be sure. |
michael@0 | 2233 | if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) { |
michael@0 | 2234 | /* if no normalization and not in buffer. */ |
michael@0 | 2235 | if(data->flags & UCOL_USE_ITERATOR) { |
michael@0 | 2236 | return (UChar)data->iterator->next(data->iterator); |
michael@0 | 2237 | } else { |
michael@0 | 2238 | return *(data->pos ++); |
michael@0 | 2239 | } |
michael@0 | 2240 | } |
michael@0 | 2241 | |
michael@0 | 2242 | //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) { |
michael@0 | 2243 | //normalizeIterator(data); |
michael@0 | 2244 | //} |
michael@0 | 2245 | |
michael@0 | 2246 | UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF); |
michael@0 | 2247 | if ((innormbuf && *data->pos != 0) || |
michael@0 | 2248 | (data->fcdPosition != NULL && !innormbuf && |
michael@0 | 2249 | data->pos < data->fcdPosition)) { |
michael@0 | 2250 | /* |
michael@0 | 2251 | if next character is in normalized buffer, no further normalization |
michael@0 | 2252 | is required |
michael@0 | 2253 | */ |
michael@0 | 2254 | return *(data->pos ++); |
michael@0 | 2255 | } |
michael@0 | 2256 | |
michael@0 | 2257 | if (data->flags & UCOL_ITER_HASLEN) { |
michael@0 | 2258 | /* in data string */ |
michael@0 | 2259 | if (data->pos + 1 == data->endp) { |
michael@0 | 2260 | return *(data->pos ++); |
michael@0 | 2261 | } |
michael@0 | 2262 | } |
michael@0 | 2263 | else { |
michael@0 | 2264 | if (innormbuf) { |
michael@0 | 2265 | // inside the normalization buffer, but at the end |
michael@0 | 2266 | // (since we encountered zero). This means, in the |
michael@0 | 2267 | // case we're using char iterator, that we need to |
michael@0 | 2268 | // do another round of normalization. |
michael@0 | 2269 | //if(data->origFlags & UCOL_USE_ITERATOR) { |
michael@0 | 2270 | // we need to restore original flags, |
michael@0 | 2271 | // otherwise, we'll lose them |
michael@0 | 2272 | //data->flags = data->origFlags; |
michael@0 | 2273 | //normalizeIterator(data); |
michael@0 | 2274 | //return *(data->pos++); |
michael@0 | 2275 | //} else { |
michael@0 | 2276 | /* |
michael@0 | 2277 | in writable buffer, at this point fcdPosition can not be |
michael@0 | 2278 | pointing to the end of the data string. see contracting tag. |
michael@0 | 2279 | */ |
michael@0 | 2280 | if(data->fcdPosition) { |
michael@0 | 2281 | if (*(data->fcdPosition + 1) == 0 || |
michael@0 | 2282 | data->fcdPosition + 1 == data->endp) { |
michael@0 | 2283 | /* at the end of the string, dump it into the normalizer */ |
michael@0 | 2284 | data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1; |
michael@0 | 2285 | // Check if data->pos received a null pointer |
michael@0 | 2286 | if (data->pos == NULL) { |
michael@0 | 2287 | return (UChar)-1; // Return to indicate error. |
michael@0 | 2288 | } |
michael@0 | 2289 | return *(data->fcdPosition ++); |
michael@0 | 2290 | } |
michael@0 | 2291 | data->pos = data->fcdPosition; |
michael@0 | 2292 | } else if(data->origFlags & UCOL_USE_ITERATOR) { |
michael@0 | 2293 | // if we are here, we're using a normalizing iterator. |
michael@0 | 2294 | // we should just continue further. |
michael@0 | 2295 | data->flags = data->origFlags; |
michael@0 | 2296 | data->pos = NULL; |
michael@0 | 2297 | return (UChar)data->iterator->next(data->iterator); |
michael@0 | 2298 | } |
michael@0 | 2299 | //} |
michael@0 | 2300 | } |
michael@0 | 2301 | else { |
michael@0 | 2302 | if (*(data->pos + 1) == 0) { |
michael@0 | 2303 | return *(data->pos ++); |
michael@0 | 2304 | } |
michael@0 | 2305 | } |
michael@0 | 2306 | } |
michael@0 | 2307 | |
michael@0 | 2308 | ch = *data->pos ++; |
michael@0 | 2309 | nextch = *data->pos; |
michael@0 | 2310 | |
michael@0 | 2311 | /* |
michael@0 | 2312 | * if the current character is not fcd. |
michael@0 | 2313 | * Trailing combining class == 0. |
michael@0 | 2314 | */ |
michael@0 | 2315 | if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) && |
michael@0 | 2316 | (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ || |
michael@0 | 2317 | ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) { |
michael@0 | 2318 | /* |
michael@0 | 2319 | Need a more complete FCD check and possible normalization. |
michael@0 | 2320 | normalize substring will be appended to buffer |
michael@0 | 2321 | */ |
michael@0 | 2322 | if (collIterFCD(data)) { |
michael@0 | 2323 | normalizeNextContraction(data); |
michael@0 | 2324 | return *(data->pos ++); |
michael@0 | 2325 | } |
michael@0 | 2326 | else if (innormbuf) { |
michael@0 | 2327 | /* fcdposition shifted even when there's no normalization, if we |
michael@0 | 2328 | don't input the rest into this, we'll get the wrong position when |
michael@0 | 2329 | we reach the end of the writableBuffer */ |
michael@0 | 2330 | int32_t length = (int32_t)(data->fcdPosition - data->pos + 1); |
michael@0 | 2331 | data->pos = insertBufferEnd(data, data->pos - 1, length); |
michael@0 | 2332 | // Check if data->pos received a null pointer |
michael@0 | 2333 | if (data->pos == NULL) { |
michael@0 | 2334 | return (UChar)-1; // Return to indicate error. |
michael@0 | 2335 | } |
michael@0 | 2336 | return *(data->pos ++); |
michael@0 | 2337 | } |
michael@0 | 2338 | } |
michael@0 | 2339 | |
michael@0 | 2340 | if (innormbuf) { |
michael@0 | 2341 | /* |
michael@0 | 2342 | no normalization is to be done hence only one character will be |
michael@0 | 2343 | appended to the buffer. |
michael@0 | 2344 | */ |
michael@0 | 2345 | data->pos = insertBufferEnd(data, ch) + 1; |
michael@0 | 2346 | // Check if data->pos received a null pointer |
michael@0 | 2347 | if (data->pos == NULL) { |
michael@0 | 2348 | return (UChar)-1; // Return to indicate error. |
michael@0 | 2349 | } |
michael@0 | 2350 | } |
michael@0 | 2351 | |
michael@0 | 2352 | /* points back to the pos in string */ |
michael@0 | 2353 | return ch; |
michael@0 | 2354 | } |
michael@0 | 2355 | |
michael@0 | 2356 | |
michael@0 | 2357 | |
michael@0 | 2358 | /** |
michael@0 | 2359 | * Function to copy the buffer into writableBuffer and sets the fcd position to |
michael@0 | 2360 | * the correct position |
michael@0 | 2361 | * @param source data string source |
michael@0 | 2362 | * @param buffer character buffer |
michael@0 | 2363 | */ |
michael@0 | 2364 | static |
michael@0 | 2365 | inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &buffer) |
michael@0 | 2366 | { |
michael@0 | 2367 | /* okay confusing part here. to ensure that the skipped characters are |
michael@0 | 2368 | considered later, we need to place it in the appropriate position in the |
michael@0 | 2369 | normalization buffer and reassign the pos pointer. simple case if pos |
michael@0 | 2370 | reside in string, simply copy to normalization buffer and |
michael@0 | 2371 | fcdposition = pos, pos = start of normalization buffer. if pos in |
michael@0 | 2372 | normalization buffer, we'll insert the copy infront of pos and point pos |
michael@0 | 2373 | to the start of the normalization buffer. why am i doing these copies? |
michael@0 | 2374 | well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does |
michael@0 | 2375 | not require any changes, which be really painful. */ |
michael@0 | 2376 | if (source->flags & UCOL_ITER_INNORMBUF) { |
michael@0 | 2377 | int32_t replaceLength = source->pos - source->writableBuffer.getBuffer(); |
michael@0 | 2378 | source->writableBuffer.replace(0, replaceLength, buffer); |
michael@0 | 2379 | } |
michael@0 | 2380 | else { |
michael@0 | 2381 | source->fcdPosition = source->pos; |
michael@0 | 2382 | source->origFlags = source->flags; |
michael@0 | 2383 | source->flags |= UCOL_ITER_INNORMBUF; |
michael@0 | 2384 | source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); |
michael@0 | 2385 | source->writableBuffer = buffer; |
michael@0 | 2386 | } |
michael@0 | 2387 | |
michael@0 | 2388 | source->pos = source->writableBuffer.getTerminatedBuffer(); |
michael@0 | 2389 | } |
michael@0 | 2390 | |
michael@0 | 2391 | /** |
michael@0 | 2392 | * Function to get the discontiguos collation element within the source. |
michael@0 | 2393 | * Note this function will set the position to the appropriate places. |
michael@0 | 2394 | * @param coll current collator used |
michael@0 | 2395 | * @param source data string source |
michael@0 | 2396 | * @param constart index to the start character in the contraction table |
michael@0 | 2397 | * @return discontiguos collation element offset |
michael@0 | 2398 | */ |
michael@0 | 2399 | static |
michael@0 | 2400 | uint32_t getDiscontiguous(const UCollator *coll, collIterate *source, |
michael@0 | 2401 | const UChar *constart) |
michael@0 | 2402 | { |
michael@0 | 2403 | /* source->pos currently points to the second combining character after |
michael@0 | 2404 | the start character */ |
michael@0 | 2405 | const UChar *temppos = source->pos; |
michael@0 | 2406 | UnicodeString buffer; |
michael@0 | 2407 | const UChar *tempconstart = constart; |
michael@0 | 2408 | uint8_t tempflags = source->flags; |
michael@0 | 2409 | UBool multicontraction = FALSE; |
michael@0 | 2410 | collIterateState discState; |
michael@0 | 2411 | |
michael@0 | 2412 | backupState(source, &discState); |
michael@0 | 2413 | |
michael@0 | 2414 | buffer.setTo(peekCodePoint(source, -1)); |
michael@0 | 2415 | for (;;) { |
michael@0 | 2416 | UChar *UCharOffset; |
michael@0 | 2417 | UChar schar, |
michael@0 | 2418 | tchar; |
michael@0 | 2419 | uint32_t result; |
michael@0 | 2420 | |
michael@0 | 2421 | if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp) |
michael@0 | 2422 | || (peekCodeUnit(source, 0) == 0 && |
michael@0 | 2423 | //|| (*source->pos == 0 && |
michael@0 | 2424 | ((source->flags & UCOL_ITER_INNORMBUF) == 0 || |
michael@0 | 2425 | source->fcdPosition == NULL || |
michael@0 | 2426 | source->fcdPosition == source->endp || |
michael@0 | 2427 | *(source->fcdPosition) == 0 || |
michael@0 | 2428 | u_getCombiningClass(*(source->fcdPosition)) == 0)) || |
michael@0 | 2429 | /* end of string in null terminated string or stopped by a |
michael@0 | 2430 | null character, note fcd does not always point to a base |
michael@0 | 2431 | character after the discontiguos change */ |
michael@0 | 2432 | u_getCombiningClass(peekCodePoint(source, 0)) == 0) { |
michael@0 | 2433 | //u_getCombiningClass(*(source->pos)) == 0) { |
michael@0 | 2434 | //constart = (UChar *)coll->image + getContractOffset(CE); |
michael@0 | 2435 | if (multicontraction) { |
michael@0 | 2436 | source->pos = temppos - 1; |
michael@0 | 2437 | setDiscontiguosAttribute(source, buffer); |
michael@0 | 2438 | return *(coll->contractionCEs + |
michael@0 | 2439 | (tempconstart - coll->contractionIndex)); |
michael@0 | 2440 | } |
michael@0 | 2441 | constart = tempconstart; |
michael@0 | 2442 | break; |
michael@0 | 2443 | } |
michael@0 | 2444 | |
michael@0 | 2445 | UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/ |
michael@0 | 2446 | schar = getNextNormalizedChar(source); |
michael@0 | 2447 | |
michael@0 | 2448 | while (schar > (tchar = *UCharOffset)) { |
michael@0 | 2449 | UCharOffset++; |
michael@0 | 2450 | } |
michael@0 | 2451 | |
michael@0 | 2452 | if (schar != tchar) { |
michael@0 | 2453 | /* not the correct codepoint. we stuff the current codepoint into |
michael@0 | 2454 | the discontiguos buffer and try the next character */ |
michael@0 | 2455 | buffer.append(schar); |
michael@0 | 2456 | continue; |
michael@0 | 2457 | } |
michael@0 | 2458 | else { |
michael@0 | 2459 | if (u_getCombiningClass(schar) == |
michael@0 | 2460 | u_getCombiningClass(peekCodePoint(source, -2))) { |
michael@0 | 2461 | buffer.append(schar); |
michael@0 | 2462 | continue; |
michael@0 | 2463 | } |
michael@0 | 2464 | result = *(coll->contractionCEs + |
michael@0 | 2465 | (UCharOffset - coll->contractionIndex)); |
michael@0 | 2466 | } |
michael@0 | 2467 | |
michael@0 | 2468 | if (result == UCOL_NOT_FOUND) { |
michael@0 | 2469 | break; |
michael@0 | 2470 | } else if (isContraction(result)) { |
michael@0 | 2471 | /* this is a multi-contraction*/ |
michael@0 | 2472 | tempconstart = (UChar *)coll->image + getContractOffset(result); |
michael@0 | 2473 | if (*(coll->contractionCEs + (constart - coll->contractionIndex)) |
michael@0 | 2474 | != UCOL_NOT_FOUND) { |
michael@0 | 2475 | multicontraction = TRUE; |
michael@0 | 2476 | temppos = source->pos + 1; |
michael@0 | 2477 | } |
michael@0 | 2478 | } else { |
michael@0 | 2479 | setDiscontiguosAttribute(source, buffer); |
michael@0 | 2480 | return result; |
michael@0 | 2481 | } |
michael@0 | 2482 | } |
michael@0 | 2483 | |
michael@0 | 2484 | /* no problems simply reverting just like that, |
michael@0 | 2485 | if we are in string before getting into this function, points back to |
michael@0 | 2486 | string hence no problem. |
michael@0 | 2487 | if we are in normalization buffer before getting into this function, |
michael@0 | 2488 | since we'll never use another normalization within this function, we |
michael@0 | 2489 | know that fcdposition points to a base character. the normalization buffer |
michael@0 | 2490 | never change, hence this revert works. */ |
michael@0 | 2491 | loadState(source, &discState, TRUE); |
michael@0 | 2492 | goBackOne(source); |
michael@0 | 2493 | |
michael@0 | 2494 | //source->pos = temppos - 1; |
michael@0 | 2495 | source->flags = tempflags; |
michael@0 | 2496 | return *(coll->contractionCEs + (constart - coll->contractionIndex)); |
michael@0 | 2497 | } |
michael@0 | 2498 | |
michael@0 | 2499 | /* now uses Mark's getImplicitPrimary code */ |
michael@0 | 2500 | static |
michael@0 | 2501 | inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) { |
michael@0 | 2502 | uint32_t r = uprv_uca_getImplicitPrimary(cp); |
michael@0 | 2503 | *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0; |
michael@0 | 2504 | collationSource->offsetRepeatCount += 1; |
michael@0 | 2505 | return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order' |
michael@0 | 2506 | } |
michael@0 | 2507 | |
michael@0 | 2508 | /** |
michael@0 | 2509 | * Inserts the argument character into the front of the buffer replacing the |
michael@0 | 2510 | * front null terminator. |
michael@0 | 2511 | * @param data collation element iterator data |
michael@0 | 2512 | * @param ch character to be appended |
michael@0 | 2513 | */ |
michael@0 | 2514 | static |
michael@0 | 2515 | inline void insertBufferFront(collIterate *data, UChar ch) |
michael@0 | 2516 | { |
michael@0 | 2517 | data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTerminatedBuffer() + 2; |
michael@0 | 2518 | } |
michael@0 | 2519 | |
michael@0 | 2520 | /** |
michael@0 | 2521 | * Special normalization function for contraction in the previous iterator. |
michael@0 | 2522 | * This normalization sequence will place the current character at source->pos |
michael@0 | 2523 | * and its following normalized sequence into the buffer. |
michael@0 | 2524 | * The fcd position, pos will be changed. |
michael@0 | 2525 | * pos will now point to positions in the buffer. |
michael@0 | 2526 | * Flags will be changed accordingly. |
michael@0 | 2527 | * @param data collation iterator data |
michael@0 | 2528 | */ |
michael@0 | 2529 | static |
michael@0 | 2530 | inline void normalizePrevContraction(collIterate *data, UErrorCode *status) |
michael@0 | 2531 | { |
michael@0 | 2532 | const UChar *pEnd = data->pos + 1; /* End normalize + 1 */ |
michael@0 | 2533 | const UChar *pStart; |
michael@0 | 2534 | |
michael@0 | 2535 | UnicodeString endOfBuffer; |
michael@0 | 2536 | if (data->flags & UCOL_ITER_HASLEN) { |
michael@0 | 2537 | /* |
michael@0 | 2538 | normalization buffer not used yet, we'll pull down the next |
michael@0 | 2539 | character into the end of the buffer |
michael@0 | 2540 | */ |
michael@0 | 2541 | endOfBuffer.setTo(*pEnd); |
michael@0 | 2542 | } |
michael@0 | 2543 | else { |
michael@0 | 2544 | endOfBuffer.setTo(data->writableBuffer, 1); // after the leading NUL |
michael@0 | 2545 | } |
michael@0 | 2546 | |
michael@0 | 2547 | if (data->fcdPosition == NULL) { |
michael@0 | 2548 | pStart = data->string; |
michael@0 | 2549 | } |
michael@0 | 2550 | else { |
michael@0 | 2551 | pStart = data->fcdPosition + 1; |
michael@0 | 2552 | } |
michael@0 | 2553 | int32_t normLen = |
michael@0 | 2554 | data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), |
michael@0 | 2555 | data->writableBuffer, |
michael@0 | 2556 | *status). |
michael@0 | 2557 | length(); |
michael@0 | 2558 | if(U_FAILURE(*status)) { |
michael@0 | 2559 | return; |
michael@0 | 2560 | } |
michael@0 | 2561 | /* |
michael@0 | 2562 | this puts the null termination infront of the normalized string instead |
michael@0 | 2563 | of the end |
michael@0 | 2564 | */ |
michael@0 | 2565 | data->pos = |
michael@0 | 2566 | data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminatedBuffer() + |
michael@0 | 2567 | 1 + normLen; |
michael@0 | 2568 | data->origFlags = data->flags; |
michael@0 | 2569 | data->flags |= UCOL_ITER_INNORMBUF; |
michael@0 | 2570 | data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); |
michael@0 | 2571 | } |
michael@0 | 2572 | |
michael@0 | 2573 | /** |
michael@0 | 2574 | * Contraction character management function that returns the previous character |
michael@0 | 2575 | * for the backwards iterator. |
michael@0 | 2576 | * Does nothing if the previous character is in buffer and not the first |
michael@0 | 2577 | * character in it. |
michael@0 | 2578 | * Else it checks previous character in data string to see if it is |
michael@0 | 2579 | * normalizable. |
michael@0 | 2580 | * If it is not, the character is simply copied into the buffer, else |
michael@0 | 2581 | * the whole normalized substring is copied into the buffer, including the |
michael@0 | 2582 | * current character. |
michael@0 | 2583 | * @param data collation element iterator data |
michael@0 | 2584 | * @return previous character |
michael@0 | 2585 | */ |
michael@0 | 2586 | static |
michael@0 | 2587 | inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status) |
michael@0 | 2588 | { |
michael@0 | 2589 | UChar prevch; |
michael@0 | 2590 | UChar ch; |
michael@0 | 2591 | const UChar *start; |
michael@0 | 2592 | UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF); |
michael@0 | 2593 | if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 || |
michael@0 | 2594 | (innormbuf && *(data->pos - 1) != 0)) { |
michael@0 | 2595 | /* |
michael@0 | 2596 | if no normalization. |
michael@0 | 2597 | if previous character is in normalized buffer, no further normalization |
michael@0 | 2598 | is required |
michael@0 | 2599 | */ |
michael@0 | 2600 | if(data->flags & UCOL_USE_ITERATOR) { |
michael@0 | 2601 | data->iterator->move(data->iterator, -1, UITER_CURRENT); |
michael@0 | 2602 | return (UChar)data->iterator->next(data->iterator); |
michael@0 | 2603 | } else { |
michael@0 | 2604 | return *(data->pos - 1); |
michael@0 | 2605 | } |
michael@0 | 2606 | } |
michael@0 | 2607 | |
michael@0 | 2608 | start = data->pos; |
michael@0 | 2609 | if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) { |
michael@0 | 2610 | /* in data string */ |
michael@0 | 2611 | if ((start - 1) == data->string) { |
michael@0 | 2612 | return *(start - 1); |
michael@0 | 2613 | } |
michael@0 | 2614 | start --; |
michael@0 | 2615 | ch = *start; |
michael@0 | 2616 | prevch = *(start - 1); |
michael@0 | 2617 | } |
michael@0 | 2618 | else { |
michael@0 | 2619 | /* |
michael@0 | 2620 | in writable buffer, at this point fcdPosition can not be NULL. |
michael@0 | 2621 | see contracting tag. |
michael@0 | 2622 | */ |
michael@0 | 2623 | if (data->fcdPosition == data->string) { |
michael@0 | 2624 | /* at the start of the string, just dump it into the normalizer */ |
michael@0 | 2625 | insertBufferFront(data, *(data->fcdPosition)); |
michael@0 | 2626 | data->fcdPosition = NULL; |
michael@0 | 2627 | return *(data->pos - 1); |
michael@0 | 2628 | } |
michael@0 | 2629 | start = data->fcdPosition; |
michael@0 | 2630 | ch = *start; |
michael@0 | 2631 | prevch = *(start - 1); |
michael@0 | 2632 | } |
michael@0 | 2633 | /* |
michael@0 | 2634 | * if the current character is not fcd. |
michael@0 | 2635 | * Trailing combining class == 0. |
michael@0 | 2636 | */ |
michael@0 | 2637 | if (data->fcdPosition > start && |
michael@0 | 2638 | (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_)) |
michael@0 | 2639 | { |
michael@0 | 2640 | /* |
michael@0 | 2641 | Need a more complete FCD check and possible normalization. |
michael@0 | 2642 | normalize substring will be appended to buffer |
michael@0 | 2643 | */ |
michael@0 | 2644 | const UChar *backuppos = data->pos; |
michael@0 | 2645 | data->pos = start; |
michael@0 | 2646 | if (collPrevIterFCD(data)) { |
michael@0 | 2647 | normalizePrevContraction(data, status); |
michael@0 | 2648 | return *(data->pos - 1); |
michael@0 | 2649 | } |
michael@0 | 2650 | data->pos = backuppos; |
michael@0 | 2651 | data->fcdPosition ++; |
michael@0 | 2652 | } |
michael@0 | 2653 | |
michael@0 | 2654 | if (innormbuf) { |
michael@0 | 2655 | /* |
michael@0 | 2656 | no normalization is to be done hence only one character will be |
michael@0 | 2657 | appended to the buffer. |
michael@0 | 2658 | */ |
michael@0 | 2659 | insertBufferFront(data, ch); |
michael@0 | 2660 | data->fcdPosition --; |
michael@0 | 2661 | } |
michael@0 | 2662 | |
michael@0 | 2663 | return ch; |
michael@0 | 2664 | } |
michael@0 | 2665 | |
michael@0 | 2666 | /* This function handles the special CEs like contractions, expansions, surrogates, Thai */ |
michael@0 | 2667 | /* It is called by getNextCE */ |
michael@0 | 2668 | |
michael@0 | 2669 | /* The following should be even */ |
michael@0 | 2670 | #define UCOL_MAX_DIGITS_FOR_NUMBER 254 |
michael@0 | 2671 | |
michael@0 | 2672 | uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) { |
michael@0 | 2673 | collIterateState entryState; |
michael@0 | 2674 | backupState(source, &entryState); |
michael@0 | 2675 | UChar32 cp = ch; |
michael@0 | 2676 | |
michael@0 | 2677 | for (;;) { |
michael@0 | 2678 | // This loop will repeat only in the case of contractions, and only when a contraction |
michael@0 | 2679 | // is found and the first CE resulting from that contraction is itself a special |
michael@0 | 2680 | // (an expansion, for example.) All other special CE types are fully handled the |
michael@0 | 2681 | // first time through, and the loop exits. |
michael@0 | 2682 | |
michael@0 | 2683 | const uint32_t *CEOffset = NULL; |
michael@0 | 2684 | switch(getCETag(CE)) { |
michael@0 | 2685 | case NOT_FOUND_TAG: |
michael@0 | 2686 | /* This one is not found, and we'll let somebody else bother about it... no more games */ |
michael@0 | 2687 | return CE; |
michael@0 | 2688 | case SPEC_PROC_TAG: |
michael@0 | 2689 | { |
michael@0 | 2690 | // Special processing is getting a CE that is preceded by a certain prefix |
michael@0 | 2691 | // Currently this is only needed for optimizing Japanese length and iteration marks. |
michael@0 | 2692 | // When we encouter a special processing tag, we go backwards and try to see if |
michael@0 | 2693 | // we have a match. |
michael@0 | 2694 | // Contraction tables are used - so the whole process is not unlike contraction. |
michael@0 | 2695 | // prefix data is stored backwards in the table. |
michael@0 | 2696 | const UChar *UCharOffset; |
michael@0 | 2697 | UChar schar, tchar; |
michael@0 | 2698 | collIterateState prefixState; |
michael@0 | 2699 | backupState(source, &prefixState); |
michael@0 | 2700 | loadState(source, &entryState, TRUE); |
michael@0 | 2701 | goBackOne(source); // We want to look at the point where we entered - actually one |
michael@0 | 2702 | // before that... |
michael@0 | 2703 | |
michael@0 | 2704 | for(;;) { |
michael@0 | 2705 | // This loop will run once per source string character, for as long as we |
michael@0 | 2706 | // are matching a potential contraction sequence |
michael@0 | 2707 | |
michael@0 | 2708 | // First we position ourselves at the begining of contraction sequence |
michael@0 | 2709 | const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); |
michael@0 | 2710 | if (collIter_bos(source)) { |
michael@0 | 2711 | CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); |
michael@0 | 2712 | break; |
michael@0 | 2713 | } |
michael@0 | 2714 | schar = getPrevNormalizedChar(source, status); |
michael@0 | 2715 | goBackOne(source); |
michael@0 | 2716 | |
michael@0 | 2717 | while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ |
michael@0 | 2718 | UCharOffset++; |
michael@0 | 2719 | } |
michael@0 | 2720 | |
michael@0 | 2721 | if (schar == tchar) { |
michael@0 | 2722 | // Found the source string char in the table. |
michael@0 | 2723 | // Pick up the corresponding CE from the table. |
michael@0 | 2724 | CE = *(coll->contractionCEs + |
michael@0 | 2725 | (UCharOffset - coll->contractionIndex)); |
michael@0 | 2726 | } |
michael@0 | 2727 | else |
michael@0 | 2728 | { |
michael@0 | 2729 | // Source string char was not in the table. |
michael@0 | 2730 | // We have not found the prefix. |
michael@0 | 2731 | CE = *(coll->contractionCEs + |
michael@0 | 2732 | (ContractionStart - coll->contractionIndex)); |
michael@0 | 2733 | } |
michael@0 | 2734 | |
michael@0 | 2735 | if(!isPrefix(CE)) { |
michael@0 | 2736 | // The source string char was in the contraction table, and the corresponding |
michael@0 | 2737 | // CE is not a prefix CE. We found the prefix, break |
michael@0 | 2738 | // out of loop, this CE will end up being returned. This is the normal |
michael@0 | 2739 | // way out of prefix handling when the source actually contained |
michael@0 | 2740 | // the prefix. |
michael@0 | 2741 | break; |
michael@0 | 2742 | } |
michael@0 | 2743 | } |
michael@0 | 2744 | if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue |
michael@0 | 2745 | loadState(source, &prefixState, TRUE); |
michael@0 | 2746 | if(source->origFlags & UCOL_USE_ITERATOR) { |
michael@0 | 2747 | source->flags = source->origFlags; |
michael@0 | 2748 | } |
michael@0 | 2749 | } else { // prefix search was a failure, we have to backup all the way to the start |
michael@0 | 2750 | loadState(source, &entryState, TRUE); |
michael@0 | 2751 | } |
michael@0 | 2752 | break; |
michael@0 | 2753 | } |
michael@0 | 2754 | case CONTRACTION_TAG: |
michael@0 | 2755 | { |
michael@0 | 2756 | /* This should handle contractions */ |
michael@0 | 2757 | collIterateState state; |
michael@0 | 2758 | backupState(source, &state); |
michael@0 | 2759 | uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND; |
michael@0 | 2760 | const UChar *UCharOffset; |
michael@0 | 2761 | UChar schar, tchar; |
michael@0 | 2762 | |
michael@0 | 2763 | for (;;) { |
michael@0 | 2764 | /* This loop will run once per source string character, for as long as we */ |
michael@0 | 2765 | /* are matching a potential contraction sequence */ |
michael@0 | 2766 | |
michael@0 | 2767 | /* First we position ourselves at the begining of contraction sequence */ |
michael@0 | 2768 | const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); |
michael@0 | 2769 | |
michael@0 | 2770 | if (collIter_eos(source)) { |
michael@0 | 2771 | // Ran off the end of the source string. |
michael@0 | 2772 | CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); |
michael@0 | 2773 | // So we'll pick whatever we have at the point... |
michael@0 | 2774 | if (CE == UCOL_NOT_FOUND) { |
michael@0 | 2775 | // back up the source over all the chars we scanned going into this contraction. |
michael@0 | 2776 | CE = firstCE; |
michael@0 | 2777 | loadState(source, &state, TRUE); |
michael@0 | 2778 | if(source->origFlags & UCOL_USE_ITERATOR) { |
michael@0 | 2779 | source->flags = source->origFlags; |
michael@0 | 2780 | } |
michael@0 | 2781 | } |
michael@0 | 2782 | break; |
michael@0 | 2783 | } |
michael@0 | 2784 | |
michael@0 | 2785 | uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */ |
michael@0 | 2786 | uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8); |
michael@0 | 2787 | |
michael@0 | 2788 | schar = getNextNormalizedChar(source); |
michael@0 | 2789 | while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ |
michael@0 | 2790 | UCharOffset++; |
michael@0 | 2791 | } |
michael@0 | 2792 | |
michael@0 | 2793 | if (schar == tchar) { |
michael@0 | 2794 | // Found the source string char in the contraction table. |
michael@0 | 2795 | // Pick up the corresponding CE from the table. |
michael@0 | 2796 | CE = *(coll->contractionCEs + |
michael@0 | 2797 | (UCharOffset - coll->contractionIndex)); |
michael@0 | 2798 | } |
michael@0 | 2799 | else |
michael@0 | 2800 | { |
michael@0 | 2801 | // Source string char was not in contraction table. |
michael@0 | 2802 | // Unless we have a discontiguous contraction, we have finished |
michael@0 | 2803 | // with this contraction. |
michael@0 | 2804 | // in order to do the proper detection, we |
michael@0 | 2805 | // need to see if we're dealing with a supplementary |
michael@0 | 2806 | /* We test whether the next two char are surrogate pairs. |
michael@0 | 2807 | * This test is done if the iterator is not NULL. |
michael@0 | 2808 | * If there is no surrogate pair, the iterator |
michael@0 | 2809 | * goes back one if needed. */ |
michael@0 | 2810 | UChar32 miss = schar; |
michael@0 | 2811 | if (source->iterator) { |
michael@0 | 2812 | UChar32 surrNextChar; /* the next char in the iteration to test */ |
michael@0 | 2813 | int32_t prevPos; /* holds the previous position before move forward of the source iterator */ |
michael@0 | 2814 | if(U16_IS_LEAD(schar) && source->iterator->hasNext(source->iterator)) { |
michael@0 | 2815 | prevPos = source->iterator->index; |
michael@0 | 2816 | surrNextChar = getNextNormalizedChar(source); |
michael@0 | 2817 | if (U16_IS_TRAIL(surrNextChar)) { |
michael@0 | 2818 | miss = U16_GET_SUPPLEMENTARY(schar, surrNextChar); |
michael@0 | 2819 | } else if (prevPos < source->iterator->index){ |
michael@0 | 2820 | goBackOne(source); |
michael@0 | 2821 | } |
michael@0 | 2822 | } |
michael@0 | 2823 | } else if (U16_IS_LEAD(schar)) { |
michael@0 | 2824 | miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source)); |
michael@0 | 2825 | } |
michael@0 | 2826 | |
michael@0 | 2827 | uint8_t sCC; |
michael@0 | 2828 | if (miss < 0x300 || |
michael@0 | 2829 | maxCC == 0 || |
michael@0 | 2830 | (sCC = i_getCombiningClass(miss, coll)) == 0 || |
michael@0 | 2831 | sCC>maxCC || |
michael@0 | 2832 | (allSame != 0 && sCC == maxCC) || |
michael@0 | 2833 | collIter_eos(source)) |
michael@0 | 2834 | { |
michael@0 | 2835 | // Contraction can not be discontiguous. |
michael@0 | 2836 | goBackOne(source); // back up the source string by one, |
michael@0 | 2837 | // because the character we just looked at was |
michael@0 | 2838 | // not part of the contraction. */ |
michael@0 | 2839 | if(U_IS_SUPPLEMENTARY(miss)) { |
michael@0 | 2840 | goBackOne(source); |
michael@0 | 2841 | } |
michael@0 | 2842 | CE = *(coll->contractionCEs + |
michael@0 | 2843 | (ContractionStart - coll->contractionIndex)); |
michael@0 | 2844 | } else { |
michael@0 | 2845 | // |
michael@0 | 2846 | // Contraction is possibly discontiguous. |
michael@0 | 2847 | // Scan more of source string looking for a match |
michael@0 | 2848 | // |
michael@0 | 2849 | UChar tempchar; |
michael@0 | 2850 | /* find the next character if schar is not a base character |
michael@0 | 2851 | and we are not yet at the end of the string */ |
michael@0 | 2852 | tempchar = getNextNormalizedChar(source); |
michael@0 | 2853 | // probably need another supplementary thingie here |
michael@0 | 2854 | goBackOne(source); |
michael@0 | 2855 | if (i_getCombiningClass(tempchar, coll) == 0) { |
michael@0 | 2856 | goBackOne(source); |
michael@0 | 2857 | if(U_IS_SUPPLEMENTARY(miss)) { |
michael@0 | 2858 | goBackOne(source); |
michael@0 | 2859 | } |
michael@0 | 2860 | /* Spit out the last char of the string, wasn't tasty enough */ |
michael@0 | 2861 | CE = *(coll->contractionCEs + |
michael@0 | 2862 | (ContractionStart - coll->contractionIndex)); |
michael@0 | 2863 | } else { |
michael@0 | 2864 | CE = getDiscontiguous(coll, source, ContractionStart); |
michael@0 | 2865 | } |
michael@0 | 2866 | } |
michael@0 | 2867 | } // else after if(schar == tchar) |
michael@0 | 2868 | |
michael@0 | 2869 | if(CE == UCOL_NOT_FOUND) { |
michael@0 | 2870 | /* The Source string did not match the contraction that we were checking. */ |
michael@0 | 2871 | /* Back up the source position to undo the effects of having partially */ |
michael@0 | 2872 | /* scanned through what ultimately proved to not be a contraction. */ |
michael@0 | 2873 | loadState(source, &state, TRUE); |
michael@0 | 2874 | CE = firstCE; |
michael@0 | 2875 | break; |
michael@0 | 2876 | } |
michael@0 | 2877 | |
michael@0 | 2878 | if(!isContraction(CE)) { |
michael@0 | 2879 | // The source string char was in the contraction table, and the corresponding |
michael@0 | 2880 | // CE is not a contraction CE. We completed the contraction, break |
michael@0 | 2881 | // out of loop, this CE will end up being returned. This is the normal |
michael@0 | 2882 | // way out of contraction handling when the source actually contained |
michael@0 | 2883 | // the contraction. |
michael@0 | 2884 | break; |
michael@0 | 2885 | } |
michael@0 | 2886 | |
michael@0 | 2887 | |
michael@0 | 2888 | // The source string char was in the contraction table, and the corresponding |
michael@0 | 2889 | // CE is IS a contraction CE. We will continue looping to check the source |
michael@0 | 2890 | // string for the remaining chars in the contraction. |
michael@0 | 2891 | uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex)); |
michael@0 | 2892 | if(tempCE != UCOL_NOT_FOUND) { |
michael@0 | 2893 | // We have scanned a a section of source string for which there is a |
michael@0 | 2894 | // CE from the contraction table. Remember the CE and scan position, so |
michael@0 | 2895 | // that we can return to this point if further scanning fails to |
michael@0 | 2896 | // match a longer contraction sequence. |
michael@0 | 2897 | firstCE = tempCE; |
michael@0 | 2898 | |
michael@0 | 2899 | goBackOne(source); |
michael@0 | 2900 | backupState(source, &state); |
michael@0 | 2901 | getNextNormalizedChar(source); |
michael@0 | 2902 | |
michael@0 | 2903 | // Another way to do this is: |
michael@0 | 2904 | //collIterateState tempState; |
michael@0 | 2905 | //backupState(source, &tempState); |
michael@0 | 2906 | //goBackOne(source); |
michael@0 | 2907 | //backupState(source, &state); |
michael@0 | 2908 | //loadState(source, &tempState, TRUE); |
michael@0 | 2909 | |
michael@0 | 2910 | // The problem is that for incomplete contractions we have to remember the previous |
michael@0 | 2911 | // position. Before, the only thing I needed to do was state.pos--; |
michael@0 | 2912 | // After iterator introduction and especially after introduction of normalizing |
michael@0 | 2913 | // iterators, it became much more difficult to decrease the saved state. |
michael@0 | 2914 | // I'm not yet sure which of the two methods above is faster. |
michael@0 | 2915 | } |
michael@0 | 2916 | } // for(;;) |
michael@0 | 2917 | break; |
michael@0 | 2918 | } // case CONTRACTION_TAG: |
michael@0 | 2919 | case LONG_PRIMARY_TAG: |
michael@0 | 2920 | { |
michael@0 | 2921 | *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER; |
michael@0 | 2922 | CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON; |
michael@0 | 2923 | source->offsetRepeatCount += 1; |
michael@0 | 2924 | return CE; |
michael@0 | 2925 | } |
michael@0 | 2926 | case EXPANSION_TAG: |
michael@0 | 2927 | { |
michael@0 | 2928 | /* This should handle expansion. */ |
michael@0 | 2929 | /* NOTE: we can encounter both continuations and expansions in an expansion! */ |
michael@0 | 2930 | /* I have to decide where continuations are going to be dealt with */ |
michael@0 | 2931 | uint32_t size; |
michael@0 | 2932 | uint32_t i; /* general counter */ |
michael@0 | 2933 | |
michael@0 | 2934 | CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ |
michael@0 | 2935 | size = getExpansionCount(CE); |
michael@0 | 2936 | CE = *CEOffset++; |
michael@0 | 2937 | //source->offsetRepeatCount = -1; |
michael@0 | 2938 | |
michael@0 | 2939 | if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ |
michael@0 | 2940 | for(i = 1; i<size; i++) { |
michael@0 | 2941 | *(source->CEpos++) = *CEOffset++; |
michael@0 | 2942 | source->offsetRepeatCount += 1; |
michael@0 | 2943 | } |
michael@0 | 2944 | } else { /* else, we do */ |
michael@0 | 2945 | while(*CEOffset != 0) { |
michael@0 | 2946 | *(source->CEpos++) = *CEOffset++; |
michael@0 | 2947 | source->offsetRepeatCount += 1; |
michael@0 | 2948 | } |
michael@0 | 2949 | } |
michael@0 | 2950 | |
michael@0 | 2951 | return CE; |
michael@0 | 2952 | } |
michael@0 | 2953 | case DIGIT_TAG: |
michael@0 | 2954 | { |
michael@0 | 2955 | /* |
michael@0 | 2956 | We do a check to see if we want to collate digits as numbers; if so we generate |
michael@0 | 2957 | a custom collation key. Otherwise we pull out the value stored in the expansion table. |
michael@0 | 2958 | */ |
michael@0 | 2959 | //uint32_t size; |
michael@0 | 2960 | uint32_t i; /* general counter */ |
michael@0 | 2961 | |
michael@0 | 2962 | if (source->coll->numericCollation == UCOL_ON){ |
michael@0 | 2963 | collIterateState digitState = {0,0,0,0,0,0,0,0,0}; |
michael@0 | 2964 | UChar32 char32 = 0; |
michael@0 | 2965 | int32_t digVal = 0; |
michael@0 | 2966 | |
michael@0 | 2967 | uint32_t digIndx = 0; |
michael@0 | 2968 | uint32_t endIndex = 0; |
michael@0 | 2969 | uint32_t trailingZeroIndex = 0; |
michael@0 | 2970 | |
michael@0 | 2971 | uint8_t collateVal = 0; |
michael@0 | 2972 | |
michael@0 | 2973 | UBool nonZeroValReached = FALSE; |
michael@0 | 2974 | |
michael@0 | 2975 | uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I just need a temporary place to store my generated CEs. |
michael@0 | 2976 | /* |
michael@0 | 2977 | We parse the source string until we hit a char that's NOT a digit. |
michael@0 | 2978 | Use this u_charDigitValue. This might be slow because we have to |
michael@0 | 2979 | handle surrogates... |
michael@0 | 2980 | */ |
michael@0 | 2981 | /* |
michael@0 | 2982 | if (U16_IS_LEAD(ch)){ |
michael@0 | 2983 | if (!collIter_eos(source)) { |
michael@0 | 2984 | backupState(source, &digitState); |
michael@0 | 2985 | UChar trail = getNextNormalizedChar(source); |
michael@0 | 2986 | if(U16_IS_TRAIL(trail)) { |
michael@0 | 2987 | char32 = U16_GET_SUPPLEMENTARY(ch, trail); |
michael@0 | 2988 | } else { |
michael@0 | 2989 | loadState(source, &digitState, TRUE); |
michael@0 | 2990 | char32 = ch; |
michael@0 | 2991 | } |
michael@0 | 2992 | } else { |
michael@0 | 2993 | char32 = ch; |
michael@0 | 2994 | } |
michael@0 | 2995 | } else { |
michael@0 | 2996 | char32 = ch; |
michael@0 | 2997 | } |
michael@0 | 2998 | digVal = u_charDigitValue(char32); |
michael@0 | 2999 | */ |
michael@0 | 3000 | digVal = u_charDigitValue(cp); // if we have arrived here, we have |
michael@0 | 3001 | // already processed possible supplementaries that trigered the digit tag - |
michael@0 | 3002 | // all supplementaries are marked in the UCA. |
michael@0 | 3003 | /* |
michael@0 | 3004 | We pad a zero in front of the first element anyways. This takes |
michael@0 | 3005 | care of the (probably) most common case where people are sorting things followed |
michael@0 | 3006 | by a single digit |
michael@0 | 3007 | */ |
michael@0 | 3008 | digIndx++; |
michael@0 | 3009 | for(;;){ |
michael@0 | 3010 | // Make sure we have enough space. No longer needed; |
michael@0 | 3011 | // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER |
michael@0 | 3012 | // (it has been pre-incremented) so we just ensure that numTempBuf is big enough |
michael@0 | 3013 | // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3). |
michael@0 | 3014 | |
michael@0 | 3015 | // Skipping over leading zeroes. |
michael@0 | 3016 | if (digVal != 0) { |
michael@0 | 3017 | nonZeroValReached = TRUE; |
michael@0 | 3018 | } |
michael@0 | 3019 | if (nonZeroValReached) { |
michael@0 | 3020 | /* |
michael@0 | 3021 | We parse the digit string into base 100 numbers (this fits into a byte). |
michael@0 | 3022 | We only add to the buffer in twos, thus if we are parsing an odd character, |
michael@0 | 3023 | that serves as the 'tens' digit while the if we are parsing an even one, that |
michael@0 | 3024 | is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into |
michael@0 | 3025 | a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid |
michael@0 | 3026 | overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less |
michael@0 | 3027 | than all the other bytes. |
michael@0 | 3028 | */ |
michael@0 | 3029 | |
michael@0 | 3030 | if (digIndx % 2 == 1){ |
michael@0 | 3031 | collateVal += (uint8_t)digVal; |
michael@0 | 3032 | |
michael@0 | 3033 | // We don't enter the low-order-digit case unless we've already seen |
michael@0 | 3034 | // the high order, or for the first digit, which is always non-zero. |
michael@0 | 3035 | if (collateVal != 0) |
michael@0 | 3036 | trailingZeroIndex = 0; |
michael@0 | 3037 | |
michael@0 | 3038 | numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; |
michael@0 | 3039 | collateVal = 0; |
michael@0 | 3040 | } |
michael@0 | 3041 | else{ |
michael@0 | 3042 | // We drop the collation value into the buffer so if we need to do |
michael@0 | 3043 | // a "front patch" we don't have to check to see if we're hitting the |
michael@0 | 3044 | // last element. |
michael@0 | 3045 | collateVal = (uint8_t)(digVal * 10); |
michael@0 | 3046 | |
michael@0 | 3047 | // Check for trailing zeroes. |
michael@0 | 3048 | if (collateVal == 0) |
michael@0 | 3049 | { |
michael@0 | 3050 | if (!trailingZeroIndex) |
michael@0 | 3051 | trailingZeroIndex = (digIndx/2) + 2; |
michael@0 | 3052 | } |
michael@0 | 3053 | else |
michael@0 | 3054 | trailingZeroIndex = 0; |
michael@0 | 3055 | |
michael@0 | 3056 | numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; |
michael@0 | 3057 | } |
michael@0 | 3058 | digIndx++; |
michael@0 | 3059 | } |
michael@0 | 3060 | |
michael@0 | 3061 | // Get next character. |
michael@0 | 3062 | if (!collIter_eos(source)){ |
michael@0 | 3063 | ch = getNextNormalizedChar(source); |
michael@0 | 3064 | if (U16_IS_LEAD(ch)){ |
michael@0 | 3065 | if (!collIter_eos(source)) { |
michael@0 | 3066 | backupState(source, &digitState); |
michael@0 | 3067 | UChar trail = getNextNormalizedChar(source); |
michael@0 | 3068 | if(U16_IS_TRAIL(trail)) { |
michael@0 | 3069 | char32 = U16_GET_SUPPLEMENTARY(ch, trail); |
michael@0 | 3070 | } else { |
michael@0 | 3071 | loadState(source, &digitState, TRUE); |
michael@0 | 3072 | char32 = ch; |
michael@0 | 3073 | } |
michael@0 | 3074 | } |
michael@0 | 3075 | } else { |
michael@0 | 3076 | char32 = ch; |
michael@0 | 3077 | } |
michael@0 | 3078 | |
michael@0 | 3079 | if ((digVal = u_charDigitValue(char32)) == -1 || digIndx > UCOL_MAX_DIGITS_FOR_NUMBER){ |
michael@0 | 3080 | // Resetting position to point to the next unprocessed char. We |
michael@0 | 3081 | // overshot it when doing our test/set for numbers. |
michael@0 | 3082 | if (char32 > 0xFFFF) { // For surrogates. |
michael@0 | 3083 | loadState(source, &digitState, TRUE); |
michael@0 | 3084 | //goBackOne(source); |
michael@0 | 3085 | } |
michael@0 | 3086 | goBackOne(source); |
michael@0 | 3087 | break; |
michael@0 | 3088 | } |
michael@0 | 3089 | } else { |
michael@0 | 3090 | break; |
michael@0 | 3091 | } |
michael@0 | 3092 | } |
michael@0 | 3093 | |
michael@0 | 3094 | if (nonZeroValReached == FALSE){ |
michael@0 | 3095 | digIndx = 2; |
michael@0 | 3096 | numTempBuf[2] = 6; |
michael@0 | 3097 | } |
michael@0 | 3098 | |
michael@0 | 3099 | endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ; |
michael@0 | 3100 | if (digIndx % 2 != 0){ |
michael@0 | 3101 | /* |
michael@0 | 3102 | We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what |
michael@0 | 3103 | we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward. |
michael@0 | 3104 | Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a |
michael@0 | 3105 | single pass and optimizes for strings with single digits. I'm just assuming that's the more common case. |
michael@0 | 3106 | */ |
michael@0 | 3107 | |
michael@0 | 3108 | for(i = 2; i < endIndex; i++){ |
michael@0 | 3109 | numTempBuf[i] = (((((numTempBuf[i] - 6)/2) % 10) * 10) + |
michael@0 | 3110 | (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6; |
michael@0 | 3111 | } |
michael@0 | 3112 | --digIndx; |
michael@0 | 3113 | } |
michael@0 | 3114 | |
michael@0 | 3115 | // Subtract one off of the last byte. |
michael@0 | 3116 | numTempBuf[endIndex-1] -= 1; |
michael@0 | 3117 | |
michael@0 | 3118 | /* |
michael@0 | 3119 | We want to skip over the first two slots in the buffer. The first slot |
michael@0 | 3120 | is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the |
michael@0 | 3121 | sign/exponent byte: 0x80 + (decimalPos/2) & 7f. |
michael@0 | 3122 | */ |
michael@0 | 3123 | numTempBuf[0] = UCOL_CODAN_PLACEHOLDER; |
michael@0 | 3124 | numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F)); |
michael@0 | 3125 | |
michael@0 | 3126 | // Now transfer the collation key to our collIterate struct. |
michael@0 | 3127 | // The total size for our collation key is endIndx bumped up to the next largest even value divided by two. |
michael@0 | 3128 | //size = ((endIndex+1) & ~1)/2; |
michael@0 | 3129 | CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight |
michael@0 | 3130 | (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight |
michael@0 | 3131 | UCOL_BYTE_COMMON; // Tertiary weight. |
michael@0 | 3132 | i = 2; // Reset the index into the buffer. |
michael@0 | 3133 | while(i < endIndex) |
michael@0 | 3134 | { |
michael@0 | 3135 | uint32_t primWeight = numTempBuf[i++] << 8; |
michael@0 | 3136 | if ( i < endIndex) |
michael@0 | 3137 | primWeight |= numTempBuf[i++]; |
michael@0 | 3138 | *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER; |
michael@0 | 3139 | } |
michael@0 | 3140 | |
michael@0 | 3141 | } else { |
michael@0 | 3142 | // no numeric mode, we'll just switch to whatever we stashed and continue |
michael@0 | 3143 | CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ |
michael@0 | 3144 | CE = *CEOffset++; |
michael@0 | 3145 | break; |
michael@0 | 3146 | } |
michael@0 | 3147 | return CE; |
michael@0 | 3148 | } |
michael@0 | 3149 | /* various implicits optimization */ |
michael@0 | 3150 | case IMPLICIT_TAG: /* everything that is not defined otherwise */ |
michael@0 | 3151 | /* UCA is filled with these. Tailorings are NOT_FOUND */ |
michael@0 | 3152 | return getImplicit(cp, source); |
michael@0 | 3153 | case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/ |
michael@0 | 3154 | // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit |
michael@0 | 3155 | return getImplicit(cp, source); |
michael@0 | 3156 | case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/ |
michael@0 | 3157 | { |
michael@0 | 3158 | static const uint32_t |
michael@0 | 3159 | SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7; |
michael@0 | 3160 | //const uint32_t LCount = 19; |
michael@0 | 3161 | static const uint32_t VCount = 21; |
michael@0 | 3162 | static const uint32_t TCount = 28; |
michael@0 | 3163 | //const uint32_t NCount = VCount * TCount; // 588 |
michael@0 | 3164 | //const uint32_t SCount = LCount * NCount; // 11172 |
michael@0 | 3165 | uint32_t L = ch - SBase; |
michael@0 | 3166 | |
michael@0 | 3167 | // divide into pieces |
michael@0 | 3168 | |
michael@0 | 3169 | uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation |
michael@0 | 3170 | L /= TCount; |
michael@0 | 3171 | uint32_t V = L % VCount; |
michael@0 | 3172 | L /= VCount; |
michael@0 | 3173 | |
michael@0 | 3174 | // offset them |
michael@0 | 3175 | |
michael@0 | 3176 | L += LBase; |
michael@0 | 3177 | V += VBase; |
michael@0 | 3178 | T += TBase; |
michael@0 | 3179 | |
michael@0 | 3180 | // return the first CE, but first put the rest into the expansion buffer |
michael@0 | 3181 | if (!source->coll->image->jamoSpecial) { // FAST PATH |
michael@0 | 3182 | |
michael@0 | 3183 | *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V); |
michael@0 | 3184 | if (T != TBase) { |
michael@0 | 3185 | *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T); |
michael@0 | 3186 | } |
michael@0 | 3187 | |
michael@0 | 3188 | return UTRIE_GET32_FROM_LEAD(&coll->mapping, L); |
michael@0 | 3189 | |
michael@0 | 3190 | } else { // Jamo is Special |
michael@0 | 3191 | // Since Hanguls pass the FCD check, it is |
michael@0 | 3192 | // guaranteed that we won't be in |
michael@0 | 3193 | // the normalization buffer if something like this happens |
michael@0 | 3194 | |
michael@0 | 3195 | // However, if we are using a uchar iterator and normalization |
michael@0 | 3196 | // is ON, the Hangul that lead us here is going to be in that |
michael@0 | 3197 | // normalization buffer. Here we want to restore the uchar |
michael@0 | 3198 | // iterator state and pull out of the normalization buffer |
michael@0 | 3199 | if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) { |
michael@0 | 3200 | source->flags = source->origFlags; // restore the iterator |
michael@0 | 3201 | source->pos = NULL; |
michael@0 | 3202 | } |
michael@0 | 3203 | |
michael@0 | 3204 | // Move Jamos into normalization buffer |
michael@0 | 3205 | UChar *buffer = source->writableBuffer.getBuffer(4); |
michael@0 | 3206 | int32_t bufferLength; |
michael@0 | 3207 | buffer[0] = (UChar)L; |
michael@0 | 3208 | buffer[1] = (UChar)V; |
michael@0 | 3209 | if (T != TBase) { |
michael@0 | 3210 | buffer[2] = (UChar)T; |
michael@0 | 3211 | bufferLength = 3; |
michael@0 | 3212 | } else { |
michael@0 | 3213 | bufferLength = 2; |
michael@0 | 3214 | } |
michael@0 | 3215 | source->writableBuffer.releaseBuffer(bufferLength); |
michael@0 | 3216 | |
michael@0 | 3217 | // Indicate where to continue in main input string after exhausting the writableBuffer |
michael@0 | 3218 | source->fcdPosition = source->pos; |
michael@0 | 3219 | |
michael@0 | 3220 | source->pos = source->writableBuffer.getTerminatedBuffer(); |
michael@0 | 3221 | source->origFlags = source->flags; |
michael@0 | 3222 | source->flags |= UCOL_ITER_INNORMBUF; |
michael@0 | 3223 | source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); |
michael@0 | 3224 | |
michael@0 | 3225 | return(UCOL_IGNORABLE); |
michael@0 | 3226 | } |
michael@0 | 3227 | } |
michael@0 | 3228 | case SURROGATE_TAG: |
michael@0 | 3229 | /* we encountered a leading surrogate. We shall get the CE by using the following code unit */ |
michael@0 | 3230 | /* two things can happen here: next code point can be a trailing surrogate - we will use it */ |
michael@0 | 3231 | /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */ |
michael@0 | 3232 | /* we treat it like an unassigned code point. */ |
michael@0 | 3233 | { |
michael@0 | 3234 | UChar trail; |
michael@0 | 3235 | collIterateState state; |
michael@0 | 3236 | backupState(source, &state); |
michael@0 | 3237 | if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) { |
michael@0 | 3238 | // we chould have stepped one char forward and it might have turned that it |
michael@0 | 3239 | // was not a trail surrogate. In that case, we have to backup. |
michael@0 | 3240 | loadState(source, &state, TRUE); |
michael@0 | 3241 | return UCOL_NOT_FOUND; |
michael@0 | 3242 | } else { |
michael@0 | 3243 | /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */ |
michael@0 | 3244 | CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail); |
michael@0 | 3245 | if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one. |
michael@0 | 3246 | // We need to backup |
michael@0 | 3247 | loadState(source, &state, TRUE); |
michael@0 | 3248 | return CE; |
michael@0 | 3249 | } |
michael@0 | 3250 | // calculate the supplementary code point value, if surrogate was not tailored |
michael@0 | 3251 | cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000)); |
michael@0 | 3252 | } |
michael@0 | 3253 | } |
michael@0 | 3254 | break; |
michael@0 | 3255 | case LEAD_SURROGATE_TAG: /* D800-DBFF*/ |
michael@0 | 3256 | UChar nextChar; |
michael@0 | 3257 | if( source->flags & UCOL_USE_ITERATOR) { |
michael@0 | 3258 | if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) { |
michael@0 | 3259 | cp = U16_GET_SUPPLEMENTARY(ch, nextChar); |
michael@0 | 3260 | source->iterator->next(source->iterator); |
michael@0 | 3261 | return getImplicit(cp, source); |
michael@0 | 3262 | } |
michael@0 | 3263 | } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) && |
michael@0 | 3264 | U_IS_TRAIL((nextChar=*source->pos))) { |
michael@0 | 3265 | cp = U16_GET_SUPPLEMENTARY(ch, nextChar); |
michael@0 | 3266 | source->pos++; |
michael@0 | 3267 | return getImplicit(cp, source); |
michael@0 | 3268 | } |
michael@0 | 3269 | return UCOL_NOT_FOUND; |
michael@0 | 3270 | case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/ |
michael@0 | 3271 | return UCOL_NOT_FOUND; /* broken surrogate sequence */ |
michael@0 | 3272 | case CHARSET_TAG: |
michael@0 | 3273 | /* not yet implemented */ |
michael@0 | 3274 | /* probably after 1.8 */ |
michael@0 | 3275 | return UCOL_NOT_FOUND; |
michael@0 | 3276 | default: |
michael@0 | 3277 | *status = U_INTERNAL_PROGRAM_ERROR; |
michael@0 | 3278 | CE=0; |
michael@0 | 3279 | break; |
michael@0 | 3280 | } |
michael@0 | 3281 | if (CE <= UCOL_NOT_FOUND) break; |
michael@0 | 3282 | } |
michael@0 | 3283 | return CE; |
michael@0 | 3284 | } |
michael@0 | 3285 | |
michael@0 | 3286 | |
michael@0 | 3287 | /* now uses Mark's getImplicitPrimary code */ |
michael@0 | 3288 | static |
michael@0 | 3289 | inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) { |
michael@0 | 3290 | uint32_t r = uprv_uca_getImplicitPrimary(cp); |
michael@0 | 3291 | |
michael@0 | 3292 | *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505; |
michael@0 | 3293 | collationSource->toReturn = collationSource->CEpos; |
michael@0 | 3294 | |
michael@0 | 3295 | // **** doesn't work if using iterator **** |
michael@0 | 3296 | if (collationSource->flags & UCOL_ITER_INNORMBUF) { |
michael@0 | 3297 | collationSource->offsetRepeatCount = 1; |
michael@0 | 3298 | } else { |
michael@0 | 3299 | int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string); |
michael@0 | 3300 | |
michael@0 | 3301 | UErrorCode errorCode = U_ZERO_ERROR; |
michael@0 | 3302 | collationSource->appendOffset(firstOffset, errorCode); |
michael@0 | 3303 | collationSource->appendOffset(firstOffset + 1, errorCode); |
michael@0 | 3304 | |
michael@0 | 3305 | collationSource->offsetReturn = collationSource->offsetStore - 1; |
michael@0 | 3306 | *(collationSource->offsetBuffer) = firstOffset; |
michael@0 | 3307 | if (collationSource->offsetReturn == collationSource->offsetBuffer) { |
michael@0 | 3308 | collationSource->offsetStore = collationSource->offsetBuffer; |
michael@0 | 3309 | } |
michael@0 | 3310 | } |
michael@0 | 3311 | |
michael@0 | 3312 | return ((r & 0x0000FFFF)<<16) | 0x000000C0; |
michael@0 | 3313 | } |
michael@0 | 3314 | |
michael@0 | 3315 | /** |
michael@0 | 3316 | * This function handles the special CEs like contractions, expansions, |
michael@0 | 3317 | * surrogates, Thai. |
michael@0 | 3318 | * It is called by both getPrevCE |
michael@0 | 3319 | */ |
michael@0 | 3320 | uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE, |
michael@0 | 3321 | collIterate *source, |
michael@0 | 3322 | UErrorCode *status) |
michael@0 | 3323 | { |
michael@0 | 3324 | const uint32_t *CEOffset = NULL; |
michael@0 | 3325 | UChar *UCharOffset = NULL; |
michael@0 | 3326 | UChar schar; |
michael@0 | 3327 | const UChar *constart = NULL; |
michael@0 | 3328 | uint32_t size; |
michael@0 | 3329 | UChar buffer[UCOL_MAX_BUFFER]; |
michael@0 | 3330 | uint32_t *endCEBuffer; |
michael@0 | 3331 | UChar *strbuffer; |
michael@0 | 3332 | int32_t noChars = 0; |
michael@0 | 3333 | int32_t CECount = 0; |
michael@0 | 3334 | |
michael@0 | 3335 | for(;;) |
michael@0 | 3336 | { |
michael@0 | 3337 | /* the only ces that loops are thai and contractions */ |
michael@0 | 3338 | switch (getCETag(CE)) |
michael@0 | 3339 | { |
michael@0 | 3340 | case NOT_FOUND_TAG: /* this tag always returns */ |
michael@0 | 3341 | return CE; |
michael@0 | 3342 | |
michael@0 | 3343 | case SPEC_PROC_TAG: |
michael@0 | 3344 | { |
michael@0 | 3345 | // Special processing is getting a CE that is preceded by a certain prefix |
michael@0 | 3346 | // Currently this is only needed for optimizing Japanese length and iteration marks. |
michael@0 | 3347 | // When we encouter a special processing tag, we go backwards and try to see if |
michael@0 | 3348 | // we have a match. |
michael@0 | 3349 | // Contraction tables are used - so the whole process is not unlike contraction. |
michael@0 | 3350 | // prefix data is stored backwards in the table. |
michael@0 | 3351 | const UChar *UCharOffset; |
michael@0 | 3352 | UChar schar, tchar; |
michael@0 | 3353 | collIterateState prefixState; |
michael@0 | 3354 | backupState(source, &prefixState); |
michael@0 | 3355 | for(;;) { |
michael@0 | 3356 | // This loop will run once per source string character, for as long as we |
michael@0 | 3357 | // are matching a potential contraction sequence |
michael@0 | 3358 | |
michael@0 | 3359 | // First we position ourselves at the begining of contraction sequence |
michael@0 | 3360 | const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); |
michael@0 | 3361 | |
michael@0 | 3362 | if (collIter_bos(source)) { |
michael@0 | 3363 | CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); |
michael@0 | 3364 | break; |
michael@0 | 3365 | } |
michael@0 | 3366 | schar = getPrevNormalizedChar(source, status); |
michael@0 | 3367 | goBackOne(source); |
michael@0 | 3368 | |
michael@0 | 3369 | while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ |
michael@0 | 3370 | UCharOffset++; |
michael@0 | 3371 | } |
michael@0 | 3372 | |
michael@0 | 3373 | if (schar == tchar) { |
michael@0 | 3374 | // Found the source string char in the table. |
michael@0 | 3375 | // Pick up the corresponding CE from the table. |
michael@0 | 3376 | CE = *(coll->contractionCEs + |
michael@0 | 3377 | (UCharOffset - coll->contractionIndex)); |
michael@0 | 3378 | } |
michael@0 | 3379 | else |
michael@0 | 3380 | { |
michael@0 | 3381 | // if there is a completely ignorable code point in the middle of |
michael@0 | 3382 | // a prefix, we need to act as if it's not there |
michael@0 | 3383 | // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero) |
michael@0 | 3384 | // lone surrogates cannot be set to zero as it would break other processing |
michael@0 | 3385 | uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar); |
michael@0 | 3386 | // it's easy for BMP code points |
michael@0 | 3387 | if(isZeroCE == 0) { |
michael@0 | 3388 | continue; |
michael@0 | 3389 | } else if(U16_IS_SURROGATE(schar)) { |
michael@0 | 3390 | // for supplementary code points, we have to check the next one |
michael@0 | 3391 | // situations where we are going to ignore |
michael@0 | 3392 | // 1. beginning of the string: schar is a lone surrogate |
michael@0 | 3393 | // 2. schar is a lone surrogate |
michael@0 | 3394 | // 3. schar is a trail surrogate in a valid surrogate sequence |
michael@0 | 3395 | // that is explicitly set to zero. |
michael@0 | 3396 | if (!collIter_bos(source)) { |
michael@0 | 3397 | UChar lead; |
michael@0 | 3398 | if(!U16_IS_SURROGATE_LEAD(schar) && U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) { |
michael@0 | 3399 | isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead); |
michael@0 | 3400 | if(isSpecial(isZeroCE) && getCETag(isZeroCE) == SURROGATE_TAG) { |
michael@0 | 3401 | uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar); |
michael@0 | 3402 | if(finalCE == 0) { |
michael@0 | 3403 | // this is a real, assigned completely ignorable code point |
michael@0 | 3404 | goBackOne(source); |
michael@0 | 3405 | continue; |
michael@0 | 3406 | } |
michael@0 | 3407 | } |
michael@0 | 3408 | } else { |
michael@0 | 3409 | // lone surrogate, treat like unassigned |
michael@0 | 3410 | return UCOL_NOT_FOUND; |
michael@0 | 3411 | } |
michael@0 | 3412 | } else { |
michael@0 | 3413 | // lone surrogate at the beggining, treat like unassigned |
michael@0 | 3414 | return UCOL_NOT_FOUND; |
michael@0 | 3415 | } |
michael@0 | 3416 | } |
michael@0 | 3417 | // Source string char was not in the table. |
michael@0 | 3418 | // We have not found the prefix. |
michael@0 | 3419 | CE = *(coll->contractionCEs + |
michael@0 | 3420 | (ContractionStart - coll->contractionIndex)); |
michael@0 | 3421 | } |
michael@0 | 3422 | |
michael@0 | 3423 | if(!isPrefix(CE)) { |
michael@0 | 3424 | // The source string char was in the contraction table, and the corresponding |
michael@0 | 3425 | // CE is not a prefix CE. We found the prefix, break |
michael@0 | 3426 | // out of loop, this CE will end up being returned. This is the normal |
michael@0 | 3427 | // way out of prefix handling when the source actually contained |
michael@0 | 3428 | // the prefix. |
michael@0 | 3429 | break; |
michael@0 | 3430 | } |
michael@0 | 3431 | } |
michael@0 | 3432 | loadState(source, &prefixState, TRUE); |
michael@0 | 3433 | break; |
michael@0 | 3434 | } |
michael@0 | 3435 | |
michael@0 | 3436 | case CONTRACTION_TAG: { |
michael@0 | 3437 | /* to ensure that the backwards and forwards iteration matches, we |
michael@0 | 3438 | take the current region of most possible match and pass it through |
michael@0 | 3439 | the forward iteration. this will ensure that the obstinate problem of |
michael@0 | 3440 | overlapping contractions will not occur. |
michael@0 | 3441 | */ |
michael@0 | 3442 | schar = peekCodeUnit(source, 0); |
michael@0 | 3443 | constart = (UChar *)coll->image + getContractOffset(CE); |
michael@0 | 3444 | if (isAtStartPrevIterate(source) |
michael@0 | 3445 | /* commented away contraction end checks after adding the checks |
michael@0 | 3446 | in getPrevCE */) { |
michael@0 | 3447 | /* start of string or this is not the end of any contraction */ |
michael@0 | 3448 | CE = *(coll->contractionCEs + |
michael@0 | 3449 | (constart - coll->contractionIndex)); |
michael@0 | 3450 | break; |
michael@0 | 3451 | } |
michael@0 | 3452 | strbuffer = buffer; |
michael@0 | 3453 | UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1); |
michael@0 | 3454 | *(UCharOffset --) = 0; |
michael@0 | 3455 | noChars = 0; |
michael@0 | 3456 | // have to swap thai characters |
michael@0 | 3457 | while (ucol_unsafeCP(schar, coll)) { |
michael@0 | 3458 | *(UCharOffset) = schar; |
michael@0 | 3459 | noChars++; |
michael@0 | 3460 | UCharOffset --; |
michael@0 | 3461 | schar = getPrevNormalizedChar(source, status); |
michael@0 | 3462 | goBackOne(source); |
michael@0 | 3463 | // TODO: when we exhaust the contraction buffer, |
michael@0 | 3464 | // it needs to get reallocated. The problem is |
michael@0 | 3465 | // that the size depends on the string which is |
michael@0 | 3466 | // not iterated over. However, since we're travelling |
michael@0 | 3467 | // backwards, we already had to set the iterator at |
michael@0 | 3468 | // the end - so we might as well know where we are? |
michael@0 | 3469 | if (UCharOffset + 1 == buffer) { |
michael@0 | 3470 | /* we have exhausted the buffer */ |
michael@0 | 3471 | int32_t newsize = 0; |
michael@0 | 3472 | if(source->pos) { // actually dealing with a position |
michael@0 | 3473 | newsize = (int32_t)(source->pos - source->string + 1); |
michael@0 | 3474 | } else { // iterator |
michael@0 | 3475 | newsize = 4 * UCOL_MAX_BUFFER; |
michael@0 | 3476 | } |
michael@0 | 3477 | strbuffer = (UChar *)uprv_malloc(sizeof(UChar) * |
michael@0 | 3478 | (newsize + UCOL_MAX_BUFFER)); |
michael@0 | 3479 | /* test for NULL */ |
michael@0 | 3480 | if (strbuffer == NULL) { |
michael@0 | 3481 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 3482 | return UCOL_NO_MORE_CES; |
michael@0 | 3483 | } |
michael@0 | 3484 | UCharOffset = strbuffer + newsize; |
michael@0 | 3485 | uprv_memcpy(UCharOffset, buffer, |
michael@0 | 3486 | UCOL_MAX_BUFFER * sizeof(UChar)); |
michael@0 | 3487 | UCharOffset --; |
michael@0 | 3488 | } |
michael@0 | 3489 | if ((source->pos && (source->pos == source->string || |
michael@0 | 3490 | ((source->flags & UCOL_ITER_INNORMBUF) && |
michael@0 | 3491 | *(source->pos - 1) == 0 && source->fcdPosition == NULL))) |
michael@0 | 3492 | || (source->iterator && !source->iterator->hasPrevious(source->iterator))) { |
michael@0 | 3493 | break; |
michael@0 | 3494 | } |
michael@0 | 3495 | } |
michael@0 | 3496 | /* adds the initial base character to the string */ |
michael@0 | 3497 | *(UCharOffset) = schar; |
michael@0 | 3498 | noChars++; |
michael@0 | 3499 | |
michael@0 | 3500 | int32_t offsetBias; |
michael@0 | 3501 | |
michael@0 | 3502 | // **** doesn't work if using iterator **** |
michael@0 | 3503 | if (source->flags & UCOL_ITER_INNORMBUF) { |
michael@0 | 3504 | offsetBias = -1; |
michael@0 | 3505 | } else { |
michael@0 | 3506 | offsetBias = (int32_t)(source->pos - source->string); |
michael@0 | 3507 | } |
michael@0 | 3508 | |
michael@0 | 3509 | /* a new collIterate is used to simplify things, since using the current |
michael@0 | 3510 | collIterate will mean that the forward and backwards iteration will |
michael@0 | 3511 | share and change the same buffers. we don't want to get into that. */ |
michael@0 | 3512 | collIterate temp; |
michael@0 | 3513 | int32_t rawOffset; |
michael@0 | 3514 | |
michael@0 | 3515 | IInit_collIterate(coll, UCharOffset, noChars, &temp, status); |
michael@0 | 3516 | if(U_FAILURE(*status)) { |
michael@0 | 3517 | return (uint32_t)UCOL_NULLORDER; |
michael@0 | 3518 | } |
michael@0 | 3519 | temp.flags &= ~UCOL_ITER_NORM; |
michael@0 | 3520 | temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT; |
michael@0 | 3521 | |
michael@0 | 3522 | rawOffset = (int32_t)(temp.pos - temp.string); // should always be zero? |
michael@0 | 3523 | CE = ucol_IGetNextCE(coll, &temp, status); |
michael@0 | 3524 | |
michael@0 | 3525 | if (source->extendCEs) { |
michael@0 | 3526 | endCEBuffer = source->extendCEs + source->extendCEsSize; |
michael@0 | 3527 | CECount = (int32_t)((source->CEpos - source->extendCEs)/sizeof(uint32_t)); |
michael@0 | 3528 | } else { |
michael@0 | 3529 | endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE; |
michael@0 | 3530 | CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_t)); |
michael@0 | 3531 | } |
michael@0 | 3532 | |
michael@0 | 3533 | while (CE != UCOL_NO_MORE_CES) { |
michael@0 | 3534 | *(source->CEpos ++) = CE; |
michael@0 | 3535 | |
michael@0 | 3536 | if (offsetBias >= 0) { |
michael@0 | 3537 | source->appendOffset(rawOffset + offsetBias, *status); |
michael@0 | 3538 | } |
michael@0 | 3539 | |
michael@0 | 3540 | CECount++; |
michael@0 | 3541 | if (source->CEpos == endCEBuffer) { |
michael@0 | 3542 | /* ran out of CE space, reallocate to new buffer. |
michael@0 | 3543 | If reallocation fails, reset pointers and bail out, |
michael@0 | 3544 | there's no guarantee of the right character position after |
michael@0 | 3545 | this bail*/ |
michael@0 | 3546 | if (!increaseCEsCapacity(source)) { |
michael@0 | 3547 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 3548 | break; |
michael@0 | 3549 | } |
michael@0 | 3550 | |
michael@0 | 3551 | endCEBuffer = source->extendCEs + source->extendCEsSize; |
michael@0 | 3552 | } |
michael@0 | 3553 | |
michael@0 | 3554 | if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) { |
michael@0 | 3555 | rawOffset = (int32_t)(temp.fcdPosition - temp.string); |
michael@0 | 3556 | } else { |
michael@0 | 3557 | rawOffset = (int32_t)(temp.pos - temp.string); |
michael@0 | 3558 | } |
michael@0 | 3559 | |
michael@0 | 3560 | CE = ucol_IGetNextCE(coll, &temp, status); |
michael@0 | 3561 | } |
michael@0 | 3562 | |
michael@0 | 3563 | if (strbuffer != buffer) { |
michael@0 | 3564 | uprv_free(strbuffer); |
michael@0 | 3565 | } |
michael@0 | 3566 | if (U_FAILURE(*status)) { |
michael@0 | 3567 | return (uint32_t)UCOL_NULLORDER; |
michael@0 | 3568 | } |
michael@0 | 3569 | |
michael@0 | 3570 | if (source->offsetRepeatValue != 0) { |
michael@0 | 3571 | if (CECount > noChars) { |
michael@0 | 3572 | source->offsetRepeatCount += temp.offsetRepeatCount; |
michael@0 | 3573 | } else { |
michael@0 | 3574 | // **** does this really skip the right offsets? **** |
michael@0 | 3575 | source->offsetReturn -= (noChars - CECount); |
michael@0 | 3576 | } |
michael@0 | 3577 | } |
michael@0 | 3578 | |
michael@0 | 3579 | if (offsetBias >= 0) { |
michael@0 | 3580 | source->offsetReturn = source->offsetStore - 1; |
michael@0 | 3581 | if (source->offsetReturn == source->offsetBuffer) { |
michael@0 | 3582 | source->offsetStore = source->offsetBuffer; |
michael@0 | 3583 | } |
michael@0 | 3584 | } |
michael@0 | 3585 | |
michael@0 | 3586 | source->toReturn = source->CEpos - 1; |
michael@0 | 3587 | if (source->toReturn == source->CEs) { |
michael@0 | 3588 | source->CEpos = source->CEs; |
michael@0 | 3589 | } |
michael@0 | 3590 | |
michael@0 | 3591 | return *(source->toReturn); |
michael@0 | 3592 | } |
michael@0 | 3593 | case LONG_PRIMARY_TAG: |
michael@0 | 3594 | { |
michael@0 | 3595 | *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON; |
michael@0 | 3596 | *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER; |
michael@0 | 3597 | source->toReturn = source->CEpos - 1; |
michael@0 | 3598 | |
michael@0 | 3599 | if (source->flags & UCOL_ITER_INNORMBUF) { |
michael@0 | 3600 | source->offsetRepeatCount = 1; |
michael@0 | 3601 | } else { |
michael@0 | 3602 | int32_t firstOffset = (int32_t)(source->pos - source->string); |
michael@0 | 3603 | |
michael@0 | 3604 | source->appendOffset(firstOffset, *status); |
michael@0 | 3605 | source->appendOffset(firstOffset + 1, *status); |
michael@0 | 3606 | |
michael@0 | 3607 | source->offsetReturn = source->offsetStore - 1; |
michael@0 | 3608 | *(source->offsetBuffer) = firstOffset; |
michael@0 | 3609 | if (source->offsetReturn == source->offsetBuffer) { |
michael@0 | 3610 | source->offsetStore = source->offsetBuffer; |
michael@0 | 3611 | } |
michael@0 | 3612 | } |
michael@0 | 3613 | |
michael@0 | 3614 | |
michael@0 | 3615 | return *(source->toReturn); |
michael@0 | 3616 | } |
michael@0 | 3617 | |
michael@0 | 3618 | case EXPANSION_TAG: /* this tag always returns */ |
michael@0 | 3619 | { |
michael@0 | 3620 | /* |
michael@0 | 3621 | This should handle expansion. |
michael@0 | 3622 | NOTE: we can encounter both continuations and expansions in an expansion! |
michael@0 | 3623 | I have to decide where continuations are going to be dealt with |
michael@0 | 3624 | */ |
michael@0 | 3625 | int32_t firstOffset = (int32_t)(source->pos - source->string); |
michael@0 | 3626 | |
michael@0 | 3627 | // **** doesn't work if using iterator **** |
michael@0 | 3628 | if (source->offsetReturn != NULL) { |
michael@0 | 3629 | if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetReturn == source->offsetBuffer) { |
michael@0 | 3630 | source->offsetStore = source->offsetBuffer; |
michael@0 | 3631 | }else { |
michael@0 | 3632 | firstOffset = -1; |
michael@0 | 3633 | } |
michael@0 | 3634 | } |
michael@0 | 3635 | |
michael@0 | 3636 | /* find the offset to expansion table */ |
michael@0 | 3637 | CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); |
michael@0 | 3638 | size = getExpansionCount(CE); |
michael@0 | 3639 | if (size != 0) { |
michael@0 | 3640 | /* |
michael@0 | 3641 | if there are less than 16 elements in expansion, we don't terminate |
michael@0 | 3642 | */ |
michael@0 | 3643 | uint32_t count; |
michael@0 | 3644 | |
michael@0 | 3645 | for (count = 0; count < size; count++) { |
michael@0 | 3646 | *(source->CEpos ++) = *CEOffset++; |
michael@0 | 3647 | |
michael@0 | 3648 | if (firstOffset >= 0) { |
michael@0 | 3649 | source->appendOffset(firstOffset + 1, *status); |
michael@0 | 3650 | } |
michael@0 | 3651 | } |
michael@0 | 3652 | } else { |
michael@0 | 3653 | /* else, we do */ |
michael@0 | 3654 | while (*CEOffset != 0) { |
michael@0 | 3655 | *(source->CEpos ++) = *CEOffset ++; |
michael@0 | 3656 | |
michael@0 | 3657 | if (firstOffset >= 0) { |
michael@0 | 3658 | source->appendOffset(firstOffset + 1, *status); |
michael@0 | 3659 | } |
michael@0 | 3660 | } |
michael@0 | 3661 | } |
michael@0 | 3662 | |
michael@0 | 3663 | if (firstOffset >= 0) { |
michael@0 | 3664 | source->offsetReturn = source->offsetStore - 1; |
michael@0 | 3665 | *(source->offsetBuffer) = firstOffset; |
michael@0 | 3666 | if (source->offsetReturn == source->offsetBuffer) { |
michael@0 | 3667 | source->offsetStore = source->offsetBuffer; |
michael@0 | 3668 | } |
michael@0 | 3669 | } else { |
michael@0 | 3670 | source->offsetRepeatCount += size - 1; |
michael@0 | 3671 | } |
michael@0 | 3672 | |
michael@0 | 3673 | source->toReturn = source->CEpos - 1; |
michael@0 | 3674 | // in case of one element expansion, we |
michael@0 | 3675 | // want to immediately return CEpos |
michael@0 | 3676 | if(source->toReturn == source->CEs) { |
michael@0 | 3677 | source->CEpos = source->CEs; |
michael@0 | 3678 | } |
michael@0 | 3679 | |
michael@0 | 3680 | return *(source->toReturn); |
michael@0 | 3681 | } |
michael@0 | 3682 | |
michael@0 | 3683 | case DIGIT_TAG: |
michael@0 | 3684 | { |
michael@0 | 3685 | /* |
michael@0 | 3686 | We do a check to see if we want to collate digits as numbers; if so we generate |
michael@0 | 3687 | a custom collation key. Otherwise we pull out the value stored in the expansion table. |
michael@0 | 3688 | */ |
michael@0 | 3689 | uint32_t i; /* general counter */ |
michael@0 | 3690 | |
michael@0 | 3691 | if (source->coll->numericCollation == UCOL_ON){ |
michael@0 | 3692 | uint32_t digIndx = 0; |
michael@0 | 3693 | uint32_t endIndex = 0; |
michael@0 | 3694 | uint32_t leadingZeroIndex = 0; |
michael@0 | 3695 | uint32_t trailingZeroCount = 0; |
michael@0 | 3696 | |
michael@0 | 3697 | uint8_t collateVal = 0; |
michael@0 | 3698 | |
michael@0 | 3699 | UBool nonZeroValReached = FALSE; |
michael@0 | 3700 | |
michael@0 | 3701 | uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I just need a temporary place to store my generated CEs. |
michael@0 | 3702 | /* |
michael@0 | 3703 | We parse the source string until we hit a char that's NOT a digit. |
michael@0 | 3704 | Use this u_charDigitValue. This might be slow because we have to |
michael@0 | 3705 | handle surrogates... |
michael@0 | 3706 | */ |
michael@0 | 3707 | /* |
michael@0 | 3708 | We need to break up the digit string into collection elements of UCOL_MAX_DIGITS_FOR_NUMBER or less, |
michael@0 | 3709 | with any chunks smaller than that being on the right end of the digit string - i.e. the first collation |
michael@0 | 3710 | element we process when going backward. To determine how long that chunk might be, we may need to make |
michael@0 | 3711 | two passes through the loop that collects digits - one to see how long the string is (and how much is |
michael@0 | 3712 | leading zeros) to determine the length of that right-hand chunk, and a second (if the whole string has |
michael@0 | 3713 | more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits) to actually process that collation |
michael@0 | 3714 | element chunk after resetting the state to the initialState at the right side of the digit string. |
michael@0 | 3715 | */ |
michael@0 | 3716 | uint32_t ceLimit = 0; |
michael@0 | 3717 | UChar initial_ch = ch; |
michael@0 | 3718 | collIterateState initialState = {0,0,0,0,0,0,0,0,0}; |
michael@0 | 3719 | backupState(source, &initialState); |
michael@0 | 3720 | |
michael@0 | 3721 | for(;;) { |
michael@0 | 3722 | collIterateState state = {0,0,0,0,0,0,0,0,0}; |
michael@0 | 3723 | UChar32 char32 = 0; |
michael@0 | 3724 | int32_t digVal = 0; |
michael@0 | 3725 | |
michael@0 | 3726 | if (U16_IS_TRAIL (ch)) { |
michael@0 | 3727 | if (!collIter_bos(source)){ |
michael@0 | 3728 | UChar lead = getPrevNormalizedChar(source, status); |
michael@0 | 3729 | if(U16_IS_LEAD(lead)) { |
michael@0 | 3730 | char32 = U16_GET_SUPPLEMENTARY(lead,ch); |
michael@0 | 3731 | goBackOne(source); |
michael@0 | 3732 | } else { |
michael@0 | 3733 | char32 = ch; |
michael@0 | 3734 | } |
michael@0 | 3735 | } else { |
michael@0 | 3736 | char32 = ch; |
michael@0 | 3737 | } |
michael@0 | 3738 | } else { |
michael@0 | 3739 | char32 = ch; |
michael@0 | 3740 | } |
michael@0 | 3741 | digVal = u_charDigitValue(char32); |
michael@0 | 3742 | |
michael@0 | 3743 | for(;;) { |
michael@0 | 3744 | // Make sure we have enough space. No longer needed; |
michael@0 | 3745 | // at this point the largest value of digIndx when we need to save data in numTempBuf |
michael@0 | 3746 | // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-incremented) so we just ensure |
michael@0 | 3747 | // that numTempBuf is big enough (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2). |
michael@0 | 3748 | |
michael@0 | 3749 | // Skip over trailing zeroes, and keep a count of them. |
michael@0 | 3750 | if (digVal != 0) |
michael@0 | 3751 | nonZeroValReached = TRUE; |
michael@0 | 3752 | |
michael@0 | 3753 | if (nonZeroValReached) { |
michael@0 | 3754 | /* |
michael@0 | 3755 | We parse the digit string into base 100 numbers (this fits into a byte). |
michael@0 | 3756 | We only add to the buffer in twos, thus if we are parsing an odd character, |
michael@0 | 3757 | that serves as the 'tens' digit while the if we are parsing an even one, that |
michael@0 | 3758 | is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into |
michael@0 | 3759 | a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid |
michael@0 | 3760 | overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less |
michael@0 | 3761 | than all the other bytes. |
michael@0 | 3762 | |
michael@0 | 3763 | Since we're doing in this reverse we want to put the first digit encountered into the |
michael@0 | 3764 | ones place and the second digit encountered into the tens place. |
michael@0 | 3765 | */ |
michael@0 | 3766 | |
michael@0 | 3767 | if ((digIndx + trailingZeroCount) % 2 == 1) { |
michael@0 | 3768 | // High-order digit case (tens place) |
michael@0 | 3769 | collateVal += (uint8_t)(digVal * 10); |
michael@0 | 3770 | |
michael@0 | 3771 | // We cannot set leadingZeroIndex unless it has been set for the |
michael@0 | 3772 | // low-order digit. Therefore, all we can do for the high-order |
michael@0 | 3773 | // digit is turn it off, never on. |
michael@0 | 3774 | // The only time we will have a high digit without a low is for |
michael@0 | 3775 | // the very first non-zero digit, so no zero check is necessary. |
michael@0 | 3776 | if (collateVal != 0) |
michael@0 | 3777 | leadingZeroIndex = 0; |
michael@0 | 3778 | |
michael@0 | 3779 | // The first pass through, digIndx may exceed the limit, but in that case |
michael@0 | 3780 | // we no longer care about numTempBuf contents since they will be discarded |
michael@0 | 3781 | if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) { |
michael@0 | 3782 | numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; |
michael@0 | 3783 | } |
michael@0 | 3784 | collateVal = 0; |
michael@0 | 3785 | } else { |
michael@0 | 3786 | // Low-order digit case (ones place) |
michael@0 | 3787 | collateVal = (uint8_t)digVal; |
michael@0 | 3788 | |
michael@0 | 3789 | // Check for leading zeroes. |
michael@0 | 3790 | if (collateVal == 0) { |
michael@0 | 3791 | if (!leadingZeroIndex) |
michael@0 | 3792 | leadingZeroIndex = (digIndx/2) + 2; |
michael@0 | 3793 | } else |
michael@0 | 3794 | leadingZeroIndex = 0; |
michael@0 | 3795 | |
michael@0 | 3796 | // No need to write to buffer; the case of a last odd digit |
michael@0 | 3797 | // is handled below. |
michael@0 | 3798 | } |
michael@0 | 3799 | ++digIndx; |
michael@0 | 3800 | } else |
michael@0 | 3801 | ++trailingZeroCount; |
michael@0 | 3802 | |
michael@0 | 3803 | if (!collIter_bos(source)) { |
michael@0 | 3804 | ch = getPrevNormalizedChar(source, status); |
michael@0 | 3805 | //goBackOne(source); |
michael@0 | 3806 | if (U16_IS_TRAIL(ch)) { |
michael@0 | 3807 | backupState(source, &state); |
michael@0 | 3808 | if (!collIter_bos(source)) { |
michael@0 | 3809 | goBackOne(source); |
michael@0 | 3810 | UChar lead = getPrevNormalizedChar(source, status); |
michael@0 | 3811 | |
michael@0 | 3812 | if(U16_IS_LEAD(lead)) { |
michael@0 | 3813 | char32 = U16_GET_SUPPLEMENTARY(lead,ch); |
michael@0 | 3814 | } else { |
michael@0 | 3815 | loadState(source, &state, FALSE); |
michael@0 | 3816 | char32 = ch; |
michael@0 | 3817 | } |
michael@0 | 3818 | } |
michael@0 | 3819 | } else |
michael@0 | 3820 | char32 = ch; |
michael@0 | 3821 | |
michael@0 | 3822 | if ((digVal = u_charDigitValue(char32)) == -1 || (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) { |
michael@0 | 3823 | if (char32 > 0xFFFF) {// For surrogates. |
michael@0 | 3824 | loadState(source, &state, FALSE); |
michael@0 | 3825 | } |
michael@0 | 3826 | // Don't need to "reverse" the goBackOne call, |
michael@0 | 3827 | // as this points to the next position to process.. |
michael@0 | 3828 | //if (char32 > 0xFFFF) // For surrogates. |
michael@0 | 3829 | //getNextNormalizedChar(source); |
michael@0 | 3830 | break; |
michael@0 | 3831 | } |
michael@0 | 3832 | |
michael@0 | 3833 | goBackOne(source); |
michael@0 | 3834 | }else |
michael@0 | 3835 | break; |
michael@0 | 3836 | } |
michael@0 | 3837 | |
michael@0 | 3838 | if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_NUMBER) { |
michael@0 | 3839 | // our collation element is not too big, go ahead and finish with it |
michael@0 | 3840 | break; |
michael@0 | 3841 | } |
michael@0 | 3842 | // our digit string is too long for a collation element; |
michael@0 | 3843 | // set the limit for it, reset the state and begin again |
michael@0 | 3844 | ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGITS_FOR_NUMBER; |
michael@0 | 3845 | if ( ceLimit == 0 ) { |
michael@0 | 3846 | ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER; |
michael@0 | 3847 | } |
michael@0 | 3848 | ch = initial_ch; |
michael@0 | 3849 | loadState(source, &initialState, FALSE); |
michael@0 | 3850 | digIndx = endIndex = leadingZeroIndex = trailingZeroCount = 0; |
michael@0 | 3851 | collateVal = 0; |
michael@0 | 3852 | nonZeroValReached = FALSE; |
michael@0 | 3853 | } |
michael@0 | 3854 | |
michael@0 | 3855 | if (! nonZeroValReached) { |
michael@0 | 3856 | digIndx = 2; |
michael@0 | 3857 | trailingZeroCount = 0; |
michael@0 | 3858 | numTempBuf[2] = 6; |
michael@0 | 3859 | } |
michael@0 | 3860 | |
michael@0 | 3861 | if ((digIndx + trailingZeroCount) % 2 != 0) { |
michael@0 | 3862 | numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6; |
michael@0 | 3863 | digIndx += 1; // The implicit leading zero |
michael@0 | 3864 | } |
michael@0 | 3865 | if (trailingZeroCount % 2 != 0) { |
michael@0 | 3866 | // We had to consume one trailing zero for the low digit |
michael@0 | 3867 | // of the least significant byte |
michael@0 | 3868 | digIndx += 1; // The trailing zero not in the exponent |
michael@0 | 3869 | trailingZeroCount -= 1; |
michael@0 | 3870 | } |
michael@0 | 3871 | |
michael@0 | 3872 | endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ; |
michael@0 | 3873 | |
michael@0 | 3874 | // Subtract one off of the last byte. Really the first byte here, but it's reversed... |
michael@0 | 3875 | numTempBuf[2] -= 1; |
michael@0 | 3876 | |
michael@0 | 3877 | /* |
michael@0 | 3878 | We want to skip over the first two slots in the buffer. The first slot |
michael@0 | 3879 | is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the |
michael@0 | 3880 | sign/exponent byte: 0x80 + (decimalPos/2) & 7f. |
michael@0 | 3881 | The exponent must be adjusted by the number of leading zeroes, and the number of |
michael@0 | 3882 | trailing zeroes. |
michael@0 | 3883 | */ |
michael@0 | 3884 | numTempBuf[0] = UCOL_CODAN_PLACEHOLDER; |
michael@0 | 3885 | uint32_t exponent = (digIndx+trailingZeroCount)/2; |
michael@0 | 3886 | if (leadingZeroIndex) |
michael@0 | 3887 | exponent -= ((digIndx/2) + 2 - leadingZeroIndex); |
michael@0 | 3888 | numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F)); |
michael@0 | 3889 | |
michael@0 | 3890 | // Now transfer the collation key to our collIterate struct. |
michael@0 | 3891 | // The total size for our collation key is half of endIndex, rounded up. |
michael@0 | 3892 | int32_t size = (endIndex+1)/2; |
michael@0 | 3893 | if(!ensureCEsCapacity(source, size)) { |
michael@0 | 3894 | return (uint32_t)UCOL_NULLORDER; |
michael@0 | 3895 | } |
michael@0 | 3896 | *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight |
michael@0 | 3897 | (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight |
michael@0 | 3898 | UCOL_BYTE_COMMON; // Tertiary weight. |
michael@0 | 3899 | i = endIndex - 1; // Reset the index into the buffer. |
michael@0 | 3900 | while(i >= 2) { |
michael@0 | 3901 | uint32_t primWeight = numTempBuf[i--] << 8; |
michael@0 | 3902 | if ( i >= 2) |
michael@0 | 3903 | primWeight |= numTempBuf[i--]; |
michael@0 | 3904 | *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER; |
michael@0 | 3905 | } |
michael@0 | 3906 | |
michael@0 | 3907 | source->toReturn = source->CEpos -1; |
michael@0 | 3908 | return *(source->toReturn); |
michael@0 | 3909 | } else { |
michael@0 | 3910 | CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); |
michael@0 | 3911 | CE = *(CEOffset++); |
michael@0 | 3912 | break; |
michael@0 | 3913 | } |
michael@0 | 3914 | } |
michael@0 | 3915 | |
michael@0 | 3916 | case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/ |
michael@0 | 3917 | { |
michael@0 | 3918 | static const uint32_t |
michael@0 | 3919 | SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7; |
michael@0 | 3920 | //const uint32_t LCount = 19; |
michael@0 | 3921 | static const uint32_t VCount = 21; |
michael@0 | 3922 | static const uint32_t TCount = 28; |
michael@0 | 3923 | //const uint32_t NCount = VCount * TCount; /* 588 */ |
michael@0 | 3924 | //const uint32_t SCount = LCount * NCount; /* 11172 */ |
michael@0 | 3925 | |
michael@0 | 3926 | uint32_t L = ch - SBase; |
michael@0 | 3927 | /* |
michael@0 | 3928 | divide into pieces. |
michael@0 | 3929 | we do it in this order since some compilers can do % and / in one |
michael@0 | 3930 | operation |
michael@0 | 3931 | */ |
michael@0 | 3932 | uint32_t T = L % TCount; |
michael@0 | 3933 | L /= TCount; |
michael@0 | 3934 | uint32_t V = L % VCount; |
michael@0 | 3935 | L /= VCount; |
michael@0 | 3936 | |
michael@0 | 3937 | /* offset them */ |
michael@0 | 3938 | L += LBase; |
michael@0 | 3939 | V += VBase; |
michael@0 | 3940 | T += TBase; |
michael@0 | 3941 | |
michael@0 | 3942 | int32_t firstOffset = (int32_t)(source->pos - source->string); |
michael@0 | 3943 | source->appendOffset(firstOffset, *status); |
michael@0 | 3944 | |
michael@0 | 3945 | /* |
michael@0 | 3946 | * return the first CE, but first put the rest into the expansion buffer |
michael@0 | 3947 | */ |
michael@0 | 3948 | if (!source->coll->image->jamoSpecial) { |
michael@0 | 3949 | *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L); |
michael@0 | 3950 | *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V); |
michael@0 | 3951 | source->appendOffset(firstOffset + 1, *status); |
michael@0 | 3952 | |
michael@0 | 3953 | if (T != TBase) { |
michael@0 | 3954 | *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T); |
michael@0 | 3955 | source->appendOffset(firstOffset + 1, *status); |
michael@0 | 3956 | } |
michael@0 | 3957 | |
michael@0 | 3958 | source->toReturn = source->CEpos - 1; |
michael@0 | 3959 | |
michael@0 | 3960 | source->offsetReturn = source->offsetStore - 1; |
michael@0 | 3961 | if (source->offsetReturn == source->offsetBuffer) { |
michael@0 | 3962 | source->offsetStore = source->offsetBuffer; |
michael@0 | 3963 | } |
michael@0 | 3964 | |
michael@0 | 3965 | return *(source->toReturn); |
michael@0 | 3966 | } else { |
michael@0 | 3967 | // Since Hanguls pass the FCD check, it is |
michael@0 | 3968 | // guaranteed that we won't be in |
michael@0 | 3969 | // the normalization buffer if something like this happens |
michael@0 | 3970 | |
michael@0 | 3971 | // Move Jamos into normalization buffer |
michael@0 | 3972 | UChar *tempbuffer = source->writableBuffer.getBuffer(5); |
michael@0 | 3973 | int32_t tempbufferLength, jamoOffset; |
michael@0 | 3974 | tempbuffer[0] = 0; |
michael@0 | 3975 | tempbuffer[1] = (UChar)L; |
michael@0 | 3976 | tempbuffer[2] = (UChar)V; |
michael@0 | 3977 | if (T != TBase) { |
michael@0 | 3978 | tempbuffer[3] = (UChar)T; |
michael@0 | 3979 | tempbufferLength = 4; |
michael@0 | 3980 | } else { |
michael@0 | 3981 | tempbufferLength = 3; |
michael@0 | 3982 | } |
michael@0 | 3983 | source->writableBuffer.releaseBuffer(tempbufferLength); |
michael@0 | 3984 | |
michael@0 | 3985 | // Indicate where to continue in main input string after exhausting the writableBuffer |
michael@0 | 3986 | if (source->pos == source->string) { |
michael@0 | 3987 | jamoOffset = 0; |
michael@0 | 3988 | source->fcdPosition = NULL; |
michael@0 | 3989 | } else { |
michael@0 | 3990 | jamoOffset = source->pos - source->string; |
michael@0 | 3991 | source->fcdPosition = source->pos-1; |
michael@0 | 3992 | } |
michael@0 | 3993 | |
michael@0 | 3994 | // Append offsets for the additional chars |
michael@0 | 3995 | // (not the 0, and not the L whose offsets match the original Hangul) |
michael@0 | 3996 | int32_t jamoRemaining = tempbufferLength - 2; |
michael@0 | 3997 | jamoOffset++; // appended offsets should match end of original Hangul |
michael@0 | 3998 | while (jamoRemaining-- > 0) { |
michael@0 | 3999 | source->appendOffset(jamoOffset, *status); |
michael@0 | 4000 | } |
michael@0 | 4001 | |
michael@0 | 4002 | source->offsetRepeatValue = jamoOffset; |
michael@0 | 4003 | |
michael@0 | 4004 | source->offsetReturn = source->offsetStore - 1; |
michael@0 | 4005 | if (source->offsetReturn == source->offsetBuffer) { |
michael@0 | 4006 | source->offsetStore = source->offsetBuffer; |
michael@0 | 4007 | } |
michael@0 | 4008 | |
michael@0 | 4009 | source->pos = source->writableBuffer.getTerminatedBuffer() + tempbufferLength; |
michael@0 | 4010 | source->origFlags = source->flags; |
michael@0 | 4011 | source->flags |= UCOL_ITER_INNORMBUF; |
michael@0 | 4012 | source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); |
michael@0 | 4013 | |
michael@0 | 4014 | return(UCOL_IGNORABLE); |
michael@0 | 4015 | } |
michael@0 | 4016 | } |
michael@0 | 4017 | |
michael@0 | 4018 | case IMPLICIT_TAG: /* everything that is not defined otherwise */ |
michael@0 | 4019 | return getPrevImplicit(ch, source); |
michael@0 | 4020 | |
michael@0 | 4021 | // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function |
michael@0 | 4022 | case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/ |
michael@0 | 4023 | return getPrevImplicit(ch, source); |
michael@0 | 4024 | |
michael@0 | 4025 | case SURROGATE_TAG: /* This is a surrogate pair */ |
michael@0 | 4026 | /* essentially an engaged lead surrogate. */ |
michael@0 | 4027 | /* if you have encountered it here, it means that a */ |
michael@0 | 4028 | /* broken sequence was encountered and this is an error */ |
michael@0 | 4029 | return UCOL_NOT_FOUND; |
michael@0 | 4030 | |
michael@0 | 4031 | case LEAD_SURROGATE_TAG: /* D800-DBFF*/ |
michael@0 | 4032 | return UCOL_NOT_FOUND; /* broken surrogate sequence */ |
michael@0 | 4033 | |
michael@0 | 4034 | case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/ |
michael@0 | 4035 | { |
michael@0 | 4036 | UChar32 cp = 0; |
michael@0 | 4037 | UChar prevChar; |
michael@0 | 4038 | const UChar *prev; |
michael@0 | 4039 | if (isAtStartPrevIterate(source)) { |
michael@0 | 4040 | /* we are at the start of the string, wrong place to be at */ |
michael@0 | 4041 | return UCOL_NOT_FOUND; |
michael@0 | 4042 | } |
michael@0 | 4043 | if (source->pos != source->writableBuffer.getBuffer()) { |
michael@0 | 4044 | prev = source->pos - 1; |
michael@0 | 4045 | } else { |
michael@0 | 4046 | prev = source->fcdPosition; |
michael@0 | 4047 | } |
michael@0 | 4048 | prevChar = *prev; |
michael@0 | 4049 | |
michael@0 | 4050 | /* Handles Han and Supplementary characters here.*/ |
michael@0 | 4051 | if (U16_IS_LEAD(prevChar)) { |
michael@0 | 4052 | cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000)); |
michael@0 | 4053 | source->pos = prev; |
michael@0 | 4054 | } else { |
michael@0 | 4055 | return UCOL_NOT_FOUND; /* like unassigned */ |
michael@0 | 4056 | } |
michael@0 | 4057 | |
michael@0 | 4058 | return getPrevImplicit(cp, source); |
michael@0 | 4059 | } |
michael@0 | 4060 | |
michael@0 | 4061 | /* UCA is filled with these. Tailorings are NOT_FOUND */ |
michael@0 | 4062 | /* not yet implemented */ |
michael@0 | 4063 | case CHARSET_TAG: /* this tag always returns */ |
michael@0 | 4064 | /* probably after 1.8 */ |
michael@0 | 4065 | return UCOL_NOT_FOUND; |
michael@0 | 4066 | |
michael@0 | 4067 | default: /* this tag always returns */ |
michael@0 | 4068 | *status = U_INTERNAL_PROGRAM_ERROR; |
michael@0 | 4069 | CE=0; |
michael@0 | 4070 | break; |
michael@0 | 4071 | } |
michael@0 | 4072 | |
michael@0 | 4073 | if (CE <= UCOL_NOT_FOUND) { |
michael@0 | 4074 | break; |
michael@0 | 4075 | } |
michael@0 | 4076 | } |
michael@0 | 4077 | |
michael@0 | 4078 | return CE; |
michael@0 | 4079 | } |
michael@0 | 4080 | |
michael@0 | 4081 | /* This should really be a macro */ |
michael@0 | 4082 | /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */ |
michael@0 | 4083 | /* secondaries in French */ |
michael@0 | 4084 | /* |
michael@0 | 4085 | void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) { |
michael@0 | 4086 | uint8_t temp; |
michael@0 | 4087 | while(start<end) { |
michael@0 | 4088 | temp = *start; |
michael@0 | 4089 | *start++ = *end; |
michael@0 | 4090 | *end-- = temp; |
michael@0 | 4091 | } |
michael@0 | 4092 | } |
michael@0 | 4093 | */ |
michael@0 | 4094 | |
michael@0 | 4095 | #define uprv_ucol_reverse_buffer(TYPE, start, end) { \ |
michael@0 | 4096 | TYPE tempA; \ |
michael@0 | 4097 | while((start)<(end)) { \ |
michael@0 | 4098 | tempA = *(start); \ |
michael@0 | 4099 | *(start)++ = *(end); \ |
michael@0 | 4100 | *(end)-- = tempA; \ |
michael@0 | 4101 | } \ |
michael@0 | 4102 | } |
michael@0 | 4103 | |
michael@0 | 4104 | /****************************************************************************/ |
michael@0 | 4105 | /* Following are the sortkey generation functions */ |
michael@0 | 4106 | /* */ |
michael@0 | 4107 | /****************************************************************************/ |
michael@0 | 4108 | |
michael@0 | 4109 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 4110 | ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length, |
michael@0 | 4111 | const uint8_t *src2, int32_t src2Length, |
michael@0 | 4112 | uint8_t *dest, int32_t destCapacity) { |
michael@0 | 4113 | /* check arguments */ |
michael@0 | 4114 | if( src1==NULL || src1Length<-1 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) || |
michael@0 | 4115 | src2==NULL || src2Length<-1 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) || |
michael@0 | 4116 | destCapacity<0 || (destCapacity>0 && dest==NULL) |
michael@0 | 4117 | ) { |
michael@0 | 4118 | /* error, attempt to write a zero byte and return 0 */ |
michael@0 | 4119 | if(dest!=NULL && destCapacity>0) { |
michael@0 | 4120 | *dest=0; |
michael@0 | 4121 | } |
michael@0 | 4122 | return 0; |
michael@0 | 4123 | } |
michael@0 | 4124 | |
michael@0 | 4125 | /* check lengths and capacity */ |
michael@0 | 4126 | if(src1Length<0) { |
michael@0 | 4127 | src1Length=(int32_t)uprv_strlen((const char *)src1)+1; |
michael@0 | 4128 | } |
michael@0 | 4129 | if(src2Length<0) { |
michael@0 | 4130 | src2Length=(int32_t)uprv_strlen((const char *)src2)+1; |
michael@0 | 4131 | } |
michael@0 | 4132 | |
michael@0 | 4133 | int32_t destLength=src1Length+src2Length; |
michael@0 | 4134 | if(destLength>destCapacity) { |
michael@0 | 4135 | /* the merged sort key does not fit into the destination */ |
michael@0 | 4136 | return destLength; |
michael@0 | 4137 | } |
michael@0 | 4138 | |
michael@0 | 4139 | /* merge the sort keys with the same number of levels */ |
michael@0 | 4140 | uint8_t *p=dest; |
michael@0 | 4141 | for(;;) { |
michael@0 | 4142 | /* copy level from src1 not including 00 or 01 */ |
michael@0 | 4143 | uint8_t b; |
michael@0 | 4144 | while((b=*src1)>=2) { |
michael@0 | 4145 | ++src1; |
michael@0 | 4146 | *p++=b; |
michael@0 | 4147 | } |
michael@0 | 4148 | |
michael@0 | 4149 | /* add a 02 merge separator */ |
michael@0 | 4150 | *p++=2; |
michael@0 | 4151 | |
michael@0 | 4152 | /* copy level from src2 not including 00 or 01 */ |
michael@0 | 4153 | while((b=*src2)>=2) { |
michael@0 | 4154 | ++src2; |
michael@0 | 4155 | *p++=b; |
michael@0 | 4156 | } |
michael@0 | 4157 | |
michael@0 | 4158 | /* if both sort keys have another level, then add a 01 level separator and continue */ |
michael@0 | 4159 | if(*src1==1 && *src2==1) { |
michael@0 | 4160 | ++src1; |
michael@0 | 4161 | ++src2; |
michael@0 | 4162 | *p++=1; |
michael@0 | 4163 | } else { |
michael@0 | 4164 | break; |
michael@0 | 4165 | } |
michael@0 | 4166 | } |
michael@0 | 4167 | |
michael@0 | 4168 | /* |
michael@0 | 4169 | * here, at least one sort key is finished now, but the other one |
michael@0 | 4170 | * might have some contents left from containing more levels; |
michael@0 | 4171 | * that contents is just appended to the result |
michael@0 | 4172 | */ |
michael@0 | 4173 | if(*src1!=0) { |
michael@0 | 4174 | /* src1 is not finished, therefore *src2==0, and src1 is appended */ |
michael@0 | 4175 | src2=src1; |
michael@0 | 4176 | } |
michael@0 | 4177 | /* append src2, "the other, unfinished sort key" */ |
michael@0 | 4178 | while((*p++=*src2++)!=0) {} |
michael@0 | 4179 | |
michael@0 | 4180 | /* the actual length might be less than destLength if either sort key contained illegally embedded zero bytes */ |
michael@0 | 4181 | return (int32_t)(p-dest); |
michael@0 | 4182 | } |
michael@0 | 4183 | |
michael@0 | 4184 | U_NAMESPACE_BEGIN |
michael@0 | 4185 | |
michael@0 | 4186 | class SortKeyByteSink : public ByteSink { |
michael@0 | 4187 | public: |
michael@0 | 4188 | SortKeyByteSink(char *dest, int32_t destCapacity) |
michael@0 | 4189 | : buffer_(dest), capacity_(destCapacity), |
michael@0 | 4190 | appended_(0) { |
michael@0 | 4191 | if (buffer_ == NULL) { |
michael@0 | 4192 | capacity_ = 0; |
michael@0 | 4193 | } else if(capacity_ < 0) { |
michael@0 | 4194 | buffer_ = NULL; |
michael@0 | 4195 | capacity_ = 0; |
michael@0 | 4196 | } |
michael@0 | 4197 | } |
michael@0 | 4198 | virtual ~SortKeyByteSink(); |
michael@0 | 4199 | |
michael@0 | 4200 | virtual void Append(const char *bytes, int32_t n); |
michael@0 | 4201 | void Append(uint32_t b) { |
michael@0 | 4202 | if (appended_ < capacity_ || Resize(1, appended_)) { |
michael@0 | 4203 | buffer_[appended_] = (char)b; |
michael@0 | 4204 | } |
michael@0 | 4205 | ++appended_; |
michael@0 | 4206 | } |
michael@0 | 4207 | void Append(uint32_t b1, uint32_t b2) { |
michael@0 | 4208 | int32_t a2 = appended_ + 2; |
michael@0 | 4209 | if (a2 <= capacity_ || Resize(2, appended_)) { |
michael@0 | 4210 | buffer_[appended_] = (char)b1; |
michael@0 | 4211 | buffer_[appended_ + 1] = (char)b2; |
michael@0 | 4212 | } else if(appended_ < capacity_) { |
michael@0 | 4213 | buffer_[appended_] = (char)b1; |
michael@0 | 4214 | } |
michael@0 | 4215 | appended_ = a2; |
michael@0 | 4216 | } |
michael@0 | 4217 | virtual char *GetAppendBuffer(int32_t min_capacity, |
michael@0 | 4218 | int32_t desired_capacity_hint, |
michael@0 | 4219 | char *scratch, int32_t scratch_capacity, |
michael@0 | 4220 | int32_t *result_capacity); |
michael@0 | 4221 | int32_t NumberOfBytesAppended() const { return appended_; } |
michael@0 | 4222 | /** @return FALSE if memory allocation failed */ |
michael@0 | 4223 | UBool IsOk() const { return buffer_ != NULL; } |
michael@0 | 4224 | |
michael@0 | 4225 | protected: |
michael@0 | 4226 | virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) = 0; |
michael@0 | 4227 | virtual UBool Resize(int32_t appendCapacity, int32_t length) = 0; |
michael@0 | 4228 | |
michael@0 | 4229 | void SetNotOk() { |
michael@0 | 4230 | buffer_ = NULL; |
michael@0 | 4231 | capacity_ = 0; |
michael@0 | 4232 | } |
michael@0 | 4233 | |
michael@0 | 4234 | char *buffer_; |
michael@0 | 4235 | int32_t capacity_; |
michael@0 | 4236 | int32_t appended_; |
michael@0 | 4237 | |
michael@0 | 4238 | private: |
michael@0 | 4239 | SortKeyByteSink(const SortKeyByteSink &); // copy constructor not implemented |
michael@0 | 4240 | SortKeyByteSink &operator=(const SortKeyByteSink &); // assignment operator not implemented |
michael@0 | 4241 | }; |
michael@0 | 4242 | |
michael@0 | 4243 | SortKeyByteSink::~SortKeyByteSink() {} |
michael@0 | 4244 | |
michael@0 | 4245 | void |
michael@0 | 4246 | SortKeyByteSink::Append(const char *bytes, int32_t n) { |
michael@0 | 4247 | if (n <= 0 || bytes == NULL) { |
michael@0 | 4248 | return; |
michael@0 | 4249 | } |
michael@0 | 4250 | int32_t length = appended_; |
michael@0 | 4251 | appended_ += n; |
michael@0 | 4252 | if ((buffer_ + length) == bytes) { |
michael@0 | 4253 | return; // the caller used GetAppendBuffer() and wrote the bytes already |
michael@0 | 4254 | } |
michael@0 | 4255 | int32_t available = capacity_ - length; |
michael@0 | 4256 | if (n <= available) { |
michael@0 | 4257 | uprv_memcpy(buffer_ + length, bytes, n); |
michael@0 | 4258 | } else { |
michael@0 | 4259 | AppendBeyondCapacity(bytes, n, length); |
michael@0 | 4260 | } |
michael@0 | 4261 | } |
michael@0 | 4262 | |
michael@0 | 4263 | char * |
michael@0 | 4264 | SortKeyByteSink::GetAppendBuffer(int32_t min_capacity, |
michael@0 | 4265 | int32_t desired_capacity_hint, |
michael@0 | 4266 | char *scratch, |
michael@0 | 4267 | int32_t scratch_capacity, |
michael@0 | 4268 | int32_t *result_capacity) { |
michael@0 | 4269 | if (min_capacity < 1 || scratch_capacity < min_capacity) { |
michael@0 | 4270 | *result_capacity = 0; |
michael@0 | 4271 | return NULL; |
michael@0 | 4272 | } |
michael@0 | 4273 | int32_t available = capacity_ - appended_; |
michael@0 | 4274 | if (available >= min_capacity) { |
michael@0 | 4275 | *result_capacity = available; |
michael@0 | 4276 | return buffer_ + appended_; |
michael@0 | 4277 | } else if (Resize(desired_capacity_hint, appended_)) { |
michael@0 | 4278 | *result_capacity = capacity_ - appended_; |
michael@0 | 4279 | return buffer_ + appended_; |
michael@0 | 4280 | } else { |
michael@0 | 4281 | *result_capacity = scratch_capacity; |
michael@0 | 4282 | return scratch; |
michael@0 | 4283 | } |
michael@0 | 4284 | } |
michael@0 | 4285 | |
michael@0 | 4286 | class FixedSortKeyByteSink : public SortKeyByteSink { |
michael@0 | 4287 | public: |
michael@0 | 4288 | FixedSortKeyByteSink(char *dest, int32_t destCapacity) |
michael@0 | 4289 | : SortKeyByteSink(dest, destCapacity) {} |
michael@0 | 4290 | virtual ~FixedSortKeyByteSink(); |
michael@0 | 4291 | |
michael@0 | 4292 | private: |
michael@0 | 4293 | virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length); |
michael@0 | 4294 | virtual UBool Resize(int32_t appendCapacity, int32_t length); |
michael@0 | 4295 | }; |
michael@0 | 4296 | |
michael@0 | 4297 | FixedSortKeyByteSink::~FixedSortKeyByteSink() {} |
michael@0 | 4298 | |
michael@0 | 4299 | void |
michael@0 | 4300 | FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t /*n*/, int32_t length) { |
michael@0 | 4301 | // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_ |
michael@0 | 4302 | // Fill the buffer completely. |
michael@0 | 4303 | int32_t available = capacity_ - length; |
michael@0 | 4304 | if (available > 0) { |
michael@0 | 4305 | uprv_memcpy(buffer_ + length, bytes, available); |
michael@0 | 4306 | } |
michael@0 | 4307 | } |
michael@0 | 4308 | |
michael@0 | 4309 | UBool |
michael@0 | 4310 | FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) { |
michael@0 | 4311 | return FALSE; |
michael@0 | 4312 | } |
michael@0 | 4313 | |
michael@0 | 4314 | class CollationKeyByteSink : public SortKeyByteSink { |
michael@0 | 4315 | public: |
michael@0 | 4316 | CollationKeyByteSink(CollationKey &key) |
michael@0 | 4317 | : SortKeyByteSink(reinterpret_cast<char *>(key.getBytes()), key.getCapacity()), |
michael@0 | 4318 | key_(key) {} |
michael@0 | 4319 | virtual ~CollationKeyByteSink(); |
michael@0 | 4320 | |
michael@0 | 4321 | private: |
michael@0 | 4322 | virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length); |
michael@0 | 4323 | virtual UBool Resize(int32_t appendCapacity, int32_t length); |
michael@0 | 4324 | |
michael@0 | 4325 | CollationKey &key_; |
michael@0 | 4326 | }; |
michael@0 | 4327 | |
michael@0 | 4328 | CollationKeyByteSink::~CollationKeyByteSink() {} |
michael@0 | 4329 | |
michael@0 | 4330 | void |
michael@0 | 4331 | CollationKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) { |
michael@0 | 4332 | // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_ |
michael@0 | 4333 | if (Resize(n, length)) { |
michael@0 | 4334 | uprv_memcpy(buffer_ + length, bytes, n); |
michael@0 | 4335 | } |
michael@0 | 4336 | } |
michael@0 | 4337 | |
michael@0 | 4338 | UBool |
michael@0 | 4339 | CollationKeyByteSink::Resize(int32_t appendCapacity, int32_t length) { |
michael@0 | 4340 | if (buffer_ == NULL) { |
michael@0 | 4341 | return FALSE; // allocation failed before already |
michael@0 | 4342 | } |
michael@0 | 4343 | int32_t newCapacity = 2 * capacity_; |
michael@0 | 4344 | int32_t altCapacity = length + 2 * appendCapacity; |
michael@0 | 4345 | if (newCapacity < altCapacity) { |
michael@0 | 4346 | newCapacity = altCapacity; |
michael@0 | 4347 | } |
michael@0 | 4348 | if (newCapacity < 200) { |
michael@0 | 4349 | newCapacity = 200; |
michael@0 | 4350 | } |
michael@0 | 4351 | uint8_t *newBuffer = key_.reallocate(newCapacity, length); |
michael@0 | 4352 | if (newBuffer == NULL) { |
michael@0 | 4353 | SetNotOk(); |
michael@0 | 4354 | return FALSE; |
michael@0 | 4355 | } |
michael@0 | 4356 | buffer_ = reinterpret_cast<char *>(newBuffer); |
michael@0 | 4357 | capacity_ = newCapacity; |
michael@0 | 4358 | return TRUE; |
michael@0 | 4359 | } |
michael@0 | 4360 | |
michael@0 | 4361 | /** |
michael@0 | 4362 | * uint8_t byte buffer, similar to CharString but simpler. |
michael@0 | 4363 | */ |
michael@0 | 4364 | class SortKeyLevel : public UMemory { |
michael@0 | 4365 | public: |
michael@0 | 4366 | SortKeyLevel() : len(0), ok(TRUE) {} |
michael@0 | 4367 | ~SortKeyLevel() {} |
michael@0 | 4368 | |
michael@0 | 4369 | /** @return FALSE if memory allocation failed */ |
michael@0 | 4370 | UBool isOk() const { return ok; } |
michael@0 | 4371 | UBool isEmpty() const { return len == 0; } |
michael@0 | 4372 | int32_t length() const { return len; } |
michael@0 | 4373 | const uint8_t *data() const { return buffer.getAlias(); } |
michael@0 | 4374 | uint8_t operator[](int32_t index) const { return buffer[index]; } |
michael@0 | 4375 | |
michael@0 | 4376 | void appendByte(uint32_t b); |
michael@0 | 4377 | |
michael@0 | 4378 | void appendTo(ByteSink &sink) const { |
michael@0 | 4379 | sink.Append(reinterpret_cast<const char *>(buffer.getAlias()), len); |
michael@0 | 4380 | } |
michael@0 | 4381 | |
michael@0 | 4382 | uint8_t &lastByte() { |
michael@0 | 4383 | U_ASSERT(len > 0); |
michael@0 | 4384 | return buffer[len - 1]; |
michael@0 | 4385 | } |
michael@0 | 4386 | |
michael@0 | 4387 | uint8_t *getLastFewBytes(int32_t n) { |
michael@0 | 4388 | if (ok && len >= n) { |
michael@0 | 4389 | return buffer.getAlias() + len - n; |
michael@0 | 4390 | } else { |
michael@0 | 4391 | return NULL; |
michael@0 | 4392 | } |
michael@0 | 4393 | } |
michael@0 | 4394 | |
michael@0 | 4395 | private: |
michael@0 | 4396 | MaybeStackArray<uint8_t, 40> buffer; |
michael@0 | 4397 | int32_t len; |
michael@0 | 4398 | UBool ok; |
michael@0 | 4399 | |
michael@0 | 4400 | UBool ensureCapacity(int32_t appendCapacity); |
michael@0 | 4401 | |
michael@0 | 4402 | SortKeyLevel(const SortKeyLevel &other); // forbid copying of this class |
michael@0 | 4403 | SortKeyLevel &operator=(const SortKeyLevel &other); // forbid copying of this class |
michael@0 | 4404 | }; |
michael@0 | 4405 | |
michael@0 | 4406 | void SortKeyLevel::appendByte(uint32_t b) { |
michael@0 | 4407 | if(len < buffer.getCapacity() || ensureCapacity(1)) { |
michael@0 | 4408 | buffer[len++] = (uint8_t)b; |
michael@0 | 4409 | } |
michael@0 | 4410 | } |
michael@0 | 4411 | |
michael@0 | 4412 | UBool SortKeyLevel::ensureCapacity(int32_t appendCapacity) { |
michael@0 | 4413 | if(!ok) { |
michael@0 | 4414 | return FALSE; |
michael@0 | 4415 | } |
michael@0 | 4416 | int32_t newCapacity = 2 * buffer.getCapacity(); |
michael@0 | 4417 | int32_t altCapacity = len + 2 * appendCapacity; |
michael@0 | 4418 | if (newCapacity < altCapacity) { |
michael@0 | 4419 | newCapacity = altCapacity; |
michael@0 | 4420 | } |
michael@0 | 4421 | if (newCapacity < 200) { |
michael@0 | 4422 | newCapacity = 200; |
michael@0 | 4423 | } |
michael@0 | 4424 | if(buffer.resize(newCapacity, len)==NULL) { |
michael@0 | 4425 | return ok = FALSE; |
michael@0 | 4426 | } |
michael@0 | 4427 | return TRUE; |
michael@0 | 4428 | } |
michael@0 | 4429 | |
michael@0 | 4430 | U_NAMESPACE_END |
michael@0 | 4431 | |
michael@0 | 4432 | /* sortkey API */ |
michael@0 | 4433 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 4434 | ucol_getSortKey(const UCollator *coll, |
michael@0 | 4435 | const UChar *source, |
michael@0 | 4436 | int32_t sourceLength, |
michael@0 | 4437 | uint8_t *result, |
michael@0 | 4438 | int32_t resultLength) |
michael@0 | 4439 | { |
michael@0 | 4440 | UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY); |
michael@0 | 4441 | if (UTRACE_LEVEL(UTRACE_VERBOSE)) { |
michael@0 | 4442 | UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source, |
michael@0 | 4443 | ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength)); |
michael@0 | 4444 | } |
michael@0 | 4445 | |
michael@0 | 4446 | if(coll->delegate != NULL) { |
michael@0 | 4447 | return ((const Collator*)coll->delegate)->getSortKey(source, sourceLength, result, resultLength); |
michael@0 | 4448 | } |
michael@0 | 4449 | |
michael@0 | 4450 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 4451 | int32_t keySize = 0; |
michael@0 | 4452 | |
michael@0 | 4453 | if(source != NULL) { |
michael@0 | 4454 | // source == NULL is actually an error situation, but we would need to |
michael@0 | 4455 | // have an error code to return it. Until we introduce a new |
michael@0 | 4456 | // API, it stays like this |
michael@0 | 4457 | |
michael@0 | 4458 | /* this uses the function pointer that is set in updateinternalstate */ |
michael@0 | 4459 | /* currently, there are two funcs: */ |
michael@0 | 4460 | /*ucol_calcSortKey(...);*/ |
michael@0 | 4461 | /*ucol_calcSortKeySimpleTertiary(...);*/ |
michael@0 | 4462 | |
michael@0 | 4463 | uint8_t noDest[1] = { 0 }; |
michael@0 | 4464 | if(result == NULL) { |
michael@0 | 4465 | // Distinguish pure preflighting from an allocation error. |
michael@0 | 4466 | result = noDest; |
michael@0 | 4467 | resultLength = 0; |
michael@0 | 4468 | } |
michael@0 | 4469 | FixedSortKeyByteSink sink(reinterpret_cast<char *>(result), resultLength); |
michael@0 | 4470 | coll->sortKeyGen(coll, source, sourceLength, sink, &status); |
michael@0 | 4471 | if(U_SUCCESS(status)) { |
michael@0 | 4472 | keySize = sink.NumberOfBytesAppended(); |
michael@0 | 4473 | } |
michael@0 | 4474 | } |
michael@0 | 4475 | UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize); |
michael@0 | 4476 | UTRACE_EXIT_STATUS(status); |
michael@0 | 4477 | return keySize; |
michael@0 | 4478 | } |
michael@0 | 4479 | |
michael@0 | 4480 | U_CFUNC int32_t |
michael@0 | 4481 | ucol_getCollationKey(const UCollator *coll, |
michael@0 | 4482 | const UChar *source, int32_t sourceLength, |
michael@0 | 4483 | CollationKey &key, |
michael@0 | 4484 | UErrorCode &errorCode) { |
michael@0 | 4485 | CollationKeyByteSink sink(key); |
michael@0 | 4486 | coll->sortKeyGen(coll, source, sourceLength, sink, &errorCode); |
michael@0 | 4487 | return sink.NumberOfBytesAppended(); |
michael@0 | 4488 | } |
michael@0 | 4489 | |
michael@0 | 4490 | // Is this primary weight compressible? |
michael@0 | 4491 | // Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit). |
michael@0 | 4492 | // TODO: This should use per-lead-byte flags from FractionalUCA.txt. |
michael@0 | 4493 | static inline UBool |
michael@0 | 4494 | isCompressible(const UCollator * /*coll*/, uint8_t primary1) { |
michael@0 | 4495 | return UCOL_BYTE_FIRST_NON_LATIN_PRIMARY <= primary1 && primary1 <= maxRegularPrimary; |
michael@0 | 4496 | } |
michael@0 | 4497 | |
michael@0 | 4498 | static |
michael@0 | 4499 | inline void doCaseShift(SortKeyLevel &cases, uint32_t &caseShift) { |
michael@0 | 4500 | if (caseShift == 0) { |
michael@0 | 4501 | cases.appendByte(UCOL_CASE_BYTE_START); |
michael@0 | 4502 | caseShift = UCOL_CASE_SHIFT_START; |
michael@0 | 4503 | } |
michael@0 | 4504 | } |
michael@0 | 4505 | |
michael@0 | 4506 | // Packs the secondary buffer when processing French locale. |
michael@0 | 4507 | static void |
michael@0 | 4508 | packFrench(const uint8_t *secondaries, int32_t secsize, SortKeyByteSink &result) { |
michael@0 | 4509 | secondaries += secsize; // We read the secondary-level bytes back to front. |
michael@0 | 4510 | uint8_t secondary; |
michael@0 | 4511 | int32_t count2 = 0; |
michael@0 | 4512 | int32_t i = 0; |
michael@0 | 4513 | // we use i here since the key size already accounts for terminators, so we'll discard the increment |
michael@0 | 4514 | for(i = 0; i<secsize; i++) { |
michael@0 | 4515 | secondary = *(secondaries-i-1); |
michael@0 | 4516 | /* This is compression code. */ |
michael@0 | 4517 | if (secondary == UCOL_COMMON2) { |
michael@0 | 4518 | ++count2; |
michael@0 | 4519 | } else { |
michael@0 | 4520 | if (count2 > 0) { |
michael@0 | 4521 | if (secondary > UCOL_COMMON2) { // not necessary for 4th level. |
michael@0 | 4522 | while (count2 > UCOL_TOP_COUNT2) { |
michael@0 | 4523 | result.Append(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2); |
michael@0 | 4524 | count2 -= (uint32_t)UCOL_TOP_COUNT2; |
michael@0 | 4525 | } |
michael@0 | 4526 | result.Append(UCOL_COMMON_TOP2 - (count2-1)); |
michael@0 | 4527 | } else { |
michael@0 | 4528 | while (count2 > UCOL_BOT_COUNT2) { |
michael@0 | 4529 | result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); |
michael@0 | 4530 | count2 -= (uint32_t)UCOL_BOT_COUNT2; |
michael@0 | 4531 | } |
michael@0 | 4532 | result.Append(UCOL_COMMON_BOT2 + (count2-1)); |
michael@0 | 4533 | } |
michael@0 | 4534 | count2 = 0; |
michael@0 | 4535 | } |
michael@0 | 4536 | result.Append(secondary); |
michael@0 | 4537 | } |
michael@0 | 4538 | } |
michael@0 | 4539 | if (count2 > 0) { |
michael@0 | 4540 | while (count2 > UCOL_BOT_COUNT2) { |
michael@0 | 4541 | result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); |
michael@0 | 4542 | count2 -= (uint32_t)UCOL_BOT_COUNT2; |
michael@0 | 4543 | } |
michael@0 | 4544 | result.Append(UCOL_COMMON_BOT2 + (count2-1)); |
michael@0 | 4545 | } |
michael@0 | 4546 | } |
michael@0 | 4547 | |
michael@0 | 4548 | #define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0 |
michael@0 | 4549 | |
michael@0 | 4550 | /* This is the sortkey work horse function */ |
michael@0 | 4551 | U_CFUNC void U_CALLCONV |
michael@0 | 4552 | ucol_calcSortKey(const UCollator *coll, |
michael@0 | 4553 | const UChar *source, |
michael@0 | 4554 | int32_t sourceLength, |
michael@0 | 4555 | SortKeyByteSink &result, |
michael@0 | 4556 | UErrorCode *status) |
michael@0 | 4557 | { |
michael@0 | 4558 | if(U_FAILURE(*status)) { |
michael@0 | 4559 | return; |
michael@0 | 4560 | } |
michael@0 | 4561 | |
michael@0 | 4562 | SortKeyByteSink &primaries = result; |
michael@0 | 4563 | SortKeyLevel secondaries; |
michael@0 | 4564 | SortKeyLevel tertiaries; |
michael@0 | 4565 | SortKeyLevel cases; |
michael@0 | 4566 | SortKeyLevel quads; |
michael@0 | 4567 | |
michael@0 | 4568 | UnicodeString normSource; |
michael@0 | 4569 | |
michael@0 | 4570 | int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength); |
michael@0 | 4571 | |
michael@0 | 4572 | UColAttributeValue strength = coll->strength; |
michael@0 | 4573 | |
michael@0 | 4574 | uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF); |
michael@0 | 4575 | uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF); |
michael@0 | 4576 | uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF); |
michael@0 | 4577 | UBool compareIdent = (strength == UCOL_IDENTICAL); |
michael@0 | 4578 | UBool doCase = (coll->caseLevel == UCOL_ON); |
michael@0 | 4579 | UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0); |
michael@0 | 4580 | UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); |
michael@0 | 4581 | //UBool qShifted = shifted && (compareQuad == 0); |
michael@0 | 4582 | UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0); |
michael@0 | 4583 | |
michael@0 | 4584 | uint32_t variableTopValue = coll->variableTopValue; |
michael@0 | 4585 | // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no |
michael@0 | 4586 | // qShifted, we don't need to set UCOL_COMMON_BOT4 so high. |
michael@0 | 4587 | uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1); |
michael@0 | 4588 | uint8_t UCOL_HIRAGANA_QUAD = 0; |
michael@0 | 4589 | if(doHiragana) { |
michael@0 | 4590 | UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++; |
michael@0 | 4591 | /* allocate one more space for hiragana, value for hiragana */ |
michael@0 | 4592 | } |
michael@0 | 4593 | uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4); |
michael@0 | 4594 | |
michael@0 | 4595 | /* support for special features like caselevel and funky secondaries */ |
michael@0 | 4596 | int32_t lastSecondaryLength = 0; |
michael@0 | 4597 | uint32_t caseShift = 0; |
michael@0 | 4598 | |
michael@0 | 4599 | /* If we need to normalize, we'll do it all at once at the beginning! */ |
michael@0 | 4600 | const Normalizer2 *norm2; |
michael@0 | 4601 | if(compareIdent) { |
michael@0 | 4602 | norm2 = Normalizer2Factory::getNFDInstance(*status); |
michael@0 | 4603 | } else if(coll->normalizationMode != UCOL_OFF) { |
michael@0 | 4604 | norm2 = Normalizer2Factory::getFCDInstance(*status); |
michael@0 | 4605 | } else { |
michael@0 | 4606 | norm2 = NULL; |
michael@0 | 4607 | } |
michael@0 | 4608 | if(norm2 != NULL) { |
michael@0 | 4609 | normSource.setTo(FALSE, source, len); |
michael@0 | 4610 | int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status); |
michael@0 | 4611 | if(qcYesLength != len) { |
michael@0 | 4612 | UnicodeString unnormalized = normSource.tempSubString(qcYesLength); |
michael@0 | 4613 | normSource.truncate(qcYesLength); |
michael@0 | 4614 | norm2->normalizeSecondAndAppend(normSource, unnormalized, *status); |
michael@0 | 4615 | source = normSource.getBuffer(); |
michael@0 | 4616 | len = normSource.length(); |
michael@0 | 4617 | } |
michael@0 | 4618 | } |
michael@0 | 4619 | collIterate s; |
michael@0 | 4620 | IInit_collIterate(coll, source, len, &s, status); |
michael@0 | 4621 | if(U_FAILURE(*status)) { |
michael@0 | 4622 | return; |
michael@0 | 4623 | } |
michael@0 | 4624 | s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was normalized. |
michael@0 | 4625 | |
michael@0 | 4626 | uint32_t order = 0; |
michael@0 | 4627 | |
michael@0 | 4628 | uint8_t primary1 = 0; |
michael@0 | 4629 | uint8_t primary2 = 0; |
michael@0 | 4630 | uint8_t secondary = 0; |
michael@0 | 4631 | uint8_t tertiary = 0; |
michael@0 | 4632 | uint8_t caseSwitch = coll->caseSwitch; |
michael@0 | 4633 | uint8_t tertiaryMask = coll->tertiaryMask; |
michael@0 | 4634 | int8_t tertiaryAddition = coll->tertiaryAddition; |
michael@0 | 4635 | uint8_t tertiaryTop = coll->tertiaryTop; |
michael@0 | 4636 | uint8_t tertiaryBottom = coll->tertiaryBottom; |
michael@0 | 4637 | uint8_t tertiaryCommon = coll->tertiaryCommon; |
michael@0 | 4638 | uint8_t caseBits = 0; |
michael@0 | 4639 | |
michael@0 | 4640 | UBool wasShifted = FALSE; |
michael@0 | 4641 | UBool notIsContinuation = FALSE; |
michael@0 | 4642 | |
michael@0 | 4643 | uint32_t count2 = 0, count3 = 0, count4 = 0; |
michael@0 | 4644 | uint8_t leadPrimary = 0; |
michael@0 | 4645 | |
michael@0 | 4646 | for(;;) { |
michael@0 | 4647 | order = ucol_IGetNextCE(coll, &s, status); |
michael@0 | 4648 | if(order == UCOL_NO_MORE_CES) { |
michael@0 | 4649 | break; |
michael@0 | 4650 | } |
michael@0 | 4651 | |
michael@0 | 4652 | if(order == 0) { |
michael@0 | 4653 | continue; |
michael@0 | 4654 | } |
michael@0 | 4655 | |
michael@0 | 4656 | notIsContinuation = !isContinuation(order); |
michael@0 | 4657 | |
michael@0 | 4658 | if(notIsContinuation) { |
michael@0 | 4659 | tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK); |
michael@0 | 4660 | } else { |
michael@0 | 4661 | tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); |
michael@0 | 4662 | } |
michael@0 | 4663 | |
michael@0 | 4664 | secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); |
michael@0 | 4665 | primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); |
michael@0 | 4666 | primary1 = (uint8_t)(order >> 8); |
michael@0 | 4667 | |
michael@0 | 4668 | uint8_t originalPrimary1 = primary1; |
michael@0 | 4669 | if(notIsContinuation && coll->leadBytePermutationTable != NULL) { |
michael@0 | 4670 | primary1 = coll->leadBytePermutationTable[primary1]; |
michael@0 | 4671 | } |
michael@0 | 4672 | |
michael@0 | 4673 | if((shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0) |
michael@0 | 4674 | || (!notIsContinuation && wasShifted))) |
michael@0 | 4675 | || (wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */ |
michael@0 | 4676 | { |
michael@0 | 4677 | /* and other ignorables should be removed if following a shifted code point */ |
michael@0 | 4678 | if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */ |
michael@0 | 4679 | /* we should just completely ignore it */ |
michael@0 | 4680 | continue; |
michael@0 | 4681 | } |
michael@0 | 4682 | if(compareQuad == 0) { |
michael@0 | 4683 | if(count4 > 0) { |
michael@0 | 4684 | while (count4 > UCOL_BOT_COUNT4) { |
michael@0 | 4685 | quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); |
michael@0 | 4686 | count4 -= UCOL_BOT_COUNT4; |
michael@0 | 4687 | } |
michael@0 | 4688 | quads.appendByte(UCOL_COMMON_BOT4 + (count4-1)); |
michael@0 | 4689 | count4 = 0; |
michael@0 | 4690 | } |
michael@0 | 4691 | /* We are dealing with a variable and we're treating them as shifted */ |
michael@0 | 4692 | /* This is a shifted ignorable */ |
michael@0 | 4693 | if(primary1 != 0) { /* we need to check this since we could be in continuation */ |
michael@0 | 4694 | quads.appendByte(primary1); |
michael@0 | 4695 | } |
michael@0 | 4696 | if(primary2 != 0) { |
michael@0 | 4697 | quads.appendByte(primary2); |
michael@0 | 4698 | } |
michael@0 | 4699 | } |
michael@0 | 4700 | wasShifted = TRUE; |
michael@0 | 4701 | } else { |
michael@0 | 4702 | wasShifted = FALSE; |
michael@0 | 4703 | /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */ |
michael@0 | 4704 | /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */ |
michael@0 | 4705 | /* regular and simple sortkey calc */ |
michael@0 | 4706 | if(primary1 != UCOL_IGNORABLE) { |
michael@0 | 4707 | if(notIsContinuation) { |
michael@0 | 4708 | if(leadPrimary == primary1) { |
michael@0 | 4709 | primaries.Append(primary2); |
michael@0 | 4710 | } else { |
michael@0 | 4711 | if(leadPrimary != 0) { |
michael@0 | 4712 | primaries.Append((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN); |
michael@0 | 4713 | } |
michael@0 | 4714 | if(primary2 == UCOL_IGNORABLE) { |
michael@0 | 4715 | /* one byter, not compressed */ |
michael@0 | 4716 | primaries.Append(primary1); |
michael@0 | 4717 | leadPrimary = 0; |
michael@0 | 4718 | } else if(isCompressible(coll, originalPrimary1)) { |
michael@0 | 4719 | /* compress */ |
michael@0 | 4720 | primaries.Append(leadPrimary = primary1, primary2); |
michael@0 | 4721 | } else { |
michael@0 | 4722 | leadPrimary = 0; |
michael@0 | 4723 | primaries.Append(primary1, primary2); |
michael@0 | 4724 | } |
michael@0 | 4725 | } |
michael@0 | 4726 | } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */ |
michael@0 | 4727 | if(primary2 == UCOL_IGNORABLE) { |
michael@0 | 4728 | primaries.Append(primary1); |
michael@0 | 4729 | } else { |
michael@0 | 4730 | primaries.Append(primary1, primary2); |
michael@0 | 4731 | } |
michael@0 | 4732 | } |
michael@0 | 4733 | } |
michael@0 | 4734 | |
michael@0 | 4735 | if(secondary > compareSec) { |
michael@0 | 4736 | if(!isFrenchSec) { |
michael@0 | 4737 | /* This is compression code. */ |
michael@0 | 4738 | if (secondary == UCOL_COMMON2 && notIsContinuation) { |
michael@0 | 4739 | ++count2; |
michael@0 | 4740 | } else { |
michael@0 | 4741 | if (count2 > 0) { |
michael@0 | 4742 | if (secondary > UCOL_COMMON2) { // not necessary for 4th level. |
michael@0 | 4743 | while (count2 > UCOL_TOP_COUNT2) { |
michael@0 | 4744 | secondaries.appendByte(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2); |
michael@0 | 4745 | count2 -= (uint32_t)UCOL_TOP_COUNT2; |
michael@0 | 4746 | } |
michael@0 | 4747 | secondaries.appendByte(UCOL_COMMON_TOP2 - (count2-1)); |
michael@0 | 4748 | } else { |
michael@0 | 4749 | while (count2 > UCOL_BOT_COUNT2) { |
michael@0 | 4750 | secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); |
michael@0 | 4751 | count2 -= (uint32_t)UCOL_BOT_COUNT2; |
michael@0 | 4752 | } |
michael@0 | 4753 | secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1)); |
michael@0 | 4754 | } |
michael@0 | 4755 | count2 = 0; |
michael@0 | 4756 | } |
michael@0 | 4757 | secondaries.appendByte(secondary); |
michael@0 | 4758 | } |
michael@0 | 4759 | } else { |
michael@0 | 4760 | /* Do the special handling for French secondaries */ |
michael@0 | 4761 | /* We need to get continuation elements and do intermediate restore */ |
michael@0 | 4762 | /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */ |
michael@0 | 4763 | if(notIsContinuation) { |
michael@0 | 4764 | if (lastSecondaryLength > 1) { |
michael@0 | 4765 | uint8_t *frenchStartPtr = secondaries.getLastFewBytes(lastSecondaryLength); |
michael@0 | 4766 | if (frenchStartPtr != NULL) { |
michael@0 | 4767 | /* reverse secondaries from frenchStartPtr up to frenchEndPtr */ |
michael@0 | 4768 | uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1; |
michael@0 | 4769 | uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); |
michael@0 | 4770 | } |
michael@0 | 4771 | } |
michael@0 | 4772 | lastSecondaryLength = 1; |
michael@0 | 4773 | } else { |
michael@0 | 4774 | ++lastSecondaryLength; |
michael@0 | 4775 | } |
michael@0 | 4776 | secondaries.appendByte(secondary); |
michael@0 | 4777 | } |
michael@0 | 4778 | } |
michael@0 | 4779 | |
michael@0 | 4780 | if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) { |
michael@0 | 4781 | // do the case level if we need to do it. We don't want to calculate |
michael@0 | 4782 | // case level for primary ignorables if we have only primary strength and case level |
michael@0 | 4783 | // otherwise we would break well formedness of CEs |
michael@0 | 4784 | doCaseShift(cases, caseShift); |
michael@0 | 4785 | if(notIsContinuation) { |
michael@0 | 4786 | caseBits = (uint8_t)(tertiary & 0xC0); |
michael@0 | 4787 | |
michael@0 | 4788 | if(tertiary != 0) { |
michael@0 | 4789 | if(coll->caseFirst == UCOL_UPPER_FIRST) { |
michael@0 | 4790 | if((caseBits & 0xC0) == 0) { |
michael@0 | 4791 | cases.lastByte() |= 1 << (--caseShift); |
michael@0 | 4792 | } else { |
michael@0 | 4793 | cases.lastByte() |= 0 << (--caseShift); |
michael@0 | 4794 | /* second bit */ |
michael@0 | 4795 | doCaseShift(cases, caseShift); |
michael@0 | 4796 | cases.lastByte() |= ((caseBits>>6)&1) << (--caseShift); |
michael@0 | 4797 | } |
michael@0 | 4798 | } else { |
michael@0 | 4799 | if((caseBits & 0xC0) == 0) { |
michael@0 | 4800 | cases.lastByte() |= 0 << (--caseShift); |
michael@0 | 4801 | } else { |
michael@0 | 4802 | cases.lastByte() |= 1 << (--caseShift); |
michael@0 | 4803 | /* second bit */ |
michael@0 | 4804 | doCaseShift(cases, caseShift); |
michael@0 | 4805 | cases.lastByte() |= ((caseBits>>7)&1) << (--caseShift); |
michael@0 | 4806 | } |
michael@0 | 4807 | } |
michael@0 | 4808 | } |
michael@0 | 4809 | } |
michael@0 | 4810 | } else { |
michael@0 | 4811 | if(notIsContinuation) { |
michael@0 | 4812 | tertiary ^= caseSwitch; |
michael@0 | 4813 | } |
michael@0 | 4814 | } |
michael@0 | 4815 | |
michael@0 | 4816 | tertiary &= tertiaryMask; |
michael@0 | 4817 | if(tertiary > compareTer) { |
michael@0 | 4818 | /* This is compression code. */ |
michael@0 | 4819 | /* sequence size check is included in the if clause */ |
michael@0 | 4820 | if (tertiary == tertiaryCommon && notIsContinuation) { |
michael@0 | 4821 | ++count3; |
michael@0 | 4822 | } else { |
michael@0 | 4823 | if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) { |
michael@0 | 4824 | tertiary += tertiaryAddition; |
michael@0 | 4825 | } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) { |
michael@0 | 4826 | tertiary -= tertiaryAddition; |
michael@0 | 4827 | } |
michael@0 | 4828 | if (count3 > 0) { |
michael@0 | 4829 | if ((tertiary > tertiaryCommon)) { |
michael@0 | 4830 | while (count3 > coll->tertiaryTopCount) { |
michael@0 | 4831 | tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount); |
michael@0 | 4832 | count3 -= (uint32_t)coll->tertiaryTopCount; |
michael@0 | 4833 | } |
michael@0 | 4834 | tertiaries.appendByte(tertiaryTop - (count3-1)); |
michael@0 | 4835 | } else { |
michael@0 | 4836 | while (count3 > coll->tertiaryBottomCount) { |
michael@0 | 4837 | tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount); |
michael@0 | 4838 | count3 -= (uint32_t)coll->tertiaryBottomCount; |
michael@0 | 4839 | } |
michael@0 | 4840 | tertiaries.appendByte(tertiaryBottom + (count3-1)); |
michael@0 | 4841 | } |
michael@0 | 4842 | count3 = 0; |
michael@0 | 4843 | } |
michael@0 | 4844 | tertiaries.appendByte(tertiary); |
michael@0 | 4845 | } |
michael@0 | 4846 | } |
michael@0 | 4847 | |
michael@0 | 4848 | if(/*qShifted*/(compareQuad==0) && notIsContinuation) { |
michael@0 | 4849 | if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it |
michael@0 | 4850 | if(count4>0) { // Close this part |
michael@0 | 4851 | while (count4 > UCOL_BOT_COUNT4) { |
michael@0 | 4852 | quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); |
michael@0 | 4853 | count4 -= UCOL_BOT_COUNT4; |
michael@0 | 4854 | } |
michael@0 | 4855 | quads.appendByte(UCOL_COMMON_BOT4 + (count4-1)); |
michael@0 | 4856 | count4 = 0; |
michael@0 | 4857 | } |
michael@0 | 4858 | quads.appendByte(UCOL_HIRAGANA_QUAD); // Add the Hiragana |
michael@0 | 4859 | } else { // This wasn't Hiragana, so we can continue adding stuff |
michael@0 | 4860 | count4++; |
michael@0 | 4861 | } |
michael@0 | 4862 | } |
michael@0 | 4863 | } |
michael@0 | 4864 | } |
michael@0 | 4865 | |
michael@0 | 4866 | /* Here, we are generally done with processing */ |
michael@0 | 4867 | /* bailing out would not be too productive */ |
michael@0 | 4868 | |
michael@0 | 4869 | UBool ok = TRUE; |
michael@0 | 4870 | if(U_SUCCESS(*status)) { |
michael@0 | 4871 | /* we have done all the CE's, now let's put them together to form a key */ |
michael@0 | 4872 | if(compareSec == 0) { |
michael@0 | 4873 | if (count2 > 0) { |
michael@0 | 4874 | while (count2 > UCOL_BOT_COUNT2) { |
michael@0 | 4875 | secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); |
michael@0 | 4876 | count2 -= (uint32_t)UCOL_BOT_COUNT2; |
michael@0 | 4877 | } |
michael@0 | 4878 | secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1)); |
michael@0 | 4879 | } |
michael@0 | 4880 | result.Append(UCOL_LEVELTERMINATOR); |
michael@0 | 4881 | if(!secondaries.isOk()) { |
michael@0 | 4882 | ok = FALSE; |
michael@0 | 4883 | } else if(!isFrenchSec) { |
michael@0 | 4884 | secondaries.appendTo(result); |
michael@0 | 4885 | } else { |
michael@0 | 4886 | // If there are any unresolved continuation secondaries, |
michael@0 | 4887 | // reverse them here so that we can reverse the whole secondary thing. |
michael@0 | 4888 | if (lastSecondaryLength > 1) { |
michael@0 | 4889 | uint8_t *frenchStartPtr = secondaries.getLastFewBytes(lastSecondaryLength); |
michael@0 | 4890 | if (frenchStartPtr != NULL) { |
michael@0 | 4891 | /* reverse secondaries from frenchStartPtr up to frenchEndPtr */ |
michael@0 | 4892 | uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1; |
michael@0 | 4893 | uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); |
michael@0 | 4894 | } |
michael@0 | 4895 | } |
michael@0 | 4896 | packFrench(secondaries.data(), secondaries.length(), result); |
michael@0 | 4897 | } |
michael@0 | 4898 | } |
michael@0 | 4899 | |
michael@0 | 4900 | if(doCase) { |
michael@0 | 4901 | ok &= cases.isOk(); |
michael@0 | 4902 | result.Append(UCOL_LEVELTERMINATOR); |
michael@0 | 4903 | cases.appendTo(result); |
michael@0 | 4904 | } |
michael@0 | 4905 | |
michael@0 | 4906 | if(compareTer == 0) { |
michael@0 | 4907 | if (count3 > 0) { |
michael@0 | 4908 | if (coll->tertiaryCommon != UCOL_COMMON_BOT3) { |
michael@0 | 4909 | while (count3 >= coll->tertiaryTopCount) { |
michael@0 | 4910 | tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount); |
michael@0 | 4911 | count3 -= (uint32_t)coll->tertiaryTopCount; |
michael@0 | 4912 | } |
michael@0 | 4913 | tertiaries.appendByte(tertiaryTop - count3); |
michael@0 | 4914 | } else { |
michael@0 | 4915 | while (count3 > coll->tertiaryBottomCount) { |
michael@0 | 4916 | tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount); |
michael@0 | 4917 | count3 -= (uint32_t)coll->tertiaryBottomCount; |
michael@0 | 4918 | } |
michael@0 | 4919 | tertiaries.appendByte(tertiaryBottom + (count3-1)); |
michael@0 | 4920 | } |
michael@0 | 4921 | } |
michael@0 | 4922 | ok &= tertiaries.isOk(); |
michael@0 | 4923 | result.Append(UCOL_LEVELTERMINATOR); |
michael@0 | 4924 | tertiaries.appendTo(result); |
michael@0 | 4925 | |
michael@0 | 4926 | if(compareQuad == 0/*qShifted == TRUE*/) { |
michael@0 | 4927 | if(count4 > 0) { |
michael@0 | 4928 | while (count4 > UCOL_BOT_COUNT4) { |
michael@0 | 4929 | quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); |
michael@0 | 4930 | count4 -= UCOL_BOT_COUNT4; |
michael@0 | 4931 | } |
michael@0 | 4932 | quads.appendByte(UCOL_COMMON_BOT4 + (count4-1)); |
michael@0 | 4933 | } |
michael@0 | 4934 | ok &= quads.isOk(); |
michael@0 | 4935 | result.Append(UCOL_LEVELTERMINATOR); |
michael@0 | 4936 | quads.appendTo(result); |
michael@0 | 4937 | } |
michael@0 | 4938 | |
michael@0 | 4939 | if(compareIdent) { |
michael@0 | 4940 | result.Append(UCOL_LEVELTERMINATOR); |
michael@0 | 4941 | u_writeIdenticalLevelRun(s.string, len, result); |
michael@0 | 4942 | } |
michael@0 | 4943 | } |
michael@0 | 4944 | result.Append(0); |
michael@0 | 4945 | } |
michael@0 | 4946 | |
michael@0 | 4947 | /* To avoid memory leak, free the offset buffer if necessary. */ |
michael@0 | 4948 | ucol_freeOffsetBuffer(&s); |
michael@0 | 4949 | |
michael@0 | 4950 | ok &= result.IsOk(); |
michael@0 | 4951 | if(!ok && U_SUCCESS(*status)) { *status = U_MEMORY_ALLOCATION_ERROR; } |
michael@0 | 4952 | } |
michael@0 | 4953 | |
michael@0 | 4954 | |
michael@0 | 4955 | U_CFUNC void U_CALLCONV |
michael@0 | 4956 | ucol_calcSortKeySimpleTertiary(const UCollator *coll, |
michael@0 | 4957 | const UChar *source, |
michael@0 | 4958 | int32_t sourceLength, |
michael@0 | 4959 | SortKeyByteSink &result, |
michael@0 | 4960 | UErrorCode *status) |
michael@0 | 4961 | { |
michael@0 | 4962 | U_ALIGN_CODE(16); |
michael@0 | 4963 | |
michael@0 | 4964 | if(U_FAILURE(*status)) { |
michael@0 | 4965 | return; |
michael@0 | 4966 | } |
michael@0 | 4967 | |
michael@0 | 4968 | SortKeyByteSink &primaries = result; |
michael@0 | 4969 | SortKeyLevel secondaries; |
michael@0 | 4970 | SortKeyLevel tertiaries; |
michael@0 | 4971 | |
michael@0 | 4972 | UnicodeString normSource; |
michael@0 | 4973 | |
michael@0 | 4974 | int32_t len = sourceLength; |
michael@0 | 4975 | |
michael@0 | 4976 | /* If we need to normalize, we'll do it all at once at the beginning! */ |
michael@0 | 4977 | if(coll->normalizationMode != UCOL_OFF) { |
michael@0 | 4978 | normSource.setTo(len < 0, source, len); |
michael@0 | 4979 | const Normalizer2 *norm2 = Normalizer2Factory::getFCDInstance(*status); |
michael@0 | 4980 | int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status); |
michael@0 | 4981 | if(qcYesLength != normSource.length()) { |
michael@0 | 4982 | UnicodeString unnormalized = normSource.tempSubString(qcYesLength); |
michael@0 | 4983 | normSource.truncate(qcYesLength); |
michael@0 | 4984 | norm2->normalizeSecondAndAppend(normSource, unnormalized, *status); |
michael@0 | 4985 | source = normSource.getBuffer(); |
michael@0 | 4986 | len = normSource.length(); |
michael@0 | 4987 | } |
michael@0 | 4988 | } |
michael@0 | 4989 | collIterate s; |
michael@0 | 4990 | IInit_collIterate(coll, (UChar *)source, len, &s, status); |
michael@0 | 4991 | if(U_FAILURE(*status)) { |
michael@0 | 4992 | return; |
michael@0 | 4993 | } |
michael@0 | 4994 | s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was normalized. |
michael@0 | 4995 | |
michael@0 | 4996 | uint32_t order = 0; |
michael@0 | 4997 | |
michael@0 | 4998 | uint8_t primary1 = 0; |
michael@0 | 4999 | uint8_t primary2 = 0; |
michael@0 | 5000 | uint8_t secondary = 0; |
michael@0 | 5001 | uint8_t tertiary = 0; |
michael@0 | 5002 | uint8_t caseSwitch = coll->caseSwitch; |
michael@0 | 5003 | uint8_t tertiaryMask = coll->tertiaryMask; |
michael@0 | 5004 | int8_t tertiaryAddition = coll->tertiaryAddition; |
michael@0 | 5005 | uint8_t tertiaryTop = coll->tertiaryTop; |
michael@0 | 5006 | uint8_t tertiaryBottom = coll->tertiaryBottom; |
michael@0 | 5007 | uint8_t tertiaryCommon = coll->tertiaryCommon; |
michael@0 | 5008 | |
michael@0 | 5009 | UBool notIsContinuation = FALSE; |
michael@0 | 5010 | |
michael@0 | 5011 | uint32_t count2 = 0, count3 = 0; |
michael@0 | 5012 | uint8_t leadPrimary = 0; |
michael@0 | 5013 | |
michael@0 | 5014 | for(;;) { |
michael@0 | 5015 | order = ucol_IGetNextCE(coll, &s, status); |
michael@0 | 5016 | |
michael@0 | 5017 | if(order == 0) { |
michael@0 | 5018 | continue; |
michael@0 | 5019 | } |
michael@0 | 5020 | |
michael@0 | 5021 | if(order == UCOL_NO_MORE_CES) { |
michael@0 | 5022 | break; |
michael@0 | 5023 | } |
michael@0 | 5024 | |
michael@0 | 5025 | notIsContinuation = !isContinuation(order); |
michael@0 | 5026 | |
michael@0 | 5027 | if(notIsContinuation) { |
michael@0 | 5028 | tertiary = (uint8_t)((order & tertiaryMask)); |
michael@0 | 5029 | } else { |
michael@0 | 5030 | tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); |
michael@0 | 5031 | } |
michael@0 | 5032 | |
michael@0 | 5033 | secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); |
michael@0 | 5034 | primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); |
michael@0 | 5035 | primary1 = (uint8_t)(order >> 8); |
michael@0 | 5036 | |
michael@0 | 5037 | uint8_t originalPrimary1 = primary1; |
michael@0 | 5038 | if (coll->leadBytePermutationTable != NULL && notIsContinuation) { |
michael@0 | 5039 | primary1 = coll->leadBytePermutationTable[primary1]; |
michael@0 | 5040 | } |
michael@0 | 5041 | |
michael@0 | 5042 | /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */ |
michael@0 | 5043 | /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */ |
michael@0 | 5044 | /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */ |
michael@0 | 5045 | /* regular and simple sortkey calc */ |
michael@0 | 5046 | if(primary1 != UCOL_IGNORABLE) { |
michael@0 | 5047 | if(notIsContinuation) { |
michael@0 | 5048 | if(leadPrimary == primary1) { |
michael@0 | 5049 | primaries.Append(primary2); |
michael@0 | 5050 | } else { |
michael@0 | 5051 | if(leadPrimary != 0) { |
michael@0 | 5052 | primaries.Append((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN); |
michael@0 | 5053 | } |
michael@0 | 5054 | if(primary2 == UCOL_IGNORABLE) { |
michael@0 | 5055 | /* one byter, not compressed */ |
michael@0 | 5056 | primaries.Append(primary1); |
michael@0 | 5057 | leadPrimary = 0; |
michael@0 | 5058 | } else if(isCompressible(coll, originalPrimary1)) { |
michael@0 | 5059 | /* compress */ |
michael@0 | 5060 | primaries.Append(leadPrimary = primary1, primary2); |
michael@0 | 5061 | } else { |
michael@0 | 5062 | leadPrimary = 0; |
michael@0 | 5063 | primaries.Append(primary1, primary2); |
michael@0 | 5064 | } |
michael@0 | 5065 | } |
michael@0 | 5066 | } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */ |
michael@0 | 5067 | if(primary2 == UCOL_IGNORABLE) { |
michael@0 | 5068 | primaries.Append(primary1); |
michael@0 | 5069 | } else { |
michael@0 | 5070 | primaries.Append(primary1, primary2); |
michael@0 | 5071 | } |
michael@0 | 5072 | } |
michael@0 | 5073 | } |
michael@0 | 5074 | |
michael@0 | 5075 | if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */ |
michael@0 | 5076 | /* This is compression code. */ |
michael@0 | 5077 | if (secondary == UCOL_COMMON2 && notIsContinuation) { |
michael@0 | 5078 | ++count2; |
michael@0 | 5079 | } else { |
michael@0 | 5080 | if (count2 > 0) { |
michael@0 | 5081 | if (secondary > UCOL_COMMON2) { // not necessary for 4th level. |
michael@0 | 5082 | while (count2 > UCOL_TOP_COUNT2) { |
michael@0 | 5083 | secondaries.appendByte(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2); |
michael@0 | 5084 | count2 -= (uint32_t)UCOL_TOP_COUNT2; |
michael@0 | 5085 | } |
michael@0 | 5086 | secondaries.appendByte(UCOL_COMMON_TOP2 - (count2-1)); |
michael@0 | 5087 | } else { |
michael@0 | 5088 | while (count2 > UCOL_BOT_COUNT2) { |
michael@0 | 5089 | secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); |
michael@0 | 5090 | count2 -= (uint32_t)UCOL_BOT_COUNT2; |
michael@0 | 5091 | } |
michael@0 | 5092 | secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1)); |
michael@0 | 5093 | } |
michael@0 | 5094 | count2 = 0; |
michael@0 | 5095 | } |
michael@0 | 5096 | secondaries.appendByte(secondary); |
michael@0 | 5097 | } |
michael@0 | 5098 | } |
michael@0 | 5099 | |
michael@0 | 5100 | if(notIsContinuation) { |
michael@0 | 5101 | tertiary ^= caseSwitch; |
michael@0 | 5102 | } |
michael@0 | 5103 | |
michael@0 | 5104 | if(tertiary > 0) { |
michael@0 | 5105 | /* This is compression code. */ |
michael@0 | 5106 | /* sequence size check is included in the if clause */ |
michael@0 | 5107 | if (tertiary == tertiaryCommon && notIsContinuation) { |
michael@0 | 5108 | ++count3; |
michael@0 | 5109 | } else { |
michael@0 | 5110 | if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) { |
michael@0 | 5111 | tertiary += tertiaryAddition; |
michael@0 | 5112 | } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) { |
michael@0 | 5113 | tertiary -= tertiaryAddition; |
michael@0 | 5114 | } |
michael@0 | 5115 | if (count3 > 0) { |
michael@0 | 5116 | if ((tertiary > tertiaryCommon)) { |
michael@0 | 5117 | while (count3 > coll->tertiaryTopCount) { |
michael@0 | 5118 | tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount); |
michael@0 | 5119 | count3 -= (uint32_t)coll->tertiaryTopCount; |
michael@0 | 5120 | } |
michael@0 | 5121 | tertiaries.appendByte(tertiaryTop - (count3-1)); |
michael@0 | 5122 | } else { |
michael@0 | 5123 | while (count3 > coll->tertiaryBottomCount) { |
michael@0 | 5124 | tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount); |
michael@0 | 5125 | count3 -= (uint32_t)coll->tertiaryBottomCount; |
michael@0 | 5126 | } |
michael@0 | 5127 | tertiaries.appendByte(tertiaryBottom + (count3-1)); |
michael@0 | 5128 | } |
michael@0 | 5129 | count3 = 0; |
michael@0 | 5130 | } |
michael@0 | 5131 | tertiaries.appendByte(tertiary); |
michael@0 | 5132 | } |
michael@0 | 5133 | } |
michael@0 | 5134 | } |
michael@0 | 5135 | |
michael@0 | 5136 | UBool ok = TRUE; |
michael@0 | 5137 | if(U_SUCCESS(*status)) { |
michael@0 | 5138 | /* we have done all the CE's, now let's put them together to form a key */ |
michael@0 | 5139 | if (count2 > 0) { |
michael@0 | 5140 | while (count2 > UCOL_BOT_COUNT2) { |
michael@0 | 5141 | secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); |
michael@0 | 5142 | count2 -= (uint32_t)UCOL_BOT_COUNT2; |
michael@0 | 5143 | } |
michael@0 | 5144 | secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1)); |
michael@0 | 5145 | } |
michael@0 | 5146 | ok &= secondaries.isOk(); |
michael@0 | 5147 | result.Append(UCOL_LEVELTERMINATOR); |
michael@0 | 5148 | secondaries.appendTo(result); |
michael@0 | 5149 | |
michael@0 | 5150 | if (count3 > 0) { |
michael@0 | 5151 | if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) { |
michael@0 | 5152 | while (count3 >= coll->tertiaryTopCount) { |
michael@0 | 5153 | tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount); |
michael@0 | 5154 | count3 -= (uint32_t)coll->tertiaryTopCount; |
michael@0 | 5155 | } |
michael@0 | 5156 | tertiaries.appendByte(tertiaryTop - count3); |
michael@0 | 5157 | } else { |
michael@0 | 5158 | while (count3 > coll->tertiaryBottomCount) { |
michael@0 | 5159 | tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount); |
michael@0 | 5160 | count3 -= (uint32_t)coll->tertiaryBottomCount; |
michael@0 | 5161 | } |
michael@0 | 5162 | tertiaries.appendByte(tertiaryBottom + (count3-1)); |
michael@0 | 5163 | } |
michael@0 | 5164 | } |
michael@0 | 5165 | ok &= tertiaries.isOk(); |
michael@0 | 5166 | result.Append(UCOL_LEVELTERMINATOR); |
michael@0 | 5167 | tertiaries.appendTo(result); |
michael@0 | 5168 | |
michael@0 | 5169 | result.Append(0); |
michael@0 | 5170 | } |
michael@0 | 5171 | |
michael@0 | 5172 | /* To avoid memory leak, free the offset buffer if necessary. */ |
michael@0 | 5173 | ucol_freeOffsetBuffer(&s); |
michael@0 | 5174 | |
michael@0 | 5175 | ok &= result.IsOk(); |
michael@0 | 5176 | if(!ok && U_SUCCESS(*status)) { *status = U_MEMORY_ALLOCATION_ERROR; } |
michael@0 | 5177 | } |
michael@0 | 5178 | |
michael@0 | 5179 | static inline |
michael@0 | 5180 | UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) { |
michael@0 | 5181 | UBool notIsContinuation = !isContinuation(CE); |
michael@0 | 5182 | uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF); |
michael@0 | 5183 | if((LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0) |
michael@0 | 5184 | || (!notIsContinuation && *wasShifted))) |
michael@0 | 5185 | || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */ |
michael@0 | 5186 | { |
michael@0 | 5187 | // The stuff below should probably be in the sortkey code... maybe not... |
michael@0 | 5188 | if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */ |
michael@0 | 5189 | /* we should just completely ignore it */ |
michael@0 | 5190 | *wasShifted = TRUE; |
michael@0 | 5191 | //continue; |
michael@0 | 5192 | } |
michael@0 | 5193 | //*wasShifted = TRUE; |
michael@0 | 5194 | return TRUE; |
michael@0 | 5195 | } else { |
michael@0 | 5196 | *wasShifted = FALSE; |
michael@0 | 5197 | return FALSE; |
michael@0 | 5198 | } |
michael@0 | 5199 | } |
michael@0 | 5200 | static inline |
michael@0 | 5201 | void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) { |
michael@0 | 5202 | if(level < maxLevel) { |
michael@0 | 5203 | dest[i++] = UCOL_LEVELTERMINATOR; |
michael@0 | 5204 | } else { |
michael@0 | 5205 | dest[i++] = 0; |
michael@0 | 5206 | } |
michael@0 | 5207 | } |
michael@0 | 5208 | |
michael@0 | 5209 | /** enumeration of level identifiers for partial sort key generation */ |
michael@0 | 5210 | enum { |
michael@0 | 5211 | UCOL_PSK_PRIMARY = 0, |
michael@0 | 5212 | UCOL_PSK_SECONDARY = 1, |
michael@0 | 5213 | UCOL_PSK_CASE = 2, |
michael@0 | 5214 | UCOL_PSK_TERTIARY = 3, |
michael@0 | 5215 | UCOL_PSK_QUATERNARY = 4, |
michael@0 | 5216 | UCOL_PSK_QUIN = 5, /** This is an extra level, not used - but we have three bits to blow */ |
michael@0 | 5217 | UCOL_PSK_IDENTICAL = 6, |
michael@0 | 5218 | UCOL_PSK_NULL = 7, /** level for the end of sort key. Will just produce zeros */ |
michael@0 | 5219 | UCOL_PSK_LIMIT |
michael@0 | 5220 | }; |
michael@0 | 5221 | |
michael@0 | 5222 | /** collation state enum. *_SHIFT value is how much to shift right |
michael@0 | 5223 | * to get the state piece to the right. *_MASK value should be |
michael@0 | 5224 | * ANDed with the shifted state. This data is stored in state[1] |
michael@0 | 5225 | * field. |
michael@0 | 5226 | */ |
michael@0 | 5227 | enum { |
michael@0 | 5228 | UCOL_PSK_LEVEL_SHIFT = 0, /** level identificator. stores an enum value from above */ |
michael@0 | 5229 | UCOL_PSK_LEVEL_MASK = 7, /** three bits */ |
michael@0 | 5230 | UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */ |
michael@0 | 5231 | UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1, |
michael@0 | 5232 | /** can be only 0 or 1, since we get up to two bytes from primary or quaternary |
michael@0 | 5233 | * This field is also used to denote that the French secondary level is finished |
michael@0 | 5234 | */ |
michael@0 | 5235 | UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */ |
michael@0 | 5236 | UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */ |
michael@0 | 5237 | UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */ |
michael@0 | 5238 | UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */ |
michael@0 | 5239 | /** When we do French we need to reverse secondary values. However, continuations |
michael@0 | 5240 | * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba |
michael@0 | 5241 | */ |
michael@0 | 5242 | UCOL_PSK_BOCSU_BYTES_SHIFT = 7, |
michael@0 | 5243 | UCOL_PSK_BOCSU_BYTES_MASK = 3, |
michael@0 | 5244 | UCOL_PSK_CONSUMED_CES_SHIFT = 9, |
michael@0 | 5245 | UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF |
michael@0 | 5246 | }; |
michael@0 | 5247 | |
michael@0 | 5248 | // macro calculating the number of expansion CEs available |
michael@0 | 5249 | #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn |
michael@0 | 5250 | |
michael@0 | 5251 | |
michael@0 | 5252 | /** main sortkey part procedure. On the first call, |
michael@0 | 5253 | * you should pass in a collator, an iterator, empty state |
michael@0 | 5254 | * state[0] == state[1] == 0, a buffer to hold results |
michael@0 | 5255 | * number of bytes you need and an error code pointer. |
michael@0 | 5256 | * Make sure your buffer is big enough to hold the wanted |
michael@0 | 5257 | * number of sortkey bytes. I don't check. |
michael@0 | 5258 | * The only meaningful status you can get back is |
michael@0 | 5259 | * U_BUFFER_OVERFLOW_ERROR, which basically means that you |
michael@0 | 5260 | * have been dealt a raw deal and that you probably won't |
michael@0 | 5261 | * be able to use partial sortkey generation for this |
michael@0 | 5262 | * particular combination of string and collator. This |
michael@0 | 5263 | * is highly unlikely, but you should still check the error code. |
michael@0 | 5264 | * Any other status means that you're not in a sane situation |
michael@0 | 5265 | * anymore. After the first call, preserve state values and |
michael@0 | 5266 | * use them on subsequent calls to obtain more bytes of a sortkey. |
michael@0 | 5267 | * Use until the number of bytes written is smaller than the requested |
michael@0 | 5268 | * number of bytes. Generated sortkey is not compatible with the |
michael@0 | 5269 | * one generated by ucol_getSortKey, as we don't do any compression. |
michael@0 | 5270 | * However, levels are still terminated by a 1 (one) and the sortkey |
michael@0 | 5271 | * is terminated by a 0 (zero). Identical level is the same as in the |
michael@0 | 5272 | * regular sortkey - internal bocu-1 implementation is used. |
michael@0 | 5273 | * For curious, although you cannot do much about this, here is |
michael@0 | 5274 | * the structure of state words. |
michael@0 | 5275 | * state[0] - iterator state. Depends on the iterator implementation, |
michael@0 | 5276 | * but allows the iterator to continue where it stopped in |
michael@0 | 5277 | * the last iteration. |
michael@0 | 5278 | * state[1] - collation processing state. Here is the distribution |
michael@0 | 5279 | * of the bits: |
michael@0 | 5280 | * 0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary |
michael@0 | 5281 | * quaternary, quin (we don't use this one), identical and |
michael@0 | 5282 | * null (producing only zeroes - first one to terminate the |
michael@0 | 5283 | * sortkey and subsequent to fill the buffer). |
michael@0 | 5284 | * 3 - byte count. Number of bytes written on the primary level. |
michael@0 | 5285 | * 4 - was shifted. Whether the previous iteration finished in the |
michael@0 | 5286 | * shifted state. |
michael@0 | 5287 | * 5, 6 - French continuation bytes written. See the comment in the enum |
michael@0 | 5288 | * 7,8 - Bocsu bytes used. Number of bytes from a bocu sequence on |
michael@0 | 5289 | * the identical level. |
michael@0 | 5290 | * 9..31 - CEs consumed. Number of getCE or next32 operations performed |
michael@0 | 5291 | * since thes last successful update of the iterator state. |
michael@0 | 5292 | */ |
michael@0 | 5293 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 5294 | ucol_nextSortKeyPart(const UCollator *coll, |
michael@0 | 5295 | UCharIterator *iter, |
michael@0 | 5296 | uint32_t state[2], |
michael@0 | 5297 | uint8_t *dest, int32_t count, |
michael@0 | 5298 | UErrorCode *status) |
michael@0 | 5299 | { |
michael@0 | 5300 | /* error checking */ |
michael@0 | 5301 | if(status==NULL || U_FAILURE(*status)) { |
michael@0 | 5302 | return 0; |
michael@0 | 5303 | } |
michael@0 | 5304 | UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART); |
michael@0 | 5305 | if( coll==NULL || iter==NULL || |
michael@0 | 5306 | state==NULL || |
michael@0 | 5307 | count<0 || (count>0 && dest==NULL) |
michael@0 | 5308 | ) { |
michael@0 | 5309 | *status=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 5310 | UTRACE_EXIT_STATUS(status); |
michael@0 | 5311 | return 0; |
michael@0 | 5312 | } |
michael@0 | 5313 | |
michael@0 | 5314 | UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d", |
michael@0 | 5315 | coll, iter, state[0], state[1], dest, count); |
michael@0 | 5316 | |
michael@0 | 5317 | if(count==0) { |
michael@0 | 5318 | /* nothing to do */ |
michael@0 | 5319 | UTRACE_EXIT_VALUE(0); |
michael@0 | 5320 | return 0; |
michael@0 | 5321 | } |
michael@0 | 5322 | /** Setting up situation according to the state we got from the previous iteration */ |
michael@0 | 5323 | // The state of the iterator from the previous invocation |
michael@0 | 5324 | uint32_t iterState = state[0]; |
michael@0 | 5325 | // Has the last iteration ended in the shifted state |
michael@0 | 5326 | UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE; |
michael@0 | 5327 | // What is the current level of the sortkey? |
michael@0 | 5328 | int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK; |
michael@0 | 5329 | // Have we written only one byte from a two byte primary in the previous iteration? |
michael@0 | 5330 | // Also on secondary level - have we finished with the French secondary? |
michael@0 | 5331 | int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK; |
michael@0 | 5332 | // number of bytes in the continuation buffer for French |
michael@0 | 5333 | int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK; |
michael@0 | 5334 | // Number of bytes already written from a bocsu sequence. Since |
michael@0 | 5335 | // the longes bocsu sequence is 4 long, this can be up to 3. |
michael@0 | 5336 | int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK; |
michael@0 | 5337 | // Number of elements that need to be consumed in this iteration because |
michael@0 | 5338 | // the iterator returned UITER_NO_STATE at the end of the last iteration, |
michael@0 | 5339 | // so we had to save the last valid state. |
michael@0 | 5340 | int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK; |
michael@0 | 5341 | |
michael@0 | 5342 | /** values that depend on the collator attributes */ |
michael@0 | 5343 | // strength of the collator. |
michael@0 | 5344 | int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status); |
michael@0 | 5345 | // maximal level of the partial sortkey. Need to take whether case level is done |
michael@0 | 5346 | int32_t maxLevel = 0; |
michael@0 | 5347 | if(strength < UCOL_TERTIARY) { |
michael@0 | 5348 | if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) { |
michael@0 | 5349 | maxLevel = UCOL_PSK_CASE; |
michael@0 | 5350 | } else { |
michael@0 | 5351 | maxLevel = strength; |
michael@0 | 5352 | } |
michael@0 | 5353 | } else { |
michael@0 | 5354 | if(strength == UCOL_TERTIARY) { |
michael@0 | 5355 | maxLevel = UCOL_PSK_TERTIARY; |
michael@0 | 5356 | } else if(strength == UCOL_QUATERNARY) { |
michael@0 | 5357 | maxLevel = UCOL_PSK_QUATERNARY; |
michael@0 | 5358 | } else { // identical |
michael@0 | 5359 | maxLevel = UCOL_IDENTICAL; |
michael@0 | 5360 | } |
michael@0 | 5361 | } |
michael@0 | 5362 | // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation |
michael@0 | 5363 | uint8_t UCOL_HIRAGANA_QUAD = |
michael@0 | 5364 | (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF; |
michael@0 | 5365 | // Boundary value that decides whether a CE is shifted or not |
michael@0 | 5366 | uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0; |
michael@0 | 5367 | // Are we doing French collation? |
michael@0 | 5368 | UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON); |
michael@0 | 5369 | |
michael@0 | 5370 | /** initializing the collation state */ |
michael@0 | 5371 | UBool notIsContinuation = FALSE; |
michael@0 | 5372 | uint32_t CE = UCOL_NO_MORE_CES; |
michael@0 | 5373 | |
michael@0 | 5374 | collIterate s; |
michael@0 | 5375 | IInit_collIterate(coll, NULL, -1, &s, status); |
michael@0 | 5376 | if(U_FAILURE(*status)) { |
michael@0 | 5377 | UTRACE_EXIT_STATUS(*status); |
michael@0 | 5378 | return 0; |
michael@0 | 5379 | } |
michael@0 | 5380 | s.iterator = iter; |
michael@0 | 5381 | s.flags |= UCOL_USE_ITERATOR; |
michael@0 | 5382 | // This variable tells us whether we have produced some other levels in this iteration |
michael@0 | 5383 | // before we moved to the identical level. In that case, we need to switch the |
michael@0 | 5384 | // type of the iterator. |
michael@0 | 5385 | UBool doingIdenticalFromStart = FALSE; |
michael@0 | 5386 | // Normalizing iterator |
michael@0 | 5387 | // The division for the array length may truncate the array size to |
michael@0 | 5388 | // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high |
michael@0 | 5389 | // for all platforms anyway. |
michael@0 | 5390 | UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; |
michael@0 | 5391 | UNormIterator *normIter = NULL; |
michael@0 | 5392 | // If the normalization is turned on for the collator and we are below identical level |
michael@0 | 5393 | // we will use a FCD normalizing iterator |
michael@0 | 5394 | if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) { |
michael@0 | 5395 | normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); |
michael@0 | 5396 | s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status); |
michael@0 | 5397 | s.flags &= ~UCOL_ITER_NORM; |
michael@0 | 5398 | if(U_FAILURE(*status)) { |
michael@0 | 5399 | UTRACE_EXIT_STATUS(*status); |
michael@0 | 5400 | return 0; |
michael@0 | 5401 | } |
michael@0 | 5402 | } else if(level == UCOL_PSK_IDENTICAL) { |
michael@0 | 5403 | // for identical level, we need a NFD iterator. We need to instantiate it here, since we |
michael@0 | 5404 | // will be updating the state - and this cannot be done on an ordinary iterator. |
michael@0 | 5405 | normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); |
michael@0 | 5406 | s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); |
michael@0 | 5407 | s.flags &= ~UCOL_ITER_NORM; |
michael@0 | 5408 | if(U_FAILURE(*status)) { |
michael@0 | 5409 | UTRACE_EXIT_STATUS(*status); |
michael@0 | 5410 | return 0; |
michael@0 | 5411 | } |
michael@0 | 5412 | doingIdenticalFromStart = TRUE; |
michael@0 | 5413 | } |
michael@0 | 5414 | |
michael@0 | 5415 | // This is the tentative new state of the iterator. The problem |
michael@0 | 5416 | // is that the iterator might return an undefined state, in |
michael@0 | 5417 | // which case we should save the last valid state and increase |
michael@0 | 5418 | // the iterator skip value. |
michael@0 | 5419 | uint32_t newState = 0; |
michael@0 | 5420 | |
michael@0 | 5421 | // First, we set the iterator to the last valid position |
michael@0 | 5422 | // from the last iteration. This was saved in state[0]. |
michael@0 | 5423 | if(iterState == 0) { |
michael@0 | 5424 | /* initial state */ |
michael@0 | 5425 | if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) { |
michael@0 | 5426 | s.iterator->move(s.iterator, 0, UITER_LIMIT); |
michael@0 | 5427 | } else { |
michael@0 | 5428 | s.iterator->move(s.iterator, 0, UITER_START); |
michael@0 | 5429 | } |
michael@0 | 5430 | } else { |
michael@0 | 5431 | /* reset to previous state */ |
michael@0 | 5432 | s.iterator->setState(s.iterator, iterState, status); |
michael@0 | 5433 | if(U_FAILURE(*status)) { |
michael@0 | 5434 | UTRACE_EXIT_STATUS(*status); |
michael@0 | 5435 | return 0; |
michael@0 | 5436 | } |
michael@0 | 5437 | } |
michael@0 | 5438 | |
michael@0 | 5439 | |
michael@0 | 5440 | |
michael@0 | 5441 | // This variable tells us whether we can attempt to update the state |
michael@0 | 5442 | // of iterator. Situations where we don't want to update iterator state |
michael@0 | 5443 | // are the existence of expansion CEs that are not yet processed, and |
michael@0 | 5444 | // finishing the case level without enough space in the buffer to insert |
michael@0 | 5445 | // a level terminator. |
michael@0 | 5446 | UBool canUpdateState = TRUE; |
michael@0 | 5447 | |
michael@0 | 5448 | // Consume all the CEs that were consumed at the end of the previous |
michael@0 | 5449 | // iteration without updating the iterator state. On identical level, |
michael@0 | 5450 | // consume the code points. |
michael@0 | 5451 | int32_t counter = cces; |
michael@0 | 5452 | if(level < UCOL_PSK_IDENTICAL) { |
michael@0 | 5453 | while(counter-->0) { |
michael@0 | 5454 | // If we're doing French and we are on the secondary level, |
michael@0 | 5455 | // we go backwards. |
michael@0 | 5456 | if(level == UCOL_PSK_SECONDARY && doingFrench) { |
michael@0 | 5457 | CE = ucol_IGetPrevCE(coll, &s, status); |
michael@0 | 5458 | } else { |
michael@0 | 5459 | CE = ucol_IGetNextCE(coll, &s, status); |
michael@0 | 5460 | } |
michael@0 | 5461 | if(CE==UCOL_NO_MORE_CES) { |
michael@0 | 5462 | /* should not happen */ |
michael@0 | 5463 | *status=U_INTERNAL_PROGRAM_ERROR; |
michael@0 | 5464 | UTRACE_EXIT_STATUS(*status); |
michael@0 | 5465 | return 0; |
michael@0 | 5466 | } |
michael@0 | 5467 | if(uprv_numAvailableExpCEs(s)) { |
michael@0 | 5468 | canUpdateState = FALSE; |
michael@0 | 5469 | } |
michael@0 | 5470 | } |
michael@0 | 5471 | } else { |
michael@0 | 5472 | while(counter-->0) { |
michael@0 | 5473 | uiter_next32(s.iterator); |
michael@0 | 5474 | } |
michael@0 | 5475 | } |
michael@0 | 5476 | |
michael@0 | 5477 | // French secondary needs to know whether the iterator state of zero came from previous level OR |
michael@0 | 5478 | // from a new invocation... |
michael@0 | 5479 | UBool wasDoingPrimary = FALSE; |
michael@0 | 5480 | // destination buffer byte counter. When this guy |
michael@0 | 5481 | // gets to count, we're done with the iteration |
michael@0 | 5482 | int32_t i = 0; |
michael@0 | 5483 | // used to count the zero bytes written after we |
michael@0 | 5484 | // have finished with the sort key |
michael@0 | 5485 | int32_t j = 0; |
michael@0 | 5486 | |
michael@0 | 5487 | |
michael@0 | 5488 | // Hm.... I think we're ready to plunge in. Basic story is as following: |
michael@0 | 5489 | // we have a fall through case based on level. This is used for initial |
michael@0 | 5490 | // positioning on iteration start. Every level processor contains a |
michael@0 | 5491 | // for(;;) which will be broken when we exhaust all the CEs. Other |
michael@0 | 5492 | // way to exit is a goto saveState, which happens when we have filled |
michael@0 | 5493 | // out our buffer. |
michael@0 | 5494 | switch(level) { |
michael@0 | 5495 | case UCOL_PSK_PRIMARY: |
michael@0 | 5496 | wasDoingPrimary = TRUE; |
michael@0 | 5497 | for(;;) { |
michael@0 | 5498 | if(i==count) { |
michael@0 | 5499 | goto saveState; |
michael@0 | 5500 | } |
michael@0 | 5501 | // We should save the state only if we |
michael@0 | 5502 | // are sure that we are done with the |
michael@0 | 5503 | // previous iterator state |
michael@0 | 5504 | if(canUpdateState && byteCountOrFrenchDone == 0) { |
michael@0 | 5505 | newState = s.iterator->getState(s.iterator); |
michael@0 | 5506 | if(newState != UITER_NO_STATE) { |
michael@0 | 5507 | iterState = newState; |
michael@0 | 5508 | cces = 0; |
michael@0 | 5509 | } |
michael@0 | 5510 | } |
michael@0 | 5511 | CE = ucol_IGetNextCE(coll, &s, status); |
michael@0 | 5512 | cces++; |
michael@0 | 5513 | if(CE==UCOL_NO_MORE_CES) { |
michael@0 | 5514 | // Add the level separator |
michael@0 | 5515 | terminatePSKLevel(level, maxLevel, i, dest); |
michael@0 | 5516 | byteCountOrFrenchDone=0; |
michael@0 | 5517 | // Restart the iteration an move to the |
michael@0 | 5518 | // second level |
michael@0 | 5519 | s.iterator->move(s.iterator, 0, UITER_START); |
michael@0 | 5520 | cces = 0; |
michael@0 | 5521 | level = UCOL_PSK_SECONDARY; |
michael@0 | 5522 | break; |
michael@0 | 5523 | } |
michael@0 | 5524 | if(!isContinuation(CE)){ |
michael@0 | 5525 | if(coll->leadBytePermutationTable != NULL){ |
michael@0 | 5526 | CE = (coll->leadBytePermutationTable[CE>>24] << 24) | (CE & 0x00FFFFFF); |
michael@0 | 5527 | } |
michael@0 | 5528 | } |
michael@0 | 5529 | if(!isShiftedCE(CE, LVT, &wasShifted)) { |
michael@0 | 5530 | CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */ |
michael@0 | 5531 | if(CE != 0) { |
michael@0 | 5532 | if(byteCountOrFrenchDone == 0) { |
michael@0 | 5533 | // get the second byte of primary |
michael@0 | 5534 | dest[i++]=(uint8_t)(CE >> 8); |
michael@0 | 5535 | } else { |
michael@0 | 5536 | byteCountOrFrenchDone = 0; |
michael@0 | 5537 | } |
michael@0 | 5538 | if((CE &=0xff)!=0) { |
michael@0 | 5539 | if(i==count) { |
michael@0 | 5540 | /* overflow */ |
michael@0 | 5541 | byteCountOrFrenchDone = 1; |
michael@0 | 5542 | cces--; |
michael@0 | 5543 | goto saveState; |
michael@0 | 5544 | } |
michael@0 | 5545 | dest[i++]=(uint8_t)CE; |
michael@0 | 5546 | } |
michael@0 | 5547 | } |
michael@0 | 5548 | } |
michael@0 | 5549 | if(uprv_numAvailableExpCEs(s)) { |
michael@0 | 5550 | canUpdateState = FALSE; |
michael@0 | 5551 | } else { |
michael@0 | 5552 | canUpdateState = TRUE; |
michael@0 | 5553 | } |
michael@0 | 5554 | } |
michael@0 | 5555 | /* fall through to next level */ |
michael@0 | 5556 | case UCOL_PSK_SECONDARY: |
michael@0 | 5557 | if(strength >= UCOL_SECONDARY) { |
michael@0 | 5558 | if(!doingFrench) { |
michael@0 | 5559 | for(;;) { |
michael@0 | 5560 | if(i == count) { |
michael@0 | 5561 | goto saveState; |
michael@0 | 5562 | } |
michael@0 | 5563 | // We should save the state only if we |
michael@0 | 5564 | // are sure that we are done with the |
michael@0 | 5565 | // previous iterator state |
michael@0 | 5566 | if(canUpdateState) { |
michael@0 | 5567 | newState = s.iterator->getState(s.iterator); |
michael@0 | 5568 | if(newState != UITER_NO_STATE) { |
michael@0 | 5569 | iterState = newState; |
michael@0 | 5570 | cces = 0; |
michael@0 | 5571 | } |
michael@0 | 5572 | } |
michael@0 | 5573 | CE = ucol_IGetNextCE(coll, &s, status); |
michael@0 | 5574 | cces++; |
michael@0 | 5575 | if(CE==UCOL_NO_MORE_CES) { |
michael@0 | 5576 | // Add the level separator |
michael@0 | 5577 | terminatePSKLevel(level, maxLevel, i, dest); |
michael@0 | 5578 | byteCountOrFrenchDone = 0; |
michael@0 | 5579 | // Restart the iteration an move to the |
michael@0 | 5580 | // second level |
michael@0 | 5581 | s.iterator->move(s.iterator, 0, UITER_START); |
michael@0 | 5582 | cces = 0; |
michael@0 | 5583 | level = UCOL_PSK_CASE; |
michael@0 | 5584 | break; |
michael@0 | 5585 | } |
michael@0 | 5586 | if(!isShiftedCE(CE, LVT, &wasShifted)) { |
michael@0 | 5587 | CE >>= 8; /* get secondary */ |
michael@0 | 5588 | if(CE != 0) { |
michael@0 | 5589 | dest[i++]=(uint8_t)CE; |
michael@0 | 5590 | } |
michael@0 | 5591 | } |
michael@0 | 5592 | if(uprv_numAvailableExpCEs(s)) { |
michael@0 | 5593 | canUpdateState = FALSE; |
michael@0 | 5594 | } else { |
michael@0 | 5595 | canUpdateState = TRUE; |
michael@0 | 5596 | } |
michael@0 | 5597 | } |
michael@0 | 5598 | } else { // French secondary processing |
michael@0 | 5599 | uint8_t frenchBuff[UCOL_MAX_BUFFER]; |
michael@0 | 5600 | int32_t frenchIndex = 0; |
michael@0 | 5601 | // Here we are going backwards. |
michael@0 | 5602 | // If the iterator is at the beggining, it should be |
michael@0 | 5603 | // moved to end. |
michael@0 | 5604 | if(wasDoingPrimary) { |
michael@0 | 5605 | s.iterator->move(s.iterator, 0, UITER_LIMIT); |
michael@0 | 5606 | cces = 0; |
michael@0 | 5607 | } |
michael@0 | 5608 | for(;;) { |
michael@0 | 5609 | if(i == count) { |
michael@0 | 5610 | goto saveState; |
michael@0 | 5611 | } |
michael@0 | 5612 | if(canUpdateState) { |
michael@0 | 5613 | newState = s.iterator->getState(s.iterator); |
michael@0 | 5614 | if(newState != UITER_NO_STATE) { |
michael@0 | 5615 | iterState = newState; |
michael@0 | 5616 | cces = 0; |
michael@0 | 5617 | } |
michael@0 | 5618 | } |
michael@0 | 5619 | CE = ucol_IGetPrevCE(coll, &s, status); |
michael@0 | 5620 | cces++; |
michael@0 | 5621 | if(CE==UCOL_NO_MORE_CES) { |
michael@0 | 5622 | // Add the level separator |
michael@0 | 5623 | terminatePSKLevel(level, maxLevel, i, dest); |
michael@0 | 5624 | byteCountOrFrenchDone = 0; |
michael@0 | 5625 | // Restart the iteration an move to the next level |
michael@0 | 5626 | s.iterator->move(s.iterator, 0, UITER_START); |
michael@0 | 5627 | level = UCOL_PSK_CASE; |
michael@0 | 5628 | break; |
michael@0 | 5629 | } |
michael@0 | 5630 | if(isContinuation(CE)) { // if it's a continuation, we want to save it and |
michael@0 | 5631 | // reverse when we get a first non-continuation CE. |
michael@0 | 5632 | CE >>= 8; |
michael@0 | 5633 | frenchBuff[frenchIndex++] = (uint8_t)CE; |
michael@0 | 5634 | } else if(!isShiftedCE(CE, LVT, &wasShifted)) { |
michael@0 | 5635 | CE >>= 8; /* get secondary */ |
michael@0 | 5636 | if(!frenchIndex) { |
michael@0 | 5637 | if(CE != 0) { |
michael@0 | 5638 | dest[i++]=(uint8_t)CE; |
michael@0 | 5639 | } |
michael@0 | 5640 | } else { |
michael@0 | 5641 | frenchBuff[frenchIndex++] = (uint8_t)CE; |
michael@0 | 5642 | frenchIndex -= usedFrench; |
michael@0 | 5643 | usedFrench = 0; |
michael@0 | 5644 | while(i < count && frenchIndex) { |
michael@0 | 5645 | dest[i++] = frenchBuff[--frenchIndex]; |
michael@0 | 5646 | usedFrench++; |
michael@0 | 5647 | } |
michael@0 | 5648 | } |
michael@0 | 5649 | } |
michael@0 | 5650 | if(uprv_numAvailableExpCEs(s)) { |
michael@0 | 5651 | canUpdateState = FALSE; |
michael@0 | 5652 | } else { |
michael@0 | 5653 | canUpdateState = TRUE; |
michael@0 | 5654 | } |
michael@0 | 5655 | } |
michael@0 | 5656 | } |
michael@0 | 5657 | } else { |
michael@0 | 5658 | level = UCOL_PSK_CASE; |
michael@0 | 5659 | } |
michael@0 | 5660 | /* fall through to next level */ |
michael@0 | 5661 | case UCOL_PSK_CASE: |
michael@0 | 5662 | if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) { |
michael@0 | 5663 | uint32_t caseShift = UCOL_CASE_SHIFT_START; |
michael@0 | 5664 | uint8_t caseByte = UCOL_CASE_BYTE_START; |
michael@0 | 5665 | uint8_t caseBits = 0; |
michael@0 | 5666 | |
michael@0 | 5667 | for(;;) { |
michael@0 | 5668 | U_ASSERT(caseShift <= UCOL_CASE_SHIFT_START); |
michael@0 | 5669 | if(i == count) { |
michael@0 | 5670 | goto saveState; |
michael@0 | 5671 | } |
michael@0 | 5672 | // We should save the state only if we |
michael@0 | 5673 | // are sure that we are done with the |
michael@0 | 5674 | // previous iterator state |
michael@0 | 5675 | if(canUpdateState) { |
michael@0 | 5676 | newState = s.iterator->getState(s.iterator); |
michael@0 | 5677 | if(newState != UITER_NO_STATE) { |
michael@0 | 5678 | iterState = newState; |
michael@0 | 5679 | cces = 0; |
michael@0 | 5680 | } |
michael@0 | 5681 | } |
michael@0 | 5682 | CE = ucol_IGetNextCE(coll, &s, status); |
michael@0 | 5683 | cces++; |
michael@0 | 5684 | if(CE==UCOL_NO_MORE_CES) { |
michael@0 | 5685 | // On the case level we might have an unfinished |
michael@0 | 5686 | // case byte. Add one if it's started. |
michael@0 | 5687 | if(caseShift != UCOL_CASE_SHIFT_START) { |
michael@0 | 5688 | dest[i++] = caseByte; |
michael@0 | 5689 | } |
michael@0 | 5690 | cces = 0; |
michael@0 | 5691 | // We have finished processing CEs on this level. |
michael@0 | 5692 | // However, we don't know if we have enough space |
michael@0 | 5693 | // to add a case level terminator. |
michael@0 | 5694 | if(i < count) { |
michael@0 | 5695 | // Add the level separator |
michael@0 | 5696 | terminatePSKLevel(level, maxLevel, i, dest); |
michael@0 | 5697 | // Restart the iteration and move to the |
michael@0 | 5698 | // next level |
michael@0 | 5699 | s.iterator->move(s.iterator, 0, UITER_START); |
michael@0 | 5700 | level = UCOL_PSK_TERTIARY; |
michael@0 | 5701 | } else { |
michael@0 | 5702 | canUpdateState = FALSE; |
michael@0 | 5703 | } |
michael@0 | 5704 | break; |
michael@0 | 5705 | } |
michael@0 | 5706 | |
michael@0 | 5707 | if(!isShiftedCE(CE, LVT, &wasShifted)) { |
michael@0 | 5708 | if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) { |
michael@0 | 5709 | // do the case level if we need to do it. We don't want to calculate |
michael@0 | 5710 | // case level for primary ignorables if we have only primary strength and case level |
michael@0 | 5711 | // otherwise we would break well formedness of CEs |
michael@0 | 5712 | CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK); |
michael@0 | 5713 | caseBits = (uint8_t)(CE & 0xC0); |
michael@0 | 5714 | // this copies the case level logic from the |
michael@0 | 5715 | // sort key generation code |
michael@0 | 5716 | if(CE != 0) { |
michael@0 | 5717 | if (caseShift == 0) { |
michael@0 | 5718 | dest[i++] = caseByte; |
michael@0 | 5719 | caseShift = UCOL_CASE_SHIFT_START; |
michael@0 | 5720 | caseByte = UCOL_CASE_BYTE_START; |
michael@0 | 5721 | } |
michael@0 | 5722 | if(coll->caseFirst == UCOL_UPPER_FIRST) { |
michael@0 | 5723 | if((caseBits & 0xC0) == 0) { |
michael@0 | 5724 | caseByte |= 1 << (--caseShift); |
michael@0 | 5725 | } else { |
michael@0 | 5726 | caseByte |= 0 << (--caseShift); |
michael@0 | 5727 | /* second bit */ |
michael@0 | 5728 | if(caseShift == 0) { |
michael@0 | 5729 | dest[i++] = caseByte; |
michael@0 | 5730 | caseShift = UCOL_CASE_SHIFT_START; |
michael@0 | 5731 | caseByte = UCOL_CASE_BYTE_START; |
michael@0 | 5732 | } |
michael@0 | 5733 | caseByte |= ((caseBits>>6)&1) << (--caseShift); |
michael@0 | 5734 | } |
michael@0 | 5735 | } else { |
michael@0 | 5736 | if((caseBits & 0xC0) == 0) { |
michael@0 | 5737 | caseByte |= 0 << (--caseShift); |
michael@0 | 5738 | } else { |
michael@0 | 5739 | caseByte |= 1 << (--caseShift); |
michael@0 | 5740 | /* second bit */ |
michael@0 | 5741 | if(caseShift == 0) { |
michael@0 | 5742 | dest[i++] = caseByte; |
michael@0 | 5743 | caseShift = UCOL_CASE_SHIFT_START; |
michael@0 | 5744 | caseByte = UCOL_CASE_BYTE_START; |
michael@0 | 5745 | } |
michael@0 | 5746 | caseByte |= ((caseBits>>7)&1) << (--caseShift); |
michael@0 | 5747 | } |
michael@0 | 5748 | } |
michael@0 | 5749 | } |
michael@0 | 5750 | |
michael@0 | 5751 | } |
michael@0 | 5752 | } |
michael@0 | 5753 | // Not sure this is correct for the case level - revisit |
michael@0 | 5754 | if(uprv_numAvailableExpCEs(s)) { |
michael@0 | 5755 | canUpdateState = FALSE; |
michael@0 | 5756 | } else { |
michael@0 | 5757 | canUpdateState = TRUE; |
michael@0 | 5758 | } |
michael@0 | 5759 | } |
michael@0 | 5760 | } else { |
michael@0 | 5761 | level = UCOL_PSK_TERTIARY; |
michael@0 | 5762 | } |
michael@0 | 5763 | /* fall through to next level */ |
michael@0 | 5764 | case UCOL_PSK_TERTIARY: |
michael@0 | 5765 | if(strength >= UCOL_TERTIARY) { |
michael@0 | 5766 | for(;;) { |
michael@0 | 5767 | if(i == count) { |
michael@0 | 5768 | goto saveState; |
michael@0 | 5769 | } |
michael@0 | 5770 | // We should save the state only if we |
michael@0 | 5771 | // are sure that we are done with the |
michael@0 | 5772 | // previous iterator state |
michael@0 | 5773 | if(canUpdateState) { |
michael@0 | 5774 | newState = s.iterator->getState(s.iterator); |
michael@0 | 5775 | if(newState != UITER_NO_STATE) { |
michael@0 | 5776 | iterState = newState; |
michael@0 | 5777 | cces = 0; |
michael@0 | 5778 | } |
michael@0 | 5779 | } |
michael@0 | 5780 | CE = ucol_IGetNextCE(coll, &s, status); |
michael@0 | 5781 | cces++; |
michael@0 | 5782 | if(CE==UCOL_NO_MORE_CES) { |
michael@0 | 5783 | // Add the level separator |
michael@0 | 5784 | terminatePSKLevel(level, maxLevel, i, dest); |
michael@0 | 5785 | byteCountOrFrenchDone = 0; |
michael@0 | 5786 | // Restart the iteration an move to the |
michael@0 | 5787 | // second level |
michael@0 | 5788 | s.iterator->move(s.iterator, 0, UITER_START); |
michael@0 | 5789 | cces = 0; |
michael@0 | 5790 | level = UCOL_PSK_QUATERNARY; |
michael@0 | 5791 | break; |
michael@0 | 5792 | } |
michael@0 | 5793 | if(!isShiftedCE(CE, LVT, &wasShifted)) { |
michael@0 | 5794 | notIsContinuation = !isContinuation(CE); |
michael@0 | 5795 | |
michael@0 | 5796 | if(notIsContinuation) { |
michael@0 | 5797 | CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK); |
michael@0 | 5798 | CE ^= coll->caseSwitch; |
michael@0 | 5799 | CE &= coll->tertiaryMask; |
michael@0 | 5800 | } else { |
michael@0 | 5801 | CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION)); |
michael@0 | 5802 | } |
michael@0 | 5803 | |
michael@0 | 5804 | if(CE != 0) { |
michael@0 | 5805 | dest[i++]=(uint8_t)CE; |
michael@0 | 5806 | } |
michael@0 | 5807 | } |
michael@0 | 5808 | if(uprv_numAvailableExpCEs(s)) { |
michael@0 | 5809 | canUpdateState = FALSE; |
michael@0 | 5810 | } else { |
michael@0 | 5811 | canUpdateState = TRUE; |
michael@0 | 5812 | } |
michael@0 | 5813 | } |
michael@0 | 5814 | } else { |
michael@0 | 5815 | // if we're not doing tertiary |
michael@0 | 5816 | // skip to the end |
michael@0 | 5817 | level = UCOL_PSK_NULL; |
michael@0 | 5818 | } |
michael@0 | 5819 | /* fall through to next level */ |
michael@0 | 5820 | case UCOL_PSK_QUATERNARY: |
michael@0 | 5821 | if(strength >= UCOL_QUATERNARY) { |
michael@0 | 5822 | for(;;) { |
michael@0 | 5823 | if(i == count) { |
michael@0 | 5824 | goto saveState; |
michael@0 | 5825 | } |
michael@0 | 5826 | // We should save the state only if we |
michael@0 | 5827 | // are sure that we are done with the |
michael@0 | 5828 | // previous iterator state |
michael@0 | 5829 | if(canUpdateState) { |
michael@0 | 5830 | newState = s.iterator->getState(s.iterator); |
michael@0 | 5831 | if(newState != UITER_NO_STATE) { |
michael@0 | 5832 | iterState = newState; |
michael@0 | 5833 | cces = 0; |
michael@0 | 5834 | } |
michael@0 | 5835 | } |
michael@0 | 5836 | CE = ucol_IGetNextCE(coll, &s, status); |
michael@0 | 5837 | cces++; |
michael@0 | 5838 | if(CE==UCOL_NO_MORE_CES) { |
michael@0 | 5839 | // Add the level separator |
michael@0 | 5840 | terminatePSKLevel(level, maxLevel, i, dest); |
michael@0 | 5841 | //dest[i++] = UCOL_LEVELTERMINATOR; |
michael@0 | 5842 | byteCountOrFrenchDone = 0; |
michael@0 | 5843 | // Restart the iteration an move to the |
michael@0 | 5844 | // second level |
michael@0 | 5845 | s.iterator->move(s.iterator, 0, UITER_START); |
michael@0 | 5846 | cces = 0; |
michael@0 | 5847 | level = UCOL_PSK_QUIN; |
michael@0 | 5848 | break; |
michael@0 | 5849 | } |
michael@0 | 5850 | if(CE==0) |
michael@0 | 5851 | continue; |
michael@0 | 5852 | if(isShiftedCE(CE, LVT, &wasShifted)) { |
michael@0 | 5853 | CE >>= 16; /* get primary */ |
michael@0 | 5854 | if(CE != 0) { |
michael@0 | 5855 | if(byteCountOrFrenchDone == 0) { |
michael@0 | 5856 | dest[i++]=(uint8_t)(CE >> 8); |
michael@0 | 5857 | } else { |
michael@0 | 5858 | byteCountOrFrenchDone = 0; |
michael@0 | 5859 | } |
michael@0 | 5860 | if((CE &=0xff)!=0) { |
michael@0 | 5861 | if(i==count) { |
michael@0 | 5862 | /* overflow */ |
michael@0 | 5863 | byteCountOrFrenchDone = 1; |
michael@0 | 5864 | goto saveState; |
michael@0 | 5865 | } |
michael@0 | 5866 | dest[i++]=(uint8_t)CE; |
michael@0 | 5867 | } |
michael@0 | 5868 | } |
michael@0 | 5869 | } else { |
michael@0 | 5870 | notIsContinuation = !isContinuation(CE); |
michael@0 | 5871 | if(notIsContinuation) { |
michael@0 | 5872 | if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it |
michael@0 | 5873 | dest[i++] = UCOL_HIRAGANA_QUAD; |
michael@0 | 5874 | } else { |
michael@0 | 5875 | dest[i++] = 0xFF; |
michael@0 | 5876 | } |
michael@0 | 5877 | } |
michael@0 | 5878 | } |
michael@0 | 5879 | if(uprv_numAvailableExpCEs(s)) { |
michael@0 | 5880 | canUpdateState = FALSE; |
michael@0 | 5881 | } else { |
michael@0 | 5882 | canUpdateState = TRUE; |
michael@0 | 5883 | } |
michael@0 | 5884 | } |
michael@0 | 5885 | } else { |
michael@0 | 5886 | // if we're not doing quaternary |
michael@0 | 5887 | // skip to the end |
michael@0 | 5888 | level = UCOL_PSK_NULL; |
michael@0 | 5889 | } |
michael@0 | 5890 | /* fall through to next level */ |
michael@0 | 5891 | case UCOL_PSK_QUIN: |
michael@0 | 5892 | level = UCOL_PSK_IDENTICAL; |
michael@0 | 5893 | /* fall through to next level */ |
michael@0 | 5894 | case UCOL_PSK_IDENTICAL: |
michael@0 | 5895 | if(strength >= UCOL_IDENTICAL) { |
michael@0 | 5896 | UChar32 first, second; |
michael@0 | 5897 | int32_t bocsuBytesWritten = 0; |
michael@0 | 5898 | // We always need to do identical on |
michael@0 | 5899 | // the NFD form of the string. |
michael@0 | 5900 | if(normIter == NULL) { |
michael@0 | 5901 | // we arrived from the level below and |
michael@0 | 5902 | // normalization was not turned on. |
michael@0 | 5903 | // therefore, we need to make a fresh NFD iterator |
michael@0 | 5904 | normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); |
michael@0 | 5905 | s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); |
michael@0 | 5906 | } else if(!doingIdenticalFromStart) { |
michael@0 | 5907 | // there is an iterator, but we did some other levels. |
michael@0 | 5908 | // therefore, we have a FCD iterator - need to make |
michael@0 | 5909 | // a NFD one. |
michael@0 | 5910 | // normIter being at the beginning does not guarantee |
michael@0 | 5911 | // that the underlying iterator is at the beginning |
michael@0 | 5912 | iter->move(iter, 0, UITER_START); |
michael@0 | 5913 | s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); |
michael@0 | 5914 | } |
michael@0 | 5915 | // At this point we have a NFD iterator that is positioned |
michael@0 | 5916 | // in the right place |
michael@0 | 5917 | if(U_FAILURE(*status)) { |
michael@0 | 5918 | UTRACE_EXIT_STATUS(*status); |
michael@0 | 5919 | return 0; |
michael@0 | 5920 | } |
michael@0 | 5921 | first = uiter_previous32(s.iterator); |
michael@0 | 5922 | // maybe we're at the start of the string |
michael@0 | 5923 | if(first == U_SENTINEL) { |
michael@0 | 5924 | first = 0; |
michael@0 | 5925 | } else { |
michael@0 | 5926 | uiter_next32(s.iterator); |
michael@0 | 5927 | } |
michael@0 | 5928 | |
michael@0 | 5929 | j = 0; |
michael@0 | 5930 | for(;;) { |
michael@0 | 5931 | if(i == count) { |
michael@0 | 5932 | if(j+1 < bocsuBytesWritten) { |
michael@0 | 5933 | bocsuBytesUsed = j+1; |
michael@0 | 5934 | } |
michael@0 | 5935 | goto saveState; |
michael@0 | 5936 | } |
michael@0 | 5937 | |
michael@0 | 5938 | // On identical level, we will always save |
michael@0 | 5939 | // the state if we reach this point, since |
michael@0 | 5940 | // we don't depend on getNextCE for content |
michael@0 | 5941 | // all the content is in our buffer and we |
michael@0 | 5942 | // already either stored the full buffer OR |
michael@0 | 5943 | // otherwise we won't arrive here. |
michael@0 | 5944 | newState = s.iterator->getState(s.iterator); |
michael@0 | 5945 | if(newState != UITER_NO_STATE) { |
michael@0 | 5946 | iterState = newState; |
michael@0 | 5947 | cces = 0; |
michael@0 | 5948 | } |
michael@0 | 5949 | |
michael@0 | 5950 | uint8_t buff[4]; |
michael@0 | 5951 | second = uiter_next32(s.iterator); |
michael@0 | 5952 | cces++; |
michael@0 | 5953 | |
michael@0 | 5954 | // end condition for identical level |
michael@0 | 5955 | if(second == U_SENTINEL) { |
michael@0 | 5956 | terminatePSKLevel(level, maxLevel, i, dest); |
michael@0 | 5957 | level = UCOL_PSK_NULL; |
michael@0 | 5958 | break; |
michael@0 | 5959 | } |
michael@0 | 5960 | bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff); |
michael@0 | 5961 | first = second; |
michael@0 | 5962 | |
michael@0 | 5963 | j = 0; |
michael@0 | 5964 | if(bocsuBytesUsed != 0) { |
michael@0 | 5965 | while(bocsuBytesUsed-->0) { |
michael@0 | 5966 | j++; |
michael@0 | 5967 | } |
michael@0 | 5968 | } |
michael@0 | 5969 | |
michael@0 | 5970 | while(i < count && j < bocsuBytesWritten) { |
michael@0 | 5971 | dest[i++] = buff[j++]; |
michael@0 | 5972 | } |
michael@0 | 5973 | } |
michael@0 | 5974 | |
michael@0 | 5975 | } else { |
michael@0 | 5976 | level = UCOL_PSK_NULL; |
michael@0 | 5977 | } |
michael@0 | 5978 | /* fall through to next level */ |
michael@0 | 5979 | case UCOL_PSK_NULL: |
michael@0 | 5980 | j = i; |
michael@0 | 5981 | while(j<count) { |
michael@0 | 5982 | dest[j++]=0; |
michael@0 | 5983 | } |
michael@0 | 5984 | break; |
michael@0 | 5985 | default: |
michael@0 | 5986 | *status = U_INTERNAL_PROGRAM_ERROR; |
michael@0 | 5987 | UTRACE_EXIT_STATUS(*status); |
michael@0 | 5988 | return 0; |
michael@0 | 5989 | } |
michael@0 | 5990 | |
michael@0 | 5991 | saveState: |
michael@0 | 5992 | // Now we need to return stuff. First we want to see whether we have |
michael@0 | 5993 | // done everything for the current state of iterator. |
michael@0 | 5994 | if(byteCountOrFrenchDone |
michael@0 | 5995 | || canUpdateState == FALSE |
michael@0 | 5996 | || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE) |
michael@0 | 5997 | { |
michael@0 | 5998 | // Any of above mean that the previous transaction |
michael@0 | 5999 | // wasn't finished and that we should store the |
michael@0 | 6000 | // previous iterator state. |
michael@0 | 6001 | state[0] = iterState; |
michael@0 | 6002 | } else { |
michael@0 | 6003 | // The transaction is complete. We will continue in the next iteration. |
michael@0 | 6004 | state[0] = s.iterator->getState(s.iterator); |
michael@0 | 6005 | cces = 0; |
michael@0 | 6006 | } |
michael@0 | 6007 | // Store the number of bocsu bytes written. |
michael@0 | 6008 | if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) { |
michael@0 | 6009 | *status = U_INDEX_OUTOFBOUNDS_ERROR; |
michael@0 | 6010 | } |
michael@0 | 6011 | state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT; |
michael@0 | 6012 | |
michael@0 | 6013 | // Next we put in the level of comparison |
michael@0 | 6014 | state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT); |
michael@0 | 6015 | |
michael@0 | 6016 | // If we are doing French, we need to store whether we have just finished the French level |
michael@0 | 6017 | if(level == UCOL_PSK_SECONDARY && doingFrench) { |
michael@0 | 6018 | state[1] |= (((int32_t)(state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT); |
michael@0 | 6019 | } else { |
michael@0 | 6020 | state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT); |
michael@0 | 6021 | } |
michael@0 | 6022 | |
michael@0 | 6023 | // Was the latest CE shifted |
michael@0 | 6024 | if(wasShifted) { |
michael@0 | 6025 | state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT; |
michael@0 | 6026 | } |
michael@0 | 6027 | // Check for cces overflow |
michael@0 | 6028 | if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) { |
michael@0 | 6029 | *status = U_INDEX_OUTOFBOUNDS_ERROR; |
michael@0 | 6030 | } |
michael@0 | 6031 | // Store cces |
michael@0 | 6032 | state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT); |
michael@0 | 6033 | |
michael@0 | 6034 | // Check for French overflow |
michael@0 | 6035 | if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) { |
michael@0 | 6036 | *status = U_INDEX_OUTOFBOUNDS_ERROR; |
michael@0 | 6037 | } |
michael@0 | 6038 | // Store number of bytes written in the French secondary continuation sequence |
michael@0 | 6039 | state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT); |
michael@0 | 6040 | |
michael@0 | 6041 | |
michael@0 | 6042 | // If we have used normalizing iterator, get rid of it |
michael@0 | 6043 | if(normIter != NULL) { |
michael@0 | 6044 | unorm_closeIter(normIter); |
michael@0 | 6045 | } |
michael@0 | 6046 | |
michael@0 | 6047 | /* To avoid memory leak, free the offset buffer if necessary. */ |
michael@0 | 6048 | ucol_freeOffsetBuffer(&s); |
michael@0 | 6049 | |
michael@0 | 6050 | // Return number of meaningful sortkey bytes. |
michael@0 | 6051 | UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d", |
michael@0 | 6052 | dest,i, state[0], state[1]); |
michael@0 | 6053 | UTRACE_EXIT_VALUE(i); |
michael@0 | 6054 | return i; |
michael@0 | 6055 | } |
michael@0 | 6056 | |
michael@0 | 6057 | /** |
michael@0 | 6058 | * Produce a bound for a given sortkey and a number of levels. |
michael@0 | 6059 | */ |
michael@0 | 6060 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 6061 | ucol_getBound(const uint8_t *source, |
michael@0 | 6062 | int32_t sourceLength, |
michael@0 | 6063 | UColBoundMode boundType, |
michael@0 | 6064 | uint32_t noOfLevels, |
michael@0 | 6065 | uint8_t *result, |
michael@0 | 6066 | int32_t resultLength, |
michael@0 | 6067 | UErrorCode *status) |
michael@0 | 6068 | { |
michael@0 | 6069 | // consistency checks |
michael@0 | 6070 | if(status == NULL || U_FAILURE(*status)) { |
michael@0 | 6071 | return 0; |
michael@0 | 6072 | } |
michael@0 | 6073 | if(source == NULL) { |
michael@0 | 6074 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 6075 | return 0; |
michael@0 | 6076 | } |
michael@0 | 6077 | |
michael@0 | 6078 | int32_t sourceIndex = 0; |
michael@0 | 6079 | // Scan the string until we skip enough of the key OR reach the end of the key |
michael@0 | 6080 | do { |
michael@0 | 6081 | sourceIndex++; |
michael@0 | 6082 | if(source[sourceIndex] == UCOL_LEVELTERMINATOR) { |
michael@0 | 6083 | noOfLevels--; |
michael@0 | 6084 | } |
michael@0 | 6085 | } while (noOfLevels > 0 |
michael@0 | 6086 | && (source[sourceIndex] != 0 || sourceIndex < sourceLength)); |
michael@0 | 6087 | |
michael@0 | 6088 | if((source[sourceIndex] == 0 || sourceIndex == sourceLength) |
michael@0 | 6089 | && noOfLevels > 0) { |
michael@0 | 6090 | *status = U_SORT_KEY_TOO_SHORT_WARNING; |
michael@0 | 6091 | } |
michael@0 | 6092 | |
michael@0 | 6093 | |
michael@0 | 6094 | // READ ME: this code assumes that the values for boundType |
michael@0 | 6095 | // enum will not changes. They are set so that the enum value |
michael@0 | 6096 | // corresponds to the number of extra bytes each bound type |
michael@0 | 6097 | // needs. |
michael@0 | 6098 | if(result != NULL && resultLength >= sourceIndex+boundType) { |
michael@0 | 6099 | uprv_memcpy(result, source, sourceIndex); |
michael@0 | 6100 | switch(boundType) { |
michael@0 | 6101 | // Lower bound just gets terminated. No extra bytes |
michael@0 | 6102 | case UCOL_BOUND_LOWER: // = 0 |
michael@0 | 6103 | break; |
michael@0 | 6104 | // Upper bound needs one extra byte |
michael@0 | 6105 | case UCOL_BOUND_UPPER: // = 1 |
michael@0 | 6106 | result[sourceIndex++] = 2; |
michael@0 | 6107 | break; |
michael@0 | 6108 | // Upper long bound needs two extra bytes |
michael@0 | 6109 | case UCOL_BOUND_UPPER_LONG: // = 2 |
michael@0 | 6110 | result[sourceIndex++] = 0xFF; |
michael@0 | 6111 | result[sourceIndex++] = 0xFF; |
michael@0 | 6112 | break; |
michael@0 | 6113 | default: |
michael@0 | 6114 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 6115 | return 0; |
michael@0 | 6116 | } |
michael@0 | 6117 | result[sourceIndex++] = 0; |
michael@0 | 6118 | |
michael@0 | 6119 | return sourceIndex; |
michael@0 | 6120 | } else { |
michael@0 | 6121 | return sourceIndex+boundType+1; |
michael@0 | 6122 | } |
michael@0 | 6123 | } |
michael@0 | 6124 | |
michael@0 | 6125 | /****************************************************************************/ |
michael@0 | 6126 | /* Following are the functions that deal with the properties of a collator */ |
michael@0 | 6127 | /* there are new APIs and some compatibility APIs */ |
michael@0 | 6128 | /****************************************************************************/ |
michael@0 | 6129 | |
michael@0 | 6130 | static inline void |
michael@0 | 6131 | ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE, |
michael@0 | 6132 | int32_t *primShift, int32_t *secShift, int32_t *terShift) |
michael@0 | 6133 | { |
michael@0 | 6134 | uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0; |
michael@0 | 6135 | UBool reverseSecondary = FALSE; |
michael@0 | 6136 | UBool continuation = isContinuation(CE); |
michael@0 | 6137 | if(!continuation) { |
michael@0 | 6138 | tertiary = (uint8_t)((CE & coll->tertiaryMask)); |
michael@0 | 6139 | tertiary ^= coll->caseSwitch; |
michael@0 | 6140 | reverseSecondary = TRUE; |
michael@0 | 6141 | } else { |
michael@0 | 6142 | tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION)); |
michael@0 | 6143 | tertiary &= UCOL_REMOVE_CASE; |
michael@0 | 6144 | reverseSecondary = FALSE; |
michael@0 | 6145 | } |
michael@0 | 6146 | |
michael@0 | 6147 | secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK); |
michael@0 | 6148 | primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK); |
michael@0 | 6149 | primary1 = (uint8_t)(CE >> 8); |
michael@0 | 6150 | |
michael@0 | 6151 | if(primary1 != 0) { |
michael@0 | 6152 | if (coll->leadBytePermutationTable != NULL && !continuation) { |
michael@0 | 6153 | primary1 = coll->leadBytePermutationTable[primary1]; |
michael@0 | 6154 | } |
michael@0 | 6155 | |
michael@0 | 6156 | coll->latinOneCEs[ch] |= (primary1 << *primShift); |
michael@0 | 6157 | *primShift -= 8; |
michael@0 | 6158 | } |
michael@0 | 6159 | if(primary2 != 0) { |
michael@0 | 6160 | if(*primShift < 0) { |
michael@0 | 6161 | coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE; |
michael@0 | 6162 | coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; |
michael@0 | 6163 | coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; |
michael@0 | 6164 | return; |
michael@0 | 6165 | } |
michael@0 | 6166 | coll->latinOneCEs[ch] |= (primary2 << *primShift); |
michael@0 | 6167 | *primShift -= 8; |
michael@0 | 6168 | } |
michael@0 | 6169 | if(secondary != 0) { |
michael@0 | 6170 | if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary |
michael@0 | 6171 | coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary |
michael@0 | 6172 | coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24); |
michael@0 | 6173 | } else { // normal case |
michael@0 | 6174 | coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift); |
michael@0 | 6175 | } |
michael@0 | 6176 | *secShift -= 8; |
michael@0 | 6177 | } |
michael@0 | 6178 | if(tertiary != 0) { |
michael@0 | 6179 | coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift); |
michael@0 | 6180 | *terShift -= 8; |
michael@0 | 6181 | } |
michael@0 | 6182 | } |
michael@0 | 6183 | |
michael@0 | 6184 | static inline UBool |
michael@0 | 6185 | ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) { |
michael@0 | 6186 | uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3); |
michael@0 | 6187 | if(newTable == NULL) { |
michael@0 | 6188 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 6189 | coll->latinOneFailed = TRUE; |
michael@0 | 6190 | return FALSE; |
michael@0 | 6191 | } |
michael@0 | 6192 | int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t); |
michael@0 | 6193 | uprv_memset(newTable, 0, size*sizeof(uint32_t)*3); |
michael@0 | 6194 | uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy); |
michael@0 | 6195 | uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy); |
michael@0 | 6196 | uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy); |
michael@0 | 6197 | coll->latinOneTableLen = size; |
michael@0 | 6198 | uprv_free(coll->latinOneCEs); |
michael@0 | 6199 | coll->latinOneCEs = newTable; |
michael@0 | 6200 | return TRUE; |
michael@0 | 6201 | } |
michael@0 | 6202 | |
michael@0 | 6203 | static UBool |
michael@0 | 6204 | ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) { |
michael@0 | 6205 | UBool result = TRUE; |
michael@0 | 6206 | if(coll->latinOneCEs == NULL) { |
michael@0 | 6207 | coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3); |
michael@0 | 6208 | if(coll->latinOneCEs == NULL) { |
michael@0 | 6209 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 6210 | return FALSE; |
michael@0 | 6211 | } |
michael@0 | 6212 | coll->latinOneTableLen = UCOL_LATINONETABLELEN; |
michael@0 | 6213 | } |
michael@0 | 6214 | UChar ch = 0; |
michael@0 | 6215 | UCollationElements *it = ucol_openElements(coll, &ch, 1, status); |
michael@0 | 6216 | // Check for null pointer |
michael@0 | 6217 | if (U_FAILURE(*status)) { |
michael@0 | 6218 | ucol_closeElements(it); |
michael@0 | 6219 | return FALSE; |
michael@0 | 6220 | } |
michael@0 | 6221 | uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3); |
michael@0 | 6222 | |
michael@0 | 6223 | int32_t primShift = 24, secShift = 24, terShift = 24; |
michael@0 | 6224 | uint32_t CE = 0; |
michael@0 | 6225 | int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1; |
michael@0 | 6226 | |
michael@0 | 6227 | // TODO: make safe if you get more than you wanted... |
michael@0 | 6228 | for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) { |
michael@0 | 6229 | primShift = 24; secShift = 24; terShift = 24; |
michael@0 | 6230 | if(ch < 0x100) { |
michael@0 | 6231 | CE = coll->latinOneMapping[ch]; |
michael@0 | 6232 | } else { |
michael@0 | 6233 | CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); |
michael@0 | 6234 | if(CE == UCOL_NOT_FOUND && coll->UCA) { |
michael@0 | 6235 | CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); |
michael@0 | 6236 | } |
michael@0 | 6237 | } |
michael@0 | 6238 | if(CE < UCOL_NOT_FOUND) { |
michael@0 | 6239 | ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); |
michael@0 | 6240 | } else { |
michael@0 | 6241 | switch (getCETag(CE)) { |
michael@0 | 6242 | case EXPANSION_TAG: |
michael@0 | 6243 | case DIGIT_TAG: |
michael@0 | 6244 | ucol_setText(it, &ch, 1, status); |
michael@0 | 6245 | while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) { |
michael@0 | 6246 | if(primShift < 0 || secShift < 0 || terShift < 0) { |
michael@0 | 6247 | coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE; |
michael@0 | 6248 | coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; |
michael@0 | 6249 | coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; |
michael@0 | 6250 | break; |
michael@0 | 6251 | } |
michael@0 | 6252 | ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); |
michael@0 | 6253 | } |
michael@0 | 6254 | break; |
michael@0 | 6255 | case CONTRACTION_TAG: |
michael@0 | 6256 | // here is the trick |
michael@0 | 6257 | // F2 is contraction. We do something very similar to contractions |
michael@0 | 6258 | // but have two indices, one in the real contraction table and the |
michael@0 | 6259 | // other to where we stuffed things. This hopes that we don't have |
michael@0 | 6260 | // many contractions (this should work for latin-1 tables). |
michael@0 | 6261 | { |
michael@0 | 6262 | if((CE & 0x00FFF000) != 0) { |
michael@0 | 6263 | *status = U_UNSUPPORTED_ERROR; |
michael@0 | 6264 | goto cleanup_after_failure; |
michael@0 | 6265 | } |
michael@0 | 6266 | |
michael@0 | 6267 | const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE); |
michael@0 | 6268 | |
michael@0 | 6269 | CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table |
michael@0 | 6270 | |
michael@0 | 6271 | coll->latinOneCEs[ch] = CE; |
michael@0 | 6272 | coll->latinOneCEs[coll->latinOneTableLen+ch] = CE; |
michael@0 | 6273 | coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE; |
michael@0 | 6274 | |
michael@0 | 6275 | // We're going to jump into contraction table, pick the elements |
michael@0 | 6276 | // and use them |
michael@0 | 6277 | do { |
michael@0 | 6278 | CE = *(coll->contractionCEs + |
michael@0 | 6279 | (UCharOffset - coll->contractionIndex)); |
michael@0 | 6280 | if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) { |
michael@0 | 6281 | uint32_t size; |
michael@0 | 6282 | uint32_t i; /* general counter */ |
michael@0 | 6283 | uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ |
michael@0 | 6284 | size = getExpansionCount(CE); |
michael@0 | 6285 | //CE = *CEOffset++; |
michael@0 | 6286 | if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ |
michael@0 | 6287 | for(i = 0; i<size; i++) { |
michael@0 | 6288 | if(primShift < 0 || secShift < 0 || terShift < 0) { |
michael@0 | 6289 | coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; |
michael@0 | 6290 | coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; |
michael@0 | 6291 | coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; |
michael@0 | 6292 | break; |
michael@0 | 6293 | } |
michael@0 | 6294 | ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift); |
michael@0 | 6295 | } |
michael@0 | 6296 | } else { /* else, we do */ |
michael@0 | 6297 | while(*CEOffset != 0) { |
michael@0 | 6298 | if(primShift < 0 || secShift < 0 || terShift < 0) { |
michael@0 | 6299 | coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; |
michael@0 | 6300 | coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; |
michael@0 | 6301 | coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; |
michael@0 | 6302 | break; |
michael@0 | 6303 | } |
michael@0 | 6304 | ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift); |
michael@0 | 6305 | } |
michael@0 | 6306 | } |
michael@0 | 6307 | contractionOffset++; |
michael@0 | 6308 | } else if(CE < UCOL_NOT_FOUND) { |
michael@0 | 6309 | ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift); |
michael@0 | 6310 | } else { |
michael@0 | 6311 | coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; |
michael@0 | 6312 | coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; |
michael@0 | 6313 | coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; |
michael@0 | 6314 | contractionOffset++; |
michael@0 | 6315 | } |
michael@0 | 6316 | UCharOffset++; |
michael@0 | 6317 | primShift = 24; secShift = 24; terShift = 24; |
michael@0 | 6318 | if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate |
michael@0 | 6319 | if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) { |
michael@0 | 6320 | goto cleanup_after_failure; |
michael@0 | 6321 | } |
michael@0 | 6322 | } |
michael@0 | 6323 | } while(*UCharOffset != 0xFFFF); |
michael@0 | 6324 | } |
michael@0 | 6325 | break;; |
michael@0 | 6326 | case SPEC_PROC_TAG: |
michael@0 | 6327 | { |
michael@0 | 6328 | // 0xB7 is a precontext character defined in UCA5.1, a special |
michael@0 | 6329 | // handle is implemeted in order to save LatinOne table for |
michael@0 | 6330 | // most locales. |
michael@0 | 6331 | if (ch==0xb7) { |
michael@0 | 6332 | ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); |
michael@0 | 6333 | } |
michael@0 | 6334 | else { |
michael@0 | 6335 | goto cleanup_after_failure; |
michael@0 | 6336 | } |
michael@0 | 6337 | } |
michael@0 | 6338 | break; |
michael@0 | 6339 | default: |
michael@0 | 6340 | goto cleanup_after_failure; |
michael@0 | 6341 | } |
michael@0 | 6342 | } |
michael@0 | 6343 | } |
michael@0 | 6344 | // compact table |
michael@0 | 6345 | if(contractionOffset < coll->latinOneTableLen) { |
michael@0 | 6346 | if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) { |
michael@0 | 6347 | goto cleanup_after_failure; |
michael@0 | 6348 | } |
michael@0 | 6349 | } |
michael@0 | 6350 | ucol_closeElements(it); |
michael@0 | 6351 | return result; |
michael@0 | 6352 | |
michael@0 | 6353 | cleanup_after_failure: |
michael@0 | 6354 | // status should already be set before arriving here. |
michael@0 | 6355 | coll->latinOneFailed = TRUE; |
michael@0 | 6356 | ucol_closeElements(it); |
michael@0 | 6357 | return FALSE; |
michael@0 | 6358 | } |
michael@0 | 6359 | |
michael@0 | 6360 | void ucol_updateInternalState(UCollator *coll, UErrorCode *status) { |
michael@0 | 6361 | if(U_SUCCESS(*status)) { |
michael@0 | 6362 | if(coll->caseFirst == UCOL_UPPER_FIRST) { |
michael@0 | 6363 | coll->caseSwitch = UCOL_CASE_SWITCH; |
michael@0 | 6364 | } else { |
michael@0 | 6365 | coll->caseSwitch = UCOL_NO_CASE_SWITCH; |
michael@0 | 6366 | } |
michael@0 | 6367 | |
michael@0 | 6368 | if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) { |
michael@0 | 6369 | coll->tertiaryMask = UCOL_REMOVE_CASE; |
michael@0 | 6370 | coll->tertiaryCommon = UCOL_COMMON3_NORMAL; |
michael@0 | 6371 | coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */ |
michael@0 | 6372 | coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF; |
michael@0 | 6373 | coll->tertiaryBottom = UCOL_COMMON_BOT3; |
michael@0 | 6374 | } else { |
michael@0 | 6375 | coll->tertiaryMask = UCOL_KEEP_CASE; |
michael@0 | 6376 | coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON; |
michael@0 | 6377 | if(coll->caseFirst == UCOL_UPPER_FIRST) { |
michael@0 | 6378 | coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST; |
michael@0 | 6379 | coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER; |
michael@0 | 6380 | coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER; |
michael@0 | 6381 | } else { |
michael@0 | 6382 | coll->tertiaryCommon = UCOL_COMMON3_NORMAL; |
michael@0 | 6383 | coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER; |
michael@0 | 6384 | coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER; |
michael@0 | 6385 | } |
michael@0 | 6386 | } |
michael@0 | 6387 | |
michael@0 | 6388 | /* Set the compression values */ |
michael@0 | 6389 | uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - coll->tertiaryBottom - 1); |
michael@0 | 6390 | coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */ |
michael@0 | 6391 | coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount); |
michael@0 | 6392 | |
michael@0 | 6393 | if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY |
michael@0 | 6394 | && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE) |
michael@0 | 6395 | { |
michael@0 | 6396 | coll->sortKeyGen = ucol_calcSortKeySimpleTertiary; |
michael@0 | 6397 | } else { |
michael@0 | 6398 | coll->sortKeyGen = ucol_calcSortKey; |
michael@0 | 6399 | } |
michael@0 | 6400 | if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF |
michael@0 | 6401 | && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed) |
michael@0 | 6402 | { |
michael@0 | 6403 | if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) { |
michael@0 | 6404 | if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it |
michael@0 | 6405 | //fprintf(stderr, "F"); |
michael@0 | 6406 | coll->latinOneUse = TRUE; |
michael@0 | 6407 | } else { |
michael@0 | 6408 | coll->latinOneUse = FALSE; |
michael@0 | 6409 | } |
michael@0 | 6410 | if(*status == U_UNSUPPORTED_ERROR) { |
michael@0 | 6411 | *status = U_ZERO_ERROR; |
michael@0 | 6412 | } |
michael@0 | 6413 | } else { // latin1Table exists and it doesn't need to be regenerated, just use it |
michael@0 | 6414 | coll->latinOneUse = TRUE; |
michael@0 | 6415 | } |
michael@0 | 6416 | } else { |
michael@0 | 6417 | coll->latinOneUse = FALSE; |
michael@0 | 6418 | } |
michael@0 | 6419 | } |
michael@0 | 6420 | } |
michael@0 | 6421 | |
michael@0 | 6422 | U_CAPI uint32_t U_EXPORT2 |
michael@0 | 6423 | ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) { |
michael@0 | 6424 | if(U_FAILURE(*status) || coll == NULL) { |
michael@0 | 6425 | return 0; |
michael@0 | 6426 | } |
michael@0 | 6427 | if(len == -1) { |
michael@0 | 6428 | len = u_strlen(varTop); |
michael@0 | 6429 | } |
michael@0 | 6430 | if(len == 0) { |
michael@0 | 6431 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 6432 | return 0; |
michael@0 | 6433 | } |
michael@0 | 6434 | |
michael@0 | 6435 | if(coll->delegate!=NULL) { |
michael@0 | 6436 | return ((Collator*)coll->delegate)->setVariableTop(varTop, len, *status); |
michael@0 | 6437 | } |
michael@0 | 6438 | |
michael@0 | 6439 | |
michael@0 | 6440 | collIterate s; |
michael@0 | 6441 | IInit_collIterate(coll, varTop, len, &s, status); |
michael@0 | 6442 | if(U_FAILURE(*status)) { |
michael@0 | 6443 | return 0; |
michael@0 | 6444 | } |
michael@0 | 6445 | |
michael@0 | 6446 | uint32_t CE = ucol_IGetNextCE(coll, &s, status); |
michael@0 | 6447 | |
michael@0 | 6448 | /* here we check if we have consumed all characters */ |
michael@0 | 6449 | /* you can put in either one character or a contraction */ |
michael@0 | 6450 | /* you shouldn't put more... */ |
michael@0 | 6451 | if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) { |
michael@0 | 6452 | *status = U_CE_NOT_FOUND_ERROR; |
michael@0 | 6453 | return 0; |
michael@0 | 6454 | } |
michael@0 | 6455 | |
michael@0 | 6456 | uint32_t nextCE = ucol_IGetNextCE(coll, &s, status); |
michael@0 | 6457 | |
michael@0 | 6458 | if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) { |
michael@0 | 6459 | *status = U_PRIMARY_TOO_LONG_ERROR; |
michael@0 | 6460 | return 0; |
michael@0 | 6461 | } |
michael@0 | 6462 | if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) { |
michael@0 | 6463 | coll->variableTopValueisDefault = FALSE; |
michael@0 | 6464 | coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16; |
michael@0 | 6465 | } |
michael@0 | 6466 | |
michael@0 | 6467 | /* To avoid memory leak, free the offset buffer if necessary. */ |
michael@0 | 6468 | ucol_freeOffsetBuffer(&s); |
michael@0 | 6469 | |
michael@0 | 6470 | return CE & UCOL_PRIMARYMASK; |
michael@0 | 6471 | } |
michael@0 | 6472 | |
michael@0 | 6473 | U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) { |
michael@0 | 6474 | if(U_FAILURE(*status) || coll == NULL) { |
michael@0 | 6475 | return 0; |
michael@0 | 6476 | } |
michael@0 | 6477 | if(coll->delegate!=NULL) { |
michael@0 | 6478 | return ((const Collator*)coll->delegate)->getVariableTop(*status); |
michael@0 | 6479 | } |
michael@0 | 6480 | return coll->variableTopValue<<16; |
michael@0 | 6481 | } |
michael@0 | 6482 | |
michael@0 | 6483 | U_CAPI void U_EXPORT2 |
michael@0 | 6484 | ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) { |
michael@0 | 6485 | if(U_FAILURE(*status) || coll == NULL) { |
michael@0 | 6486 | return; |
michael@0 | 6487 | } |
michael@0 | 6488 | |
michael@0 | 6489 | if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) { |
michael@0 | 6490 | coll->variableTopValueisDefault = FALSE; |
michael@0 | 6491 | coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16; |
michael@0 | 6492 | } |
michael@0 | 6493 | } |
michael@0 | 6494 | /* Attribute setter API */ |
michael@0 | 6495 | U_CAPI void U_EXPORT2 |
michael@0 | 6496 | ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) { |
michael@0 | 6497 | if(U_FAILURE(*status) || coll == NULL) { |
michael@0 | 6498 | return; |
michael@0 | 6499 | } |
michael@0 | 6500 | |
michael@0 | 6501 | if(coll->delegate != NULL) { |
michael@0 | 6502 | ((Collator*)coll->delegate)->setAttribute(attr,value,*status); |
michael@0 | 6503 | return; |
michael@0 | 6504 | } |
michael@0 | 6505 | |
michael@0 | 6506 | UColAttributeValue oldFrench = coll->frenchCollation; |
michael@0 | 6507 | UColAttributeValue oldCaseFirst = coll->caseFirst; |
michael@0 | 6508 | switch(attr) { |
michael@0 | 6509 | case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */ |
michael@0 | 6510 | if(value == UCOL_ON) { |
michael@0 | 6511 | coll->numericCollation = UCOL_ON; |
michael@0 | 6512 | coll->numericCollationisDefault = FALSE; |
michael@0 | 6513 | } else if (value == UCOL_OFF) { |
michael@0 | 6514 | coll->numericCollation = UCOL_OFF; |
michael@0 | 6515 | coll->numericCollationisDefault = FALSE; |
michael@0 | 6516 | } else if (value == UCOL_DEFAULT) { |
michael@0 | 6517 | coll->numericCollationisDefault = TRUE; |
michael@0 | 6518 | coll->numericCollation = (UColAttributeValue)coll->options->numericCollation; |
michael@0 | 6519 | } else { |
michael@0 | 6520 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 6521 | } |
michael@0 | 6522 | break; |
michael@0 | 6523 | case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */ |
michael@0 | 6524 | if(value == UCOL_ON || value == UCOL_OFF || value == UCOL_DEFAULT) { |
michael@0 | 6525 | // This attribute is an implementation detail of the CLDR Japanese tailoring. |
michael@0 | 6526 | // The implementation might change to use a different mechanism |
michael@0 | 6527 | // to achieve the same Japanese sort order. |
michael@0 | 6528 | // Since ICU 50, this attribute is not settable any more via API functions. |
michael@0 | 6529 | } else { |
michael@0 | 6530 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 6531 | } |
michael@0 | 6532 | break; |
michael@0 | 6533 | case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/ |
michael@0 | 6534 | if(value == UCOL_ON) { |
michael@0 | 6535 | coll->frenchCollation = UCOL_ON; |
michael@0 | 6536 | coll->frenchCollationisDefault = FALSE; |
michael@0 | 6537 | } else if (value == UCOL_OFF) { |
michael@0 | 6538 | coll->frenchCollation = UCOL_OFF; |
michael@0 | 6539 | coll->frenchCollationisDefault = FALSE; |
michael@0 | 6540 | } else if (value == UCOL_DEFAULT) { |
michael@0 | 6541 | coll->frenchCollationisDefault = TRUE; |
michael@0 | 6542 | coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation; |
michael@0 | 6543 | } else { |
michael@0 | 6544 | *status = U_ILLEGAL_ARGUMENT_ERROR ; |
michael@0 | 6545 | } |
michael@0 | 6546 | break; |
michael@0 | 6547 | case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ |
michael@0 | 6548 | if(value == UCOL_SHIFTED) { |
michael@0 | 6549 | coll->alternateHandling = UCOL_SHIFTED; |
michael@0 | 6550 | coll->alternateHandlingisDefault = FALSE; |
michael@0 | 6551 | } else if (value == UCOL_NON_IGNORABLE) { |
michael@0 | 6552 | coll->alternateHandling = UCOL_NON_IGNORABLE; |
michael@0 | 6553 | coll->alternateHandlingisDefault = FALSE; |
michael@0 | 6554 | } else if (value == UCOL_DEFAULT) { |
michael@0 | 6555 | coll->alternateHandlingisDefault = TRUE; |
michael@0 | 6556 | coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ; |
michael@0 | 6557 | } else { |
michael@0 | 6558 | *status = U_ILLEGAL_ARGUMENT_ERROR ; |
michael@0 | 6559 | } |
michael@0 | 6560 | break; |
michael@0 | 6561 | case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ |
michael@0 | 6562 | if(value == UCOL_LOWER_FIRST) { |
michael@0 | 6563 | coll->caseFirst = UCOL_LOWER_FIRST; |
michael@0 | 6564 | coll->caseFirstisDefault = FALSE; |
michael@0 | 6565 | } else if (value == UCOL_UPPER_FIRST) { |
michael@0 | 6566 | coll->caseFirst = UCOL_UPPER_FIRST; |
michael@0 | 6567 | coll->caseFirstisDefault = FALSE; |
michael@0 | 6568 | } else if (value == UCOL_OFF) { |
michael@0 | 6569 | coll->caseFirst = UCOL_OFF; |
michael@0 | 6570 | coll->caseFirstisDefault = FALSE; |
michael@0 | 6571 | } else if (value == UCOL_DEFAULT) { |
michael@0 | 6572 | coll->caseFirst = (UColAttributeValue)coll->options->caseFirst; |
michael@0 | 6573 | coll->caseFirstisDefault = TRUE; |
michael@0 | 6574 | } else { |
michael@0 | 6575 | *status = U_ILLEGAL_ARGUMENT_ERROR ; |
michael@0 | 6576 | } |
michael@0 | 6577 | break; |
michael@0 | 6578 | case UCOL_CASE_LEVEL: /* do we have an extra case level */ |
michael@0 | 6579 | if(value == UCOL_ON) { |
michael@0 | 6580 | coll->caseLevel = UCOL_ON; |
michael@0 | 6581 | coll->caseLevelisDefault = FALSE; |
michael@0 | 6582 | } else if (value == UCOL_OFF) { |
michael@0 | 6583 | coll->caseLevel = UCOL_OFF; |
michael@0 | 6584 | coll->caseLevelisDefault = FALSE; |
michael@0 | 6585 | } else if (value == UCOL_DEFAULT) { |
michael@0 | 6586 | coll->caseLevel = (UColAttributeValue)coll->options->caseLevel; |
michael@0 | 6587 | coll->caseLevelisDefault = TRUE; |
michael@0 | 6588 | } else { |
michael@0 | 6589 | *status = U_ILLEGAL_ARGUMENT_ERROR ; |
michael@0 | 6590 | } |
michael@0 | 6591 | break; |
michael@0 | 6592 | case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ |
michael@0 | 6593 | if(value == UCOL_ON) { |
michael@0 | 6594 | coll->normalizationMode = UCOL_ON; |
michael@0 | 6595 | coll->normalizationModeisDefault = FALSE; |
michael@0 | 6596 | initializeFCD(status); |
michael@0 | 6597 | } else if (value == UCOL_OFF) { |
michael@0 | 6598 | coll->normalizationMode = UCOL_OFF; |
michael@0 | 6599 | coll->normalizationModeisDefault = FALSE; |
michael@0 | 6600 | } else if (value == UCOL_DEFAULT) { |
michael@0 | 6601 | coll->normalizationModeisDefault = TRUE; |
michael@0 | 6602 | coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode; |
michael@0 | 6603 | if(coll->normalizationMode == UCOL_ON) { |
michael@0 | 6604 | initializeFCD(status); |
michael@0 | 6605 | } |
michael@0 | 6606 | } else { |
michael@0 | 6607 | *status = U_ILLEGAL_ARGUMENT_ERROR ; |
michael@0 | 6608 | } |
michael@0 | 6609 | break; |
michael@0 | 6610 | case UCOL_STRENGTH: /* attribute for strength */ |
michael@0 | 6611 | if (value == UCOL_DEFAULT) { |
michael@0 | 6612 | coll->strengthisDefault = TRUE; |
michael@0 | 6613 | coll->strength = (UColAttributeValue)coll->options->strength; |
michael@0 | 6614 | } else if (value <= UCOL_IDENTICAL) { |
michael@0 | 6615 | coll->strengthisDefault = FALSE; |
michael@0 | 6616 | coll->strength = value; |
michael@0 | 6617 | } else { |
michael@0 | 6618 | *status = U_ILLEGAL_ARGUMENT_ERROR ; |
michael@0 | 6619 | } |
michael@0 | 6620 | break; |
michael@0 | 6621 | case UCOL_ATTRIBUTE_COUNT: |
michael@0 | 6622 | default: |
michael@0 | 6623 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 6624 | break; |
michael@0 | 6625 | } |
michael@0 | 6626 | if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) { |
michael@0 | 6627 | coll->latinOneRegenTable = TRUE; |
michael@0 | 6628 | } else { |
michael@0 | 6629 | coll->latinOneRegenTable = FALSE; |
michael@0 | 6630 | } |
michael@0 | 6631 | ucol_updateInternalState(coll, status); |
michael@0 | 6632 | } |
michael@0 | 6633 | |
michael@0 | 6634 | U_CAPI UColAttributeValue U_EXPORT2 |
michael@0 | 6635 | ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) { |
michael@0 | 6636 | if(U_FAILURE(*status) || coll == NULL) { |
michael@0 | 6637 | return UCOL_DEFAULT; |
michael@0 | 6638 | } |
michael@0 | 6639 | |
michael@0 | 6640 | if(coll->delegate != NULL) { |
michael@0 | 6641 | return ((Collator*)coll->delegate)->getAttribute(attr,*status); |
michael@0 | 6642 | } |
michael@0 | 6643 | |
michael@0 | 6644 | switch(attr) { |
michael@0 | 6645 | case UCOL_NUMERIC_COLLATION: |
michael@0 | 6646 | return coll->numericCollation; |
michael@0 | 6647 | case UCOL_HIRAGANA_QUATERNARY_MODE: |
michael@0 | 6648 | return coll->hiraganaQ; |
michael@0 | 6649 | case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/ |
michael@0 | 6650 | return coll->frenchCollation; |
michael@0 | 6651 | case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ |
michael@0 | 6652 | return coll->alternateHandling; |
michael@0 | 6653 | case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ |
michael@0 | 6654 | return coll->caseFirst; |
michael@0 | 6655 | case UCOL_CASE_LEVEL: /* do we have an extra case level */ |
michael@0 | 6656 | return coll->caseLevel; |
michael@0 | 6657 | case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ |
michael@0 | 6658 | return coll->normalizationMode; |
michael@0 | 6659 | case UCOL_STRENGTH: /* attribute for strength */ |
michael@0 | 6660 | return coll->strength; |
michael@0 | 6661 | case UCOL_ATTRIBUTE_COUNT: |
michael@0 | 6662 | default: |
michael@0 | 6663 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 6664 | break; |
michael@0 | 6665 | } |
michael@0 | 6666 | return UCOL_DEFAULT; |
michael@0 | 6667 | } |
michael@0 | 6668 | |
michael@0 | 6669 | U_CAPI void U_EXPORT2 |
michael@0 | 6670 | ucol_setStrength( UCollator *coll, |
michael@0 | 6671 | UCollationStrength strength) |
michael@0 | 6672 | { |
michael@0 | 6673 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 6674 | ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status); |
michael@0 | 6675 | } |
michael@0 | 6676 | |
michael@0 | 6677 | U_CAPI UCollationStrength U_EXPORT2 |
michael@0 | 6678 | ucol_getStrength(const UCollator *coll) |
michael@0 | 6679 | { |
michael@0 | 6680 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 6681 | return ucol_getAttribute(coll, UCOL_STRENGTH, &status); |
michael@0 | 6682 | } |
michael@0 | 6683 | |
michael@0 | 6684 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 6685 | ucol_getReorderCodes(const UCollator *coll, |
michael@0 | 6686 | int32_t *dest, |
michael@0 | 6687 | int32_t destCapacity, |
michael@0 | 6688 | UErrorCode *status) { |
michael@0 | 6689 | if (U_FAILURE(*status)) { |
michael@0 | 6690 | return 0; |
michael@0 | 6691 | } |
michael@0 | 6692 | |
michael@0 | 6693 | if(coll->delegate!=NULL) { |
michael@0 | 6694 | return ((const Collator*)coll->delegate)->getReorderCodes(dest, destCapacity, *status); |
michael@0 | 6695 | } |
michael@0 | 6696 | |
michael@0 | 6697 | if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { |
michael@0 | 6698 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 6699 | return 0; |
michael@0 | 6700 | } |
michael@0 | 6701 | |
michael@0 | 6702 | #ifdef UCOL_DEBUG |
michael@0 | 6703 | printf("coll->reorderCodesLength = %d\n", coll->reorderCodesLength); |
michael@0 | 6704 | printf("coll->defaultReorderCodesLength = %d\n", coll->defaultReorderCodesLength); |
michael@0 | 6705 | #endif |
michael@0 | 6706 | |
michael@0 | 6707 | if (coll->reorderCodesLength > destCapacity) { |
michael@0 | 6708 | *status = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 6709 | return coll->reorderCodesLength; |
michael@0 | 6710 | } |
michael@0 | 6711 | for (int32_t i = 0; i < coll->reorderCodesLength; i++) { |
michael@0 | 6712 | dest[i] = coll->reorderCodes[i]; |
michael@0 | 6713 | } |
michael@0 | 6714 | return coll->reorderCodesLength; |
michael@0 | 6715 | } |
michael@0 | 6716 | |
michael@0 | 6717 | U_CAPI void U_EXPORT2 |
michael@0 | 6718 | ucol_setReorderCodes(UCollator* coll, |
michael@0 | 6719 | const int32_t* reorderCodes, |
michael@0 | 6720 | int32_t reorderCodesLength, |
michael@0 | 6721 | UErrorCode *status) { |
michael@0 | 6722 | if (U_FAILURE(*status)) { |
michael@0 | 6723 | return; |
michael@0 | 6724 | } |
michael@0 | 6725 | |
michael@0 | 6726 | if (reorderCodesLength < 0 || (reorderCodesLength > 0 && reorderCodes == NULL)) { |
michael@0 | 6727 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 6728 | return; |
michael@0 | 6729 | } |
michael@0 | 6730 | |
michael@0 | 6731 | if(coll->delegate!=NULL) { |
michael@0 | 6732 | ((Collator*)coll->delegate)->setReorderCodes(reorderCodes, reorderCodesLength, *status); |
michael@0 | 6733 | return; |
michael@0 | 6734 | } |
michael@0 | 6735 | |
michael@0 | 6736 | if (coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) { |
michael@0 | 6737 | uprv_free(coll->reorderCodes); |
michael@0 | 6738 | } |
michael@0 | 6739 | coll->reorderCodes = NULL; |
michael@0 | 6740 | coll->freeReorderCodesOnClose = FALSE; |
michael@0 | 6741 | coll->reorderCodesLength = 0; |
michael@0 | 6742 | if (reorderCodesLength == 0) { |
michael@0 | 6743 | if (coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) { |
michael@0 | 6744 | uprv_free(coll->leadBytePermutationTable); |
michael@0 | 6745 | } |
michael@0 | 6746 | coll->leadBytePermutationTable = NULL; |
michael@0 | 6747 | coll->freeLeadBytePermutationTableOnClose = FALSE; |
michael@0 | 6748 | return; |
michael@0 | 6749 | } |
michael@0 | 6750 | coll->reorderCodes = (int32_t*) uprv_malloc(reorderCodesLength * sizeof(int32_t)); |
michael@0 | 6751 | if (coll->reorderCodes == NULL) { |
michael@0 | 6752 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 6753 | return; |
michael@0 | 6754 | } |
michael@0 | 6755 | coll->freeReorderCodesOnClose = TRUE; |
michael@0 | 6756 | for (int32_t i = 0; i < reorderCodesLength; i++) { |
michael@0 | 6757 | coll->reorderCodes[i] = reorderCodes[i]; |
michael@0 | 6758 | } |
michael@0 | 6759 | coll->reorderCodesLength = reorderCodesLength; |
michael@0 | 6760 | ucol_buildPermutationTable(coll, status); |
michael@0 | 6761 | } |
michael@0 | 6762 | |
michael@0 | 6763 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 6764 | ucol_getEquivalentReorderCodes(int32_t reorderCode, |
michael@0 | 6765 | int32_t* dest, |
michael@0 | 6766 | int32_t destCapacity, |
michael@0 | 6767 | UErrorCode *pErrorCode) { |
michael@0 | 6768 | bool equivalentCodesSet[USCRIPT_CODE_LIMIT]; |
michael@0 | 6769 | uint16_t leadBytes[256]; |
michael@0 | 6770 | int leadBytesCount; |
michael@0 | 6771 | int leadByteIndex; |
michael@0 | 6772 | int16_t reorderCodesForLeadByte[USCRIPT_CODE_LIMIT]; |
michael@0 | 6773 | int reorderCodesForLeadByteCount; |
michael@0 | 6774 | int reorderCodeIndex; |
michael@0 | 6775 | |
michael@0 | 6776 | int32_t equivalentCodesCount = 0; |
michael@0 | 6777 | int setIndex; |
michael@0 | 6778 | |
michael@0 | 6779 | if (U_FAILURE(*pErrorCode)) { |
michael@0 | 6780 | return 0; |
michael@0 | 6781 | } |
michael@0 | 6782 | |
michael@0 | 6783 | if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { |
michael@0 | 6784 | *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 6785 | return 0; |
michael@0 | 6786 | } |
michael@0 | 6787 | |
michael@0 | 6788 | uprv_memset(equivalentCodesSet, 0, USCRIPT_CODE_LIMIT * sizeof(bool)); |
michael@0 | 6789 | |
michael@0 | 6790 | const UCollator* uca = ucol_initUCA(pErrorCode); |
michael@0 | 6791 | if (U_FAILURE(*pErrorCode)) { |
michael@0 | 6792 | return 0; |
michael@0 | 6793 | } |
michael@0 | 6794 | leadBytesCount = ucol_getLeadBytesForReorderCode(uca, reorderCode, leadBytes, 256); |
michael@0 | 6795 | for (leadByteIndex = 0; leadByteIndex < leadBytesCount; leadByteIndex++) { |
michael@0 | 6796 | reorderCodesForLeadByteCount = ucol_getReorderCodesForLeadByte( |
michael@0 | 6797 | uca, leadBytes[leadByteIndex], reorderCodesForLeadByte, USCRIPT_CODE_LIMIT); |
michael@0 | 6798 | for (reorderCodeIndex = 0; reorderCodeIndex < reorderCodesForLeadByteCount; reorderCodeIndex++) { |
michael@0 | 6799 | equivalentCodesSet[reorderCodesForLeadByte[reorderCodeIndex]] = true; |
michael@0 | 6800 | } |
michael@0 | 6801 | } |
michael@0 | 6802 | |
michael@0 | 6803 | for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) { |
michael@0 | 6804 | if (equivalentCodesSet[setIndex] == true) { |
michael@0 | 6805 | equivalentCodesCount++; |
michael@0 | 6806 | } |
michael@0 | 6807 | } |
michael@0 | 6808 | |
michael@0 | 6809 | if (destCapacity == 0) { |
michael@0 | 6810 | return equivalentCodesCount; |
michael@0 | 6811 | } |
michael@0 | 6812 | |
michael@0 | 6813 | equivalentCodesCount = 0; |
michael@0 | 6814 | for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) { |
michael@0 | 6815 | if (equivalentCodesSet[setIndex] == true) { |
michael@0 | 6816 | dest[equivalentCodesCount++] = setIndex; |
michael@0 | 6817 | if (equivalentCodesCount >= destCapacity) { |
michael@0 | 6818 | break; |
michael@0 | 6819 | } |
michael@0 | 6820 | } |
michael@0 | 6821 | } |
michael@0 | 6822 | return equivalentCodesCount; |
michael@0 | 6823 | } |
michael@0 | 6824 | |
michael@0 | 6825 | |
michael@0 | 6826 | /****************************************************************************/ |
michael@0 | 6827 | /* Following are misc functions */ |
michael@0 | 6828 | /* there are new APIs and some compatibility APIs */ |
michael@0 | 6829 | /****************************************************************************/ |
michael@0 | 6830 | |
michael@0 | 6831 | U_CAPI void U_EXPORT2 |
michael@0 | 6832 | ucol_getVersion(const UCollator* coll, |
michael@0 | 6833 | UVersionInfo versionInfo) |
michael@0 | 6834 | { |
michael@0 | 6835 | if(coll->delegate!=NULL) { |
michael@0 | 6836 | ((const Collator*)coll->delegate)->getVersion(versionInfo); |
michael@0 | 6837 | return; |
michael@0 | 6838 | } |
michael@0 | 6839 | /* RunTime version */ |
michael@0 | 6840 | uint8_t rtVersion = UCOL_RUNTIME_VERSION; |
michael@0 | 6841 | /* Builder version*/ |
michael@0 | 6842 | uint8_t bdVersion = coll->image->version[0]; |
michael@0 | 6843 | |
michael@0 | 6844 | /* Charset Version. Need to get the version from cnv files |
michael@0 | 6845 | * makeconv should populate cnv files with version and |
michael@0 | 6846 | * an api has to be provided in ucnv.h to obtain this version |
michael@0 | 6847 | */ |
michael@0 | 6848 | uint8_t csVersion = 0; |
michael@0 | 6849 | |
michael@0 | 6850 | /* combine the version info */ |
michael@0 | 6851 | uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion)); |
michael@0 | 6852 | |
michael@0 | 6853 | /* Tailoring rules */ |
michael@0 | 6854 | versionInfo[0] = (uint8_t)(cmbVersion>>8); |
michael@0 | 6855 | versionInfo[1] = (uint8_t)cmbVersion; |
michael@0 | 6856 | versionInfo[2] = coll->image->version[1]; |
michael@0 | 6857 | if(coll->UCA) { |
michael@0 | 6858 | /* Include the minor number when getting the UCA version. (major & 1f) << 3 | (minor & 7) */ |
michael@0 | 6859 | versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 | (coll->UCA->image->UCAVersion[1] & 0x07); |
michael@0 | 6860 | } else { |
michael@0 | 6861 | versionInfo[3] = 0; |
michael@0 | 6862 | } |
michael@0 | 6863 | } |
michael@0 | 6864 | |
michael@0 | 6865 | |
michael@0 | 6866 | /* This internal API checks whether a character is tailored or not */ |
michael@0 | 6867 | U_CAPI UBool U_EXPORT2 |
michael@0 | 6868 | ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) { |
michael@0 | 6869 | if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) { |
michael@0 | 6870 | return FALSE; |
michael@0 | 6871 | } |
michael@0 | 6872 | |
michael@0 | 6873 | uint32_t CE = UCOL_NOT_FOUND; |
michael@0 | 6874 | const UChar *ContractionStart = NULL; |
michael@0 | 6875 | if(u < 0x100) { /* latin-1 */ |
michael@0 | 6876 | CE = coll->latinOneMapping[u]; |
michael@0 | 6877 | if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) { |
michael@0 | 6878 | return FALSE; |
michael@0 | 6879 | } |
michael@0 | 6880 | } else { /* regular */ |
michael@0 | 6881 | CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u); |
michael@0 | 6882 | } |
michael@0 | 6883 | |
michael@0 | 6884 | if(isContraction(CE)) { |
michael@0 | 6885 | ContractionStart = (UChar *)coll->image+getContractOffset(CE); |
michael@0 | 6886 | CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex)); |
michael@0 | 6887 | } |
michael@0 | 6888 | |
michael@0 | 6889 | return (UBool)(CE != UCOL_NOT_FOUND); |
michael@0 | 6890 | } |
michael@0 | 6891 | |
michael@0 | 6892 | |
michael@0 | 6893 | /****************************************************************************/ |
michael@0 | 6894 | /* Following are the string compare functions */ |
michael@0 | 6895 | /* */ |
michael@0 | 6896 | /****************************************************************************/ |
michael@0 | 6897 | |
michael@0 | 6898 | |
michael@0 | 6899 | /* ucol_checkIdent internal function. Does byte level string compare. */ |
michael@0 | 6900 | /* Used by strcoll if strength == identical and strings */ |
michael@0 | 6901 | /* are otherwise equal. */ |
michael@0 | 6902 | /* */ |
michael@0 | 6903 | /* Comparison must be done on NFD normalized strings. */ |
michael@0 | 6904 | /* FCD is not good enough. */ |
michael@0 | 6905 | |
michael@0 | 6906 | static |
michael@0 | 6907 | UCollationResult ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status) |
michael@0 | 6908 | { |
michael@0 | 6909 | // When we arrive here, we can have normal strings or UCharIterators. Currently they are both |
michael@0 | 6910 | // of same type, but that doesn't really mean that it will stay that way. |
michael@0 | 6911 | int32_t comparison; |
michael@0 | 6912 | |
michael@0 | 6913 | if (sColl->flags & UCOL_USE_ITERATOR) { |
michael@0 | 6914 | // The division for the array length may truncate the array size to |
michael@0 | 6915 | // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high |
michael@0 | 6916 | // for all platforms anyway. |
michael@0 | 6917 | UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; |
michael@0 | 6918 | UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; |
michael@0 | 6919 | UNormIterator *sNIt = NULL, *tNIt = NULL; |
michael@0 | 6920 | sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status); |
michael@0 | 6921 | tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status); |
michael@0 | 6922 | sColl->iterator->move(sColl->iterator, 0, UITER_START); |
michael@0 | 6923 | tColl->iterator->move(tColl->iterator, 0, UITER_START); |
michael@0 | 6924 | UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status); |
michael@0 | 6925 | UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status); |
michael@0 | 6926 | comparison = u_strCompareIter(sIt, tIt, TRUE); |
michael@0 | 6927 | unorm_closeIter(sNIt); |
michael@0 | 6928 | unorm_closeIter(tNIt); |
michael@0 | 6929 | } else { |
michael@0 | 6930 | int32_t sLen = (sColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(sColl->endp - sColl->string) : -1; |
michael@0 | 6931 | const UChar *sBuf = sColl->string; |
michael@0 | 6932 | int32_t tLen = (tColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(tColl->endp - tColl->string) : -1; |
michael@0 | 6933 | const UChar *tBuf = tColl->string; |
michael@0 | 6934 | |
michael@0 | 6935 | if (normalize) { |
michael@0 | 6936 | *status = U_ZERO_ERROR; |
michael@0 | 6937 | // Note: We could use Normalizer::compare() or similar, but for short strings |
michael@0 | 6938 | // which may not be in FCD it might be faster to just NFD them. |
michael@0 | 6939 | // Note: spanQuickCheckYes() + normalizeSecondAndAppend() rather than |
michael@0 | 6940 | // NFD'ing immediately might be faster for long strings, |
michael@0 | 6941 | // but string comparison is usually done on relatively short strings. |
michael@0 | 6942 | sColl->nfd->normalize(UnicodeString((sColl->flags & UCOL_ITER_HASLEN) == 0, sBuf, sLen), |
michael@0 | 6943 | sColl->writableBuffer, |
michael@0 | 6944 | *status); |
michael@0 | 6945 | tColl->nfd->normalize(UnicodeString((tColl->flags & UCOL_ITER_HASLEN) == 0, tBuf, tLen), |
michael@0 | 6946 | tColl->writableBuffer, |
michael@0 | 6947 | *status); |
michael@0 | 6948 | if(U_FAILURE(*status)) { |
michael@0 | 6949 | return UCOL_LESS; |
michael@0 | 6950 | } |
michael@0 | 6951 | comparison = sColl->writableBuffer.compareCodePointOrder(tColl->writableBuffer); |
michael@0 | 6952 | } else { |
michael@0 | 6953 | comparison = u_strCompare(sBuf, sLen, tBuf, tLen, TRUE); |
michael@0 | 6954 | } |
michael@0 | 6955 | } |
michael@0 | 6956 | |
michael@0 | 6957 | if (comparison < 0) { |
michael@0 | 6958 | return UCOL_LESS; |
michael@0 | 6959 | } else if (comparison == 0) { |
michael@0 | 6960 | return UCOL_EQUAL; |
michael@0 | 6961 | } else /* comparison > 0 */ { |
michael@0 | 6962 | return UCOL_GREATER; |
michael@0 | 6963 | } |
michael@0 | 6964 | } |
michael@0 | 6965 | |
michael@0 | 6966 | /* CEBuf - A struct and some inline functions to handle the saving */ |
michael@0 | 6967 | /* of CEs in a buffer within ucol_strcoll */ |
michael@0 | 6968 | |
michael@0 | 6969 | #define UCOL_CEBUF_SIZE 512 |
michael@0 | 6970 | typedef struct ucol_CEBuf { |
michael@0 | 6971 | uint32_t *buf; |
michael@0 | 6972 | uint32_t *endp; |
michael@0 | 6973 | uint32_t *pos; |
michael@0 | 6974 | uint32_t localArray[UCOL_CEBUF_SIZE]; |
michael@0 | 6975 | } ucol_CEBuf; |
michael@0 | 6976 | |
michael@0 | 6977 | |
michael@0 | 6978 | static |
michael@0 | 6979 | inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) { |
michael@0 | 6980 | (b)->buf = (b)->pos = (b)->localArray; |
michael@0 | 6981 | (b)->endp = (b)->buf + UCOL_CEBUF_SIZE; |
michael@0 | 6982 | } |
michael@0 | 6983 | |
michael@0 | 6984 | static |
michael@0 | 6985 | void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) { |
michael@0 | 6986 | uint32_t oldSize; |
michael@0 | 6987 | uint32_t newSize; |
michael@0 | 6988 | uint32_t *newBuf; |
michael@0 | 6989 | |
michael@0 | 6990 | ci->flags |= UCOL_ITER_ALLOCATED; |
michael@0 | 6991 | oldSize = (uint32_t)(b->pos - b->buf); |
michael@0 | 6992 | newSize = oldSize * 2; |
michael@0 | 6993 | newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t)); |
michael@0 | 6994 | if(newBuf == NULL) { |
michael@0 | 6995 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 6996 | } |
michael@0 | 6997 | else { |
michael@0 | 6998 | uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t)); |
michael@0 | 6999 | if (b->buf != b->localArray) { |
michael@0 | 7000 | uprv_free(b->buf); |
michael@0 | 7001 | } |
michael@0 | 7002 | b->buf = newBuf; |
michael@0 | 7003 | b->endp = b->buf + newSize; |
michael@0 | 7004 | b->pos = b->buf + oldSize; |
michael@0 | 7005 | } |
michael@0 | 7006 | } |
michael@0 | 7007 | |
michael@0 | 7008 | static |
michael@0 | 7009 | inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCode *status) { |
michael@0 | 7010 | if (b->pos == b->endp) { |
michael@0 | 7011 | ucol_CEBuf_Expand(b, ci, status); |
michael@0 | 7012 | } |
michael@0 | 7013 | if (U_SUCCESS(*status)) { |
michael@0 | 7014 | *(b)->pos++ = ce; |
michael@0 | 7015 | } |
michael@0 | 7016 | } |
michael@0 | 7017 | |
michael@0 | 7018 | /* This is a trick string compare function that goes in and uses sortkeys to compare */ |
michael@0 | 7019 | /* It is used when compare gets in trouble and needs to bail out */ |
michael@0 | 7020 | static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl, |
michael@0 | 7021 | collIterate *tColl, |
michael@0 | 7022 | UErrorCode *status) |
michael@0 | 7023 | { |
michael@0 | 7024 | uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER]; |
michael@0 | 7025 | uint8_t *sourceKeyP = sourceKey; |
michael@0 | 7026 | uint8_t *targetKeyP = targetKey; |
michael@0 | 7027 | int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER; |
michael@0 | 7028 | const UCollator *coll = sColl->coll; |
michael@0 | 7029 | const UChar *source = NULL; |
michael@0 | 7030 | const UChar *target = NULL; |
michael@0 | 7031 | int32_t result = UCOL_EQUAL; |
michael@0 | 7032 | UnicodeString sourceString, targetString; |
michael@0 | 7033 | int32_t sourceLength; |
michael@0 | 7034 | int32_t targetLength; |
michael@0 | 7035 | |
michael@0 | 7036 | if(sColl->flags & UCOL_USE_ITERATOR) { |
michael@0 | 7037 | sColl->iterator->move(sColl->iterator, 0, UITER_START); |
michael@0 | 7038 | tColl->iterator->move(tColl->iterator, 0, UITER_START); |
michael@0 | 7039 | UChar32 c; |
michael@0 | 7040 | while((c=sColl->iterator->next(sColl->iterator))>=0) { |
michael@0 | 7041 | sourceString.append((UChar)c); |
michael@0 | 7042 | } |
michael@0 | 7043 | while((c=tColl->iterator->next(tColl->iterator))>=0) { |
michael@0 | 7044 | targetString.append((UChar)c); |
michael@0 | 7045 | } |
michael@0 | 7046 | source = sourceString.getBuffer(); |
michael@0 | 7047 | sourceLength = sourceString.length(); |
michael@0 | 7048 | target = targetString.getBuffer(); |
michael@0 | 7049 | targetLength = targetString.length(); |
michael@0 | 7050 | } else { // no iterators |
michael@0 | 7051 | sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(int32_t)(sColl->endp-sColl->string):-1; |
michael@0 | 7052 | targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(int32_t)(tColl->endp-tColl->string):-1; |
michael@0 | 7053 | source = sColl->string; |
michael@0 | 7054 | target = tColl->string; |
michael@0 | 7055 | } |
michael@0 | 7056 | |
michael@0 | 7057 | |
michael@0 | 7058 | |
michael@0 | 7059 | sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen); |
michael@0 | 7060 | if(sourceKeyLen > UCOL_MAX_BUFFER) { |
michael@0 | 7061 | sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t)); |
michael@0 | 7062 | if(sourceKeyP == NULL) { |
michael@0 | 7063 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 7064 | goto cleanup_and_do_compare; |
michael@0 | 7065 | } |
michael@0 | 7066 | sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen); |
michael@0 | 7067 | } |
michael@0 | 7068 | |
michael@0 | 7069 | targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen); |
michael@0 | 7070 | if(targetKeyLen > UCOL_MAX_BUFFER) { |
michael@0 | 7071 | targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t)); |
michael@0 | 7072 | if(targetKeyP == NULL) { |
michael@0 | 7073 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 7074 | goto cleanup_and_do_compare; |
michael@0 | 7075 | } |
michael@0 | 7076 | targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen); |
michael@0 | 7077 | } |
michael@0 | 7078 | |
michael@0 | 7079 | result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP); |
michael@0 | 7080 | |
michael@0 | 7081 | cleanup_and_do_compare: |
michael@0 | 7082 | if(sourceKeyP != NULL && sourceKeyP != sourceKey) { |
michael@0 | 7083 | uprv_free(sourceKeyP); |
michael@0 | 7084 | } |
michael@0 | 7085 | |
michael@0 | 7086 | if(targetKeyP != NULL && targetKeyP != targetKey) { |
michael@0 | 7087 | uprv_free(targetKeyP); |
michael@0 | 7088 | } |
michael@0 | 7089 | |
michael@0 | 7090 | if(result<0) { |
michael@0 | 7091 | return UCOL_LESS; |
michael@0 | 7092 | } else if(result>0) { |
michael@0 | 7093 | return UCOL_GREATER; |
michael@0 | 7094 | } else { |
michael@0 | 7095 | return UCOL_EQUAL; |
michael@0 | 7096 | } |
michael@0 | 7097 | } |
michael@0 | 7098 | |
michael@0 | 7099 | |
michael@0 | 7100 | static UCollationResult |
michael@0 | 7101 | ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status) |
michael@0 | 7102 | { |
michael@0 | 7103 | U_ALIGN_CODE(16); |
michael@0 | 7104 | |
michael@0 | 7105 | const UCollator *coll = sColl->coll; |
michael@0 | 7106 | |
michael@0 | 7107 | |
michael@0 | 7108 | // setting up the collator parameters |
michael@0 | 7109 | UColAttributeValue strength = coll->strength; |
michael@0 | 7110 | UBool initialCheckSecTer = (strength >= UCOL_SECONDARY); |
michael@0 | 7111 | |
michael@0 | 7112 | UBool checkSecTer = initialCheckSecTer; |
michael@0 | 7113 | UBool checkTertiary = (strength >= UCOL_TERTIARY); |
michael@0 | 7114 | UBool checkQuad = (strength >= UCOL_QUATERNARY); |
michael@0 | 7115 | UBool checkIdent = (strength == UCOL_IDENTICAL); |
michael@0 | 7116 | UBool checkCase = (coll->caseLevel == UCOL_ON); |
michael@0 | 7117 | UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer; |
michael@0 | 7118 | UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); |
michael@0 | 7119 | UBool qShifted = shifted && checkQuad; |
michael@0 | 7120 | UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad; |
michael@0 | 7121 | |
michael@0 | 7122 | if(doHiragana && shifted) { |
michael@0 | 7123 | return (ucol_compareUsingSortKeys(sColl, tColl, status)); |
michael@0 | 7124 | } |
michael@0 | 7125 | uint8_t caseSwitch = coll->caseSwitch; |
michael@0 | 7126 | uint8_t tertiaryMask = coll->tertiaryMask; |
michael@0 | 7127 | |
michael@0 | 7128 | // This is the lowest primary value that will not be ignored if shifted |
michael@0 | 7129 | uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0; |
michael@0 | 7130 | |
michael@0 | 7131 | UCollationResult result = UCOL_EQUAL; |
michael@0 | 7132 | UCollationResult hirResult = UCOL_EQUAL; |
michael@0 | 7133 | |
michael@0 | 7134 | // Preparing the CE buffers. They will be filled during the primary phase |
michael@0 | 7135 | ucol_CEBuf sCEs; |
michael@0 | 7136 | ucol_CEBuf tCEs; |
michael@0 | 7137 | UCOL_INIT_CEBUF(&sCEs); |
michael@0 | 7138 | UCOL_INIT_CEBUF(&tCEs); |
michael@0 | 7139 | |
michael@0 | 7140 | uint32_t secS = 0, secT = 0; |
michael@0 | 7141 | uint32_t sOrder=0, tOrder=0; |
michael@0 | 7142 | |
michael@0 | 7143 | // Non shifted primary processing is quite simple |
michael@0 | 7144 | if(!shifted) { |
michael@0 | 7145 | for(;;) { |
michael@0 | 7146 | // We fetch CEs until we hit a non ignorable primary or end. |
michael@0 | 7147 | uint32_t sPrimary; |
michael@0 | 7148 | do { |
michael@0 | 7149 | // We get the next CE |
michael@0 | 7150 | sOrder = ucol_IGetNextCE(coll, sColl, status); |
michael@0 | 7151 | // Stuff it in the buffer |
michael@0 | 7152 | UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); |
michael@0 | 7153 | // And keep just the primary part. |
michael@0 | 7154 | sPrimary = sOrder & UCOL_PRIMARYMASK; |
michael@0 | 7155 | } while(sPrimary == 0); |
michael@0 | 7156 | |
michael@0 | 7157 | // see the comments on the above block |
michael@0 | 7158 | uint32_t tPrimary; |
michael@0 | 7159 | do { |
michael@0 | 7160 | tOrder = ucol_IGetNextCE(coll, tColl, status); |
michael@0 | 7161 | UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); |
michael@0 | 7162 | tPrimary = tOrder & UCOL_PRIMARYMASK; |
michael@0 | 7163 | } while(tPrimary == 0); |
michael@0 | 7164 | |
michael@0 | 7165 | // if both primaries are the same |
michael@0 | 7166 | if(sPrimary == tPrimary) { |
michael@0 | 7167 | // and there are no more CEs, we advance to the next level |
michael@0 | 7168 | if(sPrimary == UCOL_NO_MORE_CES_PRIMARY) { |
michael@0 | 7169 | break; |
michael@0 | 7170 | } |
michael@0 | 7171 | if(doHiragana && hirResult == UCOL_EQUAL) { |
michael@0 | 7172 | if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) { |
michael@0 | 7173 | hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA)) |
michael@0 | 7174 | ? UCOL_LESS:UCOL_GREATER; |
michael@0 | 7175 | } |
michael@0 | 7176 | } |
michael@0 | 7177 | } else { |
michael@0 | 7178 | // only need to check one for continuation |
michael@0 | 7179 | // if one is then the other must be or the preceding CE would be a prefix of the other |
michael@0 | 7180 | if (coll->leadBytePermutationTable != NULL && !isContinuation(sOrder)) { |
michael@0 | 7181 | sPrimary = (coll->leadBytePermutationTable[sPrimary>>24] << 24) | (sPrimary & 0x00FFFFFF); |
michael@0 | 7182 | tPrimary = (coll->leadBytePermutationTable[tPrimary>>24] << 24) | (tPrimary & 0x00FFFFFF); |
michael@0 | 7183 | } |
michael@0 | 7184 | // if two primaries are different, we are done |
michael@0 | 7185 | result = (sPrimary < tPrimary) ? UCOL_LESS: UCOL_GREATER; |
michael@0 | 7186 | goto commonReturn; |
michael@0 | 7187 | } |
michael@0 | 7188 | } // no primary difference... do the rest from the buffers |
michael@0 | 7189 | } else { // shifted - do a slightly more complicated processing :) |
michael@0 | 7190 | for(;;) { |
michael@0 | 7191 | UBool sInShifted = FALSE; |
michael@0 | 7192 | UBool tInShifted = FALSE; |
michael@0 | 7193 | // This version of code can be refactored. However, it seems easier to understand this way. |
michael@0 | 7194 | // Source loop. Same as the target loop. |
michael@0 | 7195 | for(;;) { |
michael@0 | 7196 | sOrder = ucol_IGetNextCE(coll, sColl, status); |
michael@0 | 7197 | if(sOrder == UCOL_NO_MORE_CES) { |
michael@0 | 7198 | UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); |
michael@0 | 7199 | break; |
michael@0 | 7200 | } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) { |
michael@0 | 7201 | /* UCA amendment - ignore ignorables that follow shifted code points */ |
michael@0 | 7202 | continue; |
michael@0 | 7203 | } else if(isContinuation(sOrder)) { |
michael@0 | 7204 | if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */ |
michael@0 | 7205 | if(sInShifted) { |
michael@0 | 7206 | sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */ |
michael@0 | 7207 | UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); |
michael@0 | 7208 | continue; |
michael@0 | 7209 | } else { |
michael@0 | 7210 | UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); |
michael@0 | 7211 | break; |
michael@0 | 7212 | } |
michael@0 | 7213 | } else { /* Just lower level values */ |
michael@0 | 7214 | if(sInShifted) { |
michael@0 | 7215 | continue; |
michael@0 | 7216 | } else { |
michael@0 | 7217 | UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); |
michael@0 | 7218 | continue; |
michael@0 | 7219 | } |
michael@0 | 7220 | } |
michael@0 | 7221 | } else { /* regular */ |
michael@0 | 7222 | if(coll->leadBytePermutationTable != NULL){ |
michael@0 | 7223 | sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF); |
michael@0 | 7224 | } |
michael@0 | 7225 | if((sOrder & UCOL_PRIMARYMASK) > LVT) { |
michael@0 | 7226 | UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); |
michael@0 | 7227 | break; |
michael@0 | 7228 | } else { |
michael@0 | 7229 | if((sOrder & UCOL_PRIMARYMASK) > 0) { |
michael@0 | 7230 | sInShifted = TRUE; |
michael@0 | 7231 | sOrder &= UCOL_PRIMARYMASK; |
michael@0 | 7232 | UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); |
michael@0 | 7233 | continue; |
michael@0 | 7234 | } else { |
michael@0 | 7235 | UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); |
michael@0 | 7236 | sInShifted = FALSE; |
michael@0 | 7237 | continue; |
michael@0 | 7238 | } |
michael@0 | 7239 | } |
michael@0 | 7240 | } |
michael@0 | 7241 | } |
michael@0 | 7242 | sOrder &= UCOL_PRIMARYMASK; |
michael@0 | 7243 | sInShifted = FALSE; |
michael@0 | 7244 | |
michael@0 | 7245 | for(;;) { |
michael@0 | 7246 | tOrder = ucol_IGetNextCE(coll, tColl, status); |
michael@0 | 7247 | if(tOrder == UCOL_NO_MORE_CES) { |
michael@0 | 7248 | UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); |
michael@0 | 7249 | break; |
michael@0 | 7250 | } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) { |
michael@0 | 7251 | /* UCA amendment - ignore ignorables that follow shifted code points */ |
michael@0 | 7252 | continue; |
michael@0 | 7253 | } else if(isContinuation(tOrder)) { |
michael@0 | 7254 | if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */ |
michael@0 | 7255 | if(tInShifted) { |
michael@0 | 7256 | tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */ |
michael@0 | 7257 | UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); |
michael@0 | 7258 | continue; |
michael@0 | 7259 | } else { |
michael@0 | 7260 | UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); |
michael@0 | 7261 | break; |
michael@0 | 7262 | } |
michael@0 | 7263 | } else { /* Just lower level values */ |
michael@0 | 7264 | if(tInShifted) { |
michael@0 | 7265 | continue; |
michael@0 | 7266 | } else { |
michael@0 | 7267 | UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); |
michael@0 | 7268 | continue; |
michael@0 | 7269 | } |
michael@0 | 7270 | } |
michael@0 | 7271 | } else { /* regular */ |
michael@0 | 7272 | if(coll->leadBytePermutationTable != NULL){ |
michael@0 | 7273 | tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF); |
michael@0 | 7274 | } |
michael@0 | 7275 | if((tOrder & UCOL_PRIMARYMASK) > LVT) { |
michael@0 | 7276 | UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); |
michael@0 | 7277 | break; |
michael@0 | 7278 | } else { |
michael@0 | 7279 | if((tOrder & UCOL_PRIMARYMASK) > 0) { |
michael@0 | 7280 | tInShifted = TRUE; |
michael@0 | 7281 | tOrder &= UCOL_PRIMARYMASK; |
michael@0 | 7282 | UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); |
michael@0 | 7283 | continue; |
michael@0 | 7284 | } else { |
michael@0 | 7285 | UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); |
michael@0 | 7286 | tInShifted = FALSE; |
michael@0 | 7287 | continue; |
michael@0 | 7288 | } |
michael@0 | 7289 | } |
michael@0 | 7290 | } |
michael@0 | 7291 | } |
michael@0 | 7292 | tOrder &= UCOL_PRIMARYMASK; |
michael@0 | 7293 | tInShifted = FALSE; |
michael@0 | 7294 | |
michael@0 | 7295 | if(sOrder == tOrder) { |
michael@0 | 7296 | /* |
michael@0 | 7297 | if(doHiragana && hirResult == UCOL_EQUAL) { |
michael@0 | 7298 | if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) { |
michael@0 | 7299 | hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA)) |
michael@0 | 7300 | ? UCOL_LESS:UCOL_GREATER; |
michael@0 | 7301 | } |
michael@0 | 7302 | } |
michael@0 | 7303 | */ |
michael@0 | 7304 | if(sOrder == UCOL_NO_MORE_CES_PRIMARY) { |
michael@0 | 7305 | break; |
michael@0 | 7306 | } else { |
michael@0 | 7307 | sOrder = 0; |
michael@0 | 7308 | tOrder = 0; |
michael@0 | 7309 | continue; |
michael@0 | 7310 | } |
michael@0 | 7311 | } else { |
michael@0 | 7312 | result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER; |
michael@0 | 7313 | goto commonReturn; |
michael@0 | 7314 | } |
michael@0 | 7315 | } /* no primary difference... do the rest from the buffers */ |
michael@0 | 7316 | } |
michael@0 | 7317 | |
michael@0 | 7318 | /* now, we're gonna reexamine collected CEs */ |
michael@0 | 7319 | uint32_t *sCE; |
michael@0 | 7320 | uint32_t *tCE; |
michael@0 | 7321 | |
michael@0 | 7322 | /* This is the secondary level of comparison */ |
michael@0 | 7323 | if(checkSecTer) { |
michael@0 | 7324 | if(!isFrenchSec) { /* normal */ |
michael@0 | 7325 | sCE = sCEs.buf; |
michael@0 | 7326 | tCE = tCEs.buf; |
michael@0 | 7327 | for(;;) { |
michael@0 | 7328 | while (secS == 0) { |
michael@0 | 7329 | secS = *(sCE++) & UCOL_SECONDARYMASK; |
michael@0 | 7330 | } |
michael@0 | 7331 | |
michael@0 | 7332 | while(secT == 0) { |
michael@0 | 7333 | secT = *(tCE++) & UCOL_SECONDARYMASK; |
michael@0 | 7334 | } |
michael@0 | 7335 | |
michael@0 | 7336 | if(secS == secT) { |
michael@0 | 7337 | if(secS == UCOL_NO_MORE_CES_SECONDARY) { |
michael@0 | 7338 | break; |
michael@0 | 7339 | } else { |
michael@0 | 7340 | secS = 0; secT = 0; |
michael@0 | 7341 | continue; |
michael@0 | 7342 | } |
michael@0 | 7343 | } else { |
michael@0 | 7344 | result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; |
michael@0 | 7345 | goto commonReturn; |
michael@0 | 7346 | } |
michael@0 | 7347 | } |
michael@0 | 7348 | } else { /* do the French */ |
michael@0 | 7349 | uint32_t *sCESave = NULL; |
michael@0 | 7350 | uint32_t *tCESave = NULL; |
michael@0 | 7351 | sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */ |
michael@0 | 7352 | tCE = tCEs.pos-2; |
michael@0 | 7353 | for(;;) { |
michael@0 | 7354 | while (secS == 0 && sCE >= sCEs.buf) { |
michael@0 | 7355 | if(sCESave == NULL) { |
michael@0 | 7356 | secS = *(sCE--); |
michael@0 | 7357 | if(isContinuation(secS)) { |
michael@0 | 7358 | while(isContinuation(secS = *(sCE--))) |
michael@0 | 7359 | ; |
michael@0 | 7360 | /* after this, secS has the start of continuation, and sCEs points before that */ |
michael@0 | 7361 | sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */ |
michael@0 | 7362 | sCE+=2; /* need to point to the first continuation CP */ |
michael@0 | 7363 | /* However, now you can just continue doing stuff */ |
michael@0 | 7364 | } |
michael@0 | 7365 | } else { |
michael@0 | 7366 | secS = *(sCE++); |
michael@0 | 7367 | if(!isContinuation(secS)) { /* This means we have finished with this cont */ |
michael@0 | 7368 | sCE = sCESave; /* reset the pointer to before continuation */ |
michael@0 | 7369 | sCESave = NULL; |
michael@0 | 7370 | secS = 0; /* Fetch a fresh CE before the continuation sequence. */ |
michael@0 | 7371 | continue; |
michael@0 | 7372 | } |
michael@0 | 7373 | } |
michael@0 | 7374 | secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */ |
michael@0 | 7375 | } |
michael@0 | 7376 | |
michael@0 | 7377 | while(secT == 0 && tCE >= tCEs.buf) { |
michael@0 | 7378 | if(tCESave == NULL) { |
michael@0 | 7379 | secT = *(tCE--); |
michael@0 | 7380 | if(isContinuation(secT)) { |
michael@0 | 7381 | while(isContinuation(secT = *(tCE--))) |
michael@0 | 7382 | ; |
michael@0 | 7383 | /* after this, secS has the start of continuation, and sCEs points before that */ |
michael@0 | 7384 | tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */ |
michael@0 | 7385 | tCE+=2; /* need to point to the first continuation CP */ |
michael@0 | 7386 | /* However, now you can just continue doing stuff */ |
michael@0 | 7387 | } |
michael@0 | 7388 | } else { |
michael@0 | 7389 | secT = *(tCE++); |
michael@0 | 7390 | if(!isContinuation(secT)) { /* This means we have finished with this cont */ |
michael@0 | 7391 | tCE = tCESave; /* reset the pointer to before continuation */ |
michael@0 | 7392 | tCESave = NULL; |
michael@0 | 7393 | secT = 0; /* Fetch a fresh CE before the continuation sequence. */ |
michael@0 | 7394 | continue; |
michael@0 | 7395 | } |
michael@0 | 7396 | } |
michael@0 | 7397 | secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */ |
michael@0 | 7398 | } |
michael@0 | 7399 | |
michael@0 | 7400 | if(secS == secT) { |
michael@0 | 7401 | if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) { |
michael@0 | 7402 | break; |
michael@0 | 7403 | } else { |
michael@0 | 7404 | secS = 0; secT = 0; |
michael@0 | 7405 | continue; |
michael@0 | 7406 | } |
michael@0 | 7407 | } else { |
michael@0 | 7408 | result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; |
michael@0 | 7409 | goto commonReturn; |
michael@0 | 7410 | } |
michael@0 | 7411 | } |
michael@0 | 7412 | } |
michael@0 | 7413 | } |
michael@0 | 7414 | |
michael@0 | 7415 | /* doing the case bit */ |
michael@0 | 7416 | if(checkCase) { |
michael@0 | 7417 | sCE = sCEs.buf; |
michael@0 | 7418 | tCE = tCEs.buf; |
michael@0 | 7419 | for(;;) { |
michael@0 | 7420 | while((secS & UCOL_REMOVE_CASE) == 0) { |
michael@0 | 7421 | if(!isContinuation(*sCE++)) { |
michael@0 | 7422 | secS =*(sCE-1); |
michael@0 | 7423 | if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) { |
michael@0 | 7424 | // primary ignorables should not be considered on the case level when the strength is primary |
michael@0 | 7425 | // otherwise, the CEs stop being well-formed |
michael@0 | 7426 | secS &= UCOL_TERT_CASE_MASK; |
michael@0 | 7427 | secS ^= caseSwitch; |
michael@0 | 7428 | } else { |
michael@0 | 7429 | secS = 0; |
michael@0 | 7430 | } |
michael@0 | 7431 | } else { |
michael@0 | 7432 | secS = 0; |
michael@0 | 7433 | } |
michael@0 | 7434 | } |
michael@0 | 7435 | |
michael@0 | 7436 | while((secT & UCOL_REMOVE_CASE) == 0) { |
michael@0 | 7437 | if(!isContinuation(*tCE++)) { |
michael@0 | 7438 | secT = *(tCE-1); |
michael@0 | 7439 | if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) { |
michael@0 | 7440 | // primary ignorables should not be considered on the case level when the strength is primary |
michael@0 | 7441 | // otherwise, the CEs stop being well-formed |
michael@0 | 7442 | secT &= UCOL_TERT_CASE_MASK; |
michael@0 | 7443 | secT ^= caseSwitch; |
michael@0 | 7444 | } else { |
michael@0 | 7445 | secT = 0; |
michael@0 | 7446 | } |
michael@0 | 7447 | } else { |
michael@0 | 7448 | secT = 0; |
michael@0 | 7449 | } |
michael@0 | 7450 | } |
michael@0 | 7451 | |
michael@0 | 7452 | if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) { |
michael@0 | 7453 | result = UCOL_LESS; |
michael@0 | 7454 | goto commonReturn; |
michael@0 | 7455 | } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) { |
michael@0 | 7456 | result = UCOL_GREATER; |
michael@0 | 7457 | goto commonReturn; |
michael@0 | 7458 | } |
michael@0 | 7459 | |
michael@0 | 7460 | if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) { |
michael@0 | 7461 | break; |
michael@0 | 7462 | } else { |
michael@0 | 7463 | secS = 0; |
michael@0 | 7464 | secT = 0; |
michael@0 | 7465 | } |
michael@0 | 7466 | } |
michael@0 | 7467 | } |
michael@0 | 7468 | |
michael@0 | 7469 | /* Tertiary level */ |
michael@0 | 7470 | if(checkTertiary) { |
michael@0 | 7471 | secS = 0; |
michael@0 | 7472 | secT = 0; |
michael@0 | 7473 | sCE = sCEs.buf; |
michael@0 | 7474 | tCE = tCEs.buf; |
michael@0 | 7475 | for(;;) { |
michael@0 | 7476 | while((secS & UCOL_REMOVE_CASE) == 0) { |
michael@0 | 7477 | sOrder = *sCE++; |
michael@0 | 7478 | secS = sOrder & tertiaryMask; |
michael@0 | 7479 | if(!isContinuation(sOrder)) { |
michael@0 | 7480 | secS ^= caseSwitch; |
michael@0 | 7481 | } else { |
michael@0 | 7482 | secS &= UCOL_REMOVE_CASE; |
michael@0 | 7483 | } |
michael@0 | 7484 | } |
michael@0 | 7485 | |
michael@0 | 7486 | while((secT & UCOL_REMOVE_CASE) == 0) { |
michael@0 | 7487 | tOrder = *tCE++; |
michael@0 | 7488 | secT = tOrder & tertiaryMask; |
michael@0 | 7489 | if(!isContinuation(tOrder)) { |
michael@0 | 7490 | secT ^= caseSwitch; |
michael@0 | 7491 | } else { |
michael@0 | 7492 | secT &= UCOL_REMOVE_CASE; |
michael@0 | 7493 | } |
michael@0 | 7494 | } |
michael@0 | 7495 | |
michael@0 | 7496 | if(secS == secT) { |
michael@0 | 7497 | if((secS & UCOL_REMOVE_CASE) == 1) { |
michael@0 | 7498 | break; |
michael@0 | 7499 | } else { |
michael@0 | 7500 | secS = 0; secT = 0; |
michael@0 | 7501 | continue; |
michael@0 | 7502 | } |
michael@0 | 7503 | } else { |
michael@0 | 7504 | result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; |
michael@0 | 7505 | goto commonReturn; |
michael@0 | 7506 | } |
michael@0 | 7507 | } |
michael@0 | 7508 | } |
michael@0 | 7509 | |
michael@0 | 7510 | |
michael@0 | 7511 | if(qShifted /*checkQuad*/) { |
michael@0 | 7512 | UBool sInShifted = TRUE; |
michael@0 | 7513 | UBool tInShifted = TRUE; |
michael@0 | 7514 | secS = 0; |
michael@0 | 7515 | secT = 0; |
michael@0 | 7516 | sCE = sCEs.buf; |
michael@0 | 7517 | tCE = tCEs.buf; |
michael@0 | 7518 | for(;;) { |
michael@0 | 7519 | while((secS == 0 && secS != UCOL_NO_MORE_CES) || (isContinuation(secS) && !sInShifted)) { |
michael@0 | 7520 | secS = *(sCE++); |
michael@0 | 7521 | if(isContinuation(secS)) { |
michael@0 | 7522 | if(!sInShifted) { |
michael@0 | 7523 | continue; |
michael@0 | 7524 | } |
michael@0 | 7525 | } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */ |
michael@0 | 7526 | secS = UCOL_PRIMARYMASK; |
michael@0 | 7527 | sInShifted = FALSE; |
michael@0 | 7528 | } else { |
michael@0 | 7529 | sInShifted = TRUE; |
michael@0 | 7530 | } |
michael@0 | 7531 | } |
michael@0 | 7532 | secS &= UCOL_PRIMARYMASK; |
michael@0 | 7533 | |
michael@0 | 7534 | |
michael@0 | 7535 | while((secT == 0 && secT != UCOL_NO_MORE_CES) || (isContinuation(secT) && !tInShifted)) { |
michael@0 | 7536 | secT = *(tCE++); |
michael@0 | 7537 | if(isContinuation(secT)) { |
michael@0 | 7538 | if(!tInShifted) { |
michael@0 | 7539 | continue; |
michael@0 | 7540 | } |
michael@0 | 7541 | } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) { |
michael@0 | 7542 | secT = UCOL_PRIMARYMASK; |
michael@0 | 7543 | tInShifted = FALSE; |
michael@0 | 7544 | } else { |
michael@0 | 7545 | tInShifted = TRUE; |
michael@0 | 7546 | } |
michael@0 | 7547 | } |
michael@0 | 7548 | secT &= UCOL_PRIMARYMASK; |
michael@0 | 7549 | |
michael@0 | 7550 | if(secS == secT) { |
michael@0 | 7551 | if(secS == UCOL_NO_MORE_CES_PRIMARY) { |
michael@0 | 7552 | break; |
michael@0 | 7553 | } else { |
michael@0 | 7554 | secS = 0; secT = 0; |
michael@0 | 7555 | continue; |
michael@0 | 7556 | } |
michael@0 | 7557 | } else { |
michael@0 | 7558 | result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; |
michael@0 | 7559 | goto commonReturn; |
michael@0 | 7560 | } |
michael@0 | 7561 | } |
michael@0 | 7562 | } else if(doHiragana && hirResult != UCOL_EQUAL) { |
michael@0 | 7563 | // If we're fine on quaternaries, we might be different |
michael@0 | 7564 | // on Hiragana. This, however, might fail us in shifted. |
michael@0 | 7565 | result = hirResult; |
michael@0 | 7566 | goto commonReturn; |
michael@0 | 7567 | } |
michael@0 | 7568 | |
michael@0 | 7569 | /* For IDENTICAL comparisons, we use a bitwise character comparison */ |
michael@0 | 7570 | /* as a tiebreaker if all else is equal. */ |
michael@0 | 7571 | /* Getting here should be quite rare - strings are not identical - */ |
michael@0 | 7572 | /* that is checked first, but compared == through all other checks. */ |
michael@0 | 7573 | if(checkIdent) |
michael@0 | 7574 | { |
michael@0 | 7575 | //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON); |
michael@0 | 7576 | result = ucol_checkIdent(sColl, tColl, TRUE, status); |
michael@0 | 7577 | } |
michael@0 | 7578 | |
michael@0 | 7579 | commonReturn: |
michael@0 | 7580 | if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) { |
michael@0 | 7581 | if (sCEs.buf != sCEs.localArray ) { |
michael@0 | 7582 | uprv_free(sCEs.buf); |
michael@0 | 7583 | } |
michael@0 | 7584 | if (tCEs.buf != tCEs.localArray ) { |
michael@0 | 7585 | uprv_free(tCEs.buf); |
michael@0 | 7586 | } |
michael@0 | 7587 | } |
michael@0 | 7588 | |
michael@0 | 7589 | return result; |
michael@0 | 7590 | } |
michael@0 | 7591 | |
michael@0 | 7592 | static UCollationResult |
michael@0 | 7593 | ucol_strcollRegular(const UCollator *coll, |
michael@0 | 7594 | const UChar *source, int32_t sourceLength, |
michael@0 | 7595 | const UChar *target, int32_t targetLength, |
michael@0 | 7596 | UErrorCode *status) { |
michael@0 | 7597 | collIterate sColl, tColl; |
michael@0 | 7598 | // Preparing the context objects for iterating over strings |
michael@0 | 7599 | IInit_collIterate(coll, source, sourceLength, &sColl, status); |
michael@0 | 7600 | IInit_collIterate(coll, target, targetLength, &tColl, status); |
michael@0 | 7601 | if(U_FAILURE(*status)) { |
michael@0 | 7602 | return UCOL_LESS; |
michael@0 | 7603 | } |
michael@0 | 7604 | return ucol_strcollRegular(&sColl, &tColl, status); |
michael@0 | 7605 | } |
michael@0 | 7606 | |
michael@0 | 7607 | static inline uint32_t |
michael@0 | 7608 | ucol_getLatinOneContraction(const UCollator *coll, int32_t strength, |
michael@0 | 7609 | uint32_t CE, const UChar *s, int32_t *index, int32_t len) |
michael@0 | 7610 | { |
michael@0 | 7611 | const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF); |
michael@0 | 7612 | int32_t latinOneOffset = (CE & 0x00FFF000) >> 12; |
michael@0 | 7613 | int32_t offset = 1; |
michael@0 | 7614 | UChar schar = 0, tchar = 0; |
michael@0 | 7615 | |
michael@0 | 7616 | for(;;) { |
michael@0 | 7617 | if(len == -1) { |
michael@0 | 7618 | if(s[*index] == 0) { // end of string |
michael@0 | 7619 | return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); |
michael@0 | 7620 | } else { |
michael@0 | 7621 | schar = s[*index]; |
michael@0 | 7622 | } |
michael@0 | 7623 | } else { |
michael@0 | 7624 | if(*index == len) { |
michael@0 | 7625 | return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); |
michael@0 | 7626 | } else { |
michael@0 | 7627 | schar = s[*index]; |
michael@0 | 7628 | } |
michael@0 | 7629 | } |
michael@0 | 7630 | |
michael@0 | 7631 | while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ |
michael@0 | 7632 | offset++; |
michael@0 | 7633 | } |
michael@0 | 7634 | |
michael@0 | 7635 | if (schar == tchar) { |
michael@0 | 7636 | (*index)++; |
michael@0 | 7637 | return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]); |
michael@0 | 7638 | } |
michael@0 | 7639 | else |
michael@0 | 7640 | { |
michael@0 | 7641 | if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) { |
michael@0 | 7642 | return UCOL_BAIL_OUT_CE; |
michael@0 | 7643 | } |
michael@0 | 7644 | // skip completely ignorables |
michael@0 | 7645 | uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar); |
michael@0 | 7646 | if(isZeroCE == 0) { // we have to ignore completely ignorables |
michael@0 | 7647 | (*index)++; |
michael@0 | 7648 | continue; |
michael@0 | 7649 | } |
michael@0 | 7650 | |
michael@0 | 7651 | return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); |
michael@0 | 7652 | } |
michael@0 | 7653 | } |
michael@0 | 7654 | } |
michael@0 | 7655 | |
michael@0 | 7656 | |
michael@0 | 7657 | /** |
michael@0 | 7658 | * This is a fast strcoll, geared towards text in Latin-1. |
michael@0 | 7659 | * It supports contractions of size two, French secondaries |
michael@0 | 7660 | * and case switching. You can use it with strengths primary |
michael@0 | 7661 | * to tertiary. It does not support shifted and case level. |
michael@0 | 7662 | * It relies on the table build by setupLatin1Table. If it |
michael@0 | 7663 | * doesn't understand something, it will go to the regular |
michael@0 | 7664 | * strcoll. |
michael@0 | 7665 | */ |
michael@0 | 7666 | static UCollationResult |
michael@0 | 7667 | ucol_strcollUseLatin1( const UCollator *coll, |
michael@0 | 7668 | const UChar *source, |
michael@0 | 7669 | int32_t sLen, |
michael@0 | 7670 | const UChar *target, |
michael@0 | 7671 | int32_t tLen, |
michael@0 | 7672 | UErrorCode *status) |
michael@0 | 7673 | { |
michael@0 | 7674 | U_ALIGN_CODE(16); |
michael@0 | 7675 | int32_t strength = coll->strength; |
michael@0 | 7676 | |
michael@0 | 7677 | int32_t sIndex = 0, tIndex = 0; |
michael@0 | 7678 | UChar sChar = 0, tChar = 0; |
michael@0 | 7679 | uint32_t sOrder=0, tOrder=0; |
michael@0 | 7680 | |
michael@0 | 7681 | UBool endOfSource = FALSE; |
michael@0 | 7682 | |
michael@0 | 7683 | uint32_t *elements = coll->latinOneCEs; |
michael@0 | 7684 | |
michael@0 | 7685 | UBool haveContractions = FALSE; // if we have contractions in our string |
michael@0 | 7686 | // we cannot do French secondary |
michael@0 | 7687 | |
michael@0 | 7688 | // Do the primary level |
michael@0 | 7689 | for(;;) { |
michael@0 | 7690 | while(sOrder==0) { // this loop skips primary ignorables |
michael@0 | 7691 | // sOrder=getNextlatinOneCE(source); |
michael@0 | 7692 | if(sLen==-1) { // handling zero terminated strings |
michael@0 | 7693 | sChar=source[sIndex++]; |
michael@0 | 7694 | if(sChar==0) { |
michael@0 | 7695 | endOfSource = TRUE; |
michael@0 | 7696 | break; |
michael@0 | 7697 | } |
michael@0 | 7698 | } else { // handling strings with known length |
michael@0 | 7699 | if(sIndex==sLen) { |
michael@0 | 7700 | endOfSource = TRUE; |
michael@0 | 7701 | break; |
michael@0 | 7702 | } |
michael@0 | 7703 | sChar=source[sIndex++]; |
michael@0 | 7704 | } |
michael@0 | 7705 | if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) |
michael@0 | 7706 | //fprintf(stderr, "R"); |
michael@0 | 7707 | return ucol_strcollRegular(coll, source, sLen, target, tLen, status); |
michael@0 | 7708 | } |
michael@0 | 7709 | sOrder = elements[sChar]; |
michael@0 | 7710 | if(sOrder >= UCOL_NOT_FOUND) { // if we got a special |
michael@0 | 7711 | // specials can basically be either contractions or bail-out signs. If we get anything |
michael@0 | 7712 | // else, we'll bail out anywasy |
michael@0 | 7713 | if(getCETag(sOrder) == CONTRACTION_TAG) { |
michael@0 | 7714 | sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen); |
michael@0 | 7715 | haveContractions = TRUE; // if there are contractions, we cannot do French secondary |
michael@0 | 7716 | // However, if there are contractions in the table, but we always use just one char, |
michael@0 | 7717 | // we might be able to do French. This should be checked out. |
michael@0 | 7718 | } |
michael@0 | 7719 | if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { |
michael@0 | 7720 | //fprintf(stderr, "S"); |
michael@0 | 7721 | return ucol_strcollRegular(coll, source, sLen, target, tLen, status); |
michael@0 | 7722 | } |
michael@0 | 7723 | } |
michael@0 | 7724 | } |
michael@0 | 7725 | |
michael@0 | 7726 | while(tOrder==0) { // this loop skips primary ignorables |
michael@0 | 7727 | // tOrder=getNextlatinOneCE(target); |
michael@0 | 7728 | if(tLen==-1) { // handling zero terminated strings |
michael@0 | 7729 | tChar=target[tIndex++]; |
michael@0 | 7730 | if(tChar==0) { |
michael@0 | 7731 | if(endOfSource) { // this is different than source loop, |
michael@0 | 7732 | // as we already know that source loop is done here, |
michael@0 | 7733 | // so we can either finish the primary loop if both |
michael@0 | 7734 | // strings are done or anounce the result if only |
michael@0 | 7735 | // target is done. Same below. |
michael@0 | 7736 | goto endOfPrimLoop; |
michael@0 | 7737 | } else { |
michael@0 | 7738 | return UCOL_GREATER; |
michael@0 | 7739 | } |
michael@0 | 7740 | } |
michael@0 | 7741 | } else { // handling strings with known length |
michael@0 | 7742 | if(tIndex==tLen) { |
michael@0 | 7743 | if(endOfSource) { |
michael@0 | 7744 | goto endOfPrimLoop; |
michael@0 | 7745 | } else { |
michael@0 | 7746 | return UCOL_GREATER; |
michael@0 | 7747 | } |
michael@0 | 7748 | } |
michael@0 | 7749 | tChar=target[tIndex++]; |
michael@0 | 7750 | } |
michael@0 | 7751 | if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) |
michael@0 | 7752 | //fprintf(stderr, "R"); |
michael@0 | 7753 | return ucol_strcollRegular(coll, source, sLen, target, tLen, status); |
michael@0 | 7754 | } |
michael@0 | 7755 | tOrder = elements[tChar]; |
michael@0 | 7756 | if(tOrder >= UCOL_NOT_FOUND) { |
michael@0 | 7757 | // Handling specials, see the comments for source |
michael@0 | 7758 | if(getCETag(tOrder) == CONTRACTION_TAG) { |
michael@0 | 7759 | tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen); |
michael@0 | 7760 | haveContractions = TRUE; |
michael@0 | 7761 | } |
michael@0 | 7762 | if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { |
michael@0 | 7763 | //fprintf(stderr, "S"); |
michael@0 | 7764 | return ucol_strcollRegular(coll, source, sLen, target, tLen, status); |
michael@0 | 7765 | } |
michael@0 | 7766 | } |
michael@0 | 7767 | } |
michael@0 | 7768 | if(endOfSource) { // source is finished, but target is not, say the result. |
michael@0 | 7769 | return UCOL_LESS; |
michael@0 | 7770 | } |
michael@0 | 7771 | |
michael@0 | 7772 | if(sOrder == tOrder) { // if we have same CEs, we continue the loop |
michael@0 | 7773 | sOrder = 0; tOrder = 0; |
michael@0 | 7774 | continue; |
michael@0 | 7775 | } else { |
michael@0 | 7776 | // compare current top bytes |
michael@0 | 7777 | if(((sOrder^tOrder)&0xFF000000)!=0) { |
michael@0 | 7778 | // top bytes differ, return difference |
michael@0 | 7779 | if(sOrder < tOrder) { |
michael@0 | 7780 | return UCOL_LESS; |
michael@0 | 7781 | } else if(sOrder > tOrder) { |
michael@0 | 7782 | return UCOL_GREATER; |
michael@0 | 7783 | } |
michael@0 | 7784 | // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24); |
michael@0 | 7785 | // since we must return enum value |
michael@0 | 7786 | } |
michael@0 | 7787 | |
michael@0 | 7788 | // top bytes match, continue with following bytes |
michael@0 | 7789 | sOrder<<=8; |
michael@0 | 7790 | tOrder<<=8; |
michael@0 | 7791 | } |
michael@0 | 7792 | } |
michael@0 | 7793 | |
michael@0 | 7794 | endOfPrimLoop: |
michael@0 | 7795 | // after primary loop, we definitely know the sizes of strings, |
michael@0 | 7796 | // so we set it and use simpler loop for secondaries and tertiaries |
michael@0 | 7797 | sLen = sIndex; tLen = tIndex; |
michael@0 | 7798 | if(strength >= UCOL_SECONDARY) { |
michael@0 | 7799 | // adjust the table beggining |
michael@0 | 7800 | elements += coll->latinOneTableLen; |
michael@0 | 7801 | endOfSource = FALSE; |
michael@0 | 7802 | |
michael@0 | 7803 | if(coll->frenchCollation == UCOL_OFF) { // non French |
michael@0 | 7804 | // This loop is a simplified copy of primary loop |
michael@0 | 7805 | // at this point we know that whole strings are latin-1, so we don't |
michael@0 | 7806 | // check for that. We also know that we only have contractions as |
michael@0 | 7807 | // specials. |
michael@0 | 7808 | sIndex = 0; tIndex = 0; |
michael@0 | 7809 | for(;;) { |
michael@0 | 7810 | while(sOrder==0) { |
michael@0 | 7811 | if(sIndex==sLen) { |
michael@0 | 7812 | endOfSource = TRUE; |
michael@0 | 7813 | break; |
michael@0 | 7814 | } |
michael@0 | 7815 | sChar=source[sIndex++]; |
michael@0 | 7816 | sOrder = elements[sChar]; |
michael@0 | 7817 | if(sOrder > UCOL_NOT_FOUND) { |
michael@0 | 7818 | sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen); |
michael@0 | 7819 | } |
michael@0 | 7820 | } |
michael@0 | 7821 | |
michael@0 | 7822 | while(tOrder==0) { |
michael@0 | 7823 | if(tIndex==tLen) { |
michael@0 | 7824 | if(endOfSource) { |
michael@0 | 7825 | goto endOfSecLoop; |
michael@0 | 7826 | } else { |
michael@0 | 7827 | return UCOL_GREATER; |
michael@0 | 7828 | } |
michael@0 | 7829 | } |
michael@0 | 7830 | tChar=target[tIndex++]; |
michael@0 | 7831 | tOrder = elements[tChar]; |
michael@0 | 7832 | if(tOrder > UCOL_NOT_FOUND) { |
michael@0 | 7833 | tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen); |
michael@0 | 7834 | } |
michael@0 | 7835 | } |
michael@0 | 7836 | if(endOfSource) { |
michael@0 | 7837 | return UCOL_LESS; |
michael@0 | 7838 | } |
michael@0 | 7839 | |
michael@0 | 7840 | if(sOrder == tOrder) { |
michael@0 | 7841 | sOrder = 0; tOrder = 0; |
michael@0 | 7842 | continue; |
michael@0 | 7843 | } else { |
michael@0 | 7844 | // see primary loop for comments on this |
michael@0 | 7845 | if(((sOrder^tOrder)&0xFF000000)!=0) { |
michael@0 | 7846 | if(sOrder < tOrder) { |
michael@0 | 7847 | return UCOL_LESS; |
michael@0 | 7848 | } else if(sOrder > tOrder) { |
michael@0 | 7849 | return UCOL_GREATER; |
michael@0 | 7850 | } |
michael@0 | 7851 | } |
michael@0 | 7852 | sOrder<<=8; |
michael@0 | 7853 | tOrder<<=8; |
michael@0 | 7854 | } |
michael@0 | 7855 | } |
michael@0 | 7856 | } else { // French |
michael@0 | 7857 | if(haveContractions) { // if we have contractions, we have to bail out |
michael@0 | 7858 | // since we don't really know how to handle them here |
michael@0 | 7859 | return ucol_strcollRegular(coll, source, sLen, target, tLen, status); |
michael@0 | 7860 | } |
michael@0 | 7861 | // For French, we go backwards |
michael@0 | 7862 | sIndex = sLen; tIndex = tLen; |
michael@0 | 7863 | for(;;) { |
michael@0 | 7864 | while(sOrder==0) { |
michael@0 | 7865 | if(sIndex==0) { |
michael@0 | 7866 | endOfSource = TRUE; |
michael@0 | 7867 | break; |
michael@0 | 7868 | } |
michael@0 | 7869 | sChar=source[--sIndex]; |
michael@0 | 7870 | sOrder = elements[sChar]; |
michael@0 | 7871 | // don't even look for contractions |
michael@0 | 7872 | } |
michael@0 | 7873 | |
michael@0 | 7874 | while(tOrder==0) { |
michael@0 | 7875 | if(tIndex==0) { |
michael@0 | 7876 | if(endOfSource) { |
michael@0 | 7877 | goto endOfSecLoop; |
michael@0 | 7878 | } else { |
michael@0 | 7879 | return UCOL_GREATER; |
michael@0 | 7880 | } |
michael@0 | 7881 | } |
michael@0 | 7882 | tChar=target[--tIndex]; |
michael@0 | 7883 | tOrder = elements[tChar]; |
michael@0 | 7884 | // don't even look for contractions |
michael@0 | 7885 | } |
michael@0 | 7886 | if(endOfSource) { |
michael@0 | 7887 | return UCOL_LESS; |
michael@0 | 7888 | } |
michael@0 | 7889 | |
michael@0 | 7890 | if(sOrder == tOrder) { |
michael@0 | 7891 | sOrder = 0; tOrder = 0; |
michael@0 | 7892 | continue; |
michael@0 | 7893 | } else { |
michael@0 | 7894 | // see the primary loop for comments |
michael@0 | 7895 | if(((sOrder^tOrder)&0xFF000000)!=0) { |
michael@0 | 7896 | if(sOrder < tOrder) { |
michael@0 | 7897 | return UCOL_LESS; |
michael@0 | 7898 | } else if(sOrder > tOrder) { |
michael@0 | 7899 | return UCOL_GREATER; |
michael@0 | 7900 | } |
michael@0 | 7901 | } |
michael@0 | 7902 | sOrder<<=8; |
michael@0 | 7903 | tOrder<<=8; |
michael@0 | 7904 | } |
michael@0 | 7905 | } |
michael@0 | 7906 | } |
michael@0 | 7907 | } |
michael@0 | 7908 | |
michael@0 | 7909 | endOfSecLoop: |
michael@0 | 7910 | if(strength >= UCOL_TERTIARY) { |
michael@0 | 7911 | // tertiary loop is the same as secondary (except no French) |
michael@0 | 7912 | elements += coll->latinOneTableLen; |
michael@0 | 7913 | sIndex = 0; tIndex = 0; |
michael@0 | 7914 | endOfSource = FALSE; |
michael@0 | 7915 | for(;;) { |
michael@0 | 7916 | while(sOrder==0) { |
michael@0 | 7917 | if(sIndex==sLen) { |
michael@0 | 7918 | endOfSource = TRUE; |
michael@0 | 7919 | break; |
michael@0 | 7920 | } |
michael@0 | 7921 | sChar=source[sIndex++]; |
michael@0 | 7922 | sOrder = elements[sChar]; |
michael@0 | 7923 | if(sOrder > UCOL_NOT_FOUND) { |
michael@0 | 7924 | sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen); |
michael@0 | 7925 | } |
michael@0 | 7926 | } |
michael@0 | 7927 | while(tOrder==0) { |
michael@0 | 7928 | if(tIndex==tLen) { |
michael@0 | 7929 | if(endOfSource) { |
michael@0 | 7930 | return UCOL_EQUAL; // if both strings are at the end, they are equal |
michael@0 | 7931 | } else { |
michael@0 | 7932 | return UCOL_GREATER; |
michael@0 | 7933 | } |
michael@0 | 7934 | } |
michael@0 | 7935 | tChar=target[tIndex++]; |
michael@0 | 7936 | tOrder = elements[tChar]; |
michael@0 | 7937 | if(tOrder > UCOL_NOT_FOUND) { |
michael@0 | 7938 | tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen); |
michael@0 | 7939 | } |
michael@0 | 7940 | } |
michael@0 | 7941 | if(endOfSource) { |
michael@0 | 7942 | return UCOL_LESS; |
michael@0 | 7943 | } |
michael@0 | 7944 | if(sOrder == tOrder) { |
michael@0 | 7945 | sOrder = 0; tOrder = 0; |
michael@0 | 7946 | continue; |
michael@0 | 7947 | } else { |
michael@0 | 7948 | if(((sOrder^tOrder)&0xff000000)!=0) { |
michael@0 | 7949 | if(sOrder < tOrder) { |
michael@0 | 7950 | return UCOL_LESS; |
michael@0 | 7951 | } else if(sOrder > tOrder) { |
michael@0 | 7952 | return UCOL_GREATER; |
michael@0 | 7953 | } |
michael@0 | 7954 | } |
michael@0 | 7955 | sOrder<<=8; |
michael@0 | 7956 | tOrder<<=8; |
michael@0 | 7957 | } |
michael@0 | 7958 | } |
michael@0 | 7959 | } |
michael@0 | 7960 | return UCOL_EQUAL; |
michael@0 | 7961 | } |
michael@0 | 7962 | |
michael@0 | 7963 | /* |
michael@0 | 7964 | Note: ucol_strcollUTF8 supports null terminated input. Calculating length of |
michael@0 | 7965 | null terminated input string takes extra amount of CPU cycles. |
michael@0 | 7966 | */ |
michael@0 | 7967 | static UCollationResult |
michael@0 | 7968 | ucol_strcollRegularUTF8( |
michael@0 | 7969 | const UCollator *coll, |
michael@0 | 7970 | const char *source, |
michael@0 | 7971 | int32_t sourceLength, |
michael@0 | 7972 | const char *target, |
michael@0 | 7973 | int32_t targetLength, |
michael@0 | 7974 | UErrorCode *status) |
michael@0 | 7975 | { |
michael@0 | 7976 | UCharIterator src; |
michael@0 | 7977 | UCharIterator tgt; |
michael@0 | 7978 | |
michael@0 | 7979 | uiter_setUTF8(&src, source, sourceLength); |
michael@0 | 7980 | uiter_setUTF8(&tgt, target, targetLength); |
michael@0 | 7981 | |
michael@0 | 7982 | // Preparing the context objects for iterating over strings |
michael@0 | 7983 | collIterate sColl, tColl; |
michael@0 | 7984 | IInit_collIterate(coll, NULL, -1, &sColl, status); |
michael@0 | 7985 | IInit_collIterate(coll, NULL, -1, &tColl, status); |
michael@0 | 7986 | if(U_FAILURE(*status)) { |
michael@0 | 7987 | UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) |
michael@0 | 7988 | return UCOL_EQUAL; |
michael@0 | 7989 | } |
michael@0 | 7990 | // The division for the array length may truncate the array size to |
michael@0 | 7991 | // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high |
michael@0 | 7992 | // for all platforms anyway. |
michael@0 | 7993 | UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; |
michael@0 | 7994 | UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; |
michael@0 | 7995 | UNormIterator *sNormIter = NULL, *tNormIter = NULL; |
michael@0 | 7996 | |
michael@0 | 7997 | sColl.iterator = &src; |
michael@0 | 7998 | sColl.flags |= UCOL_USE_ITERATOR; |
michael@0 | 7999 | tColl.flags |= UCOL_USE_ITERATOR; |
michael@0 | 8000 | tColl.iterator = &tgt; |
michael@0 | 8001 | |
michael@0 | 8002 | if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) { |
michael@0 | 8003 | sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status); |
michael@0 | 8004 | sColl.iterator = unorm_setIter(sNormIter, &src, UNORM_FCD, status); |
michael@0 | 8005 | sColl.flags &= ~UCOL_ITER_NORM; |
michael@0 | 8006 | |
michael@0 | 8007 | tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status); |
michael@0 | 8008 | tColl.iterator = unorm_setIter(tNormIter, &tgt, UNORM_FCD, status); |
michael@0 | 8009 | tColl.flags &= ~UCOL_ITER_NORM; |
michael@0 | 8010 | } |
michael@0 | 8011 | |
michael@0 | 8012 | return ucol_strcollRegular(&sColl, &tColl, status); |
michael@0 | 8013 | } |
michael@0 | 8014 | |
michael@0 | 8015 | static inline uint32_t |
michael@0 | 8016 | ucol_getLatinOneContractionUTF8(const UCollator *coll, int32_t strength, |
michael@0 | 8017 | uint32_t CE, const char *s, int32_t *index, int32_t len) |
michael@0 | 8018 | { |
michael@0 | 8019 | const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF); |
michael@0 | 8020 | int32_t latinOneOffset = (CE & 0x00FFF000) >> 12; |
michael@0 | 8021 | int32_t offset = 1; |
michael@0 | 8022 | UChar32 schar = 0, tchar = 0; |
michael@0 | 8023 | |
michael@0 | 8024 | for(;;) { |
michael@0 | 8025 | if (*index == len) { |
michael@0 | 8026 | return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); |
michael@0 | 8027 | } |
michael@0 | 8028 | U8_GET_OR_FFFD((const uint8_t*)s, 0, *index, len, schar); |
michael@0 | 8029 | if (len < 0 && schar == 0) { |
michael@0 | 8030 | return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); |
michael@0 | 8031 | } |
michael@0 | 8032 | |
michael@0 | 8033 | while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ |
michael@0 | 8034 | offset++; |
michael@0 | 8035 | } |
michael@0 | 8036 | |
michael@0 | 8037 | if (schar == tchar) { |
michael@0 | 8038 | U8_FWD_1(s, *index, len); |
michael@0 | 8039 | return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]); |
michael@0 | 8040 | } |
michael@0 | 8041 | else |
michael@0 | 8042 | { |
michael@0 | 8043 | if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) { |
michael@0 | 8044 | return UCOL_BAIL_OUT_CE; |
michael@0 | 8045 | } |
michael@0 | 8046 | // skip completely ignorables |
michael@0 | 8047 | uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar); |
michael@0 | 8048 | if(isZeroCE == 0) { // we have to ignore completely ignorables |
michael@0 | 8049 | U8_FWD_1(s, *index, len); |
michael@0 | 8050 | continue; |
michael@0 | 8051 | } |
michael@0 | 8052 | |
michael@0 | 8053 | return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); |
michael@0 | 8054 | } |
michael@0 | 8055 | } |
michael@0 | 8056 | } |
michael@0 | 8057 | |
michael@0 | 8058 | static inline UCollationResult |
michael@0 | 8059 | ucol_strcollUseLatin1UTF8( |
michael@0 | 8060 | const UCollator *coll, |
michael@0 | 8061 | const char *source, |
michael@0 | 8062 | int32_t sLen, |
michael@0 | 8063 | const char *target, |
michael@0 | 8064 | int32_t tLen, |
michael@0 | 8065 | UErrorCode *status) |
michael@0 | 8066 | { |
michael@0 | 8067 | U_ALIGN_CODE(16); |
michael@0 | 8068 | int32_t strength = coll->strength; |
michael@0 | 8069 | |
michael@0 | 8070 | int32_t sIndex = 0, tIndex = 0; |
michael@0 | 8071 | UChar32 sChar = 0, tChar = 0; |
michael@0 | 8072 | uint32_t sOrder=0, tOrder=0; |
michael@0 | 8073 | |
michael@0 | 8074 | UBool endOfSource = FALSE; |
michael@0 | 8075 | |
michael@0 | 8076 | uint32_t *elements = coll->latinOneCEs; |
michael@0 | 8077 | |
michael@0 | 8078 | UBool haveContractions = FALSE; // if we have contractions in our string |
michael@0 | 8079 | // we cannot do French secondary |
michael@0 | 8080 | |
michael@0 | 8081 | // Do the primary level |
michael@0 | 8082 | for(;;) { |
michael@0 | 8083 | while(sOrder==0) { // this loop skips primary ignorables |
michael@0 | 8084 | // sOrder=getNextlatinOneCE(source); |
michael@0 | 8085 | if (sIndex == sLen) { |
michael@0 | 8086 | endOfSource = TRUE; |
michael@0 | 8087 | break; |
michael@0 | 8088 | } |
michael@0 | 8089 | U8_NEXT_OR_FFFD(source, sIndex, sLen ,sChar); |
michael@0 | 8090 | if (sLen < 0 && sChar == 0) { |
michael@0 | 8091 | endOfSource = TRUE; |
michael@0 | 8092 | sLen = sIndex; |
michael@0 | 8093 | break; |
michael@0 | 8094 | } |
michael@0 | 8095 | if(sChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) |
michael@0 | 8096 | //fprintf(stderr, "R"); |
michael@0 | 8097 | return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status); |
michael@0 | 8098 | } |
michael@0 | 8099 | sOrder = elements[sChar]; |
michael@0 | 8100 | if(sOrder >= UCOL_NOT_FOUND) { // if we got a special |
michael@0 | 8101 | // specials can basically be either contractions or bail-out signs. If we get anything |
michael@0 | 8102 | // else, we'll bail out anywasy |
michael@0 | 8103 | if(getCETag(sOrder) == CONTRACTION_TAG) { |
michael@0 | 8104 | sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen); |
michael@0 | 8105 | haveContractions = TRUE; // if there are contractions, we cannot do French secondary |
michael@0 | 8106 | // However, if there are contractions in the table, but we always use just one char, |
michael@0 | 8107 | // we might be able to do French. This should be checked out. |
michael@0 | 8108 | } |
michael@0 | 8109 | if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { |
michael@0 | 8110 | //fprintf(stderr, "S"); |
michael@0 | 8111 | return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status); |
michael@0 | 8112 | } |
michael@0 | 8113 | } |
michael@0 | 8114 | } |
michael@0 | 8115 | |
michael@0 | 8116 | while(tOrder==0) { // this loop skips primary ignorables |
michael@0 | 8117 | // tOrder=getNextlatinOneCE(target); |
michael@0 | 8118 | if (tIndex == tLen) { |
michael@0 | 8119 | if(endOfSource) { |
michael@0 | 8120 | goto endOfPrimLoopU8; |
michael@0 | 8121 | } else { |
michael@0 | 8122 | return UCOL_GREATER; |
michael@0 | 8123 | } |
michael@0 | 8124 | } |
michael@0 | 8125 | U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar); |
michael@0 | 8126 | if (tLen < 0 && tChar == 0) { |
michael@0 | 8127 | if(endOfSource) { |
michael@0 | 8128 | tLen = tIndex; |
michael@0 | 8129 | goto endOfPrimLoopU8; |
michael@0 | 8130 | } else { |
michael@0 | 8131 | return UCOL_GREATER; |
michael@0 | 8132 | } |
michael@0 | 8133 | } |
michael@0 | 8134 | if(tChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) |
michael@0 | 8135 | //fprintf(stderr, "R"); |
michael@0 | 8136 | return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status); |
michael@0 | 8137 | } |
michael@0 | 8138 | tOrder = elements[tChar]; |
michael@0 | 8139 | if(tOrder >= UCOL_NOT_FOUND) { |
michael@0 | 8140 | // Handling specials, see the comments for source |
michael@0 | 8141 | if(getCETag(tOrder) == CONTRACTION_TAG) { |
michael@0 | 8142 | tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen); |
michael@0 | 8143 | haveContractions = TRUE; |
michael@0 | 8144 | } |
michael@0 | 8145 | if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { |
michael@0 | 8146 | //fprintf(stderr, "S"); |
michael@0 | 8147 | return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status); |
michael@0 | 8148 | } |
michael@0 | 8149 | } |
michael@0 | 8150 | } |
michael@0 | 8151 | if(endOfSource) { // source is finished, but target is not, say the result. |
michael@0 | 8152 | return UCOL_LESS; |
michael@0 | 8153 | } |
michael@0 | 8154 | |
michael@0 | 8155 | if(sOrder == tOrder) { // if we have same CEs, we continue the loop |
michael@0 | 8156 | sOrder = 0; tOrder = 0; |
michael@0 | 8157 | continue; |
michael@0 | 8158 | } else { |
michael@0 | 8159 | // compare current top bytes |
michael@0 | 8160 | if(((sOrder^tOrder)&0xFF000000)!=0) { |
michael@0 | 8161 | // top bytes differ, return difference |
michael@0 | 8162 | if(sOrder < tOrder) { |
michael@0 | 8163 | return UCOL_LESS; |
michael@0 | 8164 | } else if(sOrder > tOrder) { |
michael@0 | 8165 | return UCOL_GREATER; |
michael@0 | 8166 | } |
michael@0 | 8167 | // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24); |
michael@0 | 8168 | // since we must return enum value |
michael@0 | 8169 | } |
michael@0 | 8170 | |
michael@0 | 8171 | // top bytes match, continue with following bytes |
michael@0 | 8172 | sOrder<<=8; |
michael@0 | 8173 | tOrder<<=8; |
michael@0 | 8174 | } |
michael@0 | 8175 | } |
michael@0 | 8176 | |
michael@0 | 8177 | endOfPrimLoopU8: |
michael@0 | 8178 | // after primary loop, we definitely know the sizes of strings, |
michael@0 | 8179 | // so we set it and use simpler loop for secondaries and tertiaries |
michael@0 | 8180 | sLen = sIndex; tLen = tIndex; |
michael@0 | 8181 | if(strength >= UCOL_SECONDARY) { |
michael@0 | 8182 | // adjust the table beggining |
michael@0 | 8183 | elements += coll->latinOneTableLen; |
michael@0 | 8184 | endOfSource = FALSE; |
michael@0 | 8185 | |
michael@0 | 8186 | if(coll->frenchCollation == UCOL_OFF) { // non French |
michael@0 | 8187 | // This loop is a simplified copy of primary loop |
michael@0 | 8188 | // at this point we know that whole strings are latin-1, so we don't |
michael@0 | 8189 | // check for that. We also know that we only have contractions as |
michael@0 | 8190 | // specials. |
michael@0 | 8191 | sIndex = 0; tIndex = 0; |
michael@0 | 8192 | for(;;) { |
michael@0 | 8193 | while(sOrder==0) { |
michael@0 | 8194 | if(sIndex==sLen) { |
michael@0 | 8195 | endOfSource = TRUE; |
michael@0 | 8196 | break; |
michael@0 | 8197 | } |
michael@0 | 8198 | U_ASSERT(sLen >= 0); |
michael@0 | 8199 | U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar); |
michael@0 | 8200 | U_ASSERT(sChar >= 0 && sChar <= 0xFF); |
michael@0 | 8201 | sOrder = elements[sChar]; |
michael@0 | 8202 | if(sOrder > UCOL_NOT_FOUND) { |
michael@0 | 8203 | sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen); |
michael@0 | 8204 | } |
michael@0 | 8205 | } |
michael@0 | 8206 | |
michael@0 | 8207 | while(tOrder==0) { |
michael@0 | 8208 | if(tIndex==tLen) { |
michael@0 | 8209 | if(endOfSource) { |
michael@0 | 8210 | goto endOfSecLoopU8; |
michael@0 | 8211 | } else { |
michael@0 | 8212 | return UCOL_GREATER; |
michael@0 | 8213 | } |
michael@0 | 8214 | } |
michael@0 | 8215 | U_ASSERT(tLen >= 0); |
michael@0 | 8216 | U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar); |
michael@0 | 8217 | U_ASSERT(tChar >= 0 && tChar <= 0xFF); |
michael@0 | 8218 | tOrder = elements[tChar]; |
michael@0 | 8219 | if(tOrder > UCOL_NOT_FOUND) { |
michael@0 | 8220 | tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen); |
michael@0 | 8221 | } |
michael@0 | 8222 | } |
michael@0 | 8223 | if(endOfSource) { |
michael@0 | 8224 | return UCOL_LESS; |
michael@0 | 8225 | } |
michael@0 | 8226 | |
michael@0 | 8227 | if(sOrder == tOrder) { |
michael@0 | 8228 | sOrder = 0; tOrder = 0; |
michael@0 | 8229 | continue; |
michael@0 | 8230 | } else { |
michael@0 | 8231 | // see primary loop for comments on this |
michael@0 | 8232 | if(((sOrder^tOrder)&0xFF000000)!=0) { |
michael@0 | 8233 | if(sOrder < tOrder) { |
michael@0 | 8234 | return UCOL_LESS; |
michael@0 | 8235 | } else if(sOrder > tOrder) { |
michael@0 | 8236 | return UCOL_GREATER; |
michael@0 | 8237 | } |
michael@0 | 8238 | } |
michael@0 | 8239 | sOrder<<=8; |
michael@0 | 8240 | tOrder<<=8; |
michael@0 | 8241 | } |
michael@0 | 8242 | } |
michael@0 | 8243 | } else { // French |
michael@0 | 8244 | if(haveContractions) { // if we have contractions, we have to bail out |
michael@0 | 8245 | // since we don't really know how to handle them here |
michael@0 | 8246 | return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status); |
michael@0 | 8247 | } |
michael@0 | 8248 | // For French, we go backwards |
michael@0 | 8249 | sIndex = sLen; tIndex = tLen; |
michael@0 | 8250 | for(;;) { |
michael@0 | 8251 | while(sOrder==0) { |
michael@0 | 8252 | if(sIndex==0) { |
michael@0 | 8253 | endOfSource = TRUE; |
michael@0 | 8254 | break; |
michael@0 | 8255 | } |
michael@0 | 8256 | U8_PREV_OR_FFFD(source, 0, sIndex, sChar); |
michael@0 | 8257 | U_ASSERT(sChar >= 0 && sChar <= 0xFF); |
michael@0 | 8258 | sOrder = elements[sChar]; |
michael@0 | 8259 | // don't even look for contractions |
michael@0 | 8260 | } |
michael@0 | 8261 | |
michael@0 | 8262 | while(tOrder==0) { |
michael@0 | 8263 | if(tIndex==0) { |
michael@0 | 8264 | if(endOfSource) { |
michael@0 | 8265 | goto endOfSecLoopU8; |
michael@0 | 8266 | } else { |
michael@0 | 8267 | return UCOL_GREATER; |
michael@0 | 8268 | } |
michael@0 | 8269 | } |
michael@0 | 8270 | U8_PREV_OR_FFFD(target, 0, tIndex, tChar); |
michael@0 | 8271 | U_ASSERT(tChar >= 0 && tChar <= 0xFF); |
michael@0 | 8272 | tOrder = elements[tChar]; |
michael@0 | 8273 | // don't even look for contractions |
michael@0 | 8274 | } |
michael@0 | 8275 | if(endOfSource) { |
michael@0 | 8276 | return UCOL_LESS; |
michael@0 | 8277 | } |
michael@0 | 8278 | |
michael@0 | 8279 | if(sOrder == tOrder) { |
michael@0 | 8280 | sOrder = 0; tOrder = 0; |
michael@0 | 8281 | continue; |
michael@0 | 8282 | } else { |
michael@0 | 8283 | // see the primary loop for comments |
michael@0 | 8284 | if(((sOrder^tOrder)&0xFF000000)!=0) { |
michael@0 | 8285 | if(sOrder < tOrder) { |
michael@0 | 8286 | return UCOL_LESS; |
michael@0 | 8287 | } else if(sOrder > tOrder) { |
michael@0 | 8288 | return UCOL_GREATER; |
michael@0 | 8289 | } |
michael@0 | 8290 | } |
michael@0 | 8291 | sOrder<<=8; |
michael@0 | 8292 | tOrder<<=8; |
michael@0 | 8293 | } |
michael@0 | 8294 | } |
michael@0 | 8295 | } |
michael@0 | 8296 | } |
michael@0 | 8297 | |
michael@0 | 8298 | endOfSecLoopU8: |
michael@0 | 8299 | if(strength >= UCOL_TERTIARY) { |
michael@0 | 8300 | // tertiary loop is the same as secondary (except no French) |
michael@0 | 8301 | elements += coll->latinOneTableLen; |
michael@0 | 8302 | sIndex = 0; tIndex = 0; |
michael@0 | 8303 | endOfSource = FALSE; |
michael@0 | 8304 | for(;;) { |
michael@0 | 8305 | while(sOrder==0) { |
michael@0 | 8306 | if(sIndex==sLen) { |
michael@0 | 8307 | endOfSource = TRUE; |
michael@0 | 8308 | break; |
michael@0 | 8309 | } |
michael@0 | 8310 | U_ASSERT(sLen >= 0); |
michael@0 | 8311 | U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar); |
michael@0 | 8312 | U_ASSERT(sChar >= 0 && sChar <= 0xFF); |
michael@0 | 8313 | sOrder = elements[sChar]; |
michael@0 | 8314 | if(sOrder > UCOL_NOT_FOUND) { |
michael@0 | 8315 | sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen); |
michael@0 | 8316 | } |
michael@0 | 8317 | } |
michael@0 | 8318 | while(tOrder==0) { |
michael@0 | 8319 | if(tIndex==tLen) { |
michael@0 | 8320 | if(endOfSource) { |
michael@0 | 8321 | return UCOL_EQUAL; // if both strings are at the end, they are equal |
michael@0 | 8322 | } else { |
michael@0 | 8323 | return UCOL_GREATER; |
michael@0 | 8324 | } |
michael@0 | 8325 | } |
michael@0 | 8326 | U_ASSERT(tLen >= 0); |
michael@0 | 8327 | U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar); |
michael@0 | 8328 | U_ASSERT(tChar >= 0 && tChar <= 0xFF); |
michael@0 | 8329 | tOrder = elements[tChar]; |
michael@0 | 8330 | if(tOrder > UCOL_NOT_FOUND) { |
michael@0 | 8331 | tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen); |
michael@0 | 8332 | } |
michael@0 | 8333 | } |
michael@0 | 8334 | if(endOfSource) { |
michael@0 | 8335 | return UCOL_LESS; |
michael@0 | 8336 | } |
michael@0 | 8337 | if(sOrder == tOrder) { |
michael@0 | 8338 | sOrder = 0; tOrder = 0; |
michael@0 | 8339 | continue; |
michael@0 | 8340 | } else { |
michael@0 | 8341 | if(((sOrder^tOrder)&0xff000000)!=0) { |
michael@0 | 8342 | if(sOrder < tOrder) { |
michael@0 | 8343 | return UCOL_LESS; |
michael@0 | 8344 | } else if(sOrder > tOrder) { |
michael@0 | 8345 | return UCOL_GREATER; |
michael@0 | 8346 | } |
michael@0 | 8347 | } |
michael@0 | 8348 | sOrder<<=8; |
michael@0 | 8349 | tOrder<<=8; |
michael@0 | 8350 | } |
michael@0 | 8351 | } |
michael@0 | 8352 | } |
michael@0 | 8353 | return UCOL_EQUAL; |
michael@0 | 8354 | } |
michael@0 | 8355 | |
michael@0 | 8356 | U_CAPI UCollationResult U_EXPORT2 |
michael@0 | 8357 | ucol_strcollIter( const UCollator *coll, |
michael@0 | 8358 | UCharIterator *sIter, |
michael@0 | 8359 | UCharIterator *tIter, |
michael@0 | 8360 | UErrorCode *status) |
michael@0 | 8361 | { |
michael@0 | 8362 | if(!status || U_FAILURE(*status)) { |
michael@0 | 8363 | return UCOL_EQUAL; |
michael@0 | 8364 | } |
michael@0 | 8365 | |
michael@0 | 8366 | UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER); |
michael@0 | 8367 | UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter); |
michael@0 | 8368 | |
michael@0 | 8369 | if (sIter == tIter) { |
michael@0 | 8370 | UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) |
michael@0 | 8371 | return UCOL_EQUAL; |
michael@0 | 8372 | } |
michael@0 | 8373 | if(sIter == NULL || tIter == NULL || coll == NULL) { |
michael@0 | 8374 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 8375 | UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) |
michael@0 | 8376 | return UCOL_EQUAL; |
michael@0 | 8377 | } |
michael@0 | 8378 | |
michael@0 | 8379 | UCollationResult result = UCOL_EQUAL; |
michael@0 | 8380 | |
michael@0 | 8381 | // Preparing the context objects for iterating over strings |
michael@0 | 8382 | collIterate sColl, tColl; |
michael@0 | 8383 | IInit_collIterate(coll, NULL, -1, &sColl, status); |
michael@0 | 8384 | IInit_collIterate(coll, NULL, -1, &tColl, status); |
michael@0 | 8385 | if(U_FAILURE(*status)) { |
michael@0 | 8386 | UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) |
michael@0 | 8387 | return UCOL_EQUAL; |
michael@0 | 8388 | } |
michael@0 | 8389 | // The division for the array length may truncate the array size to |
michael@0 | 8390 | // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high |
michael@0 | 8391 | // for all platforms anyway. |
michael@0 | 8392 | UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; |
michael@0 | 8393 | UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; |
michael@0 | 8394 | UNormIterator *sNormIter = NULL, *tNormIter = NULL; |
michael@0 | 8395 | |
michael@0 | 8396 | sColl.iterator = sIter; |
michael@0 | 8397 | sColl.flags |= UCOL_USE_ITERATOR; |
michael@0 | 8398 | tColl.flags |= UCOL_USE_ITERATOR; |
michael@0 | 8399 | tColl.iterator = tIter; |
michael@0 | 8400 | |
michael@0 | 8401 | if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) { |
michael@0 | 8402 | sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status); |
michael@0 | 8403 | sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status); |
michael@0 | 8404 | sColl.flags &= ~UCOL_ITER_NORM; |
michael@0 | 8405 | |
michael@0 | 8406 | tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status); |
michael@0 | 8407 | tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status); |
michael@0 | 8408 | tColl.flags &= ~UCOL_ITER_NORM; |
michael@0 | 8409 | } |
michael@0 | 8410 | |
michael@0 | 8411 | UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL; |
michael@0 | 8412 | |
michael@0 | 8413 | while((sChar = sColl.iterator->next(sColl.iterator)) == |
michael@0 | 8414 | (tChar = tColl.iterator->next(tColl.iterator))) { |
michael@0 | 8415 | if(sChar == U_SENTINEL) { |
michael@0 | 8416 | result = UCOL_EQUAL; |
michael@0 | 8417 | goto end_compare; |
michael@0 | 8418 | } |
michael@0 | 8419 | } |
michael@0 | 8420 | |
michael@0 | 8421 | if(sChar == U_SENTINEL) { |
michael@0 | 8422 | tChar = tColl.iterator->previous(tColl.iterator); |
michael@0 | 8423 | } |
michael@0 | 8424 | |
michael@0 | 8425 | if(tChar == U_SENTINEL) { |
michael@0 | 8426 | sChar = sColl.iterator->previous(sColl.iterator); |
michael@0 | 8427 | } |
michael@0 | 8428 | |
michael@0 | 8429 | sChar = sColl.iterator->previous(sColl.iterator); |
michael@0 | 8430 | tChar = tColl.iterator->previous(tColl.iterator); |
michael@0 | 8431 | |
michael@0 | 8432 | if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll)) |
michael@0 | 8433 | { |
michael@0 | 8434 | // We are stopped in the middle of a contraction. |
michael@0 | 8435 | // Scan backwards through the == part of the string looking for the start of the contraction. |
michael@0 | 8436 | // It doesn't matter which string we scan, since they are the same in this region. |
michael@0 | 8437 | do |
michael@0 | 8438 | { |
michael@0 | 8439 | sChar = sColl.iterator->previous(sColl.iterator); |
michael@0 | 8440 | tChar = tColl.iterator->previous(tColl.iterator); |
michael@0 | 8441 | } |
michael@0 | 8442 | while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll)); |
michael@0 | 8443 | } |
michael@0 | 8444 | |
michael@0 | 8445 | |
michael@0 | 8446 | if(U_SUCCESS(*status)) { |
michael@0 | 8447 | result = ucol_strcollRegular(&sColl, &tColl, status); |
michael@0 | 8448 | } |
michael@0 | 8449 | |
michael@0 | 8450 | end_compare: |
michael@0 | 8451 | if(sNormIter || tNormIter) { |
michael@0 | 8452 | unorm_closeIter(sNormIter); |
michael@0 | 8453 | unorm_closeIter(tNormIter); |
michael@0 | 8454 | } |
michael@0 | 8455 | |
michael@0 | 8456 | UTRACE_EXIT_VALUE_STATUS(result, *status) |
michael@0 | 8457 | return result; |
michael@0 | 8458 | } |
michael@0 | 8459 | |
michael@0 | 8460 | |
michael@0 | 8461 | /* */ |
michael@0 | 8462 | /* ucol_strcoll Main public API string comparison function */ |
michael@0 | 8463 | /* */ |
michael@0 | 8464 | U_CAPI UCollationResult U_EXPORT2 |
michael@0 | 8465 | ucol_strcoll( const UCollator *coll, |
michael@0 | 8466 | const UChar *source, |
michael@0 | 8467 | int32_t sourceLength, |
michael@0 | 8468 | const UChar *target, |
michael@0 | 8469 | int32_t targetLength) |
michael@0 | 8470 | { |
michael@0 | 8471 | U_ALIGN_CODE(16); |
michael@0 | 8472 | |
michael@0 | 8473 | UTRACE_ENTRY(UTRACE_UCOL_STRCOLL); |
michael@0 | 8474 | if (UTRACE_LEVEL(UTRACE_VERBOSE)) { |
michael@0 | 8475 | UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target); |
michael@0 | 8476 | UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength); |
michael@0 | 8477 | UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength); |
michael@0 | 8478 | } |
michael@0 | 8479 | |
michael@0 | 8480 | if((source == NULL && sourceLength != 0) || (target == NULL && targetLength != 0)) { |
michael@0 | 8481 | // do not crash, but return. Should have |
michael@0 | 8482 | // status argument to return error. |
michael@0 | 8483 | UTRACE_EXIT_VALUE(UCOL_EQUAL); |
michael@0 | 8484 | return UCOL_EQUAL; |
michael@0 | 8485 | } |
michael@0 | 8486 | |
michael@0 | 8487 | /* Quick check if source and target are same strings. */ |
michael@0 | 8488 | /* They should either both be NULL terminated or the explicit length should be set on both. */ |
michael@0 | 8489 | if (source==target && sourceLength==targetLength) { |
michael@0 | 8490 | UTRACE_EXIT_VALUE(UCOL_EQUAL); |
michael@0 | 8491 | return UCOL_EQUAL; |
michael@0 | 8492 | } |
michael@0 | 8493 | |
michael@0 | 8494 | if(coll->delegate != NULL) { |
michael@0 | 8495 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 8496 | return ((const Collator*)coll->delegate)->compare(source,sourceLength,target,targetLength, status); |
michael@0 | 8497 | } |
michael@0 | 8498 | |
michael@0 | 8499 | /* Scan the strings. Find: */ |
michael@0 | 8500 | /* The length of any leading portion that is equal */ |
michael@0 | 8501 | /* Whether they are exactly equal. (in which case we just return) */ |
michael@0 | 8502 | const UChar *pSrc = source; |
michael@0 | 8503 | const UChar *pTarg = target; |
michael@0 | 8504 | int32_t equalLength; |
michael@0 | 8505 | |
michael@0 | 8506 | if (sourceLength == -1 && targetLength == -1) { |
michael@0 | 8507 | // Both strings are null terminated. |
michael@0 | 8508 | // Scan through any leading equal portion. |
michael@0 | 8509 | while (*pSrc == *pTarg && *pSrc != 0) { |
michael@0 | 8510 | pSrc++; |
michael@0 | 8511 | pTarg++; |
michael@0 | 8512 | } |
michael@0 | 8513 | if (*pSrc == 0 && *pTarg == 0) { |
michael@0 | 8514 | UTRACE_EXIT_VALUE(UCOL_EQUAL); |
michael@0 | 8515 | return UCOL_EQUAL; |
michael@0 | 8516 | } |
michael@0 | 8517 | equalLength = (int32_t)(pSrc - source); |
michael@0 | 8518 | } |
michael@0 | 8519 | else |
michael@0 | 8520 | { |
michael@0 | 8521 | // One or both strings has an explicit length. |
michael@0 | 8522 | const UChar *pSrcEnd = source + sourceLength; |
michael@0 | 8523 | const UChar *pTargEnd = target + targetLength; |
michael@0 | 8524 | |
michael@0 | 8525 | // Scan while the strings are bitwise ==, or until one is exhausted. |
michael@0 | 8526 | for (;;) { |
michael@0 | 8527 | if (pSrc == pSrcEnd || pTarg == pTargEnd) { |
michael@0 | 8528 | break; |
michael@0 | 8529 | } |
michael@0 | 8530 | if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) { |
michael@0 | 8531 | break; |
michael@0 | 8532 | } |
michael@0 | 8533 | if (*pSrc != *pTarg) { |
michael@0 | 8534 | break; |
michael@0 | 8535 | } |
michael@0 | 8536 | pSrc++; |
michael@0 | 8537 | pTarg++; |
michael@0 | 8538 | } |
michael@0 | 8539 | equalLength = (int32_t)(pSrc - source); |
michael@0 | 8540 | |
michael@0 | 8541 | // If we made it all the way through both strings, we are done. They are == |
michael@0 | 8542 | if ((pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0)) && /* At end of src string, however it was specified. */ |
michael@0 | 8543 | (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0))) /* and also at end of dest string */ |
michael@0 | 8544 | { |
michael@0 | 8545 | UTRACE_EXIT_VALUE(UCOL_EQUAL); |
michael@0 | 8546 | return UCOL_EQUAL; |
michael@0 | 8547 | } |
michael@0 | 8548 | } |
michael@0 | 8549 | if (equalLength > 0) { |
michael@0 | 8550 | /* There is an identical portion at the beginning of the two strings. */ |
michael@0 | 8551 | /* If the identical portion ends within a contraction or a comibining */ |
michael@0 | 8552 | /* character sequence, back up to the start of that sequence. */ |
michael@0 | 8553 | |
michael@0 | 8554 | // These values should already be set by the code above. |
michael@0 | 8555 | //pSrc = source + equalLength; /* point to the first differing chars */ |
michael@0 | 8556 | //pTarg = target + equalLength; |
michael@0 | 8557 | if ((pSrc != source+sourceLength && ucol_unsafeCP(*pSrc, coll)) || |
michael@0 | 8558 | (pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll))) |
michael@0 | 8559 | { |
michael@0 | 8560 | // We are stopped in the middle of a contraction. |
michael@0 | 8561 | // Scan backwards through the == part of the string looking for the start of the contraction. |
michael@0 | 8562 | // It doesn't matter which string we scan, since they are the same in this region. |
michael@0 | 8563 | do |
michael@0 | 8564 | { |
michael@0 | 8565 | equalLength--; |
michael@0 | 8566 | pSrc--; |
michael@0 | 8567 | } |
michael@0 | 8568 | while (equalLength>0 && ucol_unsafeCP(*pSrc, coll)); |
michael@0 | 8569 | } |
michael@0 | 8570 | |
michael@0 | 8571 | source += equalLength; |
michael@0 | 8572 | target += equalLength; |
michael@0 | 8573 | if (sourceLength > 0) { |
michael@0 | 8574 | sourceLength -= equalLength; |
michael@0 | 8575 | } |
michael@0 | 8576 | if (targetLength > 0) { |
michael@0 | 8577 | targetLength -= equalLength; |
michael@0 | 8578 | } |
michael@0 | 8579 | } |
michael@0 | 8580 | |
michael@0 | 8581 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 8582 | UCollationResult returnVal; |
michael@0 | 8583 | if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) { |
michael@0 | 8584 | returnVal = ucol_strcollRegular(coll, source, sourceLength, target, targetLength, &status); |
michael@0 | 8585 | } else { |
michael@0 | 8586 | returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status); |
michael@0 | 8587 | } |
michael@0 | 8588 | UTRACE_EXIT_VALUE(returnVal); |
michael@0 | 8589 | return returnVal; |
michael@0 | 8590 | } |
michael@0 | 8591 | |
michael@0 | 8592 | U_CAPI UCollationResult U_EXPORT2 |
michael@0 | 8593 | ucol_strcollUTF8( |
michael@0 | 8594 | const UCollator *coll, |
michael@0 | 8595 | const char *source, |
michael@0 | 8596 | int32_t sourceLength, |
michael@0 | 8597 | const char *target, |
michael@0 | 8598 | int32_t targetLength, |
michael@0 | 8599 | UErrorCode *status) |
michael@0 | 8600 | { |
michael@0 | 8601 | U_ALIGN_CODE(16); |
michael@0 | 8602 | |
michael@0 | 8603 | UTRACE_ENTRY(UTRACE_UCOL_STRCOLLUTF8); |
michael@0 | 8604 | if (UTRACE_LEVEL(UTRACE_VERBOSE)) { |
michael@0 | 8605 | UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target); |
michael@0 | 8606 | UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vb ", source, sourceLength); |
michael@0 | 8607 | UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vb ", target, targetLength); |
michael@0 | 8608 | } |
michael@0 | 8609 | |
michael@0 | 8610 | if (U_FAILURE(*status)) { |
michael@0 | 8611 | /* do nothing */ |
michael@0 | 8612 | UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); |
michael@0 | 8613 | return UCOL_EQUAL; |
michael@0 | 8614 | } |
michael@0 | 8615 | |
michael@0 | 8616 | if((source == NULL && sourceLength != 0) || (target == NULL && targetLength != 0)) { |
michael@0 | 8617 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 8618 | UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); |
michael@0 | 8619 | return UCOL_EQUAL; |
michael@0 | 8620 | } |
michael@0 | 8621 | |
michael@0 | 8622 | /* Quick check if source and target are same strings. */ |
michael@0 | 8623 | /* They should either both be NULL terminated or the explicit length should be set on both. */ |
michael@0 | 8624 | if (source==target && sourceLength==targetLength) { |
michael@0 | 8625 | UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); |
michael@0 | 8626 | return UCOL_EQUAL; |
michael@0 | 8627 | } |
michael@0 | 8628 | |
michael@0 | 8629 | if(coll->delegate != NULL) { |
michael@0 | 8630 | return ((const Collator*)coll->delegate)->compareUTF8( |
michael@0 | 8631 | StringPiece(source, (sourceLength < 0) ? uprv_strlen(source) : sourceLength), |
michael@0 | 8632 | StringPiece(target, (targetLength < 0) ? uprv_strlen(target) : targetLength), |
michael@0 | 8633 | *status); |
michael@0 | 8634 | } |
michael@0 | 8635 | |
michael@0 | 8636 | /* Scan the strings. Find: */ |
michael@0 | 8637 | /* The length of any leading portion that is equal */ |
michael@0 | 8638 | /* Whether they are exactly equal. (in which case we just return) */ |
michael@0 | 8639 | const char *pSrc = source; |
michael@0 | 8640 | const char *pTarg = target; |
michael@0 | 8641 | UBool bSrcLimit = FALSE; |
michael@0 | 8642 | UBool bTargLimit = FALSE; |
michael@0 | 8643 | |
michael@0 | 8644 | if (sourceLength == -1 && targetLength == -1) { |
michael@0 | 8645 | // Both strings are null terminated. |
michael@0 | 8646 | // Scan through any leading equal portion. |
michael@0 | 8647 | while (*pSrc == *pTarg && *pSrc != 0) { |
michael@0 | 8648 | pSrc++; |
michael@0 | 8649 | pTarg++; |
michael@0 | 8650 | } |
michael@0 | 8651 | if (*pSrc == 0 && *pTarg == 0) { |
michael@0 | 8652 | UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); |
michael@0 | 8653 | return UCOL_EQUAL; |
michael@0 | 8654 | } |
michael@0 | 8655 | bSrcLimit = (*pSrc == 0); |
michael@0 | 8656 | bTargLimit = (*pTarg == 0); |
michael@0 | 8657 | } |
michael@0 | 8658 | else |
michael@0 | 8659 | { |
michael@0 | 8660 | // One or both strings has an explicit length. |
michael@0 | 8661 | const char *pSrcEnd = source + sourceLength; |
michael@0 | 8662 | const char *pTargEnd = target + targetLength; |
michael@0 | 8663 | |
michael@0 | 8664 | // Scan while the strings are bitwise ==, or until one is exhausted. |
michael@0 | 8665 | for (;;) { |
michael@0 | 8666 | if (pSrc == pSrcEnd || pTarg == pTargEnd) { |
michael@0 | 8667 | break; |
michael@0 | 8668 | } |
michael@0 | 8669 | if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) { |
michael@0 | 8670 | break; |
michael@0 | 8671 | } |
michael@0 | 8672 | if (*pSrc != *pTarg) { |
michael@0 | 8673 | break; |
michael@0 | 8674 | } |
michael@0 | 8675 | pSrc++; |
michael@0 | 8676 | pTarg++; |
michael@0 | 8677 | } |
michael@0 | 8678 | bSrcLimit = (pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0)); |
michael@0 | 8679 | bTargLimit = (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0)); |
michael@0 | 8680 | |
michael@0 | 8681 | // If we made it all the way through both strings, we are done. They are == |
michael@0 | 8682 | if (bSrcLimit && /* At end of src string, however it was specified. */ |
michael@0 | 8683 | bTargLimit) /* and also at end of dest string */ |
michael@0 | 8684 | { |
michael@0 | 8685 | UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); |
michael@0 | 8686 | return UCOL_EQUAL; |
michael@0 | 8687 | } |
michael@0 | 8688 | } |
michael@0 | 8689 | |
michael@0 | 8690 | U_ASSERT(!(bSrcLimit && bTargLimit)); |
michael@0 | 8691 | |
michael@0 | 8692 | int32_t equalLength = pSrc - source; |
michael@0 | 8693 | UBool bSawNonLatin1 = FALSE; |
michael@0 | 8694 | |
michael@0 | 8695 | if (equalLength > 0) { |
michael@0 | 8696 | // Align position to the start of UTF-8 code point. |
michael@0 | 8697 | if (bTargLimit) { |
michael@0 | 8698 | U8_SET_CP_START((const uint8_t*)source, 0, equalLength); |
michael@0 | 8699 | } else { |
michael@0 | 8700 | U8_SET_CP_START((const uint8_t*)target, 0, equalLength); |
michael@0 | 8701 | } |
michael@0 | 8702 | pSrc = source + equalLength; |
michael@0 | 8703 | pTarg = target + equalLength; |
michael@0 | 8704 | } |
michael@0 | 8705 | |
michael@0 | 8706 | if (equalLength > 0) { |
michael@0 | 8707 | /* There is an identical portion at the beginning of the two strings. */ |
michael@0 | 8708 | /* If the identical portion ends within a contraction or a comibining */ |
michael@0 | 8709 | /* character sequence, back up to the start of that sequence. */ |
michael@0 | 8710 | UBool bUnsafeCP = FALSE; |
michael@0 | 8711 | UChar32 uc32 = -1; |
michael@0 | 8712 | |
michael@0 | 8713 | if (!bSrcLimit) { |
michael@0 | 8714 | U8_GET_OR_FFFD((const uint8_t*)source, 0, equalLength, sourceLength, uc32); |
michael@0 | 8715 | if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) { |
michael@0 | 8716 | bUnsafeCP = TRUE; |
michael@0 | 8717 | } |
michael@0 | 8718 | bSawNonLatin1 |= (uc32 > 0xff); |
michael@0 | 8719 | } |
michael@0 | 8720 | if (!bTargLimit) { |
michael@0 | 8721 | U8_GET_OR_FFFD((const uint8_t*)target, 0, equalLength, targetLength, uc32); |
michael@0 | 8722 | if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) { |
michael@0 | 8723 | bUnsafeCP = TRUE; |
michael@0 | 8724 | } |
michael@0 | 8725 | bSawNonLatin1 |= (uc32 > 0xff); |
michael@0 | 8726 | } |
michael@0 | 8727 | |
michael@0 | 8728 | if (bUnsafeCP) { |
michael@0 | 8729 | while (equalLength > 0) { |
michael@0 | 8730 | // We are stopped in the middle of a contraction. |
michael@0 | 8731 | // Scan backwards through the == part of the string looking for the start of the contraction. |
michael@0 | 8732 | // It doesn't matter which string we scan, since they are the same in this region. |
michael@0 | 8733 | U8_PREV_OR_FFFD((uint8_t*)source, 0, equalLength, uc32); |
michael@0 | 8734 | bSawNonLatin1 |= (uc32 > 0xff); |
michael@0 | 8735 | if (uc32 < 0x10000 && !ucol_unsafeCP((UChar)uc32, coll)) { |
michael@0 | 8736 | break; |
michael@0 | 8737 | } |
michael@0 | 8738 | } |
michael@0 | 8739 | } |
michael@0 | 8740 | source += equalLength; |
michael@0 | 8741 | target += equalLength; |
michael@0 | 8742 | if (sourceLength > 0) { |
michael@0 | 8743 | sourceLength -= equalLength; |
michael@0 | 8744 | } |
michael@0 | 8745 | if (targetLength > 0) { |
michael@0 | 8746 | targetLength -= equalLength; |
michael@0 | 8747 | } |
michael@0 | 8748 | } else { |
michael@0 | 8749 | // Lead byte of Latin 1 character is 0x00 - 0xC3 |
michael@0 | 8750 | bSawNonLatin1 = (source && (sourceLength != 0) && (uint8_t)*source > 0xc3); |
michael@0 | 8751 | bSawNonLatin1 |= (target && (targetLength != 0) && (uint8_t)*target > 0xc3); |
michael@0 | 8752 | } |
michael@0 | 8753 | |
michael@0 | 8754 | UCollationResult returnVal; |
michael@0 | 8755 | |
michael@0 | 8756 | if(!coll->latinOneUse || bSawNonLatin1) { |
michael@0 | 8757 | returnVal = ucol_strcollRegularUTF8(coll, source, sourceLength, target, targetLength, status); |
michael@0 | 8758 | } else { |
michael@0 | 8759 | returnVal = ucol_strcollUseLatin1UTF8(coll, source, sourceLength, target, targetLength, status); |
michael@0 | 8760 | } |
michael@0 | 8761 | UTRACE_EXIT_VALUE_STATUS(returnVal, *status); |
michael@0 | 8762 | return returnVal; |
michael@0 | 8763 | } |
michael@0 | 8764 | |
michael@0 | 8765 | |
michael@0 | 8766 | /* convenience function for comparing strings */ |
michael@0 | 8767 | U_CAPI UBool U_EXPORT2 |
michael@0 | 8768 | ucol_greater( const UCollator *coll, |
michael@0 | 8769 | const UChar *source, |
michael@0 | 8770 | int32_t sourceLength, |
michael@0 | 8771 | const UChar *target, |
michael@0 | 8772 | int32_t targetLength) |
michael@0 | 8773 | { |
michael@0 | 8774 | return (ucol_strcoll(coll, source, sourceLength, target, targetLength) |
michael@0 | 8775 | == UCOL_GREATER); |
michael@0 | 8776 | } |
michael@0 | 8777 | |
michael@0 | 8778 | /* convenience function for comparing strings */ |
michael@0 | 8779 | U_CAPI UBool U_EXPORT2 |
michael@0 | 8780 | ucol_greaterOrEqual( const UCollator *coll, |
michael@0 | 8781 | const UChar *source, |
michael@0 | 8782 | int32_t sourceLength, |
michael@0 | 8783 | const UChar *target, |
michael@0 | 8784 | int32_t targetLength) |
michael@0 | 8785 | { |
michael@0 | 8786 | return (ucol_strcoll(coll, source, sourceLength, target, targetLength) |
michael@0 | 8787 | != UCOL_LESS); |
michael@0 | 8788 | } |
michael@0 | 8789 | |
michael@0 | 8790 | /* convenience function for comparing strings */ |
michael@0 | 8791 | U_CAPI UBool U_EXPORT2 |
michael@0 | 8792 | ucol_equal( const UCollator *coll, |
michael@0 | 8793 | const UChar *source, |
michael@0 | 8794 | int32_t sourceLength, |
michael@0 | 8795 | const UChar *target, |
michael@0 | 8796 | int32_t targetLength) |
michael@0 | 8797 | { |
michael@0 | 8798 | return (ucol_strcoll(coll, source, sourceLength, target, targetLength) |
michael@0 | 8799 | == UCOL_EQUAL); |
michael@0 | 8800 | } |
michael@0 | 8801 | |
michael@0 | 8802 | U_CAPI void U_EXPORT2 |
michael@0 | 8803 | ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) { |
michael@0 | 8804 | if(coll && coll->UCA) { |
michael@0 | 8805 | uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo)); |
michael@0 | 8806 | } |
michael@0 | 8807 | } |
michael@0 | 8808 | |
michael@0 | 8809 | #endif /* #if !UCONFIG_NO_COLLATION */ |