1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/ucol.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,8809 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* Copyright (C) 1996-2013, International Business Machines 1.7 +* Corporation and others. All Rights Reserved. 1.8 +******************************************************************************* 1.9 +* file name: ucol.cpp 1.10 +* encoding: US-ASCII 1.11 +* tab size: 8 (not used) 1.12 +* indentation:4 1.13 +* 1.14 +* Modification history 1.15 +* Date Name Comments 1.16 +* 1996-1999 various members of ICU team maintained C API for collation framework 1.17 +* 02/16/2001 synwee Added internal method getPrevSpecialCE 1.18 +* 03/01/2001 synwee Added maxexpansion functionality. 1.19 +* 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant 1.20 +*/ 1.21 + 1.22 +#include "unicode/utypes.h" 1.23 + 1.24 +#if !UCONFIG_NO_COLLATION 1.25 + 1.26 +#include "unicode/bytestream.h" 1.27 +#include "unicode/coleitr.h" 1.28 +#include "unicode/unorm.h" 1.29 +#include "unicode/udata.h" 1.30 +#include "unicode/ustring.h" 1.31 +#include "unicode/utf8.h" 1.32 + 1.33 +#include "ucol_imp.h" 1.34 +#include "bocsu.h" 1.35 + 1.36 +#include "normalizer2impl.h" 1.37 +#include "unorm_it.h" 1.38 +#include "umutex.h" 1.39 +#include "cmemory.h" 1.40 +#include "ucln_in.h" 1.41 +#include "cstring.h" 1.42 +#include "utracimp.h" 1.43 +#include "putilimp.h" 1.44 +#include "uassert.h" 1.45 +#include "unicode/coll.h" 1.46 + 1.47 +#ifdef UCOL_DEBUG 1.48 +#include <stdio.h> 1.49 +#endif 1.50 + 1.51 +U_NAMESPACE_USE 1.52 + 1.53 +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 1.54 + 1.55 +#define LAST_BYTE_MASK_ 0xFF 1.56 +#define SECOND_LAST_BYTE_SHIFT_ 8 1.57 + 1.58 +#define ZERO_CC_LIMIT_ 0xC0 1.59 + 1.60 +// These are static pointers to the NFC/NFD implementation instance. 1.61 +// Each of them is always the same between calls to u_cleanup 1.62 +// and therefore writing to it is not synchronized. 1.63 +// They are cleaned in ucol_cleanup 1.64 +static const Normalizer2 *g_nfd = NULL; 1.65 +static const Normalizer2Impl *g_nfcImpl = NULL; 1.66 + 1.67 +// These are values from UCA required for 1.68 +// implicit generation and supressing sort key compression 1.69 +// they should regularly be in the UCA, but if one 1.70 +// is running without UCA, it could be a problem 1.71 +static const int32_t maxRegularPrimary = 0x7A; 1.72 +static const int32_t minImplicitPrimary = 0xE0; 1.73 +static const int32_t maxImplicitPrimary = 0xE4; 1.74 + 1.75 +U_CDECL_BEGIN 1.76 +static UBool U_CALLCONV 1.77 +ucol_cleanup(void) 1.78 +{ 1.79 + g_nfd = NULL; 1.80 + g_nfcImpl = NULL; 1.81 + return TRUE; 1.82 +} 1.83 + 1.84 +static int32_t U_CALLCONV 1.85 +_getFoldingOffset(uint32_t data) { 1.86 + return (int32_t)(data&0xFFFFFF); 1.87 +} 1.88 + 1.89 +U_CDECL_END 1.90 + 1.91 +static inline 1.92 +UBool initializeNFD(UErrorCode *status) { 1.93 + if (g_nfd != NULL) { 1.94 + return TRUE; 1.95 + } else { 1.96 + // The result is constant, until the library is reloaded. 1.97 + g_nfd = Normalizer2Factory::getNFDInstance(*status); 1.98 + ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup); 1.99 + return U_SUCCESS(*status); 1.100 + } 1.101 +} 1.102 + 1.103 +// init FCD data 1.104 +static inline 1.105 +UBool initializeFCD(UErrorCode *status) { 1.106 + if (g_nfcImpl != NULL) { 1.107 + return TRUE; 1.108 + } else { 1.109 + // The result is constant, until the library is reloaded. 1.110 + g_nfcImpl = Normalizer2Factory::getNFCImpl(*status); 1.111 + // Note: Alternatively, we could also store this pointer in each collIterate struct, 1.112 + // same as Normalizer2Factory::getImpl(collIterate->nfd). 1.113 + ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup); 1.114 + return U_SUCCESS(*status); 1.115 + } 1.116 +} 1.117 + 1.118 +static 1.119 +inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString, 1.120 + int32_t sourceLen, collIterate *s, 1.121 + UErrorCode *status) 1.122 +{ 1.123 + (s)->string = (s)->pos = sourceString; 1.124 + (s)->origFlags = 0; 1.125 + (s)->flags = 0; 1.126 + if (sourceLen >= 0) { 1.127 + s->flags |= UCOL_ITER_HASLEN; 1.128 + (s)->endp = (UChar *)sourceString+sourceLen; 1.129 + } 1.130 + else { 1.131 + /* change to enable easier checking for end of string for fcdpositon */ 1.132 + (s)->endp = NULL; 1.133 + } 1.134 + (s)->extendCEs = NULL; 1.135 + (s)->extendCEsSize = 0; 1.136 + (s)->CEpos = (s)->toReturn = (s)->CEs; 1.137 + (s)->offsetBuffer = NULL; 1.138 + (s)->offsetBufferSize = 0; 1.139 + (s)->offsetReturn = (s)->offsetStore = NULL; 1.140 + (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0; 1.141 + (s)->coll = (collator); 1.142 + if (initializeNFD(status)) { 1.143 + (s)->nfd = g_nfd; 1.144 + } else { 1.145 + return; 1.146 + } 1.147 + (s)->fcdPosition = 0; 1.148 + if(collator->normalizationMode == UCOL_ON) { 1.149 + (s)->flags |= UCOL_ITER_NORM; 1.150 + } 1.151 + if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) { 1.152 + (s)->flags |= UCOL_HIRAGANA_Q; 1.153 + } 1.154 + (s)->iterator = NULL; 1.155 + //(s)->iteratorIndex = 0; 1.156 +} 1.157 + 1.158 +U_CAPI void U_EXPORT2 1.159 +uprv_init_collIterate(const UCollator *collator, const UChar *sourceString, 1.160 + int32_t sourceLen, collIterate *s, 1.161 + UErrorCode *status) { 1.162 + /* Out-of-line version for use from other files. */ 1.163 + IInit_collIterate(collator, sourceString, sourceLen, s, status); 1.164 +} 1.165 + 1.166 +U_CAPI collIterate * U_EXPORT2 1.167 +uprv_new_collIterate(UErrorCode *status) { 1.168 + if(U_FAILURE(*status)) { 1.169 + return NULL; 1.170 + } 1.171 + collIterate *s = new collIterate; 1.172 + if(s == NULL) { 1.173 + *status = U_MEMORY_ALLOCATION_ERROR; 1.174 + return NULL; 1.175 + } 1.176 + return s; 1.177 +} 1.178 + 1.179 +U_CAPI void U_EXPORT2 1.180 +uprv_delete_collIterate(collIterate *s) { 1.181 + delete s; 1.182 +} 1.183 + 1.184 +U_CAPI UBool U_EXPORT2 1.185 +uprv_collIterateAtEnd(collIterate *s) { 1.186 + return s == NULL || s->pos == s->endp; 1.187 +} 1.188 + 1.189 +/** 1.190 +* Backup the state of the collIterate struct data 1.191 +* @param data collIterate to backup 1.192 +* @param backup storage 1.193 +*/ 1.194 +static 1.195 +inline void backupState(const collIterate *data, collIterateState *backup) 1.196 +{ 1.197 + backup->fcdPosition = data->fcdPosition; 1.198 + backup->flags = data->flags; 1.199 + backup->origFlags = data->origFlags; 1.200 + backup->pos = data->pos; 1.201 + backup->bufferaddress = data->writableBuffer.getBuffer(); 1.202 + backup->buffersize = data->writableBuffer.length(); 1.203 + backup->iteratorMove = 0; 1.204 + backup->iteratorIndex = 0; 1.205 + if(data->iterator != NULL) { 1.206 + //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT); 1.207 + backup->iteratorIndex = data->iterator->getState(data->iterator); 1.208 + // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE 1.209 + if(backup->iteratorIndex == UITER_NO_STATE) { 1.210 + while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) { 1.211 + backup->iteratorMove++; 1.212 + data->iterator->move(data->iterator, -1, UITER_CURRENT); 1.213 + } 1.214 + data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT); 1.215 + } 1.216 + } 1.217 +} 1.218 + 1.219 +/** 1.220 +* Loads the state into the collIterate struct data 1.221 +* @param data collIterate to backup 1.222 +* @param backup storage 1.223 +* @param forwards boolean to indicate if forwards iteration is used, 1.224 +* false indicates backwards iteration 1.225 +*/ 1.226 +static 1.227 +inline void loadState(collIterate *data, const collIterateState *backup, 1.228 + UBool forwards) 1.229 +{ 1.230 + UErrorCode status = U_ZERO_ERROR; 1.231 + data->flags = backup->flags; 1.232 + data->origFlags = backup->origFlags; 1.233 + if(data->iterator != NULL) { 1.234 + //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO); 1.235 + data->iterator->setState(data->iterator, backup->iteratorIndex, &status); 1.236 + if(backup->iteratorMove != 0) { 1.237 + data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT); 1.238 + } 1.239 + } 1.240 + data->pos = backup->pos; 1.241 + 1.242 + if ((data->flags & UCOL_ITER_INNORMBUF) && 1.243 + data->writableBuffer.getBuffer() != backup->bufferaddress) { 1.244 + /* 1.245 + this is when a new buffer has been reallocated and we'll have to 1.246 + calculate the new position. 1.247 + note the new buffer has to contain the contents of the old buffer. 1.248 + */ 1.249 + if (forwards) { 1.250 + data->pos = data->writableBuffer.getTerminatedBuffer() + 1.251 + (data->pos - backup->bufferaddress); 1.252 + } 1.253 + else { 1.254 + /* backwards direction */ 1.255 + int32_t temp = backup->buffersize - 1.256 + (int32_t)(data->pos - backup->bufferaddress); 1.257 + data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writableBuffer.length() - temp); 1.258 + } 1.259 + } 1.260 + if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { 1.261 + /* 1.262 + this is alittle tricky. 1.263 + if we are initially not in the normalization buffer, even if we 1.264 + normalize in the later stage, the data in the buffer will be 1.265 + ignored, since we skip back up to the data string. 1.266 + however if we are already in the normalization buffer, any 1.267 + further normalization will pull data into the normalization 1.268 + buffer and modify the fcdPosition. 1.269 + since we are keeping the data in the buffer for use, the 1.270 + fcdPosition can not be reverted back. 1.271 + arrgghh.... 1.272 + */ 1.273 + data->fcdPosition = backup->fcdPosition; 1.274 + } 1.275 +} 1.276 + 1.277 +static UBool 1.278 +reallocCEs(collIterate *data, int32_t newCapacity) { 1.279 + uint32_t *oldCEs = data->extendCEs; 1.280 + if(oldCEs == NULL) { 1.281 + oldCEs = data->CEs; 1.282 + } 1.283 + int32_t length = data->CEpos - oldCEs; 1.284 + uint32_t *newCEs = (uint32_t *)uprv_malloc(newCapacity * 4); 1.285 + if(newCEs == NULL) { 1.286 + return FALSE; 1.287 + } 1.288 + uprv_memcpy(newCEs, oldCEs, length * 4); 1.289 + uprv_free(data->extendCEs); 1.290 + data->extendCEs = newCEs; 1.291 + data->extendCEsSize = newCapacity; 1.292 + data->CEpos = newCEs + length; 1.293 + return TRUE; 1.294 +} 1.295 + 1.296 +static UBool 1.297 +increaseCEsCapacity(collIterate *data) { 1.298 + int32_t oldCapacity; 1.299 + if(data->extendCEs != NULL) { 1.300 + oldCapacity = data->extendCEsSize; 1.301 + } else { 1.302 + oldCapacity = LENGTHOF(data->CEs); 1.303 + } 1.304 + return reallocCEs(data, 2 * oldCapacity); 1.305 +} 1.306 + 1.307 +static UBool 1.308 +ensureCEsCapacity(collIterate *data, int32_t minCapacity) { 1.309 + int32_t oldCapacity; 1.310 + if(data->extendCEs != NULL) { 1.311 + oldCapacity = data->extendCEsSize; 1.312 + } else { 1.313 + oldCapacity = LENGTHOF(data->CEs); 1.314 + } 1.315 + if(minCapacity <= oldCapacity) { 1.316 + return TRUE; 1.317 + } 1.318 + oldCapacity *= 2; 1.319 + return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacity); 1.320 +} 1.321 + 1.322 +void collIterate::appendOffset(int32_t offset, UErrorCode &errorCode) { 1.323 + if(U_FAILURE(errorCode)) { 1.324 + return; 1.325 + } 1.326 + int32_t length = offsetStore == NULL ? 0 : (int32_t)(offsetStore - offsetBuffer); 1.327 + U_ASSERT(length >= offsetBufferSize || offsetStore != NULL); 1.328 + if(length >= offsetBufferSize) { 1.329 + int32_t newCapacity = 2 * offsetBufferSize + UCOL_EXPAND_CE_BUFFER_SIZE; 1.330 + int32_t *newBuffer = static_cast<int32_t *>(uprv_malloc(newCapacity * 4)); 1.331 + if(newBuffer == NULL) { 1.332 + errorCode = U_MEMORY_ALLOCATION_ERROR; 1.333 + return; 1.334 + } 1.335 + if(length > 0) { 1.336 + uprv_memcpy(newBuffer, offsetBuffer, length * 4); 1.337 + } 1.338 + uprv_free(offsetBuffer); 1.339 + offsetBuffer = newBuffer; 1.340 + offsetStore = offsetBuffer + length; 1.341 + offsetBufferSize = newCapacity; 1.342 + } 1.343 + *offsetStore++ = offset; 1.344 +} 1.345 + 1.346 +/* 1.347 +* collIter_eos() 1.348 +* Checks for a collIterate being positioned at the end of 1.349 +* its source string. 1.350 +* 1.351 +*/ 1.352 +static 1.353 +inline UBool collIter_eos(collIterate *s) { 1.354 + if(s->flags & UCOL_USE_ITERATOR) { 1.355 + return !(s->iterator->hasNext(s->iterator)); 1.356 + } 1.357 + if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) { 1.358 + // Null terminated string, but not at null, so not at end. 1.359 + // Whether in main or normalization buffer doesn't matter. 1.360 + return FALSE; 1.361 + } 1.362 + 1.363 + // String with length. Can't be in normalization buffer, which is always 1.364 + // null termintated. 1.365 + if (s->flags & UCOL_ITER_HASLEN) { 1.366 + return (s->pos == s->endp); 1.367 + } 1.368 + 1.369 + // We are at a null termination, could be either normalization buffer or main string. 1.370 + if ((s->flags & UCOL_ITER_INNORMBUF) == 0) { 1.371 + // At null at end of main string. 1.372 + return TRUE; 1.373 + } 1.374 + 1.375 + // At null at end of normalization buffer. Need to check whether there there are 1.376 + // any characters left in the main buffer. 1.377 + if(s->origFlags & UCOL_USE_ITERATOR) { 1.378 + return !(s->iterator->hasNext(s->iterator)); 1.379 + } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) { 1.380 + // Null terminated main string. fcdPosition is the 'return' position into main buf. 1.381 + return (*s->fcdPosition == 0); 1.382 + } 1.383 + else { 1.384 + // Main string with an end pointer. 1.385 + return s->fcdPosition == s->endp; 1.386 + } 1.387 +} 1.388 + 1.389 +/* 1.390 +* collIter_bos() 1.391 +* Checks for a collIterate being positioned at the start of 1.392 +* its source string. 1.393 +* 1.394 +*/ 1.395 +static 1.396 +inline UBool collIter_bos(collIterate *source) { 1.397 + // if we're going backwards, we need to know whether there is more in the 1.398 + // iterator, even if we are in the side buffer 1.399 + if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) { 1.400 + return !source->iterator->hasPrevious(source->iterator); 1.401 + } 1.402 + if (source->pos <= source->string || 1.403 + ((source->flags & UCOL_ITER_INNORMBUF) && 1.404 + *(source->pos - 1) == 0 && source->fcdPosition == NULL)) { 1.405 + return TRUE; 1.406 + } 1.407 + return FALSE; 1.408 +} 1.409 + 1.410 +/*static 1.411 +inline UBool collIter_SimpleBos(collIterate *source) { 1.412 + // if we're going backwards, we need to know whether there is more in the 1.413 + // iterator, even if we are in the side buffer 1.414 + if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) { 1.415 + return !source->iterator->hasPrevious(source->iterator); 1.416 + } 1.417 + if (source->pos == source->string) { 1.418 + return TRUE; 1.419 + } 1.420 + return FALSE; 1.421 +}*/ 1.422 + //return (data->pos == data->string) || 1.423 + 1.424 + 1.425 +/****************************************************************************/ 1.426 +/* Following are the open/close functions */ 1.427 +/* */ 1.428 +/****************************************************************************/ 1.429 + 1.430 +static UCollator* 1.431 +ucol_initFromBinary(const uint8_t *bin, int32_t length, 1.432 + const UCollator *base, 1.433 + UCollator *fillIn, 1.434 + UErrorCode *status) 1.435 +{ 1.436 + UCollator *result = fillIn; 1.437 + if(U_FAILURE(*status)) { 1.438 + return NULL; 1.439 + } 1.440 + /* 1.441 + if(base == NULL) { 1.442 + // we don't support null base yet 1.443 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.444 + return NULL; 1.445 + } 1.446 + */ 1.447 + // We need these and we could be running without UCA 1.448 + uprv_uca_initImplicitConstants(status); 1.449 + UCATableHeader *colData = (UCATableHeader *)bin; 1.450 + // do we want version check here? We're trying to figure out whether collators are compatible 1.451 + if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 || 1.452 + uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) || 1.453 + colData->version[0] != UCOL_BUILDER_VERSION) 1.454 + { 1.455 + *status = U_COLLATOR_VERSION_MISMATCH; 1.456 + return NULL; 1.457 + } 1.458 + else { 1.459 + if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) { 1.460 + result = ucol_initCollator((const UCATableHeader *)bin, result, base, status); 1.461 + if(U_FAILURE(*status)){ 1.462 + return NULL; 1.463 + } 1.464 + result->hasRealData = TRUE; 1.465 + } 1.466 + else { 1.467 + if(base) { 1.468 + result = ucol_initCollator(base->image, result, base, status); 1.469 + ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status); 1.470 + if(U_FAILURE(*status)){ 1.471 + return NULL; 1.472 + } 1.473 + result->hasRealData = FALSE; 1.474 + } 1.475 + else { 1.476 + *status = U_USELESS_COLLATOR_ERROR; 1.477 + return NULL; 1.478 + } 1.479 + } 1.480 + result->freeImageOnClose = FALSE; 1.481 + } 1.482 + result->actualLocale = NULL; 1.483 + result->validLocale = NULL; 1.484 + result->requestedLocale = NULL; 1.485 + result->rules = NULL; 1.486 + result->rulesLength = 0; 1.487 + result->freeRulesOnClose = FALSE; 1.488 + result->ucaRules = NULL; 1.489 + return result; 1.490 +} 1.491 + 1.492 +U_CAPI UCollator* U_EXPORT2 1.493 +ucol_openBinary(const uint8_t *bin, int32_t length, 1.494 + const UCollator *base, 1.495 + UErrorCode *status) 1.496 +{ 1.497 + return ucol_initFromBinary(bin, length, base, NULL, status); 1.498 +} 1.499 + 1.500 +U_CAPI int32_t U_EXPORT2 1.501 +ucol_cloneBinary(const UCollator *coll, 1.502 + uint8_t *buffer, int32_t capacity, 1.503 + UErrorCode *status) 1.504 +{ 1.505 + int32_t length = 0; 1.506 + if(U_FAILURE(*status)) { 1.507 + return length; 1.508 + } 1.509 + if(capacity < 0) { 1.510 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.511 + return length; 1.512 + } 1.513 + if(coll->hasRealData == TRUE) { 1.514 + length = coll->image->size; 1.515 + if(length <= capacity) { 1.516 + uprv_memcpy(buffer, coll->image, length); 1.517 + } else { 1.518 + *status = U_BUFFER_OVERFLOW_ERROR; 1.519 + } 1.520 + } else { 1.521 + length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet))); 1.522 + if(length <= capacity) { 1.523 + /* build the UCATableHeader with minimal entries */ 1.524 + /* do not copy the header from the UCA file because its values are wrong! */ 1.525 + /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */ 1.526 + 1.527 + /* reset everything */ 1.528 + uprv_memset(buffer, 0, length); 1.529 + 1.530 + /* set the tailoring-specific values */ 1.531 + UCATableHeader *myData = (UCATableHeader *)buffer; 1.532 + myData->size = length; 1.533 + 1.534 + /* offset for the options, the only part of the data that is present after the header */ 1.535 + myData->options = sizeof(UCATableHeader); 1.536 + 1.537 + /* need to always set the expansion value for an upper bound of the options */ 1.538 + myData->expansion = myData->options + sizeof(UColOptionSet); 1.539 + 1.540 + myData->magic = UCOL_HEADER_MAGIC; 1.541 + myData->isBigEndian = U_IS_BIG_ENDIAN; 1.542 + myData->charSetFamily = U_CHARSET_FAMILY; 1.543 + 1.544 + /* copy UCA's version; genrb will override all but the builder version with tailoring data */ 1.545 + uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo)); 1.546 + 1.547 + uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo)); 1.548 + uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo)); 1.549 + uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo)); 1.550 + myData->jamoSpecial = coll->image->jamoSpecial; 1.551 + 1.552 + /* copy the collator options */ 1.553 + uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet)); 1.554 + } else { 1.555 + *status = U_BUFFER_OVERFLOW_ERROR; 1.556 + } 1.557 + } 1.558 + return length; 1.559 +} 1.560 + 1.561 +U_CAPI UCollator* U_EXPORT2 1.562 +ucol_safeClone(const UCollator *coll, void * /*stackBuffer*/, int32_t * pBufferSize, UErrorCode *status) 1.563 +{ 1.564 + UCollator * localCollator; 1.565 + int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator); 1.566 + int32_t imageSize = 0; 1.567 + int32_t rulesSize = 0; 1.568 + int32_t rulesPadding = 0; 1.569 + int32_t defaultReorderCodesSize = 0; 1.570 + int32_t reorderCodesSize = 0; 1.571 + uint8_t *image; 1.572 + UChar *rules; 1.573 + int32_t* defaultReorderCodes; 1.574 + int32_t* reorderCodes; 1.575 + uint8_t* leadBytePermutationTable; 1.576 + UBool imageAllocated = FALSE; 1.577 + 1.578 + if (status == NULL || U_FAILURE(*status)){ 1.579 + return NULL; 1.580 + } 1.581 + if (coll == NULL) { 1.582 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.583 + return NULL; 1.584 + } 1.585 + 1.586 + if (coll->rules && coll->freeRulesOnClose) { 1.587 + rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar); 1.588 + rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar)); 1.589 + bufferSizeNeeded += rulesSize + rulesPadding; 1.590 + } 1.591 + // no padding for alignment needed from here since the next two are 4 byte quantities 1.592 + if (coll->defaultReorderCodes) { 1.593 + defaultReorderCodesSize = coll->defaultReorderCodesLength * sizeof(int32_t); 1.594 + bufferSizeNeeded += defaultReorderCodesSize; 1.595 + } 1.596 + if (coll->reorderCodes) { 1.597 + reorderCodesSize = coll->reorderCodesLength * sizeof(int32_t); 1.598 + bufferSizeNeeded += reorderCodesSize; 1.599 + } 1.600 + if (coll->leadBytePermutationTable) { 1.601 + bufferSizeNeeded += 256 * sizeof(uint8_t); 1.602 + } 1.603 + 1.604 + if (pBufferSize != NULL) { 1.605 + int32_t inputSize = *pBufferSize; 1.606 + *pBufferSize = 1; 1.607 + if (inputSize == 0) { 1.608 + return NULL; // preflighting for deprecated functionality 1.609 + } 1.610 + } 1.611 + 1.612 + char *stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded); 1.613 + // Null pointer check. 1.614 + if (stackBufferChars == NULL) { 1.615 + *status = U_MEMORY_ALLOCATION_ERROR; 1.616 + return NULL; 1.617 + } 1.618 + *status = U_SAFECLONE_ALLOCATED_WARNING; 1.619 + 1.620 + localCollator = (UCollator *)stackBufferChars; 1.621 + rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding); 1.622 + defaultReorderCodes = (int32_t*)((uint8_t*)rules + rulesSize); 1.623 + reorderCodes = (int32_t*)((uint8_t*)defaultReorderCodes + defaultReorderCodesSize); 1.624 + leadBytePermutationTable = (uint8_t*)reorderCodes + reorderCodesSize; 1.625 + 1.626 + { 1.627 + UErrorCode tempStatus = U_ZERO_ERROR; 1.628 + imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus); 1.629 + } 1.630 + if (coll->freeImageOnClose) { 1.631 + image = (uint8_t *)uprv_malloc(imageSize); 1.632 + // Null pointer check 1.633 + if (image == NULL) { 1.634 + *status = U_MEMORY_ALLOCATION_ERROR; 1.635 + return NULL; 1.636 + } 1.637 + ucol_cloneBinary(coll, image, imageSize, status); 1.638 + imageAllocated = TRUE; 1.639 + } 1.640 + else { 1.641 + image = (uint8_t *)coll->image; 1.642 + } 1.643 + localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status); 1.644 + if (U_FAILURE(*status)) { 1.645 + return NULL; 1.646 + } 1.647 + 1.648 + if (coll->rules) { 1.649 + if (coll->freeRulesOnClose) { 1.650 + localCollator->rules = u_strcpy(rules, coll->rules); 1.651 + //bufferEnd += rulesSize; 1.652 + } 1.653 + else { 1.654 + localCollator->rules = coll->rules; 1.655 + } 1.656 + localCollator->freeRulesOnClose = FALSE; 1.657 + localCollator->rulesLength = coll->rulesLength; 1.658 + } 1.659 + 1.660 + // collator reordering 1.661 + if (coll->defaultReorderCodes) { 1.662 + localCollator->defaultReorderCodes = 1.663 + (int32_t*) uprv_memcpy(defaultReorderCodes, coll->defaultReorderCodes, coll->defaultReorderCodesLength * sizeof(int32_t)); 1.664 + localCollator->defaultReorderCodesLength = coll->defaultReorderCodesLength; 1.665 + localCollator->freeDefaultReorderCodesOnClose = FALSE; 1.666 + } 1.667 + if (coll->reorderCodes) { 1.668 + localCollator->reorderCodes = 1.669 + (int32_t*)uprv_memcpy(reorderCodes, coll->reorderCodes, coll->reorderCodesLength * sizeof(int32_t)); 1.670 + localCollator->reorderCodesLength = coll->reorderCodesLength; 1.671 + localCollator->freeReorderCodesOnClose = FALSE; 1.672 + } 1.673 + if (coll->leadBytePermutationTable) { 1.674 + localCollator->leadBytePermutationTable = 1.675 + (uint8_t*) uprv_memcpy(leadBytePermutationTable, coll->leadBytePermutationTable, 256); 1.676 + localCollator->freeLeadBytePermutationTableOnClose = FALSE; 1.677 + } 1.678 + 1.679 + int32_t i; 1.680 + for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) { 1.681 + ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status); 1.682 + } 1.683 + // zero copies of pointers 1.684 + localCollator->actualLocale = NULL; 1.685 + localCollator->validLocale = NULL; 1.686 + localCollator->requestedLocale = NULL; 1.687 + localCollator->ucaRules = coll->ucaRules; // There should only be one copy here. 1.688 + localCollator->freeOnClose = TRUE; 1.689 + localCollator->freeImageOnClose = imageAllocated; 1.690 + return localCollator; 1.691 +} 1.692 + 1.693 +U_CAPI void U_EXPORT2 1.694 +ucol_close(UCollator *coll) 1.695 +{ 1.696 + UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE); 1.697 + UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll); 1.698 + if(coll != NULL) { 1.699 + // these are always owned by each UCollator struct, 1.700 + // so we always free them 1.701 + if(coll->validLocale != NULL) { 1.702 + uprv_free(coll->validLocale); 1.703 + } 1.704 + if(coll->actualLocale != NULL) { 1.705 + uprv_free(coll->actualLocale); 1.706 + } 1.707 + if(coll->requestedLocale != NULL) { 1.708 + uprv_free(coll->requestedLocale); 1.709 + } 1.710 + if(coll->latinOneCEs != NULL) { 1.711 + uprv_free(coll->latinOneCEs); 1.712 + } 1.713 + if(coll->options != NULL && coll->freeOptionsOnClose) { 1.714 + uprv_free(coll->options); 1.715 + } 1.716 + if(coll->rules != NULL && coll->freeRulesOnClose) { 1.717 + uprv_free((UChar *)coll->rules); 1.718 + } 1.719 + if(coll->image != NULL && coll->freeImageOnClose) { 1.720 + uprv_free((UCATableHeader *)coll->image); 1.721 + } 1.722 + 1.723 + if(coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) { 1.724 + uprv_free(coll->leadBytePermutationTable); 1.725 + } 1.726 + if(coll->defaultReorderCodes != NULL && coll->freeDefaultReorderCodesOnClose == TRUE) { 1.727 + uprv_free(coll->defaultReorderCodes); 1.728 + } 1.729 + if(coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) { 1.730 + uprv_free(coll->reorderCodes); 1.731 + } 1.732 + 1.733 + if(coll->delegate != NULL) { 1.734 + delete (Collator*)coll->delegate; 1.735 + } 1.736 + 1.737 + /* Here, it would be advisable to close: */ 1.738 + /* - UData for UCA (unless we stuff it in the root resb */ 1.739 + /* Again, do we need additional housekeeping... HMMM! */ 1.740 + UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose); 1.741 + if(coll->freeOnClose){ 1.742 + /* for safeClone, if freeOnClose is FALSE, 1.743 + don't free the other instance data */ 1.744 + uprv_free(coll); 1.745 + } 1.746 + } 1.747 + UTRACE_EXIT(); 1.748 +} 1.749 + 1.750 +void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) { 1.751 + if(U_FAILURE(*status)) { 1.752 + return; 1.753 + } 1.754 + result->caseFirst = (UColAttributeValue)opts->caseFirst; 1.755 + result->caseLevel = (UColAttributeValue)opts->caseLevel; 1.756 + result->frenchCollation = (UColAttributeValue)opts->frenchCollation; 1.757 + result->normalizationMode = (UColAttributeValue)opts->normalizationMode; 1.758 + if(result->normalizationMode == UCOL_ON && !initializeFCD(status)) { 1.759 + return; 1.760 + } 1.761 + result->strength = (UColAttributeValue)opts->strength; 1.762 + result->variableTopValue = opts->variableTopValue; 1.763 + result->alternateHandling = (UColAttributeValue)opts->alternateHandling; 1.764 + result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ; 1.765 + result->numericCollation = (UColAttributeValue)opts->numericCollation; 1.766 + result->caseFirstisDefault = TRUE; 1.767 + result->caseLevelisDefault = TRUE; 1.768 + result->frenchCollationisDefault = TRUE; 1.769 + result->normalizationModeisDefault = TRUE; 1.770 + result->strengthisDefault = TRUE; 1.771 + result->variableTopValueisDefault = TRUE; 1.772 + result->alternateHandlingisDefault = TRUE; 1.773 + result->hiraganaQisDefault = TRUE; 1.774 + result->numericCollationisDefault = TRUE; 1.775 + 1.776 + ucol_updateInternalState(result, status); 1.777 + 1.778 + result->options = opts; 1.779 +} 1.780 + 1.781 + 1.782 +/** 1.783 +* Approximate determination if a character is at a contraction end. 1.784 +* Guaranteed to be TRUE if a character is at the end of a contraction, 1.785 +* otherwise it is not deterministic. 1.786 +* @param c character to be determined 1.787 +* @param coll collator 1.788 +*/ 1.789 +static 1.790 +inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) { 1.791 + if (c < coll->minContrEndCP) { 1.792 + return FALSE; 1.793 + } 1.794 + 1.795 + int32_t hash = c; 1.796 + uint8_t htbyte; 1.797 + if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) { 1.798 + if (U16_IS_TRAIL(c)) { 1.799 + return TRUE; 1.800 + } 1.801 + hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256; 1.802 + } 1.803 + htbyte = coll->contrEndCP[hash>>3]; 1.804 + return (((htbyte >> (hash & 7)) & 1) == 1); 1.805 +} 1.806 + 1.807 + 1.808 + 1.809 +/* 1.810 +* i_getCombiningClass() 1.811 +* A fast, at least partly inline version of u_getCombiningClass() 1.812 +* This is a candidate for further optimization. Used heavily 1.813 +* in contraction processing. 1.814 +*/ 1.815 +static 1.816 +inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) { 1.817 + uint8_t sCC = 0; 1.818 + if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) { 1.819 + sCC = u_getCombiningClass(c); 1.820 + } 1.821 + return sCC; 1.822 +} 1.823 + 1.824 +UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) { 1.825 + UChar c; 1.826 + UCollator *result = fillIn; 1.827 + if(U_FAILURE(*status) || image == NULL) { 1.828 + return NULL; 1.829 + } 1.830 + 1.831 + if(result == NULL) { 1.832 + result = (UCollator *)uprv_malloc(sizeof(UCollator)); 1.833 + if(result == NULL) { 1.834 + *status = U_MEMORY_ALLOCATION_ERROR; 1.835 + return result; 1.836 + } 1.837 + result->freeOnClose = TRUE; 1.838 + } else { 1.839 + result->freeOnClose = FALSE; 1.840 + } 1.841 + 1.842 + result->delegate = NULL; 1.843 + 1.844 + result->image = image; 1.845 + result->mapping.getFoldingOffset = _getFoldingOffset; 1.846 + const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition; 1.847 + utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status); 1.848 + if(U_FAILURE(*status)) { 1.849 + if(result->freeOnClose == TRUE) { 1.850 + uprv_free(result); 1.851 + result = NULL; 1.852 + } 1.853 + return result; 1.854 + } 1.855 + 1.856 + result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping); 1.857 + result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs); 1.858 + result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex); 1.859 + result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion); 1.860 + result->rules = NULL; 1.861 + result->rulesLength = 0; 1.862 + result->freeRulesOnClose = FALSE; 1.863 + result->defaultReorderCodes = NULL; 1.864 + result->defaultReorderCodesLength = 0; 1.865 + result->freeDefaultReorderCodesOnClose = FALSE; 1.866 + result->reorderCodes = NULL; 1.867 + result->reorderCodesLength = 0; 1.868 + result->freeReorderCodesOnClose = FALSE; 1.869 + result->leadBytePermutationTable = NULL; 1.870 + result->freeLeadBytePermutationTableOnClose = FALSE; 1.871 + 1.872 + /* get the version info from UCATableHeader and populate the Collator struct*/ 1.873 + result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/ 1.874 + result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/ 1.875 + result->dataVersion[2] = 0; 1.876 + result->dataVersion[3] = 0; 1.877 + 1.878 + result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP; 1.879 + result->minUnsafeCP = 0; 1.880 + for (c=0; c<0x300; c++) { // Find the smallest unsafe char. 1.881 + if (ucol_unsafeCP(c, result)) break; 1.882 + } 1.883 + result->minUnsafeCP = c; 1.884 + 1.885 + result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP; 1.886 + result->minContrEndCP = 0; 1.887 + for (c=0; c<0x300; c++) { // Find the Contraction-ending char. 1.888 + if (ucol_contractionEndCP(c, result)) break; 1.889 + } 1.890 + result->minContrEndCP = c; 1.891 + 1.892 + /* max expansion tables */ 1.893 + result->endExpansionCE = (uint32_t*)((uint8_t*)result->image + 1.894 + result->image->endExpansionCE); 1.895 + result->lastEndExpansionCE = result->endExpansionCE + 1.896 + result->image->endExpansionCECount - 1; 1.897 + result->expansionCESize = (uint8_t*)result->image + 1.898 + result->image->expansionCESize; 1.899 + 1.900 + 1.901 + //result->errorCode = *status; 1.902 + 1.903 + result->latinOneCEs = NULL; 1.904 + 1.905 + result->latinOneRegenTable = FALSE; 1.906 + result->latinOneFailed = FALSE; 1.907 + result->UCA = UCA; 1.908 + 1.909 + /* Normally these will be set correctly later. This is the default if you use UCA or the default. */ 1.910 + result->ucaRules = NULL; 1.911 + result->actualLocale = NULL; 1.912 + result->validLocale = NULL; 1.913 + result->requestedLocale = NULL; 1.914 + result->hasRealData = FALSE; // real data lives in .dat file... 1.915 + result->freeImageOnClose = FALSE; 1.916 + 1.917 + /* set attributes */ 1.918 + ucol_setOptionsFromHeader( 1.919 + result, 1.920 + (UColOptionSet*)((uint8_t*)result->image+result->image->options), 1.921 + status); 1.922 + result->freeOptionsOnClose = FALSE; 1.923 + 1.924 + return result; 1.925 +} 1.926 + 1.927 +/* new Mark's code */ 1.928 + 1.929 +/** 1.930 + * For generation of Implicit CEs 1.931 + * @author Davis 1.932 + * 1.933 + * Cleaned up so that changes can be made more easily. 1.934 + * Old values: 1.935 +# First Implicit: E26A792D 1.936 +# Last Implicit: E3DC70C0 1.937 +# First CJK: E0030300 1.938 +# Last CJK: E0A9DD00 1.939 +# First CJK_A: E0A9DF00 1.940 +# Last CJK_A: E0DE3100 1.941 + */ 1.942 +/* Following is a port of Mark's code for new treatment of implicits. 1.943 + * It is positioned here, since ucol_initUCA need to initialize the 1.944 + * variables below according to the data in the fractional UCA. 1.945 + */ 1.946 + 1.947 +/** 1.948 + * Function used to: 1.949 + * a) collapse the 2 different Han ranges from UCA into one (in the right order), and 1.950 + * b) bump any non-CJK characters by 10FFFF. 1.951 + * The relevant blocks are: 1.952 + * A: 4E00..9FFF; CJK Unified Ideographs 1.953 + * F900..FAFF; CJK Compatibility Ideographs 1.954 + * B: 3400..4DBF; CJK Unified Ideographs Extension A 1.955 + * 20000..XX; CJK Unified Ideographs Extension B (and others later on) 1.956 + * As long as 1.957 + * no new B characters are allocated between 4E00 and FAFF, and 1.958 + * no new A characters are outside of this range, 1.959 + * (very high probability) this simple code will work. 1.960 + * The reordered blocks are: 1.961 + * Block1 is CJK 1.962 + * Block2 is CJK_COMPAT_USED 1.963 + * Block3 is CJK_A 1.964 + * (all contiguous) 1.965 + * Any other CJK gets its normal code point 1.966 + * Any non-CJK gets +10FFFF 1.967 + * When we reorder Block1, we make sure that it is at the very start, 1.968 + * so that it will use a 3-byte form. 1.969 + * Warning: the we only pick up the compatibility characters that are 1.970 + * NOT decomposed, so that block is smaller! 1.971 + */ 1.972 + 1.973 +// CONSTANTS 1.974 +static const UChar32 1.975 + NON_CJK_OFFSET = 0x110000, 1.976 + UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2 1.977 + 1.978 +/** 1.979 + * Precomputed by initImplicitConstants() 1.980 + */ 1.981 +static int32_t 1.982 + final3Multiplier = 0, 1.983 + final4Multiplier = 0, 1.984 + final3Count = 0, 1.985 + final4Count = 0, 1.986 + medialCount = 0, 1.987 + min3Primary = 0, 1.988 + min4Primary = 0, 1.989 + max4Primary = 0, 1.990 + minTrail = 0, 1.991 + maxTrail = 0, 1.992 + max3Trail = 0, 1.993 + max4Trail = 0, 1.994 + min4Boundary = 0; 1.995 + 1.996 +static const UChar32 1.997 + // 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; 1.998 + // 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; (Unicode 6.1) 1.999 + CJK_BASE = 0x4E00, 1.1000 + CJK_LIMIT = 0x9FCC+1, 1.1001 + // Unified CJK ideographs in the compatibility ideographs block. 1.1002 + CJK_COMPAT_USED_BASE = 0xFA0E, 1.1003 + CJK_COMPAT_USED_LIMIT = 0xFA2F+1, 1.1004 + // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; 1.1005 + // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; 1.1006 + CJK_A_BASE = 0x3400, 1.1007 + CJK_A_LIMIT = 0x4DB5+1, 1.1008 + // 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;; 1.1009 + // 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;; 1.1010 + CJK_B_BASE = 0x20000, 1.1011 + CJK_B_LIMIT = 0x2A6D6+1, 1.1012 + // 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;; 1.1013 + // 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;; 1.1014 + CJK_C_BASE = 0x2A700, 1.1015 + CJK_C_LIMIT = 0x2B734+1, 1.1016 + // 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;; 1.1017 + // 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;; 1.1018 + CJK_D_BASE = 0x2B740, 1.1019 + CJK_D_LIMIT = 0x2B81D+1; 1.1020 + // when adding to this list, look for all occurrences (in project) 1.1021 + // of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing!!!! 1.1022 + 1.1023 +static UChar32 swapCJK(UChar32 i) { 1.1024 + if (i < CJK_A_BASE) { 1.1025 + // non-CJK 1.1026 + } else if (i < CJK_A_LIMIT) { 1.1027 + // Extension A has lower code points than the original Unihan+compat 1.1028 + // but sorts higher. 1.1029 + return i - CJK_A_BASE 1.1030 + + (CJK_LIMIT - CJK_BASE) 1.1031 + + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE); 1.1032 + } else if (i < CJK_BASE) { 1.1033 + // non-CJK 1.1034 + } else if (i < CJK_LIMIT) { 1.1035 + return i - CJK_BASE; 1.1036 + } else if (i < CJK_COMPAT_USED_BASE) { 1.1037 + // non-CJK 1.1038 + } else if (i < CJK_COMPAT_USED_LIMIT) { 1.1039 + return i - CJK_COMPAT_USED_BASE 1.1040 + + (CJK_LIMIT - CJK_BASE); 1.1041 + } else if (i < CJK_B_BASE) { 1.1042 + // non-CJK 1.1043 + } else if (i < CJK_B_LIMIT) { 1.1044 + return i; // non-BMP-CJK 1.1045 + } else if (i < CJK_C_BASE) { 1.1046 + // non-CJK 1.1047 + } else if (i < CJK_C_LIMIT) { 1.1048 + return i; // non-BMP-CJK 1.1049 + } else if (i < CJK_D_BASE) { 1.1050 + // non-CJK 1.1051 + } else if (i < CJK_D_LIMIT) { 1.1052 + return i; // non-BMP-CJK 1.1053 + } 1.1054 + return i + NON_CJK_OFFSET; // non-CJK 1.1055 +} 1.1056 + 1.1057 +U_CAPI UChar32 U_EXPORT2 1.1058 +uprv_uca_getRawFromCodePoint(UChar32 i) { 1.1059 + return swapCJK(i)+1; 1.1060 +} 1.1061 + 1.1062 +U_CAPI UChar32 U_EXPORT2 1.1063 +uprv_uca_getCodePointFromRaw(UChar32 i) { 1.1064 + i--; 1.1065 + UChar32 result = 0; 1.1066 + if(i >= NON_CJK_OFFSET) { 1.1067 + result = i - NON_CJK_OFFSET; 1.1068 + } else if(i >= CJK_B_BASE) { 1.1069 + result = i; 1.1070 + } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted 1.1071 + if(i < CJK_LIMIT - CJK_BASE) { 1.1072 + result = i + CJK_BASE; 1.1073 + } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { 1.1074 + result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE); 1.1075 + } else { 1.1076 + result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE); 1.1077 + } 1.1078 + } else { 1.1079 + result = -1; 1.1080 + } 1.1081 + return result; 1.1082 +} 1.1083 + 1.1084 +// GET IMPLICIT PRIMARY WEIGHTS 1.1085 +// Return value is left justified primary key 1.1086 +U_CAPI uint32_t U_EXPORT2 1.1087 +uprv_uca_getImplicitFromRaw(UChar32 cp) { 1.1088 + /* 1.1089 + if (cp < 0 || cp > UCOL_MAX_INPUT) { 1.1090 + throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp)); 1.1091 + } 1.1092 + */ 1.1093 + int32_t last0 = cp - min4Boundary; 1.1094 + if (last0 < 0) { 1.1095 + int32_t last1 = cp / final3Count; 1.1096 + last0 = cp % final3Count; 1.1097 + 1.1098 + int32_t last2 = last1 / medialCount; 1.1099 + last1 %= medialCount; 1.1100 + 1.1101 + last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start 1.1102 + last1 = minTrail + last1; // offset 1.1103 + last2 = min3Primary + last2; // offset 1.1104 + /* 1.1105 + if (last2 >= min4Primary) { 1.1106 + throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2)); 1.1107 + } 1.1108 + */ 1.1109 + return (last2 << 24) + (last1 << 16) + (last0 << 8); 1.1110 + } else { 1.1111 + int32_t last1 = last0 / final4Count; 1.1112 + last0 %= final4Count; 1.1113 + 1.1114 + int32_t last2 = last1 / medialCount; 1.1115 + last1 %= medialCount; 1.1116 + 1.1117 + int32_t last3 = last2 / medialCount; 1.1118 + last2 %= medialCount; 1.1119 + 1.1120 + last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start 1.1121 + last1 = minTrail + last1; // offset 1.1122 + last2 = minTrail + last2; // offset 1.1123 + last3 = min4Primary + last3; // offset 1.1124 + /* 1.1125 + if (last3 > max4Primary) { 1.1126 + throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3)); 1.1127 + } 1.1128 + */ 1.1129 + return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0; 1.1130 + } 1.1131 +} 1.1132 + 1.1133 +static uint32_t U_EXPORT2 1.1134 +uprv_uca_getImplicitPrimary(UChar32 cp) { 1.1135 + //fprintf(stdout, "Incoming: %04x\n", cp); 1.1136 + //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp)); 1.1137 + 1.1138 + cp = swapCJK(cp); 1.1139 + cp++; 1.1140 + // we now have a range of numbers from 0 to 21FFFF. 1.1141 + 1.1142 + //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp)); 1.1143 + //fprintf(stdout, "CJK swapped: %04x\n", cp); 1.1144 + 1.1145 + return uprv_uca_getImplicitFromRaw(cp); 1.1146 +} 1.1147 + 1.1148 +/** 1.1149 + * Converts implicit CE into raw integer ("code point") 1.1150 + * @param implicit 1.1151 + * @return -1 if illegal format 1.1152 + */ 1.1153 +U_CAPI UChar32 U_EXPORT2 1.1154 +uprv_uca_getRawFromImplicit(uint32_t implicit) { 1.1155 + UChar32 result; 1.1156 + UChar32 b3 = implicit & 0xFF; 1.1157 + UChar32 b2 = (implicit >> 8) & 0xFF; 1.1158 + UChar32 b1 = (implicit >> 16) & 0xFF; 1.1159 + UChar32 b0 = (implicit >> 24) & 0xFF; 1.1160 + 1.1161 + // simple parameter checks 1.1162 + if (b0 < min3Primary || b0 > max4Primary 1.1163 + || b1 < minTrail || b1 > maxTrail) 1.1164 + return -1; 1.1165 + // normal offsets 1.1166 + b1 -= minTrail; 1.1167 + 1.1168 + // take care of the final values, and compose 1.1169 + if (b0 < min4Primary) { 1.1170 + if (b2 < minTrail || b2 > max3Trail || b3 != 0) 1.1171 + return -1; 1.1172 + b2 -= minTrail; 1.1173 + UChar32 remainder = b2 % final3Multiplier; 1.1174 + if (remainder != 0) 1.1175 + return -1; 1.1176 + b0 -= min3Primary; 1.1177 + b2 /= final3Multiplier; 1.1178 + result = ((b0 * medialCount) + b1) * final3Count + b2; 1.1179 + } else { 1.1180 + if (b2 < minTrail || b2 > maxTrail 1.1181 + || b3 < minTrail || b3 > max4Trail) 1.1182 + return -1; 1.1183 + b2 -= minTrail; 1.1184 + b3 -= minTrail; 1.1185 + UChar32 remainder = b3 % final4Multiplier; 1.1186 + if (remainder != 0) 1.1187 + return -1; 1.1188 + b3 /= final4Multiplier; 1.1189 + b0 -= min4Primary; 1.1190 + result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary; 1.1191 + } 1.1192 + // final check 1.1193 + if (result < 0 || result > UCOL_MAX_INPUT) 1.1194 + return -1; 1.1195 + return result; 1.1196 +} 1.1197 + 1.1198 + 1.1199 +static inline int32_t divideAndRoundUp(int a, int b) { 1.1200 + return 1 + (a-1)/b; 1.1201 +} 1.1202 + 1.1203 +/* this function is either called from initUCA or from genUCA before 1.1204 + * doing canonical closure for the UCA. 1.1205 + */ 1.1206 + 1.1207 +/** 1.1208 + * Set up to generate implicits. 1.1209 + * Maintenance Note: this function may end up being called more than once, due 1.1210 + * to threading races during initialization. Make sure that 1.1211 + * none of the Constants is ever transiently assigned an 1.1212 + * incorrect value. 1.1213 + * @param minPrimary 1.1214 + * @param maxPrimary 1.1215 + * @param minTrail final byte 1.1216 + * @param maxTrail final byte 1.1217 + * @param gap3 the gap we leave for tailoring for 3-byte forms 1.1218 + * @param gap4 the gap we leave for tailoring for 4-byte forms 1.1219 + */ 1.1220 +static void initImplicitConstants(int minPrimary, int maxPrimary, 1.1221 + int minTrailIn, int maxTrailIn, 1.1222 + int gap3, int primaries3count, 1.1223 + UErrorCode *status) { 1.1224 + // some simple parameter checks 1.1225 + if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) 1.1226 + || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF) 1.1227 + || (primaries3count < 1)) 1.1228 + { 1.1229 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.1230 + return; 1.1231 + }; 1.1232 + 1.1233 + minTrail = minTrailIn; 1.1234 + maxTrail = maxTrailIn; 1.1235 + 1.1236 + min3Primary = minPrimary; 1.1237 + max4Primary = maxPrimary; 1.1238 + // compute constants for use later. 1.1239 + // number of values we can use in trailing bytes 1.1240 + // leave room for empty values between AND above, e.g. if gap = 2 1.1241 + // range 3..7 => +3 -4 -5 -6 -7: so 1 value 1.1242 + // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values 1.1243 + // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values 1.1244 + final3Multiplier = gap3 + 1; 1.1245 + final3Count = (maxTrail - minTrail + 1) / final3Multiplier; 1.1246 + max3Trail = minTrail + (final3Count - 1) * final3Multiplier; 1.1247 + 1.1248 + // medials can use full range 1.1249 + medialCount = (maxTrail - minTrail + 1); 1.1250 + // find out how many values fit in each form 1.1251 + int32_t threeByteCount = medialCount * final3Count; 1.1252 + // now determine where the 3/4 boundary is. 1.1253 + // we use 3 bytes below the boundary, and 4 above 1.1254 + int32_t primariesAvailable = maxPrimary - minPrimary + 1; 1.1255 + int32_t primaries4count = primariesAvailable - primaries3count; 1.1256 + 1.1257 + 1.1258 + int32_t min3ByteCoverage = primaries3count * threeByteCount; 1.1259 + min4Primary = minPrimary + primaries3count; 1.1260 + min4Boundary = min3ByteCoverage; 1.1261 + // Now expand out the multiplier for the 4 bytes, and redo. 1.1262 + 1.1263 + int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary; 1.1264 + int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count); 1.1265 + int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount); 1.1266 + int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte; 1.1267 + if (gap4 < 1) { 1.1268 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.1269 + return; 1.1270 + } 1.1271 + final4Multiplier = gap4 + 1; 1.1272 + final4Count = neededPerFinalByte; 1.1273 + max4Trail = minTrail + (final4Count - 1) * final4Multiplier; 1.1274 +} 1.1275 + 1.1276 + /** 1.1277 + * Supply parameters for generating implicit CEs 1.1278 + */ 1.1279 +U_CAPI void U_EXPORT2 1.1280 +uprv_uca_initImplicitConstants(UErrorCode *status) { 1.1281 + // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms. 1.1282 + //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status); 1.1283 + initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status); 1.1284 +} 1.1285 + 1.1286 + 1.1287 +/* collIterNormalize Incremental Normalization happens here. */ 1.1288 +/* pick up the range of chars identifed by FCD, */ 1.1289 +/* normalize it into the collIterate's writable buffer, */ 1.1290 +/* switch the collIterate's state to use the writable buffer. */ 1.1291 +/* */ 1.1292 +static 1.1293 +void collIterNormalize(collIterate *collationSource) 1.1294 +{ 1.1295 + UErrorCode status = U_ZERO_ERROR; 1.1296 + const UChar *srcP = collationSource->pos - 1; /* Start of chars to normalize */ 1.1297 + const UChar *endP = collationSource->fcdPosition; /* End of region to normalize+1 */ 1.1298 + 1.1299 + collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP - srcP)), 1.1300 + collationSource->writableBuffer, 1.1301 + status); 1.1302 + if (U_FAILURE(status)) { 1.1303 +#ifdef UCOL_DEBUG 1.1304 + fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_errorName(status)); 1.1305 +#endif 1.1306 + return; 1.1307 + } 1.1308 + 1.1309 + collationSource->pos = collationSource->writableBuffer.getTerminatedBuffer(); 1.1310 + collationSource->origFlags = collationSource->flags; 1.1311 + collationSource->flags |= UCOL_ITER_INNORMBUF; 1.1312 + collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); 1.1313 +} 1.1314 + 1.1315 + 1.1316 +// This function takes the iterator and extracts normalized stuff up to the next boundary 1.1317 +// It is similar in the end results to the collIterNormalize, but for the cases when we 1.1318 +// use an iterator 1.1319 +/*static 1.1320 +inline void normalizeIterator(collIterate *collationSource) { 1.1321 + UErrorCode status = U_ZERO_ERROR; 1.1322 + UBool wasNormalized = FALSE; 1.1323 + //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT); 1.1324 + uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator); 1.1325 + int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer, 1.1326 + (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status); 1.1327 + if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) { 1.1328 + // reallocate and terminate 1.1329 + if(!u_growBufferFromStatic(collationSource->stackWritableBuffer, 1.1330 + &collationSource->writableBuffer, 1.1331 + (int32_t *)&collationSource->writableBufSize, normLen + 1, 1.1332 + 0) 1.1333 + ) { 1.1334 + #ifdef UCOL_DEBUG 1.1335 + fprintf(stderr, "normalizeIterator(), out of memory\n"); 1.1336 + #endif 1.1337 + return; 1.1338 + } 1.1339 + status = U_ZERO_ERROR; 1.1340 + //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO); 1.1341 + collationSource->iterator->setState(collationSource->iterator, iterIndex, &status); 1.1342 + normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer, 1.1343 + (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status); 1.1344 + } 1.1345 + // Terminate the buffer - we already checked that it is big enough 1.1346 + collationSource->writableBuffer[normLen] = 0; 1.1347 + if(collationSource->writableBuffer != collationSource->stackWritableBuffer) { 1.1348 + collationSource->flags |= UCOL_ITER_ALLOCATED; 1.1349 + } 1.1350 + collationSource->pos = collationSource->writableBuffer; 1.1351 + collationSource->origFlags = collationSource->flags; 1.1352 + collationSource->flags |= UCOL_ITER_INNORMBUF; 1.1353 + collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); 1.1354 +}*/ 1.1355 + 1.1356 + 1.1357 +/* Incremental FCD check and normalize */ 1.1358 +/* Called from getNextCE when normalization state is suspect. */ 1.1359 +/* When entering, the state is known to be this: */ 1.1360 +/* o We are working in the main buffer of the collIterate, not the side */ 1.1361 +/* writable buffer. When in the side buffer, normalization mode is always off, */ 1.1362 +/* so we won't get here. */ 1.1363 +/* o The leading combining class from the current character is 0 or */ 1.1364 +/* the trailing combining class of the previous char was zero. */ 1.1365 +/* True because the previous call to this function will have always exited */ 1.1366 +/* that way, and we get called for every char where cc might be non-zero. */ 1.1367 +static 1.1368 +inline UBool collIterFCD(collIterate *collationSource) { 1.1369 + const UChar *srcP, *endP; 1.1370 + uint8_t leadingCC; 1.1371 + uint8_t prevTrailingCC = 0; 1.1372 + uint16_t fcd; 1.1373 + UBool needNormalize = FALSE; 1.1374 + 1.1375 + srcP = collationSource->pos-1; 1.1376 + 1.1377 + if (collationSource->flags & UCOL_ITER_HASLEN) { 1.1378 + endP = collationSource->endp; 1.1379 + } else { 1.1380 + endP = NULL; 1.1381 + } 1.1382 + 1.1383 + // Get the trailing combining class of the current character. If it's zero, we are OK. 1.1384 + fcd = g_nfcImpl->nextFCD16(srcP, endP); 1.1385 + if (fcd != 0) { 1.1386 + prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); 1.1387 + 1.1388 + if (prevTrailingCC != 0) { 1.1389 + // The current char has a non-zero trailing CC. Scan forward until we find 1.1390 + // a char with a leading cc of zero. 1.1391 + while (endP == NULL || srcP != endP) 1.1392 + { 1.1393 + const UChar *savedSrcP = srcP; 1.1394 + 1.1395 + fcd = g_nfcImpl->nextFCD16(srcP, endP); 1.1396 + leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); 1.1397 + if (leadingCC == 0) { 1.1398 + srcP = savedSrcP; // Hit char that is not part of combining sequence. 1.1399 + // back up over it. (Could be surrogate pair!) 1.1400 + break; 1.1401 + } 1.1402 + 1.1403 + if (leadingCC < prevTrailingCC) { 1.1404 + needNormalize = TRUE; 1.1405 + } 1.1406 + 1.1407 + prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); 1.1408 + } 1.1409 + } 1.1410 + } 1.1411 + 1.1412 + collationSource->fcdPosition = (UChar *)srcP; 1.1413 + 1.1414 + return needNormalize; 1.1415 +} 1.1416 + 1.1417 +/****************************************************************************/ 1.1418 +/* Following are the CE retrieval functions */ 1.1419 +/* */ 1.1420 +/****************************************************************************/ 1.1421 + 1.1422 +static uint32_t getImplicit(UChar32 cp, collIterate *collationSource); 1.1423 +static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource); 1.1424 + 1.1425 +/* there should be a macro version of this function in the header file */ 1.1426 +/* This is the first function that tries to fetch a collation element */ 1.1427 +/* If it's not succesfull or it encounters a more difficult situation */ 1.1428 +/* some more sofisticated and slower functions are invoked */ 1.1429 +static 1.1430 +inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) { 1.1431 + uint32_t order = 0; 1.1432 + if (collationSource->CEpos > collationSource->toReturn) { /* Are there any CEs from previous expansions? */ 1.1433 + order = *(collationSource->toReturn++); /* if so, return them */ 1.1434 + if(collationSource->CEpos == collationSource->toReturn) { 1.1435 + collationSource->CEpos = collationSource->toReturn = collationSource->extendCEs ? collationSource->extendCEs : collationSource->CEs; 1.1436 + } 1.1437 + return order; 1.1438 + } 1.1439 + 1.1440 + UChar ch = 0; 1.1441 + collationSource->offsetReturn = NULL; 1.1442 + 1.1443 + do { 1.1444 + for (;;) /* Loop handles case when incremental normalize switches */ 1.1445 + { /* to or from the side buffer / original string, and we */ 1.1446 + /* need to start again to get the next character. */ 1.1447 + 1.1448 + if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0) 1.1449 + { 1.1450 + // The source string is null terminated and we're not working from the side buffer, 1.1451 + // and we're not normalizing. This is the fast path. 1.1452 + // (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.) 1.1453 + ch = *collationSource->pos++; 1.1454 + if (ch != 0) { 1.1455 + break; 1.1456 + } 1.1457 + else { 1.1458 + return UCOL_NO_MORE_CES; 1.1459 + } 1.1460 + } 1.1461 + 1.1462 + if (collationSource->flags & UCOL_ITER_HASLEN) { 1.1463 + // Normal path for strings when length is specified. 1.1464 + // (We can't be in side buffer because it is always null terminated.) 1.1465 + if (collationSource->pos >= collationSource->endp) { 1.1466 + // Ran off of the end of the main source string. We're done. 1.1467 + return UCOL_NO_MORE_CES; 1.1468 + } 1.1469 + ch = *collationSource->pos++; 1.1470 + } 1.1471 + else if(collationSource->flags & UCOL_USE_ITERATOR) { 1.1472 + UChar32 iterCh = collationSource->iterator->next(collationSource->iterator); 1.1473 + if(iterCh == U_SENTINEL) { 1.1474 + return UCOL_NO_MORE_CES; 1.1475 + } 1.1476 + ch = (UChar)iterCh; 1.1477 + } 1.1478 + else 1.1479 + { 1.1480 + // Null terminated string. 1.1481 + ch = *collationSource->pos++; 1.1482 + if (ch == 0) { 1.1483 + // Ran off end of buffer. 1.1484 + if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { 1.1485 + // Ran off end of main string. backing up one character. 1.1486 + collationSource->pos--; 1.1487 + return UCOL_NO_MORE_CES; 1.1488 + } 1.1489 + else 1.1490 + { 1.1491 + // Hit null in the normalize side buffer. 1.1492 + // Usually this means the end of the normalized data, 1.1493 + // except for one odd case: a null followed by combining chars, 1.1494 + // which is the case if we are at the start of the buffer. 1.1495 + if (collationSource->pos == collationSource->writableBuffer.getBuffer()+1) { 1.1496 + break; 1.1497 + } 1.1498 + 1.1499 + // Null marked end of side buffer. 1.1500 + // Revert to the main string and 1.1501 + // loop back to top to try again to get a character. 1.1502 + collationSource->pos = collationSource->fcdPosition; 1.1503 + collationSource->flags = collationSource->origFlags; 1.1504 + continue; 1.1505 + } 1.1506 + } 1.1507 + } 1.1508 + 1.1509 + if(collationSource->flags&UCOL_HIRAGANA_Q) { 1.1510 + /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag 1.1511 + * based on whether the previous codepoint was Hiragana or Katakana. 1.1512 + */ 1.1513 + if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) || 1.1514 + ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) { 1.1515 + collationSource->flags |= UCOL_WAS_HIRAGANA; 1.1516 + } else { 1.1517 + collationSource->flags &= ~UCOL_WAS_HIRAGANA; 1.1518 + } 1.1519 + } 1.1520 + 1.1521 + // We've got a character. See if there's any fcd and/or normalization stuff to do. 1.1522 + // Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer. 1.1523 + if ((collationSource->flags & UCOL_ITER_NORM) == 0) { 1.1524 + break; 1.1525 + } 1.1526 + 1.1527 + if (collationSource->fcdPosition >= collationSource->pos) { 1.1528 + // An earlier FCD check has already covered the current character. 1.1529 + // We can go ahead and process this char. 1.1530 + break; 1.1531 + } 1.1532 + 1.1533 + if (ch < ZERO_CC_LIMIT_ ) { 1.1534 + // Fast fcd safe path. Trailing combining class == 0. This char is OK. 1.1535 + break; 1.1536 + } 1.1537 + 1.1538 + if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { 1.1539 + // We need to peek at the next character in order to tell if we are FCD 1.1540 + if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) { 1.1541 + // We are at the last char of source string. 1.1542 + // It is always OK for FCD check. 1.1543 + break; 1.1544 + } 1.1545 + 1.1546 + // Not at last char of source string (or we'll check against terminating null). Do the FCD fast test 1.1547 + if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) { 1.1548 + break; 1.1549 + } 1.1550 + } 1.1551 + 1.1552 + 1.1553 + // Need a more complete FCD check and possible normalization. 1.1554 + if (collIterFCD(collationSource)) { 1.1555 + collIterNormalize(collationSource); 1.1556 + } 1.1557 + if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { 1.1558 + // No normalization was needed. Go ahead and process the char we already had. 1.1559 + break; 1.1560 + } 1.1561 + 1.1562 + // Some normalization happened. Next loop iteration will pick up a char 1.1563 + // from the normalization buffer. 1.1564 + 1.1565 + } // end for (;;) 1.1566 + 1.1567 + 1.1568 + if (ch <= 0xFF) { 1.1569 + /* For latin-1 characters we never need to fall back to the UCA table */ 1.1570 + /* because all of the UCA data is replicated in the latinOneMapping array */ 1.1571 + order = coll->latinOneMapping[ch]; 1.1572 + if (order > UCOL_NOT_FOUND) { 1.1573 + order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); 1.1574 + } 1.1575 + } 1.1576 + else 1.1577 + { 1.1578 + // Always use UCA for Han, Hangul 1.1579 + // (Han extension A is before main Han block) 1.1580 + // **** Han compatibility chars ?? **** 1.1581 + if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && 1.1582 + (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) { 1.1583 + if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) { 1.1584 + // between the two target ranges; do normal lookup 1.1585 + // **** this range is YI, Modifier tone letters, **** 1.1586 + // **** Latin-D, Syloti Nagari, Phagas-pa. **** 1.1587 + // **** Latin-D might be tailored, so we need to **** 1.1588 + // **** do the normal lookup for these guys. **** 1.1589 + order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 1.1590 + } else { 1.1591 + // in one of the target ranges; use UCA 1.1592 + order = UCOL_NOT_FOUND; 1.1593 + } 1.1594 + } else { 1.1595 + order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 1.1596 + } 1.1597 + 1.1598 + if(order > UCOL_NOT_FOUND) { /* if a CE is special */ 1.1599 + order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */ 1.1600 + } 1.1601 + 1.1602 + if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good CE in the tailoring */ 1.1603 + /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */ 1.1604 + order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); 1.1605 + 1.1606 + if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */ 1.1607 + order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status); 1.1608 + } 1.1609 + } 1.1610 + } 1.1611 + } while ( order == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL ); 1.1612 + 1.1613 + if(order == UCOL_NOT_FOUND) { 1.1614 + order = getImplicit(ch, collationSource); 1.1615 + } 1.1616 + return order; /* return the CE */ 1.1617 +} 1.1618 + 1.1619 +/* ucol_getNextCE, out-of-line version for use from other files. */ 1.1620 +U_CAPI uint32_t U_EXPORT2 1.1621 +ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) { 1.1622 + return ucol_IGetNextCE(coll, collationSource, status); 1.1623 +} 1.1624 + 1.1625 + 1.1626 +/** 1.1627 +* Incremental previous normalization happens here. Pick up the range of chars 1.1628 +* identifed by FCD, normalize it into the collIterate's writable buffer, 1.1629 +* switch the collIterate's state to use the writable buffer. 1.1630 +* @param data collation iterator data 1.1631 +*/ 1.1632 +static 1.1633 +void collPrevIterNormalize(collIterate *data) 1.1634 +{ 1.1635 + UErrorCode status = U_ZERO_ERROR; 1.1636 + const UChar *pEnd = data->pos; /* End normalize + 1 */ 1.1637 + const UChar *pStart; 1.1638 + 1.1639 + /* Start normalize */ 1.1640 + if (data->fcdPosition == NULL) { 1.1641 + pStart = data->string; 1.1642 + } 1.1643 + else { 1.1644 + pStart = data->fcdPosition + 1; 1.1645 + } 1.1646 + 1.1647 + int32_t normLen = 1.1648 + data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pStart) + 1)), 1.1649 + data->writableBuffer, 1.1650 + status). 1.1651 + length(); 1.1652 + if(U_FAILURE(status)) { 1.1653 + return; 1.1654 + } 1.1655 + /* 1.1656 + this puts the null termination infront of the normalized string instead 1.1657 + of the end 1.1658 + */ 1.1659 + data->writableBuffer.insert(0, (UChar)0); 1.1660 + 1.1661 + /* 1.1662 + * The usual case at this point is that we've got a base 1.1663 + * character followed by marks that were normalized. If 1.1664 + * fcdPosition is NULL, that means that we backed up to 1.1665 + * the beginning of the string and there's no base character. 1.1666 + * 1.1667 + * Forward processing will usually normalize when it sees 1.1668 + * the first mark, so that mark will get it's natural offset 1.1669 + * and the rest will get the offset of the character following 1.1670 + * the marks. The base character will also get its natural offset. 1.1671 + * 1.1672 + * We write the offset of the base character, if there is one, 1.1673 + * followed by the offset of the first mark and then the offsets 1.1674 + * of the rest of the marks. 1.1675 + */ 1.1676 + int32_t firstMarkOffset = 0; 1.1677 + int32_t trailOffset = (int32_t)(data->pos - data->string + 1); 1.1678 + int32_t trailCount = normLen - 1; 1.1679 + 1.1680 + if (data->fcdPosition != NULL) { 1.1681 + int32_t baseOffset = (int32_t)(data->fcdPosition - data->string); 1.1682 + UChar baseChar = *data->fcdPosition; 1.1683 + 1.1684 + firstMarkOffset = baseOffset + 1; 1.1685 + 1.1686 + /* 1.1687 + * If the base character is the start of a contraction, forward processing 1.1688 + * will normalize the marks while checking for the contraction, which means 1.1689 + * that the offset of the first mark will the same as the other marks. 1.1690 + * 1.1691 + * **** THIS IS PROBABLY NOT A COMPLETE TEST **** 1.1692 + */ 1.1693 + if (baseChar >= 0x100) { 1.1694 + uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar); 1.1695 + 1.1696 + if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) { 1.1697 + baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar); 1.1698 + } 1.1699 + 1.1700 + if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) { 1.1701 + firstMarkOffset = trailOffset; 1.1702 + } 1.1703 + } 1.1704 + 1.1705 + data->appendOffset(baseOffset, status); 1.1706 + } 1.1707 + 1.1708 + data->appendOffset(firstMarkOffset, status); 1.1709 + 1.1710 + for (int32_t i = 0; i < trailCount; i += 1) { 1.1711 + data->appendOffset(trailOffset, status); 1.1712 + } 1.1713 + 1.1714 + data->offsetRepeatValue = trailOffset; 1.1715 + 1.1716 + data->offsetReturn = data->offsetStore - 1; 1.1717 + if (data->offsetReturn == data->offsetBuffer) { 1.1718 + data->offsetStore = data->offsetBuffer; 1.1719 + } 1.1720 + 1.1721 + data->pos = data->writableBuffer.getTerminatedBuffer() + 1 + normLen; 1.1722 + data->origFlags = data->flags; 1.1723 + data->flags |= UCOL_ITER_INNORMBUF; 1.1724 + data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 1.1725 +} 1.1726 + 1.1727 + 1.1728 +/** 1.1729 +* Incremental FCD check for previous iteration and normalize. Called from 1.1730 +* getPrevCE when normalization state is suspect. 1.1731 +* When entering, the state is known to be this: 1.1732 +* o We are working in the main buffer of the collIterate, not the side 1.1733 +* writable buffer. When in the side buffer, normalization mode is always 1.1734 +* off, so we won't get here. 1.1735 +* o The leading combining class from the current character is 0 or the 1.1736 +* trailing combining class of the previous char was zero. 1.1737 +* True because the previous call to this function will have always exited 1.1738 +* that way, and we get called for every char where cc might be non-zero. 1.1739 +* @param data collation iterate struct 1.1740 +* @return normalization status, TRUE for normalization to be done, FALSE 1.1741 +* otherwise 1.1742 +*/ 1.1743 +static 1.1744 +inline UBool collPrevIterFCD(collIterate *data) 1.1745 +{ 1.1746 + const UChar *src, *start; 1.1747 + uint8_t leadingCC; 1.1748 + uint8_t trailingCC = 0; 1.1749 + uint16_t fcd; 1.1750 + UBool result = FALSE; 1.1751 + 1.1752 + start = data->string; 1.1753 + src = data->pos + 1; 1.1754 + 1.1755 + /* Get the trailing combining class of the current character. */ 1.1756 + fcd = g_nfcImpl->previousFCD16(start, src); 1.1757 + 1.1758 + leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); 1.1759 + 1.1760 + if (leadingCC != 0) { 1.1761 + /* 1.1762 + The current char has a non-zero leading combining class. 1.1763 + Scan backward until we find a char with a trailing cc of zero. 1.1764 + */ 1.1765 + for (;;) 1.1766 + { 1.1767 + if (start == src) { 1.1768 + data->fcdPosition = NULL; 1.1769 + return result; 1.1770 + } 1.1771 + 1.1772 + fcd = g_nfcImpl->previousFCD16(start, src); 1.1773 + 1.1774 + trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); 1.1775 + 1.1776 + if (trailingCC == 0) { 1.1777 + break; 1.1778 + } 1.1779 + 1.1780 + if (leadingCC < trailingCC) { 1.1781 + result = TRUE; 1.1782 + } 1.1783 + 1.1784 + leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); 1.1785 + } 1.1786 + } 1.1787 + 1.1788 + data->fcdPosition = (UChar *)src; 1.1789 + 1.1790 + return result; 1.1791 +} 1.1792 + 1.1793 +/** gets a code unit from the string at a given offset 1.1794 + * Handles both normal and iterative cases. 1.1795 + * No error checking - caller beware! 1.1796 + */ 1.1797 +static inline 1.1798 +UChar peekCodeUnit(collIterate *source, int32_t offset) { 1.1799 + if(source->pos != NULL) { 1.1800 + return *(source->pos + offset); 1.1801 + } else if(source->iterator != NULL) { 1.1802 + UChar32 c; 1.1803 + if(offset != 0) { 1.1804 + source->iterator->move(source->iterator, offset, UITER_CURRENT); 1.1805 + c = source->iterator->next(source->iterator); 1.1806 + source->iterator->move(source->iterator, -offset-1, UITER_CURRENT); 1.1807 + } else { 1.1808 + c = source->iterator->current(source->iterator); 1.1809 + } 1.1810 + return c >= 0 ? (UChar)c : 0xfffd; // If the caller works properly, we should never see c<0. 1.1811 + } else { 1.1812 + return 0xfffd; 1.1813 + } 1.1814 +} 1.1815 + 1.1816 +// Code point version. Treats the offset as a _code point_ delta. 1.1817 +// We cannot use U16_FWD_1_UNSAFE and similar because we might not have well-formed UTF-16. 1.1818 +// We cannot use U16_FWD_1 and similar because we do not know the start and limit of the buffer. 1.1819 +static inline 1.1820 +UChar32 peekCodePoint(collIterate *source, int32_t offset) { 1.1821 + UChar32 c; 1.1822 + if(source->pos != NULL) { 1.1823 + const UChar *p = source->pos; 1.1824 + if(offset >= 0) { 1.1825 + // Skip forward over (offset-1) code points. 1.1826 + while(--offset >= 0) { 1.1827 + if(U16_IS_LEAD(*p++) && U16_IS_TRAIL(*p)) { 1.1828 + ++p; 1.1829 + } 1.1830 + } 1.1831 + // Read the code point there. 1.1832 + c = *p++; 1.1833 + UChar trail; 1.1834 + if(U16_IS_LEAD(c) && U16_IS_TRAIL(trail = *p)) { 1.1835 + c = U16_GET_SUPPLEMENTARY(c, trail); 1.1836 + } 1.1837 + } else /* offset<0 */ { 1.1838 + // Skip backward over (offset-1) code points. 1.1839 + while(++offset < 0) { 1.1840 + if(U16_IS_TRAIL(*--p) && U16_IS_LEAD(*(p - 1))) { 1.1841 + --p; 1.1842 + } 1.1843 + } 1.1844 + // Read the code point before that. 1.1845 + c = *--p; 1.1846 + UChar lead; 1.1847 + if(U16_IS_TRAIL(c) && U16_IS_LEAD(lead = *(p - 1))) { 1.1848 + c = U16_GET_SUPPLEMENTARY(lead, c); 1.1849 + } 1.1850 + } 1.1851 + } else if(source->iterator != NULL) { 1.1852 + if(offset >= 0) { 1.1853 + // Skip forward over (offset-1) code points. 1.1854 + int32_t fwd = offset; 1.1855 + while(fwd-- > 0) { 1.1856 + uiter_next32(source->iterator); 1.1857 + } 1.1858 + // Read the code point there. 1.1859 + c = uiter_current32(source->iterator); 1.1860 + // Return to the starting point, skipping backward over (offset-1) code points. 1.1861 + while(offset-- > 0) { 1.1862 + uiter_previous32(source->iterator); 1.1863 + } 1.1864 + } else /* offset<0 */ { 1.1865 + // Read backward, reading offset code points, remember only the last-read one. 1.1866 + int32_t back = offset; 1.1867 + do { 1.1868 + c = uiter_previous32(source->iterator); 1.1869 + } while(++back < 0); 1.1870 + // Return to the starting position, skipping forward over offset code points. 1.1871 + do { 1.1872 + uiter_next32(source->iterator); 1.1873 + } while(++offset < 0); 1.1874 + } 1.1875 + } else { 1.1876 + c = U_SENTINEL; 1.1877 + } 1.1878 + return c; 1.1879 +} 1.1880 + 1.1881 +/** 1.1882 +* Determines if we are at the start of the data string in the backwards 1.1883 +* collation iterator 1.1884 +* @param data collation iterator 1.1885 +* @return TRUE if we are at the start 1.1886 +*/ 1.1887 +static 1.1888 +inline UBool isAtStartPrevIterate(collIterate *data) { 1.1889 + if(data->pos == NULL && data->iterator != NULL) { 1.1890 + return !data->iterator->hasPrevious(data->iterator); 1.1891 + } 1.1892 + //return (collIter_bos(data)) || 1.1893 + return (data->pos == data->string) || 1.1894 + ((data->flags & UCOL_ITER_INNORMBUF) && (data->pos != NULL) && 1.1895 + *(data->pos - 1) == 0 && data->fcdPosition == NULL); 1.1896 +} 1.1897 + 1.1898 +static 1.1899 +inline void goBackOne(collIterate *data) { 1.1900 +# if 0 1.1901 + // somehow, it looks like we need to keep iterator synced up 1.1902 + // at all times, as above. 1.1903 + if(data->pos) { 1.1904 + data->pos--; 1.1905 + } 1.1906 + if(data->iterator) { 1.1907 + data->iterator->previous(data->iterator); 1.1908 + } 1.1909 +#endif 1.1910 + if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) { 1.1911 + data->iterator->previous(data->iterator); 1.1912 + } 1.1913 + if(data->pos) { 1.1914 + data->pos --; 1.1915 + } 1.1916 +} 1.1917 + 1.1918 +/** 1.1919 +* Inline function that gets a simple CE. 1.1920 +* So what it does is that it will first check the expansion buffer. If the 1.1921 +* expansion buffer is not empty, ie the end pointer to the expansion buffer 1.1922 +* is different from the string pointer, we return the collation element at the 1.1923 +* return pointer and decrement it. 1.1924 +* For more complicated CEs it resorts to getComplicatedCE. 1.1925 +* @param coll collator data 1.1926 +* @param data collation iterator struct 1.1927 +* @param status error status 1.1928 +*/ 1.1929 +static 1.1930 +inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data, 1.1931 + UErrorCode *status) 1.1932 +{ 1.1933 + uint32_t result = (uint32_t)UCOL_NULLORDER; 1.1934 + 1.1935 + if (data->offsetReturn != NULL) { 1.1936 + if (data->offsetRepeatCount > 0) { 1.1937 + data->offsetRepeatCount -= 1; 1.1938 + } else { 1.1939 + if (data->offsetReturn == data->offsetBuffer) { 1.1940 + data->offsetReturn = NULL; 1.1941 + data->offsetStore = data->offsetBuffer; 1.1942 + } else { 1.1943 + data->offsetReturn -= 1; 1.1944 + } 1.1945 + } 1.1946 + } 1.1947 + 1.1948 + if ((data->extendCEs && data->toReturn > data->extendCEs) || 1.1949 + (!data->extendCEs && data->toReturn > data->CEs)) 1.1950 + { 1.1951 + data->toReturn -= 1; 1.1952 + result = *(data->toReturn); 1.1953 + if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) { 1.1954 + data->CEpos = data->toReturn; 1.1955 + } 1.1956 + } 1.1957 + else { 1.1958 + UChar ch = 0; 1.1959 + 1.1960 + do { 1.1961 + /* 1.1962 + Loop handles case when incremental normalize switches to or from the 1.1963 + side buffer / original string, and we need to start again to get the 1.1964 + next character. 1.1965 + */ 1.1966 + for (;;) { 1.1967 + if (data->flags & UCOL_ITER_HASLEN) { 1.1968 + /* 1.1969 + Normal path for strings when length is specified. 1.1970 + Not in side buffer because it is always null terminated. 1.1971 + */ 1.1972 + if (data->pos <= data->string) { 1.1973 + /* End of the main source string */ 1.1974 + return UCOL_NO_MORE_CES; 1.1975 + } 1.1976 + data->pos --; 1.1977 + ch = *data->pos; 1.1978 + } 1.1979 + // we are using an iterator to go back. Pray for us! 1.1980 + else if (data->flags & UCOL_USE_ITERATOR) { 1.1981 + UChar32 iterCh = data->iterator->previous(data->iterator); 1.1982 + if(iterCh == U_SENTINEL) { 1.1983 + return UCOL_NO_MORE_CES; 1.1984 + } else { 1.1985 + ch = (UChar)iterCh; 1.1986 + } 1.1987 + } 1.1988 + else { 1.1989 + data->pos --; 1.1990 + ch = *data->pos; 1.1991 + /* we are in the side buffer. */ 1.1992 + if (ch == 0) { 1.1993 + /* 1.1994 + At the start of the normalize side buffer. 1.1995 + Go back to string. 1.1996 + Because pointer points to the last accessed character, 1.1997 + hence we have to increment it by one here. 1.1998 + */ 1.1999 + data->flags = data->origFlags; 1.2000 + data->offsetRepeatValue = 0; 1.2001 + 1.2002 + if (data->fcdPosition == NULL) { 1.2003 + data->pos = data->string; 1.2004 + return UCOL_NO_MORE_CES; 1.2005 + } 1.2006 + else { 1.2007 + data->pos = data->fcdPosition + 1; 1.2008 + } 1.2009 + 1.2010 + continue; 1.2011 + } 1.2012 + } 1.2013 + 1.2014 + if(data->flags&UCOL_HIRAGANA_Q) { 1.2015 + if(ch>=0x3040 && ch<=0x309f) { 1.2016 + data->flags |= UCOL_WAS_HIRAGANA; 1.2017 + } else { 1.2018 + data->flags &= ~UCOL_WAS_HIRAGANA; 1.2019 + } 1.2020 + } 1.2021 + 1.2022 + /* 1.2023 + * got a character to determine if there's fcd and/or normalization 1.2024 + * stuff to do. 1.2025 + * if the current character is not fcd. 1.2026 + * if current character is at the start of the string 1.2027 + * Trailing combining class == 0. 1.2028 + * Note if pos is in the writablebuffer, norm is always 0 1.2029 + */ 1.2030 + if (ch < ZERO_CC_LIMIT_ || 1.2031 + // this should propel us out of the loop in the iterator case 1.2032 + (data->flags & UCOL_ITER_NORM) == 0 || 1.2033 + (data->fcdPosition != NULL && data->fcdPosition <= data->pos) 1.2034 + || data->string == data->pos) { 1.2035 + break; 1.2036 + } 1.2037 + 1.2038 + if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { 1.2039 + /* if next character is FCD */ 1.2040 + if (data->pos == data->string) { 1.2041 + /* First char of string is always OK for FCD check */ 1.2042 + break; 1.2043 + } 1.2044 + 1.2045 + /* Not first char of string, do the FCD fast test */ 1.2046 + if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) { 1.2047 + break; 1.2048 + } 1.2049 + } 1.2050 + 1.2051 + /* Need a more complete FCD check and possible normalization. */ 1.2052 + if (collPrevIterFCD(data)) { 1.2053 + collPrevIterNormalize(data); 1.2054 + } 1.2055 + 1.2056 + if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { 1.2057 + /* No normalization. Go ahead and process the char. */ 1.2058 + break; 1.2059 + } 1.2060 + 1.2061 + /* 1.2062 + Some normalization happened. 1.2063 + Next loop picks up a char from the normalization buffer. 1.2064 + */ 1.2065 + } 1.2066 + 1.2067 + /* attempt to handle contractions, after removal of the backwards 1.2068 + contraction 1.2069 + */ 1.2070 + if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) { 1.2071 + result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status); 1.2072 + } else { 1.2073 + if (ch <= 0xFF) { 1.2074 + result = coll->latinOneMapping[ch]; 1.2075 + } 1.2076 + else { 1.2077 + // Always use UCA for [3400..9FFF], [AC00..D7AF] 1.2078 + // **** [FA0E..FA2F] ?? **** 1.2079 + if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && 1.2080 + (ch >= 0x3400 && ch <= 0xD7AF)) { 1.2081 + if (ch > 0x9FFF && ch < 0xAC00) { 1.2082 + // between the two target ranges; do normal lookup 1.2083 + // **** this range is YI, Modifier tone letters, **** 1.2084 + // **** Latin-D, Syloti Nagari, Phagas-pa. **** 1.2085 + // **** Latin-D might be tailored, so we need to **** 1.2086 + // **** do the normal lookup for these guys. **** 1.2087 + result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 1.2088 + } else { 1.2089 + result = UCOL_NOT_FOUND; 1.2090 + } 1.2091 + } else { 1.2092 + result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 1.2093 + } 1.2094 + } 1.2095 + if (result > UCOL_NOT_FOUND) { 1.2096 + result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status); 1.2097 + } 1.2098 + if (result == UCOL_NOT_FOUND) { // Not found in master list 1.2099 + if (!isAtStartPrevIterate(data) && 1.2100 + ucol_contractionEndCP(ch, data->coll)) 1.2101 + { 1.2102 + result = UCOL_CONTRACTION; 1.2103 + } else { 1.2104 + if(coll->UCA) { 1.2105 + result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); 1.2106 + } 1.2107 + } 1.2108 + 1.2109 + if (result > UCOL_NOT_FOUND) { 1.2110 + if(coll->UCA) { 1.2111 + result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status); 1.2112 + } 1.2113 + } 1.2114 + } 1.2115 + } 1.2116 + } while ( result == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL ); 1.2117 + 1.2118 + if(result == UCOL_NOT_FOUND) { 1.2119 + result = getPrevImplicit(ch, data); 1.2120 + } 1.2121 + } 1.2122 + 1.2123 + return result; 1.2124 +} 1.2125 + 1.2126 + 1.2127 +/* ucol_getPrevCE, out-of-line version for use from other files. */ 1.2128 +U_CFUNC uint32_t U_EXPORT2 1.2129 +ucol_getPrevCE(const UCollator *coll, collIterate *data, 1.2130 + UErrorCode *status) { 1.2131 + return ucol_IGetPrevCE(coll, data, status); 1.2132 +} 1.2133 + 1.2134 + 1.2135 +/* this should be connected to special Jamo handling */ 1.2136 +U_CFUNC uint32_t U_EXPORT2 1.2137 +ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) { 1.2138 + collIterate colIt; 1.2139 + IInit_collIterate(coll, &u, 1, &colIt, status); 1.2140 + if(U_FAILURE(*status)) { 1.2141 + return 0; 1.2142 + } 1.2143 + return ucol_IGetNextCE(coll, &colIt, status); 1.2144 +} 1.2145 + 1.2146 +/** 1.2147 +* Inserts the argument character into the end of the buffer pushing back the 1.2148 +* null terminator. 1.2149 +* @param data collIterate struct data 1.2150 +* @param ch character to be appended 1.2151 +* @return the position of the new addition 1.2152 +*/ 1.2153 +static 1.2154 +inline const UChar * insertBufferEnd(collIterate *data, UChar ch) 1.2155 +{ 1.2156 + int32_t oldLength = data->writableBuffer.length(); 1.2157 + return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength; 1.2158 +} 1.2159 + 1.2160 +/** 1.2161 +* Inserts the argument string into the end of the buffer pushing back the 1.2162 +* null terminator. 1.2163 +* @param data collIterate struct data 1.2164 +* @param string to be appended 1.2165 +* @param length of the string to be appended 1.2166 +* @return the position of the new addition 1.2167 +*/ 1.2168 +static 1.2169 +inline const UChar * insertBufferEnd(collIterate *data, const UChar *str, int32_t length) 1.2170 +{ 1.2171 + int32_t oldLength = data->writableBuffer.length(); 1.2172 + return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldLength; 1.2173 +} 1.2174 + 1.2175 +/** 1.2176 +* Special normalization function for contraction in the forwards iterator. 1.2177 +* This normalization sequence will place the current character at source->pos 1.2178 +* and its following normalized sequence into the buffer. 1.2179 +* The fcd position, pos will be changed. 1.2180 +* pos will now point to positions in the buffer. 1.2181 +* Flags will be changed accordingly. 1.2182 +* @param data collation iterator data 1.2183 +*/ 1.2184 +static 1.2185 +inline void normalizeNextContraction(collIterate *data) 1.2186 +{ 1.2187 + int32_t strsize; 1.2188 + UErrorCode status = U_ZERO_ERROR; 1.2189 + /* because the pointer points to the next character */ 1.2190 + const UChar *pStart = data->pos - 1; 1.2191 + const UChar *pEnd; 1.2192 + 1.2193 + if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { 1.2194 + data->writableBuffer.setTo(*(pStart - 1)); 1.2195 + strsize = 1; 1.2196 + } 1.2197 + else { 1.2198 + strsize = data->writableBuffer.length(); 1.2199 + } 1.2200 + 1.2201 + pEnd = data->fcdPosition; 1.2202 + 1.2203 + data->writableBuffer.append( 1.2204 + data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), status)); 1.2205 + if(U_FAILURE(status)) { 1.2206 + return; 1.2207 + } 1.2208 + 1.2209 + data->pos = data->writableBuffer.getTerminatedBuffer() + strsize; 1.2210 + data->origFlags = data->flags; 1.2211 + data->flags |= UCOL_ITER_INNORMBUF; 1.2212 + data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 1.2213 +} 1.2214 + 1.2215 +/** 1.2216 +* Contraction character management function that returns the next character 1.2217 +* for the forwards iterator. 1.2218 +* Does nothing if the next character is in buffer and not the first character 1.2219 +* in it. 1.2220 +* Else it checks next character in data string to see if it is normalizable. 1.2221 +* If it is not, the character is simply copied into the buffer, else 1.2222 +* the whole normalized substring is copied into the buffer, including the 1.2223 +* current character. 1.2224 +* @param data collation element iterator data 1.2225 +* @return next character 1.2226 +*/ 1.2227 +static 1.2228 +inline UChar getNextNormalizedChar(collIterate *data) 1.2229 +{ 1.2230 + UChar nextch; 1.2231 + UChar ch; 1.2232 + // Here we need to add the iterator code. One problem is the way 1.2233 + // end of string is handled. If we just return next char, it could 1.2234 + // be the sentinel. Most of the cases already check for this, but we 1.2235 + // need to be sure. 1.2236 + if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) { 1.2237 + /* if no normalization and not in buffer. */ 1.2238 + if(data->flags & UCOL_USE_ITERATOR) { 1.2239 + return (UChar)data->iterator->next(data->iterator); 1.2240 + } else { 1.2241 + return *(data->pos ++); 1.2242 + } 1.2243 + } 1.2244 + 1.2245 + //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) { 1.2246 + //normalizeIterator(data); 1.2247 + //} 1.2248 + 1.2249 + UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF); 1.2250 + if ((innormbuf && *data->pos != 0) || 1.2251 + (data->fcdPosition != NULL && !innormbuf && 1.2252 + data->pos < data->fcdPosition)) { 1.2253 + /* 1.2254 + if next character is in normalized buffer, no further normalization 1.2255 + is required 1.2256 + */ 1.2257 + return *(data->pos ++); 1.2258 + } 1.2259 + 1.2260 + if (data->flags & UCOL_ITER_HASLEN) { 1.2261 + /* in data string */ 1.2262 + if (data->pos + 1 == data->endp) { 1.2263 + return *(data->pos ++); 1.2264 + } 1.2265 + } 1.2266 + else { 1.2267 + if (innormbuf) { 1.2268 + // inside the normalization buffer, but at the end 1.2269 + // (since we encountered zero). This means, in the 1.2270 + // case we're using char iterator, that we need to 1.2271 + // do another round of normalization. 1.2272 + //if(data->origFlags & UCOL_USE_ITERATOR) { 1.2273 + // we need to restore original flags, 1.2274 + // otherwise, we'll lose them 1.2275 + //data->flags = data->origFlags; 1.2276 + //normalizeIterator(data); 1.2277 + //return *(data->pos++); 1.2278 + //} else { 1.2279 + /* 1.2280 + in writable buffer, at this point fcdPosition can not be 1.2281 + pointing to the end of the data string. see contracting tag. 1.2282 + */ 1.2283 + if(data->fcdPosition) { 1.2284 + if (*(data->fcdPosition + 1) == 0 || 1.2285 + data->fcdPosition + 1 == data->endp) { 1.2286 + /* at the end of the string, dump it into the normalizer */ 1.2287 + data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1; 1.2288 + // Check if data->pos received a null pointer 1.2289 + if (data->pos == NULL) { 1.2290 + return (UChar)-1; // Return to indicate error. 1.2291 + } 1.2292 + return *(data->fcdPosition ++); 1.2293 + } 1.2294 + data->pos = data->fcdPosition; 1.2295 + } else if(data->origFlags & UCOL_USE_ITERATOR) { 1.2296 + // if we are here, we're using a normalizing iterator. 1.2297 + // we should just continue further. 1.2298 + data->flags = data->origFlags; 1.2299 + data->pos = NULL; 1.2300 + return (UChar)data->iterator->next(data->iterator); 1.2301 + } 1.2302 + //} 1.2303 + } 1.2304 + else { 1.2305 + if (*(data->pos + 1) == 0) { 1.2306 + return *(data->pos ++); 1.2307 + } 1.2308 + } 1.2309 + } 1.2310 + 1.2311 + ch = *data->pos ++; 1.2312 + nextch = *data->pos; 1.2313 + 1.2314 + /* 1.2315 + * if the current character is not fcd. 1.2316 + * Trailing combining class == 0. 1.2317 + */ 1.2318 + if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) && 1.2319 + (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ || 1.2320 + ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) { 1.2321 + /* 1.2322 + Need a more complete FCD check and possible normalization. 1.2323 + normalize substring will be appended to buffer 1.2324 + */ 1.2325 + if (collIterFCD(data)) { 1.2326 + normalizeNextContraction(data); 1.2327 + return *(data->pos ++); 1.2328 + } 1.2329 + else if (innormbuf) { 1.2330 + /* fcdposition shifted even when there's no normalization, if we 1.2331 + don't input the rest into this, we'll get the wrong position when 1.2332 + we reach the end of the writableBuffer */ 1.2333 + int32_t length = (int32_t)(data->fcdPosition - data->pos + 1); 1.2334 + data->pos = insertBufferEnd(data, data->pos - 1, length); 1.2335 + // Check if data->pos received a null pointer 1.2336 + if (data->pos == NULL) { 1.2337 + return (UChar)-1; // Return to indicate error. 1.2338 + } 1.2339 + return *(data->pos ++); 1.2340 + } 1.2341 + } 1.2342 + 1.2343 + if (innormbuf) { 1.2344 + /* 1.2345 + no normalization is to be done hence only one character will be 1.2346 + appended to the buffer. 1.2347 + */ 1.2348 + data->pos = insertBufferEnd(data, ch) + 1; 1.2349 + // Check if data->pos received a null pointer 1.2350 + if (data->pos == NULL) { 1.2351 + return (UChar)-1; // Return to indicate error. 1.2352 + } 1.2353 + } 1.2354 + 1.2355 + /* points back to the pos in string */ 1.2356 + return ch; 1.2357 +} 1.2358 + 1.2359 + 1.2360 + 1.2361 +/** 1.2362 +* Function to copy the buffer into writableBuffer and sets the fcd position to 1.2363 +* the correct position 1.2364 +* @param source data string source 1.2365 +* @param buffer character buffer 1.2366 +*/ 1.2367 +static 1.2368 +inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &buffer) 1.2369 +{ 1.2370 + /* okay confusing part here. to ensure that the skipped characters are 1.2371 + considered later, we need to place it in the appropriate position in the 1.2372 + normalization buffer and reassign the pos pointer. simple case if pos 1.2373 + reside in string, simply copy to normalization buffer and 1.2374 + fcdposition = pos, pos = start of normalization buffer. if pos in 1.2375 + normalization buffer, we'll insert the copy infront of pos and point pos 1.2376 + to the start of the normalization buffer. why am i doing these copies? 1.2377 + well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does 1.2378 + not require any changes, which be really painful. */ 1.2379 + if (source->flags & UCOL_ITER_INNORMBUF) { 1.2380 + int32_t replaceLength = source->pos - source->writableBuffer.getBuffer(); 1.2381 + source->writableBuffer.replace(0, replaceLength, buffer); 1.2382 + } 1.2383 + else { 1.2384 + source->fcdPosition = source->pos; 1.2385 + source->origFlags = source->flags; 1.2386 + source->flags |= UCOL_ITER_INNORMBUF; 1.2387 + source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); 1.2388 + source->writableBuffer = buffer; 1.2389 + } 1.2390 + 1.2391 + source->pos = source->writableBuffer.getTerminatedBuffer(); 1.2392 +} 1.2393 + 1.2394 +/** 1.2395 +* Function to get the discontiguos collation element within the source. 1.2396 +* Note this function will set the position to the appropriate places. 1.2397 +* @param coll current collator used 1.2398 +* @param source data string source 1.2399 +* @param constart index to the start character in the contraction table 1.2400 +* @return discontiguos collation element offset 1.2401 +*/ 1.2402 +static 1.2403 +uint32_t getDiscontiguous(const UCollator *coll, collIterate *source, 1.2404 + const UChar *constart) 1.2405 +{ 1.2406 + /* source->pos currently points to the second combining character after 1.2407 + the start character */ 1.2408 + const UChar *temppos = source->pos; 1.2409 + UnicodeString buffer; 1.2410 + const UChar *tempconstart = constart; 1.2411 + uint8_t tempflags = source->flags; 1.2412 + UBool multicontraction = FALSE; 1.2413 + collIterateState discState; 1.2414 + 1.2415 + backupState(source, &discState); 1.2416 + 1.2417 + buffer.setTo(peekCodePoint(source, -1)); 1.2418 + for (;;) { 1.2419 + UChar *UCharOffset; 1.2420 + UChar schar, 1.2421 + tchar; 1.2422 + uint32_t result; 1.2423 + 1.2424 + if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp) 1.2425 + || (peekCodeUnit(source, 0) == 0 && 1.2426 + //|| (*source->pos == 0 && 1.2427 + ((source->flags & UCOL_ITER_INNORMBUF) == 0 || 1.2428 + source->fcdPosition == NULL || 1.2429 + source->fcdPosition == source->endp || 1.2430 + *(source->fcdPosition) == 0 || 1.2431 + u_getCombiningClass(*(source->fcdPosition)) == 0)) || 1.2432 + /* end of string in null terminated string or stopped by a 1.2433 + null character, note fcd does not always point to a base 1.2434 + character after the discontiguos change */ 1.2435 + u_getCombiningClass(peekCodePoint(source, 0)) == 0) { 1.2436 + //u_getCombiningClass(*(source->pos)) == 0) { 1.2437 + //constart = (UChar *)coll->image + getContractOffset(CE); 1.2438 + if (multicontraction) { 1.2439 + source->pos = temppos - 1; 1.2440 + setDiscontiguosAttribute(source, buffer); 1.2441 + return *(coll->contractionCEs + 1.2442 + (tempconstart - coll->contractionIndex)); 1.2443 + } 1.2444 + constart = tempconstart; 1.2445 + break; 1.2446 + } 1.2447 + 1.2448 + UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/ 1.2449 + schar = getNextNormalizedChar(source); 1.2450 + 1.2451 + while (schar > (tchar = *UCharOffset)) { 1.2452 + UCharOffset++; 1.2453 + } 1.2454 + 1.2455 + if (schar != tchar) { 1.2456 + /* not the correct codepoint. we stuff the current codepoint into 1.2457 + the discontiguos buffer and try the next character */ 1.2458 + buffer.append(schar); 1.2459 + continue; 1.2460 + } 1.2461 + else { 1.2462 + if (u_getCombiningClass(schar) == 1.2463 + u_getCombiningClass(peekCodePoint(source, -2))) { 1.2464 + buffer.append(schar); 1.2465 + continue; 1.2466 + } 1.2467 + result = *(coll->contractionCEs + 1.2468 + (UCharOffset - coll->contractionIndex)); 1.2469 + } 1.2470 + 1.2471 + if (result == UCOL_NOT_FOUND) { 1.2472 + break; 1.2473 + } else if (isContraction(result)) { 1.2474 + /* this is a multi-contraction*/ 1.2475 + tempconstart = (UChar *)coll->image + getContractOffset(result); 1.2476 + if (*(coll->contractionCEs + (constart - coll->contractionIndex)) 1.2477 + != UCOL_NOT_FOUND) { 1.2478 + multicontraction = TRUE; 1.2479 + temppos = source->pos + 1; 1.2480 + } 1.2481 + } else { 1.2482 + setDiscontiguosAttribute(source, buffer); 1.2483 + return result; 1.2484 + } 1.2485 + } 1.2486 + 1.2487 + /* no problems simply reverting just like that, 1.2488 + if we are in string before getting into this function, points back to 1.2489 + string hence no problem. 1.2490 + if we are in normalization buffer before getting into this function, 1.2491 + since we'll never use another normalization within this function, we 1.2492 + know that fcdposition points to a base character. the normalization buffer 1.2493 + never change, hence this revert works. */ 1.2494 + loadState(source, &discState, TRUE); 1.2495 + goBackOne(source); 1.2496 + 1.2497 + //source->pos = temppos - 1; 1.2498 + source->flags = tempflags; 1.2499 + return *(coll->contractionCEs + (constart - coll->contractionIndex)); 1.2500 +} 1.2501 + 1.2502 +/* now uses Mark's getImplicitPrimary code */ 1.2503 +static 1.2504 +inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) { 1.2505 + uint32_t r = uprv_uca_getImplicitPrimary(cp); 1.2506 + *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0; 1.2507 + collationSource->offsetRepeatCount += 1; 1.2508 + return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order' 1.2509 +} 1.2510 + 1.2511 +/** 1.2512 +* Inserts the argument character into the front of the buffer replacing the 1.2513 +* front null terminator. 1.2514 +* @param data collation element iterator data 1.2515 +* @param ch character to be appended 1.2516 +*/ 1.2517 +static 1.2518 +inline void insertBufferFront(collIterate *data, UChar ch) 1.2519 +{ 1.2520 + data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTerminatedBuffer() + 2; 1.2521 +} 1.2522 + 1.2523 +/** 1.2524 +* Special normalization function for contraction in the previous iterator. 1.2525 +* This normalization sequence will place the current character at source->pos 1.2526 +* and its following normalized sequence into the buffer. 1.2527 +* The fcd position, pos will be changed. 1.2528 +* pos will now point to positions in the buffer. 1.2529 +* Flags will be changed accordingly. 1.2530 +* @param data collation iterator data 1.2531 +*/ 1.2532 +static 1.2533 +inline void normalizePrevContraction(collIterate *data, UErrorCode *status) 1.2534 +{ 1.2535 + const UChar *pEnd = data->pos + 1; /* End normalize + 1 */ 1.2536 + const UChar *pStart; 1.2537 + 1.2538 + UnicodeString endOfBuffer; 1.2539 + if (data->flags & UCOL_ITER_HASLEN) { 1.2540 + /* 1.2541 + normalization buffer not used yet, we'll pull down the next 1.2542 + character into the end of the buffer 1.2543 + */ 1.2544 + endOfBuffer.setTo(*pEnd); 1.2545 + } 1.2546 + else { 1.2547 + endOfBuffer.setTo(data->writableBuffer, 1); // after the leading NUL 1.2548 + } 1.2549 + 1.2550 + if (data->fcdPosition == NULL) { 1.2551 + pStart = data->string; 1.2552 + } 1.2553 + else { 1.2554 + pStart = data->fcdPosition + 1; 1.2555 + } 1.2556 + int32_t normLen = 1.2557 + data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), 1.2558 + data->writableBuffer, 1.2559 + *status). 1.2560 + length(); 1.2561 + if(U_FAILURE(*status)) { 1.2562 + return; 1.2563 + } 1.2564 + /* 1.2565 + this puts the null termination infront of the normalized string instead 1.2566 + of the end 1.2567 + */ 1.2568 + data->pos = 1.2569 + data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminatedBuffer() + 1.2570 + 1 + normLen; 1.2571 + data->origFlags = data->flags; 1.2572 + data->flags |= UCOL_ITER_INNORMBUF; 1.2573 + data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 1.2574 +} 1.2575 + 1.2576 +/** 1.2577 +* Contraction character management function that returns the previous character 1.2578 +* for the backwards iterator. 1.2579 +* Does nothing if the previous character is in buffer and not the first 1.2580 +* character in it. 1.2581 +* Else it checks previous character in data string to see if it is 1.2582 +* normalizable. 1.2583 +* If it is not, the character is simply copied into the buffer, else 1.2584 +* the whole normalized substring is copied into the buffer, including the 1.2585 +* current character. 1.2586 +* @param data collation element iterator data 1.2587 +* @return previous character 1.2588 +*/ 1.2589 +static 1.2590 +inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status) 1.2591 +{ 1.2592 + UChar prevch; 1.2593 + UChar ch; 1.2594 + const UChar *start; 1.2595 + UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF); 1.2596 + if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 || 1.2597 + (innormbuf && *(data->pos - 1) != 0)) { 1.2598 + /* 1.2599 + if no normalization. 1.2600 + if previous character is in normalized buffer, no further normalization 1.2601 + is required 1.2602 + */ 1.2603 + if(data->flags & UCOL_USE_ITERATOR) { 1.2604 + data->iterator->move(data->iterator, -1, UITER_CURRENT); 1.2605 + return (UChar)data->iterator->next(data->iterator); 1.2606 + } else { 1.2607 + return *(data->pos - 1); 1.2608 + } 1.2609 + } 1.2610 + 1.2611 + start = data->pos; 1.2612 + if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) { 1.2613 + /* in data string */ 1.2614 + if ((start - 1) == data->string) { 1.2615 + return *(start - 1); 1.2616 + } 1.2617 + start --; 1.2618 + ch = *start; 1.2619 + prevch = *(start - 1); 1.2620 + } 1.2621 + else { 1.2622 + /* 1.2623 + in writable buffer, at this point fcdPosition can not be NULL. 1.2624 + see contracting tag. 1.2625 + */ 1.2626 + if (data->fcdPosition == data->string) { 1.2627 + /* at the start of the string, just dump it into the normalizer */ 1.2628 + insertBufferFront(data, *(data->fcdPosition)); 1.2629 + data->fcdPosition = NULL; 1.2630 + return *(data->pos - 1); 1.2631 + } 1.2632 + start = data->fcdPosition; 1.2633 + ch = *start; 1.2634 + prevch = *(start - 1); 1.2635 + } 1.2636 + /* 1.2637 + * if the current character is not fcd. 1.2638 + * Trailing combining class == 0. 1.2639 + */ 1.2640 + if (data->fcdPosition > start && 1.2641 + (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_)) 1.2642 + { 1.2643 + /* 1.2644 + Need a more complete FCD check and possible normalization. 1.2645 + normalize substring will be appended to buffer 1.2646 + */ 1.2647 + const UChar *backuppos = data->pos; 1.2648 + data->pos = start; 1.2649 + if (collPrevIterFCD(data)) { 1.2650 + normalizePrevContraction(data, status); 1.2651 + return *(data->pos - 1); 1.2652 + } 1.2653 + data->pos = backuppos; 1.2654 + data->fcdPosition ++; 1.2655 + } 1.2656 + 1.2657 + if (innormbuf) { 1.2658 + /* 1.2659 + no normalization is to be done hence only one character will be 1.2660 + appended to the buffer. 1.2661 + */ 1.2662 + insertBufferFront(data, ch); 1.2663 + data->fcdPosition --; 1.2664 + } 1.2665 + 1.2666 + return ch; 1.2667 +} 1.2668 + 1.2669 +/* This function handles the special CEs like contractions, expansions, surrogates, Thai */ 1.2670 +/* It is called by getNextCE */ 1.2671 + 1.2672 +/* The following should be even */ 1.2673 +#define UCOL_MAX_DIGITS_FOR_NUMBER 254 1.2674 + 1.2675 +uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) { 1.2676 + collIterateState entryState; 1.2677 + backupState(source, &entryState); 1.2678 + UChar32 cp = ch; 1.2679 + 1.2680 + for (;;) { 1.2681 + // This loop will repeat only in the case of contractions, and only when a contraction 1.2682 + // is found and the first CE resulting from that contraction is itself a special 1.2683 + // (an expansion, for example.) All other special CE types are fully handled the 1.2684 + // first time through, and the loop exits. 1.2685 + 1.2686 + const uint32_t *CEOffset = NULL; 1.2687 + switch(getCETag(CE)) { 1.2688 + case NOT_FOUND_TAG: 1.2689 + /* This one is not found, and we'll let somebody else bother about it... no more games */ 1.2690 + return CE; 1.2691 + case SPEC_PROC_TAG: 1.2692 + { 1.2693 + // Special processing is getting a CE that is preceded by a certain prefix 1.2694 + // Currently this is only needed for optimizing Japanese length and iteration marks. 1.2695 + // When we encouter a special processing tag, we go backwards and try to see if 1.2696 + // we have a match. 1.2697 + // Contraction tables are used - so the whole process is not unlike contraction. 1.2698 + // prefix data is stored backwards in the table. 1.2699 + const UChar *UCharOffset; 1.2700 + UChar schar, tchar; 1.2701 + collIterateState prefixState; 1.2702 + backupState(source, &prefixState); 1.2703 + loadState(source, &entryState, TRUE); 1.2704 + goBackOne(source); // We want to look at the point where we entered - actually one 1.2705 + // before that... 1.2706 + 1.2707 + for(;;) { 1.2708 + // This loop will run once per source string character, for as long as we 1.2709 + // are matching a potential contraction sequence 1.2710 + 1.2711 + // First we position ourselves at the begining of contraction sequence 1.2712 + const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); 1.2713 + if (collIter_bos(source)) { 1.2714 + CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 1.2715 + break; 1.2716 + } 1.2717 + schar = getPrevNormalizedChar(source, status); 1.2718 + goBackOne(source); 1.2719 + 1.2720 + while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 1.2721 + UCharOffset++; 1.2722 + } 1.2723 + 1.2724 + if (schar == tchar) { 1.2725 + // Found the source string char in the table. 1.2726 + // Pick up the corresponding CE from the table. 1.2727 + CE = *(coll->contractionCEs + 1.2728 + (UCharOffset - coll->contractionIndex)); 1.2729 + } 1.2730 + else 1.2731 + { 1.2732 + // Source string char was not in the table. 1.2733 + // We have not found the prefix. 1.2734 + CE = *(coll->contractionCEs + 1.2735 + (ContractionStart - coll->contractionIndex)); 1.2736 + } 1.2737 + 1.2738 + if(!isPrefix(CE)) { 1.2739 + // The source string char was in the contraction table, and the corresponding 1.2740 + // CE is not a prefix CE. We found the prefix, break 1.2741 + // out of loop, this CE will end up being returned. This is the normal 1.2742 + // way out of prefix handling when the source actually contained 1.2743 + // the prefix. 1.2744 + break; 1.2745 + } 1.2746 + } 1.2747 + if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue 1.2748 + loadState(source, &prefixState, TRUE); 1.2749 + if(source->origFlags & UCOL_USE_ITERATOR) { 1.2750 + source->flags = source->origFlags; 1.2751 + } 1.2752 + } else { // prefix search was a failure, we have to backup all the way to the start 1.2753 + loadState(source, &entryState, TRUE); 1.2754 + } 1.2755 + break; 1.2756 + } 1.2757 + case CONTRACTION_TAG: 1.2758 + { 1.2759 + /* This should handle contractions */ 1.2760 + collIterateState state; 1.2761 + backupState(source, &state); 1.2762 + uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND; 1.2763 + const UChar *UCharOffset; 1.2764 + UChar schar, tchar; 1.2765 + 1.2766 + for (;;) { 1.2767 + /* This loop will run once per source string character, for as long as we */ 1.2768 + /* are matching a potential contraction sequence */ 1.2769 + 1.2770 + /* First we position ourselves at the begining of contraction sequence */ 1.2771 + const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); 1.2772 + 1.2773 + if (collIter_eos(source)) { 1.2774 + // Ran off the end of the source string. 1.2775 + CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 1.2776 + // So we'll pick whatever we have at the point... 1.2777 + if (CE == UCOL_NOT_FOUND) { 1.2778 + // back up the source over all the chars we scanned going into this contraction. 1.2779 + CE = firstCE; 1.2780 + loadState(source, &state, TRUE); 1.2781 + if(source->origFlags & UCOL_USE_ITERATOR) { 1.2782 + source->flags = source->origFlags; 1.2783 + } 1.2784 + } 1.2785 + break; 1.2786 + } 1.2787 + 1.2788 + uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */ 1.2789 + uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8); 1.2790 + 1.2791 + schar = getNextNormalizedChar(source); 1.2792 + while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 1.2793 + UCharOffset++; 1.2794 + } 1.2795 + 1.2796 + if (schar == tchar) { 1.2797 + // Found the source string char in the contraction table. 1.2798 + // Pick up the corresponding CE from the table. 1.2799 + CE = *(coll->contractionCEs + 1.2800 + (UCharOffset - coll->contractionIndex)); 1.2801 + } 1.2802 + else 1.2803 + { 1.2804 + // Source string char was not in contraction table. 1.2805 + // Unless we have a discontiguous contraction, we have finished 1.2806 + // with this contraction. 1.2807 + // in order to do the proper detection, we 1.2808 + // need to see if we're dealing with a supplementary 1.2809 + /* We test whether the next two char are surrogate pairs. 1.2810 + * This test is done if the iterator is not NULL. 1.2811 + * If there is no surrogate pair, the iterator 1.2812 + * goes back one if needed. */ 1.2813 + UChar32 miss = schar; 1.2814 + if (source->iterator) { 1.2815 + UChar32 surrNextChar; /* the next char in the iteration to test */ 1.2816 + int32_t prevPos; /* holds the previous position before move forward of the source iterator */ 1.2817 + if(U16_IS_LEAD(schar) && source->iterator->hasNext(source->iterator)) { 1.2818 + prevPos = source->iterator->index; 1.2819 + surrNextChar = getNextNormalizedChar(source); 1.2820 + if (U16_IS_TRAIL(surrNextChar)) { 1.2821 + miss = U16_GET_SUPPLEMENTARY(schar, surrNextChar); 1.2822 + } else if (prevPos < source->iterator->index){ 1.2823 + goBackOne(source); 1.2824 + } 1.2825 + } 1.2826 + } else if (U16_IS_LEAD(schar)) { 1.2827 + miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source)); 1.2828 + } 1.2829 + 1.2830 + uint8_t sCC; 1.2831 + if (miss < 0x300 || 1.2832 + maxCC == 0 || 1.2833 + (sCC = i_getCombiningClass(miss, coll)) == 0 || 1.2834 + sCC>maxCC || 1.2835 + (allSame != 0 && sCC == maxCC) || 1.2836 + collIter_eos(source)) 1.2837 + { 1.2838 + // Contraction can not be discontiguous. 1.2839 + goBackOne(source); // back up the source string by one, 1.2840 + // because the character we just looked at was 1.2841 + // not part of the contraction. */ 1.2842 + if(U_IS_SUPPLEMENTARY(miss)) { 1.2843 + goBackOne(source); 1.2844 + } 1.2845 + CE = *(coll->contractionCEs + 1.2846 + (ContractionStart - coll->contractionIndex)); 1.2847 + } else { 1.2848 + // 1.2849 + // Contraction is possibly discontiguous. 1.2850 + // Scan more of source string looking for a match 1.2851 + // 1.2852 + UChar tempchar; 1.2853 + /* find the next character if schar is not a base character 1.2854 + and we are not yet at the end of the string */ 1.2855 + tempchar = getNextNormalizedChar(source); 1.2856 + // probably need another supplementary thingie here 1.2857 + goBackOne(source); 1.2858 + if (i_getCombiningClass(tempchar, coll) == 0) { 1.2859 + goBackOne(source); 1.2860 + if(U_IS_SUPPLEMENTARY(miss)) { 1.2861 + goBackOne(source); 1.2862 + } 1.2863 + /* Spit out the last char of the string, wasn't tasty enough */ 1.2864 + CE = *(coll->contractionCEs + 1.2865 + (ContractionStart - coll->contractionIndex)); 1.2866 + } else { 1.2867 + CE = getDiscontiguous(coll, source, ContractionStart); 1.2868 + } 1.2869 + } 1.2870 + } // else after if(schar == tchar) 1.2871 + 1.2872 + if(CE == UCOL_NOT_FOUND) { 1.2873 + /* The Source string did not match the contraction that we were checking. */ 1.2874 + /* Back up the source position to undo the effects of having partially */ 1.2875 + /* scanned through what ultimately proved to not be a contraction. */ 1.2876 + loadState(source, &state, TRUE); 1.2877 + CE = firstCE; 1.2878 + break; 1.2879 + } 1.2880 + 1.2881 + if(!isContraction(CE)) { 1.2882 + // The source string char was in the contraction table, and the corresponding 1.2883 + // CE is not a contraction CE. We completed the contraction, break 1.2884 + // out of loop, this CE will end up being returned. This is the normal 1.2885 + // way out of contraction handling when the source actually contained 1.2886 + // the contraction. 1.2887 + break; 1.2888 + } 1.2889 + 1.2890 + 1.2891 + // The source string char was in the contraction table, and the corresponding 1.2892 + // CE is IS a contraction CE. We will continue looping to check the source 1.2893 + // string for the remaining chars in the contraction. 1.2894 + uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex)); 1.2895 + if(tempCE != UCOL_NOT_FOUND) { 1.2896 + // We have scanned a a section of source string for which there is a 1.2897 + // CE from the contraction table. Remember the CE and scan position, so 1.2898 + // that we can return to this point if further scanning fails to 1.2899 + // match a longer contraction sequence. 1.2900 + firstCE = tempCE; 1.2901 + 1.2902 + goBackOne(source); 1.2903 + backupState(source, &state); 1.2904 + getNextNormalizedChar(source); 1.2905 + 1.2906 + // Another way to do this is: 1.2907 + //collIterateState tempState; 1.2908 + //backupState(source, &tempState); 1.2909 + //goBackOne(source); 1.2910 + //backupState(source, &state); 1.2911 + //loadState(source, &tempState, TRUE); 1.2912 + 1.2913 + // The problem is that for incomplete contractions we have to remember the previous 1.2914 + // position. Before, the only thing I needed to do was state.pos--; 1.2915 + // After iterator introduction and especially after introduction of normalizing 1.2916 + // iterators, it became much more difficult to decrease the saved state. 1.2917 + // I'm not yet sure which of the two methods above is faster. 1.2918 + } 1.2919 + } // for(;;) 1.2920 + break; 1.2921 + } // case CONTRACTION_TAG: 1.2922 + case LONG_PRIMARY_TAG: 1.2923 + { 1.2924 + *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER; 1.2925 + CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON; 1.2926 + source->offsetRepeatCount += 1; 1.2927 + return CE; 1.2928 + } 1.2929 + case EXPANSION_TAG: 1.2930 + { 1.2931 + /* This should handle expansion. */ 1.2932 + /* NOTE: we can encounter both continuations and expansions in an expansion! */ 1.2933 + /* I have to decide where continuations are going to be dealt with */ 1.2934 + uint32_t size; 1.2935 + uint32_t i; /* general counter */ 1.2936 + 1.2937 + CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ 1.2938 + size = getExpansionCount(CE); 1.2939 + CE = *CEOffset++; 1.2940 + //source->offsetRepeatCount = -1; 1.2941 + 1.2942 + if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ 1.2943 + for(i = 1; i<size; i++) { 1.2944 + *(source->CEpos++) = *CEOffset++; 1.2945 + source->offsetRepeatCount += 1; 1.2946 + } 1.2947 + } else { /* else, we do */ 1.2948 + while(*CEOffset != 0) { 1.2949 + *(source->CEpos++) = *CEOffset++; 1.2950 + source->offsetRepeatCount += 1; 1.2951 + } 1.2952 + } 1.2953 + 1.2954 + return CE; 1.2955 + } 1.2956 + case DIGIT_TAG: 1.2957 + { 1.2958 + /* 1.2959 + We do a check to see if we want to collate digits as numbers; if so we generate 1.2960 + a custom collation key. Otherwise we pull out the value stored in the expansion table. 1.2961 + */ 1.2962 + //uint32_t size; 1.2963 + uint32_t i; /* general counter */ 1.2964 + 1.2965 + if (source->coll->numericCollation == UCOL_ON){ 1.2966 + collIterateState digitState = {0,0,0,0,0,0,0,0,0}; 1.2967 + UChar32 char32 = 0; 1.2968 + int32_t digVal = 0; 1.2969 + 1.2970 + uint32_t digIndx = 0; 1.2971 + uint32_t endIndex = 0; 1.2972 + uint32_t trailingZeroIndex = 0; 1.2973 + 1.2974 + uint8_t collateVal = 0; 1.2975 + 1.2976 + UBool nonZeroValReached = FALSE; 1.2977 + 1.2978 + uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I just need a temporary place to store my generated CEs. 1.2979 + /* 1.2980 + We parse the source string until we hit a char that's NOT a digit. 1.2981 + Use this u_charDigitValue. This might be slow because we have to 1.2982 + handle surrogates... 1.2983 + */ 1.2984 + /* 1.2985 + if (U16_IS_LEAD(ch)){ 1.2986 + if (!collIter_eos(source)) { 1.2987 + backupState(source, &digitState); 1.2988 + UChar trail = getNextNormalizedChar(source); 1.2989 + if(U16_IS_TRAIL(trail)) { 1.2990 + char32 = U16_GET_SUPPLEMENTARY(ch, trail); 1.2991 + } else { 1.2992 + loadState(source, &digitState, TRUE); 1.2993 + char32 = ch; 1.2994 + } 1.2995 + } else { 1.2996 + char32 = ch; 1.2997 + } 1.2998 + } else { 1.2999 + char32 = ch; 1.3000 + } 1.3001 + digVal = u_charDigitValue(char32); 1.3002 + */ 1.3003 + digVal = u_charDigitValue(cp); // if we have arrived here, we have 1.3004 + // already processed possible supplementaries that trigered the digit tag - 1.3005 + // all supplementaries are marked in the UCA. 1.3006 + /* 1.3007 + We pad a zero in front of the first element anyways. This takes 1.3008 + care of the (probably) most common case where people are sorting things followed 1.3009 + by a single digit 1.3010 + */ 1.3011 + digIndx++; 1.3012 + for(;;){ 1.3013 + // Make sure we have enough space. No longer needed; 1.3014 + // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER 1.3015 + // (it has been pre-incremented) so we just ensure that numTempBuf is big enough 1.3016 + // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3). 1.3017 + 1.3018 + // Skipping over leading zeroes. 1.3019 + if (digVal != 0) { 1.3020 + nonZeroValReached = TRUE; 1.3021 + } 1.3022 + if (nonZeroValReached) { 1.3023 + /* 1.3024 + We parse the digit string into base 100 numbers (this fits into a byte). 1.3025 + We only add to the buffer in twos, thus if we are parsing an odd character, 1.3026 + that serves as the 'tens' digit while the if we are parsing an even one, that 1.3027 + is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into 1.3028 + a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid 1.3029 + overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less 1.3030 + than all the other bytes. 1.3031 + */ 1.3032 + 1.3033 + if (digIndx % 2 == 1){ 1.3034 + collateVal += (uint8_t)digVal; 1.3035 + 1.3036 + // We don't enter the low-order-digit case unless we've already seen 1.3037 + // the high order, or for the first digit, which is always non-zero. 1.3038 + if (collateVal != 0) 1.3039 + trailingZeroIndex = 0; 1.3040 + 1.3041 + numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; 1.3042 + collateVal = 0; 1.3043 + } 1.3044 + else{ 1.3045 + // We drop the collation value into the buffer so if we need to do 1.3046 + // a "front patch" we don't have to check to see if we're hitting the 1.3047 + // last element. 1.3048 + collateVal = (uint8_t)(digVal * 10); 1.3049 + 1.3050 + // Check for trailing zeroes. 1.3051 + if (collateVal == 0) 1.3052 + { 1.3053 + if (!trailingZeroIndex) 1.3054 + trailingZeroIndex = (digIndx/2) + 2; 1.3055 + } 1.3056 + else 1.3057 + trailingZeroIndex = 0; 1.3058 + 1.3059 + numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; 1.3060 + } 1.3061 + digIndx++; 1.3062 + } 1.3063 + 1.3064 + // Get next character. 1.3065 + if (!collIter_eos(source)){ 1.3066 + ch = getNextNormalizedChar(source); 1.3067 + if (U16_IS_LEAD(ch)){ 1.3068 + if (!collIter_eos(source)) { 1.3069 + backupState(source, &digitState); 1.3070 + UChar trail = getNextNormalizedChar(source); 1.3071 + if(U16_IS_TRAIL(trail)) { 1.3072 + char32 = U16_GET_SUPPLEMENTARY(ch, trail); 1.3073 + } else { 1.3074 + loadState(source, &digitState, TRUE); 1.3075 + char32 = ch; 1.3076 + } 1.3077 + } 1.3078 + } else { 1.3079 + char32 = ch; 1.3080 + } 1.3081 + 1.3082 + if ((digVal = u_charDigitValue(char32)) == -1 || digIndx > UCOL_MAX_DIGITS_FOR_NUMBER){ 1.3083 + // Resetting position to point to the next unprocessed char. We 1.3084 + // overshot it when doing our test/set for numbers. 1.3085 + if (char32 > 0xFFFF) { // For surrogates. 1.3086 + loadState(source, &digitState, TRUE); 1.3087 + //goBackOne(source); 1.3088 + } 1.3089 + goBackOne(source); 1.3090 + break; 1.3091 + } 1.3092 + } else { 1.3093 + break; 1.3094 + } 1.3095 + } 1.3096 + 1.3097 + if (nonZeroValReached == FALSE){ 1.3098 + digIndx = 2; 1.3099 + numTempBuf[2] = 6; 1.3100 + } 1.3101 + 1.3102 + endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ; 1.3103 + if (digIndx % 2 != 0){ 1.3104 + /* 1.3105 + We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what 1.3106 + we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward. 1.3107 + Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a 1.3108 + single pass and optimizes for strings with single digits. I'm just assuming that's the more common case. 1.3109 + */ 1.3110 + 1.3111 + for(i = 2; i < endIndex; i++){ 1.3112 + numTempBuf[i] = (((((numTempBuf[i] - 6)/2) % 10) * 10) + 1.3113 + (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6; 1.3114 + } 1.3115 + --digIndx; 1.3116 + } 1.3117 + 1.3118 + // Subtract one off of the last byte. 1.3119 + numTempBuf[endIndex-1] -= 1; 1.3120 + 1.3121 + /* 1.3122 + We want to skip over the first two slots in the buffer. The first slot 1.3123 + is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the 1.3124 + sign/exponent byte: 0x80 + (decimalPos/2) & 7f. 1.3125 + */ 1.3126 + numTempBuf[0] = UCOL_CODAN_PLACEHOLDER; 1.3127 + numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F)); 1.3128 + 1.3129 + // Now transfer the collation key to our collIterate struct. 1.3130 + // The total size for our collation key is endIndx bumped up to the next largest even value divided by two. 1.3131 + //size = ((endIndex+1) & ~1)/2; 1.3132 + CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight 1.3133 + (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight 1.3134 + UCOL_BYTE_COMMON; // Tertiary weight. 1.3135 + i = 2; // Reset the index into the buffer. 1.3136 + while(i < endIndex) 1.3137 + { 1.3138 + uint32_t primWeight = numTempBuf[i++] << 8; 1.3139 + if ( i < endIndex) 1.3140 + primWeight |= numTempBuf[i++]; 1.3141 + *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER; 1.3142 + } 1.3143 + 1.3144 + } else { 1.3145 + // no numeric mode, we'll just switch to whatever we stashed and continue 1.3146 + CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ 1.3147 + CE = *CEOffset++; 1.3148 + break; 1.3149 + } 1.3150 + return CE; 1.3151 + } 1.3152 + /* various implicits optimization */ 1.3153 + case IMPLICIT_TAG: /* everything that is not defined otherwise */ 1.3154 + /* UCA is filled with these. Tailorings are NOT_FOUND */ 1.3155 + return getImplicit(cp, source); 1.3156 + case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/ 1.3157 + // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit 1.3158 + return getImplicit(cp, source); 1.3159 + case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/ 1.3160 + { 1.3161 + static const uint32_t 1.3162 + SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7; 1.3163 + //const uint32_t LCount = 19; 1.3164 + static const uint32_t VCount = 21; 1.3165 + static const uint32_t TCount = 28; 1.3166 + //const uint32_t NCount = VCount * TCount; // 588 1.3167 + //const uint32_t SCount = LCount * NCount; // 11172 1.3168 + uint32_t L = ch - SBase; 1.3169 + 1.3170 + // divide into pieces 1.3171 + 1.3172 + uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation 1.3173 + L /= TCount; 1.3174 + uint32_t V = L % VCount; 1.3175 + L /= VCount; 1.3176 + 1.3177 + // offset them 1.3178 + 1.3179 + L += LBase; 1.3180 + V += VBase; 1.3181 + T += TBase; 1.3182 + 1.3183 + // return the first CE, but first put the rest into the expansion buffer 1.3184 + if (!source->coll->image->jamoSpecial) { // FAST PATH 1.3185 + 1.3186 + *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V); 1.3187 + if (T != TBase) { 1.3188 + *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T); 1.3189 + } 1.3190 + 1.3191 + return UTRIE_GET32_FROM_LEAD(&coll->mapping, L); 1.3192 + 1.3193 + } else { // Jamo is Special 1.3194 + // Since Hanguls pass the FCD check, it is 1.3195 + // guaranteed that we won't be in 1.3196 + // the normalization buffer if something like this happens 1.3197 + 1.3198 + // However, if we are using a uchar iterator and normalization 1.3199 + // is ON, the Hangul that lead us here is going to be in that 1.3200 + // normalization buffer. Here we want to restore the uchar 1.3201 + // iterator state and pull out of the normalization buffer 1.3202 + if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) { 1.3203 + source->flags = source->origFlags; // restore the iterator 1.3204 + source->pos = NULL; 1.3205 + } 1.3206 + 1.3207 + // Move Jamos into normalization buffer 1.3208 + UChar *buffer = source->writableBuffer.getBuffer(4); 1.3209 + int32_t bufferLength; 1.3210 + buffer[0] = (UChar)L; 1.3211 + buffer[1] = (UChar)V; 1.3212 + if (T != TBase) { 1.3213 + buffer[2] = (UChar)T; 1.3214 + bufferLength = 3; 1.3215 + } else { 1.3216 + bufferLength = 2; 1.3217 + } 1.3218 + source->writableBuffer.releaseBuffer(bufferLength); 1.3219 + 1.3220 + // Indicate where to continue in main input string after exhausting the writableBuffer 1.3221 + source->fcdPosition = source->pos; 1.3222 + 1.3223 + source->pos = source->writableBuffer.getTerminatedBuffer(); 1.3224 + source->origFlags = source->flags; 1.3225 + source->flags |= UCOL_ITER_INNORMBUF; 1.3226 + source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 1.3227 + 1.3228 + return(UCOL_IGNORABLE); 1.3229 + } 1.3230 + } 1.3231 + case SURROGATE_TAG: 1.3232 + /* we encountered a leading surrogate. We shall get the CE by using the following code unit */ 1.3233 + /* two things can happen here: next code point can be a trailing surrogate - we will use it */ 1.3234 + /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */ 1.3235 + /* we treat it like an unassigned code point. */ 1.3236 + { 1.3237 + UChar trail; 1.3238 + collIterateState state; 1.3239 + backupState(source, &state); 1.3240 + if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) { 1.3241 + // we chould have stepped one char forward and it might have turned that it 1.3242 + // was not a trail surrogate. In that case, we have to backup. 1.3243 + loadState(source, &state, TRUE); 1.3244 + return UCOL_NOT_FOUND; 1.3245 + } else { 1.3246 + /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */ 1.3247 + CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail); 1.3248 + if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one. 1.3249 + // We need to backup 1.3250 + loadState(source, &state, TRUE); 1.3251 + return CE; 1.3252 + } 1.3253 + // calculate the supplementary code point value, if surrogate was not tailored 1.3254 + cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000)); 1.3255 + } 1.3256 + } 1.3257 + break; 1.3258 + case LEAD_SURROGATE_TAG: /* D800-DBFF*/ 1.3259 + UChar nextChar; 1.3260 + if( source->flags & UCOL_USE_ITERATOR) { 1.3261 + if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) { 1.3262 + cp = U16_GET_SUPPLEMENTARY(ch, nextChar); 1.3263 + source->iterator->next(source->iterator); 1.3264 + return getImplicit(cp, source); 1.3265 + } 1.3266 + } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) && 1.3267 + U_IS_TRAIL((nextChar=*source->pos))) { 1.3268 + cp = U16_GET_SUPPLEMENTARY(ch, nextChar); 1.3269 + source->pos++; 1.3270 + return getImplicit(cp, source); 1.3271 + } 1.3272 + return UCOL_NOT_FOUND; 1.3273 + case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/ 1.3274 + return UCOL_NOT_FOUND; /* broken surrogate sequence */ 1.3275 + case CHARSET_TAG: 1.3276 + /* not yet implemented */ 1.3277 + /* probably after 1.8 */ 1.3278 + return UCOL_NOT_FOUND; 1.3279 + default: 1.3280 + *status = U_INTERNAL_PROGRAM_ERROR; 1.3281 + CE=0; 1.3282 + break; 1.3283 + } 1.3284 + if (CE <= UCOL_NOT_FOUND) break; 1.3285 + } 1.3286 + return CE; 1.3287 +} 1.3288 + 1.3289 + 1.3290 +/* now uses Mark's getImplicitPrimary code */ 1.3291 +static 1.3292 +inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) { 1.3293 + uint32_t r = uprv_uca_getImplicitPrimary(cp); 1.3294 + 1.3295 + *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505; 1.3296 + collationSource->toReturn = collationSource->CEpos; 1.3297 + 1.3298 + // **** doesn't work if using iterator **** 1.3299 + if (collationSource->flags & UCOL_ITER_INNORMBUF) { 1.3300 + collationSource->offsetRepeatCount = 1; 1.3301 + } else { 1.3302 + int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string); 1.3303 + 1.3304 + UErrorCode errorCode = U_ZERO_ERROR; 1.3305 + collationSource->appendOffset(firstOffset, errorCode); 1.3306 + collationSource->appendOffset(firstOffset + 1, errorCode); 1.3307 + 1.3308 + collationSource->offsetReturn = collationSource->offsetStore - 1; 1.3309 + *(collationSource->offsetBuffer) = firstOffset; 1.3310 + if (collationSource->offsetReturn == collationSource->offsetBuffer) { 1.3311 + collationSource->offsetStore = collationSource->offsetBuffer; 1.3312 + } 1.3313 + } 1.3314 + 1.3315 + return ((r & 0x0000FFFF)<<16) | 0x000000C0; 1.3316 +} 1.3317 + 1.3318 +/** 1.3319 + * This function handles the special CEs like contractions, expansions, 1.3320 + * surrogates, Thai. 1.3321 + * It is called by both getPrevCE 1.3322 + */ 1.3323 +uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE, 1.3324 + collIterate *source, 1.3325 + UErrorCode *status) 1.3326 +{ 1.3327 + const uint32_t *CEOffset = NULL; 1.3328 + UChar *UCharOffset = NULL; 1.3329 + UChar schar; 1.3330 + const UChar *constart = NULL; 1.3331 + uint32_t size; 1.3332 + UChar buffer[UCOL_MAX_BUFFER]; 1.3333 + uint32_t *endCEBuffer; 1.3334 + UChar *strbuffer; 1.3335 + int32_t noChars = 0; 1.3336 + int32_t CECount = 0; 1.3337 + 1.3338 + for(;;) 1.3339 + { 1.3340 + /* the only ces that loops are thai and contractions */ 1.3341 + switch (getCETag(CE)) 1.3342 + { 1.3343 + case NOT_FOUND_TAG: /* this tag always returns */ 1.3344 + return CE; 1.3345 + 1.3346 + case SPEC_PROC_TAG: 1.3347 + { 1.3348 + // Special processing is getting a CE that is preceded by a certain prefix 1.3349 + // Currently this is only needed for optimizing Japanese length and iteration marks. 1.3350 + // When we encouter a special processing tag, we go backwards and try to see if 1.3351 + // we have a match. 1.3352 + // Contraction tables are used - so the whole process is not unlike contraction. 1.3353 + // prefix data is stored backwards in the table. 1.3354 + const UChar *UCharOffset; 1.3355 + UChar schar, tchar; 1.3356 + collIterateState prefixState; 1.3357 + backupState(source, &prefixState); 1.3358 + for(;;) { 1.3359 + // This loop will run once per source string character, for as long as we 1.3360 + // are matching a potential contraction sequence 1.3361 + 1.3362 + // First we position ourselves at the begining of contraction sequence 1.3363 + const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); 1.3364 + 1.3365 + if (collIter_bos(source)) { 1.3366 + CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 1.3367 + break; 1.3368 + } 1.3369 + schar = getPrevNormalizedChar(source, status); 1.3370 + goBackOne(source); 1.3371 + 1.3372 + while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 1.3373 + UCharOffset++; 1.3374 + } 1.3375 + 1.3376 + if (schar == tchar) { 1.3377 + // Found the source string char in the table. 1.3378 + // Pick up the corresponding CE from the table. 1.3379 + CE = *(coll->contractionCEs + 1.3380 + (UCharOffset - coll->contractionIndex)); 1.3381 + } 1.3382 + else 1.3383 + { 1.3384 + // if there is a completely ignorable code point in the middle of 1.3385 + // a prefix, we need to act as if it's not there 1.3386 + // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero) 1.3387 + // lone surrogates cannot be set to zero as it would break other processing 1.3388 + uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar); 1.3389 + // it's easy for BMP code points 1.3390 + if(isZeroCE == 0) { 1.3391 + continue; 1.3392 + } else if(U16_IS_SURROGATE(schar)) { 1.3393 + // for supplementary code points, we have to check the next one 1.3394 + // situations where we are going to ignore 1.3395 + // 1. beginning of the string: schar is a lone surrogate 1.3396 + // 2. schar is a lone surrogate 1.3397 + // 3. schar is a trail surrogate in a valid surrogate sequence 1.3398 + // that is explicitly set to zero. 1.3399 + if (!collIter_bos(source)) { 1.3400 + UChar lead; 1.3401 + if(!U16_IS_SURROGATE_LEAD(schar) && U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) { 1.3402 + isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead); 1.3403 + if(isSpecial(isZeroCE) && getCETag(isZeroCE) == SURROGATE_TAG) { 1.3404 + uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar); 1.3405 + if(finalCE == 0) { 1.3406 + // this is a real, assigned completely ignorable code point 1.3407 + goBackOne(source); 1.3408 + continue; 1.3409 + } 1.3410 + } 1.3411 + } else { 1.3412 + // lone surrogate, treat like unassigned 1.3413 + return UCOL_NOT_FOUND; 1.3414 + } 1.3415 + } else { 1.3416 + // lone surrogate at the beggining, treat like unassigned 1.3417 + return UCOL_NOT_FOUND; 1.3418 + } 1.3419 + } 1.3420 + // Source string char was not in the table. 1.3421 + // We have not found the prefix. 1.3422 + CE = *(coll->contractionCEs + 1.3423 + (ContractionStart - coll->contractionIndex)); 1.3424 + } 1.3425 + 1.3426 + if(!isPrefix(CE)) { 1.3427 + // The source string char was in the contraction table, and the corresponding 1.3428 + // CE is not a prefix CE. We found the prefix, break 1.3429 + // out of loop, this CE will end up being returned. This is the normal 1.3430 + // way out of prefix handling when the source actually contained 1.3431 + // the prefix. 1.3432 + break; 1.3433 + } 1.3434 + } 1.3435 + loadState(source, &prefixState, TRUE); 1.3436 + break; 1.3437 + } 1.3438 + 1.3439 + case CONTRACTION_TAG: { 1.3440 + /* to ensure that the backwards and forwards iteration matches, we 1.3441 + take the current region of most possible match and pass it through 1.3442 + the forward iteration. this will ensure that the obstinate problem of 1.3443 + overlapping contractions will not occur. 1.3444 + */ 1.3445 + schar = peekCodeUnit(source, 0); 1.3446 + constart = (UChar *)coll->image + getContractOffset(CE); 1.3447 + if (isAtStartPrevIterate(source) 1.3448 + /* commented away contraction end checks after adding the checks 1.3449 + in getPrevCE */) { 1.3450 + /* start of string or this is not the end of any contraction */ 1.3451 + CE = *(coll->contractionCEs + 1.3452 + (constart - coll->contractionIndex)); 1.3453 + break; 1.3454 + } 1.3455 + strbuffer = buffer; 1.3456 + UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1); 1.3457 + *(UCharOffset --) = 0; 1.3458 + noChars = 0; 1.3459 + // have to swap thai characters 1.3460 + while (ucol_unsafeCP(schar, coll)) { 1.3461 + *(UCharOffset) = schar; 1.3462 + noChars++; 1.3463 + UCharOffset --; 1.3464 + schar = getPrevNormalizedChar(source, status); 1.3465 + goBackOne(source); 1.3466 + // TODO: when we exhaust the contraction buffer, 1.3467 + // it needs to get reallocated. The problem is 1.3468 + // that the size depends on the string which is 1.3469 + // not iterated over. However, since we're travelling 1.3470 + // backwards, we already had to set the iterator at 1.3471 + // the end - so we might as well know where we are? 1.3472 + if (UCharOffset + 1 == buffer) { 1.3473 + /* we have exhausted the buffer */ 1.3474 + int32_t newsize = 0; 1.3475 + if(source->pos) { // actually dealing with a position 1.3476 + newsize = (int32_t)(source->pos - source->string + 1); 1.3477 + } else { // iterator 1.3478 + newsize = 4 * UCOL_MAX_BUFFER; 1.3479 + } 1.3480 + strbuffer = (UChar *)uprv_malloc(sizeof(UChar) * 1.3481 + (newsize + UCOL_MAX_BUFFER)); 1.3482 + /* test for NULL */ 1.3483 + if (strbuffer == NULL) { 1.3484 + *status = U_MEMORY_ALLOCATION_ERROR; 1.3485 + return UCOL_NO_MORE_CES; 1.3486 + } 1.3487 + UCharOffset = strbuffer + newsize; 1.3488 + uprv_memcpy(UCharOffset, buffer, 1.3489 + UCOL_MAX_BUFFER * sizeof(UChar)); 1.3490 + UCharOffset --; 1.3491 + } 1.3492 + if ((source->pos && (source->pos == source->string || 1.3493 + ((source->flags & UCOL_ITER_INNORMBUF) && 1.3494 + *(source->pos - 1) == 0 && source->fcdPosition == NULL))) 1.3495 + || (source->iterator && !source->iterator->hasPrevious(source->iterator))) { 1.3496 + break; 1.3497 + } 1.3498 + } 1.3499 + /* adds the initial base character to the string */ 1.3500 + *(UCharOffset) = schar; 1.3501 + noChars++; 1.3502 + 1.3503 + int32_t offsetBias; 1.3504 + 1.3505 + // **** doesn't work if using iterator **** 1.3506 + if (source->flags & UCOL_ITER_INNORMBUF) { 1.3507 + offsetBias = -1; 1.3508 + } else { 1.3509 + offsetBias = (int32_t)(source->pos - source->string); 1.3510 + } 1.3511 + 1.3512 + /* a new collIterate is used to simplify things, since using the current 1.3513 + collIterate will mean that the forward and backwards iteration will 1.3514 + share and change the same buffers. we don't want to get into that. */ 1.3515 + collIterate temp; 1.3516 + int32_t rawOffset; 1.3517 + 1.3518 + IInit_collIterate(coll, UCharOffset, noChars, &temp, status); 1.3519 + if(U_FAILURE(*status)) { 1.3520 + return (uint32_t)UCOL_NULLORDER; 1.3521 + } 1.3522 + temp.flags &= ~UCOL_ITER_NORM; 1.3523 + temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT; 1.3524 + 1.3525 + rawOffset = (int32_t)(temp.pos - temp.string); // should always be zero? 1.3526 + CE = ucol_IGetNextCE(coll, &temp, status); 1.3527 + 1.3528 + if (source->extendCEs) { 1.3529 + endCEBuffer = source->extendCEs + source->extendCEsSize; 1.3530 + CECount = (int32_t)((source->CEpos - source->extendCEs)/sizeof(uint32_t)); 1.3531 + } else { 1.3532 + endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE; 1.3533 + CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_t)); 1.3534 + } 1.3535 + 1.3536 + while (CE != UCOL_NO_MORE_CES) { 1.3537 + *(source->CEpos ++) = CE; 1.3538 + 1.3539 + if (offsetBias >= 0) { 1.3540 + source->appendOffset(rawOffset + offsetBias, *status); 1.3541 + } 1.3542 + 1.3543 + CECount++; 1.3544 + if (source->CEpos == endCEBuffer) { 1.3545 + /* ran out of CE space, reallocate to new buffer. 1.3546 + If reallocation fails, reset pointers and bail out, 1.3547 + there's no guarantee of the right character position after 1.3548 + this bail*/ 1.3549 + if (!increaseCEsCapacity(source)) { 1.3550 + *status = U_MEMORY_ALLOCATION_ERROR; 1.3551 + break; 1.3552 + } 1.3553 + 1.3554 + endCEBuffer = source->extendCEs + source->extendCEsSize; 1.3555 + } 1.3556 + 1.3557 + if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) { 1.3558 + rawOffset = (int32_t)(temp.fcdPosition - temp.string); 1.3559 + } else { 1.3560 + rawOffset = (int32_t)(temp.pos - temp.string); 1.3561 + } 1.3562 + 1.3563 + CE = ucol_IGetNextCE(coll, &temp, status); 1.3564 + } 1.3565 + 1.3566 + if (strbuffer != buffer) { 1.3567 + uprv_free(strbuffer); 1.3568 + } 1.3569 + if (U_FAILURE(*status)) { 1.3570 + return (uint32_t)UCOL_NULLORDER; 1.3571 + } 1.3572 + 1.3573 + if (source->offsetRepeatValue != 0) { 1.3574 + if (CECount > noChars) { 1.3575 + source->offsetRepeatCount += temp.offsetRepeatCount; 1.3576 + } else { 1.3577 + // **** does this really skip the right offsets? **** 1.3578 + source->offsetReturn -= (noChars - CECount); 1.3579 + } 1.3580 + } 1.3581 + 1.3582 + if (offsetBias >= 0) { 1.3583 + source->offsetReturn = source->offsetStore - 1; 1.3584 + if (source->offsetReturn == source->offsetBuffer) { 1.3585 + source->offsetStore = source->offsetBuffer; 1.3586 + } 1.3587 + } 1.3588 + 1.3589 + source->toReturn = source->CEpos - 1; 1.3590 + if (source->toReturn == source->CEs) { 1.3591 + source->CEpos = source->CEs; 1.3592 + } 1.3593 + 1.3594 + return *(source->toReturn); 1.3595 + } 1.3596 + case LONG_PRIMARY_TAG: 1.3597 + { 1.3598 + *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON; 1.3599 + *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER; 1.3600 + source->toReturn = source->CEpos - 1; 1.3601 + 1.3602 + if (source->flags & UCOL_ITER_INNORMBUF) { 1.3603 + source->offsetRepeatCount = 1; 1.3604 + } else { 1.3605 + int32_t firstOffset = (int32_t)(source->pos - source->string); 1.3606 + 1.3607 + source->appendOffset(firstOffset, *status); 1.3608 + source->appendOffset(firstOffset + 1, *status); 1.3609 + 1.3610 + source->offsetReturn = source->offsetStore - 1; 1.3611 + *(source->offsetBuffer) = firstOffset; 1.3612 + if (source->offsetReturn == source->offsetBuffer) { 1.3613 + source->offsetStore = source->offsetBuffer; 1.3614 + } 1.3615 + } 1.3616 + 1.3617 + 1.3618 + return *(source->toReturn); 1.3619 + } 1.3620 + 1.3621 + case EXPANSION_TAG: /* this tag always returns */ 1.3622 + { 1.3623 + /* 1.3624 + This should handle expansion. 1.3625 + NOTE: we can encounter both continuations and expansions in an expansion! 1.3626 + I have to decide where continuations are going to be dealt with 1.3627 + */ 1.3628 + int32_t firstOffset = (int32_t)(source->pos - source->string); 1.3629 + 1.3630 + // **** doesn't work if using iterator **** 1.3631 + if (source->offsetReturn != NULL) { 1.3632 + if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetReturn == source->offsetBuffer) { 1.3633 + source->offsetStore = source->offsetBuffer; 1.3634 + }else { 1.3635 + firstOffset = -1; 1.3636 + } 1.3637 + } 1.3638 + 1.3639 + /* find the offset to expansion table */ 1.3640 + CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); 1.3641 + size = getExpansionCount(CE); 1.3642 + if (size != 0) { 1.3643 + /* 1.3644 + if there are less than 16 elements in expansion, we don't terminate 1.3645 + */ 1.3646 + uint32_t count; 1.3647 + 1.3648 + for (count = 0; count < size; count++) { 1.3649 + *(source->CEpos ++) = *CEOffset++; 1.3650 + 1.3651 + if (firstOffset >= 0) { 1.3652 + source->appendOffset(firstOffset + 1, *status); 1.3653 + } 1.3654 + } 1.3655 + } else { 1.3656 + /* else, we do */ 1.3657 + while (*CEOffset != 0) { 1.3658 + *(source->CEpos ++) = *CEOffset ++; 1.3659 + 1.3660 + if (firstOffset >= 0) { 1.3661 + source->appendOffset(firstOffset + 1, *status); 1.3662 + } 1.3663 + } 1.3664 + } 1.3665 + 1.3666 + if (firstOffset >= 0) { 1.3667 + source->offsetReturn = source->offsetStore - 1; 1.3668 + *(source->offsetBuffer) = firstOffset; 1.3669 + if (source->offsetReturn == source->offsetBuffer) { 1.3670 + source->offsetStore = source->offsetBuffer; 1.3671 + } 1.3672 + } else { 1.3673 + source->offsetRepeatCount += size - 1; 1.3674 + } 1.3675 + 1.3676 + source->toReturn = source->CEpos - 1; 1.3677 + // in case of one element expansion, we 1.3678 + // want to immediately return CEpos 1.3679 + if(source->toReturn == source->CEs) { 1.3680 + source->CEpos = source->CEs; 1.3681 + } 1.3682 + 1.3683 + return *(source->toReturn); 1.3684 + } 1.3685 + 1.3686 + case DIGIT_TAG: 1.3687 + { 1.3688 + /* 1.3689 + We do a check to see if we want to collate digits as numbers; if so we generate 1.3690 + a custom collation key. Otherwise we pull out the value stored in the expansion table. 1.3691 + */ 1.3692 + uint32_t i; /* general counter */ 1.3693 + 1.3694 + if (source->coll->numericCollation == UCOL_ON){ 1.3695 + uint32_t digIndx = 0; 1.3696 + uint32_t endIndex = 0; 1.3697 + uint32_t leadingZeroIndex = 0; 1.3698 + uint32_t trailingZeroCount = 0; 1.3699 + 1.3700 + uint8_t collateVal = 0; 1.3701 + 1.3702 + UBool nonZeroValReached = FALSE; 1.3703 + 1.3704 + uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I just need a temporary place to store my generated CEs. 1.3705 + /* 1.3706 + We parse the source string until we hit a char that's NOT a digit. 1.3707 + Use this u_charDigitValue. This might be slow because we have to 1.3708 + handle surrogates... 1.3709 + */ 1.3710 + /* 1.3711 + We need to break up the digit string into collection elements of UCOL_MAX_DIGITS_FOR_NUMBER or less, 1.3712 + with any chunks smaller than that being on the right end of the digit string - i.e. the first collation 1.3713 + element we process when going backward. To determine how long that chunk might be, we may need to make 1.3714 + two passes through the loop that collects digits - one to see how long the string is (and how much is 1.3715 + leading zeros) to determine the length of that right-hand chunk, and a second (if the whole string has 1.3716 + more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits) to actually process that collation 1.3717 + element chunk after resetting the state to the initialState at the right side of the digit string. 1.3718 + */ 1.3719 + uint32_t ceLimit = 0; 1.3720 + UChar initial_ch = ch; 1.3721 + collIterateState initialState = {0,0,0,0,0,0,0,0,0}; 1.3722 + backupState(source, &initialState); 1.3723 + 1.3724 + for(;;) { 1.3725 + collIterateState state = {0,0,0,0,0,0,0,0,0}; 1.3726 + UChar32 char32 = 0; 1.3727 + int32_t digVal = 0; 1.3728 + 1.3729 + if (U16_IS_TRAIL (ch)) { 1.3730 + if (!collIter_bos(source)){ 1.3731 + UChar lead = getPrevNormalizedChar(source, status); 1.3732 + if(U16_IS_LEAD(lead)) { 1.3733 + char32 = U16_GET_SUPPLEMENTARY(lead,ch); 1.3734 + goBackOne(source); 1.3735 + } else { 1.3736 + char32 = ch; 1.3737 + } 1.3738 + } else { 1.3739 + char32 = ch; 1.3740 + } 1.3741 + } else { 1.3742 + char32 = ch; 1.3743 + } 1.3744 + digVal = u_charDigitValue(char32); 1.3745 + 1.3746 + for(;;) { 1.3747 + // Make sure we have enough space. No longer needed; 1.3748 + // at this point the largest value of digIndx when we need to save data in numTempBuf 1.3749 + // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-incremented) so we just ensure 1.3750 + // that numTempBuf is big enough (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2). 1.3751 + 1.3752 + // Skip over trailing zeroes, and keep a count of them. 1.3753 + if (digVal != 0) 1.3754 + nonZeroValReached = TRUE; 1.3755 + 1.3756 + if (nonZeroValReached) { 1.3757 + /* 1.3758 + We parse the digit string into base 100 numbers (this fits into a byte). 1.3759 + We only add to the buffer in twos, thus if we are parsing an odd character, 1.3760 + that serves as the 'tens' digit while the if we are parsing an even one, that 1.3761 + is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into 1.3762 + a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid 1.3763 + overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less 1.3764 + than all the other bytes. 1.3765 + 1.3766 + Since we're doing in this reverse we want to put the first digit encountered into the 1.3767 + ones place and the second digit encountered into the tens place. 1.3768 + */ 1.3769 + 1.3770 + if ((digIndx + trailingZeroCount) % 2 == 1) { 1.3771 + // High-order digit case (tens place) 1.3772 + collateVal += (uint8_t)(digVal * 10); 1.3773 + 1.3774 + // We cannot set leadingZeroIndex unless it has been set for the 1.3775 + // low-order digit. Therefore, all we can do for the high-order 1.3776 + // digit is turn it off, never on. 1.3777 + // The only time we will have a high digit without a low is for 1.3778 + // the very first non-zero digit, so no zero check is necessary. 1.3779 + if (collateVal != 0) 1.3780 + leadingZeroIndex = 0; 1.3781 + 1.3782 + // The first pass through, digIndx may exceed the limit, but in that case 1.3783 + // we no longer care about numTempBuf contents since they will be discarded 1.3784 + if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) { 1.3785 + numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; 1.3786 + } 1.3787 + collateVal = 0; 1.3788 + } else { 1.3789 + // Low-order digit case (ones place) 1.3790 + collateVal = (uint8_t)digVal; 1.3791 + 1.3792 + // Check for leading zeroes. 1.3793 + if (collateVal == 0) { 1.3794 + if (!leadingZeroIndex) 1.3795 + leadingZeroIndex = (digIndx/2) + 2; 1.3796 + } else 1.3797 + leadingZeroIndex = 0; 1.3798 + 1.3799 + // No need to write to buffer; the case of a last odd digit 1.3800 + // is handled below. 1.3801 + } 1.3802 + ++digIndx; 1.3803 + } else 1.3804 + ++trailingZeroCount; 1.3805 + 1.3806 + if (!collIter_bos(source)) { 1.3807 + ch = getPrevNormalizedChar(source, status); 1.3808 + //goBackOne(source); 1.3809 + if (U16_IS_TRAIL(ch)) { 1.3810 + backupState(source, &state); 1.3811 + if (!collIter_bos(source)) { 1.3812 + goBackOne(source); 1.3813 + UChar lead = getPrevNormalizedChar(source, status); 1.3814 + 1.3815 + if(U16_IS_LEAD(lead)) { 1.3816 + char32 = U16_GET_SUPPLEMENTARY(lead,ch); 1.3817 + } else { 1.3818 + loadState(source, &state, FALSE); 1.3819 + char32 = ch; 1.3820 + } 1.3821 + } 1.3822 + } else 1.3823 + char32 = ch; 1.3824 + 1.3825 + if ((digVal = u_charDigitValue(char32)) == -1 || (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) { 1.3826 + if (char32 > 0xFFFF) {// For surrogates. 1.3827 + loadState(source, &state, FALSE); 1.3828 + } 1.3829 + // Don't need to "reverse" the goBackOne call, 1.3830 + // as this points to the next position to process.. 1.3831 + //if (char32 > 0xFFFF) // For surrogates. 1.3832 + //getNextNormalizedChar(source); 1.3833 + break; 1.3834 + } 1.3835 + 1.3836 + goBackOne(source); 1.3837 + }else 1.3838 + break; 1.3839 + } 1.3840 + 1.3841 + if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_NUMBER) { 1.3842 + // our collation element is not too big, go ahead and finish with it 1.3843 + break; 1.3844 + } 1.3845 + // our digit string is too long for a collation element; 1.3846 + // set the limit for it, reset the state and begin again 1.3847 + ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGITS_FOR_NUMBER; 1.3848 + if ( ceLimit == 0 ) { 1.3849 + ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER; 1.3850 + } 1.3851 + ch = initial_ch; 1.3852 + loadState(source, &initialState, FALSE); 1.3853 + digIndx = endIndex = leadingZeroIndex = trailingZeroCount = 0; 1.3854 + collateVal = 0; 1.3855 + nonZeroValReached = FALSE; 1.3856 + } 1.3857 + 1.3858 + if (! nonZeroValReached) { 1.3859 + digIndx = 2; 1.3860 + trailingZeroCount = 0; 1.3861 + numTempBuf[2] = 6; 1.3862 + } 1.3863 + 1.3864 + if ((digIndx + trailingZeroCount) % 2 != 0) { 1.3865 + numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6; 1.3866 + digIndx += 1; // The implicit leading zero 1.3867 + } 1.3868 + if (trailingZeroCount % 2 != 0) { 1.3869 + // We had to consume one trailing zero for the low digit 1.3870 + // of the least significant byte 1.3871 + digIndx += 1; // The trailing zero not in the exponent 1.3872 + trailingZeroCount -= 1; 1.3873 + } 1.3874 + 1.3875 + endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ; 1.3876 + 1.3877 + // Subtract one off of the last byte. Really the first byte here, but it's reversed... 1.3878 + numTempBuf[2] -= 1; 1.3879 + 1.3880 + /* 1.3881 + We want to skip over the first two slots in the buffer. The first slot 1.3882 + is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the 1.3883 + sign/exponent byte: 0x80 + (decimalPos/2) & 7f. 1.3884 + The exponent must be adjusted by the number of leading zeroes, and the number of 1.3885 + trailing zeroes. 1.3886 + */ 1.3887 + numTempBuf[0] = UCOL_CODAN_PLACEHOLDER; 1.3888 + uint32_t exponent = (digIndx+trailingZeroCount)/2; 1.3889 + if (leadingZeroIndex) 1.3890 + exponent -= ((digIndx/2) + 2 - leadingZeroIndex); 1.3891 + numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F)); 1.3892 + 1.3893 + // Now transfer the collation key to our collIterate struct. 1.3894 + // The total size for our collation key is half of endIndex, rounded up. 1.3895 + int32_t size = (endIndex+1)/2; 1.3896 + if(!ensureCEsCapacity(source, size)) { 1.3897 + return (uint32_t)UCOL_NULLORDER; 1.3898 + } 1.3899 + *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight 1.3900 + (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight 1.3901 + UCOL_BYTE_COMMON; // Tertiary weight. 1.3902 + i = endIndex - 1; // Reset the index into the buffer. 1.3903 + while(i >= 2) { 1.3904 + uint32_t primWeight = numTempBuf[i--] << 8; 1.3905 + if ( i >= 2) 1.3906 + primWeight |= numTempBuf[i--]; 1.3907 + *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER; 1.3908 + } 1.3909 + 1.3910 + source->toReturn = source->CEpos -1; 1.3911 + return *(source->toReturn); 1.3912 + } else { 1.3913 + CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); 1.3914 + CE = *(CEOffset++); 1.3915 + break; 1.3916 + } 1.3917 + } 1.3918 + 1.3919 + case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/ 1.3920 + { 1.3921 + static const uint32_t 1.3922 + SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7; 1.3923 + //const uint32_t LCount = 19; 1.3924 + static const uint32_t VCount = 21; 1.3925 + static const uint32_t TCount = 28; 1.3926 + //const uint32_t NCount = VCount * TCount; /* 588 */ 1.3927 + //const uint32_t SCount = LCount * NCount; /* 11172 */ 1.3928 + 1.3929 + uint32_t L = ch - SBase; 1.3930 + /* 1.3931 + divide into pieces. 1.3932 + we do it in this order since some compilers can do % and / in one 1.3933 + operation 1.3934 + */ 1.3935 + uint32_t T = L % TCount; 1.3936 + L /= TCount; 1.3937 + uint32_t V = L % VCount; 1.3938 + L /= VCount; 1.3939 + 1.3940 + /* offset them */ 1.3941 + L += LBase; 1.3942 + V += VBase; 1.3943 + T += TBase; 1.3944 + 1.3945 + int32_t firstOffset = (int32_t)(source->pos - source->string); 1.3946 + source->appendOffset(firstOffset, *status); 1.3947 + 1.3948 + /* 1.3949 + * return the first CE, but first put the rest into the expansion buffer 1.3950 + */ 1.3951 + if (!source->coll->image->jamoSpecial) { 1.3952 + *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L); 1.3953 + *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V); 1.3954 + source->appendOffset(firstOffset + 1, *status); 1.3955 + 1.3956 + if (T != TBase) { 1.3957 + *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T); 1.3958 + source->appendOffset(firstOffset + 1, *status); 1.3959 + } 1.3960 + 1.3961 + source->toReturn = source->CEpos - 1; 1.3962 + 1.3963 + source->offsetReturn = source->offsetStore - 1; 1.3964 + if (source->offsetReturn == source->offsetBuffer) { 1.3965 + source->offsetStore = source->offsetBuffer; 1.3966 + } 1.3967 + 1.3968 + return *(source->toReturn); 1.3969 + } else { 1.3970 + // Since Hanguls pass the FCD check, it is 1.3971 + // guaranteed that we won't be in 1.3972 + // the normalization buffer if something like this happens 1.3973 + 1.3974 + // Move Jamos into normalization buffer 1.3975 + UChar *tempbuffer = source->writableBuffer.getBuffer(5); 1.3976 + int32_t tempbufferLength, jamoOffset; 1.3977 + tempbuffer[0] = 0; 1.3978 + tempbuffer[1] = (UChar)L; 1.3979 + tempbuffer[2] = (UChar)V; 1.3980 + if (T != TBase) { 1.3981 + tempbuffer[3] = (UChar)T; 1.3982 + tempbufferLength = 4; 1.3983 + } else { 1.3984 + tempbufferLength = 3; 1.3985 + } 1.3986 + source->writableBuffer.releaseBuffer(tempbufferLength); 1.3987 + 1.3988 + // Indicate where to continue in main input string after exhausting the writableBuffer 1.3989 + if (source->pos == source->string) { 1.3990 + jamoOffset = 0; 1.3991 + source->fcdPosition = NULL; 1.3992 + } else { 1.3993 + jamoOffset = source->pos - source->string; 1.3994 + source->fcdPosition = source->pos-1; 1.3995 + } 1.3996 + 1.3997 + // Append offsets for the additional chars 1.3998 + // (not the 0, and not the L whose offsets match the original Hangul) 1.3999 + int32_t jamoRemaining = tempbufferLength - 2; 1.4000 + jamoOffset++; // appended offsets should match end of original Hangul 1.4001 + while (jamoRemaining-- > 0) { 1.4002 + source->appendOffset(jamoOffset, *status); 1.4003 + } 1.4004 + 1.4005 + source->offsetRepeatValue = jamoOffset; 1.4006 + 1.4007 + source->offsetReturn = source->offsetStore - 1; 1.4008 + if (source->offsetReturn == source->offsetBuffer) { 1.4009 + source->offsetStore = source->offsetBuffer; 1.4010 + } 1.4011 + 1.4012 + source->pos = source->writableBuffer.getTerminatedBuffer() + tempbufferLength; 1.4013 + source->origFlags = source->flags; 1.4014 + source->flags |= UCOL_ITER_INNORMBUF; 1.4015 + source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 1.4016 + 1.4017 + return(UCOL_IGNORABLE); 1.4018 + } 1.4019 + } 1.4020 + 1.4021 + case IMPLICIT_TAG: /* everything that is not defined otherwise */ 1.4022 + return getPrevImplicit(ch, source); 1.4023 + 1.4024 + // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function 1.4025 + case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/ 1.4026 + return getPrevImplicit(ch, source); 1.4027 + 1.4028 + case SURROGATE_TAG: /* This is a surrogate pair */ 1.4029 + /* essentially an engaged lead surrogate. */ 1.4030 + /* if you have encountered it here, it means that a */ 1.4031 + /* broken sequence was encountered and this is an error */ 1.4032 + return UCOL_NOT_FOUND; 1.4033 + 1.4034 + case LEAD_SURROGATE_TAG: /* D800-DBFF*/ 1.4035 + return UCOL_NOT_FOUND; /* broken surrogate sequence */ 1.4036 + 1.4037 + case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/ 1.4038 + { 1.4039 + UChar32 cp = 0; 1.4040 + UChar prevChar; 1.4041 + const UChar *prev; 1.4042 + if (isAtStartPrevIterate(source)) { 1.4043 + /* we are at the start of the string, wrong place to be at */ 1.4044 + return UCOL_NOT_FOUND; 1.4045 + } 1.4046 + if (source->pos != source->writableBuffer.getBuffer()) { 1.4047 + prev = source->pos - 1; 1.4048 + } else { 1.4049 + prev = source->fcdPosition; 1.4050 + } 1.4051 + prevChar = *prev; 1.4052 + 1.4053 + /* Handles Han and Supplementary characters here.*/ 1.4054 + if (U16_IS_LEAD(prevChar)) { 1.4055 + cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000)); 1.4056 + source->pos = prev; 1.4057 + } else { 1.4058 + return UCOL_NOT_FOUND; /* like unassigned */ 1.4059 + } 1.4060 + 1.4061 + return getPrevImplicit(cp, source); 1.4062 + } 1.4063 + 1.4064 + /* UCA is filled with these. Tailorings are NOT_FOUND */ 1.4065 + /* not yet implemented */ 1.4066 + case CHARSET_TAG: /* this tag always returns */ 1.4067 + /* probably after 1.8 */ 1.4068 + return UCOL_NOT_FOUND; 1.4069 + 1.4070 + default: /* this tag always returns */ 1.4071 + *status = U_INTERNAL_PROGRAM_ERROR; 1.4072 + CE=0; 1.4073 + break; 1.4074 + } 1.4075 + 1.4076 + if (CE <= UCOL_NOT_FOUND) { 1.4077 + break; 1.4078 + } 1.4079 + } 1.4080 + 1.4081 + return CE; 1.4082 +} 1.4083 + 1.4084 +/* This should really be a macro */ 1.4085 +/* This function is used to reverse parts of a buffer. We need this operation when doing continuation */ 1.4086 +/* secondaries in French */ 1.4087 +/* 1.4088 +void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) { 1.4089 + uint8_t temp; 1.4090 + while(start<end) { 1.4091 + temp = *start; 1.4092 + *start++ = *end; 1.4093 + *end-- = temp; 1.4094 + } 1.4095 +} 1.4096 +*/ 1.4097 + 1.4098 +#define uprv_ucol_reverse_buffer(TYPE, start, end) { \ 1.4099 + TYPE tempA; \ 1.4100 +while((start)<(end)) { \ 1.4101 + tempA = *(start); \ 1.4102 + *(start)++ = *(end); \ 1.4103 + *(end)-- = tempA; \ 1.4104 +} \ 1.4105 +} 1.4106 + 1.4107 +/****************************************************************************/ 1.4108 +/* Following are the sortkey generation functions */ 1.4109 +/* */ 1.4110 +/****************************************************************************/ 1.4111 + 1.4112 +U_CAPI int32_t U_EXPORT2 1.4113 +ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length, 1.4114 + const uint8_t *src2, int32_t src2Length, 1.4115 + uint8_t *dest, int32_t destCapacity) { 1.4116 + /* check arguments */ 1.4117 + if( src1==NULL || src1Length<-1 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) || 1.4118 + src2==NULL || src2Length<-1 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) || 1.4119 + destCapacity<0 || (destCapacity>0 && dest==NULL) 1.4120 + ) { 1.4121 + /* error, attempt to write a zero byte and return 0 */ 1.4122 + if(dest!=NULL && destCapacity>0) { 1.4123 + *dest=0; 1.4124 + } 1.4125 + return 0; 1.4126 + } 1.4127 + 1.4128 + /* check lengths and capacity */ 1.4129 + if(src1Length<0) { 1.4130 + src1Length=(int32_t)uprv_strlen((const char *)src1)+1; 1.4131 + } 1.4132 + if(src2Length<0) { 1.4133 + src2Length=(int32_t)uprv_strlen((const char *)src2)+1; 1.4134 + } 1.4135 + 1.4136 + int32_t destLength=src1Length+src2Length; 1.4137 + if(destLength>destCapacity) { 1.4138 + /* the merged sort key does not fit into the destination */ 1.4139 + return destLength; 1.4140 + } 1.4141 + 1.4142 + /* merge the sort keys with the same number of levels */ 1.4143 + uint8_t *p=dest; 1.4144 + for(;;) { 1.4145 + /* copy level from src1 not including 00 or 01 */ 1.4146 + uint8_t b; 1.4147 + while((b=*src1)>=2) { 1.4148 + ++src1; 1.4149 + *p++=b; 1.4150 + } 1.4151 + 1.4152 + /* add a 02 merge separator */ 1.4153 + *p++=2; 1.4154 + 1.4155 + /* copy level from src2 not including 00 or 01 */ 1.4156 + while((b=*src2)>=2) { 1.4157 + ++src2; 1.4158 + *p++=b; 1.4159 + } 1.4160 + 1.4161 + /* if both sort keys have another level, then add a 01 level separator and continue */ 1.4162 + if(*src1==1 && *src2==1) { 1.4163 + ++src1; 1.4164 + ++src2; 1.4165 + *p++=1; 1.4166 + } else { 1.4167 + break; 1.4168 + } 1.4169 + } 1.4170 + 1.4171 + /* 1.4172 + * here, at least one sort key is finished now, but the other one 1.4173 + * might have some contents left from containing more levels; 1.4174 + * that contents is just appended to the result 1.4175 + */ 1.4176 + if(*src1!=0) { 1.4177 + /* src1 is not finished, therefore *src2==0, and src1 is appended */ 1.4178 + src2=src1; 1.4179 + } 1.4180 + /* append src2, "the other, unfinished sort key" */ 1.4181 + while((*p++=*src2++)!=0) {} 1.4182 + 1.4183 + /* the actual length might be less than destLength if either sort key contained illegally embedded zero bytes */ 1.4184 + return (int32_t)(p-dest); 1.4185 +} 1.4186 + 1.4187 +U_NAMESPACE_BEGIN 1.4188 + 1.4189 +class SortKeyByteSink : public ByteSink { 1.4190 +public: 1.4191 + SortKeyByteSink(char *dest, int32_t destCapacity) 1.4192 + : buffer_(dest), capacity_(destCapacity), 1.4193 + appended_(0) { 1.4194 + if (buffer_ == NULL) { 1.4195 + capacity_ = 0; 1.4196 + } else if(capacity_ < 0) { 1.4197 + buffer_ = NULL; 1.4198 + capacity_ = 0; 1.4199 + } 1.4200 + } 1.4201 + virtual ~SortKeyByteSink(); 1.4202 + 1.4203 + virtual void Append(const char *bytes, int32_t n); 1.4204 + void Append(uint32_t b) { 1.4205 + if (appended_ < capacity_ || Resize(1, appended_)) { 1.4206 + buffer_[appended_] = (char)b; 1.4207 + } 1.4208 + ++appended_; 1.4209 + } 1.4210 + void Append(uint32_t b1, uint32_t b2) { 1.4211 + int32_t a2 = appended_ + 2; 1.4212 + if (a2 <= capacity_ || Resize(2, appended_)) { 1.4213 + buffer_[appended_] = (char)b1; 1.4214 + buffer_[appended_ + 1] = (char)b2; 1.4215 + } else if(appended_ < capacity_) { 1.4216 + buffer_[appended_] = (char)b1; 1.4217 + } 1.4218 + appended_ = a2; 1.4219 + } 1.4220 + virtual char *GetAppendBuffer(int32_t min_capacity, 1.4221 + int32_t desired_capacity_hint, 1.4222 + char *scratch, int32_t scratch_capacity, 1.4223 + int32_t *result_capacity); 1.4224 + int32_t NumberOfBytesAppended() const { return appended_; } 1.4225 + /** @return FALSE if memory allocation failed */ 1.4226 + UBool IsOk() const { return buffer_ != NULL; } 1.4227 + 1.4228 +protected: 1.4229 + virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) = 0; 1.4230 + virtual UBool Resize(int32_t appendCapacity, int32_t length) = 0; 1.4231 + 1.4232 + void SetNotOk() { 1.4233 + buffer_ = NULL; 1.4234 + capacity_ = 0; 1.4235 + } 1.4236 + 1.4237 + char *buffer_; 1.4238 + int32_t capacity_; 1.4239 + int32_t appended_; 1.4240 + 1.4241 +private: 1.4242 + SortKeyByteSink(const SortKeyByteSink &); // copy constructor not implemented 1.4243 + SortKeyByteSink &operator=(const SortKeyByteSink &); // assignment operator not implemented 1.4244 +}; 1.4245 + 1.4246 +SortKeyByteSink::~SortKeyByteSink() {} 1.4247 + 1.4248 +void 1.4249 +SortKeyByteSink::Append(const char *bytes, int32_t n) { 1.4250 + if (n <= 0 || bytes == NULL) { 1.4251 + return; 1.4252 + } 1.4253 + int32_t length = appended_; 1.4254 + appended_ += n; 1.4255 + if ((buffer_ + length) == bytes) { 1.4256 + return; // the caller used GetAppendBuffer() and wrote the bytes already 1.4257 + } 1.4258 + int32_t available = capacity_ - length; 1.4259 + if (n <= available) { 1.4260 + uprv_memcpy(buffer_ + length, bytes, n); 1.4261 + } else { 1.4262 + AppendBeyondCapacity(bytes, n, length); 1.4263 + } 1.4264 +} 1.4265 + 1.4266 +char * 1.4267 +SortKeyByteSink::GetAppendBuffer(int32_t min_capacity, 1.4268 + int32_t desired_capacity_hint, 1.4269 + char *scratch, 1.4270 + int32_t scratch_capacity, 1.4271 + int32_t *result_capacity) { 1.4272 + if (min_capacity < 1 || scratch_capacity < min_capacity) { 1.4273 + *result_capacity = 0; 1.4274 + return NULL; 1.4275 + } 1.4276 + int32_t available = capacity_ - appended_; 1.4277 + if (available >= min_capacity) { 1.4278 + *result_capacity = available; 1.4279 + return buffer_ + appended_; 1.4280 + } else if (Resize(desired_capacity_hint, appended_)) { 1.4281 + *result_capacity = capacity_ - appended_; 1.4282 + return buffer_ + appended_; 1.4283 + } else { 1.4284 + *result_capacity = scratch_capacity; 1.4285 + return scratch; 1.4286 + } 1.4287 +} 1.4288 + 1.4289 +class FixedSortKeyByteSink : public SortKeyByteSink { 1.4290 +public: 1.4291 + FixedSortKeyByteSink(char *dest, int32_t destCapacity) 1.4292 + : SortKeyByteSink(dest, destCapacity) {} 1.4293 + virtual ~FixedSortKeyByteSink(); 1.4294 + 1.4295 +private: 1.4296 + virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length); 1.4297 + virtual UBool Resize(int32_t appendCapacity, int32_t length); 1.4298 +}; 1.4299 + 1.4300 +FixedSortKeyByteSink::~FixedSortKeyByteSink() {} 1.4301 + 1.4302 +void 1.4303 +FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t /*n*/, int32_t length) { 1.4304 + // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_ 1.4305 + // Fill the buffer completely. 1.4306 + int32_t available = capacity_ - length; 1.4307 + if (available > 0) { 1.4308 + uprv_memcpy(buffer_ + length, bytes, available); 1.4309 + } 1.4310 +} 1.4311 + 1.4312 +UBool 1.4313 +FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) { 1.4314 + return FALSE; 1.4315 +} 1.4316 + 1.4317 +class CollationKeyByteSink : public SortKeyByteSink { 1.4318 +public: 1.4319 + CollationKeyByteSink(CollationKey &key) 1.4320 + : SortKeyByteSink(reinterpret_cast<char *>(key.getBytes()), key.getCapacity()), 1.4321 + key_(key) {} 1.4322 + virtual ~CollationKeyByteSink(); 1.4323 + 1.4324 +private: 1.4325 + virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length); 1.4326 + virtual UBool Resize(int32_t appendCapacity, int32_t length); 1.4327 + 1.4328 + CollationKey &key_; 1.4329 +}; 1.4330 + 1.4331 +CollationKeyByteSink::~CollationKeyByteSink() {} 1.4332 + 1.4333 +void 1.4334 +CollationKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) { 1.4335 + // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_ 1.4336 + if (Resize(n, length)) { 1.4337 + uprv_memcpy(buffer_ + length, bytes, n); 1.4338 + } 1.4339 +} 1.4340 + 1.4341 +UBool 1.4342 +CollationKeyByteSink::Resize(int32_t appendCapacity, int32_t length) { 1.4343 + if (buffer_ == NULL) { 1.4344 + return FALSE; // allocation failed before already 1.4345 + } 1.4346 + int32_t newCapacity = 2 * capacity_; 1.4347 + int32_t altCapacity = length + 2 * appendCapacity; 1.4348 + if (newCapacity < altCapacity) { 1.4349 + newCapacity = altCapacity; 1.4350 + } 1.4351 + if (newCapacity < 200) { 1.4352 + newCapacity = 200; 1.4353 + } 1.4354 + uint8_t *newBuffer = key_.reallocate(newCapacity, length); 1.4355 + if (newBuffer == NULL) { 1.4356 + SetNotOk(); 1.4357 + return FALSE; 1.4358 + } 1.4359 + buffer_ = reinterpret_cast<char *>(newBuffer); 1.4360 + capacity_ = newCapacity; 1.4361 + return TRUE; 1.4362 +} 1.4363 + 1.4364 +/** 1.4365 + * uint8_t byte buffer, similar to CharString but simpler. 1.4366 + */ 1.4367 +class SortKeyLevel : public UMemory { 1.4368 +public: 1.4369 + SortKeyLevel() : len(0), ok(TRUE) {} 1.4370 + ~SortKeyLevel() {} 1.4371 + 1.4372 + /** @return FALSE if memory allocation failed */ 1.4373 + UBool isOk() const { return ok; } 1.4374 + UBool isEmpty() const { return len == 0; } 1.4375 + int32_t length() const { return len; } 1.4376 + const uint8_t *data() const { return buffer.getAlias(); } 1.4377 + uint8_t operator[](int32_t index) const { return buffer[index]; } 1.4378 + 1.4379 + void appendByte(uint32_t b); 1.4380 + 1.4381 + void appendTo(ByteSink &sink) const { 1.4382 + sink.Append(reinterpret_cast<const char *>(buffer.getAlias()), len); 1.4383 + } 1.4384 + 1.4385 + uint8_t &lastByte() { 1.4386 + U_ASSERT(len > 0); 1.4387 + return buffer[len - 1]; 1.4388 + } 1.4389 + 1.4390 + uint8_t *getLastFewBytes(int32_t n) { 1.4391 + if (ok && len >= n) { 1.4392 + return buffer.getAlias() + len - n; 1.4393 + } else { 1.4394 + return NULL; 1.4395 + } 1.4396 + } 1.4397 + 1.4398 +private: 1.4399 + MaybeStackArray<uint8_t, 40> buffer; 1.4400 + int32_t len; 1.4401 + UBool ok; 1.4402 + 1.4403 + UBool ensureCapacity(int32_t appendCapacity); 1.4404 + 1.4405 + SortKeyLevel(const SortKeyLevel &other); // forbid copying of this class 1.4406 + SortKeyLevel &operator=(const SortKeyLevel &other); // forbid copying of this class 1.4407 +}; 1.4408 + 1.4409 +void SortKeyLevel::appendByte(uint32_t b) { 1.4410 + if(len < buffer.getCapacity() || ensureCapacity(1)) { 1.4411 + buffer[len++] = (uint8_t)b; 1.4412 + } 1.4413 +} 1.4414 + 1.4415 +UBool SortKeyLevel::ensureCapacity(int32_t appendCapacity) { 1.4416 + if(!ok) { 1.4417 + return FALSE; 1.4418 + } 1.4419 + int32_t newCapacity = 2 * buffer.getCapacity(); 1.4420 + int32_t altCapacity = len + 2 * appendCapacity; 1.4421 + if (newCapacity < altCapacity) { 1.4422 + newCapacity = altCapacity; 1.4423 + } 1.4424 + if (newCapacity < 200) { 1.4425 + newCapacity = 200; 1.4426 + } 1.4427 + if(buffer.resize(newCapacity, len)==NULL) { 1.4428 + return ok = FALSE; 1.4429 + } 1.4430 + return TRUE; 1.4431 +} 1.4432 + 1.4433 +U_NAMESPACE_END 1.4434 + 1.4435 +/* sortkey API */ 1.4436 +U_CAPI int32_t U_EXPORT2 1.4437 +ucol_getSortKey(const UCollator *coll, 1.4438 + const UChar *source, 1.4439 + int32_t sourceLength, 1.4440 + uint8_t *result, 1.4441 + int32_t resultLength) 1.4442 +{ 1.4443 + UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY); 1.4444 + if (UTRACE_LEVEL(UTRACE_VERBOSE)) { 1.4445 + UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source, 1.4446 + ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength)); 1.4447 + } 1.4448 + 1.4449 + if(coll->delegate != NULL) { 1.4450 + return ((const Collator*)coll->delegate)->getSortKey(source, sourceLength, result, resultLength); 1.4451 + } 1.4452 + 1.4453 + UErrorCode status = U_ZERO_ERROR; 1.4454 + int32_t keySize = 0; 1.4455 + 1.4456 + if(source != NULL) { 1.4457 + // source == NULL is actually an error situation, but we would need to 1.4458 + // have an error code to return it. Until we introduce a new 1.4459 + // API, it stays like this 1.4460 + 1.4461 + /* this uses the function pointer that is set in updateinternalstate */ 1.4462 + /* currently, there are two funcs: */ 1.4463 + /*ucol_calcSortKey(...);*/ 1.4464 + /*ucol_calcSortKeySimpleTertiary(...);*/ 1.4465 + 1.4466 + uint8_t noDest[1] = { 0 }; 1.4467 + if(result == NULL) { 1.4468 + // Distinguish pure preflighting from an allocation error. 1.4469 + result = noDest; 1.4470 + resultLength = 0; 1.4471 + } 1.4472 + FixedSortKeyByteSink sink(reinterpret_cast<char *>(result), resultLength); 1.4473 + coll->sortKeyGen(coll, source, sourceLength, sink, &status); 1.4474 + if(U_SUCCESS(status)) { 1.4475 + keySize = sink.NumberOfBytesAppended(); 1.4476 + } 1.4477 + } 1.4478 + UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize); 1.4479 + UTRACE_EXIT_STATUS(status); 1.4480 + return keySize; 1.4481 +} 1.4482 + 1.4483 +U_CFUNC int32_t 1.4484 +ucol_getCollationKey(const UCollator *coll, 1.4485 + const UChar *source, int32_t sourceLength, 1.4486 + CollationKey &key, 1.4487 + UErrorCode &errorCode) { 1.4488 + CollationKeyByteSink sink(key); 1.4489 + coll->sortKeyGen(coll, source, sourceLength, sink, &errorCode); 1.4490 + return sink.NumberOfBytesAppended(); 1.4491 +} 1.4492 + 1.4493 +// Is this primary weight compressible? 1.4494 +// Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit). 1.4495 +// TODO: This should use per-lead-byte flags from FractionalUCA.txt. 1.4496 +static inline UBool 1.4497 +isCompressible(const UCollator * /*coll*/, uint8_t primary1) { 1.4498 + return UCOL_BYTE_FIRST_NON_LATIN_PRIMARY <= primary1 && primary1 <= maxRegularPrimary; 1.4499 +} 1.4500 + 1.4501 +static 1.4502 +inline void doCaseShift(SortKeyLevel &cases, uint32_t &caseShift) { 1.4503 + if (caseShift == 0) { 1.4504 + cases.appendByte(UCOL_CASE_BYTE_START); 1.4505 + caseShift = UCOL_CASE_SHIFT_START; 1.4506 + } 1.4507 +} 1.4508 + 1.4509 +// Packs the secondary buffer when processing French locale. 1.4510 +static void 1.4511 +packFrench(const uint8_t *secondaries, int32_t secsize, SortKeyByteSink &result) { 1.4512 + secondaries += secsize; // We read the secondary-level bytes back to front. 1.4513 + uint8_t secondary; 1.4514 + int32_t count2 = 0; 1.4515 + int32_t i = 0; 1.4516 + // we use i here since the key size already accounts for terminators, so we'll discard the increment 1.4517 + for(i = 0; i<secsize; i++) { 1.4518 + secondary = *(secondaries-i-1); 1.4519 + /* This is compression code. */ 1.4520 + if (secondary == UCOL_COMMON2) { 1.4521 + ++count2; 1.4522 + } else { 1.4523 + if (count2 > 0) { 1.4524 + if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 1.4525 + while (count2 > UCOL_TOP_COUNT2) { 1.4526 + result.Append(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2); 1.4527 + count2 -= (uint32_t)UCOL_TOP_COUNT2; 1.4528 + } 1.4529 + result.Append(UCOL_COMMON_TOP2 - (count2-1)); 1.4530 + } else { 1.4531 + while (count2 > UCOL_BOT_COUNT2) { 1.4532 + result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 1.4533 + count2 -= (uint32_t)UCOL_BOT_COUNT2; 1.4534 + } 1.4535 + result.Append(UCOL_COMMON_BOT2 + (count2-1)); 1.4536 + } 1.4537 + count2 = 0; 1.4538 + } 1.4539 + result.Append(secondary); 1.4540 + } 1.4541 + } 1.4542 + if (count2 > 0) { 1.4543 + while (count2 > UCOL_BOT_COUNT2) { 1.4544 + result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 1.4545 + count2 -= (uint32_t)UCOL_BOT_COUNT2; 1.4546 + } 1.4547 + result.Append(UCOL_COMMON_BOT2 + (count2-1)); 1.4548 + } 1.4549 +} 1.4550 + 1.4551 +#define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0 1.4552 + 1.4553 +/* This is the sortkey work horse function */ 1.4554 +U_CFUNC void U_CALLCONV 1.4555 +ucol_calcSortKey(const UCollator *coll, 1.4556 + const UChar *source, 1.4557 + int32_t sourceLength, 1.4558 + SortKeyByteSink &result, 1.4559 + UErrorCode *status) 1.4560 +{ 1.4561 + if(U_FAILURE(*status)) { 1.4562 + return; 1.4563 + } 1.4564 + 1.4565 + SortKeyByteSink &primaries = result; 1.4566 + SortKeyLevel secondaries; 1.4567 + SortKeyLevel tertiaries; 1.4568 + SortKeyLevel cases; 1.4569 + SortKeyLevel quads; 1.4570 + 1.4571 + UnicodeString normSource; 1.4572 + 1.4573 + int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength); 1.4574 + 1.4575 + UColAttributeValue strength = coll->strength; 1.4576 + 1.4577 + uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF); 1.4578 + uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF); 1.4579 + uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF); 1.4580 + UBool compareIdent = (strength == UCOL_IDENTICAL); 1.4581 + UBool doCase = (coll->caseLevel == UCOL_ON); 1.4582 + UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0); 1.4583 + UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); 1.4584 + //UBool qShifted = shifted && (compareQuad == 0); 1.4585 + UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0); 1.4586 + 1.4587 + uint32_t variableTopValue = coll->variableTopValue; 1.4588 + // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no 1.4589 + // qShifted, we don't need to set UCOL_COMMON_BOT4 so high. 1.4590 + uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1); 1.4591 + uint8_t UCOL_HIRAGANA_QUAD = 0; 1.4592 + if(doHiragana) { 1.4593 + UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++; 1.4594 + /* allocate one more space for hiragana, value for hiragana */ 1.4595 + } 1.4596 + uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4); 1.4597 + 1.4598 + /* support for special features like caselevel and funky secondaries */ 1.4599 + int32_t lastSecondaryLength = 0; 1.4600 + uint32_t caseShift = 0; 1.4601 + 1.4602 + /* If we need to normalize, we'll do it all at once at the beginning! */ 1.4603 + const Normalizer2 *norm2; 1.4604 + if(compareIdent) { 1.4605 + norm2 = Normalizer2Factory::getNFDInstance(*status); 1.4606 + } else if(coll->normalizationMode != UCOL_OFF) { 1.4607 + norm2 = Normalizer2Factory::getFCDInstance(*status); 1.4608 + } else { 1.4609 + norm2 = NULL; 1.4610 + } 1.4611 + if(norm2 != NULL) { 1.4612 + normSource.setTo(FALSE, source, len); 1.4613 + int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status); 1.4614 + if(qcYesLength != len) { 1.4615 + UnicodeString unnormalized = normSource.tempSubString(qcYesLength); 1.4616 + normSource.truncate(qcYesLength); 1.4617 + norm2->normalizeSecondAndAppend(normSource, unnormalized, *status); 1.4618 + source = normSource.getBuffer(); 1.4619 + len = normSource.length(); 1.4620 + } 1.4621 + } 1.4622 + collIterate s; 1.4623 + IInit_collIterate(coll, source, len, &s, status); 1.4624 + if(U_FAILURE(*status)) { 1.4625 + return; 1.4626 + } 1.4627 + s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was normalized. 1.4628 + 1.4629 + uint32_t order = 0; 1.4630 + 1.4631 + uint8_t primary1 = 0; 1.4632 + uint8_t primary2 = 0; 1.4633 + uint8_t secondary = 0; 1.4634 + uint8_t tertiary = 0; 1.4635 + uint8_t caseSwitch = coll->caseSwitch; 1.4636 + uint8_t tertiaryMask = coll->tertiaryMask; 1.4637 + int8_t tertiaryAddition = coll->tertiaryAddition; 1.4638 + uint8_t tertiaryTop = coll->tertiaryTop; 1.4639 + uint8_t tertiaryBottom = coll->tertiaryBottom; 1.4640 + uint8_t tertiaryCommon = coll->tertiaryCommon; 1.4641 + uint8_t caseBits = 0; 1.4642 + 1.4643 + UBool wasShifted = FALSE; 1.4644 + UBool notIsContinuation = FALSE; 1.4645 + 1.4646 + uint32_t count2 = 0, count3 = 0, count4 = 0; 1.4647 + uint8_t leadPrimary = 0; 1.4648 + 1.4649 + for(;;) { 1.4650 + order = ucol_IGetNextCE(coll, &s, status); 1.4651 + if(order == UCOL_NO_MORE_CES) { 1.4652 + break; 1.4653 + } 1.4654 + 1.4655 + if(order == 0) { 1.4656 + continue; 1.4657 + } 1.4658 + 1.4659 + notIsContinuation = !isContinuation(order); 1.4660 + 1.4661 + if(notIsContinuation) { 1.4662 + tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK); 1.4663 + } else { 1.4664 + tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); 1.4665 + } 1.4666 + 1.4667 + secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 1.4668 + primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 1.4669 + primary1 = (uint8_t)(order >> 8); 1.4670 + 1.4671 + uint8_t originalPrimary1 = primary1; 1.4672 + if(notIsContinuation && coll->leadBytePermutationTable != NULL) { 1.4673 + primary1 = coll->leadBytePermutationTable[primary1]; 1.4674 + } 1.4675 + 1.4676 + if((shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0) 1.4677 + || (!notIsContinuation && wasShifted))) 1.4678 + || (wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */ 1.4679 + { 1.4680 + /* and other ignorables should be removed if following a shifted code point */ 1.4681 + if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */ 1.4682 + /* we should just completely ignore it */ 1.4683 + continue; 1.4684 + } 1.4685 + if(compareQuad == 0) { 1.4686 + if(count4 > 0) { 1.4687 + while (count4 > UCOL_BOT_COUNT4) { 1.4688 + quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); 1.4689 + count4 -= UCOL_BOT_COUNT4; 1.4690 + } 1.4691 + quads.appendByte(UCOL_COMMON_BOT4 + (count4-1)); 1.4692 + count4 = 0; 1.4693 + } 1.4694 + /* We are dealing with a variable and we're treating them as shifted */ 1.4695 + /* This is a shifted ignorable */ 1.4696 + if(primary1 != 0) { /* we need to check this since we could be in continuation */ 1.4697 + quads.appendByte(primary1); 1.4698 + } 1.4699 + if(primary2 != 0) { 1.4700 + quads.appendByte(primary2); 1.4701 + } 1.4702 + } 1.4703 + wasShifted = TRUE; 1.4704 + } else { 1.4705 + wasShifted = FALSE; 1.4706 + /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */ 1.4707 + /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */ 1.4708 + /* regular and simple sortkey calc */ 1.4709 + if(primary1 != UCOL_IGNORABLE) { 1.4710 + if(notIsContinuation) { 1.4711 + if(leadPrimary == primary1) { 1.4712 + primaries.Append(primary2); 1.4713 + } else { 1.4714 + if(leadPrimary != 0) { 1.4715 + primaries.Append((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN); 1.4716 + } 1.4717 + if(primary2 == UCOL_IGNORABLE) { 1.4718 + /* one byter, not compressed */ 1.4719 + primaries.Append(primary1); 1.4720 + leadPrimary = 0; 1.4721 + } else if(isCompressible(coll, originalPrimary1)) { 1.4722 + /* compress */ 1.4723 + primaries.Append(leadPrimary = primary1, primary2); 1.4724 + } else { 1.4725 + leadPrimary = 0; 1.4726 + primaries.Append(primary1, primary2); 1.4727 + } 1.4728 + } 1.4729 + } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */ 1.4730 + if(primary2 == UCOL_IGNORABLE) { 1.4731 + primaries.Append(primary1); 1.4732 + } else { 1.4733 + primaries.Append(primary1, primary2); 1.4734 + } 1.4735 + } 1.4736 + } 1.4737 + 1.4738 + if(secondary > compareSec) { 1.4739 + if(!isFrenchSec) { 1.4740 + /* This is compression code. */ 1.4741 + if (secondary == UCOL_COMMON2 && notIsContinuation) { 1.4742 + ++count2; 1.4743 + } else { 1.4744 + if (count2 > 0) { 1.4745 + if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 1.4746 + while (count2 > UCOL_TOP_COUNT2) { 1.4747 + secondaries.appendByte(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2); 1.4748 + count2 -= (uint32_t)UCOL_TOP_COUNT2; 1.4749 + } 1.4750 + secondaries.appendByte(UCOL_COMMON_TOP2 - (count2-1)); 1.4751 + } else { 1.4752 + while (count2 > UCOL_BOT_COUNT2) { 1.4753 + secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 1.4754 + count2 -= (uint32_t)UCOL_BOT_COUNT2; 1.4755 + } 1.4756 + secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1)); 1.4757 + } 1.4758 + count2 = 0; 1.4759 + } 1.4760 + secondaries.appendByte(secondary); 1.4761 + } 1.4762 + } else { 1.4763 + /* Do the special handling for French secondaries */ 1.4764 + /* We need to get continuation elements and do intermediate restore */ 1.4765 + /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */ 1.4766 + if(notIsContinuation) { 1.4767 + if (lastSecondaryLength > 1) { 1.4768 + uint8_t *frenchStartPtr = secondaries.getLastFewBytes(lastSecondaryLength); 1.4769 + if (frenchStartPtr != NULL) { 1.4770 + /* reverse secondaries from frenchStartPtr up to frenchEndPtr */ 1.4771 + uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1; 1.4772 + uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); 1.4773 + } 1.4774 + } 1.4775 + lastSecondaryLength = 1; 1.4776 + } else { 1.4777 + ++lastSecondaryLength; 1.4778 + } 1.4779 + secondaries.appendByte(secondary); 1.4780 + } 1.4781 + } 1.4782 + 1.4783 + if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) { 1.4784 + // do the case level if we need to do it. We don't want to calculate 1.4785 + // case level for primary ignorables if we have only primary strength and case level 1.4786 + // otherwise we would break well formedness of CEs 1.4787 + doCaseShift(cases, caseShift); 1.4788 + if(notIsContinuation) { 1.4789 + caseBits = (uint8_t)(tertiary & 0xC0); 1.4790 + 1.4791 + if(tertiary != 0) { 1.4792 + if(coll->caseFirst == UCOL_UPPER_FIRST) { 1.4793 + if((caseBits & 0xC0) == 0) { 1.4794 + cases.lastByte() |= 1 << (--caseShift); 1.4795 + } else { 1.4796 + cases.lastByte() |= 0 << (--caseShift); 1.4797 + /* second bit */ 1.4798 + doCaseShift(cases, caseShift); 1.4799 + cases.lastByte() |= ((caseBits>>6)&1) << (--caseShift); 1.4800 + } 1.4801 + } else { 1.4802 + if((caseBits & 0xC0) == 0) { 1.4803 + cases.lastByte() |= 0 << (--caseShift); 1.4804 + } else { 1.4805 + cases.lastByte() |= 1 << (--caseShift); 1.4806 + /* second bit */ 1.4807 + doCaseShift(cases, caseShift); 1.4808 + cases.lastByte() |= ((caseBits>>7)&1) << (--caseShift); 1.4809 + } 1.4810 + } 1.4811 + } 1.4812 + } 1.4813 + } else { 1.4814 + if(notIsContinuation) { 1.4815 + tertiary ^= caseSwitch; 1.4816 + } 1.4817 + } 1.4818 + 1.4819 + tertiary &= tertiaryMask; 1.4820 + if(tertiary > compareTer) { 1.4821 + /* This is compression code. */ 1.4822 + /* sequence size check is included in the if clause */ 1.4823 + if (tertiary == tertiaryCommon && notIsContinuation) { 1.4824 + ++count3; 1.4825 + } else { 1.4826 + if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) { 1.4827 + tertiary += tertiaryAddition; 1.4828 + } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) { 1.4829 + tertiary -= tertiaryAddition; 1.4830 + } 1.4831 + if (count3 > 0) { 1.4832 + if ((tertiary > tertiaryCommon)) { 1.4833 + while (count3 > coll->tertiaryTopCount) { 1.4834 + tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount); 1.4835 + count3 -= (uint32_t)coll->tertiaryTopCount; 1.4836 + } 1.4837 + tertiaries.appendByte(tertiaryTop - (count3-1)); 1.4838 + } else { 1.4839 + while (count3 > coll->tertiaryBottomCount) { 1.4840 + tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount); 1.4841 + count3 -= (uint32_t)coll->tertiaryBottomCount; 1.4842 + } 1.4843 + tertiaries.appendByte(tertiaryBottom + (count3-1)); 1.4844 + } 1.4845 + count3 = 0; 1.4846 + } 1.4847 + tertiaries.appendByte(tertiary); 1.4848 + } 1.4849 + } 1.4850 + 1.4851 + if(/*qShifted*/(compareQuad==0) && notIsContinuation) { 1.4852 + if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it 1.4853 + if(count4>0) { // Close this part 1.4854 + while (count4 > UCOL_BOT_COUNT4) { 1.4855 + quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); 1.4856 + count4 -= UCOL_BOT_COUNT4; 1.4857 + } 1.4858 + quads.appendByte(UCOL_COMMON_BOT4 + (count4-1)); 1.4859 + count4 = 0; 1.4860 + } 1.4861 + quads.appendByte(UCOL_HIRAGANA_QUAD); // Add the Hiragana 1.4862 + } else { // This wasn't Hiragana, so we can continue adding stuff 1.4863 + count4++; 1.4864 + } 1.4865 + } 1.4866 + } 1.4867 + } 1.4868 + 1.4869 + /* Here, we are generally done with processing */ 1.4870 + /* bailing out would not be too productive */ 1.4871 + 1.4872 + UBool ok = TRUE; 1.4873 + if(U_SUCCESS(*status)) { 1.4874 + /* we have done all the CE's, now let's put them together to form a key */ 1.4875 + if(compareSec == 0) { 1.4876 + if (count2 > 0) { 1.4877 + while (count2 > UCOL_BOT_COUNT2) { 1.4878 + secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 1.4879 + count2 -= (uint32_t)UCOL_BOT_COUNT2; 1.4880 + } 1.4881 + secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1)); 1.4882 + } 1.4883 + result.Append(UCOL_LEVELTERMINATOR); 1.4884 + if(!secondaries.isOk()) { 1.4885 + ok = FALSE; 1.4886 + } else if(!isFrenchSec) { 1.4887 + secondaries.appendTo(result); 1.4888 + } else { 1.4889 + // If there are any unresolved continuation secondaries, 1.4890 + // reverse them here so that we can reverse the whole secondary thing. 1.4891 + if (lastSecondaryLength > 1) { 1.4892 + uint8_t *frenchStartPtr = secondaries.getLastFewBytes(lastSecondaryLength); 1.4893 + if (frenchStartPtr != NULL) { 1.4894 + /* reverse secondaries from frenchStartPtr up to frenchEndPtr */ 1.4895 + uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1; 1.4896 + uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); 1.4897 + } 1.4898 + } 1.4899 + packFrench(secondaries.data(), secondaries.length(), result); 1.4900 + } 1.4901 + } 1.4902 + 1.4903 + if(doCase) { 1.4904 + ok &= cases.isOk(); 1.4905 + result.Append(UCOL_LEVELTERMINATOR); 1.4906 + cases.appendTo(result); 1.4907 + } 1.4908 + 1.4909 + if(compareTer == 0) { 1.4910 + if (count3 > 0) { 1.4911 + if (coll->tertiaryCommon != UCOL_COMMON_BOT3) { 1.4912 + while (count3 >= coll->tertiaryTopCount) { 1.4913 + tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount); 1.4914 + count3 -= (uint32_t)coll->tertiaryTopCount; 1.4915 + } 1.4916 + tertiaries.appendByte(tertiaryTop - count3); 1.4917 + } else { 1.4918 + while (count3 > coll->tertiaryBottomCount) { 1.4919 + tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount); 1.4920 + count3 -= (uint32_t)coll->tertiaryBottomCount; 1.4921 + } 1.4922 + tertiaries.appendByte(tertiaryBottom + (count3-1)); 1.4923 + } 1.4924 + } 1.4925 + ok &= tertiaries.isOk(); 1.4926 + result.Append(UCOL_LEVELTERMINATOR); 1.4927 + tertiaries.appendTo(result); 1.4928 + 1.4929 + if(compareQuad == 0/*qShifted == TRUE*/) { 1.4930 + if(count4 > 0) { 1.4931 + while (count4 > UCOL_BOT_COUNT4) { 1.4932 + quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); 1.4933 + count4 -= UCOL_BOT_COUNT4; 1.4934 + } 1.4935 + quads.appendByte(UCOL_COMMON_BOT4 + (count4-1)); 1.4936 + } 1.4937 + ok &= quads.isOk(); 1.4938 + result.Append(UCOL_LEVELTERMINATOR); 1.4939 + quads.appendTo(result); 1.4940 + } 1.4941 + 1.4942 + if(compareIdent) { 1.4943 + result.Append(UCOL_LEVELTERMINATOR); 1.4944 + u_writeIdenticalLevelRun(s.string, len, result); 1.4945 + } 1.4946 + } 1.4947 + result.Append(0); 1.4948 + } 1.4949 + 1.4950 + /* To avoid memory leak, free the offset buffer if necessary. */ 1.4951 + ucol_freeOffsetBuffer(&s); 1.4952 + 1.4953 + ok &= result.IsOk(); 1.4954 + if(!ok && U_SUCCESS(*status)) { *status = U_MEMORY_ALLOCATION_ERROR; } 1.4955 +} 1.4956 + 1.4957 + 1.4958 +U_CFUNC void U_CALLCONV 1.4959 +ucol_calcSortKeySimpleTertiary(const UCollator *coll, 1.4960 + const UChar *source, 1.4961 + int32_t sourceLength, 1.4962 + SortKeyByteSink &result, 1.4963 + UErrorCode *status) 1.4964 +{ 1.4965 + U_ALIGN_CODE(16); 1.4966 + 1.4967 + if(U_FAILURE(*status)) { 1.4968 + return; 1.4969 + } 1.4970 + 1.4971 + SortKeyByteSink &primaries = result; 1.4972 + SortKeyLevel secondaries; 1.4973 + SortKeyLevel tertiaries; 1.4974 + 1.4975 + UnicodeString normSource; 1.4976 + 1.4977 + int32_t len = sourceLength; 1.4978 + 1.4979 + /* If we need to normalize, we'll do it all at once at the beginning! */ 1.4980 + if(coll->normalizationMode != UCOL_OFF) { 1.4981 + normSource.setTo(len < 0, source, len); 1.4982 + const Normalizer2 *norm2 = Normalizer2Factory::getFCDInstance(*status); 1.4983 + int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status); 1.4984 + if(qcYesLength != normSource.length()) { 1.4985 + UnicodeString unnormalized = normSource.tempSubString(qcYesLength); 1.4986 + normSource.truncate(qcYesLength); 1.4987 + norm2->normalizeSecondAndAppend(normSource, unnormalized, *status); 1.4988 + source = normSource.getBuffer(); 1.4989 + len = normSource.length(); 1.4990 + } 1.4991 + } 1.4992 + collIterate s; 1.4993 + IInit_collIterate(coll, (UChar *)source, len, &s, status); 1.4994 + if(U_FAILURE(*status)) { 1.4995 + return; 1.4996 + } 1.4997 + s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was normalized. 1.4998 + 1.4999 + uint32_t order = 0; 1.5000 + 1.5001 + uint8_t primary1 = 0; 1.5002 + uint8_t primary2 = 0; 1.5003 + uint8_t secondary = 0; 1.5004 + uint8_t tertiary = 0; 1.5005 + uint8_t caseSwitch = coll->caseSwitch; 1.5006 + uint8_t tertiaryMask = coll->tertiaryMask; 1.5007 + int8_t tertiaryAddition = coll->tertiaryAddition; 1.5008 + uint8_t tertiaryTop = coll->tertiaryTop; 1.5009 + uint8_t tertiaryBottom = coll->tertiaryBottom; 1.5010 + uint8_t tertiaryCommon = coll->tertiaryCommon; 1.5011 + 1.5012 + UBool notIsContinuation = FALSE; 1.5013 + 1.5014 + uint32_t count2 = 0, count3 = 0; 1.5015 + uint8_t leadPrimary = 0; 1.5016 + 1.5017 + for(;;) { 1.5018 + order = ucol_IGetNextCE(coll, &s, status); 1.5019 + 1.5020 + if(order == 0) { 1.5021 + continue; 1.5022 + } 1.5023 + 1.5024 + if(order == UCOL_NO_MORE_CES) { 1.5025 + break; 1.5026 + } 1.5027 + 1.5028 + notIsContinuation = !isContinuation(order); 1.5029 + 1.5030 + if(notIsContinuation) { 1.5031 + tertiary = (uint8_t)((order & tertiaryMask)); 1.5032 + } else { 1.5033 + tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); 1.5034 + } 1.5035 + 1.5036 + secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 1.5037 + primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 1.5038 + primary1 = (uint8_t)(order >> 8); 1.5039 + 1.5040 + uint8_t originalPrimary1 = primary1; 1.5041 + if (coll->leadBytePermutationTable != NULL && notIsContinuation) { 1.5042 + primary1 = coll->leadBytePermutationTable[primary1]; 1.5043 + } 1.5044 + 1.5045 + /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */ 1.5046 + /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */ 1.5047 + /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */ 1.5048 + /* regular and simple sortkey calc */ 1.5049 + if(primary1 != UCOL_IGNORABLE) { 1.5050 + if(notIsContinuation) { 1.5051 + if(leadPrimary == primary1) { 1.5052 + primaries.Append(primary2); 1.5053 + } else { 1.5054 + if(leadPrimary != 0) { 1.5055 + primaries.Append((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN); 1.5056 + } 1.5057 + if(primary2 == UCOL_IGNORABLE) { 1.5058 + /* one byter, not compressed */ 1.5059 + primaries.Append(primary1); 1.5060 + leadPrimary = 0; 1.5061 + } else if(isCompressible(coll, originalPrimary1)) { 1.5062 + /* compress */ 1.5063 + primaries.Append(leadPrimary = primary1, primary2); 1.5064 + } else { 1.5065 + leadPrimary = 0; 1.5066 + primaries.Append(primary1, primary2); 1.5067 + } 1.5068 + } 1.5069 + } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */ 1.5070 + if(primary2 == UCOL_IGNORABLE) { 1.5071 + primaries.Append(primary1); 1.5072 + } else { 1.5073 + primaries.Append(primary1, primary2); 1.5074 + } 1.5075 + } 1.5076 + } 1.5077 + 1.5078 + if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */ 1.5079 + /* This is compression code. */ 1.5080 + if (secondary == UCOL_COMMON2 && notIsContinuation) { 1.5081 + ++count2; 1.5082 + } else { 1.5083 + if (count2 > 0) { 1.5084 + if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 1.5085 + while (count2 > UCOL_TOP_COUNT2) { 1.5086 + secondaries.appendByte(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2); 1.5087 + count2 -= (uint32_t)UCOL_TOP_COUNT2; 1.5088 + } 1.5089 + secondaries.appendByte(UCOL_COMMON_TOP2 - (count2-1)); 1.5090 + } else { 1.5091 + while (count2 > UCOL_BOT_COUNT2) { 1.5092 + secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 1.5093 + count2 -= (uint32_t)UCOL_BOT_COUNT2; 1.5094 + } 1.5095 + secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1)); 1.5096 + } 1.5097 + count2 = 0; 1.5098 + } 1.5099 + secondaries.appendByte(secondary); 1.5100 + } 1.5101 + } 1.5102 + 1.5103 + if(notIsContinuation) { 1.5104 + tertiary ^= caseSwitch; 1.5105 + } 1.5106 + 1.5107 + if(tertiary > 0) { 1.5108 + /* This is compression code. */ 1.5109 + /* sequence size check is included in the if clause */ 1.5110 + if (tertiary == tertiaryCommon && notIsContinuation) { 1.5111 + ++count3; 1.5112 + } else { 1.5113 + if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) { 1.5114 + tertiary += tertiaryAddition; 1.5115 + } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) { 1.5116 + tertiary -= tertiaryAddition; 1.5117 + } 1.5118 + if (count3 > 0) { 1.5119 + if ((tertiary > tertiaryCommon)) { 1.5120 + while (count3 > coll->tertiaryTopCount) { 1.5121 + tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount); 1.5122 + count3 -= (uint32_t)coll->tertiaryTopCount; 1.5123 + } 1.5124 + tertiaries.appendByte(tertiaryTop - (count3-1)); 1.5125 + } else { 1.5126 + while (count3 > coll->tertiaryBottomCount) { 1.5127 + tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount); 1.5128 + count3 -= (uint32_t)coll->tertiaryBottomCount; 1.5129 + } 1.5130 + tertiaries.appendByte(tertiaryBottom + (count3-1)); 1.5131 + } 1.5132 + count3 = 0; 1.5133 + } 1.5134 + tertiaries.appendByte(tertiary); 1.5135 + } 1.5136 + } 1.5137 + } 1.5138 + 1.5139 + UBool ok = TRUE; 1.5140 + if(U_SUCCESS(*status)) { 1.5141 + /* we have done all the CE's, now let's put them together to form a key */ 1.5142 + if (count2 > 0) { 1.5143 + while (count2 > UCOL_BOT_COUNT2) { 1.5144 + secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 1.5145 + count2 -= (uint32_t)UCOL_BOT_COUNT2; 1.5146 + } 1.5147 + secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1)); 1.5148 + } 1.5149 + ok &= secondaries.isOk(); 1.5150 + result.Append(UCOL_LEVELTERMINATOR); 1.5151 + secondaries.appendTo(result); 1.5152 + 1.5153 + if (count3 > 0) { 1.5154 + if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) { 1.5155 + while (count3 >= coll->tertiaryTopCount) { 1.5156 + tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount); 1.5157 + count3 -= (uint32_t)coll->tertiaryTopCount; 1.5158 + } 1.5159 + tertiaries.appendByte(tertiaryTop - count3); 1.5160 + } else { 1.5161 + while (count3 > coll->tertiaryBottomCount) { 1.5162 + tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount); 1.5163 + count3 -= (uint32_t)coll->tertiaryBottomCount; 1.5164 + } 1.5165 + tertiaries.appendByte(tertiaryBottom + (count3-1)); 1.5166 + } 1.5167 + } 1.5168 + ok &= tertiaries.isOk(); 1.5169 + result.Append(UCOL_LEVELTERMINATOR); 1.5170 + tertiaries.appendTo(result); 1.5171 + 1.5172 + result.Append(0); 1.5173 + } 1.5174 + 1.5175 + /* To avoid memory leak, free the offset buffer if necessary. */ 1.5176 + ucol_freeOffsetBuffer(&s); 1.5177 + 1.5178 + ok &= result.IsOk(); 1.5179 + if(!ok && U_SUCCESS(*status)) { *status = U_MEMORY_ALLOCATION_ERROR; } 1.5180 +} 1.5181 + 1.5182 +static inline 1.5183 +UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) { 1.5184 + UBool notIsContinuation = !isContinuation(CE); 1.5185 + uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF); 1.5186 + if((LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0) 1.5187 + || (!notIsContinuation && *wasShifted))) 1.5188 + || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */ 1.5189 + { 1.5190 + // The stuff below should probably be in the sortkey code... maybe not... 1.5191 + if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */ 1.5192 + /* we should just completely ignore it */ 1.5193 + *wasShifted = TRUE; 1.5194 + //continue; 1.5195 + } 1.5196 + //*wasShifted = TRUE; 1.5197 + return TRUE; 1.5198 + } else { 1.5199 + *wasShifted = FALSE; 1.5200 + return FALSE; 1.5201 + } 1.5202 +} 1.5203 +static inline 1.5204 +void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) { 1.5205 + if(level < maxLevel) { 1.5206 + dest[i++] = UCOL_LEVELTERMINATOR; 1.5207 + } else { 1.5208 + dest[i++] = 0; 1.5209 + } 1.5210 +} 1.5211 + 1.5212 +/** enumeration of level identifiers for partial sort key generation */ 1.5213 +enum { 1.5214 + UCOL_PSK_PRIMARY = 0, 1.5215 + UCOL_PSK_SECONDARY = 1, 1.5216 + UCOL_PSK_CASE = 2, 1.5217 + UCOL_PSK_TERTIARY = 3, 1.5218 + UCOL_PSK_QUATERNARY = 4, 1.5219 + UCOL_PSK_QUIN = 5, /** This is an extra level, not used - but we have three bits to blow */ 1.5220 + UCOL_PSK_IDENTICAL = 6, 1.5221 + UCOL_PSK_NULL = 7, /** level for the end of sort key. Will just produce zeros */ 1.5222 + UCOL_PSK_LIMIT 1.5223 +}; 1.5224 + 1.5225 +/** collation state enum. *_SHIFT value is how much to shift right 1.5226 + * to get the state piece to the right. *_MASK value should be 1.5227 + * ANDed with the shifted state. This data is stored in state[1] 1.5228 + * field. 1.5229 + */ 1.5230 +enum { 1.5231 + UCOL_PSK_LEVEL_SHIFT = 0, /** level identificator. stores an enum value from above */ 1.5232 + UCOL_PSK_LEVEL_MASK = 7, /** three bits */ 1.5233 + UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */ 1.5234 + UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1, 1.5235 + /** can be only 0 or 1, since we get up to two bytes from primary or quaternary 1.5236 + * This field is also used to denote that the French secondary level is finished 1.5237 + */ 1.5238 + UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */ 1.5239 + UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */ 1.5240 + UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */ 1.5241 + UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */ 1.5242 + /** When we do French we need to reverse secondary values. However, continuations 1.5243 + * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba 1.5244 + */ 1.5245 + UCOL_PSK_BOCSU_BYTES_SHIFT = 7, 1.5246 + UCOL_PSK_BOCSU_BYTES_MASK = 3, 1.5247 + UCOL_PSK_CONSUMED_CES_SHIFT = 9, 1.5248 + UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF 1.5249 +}; 1.5250 + 1.5251 +// macro calculating the number of expansion CEs available 1.5252 +#define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn 1.5253 + 1.5254 + 1.5255 +/** main sortkey part procedure. On the first call, 1.5256 + * you should pass in a collator, an iterator, empty state 1.5257 + * state[0] == state[1] == 0, a buffer to hold results 1.5258 + * number of bytes you need and an error code pointer. 1.5259 + * Make sure your buffer is big enough to hold the wanted 1.5260 + * number of sortkey bytes. I don't check. 1.5261 + * The only meaningful status you can get back is 1.5262 + * U_BUFFER_OVERFLOW_ERROR, which basically means that you 1.5263 + * have been dealt a raw deal and that you probably won't 1.5264 + * be able to use partial sortkey generation for this 1.5265 + * particular combination of string and collator. This 1.5266 + * is highly unlikely, but you should still check the error code. 1.5267 + * Any other status means that you're not in a sane situation 1.5268 + * anymore. After the first call, preserve state values and 1.5269 + * use them on subsequent calls to obtain more bytes of a sortkey. 1.5270 + * Use until the number of bytes written is smaller than the requested 1.5271 + * number of bytes. Generated sortkey is not compatible with the 1.5272 + * one generated by ucol_getSortKey, as we don't do any compression. 1.5273 + * However, levels are still terminated by a 1 (one) and the sortkey 1.5274 + * is terminated by a 0 (zero). Identical level is the same as in the 1.5275 + * regular sortkey - internal bocu-1 implementation is used. 1.5276 + * For curious, although you cannot do much about this, here is 1.5277 + * the structure of state words. 1.5278 + * state[0] - iterator state. Depends on the iterator implementation, 1.5279 + * but allows the iterator to continue where it stopped in 1.5280 + * the last iteration. 1.5281 + * state[1] - collation processing state. Here is the distribution 1.5282 + * of the bits: 1.5283 + * 0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary 1.5284 + * quaternary, quin (we don't use this one), identical and 1.5285 + * null (producing only zeroes - first one to terminate the 1.5286 + * sortkey and subsequent to fill the buffer). 1.5287 + * 3 - byte count. Number of bytes written on the primary level. 1.5288 + * 4 - was shifted. Whether the previous iteration finished in the 1.5289 + * shifted state. 1.5290 + * 5, 6 - French continuation bytes written. See the comment in the enum 1.5291 + * 7,8 - Bocsu bytes used. Number of bytes from a bocu sequence on 1.5292 + * the identical level. 1.5293 + * 9..31 - CEs consumed. Number of getCE or next32 operations performed 1.5294 + * since thes last successful update of the iterator state. 1.5295 + */ 1.5296 +U_CAPI int32_t U_EXPORT2 1.5297 +ucol_nextSortKeyPart(const UCollator *coll, 1.5298 + UCharIterator *iter, 1.5299 + uint32_t state[2], 1.5300 + uint8_t *dest, int32_t count, 1.5301 + UErrorCode *status) 1.5302 +{ 1.5303 + /* error checking */ 1.5304 + if(status==NULL || U_FAILURE(*status)) { 1.5305 + return 0; 1.5306 + } 1.5307 + UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART); 1.5308 + if( coll==NULL || iter==NULL || 1.5309 + state==NULL || 1.5310 + count<0 || (count>0 && dest==NULL) 1.5311 + ) { 1.5312 + *status=U_ILLEGAL_ARGUMENT_ERROR; 1.5313 + UTRACE_EXIT_STATUS(status); 1.5314 + return 0; 1.5315 + } 1.5316 + 1.5317 + UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d", 1.5318 + coll, iter, state[0], state[1], dest, count); 1.5319 + 1.5320 + if(count==0) { 1.5321 + /* nothing to do */ 1.5322 + UTRACE_EXIT_VALUE(0); 1.5323 + return 0; 1.5324 + } 1.5325 + /** Setting up situation according to the state we got from the previous iteration */ 1.5326 + // The state of the iterator from the previous invocation 1.5327 + uint32_t iterState = state[0]; 1.5328 + // Has the last iteration ended in the shifted state 1.5329 + UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE; 1.5330 + // What is the current level of the sortkey? 1.5331 + int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK; 1.5332 + // Have we written only one byte from a two byte primary in the previous iteration? 1.5333 + // Also on secondary level - have we finished with the French secondary? 1.5334 + int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK; 1.5335 + // number of bytes in the continuation buffer for French 1.5336 + int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK; 1.5337 + // Number of bytes already written from a bocsu sequence. Since 1.5338 + // the longes bocsu sequence is 4 long, this can be up to 3. 1.5339 + int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK; 1.5340 + // Number of elements that need to be consumed in this iteration because 1.5341 + // the iterator returned UITER_NO_STATE at the end of the last iteration, 1.5342 + // so we had to save the last valid state. 1.5343 + int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK; 1.5344 + 1.5345 + /** values that depend on the collator attributes */ 1.5346 + // strength of the collator. 1.5347 + int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status); 1.5348 + // maximal level of the partial sortkey. Need to take whether case level is done 1.5349 + int32_t maxLevel = 0; 1.5350 + if(strength < UCOL_TERTIARY) { 1.5351 + if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) { 1.5352 + maxLevel = UCOL_PSK_CASE; 1.5353 + } else { 1.5354 + maxLevel = strength; 1.5355 + } 1.5356 + } else { 1.5357 + if(strength == UCOL_TERTIARY) { 1.5358 + maxLevel = UCOL_PSK_TERTIARY; 1.5359 + } else if(strength == UCOL_QUATERNARY) { 1.5360 + maxLevel = UCOL_PSK_QUATERNARY; 1.5361 + } else { // identical 1.5362 + maxLevel = UCOL_IDENTICAL; 1.5363 + } 1.5364 + } 1.5365 + // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation 1.5366 + uint8_t UCOL_HIRAGANA_QUAD = 1.5367 + (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF; 1.5368 + // Boundary value that decides whether a CE is shifted or not 1.5369 + uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0; 1.5370 + // Are we doing French collation? 1.5371 + UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON); 1.5372 + 1.5373 + /** initializing the collation state */ 1.5374 + UBool notIsContinuation = FALSE; 1.5375 + uint32_t CE = UCOL_NO_MORE_CES; 1.5376 + 1.5377 + collIterate s; 1.5378 + IInit_collIterate(coll, NULL, -1, &s, status); 1.5379 + if(U_FAILURE(*status)) { 1.5380 + UTRACE_EXIT_STATUS(*status); 1.5381 + return 0; 1.5382 + } 1.5383 + s.iterator = iter; 1.5384 + s.flags |= UCOL_USE_ITERATOR; 1.5385 + // This variable tells us whether we have produced some other levels in this iteration 1.5386 + // before we moved to the identical level. In that case, we need to switch the 1.5387 + // type of the iterator. 1.5388 + UBool doingIdenticalFromStart = FALSE; 1.5389 + // Normalizing iterator 1.5390 + // The division for the array length may truncate the array size to 1.5391 + // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high 1.5392 + // for all platforms anyway. 1.5393 + UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 1.5394 + UNormIterator *normIter = NULL; 1.5395 + // If the normalization is turned on for the collator and we are below identical level 1.5396 + // we will use a FCD normalizing iterator 1.5397 + if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) { 1.5398 + normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); 1.5399 + s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status); 1.5400 + s.flags &= ~UCOL_ITER_NORM; 1.5401 + if(U_FAILURE(*status)) { 1.5402 + UTRACE_EXIT_STATUS(*status); 1.5403 + return 0; 1.5404 + } 1.5405 + } else if(level == UCOL_PSK_IDENTICAL) { 1.5406 + // for identical level, we need a NFD iterator. We need to instantiate it here, since we 1.5407 + // will be updating the state - and this cannot be done on an ordinary iterator. 1.5408 + normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); 1.5409 + s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); 1.5410 + s.flags &= ~UCOL_ITER_NORM; 1.5411 + if(U_FAILURE(*status)) { 1.5412 + UTRACE_EXIT_STATUS(*status); 1.5413 + return 0; 1.5414 + } 1.5415 + doingIdenticalFromStart = TRUE; 1.5416 + } 1.5417 + 1.5418 + // This is the tentative new state of the iterator. The problem 1.5419 + // is that the iterator might return an undefined state, in 1.5420 + // which case we should save the last valid state and increase 1.5421 + // the iterator skip value. 1.5422 + uint32_t newState = 0; 1.5423 + 1.5424 + // First, we set the iterator to the last valid position 1.5425 + // from the last iteration. This was saved in state[0]. 1.5426 + if(iterState == 0) { 1.5427 + /* initial state */ 1.5428 + if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) { 1.5429 + s.iterator->move(s.iterator, 0, UITER_LIMIT); 1.5430 + } else { 1.5431 + s.iterator->move(s.iterator, 0, UITER_START); 1.5432 + } 1.5433 + } else { 1.5434 + /* reset to previous state */ 1.5435 + s.iterator->setState(s.iterator, iterState, status); 1.5436 + if(U_FAILURE(*status)) { 1.5437 + UTRACE_EXIT_STATUS(*status); 1.5438 + return 0; 1.5439 + } 1.5440 + } 1.5441 + 1.5442 + 1.5443 + 1.5444 + // This variable tells us whether we can attempt to update the state 1.5445 + // of iterator. Situations where we don't want to update iterator state 1.5446 + // are the existence of expansion CEs that are not yet processed, and 1.5447 + // finishing the case level without enough space in the buffer to insert 1.5448 + // a level terminator. 1.5449 + UBool canUpdateState = TRUE; 1.5450 + 1.5451 + // Consume all the CEs that were consumed at the end of the previous 1.5452 + // iteration without updating the iterator state. On identical level, 1.5453 + // consume the code points. 1.5454 + int32_t counter = cces; 1.5455 + if(level < UCOL_PSK_IDENTICAL) { 1.5456 + while(counter-->0) { 1.5457 + // If we're doing French and we are on the secondary level, 1.5458 + // we go backwards. 1.5459 + if(level == UCOL_PSK_SECONDARY && doingFrench) { 1.5460 + CE = ucol_IGetPrevCE(coll, &s, status); 1.5461 + } else { 1.5462 + CE = ucol_IGetNextCE(coll, &s, status); 1.5463 + } 1.5464 + if(CE==UCOL_NO_MORE_CES) { 1.5465 + /* should not happen */ 1.5466 + *status=U_INTERNAL_PROGRAM_ERROR; 1.5467 + UTRACE_EXIT_STATUS(*status); 1.5468 + return 0; 1.5469 + } 1.5470 + if(uprv_numAvailableExpCEs(s)) { 1.5471 + canUpdateState = FALSE; 1.5472 + } 1.5473 + } 1.5474 + } else { 1.5475 + while(counter-->0) { 1.5476 + uiter_next32(s.iterator); 1.5477 + } 1.5478 + } 1.5479 + 1.5480 + // French secondary needs to know whether the iterator state of zero came from previous level OR 1.5481 + // from a new invocation... 1.5482 + UBool wasDoingPrimary = FALSE; 1.5483 + // destination buffer byte counter. When this guy 1.5484 + // gets to count, we're done with the iteration 1.5485 + int32_t i = 0; 1.5486 + // used to count the zero bytes written after we 1.5487 + // have finished with the sort key 1.5488 + int32_t j = 0; 1.5489 + 1.5490 + 1.5491 + // Hm.... I think we're ready to plunge in. Basic story is as following: 1.5492 + // we have a fall through case based on level. This is used for initial 1.5493 + // positioning on iteration start. Every level processor contains a 1.5494 + // for(;;) which will be broken when we exhaust all the CEs. Other 1.5495 + // way to exit is a goto saveState, which happens when we have filled 1.5496 + // out our buffer. 1.5497 + switch(level) { 1.5498 + case UCOL_PSK_PRIMARY: 1.5499 + wasDoingPrimary = TRUE; 1.5500 + for(;;) { 1.5501 + if(i==count) { 1.5502 + goto saveState; 1.5503 + } 1.5504 + // We should save the state only if we 1.5505 + // are sure that we are done with the 1.5506 + // previous iterator state 1.5507 + if(canUpdateState && byteCountOrFrenchDone == 0) { 1.5508 + newState = s.iterator->getState(s.iterator); 1.5509 + if(newState != UITER_NO_STATE) { 1.5510 + iterState = newState; 1.5511 + cces = 0; 1.5512 + } 1.5513 + } 1.5514 + CE = ucol_IGetNextCE(coll, &s, status); 1.5515 + cces++; 1.5516 + if(CE==UCOL_NO_MORE_CES) { 1.5517 + // Add the level separator 1.5518 + terminatePSKLevel(level, maxLevel, i, dest); 1.5519 + byteCountOrFrenchDone=0; 1.5520 + // Restart the iteration an move to the 1.5521 + // second level 1.5522 + s.iterator->move(s.iterator, 0, UITER_START); 1.5523 + cces = 0; 1.5524 + level = UCOL_PSK_SECONDARY; 1.5525 + break; 1.5526 + } 1.5527 + if(!isContinuation(CE)){ 1.5528 + if(coll->leadBytePermutationTable != NULL){ 1.5529 + CE = (coll->leadBytePermutationTable[CE>>24] << 24) | (CE & 0x00FFFFFF); 1.5530 + } 1.5531 + } 1.5532 + if(!isShiftedCE(CE, LVT, &wasShifted)) { 1.5533 + CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */ 1.5534 + if(CE != 0) { 1.5535 + if(byteCountOrFrenchDone == 0) { 1.5536 + // get the second byte of primary 1.5537 + dest[i++]=(uint8_t)(CE >> 8); 1.5538 + } else { 1.5539 + byteCountOrFrenchDone = 0; 1.5540 + } 1.5541 + if((CE &=0xff)!=0) { 1.5542 + if(i==count) { 1.5543 + /* overflow */ 1.5544 + byteCountOrFrenchDone = 1; 1.5545 + cces--; 1.5546 + goto saveState; 1.5547 + } 1.5548 + dest[i++]=(uint8_t)CE; 1.5549 + } 1.5550 + } 1.5551 + } 1.5552 + if(uprv_numAvailableExpCEs(s)) { 1.5553 + canUpdateState = FALSE; 1.5554 + } else { 1.5555 + canUpdateState = TRUE; 1.5556 + } 1.5557 + } 1.5558 + /* fall through to next level */ 1.5559 + case UCOL_PSK_SECONDARY: 1.5560 + if(strength >= UCOL_SECONDARY) { 1.5561 + if(!doingFrench) { 1.5562 + for(;;) { 1.5563 + if(i == count) { 1.5564 + goto saveState; 1.5565 + } 1.5566 + // We should save the state only if we 1.5567 + // are sure that we are done with the 1.5568 + // previous iterator state 1.5569 + if(canUpdateState) { 1.5570 + newState = s.iterator->getState(s.iterator); 1.5571 + if(newState != UITER_NO_STATE) { 1.5572 + iterState = newState; 1.5573 + cces = 0; 1.5574 + } 1.5575 + } 1.5576 + CE = ucol_IGetNextCE(coll, &s, status); 1.5577 + cces++; 1.5578 + if(CE==UCOL_NO_MORE_CES) { 1.5579 + // Add the level separator 1.5580 + terminatePSKLevel(level, maxLevel, i, dest); 1.5581 + byteCountOrFrenchDone = 0; 1.5582 + // Restart the iteration an move to the 1.5583 + // second level 1.5584 + s.iterator->move(s.iterator, 0, UITER_START); 1.5585 + cces = 0; 1.5586 + level = UCOL_PSK_CASE; 1.5587 + break; 1.5588 + } 1.5589 + if(!isShiftedCE(CE, LVT, &wasShifted)) { 1.5590 + CE >>= 8; /* get secondary */ 1.5591 + if(CE != 0) { 1.5592 + dest[i++]=(uint8_t)CE; 1.5593 + } 1.5594 + } 1.5595 + if(uprv_numAvailableExpCEs(s)) { 1.5596 + canUpdateState = FALSE; 1.5597 + } else { 1.5598 + canUpdateState = TRUE; 1.5599 + } 1.5600 + } 1.5601 + } else { // French secondary processing 1.5602 + uint8_t frenchBuff[UCOL_MAX_BUFFER]; 1.5603 + int32_t frenchIndex = 0; 1.5604 + // Here we are going backwards. 1.5605 + // If the iterator is at the beggining, it should be 1.5606 + // moved to end. 1.5607 + if(wasDoingPrimary) { 1.5608 + s.iterator->move(s.iterator, 0, UITER_LIMIT); 1.5609 + cces = 0; 1.5610 + } 1.5611 + for(;;) { 1.5612 + if(i == count) { 1.5613 + goto saveState; 1.5614 + } 1.5615 + if(canUpdateState) { 1.5616 + newState = s.iterator->getState(s.iterator); 1.5617 + if(newState != UITER_NO_STATE) { 1.5618 + iterState = newState; 1.5619 + cces = 0; 1.5620 + } 1.5621 + } 1.5622 + CE = ucol_IGetPrevCE(coll, &s, status); 1.5623 + cces++; 1.5624 + if(CE==UCOL_NO_MORE_CES) { 1.5625 + // Add the level separator 1.5626 + terminatePSKLevel(level, maxLevel, i, dest); 1.5627 + byteCountOrFrenchDone = 0; 1.5628 + // Restart the iteration an move to the next level 1.5629 + s.iterator->move(s.iterator, 0, UITER_START); 1.5630 + level = UCOL_PSK_CASE; 1.5631 + break; 1.5632 + } 1.5633 + if(isContinuation(CE)) { // if it's a continuation, we want to save it and 1.5634 + // reverse when we get a first non-continuation CE. 1.5635 + CE >>= 8; 1.5636 + frenchBuff[frenchIndex++] = (uint8_t)CE; 1.5637 + } else if(!isShiftedCE(CE, LVT, &wasShifted)) { 1.5638 + CE >>= 8; /* get secondary */ 1.5639 + if(!frenchIndex) { 1.5640 + if(CE != 0) { 1.5641 + dest[i++]=(uint8_t)CE; 1.5642 + } 1.5643 + } else { 1.5644 + frenchBuff[frenchIndex++] = (uint8_t)CE; 1.5645 + frenchIndex -= usedFrench; 1.5646 + usedFrench = 0; 1.5647 + while(i < count && frenchIndex) { 1.5648 + dest[i++] = frenchBuff[--frenchIndex]; 1.5649 + usedFrench++; 1.5650 + } 1.5651 + } 1.5652 + } 1.5653 + if(uprv_numAvailableExpCEs(s)) { 1.5654 + canUpdateState = FALSE; 1.5655 + } else { 1.5656 + canUpdateState = TRUE; 1.5657 + } 1.5658 + } 1.5659 + } 1.5660 + } else { 1.5661 + level = UCOL_PSK_CASE; 1.5662 + } 1.5663 + /* fall through to next level */ 1.5664 + case UCOL_PSK_CASE: 1.5665 + if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) { 1.5666 + uint32_t caseShift = UCOL_CASE_SHIFT_START; 1.5667 + uint8_t caseByte = UCOL_CASE_BYTE_START; 1.5668 + uint8_t caseBits = 0; 1.5669 + 1.5670 + for(;;) { 1.5671 + U_ASSERT(caseShift <= UCOL_CASE_SHIFT_START); 1.5672 + if(i == count) { 1.5673 + goto saveState; 1.5674 + } 1.5675 + // We should save the state only if we 1.5676 + // are sure that we are done with the 1.5677 + // previous iterator state 1.5678 + if(canUpdateState) { 1.5679 + newState = s.iterator->getState(s.iterator); 1.5680 + if(newState != UITER_NO_STATE) { 1.5681 + iterState = newState; 1.5682 + cces = 0; 1.5683 + } 1.5684 + } 1.5685 + CE = ucol_IGetNextCE(coll, &s, status); 1.5686 + cces++; 1.5687 + if(CE==UCOL_NO_MORE_CES) { 1.5688 + // On the case level we might have an unfinished 1.5689 + // case byte. Add one if it's started. 1.5690 + if(caseShift != UCOL_CASE_SHIFT_START) { 1.5691 + dest[i++] = caseByte; 1.5692 + } 1.5693 + cces = 0; 1.5694 + // We have finished processing CEs on this level. 1.5695 + // However, we don't know if we have enough space 1.5696 + // to add a case level terminator. 1.5697 + if(i < count) { 1.5698 + // Add the level separator 1.5699 + terminatePSKLevel(level, maxLevel, i, dest); 1.5700 + // Restart the iteration and move to the 1.5701 + // next level 1.5702 + s.iterator->move(s.iterator, 0, UITER_START); 1.5703 + level = UCOL_PSK_TERTIARY; 1.5704 + } else { 1.5705 + canUpdateState = FALSE; 1.5706 + } 1.5707 + break; 1.5708 + } 1.5709 + 1.5710 + if(!isShiftedCE(CE, LVT, &wasShifted)) { 1.5711 + if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) { 1.5712 + // do the case level if we need to do it. We don't want to calculate 1.5713 + // case level for primary ignorables if we have only primary strength and case level 1.5714 + // otherwise we would break well formedness of CEs 1.5715 + CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK); 1.5716 + caseBits = (uint8_t)(CE & 0xC0); 1.5717 + // this copies the case level logic from the 1.5718 + // sort key generation code 1.5719 + if(CE != 0) { 1.5720 + if (caseShift == 0) { 1.5721 + dest[i++] = caseByte; 1.5722 + caseShift = UCOL_CASE_SHIFT_START; 1.5723 + caseByte = UCOL_CASE_BYTE_START; 1.5724 + } 1.5725 + if(coll->caseFirst == UCOL_UPPER_FIRST) { 1.5726 + if((caseBits & 0xC0) == 0) { 1.5727 + caseByte |= 1 << (--caseShift); 1.5728 + } else { 1.5729 + caseByte |= 0 << (--caseShift); 1.5730 + /* second bit */ 1.5731 + if(caseShift == 0) { 1.5732 + dest[i++] = caseByte; 1.5733 + caseShift = UCOL_CASE_SHIFT_START; 1.5734 + caseByte = UCOL_CASE_BYTE_START; 1.5735 + } 1.5736 + caseByte |= ((caseBits>>6)&1) << (--caseShift); 1.5737 + } 1.5738 + } else { 1.5739 + if((caseBits & 0xC0) == 0) { 1.5740 + caseByte |= 0 << (--caseShift); 1.5741 + } else { 1.5742 + caseByte |= 1 << (--caseShift); 1.5743 + /* second bit */ 1.5744 + if(caseShift == 0) { 1.5745 + dest[i++] = caseByte; 1.5746 + caseShift = UCOL_CASE_SHIFT_START; 1.5747 + caseByte = UCOL_CASE_BYTE_START; 1.5748 + } 1.5749 + caseByte |= ((caseBits>>7)&1) << (--caseShift); 1.5750 + } 1.5751 + } 1.5752 + } 1.5753 + 1.5754 + } 1.5755 + } 1.5756 + // Not sure this is correct for the case level - revisit 1.5757 + if(uprv_numAvailableExpCEs(s)) { 1.5758 + canUpdateState = FALSE; 1.5759 + } else { 1.5760 + canUpdateState = TRUE; 1.5761 + } 1.5762 + } 1.5763 + } else { 1.5764 + level = UCOL_PSK_TERTIARY; 1.5765 + } 1.5766 + /* fall through to next level */ 1.5767 + case UCOL_PSK_TERTIARY: 1.5768 + if(strength >= UCOL_TERTIARY) { 1.5769 + for(;;) { 1.5770 + if(i == count) { 1.5771 + goto saveState; 1.5772 + } 1.5773 + // We should save the state only if we 1.5774 + // are sure that we are done with the 1.5775 + // previous iterator state 1.5776 + if(canUpdateState) { 1.5777 + newState = s.iterator->getState(s.iterator); 1.5778 + if(newState != UITER_NO_STATE) { 1.5779 + iterState = newState; 1.5780 + cces = 0; 1.5781 + } 1.5782 + } 1.5783 + CE = ucol_IGetNextCE(coll, &s, status); 1.5784 + cces++; 1.5785 + if(CE==UCOL_NO_MORE_CES) { 1.5786 + // Add the level separator 1.5787 + terminatePSKLevel(level, maxLevel, i, dest); 1.5788 + byteCountOrFrenchDone = 0; 1.5789 + // Restart the iteration an move to the 1.5790 + // second level 1.5791 + s.iterator->move(s.iterator, 0, UITER_START); 1.5792 + cces = 0; 1.5793 + level = UCOL_PSK_QUATERNARY; 1.5794 + break; 1.5795 + } 1.5796 + if(!isShiftedCE(CE, LVT, &wasShifted)) { 1.5797 + notIsContinuation = !isContinuation(CE); 1.5798 + 1.5799 + if(notIsContinuation) { 1.5800 + CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK); 1.5801 + CE ^= coll->caseSwitch; 1.5802 + CE &= coll->tertiaryMask; 1.5803 + } else { 1.5804 + CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION)); 1.5805 + } 1.5806 + 1.5807 + if(CE != 0) { 1.5808 + dest[i++]=(uint8_t)CE; 1.5809 + } 1.5810 + } 1.5811 + if(uprv_numAvailableExpCEs(s)) { 1.5812 + canUpdateState = FALSE; 1.5813 + } else { 1.5814 + canUpdateState = TRUE; 1.5815 + } 1.5816 + } 1.5817 + } else { 1.5818 + // if we're not doing tertiary 1.5819 + // skip to the end 1.5820 + level = UCOL_PSK_NULL; 1.5821 + } 1.5822 + /* fall through to next level */ 1.5823 + case UCOL_PSK_QUATERNARY: 1.5824 + if(strength >= UCOL_QUATERNARY) { 1.5825 + for(;;) { 1.5826 + if(i == count) { 1.5827 + goto saveState; 1.5828 + } 1.5829 + // We should save the state only if we 1.5830 + // are sure that we are done with the 1.5831 + // previous iterator state 1.5832 + if(canUpdateState) { 1.5833 + newState = s.iterator->getState(s.iterator); 1.5834 + if(newState != UITER_NO_STATE) { 1.5835 + iterState = newState; 1.5836 + cces = 0; 1.5837 + } 1.5838 + } 1.5839 + CE = ucol_IGetNextCE(coll, &s, status); 1.5840 + cces++; 1.5841 + if(CE==UCOL_NO_MORE_CES) { 1.5842 + // Add the level separator 1.5843 + terminatePSKLevel(level, maxLevel, i, dest); 1.5844 + //dest[i++] = UCOL_LEVELTERMINATOR; 1.5845 + byteCountOrFrenchDone = 0; 1.5846 + // Restart the iteration an move to the 1.5847 + // second level 1.5848 + s.iterator->move(s.iterator, 0, UITER_START); 1.5849 + cces = 0; 1.5850 + level = UCOL_PSK_QUIN; 1.5851 + break; 1.5852 + } 1.5853 + if(CE==0) 1.5854 + continue; 1.5855 + if(isShiftedCE(CE, LVT, &wasShifted)) { 1.5856 + CE >>= 16; /* get primary */ 1.5857 + if(CE != 0) { 1.5858 + if(byteCountOrFrenchDone == 0) { 1.5859 + dest[i++]=(uint8_t)(CE >> 8); 1.5860 + } else { 1.5861 + byteCountOrFrenchDone = 0; 1.5862 + } 1.5863 + if((CE &=0xff)!=0) { 1.5864 + if(i==count) { 1.5865 + /* overflow */ 1.5866 + byteCountOrFrenchDone = 1; 1.5867 + goto saveState; 1.5868 + } 1.5869 + dest[i++]=(uint8_t)CE; 1.5870 + } 1.5871 + } 1.5872 + } else { 1.5873 + notIsContinuation = !isContinuation(CE); 1.5874 + if(notIsContinuation) { 1.5875 + if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it 1.5876 + dest[i++] = UCOL_HIRAGANA_QUAD; 1.5877 + } else { 1.5878 + dest[i++] = 0xFF; 1.5879 + } 1.5880 + } 1.5881 + } 1.5882 + if(uprv_numAvailableExpCEs(s)) { 1.5883 + canUpdateState = FALSE; 1.5884 + } else { 1.5885 + canUpdateState = TRUE; 1.5886 + } 1.5887 + } 1.5888 + } else { 1.5889 + // if we're not doing quaternary 1.5890 + // skip to the end 1.5891 + level = UCOL_PSK_NULL; 1.5892 + } 1.5893 + /* fall through to next level */ 1.5894 + case UCOL_PSK_QUIN: 1.5895 + level = UCOL_PSK_IDENTICAL; 1.5896 + /* fall through to next level */ 1.5897 + case UCOL_PSK_IDENTICAL: 1.5898 + if(strength >= UCOL_IDENTICAL) { 1.5899 + UChar32 first, second; 1.5900 + int32_t bocsuBytesWritten = 0; 1.5901 + // We always need to do identical on 1.5902 + // the NFD form of the string. 1.5903 + if(normIter == NULL) { 1.5904 + // we arrived from the level below and 1.5905 + // normalization was not turned on. 1.5906 + // therefore, we need to make a fresh NFD iterator 1.5907 + normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); 1.5908 + s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); 1.5909 + } else if(!doingIdenticalFromStart) { 1.5910 + // there is an iterator, but we did some other levels. 1.5911 + // therefore, we have a FCD iterator - need to make 1.5912 + // a NFD one. 1.5913 + // normIter being at the beginning does not guarantee 1.5914 + // that the underlying iterator is at the beginning 1.5915 + iter->move(iter, 0, UITER_START); 1.5916 + s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); 1.5917 + } 1.5918 + // At this point we have a NFD iterator that is positioned 1.5919 + // in the right place 1.5920 + if(U_FAILURE(*status)) { 1.5921 + UTRACE_EXIT_STATUS(*status); 1.5922 + return 0; 1.5923 + } 1.5924 + first = uiter_previous32(s.iterator); 1.5925 + // maybe we're at the start of the string 1.5926 + if(first == U_SENTINEL) { 1.5927 + first = 0; 1.5928 + } else { 1.5929 + uiter_next32(s.iterator); 1.5930 + } 1.5931 + 1.5932 + j = 0; 1.5933 + for(;;) { 1.5934 + if(i == count) { 1.5935 + if(j+1 < bocsuBytesWritten) { 1.5936 + bocsuBytesUsed = j+1; 1.5937 + } 1.5938 + goto saveState; 1.5939 + } 1.5940 + 1.5941 + // On identical level, we will always save 1.5942 + // the state if we reach this point, since 1.5943 + // we don't depend on getNextCE for content 1.5944 + // all the content is in our buffer and we 1.5945 + // already either stored the full buffer OR 1.5946 + // otherwise we won't arrive here. 1.5947 + newState = s.iterator->getState(s.iterator); 1.5948 + if(newState != UITER_NO_STATE) { 1.5949 + iterState = newState; 1.5950 + cces = 0; 1.5951 + } 1.5952 + 1.5953 + uint8_t buff[4]; 1.5954 + second = uiter_next32(s.iterator); 1.5955 + cces++; 1.5956 + 1.5957 + // end condition for identical level 1.5958 + if(second == U_SENTINEL) { 1.5959 + terminatePSKLevel(level, maxLevel, i, dest); 1.5960 + level = UCOL_PSK_NULL; 1.5961 + break; 1.5962 + } 1.5963 + bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff); 1.5964 + first = second; 1.5965 + 1.5966 + j = 0; 1.5967 + if(bocsuBytesUsed != 0) { 1.5968 + while(bocsuBytesUsed-->0) { 1.5969 + j++; 1.5970 + } 1.5971 + } 1.5972 + 1.5973 + while(i < count && j < bocsuBytesWritten) { 1.5974 + dest[i++] = buff[j++]; 1.5975 + } 1.5976 + } 1.5977 + 1.5978 + } else { 1.5979 + level = UCOL_PSK_NULL; 1.5980 + } 1.5981 + /* fall through to next level */ 1.5982 + case UCOL_PSK_NULL: 1.5983 + j = i; 1.5984 + while(j<count) { 1.5985 + dest[j++]=0; 1.5986 + } 1.5987 + break; 1.5988 + default: 1.5989 + *status = U_INTERNAL_PROGRAM_ERROR; 1.5990 + UTRACE_EXIT_STATUS(*status); 1.5991 + return 0; 1.5992 + } 1.5993 + 1.5994 +saveState: 1.5995 + // Now we need to return stuff. First we want to see whether we have 1.5996 + // done everything for the current state of iterator. 1.5997 + if(byteCountOrFrenchDone 1.5998 + || canUpdateState == FALSE 1.5999 + || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE) 1.6000 + { 1.6001 + // Any of above mean that the previous transaction 1.6002 + // wasn't finished and that we should store the 1.6003 + // previous iterator state. 1.6004 + state[0] = iterState; 1.6005 + } else { 1.6006 + // The transaction is complete. We will continue in the next iteration. 1.6007 + state[0] = s.iterator->getState(s.iterator); 1.6008 + cces = 0; 1.6009 + } 1.6010 + // Store the number of bocsu bytes written. 1.6011 + if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) { 1.6012 + *status = U_INDEX_OUTOFBOUNDS_ERROR; 1.6013 + } 1.6014 + state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT; 1.6015 + 1.6016 + // Next we put in the level of comparison 1.6017 + state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT); 1.6018 + 1.6019 + // If we are doing French, we need to store whether we have just finished the French level 1.6020 + if(level == UCOL_PSK_SECONDARY && doingFrench) { 1.6021 + state[1] |= (((int32_t)(state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT); 1.6022 + } else { 1.6023 + state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT); 1.6024 + } 1.6025 + 1.6026 + // Was the latest CE shifted 1.6027 + if(wasShifted) { 1.6028 + state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT; 1.6029 + } 1.6030 + // Check for cces overflow 1.6031 + if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) { 1.6032 + *status = U_INDEX_OUTOFBOUNDS_ERROR; 1.6033 + } 1.6034 + // Store cces 1.6035 + state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT); 1.6036 + 1.6037 + // Check for French overflow 1.6038 + if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) { 1.6039 + *status = U_INDEX_OUTOFBOUNDS_ERROR; 1.6040 + } 1.6041 + // Store number of bytes written in the French secondary continuation sequence 1.6042 + state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT); 1.6043 + 1.6044 + 1.6045 + // If we have used normalizing iterator, get rid of it 1.6046 + if(normIter != NULL) { 1.6047 + unorm_closeIter(normIter); 1.6048 + } 1.6049 + 1.6050 + /* To avoid memory leak, free the offset buffer if necessary. */ 1.6051 + ucol_freeOffsetBuffer(&s); 1.6052 + 1.6053 + // Return number of meaningful sortkey bytes. 1.6054 + UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d", 1.6055 + dest,i, state[0], state[1]); 1.6056 + UTRACE_EXIT_VALUE(i); 1.6057 + return i; 1.6058 +} 1.6059 + 1.6060 +/** 1.6061 + * Produce a bound for a given sortkey and a number of levels. 1.6062 + */ 1.6063 +U_CAPI int32_t U_EXPORT2 1.6064 +ucol_getBound(const uint8_t *source, 1.6065 + int32_t sourceLength, 1.6066 + UColBoundMode boundType, 1.6067 + uint32_t noOfLevels, 1.6068 + uint8_t *result, 1.6069 + int32_t resultLength, 1.6070 + UErrorCode *status) 1.6071 +{ 1.6072 + // consistency checks 1.6073 + if(status == NULL || U_FAILURE(*status)) { 1.6074 + return 0; 1.6075 + } 1.6076 + if(source == NULL) { 1.6077 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.6078 + return 0; 1.6079 + } 1.6080 + 1.6081 + int32_t sourceIndex = 0; 1.6082 + // Scan the string until we skip enough of the key OR reach the end of the key 1.6083 + do { 1.6084 + sourceIndex++; 1.6085 + if(source[sourceIndex] == UCOL_LEVELTERMINATOR) { 1.6086 + noOfLevels--; 1.6087 + } 1.6088 + } while (noOfLevels > 0 1.6089 + && (source[sourceIndex] != 0 || sourceIndex < sourceLength)); 1.6090 + 1.6091 + if((source[sourceIndex] == 0 || sourceIndex == sourceLength) 1.6092 + && noOfLevels > 0) { 1.6093 + *status = U_SORT_KEY_TOO_SHORT_WARNING; 1.6094 + } 1.6095 + 1.6096 + 1.6097 + // READ ME: this code assumes that the values for boundType 1.6098 + // enum will not changes. They are set so that the enum value 1.6099 + // corresponds to the number of extra bytes each bound type 1.6100 + // needs. 1.6101 + if(result != NULL && resultLength >= sourceIndex+boundType) { 1.6102 + uprv_memcpy(result, source, sourceIndex); 1.6103 + switch(boundType) { 1.6104 + // Lower bound just gets terminated. No extra bytes 1.6105 + case UCOL_BOUND_LOWER: // = 0 1.6106 + break; 1.6107 + // Upper bound needs one extra byte 1.6108 + case UCOL_BOUND_UPPER: // = 1 1.6109 + result[sourceIndex++] = 2; 1.6110 + break; 1.6111 + // Upper long bound needs two extra bytes 1.6112 + case UCOL_BOUND_UPPER_LONG: // = 2 1.6113 + result[sourceIndex++] = 0xFF; 1.6114 + result[sourceIndex++] = 0xFF; 1.6115 + break; 1.6116 + default: 1.6117 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.6118 + return 0; 1.6119 + } 1.6120 + result[sourceIndex++] = 0; 1.6121 + 1.6122 + return sourceIndex; 1.6123 + } else { 1.6124 + return sourceIndex+boundType+1; 1.6125 + } 1.6126 +} 1.6127 + 1.6128 +/****************************************************************************/ 1.6129 +/* Following are the functions that deal with the properties of a collator */ 1.6130 +/* there are new APIs and some compatibility APIs */ 1.6131 +/****************************************************************************/ 1.6132 + 1.6133 +static inline void 1.6134 +ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE, 1.6135 + int32_t *primShift, int32_t *secShift, int32_t *terShift) 1.6136 +{ 1.6137 + uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0; 1.6138 + UBool reverseSecondary = FALSE; 1.6139 + UBool continuation = isContinuation(CE); 1.6140 + if(!continuation) { 1.6141 + tertiary = (uint8_t)((CE & coll->tertiaryMask)); 1.6142 + tertiary ^= coll->caseSwitch; 1.6143 + reverseSecondary = TRUE; 1.6144 + } else { 1.6145 + tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION)); 1.6146 + tertiary &= UCOL_REMOVE_CASE; 1.6147 + reverseSecondary = FALSE; 1.6148 + } 1.6149 + 1.6150 + secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK); 1.6151 + primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK); 1.6152 + primary1 = (uint8_t)(CE >> 8); 1.6153 + 1.6154 + if(primary1 != 0) { 1.6155 + if (coll->leadBytePermutationTable != NULL && !continuation) { 1.6156 + primary1 = coll->leadBytePermutationTable[primary1]; 1.6157 + } 1.6158 + 1.6159 + coll->latinOneCEs[ch] |= (primary1 << *primShift); 1.6160 + *primShift -= 8; 1.6161 + } 1.6162 + if(primary2 != 0) { 1.6163 + if(*primShift < 0) { 1.6164 + coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE; 1.6165 + coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 1.6166 + coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 1.6167 + return; 1.6168 + } 1.6169 + coll->latinOneCEs[ch] |= (primary2 << *primShift); 1.6170 + *primShift -= 8; 1.6171 + } 1.6172 + if(secondary != 0) { 1.6173 + if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary 1.6174 + coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary 1.6175 + coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24); 1.6176 + } else { // normal case 1.6177 + coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift); 1.6178 + } 1.6179 + *secShift -= 8; 1.6180 + } 1.6181 + if(tertiary != 0) { 1.6182 + coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift); 1.6183 + *terShift -= 8; 1.6184 + } 1.6185 +} 1.6186 + 1.6187 +static inline UBool 1.6188 +ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) { 1.6189 + uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3); 1.6190 + if(newTable == NULL) { 1.6191 + *status = U_MEMORY_ALLOCATION_ERROR; 1.6192 + coll->latinOneFailed = TRUE; 1.6193 + return FALSE; 1.6194 + } 1.6195 + int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t); 1.6196 + uprv_memset(newTable, 0, size*sizeof(uint32_t)*3); 1.6197 + uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy); 1.6198 + uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy); 1.6199 + uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy); 1.6200 + coll->latinOneTableLen = size; 1.6201 + uprv_free(coll->latinOneCEs); 1.6202 + coll->latinOneCEs = newTable; 1.6203 + return TRUE; 1.6204 +} 1.6205 + 1.6206 +static UBool 1.6207 +ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) { 1.6208 + UBool result = TRUE; 1.6209 + if(coll->latinOneCEs == NULL) { 1.6210 + coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3); 1.6211 + if(coll->latinOneCEs == NULL) { 1.6212 + *status = U_MEMORY_ALLOCATION_ERROR; 1.6213 + return FALSE; 1.6214 + } 1.6215 + coll->latinOneTableLen = UCOL_LATINONETABLELEN; 1.6216 + } 1.6217 + UChar ch = 0; 1.6218 + UCollationElements *it = ucol_openElements(coll, &ch, 1, status); 1.6219 + // Check for null pointer 1.6220 + if (U_FAILURE(*status)) { 1.6221 + ucol_closeElements(it); 1.6222 + return FALSE; 1.6223 + } 1.6224 + uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3); 1.6225 + 1.6226 + int32_t primShift = 24, secShift = 24, terShift = 24; 1.6227 + uint32_t CE = 0; 1.6228 + int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1; 1.6229 + 1.6230 + // TODO: make safe if you get more than you wanted... 1.6231 + for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) { 1.6232 + primShift = 24; secShift = 24; terShift = 24; 1.6233 + if(ch < 0x100) { 1.6234 + CE = coll->latinOneMapping[ch]; 1.6235 + } else { 1.6236 + CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 1.6237 + if(CE == UCOL_NOT_FOUND && coll->UCA) { 1.6238 + CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); 1.6239 + } 1.6240 + } 1.6241 + if(CE < UCOL_NOT_FOUND) { 1.6242 + ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); 1.6243 + } else { 1.6244 + switch (getCETag(CE)) { 1.6245 + case EXPANSION_TAG: 1.6246 + case DIGIT_TAG: 1.6247 + ucol_setText(it, &ch, 1, status); 1.6248 + while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) { 1.6249 + if(primShift < 0 || secShift < 0 || terShift < 0) { 1.6250 + coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE; 1.6251 + coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 1.6252 + coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 1.6253 + break; 1.6254 + } 1.6255 + ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); 1.6256 + } 1.6257 + break; 1.6258 + case CONTRACTION_TAG: 1.6259 + // here is the trick 1.6260 + // F2 is contraction. We do something very similar to contractions 1.6261 + // but have two indices, one in the real contraction table and the 1.6262 + // other to where we stuffed things. This hopes that we don't have 1.6263 + // many contractions (this should work for latin-1 tables). 1.6264 + { 1.6265 + if((CE & 0x00FFF000) != 0) { 1.6266 + *status = U_UNSUPPORTED_ERROR; 1.6267 + goto cleanup_after_failure; 1.6268 + } 1.6269 + 1.6270 + const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE); 1.6271 + 1.6272 + CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table 1.6273 + 1.6274 + coll->latinOneCEs[ch] = CE; 1.6275 + coll->latinOneCEs[coll->latinOneTableLen+ch] = CE; 1.6276 + coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE; 1.6277 + 1.6278 + // We're going to jump into contraction table, pick the elements 1.6279 + // and use them 1.6280 + do { 1.6281 + CE = *(coll->contractionCEs + 1.6282 + (UCharOffset - coll->contractionIndex)); 1.6283 + if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) { 1.6284 + uint32_t size; 1.6285 + uint32_t i; /* general counter */ 1.6286 + uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ 1.6287 + size = getExpansionCount(CE); 1.6288 + //CE = *CEOffset++; 1.6289 + if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ 1.6290 + for(i = 0; i<size; i++) { 1.6291 + if(primShift < 0 || secShift < 0 || terShift < 0) { 1.6292 + coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 1.6293 + coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 1.6294 + coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 1.6295 + break; 1.6296 + } 1.6297 + ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift); 1.6298 + } 1.6299 + } else { /* else, we do */ 1.6300 + while(*CEOffset != 0) { 1.6301 + if(primShift < 0 || secShift < 0 || terShift < 0) { 1.6302 + coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 1.6303 + coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 1.6304 + coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 1.6305 + break; 1.6306 + } 1.6307 + ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift); 1.6308 + } 1.6309 + } 1.6310 + contractionOffset++; 1.6311 + } else if(CE < UCOL_NOT_FOUND) { 1.6312 + ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift); 1.6313 + } else { 1.6314 + coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 1.6315 + coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 1.6316 + coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 1.6317 + contractionOffset++; 1.6318 + } 1.6319 + UCharOffset++; 1.6320 + primShift = 24; secShift = 24; terShift = 24; 1.6321 + if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate 1.6322 + if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) { 1.6323 + goto cleanup_after_failure; 1.6324 + } 1.6325 + } 1.6326 + } while(*UCharOffset != 0xFFFF); 1.6327 + } 1.6328 + break;; 1.6329 + case SPEC_PROC_TAG: 1.6330 + { 1.6331 + // 0xB7 is a precontext character defined in UCA5.1, a special 1.6332 + // handle is implemeted in order to save LatinOne table for 1.6333 + // most locales. 1.6334 + if (ch==0xb7) { 1.6335 + ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); 1.6336 + } 1.6337 + else { 1.6338 + goto cleanup_after_failure; 1.6339 + } 1.6340 + } 1.6341 + break; 1.6342 + default: 1.6343 + goto cleanup_after_failure; 1.6344 + } 1.6345 + } 1.6346 + } 1.6347 + // compact table 1.6348 + if(contractionOffset < coll->latinOneTableLen) { 1.6349 + if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) { 1.6350 + goto cleanup_after_failure; 1.6351 + } 1.6352 + } 1.6353 + ucol_closeElements(it); 1.6354 + return result; 1.6355 + 1.6356 +cleanup_after_failure: 1.6357 + // status should already be set before arriving here. 1.6358 + coll->latinOneFailed = TRUE; 1.6359 + ucol_closeElements(it); 1.6360 + return FALSE; 1.6361 +} 1.6362 + 1.6363 +void ucol_updateInternalState(UCollator *coll, UErrorCode *status) { 1.6364 + if(U_SUCCESS(*status)) { 1.6365 + if(coll->caseFirst == UCOL_UPPER_FIRST) { 1.6366 + coll->caseSwitch = UCOL_CASE_SWITCH; 1.6367 + } else { 1.6368 + coll->caseSwitch = UCOL_NO_CASE_SWITCH; 1.6369 + } 1.6370 + 1.6371 + if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) { 1.6372 + coll->tertiaryMask = UCOL_REMOVE_CASE; 1.6373 + coll->tertiaryCommon = UCOL_COMMON3_NORMAL; 1.6374 + coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */ 1.6375 + coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF; 1.6376 + coll->tertiaryBottom = UCOL_COMMON_BOT3; 1.6377 + } else { 1.6378 + coll->tertiaryMask = UCOL_KEEP_CASE; 1.6379 + coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON; 1.6380 + if(coll->caseFirst == UCOL_UPPER_FIRST) { 1.6381 + coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST; 1.6382 + coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER; 1.6383 + coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER; 1.6384 + } else { 1.6385 + coll->tertiaryCommon = UCOL_COMMON3_NORMAL; 1.6386 + coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER; 1.6387 + coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER; 1.6388 + } 1.6389 + } 1.6390 + 1.6391 + /* Set the compression values */ 1.6392 + uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - coll->tertiaryBottom - 1); 1.6393 + coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */ 1.6394 + coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount); 1.6395 + 1.6396 + if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY 1.6397 + && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE) 1.6398 + { 1.6399 + coll->sortKeyGen = ucol_calcSortKeySimpleTertiary; 1.6400 + } else { 1.6401 + coll->sortKeyGen = ucol_calcSortKey; 1.6402 + } 1.6403 + if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF 1.6404 + && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed) 1.6405 + { 1.6406 + if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) { 1.6407 + if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it 1.6408 + //fprintf(stderr, "F"); 1.6409 + coll->latinOneUse = TRUE; 1.6410 + } else { 1.6411 + coll->latinOneUse = FALSE; 1.6412 + } 1.6413 + if(*status == U_UNSUPPORTED_ERROR) { 1.6414 + *status = U_ZERO_ERROR; 1.6415 + } 1.6416 + } else { // latin1Table exists and it doesn't need to be regenerated, just use it 1.6417 + coll->latinOneUse = TRUE; 1.6418 + } 1.6419 + } else { 1.6420 + coll->latinOneUse = FALSE; 1.6421 + } 1.6422 + } 1.6423 +} 1.6424 + 1.6425 +U_CAPI uint32_t U_EXPORT2 1.6426 +ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) { 1.6427 + if(U_FAILURE(*status) || coll == NULL) { 1.6428 + return 0; 1.6429 + } 1.6430 + if(len == -1) { 1.6431 + len = u_strlen(varTop); 1.6432 + } 1.6433 + if(len == 0) { 1.6434 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.6435 + return 0; 1.6436 + } 1.6437 + 1.6438 + if(coll->delegate!=NULL) { 1.6439 + return ((Collator*)coll->delegate)->setVariableTop(varTop, len, *status); 1.6440 + } 1.6441 + 1.6442 + 1.6443 + collIterate s; 1.6444 + IInit_collIterate(coll, varTop, len, &s, status); 1.6445 + if(U_FAILURE(*status)) { 1.6446 + return 0; 1.6447 + } 1.6448 + 1.6449 + uint32_t CE = ucol_IGetNextCE(coll, &s, status); 1.6450 + 1.6451 + /* here we check if we have consumed all characters */ 1.6452 + /* you can put in either one character or a contraction */ 1.6453 + /* you shouldn't put more... */ 1.6454 + if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) { 1.6455 + *status = U_CE_NOT_FOUND_ERROR; 1.6456 + return 0; 1.6457 + } 1.6458 + 1.6459 + uint32_t nextCE = ucol_IGetNextCE(coll, &s, status); 1.6460 + 1.6461 + if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) { 1.6462 + *status = U_PRIMARY_TOO_LONG_ERROR; 1.6463 + return 0; 1.6464 + } 1.6465 + if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) { 1.6466 + coll->variableTopValueisDefault = FALSE; 1.6467 + coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16; 1.6468 + } 1.6469 + 1.6470 + /* To avoid memory leak, free the offset buffer if necessary. */ 1.6471 + ucol_freeOffsetBuffer(&s); 1.6472 + 1.6473 + return CE & UCOL_PRIMARYMASK; 1.6474 +} 1.6475 + 1.6476 +U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) { 1.6477 + if(U_FAILURE(*status) || coll == NULL) { 1.6478 + return 0; 1.6479 + } 1.6480 + if(coll->delegate!=NULL) { 1.6481 + return ((const Collator*)coll->delegate)->getVariableTop(*status); 1.6482 + } 1.6483 + return coll->variableTopValue<<16; 1.6484 +} 1.6485 + 1.6486 +U_CAPI void U_EXPORT2 1.6487 +ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) { 1.6488 + if(U_FAILURE(*status) || coll == NULL) { 1.6489 + return; 1.6490 + } 1.6491 + 1.6492 + if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) { 1.6493 + coll->variableTopValueisDefault = FALSE; 1.6494 + coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16; 1.6495 + } 1.6496 +} 1.6497 +/* Attribute setter API */ 1.6498 +U_CAPI void U_EXPORT2 1.6499 +ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) { 1.6500 + if(U_FAILURE(*status) || coll == NULL) { 1.6501 + return; 1.6502 + } 1.6503 + 1.6504 + if(coll->delegate != NULL) { 1.6505 + ((Collator*)coll->delegate)->setAttribute(attr,value,*status); 1.6506 + return; 1.6507 + } 1.6508 + 1.6509 + UColAttributeValue oldFrench = coll->frenchCollation; 1.6510 + UColAttributeValue oldCaseFirst = coll->caseFirst; 1.6511 + switch(attr) { 1.6512 + case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */ 1.6513 + if(value == UCOL_ON) { 1.6514 + coll->numericCollation = UCOL_ON; 1.6515 + coll->numericCollationisDefault = FALSE; 1.6516 + } else if (value == UCOL_OFF) { 1.6517 + coll->numericCollation = UCOL_OFF; 1.6518 + coll->numericCollationisDefault = FALSE; 1.6519 + } else if (value == UCOL_DEFAULT) { 1.6520 + coll->numericCollationisDefault = TRUE; 1.6521 + coll->numericCollation = (UColAttributeValue)coll->options->numericCollation; 1.6522 + } else { 1.6523 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.6524 + } 1.6525 + break; 1.6526 + case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */ 1.6527 + if(value == UCOL_ON || value == UCOL_OFF || value == UCOL_DEFAULT) { 1.6528 + // This attribute is an implementation detail of the CLDR Japanese tailoring. 1.6529 + // The implementation might change to use a different mechanism 1.6530 + // to achieve the same Japanese sort order. 1.6531 + // Since ICU 50, this attribute is not settable any more via API functions. 1.6532 + } else { 1.6533 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.6534 + } 1.6535 + break; 1.6536 + case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/ 1.6537 + if(value == UCOL_ON) { 1.6538 + coll->frenchCollation = UCOL_ON; 1.6539 + coll->frenchCollationisDefault = FALSE; 1.6540 + } else if (value == UCOL_OFF) { 1.6541 + coll->frenchCollation = UCOL_OFF; 1.6542 + coll->frenchCollationisDefault = FALSE; 1.6543 + } else if (value == UCOL_DEFAULT) { 1.6544 + coll->frenchCollationisDefault = TRUE; 1.6545 + coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation; 1.6546 + } else { 1.6547 + *status = U_ILLEGAL_ARGUMENT_ERROR ; 1.6548 + } 1.6549 + break; 1.6550 + case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ 1.6551 + if(value == UCOL_SHIFTED) { 1.6552 + coll->alternateHandling = UCOL_SHIFTED; 1.6553 + coll->alternateHandlingisDefault = FALSE; 1.6554 + } else if (value == UCOL_NON_IGNORABLE) { 1.6555 + coll->alternateHandling = UCOL_NON_IGNORABLE; 1.6556 + coll->alternateHandlingisDefault = FALSE; 1.6557 + } else if (value == UCOL_DEFAULT) { 1.6558 + coll->alternateHandlingisDefault = TRUE; 1.6559 + coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ; 1.6560 + } else { 1.6561 + *status = U_ILLEGAL_ARGUMENT_ERROR ; 1.6562 + } 1.6563 + break; 1.6564 + case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ 1.6565 + if(value == UCOL_LOWER_FIRST) { 1.6566 + coll->caseFirst = UCOL_LOWER_FIRST; 1.6567 + coll->caseFirstisDefault = FALSE; 1.6568 + } else if (value == UCOL_UPPER_FIRST) { 1.6569 + coll->caseFirst = UCOL_UPPER_FIRST; 1.6570 + coll->caseFirstisDefault = FALSE; 1.6571 + } else if (value == UCOL_OFF) { 1.6572 + coll->caseFirst = UCOL_OFF; 1.6573 + coll->caseFirstisDefault = FALSE; 1.6574 + } else if (value == UCOL_DEFAULT) { 1.6575 + coll->caseFirst = (UColAttributeValue)coll->options->caseFirst; 1.6576 + coll->caseFirstisDefault = TRUE; 1.6577 + } else { 1.6578 + *status = U_ILLEGAL_ARGUMENT_ERROR ; 1.6579 + } 1.6580 + break; 1.6581 + case UCOL_CASE_LEVEL: /* do we have an extra case level */ 1.6582 + if(value == UCOL_ON) { 1.6583 + coll->caseLevel = UCOL_ON; 1.6584 + coll->caseLevelisDefault = FALSE; 1.6585 + } else if (value == UCOL_OFF) { 1.6586 + coll->caseLevel = UCOL_OFF; 1.6587 + coll->caseLevelisDefault = FALSE; 1.6588 + } else if (value == UCOL_DEFAULT) { 1.6589 + coll->caseLevel = (UColAttributeValue)coll->options->caseLevel; 1.6590 + coll->caseLevelisDefault = TRUE; 1.6591 + } else { 1.6592 + *status = U_ILLEGAL_ARGUMENT_ERROR ; 1.6593 + } 1.6594 + break; 1.6595 + case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ 1.6596 + if(value == UCOL_ON) { 1.6597 + coll->normalizationMode = UCOL_ON; 1.6598 + coll->normalizationModeisDefault = FALSE; 1.6599 + initializeFCD(status); 1.6600 + } else if (value == UCOL_OFF) { 1.6601 + coll->normalizationMode = UCOL_OFF; 1.6602 + coll->normalizationModeisDefault = FALSE; 1.6603 + } else if (value == UCOL_DEFAULT) { 1.6604 + coll->normalizationModeisDefault = TRUE; 1.6605 + coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode; 1.6606 + if(coll->normalizationMode == UCOL_ON) { 1.6607 + initializeFCD(status); 1.6608 + } 1.6609 + } else { 1.6610 + *status = U_ILLEGAL_ARGUMENT_ERROR ; 1.6611 + } 1.6612 + break; 1.6613 + case UCOL_STRENGTH: /* attribute for strength */ 1.6614 + if (value == UCOL_DEFAULT) { 1.6615 + coll->strengthisDefault = TRUE; 1.6616 + coll->strength = (UColAttributeValue)coll->options->strength; 1.6617 + } else if (value <= UCOL_IDENTICAL) { 1.6618 + coll->strengthisDefault = FALSE; 1.6619 + coll->strength = value; 1.6620 + } else { 1.6621 + *status = U_ILLEGAL_ARGUMENT_ERROR ; 1.6622 + } 1.6623 + break; 1.6624 + case UCOL_ATTRIBUTE_COUNT: 1.6625 + default: 1.6626 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.6627 + break; 1.6628 + } 1.6629 + if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) { 1.6630 + coll->latinOneRegenTable = TRUE; 1.6631 + } else { 1.6632 + coll->latinOneRegenTable = FALSE; 1.6633 + } 1.6634 + ucol_updateInternalState(coll, status); 1.6635 +} 1.6636 + 1.6637 +U_CAPI UColAttributeValue U_EXPORT2 1.6638 +ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) { 1.6639 + if(U_FAILURE(*status) || coll == NULL) { 1.6640 + return UCOL_DEFAULT; 1.6641 + } 1.6642 + 1.6643 + if(coll->delegate != NULL) { 1.6644 + return ((Collator*)coll->delegate)->getAttribute(attr,*status); 1.6645 + } 1.6646 + 1.6647 + switch(attr) { 1.6648 + case UCOL_NUMERIC_COLLATION: 1.6649 + return coll->numericCollation; 1.6650 + case UCOL_HIRAGANA_QUATERNARY_MODE: 1.6651 + return coll->hiraganaQ; 1.6652 + case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/ 1.6653 + return coll->frenchCollation; 1.6654 + case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ 1.6655 + return coll->alternateHandling; 1.6656 + case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ 1.6657 + return coll->caseFirst; 1.6658 + case UCOL_CASE_LEVEL: /* do we have an extra case level */ 1.6659 + return coll->caseLevel; 1.6660 + case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ 1.6661 + return coll->normalizationMode; 1.6662 + case UCOL_STRENGTH: /* attribute for strength */ 1.6663 + return coll->strength; 1.6664 + case UCOL_ATTRIBUTE_COUNT: 1.6665 + default: 1.6666 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.6667 + break; 1.6668 + } 1.6669 + return UCOL_DEFAULT; 1.6670 +} 1.6671 + 1.6672 +U_CAPI void U_EXPORT2 1.6673 +ucol_setStrength( UCollator *coll, 1.6674 + UCollationStrength strength) 1.6675 +{ 1.6676 + UErrorCode status = U_ZERO_ERROR; 1.6677 + ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status); 1.6678 +} 1.6679 + 1.6680 +U_CAPI UCollationStrength U_EXPORT2 1.6681 +ucol_getStrength(const UCollator *coll) 1.6682 +{ 1.6683 + UErrorCode status = U_ZERO_ERROR; 1.6684 + return ucol_getAttribute(coll, UCOL_STRENGTH, &status); 1.6685 +} 1.6686 + 1.6687 +U_CAPI int32_t U_EXPORT2 1.6688 +ucol_getReorderCodes(const UCollator *coll, 1.6689 + int32_t *dest, 1.6690 + int32_t destCapacity, 1.6691 + UErrorCode *status) { 1.6692 + if (U_FAILURE(*status)) { 1.6693 + return 0; 1.6694 + } 1.6695 + 1.6696 + if(coll->delegate!=NULL) { 1.6697 + return ((const Collator*)coll->delegate)->getReorderCodes(dest, destCapacity, *status); 1.6698 + } 1.6699 + 1.6700 + if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { 1.6701 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.6702 + return 0; 1.6703 + } 1.6704 + 1.6705 +#ifdef UCOL_DEBUG 1.6706 + printf("coll->reorderCodesLength = %d\n", coll->reorderCodesLength); 1.6707 + printf("coll->defaultReorderCodesLength = %d\n", coll->defaultReorderCodesLength); 1.6708 +#endif 1.6709 + 1.6710 + if (coll->reorderCodesLength > destCapacity) { 1.6711 + *status = U_BUFFER_OVERFLOW_ERROR; 1.6712 + return coll->reorderCodesLength; 1.6713 + } 1.6714 + for (int32_t i = 0; i < coll->reorderCodesLength; i++) { 1.6715 + dest[i] = coll->reorderCodes[i]; 1.6716 + } 1.6717 + return coll->reorderCodesLength; 1.6718 +} 1.6719 + 1.6720 +U_CAPI void U_EXPORT2 1.6721 +ucol_setReorderCodes(UCollator* coll, 1.6722 + const int32_t* reorderCodes, 1.6723 + int32_t reorderCodesLength, 1.6724 + UErrorCode *status) { 1.6725 + if (U_FAILURE(*status)) { 1.6726 + return; 1.6727 + } 1.6728 + 1.6729 + if (reorderCodesLength < 0 || (reorderCodesLength > 0 && reorderCodes == NULL)) { 1.6730 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.6731 + return; 1.6732 + } 1.6733 + 1.6734 + if(coll->delegate!=NULL) { 1.6735 + ((Collator*)coll->delegate)->setReorderCodes(reorderCodes, reorderCodesLength, *status); 1.6736 + return; 1.6737 + } 1.6738 + 1.6739 + if (coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) { 1.6740 + uprv_free(coll->reorderCodes); 1.6741 + } 1.6742 + coll->reorderCodes = NULL; 1.6743 + coll->freeReorderCodesOnClose = FALSE; 1.6744 + coll->reorderCodesLength = 0; 1.6745 + if (reorderCodesLength == 0) { 1.6746 + if (coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) { 1.6747 + uprv_free(coll->leadBytePermutationTable); 1.6748 + } 1.6749 + coll->leadBytePermutationTable = NULL; 1.6750 + coll->freeLeadBytePermutationTableOnClose = FALSE; 1.6751 + return; 1.6752 + } 1.6753 + coll->reorderCodes = (int32_t*) uprv_malloc(reorderCodesLength * sizeof(int32_t)); 1.6754 + if (coll->reorderCodes == NULL) { 1.6755 + *status = U_MEMORY_ALLOCATION_ERROR; 1.6756 + return; 1.6757 + } 1.6758 + coll->freeReorderCodesOnClose = TRUE; 1.6759 + for (int32_t i = 0; i < reorderCodesLength; i++) { 1.6760 + coll->reorderCodes[i] = reorderCodes[i]; 1.6761 + } 1.6762 + coll->reorderCodesLength = reorderCodesLength; 1.6763 + ucol_buildPermutationTable(coll, status); 1.6764 +} 1.6765 + 1.6766 +U_CAPI int32_t U_EXPORT2 1.6767 +ucol_getEquivalentReorderCodes(int32_t reorderCode, 1.6768 + int32_t* dest, 1.6769 + int32_t destCapacity, 1.6770 + UErrorCode *pErrorCode) { 1.6771 + bool equivalentCodesSet[USCRIPT_CODE_LIMIT]; 1.6772 + uint16_t leadBytes[256]; 1.6773 + int leadBytesCount; 1.6774 + int leadByteIndex; 1.6775 + int16_t reorderCodesForLeadByte[USCRIPT_CODE_LIMIT]; 1.6776 + int reorderCodesForLeadByteCount; 1.6777 + int reorderCodeIndex; 1.6778 + 1.6779 + int32_t equivalentCodesCount = 0; 1.6780 + int setIndex; 1.6781 + 1.6782 + if (U_FAILURE(*pErrorCode)) { 1.6783 + return 0; 1.6784 + } 1.6785 + 1.6786 + if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { 1.6787 + *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 1.6788 + return 0; 1.6789 + } 1.6790 + 1.6791 + uprv_memset(equivalentCodesSet, 0, USCRIPT_CODE_LIMIT * sizeof(bool)); 1.6792 + 1.6793 + const UCollator* uca = ucol_initUCA(pErrorCode); 1.6794 + if (U_FAILURE(*pErrorCode)) { 1.6795 + return 0; 1.6796 + } 1.6797 + leadBytesCount = ucol_getLeadBytesForReorderCode(uca, reorderCode, leadBytes, 256); 1.6798 + for (leadByteIndex = 0; leadByteIndex < leadBytesCount; leadByteIndex++) { 1.6799 + reorderCodesForLeadByteCount = ucol_getReorderCodesForLeadByte( 1.6800 + uca, leadBytes[leadByteIndex], reorderCodesForLeadByte, USCRIPT_CODE_LIMIT); 1.6801 + for (reorderCodeIndex = 0; reorderCodeIndex < reorderCodesForLeadByteCount; reorderCodeIndex++) { 1.6802 + equivalentCodesSet[reorderCodesForLeadByte[reorderCodeIndex]] = true; 1.6803 + } 1.6804 + } 1.6805 + 1.6806 + for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) { 1.6807 + if (equivalentCodesSet[setIndex] == true) { 1.6808 + equivalentCodesCount++; 1.6809 + } 1.6810 + } 1.6811 + 1.6812 + if (destCapacity == 0) { 1.6813 + return equivalentCodesCount; 1.6814 + } 1.6815 + 1.6816 + equivalentCodesCount = 0; 1.6817 + for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) { 1.6818 + if (equivalentCodesSet[setIndex] == true) { 1.6819 + dest[equivalentCodesCount++] = setIndex; 1.6820 + if (equivalentCodesCount >= destCapacity) { 1.6821 + break; 1.6822 + } 1.6823 + } 1.6824 + } 1.6825 + return equivalentCodesCount; 1.6826 +} 1.6827 + 1.6828 + 1.6829 +/****************************************************************************/ 1.6830 +/* Following are misc functions */ 1.6831 +/* there are new APIs and some compatibility APIs */ 1.6832 +/****************************************************************************/ 1.6833 + 1.6834 +U_CAPI void U_EXPORT2 1.6835 +ucol_getVersion(const UCollator* coll, 1.6836 + UVersionInfo versionInfo) 1.6837 +{ 1.6838 + if(coll->delegate!=NULL) { 1.6839 + ((const Collator*)coll->delegate)->getVersion(versionInfo); 1.6840 + return; 1.6841 + } 1.6842 + /* RunTime version */ 1.6843 + uint8_t rtVersion = UCOL_RUNTIME_VERSION; 1.6844 + /* Builder version*/ 1.6845 + uint8_t bdVersion = coll->image->version[0]; 1.6846 + 1.6847 + /* Charset Version. Need to get the version from cnv files 1.6848 + * makeconv should populate cnv files with version and 1.6849 + * an api has to be provided in ucnv.h to obtain this version 1.6850 + */ 1.6851 + uint8_t csVersion = 0; 1.6852 + 1.6853 + /* combine the version info */ 1.6854 + uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion)); 1.6855 + 1.6856 + /* Tailoring rules */ 1.6857 + versionInfo[0] = (uint8_t)(cmbVersion>>8); 1.6858 + versionInfo[1] = (uint8_t)cmbVersion; 1.6859 + versionInfo[2] = coll->image->version[1]; 1.6860 + if(coll->UCA) { 1.6861 + /* Include the minor number when getting the UCA version. (major & 1f) << 3 | (minor & 7) */ 1.6862 + versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 | (coll->UCA->image->UCAVersion[1] & 0x07); 1.6863 + } else { 1.6864 + versionInfo[3] = 0; 1.6865 + } 1.6866 +} 1.6867 + 1.6868 + 1.6869 +/* This internal API checks whether a character is tailored or not */ 1.6870 +U_CAPI UBool U_EXPORT2 1.6871 +ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) { 1.6872 + if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) { 1.6873 + return FALSE; 1.6874 + } 1.6875 + 1.6876 + uint32_t CE = UCOL_NOT_FOUND; 1.6877 + const UChar *ContractionStart = NULL; 1.6878 + if(u < 0x100) { /* latin-1 */ 1.6879 + CE = coll->latinOneMapping[u]; 1.6880 + if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) { 1.6881 + return FALSE; 1.6882 + } 1.6883 + } else { /* regular */ 1.6884 + CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u); 1.6885 + } 1.6886 + 1.6887 + if(isContraction(CE)) { 1.6888 + ContractionStart = (UChar *)coll->image+getContractOffset(CE); 1.6889 + CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex)); 1.6890 + } 1.6891 + 1.6892 + return (UBool)(CE != UCOL_NOT_FOUND); 1.6893 +} 1.6894 + 1.6895 + 1.6896 +/****************************************************************************/ 1.6897 +/* Following are the string compare functions */ 1.6898 +/* */ 1.6899 +/****************************************************************************/ 1.6900 + 1.6901 + 1.6902 +/* ucol_checkIdent internal function. Does byte level string compare. */ 1.6903 +/* Used by strcoll if strength == identical and strings */ 1.6904 +/* are otherwise equal. */ 1.6905 +/* */ 1.6906 +/* Comparison must be done on NFD normalized strings. */ 1.6907 +/* FCD is not good enough. */ 1.6908 + 1.6909 +static 1.6910 +UCollationResult ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status) 1.6911 +{ 1.6912 + // When we arrive here, we can have normal strings or UCharIterators. Currently they are both 1.6913 + // of same type, but that doesn't really mean that it will stay that way. 1.6914 + int32_t comparison; 1.6915 + 1.6916 + if (sColl->flags & UCOL_USE_ITERATOR) { 1.6917 + // The division for the array length may truncate the array size to 1.6918 + // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high 1.6919 + // for all platforms anyway. 1.6920 + UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 1.6921 + UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 1.6922 + UNormIterator *sNIt = NULL, *tNIt = NULL; 1.6923 + sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status); 1.6924 + tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status); 1.6925 + sColl->iterator->move(sColl->iterator, 0, UITER_START); 1.6926 + tColl->iterator->move(tColl->iterator, 0, UITER_START); 1.6927 + UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status); 1.6928 + UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status); 1.6929 + comparison = u_strCompareIter(sIt, tIt, TRUE); 1.6930 + unorm_closeIter(sNIt); 1.6931 + unorm_closeIter(tNIt); 1.6932 + } else { 1.6933 + int32_t sLen = (sColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(sColl->endp - sColl->string) : -1; 1.6934 + const UChar *sBuf = sColl->string; 1.6935 + int32_t tLen = (tColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(tColl->endp - tColl->string) : -1; 1.6936 + const UChar *tBuf = tColl->string; 1.6937 + 1.6938 + if (normalize) { 1.6939 + *status = U_ZERO_ERROR; 1.6940 + // Note: We could use Normalizer::compare() or similar, but for short strings 1.6941 + // which may not be in FCD it might be faster to just NFD them. 1.6942 + // Note: spanQuickCheckYes() + normalizeSecondAndAppend() rather than 1.6943 + // NFD'ing immediately might be faster for long strings, 1.6944 + // but string comparison is usually done on relatively short strings. 1.6945 + sColl->nfd->normalize(UnicodeString((sColl->flags & UCOL_ITER_HASLEN) == 0, sBuf, sLen), 1.6946 + sColl->writableBuffer, 1.6947 + *status); 1.6948 + tColl->nfd->normalize(UnicodeString((tColl->flags & UCOL_ITER_HASLEN) == 0, tBuf, tLen), 1.6949 + tColl->writableBuffer, 1.6950 + *status); 1.6951 + if(U_FAILURE(*status)) { 1.6952 + return UCOL_LESS; 1.6953 + } 1.6954 + comparison = sColl->writableBuffer.compareCodePointOrder(tColl->writableBuffer); 1.6955 + } else { 1.6956 + comparison = u_strCompare(sBuf, sLen, tBuf, tLen, TRUE); 1.6957 + } 1.6958 + } 1.6959 + 1.6960 + if (comparison < 0) { 1.6961 + return UCOL_LESS; 1.6962 + } else if (comparison == 0) { 1.6963 + return UCOL_EQUAL; 1.6964 + } else /* comparison > 0 */ { 1.6965 + return UCOL_GREATER; 1.6966 + } 1.6967 +} 1.6968 + 1.6969 +/* CEBuf - A struct and some inline functions to handle the saving */ 1.6970 +/* of CEs in a buffer within ucol_strcoll */ 1.6971 + 1.6972 +#define UCOL_CEBUF_SIZE 512 1.6973 +typedef struct ucol_CEBuf { 1.6974 + uint32_t *buf; 1.6975 + uint32_t *endp; 1.6976 + uint32_t *pos; 1.6977 + uint32_t localArray[UCOL_CEBUF_SIZE]; 1.6978 +} ucol_CEBuf; 1.6979 + 1.6980 + 1.6981 +static 1.6982 +inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) { 1.6983 + (b)->buf = (b)->pos = (b)->localArray; 1.6984 + (b)->endp = (b)->buf + UCOL_CEBUF_SIZE; 1.6985 +} 1.6986 + 1.6987 +static 1.6988 +void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) { 1.6989 + uint32_t oldSize; 1.6990 + uint32_t newSize; 1.6991 + uint32_t *newBuf; 1.6992 + 1.6993 + ci->flags |= UCOL_ITER_ALLOCATED; 1.6994 + oldSize = (uint32_t)(b->pos - b->buf); 1.6995 + newSize = oldSize * 2; 1.6996 + newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t)); 1.6997 + if(newBuf == NULL) { 1.6998 + *status = U_MEMORY_ALLOCATION_ERROR; 1.6999 + } 1.7000 + else { 1.7001 + uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t)); 1.7002 + if (b->buf != b->localArray) { 1.7003 + uprv_free(b->buf); 1.7004 + } 1.7005 + b->buf = newBuf; 1.7006 + b->endp = b->buf + newSize; 1.7007 + b->pos = b->buf + oldSize; 1.7008 + } 1.7009 +} 1.7010 + 1.7011 +static 1.7012 +inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCode *status) { 1.7013 + if (b->pos == b->endp) { 1.7014 + ucol_CEBuf_Expand(b, ci, status); 1.7015 + } 1.7016 + if (U_SUCCESS(*status)) { 1.7017 + *(b)->pos++ = ce; 1.7018 + } 1.7019 +} 1.7020 + 1.7021 +/* This is a trick string compare function that goes in and uses sortkeys to compare */ 1.7022 +/* It is used when compare gets in trouble and needs to bail out */ 1.7023 +static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl, 1.7024 + collIterate *tColl, 1.7025 + UErrorCode *status) 1.7026 +{ 1.7027 + uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER]; 1.7028 + uint8_t *sourceKeyP = sourceKey; 1.7029 + uint8_t *targetKeyP = targetKey; 1.7030 + int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER; 1.7031 + const UCollator *coll = sColl->coll; 1.7032 + const UChar *source = NULL; 1.7033 + const UChar *target = NULL; 1.7034 + int32_t result = UCOL_EQUAL; 1.7035 + UnicodeString sourceString, targetString; 1.7036 + int32_t sourceLength; 1.7037 + int32_t targetLength; 1.7038 + 1.7039 + if(sColl->flags & UCOL_USE_ITERATOR) { 1.7040 + sColl->iterator->move(sColl->iterator, 0, UITER_START); 1.7041 + tColl->iterator->move(tColl->iterator, 0, UITER_START); 1.7042 + UChar32 c; 1.7043 + while((c=sColl->iterator->next(sColl->iterator))>=0) { 1.7044 + sourceString.append((UChar)c); 1.7045 + } 1.7046 + while((c=tColl->iterator->next(tColl->iterator))>=0) { 1.7047 + targetString.append((UChar)c); 1.7048 + } 1.7049 + source = sourceString.getBuffer(); 1.7050 + sourceLength = sourceString.length(); 1.7051 + target = targetString.getBuffer(); 1.7052 + targetLength = targetString.length(); 1.7053 + } else { // no iterators 1.7054 + sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(int32_t)(sColl->endp-sColl->string):-1; 1.7055 + targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(int32_t)(tColl->endp-tColl->string):-1; 1.7056 + source = sColl->string; 1.7057 + target = tColl->string; 1.7058 + } 1.7059 + 1.7060 + 1.7061 + 1.7062 + sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen); 1.7063 + if(sourceKeyLen > UCOL_MAX_BUFFER) { 1.7064 + sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t)); 1.7065 + if(sourceKeyP == NULL) { 1.7066 + *status = U_MEMORY_ALLOCATION_ERROR; 1.7067 + goto cleanup_and_do_compare; 1.7068 + } 1.7069 + sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen); 1.7070 + } 1.7071 + 1.7072 + targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen); 1.7073 + if(targetKeyLen > UCOL_MAX_BUFFER) { 1.7074 + targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t)); 1.7075 + if(targetKeyP == NULL) { 1.7076 + *status = U_MEMORY_ALLOCATION_ERROR; 1.7077 + goto cleanup_and_do_compare; 1.7078 + } 1.7079 + targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen); 1.7080 + } 1.7081 + 1.7082 + result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP); 1.7083 + 1.7084 +cleanup_and_do_compare: 1.7085 + if(sourceKeyP != NULL && sourceKeyP != sourceKey) { 1.7086 + uprv_free(sourceKeyP); 1.7087 + } 1.7088 + 1.7089 + if(targetKeyP != NULL && targetKeyP != targetKey) { 1.7090 + uprv_free(targetKeyP); 1.7091 + } 1.7092 + 1.7093 + if(result<0) { 1.7094 + return UCOL_LESS; 1.7095 + } else if(result>0) { 1.7096 + return UCOL_GREATER; 1.7097 + } else { 1.7098 + return UCOL_EQUAL; 1.7099 + } 1.7100 +} 1.7101 + 1.7102 + 1.7103 +static UCollationResult 1.7104 +ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status) 1.7105 +{ 1.7106 + U_ALIGN_CODE(16); 1.7107 + 1.7108 + const UCollator *coll = sColl->coll; 1.7109 + 1.7110 + 1.7111 + // setting up the collator parameters 1.7112 + UColAttributeValue strength = coll->strength; 1.7113 + UBool initialCheckSecTer = (strength >= UCOL_SECONDARY); 1.7114 + 1.7115 + UBool checkSecTer = initialCheckSecTer; 1.7116 + UBool checkTertiary = (strength >= UCOL_TERTIARY); 1.7117 + UBool checkQuad = (strength >= UCOL_QUATERNARY); 1.7118 + UBool checkIdent = (strength == UCOL_IDENTICAL); 1.7119 + UBool checkCase = (coll->caseLevel == UCOL_ON); 1.7120 + UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer; 1.7121 + UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); 1.7122 + UBool qShifted = shifted && checkQuad; 1.7123 + UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad; 1.7124 + 1.7125 + if(doHiragana && shifted) { 1.7126 + return (ucol_compareUsingSortKeys(sColl, tColl, status)); 1.7127 + } 1.7128 + uint8_t caseSwitch = coll->caseSwitch; 1.7129 + uint8_t tertiaryMask = coll->tertiaryMask; 1.7130 + 1.7131 + // This is the lowest primary value that will not be ignored if shifted 1.7132 + uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0; 1.7133 + 1.7134 + UCollationResult result = UCOL_EQUAL; 1.7135 + UCollationResult hirResult = UCOL_EQUAL; 1.7136 + 1.7137 + // Preparing the CE buffers. They will be filled during the primary phase 1.7138 + ucol_CEBuf sCEs; 1.7139 + ucol_CEBuf tCEs; 1.7140 + UCOL_INIT_CEBUF(&sCEs); 1.7141 + UCOL_INIT_CEBUF(&tCEs); 1.7142 + 1.7143 + uint32_t secS = 0, secT = 0; 1.7144 + uint32_t sOrder=0, tOrder=0; 1.7145 + 1.7146 + // Non shifted primary processing is quite simple 1.7147 + if(!shifted) { 1.7148 + for(;;) { 1.7149 + // We fetch CEs until we hit a non ignorable primary or end. 1.7150 + uint32_t sPrimary; 1.7151 + do { 1.7152 + // We get the next CE 1.7153 + sOrder = ucol_IGetNextCE(coll, sColl, status); 1.7154 + // Stuff it in the buffer 1.7155 + UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 1.7156 + // And keep just the primary part. 1.7157 + sPrimary = sOrder & UCOL_PRIMARYMASK; 1.7158 + } while(sPrimary == 0); 1.7159 + 1.7160 + // see the comments on the above block 1.7161 + uint32_t tPrimary; 1.7162 + do { 1.7163 + tOrder = ucol_IGetNextCE(coll, tColl, status); 1.7164 + UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 1.7165 + tPrimary = tOrder & UCOL_PRIMARYMASK; 1.7166 + } while(tPrimary == 0); 1.7167 + 1.7168 + // if both primaries are the same 1.7169 + if(sPrimary == tPrimary) { 1.7170 + // and there are no more CEs, we advance to the next level 1.7171 + if(sPrimary == UCOL_NO_MORE_CES_PRIMARY) { 1.7172 + break; 1.7173 + } 1.7174 + if(doHiragana && hirResult == UCOL_EQUAL) { 1.7175 + if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) { 1.7176 + hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA)) 1.7177 + ? UCOL_LESS:UCOL_GREATER; 1.7178 + } 1.7179 + } 1.7180 + } else { 1.7181 + // only need to check one for continuation 1.7182 + // if one is then the other must be or the preceding CE would be a prefix of the other 1.7183 + if (coll->leadBytePermutationTable != NULL && !isContinuation(sOrder)) { 1.7184 + sPrimary = (coll->leadBytePermutationTable[sPrimary>>24] << 24) | (sPrimary & 0x00FFFFFF); 1.7185 + tPrimary = (coll->leadBytePermutationTable[tPrimary>>24] << 24) | (tPrimary & 0x00FFFFFF); 1.7186 + } 1.7187 + // if two primaries are different, we are done 1.7188 + result = (sPrimary < tPrimary) ? UCOL_LESS: UCOL_GREATER; 1.7189 + goto commonReturn; 1.7190 + } 1.7191 + } // no primary difference... do the rest from the buffers 1.7192 + } else { // shifted - do a slightly more complicated processing :) 1.7193 + for(;;) { 1.7194 + UBool sInShifted = FALSE; 1.7195 + UBool tInShifted = FALSE; 1.7196 + // This version of code can be refactored. However, it seems easier to understand this way. 1.7197 + // Source loop. Same as the target loop. 1.7198 + for(;;) { 1.7199 + sOrder = ucol_IGetNextCE(coll, sColl, status); 1.7200 + if(sOrder == UCOL_NO_MORE_CES) { 1.7201 + UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 1.7202 + break; 1.7203 + } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) { 1.7204 + /* UCA amendment - ignore ignorables that follow shifted code points */ 1.7205 + continue; 1.7206 + } else if(isContinuation(sOrder)) { 1.7207 + if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */ 1.7208 + if(sInShifted) { 1.7209 + sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */ 1.7210 + UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 1.7211 + continue; 1.7212 + } else { 1.7213 + UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 1.7214 + break; 1.7215 + } 1.7216 + } else { /* Just lower level values */ 1.7217 + if(sInShifted) { 1.7218 + continue; 1.7219 + } else { 1.7220 + UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 1.7221 + continue; 1.7222 + } 1.7223 + } 1.7224 + } else { /* regular */ 1.7225 + if(coll->leadBytePermutationTable != NULL){ 1.7226 + sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF); 1.7227 + } 1.7228 + if((sOrder & UCOL_PRIMARYMASK) > LVT) { 1.7229 + UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 1.7230 + break; 1.7231 + } else { 1.7232 + if((sOrder & UCOL_PRIMARYMASK) > 0) { 1.7233 + sInShifted = TRUE; 1.7234 + sOrder &= UCOL_PRIMARYMASK; 1.7235 + UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 1.7236 + continue; 1.7237 + } else { 1.7238 + UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 1.7239 + sInShifted = FALSE; 1.7240 + continue; 1.7241 + } 1.7242 + } 1.7243 + } 1.7244 + } 1.7245 + sOrder &= UCOL_PRIMARYMASK; 1.7246 + sInShifted = FALSE; 1.7247 + 1.7248 + for(;;) { 1.7249 + tOrder = ucol_IGetNextCE(coll, tColl, status); 1.7250 + if(tOrder == UCOL_NO_MORE_CES) { 1.7251 + UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 1.7252 + break; 1.7253 + } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) { 1.7254 + /* UCA amendment - ignore ignorables that follow shifted code points */ 1.7255 + continue; 1.7256 + } else if(isContinuation(tOrder)) { 1.7257 + if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */ 1.7258 + if(tInShifted) { 1.7259 + tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */ 1.7260 + UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 1.7261 + continue; 1.7262 + } else { 1.7263 + UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 1.7264 + break; 1.7265 + } 1.7266 + } else { /* Just lower level values */ 1.7267 + if(tInShifted) { 1.7268 + continue; 1.7269 + } else { 1.7270 + UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 1.7271 + continue; 1.7272 + } 1.7273 + } 1.7274 + } else { /* regular */ 1.7275 + if(coll->leadBytePermutationTable != NULL){ 1.7276 + tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF); 1.7277 + } 1.7278 + if((tOrder & UCOL_PRIMARYMASK) > LVT) { 1.7279 + UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 1.7280 + break; 1.7281 + } else { 1.7282 + if((tOrder & UCOL_PRIMARYMASK) > 0) { 1.7283 + tInShifted = TRUE; 1.7284 + tOrder &= UCOL_PRIMARYMASK; 1.7285 + UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 1.7286 + continue; 1.7287 + } else { 1.7288 + UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 1.7289 + tInShifted = FALSE; 1.7290 + continue; 1.7291 + } 1.7292 + } 1.7293 + } 1.7294 + } 1.7295 + tOrder &= UCOL_PRIMARYMASK; 1.7296 + tInShifted = FALSE; 1.7297 + 1.7298 + if(sOrder == tOrder) { 1.7299 + /* 1.7300 + if(doHiragana && hirResult == UCOL_EQUAL) { 1.7301 + if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) { 1.7302 + hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA)) 1.7303 + ? UCOL_LESS:UCOL_GREATER; 1.7304 + } 1.7305 + } 1.7306 + */ 1.7307 + if(sOrder == UCOL_NO_MORE_CES_PRIMARY) { 1.7308 + break; 1.7309 + } else { 1.7310 + sOrder = 0; 1.7311 + tOrder = 0; 1.7312 + continue; 1.7313 + } 1.7314 + } else { 1.7315 + result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER; 1.7316 + goto commonReturn; 1.7317 + } 1.7318 + } /* no primary difference... do the rest from the buffers */ 1.7319 + } 1.7320 + 1.7321 + /* now, we're gonna reexamine collected CEs */ 1.7322 + uint32_t *sCE; 1.7323 + uint32_t *tCE; 1.7324 + 1.7325 + /* This is the secondary level of comparison */ 1.7326 + if(checkSecTer) { 1.7327 + if(!isFrenchSec) { /* normal */ 1.7328 + sCE = sCEs.buf; 1.7329 + tCE = tCEs.buf; 1.7330 + for(;;) { 1.7331 + while (secS == 0) { 1.7332 + secS = *(sCE++) & UCOL_SECONDARYMASK; 1.7333 + } 1.7334 + 1.7335 + while(secT == 0) { 1.7336 + secT = *(tCE++) & UCOL_SECONDARYMASK; 1.7337 + } 1.7338 + 1.7339 + if(secS == secT) { 1.7340 + if(secS == UCOL_NO_MORE_CES_SECONDARY) { 1.7341 + break; 1.7342 + } else { 1.7343 + secS = 0; secT = 0; 1.7344 + continue; 1.7345 + } 1.7346 + } else { 1.7347 + result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 1.7348 + goto commonReturn; 1.7349 + } 1.7350 + } 1.7351 + } else { /* do the French */ 1.7352 + uint32_t *sCESave = NULL; 1.7353 + uint32_t *tCESave = NULL; 1.7354 + sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */ 1.7355 + tCE = tCEs.pos-2; 1.7356 + for(;;) { 1.7357 + while (secS == 0 && sCE >= sCEs.buf) { 1.7358 + if(sCESave == NULL) { 1.7359 + secS = *(sCE--); 1.7360 + if(isContinuation(secS)) { 1.7361 + while(isContinuation(secS = *(sCE--))) 1.7362 + ; 1.7363 + /* after this, secS has the start of continuation, and sCEs points before that */ 1.7364 + sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */ 1.7365 + sCE+=2; /* need to point to the first continuation CP */ 1.7366 + /* However, now you can just continue doing stuff */ 1.7367 + } 1.7368 + } else { 1.7369 + secS = *(sCE++); 1.7370 + if(!isContinuation(secS)) { /* This means we have finished with this cont */ 1.7371 + sCE = sCESave; /* reset the pointer to before continuation */ 1.7372 + sCESave = NULL; 1.7373 + secS = 0; /* Fetch a fresh CE before the continuation sequence. */ 1.7374 + continue; 1.7375 + } 1.7376 + } 1.7377 + secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */ 1.7378 + } 1.7379 + 1.7380 + while(secT == 0 && tCE >= tCEs.buf) { 1.7381 + if(tCESave == NULL) { 1.7382 + secT = *(tCE--); 1.7383 + if(isContinuation(secT)) { 1.7384 + while(isContinuation(secT = *(tCE--))) 1.7385 + ; 1.7386 + /* after this, secS has the start of continuation, and sCEs points before that */ 1.7387 + tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */ 1.7388 + tCE+=2; /* need to point to the first continuation CP */ 1.7389 + /* However, now you can just continue doing stuff */ 1.7390 + } 1.7391 + } else { 1.7392 + secT = *(tCE++); 1.7393 + if(!isContinuation(secT)) { /* This means we have finished with this cont */ 1.7394 + tCE = tCESave; /* reset the pointer to before continuation */ 1.7395 + tCESave = NULL; 1.7396 + secT = 0; /* Fetch a fresh CE before the continuation sequence. */ 1.7397 + continue; 1.7398 + } 1.7399 + } 1.7400 + secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */ 1.7401 + } 1.7402 + 1.7403 + if(secS == secT) { 1.7404 + if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) { 1.7405 + break; 1.7406 + } else { 1.7407 + secS = 0; secT = 0; 1.7408 + continue; 1.7409 + } 1.7410 + } else { 1.7411 + result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 1.7412 + goto commonReturn; 1.7413 + } 1.7414 + } 1.7415 + } 1.7416 + } 1.7417 + 1.7418 + /* doing the case bit */ 1.7419 + if(checkCase) { 1.7420 + sCE = sCEs.buf; 1.7421 + tCE = tCEs.buf; 1.7422 + for(;;) { 1.7423 + while((secS & UCOL_REMOVE_CASE) == 0) { 1.7424 + if(!isContinuation(*sCE++)) { 1.7425 + secS =*(sCE-1); 1.7426 + if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) { 1.7427 + // primary ignorables should not be considered on the case level when the strength is primary 1.7428 + // otherwise, the CEs stop being well-formed 1.7429 + secS &= UCOL_TERT_CASE_MASK; 1.7430 + secS ^= caseSwitch; 1.7431 + } else { 1.7432 + secS = 0; 1.7433 + } 1.7434 + } else { 1.7435 + secS = 0; 1.7436 + } 1.7437 + } 1.7438 + 1.7439 + while((secT & UCOL_REMOVE_CASE) == 0) { 1.7440 + if(!isContinuation(*tCE++)) { 1.7441 + secT = *(tCE-1); 1.7442 + if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) { 1.7443 + // primary ignorables should not be considered on the case level when the strength is primary 1.7444 + // otherwise, the CEs stop being well-formed 1.7445 + secT &= UCOL_TERT_CASE_MASK; 1.7446 + secT ^= caseSwitch; 1.7447 + } else { 1.7448 + secT = 0; 1.7449 + } 1.7450 + } else { 1.7451 + secT = 0; 1.7452 + } 1.7453 + } 1.7454 + 1.7455 + if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) { 1.7456 + result = UCOL_LESS; 1.7457 + goto commonReturn; 1.7458 + } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) { 1.7459 + result = UCOL_GREATER; 1.7460 + goto commonReturn; 1.7461 + } 1.7462 + 1.7463 + if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) { 1.7464 + break; 1.7465 + } else { 1.7466 + secS = 0; 1.7467 + secT = 0; 1.7468 + } 1.7469 + } 1.7470 + } 1.7471 + 1.7472 + /* Tertiary level */ 1.7473 + if(checkTertiary) { 1.7474 + secS = 0; 1.7475 + secT = 0; 1.7476 + sCE = sCEs.buf; 1.7477 + tCE = tCEs.buf; 1.7478 + for(;;) { 1.7479 + while((secS & UCOL_REMOVE_CASE) == 0) { 1.7480 + sOrder = *sCE++; 1.7481 + secS = sOrder & tertiaryMask; 1.7482 + if(!isContinuation(sOrder)) { 1.7483 + secS ^= caseSwitch; 1.7484 + } else { 1.7485 + secS &= UCOL_REMOVE_CASE; 1.7486 + } 1.7487 + } 1.7488 + 1.7489 + while((secT & UCOL_REMOVE_CASE) == 0) { 1.7490 + tOrder = *tCE++; 1.7491 + secT = tOrder & tertiaryMask; 1.7492 + if(!isContinuation(tOrder)) { 1.7493 + secT ^= caseSwitch; 1.7494 + } else { 1.7495 + secT &= UCOL_REMOVE_CASE; 1.7496 + } 1.7497 + } 1.7498 + 1.7499 + if(secS == secT) { 1.7500 + if((secS & UCOL_REMOVE_CASE) == 1) { 1.7501 + break; 1.7502 + } else { 1.7503 + secS = 0; secT = 0; 1.7504 + continue; 1.7505 + } 1.7506 + } else { 1.7507 + result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 1.7508 + goto commonReturn; 1.7509 + } 1.7510 + } 1.7511 + } 1.7512 + 1.7513 + 1.7514 + if(qShifted /*checkQuad*/) { 1.7515 + UBool sInShifted = TRUE; 1.7516 + UBool tInShifted = TRUE; 1.7517 + secS = 0; 1.7518 + secT = 0; 1.7519 + sCE = sCEs.buf; 1.7520 + tCE = tCEs.buf; 1.7521 + for(;;) { 1.7522 + while((secS == 0 && secS != UCOL_NO_MORE_CES) || (isContinuation(secS) && !sInShifted)) { 1.7523 + secS = *(sCE++); 1.7524 + if(isContinuation(secS)) { 1.7525 + if(!sInShifted) { 1.7526 + continue; 1.7527 + } 1.7528 + } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */ 1.7529 + secS = UCOL_PRIMARYMASK; 1.7530 + sInShifted = FALSE; 1.7531 + } else { 1.7532 + sInShifted = TRUE; 1.7533 + } 1.7534 + } 1.7535 + secS &= UCOL_PRIMARYMASK; 1.7536 + 1.7537 + 1.7538 + while((secT == 0 && secT != UCOL_NO_MORE_CES) || (isContinuation(secT) && !tInShifted)) { 1.7539 + secT = *(tCE++); 1.7540 + if(isContinuation(secT)) { 1.7541 + if(!tInShifted) { 1.7542 + continue; 1.7543 + } 1.7544 + } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) { 1.7545 + secT = UCOL_PRIMARYMASK; 1.7546 + tInShifted = FALSE; 1.7547 + } else { 1.7548 + tInShifted = TRUE; 1.7549 + } 1.7550 + } 1.7551 + secT &= UCOL_PRIMARYMASK; 1.7552 + 1.7553 + if(secS == secT) { 1.7554 + if(secS == UCOL_NO_MORE_CES_PRIMARY) { 1.7555 + break; 1.7556 + } else { 1.7557 + secS = 0; secT = 0; 1.7558 + continue; 1.7559 + } 1.7560 + } else { 1.7561 + result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 1.7562 + goto commonReturn; 1.7563 + } 1.7564 + } 1.7565 + } else if(doHiragana && hirResult != UCOL_EQUAL) { 1.7566 + // If we're fine on quaternaries, we might be different 1.7567 + // on Hiragana. This, however, might fail us in shifted. 1.7568 + result = hirResult; 1.7569 + goto commonReturn; 1.7570 + } 1.7571 + 1.7572 + /* For IDENTICAL comparisons, we use a bitwise character comparison */ 1.7573 + /* as a tiebreaker if all else is equal. */ 1.7574 + /* Getting here should be quite rare - strings are not identical - */ 1.7575 + /* that is checked first, but compared == through all other checks. */ 1.7576 + if(checkIdent) 1.7577 + { 1.7578 + //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON); 1.7579 + result = ucol_checkIdent(sColl, tColl, TRUE, status); 1.7580 + } 1.7581 + 1.7582 +commonReturn: 1.7583 + if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) { 1.7584 + if (sCEs.buf != sCEs.localArray ) { 1.7585 + uprv_free(sCEs.buf); 1.7586 + } 1.7587 + if (tCEs.buf != tCEs.localArray ) { 1.7588 + uprv_free(tCEs.buf); 1.7589 + } 1.7590 + } 1.7591 + 1.7592 + return result; 1.7593 +} 1.7594 + 1.7595 +static UCollationResult 1.7596 +ucol_strcollRegular(const UCollator *coll, 1.7597 + const UChar *source, int32_t sourceLength, 1.7598 + const UChar *target, int32_t targetLength, 1.7599 + UErrorCode *status) { 1.7600 + collIterate sColl, tColl; 1.7601 + // Preparing the context objects for iterating over strings 1.7602 + IInit_collIterate(coll, source, sourceLength, &sColl, status); 1.7603 + IInit_collIterate(coll, target, targetLength, &tColl, status); 1.7604 + if(U_FAILURE(*status)) { 1.7605 + return UCOL_LESS; 1.7606 + } 1.7607 + return ucol_strcollRegular(&sColl, &tColl, status); 1.7608 +} 1.7609 + 1.7610 +static inline uint32_t 1.7611 +ucol_getLatinOneContraction(const UCollator *coll, int32_t strength, 1.7612 + uint32_t CE, const UChar *s, int32_t *index, int32_t len) 1.7613 +{ 1.7614 + const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF); 1.7615 + int32_t latinOneOffset = (CE & 0x00FFF000) >> 12; 1.7616 + int32_t offset = 1; 1.7617 + UChar schar = 0, tchar = 0; 1.7618 + 1.7619 + for(;;) { 1.7620 + if(len == -1) { 1.7621 + if(s[*index] == 0) { // end of string 1.7622 + return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 1.7623 + } else { 1.7624 + schar = s[*index]; 1.7625 + } 1.7626 + } else { 1.7627 + if(*index == len) { 1.7628 + return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 1.7629 + } else { 1.7630 + schar = s[*index]; 1.7631 + } 1.7632 + } 1.7633 + 1.7634 + while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 1.7635 + offset++; 1.7636 + } 1.7637 + 1.7638 + if (schar == tchar) { 1.7639 + (*index)++; 1.7640 + return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]); 1.7641 + } 1.7642 + else 1.7643 + { 1.7644 + if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) { 1.7645 + return UCOL_BAIL_OUT_CE; 1.7646 + } 1.7647 + // skip completely ignorables 1.7648 + uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar); 1.7649 + if(isZeroCE == 0) { // we have to ignore completely ignorables 1.7650 + (*index)++; 1.7651 + continue; 1.7652 + } 1.7653 + 1.7654 + return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 1.7655 + } 1.7656 + } 1.7657 +} 1.7658 + 1.7659 + 1.7660 +/** 1.7661 + * This is a fast strcoll, geared towards text in Latin-1. 1.7662 + * It supports contractions of size two, French secondaries 1.7663 + * and case switching. You can use it with strengths primary 1.7664 + * to tertiary. It does not support shifted and case level. 1.7665 + * It relies on the table build by setupLatin1Table. If it 1.7666 + * doesn't understand something, it will go to the regular 1.7667 + * strcoll. 1.7668 + */ 1.7669 +static UCollationResult 1.7670 +ucol_strcollUseLatin1( const UCollator *coll, 1.7671 + const UChar *source, 1.7672 + int32_t sLen, 1.7673 + const UChar *target, 1.7674 + int32_t tLen, 1.7675 + UErrorCode *status) 1.7676 +{ 1.7677 + U_ALIGN_CODE(16); 1.7678 + int32_t strength = coll->strength; 1.7679 + 1.7680 + int32_t sIndex = 0, tIndex = 0; 1.7681 + UChar sChar = 0, tChar = 0; 1.7682 + uint32_t sOrder=0, tOrder=0; 1.7683 + 1.7684 + UBool endOfSource = FALSE; 1.7685 + 1.7686 + uint32_t *elements = coll->latinOneCEs; 1.7687 + 1.7688 + UBool haveContractions = FALSE; // if we have contractions in our string 1.7689 + // we cannot do French secondary 1.7690 + 1.7691 + // Do the primary level 1.7692 + for(;;) { 1.7693 + while(sOrder==0) { // this loop skips primary ignorables 1.7694 + // sOrder=getNextlatinOneCE(source); 1.7695 + if(sLen==-1) { // handling zero terminated strings 1.7696 + sChar=source[sIndex++]; 1.7697 + if(sChar==0) { 1.7698 + endOfSource = TRUE; 1.7699 + break; 1.7700 + } 1.7701 + } else { // handling strings with known length 1.7702 + if(sIndex==sLen) { 1.7703 + endOfSource = TRUE; 1.7704 + break; 1.7705 + } 1.7706 + sChar=source[sIndex++]; 1.7707 + } 1.7708 + if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) 1.7709 + //fprintf(stderr, "R"); 1.7710 + return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 1.7711 + } 1.7712 + sOrder = elements[sChar]; 1.7713 + if(sOrder >= UCOL_NOT_FOUND) { // if we got a special 1.7714 + // specials can basically be either contractions or bail-out signs. If we get anything 1.7715 + // else, we'll bail out anywasy 1.7716 + if(getCETag(sOrder) == CONTRACTION_TAG) { 1.7717 + sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen); 1.7718 + haveContractions = TRUE; // if there are contractions, we cannot do French secondary 1.7719 + // However, if there are contractions in the table, but we always use just one char, 1.7720 + // we might be able to do French. This should be checked out. 1.7721 + } 1.7722 + if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { 1.7723 + //fprintf(stderr, "S"); 1.7724 + return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 1.7725 + } 1.7726 + } 1.7727 + } 1.7728 + 1.7729 + while(tOrder==0) { // this loop skips primary ignorables 1.7730 + // tOrder=getNextlatinOneCE(target); 1.7731 + if(tLen==-1) { // handling zero terminated strings 1.7732 + tChar=target[tIndex++]; 1.7733 + if(tChar==0) { 1.7734 + if(endOfSource) { // this is different than source loop, 1.7735 + // as we already know that source loop is done here, 1.7736 + // so we can either finish the primary loop if both 1.7737 + // strings are done or anounce the result if only 1.7738 + // target is done. Same below. 1.7739 + goto endOfPrimLoop; 1.7740 + } else { 1.7741 + return UCOL_GREATER; 1.7742 + } 1.7743 + } 1.7744 + } else { // handling strings with known length 1.7745 + if(tIndex==tLen) { 1.7746 + if(endOfSource) { 1.7747 + goto endOfPrimLoop; 1.7748 + } else { 1.7749 + return UCOL_GREATER; 1.7750 + } 1.7751 + } 1.7752 + tChar=target[tIndex++]; 1.7753 + } 1.7754 + if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) 1.7755 + //fprintf(stderr, "R"); 1.7756 + return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 1.7757 + } 1.7758 + tOrder = elements[tChar]; 1.7759 + if(tOrder >= UCOL_NOT_FOUND) { 1.7760 + // Handling specials, see the comments for source 1.7761 + if(getCETag(tOrder) == CONTRACTION_TAG) { 1.7762 + tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen); 1.7763 + haveContractions = TRUE; 1.7764 + } 1.7765 + if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { 1.7766 + //fprintf(stderr, "S"); 1.7767 + return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 1.7768 + } 1.7769 + } 1.7770 + } 1.7771 + if(endOfSource) { // source is finished, but target is not, say the result. 1.7772 + return UCOL_LESS; 1.7773 + } 1.7774 + 1.7775 + if(sOrder == tOrder) { // if we have same CEs, we continue the loop 1.7776 + sOrder = 0; tOrder = 0; 1.7777 + continue; 1.7778 + } else { 1.7779 + // compare current top bytes 1.7780 + if(((sOrder^tOrder)&0xFF000000)!=0) { 1.7781 + // top bytes differ, return difference 1.7782 + if(sOrder < tOrder) { 1.7783 + return UCOL_LESS; 1.7784 + } else if(sOrder > tOrder) { 1.7785 + return UCOL_GREATER; 1.7786 + } 1.7787 + // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24); 1.7788 + // since we must return enum value 1.7789 + } 1.7790 + 1.7791 + // top bytes match, continue with following bytes 1.7792 + sOrder<<=8; 1.7793 + tOrder<<=8; 1.7794 + } 1.7795 + } 1.7796 + 1.7797 +endOfPrimLoop: 1.7798 + // after primary loop, we definitely know the sizes of strings, 1.7799 + // so we set it and use simpler loop for secondaries and tertiaries 1.7800 + sLen = sIndex; tLen = tIndex; 1.7801 + if(strength >= UCOL_SECONDARY) { 1.7802 + // adjust the table beggining 1.7803 + elements += coll->latinOneTableLen; 1.7804 + endOfSource = FALSE; 1.7805 + 1.7806 + if(coll->frenchCollation == UCOL_OFF) { // non French 1.7807 + // This loop is a simplified copy of primary loop 1.7808 + // at this point we know that whole strings are latin-1, so we don't 1.7809 + // check for that. We also know that we only have contractions as 1.7810 + // specials. 1.7811 + sIndex = 0; tIndex = 0; 1.7812 + for(;;) { 1.7813 + while(sOrder==0) { 1.7814 + if(sIndex==sLen) { 1.7815 + endOfSource = TRUE; 1.7816 + break; 1.7817 + } 1.7818 + sChar=source[sIndex++]; 1.7819 + sOrder = elements[sChar]; 1.7820 + if(sOrder > UCOL_NOT_FOUND) { 1.7821 + sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen); 1.7822 + } 1.7823 + } 1.7824 + 1.7825 + while(tOrder==0) { 1.7826 + if(tIndex==tLen) { 1.7827 + if(endOfSource) { 1.7828 + goto endOfSecLoop; 1.7829 + } else { 1.7830 + return UCOL_GREATER; 1.7831 + } 1.7832 + } 1.7833 + tChar=target[tIndex++]; 1.7834 + tOrder = elements[tChar]; 1.7835 + if(tOrder > UCOL_NOT_FOUND) { 1.7836 + tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen); 1.7837 + } 1.7838 + } 1.7839 + if(endOfSource) { 1.7840 + return UCOL_LESS; 1.7841 + } 1.7842 + 1.7843 + if(sOrder == tOrder) { 1.7844 + sOrder = 0; tOrder = 0; 1.7845 + continue; 1.7846 + } else { 1.7847 + // see primary loop for comments on this 1.7848 + if(((sOrder^tOrder)&0xFF000000)!=0) { 1.7849 + if(sOrder < tOrder) { 1.7850 + return UCOL_LESS; 1.7851 + } else if(sOrder > tOrder) { 1.7852 + return UCOL_GREATER; 1.7853 + } 1.7854 + } 1.7855 + sOrder<<=8; 1.7856 + tOrder<<=8; 1.7857 + } 1.7858 + } 1.7859 + } else { // French 1.7860 + if(haveContractions) { // if we have contractions, we have to bail out 1.7861 + // since we don't really know how to handle them here 1.7862 + return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 1.7863 + } 1.7864 + // For French, we go backwards 1.7865 + sIndex = sLen; tIndex = tLen; 1.7866 + for(;;) { 1.7867 + while(sOrder==0) { 1.7868 + if(sIndex==0) { 1.7869 + endOfSource = TRUE; 1.7870 + break; 1.7871 + } 1.7872 + sChar=source[--sIndex]; 1.7873 + sOrder = elements[sChar]; 1.7874 + // don't even look for contractions 1.7875 + } 1.7876 + 1.7877 + while(tOrder==0) { 1.7878 + if(tIndex==0) { 1.7879 + if(endOfSource) { 1.7880 + goto endOfSecLoop; 1.7881 + } else { 1.7882 + return UCOL_GREATER; 1.7883 + } 1.7884 + } 1.7885 + tChar=target[--tIndex]; 1.7886 + tOrder = elements[tChar]; 1.7887 + // don't even look for contractions 1.7888 + } 1.7889 + if(endOfSource) { 1.7890 + return UCOL_LESS; 1.7891 + } 1.7892 + 1.7893 + if(sOrder == tOrder) { 1.7894 + sOrder = 0; tOrder = 0; 1.7895 + continue; 1.7896 + } else { 1.7897 + // see the primary loop for comments 1.7898 + if(((sOrder^tOrder)&0xFF000000)!=0) { 1.7899 + if(sOrder < tOrder) { 1.7900 + return UCOL_LESS; 1.7901 + } else if(sOrder > tOrder) { 1.7902 + return UCOL_GREATER; 1.7903 + } 1.7904 + } 1.7905 + sOrder<<=8; 1.7906 + tOrder<<=8; 1.7907 + } 1.7908 + } 1.7909 + } 1.7910 + } 1.7911 + 1.7912 +endOfSecLoop: 1.7913 + if(strength >= UCOL_TERTIARY) { 1.7914 + // tertiary loop is the same as secondary (except no French) 1.7915 + elements += coll->latinOneTableLen; 1.7916 + sIndex = 0; tIndex = 0; 1.7917 + endOfSource = FALSE; 1.7918 + for(;;) { 1.7919 + while(sOrder==0) { 1.7920 + if(sIndex==sLen) { 1.7921 + endOfSource = TRUE; 1.7922 + break; 1.7923 + } 1.7924 + sChar=source[sIndex++]; 1.7925 + sOrder = elements[sChar]; 1.7926 + if(sOrder > UCOL_NOT_FOUND) { 1.7927 + sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen); 1.7928 + } 1.7929 + } 1.7930 + while(tOrder==0) { 1.7931 + if(tIndex==tLen) { 1.7932 + if(endOfSource) { 1.7933 + return UCOL_EQUAL; // if both strings are at the end, they are equal 1.7934 + } else { 1.7935 + return UCOL_GREATER; 1.7936 + } 1.7937 + } 1.7938 + tChar=target[tIndex++]; 1.7939 + tOrder = elements[tChar]; 1.7940 + if(tOrder > UCOL_NOT_FOUND) { 1.7941 + tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen); 1.7942 + } 1.7943 + } 1.7944 + if(endOfSource) { 1.7945 + return UCOL_LESS; 1.7946 + } 1.7947 + if(sOrder == tOrder) { 1.7948 + sOrder = 0; tOrder = 0; 1.7949 + continue; 1.7950 + } else { 1.7951 + if(((sOrder^tOrder)&0xff000000)!=0) { 1.7952 + if(sOrder < tOrder) { 1.7953 + return UCOL_LESS; 1.7954 + } else if(sOrder > tOrder) { 1.7955 + return UCOL_GREATER; 1.7956 + } 1.7957 + } 1.7958 + sOrder<<=8; 1.7959 + tOrder<<=8; 1.7960 + } 1.7961 + } 1.7962 + } 1.7963 + return UCOL_EQUAL; 1.7964 +} 1.7965 + 1.7966 +/* 1.7967 + Note: ucol_strcollUTF8 supports null terminated input. Calculating length of 1.7968 + null terminated input string takes extra amount of CPU cycles. 1.7969 +*/ 1.7970 +static UCollationResult 1.7971 +ucol_strcollRegularUTF8( 1.7972 + const UCollator *coll, 1.7973 + const char *source, 1.7974 + int32_t sourceLength, 1.7975 + const char *target, 1.7976 + int32_t targetLength, 1.7977 + UErrorCode *status) 1.7978 +{ 1.7979 + UCharIterator src; 1.7980 + UCharIterator tgt; 1.7981 + 1.7982 + uiter_setUTF8(&src, source, sourceLength); 1.7983 + uiter_setUTF8(&tgt, target, targetLength); 1.7984 + 1.7985 + // Preparing the context objects for iterating over strings 1.7986 + collIterate sColl, tColl; 1.7987 + IInit_collIterate(coll, NULL, -1, &sColl, status); 1.7988 + IInit_collIterate(coll, NULL, -1, &tColl, status); 1.7989 + if(U_FAILURE(*status)) { 1.7990 + UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) 1.7991 + return UCOL_EQUAL; 1.7992 + } 1.7993 + // The division for the array length may truncate the array size to 1.7994 + // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high 1.7995 + // for all platforms anyway. 1.7996 + UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 1.7997 + UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 1.7998 + UNormIterator *sNormIter = NULL, *tNormIter = NULL; 1.7999 + 1.8000 + sColl.iterator = &src; 1.8001 + sColl.flags |= UCOL_USE_ITERATOR; 1.8002 + tColl.flags |= UCOL_USE_ITERATOR; 1.8003 + tColl.iterator = &tgt; 1.8004 + 1.8005 + if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) { 1.8006 + sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status); 1.8007 + sColl.iterator = unorm_setIter(sNormIter, &src, UNORM_FCD, status); 1.8008 + sColl.flags &= ~UCOL_ITER_NORM; 1.8009 + 1.8010 + tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status); 1.8011 + tColl.iterator = unorm_setIter(tNormIter, &tgt, UNORM_FCD, status); 1.8012 + tColl.flags &= ~UCOL_ITER_NORM; 1.8013 + } 1.8014 + 1.8015 + return ucol_strcollRegular(&sColl, &tColl, status); 1.8016 +} 1.8017 + 1.8018 +static inline uint32_t 1.8019 +ucol_getLatinOneContractionUTF8(const UCollator *coll, int32_t strength, 1.8020 + uint32_t CE, const char *s, int32_t *index, int32_t len) 1.8021 +{ 1.8022 + const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF); 1.8023 + int32_t latinOneOffset = (CE & 0x00FFF000) >> 12; 1.8024 + int32_t offset = 1; 1.8025 + UChar32 schar = 0, tchar = 0; 1.8026 + 1.8027 + for(;;) { 1.8028 + if (*index == len) { 1.8029 + return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 1.8030 + } 1.8031 + U8_GET_OR_FFFD((const uint8_t*)s, 0, *index, len, schar); 1.8032 + if (len < 0 && schar == 0) { 1.8033 + return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 1.8034 + } 1.8035 + 1.8036 + while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 1.8037 + offset++; 1.8038 + } 1.8039 + 1.8040 + if (schar == tchar) { 1.8041 + U8_FWD_1(s, *index, len); 1.8042 + return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]); 1.8043 + } 1.8044 + else 1.8045 + { 1.8046 + if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) { 1.8047 + return UCOL_BAIL_OUT_CE; 1.8048 + } 1.8049 + // skip completely ignorables 1.8050 + uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar); 1.8051 + if(isZeroCE == 0) { // we have to ignore completely ignorables 1.8052 + U8_FWD_1(s, *index, len); 1.8053 + continue; 1.8054 + } 1.8055 + 1.8056 + return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 1.8057 + } 1.8058 + } 1.8059 +} 1.8060 + 1.8061 +static inline UCollationResult 1.8062 +ucol_strcollUseLatin1UTF8( 1.8063 + const UCollator *coll, 1.8064 + const char *source, 1.8065 + int32_t sLen, 1.8066 + const char *target, 1.8067 + int32_t tLen, 1.8068 + UErrorCode *status) 1.8069 +{ 1.8070 + U_ALIGN_CODE(16); 1.8071 + int32_t strength = coll->strength; 1.8072 + 1.8073 + int32_t sIndex = 0, tIndex = 0; 1.8074 + UChar32 sChar = 0, tChar = 0; 1.8075 + uint32_t sOrder=0, tOrder=0; 1.8076 + 1.8077 + UBool endOfSource = FALSE; 1.8078 + 1.8079 + uint32_t *elements = coll->latinOneCEs; 1.8080 + 1.8081 + UBool haveContractions = FALSE; // if we have contractions in our string 1.8082 + // we cannot do French secondary 1.8083 + 1.8084 + // Do the primary level 1.8085 + for(;;) { 1.8086 + while(sOrder==0) { // this loop skips primary ignorables 1.8087 + // sOrder=getNextlatinOneCE(source); 1.8088 + if (sIndex == sLen) { 1.8089 + endOfSource = TRUE; 1.8090 + break; 1.8091 + } 1.8092 + U8_NEXT_OR_FFFD(source, sIndex, sLen ,sChar); 1.8093 + if (sLen < 0 && sChar == 0) { 1.8094 + endOfSource = TRUE; 1.8095 + sLen = sIndex; 1.8096 + break; 1.8097 + } 1.8098 + if(sChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) 1.8099 + //fprintf(stderr, "R"); 1.8100 + return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status); 1.8101 + } 1.8102 + sOrder = elements[sChar]; 1.8103 + if(sOrder >= UCOL_NOT_FOUND) { // if we got a special 1.8104 + // specials can basically be either contractions or bail-out signs. If we get anything 1.8105 + // else, we'll bail out anywasy 1.8106 + if(getCETag(sOrder) == CONTRACTION_TAG) { 1.8107 + sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen); 1.8108 + haveContractions = TRUE; // if there are contractions, we cannot do French secondary 1.8109 + // However, if there are contractions in the table, but we always use just one char, 1.8110 + // we might be able to do French. This should be checked out. 1.8111 + } 1.8112 + if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { 1.8113 + //fprintf(stderr, "S"); 1.8114 + return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status); 1.8115 + } 1.8116 + } 1.8117 + } 1.8118 + 1.8119 + while(tOrder==0) { // this loop skips primary ignorables 1.8120 + // tOrder=getNextlatinOneCE(target); 1.8121 + if (tIndex == tLen) { 1.8122 + if(endOfSource) { 1.8123 + goto endOfPrimLoopU8; 1.8124 + } else { 1.8125 + return UCOL_GREATER; 1.8126 + } 1.8127 + } 1.8128 + U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar); 1.8129 + if (tLen < 0 && tChar == 0) { 1.8130 + if(endOfSource) { 1.8131 + tLen = tIndex; 1.8132 + goto endOfPrimLoopU8; 1.8133 + } else { 1.8134 + return UCOL_GREATER; 1.8135 + } 1.8136 + } 1.8137 + if(tChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) 1.8138 + //fprintf(stderr, "R"); 1.8139 + return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status); 1.8140 + } 1.8141 + tOrder = elements[tChar]; 1.8142 + if(tOrder >= UCOL_NOT_FOUND) { 1.8143 + // Handling specials, see the comments for source 1.8144 + if(getCETag(tOrder) == CONTRACTION_TAG) { 1.8145 + tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen); 1.8146 + haveContractions = TRUE; 1.8147 + } 1.8148 + if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { 1.8149 + //fprintf(stderr, "S"); 1.8150 + return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status); 1.8151 + } 1.8152 + } 1.8153 + } 1.8154 + if(endOfSource) { // source is finished, but target is not, say the result. 1.8155 + return UCOL_LESS; 1.8156 + } 1.8157 + 1.8158 + if(sOrder == tOrder) { // if we have same CEs, we continue the loop 1.8159 + sOrder = 0; tOrder = 0; 1.8160 + continue; 1.8161 + } else { 1.8162 + // compare current top bytes 1.8163 + if(((sOrder^tOrder)&0xFF000000)!=0) { 1.8164 + // top bytes differ, return difference 1.8165 + if(sOrder < tOrder) { 1.8166 + return UCOL_LESS; 1.8167 + } else if(sOrder > tOrder) { 1.8168 + return UCOL_GREATER; 1.8169 + } 1.8170 + // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24); 1.8171 + // since we must return enum value 1.8172 + } 1.8173 + 1.8174 + // top bytes match, continue with following bytes 1.8175 + sOrder<<=8; 1.8176 + tOrder<<=8; 1.8177 + } 1.8178 + } 1.8179 + 1.8180 +endOfPrimLoopU8: 1.8181 + // after primary loop, we definitely know the sizes of strings, 1.8182 + // so we set it and use simpler loop for secondaries and tertiaries 1.8183 + sLen = sIndex; tLen = tIndex; 1.8184 + if(strength >= UCOL_SECONDARY) { 1.8185 + // adjust the table beggining 1.8186 + elements += coll->latinOneTableLen; 1.8187 + endOfSource = FALSE; 1.8188 + 1.8189 + if(coll->frenchCollation == UCOL_OFF) { // non French 1.8190 + // This loop is a simplified copy of primary loop 1.8191 + // at this point we know that whole strings are latin-1, so we don't 1.8192 + // check for that. We also know that we only have contractions as 1.8193 + // specials. 1.8194 + sIndex = 0; tIndex = 0; 1.8195 + for(;;) { 1.8196 + while(sOrder==0) { 1.8197 + if(sIndex==sLen) { 1.8198 + endOfSource = TRUE; 1.8199 + break; 1.8200 + } 1.8201 + U_ASSERT(sLen >= 0); 1.8202 + U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar); 1.8203 + U_ASSERT(sChar >= 0 && sChar <= 0xFF); 1.8204 + sOrder = elements[sChar]; 1.8205 + if(sOrder > UCOL_NOT_FOUND) { 1.8206 + sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen); 1.8207 + } 1.8208 + } 1.8209 + 1.8210 + while(tOrder==0) { 1.8211 + if(tIndex==tLen) { 1.8212 + if(endOfSource) { 1.8213 + goto endOfSecLoopU8; 1.8214 + } else { 1.8215 + return UCOL_GREATER; 1.8216 + } 1.8217 + } 1.8218 + U_ASSERT(tLen >= 0); 1.8219 + U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar); 1.8220 + U_ASSERT(tChar >= 0 && tChar <= 0xFF); 1.8221 + tOrder = elements[tChar]; 1.8222 + if(tOrder > UCOL_NOT_FOUND) { 1.8223 + tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen); 1.8224 + } 1.8225 + } 1.8226 + if(endOfSource) { 1.8227 + return UCOL_LESS; 1.8228 + } 1.8229 + 1.8230 + if(sOrder == tOrder) { 1.8231 + sOrder = 0; tOrder = 0; 1.8232 + continue; 1.8233 + } else { 1.8234 + // see primary loop for comments on this 1.8235 + if(((sOrder^tOrder)&0xFF000000)!=0) { 1.8236 + if(sOrder < tOrder) { 1.8237 + return UCOL_LESS; 1.8238 + } else if(sOrder > tOrder) { 1.8239 + return UCOL_GREATER; 1.8240 + } 1.8241 + } 1.8242 + sOrder<<=8; 1.8243 + tOrder<<=8; 1.8244 + } 1.8245 + } 1.8246 + } else { // French 1.8247 + if(haveContractions) { // if we have contractions, we have to bail out 1.8248 + // since we don't really know how to handle them here 1.8249 + return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status); 1.8250 + } 1.8251 + // For French, we go backwards 1.8252 + sIndex = sLen; tIndex = tLen; 1.8253 + for(;;) { 1.8254 + while(sOrder==0) { 1.8255 + if(sIndex==0) { 1.8256 + endOfSource = TRUE; 1.8257 + break; 1.8258 + } 1.8259 + U8_PREV_OR_FFFD(source, 0, sIndex, sChar); 1.8260 + U_ASSERT(sChar >= 0 && sChar <= 0xFF); 1.8261 + sOrder = elements[sChar]; 1.8262 + // don't even look for contractions 1.8263 + } 1.8264 + 1.8265 + while(tOrder==0) { 1.8266 + if(tIndex==0) { 1.8267 + if(endOfSource) { 1.8268 + goto endOfSecLoopU8; 1.8269 + } else { 1.8270 + return UCOL_GREATER; 1.8271 + } 1.8272 + } 1.8273 + U8_PREV_OR_FFFD(target, 0, tIndex, tChar); 1.8274 + U_ASSERT(tChar >= 0 && tChar <= 0xFF); 1.8275 + tOrder = elements[tChar]; 1.8276 + // don't even look for contractions 1.8277 + } 1.8278 + if(endOfSource) { 1.8279 + return UCOL_LESS; 1.8280 + } 1.8281 + 1.8282 + if(sOrder == tOrder) { 1.8283 + sOrder = 0; tOrder = 0; 1.8284 + continue; 1.8285 + } else { 1.8286 + // see the primary loop for comments 1.8287 + if(((sOrder^tOrder)&0xFF000000)!=0) { 1.8288 + if(sOrder < tOrder) { 1.8289 + return UCOL_LESS; 1.8290 + } else if(sOrder > tOrder) { 1.8291 + return UCOL_GREATER; 1.8292 + } 1.8293 + } 1.8294 + sOrder<<=8; 1.8295 + tOrder<<=8; 1.8296 + } 1.8297 + } 1.8298 + } 1.8299 + } 1.8300 + 1.8301 +endOfSecLoopU8: 1.8302 + if(strength >= UCOL_TERTIARY) { 1.8303 + // tertiary loop is the same as secondary (except no French) 1.8304 + elements += coll->latinOneTableLen; 1.8305 + sIndex = 0; tIndex = 0; 1.8306 + endOfSource = FALSE; 1.8307 + for(;;) { 1.8308 + while(sOrder==0) { 1.8309 + if(sIndex==sLen) { 1.8310 + endOfSource = TRUE; 1.8311 + break; 1.8312 + } 1.8313 + U_ASSERT(sLen >= 0); 1.8314 + U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar); 1.8315 + U_ASSERT(sChar >= 0 && sChar <= 0xFF); 1.8316 + sOrder = elements[sChar]; 1.8317 + if(sOrder > UCOL_NOT_FOUND) { 1.8318 + sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen); 1.8319 + } 1.8320 + } 1.8321 + while(tOrder==0) { 1.8322 + if(tIndex==tLen) { 1.8323 + if(endOfSource) { 1.8324 + return UCOL_EQUAL; // if both strings are at the end, they are equal 1.8325 + } else { 1.8326 + return UCOL_GREATER; 1.8327 + } 1.8328 + } 1.8329 + U_ASSERT(tLen >= 0); 1.8330 + U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar); 1.8331 + U_ASSERT(tChar >= 0 && tChar <= 0xFF); 1.8332 + tOrder = elements[tChar]; 1.8333 + if(tOrder > UCOL_NOT_FOUND) { 1.8334 + tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen); 1.8335 + } 1.8336 + } 1.8337 + if(endOfSource) { 1.8338 + return UCOL_LESS; 1.8339 + } 1.8340 + if(sOrder == tOrder) { 1.8341 + sOrder = 0; tOrder = 0; 1.8342 + continue; 1.8343 + } else { 1.8344 + if(((sOrder^tOrder)&0xff000000)!=0) { 1.8345 + if(sOrder < tOrder) { 1.8346 + return UCOL_LESS; 1.8347 + } else if(sOrder > tOrder) { 1.8348 + return UCOL_GREATER; 1.8349 + } 1.8350 + } 1.8351 + sOrder<<=8; 1.8352 + tOrder<<=8; 1.8353 + } 1.8354 + } 1.8355 + } 1.8356 + return UCOL_EQUAL; 1.8357 +} 1.8358 + 1.8359 +U_CAPI UCollationResult U_EXPORT2 1.8360 +ucol_strcollIter( const UCollator *coll, 1.8361 + UCharIterator *sIter, 1.8362 + UCharIterator *tIter, 1.8363 + UErrorCode *status) 1.8364 +{ 1.8365 + if(!status || U_FAILURE(*status)) { 1.8366 + return UCOL_EQUAL; 1.8367 + } 1.8368 + 1.8369 + UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER); 1.8370 + UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter); 1.8371 + 1.8372 + if (sIter == tIter) { 1.8373 + UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) 1.8374 + return UCOL_EQUAL; 1.8375 + } 1.8376 + if(sIter == NULL || tIter == NULL || coll == NULL) { 1.8377 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.8378 + UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) 1.8379 + return UCOL_EQUAL; 1.8380 + } 1.8381 + 1.8382 + UCollationResult result = UCOL_EQUAL; 1.8383 + 1.8384 + // Preparing the context objects for iterating over strings 1.8385 + collIterate sColl, tColl; 1.8386 + IInit_collIterate(coll, NULL, -1, &sColl, status); 1.8387 + IInit_collIterate(coll, NULL, -1, &tColl, status); 1.8388 + if(U_FAILURE(*status)) { 1.8389 + UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) 1.8390 + return UCOL_EQUAL; 1.8391 + } 1.8392 + // The division for the array length may truncate the array size to 1.8393 + // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high 1.8394 + // for all platforms anyway. 1.8395 + UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 1.8396 + UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 1.8397 + UNormIterator *sNormIter = NULL, *tNormIter = NULL; 1.8398 + 1.8399 + sColl.iterator = sIter; 1.8400 + sColl.flags |= UCOL_USE_ITERATOR; 1.8401 + tColl.flags |= UCOL_USE_ITERATOR; 1.8402 + tColl.iterator = tIter; 1.8403 + 1.8404 + if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) { 1.8405 + sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status); 1.8406 + sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status); 1.8407 + sColl.flags &= ~UCOL_ITER_NORM; 1.8408 + 1.8409 + tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status); 1.8410 + tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status); 1.8411 + tColl.flags &= ~UCOL_ITER_NORM; 1.8412 + } 1.8413 + 1.8414 + UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL; 1.8415 + 1.8416 + while((sChar = sColl.iterator->next(sColl.iterator)) == 1.8417 + (tChar = tColl.iterator->next(tColl.iterator))) { 1.8418 + if(sChar == U_SENTINEL) { 1.8419 + result = UCOL_EQUAL; 1.8420 + goto end_compare; 1.8421 + } 1.8422 + } 1.8423 + 1.8424 + if(sChar == U_SENTINEL) { 1.8425 + tChar = tColl.iterator->previous(tColl.iterator); 1.8426 + } 1.8427 + 1.8428 + if(tChar == U_SENTINEL) { 1.8429 + sChar = sColl.iterator->previous(sColl.iterator); 1.8430 + } 1.8431 + 1.8432 + sChar = sColl.iterator->previous(sColl.iterator); 1.8433 + tChar = tColl.iterator->previous(tColl.iterator); 1.8434 + 1.8435 + if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll)) 1.8436 + { 1.8437 + // We are stopped in the middle of a contraction. 1.8438 + // Scan backwards through the == part of the string looking for the start of the contraction. 1.8439 + // It doesn't matter which string we scan, since they are the same in this region. 1.8440 + do 1.8441 + { 1.8442 + sChar = sColl.iterator->previous(sColl.iterator); 1.8443 + tChar = tColl.iterator->previous(tColl.iterator); 1.8444 + } 1.8445 + while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll)); 1.8446 + } 1.8447 + 1.8448 + 1.8449 + if(U_SUCCESS(*status)) { 1.8450 + result = ucol_strcollRegular(&sColl, &tColl, status); 1.8451 + } 1.8452 + 1.8453 +end_compare: 1.8454 + if(sNormIter || tNormIter) { 1.8455 + unorm_closeIter(sNormIter); 1.8456 + unorm_closeIter(tNormIter); 1.8457 + } 1.8458 + 1.8459 + UTRACE_EXIT_VALUE_STATUS(result, *status) 1.8460 + return result; 1.8461 +} 1.8462 + 1.8463 + 1.8464 +/* */ 1.8465 +/* ucol_strcoll Main public API string comparison function */ 1.8466 +/* */ 1.8467 +U_CAPI UCollationResult U_EXPORT2 1.8468 +ucol_strcoll( const UCollator *coll, 1.8469 + const UChar *source, 1.8470 + int32_t sourceLength, 1.8471 + const UChar *target, 1.8472 + int32_t targetLength) 1.8473 +{ 1.8474 + U_ALIGN_CODE(16); 1.8475 + 1.8476 + UTRACE_ENTRY(UTRACE_UCOL_STRCOLL); 1.8477 + if (UTRACE_LEVEL(UTRACE_VERBOSE)) { 1.8478 + UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target); 1.8479 + UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength); 1.8480 + UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength); 1.8481 + } 1.8482 + 1.8483 + if((source == NULL && sourceLength != 0) || (target == NULL && targetLength != 0)) { 1.8484 + // do not crash, but return. Should have 1.8485 + // status argument to return error. 1.8486 + UTRACE_EXIT_VALUE(UCOL_EQUAL); 1.8487 + return UCOL_EQUAL; 1.8488 + } 1.8489 + 1.8490 + /* Quick check if source and target are same strings. */ 1.8491 + /* They should either both be NULL terminated or the explicit length should be set on both. */ 1.8492 + if (source==target && sourceLength==targetLength) { 1.8493 + UTRACE_EXIT_VALUE(UCOL_EQUAL); 1.8494 + return UCOL_EQUAL; 1.8495 + } 1.8496 + 1.8497 + if(coll->delegate != NULL) { 1.8498 + UErrorCode status = U_ZERO_ERROR; 1.8499 + return ((const Collator*)coll->delegate)->compare(source,sourceLength,target,targetLength, status); 1.8500 + } 1.8501 + 1.8502 + /* Scan the strings. Find: */ 1.8503 + /* The length of any leading portion that is equal */ 1.8504 + /* Whether they are exactly equal. (in which case we just return) */ 1.8505 + const UChar *pSrc = source; 1.8506 + const UChar *pTarg = target; 1.8507 + int32_t equalLength; 1.8508 + 1.8509 + if (sourceLength == -1 && targetLength == -1) { 1.8510 + // Both strings are null terminated. 1.8511 + // Scan through any leading equal portion. 1.8512 + while (*pSrc == *pTarg && *pSrc != 0) { 1.8513 + pSrc++; 1.8514 + pTarg++; 1.8515 + } 1.8516 + if (*pSrc == 0 && *pTarg == 0) { 1.8517 + UTRACE_EXIT_VALUE(UCOL_EQUAL); 1.8518 + return UCOL_EQUAL; 1.8519 + } 1.8520 + equalLength = (int32_t)(pSrc - source); 1.8521 + } 1.8522 + else 1.8523 + { 1.8524 + // One or both strings has an explicit length. 1.8525 + const UChar *pSrcEnd = source + sourceLength; 1.8526 + const UChar *pTargEnd = target + targetLength; 1.8527 + 1.8528 + // Scan while the strings are bitwise ==, or until one is exhausted. 1.8529 + for (;;) { 1.8530 + if (pSrc == pSrcEnd || pTarg == pTargEnd) { 1.8531 + break; 1.8532 + } 1.8533 + if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) { 1.8534 + break; 1.8535 + } 1.8536 + if (*pSrc != *pTarg) { 1.8537 + break; 1.8538 + } 1.8539 + pSrc++; 1.8540 + pTarg++; 1.8541 + } 1.8542 + equalLength = (int32_t)(pSrc - source); 1.8543 + 1.8544 + // If we made it all the way through both strings, we are done. They are == 1.8545 + if ((pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0)) && /* At end of src string, however it was specified. */ 1.8546 + (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0))) /* and also at end of dest string */ 1.8547 + { 1.8548 + UTRACE_EXIT_VALUE(UCOL_EQUAL); 1.8549 + return UCOL_EQUAL; 1.8550 + } 1.8551 + } 1.8552 + if (equalLength > 0) { 1.8553 + /* There is an identical portion at the beginning of the two strings. */ 1.8554 + /* If the identical portion ends within a contraction or a comibining */ 1.8555 + /* character sequence, back up to the start of that sequence. */ 1.8556 + 1.8557 + // These values should already be set by the code above. 1.8558 + //pSrc = source + equalLength; /* point to the first differing chars */ 1.8559 + //pTarg = target + equalLength; 1.8560 + if ((pSrc != source+sourceLength && ucol_unsafeCP(*pSrc, coll)) || 1.8561 + (pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll))) 1.8562 + { 1.8563 + // We are stopped in the middle of a contraction. 1.8564 + // Scan backwards through the == part of the string looking for the start of the contraction. 1.8565 + // It doesn't matter which string we scan, since they are the same in this region. 1.8566 + do 1.8567 + { 1.8568 + equalLength--; 1.8569 + pSrc--; 1.8570 + } 1.8571 + while (equalLength>0 && ucol_unsafeCP(*pSrc, coll)); 1.8572 + } 1.8573 + 1.8574 + source += equalLength; 1.8575 + target += equalLength; 1.8576 + if (sourceLength > 0) { 1.8577 + sourceLength -= equalLength; 1.8578 + } 1.8579 + if (targetLength > 0) { 1.8580 + targetLength -= equalLength; 1.8581 + } 1.8582 + } 1.8583 + 1.8584 + UErrorCode status = U_ZERO_ERROR; 1.8585 + UCollationResult returnVal; 1.8586 + if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) { 1.8587 + returnVal = ucol_strcollRegular(coll, source, sourceLength, target, targetLength, &status); 1.8588 + } else { 1.8589 + returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status); 1.8590 + } 1.8591 + UTRACE_EXIT_VALUE(returnVal); 1.8592 + return returnVal; 1.8593 +} 1.8594 + 1.8595 +U_CAPI UCollationResult U_EXPORT2 1.8596 +ucol_strcollUTF8( 1.8597 + const UCollator *coll, 1.8598 + const char *source, 1.8599 + int32_t sourceLength, 1.8600 + const char *target, 1.8601 + int32_t targetLength, 1.8602 + UErrorCode *status) 1.8603 +{ 1.8604 + U_ALIGN_CODE(16); 1.8605 + 1.8606 + UTRACE_ENTRY(UTRACE_UCOL_STRCOLLUTF8); 1.8607 + if (UTRACE_LEVEL(UTRACE_VERBOSE)) { 1.8608 + UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target); 1.8609 + UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vb ", source, sourceLength); 1.8610 + UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vb ", target, targetLength); 1.8611 + } 1.8612 + 1.8613 + if (U_FAILURE(*status)) { 1.8614 + /* do nothing */ 1.8615 + UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); 1.8616 + return UCOL_EQUAL; 1.8617 + } 1.8618 + 1.8619 + if((source == NULL && sourceLength != 0) || (target == NULL && targetLength != 0)) { 1.8620 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.8621 + UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); 1.8622 + return UCOL_EQUAL; 1.8623 + } 1.8624 + 1.8625 + /* Quick check if source and target are same strings. */ 1.8626 + /* They should either both be NULL terminated or the explicit length should be set on both. */ 1.8627 + if (source==target && sourceLength==targetLength) { 1.8628 + UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); 1.8629 + return UCOL_EQUAL; 1.8630 + } 1.8631 + 1.8632 + if(coll->delegate != NULL) { 1.8633 + return ((const Collator*)coll->delegate)->compareUTF8( 1.8634 + StringPiece(source, (sourceLength < 0) ? uprv_strlen(source) : sourceLength), 1.8635 + StringPiece(target, (targetLength < 0) ? uprv_strlen(target) : targetLength), 1.8636 + *status); 1.8637 + } 1.8638 + 1.8639 + /* Scan the strings. Find: */ 1.8640 + /* The length of any leading portion that is equal */ 1.8641 + /* Whether they are exactly equal. (in which case we just return) */ 1.8642 + const char *pSrc = source; 1.8643 + const char *pTarg = target; 1.8644 + UBool bSrcLimit = FALSE; 1.8645 + UBool bTargLimit = FALSE; 1.8646 + 1.8647 + if (sourceLength == -1 && targetLength == -1) { 1.8648 + // Both strings are null terminated. 1.8649 + // Scan through any leading equal portion. 1.8650 + while (*pSrc == *pTarg && *pSrc != 0) { 1.8651 + pSrc++; 1.8652 + pTarg++; 1.8653 + } 1.8654 + if (*pSrc == 0 && *pTarg == 0) { 1.8655 + UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); 1.8656 + return UCOL_EQUAL; 1.8657 + } 1.8658 + bSrcLimit = (*pSrc == 0); 1.8659 + bTargLimit = (*pTarg == 0); 1.8660 + } 1.8661 + else 1.8662 + { 1.8663 + // One or both strings has an explicit length. 1.8664 + const char *pSrcEnd = source + sourceLength; 1.8665 + const char *pTargEnd = target + targetLength; 1.8666 + 1.8667 + // Scan while the strings are bitwise ==, or until one is exhausted. 1.8668 + for (;;) { 1.8669 + if (pSrc == pSrcEnd || pTarg == pTargEnd) { 1.8670 + break; 1.8671 + } 1.8672 + if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) { 1.8673 + break; 1.8674 + } 1.8675 + if (*pSrc != *pTarg) { 1.8676 + break; 1.8677 + } 1.8678 + pSrc++; 1.8679 + pTarg++; 1.8680 + } 1.8681 + bSrcLimit = (pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0)); 1.8682 + bTargLimit = (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0)); 1.8683 + 1.8684 + // If we made it all the way through both strings, we are done. They are == 1.8685 + if (bSrcLimit && /* At end of src string, however it was specified. */ 1.8686 + bTargLimit) /* and also at end of dest string */ 1.8687 + { 1.8688 + UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); 1.8689 + return UCOL_EQUAL; 1.8690 + } 1.8691 + } 1.8692 + 1.8693 + U_ASSERT(!(bSrcLimit && bTargLimit)); 1.8694 + 1.8695 + int32_t equalLength = pSrc - source; 1.8696 + UBool bSawNonLatin1 = FALSE; 1.8697 + 1.8698 + if (equalLength > 0) { 1.8699 + // Align position to the start of UTF-8 code point. 1.8700 + if (bTargLimit) { 1.8701 + U8_SET_CP_START((const uint8_t*)source, 0, equalLength); 1.8702 + } else { 1.8703 + U8_SET_CP_START((const uint8_t*)target, 0, equalLength); 1.8704 + } 1.8705 + pSrc = source + equalLength; 1.8706 + pTarg = target + equalLength; 1.8707 + } 1.8708 + 1.8709 + if (equalLength > 0) { 1.8710 + /* There is an identical portion at the beginning of the two strings. */ 1.8711 + /* If the identical portion ends within a contraction or a comibining */ 1.8712 + /* character sequence, back up to the start of that sequence. */ 1.8713 + UBool bUnsafeCP = FALSE; 1.8714 + UChar32 uc32 = -1; 1.8715 + 1.8716 + if (!bSrcLimit) { 1.8717 + U8_GET_OR_FFFD((const uint8_t*)source, 0, equalLength, sourceLength, uc32); 1.8718 + if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) { 1.8719 + bUnsafeCP = TRUE; 1.8720 + } 1.8721 + bSawNonLatin1 |= (uc32 > 0xff); 1.8722 + } 1.8723 + if (!bTargLimit) { 1.8724 + U8_GET_OR_FFFD((const uint8_t*)target, 0, equalLength, targetLength, uc32); 1.8725 + if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) { 1.8726 + bUnsafeCP = TRUE; 1.8727 + } 1.8728 + bSawNonLatin1 |= (uc32 > 0xff); 1.8729 + } 1.8730 + 1.8731 + if (bUnsafeCP) { 1.8732 + while (equalLength > 0) { 1.8733 + // We are stopped in the middle of a contraction. 1.8734 + // Scan backwards through the == part of the string looking for the start of the contraction. 1.8735 + // It doesn't matter which string we scan, since they are the same in this region. 1.8736 + U8_PREV_OR_FFFD((uint8_t*)source, 0, equalLength, uc32); 1.8737 + bSawNonLatin1 |= (uc32 > 0xff); 1.8738 + if (uc32 < 0x10000 && !ucol_unsafeCP((UChar)uc32, coll)) { 1.8739 + break; 1.8740 + } 1.8741 + } 1.8742 + } 1.8743 + source += equalLength; 1.8744 + target += equalLength; 1.8745 + if (sourceLength > 0) { 1.8746 + sourceLength -= equalLength; 1.8747 + } 1.8748 + if (targetLength > 0) { 1.8749 + targetLength -= equalLength; 1.8750 + } 1.8751 + } else { 1.8752 + // Lead byte of Latin 1 character is 0x00 - 0xC3 1.8753 + bSawNonLatin1 = (source && (sourceLength != 0) && (uint8_t)*source > 0xc3); 1.8754 + bSawNonLatin1 |= (target && (targetLength != 0) && (uint8_t)*target > 0xc3); 1.8755 + } 1.8756 + 1.8757 + UCollationResult returnVal; 1.8758 + 1.8759 + if(!coll->latinOneUse || bSawNonLatin1) { 1.8760 + returnVal = ucol_strcollRegularUTF8(coll, source, sourceLength, target, targetLength, status); 1.8761 + } else { 1.8762 + returnVal = ucol_strcollUseLatin1UTF8(coll, source, sourceLength, target, targetLength, status); 1.8763 + } 1.8764 + UTRACE_EXIT_VALUE_STATUS(returnVal, *status); 1.8765 + return returnVal; 1.8766 +} 1.8767 + 1.8768 + 1.8769 +/* convenience function for comparing strings */ 1.8770 +U_CAPI UBool U_EXPORT2 1.8771 +ucol_greater( const UCollator *coll, 1.8772 + const UChar *source, 1.8773 + int32_t sourceLength, 1.8774 + const UChar *target, 1.8775 + int32_t targetLength) 1.8776 +{ 1.8777 + return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 1.8778 + == UCOL_GREATER); 1.8779 +} 1.8780 + 1.8781 +/* convenience function for comparing strings */ 1.8782 +U_CAPI UBool U_EXPORT2 1.8783 +ucol_greaterOrEqual( const UCollator *coll, 1.8784 + const UChar *source, 1.8785 + int32_t sourceLength, 1.8786 + const UChar *target, 1.8787 + int32_t targetLength) 1.8788 +{ 1.8789 + return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 1.8790 + != UCOL_LESS); 1.8791 +} 1.8792 + 1.8793 +/* convenience function for comparing strings */ 1.8794 +U_CAPI UBool U_EXPORT2 1.8795 +ucol_equal( const UCollator *coll, 1.8796 + const UChar *source, 1.8797 + int32_t sourceLength, 1.8798 + const UChar *target, 1.8799 + int32_t targetLength) 1.8800 +{ 1.8801 + return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 1.8802 + == UCOL_EQUAL); 1.8803 +} 1.8804 + 1.8805 +U_CAPI void U_EXPORT2 1.8806 +ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) { 1.8807 + if(coll && coll->UCA) { 1.8808 + uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo)); 1.8809 + } 1.8810 +} 1.8811 + 1.8812 +#endif /* #if !UCONFIG_NO_COLLATION */