intl/icu/source/i18n/ucol.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 *******************************************************************************
michael@0 3 * Copyright (C) 1996-2013, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 *******************************************************************************
michael@0 6 * file name: ucol.cpp
michael@0 7 * encoding: US-ASCII
michael@0 8 * tab size: 8 (not used)
michael@0 9 * indentation:4
michael@0 10 *
michael@0 11 * Modification history
michael@0 12 * Date Name Comments
michael@0 13 * 1996-1999 various members of ICU team maintained C API for collation framework
michael@0 14 * 02/16/2001 synwee Added internal method getPrevSpecialCE
michael@0 15 * 03/01/2001 synwee Added maxexpansion functionality.
michael@0 16 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant
michael@0 17 */
michael@0 18
michael@0 19 #include "unicode/utypes.h"
michael@0 20
michael@0 21 #if !UCONFIG_NO_COLLATION
michael@0 22
michael@0 23 #include "unicode/bytestream.h"
michael@0 24 #include "unicode/coleitr.h"
michael@0 25 #include "unicode/unorm.h"
michael@0 26 #include "unicode/udata.h"
michael@0 27 #include "unicode/ustring.h"
michael@0 28 #include "unicode/utf8.h"
michael@0 29
michael@0 30 #include "ucol_imp.h"
michael@0 31 #include "bocsu.h"
michael@0 32
michael@0 33 #include "normalizer2impl.h"
michael@0 34 #include "unorm_it.h"
michael@0 35 #include "umutex.h"
michael@0 36 #include "cmemory.h"
michael@0 37 #include "ucln_in.h"
michael@0 38 #include "cstring.h"
michael@0 39 #include "utracimp.h"
michael@0 40 #include "putilimp.h"
michael@0 41 #include "uassert.h"
michael@0 42 #include "unicode/coll.h"
michael@0 43
michael@0 44 #ifdef UCOL_DEBUG
michael@0 45 #include <stdio.h>
michael@0 46 #endif
michael@0 47
michael@0 48 U_NAMESPACE_USE
michael@0 49
michael@0 50 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
michael@0 51
michael@0 52 #define LAST_BYTE_MASK_ 0xFF
michael@0 53 #define SECOND_LAST_BYTE_SHIFT_ 8
michael@0 54
michael@0 55 #define ZERO_CC_LIMIT_ 0xC0
michael@0 56
michael@0 57 // These are static pointers to the NFC/NFD implementation instance.
michael@0 58 // Each of them is always the same between calls to u_cleanup
michael@0 59 // and therefore writing to it is not synchronized.
michael@0 60 // They are cleaned in ucol_cleanup
michael@0 61 static const Normalizer2 *g_nfd = NULL;
michael@0 62 static const Normalizer2Impl *g_nfcImpl = NULL;
michael@0 63
michael@0 64 // These are values from UCA required for
michael@0 65 // implicit generation and supressing sort key compression
michael@0 66 // they should regularly be in the UCA, but if one
michael@0 67 // is running without UCA, it could be a problem
michael@0 68 static const int32_t maxRegularPrimary = 0x7A;
michael@0 69 static const int32_t minImplicitPrimary = 0xE0;
michael@0 70 static const int32_t maxImplicitPrimary = 0xE4;
michael@0 71
michael@0 72 U_CDECL_BEGIN
michael@0 73 static UBool U_CALLCONV
michael@0 74 ucol_cleanup(void)
michael@0 75 {
michael@0 76 g_nfd = NULL;
michael@0 77 g_nfcImpl = NULL;
michael@0 78 return TRUE;
michael@0 79 }
michael@0 80
michael@0 81 static int32_t U_CALLCONV
michael@0 82 _getFoldingOffset(uint32_t data) {
michael@0 83 return (int32_t)(data&0xFFFFFF);
michael@0 84 }
michael@0 85
michael@0 86 U_CDECL_END
michael@0 87
michael@0 88 static inline
michael@0 89 UBool initializeNFD(UErrorCode *status) {
michael@0 90 if (g_nfd != NULL) {
michael@0 91 return TRUE;
michael@0 92 } else {
michael@0 93 // The result is constant, until the library is reloaded.
michael@0 94 g_nfd = Normalizer2Factory::getNFDInstance(*status);
michael@0 95 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
michael@0 96 return U_SUCCESS(*status);
michael@0 97 }
michael@0 98 }
michael@0 99
michael@0 100 // init FCD data
michael@0 101 static inline
michael@0 102 UBool initializeFCD(UErrorCode *status) {
michael@0 103 if (g_nfcImpl != NULL) {
michael@0 104 return TRUE;
michael@0 105 } else {
michael@0 106 // The result is constant, until the library is reloaded.
michael@0 107 g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
michael@0 108 // Note: Alternatively, we could also store this pointer in each collIterate struct,
michael@0 109 // same as Normalizer2Factory::getImpl(collIterate->nfd).
michael@0 110 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
michael@0 111 return U_SUCCESS(*status);
michael@0 112 }
michael@0 113 }
michael@0 114
michael@0 115 static
michael@0 116 inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString,
michael@0 117 int32_t sourceLen, collIterate *s,
michael@0 118 UErrorCode *status)
michael@0 119 {
michael@0 120 (s)->string = (s)->pos = sourceString;
michael@0 121 (s)->origFlags = 0;
michael@0 122 (s)->flags = 0;
michael@0 123 if (sourceLen >= 0) {
michael@0 124 s->flags |= UCOL_ITER_HASLEN;
michael@0 125 (s)->endp = (UChar *)sourceString+sourceLen;
michael@0 126 }
michael@0 127 else {
michael@0 128 /* change to enable easier checking for end of string for fcdpositon */
michael@0 129 (s)->endp = NULL;
michael@0 130 }
michael@0 131 (s)->extendCEs = NULL;
michael@0 132 (s)->extendCEsSize = 0;
michael@0 133 (s)->CEpos = (s)->toReturn = (s)->CEs;
michael@0 134 (s)->offsetBuffer = NULL;
michael@0 135 (s)->offsetBufferSize = 0;
michael@0 136 (s)->offsetReturn = (s)->offsetStore = NULL;
michael@0 137 (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0;
michael@0 138 (s)->coll = (collator);
michael@0 139 if (initializeNFD(status)) {
michael@0 140 (s)->nfd = g_nfd;
michael@0 141 } else {
michael@0 142 return;
michael@0 143 }
michael@0 144 (s)->fcdPosition = 0;
michael@0 145 if(collator->normalizationMode == UCOL_ON) {
michael@0 146 (s)->flags |= UCOL_ITER_NORM;
michael@0 147 }
michael@0 148 if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) {
michael@0 149 (s)->flags |= UCOL_HIRAGANA_Q;
michael@0 150 }
michael@0 151 (s)->iterator = NULL;
michael@0 152 //(s)->iteratorIndex = 0;
michael@0 153 }
michael@0 154
michael@0 155 U_CAPI void U_EXPORT2
michael@0 156 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
michael@0 157 int32_t sourceLen, collIterate *s,
michael@0 158 UErrorCode *status) {
michael@0 159 /* Out-of-line version for use from other files. */
michael@0 160 IInit_collIterate(collator, sourceString, sourceLen, s, status);
michael@0 161 }
michael@0 162
michael@0 163 U_CAPI collIterate * U_EXPORT2
michael@0 164 uprv_new_collIterate(UErrorCode *status) {
michael@0 165 if(U_FAILURE(*status)) {
michael@0 166 return NULL;
michael@0 167 }
michael@0 168 collIterate *s = new collIterate;
michael@0 169 if(s == NULL) {
michael@0 170 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 171 return NULL;
michael@0 172 }
michael@0 173 return s;
michael@0 174 }
michael@0 175
michael@0 176 U_CAPI void U_EXPORT2
michael@0 177 uprv_delete_collIterate(collIterate *s) {
michael@0 178 delete s;
michael@0 179 }
michael@0 180
michael@0 181 U_CAPI UBool U_EXPORT2
michael@0 182 uprv_collIterateAtEnd(collIterate *s) {
michael@0 183 return s == NULL || s->pos == s->endp;
michael@0 184 }
michael@0 185
michael@0 186 /**
michael@0 187 * Backup the state of the collIterate struct data
michael@0 188 * @param data collIterate to backup
michael@0 189 * @param backup storage
michael@0 190 */
michael@0 191 static
michael@0 192 inline void backupState(const collIterate *data, collIterateState *backup)
michael@0 193 {
michael@0 194 backup->fcdPosition = data->fcdPosition;
michael@0 195 backup->flags = data->flags;
michael@0 196 backup->origFlags = data->origFlags;
michael@0 197 backup->pos = data->pos;
michael@0 198 backup->bufferaddress = data->writableBuffer.getBuffer();
michael@0 199 backup->buffersize = data->writableBuffer.length();
michael@0 200 backup->iteratorMove = 0;
michael@0 201 backup->iteratorIndex = 0;
michael@0 202 if(data->iterator != NULL) {
michael@0 203 //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
michael@0 204 backup->iteratorIndex = data->iterator->getState(data->iterator);
michael@0 205 // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
michael@0 206 if(backup->iteratorIndex == UITER_NO_STATE) {
michael@0 207 while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) {
michael@0 208 backup->iteratorMove++;
michael@0 209 data->iterator->move(data->iterator, -1, UITER_CURRENT);
michael@0 210 }
michael@0 211 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
michael@0 212 }
michael@0 213 }
michael@0 214 }
michael@0 215
michael@0 216 /**
michael@0 217 * Loads the state into the collIterate struct data
michael@0 218 * @param data collIterate to backup
michael@0 219 * @param backup storage
michael@0 220 * @param forwards boolean to indicate if forwards iteration is used,
michael@0 221 * false indicates backwards iteration
michael@0 222 */
michael@0 223 static
michael@0 224 inline void loadState(collIterate *data, const collIterateState *backup,
michael@0 225 UBool forwards)
michael@0 226 {
michael@0 227 UErrorCode status = U_ZERO_ERROR;
michael@0 228 data->flags = backup->flags;
michael@0 229 data->origFlags = backup->origFlags;
michael@0 230 if(data->iterator != NULL) {
michael@0 231 //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
michael@0 232 data->iterator->setState(data->iterator, backup->iteratorIndex, &status);
michael@0 233 if(backup->iteratorMove != 0) {
michael@0 234 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
michael@0 235 }
michael@0 236 }
michael@0 237 data->pos = backup->pos;
michael@0 238
michael@0 239 if ((data->flags & UCOL_ITER_INNORMBUF) &&
michael@0 240 data->writableBuffer.getBuffer() != backup->bufferaddress) {
michael@0 241 /*
michael@0 242 this is when a new buffer has been reallocated and we'll have to
michael@0 243 calculate the new position.
michael@0 244 note the new buffer has to contain the contents of the old buffer.
michael@0 245 */
michael@0 246 if (forwards) {
michael@0 247 data->pos = data->writableBuffer.getTerminatedBuffer() +
michael@0 248 (data->pos - backup->bufferaddress);
michael@0 249 }
michael@0 250 else {
michael@0 251 /* backwards direction */
michael@0 252 int32_t temp = backup->buffersize -
michael@0 253 (int32_t)(data->pos - backup->bufferaddress);
michael@0 254 data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writableBuffer.length() - temp);
michael@0 255 }
michael@0 256 }
michael@0 257 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
michael@0 258 /*
michael@0 259 this is alittle tricky.
michael@0 260 if we are initially not in the normalization buffer, even if we
michael@0 261 normalize in the later stage, the data in the buffer will be
michael@0 262 ignored, since we skip back up to the data string.
michael@0 263 however if we are already in the normalization buffer, any
michael@0 264 further normalization will pull data into the normalization
michael@0 265 buffer and modify the fcdPosition.
michael@0 266 since we are keeping the data in the buffer for use, the
michael@0 267 fcdPosition can not be reverted back.
michael@0 268 arrgghh....
michael@0 269 */
michael@0 270 data->fcdPosition = backup->fcdPosition;
michael@0 271 }
michael@0 272 }
michael@0 273
michael@0 274 static UBool
michael@0 275 reallocCEs(collIterate *data, int32_t newCapacity) {
michael@0 276 uint32_t *oldCEs = data->extendCEs;
michael@0 277 if(oldCEs == NULL) {
michael@0 278 oldCEs = data->CEs;
michael@0 279 }
michael@0 280 int32_t length = data->CEpos - oldCEs;
michael@0 281 uint32_t *newCEs = (uint32_t *)uprv_malloc(newCapacity * 4);
michael@0 282 if(newCEs == NULL) {
michael@0 283 return FALSE;
michael@0 284 }
michael@0 285 uprv_memcpy(newCEs, oldCEs, length * 4);
michael@0 286 uprv_free(data->extendCEs);
michael@0 287 data->extendCEs = newCEs;
michael@0 288 data->extendCEsSize = newCapacity;
michael@0 289 data->CEpos = newCEs + length;
michael@0 290 return TRUE;
michael@0 291 }
michael@0 292
michael@0 293 static UBool
michael@0 294 increaseCEsCapacity(collIterate *data) {
michael@0 295 int32_t oldCapacity;
michael@0 296 if(data->extendCEs != NULL) {
michael@0 297 oldCapacity = data->extendCEsSize;
michael@0 298 } else {
michael@0 299 oldCapacity = LENGTHOF(data->CEs);
michael@0 300 }
michael@0 301 return reallocCEs(data, 2 * oldCapacity);
michael@0 302 }
michael@0 303
michael@0 304 static UBool
michael@0 305 ensureCEsCapacity(collIterate *data, int32_t minCapacity) {
michael@0 306 int32_t oldCapacity;
michael@0 307 if(data->extendCEs != NULL) {
michael@0 308 oldCapacity = data->extendCEsSize;
michael@0 309 } else {
michael@0 310 oldCapacity = LENGTHOF(data->CEs);
michael@0 311 }
michael@0 312 if(minCapacity <= oldCapacity) {
michael@0 313 return TRUE;
michael@0 314 }
michael@0 315 oldCapacity *= 2;
michael@0 316 return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacity);
michael@0 317 }
michael@0 318
michael@0 319 void collIterate::appendOffset(int32_t offset, UErrorCode &errorCode) {
michael@0 320 if(U_FAILURE(errorCode)) {
michael@0 321 return;
michael@0 322 }
michael@0 323 int32_t length = offsetStore == NULL ? 0 : (int32_t)(offsetStore - offsetBuffer);
michael@0 324 U_ASSERT(length >= offsetBufferSize || offsetStore != NULL);
michael@0 325 if(length >= offsetBufferSize) {
michael@0 326 int32_t newCapacity = 2 * offsetBufferSize + UCOL_EXPAND_CE_BUFFER_SIZE;
michael@0 327 int32_t *newBuffer = static_cast<int32_t *>(uprv_malloc(newCapacity * 4));
michael@0 328 if(newBuffer == NULL) {
michael@0 329 errorCode = U_MEMORY_ALLOCATION_ERROR;
michael@0 330 return;
michael@0 331 }
michael@0 332 if(length > 0) {
michael@0 333 uprv_memcpy(newBuffer, offsetBuffer, length * 4);
michael@0 334 }
michael@0 335 uprv_free(offsetBuffer);
michael@0 336 offsetBuffer = newBuffer;
michael@0 337 offsetStore = offsetBuffer + length;
michael@0 338 offsetBufferSize = newCapacity;
michael@0 339 }
michael@0 340 *offsetStore++ = offset;
michael@0 341 }
michael@0 342
michael@0 343 /*
michael@0 344 * collIter_eos()
michael@0 345 * Checks for a collIterate being positioned at the end of
michael@0 346 * its source string.
michael@0 347 *
michael@0 348 */
michael@0 349 static
michael@0 350 inline UBool collIter_eos(collIterate *s) {
michael@0 351 if(s->flags & UCOL_USE_ITERATOR) {
michael@0 352 return !(s->iterator->hasNext(s->iterator));
michael@0 353 }
michael@0 354 if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) {
michael@0 355 // Null terminated string, but not at null, so not at end.
michael@0 356 // Whether in main or normalization buffer doesn't matter.
michael@0 357 return FALSE;
michael@0 358 }
michael@0 359
michael@0 360 // String with length. Can't be in normalization buffer, which is always
michael@0 361 // null termintated.
michael@0 362 if (s->flags & UCOL_ITER_HASLEN) {
michael@0 363 return (s->pos == s->endp);
michael@0 364 }
michael@0 365
michael@0 366 // We are at a null termination, could be either normalization buffer or main string.
michael@0 367 if ((s->flags & UCOL_ITER_INNORMBUF) == 0) {
michael@0 368 // At null at end of main string.
michael@0 369 return TRUE;
michael@0 370 }
michael@0 371
michael@0 372 // At null at end of normalization buffer. Need to check whether there there are
michael@0 373 // any characters left in the main buffer.
michael@0 374 if(s->origFlags & UCOL_USE_ITERATOR) {
michael@0 375 return !(s->iterator->hasNext(s->iterator));
michael@0 376 } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) {
michael@0 377 // Null terminated main string. fcdPosition is the 'return' position into main buf.
michael@0 378 return (*s->fcdPosition == 0);
michael@0 379 }
michael@0 380 else {
michael@0 381 // Main string with an end pointer.
michael@0 382 return s->fcdPosition == s->endp;
michael@0 383 }
michael@0 384 }
michael@0 385
michael@0 386 /*
michael@0 387 * collIter_bos()
michael@0 388 * Checks for a collIterate being positioned at the start of
michael@0 389 * its source string.
michael@0 390 *
michael@0 391 */
michael@0 392 static
michael@0 393 inline UBool collIter_bos(collIterate *source) {
michael@0 394 // if we're going backwards, we need to know whether there is more in the
michael@0 395 // iterator, even if we are in the side buffer
michael@0 396 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
michael@0 397 return !source->iterator->hasPrevious(source->iterator);
michael@0 398 }
michael@0 399 if (source->pos <= source->string ||
michael@0 400 ((source->flags & UCOL_ITER_INNORMBUF) &&
michael@0 401 *(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
michael@0 402 return TRUE;
michael@0 403 }
michael@0 404 return FALSE;
michael@0 405 }
michael@0 406
michael@0 407 /*static
michael@0 408 inline UBool collIter_SimpleBos(collIterate *source) {
michael@0 409 // if we're going backwards, we need to know whether there is more in the
michael@0 410 // iterator, even if we are in the side buffer
michael@0 411 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
michael@0 412 return !source->iterator->hasPrevious(source->iterator);
michael@0 413 }
michael@0 414 if (source->pos == source->string) {
michael@0 415 return TRUE;
michael@0 416 }
michael@0 417 return FALSE;
michael@0 418 }*/
michael@0 419 //return (data->pos == data->string) ||
michael@0 420
michael@0 421
michael@0 422 /****************************************************************************/
michael@0 423 /* Following are the open/close functions */
michael@0 424 /* */
michael@0 425 /****************************************************************************/
michael@0 426
michael@0 427 static UCollator*
michael@0 428 ucol_initFromBinary(const uint8_t *bin, int32_t length,
michael@0 429 const UCollator *base,
michael@0 430 UCollator *fillIn,
michael@0 431 UErrorCode *status)
michael@0 432 {
michael@0 433 UCollator *result = fillIn;
michael@0 434 if(U_FAILURE(*status)) {
michael@0 435 return NULL;
michael@0 436 }
michael@0 437 /*
michael@0 438 if(base == NULL) {
michael@0 439 // we don't support null base yet
michael@0 440 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 441 return NULL;
michael@0 442 }
michael@0 443 */
michael@0 444 // We need these and we could be running without UCA
michael@0 445 uprv_uca_initImplicitConstants(status);
michael@0 446 UCATableHeader *colData = (UCATableHeader *)bin;
michael@0 447 // do we want version check here? We're trying to figure out whether collators are compatible
michael@0 448 if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 ||
michael@0 449 uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) ||
michael@0 450 colData->version[0] != UCOL_BUILDER_VERSION)
michael@0 451 {
michael@0 452 *status = U_COLLATOR_VERSION_MISMATCH;
michael@0 453 return NULL;
michael@0 454 }
michael@0 455 else {
michael@0 456 if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) {
michael@0 457 result = ucol_initCollator((const UCATableHeader *)bin, result, base, status);
michael@0 458 if(U_FAILURE(*status)){
michael@0 459 return NULL;
michael@0 460 }
michael@0 461 result->hasRealData = TRUE;
michael@0 462 }
michael@0 463 else {
michael@0 464 if(base) {
michael@0 465 result = ucol_initCollator(base->image, result, base, status);
michael@0 466 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status);
michael@0 467 if(U_FAILURE(*status)){
michael@0 468 return NULL;
michael@0 469 }
michael@0 470 result->hasRealData = FALSE;
michael@0 471 }
michael@0 472 else {
michael@0 473 *status = U_USELESS_COLLATOR_ERROR;
michael@0 474 return NULL;
michael@0 475 }
michael@0 476 }
michael@0 477 result->freeImageOnClose = FALSE;
michael@0 478 }
michael@0 479 result->actualLocale = NULL;
michael@0 480 result->validLocale = NULL;
michael@0 481 result->requestedLocale = NULL;
michael@0 482 result->rules = NULL;
michael@0 483 result->rulesLength = 0;
michael@0 484 result->freeRulesOnClose = FALSE;
michael@0 485 result->ucaRules = NULL;
michael@0 486 return result;
michael@0 487 }
michael@0 488
michael@0 489 U_CAPI UCollator* U_EXPORT2
michael@0 490 ucol_openBinary(const uint8_t *bin, int32_t length,
michael@0 491 const UCollator *base,
michael@0 492 UErrorCode *status)
michael@0 493 {
michael@0 494 return ucol_initFromBinary(bin, length, base, NULL, status);
michael@0 495 }
michael@0 496
michael@0 497 U_CAPI int32_t U_EXPORT2
michael@0 498 ucol_cloneBinary(const UCollator *coll,
michael@0 499 uint8_t *buffer, int32_t capacity,
michael@0 500 UErrorCode *status)
michael@0 501 {
michael@0 502 int32_t length = 0;
michael@0 503 if(U_FAILURE(*status)) {
michael@0 504 return length;
michael@0 505 }
michael@0 506 if(capacity < 0) {
michael@0 507 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 508 return length;
michael@0 509 }
michael@0 510 if(coll->hasRealData == TRUE) {
michael@0 511 length = coll->image->size;
michael@0 512 if(length <= capacity) {
michael@0 513 uprv_memcpy(buffer, coll->image, length);
michael@0 514 } else {
michael@0 515 *status = U_BUFFER_OVERFLOW_ERROR;
michael@0 516 }
michael@0 517 } else {
michael@0 518 length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
michael@0 519 if(length <= capacity) {
michael@0 520 /* build the UCATableHeader with minimal entries */
michael@0 521 /* do not copy the header from the UCA file because its values are wrong! */
michael@0 522 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
michael@0 523
michael@0 524 /* reset everything */
michael@0 525 uprv_memset(buffer, 0, length);
michael@0 526
michael@0 527 /* set the tailoring-specific values */
michael@0 528 UCATableHeader *myData = (UCATableHeader *)buffer;
michael@0 529 myData->size = length;
michael@0 530
michael@0 531 /* offset for the options, the only part of the data that is present after the header */
michael@0 532 myData->options = sizeof(UCATableHeader);
michael@0 533
michael@0 534 /* need to always set the expansion value for an upper bound of the options */
michael@0 535 myData->expansion = myData->options + sizeof(UColOptionSet);
michael@0 536
michael@0 537 myData->magic = UCOL_HEADER_MAGIC;
michael@0 538 myData->isBigEndian = U_IS_BIG_ENDIAN;
michael@0 539 myData->charSetFamily = U_CHARSET_FAMILY;
michael@0 540
michael@0 541 /* copy UCA's version; genrb will override all but the builder version with tailoring data */
michael@0 542 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
michael@0 543
michael@0 544 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
michael@0 545 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
michael@0 546 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
michael@0 547 myData->jamoSpecial = coll->image->jamoSpecial;
michael@0 548
michael@0 549 /* copy the collator options */
michael@0 550 uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
michael@0 551 } else {
michael@0 552 *status = U_BUFFER_OVERFLOW_ERROR;
michael@0 553 }
michael@0 554 }
michael@0 555 return length;
michael@0 556 }
michael@0 557
michael@0 558 U_CAPI UCollator* U_EXPORT2
michael@0 559 ucol_safeClone(const UCollator *coll, void * /*stackBuffer*/, int32_t * pBufferSize, UErrorCode *status)
michael@0 560 {
michael@0 561 UCollator * localCollator;
michael@0 562 int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator);
michael@0 563 int32_t imageSize = 0;
michael@0 564 int32_t rulesSize = 0;
michael@0 565 int32_t rulesPadding = 0;
michael@0 566 int32_t defaultReorderCodesSize = 0;
michael@0 567 int32_t reorderCodesSize = 0;
michael@0 568 uint8_t *image;
michael@0 569 UChar *rules;
michael@0 570 int32_t* defaultReorderCodes;
michael@0 571 int32_t* reorderCodes;
michael@0 572 uint8_t* leadBytePermutationTable;
michael@0 573 UBool imageAllocated = FALSE;
michael@0 574
michael@0 575 if (status == NULL || U_FAILURE(*status)){
michael@0 576 return NULL;
michael@0 577 }
michael@0 578 if (coll == NULL) {
michael@0 579 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 580 return NULL;
michael@0 581 }
michael@0 582
michael@0 583 if (coll->rules && coll->freeRulesOnClose) {
michael@0 584 rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar);
michael@0 585 rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar));
michael@0 586 bufferSizeNeeded += rulesSize + rulesPadding;
michael@0 587 }
michael@0 588 // no padding for alignment needed from here since the next two are 4 byte quantities
michael@0 589 if (coll->defaultReorderCodes) {
michael@0 590 defaultReorderCodesSize = coll->defaultReorderCodesLength * sizeof(int32_t);
michael@0 591 bufferSizeNeeded += defaultReorderCodesSize;
michael@0 592 }
michael@0 593 if (coll->reorderCodes) {
michael@0 594 reorderCodesSize = coll->reorderCodesLength * sizeof(int32_t);
michael@0 595 bufferSizeNeeded += reorderCodesSize;
michael@0 596 }
michael@0 597 if (coll->leadBytePermutationTable) {
michael@0 598 bufferSizeNeeded += 256 * sizeof(uint8_t);
michael@0 599 }
michael@0 600
michael@0 601 if (pBufferSize != NULL) {
michael@0 602 int32_t inputSize = *pBufferSize;
michael@0 603 *pBufferSize = 1;
michael@0 604 if (inputSize == 0) {
michael@0 605 return NULL; // preflighting for deprecated functionality
michael@0 606 }
michael@0 607 }
michael@0 608
michael@0 609 char *stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded);
michael@0 610 // Null pointer check.
michael@0 611 if (stackBufferChars == NULL) {
michael@0 612 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 613 return NULL;
michael@0 614 }
michael@0 615 *status = U_SAFECLONE_ALLOCATED_WARNING;
michael@0 616
michael@0 617 localCollator = (UCollator *)stackBufferChars;
michael@0 618 rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding);
michael@0 619 defaultReorderCodes = (int32_t*)((uint8_t*)rules + rulesSize);
michael@0 620 reorderCodes = (int32_t*)((uint8_t*)defaultReorderCodes + defaultReorderCodesSize);
michael@0 621 leadBytePermutationTable = (uint8_t*)reorderCodes + reorderCodesSize;
michael@0 622
michael@0 623 {
michael@0 624 UErrorCode tempStatus = U_ZERO_ERROR;
michael@0 625 imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus);
michael@0 626 }
michael@0 627 if (coll->freeImageOnClose) {
michael@0 628 image = (uint8_t *)uprv_malloc(imageSize);
michael@0 629 // Null pointer check
michael@0 630 if (image == NULL) {
michael@0 631 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 632 return NULL;
michael@0 633 }
michael@0 634 ucol_cloneBinary(coll, image, imageSize, status);
michael@0 635 imageAllocated = TRUE;
michael@0 636 }
michael@0 637 else {
michael@0 638 image = (uint8_t *)coll->image;
michael@0 639 }
michael@0 640 localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status);
michael@0 641 if (U_FAILURE(*status)) {
michael@0 642 return NULL;
michael@0 643 }
michael@0 644
michael@0 645 if (coll->rules) {
michael@0 646 if (coll->freeRulesOnClose) {
michael@0 647 localCollator->rules = u_strcpy(rules, coll->rules);
michael@0 648 //bufferEnd += rulesSize;
michael@0 649 }
michael@0 650 else {
michael@0 651 localCollator->rules = coll->rules;
michael@0 652 }
michael@0 653 localCollator->freeRulesOnClose = FALSE;
michael@0 654 localCollator->rulesLength = coll->rulesLength;
michael@0 655 }
michael@0 656
michael@0 657 // collator reordering
michael@0 658 if (coll->defaultReorderCodes) {
michael@0 659 localCollator->defaultReorderCodes =
michael@0 660 (int32_t*) uprv_memcpy(defaultReorderCodes, coll->defaultReorderCodes, coll->defaultReorderCodesLength * sizeof(int32_t));
michael@0 661 localCollator->defaultReorderCodesLength = coll->defaultReorderCodesLength;
michael@0 662 localCollator->freeDefaultReorderCodesOnClose = FALSE;
michael@0 663 }
michael@0 664 if (coll->reorderCodes) {
michael@0 665 localCollator->reorderCodes =
michael@0 666 (int32_t*)uprv_memcpy(reorderCodes, coll->reorderCodes, coll->reorderCodesLength * sizeof(int32_t));
michael@0 667 localCollator->reorderCodesLength = coll->reorderCodesLength;
michael@0 668 localCollator->freeReorderCodesOnClose = FALSE;
michael@0 669 }
michael@0 670 if (coll->leadBytePermutationTable) {
michael@0 671 localCollator->leadBytePermutationTable =
michael@0 672 (uint8_t*) uprv_memcpy(leadBytePermutationTable, coll->leadBytePermutationTable, 256);
michael@0 673 localCollator->freeLeadBytePermutationTableOnClose = FALSE;
michael@0 674 }
michael@0 675
michael@0 676 int32_t i;
michael@0 677 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
michael@0 678 ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status);
michael@0 679 }
michael@0 680 // zero copies of pointers
michael@0 681 localCollator->actualLocale = NULL;
michael@0 682 localCollator->validLocale = NULL;
michael@0 683 localCollator->requestedLocale = NULL;
michael@0 684 localCollator->ucaRules = coll->ucaRules; // There should only be one copy here.
michael@0 685 localCollator->freeOnClose = TRUE;
michael@0 686 localCollator->freeImageOnClose = imageAllocated;
michael@0 687 return localCollator;
michael@0 688 }
michael@0 689
michael@0 690 U_CAPI void U_EXPORT2
michael@0 691 ucol_close(UCollator *coll)
michael@0 692 {
michael@0 693 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);
michael@0 694 UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);
michael@0 695 if(coll != NULL) {
michael@0 696 // these are always owned by each UCollator struct,
michael@0 697 // so we always free them
michael@0 698 if(coll->validLocale != NULL) {
michael@0 699 uprv_free(coll->validLocale);
michael@0 700 }
michael@0 701 if(coll->actualLocale != NULL) {
michael@0 702 uprv_free(coll->actualLocale);
michael@0 703 }
michael@0 704 if(coll->requestedLocale != NULL) {
michael@0 705 uprv_free(coll->requestedLocale);
michael@0 706 }
michael@0 707 if(coll->latinOneCEs != NULL) {
michael@0 708 uprv_free(coll->latinOneCEs);
michael@0 709 }
michael@0 710 if(coll->options != NULL && coll->freeOptionsOnClose) {
michael@0 711 uprv_free(coll->options);
michael@0 712 }
michael@0 713 if(coll->rules != NULL && coll->freeRulesOnClose) {
michael@0 714 uprv_free((UChar *)coll->rules);
michael@0 715 }
michael@0 716 if(coll->image != NULL && coll->freeImageOnClose) {
michael@0 717 uprv_free((UCATableHeader *)coll->image);
michael@0 718 }
michael@0 719
michael@0 720 if(coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) {
michael@0 721 uprv_free(coll->leadBytePermutationTable);
michael@0 722 }
michael@0 723 if(coll->defaultReorderCodes != NULL && coll->freeDefaultReorderCodesOnClose == TRUE) {
michael@0 724 uprv_free(coll->defaultReorderCodes);
michael@0 725 }
michael@0 726 if(coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) {
michael@0 727 uprv_free(coll->reorderCodes);
michael@0 728 }
michael@0 729
michael@0 730 if(coll->delegate != NULL) {
michael@0 731 delete (Collator*)coll->delegate;
michael@0 732 }
michael@0 733
michael@0 734 /* Here, it would be advisable to close: */
michael@0 735 /* - UData for UCA (unless we stuff it in the root resb */
michael@0 736 /* Again, do we need additional housekeeping... HMMM! */
michael@0 737 UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose);
michael@0 738 if(coll->freeOnClose){
michael@0 739 /* for safeClone, if freeOnClose is FALSE,
michael@0 740 don't free the other instance data */
michael@0 741 uprv_free(coll);
michael@0 742 }
michael@0 743 }
michael@0 744 UTRACE_EXIT();
michael@0 745 }
michael@0 746
michael@0 747 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
michael@0 748 if(U_FAILURE(*status)) {
michael@0 749 return;
michael@0 750 }
michael@0 751 result->caseFirst = (UColAttributeValue)opts->caseFirst;
michael@0 752 result->caseLevel = (UColAttributeValue)opts->caseLevel;
michael@0 753 result->frenchCollation = (UColAttributeValue)opts->frenchCollation;
michael@0 754 result->normalizationMode = (UColAttributeValue)opts->normalizationMode;
michael@0 755 if(result->normalizationMode == UCOL_ON && !initializeFCD(status)) {
michael@0 756 return;
michael@0 757 }
michael@0 758 result->strength = (UColAttributeValue)opts->strength;
michael@0 759 result->variableTopValue = opts->variableTopValue;
michael@0 760 result->alternateHandling = (UColAttributeValue)opts->alternateHandling;
michael@0 761 result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ;
michael@0 762 result->numericCollation = (UColAttributeValue)opts->numericCollation;
michael@0 763 result->caseFirstisDefault = TRUE;
michael@0 764 result->caseLevelisDefault = TRUE;
michael@0 765 result->frenchCollationisDefault = TRUE;
michael@0 766 result->normalizationModeisDefault = TRUE;
michael@0 767 result->strengthisDefault = TRUE;
michael@0 768 result->variableTopValueisDefault = TRUE;
michael@0 769 result->alternateHandlingisDefault = TRUE;
michael@0 770 result->hiraganaQisDefault = TRUE;
michael@0 771 result->numericCollationisDefault = TRUE;
michael@0 772
michael@0 773 ucol_updateInternalState(result, status);
michael@0 774
michael@0 775 result->options = opts;
michael@0 776 }
michael@0 777
michael@0 778
michael@0 779 /**
michael@0 780 * Approximate determination if a character is at a contraction end.
michael@0 781 * Guaranteed to be TRUE if a character is at the end of a contraction,
michael@0 782 * otherwise it is not deterministic.
michael@0 783 * @param c character to be determined
michael@0 784 * @param coll collator
michael@0 785 */
michael@0 786 static
michael@0 787 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
michael@0 788 if (c < coll->minContrEndCP) {
michael@0 789 return FALSE;
michael@0 790 }
michael@0 791
michael@0 792 int32_t hash = c;
michael@0 793 uint8_t htbyte;
michael@0 794 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
michael@0 795 if (U16_IS_TRAIL(c)) {
michael@0 796 return TRUE;
michael@0 797 }
michael@0 798 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
michael@0 799 }
michael@0 800 htbyte = coll->contrEndCP[hash>>3];
michael@0 801 return (((htbyte >> (hash & 7)) & 1) == 1);
michael@0 802 }
michael@0 803
michael@0 804
michael@0 805
michael@0 806 /*
michael@0 807 * i_getCombiningClass()
michael@0 808 * A fast, at least partly inline version of u_getCombiningClass()
michael@0 809 * This is a candidate for further optimization. Used heavily
michael@0 810 * in contraction processing.
michael@0 811 */
michael@0 812 static
michael@0 813 inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) {
michael@0 814 uint8_t sCC = 0;
michael@0 815 if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) {
michael@0 816 sCC = u_getCombiningClass(c);
michael@0 817 }
michael@0 818 return sCC;
michael@0 819 }
michael@0 820
michael@0 821 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) {
michael@0 822 UChar c;
michael@0 823 UCollator *result = fillIn;
michael@0 824 if(U_FAILURE(*status) || image == NULL) {
michael@0 825 return NULL;
michael@0 826 }
michael@0 827
michael@0 828 if(result == NULL) {
michael@0 829 result = (UCollator *)uprv_malloc(sizeof(UCollator));
michael@0 830 if(result == NULL) {
michael@0 831 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 832 return result;
michael@0 833 }
michael@0 834 result->freeOnClose = TRUE;
michael@0 835 } else {
michael@0 836 result->freeOnClose = FALSE;
michael@0 837 }
michael@0 838
michael@0 839 result->delegate = NULL;
michael@0 840
michael@0 841 result->image = image;
michael@0 842 result->mapping.getFoldingOffset = _getFoldingOffset;
michael@0 843 const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition;
michael@0 844 utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status);
michael@0 845 if(U_FAILURE(*status)) {
michael@0 846 if(result->freeOnClose == TRUE) {
michael@0 847 uprv_free(result);
michael@0 848 result = NULL;
michael@0 849 }
michael@0 850 return result;
michael@0 851 }
michael@0 852
michael@0 853 result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping);
michael@0 854 result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs);
michael@0 855 result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex);
michael@0 856 result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion);
michael@0 857 result->rules = NULL;
michael@0 858 result->rulesLength = 0;
michael@0 859 result->freeRulesOnClose = FALSE;
michael@0 860 result->defaultReorderCodes = NULL;
michael@0 861 result->defaultReorderCodesLength = 0;
michael@0 862 result->freeDefaultReorderCodesOnClose = FALSE;
michael@0 863 result->reorderCodes = NULL;
michael@0 864 result->reorderCodesLength = 0;
michael@0 865 result->freeReorderCodesOnClose = FALSE;
michael@0 866 result->leadBytePermutationTable = NULL;
michael@0 867 result->freeLeadBytePermutationTableOnClose = FALSE;
michael@0 868
michael@0 869 /* get the version info from UCATableHeader and populate the Collator struct*/
michael@0 870 result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
michael@0 871 result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/
michael@0 872 result->dataVersion[2] = 0;
michael@0 873 result->dataVersion[3] = 0;
michael@0 874
michael@0 875 result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP;
michael@0 876 result->minUnsafeCP = 0;
michael@0 877 for (c=0; c<0x300; c++) { // Find the smallest unsafe char.
michael@0 878 if (ucol_unsafeCP(c, result)) break;
michael@0 879 }
michael@0 880 result->minUnsafeCP = c;
michael@0 881
michael@0 882 result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;
michael@0 883 result->minContrEndCP = 0;
michael@0 884 for (c=0; c<0x300; c++) { // Find the Contraction-ending char.
michael@0 885 if (ucol_contractionEndCP(c, result)) break;
michael@0 886 }
michael@0 887 result->minContrEndCP = c;
michael@0 888
michael@0 889 /* max expansion tables */
michael@0 890 result->endExpansionCE = (uint32_t*)((uint8_t*)result->image +
michael@0 891 result->image->endExpansionCE);
michael@0 892 result->lastEndExpansionCE = result->endExpansionCE +
michael@0 893 result->image->endExpansionCECount - 1;
michael@0 894 result->expansionCESize = (uint8_t*)result->image +
michael@0 895 result->image->expansionCESize;
michael@0 896
michael@0 897
michael@0 898 //result->errorCode = *status;
michael@0 899
michael@0 900 result->latinOneCEs = NULL;
michael@0 901
michael@0 902 result->latinOneRegenTable = FALSE;
michael@0 903 result->latinOneFailed = FALSE;
michael@0 904 result->UCA = UCA;
michael@0 905
michael@0 906 /* Normally these will be set correctly later. This is the default if you use UCA or the default. */
michael@0 907 result->ucaRules = NULL;
michael@0 908 result->actualLocale = NULL;
michael@0 909 result->validLocale = NULL;
michael@0 910 result->requestedLocale = NULL;
michael@0 911 result->hasRealData = FALSE; // real data lives in .dat file...
michael@0 912 result->freeImageOnClose = FALSE;
michael@0 913
michael@0 914 /* set attributes */
michael@0 915 ucol_setOptionsFromHeader(
michael@0 916 result,
michael@0 917 (UColOptionSet*)((uint8_t*)result->image+result->image->options),
michael@0 918 status);
michael@0 919 result->freeOptionsOnClose = FALSE;
michael@0 920
michael@0 921 return result;
michael@0 922 }
michael@0 923
michael@0 924 /* new Mark's code */
michael@0 925
michael@0 926 /**
michael@0 927 * For generation of Implicit CEs
michael@0 928 * @author Davis
michael@0 929 *
michael@0 930 * Cleaned up so that changes can be made more easily.
michael@0 931 * Old values:
michael@0 932 # First Implicit: E26A792D
michael@0 933 # Last Implicit: E3DC70C0
michael@0 934 # First CJK: E0030300
michael@0 935 # Last CJK: E0A9DD00
michael@0 936 # First CJK_A: E0A9DF00
michael@0 937 # Last CJK_A: E0DE3100
michael@0 938 */
michael@0 939 /* Following is a port of Mark's code for new treatment of implicits.
michael@0 940 * It is positioned here, since ucol_initUCA need to initialize the
michael@0 941 * variables below according to the data in the fractional UCA.
michael@0 942 */
michael@0 943
michael@0 944 /**
michael@0 945 * Function used to:
michael@0 946 * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
michael@0 947 * b) bump any non-CJK characters by 10FFFF.
michael@0 948 * The relevant blocks are:
michael@0 949 * A: 4E00..9FFF; CJK Unified Ideographs
michael@0 950 * F900..FAFF; CJK Compatibility Ideographs
michael@0 951 * B: 3400..4DBF; CJK Unified Ideographs Extension A
michael@0 952 * 20000..XX; CJK Unified Ideographs Extension B (and others later on)
michael@0 953 * As long as
michael@0 954 * no new B characters are allocated between 4E00 and FAFF, and
michael@0 955 * no new A characters are outside of this range,
michael@0 956 * (very high probability) this simple code will work.
michael@0 957 * The reordered blocks are:
michael@0 958 * Block1 is CJK
michael@0 959 * Block2 is CJK_COMPAT_USED
michael@0 960 * Block3 is CJK_A
michael@0 961 * (all contiguous)
michael@0 962 * Any other CJK gets its normal code point
michael@0 963 * Any non-CJK gets +10FFFF
michael@0 964 * When we reorder Block1, we make sure that it is at the very start,
michael@0 965 * so that it will use a 3-byte form.
michael@0 966 * Warning: the we only pick up the compatibility characters that are
michael@0 967 * NOT decomposed, so that block is smaller!
michael@0 968 */
michael@0 969
michael@0 970 // CONSTANTS
michael@0 971 static const UChar32
michael@0 972 NON_CJK_OFFSET = 0x110000,
michael@0 973 UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2
michael@0 974
michael@0 975 /**
michael@0 976 * Precomputed by initImplicitConstants()
michael@0 977 */
michael@0 978 static int32_t
michael@0 979 final3Multiplier = 0,
michael@0 980 final4Multiplier = 0,
michael@0 981 final3Count = 0,
michael@0 982 final4Count = 0,
michael@0 983 medialCount = 0,
michael@0 984 min3Primary = 0,
michael@0 985 min4Primary = 0,
michael@0 986 max4Primary = 0,
michael@0 987 minTrail = 0,
michael@0 988 maxTrail = 0,
michael@0 989 max3Trail = 0,
michael@0 990 max4Trail = 0,
michael@0 991 min4Boundary = 0;
michael@0 992
michael@0 993 static const UChar32
michael@0 994 // 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
michael@0 995 // 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; (Unicode 6.1)
michael@0 996 CJK_BASE = 0x4E00,
michael@0 997 CJK_LIMIT = 0x9FCC+1,
michael@0 998 // Unified CJK ideographs in the compatibility ideographs block.
michael@0 999 CJK_COMPAT_USED_BASE = 0xFA0E,
michael@0 1000 CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
michael@0 1001 // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
michael@0 1002 // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
michael@0 1003 CJK_A_BASE = 0x3400,
michael@0 1004 CJK_A_LIMIT = 0x4DB5+1,
michael@0 1005 // 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
michael@0 1006 // 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
michael@0 1007 CJK_B_BASE = 0x20000,
michael@0 1008 CJK_B_LIMIT = 0x2A6D6+1,
michael@0 1009 // 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;
michael@0 1010 // 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
michael@0 1011 CJK_C_BASE = 0x2A700,
michael@0 1012 CJK_C_LIMIT = 0x2B734+1,
michael@0 1013 // 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;;
michael@0 1014 // 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;;
michael@0 1015 CJK_D_BASE = 0x2B740,
michael@0 1016 CJK_D_LIMIT = 0x2B81D+1;
michael@0 1017 // when adding to this list, look for all occurrences (in project)
michael@0 1018 // of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing!!!!
michael@0 1019
michael@0 1020 static UChar32 swapCJK(UChar32 i) {
michael@0 1021 if (i < CJK_A_BASE) {
michael@0 1022 // non-CJK
michael@0 1023 } else if (i < CJK_A_LIMIT) {
michael@0 1024 // Extension A has lower code points than the original Unihan+compat
michael@0 1025 // but sorts higher.
michael@0 1026 return i - CJK_A_BASE
michael@0 1027 + (CJK_LIMIT - CJK_BASE)
michael@0 1028 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
michael@0 1029 } else if (i < CJK_BASE) {
michael@0 1030 // non-CJK
michael@0 1031 } else if (i < CJK_LIMIT) {
michael@0 1032 return i - CJK_BASE;
michael@0 1033 } else if (i < CJK_COMPAT_USED_BASE) {
michael@0 1034 // non-CJK
michael@0 1035 } else if (i < CJK_COMPAT_USED_LIMIT) {
michael@0 1036 return i - CJK_COMPAT_USED_BASE
michael@0 1037 + (CJK_LIMIT - CJK_BASE);
michael@0 1038 } else if (i < CJK_B_BASE) {
michael@0 1039 // non-CJK
michael@0 1040 } else if (i < CJK_B_LIMIT) {
michael@0 1041 return i; // non-BMP-CJK
michael@0 1042 } else if (i < CJK_C_BASE) {
michael@0 1043 // non-CJK
michael@0 1044 } else if (i < CJK_C_LIMIT) {
michael@0 1045 return i; // non-BMP-CJK
michael@0 1046 } else if (i < CJK_D_BASE) {
michael@0 1047 // non-CJK
michael@0 1048 } else if (i < CJK_D_LIMIT) {
michael@0 1049 return i; // non-BMP-CJK
michael@0 1050 }
michael@0 1051 return i + NON_CJK_OFFSET; // non-CJK
michael@0 1052 }
michael@0 1053
michael@0 1054 U_CAPI UChar32 U_EXPORT2
michael@0 1055 uprv_uca_getRawFromCodePoint(UChar32 i) {
michael@0 1056 return swapCJK(i)+1;
michael@0 1057 }
michael@0 1058
michael@0 1059 U_CAPI UChar32 U_EXPORT2
michael@0 1060 uprv_uca_getCodePointFromRaw(UChar32 i) {
michael@0 1061 i--;
michael@0 1062 UChar32 result = 0;
michael@0 1063 if(i >= NON_CJK_OFFSET) {
michael@0 1064 result = i - NON_CJK_OFFSET;
michael@0 1065 } else if(i >= CJK_B_BASE) {
michael@0 1066 result = i;
michael@0 1067 } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted
michael@0 1068 if(i < CJK_LIMIT - CJK_BASE) {
michael@0 1069 result = i + CJK_BASE;
michael@0 1070 } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
michael@0 1071 result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
michael@0 1072 } else {
michael@0 1073 result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
michael@0 1074 }
michael@0 1075 } else {
michael@0 1076 result = -1;
michael@0 1077 }
michael@0 1078 return result;
michael@0 1079 }
michael@0 1080
michael@0 1081 // GET IMPLICIT PRIMARY WEIGHTS
michael@0 1082 // Return value is left justified primary key
michael@0 1083 U_CAPI uint32_t U_EXPORT2
michael@0 1084 uprv_uca_getImplicitFromRaw(UChar32 cp) {
michael@0 1085 /*
michael@0 1086 if (cp < 0 || cp > UCOL_MAX_INPUT) {
michael@0 1087 throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
michael@0 1088 }
michael@0 1089 */
michael@0 1090 int32_t last0 = cp - min4Boundary;
michael@0 1091 if (last0 < 0) {
michael@0 1092 int32_t last1 = cp / final3Count;
michael@0 1093 last0 = cp % final3Count;
michael@0 1094
michael@0 1095 int32_t last2 = last1 / medialCount;
michael@0 1096 last1 %= medialCount;
michael@0 1097
michael@0 1098 last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
michael@0 1099 last1 = minTrail + last1; // offset
michael@0 1100 last2 = min3Primary + last2; // offset
michael@0 1101 /*
michael@0 1102 if (last2 >= min4Primary) {
michael@0 1103 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
michael@0 1104 }
michael@0 1105 */
michael@0 1106 return (last2 << 24) + (last1 << 16) + (last0 << 8);
michael@0 1107 } else {
michael@0 1108 int32_t last1 = last0 / final4Count;
michael@0 1109 last0 %= final4Count;
michael@0 1110
michael@0 1111 int32_t last2 = last1 / medialCount;
michael@0 1112 last1 %= medialCount;
michael@0 1113
michael@0 1114 int32_t last3 = last2 / medialCount;
michael@0 1115 last2 %= medialCount;
michael@0 1116
michael@0 1117 last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
michael@0 1118 last1 = minTrail + last1; // offset
michael@0 1119 last2 = minTrail + last2; // offset
michael@0 1120 last3 = min4Primary + last3; // offset
michael@0 1121 /*
michael@0 1122 if (last3 > max4Primary) {
michael@0 1123 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
michael@0 1124 }
michael@0 1125 */
michael@0 1126 return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
michael@0 1127 }
michael@0 1128 }
michael@0 1129
michael@0 1130 static uint32_t U_EXPORT2
michael@0 1131 uprv_uca_getImplicitPrimary(UChar32 cp) {
michael@0 1132 //fprintf(stdout, "Incoming: %04x\n", cp);
michael@0 1133 //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
michael@0 1134
michael@0 1135 cp = swapCJK(cp);
michael@0 1136 cp++;
michael@0 1137 // we now have a range of numbers from 0 to 21FFFF.
michael@0 1138
michael@0 1139 //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
michael@0 1140 //fprintf(stdout, "CJK swapped: %04x\n", cp);
michael@0 1141
michael@0 1142 return uprv_uca_getImplicitFromRaw(cp);
michael@0 1143 }
michael@0 1144
michael@0 1145 /**
michael@0 1146 * Converts implicit CE into raw integer ("code point")
michael@0 1147 * @param implicit
michael@0 1148 * @return -1 if illegal format
michael@0 1149 */
michael@0 1150 U_CAPI UChar32 U_EXPORT2
michael@0 1151 uprv_uca_getRawFromImplicit(uint32_t implicit) {
michael@0 1152 UChar32 result;
michael@0 1153 UChar32 b3 = implicit & 0xFF;
michael@0 1154 UChar32 b2 = (implicit >> 8) & 0xFF;
michael@0 1155 UChar32 b1 = (implicit >> 16) & 0xFF;
michael@0 1156 UChar32 b0 = (implicit >> 24) & 0xFF;
michael@0 1157
michael@0 1158 // simple parameter checks
michael@0 1159 if (b0 < min3Primary || b0 > max4Primary
michael@0 1160 || b1 < minTrail || b1 > maxTrail)
michael@0 1161 return -1;
michael@0 1162 // normal offsets
michael@0 1163 b1 -= minTrail;
michael@0 1164
michael@0 1165 // take care of the final values, and compose
michael@0 1166 if (b0 < min4Primary) {
michael@0 1167 if (b2 < minTrail || b2 > max3Trail || b3 != 0)
michael@0 1168 return -1;
michael@0 1169 b2 -= minTrail;
michael@0 1170 UChar32 remainder = b2 % final3Multiplier;
michael@0 1171 if (remainder != 0)
michael@0 1172 return -1;
michael@0 1173 b0 -= min3Primary;
michael@0 1174 b2 /= final3Multiplier;
michael@0 1175 result = ((b0 * medialCount) + b1) * final3Count + b2;
michael@0 1176 } else {
michael@0 1177 if (b2 < minTrail || b2 > maxTrail
michael@0 1178 || b3 < minTrail || b3 > max4Trail)
michael@0 1179 return -1;
michael@0 1180 b2 -= minTrail;
michael@0 1181 b3 -= minTrail;
michael@0 1182 UChar32 remainder = b3 % final4Multiplier;
michael@0 1183 if (remainder != 0)
michael@0 1184 return -1;
michael@0 1185 b3 /= final4Multiplier;
michael@0 1186 b0 -= min4Primary;
michael@0 1187 result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
michael@0 1188 }
michael@0 1189 // final check
michael@0 1190 if (result < 0 || result > UCOL_MAX_INPUT)
michael@0 1191 return -1;
michael@0 1192 return result;
michael@0 1193 }
michael@0 1194
michael@0 1195
michael@0 1196 static inline int32_t divideAndRoundUp(int a, int b) {
michael@0 1197 return 1 + (a-1)/b;
michael@0 1198 }
michael@0 1199
michael@0 1200 /* this function is either called from initUCA or from genUCA before
michael@0 1201 * doing canonical closure for the UCA.
michael@0 1202 */
michael@0 1203
michael@0 1204 /**
michael@0 1205 * Set up to generate implicits.
michael@0 1206 * Maintenance Note: this function may end up being called more than once, due
michael@0 1207 * to threading races during initialization. Make sure that
michael@0 1208 * none of the Constants is ever transiently assigned an
michael@0 1209 * incorrect value.
michael@0 1210 * @param minPrimary
michael@0 1211 * @param maxPrimary
michael@0 1212 * @param minTrail final byte
michael@0 1213 * @param maxTrail final byte
michael@0 1214 * @param gap3 the gap we leave for tailoring for 3-byte forms
michael@0 1215 * @param gap4 the gap we leave for tailoring for 4-byte forms
michael@0 1216 */
michael@0 1217 static void initImplicitConstants(int minPrimary, int maxPrimary,
michael@0 1218 int minTrailIn, int maxTrailIn,
michael@0 1219 int gap3, int primaries3count,
michael@0 1220 UErrorCode *status) {
michael@0 1221 // some simple parameter checks
michael@0 1222 if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF)
michael@0 1223 || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF)
michael@0 1224 || (primaries3count < 1))
michael@0 1225 {
michael@0 1226 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 1227 return;
michael@0 1228 };
michael@0 1229
michael@0 1230 minTrail = minTrailIn;
michael@0 1231 maxTrail = maxTrailIn;
michael@0 1232
michael@0 1233 min3Primary = minPrimary;
michael@0 1234 max4Primary = maxPrimary;
michael@0 1235 // compute constants for use later.
michael@0 1236 // number of values we can use in trailing bytes
michael@0 1237 // leave room for empty values between AND above, e.g. if gap = 2
michael@0 1238 // range 3..7 => +3 -4 -5 -6 -7: so 1 value
michael@0 1239 // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
michael@0 1240 // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
michael@0 1241 final3Multiplier = gap3 + 1;
michael@0 1242 final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
michael@0 1243 max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
michael@0 1244
michael@0 1245 // medials can use full range
michael@0 1246 medialCount = (maxTrail - minTrail + 1);
michael@0 1247 // find out how many values fit in each form
michael@0 1248 int32_t threeByteCount = medialCount * final3Count;
michael@0 1249 // now determine where the 3/4 boundary is.
michael@0 1250 // we use 3 bytes below the boundary, and 4 above
michael@0 1251 int32_t primariesAvailable = maxPrimary - minPrimary + 1;
michael@0 1252 int32_t primaries4count = primariesAvailable - primaries3count;
michael@0 1253
michael@0 1254
michael@0 1255 int32_t min3ByteCoverage = primaries3count * threeByteCount;
michael@0 1256 min4Primary = minPrimary + primaries3count;
michael@0 1257 min4Boundary = min3ByteCoverage;
michael@0 1258 // Now expand out the multiplier for the 4 bytes, and redo.
michael@0 1259
michael@0 1260 int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary;
michael@0 1261 int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
michael@0 1262 int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
michael@0 1263 int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
michael@0 1264 if (gap4 < 1) {
michael@0 1265 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 1266 return;
michael@0 1267 }
michael@0 1268 final4Multiplier = gap4 + 1;
michael@0 1269 final4Count = neededPerFinalByte;
michael@0 1270 max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
michael@0 1271 }
michael@0 1272
michael@0 1273 /**
michael@0 1274 * Supply parameters for generating implicit CEs
michael@0 1275 */
michael@0 1276 U_CAPI void U_EXPORT2
michael@0 1277 uprv_uca_initImplicitConstants(UErrorCode *status) {
michael@0 1278 // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
michael@0 1279 //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);
michael@0 1280 initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status);
michael@0 1281 }
michael@0 1282
michael@0 1283
michael@0 1284 /* collIterNormalize Incremental Normalization happens here. */
michael@0 1285 /* pick up the range of chars identifed by FCD, */
michael@0 1286 /* normalize it into the collIterate's writable buffer, */
michael@0 1287 /* switch the collIterate's state to use the writable buffer. */
michael@0 1288 /* */
michael@0 1289 static
michael@0 1290 void collIterNormalize(collIterate *collationSource)
michael@0 1291 {
michael@0 1292 UErrorCode status = U_ZERO_ERROR;
michael@0 1293 const UChar *srcP = collationSource->pos - 1; /* Start of chars to normalize */
michael@0 1294 const UChar *endP = collationSource->fcdPosition; /* End of region to normalize+1 */
michael@0 1295
michael@0 1296 collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP - srcP)),
michael@0 1297 collationSource->writableBuffer,
michael@0 1298 status);
michael@0 1299 if (U_FAILURE(status)) {
michael@0 1300 #ifdef UCOL_DEBUG
michael@0 1301 fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_errorName(status));
michael@0 1302 #endif
michael@0 1303 return;
michael@0 1304 }
michael@0 1305
michael@0 1306 collationSource->pos = collationSource->writableBuffer.getTerminatedBuffer();
michael@0 1307 collationSource->origFlags = collationSource->flags;
michael@0 1308 collationSource->flags |= UCOL_ITER_INNORMBUF;
michael@0 1309 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
michael@0 1310 }
michael@0 1311
michael@0 1312
michael@0 1313 // This function takes the iterator and extracts normalized stuff up to the next boundary
michael@0 1314 // It is similar in the end results to the collIterNormalize, but for the cases when we
michael@0 1315 // use an iterator
michael@0 1316 /*static
michael@0 1317 inline void normalizeIterator(collIterate *collationSource) {
michael@0 1318 UErrorCode status = U_ZERO_ERROR;
michael@0 1319 UBool wasNormalized = FALSE;
michael@0 1320 //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
michael@0 1321 uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator);
michael@0 1322 int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
michael@0 1323 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
michael@0 1324 if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) {
michael@0 1325 // reallocate and terminate
michael@0 1326 if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
michael@0 1327 &collationSource->writableBuffer,
michael@0 1328 (int32_t *)&collationSource->writableBufSize, normLen + 1,
michael@0 1329 0)
michael@0 1330 ) {
michael@0 1331 #ifdef UCOL_DEBUG
michael@0 1332 fprintf(stderr, "normalizeIterator(), out of memory\n");
michael@0 1333 #endif
michael@0 1334 return;
michael@0 1335 }
michael@0 1336 status = U_ZERO_ERROR;
michael@0 1337 //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
michael@0 1338 collationSource->iterator->setState(collationSource->iterator, iterIndex, &status);
michael@0 1339 normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
michael@0 1340 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
michael@0 1341 }
michael@0 1342 // Terminate the buffer - we already checked that it is big enough
michael@0 1343 collationSource->writableBuffer[normLen] = 0;
michael@0 1344 if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
michael@0 1345 collationSource->flags |= UCOL_ITER_ALLOCATED;
michael@0 1346 }
michael@0 1347 collationSource->pos = collationSource->writableBuffer;
michael@0 1348 collationSource->origFlags = collationSource->flags;
michael@0 1349 collationSource->flags |= UCOL_ITER_INNORMBUF;
michael@0 1350 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
michael@0 1351 }*/
michael@0 1352
michael@0 1353
michael@0 1354 /* Incremental FCD check and normalize */
michael@0 1355 /* Called from getNextCE when normalization state is suspect. */
michael@0 1356 /* When entering, the state is known to be this: */
michael@0 1357 /* o We are working in the main buffer of the collIterate, not the side */
michael@0 1358 /* writable buffer. When in the side buffer, normalization mode is always off, */
michael@0 1359 /* so we won't get here. */
michael@0 1360 /* o The leading combining class from the current character is 0 or */
michael@0 1361 /* the trailing combining class of the previous char was zero. */
michael@0 1362 /* True because the previous call to this function will have always exited */
michael@0 1363 /* that way, and we get called for every char where cc might be non-zero. */
michael@0 1364 static
michael@0 1365 inline UBool collIterFCD(collIterate *collationSource) {
michael@0 1366 const UChar *srcP, *endP;
michael@0 1367 uint8_t leadingCC;
michael@0 1368 uint8_t prevTrailingCC = 0;
michael@0 1369 uint16_t fcd;
michael@0 1370 UBool needNormalize = FALSE;
michael@0 1371
michael@0 1372 srcP = collationSource->pos-1;
michael@0 1373
michael@0 1374 if (collationSource->flags & UCOL_ITER_HASLEN) {
michael@0 1375 endP = collationSource->endp;
michael@0 1376 } else {
michael@0 1377 endP = NULL;
michael@0 1378 }
michael@0 1379
michael@0 1380 // Get the trailing combining class of the current character. If it's zero, we are OK.
michael@0 1381 fcd = g_nfcImpl->nextFCD16(srcP, endP);
michael@0 1382 if (fcd != 0) {
michael@0 1383 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
michael@0 1384
michael@0 1385 if (prevTrailingCC != 0) {
michael@0 1386 // The current char has a non-zero trailing CC. Scan forward until we find
michael@0 1387 // a char with a leading cc of zero.
michael@0 1388 while (endP == NULL || srcP != endP)
michael@0 1389 {
michael@0 1390 const UChar *savedSrcP = srcP;
michael@0 1391
michael@0 1392 fcd = g_nfcImpl->nextFCD16(srcP, endP);
michael@0 1393 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
michael@0 1394 if (leadingCC == 0) {
michael@0 1395 srcP = savedSrcP; // Hit char that is not part of combining sequence.
michael@0 1396 // back up over it. (Could be surrogate pair!)
michael@0 1397 break;
michael@0 1398 }
michael@0 1399
michael@0 1400 if (leadingCC < prevTrailingCC) {
michael@0 1401 needNormalize = TRUE;
michael@0 1402 }
michael@0 1403
michael@0 1404 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
michael@0 1405 }
michael@0 1406 }
michael@0 1407 }
michael@0 1408
michael@0 1409 collationSource->fcdPosition = (UChar *)srcP;
michael@0 1410
michael@0 1411 return needNormalize;
michael@0 1412 }
michael@0 1413
michael@0 1414 /****************************************************************************/
michael@0 1415 /* Following are the CE retrieval functions */
michael@0 1416 /* */
michael@0 1417 /****************************************************************************/
michael@0 1418
michael@0 1419 static uint32_t getImplicit(UChar32 cp, collIterate *collationSource);
michael@0 1420 static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource);
michael@0 1421
michael@0 1422 /* there should be a macro version of this function in the header file */
michael@0 1423 /* This is the first function that tries to fetch a collation element */
michael@0 1424 /* If it's not succesfull or it encounters a more difficult situation */
michael@0 1425 /* some more sofisticated and slower functions are invoked */
michael@0 1426 static
michael@0 1427 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
michael@0 1428 uint32_t order = 0;
michael@0 1429 if (collationSource->CEpos > collationSource->toReturn) { /* Are there any CEs from previous expansions? */
michael@0 1430 order = *(collationSource->toReturn++); /* if so, return them */
michael@0 1431 if(collationSource->CEpos == collationSource->toReturn) {
michael@0 1432 collationSource->CEpos = collationSource->toReturn = collationSource->extendCEs ? collationSource->extendCEs : collationSource->CEs;
michael@0 1433 }
michael@0 1434 return order;
michael@0 1435 }
michael@0 1436
michael@0 1437 UChar ch = 0;
michael@0 1438 collationSource->offsetReturn = NULL;
michael@0 1439
michael@0 1440 do {
michael@0 1441 for (;;) /* Loop handles case when incremental normalize switches */
michael@0 1442 { /* to or from the side buffer / original string, and we */
michael@0 1443 /* need to start again to get the next character. */
michael@0 1444
michael@0 1445 if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
michael@0 1446 {
michael@0 1447 // The source string is null terminated and we're not working from the side buffer,
michael@0 1448 // and we're not normalizing. This is the fast path.
michael@0 1449 // (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
michael@0 1450 ch = *collationSource->pos++;
michael@0 1451 if (ch != 0) {
michael@0 1452 break;
michael@0 1453 }
michael@0 1454 else {
michael@0 1455 return UCOL_NO_MORE_CES;
michael@0 1456 }
michael@0 1457 }
michael@0 1458
michael@0 1459 if (collationSource->flags & UCOL_ITER_HASLEN) {
michael@0 1460 // Normal path for strings when length is specified.
michael@0 1461 // (We can't be in side buffer because it is always null terminated.)
michael@0 1462 if (collationSource->pos >= collationSource->endp) {
michael@0 1463 // Ran off of the end of the main source string. We're done.
michael@0 1464 return UCOL_NO_MORE_CES;
michael@0 1465 }
michael@0 1466 ch = *collationSource->pos++;
michael@0 1467 }
michael@0 1468 else if(collationSource->flags & UCOL_USE_ITERATOR) {
michael@0 1469 UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
michael@0 1470 if(iterCh == U_SENTINEL) {
michael@0 1471 return UCOL_NO_MORE_CES;
michael@0 1472 }
michael@0 1473 ch = (UChar)iterCh;
michael@0 1474 }
michael@0 1475 else
michael@0 1476 {
michael@0 1477 // Null terminated string.
michael@0 1478 ch = *collationSource->pos++;
michael@0 1479 if (ch == 0) {
michael@0 1480 // Ran off end of buffer.
michael@0 1481 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
michael@0 1482 // Ran off end of main string. backing up one character.
michael@0 1483 collationSource->pos--;
michael@0 1484 return UCOL_NO_MORE_CES;
michael@0 1485 }
michael@0 1486 else
michael@0 1487 {
michael@0 1488 // Hit null in the normalize side buffer.
michael@0 1489 // Usually this means the end of the normalized data,
michael@0 1490 // except for one odd case: a null followed by combining chars,
michael@0 1491 // which is the case if we are at the start of the buffer.
michael@0 1492 if (collationSource->pos == collationSource->writableBuffer.getBuffer()+1) {
michael@0 1493 break;
michael@0 1494 }
michael@0 1495
michael@0 1496 // Null marked end of side buffer.
michael@0 1497 // Revert to the main string and
michael@0 1498 // loop back to top to try again to get a character.
michael@0 1499 collationSource->pos = collationSource->fcdPosition;
michael@0 1500 collationSource->flags = collationSource->origFlags;
michael@0 1501 continue;
michael@0 1502 }
michael@0 1503 }
michael@0 1504 }
michael@0 1505
michael@0 1506 if(collationSource->flags&UCOL_HIRAGANA_Q) {
michael@0 1507 /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
michael@0 1508 * based on whether the previous codepoint was Hiragana or Katakana.
michael@0 1509 */
michael@0 1510 if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) ||
michael@0 1511 ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) {
michael@0 1512 collationSource->flags |= UCOL_WAS_HIRAGANA;
michael@0 1513 } else {
michael@0 1514 collationSource->flags &= ~UCOL_WAS_HIRAGANA;
michael@0 1515 }
michael@0 1516 }
michael@0 1517
michael@0 1518 // We've got a character. See if there's any fcd and/or normalization stuff to do.
michael@0 1519 // Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
michael@0 1520 if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
michael@0 1521 break;
michael@0 1522 }
michael@0 1523
michael@0 1524 if (collationSource->fcdPosition >= collationSource->pos) {
michael@0 1525 // An earlier FCD check has already covered the current character.
michael@0 1526 // We can go ahead and process this char.
michael@0 1527 break;
michael@0 1528 }
michael@0 1529
michael@0 1530 if (ch < ZERO_CC_LIMIT_ ) {
michael@0 1531 // Fast fcd safe path. Trailing combining class == 0. This char is OK.
michael@0 1532 break;
michael@0 1533 }
michael@0 1534
michael@0 1535 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
michael@0 1536 // We need to peek at the next character in order to tell if we are FCD
michael@0 1537 if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
michael@0 1538 // We are at the last char of source string.
michael@0 1539 // It is always OK for FCD check.
michael@0 1540 break;
michael@0 1541 }
michael@0 1542
michael@0 1543 // Not at last char of source string (or we'll check against terminating null). Do the FCD fast test
michael@0 1544 if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
michael@0 1545 break;
michael@0 1546 }
michael@0 1547 }
michael@0 1548
michael@0 1549
michael@0 1550 // Need a more complete FCD check and possible normalization.
michael@0 1551 if (collIterFCD(collationSource)) {
michael@0 1552 collIterNormalize(collationSource);
michael@0 1553 }
michael@0 1554 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
michael@0 1555 // No normalization was needed. Go ahead and process the char we already had.
michael@0 1556 break;
michael@0 1557 }
michael@0 1558
michael@0 1559 // Some normalization happened. Next loop iteration will pick up a char
michael@0 1560 // from the normalization buffer.
michael@0 1561
michael@0 1562 } // end for (;;)
michael@0 1563
michael@0 1564
michael@0 1565 if (ch <= 0xFF) {
michael@0 1566 /* For latin-1 characters we never need to fall back to the UCA table */
michael@0 1567 /* because all of the UCA data is replicated in the latinOneMapping array */
michael@0 1568 order = coll->latinOneMapping[ch];
michael@0 1569 if (order > UCOL_NOT_FOUND) {
michael@0 1570 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
michael@0 1571 }
michael@0 1572 }
michael@0 1573 else
michael@0 1574 {
michael@0 1575 // Always use UCA for Han, Hangul
michael@0 1576 // (Han extension A is before main Han block)
michael@0 1577 // **** Han compatibility chars ?? ****
michael@0 1578 if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
michael@0 1579 (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) {
michael@0 1580 if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) {
michael@0 1581 // between the two target ranges; do normal lookup
michael@0 1582 // **** this range is YI, Modifier tone letters, ****
michael@0 1583 // **** Latin-D, Syloti Nagari, Phagas-pa. ****
michael@0 1584 // **** Latin-D might be tailored, so we need to ****
michael@0 1585 // **** do the normal lookup for these guys. ****
michael@0 1586 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
michael@0 1587 } else {
michael@0 1588 // in one of the target ranges; use UCA
michael@0 1589 order = UCOL_NOT_FOUND;
michael@0 1590 }
michael@0 1591 } else {
michael@0 1592 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
michael@0 1593 }
michael@0 1594
michael@0 1595 if(order > UCOL_NOT_FOUND) { /* if a CE is special */
michael@0 1596 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */
michael@0 1597 }
michael@0 1598
michael@0 1599 if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good CE in the tailoring */
michael@0 1600 /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
michael@0 1601 order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
michael@0 1602
michael@0 1603 if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
michael@0 1604 order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status);
michael@0 1605 }
michael@0 1606 }
michael@0 1607 }
michael@0 1608 } while ( order == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL );
michael@0 1609
michael@0 1610 if(order == UCOL_NOT_FOUND) {
michael@0 1611 order = getImplicit(ch, collationSource);
michael@0 1612 }
michael@0 1613 return order; /* return the CE */
michael@0 1614 }
michael@0 1615
michael@0 1616 /* ucol_getNextCE, out-of-line version for use from other files. */
michael@0 1617 U_CAPI uint32_t U_EXPORT2
michael@0 1618 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
michael@0 1619 return ucol_IGetNextCE(coll, collationSource, status);
michael@0 1620 }
michael@0 1621
michael@0 1622
michael@0 1623 /**
michael@0 1624 * Incremental previous normalization happens here. Pick up the range of chars
michael@0 1625 * identifed by FCD, normalize it into the collIterate's writable buffer,
michael@0 1626 * switch the collIterate's state to use the writable buffer.
michael@0 1627 * @param data collation iterator data
michael@0 1628 */
michael@0 1629 static
michael@0 1630 void collPrevIterNormalize(collIterate *data)
michael@0 1631 {
michael@0 1632 UErrorCode status = U_ZERO_ERROR;
michael@0 1633 const UChar *pEnd = data->pos; /* End normalize + 1 */
michael@0 1634 const UChar *pStart;
michael@0 1635
michael@0 1636 /* Start normalize */
michael@0 1637 if (data->fcdPosition == NULL) {
michael@0 1638 pStart = data->string;
michael@0 1639 }
michael@0 1640 else {
michael@0 1641 pStart = data->fcdPosition + 1;
michael@0 1642 }
michael@0 1643
michael@0 1644 int32_t normLen =
michael@0 1645 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pStart) + 1)),
michael@0 1646 data->writableBuffer,
michael@0 1647 status).
michael@0 1648 length();
michael@0 1649 if(U_FAILURE(status)) {
michael@0 1650 return;
michael@0 1651 }
michael@0 1652 /*
michael@0 1653 this puts the null termination infront of the normalized string instead
michael@0 1654 of the end
michael@0 1655 */
michael@0 1656 data->writableBuffer.insert(0, (UChar)0);
michael@0 1657
michael@0 1658 /*
michael@0 1659 * The usual case at this point is that we've got a base
michael@0 1660 * character followed by marks that were normalized. If
michael@0 1661 * fcdPosition is NULL, that means that we backed up to
michael@0 1662 * the beginning of the string and there's no base character.
michael@0 1663 *
michael@0 1664 * Forward processing will usually normalize when it sees
michael@0 1665 * the first mark, so that mark will get it's natural offset
michael@0 1666 * and the rest will get the offset of the character following
michael@0 1667 * the marks. The base character will also get its natural offset.
michael@0 1668 *
michael@0 1669 * We write the offset of the base character, if there is one,
michael@0 1670 * followed by the offset of the first mark and then the offsets
michael@0 1671 * of the rest of the marks.
michael@0 1672 */
michael@0 1673 int32_t firstMarkOffset = 0;
michael@0 1674 int32_t trailOffset = (int32_t)(data->pos - data->string + 1);
michael@0 1675 int32_t trailCount = normLen - 1;
michael@0 1676
michael@0 1677 if (data->fcdPosition != NULL) {
michael@0 1678 int32_t baseOffset = (int32_t)(data->fcdPosition - data->string);
michael@0 1679 UChar baseChar = *data->fcdPosition;
michael@0 1680
michael@0 1681 firstMarkOffset = baseOffset + 1;
michael@0 1682
michael@0 1683 /*
michael@0 1684 * If the base character is the start of a contraction, forward processing
michael@0 1685 * will normalize the marks while checking for the contraction, which means
michael@0 1686 * that the offset of the first mark will the same as the other marks.
michael@0 1687 *
michael@0 1688 * **** THIS IS PROBABLY NOT A COMPLETE TEST ****
michael@0 1689 */
michael@0 1690 if (baseChar >= 0x100) {
michael@0 1691 uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar);
michael@0 1692
michael@0 1693 if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) {
michael@0 1694 baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar);
michael@0 1695 }
michael@0 1696
michael@0 1697 if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) {
michael@0 1698 firstMarkOffset = trailOffset;
michael@0 1699 }
michael@0 1700 }
michael@0 1701
michael@0 1702 data->appendOffset(baseOffset, status);
michael@0 1703 }
michael@0 1704
michael@0 1705 data->appendOffset(firstMarkOffset, status);
michael@0 1706
michael@0 1707 for (int32_t i = 0; i < trailCount; i += 1) {
michael@0 1708 data->appendOffset(trailOffset, status);
michael@0 1709 }
michael@0 1710
michael@0 1711 data->offsetRepeatValue = trailOffset;
michael@0 1712
michael@0 1713 data->offsetReturn = data->offsetStore - 1;
michael@0 1714 if (data->offsetReturn == data->offsetBuffer) {
michael@0 1715 data->offsetStore = data->offsetBuffer;
michael@0 1716 }
michael@0 1717
michael@0 1718 data->pos = data->writableBuffer.getTerminatedBuffer() + 1 + normLen;
michael@0 1719 data->origFlags = data->flags;
michael@0 1720 data->flags |= UCOL_ITER_INNORMBUF;
michael@0 1721 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
michael@0 1722 }
michael@0 1723
michael@0 1724
michael@0 1725 /**
michael@0 1726 * Incremental FCD check for previous iteration and normalize. Called from
michael@0 1727 * getPrevCE when normalization state is suspect.
michael@0 1728 * When entering, the state is known to be this:
michael@0 1729 * o We are working in the main buffer of the collIterate, not the side
michael@0 1730 * writable buffer. When in the side buffer, normalization mode is always
michael@0 1731 * off, so we won't get here.
michael@0 1732 * o The leading combining class from the current character is 0 or the
michael@0 1733 * trailing combining class of the previous char was zero.
michael@0 1734 * True because the previous call to this function will have always exited
michael@0 1735 * that way, and we get called for every char where cc might be non-zero.
michael@0 1736 * @param data collation iterate struct
michael@0 1737 * @return normalization status, TRUE for normalization to be done, FALSE
michael@0 1738 * otherwise
michael@0 1739 */
michael@0 1740 static
michael@0 1741 inline UBool collPrevIterFCD(collIterate *data)
michael@0 1742 {
michael@0 1743 const UChar *src, *start;
michael@0 1744 uint8_t leadingCC;
michael@0 1745 uint8_t trailingCC = 0;
michael@0 1746 uint16_t fcd;
michael@0 1747 UBool result = FALSE;
michael@0 1748
michael@0 1749 start = data->string;
michael@0 1750 src = data->pos + 1;
michael@0 1751
michael@0 1752 /* Get the trailing combining class of the current character. */
michael@0 1753 fcd = g_nfcImpl->previousFCD16(start, src);
michael@0 1754
michael@0 1755 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
michael@0 1756
michael@0 1757 if (leadingCC != 0) {
michael@0 1758 /*
michael@0 1759 The current char has a non-zero leading combining class.
michael@0 1760 Scan backward until we find a char with a trailing cc of zero.
michael@0 1761 */
michael@0 1762 for (;;)
michael@0 1763 {
michael@0 1764 if (start == src) {
michael@0 1765 data->fcdPosition = NULL;
michael@0 1766 return result;
michael@0 1767 }
michael@0 1768
michael@0 1769 fcd = g_nfcImpl->previousFCD16(start, src);
michael@0 1770
michael@0 1771 trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
michael@0 1772
michael@0 1773 if (trailingCC == 0) {
michael@0 1774 break;
michael@0 1775 }
michael@0 1776
michael@0 1777 if (leadingCC < trailingCC) {
michael@0 1778 result = TRUE;
michael@0 1779 }
michael@0 1780
michael@0 1781 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
michael@0 1782 }
michael@0 1783 }
michael@0 1784
michael@0 1785 data->fcdPosition = (UChar *)src;
michael@0 1786
michael@0 1787 return result;
michael@0 1788 }
michael@0 1789
michael@0 1790 /** gets a code unit from the string at a given offset
michael@0 1791 * Handles both normal and iterative cases.
michael@0 1792 * No error checking - caller beware!
michael@0 1793 */
michael@0 1794 static inline
michael@0 1795 UChar peekCodeUnit(collIterate *source, int32_t offset) {
michael@0 1796 if(source->pos != NULL) {
michael@0 1797 return *(source->pos + offset);
michael@0 1798 } else if(source->iterator != NULL) {
michael@0 1799 UChar32 c;
michael@0 1800 if(offset != 0) {
michael@0 1801 source->iterator->move(source->iterator, offset, UITER_CURRENT);
michael@0 1802 c = source->iterator->next(source->iterator);
michael@0 1803 source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
michael@0 1804 } else {
michael@0 1805 c = source->iterator->current(source->iterator);
michael@0 1806 }
michael@0 1807 return c >= 0 ? (UChar)c : 0xfffd; // If the caller works properly, we should never see c<0.
michael@0 1808 } else {
michael@0 1809 return 0xfffd;
michael@0 1810 }
michael@0 1811 }
michael@0 1812
michael@0 1813 // Code point version. Treats the offset as a _code point_ delta.
michael@0 1814 // We cannot use U16_FWD_1_UNSAFE and similar because we might not have well-formed UTF-16.
michael@0 1815 // We cannot use U16_FWD_1 and similar because we do not know the start and limit of the buffer.
michael@0 1816 static inline
michael@0 1817 UChar32 peekCodePoint(collIterate *source, int32_t offset) {
michael@0 1818 UChar32 c;
michael@0 1819 if(source->pos != NULL) {
michael@0 1820 const UChar *p = source->pos;
michael@0 1821 if(offset >= 0) {
michael@0 1822 // Skip forward over (offset-1) code points.
michael@0 1823 while(--offset >= 0) {
michael@0 1824 if(U16_IS_LEAD(*p++) && U16_IS_TRAIL(*p)) {
michael@0 1825 ++p;
michael@0 1826 }
michael@0 1827 }
michael@0 1828 // Read the code point there.
michael@0 1829 c = *p++;
michael@0 1830 UChar trail;
michael@0 1831 if(U16_IS_LEAD(c) && U16_IS_TRAIL(trail = *p)) {
michael@0 1832 c = U16_GET_SUPPLEMENTARY(c, trail);
michael@0 1833 }
michael@0 1834 } else /* offset<0 */ {
michael@0 1835 // Skip backward over (offset-1) code points.
michael@0 1836 while(++offset < 0) {
michael@0 1837 if(U16_IS_TRAIL(*--p) && U16_IS_LEAD(*(p - 1))) {
michael@0 1838 --p;
michael@0 1839 }
michael@0 1840 }
michael@0 1841 // Read the code point before that.
michael@0 1842 c = *--p;
michael@0 1843 UChar lead;
michael@0 1844 if(U16_IS_TRAIL(c) && U16_IS_LEAD(lead = *(p - 1))) {
michael@0 1845 c = U16_GET_SUPPLEMENTARY(lead, c);
michael@0 1846 }
michael@0 1847 }
michael@0 1848 } else if(source->iterator != NULL) {
michael@0 1849 if(offset >= 0) {
michael@0 1850 // Skip forward over (offset-1) code points.
michael@0 1851 int32_t fwd = offset;
michael@0 1852 while(fwd-- > 0) {
michael@0 1853 uiter_next32(source->iterator);
michael@0 1854 }
michael@0 1855 // Read the code point there.
michael@0 1856 c = uiter_current32(source->iterator);
michael@0 1857 // Return to the starting point, skipping backward over (offset-1) code points.
michael@0 1858 while(offset-- > 0) {
michael@0 1859 uiter_previous32(source->iterator);
michael@0 1860 }
michael@0 1861 } else /* offset<0 */ {
michael@0 1862 // Read backward, reading offset code points, remember only the last-read one.
michael@0 1863 int32_t back = offset;
michael@0 1864 do {
michael@0 1865 c = uiter_previous32(source->iterator);
michael@0 1866 } while(++back < 0);
michael@0 1867 // Return to the starting position, skipping forward over offset code points.
michael@0 1868 do {
michael@0 1869 uiter_next32(source->iterator);
michael@0 1870 } while(++offset < 0);
michael@0 1871 }
michael@0 1872 } else {
michael@0 1873 c = U_SENTINEL;
michael@0 1874 }
michael@0 1875 return c;
michael@0 1876 }
michael@0 1877
michael@0 1878 /**
michael@0 1879 * Determines if we are at the start of the data string in the backwards
michael@0 1880 * collation iterator
michael@0 1881 * @param data collation iterator
michael@0 1882 * @return TRUE if we are at the start
michael@0 1883 */
michael@0 1884 static
michael@0 1885 inline UBool isAtStartPrevIterate(collIterate *data) {
michael@0 1886 if(data->pos == NULL && data->iterator != NULL) {
michael@0 1887 return !data->iterator->hasPrevious(data->iterator);
michael@0 1888 }
michael@0 1889 //return (collIter_bos(data)) ||
michael@0 1890 return (data->pos == data->string) ||
michael@0 1891 ((data->flags & UCOL_ITER_INNORMBUF) && (data->pos != NULL) &&
michael@0 1892 *(data->pos - 1) == 0 && data->fcdPosition == NULL);
michael@0 1893 }
michael@0 1894
michael@0 1895 static
michael@0 1896 inline void goBackOne(collIterate *data) {
michael@0 1897 # if 0
michael@0 1898 // somehow, it looks like we need to keep iterator synced up
michael@0 1899 // at all times, as above.
michael@0 1900 if(data->pos) {
michael@0 1901 data->pos--;
michael@0 1902 }
michael@0 1903 if(data->iterator) {
michael@0 1904 data->iterator->previous(data->iterator);
michael@0 1905 }
michael@0 1906 #endif
michael@0 1907 if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) {
michael@0 1908 data->iterator->previous(data->iterator);
michael@0 1909 }
michael@0 1910 if(data->pos) {
michael@0 1911 data->pos --;
michael@0 1912 }
michael@0 1913 }
michael@0 1914
michael@0 1915 /**
michael@0 1916 * Inline function that gets a simple CE.
michael@0 1917 * So what it does is that it will first check the expansion buffer. If the
michael@0 1918 * expansion buffer is not empty, ie the end pointer to the expansion buffer
michael@0 1919 * is different from the string pointer, we return the collation element at the
michael@0 1920 * return pointer and decrement it.
michael@0 1921 * For more complicated CEs it resorts to getComplicatedCE.
michael@0 1922 * @param coll collator data
michael@0 1923 * @param data collation iterator struct
michael@0 1924 * @param status error status
michael@0 1925 */
michael@0 1926 static
michael@0 1927 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
michael@0 1928 UErrorCode *status)
michael@0 1929 {
michael@0 1930 uint32_t result = (uint32_t)UCOL_NULLORDER;
michael@0 1931
michael@0 1932 if (data->offsetReturn != NULL) {
michael@0 1933 if (data->offsetRepeatCount > 0) {
michael@0 1934 data->offsetRepeatCount -= 1;
michael@0 1935 } else {
michael@0 1936 if (data->offsetReturn == data->offsetBuffer) {
michael@0 1937 data->offsetReturn = NULL;
michael@0 1938 data->offsetStore = data->offsetBuffer;
michael@0 1939 } else {
michael@0 1940 data->offsetReturn -= 1;
michael@0 1941 }
michael@0 1942 }
michael@0 1943 }
michael@0 1944
michael@0 1945 if ((data->extendCEs && data->toReturn > data->extendCEs) ||
michael@0 1946 (!data->extendCEs && data->toReturn > data->CEs))
michael@0 1947 {
michael@0 1948 data->toReturn -= 1;
michael@0 1949 result = *(data->toReturn);
michael@0 1950 if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) {
michael@0 1951 data->CEpos = data->toReturn;
michael@0 1952 }
michael@0 1953 }
michael@0 1954 else {
michael@0 1955 UChar ch = 0;
michael@0 1956
michael@0 1957 do {
michael@0 1958 /*
michael@0 1959 Loop handles case when incremental normalize switches to or from the
michael@0 1960 side buffer / original string, and we need to start again to get the
michael@0 1961 next character.
michael@0 1962 */
michael@0 1963 for (;;) {
michael@0 1964 if (data->flags & UCOL_ITER_HASLEN) {
michael@0 1965 /*
michael@0 1966 Normal path for strings when length is specified.
michael@0 1967 Not in side buffer because it is always null terminated.
michael@0 1968 */
michael@0 1969 if (data->pos <= data->string) {
michael@0 1970 /* End of the main source string */
michael@0 1971 return UCOL_NO_MORE_CES;
michael@0 1972 }
michael@0 1973 data->pos --;
michael@0 1974 ch = *data->pos;
michael@0 1975 }
michael@0 1976 // we are using an iterator to go back. Pray for us!
michael@0 1977 else if (data->flags & UCOL_USE_ITERATOR) {
michael@0 1978 UChar32 iterCh = data->iterator->previous(data->iterator);
michael@0 1979 if(iterCh == U_SENTINEL) {
michael@0 1980 return UCOL_NO_MORE_CES;
michael@0 1981 } else {
michael@0 1982 ch = (UChar)iterCh;
michael@0 1983 }
michael@0 1984 }
michael@0 1985 else {
michael@0 1986 data->pos --;
michael@0 1987 ch = *data->pos;
michael@0 1988 /* we are in the side buffer. */
michael@0 1989 if (ch == 0) {
michael@0 1990 /*
michael@0 1991 At the start of the normalize side buffer.
michael@0 1992 Go back to string.
michael@0 1993 Because pointer points to the last accessed character,
michael@0 1994 hence we have to increment it by one here.
michael@0 1995 */
michael@0 1996 data->flags = data->origFlags;
michael@0 1997 data->offsetRepeatValue = 0;
michael@0 1998
michael@0 1999 if (data->fcdPosition == NULL) {
michael@0 2000 data->pos = data->string;
michael@0 2001 return UCOL_NO_MORE_CES;
michael@0 2002 }
michael@0 2003 else {
michael@0 2004 data->pos = data->fcdPosition + 1;
michael@0 2005 }
michael@0 2006
michael@0 2007 continue;
michael@0 2008 }
michael@0 2009 }
michael@0 2010
michael@0 2011 if(data->flags&UCOL_HIRAGANA_Q) {
michael@0 2012 if(ch>=0x3040 && ch<=0x309f) {
michael@0 2013 data->flags |= UCOL_WAS_HIRAGANA;
michael@0 2014 } else {
michael@0 2015 data->flags &= ~UCOL_WAS_HIRAGANA;
michael@0 2016 }
michael@0 2017 }
michael@0 2018
michael@0 2019 /*
michael@0 2020 * got a character to determine if there's fcd and/or normalization
michael@0 2021 * stuff to do.
michael@0 2022 * if the current character is not fcd.
michael@0 2023 * if current character is at the start of the string
michael@0 2024 * Trailing combining class == 0.
michael@0 2025 * Note if pos is in the writablebuffer, norm is always 0
michael@0 2026 */
michael@0 2027 if (ch < ZERO_CC_LIMIT_ ||
michael@0 2028 // this should propel us out of the loop in the iterator case
michael@0 2029 (data->flags & UCOL_ITER_NORM) == 0 ||
michael@0 2030 (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
michael@0 2031 || data->string == data->pos) {
michael@0 2032 break;
michael@0 2033 }
michael@0 2034
michael@0 2035 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
michael@0 2036 /* if next character is FCD */
michael@0 2037 if (data->pos == data->string) {
michael@0 2038 /* First char of string is always OK for FCD check */
michael@0 2039 break;
michael@0 2040 }
michael@0 2041
michael@0 2042 /* Not first char of string, do the FCD fast test */
michael@0 2043 if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
michael@0 2044 break;
michael@0 2045 }
michael@0 2046 }
michael@0 2047
michael@0 2048 /* Need a more complete FCD check and possible normalization. */
michael@0 2049 if (collPrevIterFCD(data)) {
michael@0 2050 collPrevIterNormalize(data);
michael@0 2051 }
michael@0 2052
michael@0 2053 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
michael@0 2054 /* No normalization. Go ahead and process the char. */
michael@0 2055 break;
michael@0 2056 }
michael@0 2057
michael@0 2058 /*
michael@0 2059 Some normalization happened.
michael@0 2060 Next loop picks up a char from the normalization buffer.
michael@0 2061 */
michael@0 2062 }
michael@0 2063
michael@0 2064 /* attempt to handle contractions, after removal of the backwards
michael@0 2065 contraction
michael@0 2066 */
michael@0 2067 if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
michael@0 2068 result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
michael@0 2069 } else {
michael@0 2070 if (ch <= 0xFF) {
michael@0 2071 result = coll->latinOneMapping[ch];
michael@0 2072 }
michael@0 2073 else {
michael@0 2074 // Always use UCA for [3400..9FFF], [AC00..D7AF]
michael@0 2075 // **** [FA0E..FA2F] ?? ****
michael@0 2076 if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
michael@0 2077 (ch >= 0x3400 && ch <= 0xD7AF)) {
michael@0 2078 if (ch > 0x9FFF && ch < 0xAC00) {
michael@0 2079 // between the two target ranges; do normal lookup
michael@0 2080 // **** this range is YI, Modifier tone letters, ****
michael@0 2081 // **** Latin-D, Syloti Nagari, Phagas-pa. ****
michael@0 2082 // **** Latin-D might be tailored, so we need to ****
michael@0 2083 // **** do the normal lookup for these guys. ****
michael@0 2084 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
michael@0 2085 } else {
michael@0 2086 result = UCOL_NOT_FOUND;
michael@0 2087 }
michael@0 2088 } else {
michael@0 2089 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
michael@0 2090 }
michael@0 2091 }
michael@0 2092 if (result > UCOL_NOT_FOUND) {
michael@0 2093 result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
michael@0 2094 }
michael@0 2095 if (result == UCOL_NOT_FOUND) { // Not found in master list
michael@0 2096 if (!isAtStartPrevIterate(data) &&
michael@0 2097 ucol_contractionEndCP(ch, data->coll))
michael@0 2098 {
michael@0 2099 result = UCOL_CONTRACTION;
michael@0 2100 } else {
michael@0 2101 if(coll->UCA) {
michael@0 2102 result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
michael@0 2103 }
michael@0 2104 }
michael@0 2105
michael@0 2106 if (result > UCOL_NOT_FOUND) {
michael@0 2107 if(coll->UCA) {
michael@0 2108 result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
michael@0 2109 }
michael@0 2110 }
michael@0 2111 }
michael@0 2112 }
michael@0 2113 } while ( result == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL );
michael@0 2114
michael@0 2115 if(result == UCOL_NOT_FOUND) {
michael@0 2116 result = getPrevImplicit(ch, data);
michael@0 2117 }
michael@0 2118 }
michael@0 2119
michael@0 2120 return result;
michael@0 2121 }
michael@0 2122
michael@0 2123
michael@0 2124 /* ucol_getPrevCE, out-of-line version for use from other files. */
michael@0 2125 U_CFUNC uint32_t U_EXPORT2
michael@0 2126 ucol_getPrevCE(const UCollator *coll, collIterate *data,
michael@0 2127 UErrorCode *status) {
michael@0 2128 return ucol_IGetPrevCE(coll, data, status);
michael@0 2129 }
michael@0 2130
michael@0 2131
michael@0 2132 /* this should be connected to special Jamo handling */
michael@0 2133 U_CFUNC uint32_t U_EXPORT2
michael@0 2134 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) {
michael@0 2135 collIterate colIt;
michael@0 2136 IInit_collIterate(coll, &u, 1, &colIt, status);
michael@0 2137 if(U_FAILURE(*status)) {
michael@0 2138 return 0;
michael@0 2139 }
michael@0 2140 return ucol_IGetNextCE(coll, &colIt, status);
michael@0 2141 }
michael@0 2142
michael@0 2143 /**
michael@0 2144 * Inserts the argument character into the end of the buffer pushing back the
michael@0 2145 * null terminator.
michael@0 2146 * @param data collIterate struct data
michael@0 2147 * @param ch character to be appended
michael@0 2148 * @return the position of the new addition
michael@0 2149 */
michael@0 2150 static
michael@0 2151 inline const UChar * insertBufferEnd(collIterate *data, UChar ch)
michael@0 2152 {
michael@0 2153 int32_t oldLength = data->writableBuffer.length();
michael@0 2154 return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength;
michael@0 2155 }
michael@0 2156
michael@0 2157 /**
michael@0 2158 * Inserts the argument string into the end of the buffer pushing back the
michael@0 2159 * null terminator.
michael@0 2160 * @param data collIterate struct data
michael@0 2161 * @param string to be appended
michael@0 2162 * @param length of the string to be appended
michael@0 2163 * @return the position of the new addition
michael@0 2164 */
michael@0 2165 static
michael@0 2166 inline const UChar * insertBufferEnd(collIterate *data, const UChar *str, int32_t length)
michael@0 2167 {
michael@0 2168 int32_t oldLength = data->writableBuffer.length();
michael@0 2169 return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldLength;
michael@0 2170 }
michael@0 2171
michael@0 2172 /**
michael@0 2173 * Special normalization function for contraction in the forwards iterator.
michael@0 2174 * This normalization sequence will place the current character at source->pos
michael@0 2175 * and its following normalized sequence into the buffer.
michael@0 2176 * The fcd position, pos will be changed.
michael@0 2177 * pos will now point to positions in the buffer.
michael@0 2178 * Flags will be changed accordingly.
michael@0 2179 * @param data collation iterator data
michael@0 2180 */
michael@0 2181 static
michael@0 2182 inline void normalizeNextContraction(collIterate *data)
michael@0 2183 {
michael@0 2184 int32_t strsize;
michael@0 2185 UErrorCode status = U_ZERO_ERROR;
michael@0 2186 /* because the pointer points to the next character */
michael@0 2187 const UChar *pStart = data->pos - 1;
michael@0 2188 const UChar *pEnd;
michael@0 2189
michael@0 2190 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
michael@0 2191 data->writableBuffer.setTo(*(pStart - 1));
michael@0 2192 strsize = 1;
michael@0 2193 }
michael@0 2194 else {
michael@0 2195 strsize = data->writableBuffer.length();
michael@0 2196 }
michael@0 2197
michael@0 2198 pEnd = data->fcdPosition;
michael@0 2199
michael@0 2200 data->writableBuffer.append(
michael@0 2201 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), status));
michael@0 2202 if(U_FAILURE(status)) {
michael@0 2203 return;
michael@0 2204 }
michael@0 2205
michael@0 2206 data->pos = data->writableBuffer.getTerminatedBuffer() + strsize;
michael@0 2207 data->origFlags = data->flags;
michael@0 2208 data->flags |= UCOL_ITER_INNORMBUF;
michael@0 2209 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
michael@0 2210 }
michael@0 2211
michael@0 2212 /**
michael@0 2213 * Contraction character management function that returns the next character
michael@0 2214 * for the forwards iterator.
michael@0 2215 * Does nothing if the next character is in buffer and not the first character
michael@0 2216 * in it.
michael@0 2217 * Else it checks next character in data string to see if it is normalizable.
michael@0 2218 * If it is not, the character is simply copied into the buffer, else
michael@0 2219 * the whole normalized substring is copied into the buffer, including the
michael@0 2220 * current character.
michael@0 2221 * @param data collation element iterator data
michael@0 2222 * @return next character
michael@0 2223 */
michael@0 2224 static
michael@0 2225 inline UChar getNextNormalizedChar(collIterate *data)
michael@0 2226 {
michael@0 2227 UChar nextch;
michael@0 2228 UChar ch;
michael@0 2229 // Here we need to add the iterator code. One problem is the way
michael@0 2230 // end of string is handled. If we just return next char, it could
michael@0 2231 // be the sentinel. Most of the cases already check for this, but we
michael@0 2232 // need to be sure.
michael@0 2233 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) {
michael@0 2234 /* if no normalization and not in buffer. */
michael@0 2235 if(data->flags & UCOL_USE_ITERATOR) {
michael@0 2236 return (UChar)data->iterator->next(data->iterator);
michael@0 2237 } else {
michael@0 2238 return *(data->pos ++);
michael@0 2239 }
michael@0 2240 }
michael@0 2241
michael@0 2242 //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
michael@0 2243 //normalizeIterator(data);
michael@0 2244 //}
michael@0 2245
michael@0 2246 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
michael@0 2247 if ((innormbuf && *data->pos != 0) ||
michael@0 2248 (data->fcdPosition != NULL && !innormbuf &&
michael@0 2249 data->pos < data->fcdPosition)) {
michael@0 2250 /*
michael@0 2251 if next character is in normalized buffer, no further normalization
michael@0 2252 is required
michael@0 2253 */
michael@0 2254 return *(data->pos ++);
michael@0 2255 }
michael@0 2256
michael@0 2257 if (data->flags & UCOL_ITER_HASLEN) {
michael@0 2258 /* in data string */
michael@0 2259 if (data->pos + 1 == data->endp) {
michael@0 2260 return *(data->pos ++);
michael@0 2261 }
michael@0 2262 }
michael@0 2263 else {
michael@0 2264 if (innormbuf) {
michael@0 2265 // inside the normalization buffer, but at the end
michael@0 2266 // (since we encountered zero). This means, in the
michael@0 2267 // case we're using char iterator, that we need to
michael@0 2268 // do another round of normalization.
michael@0 2269 //if(data->origFlags & UCOL_USE_ITERATOR) {
michael@0 2270 // we need to restore original flags,
michael@0 2271 // otherwise, we'll lose them
michael@0 2272 //data->flags = data->origFlags;
michael@0 2273 //normalizeIterator(data);
michael@0 2274 //return *(data->pos++);
michael@0 2275 //} else {
michael@0 2276 /*
michael@0 2277 in writable buffer, at this point fcdPosition can not be
michael@0 2278 pointing to the end of the data string. see contracting tag.
michael@0 2279 */
michael@0 2280 if(data->fcdPosition) {
michael@0 2281 if (*(data->fcdPosition + 1) == 0 ||
michael@0 2282 data->fcdPosition + 1 == data->endp) {
michael@0 2283 /* at the end of the string, dump it into the normalizer */
michael@0 2284 data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1;
michael@0 2285 // Check if data->pos received a null pointer
michael@0 2286 if (data->pos == NULL) {
michael@0 2287 return (UChar)-1; // Return to indicate error.
michael@0 2288 }
michael@0 2289 return *(data->fcdPosition ++);
michael@0 2290 }
michael@0 2291 data->pos = data->fcdPosition;
michael@0 2292 } else if(data->origFlags & UCOL_USE_ITERATOR) {
michael@0 2293 // if we are here, we're using a normalizing iterator.
michael@0 2294 // we should just continue further.
michael@0 2295 data->flags = data->origFlags;
michael@0 2296 data->pos = NULL;
michael@0 2297 return (UChar)data->iterator->next(data->iterator);
michael@0 2298 }
michael@0 2299 //}
michael@0 2300 }
michael@0 2301 else {
michael@0 2302 if (*(data->pos + 1) == 0) {
michael@0 2303 return *(data->pos ++);
michael@0 2304 }
michael@0 2305 }
michael@0 2306 }
michael@0 2307
michael@0 2308 ch = *data->pos ++;
michael@0 2309 nextch = *data->pos;
michael@0 2310
michael@0 2311 /*
michael@0 2312 * if the current character is not fcd.
michael@0 2313 * Trailing combining class == 0.
michael@0 2314 */
michael@0 2315 if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
michael@0 2316 (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
michael@0 2317 ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {
michael@0 2318 /*
michael@0 2319 Need a more complete FCD check and possible normalization.
michael@0 2320 normalize substring will be appended to buffer
michael@0 2321 */
michael@0 2322 if (collIterFCD(data)) {
michael@0 2323 normalizeNextContraction(data);
michael@0 2324 return *(data->pos ++);
michael@0 2325 }
michael@0 2326 else if (innormbuf) {
michael@0 2327 /* fcdposition shifted even when there's no normalization, if we
michael@0 2328 don't input the rest into this, we'll get the wrong position when
michael@0 2329 we reach the end of the writableBuffer */
michael@0 2330 int32_t length = (int32_t)(data->fcdPosition - data->pos + 1);
michael@0 2331 data->pos = insertBufferEnd(data, data->pos - 1, length);
michael@0 2332 // Check if data->pos received a null pointer
michael@0 2333 if (data->pos == NULL) {
michael@0 2334 return (UChar)-1; // Return to indicate error.
michael@0 2335 }
michael@0 2336 return *(data->pos ++);
michael@0 2337 }
michael@0 2338 }
michael@0 2339
michael@0 2340 if (innormbuf) {
michael@0 2341 /*
michael@0 2342 no normalization is to be done hence only one character will be
michael@0 2343 appended to the buffer.
michael@0 2344 */
michael@0 2345 data->pos = insertBufferEnd(data, ch) + 1;
michael@0 2346 // Check if data->pos received a null pointer
michael@0 2347 if (data->pos == NULL) {
michael@0 2348 return (UChar)-1; // Return to indicate error.
michael@0 2349 }
michael@0 2350 }
michael@0 2351
michael@0 2352 /* points back to the pos in string */
michael@0 2353 return ch;
michael@0 2354 }
michael@0 2355
michael@0 2356
michael@0 2357
michael@0 2358 /**
michael@0 2359 * Function to copy the buffer into writableBuffer and sets the fcd position to
michael@0 2360 * the correct position
michael@0 2361 * @param source data string source
michael@0 2362 * @param buffer character buffer
michael@0 2363 */
michael@0 2364 static
michael@0 2365 inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &buffer)
michael@0 2366 {
michael@0 2367 /* okay confusing part here. to ensure that the skipped characters are
michael@0 2368 considered later, we need to place it in the appropriate position in the
michael@0 2369 normalization buffer and reassign the pos pointer. simple case if pos
michael@0 2370 reside in string, simply copy to normalization buffer and
michael@0 2371 fcdposition = pos, pos = start of normalization buffer. if pos in
michael@0 2372 normalization buffer, we'll insert the copy infront of pos and point pos
michael@0 2373 to the start of the normalization buffer. why am i doing these copies?
michael@0 2374 well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
michael@0 2375 not require any changes, which be really painful. */
michael@0 2376 if (source->flags & UCOL_ITER_INNORMBUF) {
michael@0 2377 int32_t replaceLength = source->pos - source->writableBuffer.getBuffer();
michael@0 2378 source->writableBuffer.replace(0, replaceLength, buffer);
michael@0 2379 }
michael@0 2380 else {
michael@0 2381 source->fcdPosition = source->pos;
michael@0 2382 source->origFlags = source->flags;
michael@0 2383 source->flags |= UCOL_ITER_INNORMBUF;
michael@0 2384 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
michael@0 2385 source->writableBuffer = buffer;
michael@0 2386 }
michael@0 2387
michael@0 2388 source->pos = source->writableBuffer.getTerminatedBuffer();
michael@0 2389 }
michael@0 2390
michael@0 2391 /**
michael@0 2392 * Function to get the discontiguos collation element within the source.
michael@0 2393 * Note this function will set the position to the appropriate places.
michael@0 2394 * @param coll current collator used
michael@0 2395 * @param source data string source
michael@0 2396 * @param constart index to the start character in the contraction table
michael@0 2397 * @return discontiguos collation element offset
michael@0 2398 */
michael@0 2399 static
michael@0 2400 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
michael@0 2401 const UChar *constart)
michael@0 2402 {
michael@0 2403 /* source->pos currently points to the second combining character after
michael@0 2404 the start character */
michael@0 2405 const UChar *temppos = source->pos;
michael@0 2406 UnicodeString buffer;
michael@0 2407 const UChar *tempconstart = constart;
michael@0 2408 uint8_t tempflags = source->flags;
michael@0 2409 UBool multicontraction = FALSE;
michael@0 2410 collIterateState discState;
michael@0 2411
michael@0 2412 backupState(source, &discState);
michael@0 2413
michael@0 2414 buffer.setTo(peekCodePoint(source, -1));
michael@0 2415 for (;;) {
michael@0 2416 UChar *UCharOffset;
michael@0 2417 UChar schar,
michael@0 2418 tchar;
michael@0 2419 uint32_t result;
michael@0 2420
michael@0 2421 if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
michael@0 2422 || (peekCodeUnit(source, 0) == 0 &&
michael@0 2423 //|| (*source->pos == 0 &&
michael@0 2424 ((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
michael@0 2425 source->fcdPosition == NULL ||
michael@0 2426 source->fcdPosition == source->endp ||
michael@0 2427 *(source->fcdPosition) == 0 ||
michael@0 2428 u_getCombiningClass(*(source->fcdPosition)) == 0)) ||
michael@0 2429 /* end of string in null terminated string or stopped by a
michael@0 2430 null character, note fcd does not always point to a base
michael@0 2431 character after the discontiguos change */
michael@0 2432 u_getCombiningClass(peekCodePoint(source, 0)) == 0) {
michael@0 2433 //u_getCombiningClass(*(source->pos)) == 0) {
michael@0 2434 //constart = (UChar *)coll->image + getContractOffset(CE);
michael@0 2435 if (multicontraction) {
michael@0 2436 source->pos = temppos - 1;
michael@0 2437 setDiscontiguosAttribute(source, buffer);
michael@0 2438 return *(coll->contractionCEs +
michael@0 2439 (tempconstart - coll->contractionIndex));
michael@0 2440 }
michael@0 2441 constart = tempconstart;
michael@0 2442 break;
michael@0 2443 }
michael@0 2444
michael@0 2445 UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/
michael@0 2446 schar = getNextNormalizedChar(source);
michael@0 2447
michael@0 2448 while (schar > (tchar = *UCharOffset)) {
michael@0 2449 UCharOffset++;
michael@0 2450 }
michael@0 2451
michael@0 2452 if (schar != tchar) {
michael@0 2453 /* not the correct codepoint. we stuff the current codepoint into
michael@0 2454 the discontiguos buffer and try the next character */
michael@0 2455 buffer.append(schar);
michael@0 2456 continue;
michael@0 2457 }
michael@0 2458 else {
michael@0 2459 if (u_getCombiningClass(schar) ==
michael@0 2460 u_getCombiningClass(peekCodePoint(source, -2))) {
michael@0 2461 buffer.append(schar);
michael@0 2462 continue;
michael@0 2463 }
michael@0 2464 result = *(coll->contractionCEs +
michael@0 2465 (UCharOffset - coll->contractionIndex));
michael@0 2466 }
michael@0 2467
michael@0 2468 if (result == UCOL_NOT_FOUND) {
michael@0 2469 break;
michael@0 2470 } else if (isContraction(result)) {
michael@0 2471 /* this is a multi-contraction*/
michael@0 2472 tempconstart = (UChar *)coll->image + getContractOffset(result);
michael@0 2473 if (*(coll->contractionCEs + (constart - coll->contractionIndex))
michael@0 2474 != UCOL_NOT_FOUND) {
michael@0 2475 multicontraction = TRUE;
michael@0 2476 temppos = source->pos + 1;
michael@0 2477 }
michael@0 2478 } else {
michael@0 2479 setDiscontiguosAttribute(source, buffer);
michael@0 2480 return result;
michael@0 2481 }
michael@0 2482 }
michael@0 2483
michael@0 2484 /* no problems simply reverting just like that,
michael@0 2485 if we are in string before getting into this function, points back to
michael@0 2486 string hence no problem.
michael@0 2487 if we are in normalization buffer before getting into this function,
michael@0 2488 since we'll never use another normalization within this function, we
michael@0 2489 know that fcdposition points to a base character. the normalization buffer
michael@0 2490 never change, hence this revert works. */
michael@0 2491 loadState(source, &discState, TRUE);
michael@0 2492 goBackOne(source);
michael@0 2493
michael@0 2494 //source->pos = temppos - 1;
michael@0 2495 source->flags = tempflags;
michael@0 2496 return *(coll->contractionCEs + (constart - coll->contractionIndex));
michael@0 2497 }
michael@0 2498
michael@0 2499 /* now uses Mark's getImplicitPrimary code */
michael@0 2500 static
michael@0 2501 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
michael@0 2502 uint32_t r = uprv_uca_getImplicitPrimary(cp);
michael@0 2503 *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
michael@0 2504 collationSource->offsetRepeatCount += 1;
michael@0 2505 return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
michael@0 2506 }
michael@0 2507
michael@0 2508 /**
michael@0 2509 * Inserts the argument character into the front of the buffer replacing the
michael@0 2510 * front null terminator.
michael@0 2511 * @param data collation element iterator data
michael@0 2512 * @param ch character to be appended
michael@0 2513 */
michael@0 2514 static
michael@0 2515 inline void insertBufferFront(collIterate *data, UChar ch)
michael@0 2516 {
michael@0 2517 data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTerminatedBuffer() + 2;
michael@0 2518 }
michael@0 2519
michael@0 2520 /**
michael@0 2521 * Special normalization function for contraction in the previous iterator.
michael@0 2522 * This normalization sequence will place the current character at source->pos
michael@0 2523 * and its following normalized sequence into the buffer.
michael@0 2524 * The fcd position, pos will be changed.
michael@0 2525 * pos will now point to positions in the buffer.
michael@0 2526 * Flags will be changed accordingly.
michael@0 2527 * @param data collation iterator data
michael@0 2528 */
michael@0 2529 static
michael@0 2530 inline void normalizePrevContraction(collIterate *data, UErrorCode *status)
michael@0 2531 {
michael@0 2532 const UChar *pEnd = data->pos + 1; /* End normalize + 1 */
michael@0 2533 const UChar *pStart;
michael@0 2534
michael@0 2535 UnicodeString endOfBuffer;
michael@0 2536 if (data->flags & UCOL_ITER_HASLEN) {
michael@0 2537 /*
michael@0 2538 normalization buffer not used yet, we'll pull down the next
michael@0 2539 character into the end of the buffer
michael@0 2540 */
michael@0 2541 endOfBuffer.setTo(*pEnd);
michael@0 2542 }
michael@0 2543 else {
michael@0 2544 endOfBuffer.setTo(data->writableBuffer, 1); // after the leading NUL
michael@0 2545 }
michael@0 2546
michael@0 2547 if (data->fcdPosition == NULL) {
michael@0 2548 pStart = data->string;
michael@0 2549 }
michael@0 2550 else {
michael@0 2551 pStart = data->fcdPosition + 1;
michael@0 2552 }
michael@0 2553 int32_t normLen =
michael@0 2554 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)),
michael@0 2555 data->writableBuffer,
michael@0 2556 *status).
michael@0 2557 length();
michael@0 2558 if(U_FAILURE(*status)) {
michael@0 2559 return;
michael@0 2560 }
michael@0 2561 /*
michael@0 2562 this puts the null termination infront of the normalized string instead
michael@0 2563 of the end
michael@0 2564 */
michael@0 2565 data->pos =
michael@0 2566 data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminatedBuffer() +
michael@0 2567 1 + normLen;
michael@0 2568 data->origFlags = data->flags;
michael@0 2569 data->flags |= UCOL_ITER_INNORMBUF;
michael@0 2570 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
michael@0 2571 }
michael@0 2572
michael@0 2573 /**
michael@0 2574 * Contraction character management function that returns the previous character
michael@0 2575 * for the backwards iterator.
michael@0 2576 * Does nothing if the previous character is in buffer and not the first
michael@0 2577 * character in it.
michael@0 2578 * Else it checks previous character in data string to see if it is
michael@0 2579 * normalizable.
michael@0 2580 * If it is not, the character is simply copied into the buffer, else
michael@0 2581 * the whole normalized substring is copied into the buffer, including the
michael@0 2582 * current character.
michael@0 2583 * @param data collation element iterator data
michael@0 2584 * @return previous character
michael@0 2585 */
michael@0 2586 static
michael@0 2587 inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status)
michael@0 2588 {
michael@0 2589 UChar prevch;
michael@0 2590 UChar ch;
michael@0 2591 const UChar *start;
michael@0 2592 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
michael@0 2593 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
michael@0 2594 (innormbuf && *(data->pos - 1) != 0)) {
michael@0 2595 /*
michael@0 2596 if no normalization.
michael@0 2597 if previous character is in normalized buffer, no further normalization
michael@0 2598 is required
michael@0 2599 */
michael@0 2600 if(data->flags & UCOL_USE_ITERATOR) {
michael@0 2601 data->iterator->move(data->iterator, -1, UITER_CURRENT);
michael@0 2602 return (UChar)data->iterator->next(data->iterator);
michael@0 2603 } else {
michael@0 2604 return *(data->pos - 1);
michael@0 2605 }
michael@0 2606 }
michael@0 2607
michael@0 2608 start = data->pos;
michael@0 2609 if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) {
michael@0 2610 /* in data string */
michael@0 2611 if ((start - 1) == data->string) {
michael@0 2612 return *(start - 1);
michael@0 2613 }
michael@0 2614 start --;
michael@0 2615 ch = *start;
michael@0 2616 prevch = *(start - 1);
michael@0 2617 }
michael@0 2618 else {
michael@0 2619 /*
michael@0 2620 in writable buffer, at this point fcdPosition can not be NULL.
michael@0 2621 see contracting tag.
michael@0 2622 */
michael@0 2623 if (data->fcdPosition == data->string) {
michael@0 2624 /* at the start of the string, just dump it into the normalizer */
michael@0 2625 insertBufferFront(data, *(data->fcdPosition));
michael@0 2626 data->fcdPosition = NULL;
michael@0 2627 return *(data->pos - 1);
michael@0 2628 }
michael@0 2629 start = data->fcdPosition;
michael@0 2630 ch = *start;
michael@0 2631 prevch = *(start - 1);
michael@0 2632 }
michael@0 2633 /*
michael@0 2634 * if the current character is not fcd.
michael@0 2635 * Trailing combining class == 0.
michael@0 2636 */
michael@0 2637 if (data->fcdPosition > start &&
michael@0 2638 (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
michael@0 2639 {
michael@0 2640 /*
michael@0 2641 Need a more complete FCD check and possible normalization.
michael@0 2642 normalize substring will be appended to buffer
michael@0 2643 */
michael@0 2644 const UChar *backuppos = data->pos;
michael@0 2645 data->pos = start;
michael@0 2646 if (collPrevIterFCD(data)) {
michael@0 2647 normalizePrevContraction(data, status);
michael@0 2648 return *(data->pos - 1);
michael@0 2649 }
michael@0 2650 data->pos = backuppos;
michael@0 2651 data->fcdPosition ++;
michael@0 2652 }
michael@0 2653
michael@0 2654 if (innormbuf) {
michael@0 2655 /*
michael@0 2656 no normalization is to be done hence only one character will be
michael@0 2657 appended to the buffer.
michael@0 2658 */
michael@0 2659 insertBufferFront(data, ch);
michael@0 2660 data->fcdPosition --;
michael@0 2661 }
michael@0 2662
michael@0 2663 return ch;
michael@0 2664 }
michael@0 2665
michael@0 2666 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */
michael@0 2667 /* It is called by getNextCE */
michael@0 2668
michael@0 2669 /* The following should be even */
michael@0 2670 #define UCOL_MAX_DIGITS_FOR_NUMBER 254
michael@0 2671
michael@0 2672 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) {
michael@0 2673 collIterateState entryState;
michael@0 2674 backupState(source, &entryState);
michael@0 2675 UChar32 cp = ch;
michael@0 2676
michael@0 2677 for (;;) {
michael@0 2678 // This loop will repeat only in the case of contractions, and only when a contraction
michael@0 2679 // is found and the first CE resulting from that contraction is itself a special
michael@0 2680 // (an expansion, for example.) All other special CE types are fully handled the
michael@0 2681 // first time through, and the loop exits.
michael@0 2682
michael@0 2683 const uint32_t *CEOffset = NULL;
michael@0 2684 switch(getCETag(CE)) {
michael@0 2685 case NOT_FOUND_TAG:
michael@0 2686 /* This one is not found, and we'll let somebody else bother about it... no more games */
michael@0 2687 return CE;
michael@0 2688 case SPEC_PROC_TAG:
michael@0 2689 {
michael@0 2690 // Special processing is getting a CE that is preceded by a certain prefix
michael@0 2691 // Currently this is only needed for optimizing Japanese length and iteration marks.
michael@0 2692 // When we encouter a special processing tag, we go backwards and try to see if
michael@0 2693 // we have a match.
michael@0 2694 // Contraction tables are used - so the whole process is not unlike contraction.
michael@0 2695 // prefix data is stored backwards in the table.
michael@0 2696 const UChar *UCharOffset;
michael@0 2697 UChar schar, tchar;
michael@0 2698 collIterateState prefixState;
michael@0 2699 backupState(source, &prefixState);
michael@0 2700 loadState(source, &entryState, TRUE);
michael@0 2701 goBackOne(source); // We want to look at the point where we entered - actually one
michael@0 2702 // before that...
michael@0 2703
michael@0 2704 for(;;) {
michael@0 2705 // This loop will run once per source string character, for as long as we
michael@0 2706 // are matching a potential contraction sequence
michael@0 2707
michael@0 2708 // First we position ourselves at the begining of contraction sequence
michael@0 2709 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
michael@0 2710 if (collIter_bos(source)) {
michael@0 2711 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
michael@0 2712 break;
michael@0 2713 }
michael@0 2714 schar = getPrevNormalizedChar(source, status);
michael@0 2715 goBackOne(source);
michael@0 2716
michael@0 2717 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
michael@0 2718 UCharOffset++;
michael@0 2719 }
michael@0 2720
michael@0 2721 if (schar == tchar) {
michael@0 2722 // Found the source string char in the table.
michael@0 2723 // Pick up the corresponding CE from the table.
michael@0 2724 CE = *(coll->contractionCEs +
michael@0 2725 (UCharOffset - coll->contractionIndex));
michael@0 2726 }
michael@0 2727 else
michael@0 2728 {
michael@0 2729 // Source string char was not in the table.
michael@0 2730 // We have not found the prefix.
michael@0 2731 CE = *(coll->contractionCEs +
michael@0 2732 (ContractionStart - coll->contractionIndex));
michael@0 2733 }
michael@0 2734
michael@0 2735 if(!isPrefix(CE)) {
michael@0 2736 // The source string char was in the contraction table, and the corresponding
michael@0 2737 // CE is not a prefix CE. We found the prefix, break
michael@0 2738 // out of loop, this CE will end up being returned. This is the normal
michael@0 2739 // way out of prefix handling when the source actually contained
michael@0 2740 // the prefix.
michael@0 2741 break;
michael@0 2742 }
michael@0 2743 }
michael@0 2744 if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue
michael@0 2745 loadState(source, &prefixState, TRUE);
michael@0 2746 if(source->origFlags & UCOL_USE_ITERATOR) {
michael@0 2747 source->flags = source->origFlags;
michael@0 2748 }
michael@0 2749 } else { // prefix search was a failure, we have to backup all the way to the start
michael@0 2750 loadState(source, &entryState, TRUE);
michael@0 2751 }
michael@0 2752 break;
michael@0 2753 }
michael@0 2754 case CONTRACTION_TAG:
michael@0 2755 {
michael@0 2756 /* This should handle contractions */
michael@0 2757 collIterateState state;
michael@0 2758 backupState(source, &state);
michael@0 2759 uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND;
michael@0 2760 const UChar *UCharOffset;
michael@0 2761 UChar schar, tchar;
michael@0 2762
michael@0 2763 for (;;) {
michael@0 2764 /* This loop will run once per source string character, for as long as we */
michael@0 2765 /* are matching a potential contraction sequence */
michael@0 2766
michael@0 2767 /* First we position ourselves at the begining of contraction sequence */
michael@0 2768 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
michael@0 2769
michael@0 2770 if (collIter_eos(source)) {
michael@0 2771 // Ran off the end of the source string.
michael@0 2772 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
michael@0 2773 // So we'll pick whatever we have at the point...
michael@0 2774 if (CE == UCOL_NOT_FOUND) {
michael@0 2775 // back up the source over all the chars we scanned going into this contraction.
michael@0 2776 CE = firstCE;
michael@0 2777 loadState(source, &state, TRUE);
michael@0 2778 if(source->origFlags & UCOL_USE_ITERATOR) {
michael@0 2779 source->flags = source->origFlags;
michael@0 2780 }
michael@0 2781 }
michael@0 2782 break;
michael@0 2783 }
michael@0 2784
michael@0 2785 uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
michael@0 2786 uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);
michael@0 2787
michael@0 2788 schar = getNextNormalizedChar(source);
michael@0 2789 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
michael@0 2790 UCharOffset++;
michael@0 2791 }
michael@0 2792
michael@0 2793 if (schar == tchar) {
michael@0 2794 // Found the source string char in the contraction table.
michael@0 2795 // Pick up the corresponding CE from the table.
michael@0 2796 CE = *(coll->contractionCEs +
michael@0 2797 (UCharOffset - coll->contractionIndex));
michael@0 2798 }
michael@0 2799 else
michael@0 2800 {
michael@0 2801 // Source string char was not in contraction table.
michael@0 2802 // Unless we have a discontiguous contraction, we have finished
michael@0 2803 // with this contraction.
michael@0 2804 // in order to do the proper detection, we
michael@0 2805 // need to see if we're dealing with a supplementary
michael@0 2806 /* We test whether the next two char are surrogate pairs.
michael@0 2807 * This test is done if the iterator is not NULL.
michael@0 2808 * If there is no surrogate pair, the iterator
michael@0 2809 * goes back one if needed. */
michael@0 2810 UChar32 miss = schar;
michael@0 2811 if (source->iterator) {
michael@0 2812 UChar32 surrNextChar; /* the next char in the iteration to test */
michael@0 2813 int32_t prevPos; /* holds the previous position before move forward of the source iterator */
michael@0 2814 if(U16_IS_LEAD(schar) && source->iterator->hasNext(source->iterator)) {
michael@0 2815 prevPos = source->iterator->index;
michael@0 2816 surrNextChar = getNextNormalizedChar(source);
michael@0 2817 if (U16_IS_TRAIL(surrNextChar)) {
michael@0 2818 miss = U16_GET_SUPPLEMENTARY(schar, surrNextChar);
michael@0 2819 } else if (prevPos < source->iterator->index){
michael@0 2820 goBackOne(source);
michael@0 2821 }
michael@0 2822 }
michael@0 2823 } else if (U16_IS_LEAD(schar)) {
michael@0 2824 miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source));
michael@0 2825 }
michael@0 2826
michael@0 2827 uint8_t sCC;
michael@0 2828 if (miss < 0x300 ||
michael@0 2829 maxCC == 0 ||
michael@0 2830 (sCC = i_getCombiningClass(miss, coll)) == 0 ||
michael@0 2831 sCC>maxCC ||
michael@0 2832 (allSame != 0 && sCC == maxCC) ||
michael@0 2833 collIter_eos(source))
michael@0 2834 {
michael@0 2835 // Contraction can not be discontiguous.
michael@0 2836 goBackOne(source); // back up the source string by one,
michael@0 2837 // because the character we just looked at was
michael@0 2838 // not part of the contraction. */
michael@0 2839 if(U_IS_SUPPLEMENTARY(miss)) {
michael@0 2840 goBackOne(source);
michael@0 2841 }
michael@0 2842 CE = *(coll->contractionCEs +
michael@0 2843 (ContractionStart - coll->contractionIndex));
michael@0 2844 } else {
michael@0 2845 //
michael@0 2846 // Contraction is possibly discontiguous.
michael@0 2847 // Scan more of source string looking for a match
michael@0 2848 //
michael@0 2849 UChar tempchar;
michael@0 2850 /* find the next character if schar is not a base character
michael@0 2851 and we are not yet at the end of the string */
michael@0 2852 tempchar = getNextNormalizedChar(source);
michael@0 2853 // probably need another supplementary thingie here
michael@0 2854 goBackOne(source);
michael@0 2855 if (i_getCombiningClass(tempchar, coll) == 0) {
michael@0 2856 goBackOne(source);
michael@0 2857 if(U_IS_SUPPLEMENTARY(miss)) {
michael@0 2858 goBackOne(source);
michael@0 2859 }
michael@0 2860 /* Spit out the last char of the string, wasn't tasty enough */
michael@0 2861 CE = *(coll->contractionCEs +
michael@0 2862 (ContractionStart - coll->contractionIndex));
michael@0 2863 } else {
michael@0 2864 CE = getDiscontiguous(coll, source, ContractionStart);
michael@0 2865 }
michael@0 2866 }
michael@0 2867 } // else after if(schar == tchar)
michael@0 2868
michael@0 2869 if(CE == UCOL_NOT_FOUND) {
michael@0 2870 /* The Source string did not match the contraction that we were checking. */
michael@0 2871 /* Back up the source position to undo the effects of having partially */
michael@0 2872 /* scanned through what ultimately proved to not be a contraction. */
michael@0 2873 loadState(source, &state, TRUE);
michael@0 2874 CE = firstCE;
michael@0 2875 break;
michael@0 2876 }
michael@0 2877
michael@0 2878 if(!isContraction(CE)) {
michael@0 2879 // The source string char was in the contraction table, and the corresponding
michael@0 2880 // CE is not a contraction CE. We completed the contraction, break
michael@0 2881 // out of loop, this CE will end up being returned. This is the normal
michael@0 2882 // way out of contraction handling when the source actually contained
michael@0 2883 // the contraction.
michael@0 2884 break;
michael@0 2885 }
michael@0 2886
michael@0 2887
michael@0 2888 // The source string char was in the contraction table, and the corresponding
michael@0 2889 // CE is IS a contraction CE. We will continue looping to check the source
michael@0 2890 // string for the remaining chars in the contraction.
michael@0 2891 uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
michael@0 2892 if(tempCE != UCOL_NOT_FOUND) {
michael@0 2893 // We have scanned a a section of source string for which there is a
michael@0 2894 // CE from the contraction table. Remember the CE and scan position, so
michael@0 2895 // that we can return to this point if further scanning fails to
michael@0 2896 // match a longer contraction sequence.
michael@0 2897 firstCE = tempCE;
michael@0 2898
michael@0 2899 goBackOne(source);
michael@0 2900 backupState(source, &state);
michael@0 2901 getNextNormalizedChar(source);
michael@0 2902
michael@0 2903 // Another way to do this is:
michael@0 2904 //collIterateState tempState;
michael@0 2905 //backupState(source, &tempState);
michael@0 2906 //goBackOne(source);
michael@0 2907 //backupState(source, &state);
michael@0 2908 //loadState(source, &tempState, TRUE);
michael@0 2909
michael@0 2910 // The problem is that for incomplete contractions we have to remember the previous
michael@0 2911 // position. Before, the only thing I needed to do was state.pos--;
michael@0 2912 // After iterator introduction and especially after introduction of normalizing
michael@0 2913 // iterators, it became much more difficult to decrease the saved state.
michael@0 2914 // I'm not yet sure which of the two methods above is faster.
michael@0 2915 }
michael@0 2916 } // for(;;)
michael@0 2917 break;
michael@0 2918 } // case CONTRACTION_TAG:
michael@0 2919 case LONG_PRIMARY_TAG:
michael@0 2920 {
michael@0 2921 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
michael@0 2922 CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
michael@0 2923 source->offsetRepeatCount += 1;
michael@0 2924 return CE;
michael@0 2925 }
michael@0 2926 case EXPANSION_TAG:
michael@0 2927 {
michael@0 2928 /* This should handle expansion. */
michael@0 2929 /* NOTE: we can encounter both continuations and expansions in an expansion! */
michael@0 2930 /* I have to decide where continuations are going to be dealt with */
michael@0 2931 uint32_t size;
michael@0 2932 uint32_t i; /* general counter */
michael@0 2933
michael@0 2934 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
michael@0 2935 size = getExpansionCount(CE);
michael@0 2936 CE = *CEOffset++;
michael@0 2937 //source->offsetRepeatCount = -1;
michael@0 2938
michael@0 2939 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
michael@0 2940 for(i = 1; i<size; i++) {
michael@0 2941 *(source->CEpos++) = *CEOffset++;
michael@0 2942 source->offsetRepeatCount += 1;
michael@0 2943 }
michael@0 2944 } else { /* else, we do */
michael@0 2945 while(*CEOffset != 0) {
michael@0 2946 *(source->CEpos++) = *CEOffset++;
michael@0 2947 source->offsetRepeatCount += 1;
michael@0 2948 }
michael@0 2949 }
michael@0 2950
michael@0 2951 return CE;
michael@0 2952 }
michael@0 2953 case DIGIT_TAG:
michael@0 2954 {
michael@0 2955 /*
michael@0 2956 We do a check to see if we want to collate digits as numbers; if so we generate
michael@0 2957 a custom collation key. Otherwise we pull out the value stored in the expansion table.
michael@0 2958 */
michael@0 2959 //uint32_t size;
michael@0 2960 uint32_t i; /* general counter */
michael@0 2961
michael@0 2962 if (source->coll->numericCollation == UCOL_ON){
michael@0 2963 collIterateState digitState = {0,0,0,0,0,0,0,0,0};
michael@0 2964 UChar32 char32 = 0;
michael@0 2965 int32_t digVal = 0;
michael@0 2966
michael@0 2967 uint32_t digIndx = 0;
michael@0 2968 uint32_t endIndex = 0;
michael@0 2969 uint32_t trailingZeroIndex = 0;
michael@0 2970
michael@0 2971 uint8_t collateVal = 0;
michael@0 2972
michael@0 2973 UBool nonZeroValReached = FALSE;
michael@0 2974
michael@0 2975 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I just need a temporary place to store my generated CEs.
michael@0 2976 /*
michael@0 2977 We parse the source string until we hit a char that's NOT a digit.
michael@0 2978 Use this u_charDigitValue. This might be slow because we have to
michael@0 2979 handle surrogates...
michael@0 2980 */
michael@0 2981 /*
michael@0 2982 if (U16_IS_LEAD(ch)){
michael@0 2983 if (!collIter_eos(source)) {
michael@0 2984 backupState(source, &digitState);
michael@0 2985 UChar trail = getNextNormalizedChar(source);
michael@0 2986 if(U16_IS_TRAIL(trail)) {
michael@0 2987 char32 = U16_GET_SUPPLEMENTARY(ch, trail);
michael@0 2988 } else {
michael@0 2989 loadState(source, &digitState, TRUE);
michael@0 2990 char32 = ch;
michael@0 2991 }
michael@0 2992 } else {
michael@0 2993 char32 = ch;
michael@0 2994 }
michael@0 2995 } else {
michael@0 2996 char32 = ch;
michael@0 2997 }
michael@0 2998 digVal = u_charDigitValue(char32);
michael@0 2999 */
michael@0 3000 digVal = u_charDigitValue(cp); // if we have arrived here, we have
michael@0 3001 // already processed possible supplementaries that trigered the digit tag -
michael@0 3002 // all supplementaries are marked in the UCA.
michael@0 3003 /*
michael@0 3004 We pad a zero in front of the first element anyways. This takes
michael@0 3005 care of the (probably) most common case where people are sorting things followed
michael@0 3006 by a single digit
michael@0 3007 */
michael@0 3008 digIndx++;
michael@0 3009 for(;;){
michael@0 3010 // Make sure we have enough space. No longer needed;
michael@0 3011 // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER
michael@0 3012 // (it has been pre-incremented) so we just ensure that numTempBuf is big enough
michael@0 3013 // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3).
michael@0 3014
michael@0 3015 // Skipping over leading zeroes.
michael@0 3016 if (digVal != 0) {
michael@0 3017 nonZeroValReached = TRUE;
michael@0 3018 }
michael@0 3019 if (nonZeroValReached) {
michael@0 3020 /*
michael@0 3021 We parse the digit string into base 100 numbers (this fits into a byte).
michael@0 3022 We only add to the buffer in twos, thus if we are parsing an odd character,
michael@0 3023 that serves as the 'tens' digit while the if we are parsing an even one, that
michael@0 3024 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
michael@0 3025 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
michael@0 3026 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
michael@0 3027 than all the other bytes.
michael@0 3028 */
michael@0 3029
michael@0 3030 if (digIndx % 2 == 1){
michael@0 3031 collateVal += (uint8_t)digVal;
michael@0 3032
michael@0 3033 // We don't enter the low-order-digit case unless we've already seen
michael@0 3034 // the high order, or for the first digit, which is always non-zero.
michael@0 3035 if (collateVal != 0)
michael@0 3036 trailingZeroIndex = 0;
michael@0 3037
michael@0 3038 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
michael@0 3039 collateVal = 0;
michael@0 3040 }
michael@0 3041 else{
michael@0 3042 // We drop the collation value into the buffer so if we need to do
michael@0 3043 // a "front patch" we don't have to check to see if we're hitting the
michael@0 3044 // last element.
michael@0 3045 collateVal = (uint8_t)(digVal * 10);
michael@0 3046
michael@0 3047 // Check for trailing zeroes.
michael@0 3048 if (collateVal == 0)
michael@0 3049 {
michael@0 3050 if (!trailingZeroIndex)
michael@0 3051 trailingZeroIndex = (digIndx/2) + 2;
michael@0 3052 }
michael@0 3053 else
michael@0 3054 trailingZeroIndex = 0;
michael@0 3055
michael@0 3056 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
michael@0 3057 }
michael@0 3058 digIndx++;
michael@0 3059 }
michael@0 3060
michael@0 3061 // Get next character.
michael@0 3062 if (!collIter_eos(source)){
michael@0 3063 ch = getNextNormalizedChar(source);
michael@0 3064 if (U16_IS_LEAD(ch)){
michael@0 3065 if (!collIter_eos(source)) {
michael@0 3066 backupState(source, &digitState);
michael@0 3067 UChar trail = getNextNormalizedChar(source);
michael@0 3068 if(U16_IS_TRAIL(trail)) {
michael@0 3069 char32 = U16_GET_SUPPLEMENTARY(ch, trail);
michael@0 3070 } else {
michael@0 3071 loadState(source, &digitState, TRUE);
michael@0 3072 char32 = ch;
michael@0 3073 }
michael@0 3074 }
michael@0 3075 } else {
michael@0 3076 char32 = ch;
michael@0 3077 }
michael@0 3078
michael@0 3079 if ((digVal = u_charDigitValue(char32)) == -1 || digIndx > UCOL_MAX_DIGITS_FOR_NUMBER){
michael@0 3080 // Resetting position to point to the next unprocessed char. We
michael@0 3081 // overshot it when doing our test/set for numbers.
michael@0 3082 if (char32 > 0xFFFF) { // For surrogates.
michael@0 3083 loadState(source, &digitState, TRUE);
michael@0 3084 //goBackOne(source);
michael@0 3085 }
michael@0 3086 goBackOne(source);
michael@0 3087 break;
michael@0 3088 }
michael@0 3089 } else {
michael@0 3090 break;
michael@0 3091 }
michael@0 3092 }
michael@0 3093
michael@0 3094 if (nonZeroValReached == FALSE){
michael@0 3095 digIndx = 2;
michael@0 3096 numTempBuf[2] = 6;
michael@0 3097 }
michael@0 3098
michael@0 3099 endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ;
michael@0 3100 if (digIndx % 2 != 0){
michael@0 3101 /*
michael@0 3102 We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
michael@0 3103 we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
michael@0 3104 Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
michael@0 3105 single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
michael@0 3106 */
michael@0 3107
michael@0 3108 for(i = 2; i < endIndex; i++){
michael@0 3109 numTempBuf[i] = (((((numTempBuf[i] - 6)/2) % 10) * 10) +
michael@0 3110 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;
michael@0 3111 }
michael@0 3112 --digIndx;
michael@0 3113 }
michael@0 3114
michael@0 3115 // Subtract one off of the last byte.
michael@0 3116 numTempBuf[endIndex-1] -= 1;
michael@0 3117
michael@0 3118 /*
michael@0 3119 We want to skip over the first two slots in the buffer. The first slot
michael@0 3120 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
michael@0 3121 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
michael@0 3122 */
michael@0 3123 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
michael@0 3124 numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
michael@0 3125
michael@0 3126 // Now transfer the collation key to our collIterate struct.
michael@0 3127 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
michael@0 3128 //size = ((endIndex+1) & ~1)/2;
michael@0 3129 CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
michael@0 3130 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
michael@0 3131 UCOL_BYTE_COMMON; // Tertiary weight.
michael@0 3132 i = 2; // Reset the index into the buffer.
michael@0 3133 while(i < endIndex)
michael@0 3134 {
michael@0 3135 uint32_t primWeight = numTempBuf[i++] << 8;
michael@0 3136 if ( i < endIndex)
michael@0 3137 primWeight |= numTempBuf[i++];
michael@0 3138 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
michael@0 3139 }
michael@0 3140
michael@0 3141 } else {
michael@0 3142 // no numeric mode, we'll just switch to whatever we stashed and continue
michael@0 3143 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
michael@0 3144 CE = *CEOffset++;
michael@0 3145 break;
michael@0 3146 }
michael@0 3147 return CE;
michael@0 3148 }
michael@0 3149 /* various implicits optimization */
michael@0 3150 case IMPLICIT_TAG: /* everything that is not defined otherwise */
michael@0 3151 /* UCA is filled with these. Tailorings are NOT_FOUND */
michael@0 3152 return getImplicit(cp, source);
michael@0 3153 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
michael@0 3154 // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
michael@0 3155 return getImplicit(cp, source);
michael@0 3156 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
michael@0 3157 {
michael@0 3158 static const uint32_t
michael@0 3159 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
michael@0 3160 //const uint32_t LCount = 19;
michael@0 3161 static const uint32_t VCount = 21;
michael@0 3162 static const uint32_t TCount = 28;
michael@0 3163 //const uint32_t NCount = VCount * TCount; // 588
michael@0 3164 //const uint32_t SCount = LCount * NCount; // 11172
michael@0 3165 uint32_t L = ch - SBase;
michael@0 3166
michael@0 3167 // divide into pieces
michael@0 3168
michael@0 3169 uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation
michael@0 3170 L /= TCount;
michael@0 3171 uint32_t V = L % VCount;
michael@0 3172 L /= VCount;
michael@0 3173
michael@0 3174 // offset them
michael@0 3175
michael@0 3176 L += LBase;
michael@0 3177 V += VBase;
michael@0 3178 T += TBase;
michael@0 3179
michael@0 3180 // return the first CE, but first put the rest into the expansion buffer
michael@0 3181 if (!source->coll->image->jamoSpecial) { // FAST PATH
michael@0 3182
michael@0 3183 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
michael@0 3184 if (T != TBase) {
michael@0 3185 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
michael@0 3186 }
michael@0 3187
michael@0 3188 return UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
michael@0 3189
michael@0 3190 } else { // Jamo is Special
michael@0 3191 // Since Hanguls pass the FCD check, it is
michael@0 3192 // guaranteed that we won't be in
michael@0 3193 // the normalization buffer if something like this happens
michael@0 3194
michael@0 3195 // However, if we are using a uchar iterator and normalization
michael@0 3196 // is ON, the Hangul that lead us here is going to be in that
michael@0 3197 // normalization buffer. Here we want to restore the uchar
michael@0 3198 // iterator state and pull out of the normalization buffer
michael@0 3199 if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) {
michael@0 3200 source->flags = source->origFlags; // restore the iterator
michael@0 3201 source->pos = NULL;
michael@0 3202 }
michael@0 3203
michael@0 3204 // Move Jamos into normalization buffer
michael@0 3205 UChar *buffer = source->writableBuffer.getBuffer(4);
michael@0 3206 int32_t bufferLength;
michael@0 3207 buffer[0] = (UChar)L;
michael@0 3208 buffer[1] = (UChar)V;
michael@0 3209 if (T != TBase) {
michael@0 3210 buffer[2] = (UChar)T;
michael@0 3211 bufferLength = 3;
michael@0 3212 } else {
michael@0 3213 bufferLength = 2;
michael@0 3214 }
michael@0 3215 source->writableBuffer.releaseBuffer(bufferLength);
michael@0 3216
michael@0 3217 // Indicate where to continue in main input string after exhausting the writableBuffer
michael@0 3218 source->fcdPosition = source->pos;
michael@0 3219
michael@0 3220 source->pos = source->writableBuffer.getTerminatedBuffer();
michael@0 3221 source->origFlags = source->flags;
michael@0 3222 source->flags |= UCOL_ITER_INNORMBUF;
michael@0 3223 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
michael@0 3224
michael@0 3225 return(UCOL_IGNORABLE);
michael@0 3226 }
michael@0 3227 }
michael@0 3228 case SURROGATE_TAG:
michael@0 3229 /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
michael@0 3230 /* two things can happen here: next code point can be a trailing surrogate - we will use it */
michael@0 3231 /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
michael@0 3232 /* we treat it like an unassigned code point. */
michael@0 3233 {
michael@0 3234 UChar trail;
michael@0 3235 collIterateState state;
michael@0 3236 backupState(source, &state);
michael@0 3237 if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) {
michael@0 3238 // we chould have stepped one char forward and it might have turned that it
michael@0 3239 // was not a trail surrogate. In that case, we have to backup.
michael@0 3240 loadState(source, &state, TRUE);
michael@0 3241 return UCOL_NOT_FOUND;
michael@0 3242 } else {
michael@0 3243 /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
michael@0 3244 CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail);
michael@0 3245 if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one.
michael@0 3246 // We need to backup
michael@0 3247 loadState(source, &state, TRUE);
michael@0 3248 return CE;
michael@0 3249 }
michael@0 3250 // calculate the supplementary code point value, if surrogate was not tailored
michael@0 3251 cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
michael@0 3252 }
michael@0 3253 }
michael@0 3254 break;
michael@0 3255 case LEAD_SURROGATE_TAG: /* D800-DBFF*/
michael@0 3256 UChar nextChar;
michael@0 3257 if( source->flags & UCOL_USE_ITERATOR) {
michael@0 3258 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) {
michael@0 3259 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
michael@0 3260 source->iterator->next(source->iterator);
michael@0 3261 return getImplicit(cp, source);
michael@0 3262 }
michael@0 3263 } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) &&
michael@0 3264 U_IS_TRAIL((nextChar=*source->pos))) {
michael@0 3265 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
michael@0 3266 source->pos++;
michael@0 3267 return getImplicit(cp, source);
michael@0 3268 }
michael@0 3269 return UCOL_NOT_FOUND;
michael@0 3270 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
michael@0 3271 return UCOL_NOT_FOUND; /* broken surrogate sequence */
michael@0 3272 case CHARSET_TAG:
michael@0 3273 /* not yet implemented */
michael@0 3274 /* probably after 1.8 */
michael@0 3275 return UCOL_NOT_FOUND;
michael@0 3276 default:
michael@0 3277 *status = U_INTERNAL_PROGRAM_ERROR;
michael@0 3278 CE=0;
michael@0 3279 break;
michael@0 3280 }
michael@0 3281 if (CE <= UCOL_NOT_FOUND) break;
michael@0 3282 }
michael@0 3283 return CE;
michael@0 3284 }
michael@0 3285
michael@0 3286
michael@0 3287 /* now uses Mark's getImplicitPrimary code */
michael@0 3288 static
michael@0 3289 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
michael@0 3290 uint32_t r = uprv_uca_getImplicitPrimary(cp);
michael@0 3291
michael@0 3292 *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
michael@0 3293 collationSource->toReturn = collationSource->CEpos;
michael@0 3294
michael@0 3295 // **** doesn't work if using iterator ****
michael@0 3296 if (collationSource->flags & UCOL_ITER_INNORMBUF) {
michael@0 3297 collationSource->offsetRepeatCount = 1;
michael@0 3298 } else {
michael@0 3299 int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string);
michael@0 3300
michael@0 3301 UErrorCode errorCode = U_ZERO_ERROR;
michael@0 3302 collationSource->appendOffset(firstOffset, errorCode);
michael@0 3303 collationSource->appendOffset(firstOffset + 1, errorCode);
michael@0 3304
michael@0 3305 collationSource->offsetReturn = collationSource->offsetStore - 1;
michael@0 3306 *(collationSource->offsetBuffer) = firstOffset;
michael@0 3307 if (collationSource->offsetReturn == collationSource->offsetBuffer) {
michael@0 3308 collationSource->offsetStore = collationSource->offsetBuffer;
michael@0 3309 }
michael@0 3310 }
michael@0 3311
michael@0 3312 return ((r & 0x0000FFFF)<<16) | 0x000000C0;
michael@0 3313 }
michael@0 3314
michael@0 3315 /**
michael@0 3316 * This function handles the special CEs like contractions, expansions,
michael@0 3317 * surrogates, Thai.
michael@0 3318 * It is called by both getPrevCE
michael@0 3319 */
michael@0 3320 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
michael@0 3321 collIterate *source,
michael@0 3322 UErrorCode *status)
michael@0 3323 {
michael@0 3324 const uint32_t *CEOffset = NULL;
michael@0 3325 UChar *UCharOffset = NULL;
michael@0 3326 UChar schar;
michael@0 3327 const UChar *constart = NULL;
michael@0 3328 uint32_t size;
michael@0 3329 UChar buffer[UCOL_MAX_BUFFER];
michael@0 3330 uint32_t *endCEBuffer;
michael@0 3331 UChar *strbuffer;
michael@0 3332 int32_t noChars = 0;
michael@0 3333 int32_t CECount = 0;
michael@0 3334
michael@0 3335 for(;;)
michael@0 3336 {
michael@0 3337 /* the only ces that loops are thai and contractions */
michael@0 3338 switch (getCETag(CE))
michael@0 3339 {
michael@0 3340 case NOT_FOUND_TAG: /* this tag always returns */
michael@0 3341 return CE;
michael@0 3342
michael@0 3343 case SPEC_PROC_TAG:
michael@0 3344 {
michael@0 3345 // Special processing is getting a CE that is preceded by a certain prefix
michael@0 3346 // Currently this is only needed for optimizing Japanese length and iteration marks.
michael@0 3347 // When we encouter a special processing tag, we go backwards and try to see if
michael@0 3348 // we have a match.
michael@0 3349 // Contraction tables are used - so the whole process is not unlike contraction.
michael@0 3350 // prefix data is stored backwards in the table.
michael@0 3351 const UChar *UCharOffset;
michael@0 3352 UChar schar, tchar;
michael@0 3353 collIterateState prefixState;
michael@0 3354 backupState(source, &prefixState);
michael@0 3355 for(;;) {
michael@0 3356 // This loop will run once per source string character, for as long as we
michael@0 3357 // are matching a potential contraction sequence
michael@0 3358
michael@0 3359 // First we position ourselves at the begining of contraction sequence
michael@0 3360 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
michael@0 3361
michael@0 3362 if (collIter_bos(source)) {
michael@0 3363 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
michael@0 3364 break;
michael@0 3365 }
michael@0 3366 schar = getPrevNormalizedChar(source, status);
michael@0 3367 goBackOne(source);
michael@0 3368
michael@0 3369 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
michael@0 3370 UCharOffset++;
michael@0 3371 }
michael@0 3372
michael@0 3373 if (schar == tchar) {
michael@0 3374 // Found the source string char in the table.
michael@0 3375 // Pick up the corresponding CE from the table.
michael@0 3376 CE = *(coll->contractionCEs +
michael@0 3377 (UCharOffset - coll->contractionIndex));
michael@0 3378 }
michael@0 3379 else
michael@0 3380 {
michael@0 3381 // if there is a completely ignorable code point in the middle of
michael@0 3382 // a prefix, we need to act as if it's not there
michael@0 3383 // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
michael@0 3384 // lone surrogates cannot be set to zero as it would break other processing
michael@0 3385 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
michael@0 3386 // it's easy for BMP code points
michael@0 3387 if(isZeroCE == 0) {
michael@0 3388 continue;
michael@0 3389 } else if(U16_IS_SURROGATE(schar)) {
michael@0 3390 // for supplementary code points, we have to check the next one
michael@0 3391 // situations where we are going to ignore
michael@0 3392 // 1. beginning of the string: schar is a lone surrogate
michael@0 3393 // 2. schar is a lone surrogate
michael@0 3394 // 3. schar is a trail surrogate in a valid surrogate sequence
michael@0 3395 // that is explicitly set to zero.
michael@0 3396 if (!collIter_bos(source)) {
michael@0 3397 UChar lead;
michael@0 3398 if(!U16_IS_SURROGATE_LEAD(schar) && U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) {
michael@0 3399 isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead);
michael@0 3400 if(isSpecial(isZeroCE) && getCETag(isZeroCE) == SURROGATE_TAG) {
michael@0 3401 uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar);
michael@0 3402 if(finalCE == 0) {
michael@0 3403 // this is a real, assigned completely ignorable code point
michael@0 3404 goBackOne(source);
michael@0 3405 continue;
michael@0 3406 }
michael@0 3407 }
michael@0 3408 } else {
michael@0 3409 // lone surrogate, treat like unassigned
michael@0 3410 return UCOL_NOT_FOUND;
michael@0 3411 }
michael@0 3412 } else {
michael@0 3413 // lone surrogate at the beggining, treat like unassigned
michael@0 3414 return UCOL_NOT_FOUND;
michael@0 3415 }
michael@0 3416 }
michael@0 3417 // Source string char was not in the table.
michael@0 3418 // We have not found the prefix.
michael@0 3419 CE = *(coll->contractionCEs +
michael@0 3420 (ContractionStart - coll->contractionIndex));
michael@0 3421 }
michael@0 3422
michael@0 3423 if(!isPrefix(CE)) {
michael@0 3424 // The source string char was in the contraction table, and the corresponding
michael@0 3425 // CE is not a prefix CE. We found the prefix, break
michael@0 3426 // out of loop, this CE will end up being returned. This is the normal
michael@0 3427 // way out of prefix handling when the source actually contained
michael@0 3428 // the prefix.
michael@0 3429 break;
michael@0 3430 }
michael@0 3431 }
michael@0 3432 loadState(source, &prefixState, TRUE);
michael@0 3433 break;
michael@0 3434 }
michael@0 3435
michael@0 3436 case CONTRACTION_TAG: {
michael@0 3437 /* to ensure that the backwards and forwards iteration matches, we
michael@0 3438 take the current region of most possible match and pass it through
michael@0 3439 the forward iteration. this will ensure that the obstinate problem of
michael@0 3440 overlapping contractions will not occur.
michael@0 3441 */
michael@0 3442 schar = peekCodeUnit(source, 0);
michael@0 3443 constart = (UChar *)coll->image + getContractOffset(CE);
michael@0 3444 if (isAtStartPrevIterate(source)
michael@0 3445 /* commented away contraction end checks after adding the checks
michael@0 3446 in getPrevCE */) {
michael@0 3447 /* start of string or this is not the end of any contraction */
michael@0 3448 CE = *(coll->contractionCEs +
michael@0 3449 (constart - coll->contractionIndex));
michael@0 3450 break;
michael@0 3451 }
michael@0 3452 strbuffer = buffer;
michael@0 3453 UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);
michael@0 3454 *(UCharOffset --) = 0;
michael@0 3455 noChars = 0;
michael@0 3456 // have to swap thai characters
michael@0 3457 while (ucol_unsafeCP(schar, coll)) {
michael@0 3458 *(UCharOffset) = schar;
michael@0 3459 noChars++;
michael@0 3460 UCharOffset --;
michael@0 3461 schar = getPrevNormalizedChar(source, status);
michael@0 3462 goBackOne(source);
michael@0 3463 // TODO: when we exhaust the contraction buffer,
michael@0 3464 // it needs to get reallocated. The problem is
michael@0 3465 // that the size depends on the string which is
michael@0 3466 // not iterated over. However, since we're travelling
michael@0 3467 // backwards, we already had to set the iterator at
michael@0 3468 // the end - so we might as well know where we are?
michael@0 3469 if (UCharOffset + 1 == buffer) {
michael@0 3470 /* we have exhausted the buffer */
michael@0 3471 int32_t newsize = 0;
michael@0 3472 if(source->pos) { // actually dealing with a position
michael@0 3473 newsize = (int32_t)(source->pos - source->string + 1);
michael@0 3474 } else { // iterator
michael@0 3475 newsize = 4 * UCOL_MAX_BUFFER;
michael@0 3476 }
michael@0 3477 strbuffer = (UChar *)uprv_malloc(sizeof(UChar) *
michael@0 3478 (newsize + UCOL_MAX_BUFFER));
michael@0 3479 /* test for NULL */
michael@0 3480 if (strbuffer == NULL) {
michael@0 3481 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 3482 return UCOL_NO_MORE_CES;
michael@0 3483 }
michael@0 3484 UCharOffset = strbuffer + newsize;
michael@0 3485 uprv_memcpy(UCharOffset, buffer,
michael@0 3486 UCOL_MAX_BUFFER * sizeof(UChar));
michael@0 3487 UCharOffset --;
michael@0 3488 }
michael@0 3489 if ((source->pos && (source->pos == source->string ||
michael@0 3490 ((source->flags & UCOL_ITER_INNORMBUF) &&
michael@0 3491 *(source->pos - 1) == 0 && source->fcdPosition == NULL)))
michael@0 3492 || (source->iterator && !source->iterator->hasPrevious(source->iterator))) {
michael@0 3493 break;
michael@0 3494 }
michael@0 3495 }
michael@0 3496 /* adds the initial base character to the string */
michael@0 3497 *(UCharOffset) = schar;
michael@0 3498 noChars++;
michael@0 3499
michael@0 3500 int32_t offsetBias;
michael@0 3501
michael@0 3502 // **** doesn't work if using iterator ****
michael@0 3503 if (source->flags & UCOL_ITER_INNORMBUF) {
michael@0 3504 offsetBias = -1;
michael@0 3505 } else {
michael@0 3506 offsetBias = (int32_t)(source->pos - source->string);
michael@0 3507 }
michael@0 3508
michael@0 3509 /* a new collIterate is used to simplify things, since using the current
michael@0 3510 collIterate will mean that the forward and backwards iteration will
michael@0 3511 share and change the same buffers. we don't want to get into that. */
michael@0 3512 collIterate temp;
michael@0 3513 int32_t rawOffset;
michael@0 3514
michael@0 3515 IInit_collIterate(coll, UCharOffset, noChars, &temp, status);
michael@0 3516 if(U_FAILURE(*status)) {
michael@0 3517 return (uint32_t)UCOL_NULLORDER;
michael@0 3518 }
michael@0 3519 temp.flags &= ~UCOL_ITER_NORM;
michael@0 3520 temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT;
michael@0 3521
michael@0 3522 rawOffset = (int32_t)(temp.pos - temp.string); // should always be zero?
michael@0 3523 CE = ucol_IGetNextCE(coll, &temp, status);
michael@0 3524
michael@0 3525 if (source->extendCEs) {
michael@0 3526 endCEBuffer = source->extendCEs + source->extendCEsSize;
michael@0 3527 CECount = (int32_t)((source->CEpos - source->extendCEs)/sizeof(uint32_t));
michael@0 3528 } else {
michael@0 3529 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
michael@0 3530 CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_t));
michael@0 3531 }
michael@0 3532
michael@0 3533 while (CE != UCOL_NO_MORE_CES) {
michael@0 3534 *(source->CEpos ++) = CE;
michael@0 3535
michael@0 3536 if (offsetBias >= 0) {
michael@0 3537 source->appendOffset(rawOffset + offsetBias, *status);
michael@0 3538 }
michael@0 3539
michael@0 3540 CECount++;
michael@0 3541 if (source->CEpos == endCEBuffer) {
michael@0 3542 /* ran out of CE space, reallocate to new buffer.
michael@0 3543 If reallocation fails, reset pointers and bail out,
michael@0 3544 there's no guarantee of the right character position after
michael@0 3545 this bail*/
michael@0 3546 if (!increaseCEsCapacity(source)) {
michael@0 3547 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 3548 break;
michael@0 3549 }
michael@0 3550
michael@0 3551 endCEBuffer = source->extendCEs + source->extendCEsSize;
michael@0 3552 }
michael@0 3553
michael@0 3554 if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) {
michael@0 3555 rawOffset = (int32_t)(temp.fcdPosition - temp.string);
michael@0 3556 } else {
michael@0 3557 rawOffset = (int32_t)(temp.pos - temp.string);
michael@0 3558 }
michael@0 3559
michael@0 3560 CE = ucol_IGetNextCE(coll, &temp, status);
michael@0 3561 }
michael@0 3562
michael@0 3563 if (strbuffer != buffer) {
michael@0 3564 uprv_free(strbuffer);
michael@0 3565 }
michael@0 3566 if (U_FAILURE(*status)) {
michael@0 3567 return (uint32_t)UCOL_NULLORDER;
michael@0 3568 }
michael@0 3569
michael@0 3570 if (source->offsetRepeatValue != 0) {
michael@0 3571 if (CECount > noChars) {
michael@0 3572 source->offsetRepeatCount += temp.offsetRepeatCount;
michael@0 3573 } else {
michael@0 3574 // **** does this really skip the right offsets? ****
michael@0 3575 source->offsetReturn -= (noChars - CECount);
michael@0 3576 }
michael@0 3577 }
michael@0 3578
michael@0 3579 if (offsetBias >= 0) {
michael@0 3580 source->offsetReturn = source->offsetStore - 1;
michael@0 3581 if (source->offsetReturn == source->offsetBuffer) {
michael@0 3582 source->offsetStore = source->offsetBuffer;
michael@0 3583 }
michael@0 3584 }
michael@0 3585
michael@0 3586 source->toReturn = source->CEpos - 1;
michael@0 3587 if (source->toReturn == source->CEs) {
michael@0 3588 source->CEpos = source->CEs;
michael@0 3589 }
michael@0 3590
michael@0 3591 return *(source->toReturn);
michael@0 3592 }
michael@0 3593 case LONG_PRIMARY_TAG:
michael@0 3594 {
michael@0 3595 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
michael@0 3596 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
michael@0 3597 source->toReturn = source->CEpos - 1;
michael@0 3598
michael@0 3599 if (source->flags & UCOL_ITER_INNORMBUF) {
michael@0 3600 source->offsetRepeatCount = 1;
michael@0 3601 } else {
michael@0 3602 int32_t firstOffset = (int32_t)(source->pos - source->string);
michael@0 3603
michael@0 3604 source->appendOffset(firstOffset, *status);
michael@0 3605 source->appendOffset(firstOffset + 1, *status);
michael@0 3606
michael@0 3607 source->offsetReturn = source->offsetStore - 1;
michael@0 3608 *(source->offsetBuffer) = firstOffset;
michael@0 3609 if (source->offsetReturn == source->offsetBuffer) {
michael@0 3610 source->offsetStore = source->offsetBuffer;
michael@0 3611 }
michael@0 3612 }
michael@0 3613
michael@0 3614
michael@0 3615 return *(source->toReturn);
michael@0 3616 }
michael@0 3617
michael@0 3618 case EXPANSION_TAG: /* this tag always returns */
michael@0 3619 {
michael@0 3620 /*
michael@0 3621 This should handle expansion.
michael@0 3622 NOTE: we can encounter both continuations and expansions in an expansion!
michael@0 3623 I have to decide where continuations are going to be dealt with
michael@0 3624 */
michael@0 3625 int32_t firstOffset = (int32_t)(source->pos - source->string);
michael@0 3626
michael@0 3627 // **** doesn't work if using iterator ****
michael@0 3628 if (source->offsetReturn != NULL) {
michael@0 3629 if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetReturn == source->offsetBuffer) {
michael@0 3630 source->offsetStore = source->offsetBuffer;
michael@0 3631 }else {
michael@0 3632 firstOffset = -1;
michael@0 3633 }
michael@0 3634 }
michael@0 3635
michael@0 3636 /* find the offset to expansion table */
michael@0 3637 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
michael@0 3638 size = getExpansionCount(CE);
michael@0 3639 if (size != 0) {
michael@0 3640 /*
michael@0 3641 if there are less than 16 elements in expansion, we don't terminate
michael@0 3642 */
michael@0 3643 uint32_t count;
michael@0 3644
michael@0 3645 for (count = 0; count < size; count++) {
michael@0 3646 *(source->CEpos ++) = *CEOffset++;
michael@0 3647
michael@0 3648 if (firstOffset >= 0) {
michael@0 3649 source->appendOffset(firstOffset + 1, *status);
michael@0 3650 }
michael@0 3651 }
michael@0 3652 } else {
michael@0 3653 /* else, we do */
michael@0 3654 while (*CEOffset != 0) {
michael@0 3655 *(source->CEpos ++) = *CEOffset ++;
michael@0 3656
michael@0 3657 if (firstOffset >= 0) {
michael@0 3658 source->appendOffset(firstOffset + 1, *status);
michael@0 3659 }
michael@0 3660 }
michael@0 3661 }
michael@0 3662
michael@0 3663 if (firstOffset >= 0) {
michael@0 3664 source->offsetReturn = source->offsetStore - 1;
michael@0 3665 *(source->offsetBuffer) = firstOffset;
michael@0 3666 if (source->offsetReturn == source->offsetBuffer) {
michael@0 3667 source->offsetStore = source->offsetBuffer;
michael@0 3668 }
michael@0 3669 } else {
michael@0 3670 source->offsetRepeatCount += size - 1;
michael@0 3671 }
michael@0 3672
michael@0 3673 source->toReturn = source->CEpos - 1;
michael@0 3674 // in case of one element expansion, we
michael@0 3675 // want to immediately return CEpos
michael@0 3676 if(source->toReturn == source->CEs) {
michael@0 3677 source->CEpos = source->CEs;
michael@0 3678 }
michael@0 3679
michael@0 3680 return *(source->toReturn);
michael@0 3681 }
michael@0 3682
michael@0 3683 case DIGIT_TAG:
michael@0 3684 {
michael@0 3685 /*
michael@0 3686 We do a check to see if we want to collate digits as numbers; if so we generate
michael@0 3687 a custom collation key. Otherwise we pull out the value stored in the expansion table.
michael@0 3688 */
michael@0 3689 uint32_t i; /* general counter */
michael@0 3690
michael@0 3691 if (source->coll->numericCollation == UCOL_ON){
michael@0 3692 uint32_t digIndx = 0;
michael@0 3693 uint32_t endIndex = 0;
michael@0 3694 uint32_t leadingZeroIndex = 0;
michael@0 3695 uint32_t trailingZeroCount = 0;
michael@0 3696
michael@0 3697 uint8_t collateVal = 0;
michael@0 3698
michael@0 3699 UBool nonZeroValReached = FALSE;
michael@0 3700
michael@0 3701 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I just need a temporary place to store my generated CEs.
michael@0 3702 /*
michael@0 3703 We parse the source string until we hit a char that's NOT a digit.
michael@0 3704 Use this u_charDigitValue. This might be slow because we have to
michael@0 3705 handle surrogates...
michael@0 3706 */
michael@0 3707 /*
michael@0 3708 We need to break up the digit string into collection elements of UCOL_MAX_DIGITS_FOR_NUMBER or less,
michael@0 3709 with any chunks smaller than that being on the right end of the digit string - i.e. the first collation
michael@0 3710 element we process when going backward. To determine how long that chunk might be, we may need to make
michael@0 3711 two passes through the loop that collects digits - one to see how long the string is (and how much is
michael@0 3712 leading zeros) to determine the length of that right-hand chunk, and a second (if the whole string has
michael@0 3713 more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits) to actually process that collation
michael@0 3714 element chunk after resetting the state to the initialState at the right side of the digit string.
michael@0 3715 */
michael@0 3716 uint32_t ceLimit = 0;
michael@0 3717 UChar initial_ch = ch;
michael@0 3718 collIterateState initialState = {0,0,0,0,0,0,0,0,0};
michael@0 3719 backupState(source, &initialState);
michael@0 3720
michael@0 3721 for(;;) {
michael@0 3722 collIterateState state = {0,0,0,0,0,0,0,0,0};
michael@0 3723 UChar32 char32 = 0;
michael@0 3724 int32_t digVal = 0;
michael@0 3725
michael@0 3726 if (U16_IS_TRAIL (ch)) {
michael@0 3727 if (!collIter_bos(source)){
michael@0 3728 UChar lead = getPrevNormalizedChar(source, status);
michael@0 3729 if(U16_IS_LEAD(lead)) {
michael@0 3730 char32 = U16_GET_SUPPLEMENTARY(lead,ch);
michael@0 3731 goBackOne(source);
michael@0 3732 } else {
michael@0 3733 char32 = ch;
michael@0 3734 }
michael@0 3735 } else {
michael@0 3736 char32 = ch;
michael@0 3737 }
michael@0 3738 } else {
michael@0 3739 char32 = ch;
michael@0 3740 }
michael@0 3741 digVal = u_charDigitValue(char32);
michael@0 3742
michael@0 3743 for(;;) {
michael@0 3744 // Make sure we have enough space. No longer needed;
michael@0 3745 // at this point the largest value of digIndx when we need to save data in numTempBuf
michael@0 3746 // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-incremented) so we just ensure
michael@0 3747 // that numTempBuf is big enough (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2).
michael@0 3748
michael@0 3749 // Skip over trailing zeroes, and keep a count of them.
michael@0 3750 if (digVal != 0)
michael@0 3751 nonZeroValReached = TRUE;
michael@0 3752
michael@0 3753 if (nonZeroValReached) {
michael@0 3754 /*
michael@0 3755 We parse the digit string into base 100 numbers (this fits into a byte).
michael@0 3756 We only add to the buffer in twos, thus if we are parsing an odd character,
michael@0 3757 that serves as the 'tens' digit while the if we are parsing an even one, that
michael@0 3758 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
michael@0 3759 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
michael@0 3760 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
michael@0 3761 than all the other bytes.
michael@0 3762
michael@0 3763 Since we're doing in this reverse we want to put the first digit encountered into the
michael@0 3764 ones place and the second digit encountered into the tens place.
michael@0 3765 */
michael@0 3766
michael@0 3767 if ((digIndx + trailingZeroCount) % 2 == 1) {
michael@0 3768 // High-order digit case (tens place)
michael@0 3769 collateVal += (uint8_t)(digVal * 10);
michael@0 3770
michael@0 3771 // We cannot set leadingZeroIndex unless it has been set for the
michael@0 3772 // low-order digit. Therefore, all we can do for the high-order
michael@0 3773 // digit is turn it off, never on.
michael@0 3774 // The only time we will have a high digit without a low is for
michael@0 3775 // the very first non-zero digit, so no zero check is necessary.
michael@0 3776 if (collateVal != 0)
michael@0 3777 leadingZeroIndex = 0;
michael@0 3778
michael@0 3779 // The first pass through, digIndx may exceed the limit, but in that case
michael@0 3780 // we no longer care about numTempBuf contents since they will be discarded
michael@0 3781 if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) {
michael@0 3782 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
michael@0 3783 }
michael@0 3784 collateVal = 0;
michael@0 3785 } else {
michael@0 3786 // Low-order digit case (ones place)
michael@0 3787 collateVal = (uint8_t)digVal;
michael@0 3788
michael@0 3789 // Check for leading zeroes.
michael@0 3790 if (collateVal == 0) {
michael@0 3791 if (!leadingZeroIndex)
michael@0 3792 leadingZeroIndex = (digIndx/2) + 2;
michael@0 3793 } else
michael@0 3794 leadingZeroIndex = 0;
michael@0 3795
michael@0 3796 // No need to write to buffer; the case of a last odd digit
michael@0 3797 // is handled below.
michael@0 3798 }
michael@0 3799 ++digIndx;
michael@0 3800 } else
michael@0 3801 ++trailingZeroCount;
michael@0 3802
michael@0 3803 if (!collIter_bos(source)) {
michael@0 3804 ch = getPrevNormalizedChar(source, status);
michael@0 3805 //goBackOne(source);
michael@0 3806 if (U16_IS_TRAIL(ch)) {
michael@0 3807 backupState(source, &state);
michael@0 3808 if (!collIter_bos(source)) {
michael@0 3809 goBackOne(source);
michael@0 3810 UChar lead = getPrevNormalizedChar(source, status);
michael@0 3811
michael@0 3812 if(U16_IS_LEAD(lead)) {
michael@0 3813 char32 = U16_GET_SUPPLEMENTARY(lead,ch);
michael@0 3814 } else {
michael@0 3815 loadState(source, &state, FALSE);
michael@0 3816 char32 = ch;
michael@0 3817 }
michael@0 3818 }
michael@0 3819 } else
michael@0 3820 char32 = ch;
michael@0 3821
michael@0 3822 if ((digVal = u_charDigitValue(char32)) == -1 || (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) {
michael@0 3823 if (char32 > 0xFFFF) {// For surrogates.
michael@0 3824 loadState(source, &state, FALSE);
michael@0 3825 }
michael@0 3826 // Don't need to "reverse" the goBackOne call,
michael@0 3827 // as this points to the next position to process..
michael@0 3828 //if (char32 > 0xFFFF) // For surrogates.
michael@0 3829 //getNextNormalizedChar(source);
michael@0 3830 break;
michael@0 3831 }
michael@0 3832
michael@0 3833 goBackOne(source);
michael@0 3834 }else
michael@0 3835 break;
michael@0 3836 }
michael@0 3837
michael@0 3838 if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_NUMBER) {
michael@0 3839 // our collation element is not too big, go ahead and finish with it
michael@0 3840 break;
michael@0 3841 }
michael@0 3842 // our digit string is too long for a collation element;
michael@0 3843 // set the limit for it, reset the state and begin again
michael@0 3844 ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGITS_FOR_NUMBER;
michael@0 3845 if ( ceLimit == 0 ) {
michael@0 3846 ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER;
michael@0 3847 }
michael@0 3848 ch = initial_ch;
michael@0 3849 loadState(source, &initialState, FALSE);
michael@0 3850 digIndx = endIndex = leadingZeroIndex = trailingZeroCount = 0;
michael@0 3851 collateVal = 0;
michael@0 3852 nonZeroValReached = FALSE;
michael@0 3853 }
michael@0 3854
michael@0 3855 if (! nonZeroValReached) {
michael@0 3856 digIndx = 2;
michael@0 3857 trailingZeroCount = 0;
michael@0 3858 numTempBuf[2] = 6;
michael@0 3859 }
michael@0 3860
michael@0 3861 if ((digIndx + trailingZeroCount) % 2 != 0) {
michael@0 3862 numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
michael@0 3863 digIndx += 1; // The implicit leading zero
michael@0 3864 }
michael@0 3865 if (trailingZeroCount % 2 != 0) {
michael@0 3866 // We had to consume one trailing zero for the low digit
michael@0 3867 // of the least significant byte
michael@0 3868 digIndx += 1; // The trailing zero not in the exponent
michael@0 3869 trailingZeroCount -= 1;
michael@0 3870 }
michael@0 3871
michael@0 3872 endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ;
michael@0 3873
michael@0 3874 // Subtract one off of the last byte. Really the first byte here, but it's reversed...
michael@0 3875 numTempBuf[2] -= 1;
michael@0 3876
michael@0 3877 /*
michael@0 3878 We want to skip over the first two slots in the buffer. The first slot
michael@0 3879 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
michael@0 3880 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
michael@0 3881 The exponent must be adjusted by the number of leading zeroes, and the number of
michael@0 3882 trailing zeroes.
michael@0 3883 */
michael@0 3884 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
michael@0 3885 uint32_t exponent = (digIndx+trailingZeroCount)/2;
michael@0 3886 if (leadingZeroIndex)
michael@0 3887 exponent -= ((digIndx/2) + 2 - leadingZeroIndex);
michael@0 3888 numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F));
michael@0 3889
michael@0 3890 // Now transfer the collation key to our collIterate struct.
michael@0 3891 // The total size for our collation key is half of endIndex, rounded up.
michael@0 3892 int32_t size = (endIndex+1)/2;
michael@0 3893 if(!ensureCEsCapacity(source, size)) {
michael@0 3894 return (uint32_t)UCOL_NULLORDER;
michael@0 3895 }
michael@0 3896 *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
michael@0 3897 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
michael@0 3898 UCOL_BYTE_COMMON; // Tertiary weight.
michael@0 3899 i = endIndex - 1; // Reset the index into the buffer.
michael@0 3900 while(i >= 2) {
michael@0 3901 uint32_t primWeight = numTempBuf[i--] << 8;
michael@0 3902 if ( i >= 2)
michael@0 3903 primWeight |= numTempBuf[i--];
michael@0 3904 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
michael@0 3905 }
michael@0 3906
michael@0 3907 source->toReturn = source->CEpos -1;
michael@0 3908 return *(source->toReturn);
michael@0 3909 } else {
michael@0 3910 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
michael@0 3911 CE = *(CEOffset++);
michael@0 3912 break;
michael@0 3913 }
michael@0 3914 }
michael@0 3915
michael@0 3916 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
michael@0 3917 {
michael@0 3918 static const uint32_t
michael@0 3919 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
michael@0 3920 //const uint32_t LCount = 19;
michael@0 3921 static const uint32_t VCount = 21;
michael@0 3922 static const uint32_t TCount = 28;
michael@0 3923 //const uint32_t NCount = VCount * TCount; /* 588 */
michael@0 3924 //const uint32_t SCount = LCount * NCount; /* 11172 */
michael@0 3925
michael@0 3926 uint32_t L = ch - SBase;
michael@0 3927 /*
michael@0 3928 divide into pieces.
michael@0 3929 we do it in this order since some compilers can do % and / in one
michael@0 3930 operation
michael@0 3931 */
michael@0 3932 uint32_t T = L % TCount;
michael@0 3933 L /= TCount;
michael@0 3934 uint32_t V = L % VCount;
michael@0 3935 L /= VCount;
michael@0 3936
michael@0 3937 /* offset them */
michael@0 3938 L += LBase;
michael@0 3939 V += VBase;
michael@0 3940 T += TBase;
michael@0 3941
michael@0 3942 int32_t firstOffset = (int32_t)(source->pos - source->string);
michael@0 3943 source->appendOffset(firstOffset, *status);
michael@0 3944
michael@0 3945 /*
michael@0 3946 * return the first CE, but first put the rest into the expansion buffer
michael@0 3947 */
michael@0 3948 if (!source->coll->image->jamoSpecial) {
michael@0 3949 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
michael@0 3950 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
michael@0 3951 source->appendOffset(firstOffset + 1, *status);
michael@0 3952
michael@0 3953 if (T != TBase) {
michael@0 3954 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
michael@0 3955 source->appendOffset(firstOffset + 1, *status);
michael@0 3956 }
michael@0 3957
michael@0 3958 source->toReturn = source->CEpos - 1;
michael@0 3959
michael@0 3960 source->offsetReturn = source->offsetStore - 1;
michael@0 3961 if (source->offsetReturn == source->offsetBuffer) {
michael@0 3962 source->offsetStore = source->offsetBuffer;
michael@0 3963 }
michael@0 3964
michael@0 3965 return *(source->toReturn);
michael@0 3966 } else {
michael@0 3967 // Since Hanguls pass the FCD check, it is
michael@0 3968 // guaranteed that we won't be in
michael@0 3969 // the normalization buffer if something like this happens
michael@0 3970
michael@0 3971 // Move Jamos into normalization buffer
michael@0 3972 UChar *tempbuffer = source->writableBuffer.getBuffer(5);
michael@0 3973 int32_t tempbufferLength, jamoOffset;
michael@0 3974 tempbuffer[0] = 0;
michael@0 3975 tempbuffer[1] = (UChar)L;
michael@0 3976 tempbuffer[2] = (UChar)V;
michael@0 3977 if (T != TBase) {
michael@0 3978 tempbuffer[3] = (UChar)T;
michael@0 3979 tempbufferLength = 4;
michael@0 3980 } else {
michael@0 3981 tempbufferLength = 3;
michael@0 3982 }
michael@0 3983 source->writableBuffer.releaseBuffer(tempbufferLength);
michael@0 3984
michael@0 3985 // Indicate where to continue in main input string after exhausting the writableBuffer
michael@0 3986 if (source->pos == source->string) {
michael@0 3987 jamoOffset = 0;
michael@0 3988 source->fcdPosition = NULL;
michael@0 3989 } else {
michael@0 3990 jamoOffset = source->pos - source->string;
michael@0 3991 source->fcdPosition = source->pos-1;
michael@0 3992 }
michael@0 3993
michael@0 3994 // Append offsets for the additional chars
michael@0 3995 // (not the 0, and not the L whose offsets match the original Hangul)
michael@0 3996 int32_t jamoRemaining = tempbufferLength - 2;
michael@0 3997 jamoOffset++; // appended offsets should match end of original Hangul
michael@0 3998 while (jamoRemaining-- > 0) {
michael@0 3999 source->appendOffset(jamoOffset, *status);
michael@0 4000 }
michael@0 4001
michael@0 4002 source->offsetRepeatValue = jamoOffset;
michael@0 4003
michael@0 4004 source->offsetReturn = source->offsetStore - 1;
michael@0 4005 if (source->offsetReturn == source->offsetBuffer) {
michael@0 4006 source->offsetStore = source->offsetBuffer;
michael@0 4007 }
michael@0 4008
michael@0 4009 source->pos = source->writableBuffer.getTerminatedBuffer() + tempbufferLength;
michael@0 4010 source->origFlags = source->flags;
michael@0 4011 source->flags |= UCOL_ITER_INNORMBUF;
michael@0 4012 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
michael@0 4013
michael@0 4014 return(UCOL_IGNORABLE);
michael@0 4015 }
michael@0 4016 }
michael@0 4017
michael@0 4018 case IMPLICIT_TAG: /* everything that is not defined otherwise */
michael@0 4019 return getPrevImplicit(ch, source);
michael@0 4020
michael@0 4021 // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
michael@0 4022 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
michael@0 4023 return getPrevImplicit(ch, source);
michael@0 4024
michael@0 4025 case SURROGATE_TAG: /* This is a surrogate pair */
michael@0 4026 /* essentially an engaged lead surrogate. */
michael@0 4027 /* if you have encountered it here, it means that a */
michael@0 4028 /* broken sequence was encountered and this is an error */
michael@0 4029 return UCOL_NOT_FOUND;
michael@0 4030
michael@0 4031 case LEAD_SURROGATE_TAG: /* D800-DBFF*/
michael@0 4032 return UCOL_NOT_FOUND; /* broken surrogate sequence */
michael@0 4033
michael@0 4034 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
michael@0 4035 {
michael@0 4036 UChar32 cp = 0;
michael@0 4037 UChar prevChar;
michael@0 4038 const UChar *prev;
michael@0 4039 if (isAtStartPrevIterate(source)) {
michael@0 4040 /* we are at the start of the string, wrong place to be at */
michael@0 4041 return UCOL_NOT_FOUND;
michael@0 4042 }
michael@0 4043 if (source->pos != source->writableBuffer.getBuffer()) {
michael@0 4044 prev = source->pos - 1;
michael@0 4045 } else {
michael@0 4046 prev = source->fcdPosition;
michael@0 4047 }
michael@0 4048 prevChar = *prev;
michael@0 4049
michael@0 4050 /* Handles Han and Supplementary characters here.*/
michael@0 4051 if (U16_IS_LEAD(prevChar)) {
michael@0 4052 cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
michael@0 4053 source->pos = prev;
michael@0 4054 } else {
michael@0 4055 return UCOL_NOT_FOUND; /* like unassigned */
michael@0 4056 }
michael@0 4057
michael@0 4058 return getPrevImplicit(cp, source);
michael@0 4059 }
michael@0 4060
michael@0 4061 /* UCA is filled with these. Tailorings are NOT_FOUND */
michael@0 4062 /* not yet implemented */
michael@0 4063 case CHARSET_TAG: /* this tag always returns */
michael@0 4064 /* probably after 1.8 */
michael@0 4065 return UCOL_NOT_FOUND;
michael@0 4066
michael@0 4067 default: /* this tag always returns */
michael@0 4068 *status = U_INTERNAL_PROGRAM_ERROR;
michael@0 4069 CE=0;
michael@0 4070 break;
michael@0 4071 }
michael@0 4072
michael@0 4073 if (CE <= UCOL_NOT_FOUND) {
michael@0 4074 break;
michael@0 4075 }
michael@0 4076 }
michael@0 4077
michael@0 4078 return CE;
michael@0 4079 }
michael@0 4080
michael@0 4081 /* This should really be a macro */
michael@0 4082 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
michael@0 4083 /* secondaries in French */
michael@0 4084 /*
michael@0 4085 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
michael@0 4086 uint8_t temp;
michael@0 4087 while(start<end) {
michael@0 4088 temp = *start;
michael@0 4089 *start++ = *end;
michael@0 4090 *end-- = temp;
michael@0 4091 }
michael@0 4092 }
michael@0 4093 */
michael@0 4094
michael@0 4095 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \
michael@0 4096 TYPE tempA; \
michael@0 4097 while((start)<(end)) { \
michael@0 4098 tempA = *(start); \
michael@0 4099 *(start)++ = *(end); \
michael@0 4100 *(end)-- = tempA; \
michael@0 4101 } \
michael@0 4102 }
michael@0 4103
michael@0 4104 /****************************************************************************/
michael@0 4105 /* Following are the sortkey generation functions */
michael@0 4106 /* */
michael@0 4107 /****************************************************************************/
michael@0 4108
michael@0 4109 U_CAPI int32_t U_EXPORT2
michael@0 4110 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
michael@0 4111 const uint8_t *src2, int32_t src2Length,
michael@0 4112 uint8_t *dest, int32_t destCapacity) {
michael@0 4113 /* check arguments */
michael@0 4114 if( src1==NULL || src1Length<-1 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) ||
michael@0 4115 src2==NULL || src2Length<-1 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) ||
michael@0 4116 destCapacity<0 || (destCapacity>0 && dest==NULL)
michael@0 4117 ) {
michael@0 4118 /* error, attempt to write a zero byte and return 0 */
michael@0 4119 if(dest!=NULL && destCapacity>0) {
michael@0 4120 *dest=0;
michael@0 4121 }
michael@0 4122 return 0;
michael@0 4123 }
michael@0 4124
michael@0 4125 /* check lengths and capacity */
michael@0 4126 if(src1Length<0) {
michael@0 4127 src1Length=(int32_t)uprv_strlen((const char *)src1)+1;
michael@0 4128 }
michael@0 4129 if(src2Length<0) {
michael@0 4130 src2Length=(int32_t)uprv_strlen((const char *)src2)+1;
michael@0 4131 }
michael@0 4132
michael@0 4133 int32_t destLength=src1Length+src2Length;
michael@0 4134 if(destLength>destCapacity) {
michael@0 4135 /* the merged sort key does not fit into the destination */
michael@0 4136 return destLength;
michael@0 4137 }
michael@0 4138
michael@0 4139 /* merge the sort keys with the same number of levels */
michael@0 4140 uint8_t *p=dest;
michael@0 4141 for(;;) {
michael@0 4142 /* copy level from src1 not including 00 or 01 */
michael@0 4143 uint8_t b;
michael@0 4144 while((b=*src1)>=2) {
michael@0 4145 ++src1;
michael@0 4146 *p++=b;
michael@0 4147 }
michael@0 4148
michael@0 4149 /* add a 02 merge separator */
michael@0 4150 *p++=2;
michael@0 4151
michael@0 4152 /* copy level from src2 not including 00 or 01 */
michael@0 4153 while((b=*src2)>=2) {
michael@0 4154 ++src2;
michael@0 4155 *p++=b;
michael@0 4156 }
michael@0 4157
michael@0 4158 /* if both sort keys have another level, then add a 01 level separator and continue */
michael@0 4159 if(*src1==1 && *src2==1) {
michael@0 4160 ++src1;
michael@0 4161 ++src2;
michael@0 4162 *p++=1;
michael@0 4163 } else {
michael@0 4164 break;
michael@0 4165 }
michael@0 4166 }
michael@0 4167
michael@0 4168 /*
michael@0 4169 * here, at least one sort key is finished now, but the other one
michael@0 4170 * might have some contents left from containing more levels;
michael@0 4171 * that contents is just appended to the result
michael@0 4172 */
michael@0 4173 if(*src1!=0) {
michael@0 4174 /* src1 is not finished, therefore *src2==0, and src1 is appended */
michael@0 4175 src2=src1;
michael@0 4176 }
michael@0 4177 /* append src2, "the other, unfinished sort key" */
michael@0 4178 while((*p++=*src2++)!=0) {}
michael@0 4179
michael@0 4180 /* the actual length might be less than destLength if either sort key contained illegally embedded zero bytes */
michael@0 4181 return (int32_t)(p-dest);
michael@0 4182 }
michael@0 4183
michael@0 4184 U_NAMESPACE_BEGIN
michael@0 4185
michael@0 4186 class SortKeyByteSink : public ByteSink {
michael@0 4187 public:
michael@0 4188 SortKeyByteSink(char *dest, int32_t destCapacity)
michael@0 4189 : buffer_(dest), capacity_(destCapacity),
michael@0 4190 appended_(0) {
michael@0 4191 if (buffer_ == NULL) {
michael@0 4192 capacity_ = 0;
michael@0 4193 } else if(capacity_ < 0) {
michael@0 4194 buffer_ = NULL;
michael@0 4195 capacity_ = 0;
michael@0 4196 }
michael@0 4197 }
michael@0 4198 virtual ~SortKeyByteSink();
michael@0 4199
michael@0 4200 virtual void Append(const char *bytes, int32_t n);
michael@0 4201 void Append(uint32_t b) {
michael@0 4202 if (appended_ < capacity_ || Resize(1, appended_)) {
michael@0 4203 buffer_[appended_] = (char)b;
michael@0 4204 }
michael@0 4205 ++appended_;
michael@0 4206 }
michael@0 4207 void Append(uint32_t b1, uint32_t b2) {
michael@0 4208 int32_t a2 = appended_ + 2;
michael@0 4209 if (a2 <= capacity_ || Resize(2, appended_)) {
michael@0 4210 buffer_[appended_] = (char)b1;
michael@0 4211 buffer_[appended_ + 1] = (char)b2;
michael@0 4212 } else if(appended_ < capacity_) {
michael@0 4213 buffer_[appended_] = (char)b1;
michael@0 4214 }
michael@0 4215 appended_ = a2;
michael@0 4216 }
michael@0 4217 virtual char *GetAppendBuffer(int32_t min_capacity,
michael@0 4218 int32_t desired_capacity_hint,
michael@0 4219 char *scratch, int32_t scratch_capacity,
michael@0 4220 int32_t *result_capacity);
michael@0 4221 int32_t NumberOfBytesAppended() const { return appended_; }
michael@0 4222 /** @return FALSE if memory allocation failed */
michael@0 4223 UBool IsOk() const { return buffer_ != NULL; }
michael@0 4224
michael@0 4225 protected:
michael@0 4226 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) = 0;
michael@0 4227 virtual UBool Resize(int32_t appendCapacity, int32_t length) = 0;
michael@0 4228
michael@0 4229 void SetNotOk() {
michael@0 4230 buffer_ = NULL;
michael@0 4231 capacity_ = 0;
michael@0 4232 }
michael@0 4233
michael@0 4234 char *buffer_;
michael@0 4235 int32_t capacity_;
michael@0 4236 int32_t appended_;
michael@0 4237
michael@0 4238 private:
michael@0 4239 SortKeyByteSink(const SortKeyByteSink &); // copy constructor not implemented
michael@0 4240 SortKeyByteSink &operator=(const SortKeyByteSink &); // assignment operator not implemented
michael@0 4241 };
michael@0 4242
michael@0 4243 SortKeyByteSink::~SortKeyByteSink() {}
michael@0 4244
michael@0 4245 void
michael@0 4246 SortKeyByteSink::Append(const char *bytes, int32_t n) {
michael@0 4247 if (n <= 0 || bytes == NULL) {
michael@0 4248 return;
michael@0 4249 }
michael@0 4250 int32_t length = appended_;
michael@0 4251 appended_ += n;
michael@0 4252 if ((buffer_ + length) == bytes) {
michael@0 4253 return; // the caller used GetAppendBuffer() and wrote the bytes already
michael@0 4254 }
michael@0 4255 int32_t available = capacity_ - length;
michael@0 4256 if (n <= available) {
michael@0 4257 uprv_memcpy(buffer_ + length, bytes, n);
michael@0 4258 } else {
michael@0 4259 AppendBeyondCapacity(bytes, n, length);
michael@0 4260 }
michael@0 4261 }
michael@0 4262
michael@0 4263 char *
michael@0 4264 SortKeyByteSink::GetAppendBuffer(int32_t min_capacity,
michael@0 4265 int32_t desired_capacity_hint,
michael@0 4266 char *scratch,
michael@0 4267 int32_t scratch_capacity,
michael@0 4268 int32_t *result_capacity) {
michael@0 4269 if (min_capacity < 1 || scratch_capacity < min_capacity) {
michael@0 4270 *result_capacity = 0;
michael@0 4271 return NULL;
michael@0 4272 }
michael@0 4273 int32_t available = capacity_ - appended_;
michael@0 4274 if (available >= min_capacity) {
michael@0 4275 *result_capacity = available;
michael@0 4276 return buffer_ + appended_;
michael@0 4277 } else if (Resize(desired_capacity_hint, appended_)) {
michael@0 4278 *result_capacity = capacity_ - appended_;
michael@0 4279 return buffer_ + appended_;
michael@0 4280 } else {
michael@0 4281 *result_capacity = scratch_capacity;
michael@0 4282 return scratch;
michael@0 4283 }
michael@0 4284 }
michael@0 4285
michael@0 4286 class FixedSortKeyByteSink : public SortKeyByteSink {
michael@0 4287 public:
michael@0 4288 FixedSortKeyByteSink(char *dest, int32_t destCapacity)
michael@0 4289 : SortKeyByteSink(dest, destCapacity) {}
michael@0 4290 virtual ~FixedSortKeyByteSink();
michael@0 4291
michael@0 4292 private:
michael@0 4293 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length);
michael@0 4294 virtual UBool Resize(int32_t appendCapacity, int32_t length);
michael@0 4295 };
michael@0 4296
michael@0 4297 FixedSortKeyByteSink::~FixedSortKeyByteSink() {}
michael@0 4298
michael@0 4299 void
michael@0 4300 FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t /*n*/, int32_t length) {
michael@0 4301 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
michael@0 4302 // Fill the buffer completely.
michael@0 4303 int32_t available = capacity_ - length;
michael@0 4304 if (available > 0) {
michael@0 4305 uprv_memcpy(buffer_ + length, bytes, available);
michael@0 4306 }
michael@0 4307 }
michael@0 4308
michael@0 4309 UBool
michael@0 4310 FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) {
michael@0 4311 return FALSE;
michael@0 4312 }
michael@0 4313
michael@0 4314 class CollationKeyByteSink : public SortKeyByteSink {
michael@0 4315 public:
michael@0 4316 CollationKeyByteSink(CollationKey &key)
michael@0 4317 : SortKeyByteSink(reinterpret_cast<char *>(key.getBytes()), key.getCapacity()),
michael@0 4318 key_(key) {}
michael@0 4319 virtual ~CollationKeyByteSink();
michael@0 4320
michael@0 4321 private:
michael@0 4322 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length);
michael@0 4323 virtual UBool Resize(int32_t appendCapacity, int32_t length);
michael@0 4324
michael@0 4325 CollationKey &key_;
michael@0 4326 };
michael@0 4327
michael@0 4328 CollationKeyByteSink::~CollationKeyByteSink() {}
michael@0 4329
michael@0 4330 void
michael@0 4331 CollationKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) {
michael@0 4332 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
michael@0 4333 if (Resize(n, length)) {
michael@0 4334 uprv_memcpy(buffer_ + length, bytes, n);
michael@0 4335 }
michael@0 4336 }
michael@0 4337
michael@0 4338 UBool
michael@0 4339 CollationKeyByteSink::Resize(int32_t appendCapacity, int32_t length) {
michael@0 4340 if (buffer_ == NULL) {
michael@0 4341 return FALSE; // allocation failed before already
michael@0 4342 }
michael@0 4343 int32_t newCapacity = 2 * capacity_;
michael@0 4344 int32_t altCapacity = length + 2 * appendCapacity;
michael@0 4345 if (newCapacity < altCapacity) {
michael@0 4346 newCapacity = altCapacity;
michael@0 4347 }
michael@0 4348 if (newCapacity < 200) {
michael@0 4349 newCapacity = 200;
michael@0 4350 }
michael@0 4351 uint8_t *newBuffer = key_.reallocate(newCapacity, length);
michael@0 4352 if (newBuffer == NULL) {
michael@0 4353 SetNotOk();
michael@0 4354 return FALSE;
michael@0 4355 }
michael@0 4356 buffer_ = reinterpret_cast<char *>(newBuffer);
michael@0 4357 capacity_ = newCapacity;
michael@0 4358 return TRUE;
michael@0 4359 }
michael@0 4360
michael@0 4361 /**
michael@0 4362 * uint8_t byte buffer, similar to CharString but simpler.
michael@0 4363 */
michael@0 4364 class SortKeyLevel : public UMemory {
michael@0 4365 public:
michael@0 4366 SortKeyLevel() : len(0), ok(TRUE) {}
michael@0 4367 ~SortKeyLevel() {}
michael@0 4368
michael@0 4369 /** @return FALSE if memory allocation failed */
michael@0 4370 UBool isOk() const { return ok; }
michael@0 4371 UBool isEmpty() const { return len == 0; }
michael@0 4372 int32_t length() const { return len; }
michael@0 4373 const uint8_t *data() const { return buffer.getAlias(); }
michael@0 4374 uint8_t operator[](int32_t index) const { return buffer[index]; }
michael@0 4375
michael@0 4376 void appendByte(uint32_t b);
michael@0 4377
michael@0 4378 void appendTo(ByteSink &sink) const {
michael@0 4379 sink.Append(reinterpret_cast<const char *>(buffer.getAlias()), len);
michael@0 4380 }
michael@0 4381
michael@0 4382 uint8_t &lastByte() {
michael@0 4383 U_ASSERT(len > 0);
michael@0 4384 return buffer[len - 1];
michael@0 4385 }
michael@0 4386
michael@0 4387 uint8_t *getLastFewBytes(int32_t n) {
michael@0 4388 if (ok && len >= n) {
michael@0 4389 return buffer.getAlias() + len - n;
michael@0 4390 } else {
michael@0 4391 return NULL;
michael@0 4392 }
michael@0 4393 }
michael@0 4394
michael@0 4395 private:
michael@0 4396 MaybeStackArray<uint8_t, 40> buffer;
michael@0 4397 int32_t len;
michael@0 4398 UBool ok;
michael@0 4399
michael@0 4400 UBool ensureCapacity(int32_t appendCapacity);
michael@0 4401
michael@0 4402 SortKeyLevel(const SortKeyLevel &other); // forbid copying of this class
michael@0 4403 SortKeyLevel &operator=(const SortKeyLevel &other); // forbid copying of this class
michael@0 4404 };
michael@0 4405
michael@0 4406 void SortKeyLevel::appendByte(uint32_t b) {
michael@0 4407 if(len < buffer.getCapacity() || ensureCapacity(1)) {
michael@0 4408 buffer[len++] = (uint8_t)b;
michael@0 4409 }
michael@0 4410 }
michael@0 4411
michael@0 4412 UBool SortKeyLevel::ensureCapacity(int32_t appendCapacity) {
michael@0 4413 if(!ok) {
michael@0 4414 return FALSE;
michael@0 4415 }
michael@0 4416 int32_t newCapacity = 2 * buffer.getCapacity();
michael@0 4417 int32_t altCapacity = len + 2 * appendCapacity;
michael@0 4418 if (newCapacity < altCapacity) {
michael@0 4419 newCapacity = altCapacity;
michael@0 4420 }
michael@0 4421 if (newCapacity < 200) {
michael@0 4422 newCapacity = 200;
michael@0 4423 }
michael@0 4424 if(buffer.resize(newCapacity, len)==NULL) {
michael@0 4425 return ok = FALSE;
michael@0 4426 }
michael@0 4427 return TRUE;
michael@0 4428 }
michael@0 4429
michael@0 4430 U_NAMESPACE_END
michael@0 4431
michael@0 4432 /* sortkey API */
michael@0 4433 U_CAPI int32_t U_EXPORT2
michael@0 4434 ucol_getSortKey(const UCollator *coll,
michael@0 4435 const UChar *source,
michael@0 4436 int32_t sourceLength,
michael@0 4437 uint8_t *result,
michael@0 4438 int32_t resultLength)
michael@0 4439 {
michael@0 4440 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);
michael@0 4441 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
michael@0 4442 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source,
michael@0 4443 ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength));
michael@0 4444 }
michael@0 4445
michael@0 4446 if(coll->delegate != NULL) {
michael@0 4447 return ((const Collator*)coll->delegate)->getSortKey(source, sourceLength, result, resultLength);
michael@0 4448 }
michael@0 4449
michael@0 4450 UErrorCode status = U_ZERO_ERROR;
michael@0 4451 int32_t keySize = 0;
michael@0 4452
michael@0 4453 if(source != NULL) {
michael@0 4454 // source == NULL is actually an error situation, but we would need to
michael@0 4455 // have an error code to return it. Until we introduce a new
michael@0 4456 // API, it stays like this
michael@0 4457
michael@0 4458 /* this uses the function pointer that is set in updateinternalstate */
michael@0 4459 /* currently, there are two funcs: */
michael@0 4460 /*ucol_calcSortKey(...);*/
michael@0 4461 /*ucol_calcSortKeySimpleTertiary(...);*/
michael@0 4462
michael@0 4463 uint8_t noDest[1] = { 0 };
michael@0 4464 if(result == NULL) {
michael@0 4465 // Distinguish pure preflighting from an allocation error.
michael@0 4466 result = noDest;
michael@0 4467 resultLength = 0;
michael@0 4468 }
michael@0 4469 FixedSortKeyByteSink sink(reinterpret_cast<char *>(result), resultLength);
michael@0 4470 coll->sortKeyGen(coll, source, sourceLength, sink, &status);
michael@0 4471 if(U_SUCCESS(status)) {
michael@0 4472 keySize = sink.NumberOfBytesAppended();
michael@0 4473 }
michael@0 4474 }
michael@0 4475 UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);
michael@0 4476 UTRACE_EXIT_STATUS(status);
michael@0 4477 return keySize;
michael@0 4478 }
michael@0 4479
michael@0 4480 U_CFUNC int32_t
michael@0 4481 ucol_getCollationKey(const UCollator *coll,
michael@0 4482 const UChar *source, int32_t sourceLength,
michael@0 4483 CollationKey &key,
michael@0 4484 UErrorCode &errorCode) {
michael@0 4485 CollationKeyByteSink sink(key);
michael@0 4486 coll->sortKeyGen(coll, source, sourceLength, sink, &errorCode);
michael@0 4487 return sink.NumberOfBytesAppended();
michael@0 4488 }
michael@0 4489
michael@0 4490 // Is this primary weight compressible?
michael@0 4491 // Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit).
michael@0 4492 // TODO: This should use per-lead-byte flags from FractionalUCA.txt.
michael@0 4493 static inline UBool
michael@0 4494 isCompressible(const UCollator * /*coll*/, uint8_t primary1) {
michael@0 4495 return UCOL_BYTE_FIRST_NON_LATIN_PRIMARY <= primary1 && primary1 <= maxRegularPrimary;
michael@0 4496 }
michael@0 4497
michael@0 4498 static
michael@0 4499 inline void doCaseShift(SortKeyLevel &cases, uint32_t &caseShift) {
michael@0 4500 if (caseShift == 0) {
michael@0 4501 cases.appendByte(UCOL_CASE_BYTE_START);
michael@0 4502 caseShift = UCOL_CASE_SHIFT_START;
michael@0 4503 }
michael@0 4504 }
michael@0 4505
michael@0 4506 // Packs the secondary buffer when processing French locale.
michael@0 4507 static void
michael@0 4508 packFrench(const uint8_t *secondaries, int32_t secsize, SortKeyByteSink &result) {
michael@0 4509 secondaries += secsize; // We read the secondary-level bytes back to front.
michael@0 4510 uint8_t secondary;
michael@0 4511 int32_t count2 = 0;
michael@0 4512 int32_t i = 0;
michael@0 4513 // we use i here since the key size already accounts for terminators, so we'll discard the increment
michael@0 4514 for(i = 0; i<secsize; i++) {
michael@0 4515 secondary = *(secondaries-i-1);
michael@0 4516 /* This is compression code. */
michael@0 4517 if (secondary == UCOL_COMMON2) {
michael@0 4518 ++count2;
michael@0 4519 } else {
michael@0 4520 if (count2 > 0) {
michael@0 4521 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
michael@0 4522 while (count2 > UCOL_TOP_COUNT2) {
michael@0 4523 result.Append(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
michael@0 4524 count2 -= (uint32_t)UCOL_TOP_COUNT2;
michael@0 4525 }
michael@0 4526 result.Append(UCOL_COMMON_TOP2 - (count2-1));
michael@0 4527 } else {
michael@0 4528 while (count2 > UCOL_BOT_COUNT2) {
michael@0 4529 result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
michael@0 4530 count2 -= (uint32_t)UCOL_BOT_COUNT2;
michael@0 4531 }
michael@0 4532 result.Append(UCOL_COMMON_BOT2 + (count2-1));
michael@0 4533 }
michael@0 4534 count2 = 0;
michael@0 4535 }
michael@0 4536 result.Append(secondary);
michael@0 4537 }
michael@0 4538 }
michael@0 4539 if (count2 > 0) {
michael@0 4540 while (count2 > UCOL_BOT_COUNT2) {
michael@0 4541 result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
michael@0 4542 count2 -= (uint32_t)UCOL_BOT_COUNT2;
michael@0 4543 }
michael@0 4544 result.Append(UCOL_COMMON_BOT2 + (count2-1));
michael@0 4545 }
michael@0 4546 }
michael@0 4547
michael@0 4548 #define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0
michael@0 4549
michael@0 4550 /* This is the sortkey work horse function */
michael@0 4551 U_CFUNC void U_CALLCONV
michael@0 4552 ucol_calcSortKey(const UCollator *coll,
michael@0 4553 const UChar *source,
michael@0 4554 int32_t sourceLength,
michael@0 4555 SortKeyByteSink &result,
michael@0 4556 UErrorCode *status)
michael@0 4557 {
michael@0 4558 if(U_FAILURE(*status)) {
michael@0 4559 return;
michael@0 4560 }
michael@0 4561
michael@0 4562 SortKeyByteSink &primaries = result;
michael@0 4563 SortKeyLevel secondaries;
michael@0 4564 SortKeyLevel tertiaries;
michael@0 4565 SortKeyLevel cases;
michael@0 4566 SortKeyLevel quads;
michael@0 4567
michael@0 4568 UnicodeString normSource;
michael@0 4569
michael@0 4570 int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
michael@0 4571
michael@0 4572 UColAttributeValue strength = coll->strength;
michael@0 4573
michael@0 4574 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
michael@0 4575 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
michael@0 4576 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
michael@0 4577 UBool compareIdent = (strength == UCOL_IDENTICAL);
michael@0 4578 UBool doCase = (coll->caseLevel == UCOL_ON);
michael@0 4579 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
michael@0 4580 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
michael@0 4581 //UBool qShifted = shifted && (compareQuad == 0);
michael@0 4582 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
michael@0 4583
michael@0 4584 uint32_t variableTopValue = coll->variableTopValue;
michael@0 4585 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
michael@0 4586 // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
michael@0 4587 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
michael@0 4588 uint8_t UCOL_HIRAGANA_QUAD = 0;
michael@0 4589 if(doHiragana) {
michael@0 4590 UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++;
michael@0 4591 /* allocate one more space for hiragana, value for hiragana */
michael@0 4592 }
michael@0 4593 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
michael@0 4594
michael@0 4595 /* support for special features like caselevel and funky secondaries */
michael@0 4596 int32_t lastSecondaryLength = 0;
michael@0 4597 uint32_t caseShift = 0;
michael@0 4598
michael@0 4599 /* If we need to normalize, we'll do it all at once at the beginning! */
michael@0 4600 const Normalizer2 *norm2;
michael@0 4601 if(compareIdent) {
michael@0 4602 norm2 = Normalizer2Factory::getNFDInstance(*status);
michael@0 4603 } else if(coll->normalizationMode != UCOL_OFF) {
michael@0 4604 norm2 = Normalizer2Factory::getFCDInstance(*status);
michael@0 4605 } else {
michael@0 4606 norm2 = NULL;
michael@0 4607 }
michael@0 4608 if(norm2 != NULL) {
michael@0 4609 normSource.setTo(FALSE, source, len);
michael@0 4610 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
michael@0 4611 if(qcYesLength != len) {
michael@0 4612 UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
michael@0 4613 normSource.truncate(qcYesLength);
michael@0 4614 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
michael@0 4615 source = normSource.getBuffer();
michael@0 4616 len = normSource.length();
michael@0 4617 }
michael@0 4618 }
michael@0 4619 collIterate s;
michael@0 4620 IInit_collIterate(coll, source, len, &s, status);
michael@0 4621 if(U_FAILURE(*status)) {
michael@0 4622 return;
michael@0 4623 }
michael@0 4624 s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was normalized.
michael@0 4625
michael@0 4626 uint32_t order = 0;
michael@0 4627
michael@0 4628 uint8_t primary1 = 0;
michael@0 4629 uint8_t primary2 = 0;
michael@0 4630 uint8_t secondary = 0;
michael@0 4631 uint8_t tertiary = 0;
michael@0 4632 uint8_t caseSwitch = coll->caseSwitch;
michael@0 4633 uint8_t tertiaryMask = coll->tertiaryMask;
michael@0 4634 int8_t tertiaryAddition = coll->tertiaryAddition;
michael@0 4635 uint8_t tertiaryTop = coll->tertiaryTop;
michael@0 4636 uint8_t tertiaryBottom = coll->tertiaryBottom;
michael@0 4637 uint8_t tertiaryCommon = coll->tertiaryCommon;
michael@0 4638 uint8_t caseBits = 0;
michael@0 4639
michael@0 4640 UBool wasShifted = FALSE;
michael@0 4641 UBool notIsContinuation = FALSE;
michael@0 4642
michael@0 4643 uint32_t count2 = 0, count3 = 0, count4 = 0;
michael@0 4644 uint8_t leadPrimary = 0;
michael@0 4645
michael@0 4646 for(;;) {
michael@0 4647 order = ucol_IGetNextCE(coll, &s, status);
michael@0 4648 if(order == UCOL_NO_MORE_CES) {
michael@0 4649 break;
michael@0 4650 }
michael@0 4651
michael@0 4652 if(order == 0) {
michael@0 4653 continue;
michael@0 4654 }
michael@0 4655
michael@0 4656 notIsContinuation = !isContinuation(order);
michael@0 4657
michael@0 4658 if(notIsContinuation) {
michael@0 4659 tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK);
michael@0 4660 } else {
michael@0 4661 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
michael@0 4662 }
michael@0 4663
michael@0 4664 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
michael@0 4665 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
michael@0 4666 primary1 = (uint8_t)(order >> 8);
michael@0 4667
michael@0 4668 uint8_t originalPrimary1 = primary1;
michael@0 4669 if(notIsContinuation && coll->leadBytePermutationTable != NULL) {
michael@0 4670 primary1 = coll->leadBytePermutationTable[primary1];
michael@0 4671 }
michael@0 4672
michael@0 4673 if((shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
michael@0 4674 || (!notIsContinuation && wasShifted)))
michael@0 4675 || (wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
michael@0 4676 {
michael@0 4677 /* and other ignorables should be removed if following a shifted code point */
michael@0 4678 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
michael@0 4679 /* we should just completely ignore it */
michael@0 4680 continue;
michael@0 4681 }
michael@0 4682 if(compareQuad == 0) {
michael@0 4683 if(count4 > 0) {
michael@0 4684 while (count4 > UCOL_BOT_COUNT4) {
michael@0 4685 quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
michael@0 4686 count4 -= UCOL_BOT_COUNT4;
michael@0 4687 }
michael@0 4688 quads.appendByte(UCOL_COMMON_BOT4 + (count4-1));
michael@0 4689 count4 = 0;
michael@0 4690 }
michael@0 4691 /* We are dealing with a variable and we're treating them as shifted */
michael@0 4692 /* This is a shifted ignorable */
michael@0 4693 if(primary1 != 0) { /* we need to check this since we could be in continuation */
michael@0 4694 quads.appendByte(primary1);
michael@0 4695 }
michael@0 4696 if(primary2 != 0) {
michael@0 4697 quads.appendByte(primary2);
michael@0 4698 }
michael@0 4699 }
michael@0 4700 wasShifted = TRUE;
michael@0 4701 } else {
michael@0 4702 wasShifted = FALSE;
michael@0 4703 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
michael@0 4704 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */
michael@0 4705 /* regular and simple sortkey calc */
michael@0 4706 if(primary1 != UCOL_IGNORABLE) {
michael@0 4707 if(notIsContinuation) {
michael@0 4708 if(leadPrimary == primary1) {
michael@0 4709 primaries.Append(primary2);
michael@0 4710 } else {
michael@0 4711 if(leadPrimary != 0) {
michael@0 4712 primaries.Append((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
michael@0 4713 }
michael@0 4714 if(primary2 == UCOL_IGNORABLE) {
michael@0 4715 /* one byter, not compressed */
michael@0 4716 primaries.Append(primary1);
michael@0 4717 leadPrimary = 0;
michael@0 4718 } else if(isCompressible(coll, originalPrimary1)) {
michael@0 4719 /* compress */
michael@0 4720 primaries.Append(leadPrimary = primary1, primary2);
michael@0 4721 } else {
michael@0 4722 leadPrimary = 0;
michael@0 4723 primaries.Append(primary1, primary2);
michael@0 4724 }
michael@0 4725 }
michael@0 4726 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
michael@0 4727 if(primary2 == UCOL_IGNORABLE) {
michael@0 4728 primaries.Append(primary1);
michael@0 4729 } else {
michael@0 4730 primaries.Append(primary1, primary2);
michael@0 4731 }
michael@0 4732 }
michael@0 4733 }
michael@0 4734
michael@0 4735 if(secondary > compareSec) {
michael@0 4736 if(!isFrenchSec) {
michael@0 4737 /* This is compression code. */
michael@0 4738 if (secondary == UCOL_COMMON2 && notIsContinuation) {
michael@0 4739 ++count2;
michael@0 4740 } else {
michael@0 4741 if (count2 > 0) {
michael@0 4742 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
michael@0 4743 while (count2 > UCOL_TOP_COUNT2) {
michael@0 4744 secondaries.appendByte(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
michael@0 4745 count2 -= (uint32_t)UCOL_TOP_COUNT2;
michael@0 4746 }
michael@0 4747 secondaries.appendByte(UCOL_COMMON_TOP2 - (count2-1));
michael@0 4748 } else {
michael@0 4749 while (count2 > UCOL_BOT_COUNT2) {
michael@0 4750 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
michael@0 4751 count2 -= (uint32_t)UCOL_BOT_COUNT2;
michael@0 4752 }
michael@0 4753 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
michael@0 4754 }
michael@0 4755 count2 = 0;
michael@0 4756 }
michael@0 4757 secondaries.appendByte(secondary);
michael@0 4758 }
michael@0 4759 } else {
michael@0 4760 /* Do the special handling for French secondaries */
michael@0 4761 /* We need to get continuation elements and do intermediate restore */
michael@0 4762 /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
michael@0 4763 if(notIsContinuation) {
michael@0 4764 if (lastSecondaryLength > 1) {
michael@0 4765 uint8_t *frenchStartPtr = secondaries.getLastFewBytes(lastSecondaryLength);
michael@0 4766 if (frenchStartPtr != NULL) {
michael@0 4767 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
michael@0 4768 uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1;
michael@0 4769 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
michael@0 4770 }
michael@0 4771 }
michael@0 4772 lastSecondaryLength = 1;
michael@0 4773 } else {
michael@0 4774 ++lastSecondaryLength;
michael@0 4775 }
michael@0 4776 secondaries.appendByte(secondary);
michael@0 4777 }
michael@0 4778 }
michael@0 4779
michael@0 4780 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
michael@0 4781 // do the case level if we need to do it. We don't want to calculate
michael@0 4782 // case level for primary ignorables if we have only primary strength and case level
michael@0 4783 // otherwise we would break well formedness of CEs
michael@0 4784 doCaseShift(cases, caseShift);
michael@0 4785 if(notIsContinuation) {
michael@0 4786 caseBits = (uint8_t)(tertiary & 0xC0);
michael@0 4787
michael@0 4788 if(tertiary != 0) {
michael@0 4789 if(coll->caseFirst == UCOL_UPPER_FIRST) {
michael@0 4790 if((caseBits & 0xC0) == 0) {
michael@0 4791 cases.lastByte() |= 1 << (--caseShift);
michael@0 4792 } else {
michael@0 4793 cases.lastByte() |= 0 << (--caseShift);
michael@0 4794 /* second bit */
michael@0 4795 doCaseShift(cases, caseShift);
michael@0 4796 cases.lastByte() |= ((caseBits>>6)&1) << (--caseShift);
michael@0 4797 }
michael@0 4798 } else {
michael@0 4799 if((caseBits & 0xC0) == 0) {
michael@0 4800 cases.lastByte() |= 0 << (--caseShift);
michael@0 4801 } else {
michael@0 4802 cases.lastByte() |= 1 << (--caseShift);
michael@0 4803 /* second bit */
michael@0 4804 doCaseShift(cases, caseShift);
michael@0 4805 cases.lastByte() |= ((caseBits>>7)&1) << (--caseShift);
michael@0 4806 }
michael@0 4807 }
michael@0 4808 }
michael@0 4809 }
michael@0 4810 } else {
michael@0 4811 if(notIsContinuation) {
michael@0 4812 tertiary ^= caseSwitch;
michael@0 4813 }
michael@0 4814 }
michael@0 4815
michael@0 4816 tertiary &= tertiaryMask;
michael@0 4817 if(tertiary > compareTer) {
michael@0 4818 /* This is compression code. */
michael@0 4819 /* sequence size check is included in the if clause */
michael@0 4820 if (tertiary == tertiaryCommon && notIsContinuation) {
michael@0 4821 ++count3;
michael@0 4822 } else {
michael@0 4823 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
michael@0 4824 tertiary += tertiaryAddition;
michael@0 4825 } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
michael@0 4826 tertiary -= tertiaryAddition;
michael@0 4827 }
michael@0 4828 if (count3 > 0) {
michael@0 4829 if ((tertiary > tertiaryCommon)) {
michael@0 4830 while (count3 > coll->tertiaryTopCount) {
michael@0 4831 tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
michael@0 4832 count3 -= (uint32_t)coll->tertiaryTopCount;
michael@0 4833 }
michael@0 4834 tertiaries.appendByte(tertiaryTop - (count3-1));
michael@0 4835 } else {
michael@0 4836 while (count3 > coll->tertiaryBottomCount) {
michael@0 4837 tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
michael@0 4838 count3 -= (uint32_t)coll->tertiaryBottomCount;
michael@0 4839 }
michael@0 4840 tertiaries.appendByte(tertiaryBottom + (count3-1));
michael@0 4841 }
michael@0 4842 count3 = 0;
michael@0 4843 }
michael@0 4844 tertiaries.appendByte(tertiary);
michael@0 4845 }
michael@0 4846 }
michael@0 4847
michael@0 4848 if(/*qShifted*/(compareQuad==0) && notIsContinuation) {
michael@0 4849 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
michael@0 4850 if(count4>0) { // Close this part
michael@0 4851 while (count4 > UCOL_BOT_COUNT4) {
michael@0 4852 quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
michael@0 4853 count4 -= UCOL_BOT_COUNT4;
michael@0 4854 }
michael@0 4855 quads.appendByte(UCOL_COMMON_BOT4 + (count4-1));
michael@0 4856 count4 = 0;
michael@0 4857 }
michael@0 4858 quads.appendByte(UCOL_HIRAGANA_QUAD); // Add the Hiragana
michael@0 4859 } else { // This wasn't Hiragana, so we can continue adding stuff
michael@0 4860 count4++;
michael@0 4861 }
michael@0 4862 }
michael@0 4863 }
michael@0 4864 }
michael@0 4865
michael@0 4866 /* Here, we are generally done with processing */
michael@0 4867 /* bailing out would not be too productive */
michael@0 4868
michael@0 4869 UBool ok = TRUE;
michael@0 4870 if(U_SUCCESS(*status)) {
michael@0 4871 /* we have done all the CE's, now let's put them together to form a key */
michael@0 4872 if(compareSec == 0) {
michael@0 4873 if (count2 > 0) {
michael@0 4874 while (count2 > UCOL_BOT_COUNT2) {
michael@0 4875 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
michael@0 4876 count2 -= (uint32_t)UCOL_BOT_COUNT2;
michael@0 4877 }
michael@0 4878 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
michael@0 4879 }
michael@0 4880 result.Append(UCOL_LEVELTERMINATOR);
michael@0 4881 if(!secondaries.isOk()) {
michael@0 4882 ok = FALSE;
michael@0 4883 } else if(!isFrenchSec) {
michael@0 4884 secondaries.appendTo(result);
michael@0 4885 } else {
michael@0 4886 // If there are any unresolved continuation secondaries,
michael@0 4887 // reverse them here so that we can reverse the whole secondary thing.
michael@0 4888 if (lastSecondaryLength > 1) {
michael@0 4889 uint8_t *frenchStartPtr = secondaries.getLastFewBytes(lastSecondaryLength);
michael@0 4890 if (frenchStartPtr != NULL) {
michael@0 4891 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
michael@0 4892 uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1;
michael@0 4893 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
michael@0 4894 }
michael@0 4895 }
michael@0 4896 packFrench(secondaries.data(), secondaries.length(), result);
michael@0 4897 }
michael@0 4898 }
michael@0 4899
michael@0 4900 if(doCase) {
michael@0 4901 ok &= cases.isOk();
michael@0 4902 result.Append(UCOL_LEVELTERMINATOR);
michael@0 4903 cases.appendTo(result);
michael@0 4904 }
michael@0 4905
michael@0 4906 if(compareTer == 0) {
michael@0 4907 if (count3 > 0) {
michael@0 4908 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) {
michael@0 4909 while (count3 >= coll->tertiaryTopCount) {
michael@0 4910 tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
michael@0 4911 count3 -= (uint32_t)coll->tertiaryTopCount;
michael@0 4912 }
michael@0 4913 tertiaries.appendByte(tertiaryTop - count3);
michael@0 4914 } else {
michael@0 4915 while (count3 > coll->tertiaryBottomCount) {
michael@0 4916 tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
michael@0 4917 count3 -= (uint32_t)coll->tertiaryBottomCount;
michael@0 4918 }
michael@0 4919 tertiaries.appendByte(tertiaryBottom + (count3-1));
michael@0 4920 }
michael@0 4921 }
michael@0 4922 ok &= tertiaries.isOk();
michael@0 4923 result.Append(UCOL_LEVELTERMINATOR);
michael@0 4924 tertiaries.appendTo(result);
michael@0 4925
michael@0 4926 if(compareQuad == 0/*qShifted == TRUE*/) {
michael@0 4927 if(count4 > 0) {
michael@0 4928 while (count4 > UCOL_BOT_COUNT4) {
michael@0 4929 quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
michael@0 4930 count4 -= UCOL_BOT_COUNT4;
michael@0 4931 }
michael@0 4932 quads.appendByte(UCOL_COMMON_BOT4 + (count4-1));
michael@0 4933 }
michael@0 4934 ok &= quads.isOk();
michael@0 4935 result.Append(UCOL_LEVELTERMINATOR);
michael@0 4936 quads.appendTo(result);
michael@0 4937 }
michael@0 4938
michael@0 4939 if(compareIdent) {
michael@0 4940 result.Append(UCOL_LEVELTERMINATOR);
michael@0 4941 u_writeIdenticalLevelRun(s.string, len, result);
michael@0 4942 }
michael@0 4943 }
michael@0 4944 result.Append(0);
michael@0 4945 }
michael@0 4946
michael@0 4947 /* To avoid memory leak, free the offset buffer if necessary. */
michael@0 4948 ucol_freeOffsetBuffer(&s);
michael@0 4949
michael@0 4950 ok &= result.IsOk();
michael@0 4951 if(!ok && U_SUCCESS(*status)) { *status = U_MEMORY_ALLOCATION_ERROR; }
michael@0 4952 }
michael@0 4953
michael@0 4954
michael@0 4955 U_CFUNC void U_CALLCONV
michael@0 4956 ucol_calcSortKeySimpleTertiary(const UCollator *coll,
michael@0 4957 const UChar *source,
michael@0 4958 int32_t sourceLength,
michael@0 4959 SortKeyByteSink &result,
michael@0 4960 UErrorCode *status)
michael@0 4961 {
michael@0 4962 U_ALIGN_CODE(16);
michael@0 4963
michael@0 4964 if(U_FAILURE(*status)) {
michael@0 4965 return;
michael@0 4966 }
michael@0 4967
michael@0 4968 SortKeyByteSink &primaries = result;
michael@0 4969 SortKeyLevel secondaries;
michael@0 4970 SortKeyLevel tertiaries;
michael@0 4971
michael@0 4972 UnicodeString normSource;
michael@0 4973
michael@0 4974 int32_t len = sourceLength;
michael@0 4975
michael@0 4976 /* If we need to normalize, we'll do it all at once at the beginning! */
michael@0 4977 if(coll->normalizationMode != UCOL_OFF) {
michael@0 4978 normSource.setTo(len < 0, source, len);
michael@0 4979 const Normalizer2 *norm2 = Normalizer2Factory::getFCDInstance(*status);
michael@0 4980 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
michael@0 4981 if(qcYesLength != normSource.length()) {
michael@0 4982 UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
michael@0 4983 normSource.truncate(qcYesLength);
michael@0 4984 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
michael@0 4985 source = normSource.getBuffer();
michael@0 4986 len = normSource.length();
michael@0 4987 }
michael@0 4988 }
michael@0 4989 collIterate s;
michael@0 4990 IInit_collIterate(coll, (UChar *)source, len, &s, status);
michael@0 4991 if(U_FAILURE(*status)) {
michael@0 4992 return;
michael@0 4993 }
michael@0 4994 s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was normalized.
michael@0 4995
michael@0 4996 uint32_t order = 0;
michael@0 4997
michael@0 4998 uint8_t primary1 = 0;
michael@0 4999 uint8_t primary2 = 0;
michael@0 5000 uint8_t secondary = 0;
michael@0 5001 uint8_t tertiary = 0;
michael@0 5002 uint8_t caseSwitch = coll->caseSwitch;
michael@0 5003 uint8_t tertiaryMask = coll->tertiaryMask;
michael@0 5004 int8_t tertiaryAddition = coll->tertiaryAddition;
michael@0 5005 uint8_t tertiaryTop = coll->tertiaryTop;
michael@0 5006 uint8_t tertiaryBottom = coll->tertiaryBottom;
michael@0 5007 uint8_t tertiaryCommon = coll->tertiaryCommon;
michael@0 5008
michael@0 5009 UBool notIsContinuation = FALSE;
michael@0 5010
michael@0 5011 uint32_t count2 = 0, count3 = 0;
michael@0 5012 uint8_t leadPrimary = 0;
michael@0 5013
michael@0 5014 for(;;) {
michael@0 5015 order = ucol_IGetNextCE(coll, &s, status);
michael@0 5016
michael@0 5017 if(order == 0) {
michael@0 5018 continue;
michael@0 5019 }
michael@0 5020
michael@0 5021 if(order == UCOL_NO_MORE_CES) {
michael@0 5022 break;
michael@0 5023 }
michael@0 5024
michael@0 5025 notIsContinuation = !isContinuation(order);
michael@0 5026
michael@0 5027 if(notIsContinuation) {
michael@0 5028 tertiary = (uint8_t)((order & tertiaryMask));
michael@0 5029 } else {
michael@0 5030 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
michael@0 5031 }
michael@0 5032
michael@0 5033 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
michael@0 5034 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
michael@0 5035 primary1 = (uint8_t)(order >> 8);
michael@0 5036
michael@0 5037 uint8_t originalPrimary1 = primary1;
michael@0 5038 if (coll->leadBytePermutationTable != NULL && notIsContinuation) {
michael@0 5039 primary1 = coll->leadBytePermutationTable[primary1];
michael@0 5040 }
michael@0 5041
michael@0 5042 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
michael@0 5043 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */
michael@0 5044 /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */
michael@0 5045 /* regular and simple sortkey calc */
michael@0 5046 if(primary1 != UCOL_IGNORABLE) {
michael@0 5047 if(notIsContinuation) {
michael@0 5048 if(leadPrimary == primary1) {
michael@0 5049 primaries.Append(primary2);
michael@0 5050 } else {
michael@0 5051 if(leadPrimary != 0) {
michael@0 5052 primaries.Append((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
michael@0 5053 }
michael@0 5054 if(primary2 == UCOL_IGNORABLE) {
michael@0 5055 /* one byter, not compressed */
michael@0 5056 primaries.Append(primary1);
michael@0 5057 leadPrimary = 0;
michael@0 5058 } else if(isCompressible(coll, originalPrimary1)) {
michael@0 5059 /* compress */
michael@0 5060 primaries.Append(leadPrimary = primary1, primary2);
michael@0 5061 } else {
michael@0 5062 leadPrimary = 0;
michael@0 5063 primaries.Append(primary1, primary2);
michael@0 5064 }
michael@0 5065 }
michael@0 5066 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
michael@0 5067 if(primary2 == UCOL_IGNORABLE) {
michael@0 5068 primaries.Append(primary1);
michael@0 5069 } else {
michael@0 5070 primaries.Append(primary1, primary2);
michael@0 5071 }
michael@0 5072 }
michael@0 5073 }
michael@0 5074
michael@0 5075 if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */
michael@0 5076 /* This is compression code. */
michael@0 5077 if (secondary == UCOL_COMMON2 && notIsContinuation) {
michael@0 5078 ++count2;
michael@0 5079 } else {
michael@0 5080 if (count2 > 0) {
michael@0 5081 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
michael@0 5082 while (count2 > UCOL_TOP_COUNT2) {
michael@0 5083 secondaries.appendByte(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
michael@0 5084 count2 -= (uint32_t)UCOL_TOP_COUNT2;
michael@0 5085 }
michael@0 5086 secondaries.appendByte(UCOL_COMMON_TOP2 - (count2-1));
michael@0 5087 } else {
michael@0 5088 while (count2 > UCOL_BOT_COUNT2) {
michael@0 5089 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
michael@0 5090 count2 -= (uint32_t)UCOL_BOT_COUNT2;
michael@0 5091 }
michael@0 5092 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
michael@0 5093 }
michael@0 5094 count2 = 0;
michael@0 5095 }
michael@0 5096 secondaries.appendByte(secondary);
michael@0 5097 }
michael@0 5098 }
michael@0 5099
michael@0 5100 if(notIsContinuation) {
michael@0 5101 tertiary ^= caseSwitch;
michael@0 5102 }
michael@0 5103
michael@0 5104 if(tertiary > 0) {
michael@0 5105 /* This is compression code. */
michael@0 5106 /* sequence size check is included in the if clause */
michael@0 5107 if (tertiary == tertiaryCommon && notIsContinuation) {
michael@0 5108 ++count3;
michael@0 5109 } else {
michael@0 5110 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
michael@0 5111 tertiary += tertiaryAddition;
michael@0 5112 } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
michael@0 5113 tertiary -= tertiaryAddition;
michael@0 5114 }
michael@0 5115 if (count3 > 0) {
michael@0 5116 if ((tertiary > tertiaryCommon)) {
michael@0 5117 while (count3 > coll->tertiaryTopCount) {
michael@0 5118 tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
michael@0 5119 count3 -= (uint32_t)coll->tertiaryTopCount;
michael@0 5120 }
michael@0 5121 tertiaries.appendByte(tertiaryTop - (count3-1));
michael@0 5122 } else {
michael@0 5123 while (count3 > coll->tertiaryBottomCount) {
michael@0 5124 tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
michael@0 5125 count3 -= (uint32_t)coll->tertiaryBottomCount;
michael@0 5126 }
michael@0 5127 tertiaries.appendByte(tertiaryBottom + (count3-1));
michael@0 5128 }
michael@0 5129 count3 = 0;
michael@0 5130 }
michael@0 5131 tertiaries.appendByte(tertiary);
michael@0 5132 }
michael@0 5133 }
michael@0 5134 }
michael@0 5135
michael@0 5136 UBool ok = TRUE;
michael@0 5137 if(U_SUCCESS(*status)) {
michael@0 5138 /* we have done all the CE's, now let's put them together to form a key */
michael@0 5139 if (count2 > 0) {
michael@0 5140 while (count2 > UCOL_BOT_COUNT2) {
michael@0 5141 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
michael@0 5142 count2 -= (uint32_t)UCOL_BOT_COUNT2;
michael@0 5143 }
michael@0 5144 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
michael@0 5145 }
michael@0 5146 ok &= secondaries.isOk();
michael@0 5147 result.Append(UCOL_LEVELTERMINATOR);
michael@0 5148 secondaries.appendTo(result);
michael@0 5149
michael@0 5150 if (count3 > 0) {
michael@0 5151 if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) {
michael@0 5152 while (count3 >= coll->tertiaryTopCount) {
michael@0 5153 tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
michael@0 5154 count3 -= (uint32_t)coll->tertiaryTopCount;
michael@0 5155 }
michael@0 5156 tertiaries.appendByte(tertiaryTop - count3);
michael@0 5157 } else {
michael@0 5158 while (count3 > coll->tertiaryBottomCount) {
michael@0 5159 tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
michael@0 5160 count3 -= (uint32_t)coll->tertiaryBottomCount;
michael@0 5161 }
michael@0 5162 tertiaries.appendByte(tertiaryBottom + (count3-1));
michael@0 5163 }
michael@0 5164 }
michael@0 5165 ok &= tertiaries.isOk();
michael@0 5166 result.Append(UCOL_LEVELTERMINATOR);
michael@0 5167 tertiaries.appendTo(result);
michael@0 5168
michael@0 5169 result.Append(0);
michael@0 5170 }
michael@0 5171
michael@0 5172 /* To avoid memory leak, free the offset buffer if necessary. */
michael@0 5173 ucol_freeOffsetBuffer(&s);
michael@0 5174
michael@0 5175 ok &= result.IsOk();
michael@0 5176 if(!ok && U_SUCCESS(*status)) { *status = U_MEMORY_ALLOCATION_ERROR; }
michael@0 5177 }
michael@0 5178
michael@0 5179 static inline
michael@0 5180 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) {
michael@0 5181 UBool notIsContinuation = !isContinuation(CE);
michael@0 5182 uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF);
michael@0 5183 if((LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0)
michael@0 5184 || (!notIsContinuation && *wasShifted)))
michael@0 5185 || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
michael@0 5186 {
michael@0 5187 // The stuff below should probably be in the sortkey code... maybe not...
michael@0 5188 if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */
michael@0 5189 /* we should just completely ignore it */
michael@0 5190 *wasShifted = TRUE;
michael@0 5191 //continue;
michael@0 5192 }
michael@0 5193 //*wasShifted = TRUE;
michael@0 5194 return TRUE;
michael@0 5195 } else {
michael@0 5196 *wasShifted = FALSE;
michael@0 5197 return FALSE;
michael@0 5198 }
michael@0 5199 }
michael@0 5200 static inline
michael@0 5201 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) {
michael@0 5202 if(level < maxLevel) {
michael@0 5203 dest[i++] = UCOL_LEVELTERMINATOR;
michael@0 5204 } else {
michael@0 5205 dest[i++] = 0;
michael@0 5206 }
michael@0 5207 }
michael@0 5208
michael@0 5209 /** enumeration of level identifiers for partial sort key generation */
michael@0 5210 enum {
michael@0 5211 UCOL_PSK_PRIMARY = 0,
michael@0 5212 UCOL_PSK_SECONDARY = 1,
michael@0 5213 UCOL_PSK_CASE = 2,
michael@0 5214 UCOL_PSK_TERTIARY = 3,
michael@0 5215 UCOL_PSK_QUATERNARY = 4,
michael@0 5216 UCOL_PSK_QUIN = 5, /** This is an extra level, not used - but we have three bits to blow */
michael@0 5217 UCOL_PSK_IDENTICAL = 6,
michael@0 5218 UCOL_PSK_NULL = 7, /** level for the end of sort key. Will just produce zeros */
michael@0 5219 UCOL_PSK_LIMIT
michael@0 5220 };
michael@0 5221
michael@0 5222 /** collation state enum. *_SHIFT value is how much to shift right
michael@0 5223 * to get the state piece to the right. *_MASK value should be
michael@0 5224 * ANDed with the shifted state. This data is stored in state[1]
michael@0 5225 * field.
michael@0 5226 */
michael@0 5227 enum {
michael@0 5228 UCOL_PSK_LEVEL_SHIFT = 0, /** level identificator. stores an enum value from above */
michael@0 5229 UCOL_PSK_LEVEL_MASK = 7, /** three bits */
michael@0 5230 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */
michael@0 5231 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1,
michael@0 5232 /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
michael@0 5233 * This field is also used to denote that the French secondary level is finished
michael@0 5234 */
michael@0 5235 UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */
michael@0 5236 UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */
michael@0 5237 UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */
michael@0 5238 UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */
michael@0 5239 /** When we do French we need to reverse secondary values. However, continuations
michael@0 5240 * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
michael@0 5241 */
michael@0 5242 UCOL_PSK_BOCSU_BYTES_SHIFT = 7,
michael@0 5243 UCOL_PSK_BOCSU_BYTES_MASK = 3,
michael@0 5244 UCOL_PSK_CONSUMED_CES_SHIFT = 9,
michael@0 5245 UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF
michael@0 5246 };
michael@0 5247
michael@0 5248 // macro calculating the number of expansion CEs available
michael@0 5249 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn
michael@0 5250
michael@0 5251
michael@0 5252 /** main sortkey part procedure. On the first call,
michael@0 5253 * you should pass in a collator, an iterator, empty state
michael@0 5254 * state[0] == state[1] == 0, a buffer to hold results
michael@0 5255 * number of bytes you need and an error code pointer.
michael@0 5256 * Make sure your buffer is big enough to hold the wanted
michael@0 5257 * number of sortkey bytes. I don't check.
michael@0 5258 * The only meaningful status you can get back is
michael@0 5259 * U_BUFFER_OVERFLOW_ERROR, which basically means that you
michael@0 5260 * have been dealt a raw deal and that you probably won't
michael@0 5261 * be able to use partial sortkey generation for this
michael@0 5262 * particular combination of string and collator. This
michael@0 5263 * is highly unlikely, but you should still check the error code.
michael@0 5264 * Any other status means that you're not in a sane situation
michael@0 5265 * anymore. After the first call, preserve state values and
michael@0 5266 * use them on subsequent calls to obtain more bytes of a sortkey.
michael@0 5267 * Use until the number of bytes written is smaller than the requested
michael@0 5268 * number of bytes. Generated sortkey is not compatible with the
michael@0 5269 * one generated by ucol_getSortKey, as we don't do any compression.
michael@0 5270 * However, levels are still terminated by a 1 (one) and the sortkey
michael@0 5271 * is terminated by a 0 (zero). Identical level is the same as in the
michael@0 5272 * regular sortkey - internal bocu-1 implementation is used.
michael@0 5273 * For curious, although you cannot do much about this, here is
michael@0 5274 * the structure of state words.
michael@0 5275 * state[0] - iterator state. Depends on the iterator implementation,
michael@0 5276 * but allows the iterator to continue where it stopped in
michael@0 5277 * the last iteration.
michael@0 5278 * state[1] - collation processing state. Here is the distribution
michael@0 5279 * of the bits:
michael@0 5280 * 0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
michael@0 5281 * quaternary, quin (we don't use this one), identical and
michael@0 5282 * null (producing only zeroes - first one to terminate the
michael@0 5283 * sortkey and subsequent to fill the buffer).
michael@0 5284 * 3 - byte count. Number of bytes written on the primary level.
michael@0 5285 * 4 - was shifted. Whether the previous iteration finished in the
michael@0 5286 * shifted state.
michael@0 5287 * 5, 6 - French continuation bytes written. See the comment in the enum
michael@0 5288 * 7,8 - Bocsu bytes used. Number of bytes from a bocu sequence on
michael@0 5289 * the identical level.
michael@0 5290 * 9..31 - CEs consumed. Number of getCE or next32 operations performed
michael@0 5291 * since thes last successful update of the iterator state.
michael@0 5292 */
michael@0 5293 U_CAPI int32_t U_EXPORT2
michael@0 5294 ucol_nextSortKeyPart(const UCollator *coll,
michael@0 5295 UCharIterator *iter,
michael@0 5296 uint32_t state[2],
michael@0 5297 uint8_t *dest, int32_t count,
michael@0 5298 UErrorCode *status)
michael@0 5299 {
michael@0 5300 /* error checking */
michael@0 5301 if(status==NULL || U_FAILURE(*status)) {
michael@0 5302 return 0;
michael@0 5303 }
michael@0 5304 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);
michael@0 5305 if( coll==NULL || iter==NULL ||
michael@0 5306 state==NULL ||
michael@0 5307 count<0 || (count>0 && dest==NULL)
michael@0 5308 ) {
michael@0 5309 *status=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 5310 UTRACE_EXIT_STATUS(status);
michael@0 5311 return 0;
michael@0 5312 }
michael@0 5313
michael@0 5314 UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
michael@0 5315 coll, iter, state[0], state[1], dest, count);
michael@0 5316
michael@0 5317 if(count==0) {
michael@0 5318 /* nothing to do */
michael@0 5319 UTRACE_EXIT_VALUE(0);
michael@0 5320 return 0;
michael@0 5321 }
michael@0 5322 /** Setting up situation according to the state we got from the previous iteration */
michael@0 5323 // The state of the iterator from the previous invocation
michael@0 5324 uint32_t iterState = state[0];
michael@0 5325 // Has the last iteration ended in the shifted state
michael@0 5326 UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE;
michael@0 5327 // What is the current level of the sortkey?
michael@0 5328 int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK;
michael@0 5329 // Have we written only one byte from a two byte primary in the previous iteration?
michael@0 5330 // Also on secondary level - have we finished with the French secondary?
michael@0 5331 int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK;
michael@0 5332 // number of bytes in the continuation buffer for French
michael@0 5333 int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK;
michael@0 5334 // Number of bytes already written from a bocsu sequence. Since
michael@0 5335 // the longes bocsu sequence is 4 long, this can be up to 3.
michael@0 5336 int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK;
michael@0 5337 // Number of elements that need to be consumed in this iteration because
michael@0 5338 // the iterator returned UITER_NO_STATE at the end of the last iteration,
michael@0 5339 // so we had to save the last valid state.
michael@0 5340 int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK;
michael@0 5341
michael@0 5342 /** values that depend on the collator attributes */
michael@0 5343 // strength of the collator.
michael@0 5344 int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status);
michael@0 5345 // maximal level of the partial sortkey. Need to take whether case level is done
michael@0 5346 int32_t maxLevel = 0;
michael@0 5347 if(strength < UCOL_TERTIARY) {
michael@0 5348 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
michael@0 5349 maxLevel = UCOL_PSK_CASE;
michael@0 5350 } else {
michael@0 5351 maxLevel = strength;
michael@0 5352 }
michael@0 5353 } else {
michael@0 5354 if(strength == UCOL_TERTIARY) {
michael@0 5355 maxLevel = UCOL_PSK_TERTIARY;
michael@0 5356 } else if(strength == UCOL_QUATERNARY) {
michael@0 5357 maxLevel = UCOL_PSK_QUATERNARY;
michael@0 5358 } else { // identical
michael@0 5359 maxLevel = UCOL_IDENTICAL;
michael@0 5360 }
michael@0 5361 }
michael@0 5362 // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
michael@0 5363 uint8_t UCOL_HIRAGANA_QUAD =
michael@0 5364 (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF;
michael@0 5365 // Boundary value that decides whether a CE is shifted or not
michael@0 5366 uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0;
michael@0 5367 // Are we doing French collation?
michael@0 5368 UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
michael@0 5369
michael@0 5370 /** initializing the collation state */
michael@0 5371 UBool notIsContinuation = FALSE;
michael@0 5372 uint32_t CE = UCOL_NO_MORE_CES;
michael@0 5373
michael@0 5374 collIterate s;
michael@0 5375 IInit_collIterate(coll, NULL, -1, &s, status);
michael@0 5376 if(U_FAILURE(*status)) {
michael@0 5377 UTRACE_EXIT_STATUS(*status);
michael@0 5378 return 0;
michael@0 5379 }
michael@0 5380 s.iterator = iter;
michael@0 5381 s.flags |= UCOL_USE_ITERATOR;
michael@0 5382 // This variable tells us whether we have produced some other levels in this iteration
michael@0 5383 // before we moved to the identical level. In that case, we need to switch the
michael@0 5384 // type of the iterator.
michael@0 5385 UBool doingIdenticalFromStart = FALSE;
michael@0 5386 // Normalizing iterator
michael@0 5387 // The division for the array length may truncate the array size to
michael@0 5388 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
michael@0 5389 // for all platforms anyway.
michael@0 5390 UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
michael@0 5391 UNormIterator *normIter = NULL;
michael@0 5392 // If the normalization is turned on for the collator and we are below identical level
michael@0 5393 // we will use a FCD normalizing iterator
michael@0 5394 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) {
michael@0 5395 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
michael@0 5396 s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status);
michael@0 5397 s.flags &= ~UCOL_ITER_NORM;
michael@0 5398 if(U_FAILURE(*status)) {
michael@0 5399 UTRACE_EXIT_STATUS(*status);
michael@0 5400 return 0;
michael@0 5401 }
michael@0 5402 } else if(level == UCOL_PSK_IDENTICAL) {
michael@0 5403 // for identical level, we need a NFD iterator. We need to instantiate it here, since we
michael@0 5404 // will be updating the state - and this cannot be done on an ordinary iterator.
michael@0 5405 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
michael@0 5406 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
michael@0 5407 s.flags &= ~UCOL_ITER_NORM;
michael@0 5408 if(U_FAILURE(*status)) {
michael@0 5409 UTRACE_EXIT_STATUS(*status);
michael@0 5410 return 0;
michael@0 5411 }
michael@0 5412 doingIdenticalFromStart = TRUE;
michael@0 5413 }
michael@0 5414
michael@0 5415 // This is the tentative new state of the iterator. The problem
michael@0 5416 // is that the iterator might return an undefined state, in
michael@0 5417 // which case we should save the last valid state and increase
michael@0 5418 // the iterator skip value.
michael@0 5419 uint32_t newState = 0;
michael@0 5420
michael@0 5421 // First, we set the iterator to the last valid position
michael@0 5422 // from the last iteration. This was saved in state[0].
michael@0 5423 if(iterState == 0) {
michael@0 5424 /* initial state */
michael@0 5425 if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) {
michael@0 5426 s.iterator->move(s.iterator, 0, UITER_LIMIT);
michael@0 5427 } else {
michael@0 5428 s.iterator->move(s.iterator, 0, UITER_START);
michael@0 5429 }
michael@0 5430 } else {
michael@0 5431 /* reset to previous state */
michael@0 5432 s.iterator->setState(s.iterator, iterState, status);
michael@0 5433 if(U_FAILURE(*status)) {
michael@0 5434 UTRACE_EXIT_STATUS(*status);
michael@0 5435 return 0;
michael@0 5436 }
michael@0 5437 }
michael@0 5438
michael@0 5439
michael@0 5440
michael@0 5441 // This variable tells us whether we can attempt to update the state
michael@0 5442 // of iterator. Situations where we don't want to update iterator state
michael@0 5443 // are the existence of expansion CEs that are not yet processed, and
michael@0 5444 // finishing the case level without enough space in the buffer to insert
michael@0 5445 // a level terminator.
michael@0 5446 UBool canUpdateState = TRUE;
michael@0 5447
michael@0 5448 // Consume all the CEs that were consumed at the end of the previous
michael@0 5449 // iteration without updating the iterator state. On identical level,
michael@0 5450 // consume the code points.
michael@0 5451 int32_t counter = cces;
michael@0 5452 if(level < UCOL_PSK_IDENTICAL) {
michael@0 5453 while(counter-->0) {
michael@0 5454 // If we're doing French and we are on the secondary level,
michael@0 5455 // we go backwards.
michael@0 5456 if(level == UCOL_PSK_SECONDARY && doingFrench) {
michael@0 5457 CE = ucol_IGetPrevCE(coll, &s, status);
michael@0 5458 } else {
michael@0 5459 CE = ucol_IGetNextCE(coll, &s, status);
michael@0 5460 }
michael@0 5461 if(CE==UCOL_NO_MORE_CES) {
michael@0 5462 /* should not happen */
michael@0 5463 *status=U_INTERNAL_PROGRAM_ERROR;
michael@0 5464 UTRACE_EXIT_STATUS(*status);
michael@0 5465 return 0;
michael@0 5466 }
michael@0 5467 if(uprv_numAvailableExpCEs(s)) {
michael@0 5468 canUpdateState = FALSE;
michael@0 5469 }
michael@0 5470 }
michael@0 5471 } else {
michael@0 5472 while(counter-->0) {
michael@0 5473 uiter_next32(s.iterator);
michael@0 5474 }
michael@0 5475 }
michael@0 5476
michael@0 5477 // French secondary needs to know whether the iterator state of zero came from previous level OR
michael@0 5478 // from a new invocation...
michael@0 5479 UBool wasDoingPrimary = FALSE;
michael@0 5480 // destination buffer byte counter. When this guy
michael@0 5481 // gets to count, we're done with the iteration
michael@0 5482 int32_t i = 0;
michael@0 5483 // used to count the zero bytes written after we
michael@0 5484 // have finished with the sort key
michael@0 5485 int32_t j = 0;
michael@0 5486
michael@0 5487
michael@0 5488 // Hm.... I think we're ready to plunge in. Basic story is as following:
michael@0 5489 // we have a fall through case based on level. This is used for initial
michael@0 5490 // positioning on iteration start. Every level processor contains a
michael@0 5491 // for(;;) which will be broken when we exhaust all the CEs. Other
michael@0 5492 // way to exit is a goto saveState, which happens when we have filled
michael@0 5493 // out our buffer.
michael@0 5494 switch(level) {
michael@0 5495 case UCOL_PSK_PRIMARY:
michael@0 5496 wasDoingPrimary = TRUE;
michael@0 5497 for(;;) {
michael@0 5498 if(i==count) {
michael@0 5499 goto saveState;
michael@0 5500 }
michael@0 5501 // We should save the state only if we
michael@0 5502 // are sure that we are done with the
michael@0 5503 // previous iterator state
michael@0 5504 if(canUpdateState && byteCountOrFrenchDone == 0) {
michael@0 5505 newState = s.iterator->getState(s.iterator);
michael@0 5506 if(newState != UITER_NO_STATE) {
michael@0 5507 iterState = newState;
michael@0 5508 cces = 0;
michael@0 5509 }
michael@0 5510 }
michael@0 5511 CE = ucol_IGetNextCE(coll, &s, status);
michael@0 5512 cces++;
michael@0 5513 if(CE==UCOL_NO_MORE_CES) {
michael@0 5514 // Add the level separator
michael@0 5515 terminatePSKLevel(level, maxLevel, i, dest);
michael@0 5516 byteCountOrFrenchDone=0;
michael@0 5517 // Restart the iteration an move to the
michael@0 5518 // second level
michael@0 5519 s.iterator->move(s.iterator, 0, UITER_START);
michael@0 5520 cces = 0;
michael@0 5521 level = UCOL_PSK_SECONDARY;
michael@0 5522 break;
michael@0 5523 }
michael@0 5524 if(!isContinuation(CE)){
michael@0 5525 if(coll->leadBytePermutationTable != NULL){
michael@0 5526 CE = (coll->leadBytePermutationTable[CE>>24] << 24) | (CE & 0x00FFFFFF);
michael@0 5527 }
michael@0 5528 }
michael@0 5529 if(!isShiftedCE(CE, LVT, &wasShifted)) {
michael@0 5530 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */
michael@0 5531 if(CE != 0) {
michael@0 5532 if(byteCountOrFrenchDone == 0) {
michael@0 5533 // get the second byte of primary
michael@0 5534 dest[i++]=(uint8_t)(CE >> 8);
michael@0 5535 } else {
michael@0 5536 byteCountOrFrenchDone = 0;
michael@0 5537 }
michael@0 5538 if((CE &=0xff)!=0) {
michael@0 5539 if(i==count) {
michael@0 5540 /* overflow */
michael@0 5541 byteCountOrFrenchDone = 1;
michael@0 5542 cces--;
michael@0 5543 goto saveState;
michael@0 5544 }
michael@0 5545 dest[i++]=(uint8_t)CE;
michael@0 5546 }
michael@0 5547 }
michael@0 5548 }
michael@0 5549 if(uprv_numAvailableExpCEs(s)) {
michael@0 5550 canUpdateState = FALSE;
michael@0 5551 } else {
michael@0 5552 canUpdateState = TRUE;
michael@0 5553 }
michael@0 5554 }
michael@0 5555 /* fall through to next level */
michael@0 5556 case UCOL_PSK_SECONDARY:
michael@0 5557 if(strength >= UCOL_SECONDARY) {
michael@0 5558 if(!doingFrench) {
michael@0 5559 for(;;) {
michael@0 5560 if(i == count) {
michael@0 5561 goto saveState;
michael@0 5562 }
michael@0 5563 // We should save the state only if we
michael@0 5564 // are sure that we are done with the
michael@0 5565 // previous iterator state
michael@0 5566 if(canUpdateState) {
michael@0 5567 newState = s.iterator->getState(s.iterator);
michael@0 5568 if(newState != UITER_NO_STATE) {
michael@0 5569 iterState = newState;
michael@0 5570 cces = 0;
michael@0 5571 }
michael@0 5572 }
michael@0 5573 CE = ucol_IGetNextCE(coll, &s, status);
michael@0 5574 cces++;
michael@0 5575 if(CE==UCOL_NO_MORE_CES) {
michael@0 5576 // Add the level separator
michael@0 5577 terminatePSKLevel(level, maxLevel, i, dest);
michael@0 5578 byteCountOrFrenchDone = 0;
michael@0 5579 // Restart the iteration an move to the
michael@0 5580 // second level
michael@0 5581 s.iterator->move(s.iterator, 0, UITER_START);
michael@0 5582 cces = 0;
michael@0 5583 level = UCOL_PSK_CASE;
michael@0 5584 break;
michael@0 5585 }
michael@0 5586 if(!isShiftedCE(CE, LVT, &wasShifted)) {
michael@0 5587 CE >>= 8; /* get secondary */
michael@0 5588 if(CE != 0) {
michael@0 5589 dest[i++]=(uint8_t)CE;
michael@0 5590 }
michael@0 5591 }
michael@0 5592 if(uprv_numAvailableExpCEs(s)) {
michael@0 5593 canUpdateState = FALSE;
michael@0 5594 } else {
michael@0 5595 canUpdateState = TRUE;
michael@0 5596 }
michael@0 5597 }
michael@0 5598 } else { // French secondary processing
michael@0 5599 uint8_t frenchBuff[UCOL_MAX_BUFFER];
michael@0 5600 int32_t frenchIndex = 0;
michael@0 5601 // Here we are going backwards.
michael@0 5602 // If the iterator is at the beggining, it should be
michael@0 5603 // moved to end.
michael@0 5604 if(wasDoingPrimary) {
michael@0 5605 s.iterator->move(s.iterator, 0, UITER_LIMIT);
michael@0 5606 cces = 0;
michael@0 5607 }
michael@0 5608 for(;;) {
michael@0 5609 if(i == count) {
michael@0 5610 goto saveState;
michael@0 5611 }
michael@0 5612 if(canUpdateState) {
michael@0 5613 newState = s.iterator->getState(s.iterator);
michael@0 5614 if(newState != UITER_NO_STATE) {
michael@0 5615 iterState = newState;
michael@0 5616 cces = 0;
michael@0 5617 }
michael@0 5618 }
michael@0 5619 CE = ucol_IGetPrevCE(coll, &s, status);
michael@0 5620 cces++;
michael@0 5621 if(CE==UCOL_NO_MORE_CES) {
michael@0 5622 // Add the level separator
michael@0 5623 terminatePSKLevel(level, maxLevel, i, dest);
michael@0 5624 byteCountOrFrenchDone = 0;
michael@0 5625 // Restart the iteration an move to the next level
michael@0 5626 s.iterator->move(s.iterator, 0, UITER_START);
michael@0 5627 level = UCOL_PSK_CASE;
michael@0 5628 break;
michael@0 5629 }
michael@0 5630 if(isContinuation(CE)) { // if it's a continuation, we want to save it and
michael@0 5631 // reverse when we get a first non-continuation CE.
michael@0 5632 CE >>= 8;
michael@0 5633 frenchBuff[frenchIndex++] = (uint8_t)CE;
michael@0 5634 } else if(!isShiftedCE(CE, LVT, &wasShifted)) {
michael@0 5635 CE >>= 8; /* get secondary */
michael@0 5636 if(!frenchIndex) {
michael@0 5637 if(CE != 0) {
michael@0 5638 dest[i++]=(uint8_t)CE;
michael@0 5639 }
michael@0 5640 } else {
michael@0 5641 frenchBuff[frenchIndex++] = (uint8_t)CE;
michael@0 5642 frenchIndex -= usedFrench;
michael@0 5643 usedFrench = 0;
michael@0 5644 while(i < count && frenchIndex) {
michael@0 5645 dest[i++] = frenchBuff[--frenchIndex];
michael@0 5646 usedFrench++;
michael@0 5647 }
michael@0 5648 }
michael@0 5649 }
michael@0 5650 if(uprv_numAvailableExpCEs(s)) {
michael@0 5651 canUpdateState = FALSE;
michael@0 5652 } else {
michael@0 5653 canUpdateState = TRUE;
michael@0 5654 }
michael@0 5655 }
michael@0 5656 }
michael@0 5657 } else {
michael@0 5658 level = UCOL_PSK_CASE;
michael@0 5659 }
michael@0 5660 /* fall through to next level */
michael@0 5661 case UCOL_PSK_CASE:
michael@0 5662 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
michael@0 5663 uint32_t caseShift = UCOL_CASE_SHIFT_START;
michael@0 5664 uint8_t caseByte = UCOL_CASE_BYTE_START;
michael@0 5665 uint8_t caseBits = 0;
michael@0 5666
michael@0 5667 for(;;) {
michael@0 5668 U_ASSERT(caseShift <= UCOL_CASE_SHIFT_START);
michael@0 5669 if(i == count) {
michael@0 5670 goto saveState;
michael@0 5671 }
michael@0 5672 // We should save the state only if we
michael@0 5673 // are sure that we are done with the
michael@0 5674 // previous iterator state
michael@0 5675 if(canUpdateState) {
michael@0 5676 newState = s.iterator->getState(s.iterator);
michael@0 5677 if(newState != UITER_NO_STATE) {
michael@0 5678 iterState = newState;
michael@0 5679 cces = 0;
michael@0 5680 }
michael@0 5681 }
michael@0 5682 CE = ucol_IGetNextCE(coll, &s, status);
michael@0 5683 cces++;
michael@0 5684 if(CE==UCOL_NO_MORE_CES) {
michael@0 5685 // On the case level we might have an unfinished
michael@0 5686 // case byte. Add one if it's started.
michael@0 5687 if(caseShift != UCOL_CASE_SHIFT_START) {
michael@0 5688 dest[i++] = caseByte;
michael@0 5689 }
michael@0 5690 cces = 0;
michael@0 5691 // We have finished processing CEs on this level.
michael@0 5692 // However, we don't know if we have enough space
michael@0 5693 // to add a case level terminator.
michael@0 5694 if(i < count) {
michael@0 5695 // Add the level separator
michael@0 5696 terminatePSKLevel(level, maxLevel, i, dest);
michael@0 5697 // Restart the iteration and move to the
michael@0 5698 // next level
michael@0 5699 s.iterator->move(s.iterator, 0, UITER_START);
michael@0 5700 level = UCOL_PSK_TERTIARY;
michael@0 5701 } else {
michael@0 5702 canUpdateState = FALSE;
michael@0 5703 }
michael@0 5704 break;
michael@0 5705 }
michael@0 5706
michael@0 5707 if(!isShiftedCE(CE, LVT, &wasShifted)) {
michael@0 5708 if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) {
michael@0 5709 // do the case level if we need to do it. We don't want to calculate
michael@0 5710 // case level for primary ignorables if we have only primary strength and case level
michael@0 5711 // otherwise we would break well formedness of CEs
michael@0 5712 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
michael@0 5713 caseBits = (uint8_t)(CE & 0xC0);
michael@0 5714 // this copies the case level logic from the
michael@0 5715 // sort key generation code
michael@0 5716 if(CE != 0) {
michael@0 5717 if (caseShift == 0) {
michael@0 5718 dest[i++] = caseByte;
michael@0 5719 caseShift = UCOL_CASE_SHIFT_START;
michael@0 5720 caseByte = UCOL_CASE_BYTE_START;
michael@0 5721 }
michael@0 5722 if(coll->caseFirst == UCOL_UPPER_FIRST) {
michael@0 5723 if((caseBits & 0xC0) == 0) {
michael@0 5724 caseByte |= 1 << (--caseShift);
michael@0 5725 } else {
michael@0 5726 caseByte |= 0 << (--caseShift);
michael@0 5727 /* second bit */
michael@0 5728 if(caseShift == 0) {
michael@0 5729 dest[i++] = caseByte;
michael@0 5730 caseShift = UCOL_CASE_SHIFT_START;
michael@0 5731 caseByte = UCOL_CASE_BYTE_START;
michael@0 5732 }
michael@0 5733 caseByte |= ((caseBits>>6)&1) << (--caseShift);
michael@0 5734 }
michael@0 5735 } else {
michael@0 5736 if((caseBits & 0xC0) == 0) {
michael@0 5737 caseByte |= 0 << (--caseShift);
michael@0 5738 } else {
michael@0 5739 caseByte |= 1 << (--caseShift);
michael@0 5740 /* second bit */
michael@0 5741 if(caseShift == 0) {
michael@0 5742 dest[i++] = caseByte;
michael@0 5743 caseShift = UCOL_CASE_SHIFT_START;
michael@0 5744 caseByte = UCOL_CASE_BYTE_START;
michael@0 5745 }
michael@0 5746 caseByte |= ((caseBits>>7)&1) << (--caseShift);
michael@0 5747 }
michael@0 5748 }
michael@0 5749 }
michael@0 5750
michael@0 5751 }
michael@0 5752 }
michael@0 5753 // Not sure this is correct for the case level - revisit
michael@0 5754 if(uprv_numAvailableExpCEs(s)) {
michael@0 5755 canUpdateState = FALSE;
michael@0 5756 } else {
michael@0 5757 canUpdateState = TRUE;
michael@0 5758 }
michael@0 5759 }
michael@0 5760 } else {
michael@0 5761 level = UCOL_PSK_TERTIARY;
michael@0 5762 }
michael@0 5763 /* fall through to next level */
michael@0 5764 case UCOL_PSK_TERTIARY:
michael@0 5765 if(strength >= UCOL_TERTIARY) {
michael@0 5766 for(;;) {
michael@0 5767 if(i == count) {
michael@0 5768 goto saveState;
michael@0 5769 }
michael@0 5770 // We should save the state only if we
michael@0 5771 // are sure that we are done with the
michael@0 5772 // previous iterator state
michael@0 5773 if(canUpdateState) {
michael@0 5774 newState = s.iterator->getState(s.iterator);
michael@0 5775 if(newState != UITER_NO_STATE) {
michael@0 5776 iterState = newState;
michael@0 5777 cces = 0;
michael@0 5778 }
michael@0 5779 }
michael@0 5780 CE = ucol_IGetNextCE(coll, &s, status);
michael@0 5781 cces++;
michael@0 5782 if(CE==UCOL_NO_MORE_CES) {
michael@0 5783 // Add the level separator
michael@0 5784 terminatePSKLevel(level, maxLevel, i, dest);
michael@0 5785 byteCountOrFrenchDone = 0;
michael@0 5786 // Restart the iteration an move to the
michael@0 5787 // second level
michael@0 5788 s.iterator->move(s.iterator, 0, UITER_START);
michael@0 5789 cces = 0;
michael@0 5790 level = UCOL_PSK_QUATERNARY;
michael@0 5791 break;
michael@0 5792 }
michael@0 5793 if(!isShiftedCE(CE, LVT, &wasShifted)) {
michael@0 5794 notIsContinuation = !isContinuation(CE);
michael@0 5795
michael@0 5796 if(notIsContinuation) {
michael@0 5797 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
michael@0 5798 CE ^= coll->caseSwitch;
michael@0 5799 CE &= coll->tertiaryMask;
michael@0 5800 } else {
michael@0 5801 CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
michael@0 5802 }
michael@0 5803
michael@0 5804 if(CE != 0) {
michael@0 5805 dest[i++]=(uint8_t)CE;
michael@0 5806 }
michael@0 5807 }
michael@0 5808 if(uprv_numAvailableExpCEs(s)) {
michael@0 5809 canUpdateState = FALSE;
michael@0 5810 } else {
michael@0 5811 canUpdateState = TRUE;
michael@0 5812 }
michael@0 5813 }
michael@0 5814 } else {
michael@0 5815 // if we're not doing tertiary
michael@0 5816 // skip to the end
michael@0 5817 level = UCOL_PSK_NULL;
michael@0 5818 }
michael@0 5819 /* fall through to next level */
michael@0 5820 case UCOL_PSK_QUATERNARY:
michael@0 5821 if(strength >= UCOL_QUATERNARY) {
michael@0 5822 for(;;) {
michael@0 5823 if(i == count) {
michael@0 5824 goto saveState;
michael@0 5825 }
michael@0 5826 // We should save the state only if we
michael@0 5827 // are sure that we are done with the
michael@0 5828 // previous iterator state
michael@0 5829 if(canUpdateState) {
michael@0 5830 newState = s.iterator->getState(s.iterator);
michael@0 5831 if(newState != UITER_NO_STATE) {
michael@0 5832 iterState = newState;
michael@0 5833 cces = 0;
michael@0 5834 }
michael@0 5835 }
michael@0 5836 CE = ucol_IGetNextCE(coll, &s, status);
michael@0 5837 cces++;
michael@0 5838 if(CE==UCOL_NO_MORE_CES) {
michael@0 5839 // Add the level separator
michael@0 5840 terminatePSKLevel(level, maxLevel, i, dest);
michael@0 5841 //dest[i++] = UCOL_LEVELTERMINATOR;
michael@0 5842 byteCountOrFrenchDone = 0;
michael@0 5843 // Restart the iteration an move to the
michael@0 5844 // second level
michael@0 5845 s.iterator->move(s.iterator, 0, UITER_START);
michael@0 5846 cces = 0;
michael@0 5847 level = UCOL_PSK_QUIN;
michael@0 5848 break;
michael@0 5849 }
michael@0 5850 if(CE==0)
michael@0 5851 continue;
michael@0 5852 if(isShiftedCE(CE, LVT, &wasShifted)) {
michael@0 5853 CE >>= 16; /* get primary */
michael@0 5854 if(CE != 0) {
michael@0 5855 if(byteCountOrFrenchDone == 0) {
michael@0 5856 dest[i++]=(uint8_t)(CE >> 8);
michael@0 5857 } else {
michael@0 5858 byteCountOrFrenchDone = 0;
michael@0 5859 }
michael@0 5860 if((CE &=0xff)!=0) {
michael@0 5861 if(i==count) {
michael@0 5862 /* overflow */
michael@0 5863 byteCountOrFrenchDone = 1;
michael@0 5864 goto saveState;
michael@0 5865 }
michael@0 5866 dest[i++]=(uint8_t)CE;
michael@0 5867 }
michael@0 5868 }
michael@0 5869 } else {
michael@0 5870 notIsContinuation = !isContinuation(CE);
michael@0 5871 if(notIsContinuation) {
michael@0 5872 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
michael@0 5873 dest[i++] = UCOL_HIRAGANA_QUAD;
michael@0 5874 } else {
michael@0 5875 dest[i++] = 0xFF;
michael@0 5876 }
michael@0 5877 }
michael@0 5878 }
michael@0 5879 if(uprv_numAvailableExpCEs(s)) {
michael@0 5880 canUpdateState = FALSE;
michael@0 5881 } else {
michael@0 5882 canUpdateState = TRUE;
michael@0 5883 }
michael@0 5884 }
michael@0 5885 } else {
michael@0 5886 // if we're not doing quaternary
michael@0 5887 // skip to the end
michael@0 5888 level = UCOL_PSK_NULL;
michael@0 5889 }
michael@0 5890 /* fall through to next level */
michael@0 5891 case UCOL_PSK_QUIN:
michael@0 5892 level = UCOL_PSK_IDENTICAL;
michael@0 5893 /* fall through to next level */
michael@0 5894 case UCOL_PSK_IDENTICAL:
michael@0 5895 if(strength >= UCOL_IDENTICAL) {
michael@0 5896 UChar32 first, second;
michael@0 5897 int32_t bocsuBytesWritten = 0;
michael@0 5898 // We always need to do identical on
michael@0 5899 // the NFD form of the string.
michael@0 5900 if(normIter == NULL) {
michael@0 5901 // we arrived from the level below and
michael@0 5902 // normalization was not turned on.
michael@0 5903 // therefore, we need to make a fresh NFD iterator
michael@0 5904 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
michael@0 5905 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
michael@0 5906 } else if(!doingIdenticalFromStart) {
michael@0 5907 // there is an iterator, but we did some other levels.
michael@0 5908 // therefore, we have a FCD iterator - need to make
michael@0 5909 // a NFD one.
michael@0 5910 // normIter being at the beginning does not guarantee
michael@0 5911 // that the underlying iterator is at the beginning
michael@0 5912 iter->move(iter, 0, UITER_START);
michael@0 5913 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
michael@0 5914 }
michael@0 5915 // At this point we have a NFD iterator that is positioned
michael@0 5916 // in the right place
michael@0 5917 if(U_FAILURE(*status)) {
michael@0 5918 UTRACE_EXIT_STATUS(*status);
michael@0 5919 return 0;
michael@0 5920 }
michael@0 5921 first = uiter_previous32(s.iterator);
michael@0 5922 // maybe we're at the start of the string
michael@0 5923 if(first == U_SENTINEL) {
michael@0 5924 first = 0;
michael@0 5925 } else {
michael@0 5926 uiter_next32(s.iterator);
michael@0 5927 }
michael@0 5928
michael@0 5929 j = 0;
michael@0 5930 for(;;) {
michael@0 5931 if(i == count) {
michael@0 5932 if(j+1 < bocsuBytesWritten) {
michael@0 5933 bocsuBytesUsed = j+1;
michael@0 5934 }
michael@0 5935 goto saveState;
michael@0 5936 }
michael@0 5937
michael@0 5938 // On identical level, we will always save
michael@0 5939 // the state if we reach this point, since
michael@0 5940 // we don't depend on getNextCE for content
michael@0 5941 // all the content is in our buffer and we
michael@0 5942 // already either stored the full buffer OR
michael@0 5943 // otherwise we won't arrive here.
michael@0 5944 newState = s.iterator->getState(s.iterator);
michael@0 5945 if(newState != UITER_NO_STATE) {
michael@0 5946 iterState = newState;
michael@0 5947 cces = 0;
michael@0 5948 }
michael@0 5949
michael@0 5950 uint8_t buff[4];
michael@0 5951 second = uiter_next32(s.iterator);
michael@0 5952 cces++;
michael@0 5953
michael@0 5954 // end condition for identical level
michael@0 5955 if(second == U_SENTINEL) {
michael@0 5956 terminatePSKLevel(level, maxLevel, i, dest);
michael@0 5957 level = UCOL_PSK_NULL;
michael@0 5958 break;
michael@0 5959 }
michael@0 5960 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff);
michael@0 5961 first = second;
michael@0 5962
michael@0 5963 j = 0;
michael@0 5964 if(bocsuBytesUsed != 0) {
michael@0 5965 while(bocsuBytesUsed-->0) {
michael@0 5966 j++;
michael@0 5967 }
michael@0 5968 }
michael@0 5969
michael@0 5970 while(i < count && j < bocsuBytesWritten) {
michael@0 5971 dest[i++] = buff[j++];
michael@0 5972 }
michael@0 5973 }
michael@0 5974
michael@0 5975 } else {
michael@0 5976 level = UCOL_PSK_NULL;
michael@0 5977 }
michael@0 5978 /* fall through to next level */
michael@0 5979 case UCOL_PSK_NULL:
michael@0 5980 j = i;
michael@0 5981 while(j<count) {
michael@0 5982 dest[j++]=0;
michael@0 5983 }
michael@0 5984 break;
michael@0 5985 default:
michael@0 5986 *status = U_INTERNAL_PROGRAM_ERROR;
michael@0 5987 UTRACE_EXIT_STATUS(*status);
michael@0 5988 return 0;
michael@0 5989 }
michael@0 5990
michael@0 5991 saveState:
michael@0 5992 // Now we need to return stuff. First we want to see whether we have
michael@0 5993 // done everything for the current state of iterator.
michael@0 5994 if(byteCountOrFrenchDone
michael@0 5995 || canUpdateState == FALSE
michael@0 5996 || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE)
michael@0 5997 {
michael@0 5998 // Any of above mean that the previous transaction
michael@0 5999 // wasn't finished and that we should store the
michael@0 6000 // previous iterator state.
michael@0 6001 state[0] = iterState;
michael@0 6002 } else {
michael@0 6003 // The transaction is complete. We will continue in the next iteration.
michael@0 6004 state[0] = s.iterator->getState(s.iterator);
michael@0 6005 cces = 0;
michael@0 6006 }
michael@0 6007 // Store the number of bocsu bytes written.
michael@0 6008 if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) {
michael@0 6009 *status = U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 6010 }
michael@0 6011 state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT;
michael@0 6012
michael@0 6013 // Next we put in the level of comparison
michael@0 6014 state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT);
michael@0 6015
michael@0 6016 // If we are doing French, we need to store whether we have just finished the French level
michael@0 6017 if(level == UCOL_PSK_SECONDARY && doingFrench) {
michael@0 6018 state[1] |= (((int32_t)(state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
michael@0 6019 } else {
michael@0 6020 state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
michael@0 6021 }
michael@0 6022
michael@0 6023 // Was the latest CE shifted
michael@0 6024 if(wasShifted) {
michael@0 6025 state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT;
michael@0 6026 }
michael@0 6027 // Check for cces overflow
michael@0 6028 if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) {
michael@0 6029 *status = U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 6030 }
michael@0 6031 // Store cces
michael@0 6032 state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT);
michael@0 6033
michael@0 6034 // Check for French overflow
michael@0 6035 if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) {
michael@0 6036 *status = U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 6037 }
michael@0 6038 // Store number of bytes written in the French secondary continuation sequence
michael@0 6039 state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT);
michael@0 6040
michael@0 6041
michael@0 6042 // If we have used normalizing iterator, get rid of it
michael@0 6043 if(normIter != NULL) {
michael@0 6044 unorm_closeIter(normIter);
michael@0 6045 }
michael@0 6046
michael@0 6047 /* To avoid memory leak, free the offset buffer if necessary. */
michael@0 6048 ucol_freeOffsetBuffer(&s);
michael@0 6049
michael@0 6050 // Return number of meaningful sortkey bytes.
michael@0 6051 UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",
michael@0 6052 dest,i, state[0], state[1]);
michael@0 6053 UTRACE_EXIT_VALUE(i);
michael@0 6054 return i;
michael@0 6055 }
michael@0 6056
michael@0 6057 /**
michael@0 6058 * Produce a bound for a given sortkey and a number of levels.
michael@0 6059 */
michael@0 6060 U_CAPI int32_t U_EXPORT2
michael@0 6061 ucol_getBound(const uint8_t *source,
michael@0 6062 int32_t sourceLength,
michael@0 6063 UColBoundMode boundType,
michael@0 6064 uint32_t noOfLevels,
michael@0 6065 uint8_t *result,
michael@0 6066 int32_t resultLength,
michael@0 6067 UErrorCode *status)
michael@0 6068 {
michael@0 6069 // consistency checks
michael@0 6070 if(status == NULL || U_FAILURE(*status)) {
michael@0 6071 return 0;
michael@0 6072 }
michael@0 6073 if(source == NULL) {
michael@0 6074 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 6075 return 0;
michael@0 6076 }
michael@0 6077
michael@0 6078 int32_t sourceIndex = 0;
michael@0 6079 // Scan the string until we skip enough of the key OR reach the end of the key
michael@0 6080 do {
michael@0 6081 sourceIndex++;
michael@0 6082 if(source[sourceIndex] == UCOL_LEVELTERMINATOR) {
michael@0 6083 noOfLevels--;
michael@0 6084 }
michael@0 6085 } while (noOfLevels > 0
michael@0 6086 && (source[sourceIndex] != 0 || sourceIndex < sourceLength));
michael@0 6087
michael@0 6088 if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
michael@0 6089 && noOfLevels > 0) {
michael@0 6090 *status = U_SORT_KEY_TOO_SHORT_WARNING;
michael@0 6091 }
michael@0 6092
michael@0 6093
michael@0 6094 // READ ME: this code assumes that the values for boundType
michael@0 6095 // enum will not changes. They are set so that the enum value
michael@0 6096 // corresponds to the number of extra bytes each bound type
michael@0 6097 // needs.
michael@0 6098 if(result != NULL && resultLength >= sourceIndex+boundType) {
michael@0 6099 uprv_memcpy(result, source, sourceIndex);
michael@0 6100 switch(boundType) {
michael@0 6101 // Lower bound just gets terminated. No extra bytes
michael@0 6102 case UCOL_BOUND_LOWER: // = 0
michael@0 6103 break;
michael@0 6104 // Upper bound needs one extra byte
michael@0 6105 case UCOL_BOUND_UPPER: // = 1
michael@0 6106 result[sourceIndex++] = 2;
michael@0 6107 break;
michael@0 6108 // Upper long bound needs two extra bytes
michael@0 6109 case UCOL_BOUND_UPPER_LONG: // = 2
michael@0 6110 result[sourceIndex++] = 0xFF;
michael@0 6111 result[sourceIndex++] = 0xFF;
michael@0 6112 break;
michael@0 6113 default:
michael@0 6114 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 6115 return 0;
michael@0 6116 }
michael@0 6117 result[sourceIndex++] = 0;
michael@0 6118
michael@0 6119 return sourceIndex;
michael@0 6120 } else {
michael@0 6121 return sourceIndex+boundType+1;
michael@0 6122 }
michael@0 6123 }
michael@0 6124
michael@0 6125 /****************************************************************************/
michael@0 6126 /* Following are the functions that deal with the properties of a collator */
michael@0 6127 /* there are new APIs and some compatibility APIs */
michael@0 6128 /****************************************************************************/
michael@0 6129
michael@0 6130 static inline void
michael@0 6131 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE,
michael@0 6132 int32_t *primShift, int32_t *secShift, int32_t *terShift)
michael@0 6133 {
michael@0 6134 uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
michael@0 6135 UBool reverseSecondary = FALSE;
michael@0 6136 UBool continuation = isContinuation(CE);
michael@0 6137 if(!continuation) {
michael@0 6138 tertiary = (uint8_t)((CE & coll->tertiaryMask));
michael@0 6139 tertiary ^= coll->caseSwitch;
michael@0 6140 reverseSecondary = TRUE;
michael@0 6141 } else {
michael@0 6142 tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
michael@0 6143 tertiary &= UCOL_REMOVE_CASE;
michael@0 6144 reverseSecondary = FALSE;
michael@0 6145 }
michael@0 6146
michael@0 6147 secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
michael@0 6148 primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
michael@0 6149 primary1 = (uint8_t)(CE >> 8);
michael@0 6150
michael@0 6151 if(primary1 != 0) {
michael@0 6152 if (coll->leadBytePermutationTable != NULL && !continuation) {
michael@0 6153 primary1 = coll->leadBytePermutationTable[primary1];
michael@0 6154 }
michael@0 6155
michael@0 6156 coll->latinOneCEs[ch] |= (primary1 << *primShift);
michael@0 6157 *primShift -= 8;
michael@0 6158 }
michael@0 6159 if(primary2 != 0) {
michael@0 6160 if(*primShift < 0) {
michael@0 6161 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
michael@0 6162 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
michael@0 6163 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
michael@0 6164 return;
michael@0 6165 }
michael@0 6166 coll->latinOneCEs[ch] |= (primary2 << *primShift);
michael@0 6167 *primShift -= 8;
michael@0 6168 }
michael@0 6169 if(secondary != 0) {
michael@0 6170 if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary
michael@0 6171 coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary
michael@0 6172 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24);
michael@0 6173 } else { // normal case
michael@0 6174 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift);
michael@0 6175 }
michael@0 6176 *secShift -= 8;
michael@0 6177 }
michael@0 6178 if(tertiary != 0) {
michael@0 6179 coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift);
michael@0 6180 *terShift -= 8;
michael@0 6181 }
michael@0 6182 }
michael@0 6183
michael@0 6184 static inline UBool
michael@0 6185 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) {
michael@0 6186 uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3);
michael@0 6187 if(newTable == NULL) {
michael@0 6188 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 6189 coll->latinOneFailed = TRUE;
michael@0 6190 return FALSE;
michael@0 6191 }
michael@0 6192 int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t);
michael@0 6193 uprv_memset(newTable, 0, size*sizeof(uint32_t)*3);
michael@0 6194 uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy);
michael@0 6195 uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy);
michael@0 6196 uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy);
michael@0 6197 coll->latinOneTableLen = size;
michael@0 6198 uprv_free(coll->latinOneCEs);
michael@0 6199 coll->latinOneCEs = newTable;
michael@0 6200 return TRUE;
michael@0 6201 }
michael@0 6202
michael@0 6203 static UBool
michael@0 6204 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) {
michael@0 6205 UBool result = TRUE;
michael@0 6206 if(coll->latinOneCEs == NULL) {
michael@0 6207 coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3);
michael@0 6208 if(coll->latinOneCEs == NULL) {
michael@0 6209 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 6210 return FALSE;
michael@0 6211 }
michael@0 6212 coll->latinOneTableLen = UCOL_LATINONETABLELEN;
michael@0 6213 }
michael@0 6214 UChar ch = 0;
michael@0 6215 UCollationElements *it = ucol_openElements(coll, &ch, 1, status);
michael@0 6216 // Check for null pointer
michael@0 6217 if (U_FAILURE(*status)) {
michael@0 6218 ucol_closeElements(it);
michael@0 6219 return FALSE;
michael@0 6220 }
michael@0 6221 uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3);
michael@0 6222
michael@0 6223 int32_t primShift = 24, secShift = 24, terShift = 24;
michael@0 6224 uint32_t CE = 0;
michael@0 6225 int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1;
michael@0 6226
michael@0 6227 // TODO: make safe if you get more than you wanted...
michael@0 6228 for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) {
michael@0 6229 primShift = 24; secShift = 24; terShift = 24;
michael@0 6230 if(ch < 0x100) {
michael@0 6231 CE = coll->latinOneMapping[ch];
michael@0 6232 } else {
michael@0 6233 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
michael@0 6234 if(CE == UCOL_NOT_FOUND && coll->UCA) {
michael@0 6235 CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
michael@0 6236 }
michael@0 6237 }
michael@0 6238 if(CE < UCOL_NOT_FOUND) {
michael@0 6239 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
michael@0 6240 } else {
michael@0 6241 switch (getCETag(CE)) {
michael@0 6242 case EXPANSION_TAG:
michael@0 6243 case DIGIT_TAG:
michael@0 6244 ucol_setText(it, &ch, 1, status);
michael@0 6245 while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) {
michael@0 6246 if(primShift < 0 || secShift < 0 || terShift < 0) {
michael@0 6247 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
michael@0 6248 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
michael@0 6249 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
michael@0 6250 break;
michael@0 6251 }
michael@0 6252 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
michael@0 6253 }
michael@0 6254 break;
michael@0 6255 case CONTRACTION_TAG:
michael@0 6256 // here is the trick
michael@0 6257 // F2 is contraction. We do something very similar to contractions
michael@0 6258 // but have two indices, one in the real contraction table and the
michael@0 6259 // other to where we stuffed things. This hopes that we don't have
michael@0 6260 // many contractions (this should work for latin-1 tables).
michael@0 6261 {
michael@0 6262 if((CE & 0x00FFF000) != 0) {
michael@0 6263 *status = U_UNSUPPORTED_ERROR;
michael@0 6264 goto cleanup_after_failure;
michael@0 6265 }
michael@0 6266
michael@0 6267 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
michael@0 6268
michael@0 6269 CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
michael@0 6270
michael@0 6271 coll->latinOneCEs[ch] = CE;
michael@0 6272 coll->latinOneCEs[coll->latinOneTableLen+ch] = CE;
michael@0 6273 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE;
michael@0 6274
michael@0 6275 // We're going to jump into contraction table, pick the elements
michael@0 6276 // and use them
michael@0 6277 do {
michael@0 6278 CE = *(coll->contractionCEs +
michael@0 6279 (UCharOffset - coll->contractionIndex));
michael@0 6280 if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) {
michael@0 6281 uint32_t size;
michael@0 6282 uint32_t i; /* general counter */
michael@0 6283 uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
michael@0 6284 size = getExpansionCount(CE);
michael@0 6285 //CE = *CEOffset++;
michael@0 6286 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
michael@0 6287 for(i = 0; i<size; i++) {
michael@0 6288 if(primShift < 0 || secShift < 0 || terShift < 0) {
michael@0 6289 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
michael@0 6290 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
michael@0 6291 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
michael@0 6292 break;
michael@0 6293 }
michael@0 6294 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
michael@0 6295 }
michael@0 6296 } else { /* else, we do */
michael@0 6297 while(*CEOffset != 0) {
michael@0 6298 if(primShift < 0 || secShift < 0 || terShift < 0) {
michael@0 6299 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
michael@0 6300 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
michael@0 6301 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
michael@0 6302 break;
michael@0 6303 }
michael@0 6304 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
michael@0 6305 }
michael@0 6306 }
michael@0 6307 contractionOffset++;
michael@0 6308 } else if(CE < UCOL_NOT_FOUND) {
michael@0 6309 ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift);
michael@0 6310 } else {
michael@0 6311 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
michael@0 6312 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
michael@0 6313 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
michael@0 6314 contractionOffset++;
michael@0 6315 }
michael@0 6316 UCharOffset++;
michael@0 6317 primShift = 24; secShift = 24; terShift = 24;
michael@0 6318 if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate
michael@0 6319 if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) {
michael@0 6320 goto cleanup_after_failure;
michael@0 6321 }
michael@0 6322 }
michael@0 6323 } while(*UCharOffset != 0xFFFF);
michael@0 6324 }
michael@0 6325 break;;
michael@0 6326 case SPEC_PROC_TAG:
michael@0 6327 {
michael@0 6328 // 0xB7 is a precontext character defined in UCA5.1, a special
michael@0 6329 // handle is implemeted in order to save LatinOne table for
michael@0 6330 // most locales.
michael@0 6331 if (ch==0xb7) {
michael@0 6332 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
michael@0 6333 }
michael@0 6334 else {
michael@0 6335 goto cleanup_after_failure;
michael@0 6336 }
michael@0 6337 }
michael@0 6338 break;
michael@0 6339 default:
michael@0 6340 goto cleanup_after_failure;
michael@0 6341 }
michael@0 6342 }
michael@0 6343 }
michael@0 6344 // compact table
michael@0 6345 if(contractionOffset < coll->latinOneTableLen) {
michael@0 6346 if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) {
michael@0 6347 goto cleanup_after_failure;
michael@0 6348 }
michael@0 6349 }
michael@0 6350 ucol_closeElements(it);
michael@0 6351 return result;
michael@0 6352
michael@0 6353 cleanup_after_failure:
michael@0 6354 // status should already be set before arriving here.
michael@0 6355 coll->latinOneFailed = TRUE;
michael@0 6356 ucol_closeElements(it);
michael@0 6357 return FALSE;
michael@0 6358 }
michael@0 6359
michael@0 6360 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) {
michael@0 6361 if(U_SUCCESS(*status)) {
michael@0 6362 if(coll->caseFirst == UCOL_UPPER_FIRST) {
michael@0 6363 coll->caseSwitch = UCOL_CASE_SWITCH;
michael@0 6364 } else {
michael@0 6365 coll->caseSwitch = UCOL_NO_CASE_SWITCH;
michael@0 6366 }
michael@0 6367
michael@0 6368 if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) {
michael@0 6369 coll->tertiaryMask = UCOL_REMOVE_CASE;
michael@0 6370 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
michael@0 6371 coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */
michael@0 6372 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF;
michael@0 6373 coll->tertiaryBottom = UCOL_COMMON_BOT3;
michael@0 6374 } else {
michael@0 6375 coll->tertiaryMask = UCOL_KEEP_CASE;
michael@0 6376 coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON;
michael@0 6377 if(coll->caseFirst == UCOL_UPPER_FIRST) {
michael@0 6378 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST;
michael@0 6379 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER;
michael@0 6380 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER;
michael@0 6381 } else {
michael@0 6382 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
michael@0 6383 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER;
michael@0 6384 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER;
michael@0 6385 }
michael@0 6386 }
michael@0 6387
michael@0 6388 /* Set the compression values */
michael@0 6389 uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - coll->tertiaryBottom - 1);
michael@0 6390 coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */
michael@0 6391 coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount);
michael@0 6392
michael@0 6393 if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY
michael@0 6394 && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE)
michael@0 6395 {
michael@0 6396 coll->sortKeyGen = ucol_calcSortKeySimpleTertiary;
michael@0 6397 } else {
michael@0 6398 coll->sortKeyGen = ucol_calcSortKey;
michael@0 6399 }
michael@0 6400 if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF
michael@0 6401 && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed)
michael@0 6402 {
michael@0 6403 if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) {
michael@0 6404 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it
michael@0 6405 //fprintf(stderr, "F");
michael@0 6406 coll->latinOneUse = TRUE;
michael@0 6407 } else {
michael@0 6408 coll->latinOneUse = FALSE;
michael@0 6409 }
michael@0 6410 if(*status == U_UNSUPPORTED_ERROR) {
michael@0 6411 *status = U_ZERO_ERROR;
michael@0 6412 }
michael@0 6413 } else { // latin1Table exists and it doesn't need to be regenerated, just use it
michael@0 6414 coll->latinOneUse = TRUE;
michael@0 6415 }
michael@0 6416 } else {
michael@0 6417 coll->latinOneUse = FALSE;
michael@0 6418 }
michael@0 6419 }
michael@0 6420 }
michael@0 6421
michael@0 6422 U_CAPI uint32_t U_EXPORT2
michael@0 6423 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) {
michael@0 6424 if(U_FAILURE(*status) || coll == NULL) {
michael@0 6425 return 0;
michael@0 6426 }
michael@0 6427 if(len == -1) {
michael@0 6428 len = u_strlen(varTop);
michael@0 6429 }
michael@0 6430 if(len == 0) {
michael@0 6431 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 6432 return 0;
michael@0 6433 }
michael@0 6434
michael@0 6435 if(coll->delegate!=NULL) {
michael@0 6436 return ((Collator*)coll->delegate)->setVariableTop(varTop, len, *status);
michael@0 6437 }
michael@0 6438
michael@0 6439
michael@0 6440 collIterate s;
michael@0 6441 IInit_collIterate(coll, varTop, len, &s, status);
michael@0 6442 if(U_FAILURE(*status)) {
michael@0 6443 return 0;
michael@0 6444 }
michael@0 6445
michael@0 6446 uint32_t CE = ucol_IGetNextCE(coll, &s, status);
michael@0 6447
michael@0 6448 /* here we check if we have consumed all characters */
michael@0 6449 /* you can put in either one character or a contraction */
michael@0 6450 /* you shouldn't put more... */
michael@0 6451 if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) {
michael@0 6452 *status = U_CE_NOT_FOUND_ERROR;
michael@0 6453 return 0;
michael@0 6454 }
michael@0 6455
michael@0 6456 uint32_t nextCE = ucol_IGetNextCE(coll, &s, status);
michael@0 6457
michael@0 6458 if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) {
michael@0 6459 *status = U_PRIMARY_TOO_LONG_ERROR;
michael@0 6460 return 0;
michael@0 6461 }
michael@0 6462 if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) {
michael@0 6463 coll->variableTopValueisDefault = FALSE;
michael@0 6464 coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16;
michael@0 6465 }
michael@0 6466
michael@0 6467 /* To avoid memory leak, free the offset buffer if necessary. */
michael@0 6468 ucol_freeOffsetBuffer(&s);
michael@0 6469
michael@0 6470 return CE & UCOL_PRIMARYMASK;
michael@0 6471 }
michael@0 6472
michael@0 6473 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
michael@0 6474 if(U_FAILURE(*status) || coll == NULL) {
michael@0 6475 return 0;
michael@0 6476 }
michael@0 6477 if(coll->delegate!=NULL) {
michael@0 6478 return ((const Collator*)coll->delegate)->getVariableTop(*status);
michael@0 6479 }
michael@0 6480 return coll->variableTopValue<<16;
michael@0 6481 }
michael@0 6482
michael@0 6483 U_CAPI void U_EXPORT2
michael@0 6484 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) {
michael@0 6485 if(U_FAILURE(*status) || coll == NULL) {
michael@0 6486 return;
michael@0 6487 }
michael@0 6488
michael@0 6489 if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) {
michael@0 6490 coll->variableTopValueisDefault = FALSE;
michael@0 6491 coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16;
michael@0 6492 }
michael@0 6493 }
michael@0 6494 /* Attribute setter API */
michael@0 6495 U_CAPI void U_EXPORT2
michael@0 6496 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
michael@0 6497 if(U_FAILURE(*status) || coll == NULL) {
michael@0 6498 return;
michael@0 6499 }
michael@0 6500
michael@0 6501 if(coll->delegate != NULL) {
michael@0 6502 ((Collator*)coll->delegate)->setAttribute(attr,value,*status);
michael@0 6503 return;
michael@0 6504 }
michael@0 6505
michael@0 6506 UColAttributeValue oldFrench = coll->frenchCollation;
michael@0 6507 UColAttributeValue oldCaseFirst = coll->caseFirst;
michael@0 6508 switch(attr) {
michael@0 6509 case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */
michael@0 6510 if(value == UCOL_ON) {
michael@0 6511 coll->numericCollation = UCOL_ON;
michael@0 6512 coll->numericCollationisDefault = FALSE;
michael@0 6513 } else if (value == UCOL_OFF) {
michael@0 6514 coll->numericCollation = UCOL_OFF;
michael@0 6515 coll->numericCollationisDefault = FALSE;
michael@0 6516 } else if (value == UCOL_DEFAULT) {
michael@0 6517 coll->numericCollationisDefault = TRUE;
michael@0 6518 coll->numericCollation = (UColAttributeValue)coll->options->numericCollation;
michael@0 6519 } else {
michael@0 6520 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 6521 }
michael@0 6522 break;
michael@0 6523 case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */
michael@0 6524 if(value == UCOL_ON || value == UCOL_OFF || value == UCOL_DEFAULT) {
michael@0 6525 // This attribute is an implementation detail of the CLDR Japanese tailoring.
michael@0 6526 // The implementation might change to use a different mechanism
michael@0 6527 // to achieve the same Japanese sort order.
michael@0 6528 // Since ICU 50, this attribute is not settable any more via API functions.
michael@0 6529 } else {
michael@0 6530 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 6531 }
michael@0 6532 break;
michael@0 6533 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
michael@0 6534 if(value == UCOL_ON) {
michael@0 6535 coll->frenchCollation = UCOL_ON;
michael@0 6536 coll->frenchCollationisDefault = FALSE;
michael@0 6537 } else if (value == UCOL_OFF) {
michael@0 6538 coll->frenchCollation = UCOL_OFF;
michael@0 6539 coll->frenchCollationisDefault = FALSE;
michael@0 6540 } else if (value == UCOL_DEFAULT) {
michael@0 6541 coll->frenchCollationisDefault = TRUE;
michael@0 6542 coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation;
michael@0 6543 } else {
michael@0 6544 *status = U_ILLEGAL_ARGUMENT_ERROR ;
michael@0 6545 }
michael@0 6546 break;
michael@0 6547 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
michael@0 6548 if(value == UCOL_SHIFTED) {
michael@0 6549 coll->alternateHandling = UCOL_SHIFTED;
michael@0 6550 coll->alternateHandlingisDefault = FALSE;
michael@0 6551 } else if (value == UCOL_NON_IGNORABLE) {
michael@0 6552 coll->alternateHandling = UCOL_NON_IGNORABLE;
michael@0 6553 coll->alternateHandlingisDefault = FALSE;
michael@0 6554 } else if (value == UCOL_DEFAULT) {
michael@0 6555 coll->alternateHandlingisDefault = TRUE;
michael@0 6556 coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ;
michael@0 6557 } else {
michael@0 6558 *status = U_ILLEGAL_ARGUMENT_ERROR ;
michael@0 6559 }
michael@0 6560 break;
michael@0 6561 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
michael@0 6562 if(value == UCOL_LOWER_FIRST) {
michael@0 6563 coll->caseFirst = UCOL_LOWER_FIRST;
michael@0 6564 coll->caseFirstisDefault = FALSE;
michael@0 6565 } else if (value == UCOL_UPPER_FIRST) {
michael@0 6566 coll->caseFirst = UCOL_UPPER_FIRST;
michael@0 6567 coll->caseFirstisDefault = FALSE;
michael@0 6568 } else if (value == UCOL_OFF) {
michael@0 6569 coll->caseFirst = UCOL_OFF;
michael@0 6570 coll->caseFirstisDefault = FALSE;
michael@0 6571 } else if (value == UCOL_DEFAULT) {
michael@0 6572 coll->caseFirst = (UColAttributeValue)coll->options->caseFirst;
michael@0 6573 coll->caseFirstisDefault = TRUE;
michael@0 6574 } else {
michael@0 6575 *status = U_ILLEGAL_ARGUMENT_ERROR ;
michael@0 6576 }
michael@0 6577 break;
michael@0 6578 case UCOL_CASE_LEVEL: /* do we have an extra case level */
michael@0 6579 if(value == UCOL_ON) {
michael@0 6580 coll->caseLevel = UCOL_ON;
michael@0 6581 coll->caseLevelisDefault = FALSE;
michael@0 6582 } else if (value == UCOL_OFF) {
michael@0 6583 coll->caseLevel = UCOL_OFF;
michael@0 6584 coll->caseLevelisDefault = FALSE;
michael@0 6585 } else if (value == UCOL_DEFAULT) {
michael@0 6586 coll->caseLevel = (UColAttributeValue)coll->options->caseLevel;
michael@0 6587 coll->caseLevelisDefault = TRUE;
michael@0 6588 } else {
michael@0 6589 *status = U_ILLEGAL_ARGUMENT_ERROR ;
michael@0 6590 }
michael@0 6591 break;
michael@0 6592 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
michael@0 6593 if(value == UCOL_ON) {
michael@0 6594 coll->normalizationMode = UCOL_ON;
michael@0 6595 coll->normalizationModeisDefault = FALSE;
michael@0 6596 initializeFCD(status);
michael@0 6597 } else if (value == UCOL_OFF) {
michael@0 6598 coll->normalizationMode = UCOL_OFF;
michael@0 6599 coll->normalizationModeisDefault = FALSE;
michael@0 6600 } else if (value == UCOL_DEFAULT) {
michael@0 6601 coll->normalizationModeisDefault = TRUE;
michael@0 6602 coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode;
michael@0 6603 if(coll->normalizationMode == UCOL_ON) {
michael@0 6604 initializeFCD(status);
michael@0 6605 }
michael@0 6606 } else {
michael@0 6607 *status = U_ILLEGAL_ARGUMENT_ERROR ;
michael@0 6608 }
michael@0 6609 break;
michael@0 6610 case UCOL_STRENGTH: /* attribute for strength */
michael@0 6611 if (value == UCOL_DEFAULT) {
michael@0 6612 coll->strengthisDefault = TRUE;
michael@0 6613 coll->strength = (UColAttributeValue)coll->options->strength;
michael@0 6614 } else if (value <= UCOL_IDENTICAL) {
michael@0 6615 coll->strengthisDefault = FALSE;
michael@0 6616 coll->strength = value;
michael@0 6617 } else {
michael@0 6618 *status = U_ILLEGAL_ARGUMENT_ERROR ;
michael@0 6619 }
michael@0 6620 break;
michael@0 6621 case UCOL_ATTRIBUTE_COUNT:
michael@0 6622 default:
michael@0 6623 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 6624 break;
michael@0 6625 }
michael@0 6626 if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) {
michael@0 6627 coll->latinOneRegenTable = TRUE;
michael@0 6628 } else {
michael@0 6629 coll->latinOneRegenTable = FALSE;
michael@0 6630 }
michael@0 6631 ucol_updateInternalState(coll, status);
michael@0 6632 }
michael@0 6633
michael@0 6634 U_CAPI UColAttributeValue U_EXPORT2
michael@0 6635 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
michael@0 6636 if(U_FAILURE(*status) || coll == NULL) {
michael@0 6637 return UCOL_DEFAULT;
michael@0 6638 }
michael@0 6639
michael@0 6640 if(coll->delegate != NULL) {
michael@0 6641 return ((Collator*)coll->delegate)->getAttribute(attr,*status);
michael@0 6642 }
michael@0 6643
michael@0 6644 switch(attr) {
michael@0 6645 case UCOL_NUMERIC_COLLATION:
michael@0 6646 return coll->numericCollation;
michael@0 6647 case UCOL_HIRAGANA_QUATERNARY_MODE:
michael@0 6648 return coll->hiraganaQ;
michael@0 6649 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
michael@0 6650 return coll->frenchCollation;
michael@0 6651 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
michael@0 6652 return coll->alternateHandling;
michael@0 6653 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
michael@0 6654 return coll->caseFirst;
michael@0 6655 case UCOL_CASE_LEVEL: /* do we have an extra case level */
michael@0 6656 return coll->caseLevel;
michael@0 6657 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
michael@0 6658 return coll->normalizationMode;
michael@0 6659 case UCOL_STRENGTH: /* attribute for strength */
michael@0 6660 return coll->strength;
michael@0 6661 case UCOL_ATTRIBUTE_COUNT:
michael@0 6662 default:
michael@0 6663 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 6664 break;
michael@0 6665 }
michael@0 6666 return UCOL_DEFAULT;
michael@0 6667 }
michael@0 6668
michael@0 6669 U_CAPI void U_EXPORT2
michael@0 6670 ucol_setStrength( UCollator *coll,
michael@0 6671 UCollationStrength strength)
michael@0 6672 {
michael@0 6673 UErrorCode status = U_ZERO_ERROR;
michael@0 6674 ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
michael@0 6675 }
michael@0 6676
michael@0 6677 U_CAPI UCollationStrength U_EXPORT2
michael@0 6678 ucol_getStrength(const UCollator *coll)
michael@0 6679 {
michael@0 6680 UErrorCode status = U_ZERO_ERROR;
michael@0 6681 return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
michael@0 6682 }
michael@0 6683
michael@0 6684 U_CAPI int32_t U_EXPORT2
michael@0 6685 ucol_getReorderCodes(const UCollator *coll,
michael@0 6686 int32_t *dest,
michael@0 6687 int32_t destCapacity,
michael@0 6688 UErrorCode *status) {
michael@0 6689 if (U_FAILURE(*status)) {
michael@0 6690 return 0;
michael@0 6691 }
michael@0 6692
michael@0 6693 if(coll->delegate!=NULL) {
michael@0 6694 return ((const Collator*)coll->delegate)->getReorderCodes(dest, destCapacity, *status);
michael@0 6695 }
michael@0 6696
michael@0 6697 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
michael@0 6698 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 6699 return 0;
michael@0 6700 }
michael@0 6701
michael@0 6702 #ifdef UCOL_DEBUG
michael@0 6703 printf("coll->reorderCodesLength = %d\n", coll->reorderCodesLength);
michael@0 6704 printf("coll->defaultReorderCodesLength = %d\n", coll->defaultReorderCodesLength);
michael@0 6705 #endif
michael@0 6706
michael@0 6707 if (coll->reorderCodesLength > destCapacity) {
michael@0 6708 *status = U_BUFFER_OVERFLOW_ERROR;
michael@0 6709 return coll->reorderCodesLength;
michael@0 6710 }
michael@0 6711 for (int32_t i = 0; i < coll->reorderCodesLength; i++) {
michael@0 6712 dest[i] = coll->reorderCodes[i];
michael@0 6713 }
michael@0 6714 return coll->reorderCodesLength;
michael@0 6715 }
michael@0 6716
michael@0 6717 U_CAPI void U_EXPORT2
michael@0 6718 ucol_setReorderCodes(UCollator* coll,
michael@0 6719 const int32_t* reorderCodes,
michael@0 6720 int32_t reorderCodesLength,
michael@0 6721 UErrorCode *status) {
michael@0 6722 if (U_FAILURE(*status)) {
michael@0 6723 return;
michael@0 6724 }
michael@0 6725
michael@0 6726 if (reorderCodesLength < 0 || (reorderCodesLength > 0 && reorderCodes == NULL)) {
michael@0 6727 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 6728 return;
michael@0 6729 }
michael@0 6730
michael@0 6731 if(coll->delegate!=NULL) {
michael@0 6732 ((Collator*)coll->delegate)->setReorderCodes(reorderCodes, reorderCodesLength, *status);
michael@0 6733 return;
michael@0 6734 }
michael@0 6735
michael@0 6736 if (coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) {
michael@0 6737 uprv_free(coll->reorderCodes);
michael@0 6738 }
michael@0 6739 coll->reorderCodes = NULL;
michael@0 6740 coll->freeReorderCodesOnClose = FALSE;
michael@0 6741 coll->reorderCodesLength = 0;
michael@0 6742 if (reorderCodesLength == 0) {
michael@0 6743 if (coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) {
michael@0 6744 uprv_free(coll->leadBytePermutationTable);
michael@0 6745 }
michael@0 6746 coll->leadBytePermutationTable = NULL;
michael@0 6747 coll->freeLeadBytePermutationTableOnClose = FALSE;
michael@0 6748 return;
michael@0 6749 }
michael@0 6750 coll->reorderCodes = (int32_t*) uprv_malloc(reorderCodesLength * sizeof(int32_t));
michael@0 6751 if (coll->reorderCodes == NULL) {
michael@0 6752 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 6753 return;
michael@0 6754 }
michael@0 6755 coll->freeReorderCodesOnClose = TRUE;
michael@0 6756 for (int32_t i = 0; i < reorderCodesLength; i++) {
michael@0 6757 coll->reorderCodes[i] = reorderCodes[i];
michael@0 6758 }
michael@0 6759 coll->reorderCodesLength = reorderCodesLength;
michael@0 6760 ucol_buildPermutationTable(coll, status);
michael@0 6761 }
michael@0 6762
michael@0 6763 U_CAPI int32_t U_EXPORT2
michael@0 6764 ucol_getEquivalentReorderCodes(int32_t reorderCode,
michael@0 6765 int32_t* dest,
michael@0 6766 int32_t destCapacity,
michael@0 6767 UErrorCode *pErrorCode) {
michael@0 6768 bool equivalentCodesSet[USCRIPT_CODE_LIMIT];
michael@0 6769 uint16_t leadBytes[256];
michael@0 6770 int leadBytesCount;
michael@0 6771 int leadByteIndex;
michael@0 6772 int16_t reorderCodesForLeadByte[USCRIPT_CODE_LIMIT];
michael@0 6773 int reorderCodesForLeadByteCount;
michael@0 6774 int reorderCodeIndex;
michael@0 6775
michael@0 6776 int32_t equivalentCodesCount = 0;
michael@0 6777 int setIndex;
michael@0 6778
michael@0 6779 if (U_FAILURE(*pErrorCode)) {
michael@0 6780 return 0;
michael@0 6781 }
michael@0 6782
michael@0 6783 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
michael@0 6784 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 6785 return 0;
michael@0 6786 }
michael@0 6787
michael@0 6788 uprv_memset(equivalentCodesSet, 0, USCRIPT_CODE_LIMIT * sizeof(bool));
michael@0 6789
michael@0 6790 const UCollator* uca = ucol_initUCA(pErrorCode);
michael@0 6791 if (U_FAILURE(*pErrorCode)) {
michael@0 6792 return 0;
michael@0 6793 }
michael@0 6794 leadBytesCount = ucol_getLeadBytesForReorderCode(uca, reorderCode, leadBytes, 256);
michael@0 6795 for (leadByteIndex = 0; leadByteIndex < leadBytesCount; leadByteIndex++) {
michael@0 6796 reorderCodesForLeadByteCount = ucol_getReorderCodesForLeadByte(
michael@0 6797 uca, leadBytes[leadByteIndex], reorderCodesForLeadByte, USCRIPT_CODE_LIMIT);
michael@0 6798 for (reorderCodeIndex = 0; reorderCodeIndex < reorderCodesForLeadByteCount; reorderCodeIndex++) {
michael@0 6799 equivalentCodesSet[reorderCodesForLeadByte[reorderCodeIndex]] = true;
michael@0 6800 }
michael@0 6801 }
michael@0 6802
michael@0 6803 for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) {
michael@0 6804 if (equivalentCodesSet[setIndex] == true) {
michael@0 6805 equivalentCodesCount++;
michael@0 6806 }
michael@0 6807 }
michael@0 6808
michael@0 6809 if (destCapacity == 0) {
michael@0 6810 return equivalentCodesCount;
michael@0 6811 }
michael@0 6812
michael@0 6813 equivalentCodesCount = 0;
michael@0 6814 for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) {
michael@0 6815 if (equivalentCodesSet[setIndex] == true) {
michael@0 6816 dest[equivalentCodesCount++] = setIndex;
michael@0 6817 if (equivalentCodesCount >= destCapacity) {
michael@0 6818 break;
michael@0 6819 }
michael@0 6820 }
michael@0 6821 }
michael@0 6822 return equivalentCodesCount;
michael@0 6823 }
michael@0 6824
michael@0 6825
michael@0 6826 /****************************************************************************/
michael@0 6827 /* Following are misc functions */
michael@0 6828 /* there are new APIs and some compatibility APIs */
michael@0 6829 /****************************************************************************/
michael@0 6830
michael@0 6831 U_CAPI void U_EXPORT2
michael@0 6832 ucol_getVersion(const UCollator* coll,
michael@0 6833 UVersionInfo versionInfo)
michael@0 6834 {
michael@0 6835 if(coll->delegate!=NULL) {
michael@0 6836 ((const Collator*)coll->delegate)->getVersion(versionInfo);
michael@0 6837 return;
michael@0 6838 }
michael@0 6839 /* RunTime version */
michael@0 6840 uint8_t rtVersion = UCOL_RUNTIME_VERSION;
michael@0 6841 /* Builder version*/
michael@0 6842 uint8_t bdVersion = coll->image->version[0];
michael@0 6843
michael@0 6844 /* Charset Version. Need to get the version from cnv files
michael@0 6845 * makeconv should populate cnv files with version and
michael@0 6846 * an api has to be provided in ucnv.h to obtain this version
michael@0 6847 */
michael@0 6848 uint8_t csVersion = 0;
michael@0 6849
michael@0 6850 /* combine the version info */
michael@0 6851 uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion));
michael@0 6852
michael@0 6853 /* Tailoring rules */
michael@0 6854 versionInfo[0] = (uint8_t)(cmbVersion>>8);
michael@0 6855 versionInfo[1] = (uint8_t)cmbVersion;
michael@0 6856 versionInfo[2] = coll->image->version[1];
michael@0 6857 if(coll->UCA) {
michael@0 6858 /* Include the minor number when getting the UCA version. (major & 1f) << 3 | (minor & 7) */
michael@0 6859 versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 | (coll->UCA->image->UCAVersion[1] & 0x07);
michael@0 6860 } else {
michael@0 6861 versionInfo[3] = 0;
michael@0 6862 }
michael@0 6863 }
michael@0 6864
michael@0 6865
michael@0 6866 /* This internal API checks whether a character is tailored or not */
michael@0 6867 U_CAPI UBool U_EXPORT2
michael@0 6868 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) {
michael@0 6869 if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) {
michael@0 6870 return FALSE;
michael@0 6871 }
michael@0 6872
michael@0 6873 uint32_t CE = UCOL_NOT_FOUND;
michael@0 6874 const UChar *ContractionStart = NULL;
michael@0 6875 if(u < 0x100) { /* latin-1 */
michael@0 6876 CE = coll->latinOneMapping[u];
michael@0 6877 if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) {
michael@0 6878 return FALSE;
michael@0 6879 }
michael@0 6880 } else { /* regular */
michael@0 6881 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u);
michael@0 6882 }
michael@0 6883
michael@0 6884 if(isContraction(CE)) {
michael@0 6885 ContractionStart = (UChar *)coll->image+getContractOffset(CE);
michael@0 6886 CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex));
michael@0 6887 }
michael@0 6888
michael@0 6889 return (UBool)(CE != UCOL_NOT_FOUND);
michael@0 6890 }
michael@0 6891
michael@0 6892
michael@0 6893 /****************************************************************************/
michael@0 6894 /* Following are the string compare functions */
michael@0 6895 /* */
michael@0 6896 /****************************************************************************/
michael@0 6897
michael@0 6898
michael@0 6899 /* ucol_checkIdent internal function. Does byte level string compare. */
michael@0 6900 /* Used by strcoll if strength == identical and strings */
michael@0 6901 /* are otherwise equal. */
michael@0 6902 /* */
michael@0 6903 /* Comparison must be done on NFD normalized strings. */
michael@0 6904 /* FCD is not good enough. */
michael@0 6905
michael@0 6906 static
michael@0 6907 UCollationResult ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status)
michael@0 6908 {
michael@0 6909 // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
michael@0 6910 // of same type, but that doesn't really mean that it will stay that way.
michael@0 6911 int32_t comparison;
michael@0 6912
michael@0 6913 if (sColl->flags & UCOL_USE_ITERATOR) {
michael@0 6914 // The division for the array length may truncate the array size to
michael@0 6915 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
michael@0 6916 // for all platforms anyway.
michael@0 6917 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
michael@0 6918 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
michael@0 6919 UNormIterator *sNIt = NULL, *tNIt = NULL;
michael@0 6920 sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
michael@0 6921 tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
michael@0 6922 sColl->iterator->move(sColl->iterator, 0, UITER_START);
michael@0 6923 tColl->iterator->move(tColl->iterator, 0, UITER_START);
michael@0 6924 UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status);
michael@0 6925 UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status);
michael@0 6926 comparison = u_strCompareIter(sIt, tIt, TRUE);
michael@0 6927 unorm_closeIter(sNIt);
michael@0 6928 unorm_closeIter(tNIt);
michael@0 6929 } else {
michael@0 6930 int32_t sLen = (sColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(sColl->endp - sColl->string) : -1;
michael@0 6931 const UChar *sBuf = sColl->string;
michael@0 6932 int32_t tLen = (tColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(tColl->endp - tColl->string) : -1;
michael@0 6933 const UChar *tBuf = tColl->string;
michael@0 6934
michael@0 6935 if (normalize) {
michael@0 6936 *status = U_ZERO_ERROR;
michael@0 6937 // Note: We could use Normalizer::compare() or similar, but for short strings
michael@0 6938 // which may not be in FCD it might be faster to just NFD them.
michael@0 6939 // Note: spanQuickCheckYes() + normalizeSecondAndAppend() rather than
michael@0 6940 // NFD'ing immediately might be faster for long strings,
michael@0 6941 // but string comparison is usually done on relatively short strings.
michael@0 6942 sColl->nfd->normalize(UnicodeString((sColl->flags & UCOL_ITER_HASLEN) == 0, sBuf, sLen),
michael@0 6943 sColl->writableBuffer,
michael@0 6944 *status);
michael@0 6945 tColl->nfd->normalize(UnicodeString((tColl->flags & UCOL_ITER_HASLEN) == 0, tBuf, tLen),
michael@0 6946 tColl->writableBuffer,
michael@0 6947 *status);
michael@0 6948 if(U_FAILURE(*status)) {
michael@0 6949 return UCOL_LESS;
michael@0 6950 }
michael@0 6951 comparison = sColl->writableBuffer.compareCodePointOrder(tColl->writableBuffer);
michael@0 6952 } else {
michael@0 6953 comparison = u_strCompare(sBuf, sLen, tBuf, tLen, TRUE);
michael@0 6954 }
michael@0 6955 }
michael@0 6956
michael@0 6957 if (comparison < 0) {
michael@0 6958 return UCOL_LESS;
michael@0 6959 } else if (comparison == 0) {
michael@0 6960 return UCOL_EQUAL;
michael@0 6961 } else /* comparison > 0 */ {
michael@0 6962 return UCOL_GREATER;
michael@0 6963 }
michael@0 6964 }
michael@0 6965
michael@0 6966 /* CEBuf - A struct and some inline functions to handle the saving */
michael@0 6967 /* of CEs in a buffer within ucol_strcoll */
michael@0 6968
michael@0 6969 #define UCOL_CEBUF_SIZE 512
michael@0 6970 typedef struct ucol_CEBuf {
michael@0 6971 uint32_t *buf;
michael@0 6972 uint32_t *endp;
michael@0 6973 uint32_t *pos;
michael@0 6974 uint32_t localArray[UCOL_CEBUF_SIZE];
michael@0 6975 } ucol_CEBuf;
michael@0 6976
michael@0 6977
michael@0 6978 static
michael@0 6979 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) {
michael@0 6980 (b)->buf = (b)->pos = (b)->localArray;
michael@0 6981 (b)->endp = (b)->buf + UCOL_CEBUF_SIZE;
michael@0 6982 }
michael@0 6983
michael@0 6984 static
michael@0 6985 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) {
michael@0 6986 uint32_t oldSize;
michael@0 6987 uint32_t newSize;
michael@0 6988 uint32_t *newBuf;
michael@0 6989
michael@0 6990 ci->flags |= UCOL_ITER_ALLOCATED;
michael@0 6991 oldSize = (uint32_t)(b->pos - b->buf);
michael@0 6992 newSize = oldSize * 2;
michael@0 6993 newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t));
michael@0 6994 if(newBuf == NULL) {
michael@0 6995 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 6996 }
michael@0 6997 else {
michael@0 6998 uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t));
michael@0 6999 if (b->buf != b->localArray) {
michael@0 7000 uprv_free(b->buf);
michael@0 7001 }
michael@0 7002 b->buf = newBuf;
michael@0 7003 b->endp = b->buf + newSize;
michael@0 7004 b->pos = b->buf + oldSize;
michael@0 7005 }
michael@0 7006 }
michael@0 7007
michael@0 7008 static
michael@0 7009 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCode *status) {
michael@0 7010 if (b->pos == b->endp) {
michael@0 7011 ucol_CEBuf_Expand(b, ci, status);
michael@0 7012 }
michael@0 7013 if (U_SUCCESS(*status)) {
michael@0 7014 *(b)->pos++ = ce;
michael@0 7015 }
michael@0 7016 }
michael@0 7017
michael@0 7018 /* This is a trick string compare function that goes in and uses sortkeys to compare */
michael@0 7019 /* It is used when compare gets in trouble and needs to bail out */
michael@0 7020 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl,
michael@0 7021 collIterate *tColl,
michael@0 7022 UErrorCode *status)
michael@0 7023 {
michael@0 7024 uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER];
michael@0 7025 uint8_t *sourceKeyP = sourceKey;
michael@0 7026 uint8_t *targetKeyP = targetKey;
michael@0 7027 int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER;
michael@0 7028 const UCollator *coll = sColl->coll;
michael@0 7029 const UChar *source = NULL;
michael@0 7030 const UChar *target = NULL;
michael@0 7031 int32_t result = UCOL_EQUAL;
michael@0 7032 UnicodeString sourceString, targetString;
michael@0 7033 int32_t sourceLength;
michael@0 7034 int32_t targetLength;
michael@0 7035
michael@0 7036 if(sColl->flags & UCOL_USE_ITERATOR) {
michael@0 7037 sColl->iterator->move(sColl->iterator, 0, UITER_START);
michael@0 7038 tColl->iterator->move(tColl->iterator, 0, UITER_START);
michael@0 7039 UChar32 c;
michael@0 7040 while((c=sColl->iterator->next(sColl->iterator))>=0) {
michael@0 7041 sourceString.append((UChar)c);
michael@0 7042 }
michael@0 7043 while((c=tColl->iterator->next(tColl->iterator))>=0) {
michael@0 7044 targetString.append((UChar)c);
michael@0 7045 }
michael@0 7046 source = sourceString.getBuffer();
michael@0 7047 sourceLength = sourceString.length();
michael@0 7048 target = targetString.getBuffer();
michael@0 7049 targetLength = targetString.length();
michael@0 7050 } else { // no iterators
michael@0 7051 sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(int32_t)(sColl->endp-sColl->string):-1;
michael@0 7052 targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(int32_t)(tColl->endp-tColl->string):-1;
michael@0 7053 source = sColl->string;
michael@0 7054 target = tColl->string;
michael@0 7055 }
michael@0 7056
michael@0 7057
michael@0 7058
michael@0 7059 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
michael@0 7060 if(sourceKeyLen > UCOL_MAX_BUFFER) {
michael@0 7061 sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t));
michael@0 7062 if(sourceKeyP == NULL) {
michael@0 7063 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 7064 goto cleanup_and_do_compare;
michael@0 7065 }
michael@0 7066 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
michael@0 7067 }
michael@0 7068
michael@0 7069 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
michael@0 7070 if(targetKeyLen > UCOL_MAX_BUFFER) {
michael@0 7071 targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t));
michael@0 7072 if(targetKeyP == NULL) {
michael@0 7073 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 7074 goto cleanup_and_do_compare;
michael@0 7075 }
michael@0 7076 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
michael@0 7077 }
michael@0 7078
michael@0 7079 result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP);
michael@0 7080
michael@0 7081 cleanup_and_do_compare:
michael@0 7082 if(sourceKeyP != NULL && sourceKeyP != sourceKey) {
michael@0 7083 uprv_free(sourceKeyP);
michael@0 7084 }
michael@0 7085
michael@0 7086 if(targetKeyP != NULL && targetKeyP != targetKey) {
michael@0 7087 uprv_free(targetKeyP);
michael@0 7088 }
michael@0 7089
michael@0 7090 if(result<0) {
michael@0 7091 return UCOL_LESS;
michael@0 7092 } else if(result>0) {
michael@0 7093 return UCOL_GREATER;
michael@0 7094 } else {
michael@0 7095 return UCOL_EQUAL;
michael@0 7096 }
michael@0 7097 }
michael@0 7098
michael@0 7099
michael@0 7100 static UCollationResult
michael@0 7101 ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status)
michael@0 7102 {
michael@0 7103 U_ALIGN_CODE(16);
michael@0 7104
michael@0 7105 const UCollator *coll = sColl->coll;
michael@0 7106
michael@0 7107
michael@0 7108 // setting up the collator parameters
michael@0 7109 UColAttributeValue strength = coll->strength;
michael@0 7110 UBool initialCheckSecTer = (strength >= UCOL_SECONDARY);
michael@0 7111
michael@0 7112 UBool checkSecTer = initialCheckSecTer;
michael@0 7113 UBool checkTertiary = (strength >= UCOL_TERTIARY);
michael@0 7114 UBool checkQuad = (strength >= UCOL_QUATERNARY);
michael@0 7115 UBool checkIdent = (strength == UCOL_IDENTICAL);
michael@0 7116 UBool checkCase = (coll->caseLevel == UCOL_ON);
michael@0 7117 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer;
michael@0 7118 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
michael@0 7119 UBool qShifted = shifted && checkQuad;
michael@0 7120 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad;
michael@0 7121
michael@0 7122 if(doHiragana && shifted) {
michael@0 7123 return (ucol_compareUsingSortKeys(sColl, tColl, status));
michael@0 7124 }
michael@0 7125 uint8_t caseSwitch = coll->caseSwitch;
michael@0 7126 uint8_t tertiaryMask = coll->tertiaryMask;
michael@0 7127
michael@0 7128 // This is the lowest primary value that will not be ignored if shifted
michael@0 7129 uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0;
michael@0 7130
michael@0 7131 UCollationResult result = UCOL_EQUAL;
michael@0 7132 UCollationResult hirResult = UCOL_EQUAL;
michael@0 7133
michael@0 7134 // Preparing the CE buffers. They will be filled during the primary phase
michael@0 7135 ucol_CEBuf sCEs;
michael@0 7136 ucol_CEBuf tCEs;
michael@0 7137 UCOL_INIT_CEBUF(&sCEs);
michael@0 7138 UCOL_INIT_CEBUF(&tCEs);
michael@0 7139
michael@0 7140 uint32_t secS = 0, secT = 0;
michael@0 7141 uint32_t sOrder=0, tOrder=0;
michael@0 7142
michael@0 7143 // Non shifted primary processing is quite simple
michael@0 7144 if(!shifted) {
michael@0 7145 for(;;) {
michael@0 7146 // We fetch CEs until we hit a non ignorable primary or end.
michael@0 7147 uint32_t sPrimary;
michael@0 7148 do {
michael@0 7149 // We get the next CE
michael@0 7150 sOrder = ucol_IGetNextCE(coll, sColl, status);
michael@0 7151 // Stuff it in the buffer
michael@0 7152 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
michael@0 7153 // And keep just the primary part.
michael@0 7154 sPrimary = sOrder & UCOL_PRIMARYMASK;
michael@0 7155 } while(sPrimary == 0);
michael@0 7156
michael@0 7157 // see the comments on the above block
michael@0 7158 uint32_t tPrimary;
michael@0 7159 do {
michael@0 7160 tOrder = ucol_IGetNextCE(coll, tColl, status);
michael@0 7161 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
michael@0 7162 tPrimary = tOrder & UCOL_PRIMARYMASK;
michael@0 7163 } while(tPrimary == 0);
michael@0 7164
michael@0 7165 // if both primaries are the same
michael@0 7166 if(sPrimary == tPrimary) {
michael@0 7167 // and there are no more CEs, we advance to the next level
michael@0 7168 if(sPrimary == UCOL_NO_MORE_CES_PRIMARY) {
michael@0 7169 break;
michael@0 7170 }
michael@0 7171 if(doHiragana && hirResult == UCOL_EQUAL) {
michael@0 7172 if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) {
michael@0 7173 hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA))
michael@0 7174 ? UCOL_LESS:UCOL_GREATER;
michael@0 7175 }
michael@0 7176 }
michael@0 7177 } else {
michael@0 7178 // only need to check one for continuation
michael@0 7179 // if one is then the other must be or the preceding CE would be a prefix of the other
michael@0 7180 if (coll->leadBytePermutationTable != NULL && !isContinuation(sOrder)) {
michael@0 7181 sPrimary = (coll->leadBytePermutationTable[sPrimary>>24] << 24) | (sPrimary & 0x00FFFFFF);
michael@0 7182 tPrimary = (coll->leadBytePermutationTable[tPrimary>>24] << 24) | (tPrimary & 0x00FFFFFF);
michael@0 7183 }
michael@0 7184 // if two primaries are different, we are done
michael@0 7185 result = (sPrimary < tPrimary) ? UCOL_LESS: UCOL_GREATER;
michael@0 7186 goto commonReturn;
michael@0 7187 }
michael@0 7188 } // no primary difference... do the rest from the buffers
michael@0 7189 } else { // shifted - do a slightly more complicated processing :)
michael@0 7190 for(;;) {
michael@0 7191 UBool sInShifted = FALSE;
michael@0 7192 UBool tInShifted = FALSE;
michael@0 7193 // This version of code can be refactored. However, it seems easier to understand this way.
michael@0 7194 // Source loop. Same as the target loop.
michael@0 7195 for(;;) {
michael@0 7196 sOrder = ucol_IGetNextCE(coll, sColl, status);
michael@0 7197 if(sOrder == UCOL_NO_MORE_CES) {
michael@0 7198 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
michael@0 7199 break;
michael@0 7200 } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) {
michael@0 7201 /* UCA amendment - ignore ignorables that follow shifted code points */
michael@0 7202 continue;
michael@0 7203 } else if(isContinuation(sOrder)) {
michael@0 7204 if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
michael@0 7205 if(sInShifted) {
michael@0 7206 sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
michael@0 7207 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
michael@0 7208 continue;
michael@0 7209 } else {
michael@0 7210 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
michael@0 7211 break;
michael@0 7212 }
michael@0 7213 } else { /* Just lower level values */
michael@0 7214 if(sInShifted) {
michael@0 7215 continue;
michael@0 7216 } else {
michael@0 7217 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
michael@0 7218 continue;
michael@0 7219 }
michael@0 7220 }
michael@0 7221 } else { /* regular */
michael@0 7222 if(coll->leadBytePermutationTable != NULL){
michael@0 7223 sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF);
michael@0 7224 }
michael@0 7225 if((sOrder & UCOL_PRIMARYMASK) > LVT) {
michael@0 7226 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
michael@0 7227 break;
michael@0 7228 } else {
michael@0 7229 if((sOrder & UCOL_PRIMARYMASK) > 0) {
michael@0 7230 sInShifted = TRUE;
michael@0 7231 sOrder &= UCOL_PRIMARYMASK;
michael@0 7232 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
michael@0 7233 continue;
michael@0 7234 } else {
michael@0 7235 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
michael@0 7236 sInShifted = FALSE;
michael@0 7237 continue;
michael@0 7238 }
michael@0 7239 }
michael@0 7240 }
michael@0 7241 }
michael@0 7242 sOrder &= UCOL_PRIMARYMASK;
michael@0 7243 sInShifted = FALSE;
michael@0 7244
michael@0 7245 for(;;) {
michael@0 7246 tOrder = ucol_IGetNextCE(coll, tColl, status);
michael@0 7247 if(tOrder == UCOL_NO_MORE_CES) {
michael@0 7248 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
michael@0 7249 break;
michael@0 7250 } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) {
michael@0 7251 /* UCA amendment - ignore ignorables that follow shifted code points */
michael@0 7252 continue;
michael@0 7253 } else if(isContinuation(tOrder)) {
michael@0 7254 if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
michael@0 7255 if(tInShifted) {
michael@0 7256 tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
michael@0 7257 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
michael@0 7258 continue;
michael@0 7259 } else {
michael@0 7260 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
michael@0 7261 break;
michael@0 7262 }
michael@0 7263 } else { /* Just lower level values */
michael@0 7264 if(tInShifted) {
michael@0 7265 continue;
michael@0 7266 } else {
michael@0 7267 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
michael@0 7268 continue;
michael@0 7269 }
michael@0 7270 }
michael@0 7271 } else { /* regular */
michael@0 7272 if(coll->leadBytePermutationTable != NULL){
michael@0 7273 tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF);
michael@0 7274 }
michael@0 7275 if((tOrder & UCOL_PRIMARYMASK) > LVT) {
michael@0 7276 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
michael@0 7277 break;
michael@0 7278 } else {
michael@0 7279 if((tOrder & UCOL_PRIMARYMASK) > 0) {
michael@0 7280 tInShifted = TRUE;
michael@0 7281 tOrder &= UCOL_PRIMARYMASK;
michael@0 7282 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
michael@0 7283 continue;
michael@0 7284 } else {
michael@0 7285 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
michael@0 7286 tInShifted = FALSE;
michael@0 7287 continue;
michael@0 7288 }
michael@0 7289 }
michael@0 7290 }
michael@0 7291 }
michael@0 7292 tOrder &= UCOL_PRIMARYMASK;
michael@0 7293 tInShifted = FALSE;
michael@0 7294
michael@0 7295 if(sOrder == tOrder) {
michael@0 7296 /*
michael@0 7297 if(doHiragana && hirResult == UCOL_EQUAL) {
michael@0 7298 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
michael@0 7299 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
michael@0 7300 ? UCOL_LESS:UCOL_GREATER;
michael@0 7301 }
michael@0 7302 }
michael@0 7303 */
michael@0 7304 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
michael@0 7305 break;
michael@0 7306 } else {
michael@0 7307 sOrder = 0;
michael@0 7308 tOrder = 0;
michael@0 7309 continue;
michael@0 7310 }
michael@0 7311 } else {
michael@0 7312 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER;
michael@0 7313 goto commonReturn;
michael@0 7314 }
michael@0 7315 } /* no primary difference... do the rest from the buffers */
michael@0 7316 }
michael@0 7317
michael@0 7318 /* now, we're gonna reexamine collected CEs */
michael@0 7319 uint32_t *sCE;
michael@0 7320 uint32_t *tCE;
michael@0 7321
michael@0 7322 /* This is the secondary level of comparison */
michael@0 7323 if(checkSecTer) {
michael@0 7324 if(!isFrenchSec) { /* normal */
michael@0 7325 sCE = sCEs.buf;
michael@0 7326 tCE = tCEs.buf;
michael@0 7327 for(;;) {
michael@0 7328 while (secS == 0) {
michael@0 7329 secS = *(sCE++) & UCOL_SECONDARYMASK;
michael@0 7330 }
michael@0 7331
michael@0 7332 while(secT == 0) {
michael@0 7333 secT = *(tCE++) & UCOL_SECONDARYMASK;
michael@0 7334 }
michael@0 7335
michael@0 7336 if(secS == secT) {
michael@0 7337 if(secS == UCOL_NO_MORE_CES_SECONDARY) {
michael@0 7338 break;
michael@0 7339 } else {
michael@0 7340 secS = 0; secT = 0;
michael@0 7341 continue;
michael@0 7342 }
michael@0 7343 } else {
michael@0 7344 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
michael@0 7345 goto commonReturn;
michael@0 7346 }
michael@0 7347 }
michael@0 7348 } else { /* do the French */
michael@0 7349 uint32_t *sCESave = NULL;
michael@0 7350 uint32_t *tCESave = NULL;
michael@0 7351 sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */
michael@0 7352 tCE = tCEs.pos-2;
michael@0 7353 for(;;) {
michael@0 7354 while (secS == 0 && sCE >= sCEs.buf) {
michael@0 7355 if(sCESave == NULL) {
michael@0 7356 secS = *(sCE--);
michael@0 7357 if(isContinuation(secS)) {
michael@0 7358 while(isContinuation(secS = *(sCE--)))
michael@0 7359 ;
michael@0 7360 /* after this, secS has the start of continuation, and sCEs points before that */
michael@0 7361 sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */
michael@0 7362 sCE+=2; /* need to point to the first continuation CP */
michael@0 7363 /* However, now you can just continue doing stuff */
michael@0 7364 }
michael@0 7365 } else {
michael@0 7366 secS = *(sCE++);
michael@0 7367 if(!isContinuation(secS)) { /* This means we have finished with this cont */
michael@0 7368 sCE = sCESave; /* reset the pointer to before continuation */
michael@0 7369 sCESave = NULL;
michael@0 7370 secS = 0; /* Fetch a fresh CE before the continuation sequence. */
michael@0 7371 continue;
michael@0 7372 }
michael@0 7373 }
michael@0 7374 secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */
michael@0 7375 }
michael@0 7376
michael@0 7377 while(secT == 0 && tCE >= tCEs.buf) {
michael@0 7378 if(tCESave == NULL) {
michael@0 7379 secT = *(tCE--);
michael@0 7380 if(isContinuation(secT)) {
michael@0 7381 while(isContinuation(secT = *(tCE--)))
michael@0 7382 ;
michael@0 7383 /* after this, secS has the start of continuation, and sCEs points before that */
michael@0 7384 tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */
michael@0 7385 tCE+=2; /* need to point to the first continuation CP */
michael@0 7386 /* However, now you can just continue doing stuff */
michael@0 7387 }
michael@0 7388 } else {
michael@0 7389 secT = *(tCE++);
michael@0 7390 if(!isContinuation(secT)) { /* This means we have finished with this cont */
michael@0 7391 tCE = tCESave; /* reset the pointer to before continuation */
michael@0 7392 tCESave = NULL;
michael@0 7393 secT = 0; /* Fetch a fresh CE before the continuation sequence. */
michael@0 7394 continue;
michael@0 7395 }
michael@0 7396 }
michael@0 7397 secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */
michael@0 7398 }
michael@0 7399
michael@0 7400 if(secS == secT) {
michael@0 7401 if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) {
michael@0 7402 break;
michael@0 7403 } else {
michael@0 7404 secS = 0; secT = 0;
michael@0 7405 continue;
michael@0 7406 }
michael@0 7407 } else {
michael@0 7408 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
michael@0 7409 goto commonReturn;
michael@0 7410 }
michael@0 7411 }
michael@0 7412 }
michael@0 7413 }
michael@0 7414
michael@0 7415 /* doing the case bit */
michael@0 7416 if(checkCase) {
michael@0 7417 sCE = sCEs.buf;
michael@0 7418 tCE = tCEs.buf;
michael@0 7419 for(;;) {
michael@0 7420 while((secS & UCOL_REMOVE_CASE) == 0) {
michael@0 7421 if(!isContinuation(*sCE++)) {
michael@0 7422 secS =*(sCE-1);
michael@0 7423 if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
michael@0 7424 // primary ignorables should not be considered on the case level when the strength is primary
michael@0 7425 // otherwise, the CEs stop being well-formed
michael@0 7426 secS &= UCOL_TERT_CASE_MASK;
michael@0 7427 secS ^= caseSwitch;
michael@0 7428 } else {
michael@0 7429 secS = 0;
michael@0 7430 }
michael@0 7431 } else {
michael@0 7432 secS = 0;
michael@0 7433 }
michael@0 7434 }
michael@0 7435
michael@0 7436 while((secT & UCOL_REMOVE_CASE) == 0) {
michael@0 7437 if(!isContinuation(*tCE++)) {
michael@0 7438 secT = *(tCE-1);
michael@0 7439 if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
michael@0 7440 // primary ignorables should not be considered on the case level when the strength is primary
michael@0 7441 // otherwise, the CEs stop being well-formed
michael@0 7442 secT &= UCOL_TERT_CASE_MASK;
michael@0 7443 secT ^= caseSwitch;
michael@0 7444 } else {
michael@0 7445 secT = 0;
michael@0 7446 }
michael@0 7447 } else {
michael@0 7448 secT = 0;
michael@0 7449 }
michael@0 7450 }
michael@0 7451
michael@0 7452 if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) {
michael@0 7453 result = UCOL_LESS;
michael@0 7454 goto commonReturn;
michael@0 7455 } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) {
michael@0 7456 result = UCOL_GREATER;
michael@0 7457 goto commonReturn;
michael@0 7458 }
michael@0 7459
michael@0 7460 if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) {
michael@0 7461 break;
michael@0 7462 } else {
michael@0 7463 secS = 0;
michael@0 7464 secT = 0;
michael@0 7465 }
michael@0 7466 }
michael@0 7467 }
michael@0 7468
michael@0 7469 /* Tertiary level */
michael@0 7470 if(checkTertiary) {
michael@0 7471 secS = 0;
michael@0 7472 secT = 0;
michael@0 7473 sCE = sCEs.buf;
michael@0 7474 tCE = tCEs.buf;
michael@0 7475 for(;;) {
michael@0 7476 while((secS & UCOL_REMOVE_CASE) == 0) {
michael@0 7477 sOrder = *sCE++;
michael@0 7478 secS = sOrder & tertiaryMask;
michael@0 7479 if(!isContinuation(sOrder)) {
michael@0 7480 secS ^= caseSwitch;
michael@0 7481 } else {
michael@0 7482 secS &= UCOL_REMOVE_CASE;
michael@0 7483 }
michael@0 7484 }
michael@0 7485
michael@0 7486 while((secT & UCOL_REMOVE_CASE) == 0) {
michael@0 7487 tOrder = *tCE++;
michael@0 7488 secT = tOrder & tertiaryMask;
michael@0 7489 if(!isContinuation(tOrder)) {
michael@0 7490 secT ^= caseSwitch;
michael@0 7491 } else {
michael@0 7492 secT &= UCOL_REMOVE_CASE;
michael@0 7493 }
michael@0 7494 }
michael@0 7495
michael@0 7496 if(secS == secT) {
michael@0 7497 if((secS & UCOL_REMOVE_CASE) == 1) {
michael@0 7498 break;
michael@0 7499 } else {
michael@0 7500 secS = 0; secT = 0;
michael@0 7501 continue;
michael@0 7502 }
michael@0 7503 } else {
michael@0 7504 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
michael@0 7505 goto commonReturn;
michael@0 7506 }
michael@0 7507 }
michael@0 7508 }
michael@0 7509
michael@0 7510
michael@0 7511 if(qShifted /*checkQuad*/) {
michael@0 7512 UBool sInShifted = TRUE;
michael@0 7513 UBool tInShifted = TRUE;
michael@0 7514 secS = 0;
michael@0 7515 secT = 0;
michael@0 7516 sCE = sCEs.buf;
michael@0 7517 tCE = tCEs.buf;
michael@0 7518 for(;;) {
michael@0 7519 while((secS == 0 && secS != UCOL_NO_MORE_CES) || (isContinuation(secS) && !sInShifted)) {
michael@0 7520 secS = *(sCE++);
michael@0 7521 if(isContinuation(secS)) {
michael@0 7522 if(!sInShifted) {
michael@0 7523 continue;
michael@0 7524 }
michael@0 7525 } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */
michael@0 7526 secS = UCOL_PRIMARYMASK;
michael@0 7527 sInShifted = FALSE;
michael@0 7528 } else {
michael@0 7529 sInShifted = TRUE;
michael@0 7530 }
michael@0 7531 }
michael@0 7532 secS &= UCOL_PRIMARYMASK;
michael@0 7533
michael@0 7534
michael@0 7535 while((secT == 0 && secT != UCOL_NO_MORE_CES) || (isContinuation(secT) && !tInShifted)) {
michael@0 7536 secT = *(tCE++);
michael@0 7537 if(isContinuation(secT)) {
michael@0 7538 if(!tInShifted) {
michael@0 7539 continue;
michael@0 7540 }
michael@0 7541 } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) {
michael@0 7542 secT = UCOL_PRIMARYMASK;
michael@0 7543 tInShifted = FALSE;
michael@0 7544 } else {
michael@0 7545 tInShifted = TRUE;
michael@0 7546 }
michael@0 7547 }
michael@0 7548 secT &= UCOL_PRIMARYMASK;
michael@0 7549
michael@0 7550 if(secS == secT) {
michael@0 7551 if(secS == UCOL_NO_MORE_CES_PRIMARY) {
michael@0 7552 break;
michael@0 7553 } else {
michael@0 7554 secS = 0; secT = 0;
michael@0 7555 continue;
michael@0 7556 }
michael@0 7557 } else {
michael@0 7558 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
michael@0 7559 goto commonReturn;
michael@0 7560 }
michael@0 7561 }
michael@0 7562 } else if(doHiragana && hirResult != UCOL_EQUAL) {
michael@0 7563 // If we're fine on quaternaries, we might be different
michael@0 7564 // on Hiragana. This, however, might fail us in shifted.
michael@0 7565 result = hirResult;
michael@0 7566 goto commonReturn;
michael@0 7567 }
michael@0 7568
michael@0 7569 /* For IDENTICAL comparisons, we use a bitwise character comparison */
michael@0 7570 /* as a tiebreaker if all else is equal. */
michael@0 7571 /* Getting here should be quite rare - strings are not identical - */
michael@0 7572 /* that is checked first, but compared == through all other checks. */
michael@0 7573 if(checkIdent)
michael@0 7574 {
michael@0 7575 //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
michael@0 7576 result = ucol_checkIdent(sColl, tColl, TRUE, status);
michael@0 7577 }
michael@0 7578
michael@0 7579 commonReturn:
michael@0 7580 if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) {
michael@0 7581 if (sCEs.buf != sCEs.localArray ) {
michael@0 7582 uprv_free(sCEs.buf);
michael@0 7583 }
michael@0 7584 if (tCEs.buf != tCEs.localArray ) {
michael@0 7585 uprv_free(tCEs.buf);
michael@0 7586 }
michael@0 7587 }
michael@0 7588
michael@0 7589 return result;
michael@0 7590 }
michael@0 7591
michael@0 7592 static UCollationResult
michael@0 7593 ucol_strcollRegular(const UCollator *coll,
michael@0 7594 const UChar *source, int32_t sourceLength,
michael@0 7595 const UChar *target, int32_t targetLength,
michael@0 7596 UErrorCode *status) {
michael@0 7597 collIterate sColl, tColl;
michael@0 7598 // Preparing the context objects for iterating over strings
michael@0 7599 IInit_collIterate(coll, source, sourceLength, &sColl, status);
michael@0 7600 IInit_collIterate(coll, target, targetLength, &tColl, status);
michael@0 7601 if(U_FAILURE(*status)) {
michael@0 7602 return UCOL_LESS;
michael@0 7603 }
michael@0 7604 return ucol_strcollRegular(&sColl, &tColl, status);
michael@0 7605 }
michael@0 7606
michael@0 7607 static inline uint32_t
michael@0 7608 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength,
michael@0 7609 uint32_t CE, const UChar *s, int32_t *index, int32_t len)
michael@0 7610 {
michael@0 7611 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
michael@0 7612 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
michael@0 7613 int32_t offset = 1;
michael@0 7614 UChar schar = 0, tchar = 0;
michael@0 7615
michael@0 7616 for(;;) {
michael@0 7617 if(len == -1) {
michael@0 7618 if(s[*index] == 0) { // end of string
michael@0 7619 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
michael@0 7620 } else {
michael@0 7621 schar = s[*index];
michael@0 7622 }
michael@0 7623 } else {
michael@0 7624 if(*index == len) {
michael@0 7625 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
michael@0 7626 } else {
michael@0 7627 schar = s[*index];
michael@0 7628 }
michael@0 7629 }
michael@0 7630
michael@0 7631 while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
michael@0 7632 offset++;
michael@0 7633 }
michael@0 7634
michael@0 7635 if (schar == tchar) {
michael@0 7636 (*index)++;
michael@0 7637 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
michael@0 7638 }
michael@0 7639 else
michael@0 7640 {
michael@0 7641 if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
michael@0 7642 return UCOL_BAIL_OUT_CE;
michael@0 7643 }
michael@0 7644 // skip completely ignorables
michael@0 7645 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
michael@0 7646 if(isZeroCE == 0) { // we have to ignore completely ignorables
michael@0 7647 (*index)++;
michael@0 7648 continue;
michael@0 7649 }
michael@0 7650
michael@0 7651 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
michael@0 7652 }
michael@0 7653 }
michael@0 7654 }
michael@0 7655
michael@0 7656
michael@0 7657 /**
michael@0 7658 * This is a fast strcoll, geared towards text in Latin-1.
michael@0 7659 * It supports contractions of size two, French secondaries
michael@0 7660 * and case switching. You can use it with strengths primary
michael@0 7661 * to tertiary. It does not support shifted and case level.
michael@0 7662 * It relies on the table build by setupLatin1Table. If it
michael@0 7663 * doesn't understand something, it will go to the regular
michael@0 7664 * strcoll.
michael@0 7665 */
michael@0 7666 static UCollationResult
michael@0 7667 ucol_strcollUseLatin1( const UCollator *coll,
michael@0 7668 const UChar *source,
michael@0 7669 int32_t sLen,
michael@0 7670 const UChar *target,
michael@0 7671 int32_t tLen,
michael@0 7672 UErrorCode *status)
michael@0 7673 {
michael@0 7674 U_ALIGN_CODE(16);
michael@0 7675 int32_t strength = coll->strength;
michael@0 7676
michael@0 7677 int32_t sIndex = 0, tIndex = 0;
michael@0 7678 UChar sChar = 0, tChar = 0;
michael@0 7679 uint32_t sOrder=0, tOrder=0;
michael@0 7680
michael@0 7681 UBool endOfSource = FALSE;
michael@0 7682
michael@0 7683 uint32_t *elements = coll->latinOneCEs;
michael@0 7684
michael@0 7685 UBool haveContractions = FALSE; // if we have contractions in our string
michael@0 7686 // we cannot do French secondary
michael@0 7687
michael@0 7688 // Do the primary level
michael@0 7689 for(;;) {
michael@0 7690 while(sOrder==0) { // this loop skips primary ignorables
michael@0 7691 // sOrder=getNextlatinOneCE(source);
michael@0 7692 if(sLen==-1) { // handling zero terminated strings
michael@0 7693 sChar=source[sIndex++];
michael@0 7694 if(sChar==0) {
michael@0 7695 endOfSource = TRUE;
michael@0 7696 break;
michael@0 7697 }
michael@0 7698 } else { // handling strings with known length
michael@0 7699 if(sIndex==sLen) {
michael@0 7700 endOfSource = TRUE;
michael@0 7701 break;
michael@0 7702 }
michael@0 7703 sChar=source[sIndex++];
michael@0 7704 }
michael@0 7705 if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
michael@0 7706 //fprintf(stderr, "R");
michael@0 7707 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
michael@0 7708 }
michael@0 7709 sOrder = elements[sChar];
michael@0 7710 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
michael@0 7711 // specials can basically be either contractions or bail-out signs. If we get anything
michael@0 7712 // else, we'll bail out anywasy
michael@0 7713 if(getCETag(sOrder) == CONTRACTION_TAG) {
michael@0 7714 sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
michael@0 7715 haveContractions = TRUE; // if there are contractions, we cannot do French secondary
michael@0 7716 // However, if there are contractions in the table, but we always use just one char,
michael@0 7717 // we might be able to do French. This should be checked out.
michael@0 7718 }
michael@0 7719 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
michael@0 7720 //fprintf(stderr, "S");
michael@0 7721 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
michael@0 7722 }
michael@0 7723 }
michael@0 7724 }
michael@0 7725
michael@0 7726 while(tOrder==0) { // this loop skips primary ignorables
michael@0 7727 // tOrder=getNextlatinOneCE(target);
michael@0 7728 if(tLen==-1) { // handling zero terminated strings
michael@0 7729 tChar=target[tIndex++];
michael@0 7730 if(tChar==0) {
michael@0 7731 if(endOfSource) { // this is different than source loop,
michael@0 7732 // as we already know that source loop is done here,
michael@0 7733 // so we can either finish the primary loop if both
michael@0 7734 // strings are done or anounce the result if only
michael@0 7735 // target is done. Same below.
michael@0 7736 goto endOfPrimLoop;
michael@0 7737 } else {
michael@0 7738 return UCOL_GREATER;
michael@0 7739 }
michael@0 7740 }
michael@0 7741 } else { // handling strings with known length
michael@0 7742 if(tIndex==tLen) {
michael@0 7743 if(endOfSource) {
michael@0 7744 goto endOfPrimLoop;
michael@0 7745 } else {
michael@0 7746 return UCOL_GREATER;
michael@0 7747 }
michael@0 7748 }
michael@0 7749 tChar=target[tIndex++];
michael@0 7750 }
michael@0 7751 if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
michael@0 7752 //fprintf(stderr, "R");
michael@0 7753 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
michael@0 7754 }
michael@0 7755 tOrder = elements[tChar];
michael@0 7756 if(tOrder >= UCOL_NOT_FOUND) {
michael@0 7757 // Handling specials, see the comments for source
michael@0 7758 if(getCETag(tOrder) == CONTRACTION_TAG) {
michael@0 7759 tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
michael@0 7760 haveContractions = TRUE;
michael@0 7761 }
michael@0 7762 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
michael@0 7763 //fprintf(stderr, "S");
michael@0 7764 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
michael@0 7765 }
michael@0 7766 }
michael@0 7767 }
michael@0 7768 if(endOfSource) { // source is finished, but target is not, say the result.
michael@0 7769 return UCOL_LESS;
michael@0 7770 }
michael@0 7771
michael@0 7772 if(sOrder == tOrder) { // if we have same CEs, we continue the loop
michael@0 7773 sOrder = 0; tOrder = 0;
michael@0 7774 continue;
michael@0 7775 } else {
michael@0 7776 // compare current top bytes
michael@0 7777 if(((sOrder^tOrder)&0xFF000000)!=0) {
michael@0 7778 // top bytes differ, return difference
michael@0 7779 if(sOrder < tOrder) {
michael@0 7780 return UCOL_LESS;
michael@0 7781 } else if(sOrder > tOrder) {
michael@0 7782 return UCOL_GREATER;
michael@0 7783 }
michael@0 7784 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
michael@0 7785 // since we must return enum value
michael@0 7786 }
michael@0 7787
michael@0 7788 // top bytes match, continue with following bytes
michael@0 7789 sOrder<<=8;
michael@0 7790 tOrder<<=8;
michael@0 7791 }
michael@0 7792 }
michael@0 7793
michael@0 7794 endOfPrimLoop:
michael@0 7795 // after primary loop, we definitely know the sizes of strings,
michael@0 7796 // so we set it and use simpler loop for secondaries and tertiaries
michael@0 7797 sLen = sIndex; tLen = tIndex;
michael@0 7798 if(strength >= UCOL_SECONDARY) {
michael@0 7799 // adjust the table beggining
michael@0 7800 elements += coll->latinOneTableLen;
michael@0 7801 endOfSource = FALSE;
michael@0 7802
michael@0 7803 if(coll->frenchCollation == UCOL_OFF) { // non French
michael@0 7804 // This loop is a simplified copy of primary loop
michael@0 7805 // at this point we know that whole strings are latin-1, so we don't
michael@0 7806 // check for that. We also know that we only have contractions as
michael@0 7807 // specials.
michael@0 7808 sIndex = 0; tIndex = 0;
michael@0 7809 for(;;) {
michael@0 7810 while(sOrder==0) {
michael@0 7811 if(sIndex==sLen) {
michael@0 7812 endOfSource = TRUE;
michael@0 7813 break;
michael@0 7814 }
michael@0 7815 sChar=source[sIndex++];
michael@0 7816 sOrder = elements[sChar];
michael@0 7817 if(sOrder > UCOL_NOT_FOUND) {
michael@0 7818 sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
michael@0 7819 }
michael@0 7820 }
michael@0 7821
michael@0 7822 while(tOrder==0) {
michael@0 7823 if(tIndex==tLen) {
michael@0 7824 if(endOfSource) {
michael@0 7825 goto endOfSecLoop;
michael@0 7826 } else {
michael@0 7827 return UCOL_GREATER;
michael@0 7828 }
michael@0 7829 }
michael@0 7830 tChar=target[tIndex++];
michael@0 7831 tOrder = elements[tChar];
michael@0 7832 if(tOrder > UCOL_NOT_FOUND) {
michael@0 7833 tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
michael@0 7834 }
michael@0 7835 }
michael@0 7836 if(endOfSource) {
michael@0 7837 return UCOL_LESS;
michael@0 7838 }
michael@0 7839
michael@0 7840 if(sOrder == tOrder) {
michael@0 7841 sOrder = 0; tOrder = 0;
michael@0 7842 continue;
michael@0 7843 } else {
michael@0 7844 // see primary loop for comments on this
michael@0 7845 if(((sOrder^tOrder)&0xFF000000)!=0) {
michael@0 7846 if(sOrder < tOrder) {
michael@0 7847 return UCOL_LESS;
michael@0 7848 } else if(sOrder > tOrder) {
michael@0 7849 return UCOL_GREATER;
michael@0 7850 }
michael@0 7851 }
michael@0 7852 sOrder<<=8;
michael@0 7853 tOrder<<=8;
michael@0 7854 }
michael@0 7855 }
michael@0 7856 } else { // French
michael@0 7857 if(haveContractions) { // if we have contractions, we have to bail out
michael@0 7858 // since we don't really know how to handle them here
michael@0 7859 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
michael@0 7860 }
michael@0 7861 // For French, we go backwards
michael@0 7862 sIndex = sLen; tIndex = tLen;
michael@0 7863 for(;;) {
michael@0 7864 while(sOrder==0) {
michael@0 7865 if(sIndex==0) {
michael@0 7866 endOfSource = TRUE;
michael@0 7867 break;
michael@0 7868 }
michael@0 7869 sChar=source[--sIndex];
michael@0 7870 sOrder = elements[sChar];
michael@0 7871 // don't even look for contractions
michael@0 7872 }
michael@0 7873
michael@0 7874 while(tOrder==0) {
michael@0 7875 if(tIndex==0) {
michael@0 7876 if(endOfSource) {
michael@0 7877 goto endOfSecLoop;
michael@0 7878 } else {
michael@0 7879 return UCOL_GREATER;
michael@0 7880 }
michael@0 7881 }
michael@0 7882 tChar=target[--tIndex];
michael@0 7883 tOrder = elements[tChar];
michael@0 7884 // don't even look for contractions
michael@0 7885 }
michael@0 7886 if(endOfSource) {
michael@0 7887 return UCOL_LESS;
michael@0 7888 }
michael@0 7889
michael@0 7890 if(sOrder == tOrder) {
michael@0 7891 sOrder = 0; tOrder = 0;
michael@0 7892 continue;
michael@0 7893 } else {
michael@0 7894 // see the primary loop for comments
michael@0 7895 if(((sOrder^tOrder)&0xFF000000)!=0) {
michael@0 7896 if(sOrder < tOrder) {
michael@0 7897 return UCOL_LESS;
michael@0 7898 } else if(sOrder > tOrder) {
michael@0 7899 return UCOL_GREATER;
michael@0 7900 }
michael@0 7901 }
michael@0 7902 sOrder<<=8;
michael@0 7903 tOrder<<=8;
michael@0 7904 }
michael@0 7905 }
michael@0 7906 }
michael@0 7907 }
michael@0 7908
michael@0 7909 endOfSecLoop:
michael@0 7910 if(strength >= UCOL_TERTIARY) {
michael@0 7911 // tertiary loop is the same as secondary (except no French)
michael@0 7912 elements += coll->latinOneTableLen;
michael@0 7913 sIndex = 0; tIndex = 0;
michael@0 7914 endOfSource = FALSE;
michael@0 7915 for(;;) {
michael@0 7916 while(sOrder==0) {
michael@0 7917 if(sIndex==sLen) {
michael@0 7918 endOfSource = TRUE;
michael@0 7919 break;
michael@0 7920 }
michael@0 7921 sChar=source[sIndex++];
michael@0 7922 sOrder = elements[sChar];
michael@0 7923 if(sOrder > UCOL_NOT_FOUND) {
michael@0 7924 sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
michael@0 7925 }
michael@0 7926 }
michael@0 7927 while(tOrder==0) {
michael@0 7928 if(tIndex==tLen) {
michael@0 7929 if(endOfSource) {
michael@0 7930 return UCOL_EQUAL; // if both strings are at the end, they are equal
michael@0 7931 } else {
michael@0 7932 return UCOL_GREATER;
michael@0 7933 }
michael@0 7934 }
michael@0 7935 tChar=target[tIndex++];
michael@0 7936 tOrder = elements[tChar];
michael@0 7937 if(tOrder > UCOL_NOT_FOUND) {
michael@0 7938 tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
michael@0 7939 }
michael@0 7940 }
michael@0 7941 if(endOfSource) {
michael@0 7942 return UCOL_LESS;
michael@0 7943 }
michael@0 7944 if(sOrder == tOrder) {
michael@0 7945 sOrder = 0; tOrder = 0;
michael@0 7946 continue;
michael@0 7947 } else {
michael@0 7948 if(((sOrder^tOrder)&0xff000000)!=0) {
michael@0 7949 if(sOrder < tOrder) {
michael@0 7950 return UCOL_LESS;
michael@0 7951 } else if(sOrder > tOrder) {
michael@0 7952 return UCOL_GREATER;
michael@0 7953 }
michael@0 7954 }
michael@0 7955 sOrder<<=8;
michael@0 7956 tOrder<<=8;
michael@0 7957 }
michael@0 7958 }
michael@0 7959 }
michael@0 7960 return UCOL_EQUAL;
michael@0 7961 }
michael@0 7962
michael@0 7963 /*
michael@0 7964 Note: ucol_strcollUTF8 supports null terminated input. Calculating length of
michael@0 7965 null terminated input string takes extra amount of CPU cycles.
michael@0 7966 */
michael@0 7967 static UCollationResult
michael@0 7968 ucol_strcollRegularUTF8(
michael@0 7969 const UCollator *coll,
michael@0 7970 const char *source,
michael@0 7971 int32_t sourceLength,
michael@0 7972 const char *target,
michael@0 7973 int32_t targetLength,
michael@0 7974 UErrorCode *status)
michael@0 7975 {
michael@0 7976 UCharIterator src;
michael@0 7977 UCharIterator tgt;
michael@0 7978
michael@0 7979 uiter_setUTF8(&src, source, sourceLength);
michael@0 7980 uiter_setUTF8(&tgt, target, targetLength);
michael@0 7981
michael@0 7982 // Preparing the context objects for iterating over strings
michael@0 7983 collIterate sColl, tColl;
michael@0 7984 IInit_collIterate(coll, NULL, -1, &sColl, status);
michael@0 7985 IInit_collIterate(coll, NULL, -1, &tColl, status);
michael@0 7986 if(U_FAILURE(*status)) {
michael@0 7987 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
michael@0 7988 return UCOL_EQUAL;
michael@0 7989 }
michael@0 7990 // The division for the array length may truncate the array size to
michael@0 7991 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
michael@0 7992 // for all platforms anyway.
michael@0 7993 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
michael@0 7994 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
michael@0 7995 UNormIterator *sNormIter = NULL, *tNormIter = NULL;
michael@0 7996
michael@0 7997 sColl.iterator = &src;
michael@0 7998 sColl.flags |= UCOL_USE_ITERATOR;
michael@0 7999 tColl.flags |= UCOL_USE_ITERATOR;
michael@0 8000 tColl.iterator = &tgt;
michael@0 8001
michael@0 8002 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
michael@0 8003 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
michael@0 8004 sColl.iterator = unorm_setIter(sNormIter, &src, UNORM_FCD, status);
michael@0 8005 sColl.flags &= ~UCOL_ITER_NORM;
michael@0 8006
michael@0 8007 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
michael@0 8008 tColl.iterator = unorm_setIter(tNormIter, &tgt, UNORM_FCD, status);
michael@0 8009 tColl.flags &= ~UCOL_ITER_NORM;
michael@0 8010 }
michael@0 8011
michael@0 8012 return ucol_strcollRegular(&sColl, &tColl, status);
michael@0 8013 }
michael@0 8014
michael@0 8015 static inline uint32_t
michael@0 8016 ucol_getLatinOneContractionUTF8(const UCollator *coll, int32_t strength,
michael@0 8017 uint32_t CE, const char *s, int32_t *index, int32_t len)
michael@0 8018 {
michael@0 8019 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
michael@0 8020 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
michael@0 8021 int32_t offset = 1;
michael@0 8022 UChar32 schar = 0, tchar = 0;
michael@0 8023
michael@0 8024 for(;;) {
michael@0 8025 if (*index == len) {
michael@0 8026 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
michael@0 8027 }
michael@0 8028 U8_GET_OR_FFFD((const uint8_t*)s, 0, *index, len, schar);
michael@0 8029 if (len < 0 && schar == 0) {
michael@0 8030 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
michael@0 8031 }
michael@0 8032
michael@0 8033 while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
michael@0 8034 offset++;
michael@0 8035 }
michael@0 8036
michael@0 8037 if (schar == tchar) {
michael@0 8038 U8_FWD_1(s, *index, len);
michael@0 8039 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
michael@0 8040 }
michael@0 8041 else
michael@0 8042 {
michael@0 8043 if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
michael@0 8044 return UCOL_BAIL_OUT_CE;
michael@0 8045 }
michael@0 8046 // skip completely ignorables
michael@0 8047 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
michael@0 8048 if(isZeroCE == 0) { // we have to ignore completely ignorables
michael@0 8049 U8_FWD_1(s, *index, len);
michael@0 8050 continue;
michael@0 8051 }
michael@0 8052
michael@0 8053 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
michael@0 8054 }
michael@0 8055 }
michael@0 8056 }
michael@0 8057
michael@0 8058 static inline UCollationResult
michael@0 8059 ucol_strcollUseLatin1UTF8(
michael@0 8060 const UCollator *coll,
michael@0 8061 const char *source,
michael@0 8062 int32_t sLen,
michael@0 8063 const char *target,
michael@0 8064 int32_t tLen,
michael@0 8065 UErrorCode *status)
michael@0 8066 {
michael@0 8067 U_ALIGN_CODE(16);
michael@0 8068 int32_t strength = coll->strength;
michael@0 8069
michael@0 8070 int32_t sIndex = 0, tIndex = 0;
michael@0 8071 UChar32 sChar = 0, tChar = 0;
michael@0 8072 uint32_t sOrder=0, tOrder=0;
michael@0 8073
michael@0 8074 UBool endOfSource = FALSE;
michael@0 8075
michael@0 8076 uint32_t *elements = coll->latinOneCEs;
michael@0 8077
michael@0 8078 UBool haveContractions = FALSE; // if we have contractions in our string
michael@0 8079 // we cannot do French secondary
michael@0 8080
michael@0 8081 // Do the primary level
michael@0 8082 for(;;) {
michael@0 8083 while(sOrder==0) { // this loop skips primary ignorables
michael@0 8084 // sOrder=getNextlatinOneCE(source);
michael@0 8085 if (sIndex == sLen) {
michael@0 8086 endOfSource = TRUE;
michael@0 8087 break;
michael@0 8088 }
michael@0 8089 U8_NEXT_OR_FFFD(source, sIndex, sLen ,sChar);
michael@0 8090 if (sLen < 0 && sChar == 0) {
michael@0 8091 endOfSource = TRUE;
michael@0 8092 sLen = sIndex;
michael@0 8093 break;
michael@0 8094 }
michael@0 8095 if(sChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
michael@0 8096 //fprintf(stderr, "R");
michael@0 8097 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
michael@0 8098 }
michael@0 8099 sOrder = elements[sChar];
michael@0 8100 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
michael@0 8101 // specials can basically be either contractions or bail-out signs. If we get anything
michael@0 8102 // else, we'll bail out anywasy
michael@0 8103 if(getCETag(sOrder) == CONTRACTION_TAG) {
michael@0 8104 sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
michael@0 8105 haveContractions = TRUE; // if there are contractions, we cannot do French secondary
michael@0 8106 // However, if there are contractions in the table, but we always use just one char,
michael@0 8107 // we might be able to do French. This should be checked out.
michael@0 8108 }
michael@0 8109 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
michael@0 8110 //fprintf(stderr, "S");
michael@0 8111 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
michael@0 8112 }
michael@0 8113 }
michael@0 8114 }
michael@0 8115
michael@0 8116 while(tOrder==0) { // this loop skips primary ignorables
michael@0 8117 // tOrder=getNextlatinOneCE(target);
michael@0 8118 if (tIndex == tLen) {
michael@0 8119 if(endOfSource) {
michael@0 8120 goto endOfPrimLoopU8;
michael@0 8121 } else {
michael@0 8122 return UCOL_GREATER;
michael@0 8123 }
michael@0 8124 }
michael@0 8125 U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar);
michael@0 8126 if (tLen < 0 && tChar == 0) {
michael@0 8127 if(endOfSource) {
michael@0 8128 tLen = tIndex;
michael@0 8129 goto endOfPrimLoopU8;
michael@0 8130 } else {
michael@0 8131 return UCOL_GREATER;
michael@0 8132 }
michael@0 8133 }
michael@0 8134 if(tChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
michael@0 8135 //fprintf(stderr, "R");
michael@0 8136 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
michael@0 8137 }
michael@0 8138 tOrder = elements[tChar];
michael@0 8139 if(tOrder >= UCOL_NOT_FOUND) {
michael@0 8140 // Handling specials, see the comments for source
michael@0 8141 if(getCETag(tOrder) == CONTRACTION_TAG) {
michael@0 8142 tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
michael@0 8143 haveContractions = TRUE;
michael@0 8144 }
michael@0 8145 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
michael@0 8146 //fprintf(stderr, "S");
michael@0 8147 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
michael@0 8148 }
michael@0 8149 }
michael@0 8150 }
michael@0 8151 if(endOfSource) { // source is finished, but target is not, say the result.
michael@0 8152 return UCOL_LESS;
michael@0 8153 }
michael@0 8154
michael@0 8155 if(sOrder == tOrder) { // if we have same CEs, we continue the loop
michael@0 8156 sOrder = 0; tOrder = 0;
michael@0 8157 continue;
michael@0 8158 } else {
michael@0 8159 // compare current top bytes
michael@0 8160 if(((sOrder^tOrder)&0xFF000000)!=0) {
michael@0 8161 // top bytes differ, return difference
michael@0 8162 if(sOrder < tOrder) {
michael@0 8163 return UCOL_LESS;
michael@0 8164 } else if(sOrder > tOrder) {
michael@0 8165 return UCOL_GREATER;
michael@0 8166 }
michael@0 8167 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
michael@0 8168 // since we must return enum value
michael@0 8169 }
michael@0 8170
michael@0 8171 // top bytes match, continue with following bytes
michael@0 8172 sOrder<<=8;
michael@0 8173 tOrder<<=8;
michael@0 8174 }
michael@0 8175 }
michael@0 8176
michael@0 8177 endOfPrimLoopU8:
michael@0 8178 // after primary loop, we definitely know the sizes of strings,
michael@0 8179 // so we set it and use simpler loop for secondaries and tertiaries
michael@0 8180 sLen = sIndex; tLen = tIndex;
michael@0 8181 if(strength >= UCOL_SECONDARY) {
michael@0 8182 // adjust the table beggining
michael@0 8183 elements += coll->latinOneTableLen;
michael@0 8184 endOfSource = FALSE;
michael@0 8185
michael@0 8186 if(coll->frenchCollation == UCOL_OFF) { // non French
michael@0 8187 // This loop is a simplified copy of primary loop
michael@0 8188 // at this point we know that whole strings are latin-1, so we don't
michael@0 8189 // check for that. We also know that we only have contractions as
michael@0 8190 // specials.
michael@0 8191 sIndex = 0; tIndex = 0;
michael@0 8192 for(;;) {
michael@0 8193 while(sOrder==0) {
michael@0 8194 if(sIndex==sLen) {
michael@0 8195 endOfSource = TRUE;
michael@0 8196 break;
michael@0 8197 }
michael@0 8198 U_ASSERT(sLen >= 0);
michael@0 8199 U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar);
michael@0 8200 U_ASSERT(sChar >= 0 && sChar <= 0xFF);
michael@0 8201 sOrder = elements[sChar];
michael@0 8202 if(sOrder > UCOL_NOT_FOUND) {
michael@0 8203 sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
michael@0 8204 }
michael@0 8205 }
michael@0 8206
michael@0 8207 while(tOrder==0) {
michael@0 8208 if(tIndex==tLen) {
michael@0 8209 if(endOfSource) {
michael@0 8210 goto endOfSecLoopU8;
michael@0 8211 } else {
michael@0 8212 return UCOL_GREATER;
michael@0 8213 }
michael@0 8214 }
michael@0 8215 U_ASSERT(tLen >= 0);
michael@0 8216 U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar);
michael@0 8217 U_ASSERT(tChar >= 0 && tChar <= 0xFF);
michael@0 8218 tOrder = elements[tChar];
michael@0 8219 if(tOrder > UCOL_NOT_FOUND) {
michael@0 8220 tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
michael@0 8221 }
michael@0 8222 }
michael@0 8223 if(endOfSource) {
michael@0 8224 return UCOL_LESS;
michael@0 8225 }
michael@0 8226
michael@0 8227 if(sOrder == tOrder) {
michael@0 8228 sOrder = 0; tOrder = 0;
michael@0 8229 continue;
michael@0 8230 } else {
michael@0 8231 // see primary loop for comments on this
michael@0 8232 if(((sOrder^tOrder)&0xFF000000)!=0) {
michael@0 8233 if(sOrder < tOrder) {
michael@0 8234 return UCOL_LESS;
michael@0 8235 } else if(sOrder > tOrder) {
michael@0 8236 return UCOL_GREATER;
michael@0 8237 }
michael@0 8238 }
michael@0 8239 sOrder<<=8;
michael@0 8240 tOrder<<=8;
michael@0 8241 }
michael@0 8242 }
michael@0 8243 } else { // French
michael@0 8244 if(haveContractions) { // if we have contractions, we have to bail out
michael@0 8245 // since we don't really know how to handle them here
michael@0 8246 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
michael@0 8247 }
michael@0 8248 // For French, we go backwards
michael@0 8249 sIndex = sLen; tIndex = tLen;
michael@0 8250 for(;;) {
michael@0 8251 while(sOrder==0) {
michael@0 8252 if(sIndex==0) {
michael@0 8253 endOfSource = TRUE;
michael@0 8254 break;
michael@0 8255 }
michael@0 8256 U8_PREV_OR_FFFD(source, 0, sIndex, sChar);
michael@0 8257 U_ASSERT(sChar >= 0 && sChar <= 0xFF);
michael@0 8258 sOrder = elements[sChar];
michael@0 8259 // don't even look for contractions
michael@0 8260 }
michael@0 8261
michael@0 8262 while(tOrder==0) {
michael@0 8263 if(tIndex==0) {
michael@0 8264 if(endOfSource) {
michael@0 8265 goto endOfSecLoopU8;
michael@0 8266 } else {
michael@0 8267 return UCOL_GREATER;
michael@0 8268 }
michael@0 8269 }
michael@0 8270 U8_PREV_OR_FFFD(target, 0, tIndex, tChar);
michael@0 8271 U_ASSERT(tChar >= 0 && tChar <= 0xFF);
michael@0 8272 tOrder = elements[tChar];
michael@0 8273 // don't even look for contractions
michael@0 8274 }
michael@0 8275 if(endOfSource) {
michael@0 8276 return UCOL_LESS;
michael@0 8277 }
michael@0 8278
michael@0 8279 if(sOrder == tOrder) {
michael@0 8280 sOrder = 0; tOrder = 0;
michael@0 8281 continue;
michael@0 8282 } else {
michael@0 8283 // see the primary loop for comments
michael@0 8284 if(((sOrder^tOrder)&0xFF000000)!=0) {
michael@0 8285 if(sOrder < tOrder) {
michael@0 8286 return UCOL_LESS;
michael@0 8287 } else if(sOrder > tOrder) {
michael@0 8288 return UCOL_GREATER;
michael@0 8289 }
michael@0 8290 }
michael@0 8291 sOrder<<=8;
michael@0 8292 tOrder<<=8;
michael@0 8293 }
michael@0 8294 }
michael@0 8295 }
michael@0 8296 }
michael@0 8297
michael@0 8298 endOfSecLoopU8:
michael@0 8299 if(strength >= UCOL_TERTIARY) {
michael@0 8300 // tertiary loop is the same as secondary (except no French)
michael@0 8301 elements += coll->latinOneTableLen;
michael@0 8302 sIndex = 0; tIndex = 0;
michael@0 8303 endOfSource = FALSE;
michael@0 8304 for(;;) {
michael@0 8305 while(sOrder==0) {
michael@0 8306 if(sIndex==sLen) {
michael@0 8307 endOfSource = TRUE;
michael@0 8308 break;
michael@0 8309 }
michael@0 8310 U_ASSERT(sLen >= 0);
michael@0 8311 U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar);
michael@0 8312 U_ASSERT(sChar >= 0 && sChar <= 0xFF);
michael@0 8313 sOrder = elements[sChar];
michael@0 8314 if(sOrder > UCOL_NOT_FOUND) {
michael@0 8315 sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
michael@0 8316 }
michael@0 8317 }
michael@0 8318 while(tOrder==0) {
michael@0 8319 if(tIndex==tLen) {
michael@0 8320 if(endOfSource) {
michael@0 8321 return UCOL_EQUAL; // if both strings are at the end, they are equal
michael@0 8322 } else {
michael@0 8323 return UCOL_GREATER;
michael@0 8324 }
michael@0 8325 }
michael@0 8326 U_ASSERT(tLen >= 0);
michael@0 8327 U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar);
michael@0 8328 U_ASSERT(tChar >= 0 && tChar <= 0xFF);
michael@0 8329 tOrder = elements[tChar];
michael@0 8330 if(tOrder > UCOL_NOT_FOUND) {
michael@0 8331 tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
michael@0 8332 }
michael@0 8333 }
michael@0 8334 if(endOfSource) {
michael@0 8335 return UCOL_LESS;
michael@0 8336 }
michael@0 8337 if(sOrder == tOrder) {
michael@0 8338 sOrder = 0; tOrder = 0;
michael@0 8339 continue;
michael@0 8340 } else {
michael@0 8341 if(((sOrder^tOrder)&0xff000000)!=0) {
michael@0 8342 if(sOrder < tOrder) {
michael@0 8343 return UCOL_LESS;
michael@0 8344 } else if(sOrder > tOrder) {
michael@0 8345 return UCOL_GREATER;
michael@0 8346 }
michael@0 8347 }
michael@0 8348 sOrder<<=8;
michael@0 8349 tOrder<<=8;
michael@0 8350 }
michael@0 8351 }
michael@0 8352 }
michael@0 8353 return UCOL_EQUAL;
michael@0 8354 }
michael@0 8355
michael@0 8356 U_CAPI UCollationResult U_EXPORT2
michael@0 8357 ucol_strcollIter( const UCollator *coll,
michael@0 8358 UCharIterator *sIter,
michael@0 8359 UCharIterator *tIter,
michael@0 8360 UErrorCode *status)
michael@0 8361 {
michael@0 8362 if(!status || U_FAILURE(*status)) {
michael@0 8363 return UCOL_EQUAL;
michael@0 8364 }
michael@0 8365
michael@0 8366 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);
michael@0 8367 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter);
michael@0 8368
michael@0 8369 if (sIter == tIter) {
michael@0 8370 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
michael@0 8371 return UCOL_EQUAL;
michael@0 8372 }
michael@0 8373 if(sIter == NULL || tIter == NULL || coll == NULL) {
michael@0 8374 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 8375 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
michael@0 8376 return UCOL_EQUAL;
michael@0 8377 }
michael@0 8378
michael@0 8379 UCollationResult result = UCOL_EQUAL;
michael@0 8380
michael@0 8381 // Preparing the context objects for iterating over strings
michael@0 8382 collIterate sColl, tColl;
michael@0 8383 IInit_collIterate(coll, NULL, -1, &sColl, status);
michael@0 8384 IInit_collIterate(coll, NULL, -1, &tColl, status);
michael@0 8385 if(U_FAILURE(*status)) {
michael@0 8386 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
michael@0 8387 return UCOL_EQUAL;
michael@0 8388 }
michael@0 8389 // The division for the array length may truncate the array size to
michael@0 8390 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
michael@0 8391 // for all platforms anyway.
michael@0 8392 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
michael@0 8393 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
michael@0 8394 UNormIterator *sNormIter = NULL, *tNormIter = NULL;
michael@0 8395
michael@0 8396 sColl.iterator = sIter;
michael@0 8397 sColl.flags |= UCOL_USE_ITERATOR;
michael@0 8398 tColl.flags |= UCOL_USE_ITERATOR;
michael@0 8399 tColl.iterator = tIter;
michael@0 8400
michael@0 8401 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
michael@0 8402 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
michael@0 8403 sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status);
michael@0 8404 sColl.flags &= ~UCOL_ITER_NORM;
michael@0 8405
michael@0 8406 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
michael@0 8407 tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status);
michael@0 8408 tColl.flags &= ~UCOL_ITER_NORM;
michael@0 8409 }
michael@0 8410
michael@0 8411 UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL;
michael@0 8412
michael@0 8413 while((sChar = sColl.iterator->next(sColl.iterator)) ==
michael@0 8414 (tChar = tColl.iterator->next(tColl.iterator))) {
michael@0 8415 if(sChar == U_SENTINEL) {
michael@0 8416 result = UCOL_EQUAL;
michael@0 8417 goto end_compare;
michael@0 8418 }
michael@0 8419 }
michael@0 8420
michael@0 8421 if(sChar == U_SENTINEL) {
michael@0 8422 tChar = tColl.iterator->previous(tColl.iterator);
michael@0 8423 }
michael@0 8424
michael@0 8425 if(tChar == U_SENTINEL) {
michael@0 8426 sChar = sColl.iterator->previous(sColl.iterator);
michael@0 8427 }
michael@0 8428
michael@0 8429 sChar = sColl.iterator->previous(sColl.iterator);
michael@0 8430 tChar = tColl.iterator->previous(tColl.iterator);
michael@0 8431
michael@0 8432 if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll))
michael@0 8433 {
michael@0 8434 // We are stopped in the middle of a contraction.
michael@0 8435 // Scan backwards through the == part of the string looking for the start of the contraction.
michael@0 8436 // It doesn't matter which string we scan, since they are the same in this region.
michael@0 8437 do
michael@0 8438 {
michael@0 8439 sChar = sColl.iterator->previous(sColl.iterator);
michael@0 8440 tChar = tColl.iterator->previous(tColl.iterator);
michael@0 8441 }
michael@0 8442 while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll));
michael@0 8443 }
michael@0 8444
michael@0 8445
michael@0 8446 if(U_SUCCESS(*status)) {
michael@0 8447 result = ucol_strcollRegular(&sColl, &tColl, status);
michael@0 8448 }
michael@0 8449
michael@0 8450 end_compare:
michael@0 8451 if(sNormIter || tNormIter) {
michael@0 8452 unorm_closeIter(sNormIter);
michael@0 8453 unorm_closeIter(tNormIter);
michael@0 8454 }
michael@0 8455
michael@0 8456 UTRACE_EXIT_VALUE_STATUS(result, *status)
michael@0 8457 return result;
michael@0 8458 }
michael@0 8459
michael@0 8460
michael@0 8461 /* */
michael@0 8462 /* ucol_strcoll Main public API string comparison function */
michael@0 8463 /* */
michael@0 8464 U_CAPI UCollationResult U_EXPORT2
michael@0 8465 ucol_strcoll( const UCollator *coll,
michael@0 8466 const UChar *source,
michael@0 8467 int32_t sourceLength,
michael@0 8468 const UChar *target,
michael@0 8469 int32_t targetLength)
michael@0 8470 {
michael@0 8471 U_ALIGN_CODE(16);
michael@0 8472
michael@0 8473 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);
michael@0 8474 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
michael@0 8475 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
michael@0 8476 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength);
michael@0 8477 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength);
michael@0 8478 }
michael@0 8479
michael@0 8480 if((source == NULL && sourceLength != 0) || (target == NULL && targetLength != 0)) {
michael@0 8481 // do not crash, but return. Should have
michael@0 8482 // status argument to return error.
michael@0 8483 UTRACE_EXIT_VALUE(UCOL_EQUAL);
michael@0 8484 return UCOL_EQUAL;
michael@0 8485 }
michael@0 8486
michael@0 8487 /* Quick check if source and target are same strings. */
michael@0 8488 /* They should either both be NULL terminated or the explicit length should be set on both. */
michael@0 8489 if (source==target && sourceLength==targetLength) {
michael@0 8490 UTRACE_EXIT_VALUE(UCOL_EQUAL);
michael@0 8491 return UCOL_EQUAL;
michael@0 8492 }
michael@0 8493
michael@0 8494 if(coll->delegate != NULL) {
michael@0 8495 UErrorCode status = U_ZERO_ERROR;
michael@0 8496 return ((const Collator*)coll->delegate)->compare(source,sourceLength,target,targetLength, status);
michael@0 8497 }
michael@0 8498
michael@0 8499 /* Scan the strings. Find: */
michael@0 8500 /* The length of any leading portion that is equal */
michael@0 8501 /* Whether they are exactly equal. (in which case we just return) */
michael@0 8502 const UChar *pSrc = source;
michael@0 8503 const UChar *pTarg = target;
michael@0 8504 int32_t equalLength;
michael@0 8505
michael@0 8506 if (sourceLength == -1 && targetLength == -1) {
michael@0 8507 // Both strings are null terminated.
michael@0 8508 // Scan through any leading equal portion.
michael@0 8509 while (*pSrc == *pTarg && *pSrc != 0) {
michael@0 8510 pSrc++;
michael@0 8511 pTarg++;
michael@0 8512 }
michael@0 8513 if (*pSrc == 0 && *pTarg == 0) {
michael@0 8514 UTRACE_EXIT_VALUE(UCOL_EQUAL);
michael@0 8515 return UCOL_EQUAL;
michael@0 8516 }
michael@0 8517 equalLength = (int32_t)(pSrc - source);
michael@0 8518 }
michael@0 8519 else
michael@0 8520 {
michael@0 8521 // One or both strings has an explicit length.
michael@0 8522 const UChar *pSrcEnd = source + sourceLength;
michael@0 8523 const UChar *pTargEnd = target + targetLength;
michael@0 8524
michael@0 8525 // Scan while the strings are bitwise ==, or until one is exhausted.
michael@0 8526 for (;;) {
michael@0 8527 if (pSrc == pSrcEnd || pTarg == pTargEnd) {
michael@0 8528 break;
michael@0 8529 }
michael@0 8530 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
michael@0 8531 break;
michael@0 8532 }
michael@0 8533 if (*pSrc != *pTarg) {
michael@0 8534 break;
michael@0 8535 }
michael@0 8536 pSrc++;
michael@0 8537 pTarg++;
michael@0 8538 }
michael@0 8539 equalLength = (int32_t)(pSrc - source);
michael@0 8540
michael@0 8541 // If we made it all the way through both strings, we are done. They are ==
michael@0 8542 if ((pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0)) && /* At end of src string, however it was specified. */
michael@0 8543 (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0))) /* and also at end of dest string */
michael@0 8544 {
michael@0 8545 UTRACE_EXIT_VALUE(UCOL_EQUAL);
michael@0 8546 return UCOL_EQUAL;
michael@0 8547 }
michael@0 8548 }
michael@0 8549 if (equalLength > 0) {
michael@0 8550 /* There is an identical portion at the beginning of the two strings. */
michael@0 8551 /* If the identical portion ends within a contraction or a comibining */
michael@0 8552 /* character sequence, back up to the start of that sequence. */
michael@0 8553
michael@0 8554 // These values should already be set by the code above.
michael@0 8555 //pSrc = source + equalLength; /* point to the first differing chars */
michael@0 8556 //pTarg = target + equalLength;
michael@0 8557 if ((pSrc != source+sourceLength && ucol_unsafeCP(*pSrc, coll)) ||
michael@0 8558 (pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll)))
michael@0 8559 {
michael@0 8560 // We are stopped in the middle of a contraction.
michael@0 8561 // Scan backwards through the == part of the string looking for the start of the contraction.
michael@0 8562 // It doesn't matter which string we scan, since they are the same in this region.
michael@0 8563 do
michael@0 8564 {
michael@0 8565 equalLength--;
michael@0 8566 pSrc--;
michael@0 8567 }
michael@0 8568 while (equalLength>0 && ucol_unsafeCP(*pSrc, coll));
michael@0 8569 }
michael@0 8570
michael@0 8571 source += equalLength;
michael@0 8572 target += equalLength;
michael@0 8573 if (sourceLength > 0) {
michael@0 8574 sourceLength -= equalLength;
michael@0 8575 }
michael@0 8576 if (targetLength > 0) {
michael@0 8577 targetLength -= equalLength;
michael@0 8578 }
michael@0 8579 }
michael@0 8580
michael@0 8581 UErrorCode status = U_ZERO_ERROR;
michael@0 8582 UCollationResult returnVal;
michael@0 8583 if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) {
michael@0 8584 returnVal = ucol_strcollRegular(coll, source, sourceLength, target, targetLength, &status);
michael@0 8585 } else {
michael@0 8586 returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status);
michael@0 8587 }
michael@0 8588 UTRACE_EXIT_VALUE(returnVal);
michael@0 8589 return returnVal;
michael@0 8590 }
michael@0 8591
michael@0 8592 U_CAPI UCollationResult U_EXPORT2
michael@0 8593 ucol_strcollUTF8(
michael@0 8594 const UCollator *coll,
michael@0 8595 const char *source,
michael@0 8596 int32_t sourceLength,
michael@0 8597 const char *target,
michael@0 8598 int32_t targetLength,
michael@0 8599 UErrorCode *status)
michael@0 8600 {
michael@0 8601 U_ALIGN_CODE(16);
michael@0 8602
michael@0 8603 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLUTF8);
michael@0 8604 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
michael@0 8605 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
michael@0 8606 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vb ", source, sourceLength);
michael@0 8607 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vb ", target, targetLength);
michael@0 8608 }
michael@0 8609
michael@0 8610 if (U_FAILURE(*status)) {
michael@0 8611 /* do nothing */
michael@0 8612 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
michael@0 8613 return UCOL_EQUAL;
michael@0 8614 }
michael@0 8615
michael@0 8616 if((source == NULL && sourceLength != 0) || (target == NULL && targetLength != 0)) {
michael@0 8617 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 8618 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
michael@0 8619 return UCOL_EQUAL;
michael@0 8620 }
michael@0 8621
michael@0 8622 /* Quick check if source and target are same strings. */
michael@0 8623 /* They should either both be NULL terminated or the explicit length should be set on both. */
michael@0 8624 if (source==target && sourceLength==targetLength) {
michael@0 8625 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
michael@0 8626 return UCOL_EQUAL;
michael@0 8627 }
michael@0 8628
michael@0 8629 if(coll->delegate != NULL) {
michael@0 8630 return ((const Collator*)coll->delegate)->compareUTF8(
michael@0 8631 StringPiece(source, (sourceLength < 0) ? uprv_strlen(source) : sourceLength),
michael@0 8632 StringPiece(target, (targetLength < 0) ? uprv_strlen(target) : targetLength),
michael@0 8633 *status);
michael@0 8634 }
michael@0 8635
michael@0 8636 /* Scan the strings. Find: */
michael@0 8637 /* The length of any leading portion that is equal */
michael@0 8638 /* Whether they are exactly equal. (in which case we just return) */
michael@0 8639 const char *pSrc = source;
michael@0 8640 const char *pTarg = target;
michael@0 8641 UBool bSrcLimit = FALSE;
michael@0 8642 UBool bTargLimit = FALSE;
michael@0 8643
michael@0 8644 if (sourceLength == -1 && targetLength == -1) {
michael@0 8645 // Both strings are null terminated.
michael@0 8646 // Scan through any leading equal portion.
michael@0 8647 while (*pSrc == *pTarg && *pSrc != 0) {
michael@0 8648 pSrc++;
michael@0 8649 pTarg++;
michael@0 8650 }
michael@0 8651 if (*pSrc == 0 && *pTarg == 0) {
michael@0 8652 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
michael@0 8653 return UCOL_EQUAL;
michael@0 8654 }
michael@0 8655 bSrcLimit = (*pSrc == 0);
michael@0 8656 bTargLimit = (*pTarg == 0);
michael@0 8657 }
michael@0 8658 else
michael@0 8659 {
michael@0 8660 // One or both strings has an explicit length.
michael@0 8661 const char *pSrcEnd = source + sourceLength;
michael@0 8662 const char *pTargEnd = target + targetLength;
michael@0 8663
michael@0 8664 // Scan while the strings are bitwise ==, or until one is exhausted.
michael@0 8665 for (;;) {
michael@0 8666 if (pSrc == pSrcEnd || pTarg == pTargEnd) {
michael@0 8667 break;
michael@0 8668 }
michael@0 8669 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
michael@0 8670 break;
michael@0 8671 }
michael@0 8672 if (*pSrc != *pTarg) {
michael@0 8673 break;
michael@0 8674 }
michael@0 8675 pSrc++;
michael@0 8676 pTarg++;
michael@0 8677 }
michael@0 8678 bSrcLimit = (pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0));
michael@0 8679 bTargLimit = (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0));
michael@0 8680
michael@0 8681 // If we made it all the way through both strings, we are done. They are ==
michael@0 8682 if (bSrcLimit && /* At end of src string, however it was specified. */
michael@0 8683 bTargLimit) /* and also at end of dest string */
michael@0 8684 {
michael@0 8685 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
michael@0 8686 return UCOL_EQUAL;
michael@0 8687 }
michael@0 8688 }
michael@0 8689
michael@0 8690 U_ASSERT(!(bSrcLimit && bTargLimit));
michael@0 8691
michael@0 8692 int32_t equalLength = pSrc - source;
michael@0 8693 UBool bSawNonLatin1 = FALSE;
michael@0 8694
michael@0 8695 if (equalLength > 0) {
michael@0 8696 // Align position to the start of UTF-8 code point.
michael@0 8697 if (bTargLimit) {
michael@0 8698 U8_SET_CP_START((const uint8_t*)source, 0, equalLength);
michael@0 8699 } else {
michael@0 8700 U8_SET_CP_START((const uint8_t*)target, 0, equalLength);
michael@0 8701 }
michael@0 8702 pSrc = source + equalLength;
michael@0 8703 pTarg = target + equalLength;
michael@0 8704 }
michael@0 8705
michael@0 8706 if (equalLength > 0) {
michael@0 8707 /* There is an identical portion at the beginning of the two strings. */
michael@0 8708 /* If the identical portion ends within a contraction or a comibining */
michael@0 8709 /* character sequence, back up to the start of that sequence. */
michael@0 8710 UBool bUnsafeCP = FALSE;
michael@0 8711 UChar32 uc32 = -1;
michael@0 8712
michael@0 8713 if (!bSrcLimit) {
michael@0 8714 U8_GET_OR_FFFD((const uint8_t*)source, 0, equalLength, sourceLength, uc32);
michael@0 8715 if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) {
michael@0 8716 bUnsafeCP = TRUE;
michael@0 8717 }
michael@0 8718 bSawNonLatin1 |= (uc32 > 0xff);
michael@0 8719 }
michael@0 8720 if (!bTargLimit) {
michael@0 8721 U8_GET_OR_FFFD((const uint8_t*)target, 0, equalLength, targetLength, uc32);
michael@0 8722 if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) {
michael@0 8723 bUnsafeCP = TRUE;
michael@0 8724 }
michael@0 8725 bSawNonLatin1 |= (uc32 > 0xff);
michael@0 8726 }
michael@0 8727
michael@0 8728 if (bUnsafeCP) {
michael@0 8729 while (equalLength > 0) {
michael@0 8730 // We are stopped in the middle of a contraction.
michael@0 8731 // Scan backwards through the == part of the string looking for the start of the contraction.
michael@0 8732 // It doesn't matter which string we scan, since they are the same in this region.
michael@0 8733 U8_PREV_OR_FFFD((uint8_t*)source, 0, equalLength, uc32);
michael@0 8734 bSawNonLatin1 |= (uc32 > 0xff);
michael@0 8735 if (uc32 < 0x10000 && !ucol_unsafeCP((UChar)uc32, coll)) {
michael@0 8736 break;
michael@0 8737 }
michael@0 8738 }
michael@0 8739 }
michael@0 8740 source += equalLength;
michael@0 8741 target += equalLength;
michael@0 8742 if (sourceLength > 0) {
michael@0 8743 sourceLength -= equalLength;
michael@0 8744 }
michael@0 8745 if (targetLength > 0) {
michael@0 8746 targetLength -= equalLength;
michael@0 8747 }
michael@0 8748 } else {
michael@0 8749 // Lead byte of Latin 1 character is 0x00 - 0xC3
michael@0 8750 bSawNonLatin1 = (source && (sourceLength != 0) && (uint8_t)*source > 0xc3);
michael@0 8751 bSawNonLatin1 |= (target && (targetLength != 0) && (uint8_t)*target > 0xc3);
michael@0 8752 }
michael@0 8753
michael@0 8754 UCollationResult returnVal;
michael@0 8755
michael@0 8756 if(!coll->latinOneUse || bSawNonLatin1) {
michael@0 8757 returnVal = ucol_strcollRegularUTF8(coll, source, sourceLength, target, targetLength, status);
michael@0 8758 } else {
michael@0 8759 returnVal = ucol_strcollUseLatin1UTF8(coll, source, sourceLength, target, targetLength, status);
michael@0 8760 }
michael@0 8761 UTRACE_EXIT_VALUE_STATUS(returnVal, *status);
michael@0 8762 return returnVal;
michael@0 8763 }
michael@0 8764
michael@0 8765
michael@0 8766 /* convenience function for comparing strings */
michael@0 8767 U_CAPI UBool U_EXPORT2
michael@0 8768 ucol_greater( const UCollator *coll,
michael@0 8769 const UChar *source,
michael@0 8770 int32_t sourceLength,
michael@0 8771 const UChar *target,
michael@0 8772 int32_t targetLength)
michael@0 8773 {
michael@0 8774 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
michael@0 8775 == UCOL_GREATER);
michael@0 8776 }
michael@0 8777
michael@0 8778 /* convenience function for comparing strings */
michael@0 8779 U_CAPI UBool U_EXPORT2
michael@0 8780 ucol_greaterOrEqual( const UCollator *coll,
michael@0 8781 const UChar *source,
michael@0 8782 int32_t sourceLength,
michael@0 8783 const UChar *target,
michael@0 8784 int32_t targetLength)
michael@0 8785 {
michael@0 8786 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
michael@0 8787 != UCOL_LESS);
michael@0 8788 }
michael@0 8789
michael@0 8790 /* convenience function for comparing strings */
michael@0 8791 U_CAPI UBool U_EXPORT2
michael@0 8792 ucol_equal( const UCollator *coll,
michael@0 8793 const UChar *source,
michael@0 8794 int32_t sourceLength,
michael@0 8795 const UChar *target,
michael@0 8796 int32_t targetLength)
michael@0 8797 {
michael@0 8798 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
michael@0 8799 == UCOL_EQUAL);
michael@0 8800 }
michael@0 8801
michael@0 8802 U_CAPI void U_EXPORT2
michael@0 8803 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {
michael@0 8804 if(coll && coll->UCA) {
michael@0 8805 uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo));
michael@0 8806 }
michael@0 8807 }
michael@0 8808
michael@0 8809 #endif /* #if !UCONFIG_NO_COLLATION */

mercurial