intl/icu/source/i18n/alphaindex.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 *******************************************************************************
michael@0 3 * Copyright (C) 2009-2013, International Business Machines Corporation and
michael@0 4 * others. All Rights Reserved.
michael@0 5 *******************************************************************************
michael@0 6 */
michael@0 7
michael@0 8 #include "unicode/utypes.h"
michael@0 9
michael@0 10 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION
michael@0 11
michael@0 12 #include "unicode/alphaindex.h"
michael@0 13 #include "unicode/coleitr.h"
michael@0 14 #include "unicode/coll.h"
michael@0 15 #include "unicode/localpointer.h"
michael@0 16 #include "unicode/normalizer2.h"
michael@0 17 #include "unicode/tblcoll.h"
michael@0 18 #include "unicode/ulocdata.h"
michael@0 19 #include "unicode/uniset.h"
michael@0 20 #include "unicode/uobject.h"
michael@0 21 #include "unicode/usetiter.h"
michael@0 22 #include "unicode/utf16.h"
michael@0 23
michael@0 24 #include "cmemory.h"
michael@0 25 #include "cstring.h"
michael@0 26 #include "uassert.h"
michael@0 27 #include "uvector.h"
michael@0 28
michael@0 29 //#include <string>
michael@0 30 //#include <iostream>
michael@0 31
michael@0 32 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
michael@0 33
michael@0 34 U_NAMESPACE_BEGIN
michael@0 35
michael@0 36 namespace {
michael@0 37
michael@0 38 /**
michael@0 39 * Prefix string for Chinese index buckets.
michael@0 40 * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes
michael@0 41 */
michael@0 42 const UChar BASE[1] = { 0xFDD0 };
michael@0 43 const int32_t BASE_LENGTH = 1;
michael@0 44
michael@0 45 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
michael@0 46 const UnicodeString &one, const UnicodeString &other);
michael@0 47
michael@0 48 } // namespace
michael@0 49
michael@0 50 static int32_t U_CALLCONV
michael@0 51 collatorComparator(const void *context, const void *left, const void *right);
michael@0 52
michael@0 53 static int32_t U_CALLCONV
michael@0 54 recordCompareFn(const void *context, const void *left, const void *right);
michael@0 55
michael@0 56 // UVector<Record *> support function, delete a Record.
michael@0 57 static void U_CALLCONV
michael@0 58 alphaIndex_deleteRecord(void *obj) {
michael@0 59 delete static_cast<AlphabeticIndex::Record *>(obj);
michael@0 60 }
michael@0 61
michael@0 62 namespace {
michael@0 63
michael@0 64 UnicodeString *ownedString(const UnicodeString &s, LocalPointer<UnicodeString> &owned,
michael@0 65 UErrorCode &errorCode) {
michael@0 66 if (U_FAILURE(errorCode)) { return NULL; }
michael@0 67 if (owned.isValid()) {
michael@0 68 return owned.orphan();
michael@0 69 }
michael@0 70 UnicodeString *p = new UnicodeString(s);
michael@0 71 if (p == NULL) {
michael@0 72 errorCode = U_MEMORY_ALLOCATION_ERROR;
michael@0 73 }
michael@0 74 return p;
michael@0 75 }
michael@0 76
michael@0 77 inline UnicodeString *getString(const UVector &list, int32_t i) {
michael@0 78 return static_cast<UnicodeString *>(list[i]);
michael@0 79 }
michael@0 80
michael@0 81 inline AlphabeticIndex::Bucket *getBucket(const UVector &list, int32_t i) {
michael@0 82 return static_cast<AlphabeticIndex::Bucket *>(list[i]);
michael@0 83 }
michael@0 84
michael@0 85 inline AlphabeticIndex::Record *getRecord(const UVector &list, int32_t i) {
michael@0 86 return static_cast<AlphabeticIndex::Record *>(list[i]);
michael@0 87 }
michael@0 88
michael@0 89 /**
michael@0 90 * Like Java Collections.binarySearch(List, String, Comparator).
michael@0 91 *
michael@0 92 * @return the index>=0 where the item was found,
michael@0 93 * or the index<0 for inserting the string at ~index in sorted order
michael@0 94 */
michael@0 95 int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) {
michael@0 96 if (list.size() == 0) { return ~0; }
michael@0 97 int32_t start = 0;
michael@0 98 int32_t limit = list.size();
michael@0 99 for (;;) {
michael@0 100 int32_t i = (start + limit) / 2;
michael@0 101 const UnicodeString *si = static_cast<UnicodeString *>(list.elementAt(i));
michael@0 102 UErrorCode errorCode = U_ZERO_ERROR;
michael@0 103 UCollationResult cmp = coll.compare(s, *si, errorCode);
michael@0 104 if (cmp == UCOL_EQUAL) {
michael@0 105 return i;
michael@0 106 } else if (cmp < 0) {
michael@0 107 if (i == start) {
michael@0 108 return ~start; // insert s before *si
michael@0 109 }
michael@0 110 limit = i;
michael@0 111 } else {
michael@0 112 if (i == start) {
michael@0 113 return ~(start + 1); // insert s after *si
michael@0 114 }
michael@0 115 start = i;
michael@0 116 }
michael@0 117 }
michael@0 118 }
michael@0 119
michael@0 120 } // namespace
michael@0 121
michael@0 122 // The BucketList is not in the anonymous namespace because only Clang
michael@0 123 // seems to support its use in other classes from there.
michael@0 124 // However, we also don't need U_I18N_API because it is not used from outside the i18n library.
michael@0 125 class BucketList : public UObject {
michael@0 126 public:
michael@0 127 BucketList(UVector *bucketList, UVector *publicBucketList)
michael@0 128 : bucketList_(bucketList), immutableVisibleList_(publicBucketList) {
michael@0 129 int32_t displayIndex = 0;
michael@0 130 for (int32_t i = 0; i < publicBucketList->size(); ++i) {
michael@0 131 getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++;
michael@0 132 }
michael@0 133 }
michael@0 134
michael@0 135 // The virtual destructor must not be inline.
michael@0 136 // See ticket #8454 for details.
michael@0 137 virtual ~BucketList();
michael@0 138
michael@0 139 int32_t getBucketCount() const {
michael@0 140 return immutableVisibleList_->size();
michael@0 141 }
michael@0 142
michael@0 143 int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly,
michael@0 144 UErrorCode &errorCode) {
michael@0 145 // binary search
michael@0 146 int32_t start = 0;
michael@0 147 int32_t limit = bucketList_->size();
michael@0 148 while ((start + 1) < limit) {
michael@0 149 int32_t i = (start + limit) / 2;
michael@0 150 const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, i);
michael@0 151 UCollationResult nameVsBucket =
michael@0 152 collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode);
michael@0 153 if (nameVsBucket < 0) {
michael@0 154 limit = i;
michael@0 155 } else {
michael@0 156 start = i;
michael@0 157 }
michael@0 158 }
michael@0 159 const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, start);
michael@0 160 if (bucket->displayBucket_ != NULL) {
michael@0 161 bucket = bucket->displayBucket_;
michael@0 162 }
michael@0 163 return bucket->displayIndex_;
michael@0 164 }
michael@0 165
michael@0 166 /** All of the buckets, visible and invisible. */
michael@0 167 UVector *bucketList_;
michael@0 168 /** Just the visible buckets. */
michael@0 169 UVector *immutableVisibleList_;
michael@0 170 };
michael@0 171
michael@0 172 BucketList::~BucketList() {
michael@0 173 delete bucketList_;
michael@0 174 if (immutableVisibleList_ != bucketList_) {
michael@0 175 delete immutableVisibleList_;
michael@0 176 }
michael@0 177 }
michael@0 178
michael@0 179 AlphabeticIndex::ImmutableIndex::~ImmutableIndex() {
michael@0 180 delete buckets_;
michael@0 181 delete collatorPrimaryOnly_;
michael@0 182 }
michael@0 183
michael@0 184 int32_t
michael@0 185 AlphabeticIndex::ImmutableIndex::getBucketCount() const {
michael@0 186 return buckets_->getBucketCount();
michael@0 187 }
michael@0 188
michael@0 189 int32_t
michael@0 190 AlphabeticIndex::ImmutableIndex::getBucketIndex(
michael@0 191 const UnicodeString &name, UErrorCode &errorCode) const {
michael@0 192 return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode);
michael@0 193 }
michael@0 194
michael@0 195 const AlphabeticIndex::Bucket *
michael@0 196 AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const {
michael@0 197 if (0 <= index && index < buckets_->getBucketCount()) {
michael@0 198 return icu::getBucket(*buckets_->immutableVisibleList_, index);
michael@0 199 } else {
michael@0 200 return NULL;
michael@0 201 }
michael@0 202 }
michael@0 203
michael@0 204 AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status)
michael@0 205 : inputList_(NULL),
michael@0 206 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
michael@0 207 maxLabelCount_(99),
michael@0 208 initialLabels_(NULL), firstCharsInScripts_(NULL),
michael@0 209 collator_(NULL), collatorPrimaryOnly_(NULL),
michael@0 210 buckets_(NULL) {
michael@0 211 init(&locale, status);
michael@0 212 }
michael@0 213
michael@0 214
michael@0 215 AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status)
michael@0 216 : inputList_(NULL),
michael@0 217 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
michael@0 218 maxLabelCount_(99),
michael@0 219 initialLabels_(NULL), firstCharsInScripts_(NULL),
michael@0 220 collator_(collator), collatorPrimaryOnly_(NULL),
michael@0 221 buckets_(NULL) {
michael@0 222 init(NULL, status);
michael@0 223 }
michael@0 224
michael@0 225
michael@0 226
michael@0 227 AlphabeticIndex::~AlphabeticIndex() {
michael@0 228 delete collator_;
michael@0 229 delete collatorPrimaryOnly_;
michael@0 230 delete firstCharsInScripts_;
michael@0 231 delete buckets_;
michael@0 232 delete inputList_;
michael@0 233 delete initialLabels_;
michael@0 234 }
michael@0 235
michael@0 236
michael@0 237 AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) {
michael@0 238 if (U_FAILURE(status)) {
michael@0 239 return *this;
michael@0 240 }
michael@0 241 initialLabels_->addAll(additions);
michael@0 242 clearBuckets();
michael@0 243 return *this;
michael@0 244 }
michael@0 245
michael@0 246
michael@0 247 AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) {
michael@0 248 addIndexExemplars(locale, status);
michael@0 249 clearBuckets();
michael@0 250 return *this;
michael@0 251 }
michael@0 252
michael@0 253
michael@0 254 AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) {
michael@0 255 if (U_FAILURE(errorCode)) { return NULL; }
michael@0 256 // In C++, the ImmutableIndex must own its copy of the BucketList,
michael@0 257 // even if it contains no records, for proper memory management.
michael@0 258 // We could clone the buckets_ if they are not NULL,
michael@0 259 // but that would be worth it only if this method is called multiple times,
michael@0 260 // or called after using the old-style bucket iterator API.
michael@0 261 LocalPointer<BucketList> immutableBucketList(createBucketList(errorCode));
michael@0 262 LocalPointer<RuleBasedCollator> coll(
michael@0 263 static_cast<RuleBasedCollator *>(collatorPrimaryOnly_->clone()));
michael@0 264 if (immutableBucketList.isNull() || coll.isNull()) {
michael@0 265 errorCode = U_MEMORY_ALLOCATION_ERROR;
michael@0 266 return NULL;
michael@0 267 }
michael@0 268 ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias());
michael@0 269 if (immIndex == NULL) {
michael@0 270 errorCode = U_MEMORY_ALLOCATION_ERROR;
michael@0 271 return NULL;
michael@0 272 }
michael@0 273 // The ImmutableIndex adopted its parameter objects.
michael@0 274 immutableBucketList.orphan();
michael@0 275 coll.orphan();
michael@0 276 return immIndex;
michael@0 277 }
michael@0 278
michael@0 279 int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) {
michael@0 280 initBuckets(status);
michael@0 281 if (U_FAILURE(status)) {
michael@0 282 return 0;
michael@0 283 }
michael@0 284 return buckets_->getBucketCount();
michael@0 285 }
michael@0 286
michael@0 287
michael@0 288 int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) {
michael@0 289 if (U_FAILURE(status) || inputList_ == NULL) {
michael@0 290 return 0;
michael@0 291 }
michael@0 292 return inputList_->size();
michael@0 293 }
michael@0 294
michael@0 295 void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const {
michael@0 296 const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode);
michael@0 297 if (U_FAILURE(errorCode)) { return; }
michael@0 298
michael@0 299 const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0);
michael@0 300 const UnicodeString &overflowBoundary =
michael@0 301 *getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1);
michael@0 302
michael@0 303 // We make a sorted array of elements.
michael@0 304 // Some of the input may be redundant.
michael@0 305 // That is, we might have c, ch, d, where "ch" sorts just like "c", "h".
michael@0 306 // We filter out those cases.
michael@0 307 UnicodeSetIterator iter(*initialLabels_);
michael@0 308 while (iter.next()) {
michael@0 309 const UnicodeString *item = &iter.getString();
michael@0 310 LocalPointer<UnicodeString> ownedItem;
michael@0 311 UBool checkDistinct;
michael@0 312 int32_t itemLength = item->length();
michael@0 313 if (!item->hasMoreChar32Than(0, itemLength, 1)) {
michael@0 314 checkDistinct = FALSE;
michael@0 315 } else if(item->charAt(itemLength - 1) == 0x2a && // '*'
michael@0 316 item->charAt(itemLength - 2) != 0x2a) {
michael@0 317 // Use a label if it is marked with one trailing star,
michael@0 318 // even if the label string sorts the same when all contractions are suppressed.
michael@0 319 ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1));
michael@0 320 item = ownedItem.getAlias();
michael@0 321 if (item == NULL) {
michael@0 322 errorCode = U_MEMORY_ALLOCATION_ERROR;
michael@0 323 return;
michael@0 324 }
michael@0 325 checkDistinct = FALSE;
michael@0 326 } else {
michael@0 327 checkDistinct = TRUE;
michael@0 328 }
michael@0 329 if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) {
michael@0 330 // Ignore a primary-ignorable or non-alphabetic index character.
michael@0 331 } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) {
michael@0 332 // Ignore an index characters that will land in the overflow bucket.
michael@0 333 } else if (checkDistinct &&
michael@0 334 collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) {
michael@0 335 // Ignore a multi-code point index character that does not sort distinctly
michael@0 336 // from the sequence of its separate characters.
michael@0 337 } else {
michael@0 338 int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_);
michael@0 339 if (insertionPoint < 0) {
michael@0 340 indexCharacters.insertElementAt(
michael@0 341 ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode);
michael@0 342 } else {
michael@0 343 const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint);
michael@0 344 if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) {
michael@0 345 indexCharacters.setElementAt(
michael@0 346 ownedString(*item, ownedItem, errorCode), insertionPoint);
michael@0 347 }
michael@0 348 }
michael@0 349 }
michael@0 350 }
michael@0 351 if (U_FAILURE(errorCode)) { return; }
michael@0 352
michael@0 353 // if the result is still too large, cut down to maxCount elements, by removing every nth element
michael@0 354
michael@0 355 int32_t size = indexCharacters.size() - 1;
michael@0 356 if (size > maxLabelCount_) {
michael@0 357 int32_t count = 0;
michael@0 358 int32_t old = -1;
michael@0 359 for (int32_t i = 0; i < indexCharacters.size();) {
michael@0 360 ++count;
michael@0 361 int32_t bump = count * maxLabelCount_ / size;
michael@0 362 if (bump == old) {
michael@0 363 indexCharacters.removeElementAt(i);
michael@0 364 } else {
michael@0 365 old = bump;
michael@0 366 ++i;
michael@0 367 }
michael@0 368 }
michael@0 369 }
michael@0 370 }
michael@0 371
michael@0 372 namespace {
michael@0 373
michael@0 374 const UnicodeString &fixLabel(const UnicodeString &current, UnicodeString &temp) {
michael@0 375 if (!current.startsWith(BASE, BASE_LENGTH)) {
michael@0 376 return current;
michael@0 377 }
michael@0 378 UChar rest = current.charAt(BASE_LENGTH);
michael@0 379 if (0x2800 < rest && rest <= 0x28FF) { // stroke count
michael@0 380 int32_t count = rest-0x2800;
michael@0 381 temp.setTo((UChar)(0x30 + count % 10));
michael@0 382 if (count >= 10) {
michael@0 383 count /= 10;
michael@0 384 temp.insert(0, (UChar)(0x30 + count % 10));
michael@0 385 if (count >= 10) {
michael@0 386 count /= 10;
michael@0 387 temp.insert(0, (UChar)(0x30 + count));
michael@0 388 }
michael@0 389 }
michael@0 390 return temp.append((UChar)0x5283);
michael@0 391 }
michael@0 392 return temp.setTo(current, BASE_LENGTH);
michael@0 393 }
michael@0 394
michael@0 395 UBool hasMultiplePrimaryWeights(
michael@0 396 CollationElementIterator &cei, int32_t variableTop,
michael@0 397 const UnicodeString &s, UErrorCode &errorCode) {
michael@0 398 cei.setText(s, errorCode);
michael@0 399 UBool seenPrimary = FALSE;
michael@0 400 for (;;) {
michael@0 401 int32_t ce32 = cei.next(errorCode);
michael@0 402 if (ce32 == CollationElementIterator::NULLORDER) {
michael@0 403 break;
michael@0 404 }
michael@0 405 int32_t p = CollationElementIterator::primaryOrder(ce32);
michael@0 406 if (p > variableTop && (ce32 & 0xc0) != 0xc0) {
michael@0 407 // not primary ignorable, and not a continuation CE
michael@0 408 if (seenPrimary) {
michael@0 409 return TRUE;
michael@0 410 }
michael@0 411 seenPrimary = TRUE;
michael@0 412 }
michael@0 413 }
michael@0 414 return FALSE;
michael@0 415 }
michael@0 416
michael@0 417 } // namespace
michael@0 418
michael@0 419 BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const {
michael@0 420 // Initialize indexCharacters.
michael@0 421 UVector indexCharacters(errorCode);
michael@0 422 indexCharacters.setDeleter(uprv_deleteUObject);
michael@0 423 initLabels(indexCharacters, errorCode);
michael@0 424 if (U_FAILURE(errorCode)) { return NULL; }
michael@0 425
michael@0 426 // Variables for hasMultiplePrimaryWeights().
michael@0 427 LocalPointer<CollationElementIterator> cei(
michael@0 428 collatorPrimaryOnly_->createCollationElementIterator(emptyString_));
michael@0 429 if (cei.isNull()) {
michael@0 430 errorCode = U_MEMORY_ALLOCATION_ERROR;
michael@0 431 return NULL;
michael@0 432 }
michael@0 433 int32_t variableTop;
michael@0 434 if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) {
michael@0 435 variableTop = CollationElementIterator::primaryOrder(
michael@0 436 (int32_t)collatorPrimaryOnly_->getVariableTop(errorCode));
michael@0 437 } else {
michael@0 438 variableTop = 0;
michael@0 439 }
michael@0 440 UBool hasInvisibleBuckets = FALSE;
michael@0 441
michael@0 442 // Helper arrays for Chinese Pinyin collation.
michael@0 443 Bucket *asciiBuckets[26] = {
michael@0 444 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
michael@0 445 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
michael@0 446 };
michael@0 447 Bucket *pinyinBuckets[26] = {
michael@0 448 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
michael@0 449 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
michael@0 450 };
michael@0 451 UBool hasPinyin = FALSE;
michael@0 452
michael@0 453 LocalPointer<UVector> bucketList(new UVector(errorCode));
michael@0 454 if (bucketList.isNull()) {
michael@0 455 errorCode = U_MEMORY_ALLOCATION_ERROR;
michael@0 456 return NULL;
michael@0 457 }
michael@0 458 bucketList->setDeleter(uprv_deleteUObject);
michael@0 459
michael@0 460 // underflow bucket
michael@0 461 Bucket *bucket = new Bucket(getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW);
michael@0 462 if (bucket == NULL) {
michael@0 463 errorCode = U_MEMORY_ALLOCATION_ERROR;
michael@0 464 return NULL;
michael@0 465 }
michael@0 466 bucketList->addElement(bucket, errorCode);
michael@0 467 if (U_FAILURE(errorCode)) { return NULL; }
michael@0 468
michael@0 469 UnicodeString temp;
michael@0 470
michael@0 471 // fix up the list, adding underflow, additions, overflow
michael@0 472 // Insert inflow labels as needed.
michael@0 473 int32_t scriptIndex = -1;
michael@0 474 const UnicodeString *scriptUpperBoundary = &emptyString_;
michael@0 475 for (int32_t i = 0; i < indexCharacters.size(); ++i) {
michael@0 476 UnicodeString &current = *getString(indexCharacters, i);
michael@0 477 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) {
michael@0 478 // We crossed the script boundary into a new script.
michael@0 479 const UnicodeString &inflowBoundary = *scriptUpperBoundary;
michael@0 480 UBool skippedScript = FALSE;
michael@0 481 for (;;) {
michael@0 482 scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex);
michael@0 483 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) {
michael@0 484 break;
michael@0 485 }
michael@0 486 skippedScript = TRUE;
michael@0 487 }
michael@0 488 if (skippedScript && bucketList->size() > 1) {
michael@0 489 // We are skipping one or more scripts,
michael@0 490 // and we are not just getting out of the underflow label.
michael@0 491 bucket = new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW);
michael@0 492 if (bucket == NULL) {
michael@0 493 errorCode = U_MEMORY_ALLOCATION_ERROR;
michael@0 494 return NULL;
michael@0 495 }
michael@0 496 bucketList->addElement(bucket, errorCode);
michael@0 497 }
michael@0 498 }
michael@0 499 // Add a bucket with the current label.
michael@0 500 bucket = new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL);
michael@0 501 if (bucket == NULL) {
michael@0 502 errorCode = U_MEMORY_ALLOCATION_ERROR;
michael@0 503 return NULL;
michael@0 504 }
michael@0 505 bucketList->addElement(bucket, errorCode);
michael@0 506 // Remember ASCII and Pinyin buckets for Pinyin redirects.
michael@0 507 UChar c;
michael@0 508 if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) { // A-Z
michael@0 509 asciiBuckets[c - 0x41] = bucket;
michael@0 510 } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) &&
michael@0 511 0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) {
michael@0 512 pinyinBuckets[c - 0x41] = bucket;
michael@0 513 hasPinyin = TRUE;
michael@0 514 }
michael@0 515 // Check for multiple primary weights.
michael@0 516 if (!current.startsWith(BASE, BASE_LENGTH) &&
michael@0 517 hasMultiplePrimaryWeights(*cei, variableTop, current, errorCode) &&
michael@0 518 current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) {
michael@0 519 // "AE-ligature" or "Sch" etc.
michael@0 520 for (int32_t i = bucketList->size() - 2;; --i) {
michael@0 521 Bucket *singleBucket = getBucket(*bucketList, i);
michael@0 522 if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
michael@0 523 // There is no single-character bucket since the last
michael@0 524 // underflow or inflow label.
michael@0 525 break;
michael@0 526 }
michael@0 527 if (singleBucket->displayBucket_ == NULL &&
michael@0 528 !hasMultiplePrimaryWeights(
michael@0 529 *cei, variableTop, singleBucket->lowerBoundary_, errorCode)) {
michael@0 530 // Add an invisible bucket that redirects strings greater than the expansion
michael@0 531 // to the previous single-character bucket.
michael@0 532 // For example, after ... Q R S Sch we add Sch\uFFFF->S
michael@0 533 // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.
michael@0 534 bucket = new Bucket(emptyString_,
michael@0 535 UnicodeString(current).append((UChar)0xFFFF),
michael@0 536 U_ALPHAINDEX_NORMAL);
michael@0 537 if (bucket == NULL) {
michael@0 538 errorCode = U_MEMORY_ALLOCATION_ERROR;
michael@0 539 return NULL;
michael@0 540 }
michael@0 541 bucket->displayBucket_ = singleBucket;
michael@0 542 bucketList->addElement(bucket, errorCode);
michael@0 543 hasInvisibleBuckets = TRUE;
michael@0 544 break;
michael@0 545 }
michael@0 546 }
michael@0 547 }
michael@0 548 }
michael@0 549 if (U_FAILURE(errorCode)) { return NULL; }
michael@0 550 if (bucketList->size() == 1) {
michael@0 551 // No real labels, show only the underflow label.
michael@0 552 BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
michael@0 553 if (bl == NULL) {
michael@0 554 errorCode = U_MEMORY_ALLOCATION_ERROR;
michael@0 555 return NULL;
michael@0 556 }
michael@0 557 bucketList.orphan();
michael@0 558 return bl;
michael@0 559 }
michael@0 560 // overflow bucket
michael@0 561 bucket = new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW);
michael@0 562 if (bucket == NULL) {
michael@0 563 errorCode = U_MEMORY_ALLOCATION_ERROR;
michael@0 564 return NULL;
michael@0 565 }
michael@0 566 bucketList->addElement(bucket, errorCode); // final
michael@0 567
michael@0 568 if (hasPinyin) {
michael@0 569 // Redirect Pinyin buckets.
michael@0 570 Bucket *asciiBucket = NULL;
michael@0 571 for (int32_t i = 0; i < 26; ++i) {
michael@0 572 if (asciiBuckets[i] != NULL) {
michael@0 573 asciiBucket = asciiBuckets[i];
michael@0 574 }
michael@0 575 if (pinyinBuckets[i] != NULL && asciiBucket != NULL) {
michael@0 576 pinyinBuckets[i]->displayBucket_ = asciiBucket;
michael@0 577 hasInvisibleBuckets = TRUE;
michael@0 578 }
michael@0 579 }
michael@0 580 }
michael@0 581
michael@0 582 if (U_FAILURE(errorCode)) { return NULL; }
michael@0 583 if (!hasInvisibleBuckets) {
michael@0 584 BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
michael@0 585 if (bl == NULL) {
michael@0 586 errorCode = U_MEMORY_ALLOCATION_ERROR;
michael@0 587 return NULL;
michael@0 588 }
michael@0 589 bucketList.orphan();
michael@0 590 return bl;
michael@0 591 }
michael@0 592 // Merge inflow buckets that are visually adjacent.
michael@0 593 // Iterate backwards: Merge inflow into overflow rather than the other way around.
michael@0 594 int32_t i = bucketList->size() - 1;
michael@0 595 Bucket *nextBucket = getBucket(*bucketList, i);
michael@0 596 while (--i > 0) {
michael@0 597 bucket = getBucket(*bucketList, i);
michael@0 598 if (bucket->displayBucket_ != NULL) {
michael@0 599 continue; // skip invisible buckets
michael@0 600 }
michael@0 601 if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) {
michael@0 602 if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
michael@0 603 bucket->displayBucket_ = nextBucket;
michael@0 604 continue;
michael@0 605 }
michael@0 606 }
michael@0 607 nextBucket = bucket;
michael@0 608 }
michael@0 609
michael@0 610 LocalPointer<UVector> publicBucketList(new UVector(errorCode));
michael@0 611 if (bucketList.isNull()) {
michael@0 612 errorCode = U_MEMORY_ALLOCATION_ERROR;
michael@0 613 return NULL;
michael@0 614 }
michael@0 615 // Do not call publicBucketList->setDeleter():
michael@0 616 // This vector shares its objects with the bucketList.
michael@0 617 for (int32_t i = 0; i < bucketList->size(); ++i) {
michael@0 618 bucket = getBucket(*bucketList, i);
michael@0 619 if (bucket->displayBucket_ == NULL) {
michael@0 620 publicBucketList->addElement(bucket, errorCode);
michael@0 621 }
michael@0 622 }
michael@0 623 if (U_FAILURE(errorCode)) { return NULL; }
michael@0 624 BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias());
michael@0 625 if (bl == NULL) {
michael@0 626 errorCode = U_MEMORY_ALLOCATION_ERROR;
michael@0 627 return NULL;
michael@0 628 }
michael@0 629 bucketList.orphan();
michael@0 630 publicBucketList.orphan();
michael@0 631 return bl;
michael@0 632 }
michael@0 633
michael@0 634 /**
michael@0 635 * Creates an index, and buckets and sorts the list of records into the index.
michael@0 636 */
michael@0 637 void AlphabeticIndex::initBuckets(UErrorCode &errorCode) {
michael@0 638 if (U_FAILURE(errorCode) || buckets_ != NULL) {
michael@0 639 return;
michael@0 640 }
michael@0 641 buckets_ = createBucketList(errorCode);
michael@0 642 if (U_FAILURE(errorCode) || inputList_ == NULL || inputList_->isEmpty()) {
michael@0 643 return;
michael@0 644 }
michael@0 645
michael@0 646 // Sort the records by name.
michael@0 647 // Stable sort preserves input order of collation duplicates.
michael@0 648 inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode);
michael@0 649
michael@0 650 // Now, we traverse all of the input, which is now sorted.
michael@0 651 // If the item doesn't go in the current bucket, we find the next bucket that contains it.
michael@0 652 // This makes the process order n*log(n), since we just sort the list and then do a linear process.
michael@0 653 // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so
michael@0 654 // we need to improve it for that case.
michael@0 655
michael@0 656 Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0);
michael@0 657 int32_t bucketIndex = 1;
michael@0 658 Bucket *nextBucket;
michael@0 659 const UnicodeString *upperBoundary;
michael@0 660 if (bucketIndex < buckets_->bucketList_->size()) {
michael@0 661 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
michael@0 662 upperBoundary = &nextBucket->lowerBoundary_;
michael@0 663 } else {
michael@0 664 nextBucket = NULL;
michael@0 665 upperBoundary = NULL;
michael@0 666 }
michael@0 667 for (int32_t i = 0; i < inputList_->size(); ++i) {
michael@0 668 Record *r = getRecord(*inputList_, i);
michael@0 669 // if the current bucket isn't the right one, find the one that is
michael@0 670 // We have a special flag for the last bucket so that we don't look any further
michael@0 671 while (upperBoundary != NULL &&
michael@0 672 collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) {
michael@0 673 currentBucket = nextBucket;
michael@0 674 // now reset the boundary that we compare against
michael@0 675 if (bucketIndex < buckets_->bucketList_->size()) {
michael@0 676 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
michael@0 677 upperBoundary = &nextBucket->lowerBoundary_;
michael@0 678 } else {
michael@0 679 upperBoundary = NULL;
michael@0 680 }
michael@0 681 }
michael@0 682 // now put the record into the bucket.
michael@0 683 Bucket *bucket = currentBucket;
michael@0 684 if (bucket->displayBucket_ != NULL) {
michael@0 685 bucket = bucket->displayBucket_;
michael@0 686 }
michael@0 687 if (bucket->records_ == NULL) {
michael@0 688 bucket->records_ = new UVector(errorCode);
michael@0 689 if (bucket->records_ == NULL) {
michael@0 690 errorCode = U_MEMORY_ALLOCATION_ERROR;
michael@0 691 return;
michael@0 692 }
michael@0 693 }
michael@0 694 bucket->records_->addElement(r, errorCode);
michael@0 695 }
michael@0 696 }
michael@0 697
michael@0 698 void AlphabeticIndex::clearBuckets() {
michael@0 699 if (buckets_ != NULL) {
michael@0 700 delete buckets_;
michael@0 701 buckets_ = NULL;
michael@0 702 internalResetBucketIterator();
michael@0 703 }
michael@0 704 }
michael@0 705
michael@0 706 void AlphabeticIndex::internalResetBucketIterator() {
michael@0 707 labelsIterIndex_ = -1;
michael@0 708 currentBucket_ = NULL;
michael@0 709 }
michael@0 710
michael@0 711
michael@0 712 void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) {
michael@0 713 if (U_FAILURE(status)) { return; }
michael@0 714 // Chinese index characters, which are specific to each of the several Chinese tailorings,
michael@0 715 // take precedence over the single locale data exemplar set per language.
michael@0 716 const char *language = locale.getLanguage();
michael@0 717 if (uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 ||
michael@0 718 uprv_strcmp(language, "ko") == 0) {
michael@0 719 // TODO: This should be done regardless of the language, but it's expensive.
michael@0 720 // We should add a Collator function (can be @internal)
michael@0 721 // to enumerate just the contractions that start with a given code point or string.
michael@0 722 if (addChineseIndexCharacters(status) || U_FAILURE(status)) {
michael@0 723 return;
michael@0 724 }
michael@0 725 }
michael@0 726
michael@0 727 LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
michael@0 728 if (U_FAILURE(status)) {
michael@0 729 return;
michael@0 730 }
michael@0 731
michael@0 732 UnicodeSet exemplars;
michael@0 733 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status);
michael@0 734 if (U_SUCCESS(status)) {
michael@0 735 initialLabels_->addAll(exemplars);
michael@0 736 return;
michael@0 737 }
michael@0 738 status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR
michael@0 739
michael@0 740 // The locale data did not include explicit Index characters.
michael@0 741 // Synthesize a set of them from the locale's standard exemplar characters.
michael@0 742 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status);
michael@0 743 if (U_FAILURE(status)) {
michael@0 744 return;
michael@0 745 }
michael@0 746
michael@0 747 // question: should we add auxiliary exemplars?
michael@0 748 if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.size() == 0) {
michael@0 749 exemplars.add(0x61, 0x7A);
michael@0 750 }
michael@0 751 if (exemplars.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables
michael@0 752 // cut down to small list
michael@0 753 exemplars.remove(0xAC00, 0xD7A3).
michael@0 754 add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
michael@0 755 add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
michael@0 756 add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
michael@0 757 add(0xD30C).add(0xD558);
michael@0 758 }
michael@0 759 if (exemplars.containsSome(0x1200, 0x137F)) { // Ethiopic block
michael@0 760 // cut down to small list
michael@0 761 // make use of the fact that Ethiopic is allocated in 8's, where
michael@0 762 // the base is 0 mod 8.
michael@0 763 UnicodeSet ethiopic(
michael@0 764 UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"), status);
michael@0 765 UnicodeSetIterator it(ethiopic);
michael@0 766 while (it.next() && !it.isString()) {
michael@0 767 if ((it.getCodepoint() & 0x7) != 0) {
michael@0 768 exemplars.remove(it.getCodepoint());
michael@0 769 }
michael@0 770 }
michael@0 771 }
michael@0 772
michael@0 773 // Upper-case any that aren't already so.
michael@0 774 // (We only do this for synthesized index characters.)
michael@0 775 UnicodeSetIterator it(exemplars);
michael@0 776 UnicodeString upperC;
michael@0 777 while (it.next()) {
michael@0 778 const UnicodeString &exemplarC = it.getString();
michael@0 779 upperC = exemplarC;
michael@0 780 upperC.toUpper(locale);
michael@0 781 initialLabels_->add(upperC);
michael@0 782 }
michael@0 783 }
michael@0 784
michael@0 785 UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) {
michael@0 786 UnicodeSet contractions;
michael@0 787 ucol_getContractionsAndExpansions(collatorPrimaryOnly_->getUCollator(),
michael@0 788 contractions.toUSet(), NULL, FALSE, &errorCode);
michael@0 789 if (U_FAILURE(errorCode)) { return FALSE; }
michael@0 790 UnicodeString firstHanBoundary;
michael@0 791 UBool hasPinyin = FALSE;
michael@0 792 UnicodeSetIterator iter(contractions);
michael@0 793 while (iter.next()) {
michael@0 794 const UnicodeString &s = iter.getString();
michael@0 795 if (s.startsWith(BASE, BASE_LENGTH)) {
michael@0 796 initialLabels_->add(s);
michael@0 797 if (firstHanBoundary.isEmpty() ||
michael@0 798 collatorPrimaryOnly_->compare(s, firstHanBoundary, errorCode) < 0) {
michael@0 799 firstHanBoundary = s;
michael@0 800 }
michael@0 801 UChar c = s.charAt(s.length() - 1);
michael@0 802 if (0x41 <= c && c <= 0x5A) { // A-Z
michael@0 803 hasPinyin = TRUE;
michael@0 804 }
michael@0 805 }
michael@0 806 }
michael@0 807 if (hasPinyin) {
michael@0 808 initialLabels_->add(0x41, 0x5A); // A-Z
michael@0 809 }
michael@0 810 if (!firstHanBoundary.isEmpty()) {
michael@0 811 // The hardcoded list of script boundaries includes U+4E00
michael@0 812 // which is tailored to not be the first primary
michael@0 813 // in all Chinese tailorings except "unihan".
michael@0 814 // Replace U+4E00 with the first boundary string from the tailoring.
michael@0 815 // TODO: This becomes obsolete when the root collator gets
michael@0 816 // reliable script-first-primary mappings.
michael@0 817 int32_t hanIndex = binarySearch(
michael@0 818 *firstCharsInScripts_, UnicodeString((UChar)0x4E00), *collatorPrimaryOnly_);
michael@0 819 if (hanIndex >= 0) {
michael@0 820 UnicodeString *fh = new UnicodeString(firstHanBoundary);
michael@0 821 if (fh == NULL) {
michael@0 822 errorCode = U_MEMORY_ALLOCATION_ERROR;
michael@0 823 return FALSE;
michael@0 824 }
michael@0 825 firstCharsInScripts_->setElementAt(fh, hanIndex);
michael@0 826 }
michael@0 827 return TRUE;
michael@0 828 } else {
michael@0 829 return FALSE;
michael@0 830 }
michael@0 831 }
michael@0 832
michael@0 833
michael@0 834 /*
michael@0 835 * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
michael@0 836 */
michael@0 837 static const UChar CGJ = 0x034F;
michael@0 838 UnicodeString AlphabeticIndex::separated(const UnicodeString &item) {
michael@0 839 UnicodeString result;
michael@0 840 if (item.length() == 0) {
michael@0 841 return result;
michael@0 842 }
michael@0 843 int32_t i = 0;
michael@0 844 for (;;) {
michael@0 845 UChar32 cp = item.char32At(i);
michael@0 846 result.append(cp);
michael@0 847 i = item.moveIndex32(i, 1);
michael@0 848 if (i >= item.length()) {
michael@0 849 break;
michael@0 850 }
michael@0 851 result.append(CGJ);
michael@0 852 }
michael@0 853 return result;
michael@0 854 }
michael@0 855
michael@0 856
michael@0 857 UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const {
michael@0 858 return FALSE;
michael@0 859 }
michael@0 860
michael@0 861
michael@0 862 UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const {
michael@0 863 return FALSE;
michael@0 864 }
michael@0 865
michael@0 866
michael@0 867 const RuleBasedCollator &AlphabeticIndex::getCollator() const {
michael@0 868 // There are no known non-RuleBasedCollator collators, and none ever expected.
michael@0 869 // But, in case that changes, better a null pointer than a wrong type.
michael@0 870 return *dynamic_cast<RuleBasedCollator *>(collator_);
michael@0 871 }
michael@0 872
michael@0 873
michael@0 874 const UnicodeString &AlphabeticIndex::getInflowLabel() const {
michael@0 875 return inflowLabel_;
michael@0 876 }
michael@0 877
michael@0 878 const UnicodeString &AlphabeticIndex::getOverflowLabel() const {
michael@0 879 return overflowLabel_;
michael@0 880 }
michael@0 881
michael@0 882
michael@0 883 const UnicodeString &AlphabeticIndex::getUnderflowLabel() const {
michael@0 884 return underflowLabel_;
michael@0 885 }
michael@0 886
michael@0 887
michael@0 888 AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
michael@0 889 inflowLabel_ = label;
michael@0 890 clearBuckets();
michael@0 891 return *this;
michael@0 892 }
michael@0 893
michael@0 894
michael@0 895 AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
michael@0 896 overflowLabel_ = label;
michael@0 897 clearBuckets();
michael@0 898 return *this;
michael@0 899 }
michael@0 900
michael@0 901
michael@0 902 AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
michael@0 903 underflowLabel_ = label;
michael@0 904 clearBuckets();
michael@0 905 return *this;
michael@0 906 }
michael@0 907
michael@0 908
michael@0 909 int32_t AlphabeticIndex::getMaxLabelCount() const {
michael@0 910 return maxLabelCount_;
michael@0 911 }
michael@0 912
michael@0 913
michael@0 914 AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) {
michael@0 915 if (U_FAILURE(status)) {
michael@0 916 return *this;
michael@0 917 }
michael@0 918 if (maxLabelCount <= 0) {
michael@0 919 status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 920 return *this;
michael@0 921 }
michael@0 922 maxLabelCount_ = maxLabelCount;
michael@0 923 clearBuckets();
michael@0 924 return *this;
michael@0 925 }
michael@0 926
michael@0 927
michael@0 928 //
michael@0 929 // init() - Common code for constructors.
michael@0 930 //
michael@0 931
michael@0 932 void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) {
michael@0 933 if (U_FAILURE(status)) { return; }
michael@0 934 if (locale == NULL && collator_ == NULL) {
michael@0 935 status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 936 return;
michael@0 937 }
michael@0 938
michael@0 939 initialLabels_ = new UnicodeSet();
michael@0 940 if (initialLabels_ == NULL) {
michael@0 941 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 942 return;
michael@0 943 }
michael@0 944
michael@0 945 inflowLabel_.setTo((UChar)0x2026); // Ellipsis
michael@0 946 overflowLabel_ = inflowLabel_;
michael@0 947 underflowLabel_ = inflowLabel_;
michael@0 948
michael@0 949 if (collator_ == NULL) {
michael@0 950 collator_ = static_cast<RuleBasedCollator *>(Collator::createInstance(*locale, status));
michael@0 951 if (U_FAILURE(status)) { return; }
michael@0 952 if (collator_ == NULL) {
michael@0 953 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 954 return;
michael@0 955 }
michael@0 956 }
michael@0 957 collatorPrimaryOnly_ = static_cast<RuleBasedCollator *>(collator_->clone());
michael@0 958 if (collatorPrimaryOnly_ == NULL) {
michael@0 959 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 960 return;
michael@0 961 }
michael@0 962 collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status);
michael@0 963 firstCharsInScripts_ = firstStringsInScript(status);
michael@0 964 if (U_FAILURE(status)) { return; }
michael@0 965 firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status);
michael@0 966 UnicodeString _4E00((UChar)0x4E00);
michael@0 967 UnicodeString _1100((UChar)0x1100);
michael@0 968 UnicodeString _1112((UChar)0x1112);
michael@0 969 if (collatorPrimaryOnly_->compare(_4E00, _1112, status) <= 0 &&
michael@0 970 collatorPrimaryOnly_->compare(_1100, _4E00, status) <= 0) {
michael@0 971 // The standard Korean tailoring sorts Hanja (Han characters)
michael@0 972 // as secondary differences from Hangul syllables.
michael@0 973 // This makes U+4E00 not useful as a Han-script boundary.
michael@0 974 // TODO: This becomes obsolete when the root collator gets
michael@0 975 // reliable script-first-primary mappings.
michael@0 976 int32_t hanIndex = binarySearch(
michael@0 977 *firstCharsInScripts_, _4E00, *collatorPrimaryOnly_);
michael@0 978 if (hanIndex >= 0) {
michael@0 979 firstCharsInScripts_->removeElementAt(hanIndex);
michael@0 980 }
michael@0 981 }
michael@0 982 // Guard against a degenerate collator where
michael@0 983 // some script boundary strings are primary ignorable.
michael@0 984 for (;;) {
michael@0 985 if (U_FAILURE(status)) { return; }
michael@0 986 if (firstCharsInScripts_->isEmpty()) {
michael@0 987 // AlphabeticIndex requires some non-ignorable script boundary strings.
michael@0 988 status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 989 return;
michael@0 990 }
michael@0 991 if (collatorPrimaryOnly_->compare(
michael@0 992 *static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)),
michael@0 993 emptyString_, status) == UCOL_EQUAL) {
michael@0 994 firstCharsInScripts_->removeElementAt(0);
michael@0 995 } else {
michael@0 996 break;
michael@0 997 }
michael@0 998 }
michael@0 999
michael@0 1000 if (locale != NULL) {
michael@0 1001 addIndexExemplars(*locale, status);
michael@0 1002 }
michael@0 1003 }
michael@0 1004
michael@0 1005
michael@0 1006 //
michael@0 1007 // Comparison function for UVector<UnicodeString *> sorting with a collator.
michael@0 1008 //
michael@0 1009 static int32_t U_CALLCONV
michael@0 1010 collatorComparator(const void *context, const void *left, const void *right) {
michael@0 1011 const UElement *leftElement = static_cast<const UElement *>(left);
michael@0 1012 const UElement *rightElement = static_cast<const UElement *>(right);
michael@0 1013 const UnicodeString *leftString = static_cast<const UnicodeString *>(leftElement->pointer);
michael@0 1014 const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer);
michael@0 1015
michael@0 1016 if (leftString == rightString) {
michael@0 1017 // Catches case where both are NULL
michael@0 1018 return 0;
michael@0 1019 }
michael@0 1020 if (leftString == NULL) {
michael@0 1021 return 1;
michael@0 1022 };
michael@0 1023 if (rightString == NULL) {
michael@0 1024 return -1;
michael@0 1025 }
michael@0 1026 const Collator *col = static_cast<const Collator *>(context);
michael@0 1027 UErrorCode errorCode = U_ZERO_ERROR;
michael@0 1028 return col->compare(*leftString, *rightString, errorCode);
michael@0 1029 }
michael@0 1030
michael@0 1031 //
michael@0 1032 // Comparison function for UVector<Record *> sorting with a collator.
michael@0 1033 //
michael@0 1034 static int32_t U_CALLCONV
michael@0 1035 recordCompareFn(const void *context, const void *left, const void *right) {
michael@0 1036 const UElement *leftElement = static_cast<const UElement *>(left);
michael@0 1037 const UElement *rightElement = static_cast<const UElement *>(right);
michael@0 1038 const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer);
michael@0 1039 const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer);
michael@0 1040 const Collator *col = static_cast<const Collator *>(context);
michael@0 1041 UErrorCode errorCode = U_ZERO_ERROR;
michael@0 1042 return col->compare(leftRec->name_, rightRec->name_, errorCode);
michael@0 1043 }
michael@0 1044
michael@0 1045
michael@0 1046 /**
michael@0 1047 * This list contains one character per script that has the
michael@0 1048 * lowest primary weight for that script in the root collator.
michael@0 1049 * This list will be copied and sorted to account for script reordering.
michael@0 1050 *
michael@0 1051 * <p>TODO: This is fragile. If the first character of a script is tailored
michael@0 1052 * so that it does not map to the script's lowest primary weight any more,
michael@0 1053 * then the buckets will be off.
michael@0 1054 * There are hacks in the code to handle the known CJK tailorings of U+4E00.
michael@0 1055 *
michael@0 1056 * <p>We use "A" not "a" because the en_US_POSIX tailoring sorts A primary-before a.
michael@0 1057 *
michael@0 1058 * Keep this in sync with HACK_FIRST_CHARS_IN_SCRIPTS in
michael@0 1059 * ICU4J main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java
michael@0 1060 */
michael@0 1061 static const UChar HACK_FIRST_CHARS_IN_SCRIPTS[] = {
michael@0 1062 0x41, 0, 0x03B1, 0,
michael@0 1063 0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0,
michael@0 1064 0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0,
michael@0 1065 0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0,
michael@0 1066 0xAAF2, 0, // Meetei Mayek
michael@0 1067 0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0,
michael@0 1068 U16_LEAD(0x111C4), U16_TRAIL(0x111C4), 0, // Sharada
michael@0 1069 U16_LEAD(0x11680), U16_TRAIL(0x11680), 0, // Takri
michael@0 1070 0x1B83, 0,
michael@0 1071 0xD802, 0xDE00, 0, 0x0E01, 0,
michael@0 1072 0x0EDE, 0, // Lao
michael@0 1073 0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0,
michael@0 1074 0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0,
michael@0 1075 U16_LEAD(0x11103), U16_TRAIL(0x11103), 0, // Chakma
michael@0 1076 0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0,
michael@0 1077 0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0,
michael@0 1078 0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0,
michael@0 1079 U16_LEAD(0x16F00), U16_TRAIL(0x16F00), 0, // Miao
michael@0 1080 0xD800, 0xDE80, 0,
michael@0 1081 0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0,
michael@0 1082 0xD801, 0xDC80, 0,
michael@0 1083 U16_LEAD(0x110D0), U16_TRAIL(0x110D0), 0, // Sora Sompeng
michael@0 1084 0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0,
michael@0 1085 0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0,
michael@0 1086 U16_LEAD(0x109A0), U16_TRAIL(0x109A0), 0, // Meroitic Cursive
michael@0 1087 U16_LEAD(0x10980), U16_TRAIL(0x10980), 0, // Meroitic Hieroglyphs
michael@0 1088 0x4E00, 0,
michael@0 1089 // TODO: The overflow bucket's lowerBoundary string should be the
michael@0 1090 // first item after the last reordering group in the collator's script order.
michael@0 1091 // This should normally be the first Unicode code point
michael@0 1092 // that is unassigned (U+0378 in Unicode 6.3) and untailored.
michael@0 1093 // However, at least up to ICU 51 the Hani reordering group includes
michael@0 1094 // unassigned code points,
michael@0 1095 // and there is no stable string for the start of the trailing-weights range.
michael@0 1096 // The only known string that sorts "high" is U+FFFF.
michael@0 1097 // When ICU separates Hani vs. unassigned reordering groups, we need to fix this,
michael@0 1098 // and fix relevant test code.
michael@0 1099 // Ideally, FractionalUCA.txt will have a "script first primary"
michael@0 1100 // for unassigned code points.
michael@0 1101 0xFFFF, 0
michael@0 1102 };
michael@0 1103
michael@0 1104 UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) {
michael@0 1105 if (U_FAILURE(status)) {
michael@0 1106 return NULL;
michael@0 1107 }
michael@0 1108 UVector *dest = new UVector(status);
michael@0 1109 if (dest == NULL) {
michael@0 1110 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 1111 return NULL;
michael@0 1112 }
michael@0 1113 dest->setDeleter(uprv_deleteUObject);
michael@0 1114 const UChar *src = HACK_FIRST_CHARS_IN_SCRIPTS;
michael@0 1115 const UChar *limit = src + LENGTHOF(HACK_FIRST_CHARS_IN_SCRIPTS);
michael@0 1116 do {
michael@0 1117 if (U_FAILURE(status)) {
michael@0 1118 return dest;
michael@0 1119 }
michael@0 1120 UnicodeString *str = new UnicodeString(src, -1);
michael@0 1121 if (str == NULL) {
michael@0 1122 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 1123 return dest;
michael@0 1124 }
michael@0 1125 dest->addElement(str, status);
michael@0 1126 src += str->length() + 1;
michael@0 1127 } while (src < limit);
michael@0 1128 return dest;
michael@0 1129 }
michael@0 1130
michael@0 1131
michael@0 1132 namespace {
michael@0 1133
michael@0 1134 /**
michael@0 1135 * Returns true if one index character string is "better" than the other.
michael@0 1136 * Shorter NFKD is better, and otherwise NFKD-binary-less-than is
michael@0 1137 * better, and otherwise binary-less-than is better.
michael@0 1138 */
michael@0 1139 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
michael@0 1140 const UnicodeString &one, const UnicodeString &other) {
michael@0 1141 // This is called with primary-equal strings, but never with one.equals(other).
michael@0 1142 UErrorCode status = U_ZERO_ERROR;
michael@0 1143 UnicodeString n1 = nfkdNormalizer.normalize(one, status);
michael@0 1144 UnicodeString n2 = nfkdNormalizer.normalize(other, status);
michael@0 1145 if (U_FAILURE(status)) { return FALSE; }
michael@0 1146 int32_t result = n1.countChar32() - n2.countChar32();
michael@0 1147 if (result != 0) {
michael@0 1148 return result < 0;
michael@0 1149 }
michael@0 1150 result = n1.compareCodePointOrder(n2);
michael@0 1151 if (result != 0) {
michael@0 1152 return result < 0;
michael@0 1153 }
michael@0 1154 return one.compareCodePointOrder(other) < 0;
michael@0 1155 }
michael@0 1156
michael@0 1157 } // namespace
michael@0 1158
michael@0 1159 //
michael@0 1160 // Constructor & Destructor for AlphabeticIndex::Record
michael@0 1161 //
michael@0 1162 // Records are internal only, instances are not directly surfaced in the public API.
michael@0 1163 // This class is mostly struct-like, with all public fields.
michael@0 1164
michael@0 1165 AlphabeticIndex::Record::Record(const UnicodeString &name, const void *data)
michael@0 1166 : name_(name), data_(data) {}
michael@0 1167
michael@0 1168 AlphabeticIndex::Record::~Record() {
michael@0 1169 }
michael@0 1170
michael@0 1171
michael@0 1172 AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) {
michael@0 1173 if (U_FAILURE(status)) {
michael@0 1174 return *this;
michael@0 1175 }
michael@0 1176 if (inputList_ == NULL) {
michael@0 1177 inputList_ = new UVector(status);
michael@0 1178 if (inputList_ == NULL) {
michael@0 1179 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 1180 return *this;
michael@0 1181 }
michael@0 1182 inputList_->setDeleter(alphaIndex_deleteRecord);
michael@0 1183 }
michael@0 1184 Record *r = new Record(name, data);
michael@0 1185 if (r == NULL) {
michael@0 1186 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 1187 return *this;
michael@0 1188 }
michael@0 1189 inputList_->addElement(r, status);
michael@0 1190 clearBuckets();
michael@0 1191 //std::string ss;
michael@0 1192 //std::string ss2;
michael@0 1193 //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" <<
michael@0 1194 // " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl;
michael@0 1195 return *this;
michael@0 1196 }
michael@0 1197
michael@0 1198
michael@0 1199 AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) {
michael@0 1200 if (U_SUCCESS(status) && inputList_ != NULL && !inputList_->isEmpty()) {
michael@0 1201 inputList_->removeAllElements();
michael@0 1202 clearBuckets();
michael@0 1203 }
michael@0 1204 return *this;
michael@0 1205 }
michael@0 1206
michael@0 1207 int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) {
michael@0 1208 initBuckets(status);
michael@0 1209 if (U_FAILURE(status)) {
michael@0 1210 return 0;
michael@0 1211 }
michael@0 1212 return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, status);
michael@0 1213 }
michael@0 1214
michael@0 1215
michael@0 1216 int32_t AlphabeticIndex::getBucketIndex() const {
michael@0 1217 return labelsIterIndex_;
michael@0 1218 }
michael@0 1219
michael@0 1220
michael@0 1221 UBool AlphabeticIndex::nextBucket(UErrorCode &status) {
michael@0 1222 if (U_FAILURE(status)) {
michael@0 1223 return FALSE;
michael@0 1224 }
michael@0 1225 if (buckets_ == NULL && currentBucket_ != NULL) {
michael@0 1226 status = U_ENUM_OUT_OF_SYNC_ERROR;
michael@0 1227 return FALSE;
michael@0 1228 }
michael@0 1229 initBuckets(status);
michael@0 1230 if (U_FAILURE(status)) {
michael@0 1231 return FALSE;
michael@0 1232 }
michael@0 1233 ++labelsIterIndex_;
michael@0 1234 if (labelsIterIndex_ >= buckets_->getBucketCount()) {
michael@0 1235 labelsIterIndex_ = buckets_->getBucketCount();
michael@0 1236 return FALSE;
michael@0 1237 }
michael@0 1238 currentBucket_ = getBucket(*buckets_->immutableVisibleList_, labelsIterIndex_);
michael@0 1239 resetRecordIterator();
michael@0 1240 return TRUE;
michael@0 1241 }
michael@0 1242
michael@0 1243 const UnicodeString &AlphabeticIndex::getBucketLabel() const {
michael@0 1244 if (currentBucket_ != NULL) {
michael@0 1245 return currentBucket_->label_;
michael@0 1246 } else {
michael@0 1247 return emptyString_;
michael@0 1248 }
michael@0 1249 }
michael@0 1250
michael@0 1251
michael@0 1252 UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const {
michael@0 1253 if (currentBucket_ != NULL) {
michael@0 1254 return currentBucket_->labelType_;
michael@0 1255 } else {
michael@0 1256 return U_ALPHAINDEX_NORMAL;
michael@0 1257 }
michael@0 1258 }
michael@0 1259
michael@0 1260
michael@0 1261 int32_t AlphabeticIndex::getBucketRecordCount() const {
michael@0 1262 if (currentBucket_ != NULL && currentBucket_->records_ != NULL) {
michael@0 1263 return currentBucket_->records_->size();
michael@0 1264 } else {
michael@0 1265 return 0;
michael@0 1266 }
michael@0 1267 }
michael@0 1268
michael@0 1269
michael@0 1270 AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) {
michael@0 1271 if (U_FAILURE(status)) {
michael@0 1272 return *this;
michael@0 1273 }
michael@0 1274 internalResetBucketIterator();
michael@0 1275 return *this;
michael@0 1276 }
michael@0 1277
michael@0 1278
michael@0 1279 UBool AlphabeticIndex::nextRecord(UErrorCode &status) {
michael@0 1280 if (U_FAILURE(status)) {
michael@0 1281 return FALSE;
michael@0 1282 }
michael@0 1283 if (currentBucket_ == NULL) {
michael@0 1284 // We are trying to iterate over the items in a bucket, but there is no
michael@0 1285 // current bucket from the enumeration of buckets.
michael@0 1286 status = U_INVALID_STATE_ERROR;
michael@0 1287 return FALSE;
michael@0 1288 }
michael@0 1289 if (buckets_ == NULL) {
michael@0 1290 status = U_ENUM_OUT_OF_SYNC_ERROR;
michael@0 1291 return FALSE;
michael@0 1292 }
michael@0 1293 if (currentBucket_->records_ == NULL) {
michael@0 1294 return FALSE;
michael@0 1295 }
michael@0 1296 ++itemsIterIndex_;
michael@0 1297 if (itemsIterIndex_ >= currentBucket_->records_->size()) {
michael@0 1298 itemsIterIndex_ = currentBucket_->records_->size();
michael@0 1299 return FALSE;
michael@0 1300 }
michael@0 1301 return TRUE;
michael@0 1302 }
michael@0 1303
michael@0 1304
michael@0 1305 const UnicodeString &AlphabeticIndex::getRecordName() const {
michael@0 1306 const UnicodeString *retStr = &emptyString_;
michael@0 1307 if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
michael@0 1308 itemsIterIndex_ >= 0 &&
michael@0 1309 itemsIterIndex_ < currentBucket_->records_->size()) {
michael@0 1310 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
michael@0 1311 retStr = &item->name_;
michael@0 1312 }
michael@0 1313 return *retStr;
michael@0 1314 }
michael@0 1315
michael@0 1316 const void *AlphabeticIndex::getRecordData() const {
michael@0 1317 const void *retPtr = NULL;
michael@0 1318 if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
michael@0 1319 itemsIterIndex_ >= 0 &&
michael@0 1320 itemsIterIndex_ < currentBucket_->records_->size()) {
michael@0 1321 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
michael@0 1322 retPtr = item->data_;
michael@0 1323 }
michael@0 1324 return retPtr;
michael@0 1325 }
michael@0 1326
michael@0 1327
michael@0 1328 AlphabeticIndex & AlphabeticIndex::resetRecordIterator() {
michael@0 1329 itemsIterIndex_ = -1;
michael@0 1330 return *this;
michael@0 1331 }
michael@0 1332
michael@0 1333
michael@0 1334
michael@0 1335 AlphabeticIndex::Bucket::Bucket(const UnicodeString &label,
michael@0 1336 const UnicodeString &lowerBoundary,
michael@0 1337 UAlphabeticIndexLabelType type)
michael@0 1338 : label_(label), lowerBoundary_(lowerBoundary), labelType_(type),
michael@0 1339 displayBucket_(NULL), displayIndex_(-1),
michael@0 1340 records_(NULL) {
michael@0 1341 }
michael@0 1342
michael@0 1343
michael@0 1344 AlphabeticIndex::Bucket::~Bucket() {
michael@0 1345 delete records_;
michael@0 1346 }
michael@0 1347
michael@0 1348 U_NAMESPACE_END
michael@0 1349
michael@0 1350 #endif

mercurial