Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* |
michael@0 | 2 | ******************************************************************************* |
michael@0 | 3 | * Copyright (C) 2009-2013, International Business Machines Corporation and |
michael@0 | 4 | * others. All Rights Reserved. |
michael@0 | 5 | ******************************************************************************* |
michael@0 | 6 | */ |
michael@0 | 7 | |
michael@0 | 8 | #include "unicode/utypes.h" |
michael@0 | 9 | |
michael@0 | 10 | #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION |
michael@0 | 11 | |
michael@0 | 12 | #include "unicode/alphaindex.h" |
michael@0 | 13 | #include "unicode/coleitr.h" |
michael@0 | 14 | #include "unicode/coll.h" |
michael@0 | 15 | #include "unicode/localpointer.h" |
michael@0 | 16 | #include "unicode/normalizer2.h" |
michael@0 | 17 | #include "unicode/tblcoll.h" |
michael@0 | 18 | #include "unicode/ulocdata.h" |
michael@0 | 19 | #include "unicode/uniset.h" |
michael@0 | 20 | #include "unicode/uobject.h" |
michael@0 | 21 | #include "unicode/usetiter.h" |
michael@0 | 22 | #include "unicode/utf16.h" |
michael@0 | 23 | |
michael@0 | 24 | #include "cmemory.h" |
michael@0 | 25 | #include "cstring.h" |
michael@0 | 26 | #include "uassert.h" |
michael@0 | 27 | #include "uvector.h" |
michael@0 | 28 | |
michael@0 | 29 | //#include <string> |
michael@0 | 30 | //#include <iostream> |
michael@0 | 31 | |
michael@0 | 32 | #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
michael@0 | 33 | |
michael@0 | 34 | U_NAMESPACE_BEGIN |
michael@0 | 35 | |
michael@0 | 36 | namespace { |
michael@0 | 37 | |
michael@0 | 38 | /** |
michael@0 | 39 | * Prefix string for Chinese index buckets. |
michael@0 | 40 | * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes |
michael@0 | 41 | */ |
michael@0 | 42 | const UChar BASE[1] = { 0xFDD0 }; |
michael@0 | 43 | const int32_t BASE_LENGTH = 1; |
michael@0 | 44 | |
michael@0 | 45 | UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer, |
michael@0 | 46 | const UnicodeString &one, const UnicodeString &other); |
michael@0 | 47 | |
michael@0 | 48 | } // namespace |
michael@0 | 49 | |
michael@0 | 50 | static int32_t U_CALLCONV |
michael@0 | 51 | collatorComparator(const void *context, const void *left, const void *right); |
michael@0 | 52 | |
michael@0 | 53 | static int32_t U_CALLCONV |
michael@0 | 54 | recordCompareFn(const void *context, const void *left, const void *right); |
michael@0 | 55 | |
michael@0 | 56 | // UVector<Record *> support function, delete a Record. |
michael@0 | 57 | static void U_CALLCONV |
michael@0 | 58 | alphaIndex_deleteRecord(void *obj) { |
michael@0 | 59 | delete static_cast<AlphabeticIndex::Record *>(obj); |
michael@0 | 60 | } |
michael@0 | 61 | |
michael@0 | 62 | namespace { |
michael@0 | 63 | |
michael@0 | 64 | UnicodeString *ownedString(const UnicodeString &s, LocalPointer<UnicodeString> &owned, |
michael@0 | 65 | UErrorCode &errorCode) { |
michael@0 | 66 | if (U_FAILURE(errorCode)) { return NULL; } |
michael@0 | 67 | if (owned.isValid()) { |
michael@0 | 68 | return owned.orphan(); |
michael@0 | 69 | } |
michael@0 | 70 | UnicodeString *p = new UnicodeString(s); |
michael@0 | 71 | if (p == NULL) { |
michael@0 | 72 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 73 | } |
michael@0 | 74 | return p; |
michael@0 | 75 | } |
michael@0 | 76 | |
michael@0 | 77 | inline UnicodeString *getString(const UVector &list, int32_t i) { |
michael@0 | 78 | return static_cast<UnicodeString *>(list[i]); |
michael@0 | 79 | } |
michael@0 | 80 | |
michael@0 | 81 | inline AlphabeticIndex::Bucket *getBucket(const UVector &list, int32_t i) { |
michael@0 | 82 | return static_cast<AlphabeticIndex::Bucket *>(list[i]); |
michael@0 | 83 | } |
michael@0 | 84 | |
michael@0 | 85 | inline AlphabeticIndex::Record *getRecord(const UVector &list, int32_t i) { |
michael@0 | 86 | return static_cast<AlphabeticIndex::Record *>(list[i]); |
michael@0 | 87 | } |
michael@0 | 88 | |
michael@0 | 89 | /** |
michael@0 | 90 | * Like Java Collections.binarySearch(List, String, Comparator). |
michael@0 | 91 | * |
michael@0 | 92 | * @return the index>=0 where the item was found, |
michael@0 | 93 | * or the index<0 for inserting the string at ~index in sorted order |
michael@0 | 94 | */ |
michael@0 | 95 | int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) { |
michael@0 | 96 | if (list.size() == 0) { return ~0; } |
michael@0 | 97 | int32_t start = 0; |
michael@0 | 98 | int32_t limit = list.size(); |
michael@0 | 99 | for (;;) { |
michael@0 | 100 | int32_t i = (start + limit) / 2; |
michael@0 | 101 | const UnicodeString *si = static_cast<UnicodeString *>(list.elementAt(i)); |
michael@0 | 102 | UErrorCode errorCode = U_ZERO_ERROR; |
michael@0 | 103 | UCollationResult cmp = coll.compare(s, *si, errorCode); |
michael@0 | 104 | if (cmp == UCOL_EQUAL) { |
michael@0 | 105 | return i; |
michael@0 | 106 | } else if (cmp < 0) { |
michael@0 | 107 | if (i == start) { |
michael@0 | 108 | return ~start; // insert s before *si |
michael@0 | 109 | } |
michael@0 | 110 | limit = i; |
michael@0 | 111 | } else { |
michael@0 | 112 | if (i == start) { |
michael@0 | 113 | return ~(start + 1); // insert s after *si |
michael@0 | 114 | } |
michael@0 | 115 | start = i; |
michael@0 | 116 | } |
michael@0 | 117 | } |
michael@0 | 118 | } |
michael@0 | 119 | |
michael@0 | 120 | } // namespace |
michael@0 | 121 | |
michael@0 | 122 | // The BucketList is not in the anonymous namespace because only Clang |
michael@0 | 123 | // seems to support its use in other classes from there. |
michael@0 | 124 | // However, we also don't need U_I18N_API because it is not used from outside the i18n library. |
michael@0 | 125 | class BucketList : public UObject { |
michael@0 | 126 | public: |
michael@0 | 127 | BucketList(UVector *bucketList, UVector *publicBucketList) |
michael@0 | 128 | : bucketList_(bucketList), immutableVisibleList_(publicBucketList) { |
michael@0 | 129 | int32_t displayIndex = 0; |
michael@0 | 130 | for (int32_t i = 0; i < publicBucketList->size(); ++i) { |
michael@0 | 131 | getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++; |
michael@0 | 132 | } |
michael@0 | 133 | } |
michael@0 | 134 | |
michael@0 | 135 | // The virtual destructor must not be inline. |
michael@0 | 136 | // See ticket #8454 for details. |
michael@0 | 137 | virtual ~BucketList(); |
michael@0 | 138 | |
michael@0 | 139 | int32_t getBucketCount() const { |
michael@0 | 140 | return immutableVisibleList_->size(); |
michael@0 | 141 | } |
michael@0 | 142 | |
michael@0 | 143 | int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly, |
michael@0 | 144 | UErrorCode &errorCode) { |
michael@0 | 145 | // binary search |
michael@0 | 146 | int32_t start = 0; |
michael@0 | 147 | int32_t limit = bucketList_->size(); |
michael@0 | 148 | while ((start + 1) < limit) { |
michael@0 | 149 | int32_t i = (start + limit) / 2; |
michael@0 | 150 | const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, i); |
michael@0 | 151 | UCollationResult nameVsBucket = |
michael@0 | 152 | collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode); |
michael@0 | 153 | if (nameVsBucket < 0) { |
michael@0 | 154 | limit = i; |
michael@0 | 155 | } else { |
michael@0 | 156 | start = i; |
michael@0 | 157 | } |
michael@0 | 158 | } |
michael@0 | 159 | const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, start); |
michael@0 | 160 | if (bucket->displayBucket_ != NULL) { |
michael@0 | 161 | bucket = bucket->displayBucket_; |
michael@0 | 162 | } |
michael@0 | 163 | return bucket->displayIndex_; |
michael@0 | 164 | } |
michael@0 | 165 | |
michael@0 | 166 | /** All of the buckets, visible and invisible. */ |
michael@0 | 167 | UVector *bucketList_; |
michael@0 | 168 | /** Just the visible buckets. */ |
michael@0 | 169 | UVector *immutableVisibleList_; |
michael@0 | 170 | }; |
michael@0 | 171 | |
michael@0 | 172 | BucketList::~BucketList() { |
michael@0 | 173 | delete bucketList_; |
michael@0 | 174 | if (immutableVisibleList_ != bucketList_) { |
michael@0 | 175 | delete immutableVisibleList_; |
michael@0 | 176 | } |
michael@0 | 177 | } |
michael@0 | 178 | |
michael@0 | 179 | AlphabeticIndex::ImmutableIndex::~ImmutableIndex() { |
michael@0 | 180 | delete buckets_; |
michael@0 | 181 | delete collatorPrimaryOnly_; |
michael@0 | 182 | } |
michael@0 | 183 | |
michael@0 | 184 | int32_t |
michael@0 | 185 | AlphabeticIndex::ImmutableIndex::getBucketCount() const { |
michael@0 | 186 | return buckets_->getBucketCount(); |
michael@0 | 187 | } |
michael@0 | 188 | |
michael@0 | 189 | int32_t |
michael@0 | 190 | AlphabeticIndex::ImmutableIndex::getBucketIndex( |
michael@0 | 191 | const UnicodeString &name, UErrorCode &errorCode) const { |
michael@0 | 192 | return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode); |
michael@0 | 193 | } |
michael@0 | 194 | |
michael@0 | 195 | const AlphabeticIndex::Bucket * |
michael@0 | 196 | AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const { |
michael@0 | 197 | if (0 <= index && index < buckets_->getBucketCount()) { |
michael@0 | 198 | return icu::getBucket(*buckets_->immutableVisibleList_, index); |
michael@0 | 199 | } else { |
michael@0 | 200 | return NULL; |
michael@0 | 201 | } |
michael@0 | 202 | } |
michael@0 | 203 | |
michael@0 | 204 | AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status) |
michael@0 | 205 | : inputList_(NULL), |
michael@0 | 206 | labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL), |
michael@0 | 207 | maxLabelCount_(99), |
michael@0 | 208 | initialLabels_(NULL), firstCharsInScripts_(NULL), |
michael@0 | 209 | collator_(NULL), collatorPrimaryOnly_(NULL), |
michael@0 | 210 | buckets_(NULL) { |
michael@0 | 211 | init(&locale, status); |
michael@0 | 212 | } |
michael@0 | 213 | |
michael@0 | 214 | |
michael@0 | 215 | AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status) |
michael@0 | 216 | : inputList_(NULL), |
michael@0 | 217 | labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL), |
michael@0 | 218 | maxLabelCount_(99), |
michael@0 | 219 | initialLabels_(NULL), firstCharsInScripts_(NULL), |
michael@0 | 220 | collator_(collator), collatorPrimaryOnly_(NULL), |
michael@0 | 221 | buckets_(NULL) { |
michael@0 | 222 | init(NULL, status); |
michael@0 | 223 | } |
michael@0 | 224 | |
michael@0 | 225 | |
michael@0 | 226 | |
michael@0 | 227 | AlphabeticIndex::~AlphabeticIndex() { |
michael@0 | 228 | delete collator_; |
michael@0 | 229 | delete collatorPrimaryOnly_; |
michael@0 | 230 | delete firstCharsInScripts_; |
michael@0 | 231 | delete buckets_; |
michael@0 | 232 | delete inputList_; |
michael@0 | 233 | delete initialLabels_; |
michael@0 | 234 | } |
michael@0 | 235 | |
michael@0 | 236 | |
michael@0 | 237 | AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) { |
michael@0 | 238 | if (U_FAILURE(status)) { |
michael@0 | 239 | return *this; |
michael@0 | 240 | } |
michael@0 | 241 | initialLabels_->addAll(additions); |
michael@0 | 242 | clearBuckets(); |
michael@0 | 243 | return *this; |
michael@0 | 244 | } |
michael@0 | 245 | |
michael@0 | 246 | |
michael@0 | 247 | AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) { |
michael@0 | 248 | addIndexExemplars(locale, status); |
michael@0 | 249 | clearBuckets(); |
michael@0 | 250 | return *this; |
michael@0 | 251 | } |
michael@0 | 252 | |
michael@0 | 253 | |
michael@0 | 254 | AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) { |
michael@0 | 255 | if (U_FAILURE(errorCode)) { return NULL; } |
michael@0 | 256 | // In C++, the ImmutableIndex must own its copy of the BucketList, |
michael@0 | 257 | // even if it contains no records, for proper memory management. |
michael@0 | 258 | // We could clone the buckets_ if they are not NULL, |
michael@0 | 259 | // but that would be worth it only if this method is called multiple times, |
michael@0 | 260 | // or called after using the old-style bucket iterator API. |
michael@0 | 261 | LocalPointer<BucketList> immutableBucketList(createBucketList(errorCode)); |
michael@0 | 262 | LocalPointer<RuleBasedCollator> coll( |
michael@0 | 263 | static_cast<RuleBasedCollator *>(collatorPrimaryOnly_->clone())); |
michael@0 | 264 | if (immutableBucketList.isNull() || coll.isNull()) { |
michael@0 | 265 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 266 | return NULL; |
michael@0 | 267 | } |
michael@0 | 268 | ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias()); |
michael@0 | 269 | if (immIndex == NULL) { |
michael@0 | 270 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 271 | return NULL; |
michael@0 | 272 | } |
michael@0 | 273 | // The ImmutableIndex adopted its parameter objects. |
michael@0 | 274 | immutableBucketList.orphan(); |
michael@0 | 275 | coll.orphan(); |
michael@0 | 276 | return immIndex; |
michael@0 | 277 | } |
michael@0 | 278 | |
michael@0 | 279 | int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) { |
michael@0 | 280 | initBuckets(status); |
michael@0 | 281 | if (U_FAILURE(status)) { |
michael@0 | 282 | return 0; |
michael@0 | 283 | } |
michael@0 | 284 | return buckets_->getBucketCount(); |
michael@0 | 285 | } |
michael@0 | 286 | |
michael@0 | 287 | |
michael@0 | 288 | int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) { |
michael@0 | 289 | if (U_FAILURE(status) || inputList_ == NULL) { |
michael@0 | 290 | return 0; |
michael@0 | 291 | } |
michael@0 | 292 | return inputList_->size(); |
michael@0 | 293 | } |
michael@0 | 294 | |
michael@0 | 295 | void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const { |
michael@0 | 296 | const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode); |
michael@0 | 297 | if (U_FAILURE(errorCode)) { return; } |
michael@0 | 298 | |
michael@0 | 299 | const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0); |
michael@0 | 300 | const UnicodeString &overflowBoundary = |
michael@0 | 301 | *getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1); |
michael@0 | 302 | |
michael@0 | 303 | // We make a sorted array of elements. |
michael@0 | 304 | // Some of the input may be redundant. |
michael@0 | 305 | // That is, we might have c, ch, d, where "ch" sorts just like "c", "h". |
michael@0 | 306 | // We filter out those cases. |
michael@0 | 307 | UnicodeSetIterator iter(*initialLabels_); |
michael@0 | 308 | while (iter.next()) { |
michael@0 | 309 | const UnicodeString *item = &iter.getString(); |
michael@0 | 310 | LocalPointer<UnicodeString> ownedItem; |
michael@0 | 311 | UBool checkDistinct; |
michael@0 | 312 | int32_t itemLength = item->length(); |
michael@0 | 313 | if (!item->hasMoreChar32Than(0, itemLength, 1)) { |
michael@0 | 314 | checkDistinct = FALSE; |
michael@0 | 315 | } else if(item->charAt(itemLength - 1) == 0x2a && // '*' |
michael@0 | 316 | item->charAt(itemLength - 2) != 0x2a) { |
michael@0 | 317 | // Use a label if it is marked with one trailing star, |
michael@0 | 318 | // even if the label string sorts the same when all contractions are suppressed. |
michael@0 | 319 | ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1)); |
michael@0 | 320 | item = ownedItem.getAlias(); |
michael@0 | 321 | if (item == NULL) { |
michael@0 | 322 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 323 | return; |
michael@0 | 324 | } |
michael@0 | 325 | checkDistinct = FALSE; |
michael@0 | 326 | } else { |
michael@0 | 327 | checkDistinct = TRUE; |
michael@0 | 328 | } |
michael@0 | 329 | if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) { |
michael@0 | 330 | // Ignore a primary-ignorable or non-alphabetic index character. |
michael@0 | 331 | } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) { |
michael@0 | 332 | // Ignore an index characters that will land in the overflow bucket. |
michael@0 | 333 | } else if (checkDistinct && |
michael@0 | 334 | collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) { |
michael@0 | 335 | // Ignore a multi-code point index character that does not sort distinctly |
michael@0 | 336 | // from the sequence of its separate characters. |
michael@0 | 337 | } else { |
michael@0 | 338 | int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_); |
michael@0 | 339 | if (insertionPoint < 0) { |
michael@0 | 340 | indexCharacters.insertElementAt( |
michael@0 | 341 | ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode); |
michael@0 | 342 | } else { |
michael@0 | 343 | const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint); |
michael@0 | 344 | if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) { |
michael@0 | 345 | indexCharacters.setElementAt( |
michael@0 | 346 | ownedString(*item, ownedItem, errorCode), insertionPoint); |
michael@0 | 347 | } |
michael@0 | 348 | } |
michael@0 | 349 | } |
michael@0 | 350 | } |
michael@0 | 351 | if (U_FAILURE(errorCode)) { return; } |
michael@0 | 352 | |
michael@0 | 353 | // if the result is still too large, cut down to maxCount elements, by removing every nth element |
michael@0 | 354 | |
michael@0 | 355 | int32_t size = indexCharacters.size() - 1; |
michael@0 | 356 | if (size > maxLabelCount_) { |
michael@0 | 357 | int32_t count = 0; |
michael@0 | 358 | int32_t old = -1; |
michael@0 | 359 | for (int32_t i = 0; i < indexCharacters.size();) { |
michael@0 | 360 | ++count; |
michael@0 | 361 | int32_t bump = count * maxLabelCount_ / size; |
michael@0 | 362 | if (bump == old) { |
michael@0 | 363 | indexCharacters.removeElementAt(i); |
michael@0 | 364 | } else { |
michael@0 | 365 | old = bump; |
michael@0 | 366 | ++i; |
michael@0 | 367 | } |
michael@0 | 368 | } |
michael@0 | 369 | } |
michael@0 | 370 | } |
michael@0 | 371 | |
michael@0 | 372 | namespace { |
michael@0 | 373 | |
michael@0 | 374 | const UnicodeString &fixLabel(const UnicodeString ¤t, UnicodeString &temp) { |
michael@0 | 375 | if (!current.startsWith(BASE, BASE_LENGTH)) { |
michael@0 | 376 | return current; |
michael@0 | 377 | } |
michael@0 | 378 | UChar rest = current.charAt(BASE_LENGTH); |
michael@0 | 379 | if (0x2800 < rest && rest <= 0x28FF) { // stroke count |
michael@0 | 380 | int32_t count = rest-0x2800; |
michael@0 | 381 | temp.setTo((UChar)(0x30 + count % 10)); |
michael@0 | 382 | if (count >= 10) { |
michael@0 | 383 | count /= 10; |
michael@0 | 384 | temp.insert(0, (UChar)(0x30 + count % 10)); |
michael@0 | 385 | if (count >= 10) { |
michael@0 | 386 | count /= 10; |
michael@0 | 387 | temp.insert(0, (UChar)(0x30 + count)); |
michael@0 | 388 | } |
michael@0 | 389 | } |
michael@0 | 390 | return temp.append((UChar)0x5283); |
michael@0 | 391 | } |
michael@0 | 392 | return temp.setTo(current, BASE_LENGTH); |
michael@0 | 393 | } |
michael@0 | 394 | |
michael@0 | 395 | UBool hasMultiplePrimaryWeights( |
michael@0 | 396 | CollationElementIterator &cei, int32_t variableTop, |
michael@0 | 397 | const UnicodeString &s, UErrorCode &errorCode) { |
michael@0 | 398 | cei.setText(s, errorCode); |
michael@0 | 399 | UBool seenPrimary = FALSE; |
michael@0 | 400 | for (;;) { |
michael@0 | 401 | int32_t ce32 = cei.next(errorCode); |
michael@0 | 402 | if (ce32 == CollationElementIterator::NULLORDER) { |
michael@0 | 403 | break; |
michael@0 | 404 | } |
michael@0 | 405 | int32_t p = CollationElementIterator::primaryOrder(ce32); |
michael@0 | 406 | if (p > variableTop && (ce32 & 0xc0) != 0xc0) { |
michael@0 | 407 | // not primary ignorable, and not a continuation CE |
michael@0 | 408 | if (seenPrimary) { |
michael@0 | 409 | return TRUE; |
michael@0 | 410 | } |
michael@0 | 411 | seenPrimary = TRUE; |
michael@0 | 412 | } |
michael@0 | 413 | } |
michael@0 | 414 | return FALSE; |
michael@0 | 415 | } |
michael@0 | 416 | |
michael@0 | 417 | } // namespace |
michael@0 | 418 | |
michael@0 | 419 | BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const { |
michael@0 | 420 | // Initialize indexCharacters. |
michael@0 | 421 | UVector indexCharacters(errorCode); |
michael@0 | 422 | indexCharacters.setDeleter(uprv_deleteUObject); |
michael@0 | 423 | initLabels(indexCharacters, errorCode); |
michael@0 | 424 | if (U_FAILURE(errorCode)) { return NULL; } |
michael@0 | 425 | |
michael@0 | 426 | // Variables for hasMultiplePrimaryWeights(). |
michael@0 | 427 | LocalPointer<CollationElementIterator> cei( |
michael@0 | 428 | collatorPrimaryOnly_->createCollationElementIterator(emptyString_)); |
michael@0 | 429 | if (cei.isNull()) { |
michael@0 | 430 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 431 | return NULL; |
michael@0 | 432 | } |
michael@0 | 433 | int32_t variableTop; |
michael@0 | 434 | if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) { |
michael@0 | 435 | variableTop = CollationElementIterator::primaryOrder( |
michael@0 | 436 | (int32_t)collatorPrimaryOnly_->getVariableTop(errorCode)); |
michael@0 | 437 | } else { |
michael@0 | 438 | variableTop = 0; |
michael@0 | 439 | } |
michael@0 | 440 | UBool hasInvisibleBuckets = FALSE; |
michael@0 | 441 | |
michael@0 | 442 | // Helper arrays for Chinese Pinyin collation. |
michael@0 | 443 | Bucket *asciiBuckets[26] = { |
michael@0 | 444 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
michael@0 | 445 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL |
michael@0 | 446 | }; |
michael@0 | 447 | Bucket *pinyinBuckets[26] = { |
michael@0 | 448 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
michael@0 | 449 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL |
michael@0 | 450 | }; |
michael@0 | 451 | UBool hasPinyin = FALSE; |
michael@0 | 452 | |
michael@0 | 453 | LocalPointer<UVector> bucketList(new UVector(errorCode)); |
michael@0 | 454 | if (bucketList.isNull()) { |
michael@0 | 455 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 456 | return NULL; |
michael@0 | 457 | } |
michael@0 | 458 | bucketList->setDeleter(uprv_deleteUObject); |
michael@0 | 459 | |
michael@0 | 460 | // underflow bucket |
michael@0 | 461 | Bucket *bucket = new Bucket(getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW); |
michael@0 | 462 | if (bucket == NULL) { |
michael@0 | 463 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 464 | return NULL; |
michael@0 | 465 | } |
michael@0 | 466 | bucketList->addElement(bucket, errorCode); |
michael@0 | 467 | if (U_FAILURE(errorCode)) { return NULL; } |
michael@0 | 468 | |
michael@0 | 469 | UnicodeString temp; |
michael@0 | 470 | |
michael@0 | 471 | // fix up the list, adding underflow, additions, overflow |
michael@0 | 472 | // Insert inflow labels as needed. |
michael@0 | 473 | int32_t scriptIndex = -1; |
michael@0 | 474 | const UnicodeString *scriptUpperBoundary = &emptyString_; |
michael@0 | 475 | for (int32_t i = 0; i < indexCharacters.size(); ++i) { |
michael@0 | 476 | UnicodeString ¤t = *getString(indexCharacters, i); |
michael@0 | 477 | if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) { |
michael@0 | 478 | // We crossed the script boundary into a new script. |
michael@0 | 479 | const UnicodeString &inflowBoundary = *scriptUpperBoundary; |
michael@0 | 480 | UBool skippedScript = FALSE; |
michael@0 | 481 | for (;;) { |
michael@0 | 482 | scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex); |
michael@0 | 483 | if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) { |
michael@0 | 484 | break; |
michael@0 | 485 | } |
michael@0 | 486 | skippedScript = TRUE; |
michael@0 | 487 | } |
michael@0 | 488 | if (skippedScript && bucketList->size() > 1) { |
michael@0 | 489 | // We are skipping one or more scripts, |
michael@0 | 490 | // and we are not just getting out of the underflow label. |
michael@0 | 491 | bucket = new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW); |
michael@0 | 492 | if (bucket == NULL) { |
michael@0 | 493 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 494 | return NULL; |
michael@0 | 495 | } |
michael@0 | 496 | bucketList->addElement(bucket, errorCode); |
michael@0 | 497 | } |
michael@0 | 498 | } |
michael@0 | 499 | // Add a bucket with the current label. |
michael@0 | 500 | bucket = new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL); |
michael@0 | 501 | if (bucket == NULL) { |
michael@0 | 502 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 503 | return NULL; |
michael@0 | 504 | } |
michael@0 | 505 | bucketList->addElement(bucket, errorCode); |
michael@0 | 506 | // Remember ASCII and Pinyin buckets for Pinyin redirects. |
michael@0 | 507 | UChar c; |
michael@0 | 508 | if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) { // A-Z |
michael@0 | 509 | asciiBuckets[c - 0x41] = bucket; |
michael@0 | 510 | } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) && |
michael@0 | 511 | 0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) { |
michael@0 | 512 | pinyinBuckets[c - 0x41] = bucket; |
michael@0 | 513 | hasPinyin = TRUE; |
michael@0 | 514 | } |
michael@0 | 515 | // Check for multiple primary weights. |
michael@0 | 516 | if (!current.startsWith(BASE, BASE_LENGTH) && |
michael@0 | 517 | hasMultiplePrimaryWeights(*cei, variableTop, current, errorCode) && |
michael@0 | 518 | current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) { |
michael@0 | 519 | // "AE-ligature" or "Sch" etc. |
michael@0 | 520 | for (int32_t i = bucketList->size() - 2;; --i) { |
michael@0 | 521 | Bucket *singleBucket = getBucket(*bucketList, i); |
michael@0 | 522 | if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) { |
michael@0 | 523 | // There is no single-character bucket since the last |
michael@0 | 524 | // underflow or inflow label. |
michael@0 | 525 | break; |
michael@0 | 526 | } |
michael@0 | 527 | if (singleBucket->displayBucket_ == NULL && |
michael@0 | 528 | !hasMultiplePrimaryWeights( |
michael@0 | 529 | *cei, variableTop, singleBucket->lowerBoundary_, errorCode)) { |
michael@0 | 530 | // Add an invisible bucket that redirects strings greater than the expansion |
michael@0 | 531 | // to the previous single-character bucket. |
michael@0 | 532 | // For example, after ... Q R S Sch we add Sch\uFFFF->S |
michael@0 | 533 | // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S. |
michael@0 | 534 | bucket = new Bucket(emptyString_, |
michael@0 | 535 | UnicodeString(current).append((UChar)0xFFFF), |
michael@0 | 536 | U_ALPHAINDEX_NORMAL); |
michael@0 | 537 | if (bucket == NULL) { |
michael@0 | 538 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 539 | return NULL; |
michael@0 | 540 | } |
michael@0 | 541 | bucket->displayBucket_ = singleBucket; |
michael@0 | 542 | bucketList->addElement(bucket, errorCode); |
michael@0 | 543 | hasInvisibleBuckets = TRUE; |
michael@0 | 544 | break; |
michael@0 | 545 | } |
michael@0 | 546 | } |
michael@0 | 547 | } |
michael@0 | 548 | } |
michael@0 | 549 | if (U_FAILURE(errorCode)) { return NULL; } |
michael@0 | 550 | if (bucketList->size() == 1) { |
michael@0 | 551 | // No real labels, show only the underflow label. |
michael@0 | 552 | BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias()); |
michael@0 | 553 | if (bl == NULL) { |
michael@0 | 554 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 555 | return NULL; |
michael@0 | 556 | } |
michael@0 | 557 | bucketList.orphan(); |
michael@0 | 558 | return bl; |
michael@0 | 559 | } |
michael@0 | 560 | // overflow bucket |
michael@0 | 561 | bucket = new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW); |
michael@0 | 562 | if (bucket == NULL) { |
michael@0 | 563 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 564 | return NULL; |
michael@0 | 565 | } |
michael@0 | 566 | bucketList->addElement(bucket, errorCode); // final |
michael@0 | 567 | |
michael@0 | 568 | if (hasPinyin) { |
michael@0 | 569 | // Redirect Pinyin buckets. |
michael@0 | 570 | Bucket *asciiBucket = NULL; |
michael@0 | 571 | for (int32_t i = 0; i < 26; ++i) { |
michael@0 | 572 | if (asciiBuckets[i] != NULL) { |
michael@0 | 573 | asciiBucket = asciiBuckets[i]; |
michael@0 | 574 | } |
michael@0 | 575 | if (pinyinBuckets[i] != NULL && asciiBucket != NULL) { |
michael@0 | 576 | pinyinBuckets[i]->displayBucket_ = asciiBucket; |
michael@0 | 577 | hasInvisibleBuckets = TRUE; |
michael@0 | 578 | } |
michael@0 | 579 | } |
michael@0 | 580 | } |
michael@0 | 581 | |
michael@0 | 582 | if (U_FAILURE(errorCode)) { return NULL; } |
michael@0 | 583 | if (!hasInvisibleBuckets) { |
michael@0 | 584 | BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias()); |
michael@0 | 585 | if (bl == NULL) { |
michael@0 | 586 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 587 | return NULL; |
michael@0 | 588 | } |
michael@0 | 589 | bucketList.orphan(); |
michael@0 | 590 | return bl; |
michael@0 | 591 | } |
michael@0 | 592 | // Merge inflow buckets that are visually adjacent. |
michael@0 | 593 | // Iterate backwards: Merge inflow into overflow rather than the other way around. |
michael@0 | 594 | int32_t i = bucketList->size() - 1; |
michael@0 | 595 | Bucket *nextBucket = getBucket(*bucketList, i); |
michael@0 | 596 | while (--i > 0) { |
michael@0 | 597 | bucket = getBucket(*bucketList, i); |
michael@0 | 598 | if (bucket->displayBucket_ != NULL) { |
michael@0 | 599 | continue; // skip invisible buckets |
michael@0 | 600 | } |
michael@0 | 601 | if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) { |
michael@0 | 602 | if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) { |
michael@0 | 603 | bucket->displayBucket_ = nextBucket; |
michael@0 | 604 | continue; |
michael@0 | 605 | } |
michael@0 | 606 | } |
michael@0 | 607 | nextBucket = bucket; |
michael@0 | 608 | } |
michael@0 | 609 | |
michael@0 | 610 | LocalPointer<UVector> publicBucketList(new UVector(errorCode)); |
michael@0 | 611 | if (bucketList.isNull()) { |
michael@0 | 612 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 613 | return NULL; |
michael@0 | 614 | } |
michael@0 | 615 | // Do not call publicBucketList->setDeleter(): |
michael@0 | 616 | // This vector shares its objects with the bucketList. |
michael@0 | 617 | for (int32_t i = 0; i < bucketList->size(); ++i) { |
michael@0 | 618 | bucket = getBucket(*bucketList, i); |
michael@0 | 619 | if (bucket->displayBucket_ == NULL) { |
michael@0 | 620 | publicBucketList->addElement(bucket, errorCode); |
michael@0 | 621 | } |
michael@0 | 622 | } |
michael@0 | 623 | if (U_FAILURE(errorCode)) { return NULL; } |
michael@0 | 624 | BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias()); |
michael@0 | 625 | if (bl == NULL) { |
michael@0 | 626 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 627 | return NULL; |
michael@0 | 628 | } |
michael@0 | 629 | bucketList.orphan(); |
michael@0 | 630 | publicBucketList.orphan(); |
michael@0 | 631 | return bl; |
michael@0 | 632 | } |
michael@0 | 633 | |
michael@0 | 634 | /** |
michael@0 | 635 | * Creates an index, and buckets and sorts the list of records into the index. |
michael@0 | 636 | */ |
michael@0 | 637 | void AlphabeticIndex::initBuckets(UErrorCode &errorCode) { |
michael@0 | 638 | if (U_FAILURE(errorCode) || buckets_ != NULL) { |
michael@0 | 639 | return; |
michael@0 | 640 | } |
michael@0 | 641 | buckets_ = createBucketList(errorCode); |
michael@0 | 642 | if (U_FAILURE(errorCode) || inputList_ == NULL || inputList_->isEmpty()) { |
michael@0 | 643 | return; |
michael@0 | 644 | } |
michael@0 | 645 | |
michael@0 | 646 | // Sort the records by name. |
michael@0 | 647 | // Stable sort preserves input order of collation duplicates. |
michael@0 | 648 | inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode); |
michael@0 | 649 | |
michael@0 | 650 | // Now, we traverse all of the input, which is now sorted. |
michael@0 | 651 | // If the item doesn't go in the current bucket, we find the next bucket that contains it. |
michael@0 | 652 | // This makes the process order n*log(n), since we just sort the list and then do a linear process. |
michael@0 | 653 | // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so |
michael@0 | 654 | // we need to improve it for that case. |
michael@0 | 655 | |
michael@0 | 656 | Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0); |
michael@0 | 657 | int32_t bucketIndex = 1; |
michael@0 | 658 | Bucket *nextBucket; |
michael@0 | 659 | const UnicodeString *upperBoundary; |
michael@0 | 660 | if (bucketIndex < buckets_->bucketList_->size()) { |
michael@0 | 661 | nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++); |
michael@0 | 662 | upperBoundary = &nextBucket->lowerBoundary_; |
michael@0 | 663 | } else { |
michael@0 | 664 | nextBucket = NULL; |
michael@0 | 665 | upperBoundary = NULL; |
michael@0 | 666 | } |
michael@0 | 667 | for (int32_t i = 0; i < inputList_->size(); ++i) { |
michael@0 | 668 | Record *r = getRecord(*inputList_, i); |
michael@0 | 669 | // if the current bucket isn't the right one, find the one that is |
michael@0 | 670 | // We have a special flag for the last bucket so that we don't look any further |
michael@0 | 671 | while (upperBoundary != NULL && |
michael@0 | 672 | collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) { |
michael@0 | 673 | currentBucket = nextBucket; |
michael@0 | 674 | // now reset the boundary that we compare against |
michael@0 | 675 | if (bucketIndex < buckets_->bucketList_->size()) { |
michael@0 | 676 | nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++); |
michael@0 | 677 | upperBoundary = &nextBucket->lowerBoundary_; |
michael@0 | 678 | } else { |
michael@0 | 679 | upperBoundary = NULL; |
michael@0 | 680 | } |
michael@0 | 681 | } |
michael@0 | 682 | // now put the record into the bucket. |
michael@0 | 683 | Bucket *bucket = currentBucket; |
michael@0 | 684 | if (bucket->displayBucket_ != NULL) { |
michael@0 | 685 | bucket = bucket->displayBucket_; |
michael@0 | 686 | } |
michael@0 | 687 | if (bucket->records_ == NULL) { |
michael@0 | 688 | bucket->records_ = new UVector(errorCode); |
michael@0 | 689 | if (bucket->records_ == NULL) { |
michael@0 | 690 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 691 | return; |
michael@0 | 692 | } |
michael@0 | 693 | } |
michael@0 | 694 | bucket->records_->addElement(r, errorCode); |
michael@0 | 695 | } |
michael@0 | 696 | } |
michael@0 | 697 | |
michael@0 | 698 | void AlphabeticIndex::clearBuckets() { |
michael@0 | 699 | if (buckets_ != NULL) { |
michael@0 | 700 | delete buckets_; |
michael@0 | 701 | buckets_ = NULL; |
michael@0 | 702 | internalResetBucketIterator(); |
michael@0 | 703 | } |
michael@0 | 704 | } |
michael@0 | 705 | |
michael@0 | 706 | void AlphabeticIndex::internalResetBucketIterator() { |
michael@0 | 707 | labelsIterIndex_ = -1; |
michael@0 | 708 | currentBucket_ = NULL; |
michael@0 | 709 | } |
michael@0 | 710 | |
michael@0 | 711 | |
michael@0 | 712 | void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) { |
michael@0 | 713 | if (U_FAILURE(status)) { return; } |
michael@0 | 714 | // Chinese index characters, which are specific to each of the several Chinese tailorings, |
michael@0 | 715 | // take precedence over the single locale data exemplar set per language. |
michael@0 | 716 | const char *language = locale.getLanguage(); |
michael@0 | 717 | if (uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 || |
michael@0 | 718 | uprv_strcmp(language, "ko") == 0) { |
michael@0 | 719 | // TODO: This should be done regardless of the language, but it's expensive. |
michael@0 | 720 | // We should add a Collator function (can be @internal) |
michael@0 | 721 | // to enumerate just the contractions that start with a given code point or string. |
michael@0 | 722 | if (addChineseIndexCharacters(status) || U_FAILURE(status)) { |
michael@0 | 723 | return; |
michael@0 | 724 | } |
michael@0 | 725 | } |
michael@0 | 726 | |
michael@0 | 727 | LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status)); |
michael@0 | 728 | if (U_FAILURE(status)) { |
michael@0 | 729 | return; |
michael@0 | 730 | } |
michael@0 | 731 | |
michael@0 | 732 | UnicodeSet exemplars; |
michael@0 | 733 | ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status); |
michael@0 | 734 | if (U_SUCCESS(status)) { |
michael@0 | 735 | initialLabels_->addAll(exemplars); |
michael@0 | 736 | return; |
michael@0 | 737 | } |
michael@0 | 738 | status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR |
michael@0 | 739 | |
michael@0 | 740 | // The locale data did not include explicit Index characters. |
michael@0 | 741 | // Synthesize a set of them from the locale's standard exemplar characters. |
michael@0 | 742 | ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status); |
michael@0 | 743 | if (U_FAILURE(status)) { |
michael@0 | 744 | return; |
michael@0 | 745 | } |
michael@0 | 746 | |
michael@0 | 747 | // question: should we add auxiliary exemplars? |
michael@0 | 748 | if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.size() == 0) { |
michael@0 | 749 | exemplars.add(0x61, 0x7A); |
michael@0 | 750 | } |
michael@0 | 751 | if (exemplars.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables |
michael@0 | 752 | // cut down to small list |
michael@0 | 753 | exemplars.remove(0xAC00, 0xD7A3). |
michael@0 | 754 | add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C). |
michael@0 | 755 | add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544). |
michael@0 | 756 | add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0). |
michael@0 | 757 | add(0xD30C).add(0xD558); |
michael@0 | 758 | } |
michael@0 | 759 | if (exemplars.containsSome(0x1200, 0x137F)) { // Ethiopic block |
michael@0 | 760 | // cut down to small list |
michael@0 | 761 | // make use of the fact that Ethiopic is allocated in 8's, where |
michael@0 | 762 | // the base is 0 mod 8. |
michael@0 | 763 | UnicodeSet ethiopic( |
michael@0 | 764 | UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"), status); |
michael@0 | 765 | UnicodeSetIterator it(ethiopic); |
michael@0 | 766 | while (it.next() && !it.isString()) { |
michael@0 | 767 | if ((it.getCodepoint() & 0x7) != 0) { |
michael@0 | 768 | exemplars.remove(it.getCodepoint()); |
michael@0 | 769 | } |
michael@0 | 770 | } |
michael@0 | 771 | } |
michael@0 | 772 | |
michael@0 | 773 | // Upper-case any that aren't already so. |
michael@0 | 774 | // (We only do this for synthesized index characters.) |
michael@0 | 775 | UnicodeSetIterator it(exemplars); |
michael@0 | 776 | UnicodeString upperC; |
michael@0 | 777 | while (it.next()) { |
michael@0 | 778 | const UnicodeString &exemplarC = it.getString(); |
michael@0 | 779 | upperC = exemplarC; |
michael@0 | 780 | upperC.toUpper(locale); |
michael@0 | 781 | initialLabels_->add(upperC); |
michael@0 | 782 | } |
michael@0 | 783 | } |
michael@0 | 784 | |
michael@0 | 785 | UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) { |
michael@0 | 786 | UnicodeSet contractions; |
michael@0 | 787 | ucol_getContractionsAndExpansions(collatorPrimaryOnly_->getUCollator(), |
michael@0 | 788 | contractions.toUSet(), NULL, FALSE, &errorCode); |
michael@0 | 789 | if (U_FAILURE(errorCode)) { return FALSE; } |
michael@0 | 790 | UnicodeString firstHanBoundary; |
michael@0 | 791 | UBool hasPinyin = FALSE; |
michael@0 | 792 | UnicodeSetIterator iter(contractions); |
michael@0 | 793 | while (iter.next()) { |
michael@0 | 794 | const UnicodeString &s = iter.getString(); |
michael@0 | 795 | if (s.startsWith(BASE, BASE_LENGTH)) { |
michael@0 | 796 | initialLabels_->add(s); |
michael@0 | 797 | if (firstHanBoundary.isEmpty() || |
michael@0 | 798 | collatorPrimaryOnly_->compare(s, firstHanBoundary, errorCode) < 0) { |
michael@0 | 799 | firstHanBoundary = s; |
michael@0 | 800 | } |
michael@0 | 801 | UChar c = s.charAt(s.length() - 1); |
michael@0 | 802 | if (0x41 <= c && c <= 0x5A) { // A-Z |
michael@0 | 803 | hasPinyin = TRUE; |
michael@0 | 804 | } |
michael@0 | 805 | } |
michael@0 | 806 | } |
michael@0 | 807 | if (hasPinyin) { |
michael@0 | 808 | initialLabels_->add(0x41, 0x5A); // A-Z |
michael@0 | 809 | } |
michael@0 | 810 | if (!firstHanBoundary.isEmpty()) { |
michael@0 | 811 | // The hardcoded list of script boundaries includes U+4E00 |
michael@0 | 812 | // which is tailored to not be the first primary |
michael@0 | 813 | // in all Chinese tailorings except "unihan". |
michael@0 | 814 | // Replace U+4E00 with the first boundary string from the tailoring. |
michael@0 | 815 | // TODO: This becomes obsolete when the root collator gets |
michael@0 | 816 | // reliable script-first-primary mappings. |
michael@0 | 817 | int32_t hanIndex = binarySearch( |
michael@0 | 818 | *firstCharsInScripts_, UnicodeString((UChar)0x4E00), *collatorPrimaryOnly_); |
michael@0 | 819 | if (hanIndex >= 0) { |
michael@0 | 820 | UnicodeString *fh = new UnicodeString(firstHanBoundary); |
michael@0 | 821 | if (fh == NULL) { |
michael@0 | 822 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 823 | return FALSE; |
michael@0 | 824 | } |
michael@0 | 825 | firstCharsInScripts_->setElementAt(fh, hanIndex); |
michael@0 | 826 | } |
michael@0 | 827 | return TRUE; |
michael@0 | 828 | } else { |
michael@0 | 829 | return FALSE; |
michael@0 | 830 | } |
michael@0 | 831 | } |
michael@0 | 832 | |
michael@0 | 833 | |
michael@0 | 834 | /* |
michael@0 | 835 | * Return the string with interspersed CGJs. Input must have more than 2 codepoints. |
michael@0 | 836 | */ |
michael@0 | 837 | static const UChar CGJ = 0x034F; |
michael@0 | 838 | UnicodeString AlphabeticIndex::separated(const UnicodeString &item) { |
michael@0 | 839 | UnicodeString result; |
michael@0 | 840 | if (item.length() == 0) { |
michael@0 | 841 | return result; |
michael@0 | 842 | } |
michael@0 | 843 | int32_t i = 0; |
michael@0 | 844 | for (;;) { |
michael@0 | 845 | UChar32 cp = item.char32At(i); |
michael@0 | 846 | result.append(cp); |
michael@0 | 847 | i = item.moveIndex32(i, 1); |
michael@0 | 848 | if (i >= item.length()) { |
michael@0 | 849 | break; |
michael@0 | 850 | } |
michael@0 | 851 | result.append(CGJ); |
michael@0 | 852 | } |
michael@0 | 853 | return result; |
michael@0 | 854 | } |
michael@0 | 855 | |
michael@0 | 856 | |
michael@0 | 857 | UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const { |
michael@0 | 858 | return FALSE; |
michael@0 | 859 | } |
michael@0 | 860 | |
michael@0 | 861 | |
michael@0 | 862 | UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const { |
michael@0 | 863 | return FALSE; |
michael@0 | 864 | } |
michael@0 | 865 | |
michael@0 | 866 | |
michael@0 | 867 | const RuleBasedCollator &AlphabeticIndex::getCollator() const { |
michael@0 | 868 | // There are no known non-RuleBasedCollator collators, and none ever expected. |
michael@0 | 869 | // But, in case that changes, better a null pointer than a wrong type. |
michael@0 | 870 | return *dynamic_cast<RuleBasedCollator *>(collator_); |
michael@0 | 871 | } |
michael@0 | 872 | |
michael@0 | 873 | |
michael@0 | 874 | const UnicodeString &AlphabeticIndex::getInflowLabel() const { |
michael@0 | 875 | return inflowLabel_; |
michael@0 | 876 | } |
michael@0 | 877 | |
michael@0 | 878 | const UnicodeString &AlphabeticIndex::getOverflowLabel() const { |
michael@0 | 879 | return overflowLabel_; |
michael@0 | 880 | } |
michael@0 | 881 | |
michael@0 | 882 | |
michael@0 | 883 | const UnicodeString &AlphabeticIndex::getUnderflowLabel() const { |
michael@0 | 884 | return underflowLabel_; |
michael@0 | 885 | } |
michael@0 | 886 | |
michael@0 | 887 | |
michael@0 | 888 | AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { |
michael@0 | 889 | inflowLabel_ = label; |
michael@0 | 890 | clearBuckets(); |
michael@0 | 891 | return *this; |
michael@0 | 892 | } |
michael@0 | 893 | |
michael@0 | 894 | |
michael@0 | 895 | AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { |
michael@0 | 896 | overflowLabel_ = label; |
michael@0 | 897 | clearBuckets(); |
michael@0 | 898 | return *this; |
michael@0 | 899 | } |
michael@0 | 900 | |
michael@0 | 901 | |
michael@0 | 902 | AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { |
michael@0 | 903 | underflowLabel_ = label; |
michael@0 | 904 | clearBuckets(); |
michael@0 | 905 | return *this; |
michael@0 | 906 | } |
michael@0 | 907 | |
michael@0 | 908 | |
michael@0 | 909 | int32_t AlphabeticIndex::getMaxLabelCount() const { |
michael@0 | 910 | return maxLabelCount_; |
michael@0 | 911 | } |
michael@0 | 912 | |
michael@0 | 913 | |
michael@0 | 914 | AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) { |
michael@0 | 915 | if (U_FAILURE(status)) { |
michael@0 | 916 | return *this; |
michael@0 | 917 | } |
michael@0 | 918 | if (maxLabelCount <= 0) { |
michael@0 | 919 | status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 920 | return *this; |
michael@0 | 921 | } |
michael@0 | 922 | maxLabelCount_ = maxLabelCount; |
michael@0 | 923 | clearBuckets(); |
michael@0 | 924 | return *this; |
michael@0 | 925 | } |
michael@0 | 926 | |
michael@0 | 927 | |
michael@0 | 928 | // |
michael@0 | 929 | // init() - Common code for constructors. |
michael@0 | 930 | // |
michael@0 | 931 | |
michael@0 | 932 | void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) { |
michael@0 | 933 | if (U_FAILURE(status)) { return; } |
michael@0 | 934 | if (locale == NULL && collator_ == NULL) { |
michael@0 | 935 | status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 936 | return; |
michael@0 | 937 | } |
michael@0 | 938 | |
michael@0 | 939 | initialLabels_ = new UnicodeSet(); |
michael@0 | 940 | if (initialLabels_ == NULL) { |
michael@0 | 941 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 942 | return; |
michael@0 | 943 | } |
michael@0 | 944 | |
michael@0 | 945 | inflowLabel_.setTo((UChar)0x2026); // Ellipsis |
michael@0 | 946 | overflowLabel_ = inflowLabel_; |
michael@0 | 947 | underflowLabel_ = inflowLabel_; |
michael@0 | 948 | |
michael@0 | 949 | if (collator_ == NULL) { |
michael@0 | 950 | collator_ = static_cast<RuleBasedCollator *>(Collator::createInstance(*locale, status)); |
michael@0 | 951 | if (U_FAILURE(status)) { return; } |
michael@0 | 952 | if (collator_ == NULL) { |
michael@0 | 953 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 954 | return; |
michael@0 | 955 | } |
michael@0 | 956 | } |
michael@0 | 957 | collatorPrimaryOnly_ = static_cast<RuleBasedCollator *>(collator_->clone()); |
michael@0 | 958 | if (collatorPrimaryOnly_ == NULL) { |
michael@0 | 959 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 960 | return; |
michael@0 | 961 | } |
michael@0 | 962 | collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status); |
michael@0 | 963 | firstCharsInScripts_ = firstStringsInScript(status); |
michael@0 | 964 | if (U_FAILURE(status)) { return; } |
michael@0 | 965 | firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status); |
michael@0 | 966 | UnicodeString _4E00((UChar)0x4E00); |
michael@0 | 967 | UnicodeString _1100((UChar)0x1100); |
michael@0 | 968 | UnicodeString _1112((UChar)0x1112); |
michael@0 | 969 | if (collatorPrimaryOnly_->compare(_4E00, _1112, status) <= 0 && |
michael@0 | 970 | collatorPrimaryOnly_->compare(_1100, _4E00, status) <= 0) { |
michael@0 | 971 | // The standard Korean tailoring sorts Hanja (Han characters) |
michael@0 | 972 | // as secondary differences from Hangul syllables. |
michael@0 | 973 | // This makes U+4E00 not useful as a Han-script boundary. |
michael@0 | 974 | // TODO: This becomes obsolete when the root collator gets |
michael@0 | 975 | // reliable script-first-primary mappings. |
michael@0 | 976 | int32_t hanIndex = binarySearch( |
michael@0 | 977 | *firstCharsInScripts_, _4E00, *collatorPrimaryOnly_); |
michael@0 | 978 | if (hanIndex >= 0) { |
michael@0 | 979 | firstCharsInScripts_->removeElementAt(hanIndex); |
michael@0 | 980 | } |
michael@0 | 981 | } |
michael@0 | 982 | // Guard against a degenerate collator where |
michael@0 | 983 | // some script boundary strings are primary ignorable. |
michael@0 | 984 | for (;;) { |
michael@0 | 985 | if (U_FAILURE(status)) { return; } |
michael@0 | 986 | if (firstCharsInScripts_->isEmpty()) { |
michael@0 | 987 | // AlphabeticIndex requires some non-ignorable script boundary strings. |
michael@0 | 988 | status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 989 | return; |
michael@0 | 990 | } |
michael@0 | 991 | if (collatorPrimaryOnly_->compare( |
michael@0 | 992 | *static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)), |
michael@0 | 993 | emptyString_, status) == UCOL_EQUAL) { |
michael@0 | 994 | firstCharsInScripts_->removeElementAt(0); |
michael@0 | 995 | } else { |
michael@0 | 996 | break; |
michael@0 | 997 | } |
michael@0 | 998 | } |
michael@0 | 999 | |
michael@0 | 1000 | if (locale != NULL) { |
michael@0 | 1001 | addIndexExemplars(*locale, status); |
michael@0 | 1002 | } |
michael@0 | 1003 | } |
michael@0 | 1004 | |
michael@0 | 1005 | |
michael@0 | 1006 | // |
michael@0 | 1007 | // Comparison function for UVector<UnicodeString *> sorting with a collator. |
michael@0 | 1008 | // |
michael@0 | 1009 | static int32_t U_CALLCONV |
michael@0 | 1010 | collatorComparator(const void *context, const void *left, const void *right) { |
michael@0 | 1011 | const UElement *leftElement = static_cast<const UElement *>(left); |
michael@0 | 1012 | const UElement *rightElement = static_cast<const UElement *>(right); |
michael@0 | 1013 | const UnicodeString *leftString = static_cast<const UnicodeString *>(leftElement->pointer); |
michael@0 | 1014 | const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer); |
michael@0 | 1015 | |
michael@0 | 1016 | if (leftString == rightString) { |
michael@0 | 1017 | // Catches case where both are NULL |
michael@0 | 1018 | return 0; |
michael@0 | 1019 | } |
michael@0 | 1020 | if (leftString == NULL) { |
michael@0 | 1021 | return 1; |
michael@0 | 1022 | }; |
michael@0 | 1023 | if (rightString == NULL) { |
michael@0 | 1024 | return -1; |
michael@0 | 1025 | } |
michael@0 | 1026 | const Collator *col = static_cast<const Collator *>(context); |
michael@0 | 1027 | UErrorCode errorCode = U_ZERO_ERROR; |
michael@0 | 1028 | return col->compare(*leftString, *rightString, errorCode); |
michael@0 | 1029 | } |
michael@0 | 1030 | |
michael@0 | 1031 | // |
michael@0 | 1032 | // Comparison function for UVector<Record *> sorting with a collator. |
michael@0 | 1033 | // |
michael@0 | 1034 | static int32_t U_CALLCONV |
michael@0 | 1035 | recordCompareFn(const void *context, const void *left, const void *right) { |
michael@0 | 1036 | const UElement *leftElement = static_cast<const UElement *>(left); |
michael@0 | 1037 | const UElement *rightElement = static_cast<const UElement *>(right); |
michael@0 | 1038 | const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer); |
michael@0 | 1039 | const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer); |
michael@0 | 1040 | const Collator *col = static_cast<const Collator *>(context); |
michael@0 | 1041 | UErrorCode errorCode = U_ZERO_ERROR; |
michael@0 | 1042 | return col->compare(leftRec->name_, rightRec->name_, errorCode); |
michael@0 | 1043 | } |
michael@0 | 1044 | |
michael@0 | 1045 | |
michael@0 | 1046 | /** |
michael@0 | 1047 | * This list contains one character per script that has the |
michael@0 | 1048 | * lowest primary weight for that script in the root collator. |
michael@0 | 1049 | * This list will be copied and sorted to account for script reordering. |
michael@0 | 1050 | * |
michael@0 | 1051 | * <p>TODO: This is fragile. If the first character of a script is tailored |
michael@0 | 1052 | * so that it does not map to the script's lowest primary weight any more, |
michael@0 | 1053 | * then the buckets will be off. |
michael@0 | 1054 | * There are hacks in the code to handle the known CJK tailorings of U+4E00. |
michael@0 | 1055 | * |
michael@0 | 1056 | * <p>We use "A" not "a" because the en_US_POSIX tailoring sorts A primary-before a. |
michael@0 | 1057 | * |
michael@0 | 1058 | * Keep this in sync with HACK_FIRST_CHARS_IN_SCRIPTS in |
michael@0 | 1059 | * ICU4J main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java |
michael@0 | 1060 | */ |
michael@0 | 1061 | static const UChar HACK_FIRST_CHARS_IN_SCRIPTS[] = { |
michael@0 | 1062 | 0x41, 0, 0x03B1, 0, |
michael@0 | 1063 | 0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0, |
michael@0 | 1064 | 0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0, |
michael@0 | 1065 | 0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0, |
michael@0 | 1066 | 0xAAF2, 0, // Meetei Mayek |
michael@0 | 1067 | 0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0, |
michael@0 | 1068 | U16_LEAD(0x111C4), U16_TRAIL(0x111C4), 0, // Sharada |
michael@0 | 1069 | U16_LEAD(0x11680), U16_TRAIL(0x11680), 0, // Takri |
michael@0 | 1070 | 0x1B83, 0, |
michael@0 | 1071 | 0xD802, 0xDE00, 0, 0x0E01, 0, |
michael@0 | 1072 | 0x0EDE, 0, // Lao |
michael@0 | 1073 | 0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0, |
michael@0 | 1074 | 0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0, |
michael@0 | 1075 | U16_LEAD(0x11103), U16_TRAIL(0x11103), 0, // Chakma |
michael@0 | 1076 | 0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0, |
michael@0 | 1077 | 0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0, |
michael@0 | 1078 | 0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0, |
michael@0 | 1079 | U16_LEAD(0x16F00), U16_TRAIL(0x16F00), 0, // Miao |
michael@0 | 1080 | 0xD800, 0xDE80, 0, |
michael@0 | 1081 | 0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0, |
michael@0 | 1082 | 0xD801, 0xDC80, 0, |
michael@0 | 1083 | U16_LEAD(0x110D0), U16_TRAIL(0x110D0), 0, // Sora Sompeng |
michael@0 | 1084 | 0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0, |
michael@0 | 1085 | 0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0, |
michael@0 | 1086 | U16_LEAD(0x109A0), U16_TRAIL(0x109A0), 0, // Meroitic Cursive |
michael@0 | 1087 | U16_LEAD(0x10980), U16_TRAIL(0x10980), 0, // Meroitic Hieroglyphs |
michael@0 | 1088 | 0x4E00, 0, |
michael@0 | 1089 | // TODO: The overflow bucket's lowerBoundary string should be the |
michael@0 | 1090 | // first item after the last reordering group in the collator's script order. |
michael@0 | 1091 | // This should normally be the first Unicode code point |
michael@0 | 1092 | // that is unassigned (U+0378 in Unicode 6.3) and untailored. |
michael@0 | 1093 | // However, at least up to ICU 51 the Hani reordering group includes |
michael@0 | 1094 | // unassigned code points, |
michael@0 | 1095 | // and there is no stable string for the start of the trailing-weights range. |
michael@0 | 1096 | // The only known string that sorts "high" is U+FFFF. |
michael@0 | 1097 | // When ICU separates Hani vs. unassigned reordering groups, we need to fix this, |
michael@0 | 1098 | // and fix relevant test code. |
michael@0 | 1099 | // Ideally, FractionalUCA.txt will have a "script first primary" |
michael@0 | 1100 | // for unassigned code points. |
michael@0 | 1101 | 0xFFFF, 0 |
michael@0 | 1102 | }; |
michael@0 | 1103 | |
michael@0 | 1104 | UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) { |
michael@0 | 1105 | if (U_FAILURE(status)) { |
michael@0 | 1106 | return NULL; |
michael@0 | 1107 | } |
michael@0 | 1108 | UVector *dest = new UVector(status); |
michael@0 | 1109 | if (dest == NULL) { |
michael@0 | 1110 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 1111 | return NULL; |
michael@0 | 1112 | } |
michael@0 | 1113 | dest->setDeleter(uprv_deleteUObject); |
michael@0 | 1114 | const UChar *src = HACK_FIRST_CHARS_IN_SCRIPTS; |
michael@0 | 1115 | const UChar *limit = src + LENGTHOF(HACK_FIRST_CHARS_IN_SCRIPTS); |
michael@0 | 1116 | do { |
michael@0 | 1117 | if (U_FAILURE(status)) { |
michael@0 | 1118 | return dest; |
michael@0 | 1119 | } |
michael@0 | 1120 | UnicodeString *str = new UnicodeString(src, -1); |
michael@0 | 1121 | if (str == NULL) { |
michael@0 | 1122 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 1123 | return dest; |
michael@0 | 1124 | } |
michael@0 | 1125 | dest->addElement(str, status); |
michael@0 | 1126 | src += str->length() + 1; |
michael@0 | 1127 | } while (src < limit); |
michael@0 | 1128 | return dest; |
michael@0 | 1129 | } |
michael@0 | 1130 | |
michael@0 | 1131 | |
michael@0 | 1132 | namespace { |
michael@0 | 1133 | |
michael@0 | 1134 | /** |
michael@0 | 1135 | * Returns true if one index character string is "better" than the other. |
michael@0 | 1136 | * Shorter NFKD is better, and otherwise NFKD-binary-less-than is |
michael@0 | 1137 | * better, and otherwise binary-less-than is better. |
michael@0 | 1138 | */ |
michael@0 | 1139 | UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer, |
michael@0 | 1140 | const UnicodeString &one, const UnicodeString &other) { |
michael@0 | 1141 | // This is called with primary-equal strings, but never with one.equals(other). |
michael@0 | 1142 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 1143 | UnicodeString n1 = nfkdNormalizer.normalize(one, status); |
michael@0 | 1144 | UnicodeString n2 = nfkdNormalizer.normalize(other, status); |
michael@0 | 1145 | if (U_FAILURE(status)) { return FALSE; } |
michael@0 | 1146 | int32_t result = n1.countChar32() - n2.countChar32(); |
michael@0 | 1147 | if (result != 0) { |
michael@0 | 1148 | return result < 0; |
michael@0 | 1149 | } |
michael@0 | 1150 | result = n1.compareCodePointOrder(n2); |
michael@0 | 1151 | if (result != 0) { |
michael@0 | 1152 | return result < 0; |
michael@0 | 1153 | } |
michael@0 | 1154 | return one.compareCodePointOrder(other) < 0; |
michael@0 | 1155 | } |
michael@0 | 1156 | |
michael@0 | 1157 | } // namespace |
michael@0 | 1158 | |
michael@0 | 1159 | // |
michael@0 | 1160 | // Constructor & Destructor for AlphabeticIndex::Record |
michael@0 | 1161 | // |
michael@0 | 1162 | // Records are internal only, instances are not directly surfaced in the public API. |
michael@0 | 1163 | // This class is mostly struct-like, with all public fields. |
michael@0 | 1164 | |
michael@0 | 1165 | AlphabeticIndex::Record::Record(const UnicodeString &name, const void *data) |
michael@0 | 1166 | : name_(name), data_(data) {} |
michael@0 | 1167 | |
michael@0 | 1168 | AlphabeticIndex::Record::~Record() { |
michael@0 | 1169 | } |
michael@0 | 1170 | |
michael@0 | 1171 | |
michael@0 | 1172 | AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) { |
michael@0 | 1173 | if (U_FAILURE(status)) { |
michael@0 | 1174 | return *this; |
michael@0 | 1175 | } |
michael@0 | 1176 | if (inputList_ == NULL) { |
michael@0 | 1177 | inputList_ = new UVector(status); |
michael@0 | 1178 | if (inputList_ == NULL) { |
michael@0 | 1179 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 1180 | return *this; |
michael@0 | 1181 | } |
michael@0 | 1182 | inputList_->setDeleter(alphaIndex_deleteRecord); |
michael@0 | 1183 | } |
michael@0 | 1184 | Record *r = new Record(name, data); |
michael@0 | 1185 | if (r == NULL) { |
michael@0 | 1186 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 1187 | return *this; |
michael@0 | 1188 | } |
michael@0 | 1189 | inputList_->addElement(r, status); |
michael@0 | 1190 | clearBuckets(); |
michael@0 | 1191 | //std::string ss; |
michael@0 | 1192 | //std::string ss2; |
michael@0 | 1193 | //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" << |
michael@0 | 1194 | // " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl; |
michael@0 | 1195 | return *this; |
michael@0 | 1196 | } |
michael@0 | 1197 | |
michael@0 | 1198 | |
michael@0 | 1199 | AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) { |
michael@0 | 1200 | if (U_SUCCESS(status) && inputList_ != NULL && !inputList_->isEmpty()) { |
michael@0 | 1201 | inputList_->removeAllElements(); |
michael@0 | 1202 | clearBuckets(); |
michael@0 | 1203 | } |
michael@0 | 1204 | return *this; |
michael@0 | 1205 | } |
michael@0 | 1206 | |
michael@0 | 1207 | int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) { |
michael@0 | 1208 | initBuckets(status); |
michael@0 | 1209 | if (U_FAILURE(status)) { |
michael@0 | 1210 | return 0; |
michael@0 | 1211 | } |
michael@0 | 1212 | return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, status); |
michael@0 | 1213 | } |
michael@0 | 1214 | |
michael@0 | 1215 | |
michael@0 | 1216 | int32_t AlphabeticIndex::getBucketIndex() const { |
michael@0 | 1217 | return labelsIterIndex_; |
michael@0 | 1218 | } |
michael@0 | 1219 | |
michael@0 | 1220 | |
michael@0 | 1221 | UBool AlphabeticIndex::nextBucket(UErrorCode &status) { |
michael@0 | 1222 | if (U_FAILURE(status)) { |
michael@0 | 1223 | return FALSE; |
michael@0 | 1224 | } |
michael@0 | 1225 | if (buckets_ == NULL && currentBucket_ != NULL) { |
michael@0 | 1226 | status = U_ENUM_OUT_OF_SYNC_ERROR; |
michael@0 | 1227 | return FALSE; |
michael@0 | 1228 | } |
michael@0 | 1229 | initBuckets(status); |
michael@0 | 1230 | if (U_FAILURE(status)) { |
michael@0 | 1231 | return FALSE; |
michael@0 | 1232 | } |
michael@0 | 1233 | ++labelsIterIndex_; |
michael@0 | 1234 | if (labelsIterIndex_ >= buckets_->getBucketCount()) { |
michael@0 | 1235 | labelsIterIndex_ = buckets_->getBucketCount(); |
michael@0 | 1236 | return FALSE; |
michael@0 | 1237 | } |
michael@0 | 1238 | currentBucket_ = getBucket(*buckets_->immutableVisibleList_, labelsIterIndex_); |
michael@0 | 1239 | resetRecordIterator(); |
michael@0 | 1240 | return TRUE; |
michael@0 | 1241 | } |
michael@0 | 1242 | |
michael@0 | 1243 | const UnicodeString &AlphabeticIndex::getBucketLabel() const { |
michael@0 | 1244 | if (currentBucket_ != NULL) { |
michael@0 | 1245 | return currentBucket_->label_; |
michael@0 | 1246 | } else { |
michael@0 | 1247 | return emptyString_; |
michael@0 | 1248 | } |
michael@0 | 1249 | } |
michael@0 | 1250 | |
michael@0 | 1251 | |
michael@0 | 1252 | UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const { |
michael@0 | 1253 | if (currentBucket_ != NULL) { |
michael@0 | 1254 | return currentBucket_->labelType_; |
michael@0 | 1255 | } else { |
michael@0 | 1256 | return U_ALPHAINDEX_NORMAL; |
michael@0 | 1257 | } |
michael@0 | 1258 | } |
michael@0 | 1259 | |
michael@0 | 1260 | |
michael@0 | 1261 | int32_t AlphabeticIndex::getBucketRecordCount() const { |
michael@0 | 1262 | if (currentBucket_ != NULL && currentBucket_->records_ != NULL) { |
michael@0 | 1263 | return currentBucket_->records_->size(); |
michael@0 | 1264 | } else { |
michael@0 | 1265 | return 0; |
michael@0 | 1266 | } |
michael@0 | 1267 | } |
michael@0 | 1268 | |
michael@0 | 1269 | |
michael@0 | 1270 | AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) { |
michael@0 | 1271 | if (U_FAILURE(status)) { |
michael@0 | 1272 | return *this; |
michael@0 | 1273 | } |
michael@0 | 1274 | internalResetBucketIterator(); |
michael@0 | 1275 | return *this; |
michael@0 | 1276 | } |
michael@0 | 1277 | |
michael@0 | 1278 | |
michael@0 | 1279 | UBool AlphabeticIndex::nextRecord(UErrorCode &status) { |
michael@0 | 1280 | if (U_FAILURE(status)) { |
michael@0 | 1281 | return FALSE; |
michael@0 | 1282 | } |
michael@0 | 1283 | if (currentBucket_ == NULL) { |
michael@0 | 1284 | // We are trying to iterate over the items in a bucket, but there is no |
michael@0 | 1285 | // current bucket from the enumeration of buckets. |
michael@0 | 1286 | status = U_INVALID_STATE_ERROR; |
michael@0 | 1287 | return FALSE; |
michael@0 | 1288 | } |
michael@0 | 1289 | if (buckets_ == NULL) { |
michael@0 | 1290 | status = U_ENUM_OUT_OF_SYNC_ERROR; |
michael@0 | 1291 | return FALSE; |
michael@0 | 1292 | } |
michael@0 | 1293 | if (currentBucket_->records_ == NULL) { |
michael@0 | 1294 | return FALSE; |
michael@0 | 1295 | } |
michael@0 | 1296 | ++itemsIterIndex_; |
michael@0 | 1297 | if (itemsIterIndex_ >= currentBucket_->records_->size()) { |
michael@0 | 1298 | itemsIterIndex_ = currentBucket_->records_->size(); |
michael@0 | 1299 | return FALSE; |
michael@0 | 1300 | } |
michael@0 | 1301 | return TRUE; |
michael@0 | 1302 | } |
michael@0 | 1303 | |
michael@0 | 1304 | |
michael@0 | 1305 | const UnicodeString &AlphabeticIndex::getRecordName() const { |
michael@0 | 1306 | const UnicodeString *retStr = &emptyString_; |
michael@0 | 1307 | if (currentBucket_ != NULL && currentBucket_->records_ != NULL && |
michael@0 | 1308 | itemsIterIndex_ >= 0 && |
michael@0 | 1309 | itemsIterIndex_ < currentBucket_->records_->size()) { |
michael@0 | 1310 | Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_)); |
michael@0 | 1311 | retStr = &item->name_; |
michael@0 | 1312 | } |
michael@0 | 1313 | return *retStr; |
michael@0 | 1314 | } |
michael@0 | 1315 | |
michael@0 | 1316 | const void *AlphabeticIndex::getRecordData() const { |
michael@0 | 1317 | const void *retPtr = NULL; |
michael@0 | 1318 | if (currentBucket_ != NULL && currentBucket_->records_ != NULL && |
michael@0 | 1319 | itemsIterIndex_ >= 0 && |
michael@0 | 1320 | itemsIterIndex_ < currentBucket_->records_->size()) { |
michael@0 | 1321 | Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_)); |
michael@0 | 1322 | retPtr = item->data_; |
michael@0 | 1323 | } |
michael@0 | 1324 | return retPtr; |
michael@0 | 1325 | } |
michael@0 | 1326 | |
michael@0 | 1327 | |
michael@0 | 1328 | AlphabeticIndex & AlphabeticIndex::resetRecordIterator() { |
michael@0 | 1329 | itemsIterIndex_ = -1; |
michael@0 | 1330 | return *this; |
michael@0 | 1331 | } |
michael@0 | 1332 | |
michael@0 | 1333 | |
michael@0 | 1334 | |
michael@0 | 1335 | AlphabeticIndex::Bucket::Bucket(const UnicodeString &label, |
michael@0 | 1336 | const UnicodeString &lowerBoundary, |
michael@0 | 1337 | UAlphabeticIndexLabelType type) |
michael@0 | 1338 | : label_(label), lowerBoundary_(lowerBoundary), labelType_(type), |
michael@0 | 1339 | displayBucket_(NULL), displayIndex_(-1), |
michael@0 | 1340 | records_(NULL) { |
michael@0 | 1341 | } |
michael@0 | 1342 | |
michael@0 | 1343 | |
michael@0 | 1344 | AlphabeticIndex::Bucket::~Bucket() { |
michael@0 | 1345 | delete records_; |
michael@0 | 1346 | } |
michael@0 | 1347 | |
michael@0 | 1348 | U_NAMESPACE_END |
michael@0 | 1349 | |
michael@0 | 1350 | #endif |