1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/alphaindex.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1350 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* Copyright (C) 2009-2013, International Business Machines Corporation and 1.7 +* others. All Rights Reserved. 1.8 +******************************************************************************* 1.9 +*/ 1.10 + 1.11 +#include "unicode/utypes.h" 1.12 + 1.13 +#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION 1.14 + 1.15 +#include "unicode/alphaindex.h" 1.16 +#include "unicode/coleitr.h" 1.17 +#include "unicode/coll.h" 1.18 +#include "unicode/localpointer.h" 1.19 +#include "unicode/normalizer2.h" 1.20 +#include "unicode/tblcoll.h" 1.21 +#include "unicode/ulocdata.h" 1.22 +#include "unicode/uniset.h" 1.23 +#include "unicode/uobject.h" 1.24 +#include "unicode/usetiter.h" 1.25 +#include "unicode/utf16.h" 1.26 + 1.27 +#include "cmemory.h" 1.28 +#include "cstring.h" 1.29 +#include "uassert.h" 1.30 +#include "uvector.h" 1.31 + 1.32 +//#include <string> 1.33 +//#include <iostream> 1.34 + 1.35 +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 1.36 + 1.37 +U_NAMESPACE_BEGIN 1.38 + 1.39 +namespace { 1.40 + 1.41 +/** 1.42 + * Prefix string for Chinese index buckets. 1.43 + * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes 1.44 + */ 1.45 +const UChar BASE[1] = { 0xFDD0 }; 1.46 +const int32_t BASE_LENGTH = 1; 1.47 + 1.48 +UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer, 1.49 + const UnicodeString &one, const UnicodeString &other); 1.50 + 1.51 +} // namespace 1.52 + 1.53 +static int32_t U_CALLCONV 1.54 +collatorComparator(const void *context, const void *left, const void *right); 1.55 + 1.56 +static int32_t U_CALLCONV 1.57 +recordCompareFn(const void *context, const void *left, const void *right); 1.58 + 1.59 +// UVector<Record *> support function, delete a Record. 1.60 +static void U_CALLCONV 1.61 +alphaIndex_deleteRecord(void *obj) { 1.62 + delete static_cast<AlphabeticIndex::Record *>(obj); 1.63 +} 1.64 + 1.65 +namespace { 1.66 + 1.67 +UnicodeString *ownedString(const UnicodeString &s, LocalPointer<UnicodeString> &owned, 1.68 + UErrorCode &errorCode) { 1.69 + if (U_FAILURE(errorCode)) { return NULL; } 1.70 + if (owned.isValid()) { 1.71 + return owned.orphan(); 1.72 + } 1.73 + UnicodeString *p = new UnicodeString(s); 1.74 + if (p == NULL) { 1.75 + errorCode = U_MEMORY_ALLOCATION_ERROR; 1.76 + } 1.77 + return p; 1.78 +} 1.79 + 1.80 +inline UnicodeString *getString(const UVector &list, int32_t i) { 1.81 + return static_cast<UnicodeString *>(list[i]); 1.82 +} 1.83 + 1.84 +inline AlphabeticIndex::Bucket *getBucket(const UVector &list, int32_t i) { 1.85 + return static_cast<AlphabeticIndex::Bucket *>(list[i]); 1.86 +} 1.87 + 1.88 +inline AlphabeticIndex::Record *getRecord(const UVector &list, int32_t i) { 1.89 + return static_cast<AlphabeticIndex::Record *>(list[i]); 1.90 +} 1.91 + 1.92 +/** 1.93 + * Like Java Collections.binarySearch(List, String, Comparator). 1.94 + * 1.95 + * @return the index>=0 where the item was found, 1.96 + * or the index<0 for inserting the string at ~index in sorted order 1.97 + */ 1.98 +int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) { 1.99 + if (list.size() == 0) { return ~0; } 1.100 + int32_t start = 0; 1.101 + int32_t limit = list.size(); 1.102 + for (;;) { 1.103 + int32_t i = (start + limit) / 2; 1.104 + const UnicodeString *si = static_cast<UnicodeString *>(list.elementAt(i)); 1.105 + UErrorCode errorCode = U_ZERO_ERROR; 1.106 + UCollationResult cmp = coll.compare(s, *si, errorCode); 1.107 + if (cmp == UCOL_EQUAL) { 1.108 + return i; 1.109 + } else if (cmp < 0) { 1.110 + if (i == start) { 1.111 + return ~start; // insert s before *si 1.112 + } 1.113 + limit = i; 1.114 + } else { 1.115 + if (i == start) { 1.116 + return ~(start + 1); // insert s after *si 1.117 + } 1.118 + start = i; 1.119 + } 1.120 + } 1.121 +} 1.122 + 1.123 +} // namespace 1.124 + 1.125 +// The BucketList is not in the anonymous namespace because only Clang 1.126 +// seems to support its use in other classes from there. 1.127 +// However, we also don't need U_I18N_API because it is not used from outside the i18n library. 1.128 +class BucketList : public UObject { 1.129 +public: 1.130 + BucketList(UVector *bucketList, UVector *publicBucketList) 1.131 + : bucketList_(bucketList), immutableVisibleList_(publicBucketList) { 1.132 + int32_t displayIndex = 0; 1.133 + for (int32_t i = 0; i < publicBucketList->size(); ++i) { 1.134 + getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++; 1.135 + } 1.136 + } 1.137 + 1.138 + // The virtual destructor must not be inline. 1.139 + // See ticket #8454 for details. 1.140 + virtual ~BucketList(); 1.141 + 1.142 + int32_t getBucketCount() const { 1.143 + return immutableVisibleList_->size(); 1.144 + } 1.145 + 1.146 + int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly, 1.147 + UErrorCode &errorCode) { 1.148 + // binary search 1.149 + int32_t start = 0; 1.150 + int32_t limit = bucketList_->size(); 1.151 + while ((start + 1) < limit) { 1.152 + int32_t i = (start + limit) / 2; 1.153 + const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, i); 1.154 + UCollationResult nameVsBucket = 1.155 + collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode); 1.156 + if (nameVsBucket < 0) { 1.157 + limit = i; 1.158 + } else { 1.159 + start = i; 1.160 + } 1.161 + } 1.162 + const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, start); 1.163 + if (bucket->displayBucket_ != NULL) { 1.164 + bucket = bucket->displayBucket_; 1.165 + } 1.166 + return bucket->displayIndex_; 1.167 + } 1.168 + 1.169 + /** All of the buckets, visible and invisible. */ 1.170 + UVector *bucketList_; 1.171 + /** Just the visible buckets. */ 1.172 + UVector *immutableVisibleList_; 1.173 +}; 1.174 + 1.175 +BucketList::~BucketList() { 1.176 + delete bucketList_; 1.177 + if (immutableVisibleList_ != bucketList_) { 1.178 + delete immutableVisibleList_; 1.179 + } 1.180 +} 1.181 + 1.182 +AlphabeticIndex::ImmutableIndex::~ImmutableIndex() { 1.183 + delete buckets_; 1.184 + delete collatorPrimaryOnly_; 1.185 +} 1.186 + 1.187 +int32_t 1.188 +AlphabeticIndex::ImmutableIndex::getBucketCount() const { 1.189 + return buckets_->getBucketCount(); 1.190 +} 1.191 + 1.192 +int32_t 1.193 +AlphabeticIndex::ImmutableIndex::getBucketIndex( 1.194 + const UnicodeString &name, UErrorCode &errorCode) const { 1.195 + return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode); 1.196 +} 1.197 + 1.198 +const AlphabeticIndex::Bucket * 1.199 +AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const { 1.200 + if (0 <= index && index < buckets_->getBucketCount()) { 1.201 + return icu::getBucket(*buckets_->immutableVisibleList_, index); 1.202 + } else { 1.203 + return NULL; 1.204 + } 1.205 +} 1.206 + 1.207 +AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status) 1.208 + : inputList_(NULL), 1.209 + labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL), 1.210 + maxLabelCount_(99), 1.211 + initialLabels_(NULL), firstCharsInScripts_(NULL), 1.212 + collator_(NULL), collatorPrimaryOnly_(NULL), 1.213 + buckets_(NULL) { 1.214 + init(&locale, status); 1.215 +} 1.216 + 1.217 + 1.218 +AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status) 1.219 + : inputList_(NULL), 1.220 + labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL), 1.221 + maxLabelCount_(99), 1.222 + initialLabels_(NULL), firstCharsInScripts_(NULL), 1.223 + collator_(collator), collatorPrimaryOnly_(NULL), 1.224 + buckets_(NULL) { 1.225 + init(NULL, status); 1.226 +} 1.227 + 1.228 + 1.229 + 1.230 +AlphabeticIndex::~AlphabeticIndex() { 1.231 + delete collator_; 1.232 + delete collatorPrimaryOnly_; 1.233 + delete firstCharsInScripts_; 1.234 + delete buckets_; 1.235 + delete inputList_; 1.236 + delete initialLabels_; 1.237 +} 1.238 + 1.239 + 1.240 +AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) { 1.241 + if (U_FAILURE(status)) { 1.242 + return *this; 1.243 + } 1.244 + initialLabels_->addAll(additions); 1.245 + clearBuckets(); 1.246 + return *this; 1.247 +} 1.248 + 1.249 + 1.250 +AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) { 1.251 + addIndexExemplars(locale, status); 1.252 + clearBuckets(); 1.253 + return *this; 1.254 +} 1.255 + 1.256 + 1.257 +AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) { 1.258 + if (U_FAILURE(errorCode)) { return NULL; } 1.259 + // In C++, the ImmutableIndex must own its copy of the BucketList, 1.260 + // even if it contains no records, for proper memory management. 1.261 + // We could clone the buckets_ if they are not NULL, 1.262 + // but that would be worth it only if this method is called multiple times, 1.263 + // or called after using the old-style bucket iterator API. 1.264 + LocalPointer<BucketList> immutableBucketList(createBucketList(errorCode)); 1.265 + LocalPointer<RuleBasedCollator> coll( 1.266 + static_cast<RuleBasedCollator *>(collatorPrimaryOnly_->clone())); 1.267 + if (immutableBucketList.isNull() || coll.isNull()) { 1.268 + errorCode = U_MEMORY_ALLOCATION_ERROR; 1.269 + return NULL; 1.270 + } 1.271 + ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias()); 1.272 + if (immIndex == NULL) { 1.273 + errorCode = U_MEMORY_ALLOCATION_ERROR; 1.274 + return NULL; 1.275 + } 1.276 + // The ImmutableIndex adopted its parameter objects. 1.277 + immutableBucketList.orphan(); 1.278 + coll.orphan(); 1.279 + return immIndex; 1.280 +} 1.281 + 1.282 +int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) { 1.283 + initBuckets(status); 1.284 + if (U_FAILURE(status)) { 1.285 + return 0; 1.286 + } 1.287 + return buckets_->getBucketCount(); 1.288 +} 1.289 + 1.290 + 1.291 +int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) { 1.292 + if (U_FAILURE(status) || inputList_ == NULL) { 1.293 + return 0; 1.294 + } 1.295 + return inputList_->size(); 1.296 +} 1.297 + 1.298 +void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const { 1.299 + const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode); 1.300 + if (U_FAILURE(errorCode)) { return; } 1.301 + 1.302 + const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0); 1.303 + const UnicodeString &overflowBoundary = 1.304 + *getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1); 1.305 + 1.306 + // We make a sorted array of elements. 1.307 + // Some of the input may be redundant. 1.308 + // That is, we might have c, ch, d, where "ch" sorts just like "c", "h". 1.309 + // We filter out those cases. 1.310 + UnicodeSetIterator iter(*initialLabels_); 1.311 + while (iter.next()) { 1.312 + const UnicodeString *item = &iter.getString(); 1.313 + LocalPointer<UnicodeString> ownedItem; 1.314 + UBool checkDistinct; 1.315 + int32_t itemLength = item->length(); 1.316 + if (!item->hasMoreChar32Than(0, itemLength, 1)) { 1.317 + checkDistinct = FALSE; 1.318 + } else if(item->charAt(itemLength - 1) == 0x2a && // '*' 1.319 + item->charAt(itemLength - 2) != 0x2a) { 1.320 + // Use a label if it is marked with one trailing star, 1.321 + // even if the label string sorts the same when all contractions are suppressed. 1.322 + ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1)); 1.323 + item = ownedItem.getAlias(); 1.324 + if (item == NULL) { 1.325 + errorCode = U_MEMORY_ALLOCATION_ERROR; 1.326 + return; 1.327 + } 1.328 + checkDistinct = FALSE; 1.329 + } else { 1.330 + checkDistinct = TRUE; 1.331 + } 1.332 + if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) { 1.333 + // Ignore a primary-ignorable or non-alphabetic index character. 1.334 + } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) { 1.335 + // Ignore an index characters that will land in the overflow bucket. 1.336 + } else if (checkDistinct && 1.337 + collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) { 1.338 + // Ignore a multi-code point index character that does not sort distinctly 1.339 + // from the sequence of its separate characters. 1.340 + } else { 1.341 + int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_); 1.342 + if (insertionPoint < 0) { 1.343 + indexCharacters.insertElementAt( 1.344 + ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode); 1.345 + } else { 1.346 + const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint); 1.347 + if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) { 1.348 + indexCharacters.setElementAt( 1.349 + ownedString(*item, ownedItem, errorCode), insertionPoint); 1.350 + } 1.351 + } 1.352 + } 1.353 + } 1.354 + if (U_FAILURE(errorCode)) { return; } 1.355 + 1.356 + // if the result is still too large, cut down to maxCount elements, by removing every nth element 1.357 + 1.358 + int32_t size = indexCharacters.size() - 1; 1.359 + if (size > maxLabelCount_) { 1.360 + int32_t count = 0; 1.361 + int32_t old = -1; 1.362 + for (int32_t i = 0; i < indexCharacters.size();) { 1.363 + ++count; 1.364 + int32_t bump = count * maxLabelCount_ / size; 1.365 + if (bump == old) { 1.366 + indexCharacters.removeElementAt(i); 1.367 + } else { 1.368 + old = bump; 1.369 + ++i; 1.370 + } 1.371 + } 1.372 + } 1.373 +} 1.374 + 1.375 +namespace { 1.376 + 1.377 +const UnicodeString &fixLabel(const UnicodeString ¤t, UnicodeString &temp) { 1.378 + if (!current.startsWith(BASE, BASE_LENGTH)) { 1.379 + return current; 1.380 + } 1.381 + UChar rest = current.charAt(BASE_LENGTH); 1.382 + if (0x2800 < rest && rest <= 0x28FF) { // stroke count 1.383 + int32_t count = rest-0x2800; 1.384 + temp.setTo((UChar)(0x30 + count % 10)); 1.385 + if (count >= 10) { 1.386 + count /= 10; 1.387 + temp.insert(0, (UChar)(0x30 + count % 10)); 1.388 + if (count >= 10) { 1.389 + count /= 10; 1.390 + temp.insert(0, (UChar)(0x30 + count)); 1.391 + } 1.392 + } 1.393 + return temp.append((UChar)0x5283); 1.394 + } 1.395 + return temp.setTo(current, BASE_LENGTH); 1.396 +} 1.397 + 1.398 +UBool hasMultiplePrimaryWeights( 1.399 + CollationElementIterator &cei, int32_t variableTop, 1.400 + const UnicodeString &s, UErrorCode &errorCode) { 1.401 + cei.setText(s, errorCode); 1.402 + UBool seenPrimary = FALSE; 1.403 + for (;;) { 1.404 + int32_t ce32 = cei.next(errorCode); 1.405 + if (ce32 == CollationElementIterator::NULLORDER) { 1.406 + break; 1.407 + } 1.408 + int32_t p = CollationElementIterator::primaryOrder(ce32); 1.409 + if (p > variableTop && (ce32 & 0xc0) != 0xc0) { 1.410 + // not primary ignorable, and not a continuation CE 1.411 + if (seenPrimary) { 1.412 + return TRUE; 1.413 + } 1.414 + seenPrimary = TRUE; 1.415 + } 1.416 + } 1.417 + return FALSE; 1.418 +} 1.419 + 1.420 +} // namespace 1.421 + 1.422 +BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const { 1.423 + // Initialize indexCharacters. 1.424 + UVector indexCharacters(errorCode); 1.425 + indexCharacters.setDeleter(uprv_deleteUObject); 1.426 + initLabels(indexCharacters, errorCode); 1.427 + if (U_FAILURE(errorCode)) { return NULL; } 1.428 + 1.429 + // Variables for hasMultiplePrimaryWeights(). 1.430 + LocalPointer<CollationElementIterator> cei( 1.431 + collatorPrimaryOnly_->createCollationElementIterator(emptyString_)); 1.432 + if (cei.isNull()) { 1.433 + errorCode = U_MEMORY_ALLOCATION_ERROR; 1.434 + return NULL; 1.435 + } 1.436 + int32_t variableTop; 1.437 + if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) { 1.438 + variableTop = CollationElementIterator::primaryOrder( 1.439 + (int32_t)collatorPrimaryOnly_->getVariableTop(errorCode)); 1.440 + } else { 1.441 + variableTop = 0; 1.442 + } 1.443 + UBool hasInvisibleBuckets = FALSE; 1.444 + 1.445 + // Helper arrays for Chinese Pinyin collation. 1.446 + Bucket *asciiBuckets[26] = { 1.447 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1.448 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL 1.449 + }; 1.450 + Bucket *pinyinBuckets[26] = { 1.451 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1.452 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL 1.453 + }; 1.454 + UBool hasPinyin = FALSE; 1.455 + 1.456 + LocalPointer<UVector> bucketList(new UVector(errorCode)); 1.457 + if (bucketList.isNull()) { 1.458 + errorCode = U_MEMORY_ALLOCATION_ERROR; 1.459 + return NULL; 1.460 + } 1.461 + bucketList->setDeleter(uprv_deleteUObject); 1.462 + 1.463 + // underflow bucket 1.464 + Bucket *bucket = new Bucket(getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW); 1.465 + if (bucket == NULL) { 1.466 + errorCode = U_MEMORY_ALLOCATION_ERROR; 1.467 + return NULL; 1.468 + } 1.469 + bucketList->addElement(bucket, errorCode); 1.470 + if (U_FAILURE(errorCode)) { return NULL; } 1.471 + 1.472 + UnicodeString temp; 1.473 + 1.474 + // fix up the list, adding underflow, additions, overflow 1.475 + // Insert inflow labels as needed. 1.476 + int32_t scriptIndex = -1; 1.477 + const UnicodeString *scriptUpperBoundary = &emptyString_; 1.478 + for (int32_t i = 0; i < indexCharacters.size(); ++i) { 1.479 + UnicodeString ¤t = *getString(indexCharacters, i); 1.480 + if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) { 1.481 + // We crossed the script boundary into a new script. 1.482 + const UnicodeString &inflowBoundary = *scriptUpperBoundary; 1.483 + UBool skippedScript = FALSE; 1.484 + for (;;) { 1.485 + scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex); 1.486 + if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) { 1.487 + break; 1.488 + } 1.489 + skippedScript = TRUE; 1.490 + } 1.491 + if (skippedScript && bucketList->size() > 1) { 1.492 + // We are skipping one or more scripts, 1.493 + // and we are not just getting out of the underflow label. 1.494 + bucket = new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW); 1.495 + if (bucket == NULL) { 1.496 + errorCode = U_MEMORY_ALLOCATION_ERROR; 1.497 + return NULL; 1.498 + } 1.499 + bucketList->addElement(bucket, errorCode); 1.500 + } 1.501 + } 1.502 + // Add a bucket with the current label. 1.503 + bucket = new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL); 1.504 + if (bucket == NULL) { 1.505 + errorCode = U_MEMORY_ALLOCATION_ERROR; 1.506 + return NULL; 1.507 + } 1.508 + bucketList->addElement(bucket, errorCode); 1.509 + // Remember ASCII and Pinyin buckets for Pinyin redirects. 1.510 + UChar c; 1.511 + if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) { // A-Z 1.512 + asciiBuckets[c - 0x41] = bucket; 1.513 + } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) && 1.514 + 0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) { 1.515 + pinyinBuckets[c - 0x41] = bucket; 1.516 + hasPinyin = TRUE; 1.517 + } 1.518 + // Check for multiple primary weights. 1.519 + if (!current.startsWith(BASE, BASE_LENGTH) && 1.520 + hasMultiplePrimaryWeights(*cei, variableTop, current, errorCode) && 1.521 + current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) { 1.522 + // "AE-ligature" or "Sch" etc. 1.523 + for (int32_t i = bucketList->size() - 2;; --i) { 1.524 + Bucket *singleBucket = getBucket(*bucketList, i); 1.525 + if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) { 1.526 + // There is no single-character bucket since the last 1.527 + // underflow or inflow label. 1.528 + break; 1.529 + } 1.530 + if (singleBucket->displayBucket_ == NULL && 1.531 + !hasMultiplePrimaryWeights( 1.532 + *cei, variableTop, singleBucket->lowerBoundary_, errorCode)) { 1.533 + // Add an invisible bucket that redirects strings greater than the expansion 1.534 + // to the previous single-character bucket. 1.535 + // For example, after ... Q R S Sch we add Sch\uFFFF->S 1.536 + // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S. 1.537 + bucket = new Bucket(emptyString_, 1.538 + UnicodeString(current).append((UChar)0xFFFF), 1.539 + U_ALPHAINDEX_NORMAL); 1.540 + if (bucket == NULL) { 1.541 + errorCode = U_MEMORY_ALLOCATION_ERROR; 1.542 + return NULL; 1.543 + } 1.544 + bucket->displayBucket_ = singleBucket; 1.545 + bucketList->addElement(bucket, errorCode); 1.546 + hasInvisibleBuckets = TRUE; 1.547 + break; 1.548 + } 1.549 + } 1.550 + } 1.551 + } 1.552 + if (U_FAILURE(errorCode)) { return NULL; } 1.553 + if (bucketList->size() == 1) { 1.554 + // No real labels, show only the underflow label. 1.555 + BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias()); 1.556 + if (bl == NULL) { 1.557 + errorCode = U_MEMORY_ALLOCATION_ERROR; 1.558 + return NULL; 1.559 + } 1.560 + bucketList.orphan(); 1.561 + return bl; 1.562 + } 1.563 + // overflow bucket 1.564 + bucket = new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW); 1.565 + if (bucket == NULL) { 1.566 + errorCode = U_MEMORY_ALLOCATION_ERROR; 1.567 + return NULL; 1.568 + } 1.569 + bucketList->addElement(bucket, errorCode); // final 1.570 + 1.571 + if (hasPinyin) { 1.572 + // Redirect Pinyin buckets. 1.573 + Bucket *asciiBucket = NULL; 1.574 + for (int32_t i = 0; i < 26; ++i) { 1.575 + if (asciiBuckets[i] != NULL) { 1.576 + asciiBucket = asciiBuckets[i]; 1.577 + } 1.578 + if (pinyinBuckets[i] != NULL && asciiBucket != NULL) { 1.579 + pinyinBuckets[i]->displayBucket_ = asciiBucket; 1.580 + hasInvisibleBuckets = TRUE; 1.581 + } 1.582 + } 1.583 + } 1.584 + 1.585 + if (U_FAILURE(errorCode)) { return NULL; } 1.586 + if (!hasInvisibleBuckets) { 1.587 + BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias()); 1.588 + if (bl == NULL) { 1.589 + errorCode = U_MEMORY_ALLOCATION_ERROR; 1.590 + return NULL; 1.591 + } 1.592 + bucketList.orphan(); 1.593 + return bl; 1.594 + } 1.595 + // Merge inflow buckets that are visually adjacent. 1.596 + // Iterate backwards: Merge inflow into overflow rather than the other way around. 1.597 + int32_t i = bucketList->size() - 1; 1.598 + Bucket *nextBucket = getBucket(*bucketList, i); 1.599 + while (--i > 0) { 1.600 + bucket = getBucket(*bucketList, i); 1.601 + if (bucket->displayBucket_ != NULL) { 1.602 + continue; // skip invisible buckets 1.603 + } 1.604 + if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) { 1.605 + if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) { 1.606 + bucket->displayBucket_ = nextBucket; 1.607 + continue; 1.608 + } 1.609 + } 1.610 + nextBucket = bucket; 1.611 + } 1.612 + 1.613 + LocalPointer<UVector> publicBucketList(new UVector(errorCode)); 1.614 + if (bucketList.isNull()) { 1.615 + errorCode = U_MEMORY_ALLOCATION_ERROR; 1.616 + return NULL; 1.617 + } 1.618 + // Do not call publicBucketList->setDeleter(): 1.619 + // This vector shares its objects with the bucketList. 1.620 + for (int32_t i = 0; i < bucketList->size(); ++i) { 1.621 + bucket = getBucket(*bucketList, i); 1.622 + if (bucket->displayBucket_ == NULL) { 1.623 + publicBucketList->addElement(bucket, errorCode); 1.624 + } 1.625 + } 1.626 + if (U_FAILURE(errorCode)) { return NULL; } 1.627 + BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias()); 1.628 + if (bl == NULL) { 1.629 + errorCode = U_MEMORY_ALLOCATION_ERROR; 1.630 + return NULL; 1.631 + } 1.632 + bucketList.orphan(); 1.633 + publicBucketList.orphan(); 1.634 + return bl; 1.635 +} 1.636 + 1.637 +/** 1.638 + * Creates an index, and buckets and sorts the list of records into the index. 1.639 + */ 1.640 +void AlphabeticIndex::initBuckets(UErrorCode &errorCode) { 1.641 + if (U_FAILURE(errorCode) || buckets_ != NULL) { 1.642 + return; 1.643 + } 1.644 + buckets_ = createBucketList(errorCode); 1.645 + if (U_FAILURE(errorCode) || inputList_ == NULL || inputList_->isEmpty()) { 1.646 + return; 1.647 + } 1.648 + 1.649 + // Sort the records by name. 1.650 + // Stable sort preserves input order of collation duplicates. 1.651 + inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode); 1.652 + 1.653 + // Now, we traverse all of the input, which is now sorted. 1.654 + // If the item doesn't go in the current bucket, we find the next bucket that contains it. 1.655 + // This makes the process order n*log(n), since we just sort the list and then do a linear process. 1.656 + // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so 1.657 + // we need to improve it for that case. 1.658 + 1.659 + Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0); 1.660 + int32_t bucketIndex = 1; 1.661 + Bucket *nextBucket; 1.662 + const UnicodeString *upperBoundary; 1.663 + if (bucketIndex < buckets_->bucketList_->size()) { 1.664 + nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++); 1.665 + upperBoundary = &nextBucket->lowerBoundary_; 1.666 + } else { 1.667 + nextBucket = NULL; 1.668 + upperBoundary = NULL; 1.669 + } 1.670 + for (int32_t i = 0; i < inputList_->size(); ++i) { 1.671 + Record *r = getRecord(*inputList_, i); 1.672 + // if the current bucket isn't the right one, find the one that is 1.673 + // We have a special flag for the last bucket so that we don't look any further 1.674 + while (upperBoundary != NULL && 1.675 + collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) { 1.676 + currentBucket = nextBucket; 1.677 + // now reset the boundary that we compare against 1.678 + if (bucketIndex < buckets_->bucketList_->size()) { 1.679 + nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++); 1.680 + upperBoundary = &nextBucket->lowerBoundary_; 1.681 + } else { 1.682 + upperBoundary = NULL; 1.683 + } 1.684 + } 1.685 + // now put the record into the bucket. 1.686 + Bucket *bucket = currentBucket; 1.687 + if (bucket->displayBucket_ != NULL) { 1.688 + bucket = bucket->displayBucket_; 1.689 + } 1.690 + if (bucket->records_ == NULL) { 1.691 + bucket->records_ = new UVector(errorCode); 1.692 + if (bucket->records_ == NULL) { 1.693 + errorCode = U_MEMORY_ALLOCATION_ERROR; 1.694 + return; 1.695 + } 1.696 + } 1.697 + bucket->records_->addElement(r, errorCode); 1.698 + } 1.699 +} 1.700 + 1.701 +void AlphabeticIndex::clearBuckets() { 1.702 + if (buckets_ != NULL) { 1.703 + delete buckets_; 1.704 + buckets_ = NULL; 1.705 + internalResetBucketIterator(); 1.706 + } 1.707 +} 1.708 + 1.709 +void AlphabeticIndex::internalResetBucketIterator() { 1.710 + labelsIterIndex_ = -1; 1.711 + currentBucket_ = NULL; 1.712 +} 1.713 + 1.714 + 1.715 +void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) { 1.716 + if (U_FAILURE(status)) { return; } 1.717 + // Chinese index characters, which are specific to each of the several Chinese tailorings, 1.718 + // take precedence over the single locale data exemplar set per language. 1.719 + const char *language = locale.getLanguage(); 1.720 + if (uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 || 1.721 + uprv_strcmp(language, "ko") == 0) { 1.722 + // TODO: This should be done regardless of the language, but it's expensive. 1.723 + // We should add a Collator function (can be @internal) 1.724 + // to enumerate just the contractions that start with a given code point or string. 1.725 + if (addChineseIndexCharacters(status) || U_FAILURE(status)) { 1.726 + return; 1.727 + } 1.728 + } 1.729 + 1.730 + LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status)); 1.731 + if (U_FAILURE(status)) { 1.732 + return; 1.733 + } 1.734 + 1.735 + UnicodeSet exemplars; 1.736 + ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status); 1.737 + if (U_SUCCESS(status)) { 1.738 + initialLabels_->addAll(exemplars); 1.739 + return; 1.740 + } 1.741 + status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR 1.742 + 1.743 + // The locale data did not include explicit Index characters. 1.744 + // Synthesize a set of them from the locale's standard exemplar characters. 1.745 + ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status); 1.746 + if (U_FAILURE(status)) { 1.747 + return; 1.748 + } 1.749 + 1.750 + // question: should we add auxiliary exemplars? 1.751 + if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.size() == 0) { 1.752 + exemplars.add(0x61, 0x7A); 1.753 + } 1.754 + if (exemplars.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables 1.755 + // cut down to small list 1.756 + exemplars.remove(0xAC00, 0xD7A3). 1.757 + add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C). 1.758 + add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544). 1.759 + add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0). 1.760 + add(0xD30C).add(0xD558); 1.761 + } 1.762 + if (exemplars.containsSome(0x1200, 0x137F)) { // Ethiopic block 1.763 + // cut down to small list 1.764 + // make use of the fact that Ethiopic is allocated in 8's, where 1.765 + // the base is 0 mod 8. 1.766 + UnicodeSet ethiopic( 1.767 + UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"), status); 1.768 + UnicodeSetIterator it(ethiopic); 1.769 + while (it.next() && !it.isString()) { 1.770 + if ((it.getCodepoint() & 0x7) != 0) { 1.771 + exemplars.remove(it.getCodepoint()); 1.772 + } 1.773 + } 1.774 + } 1.775 + 1.776 + // Upper-case any that aren't already so. 1.777 + // (We only do this for synthesized index characters.) 1.778 + UnicodeSetIterator it(exemplars); 1.779 + UnicodeString upperC; 1.780 + while (it.next()) { 1.781 + const UnicodeString &exemplarC = it.getString(); 1.782 + upperC = exemplarC; 1.783 + upperC.toUpper(locale); 1.784 + initialLabels_->add(upperC); 1.785 + } 1.786 +} 1.787 + 1.788 +UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) { 1.789 + UnicodeSet contractions; 1.790 + ucol_getContractionsAndExpansions(collatorPrimaryOnly_->getUCollator(), 1.791 + contractions.toUSet(), NULL, FALSE, &errorCode); 1.792 + if (U_FAILURE(errorCode)) { return FALSE; } 1.793 + UnicodeString firstHanBoundary; 1.794 + UBool hasPinyin = FALSE; 1.795 + UnicodeSetIterator iter(contractions); 1.796 + while (iter.next()) { 1.797 + const UnicodeString &s = iter.getString(); 1.798 + if (s.startsWith(BASE, BASE_LENGTH)) { 1.799 + initialLabels_->add(s); 1.800 + if (firstHanBoundary.isEmpty() || 1.801 + collatorPrimaryOnly_->compare(s, firstHanBoundary, errorCode) < 0) { 1.802 + firstHanBoundary = s; 1.803 + } 1.804 + UChar c = s.charAt(s.length() - 1); 1.805 + if (0x41 <= c && c <= 0x5A) { // A-Z 1.806 + hasPinyin = TRUE; 1.807 + } 1.808 + } 1.809 + } 1.810 + if (hasPinyin) { 1.811 + initialLabels_->add(0x41, 0x5A); // A-Z 1.812 + } 1.813 + if (!firstHanBoundary.isEmpty()) { 1.814 + // The hardcoded list of script boundaries includes U+4E00 1.815 + // which is tailored to not be the first primary 1.816 + // in all Chinese tailorings except "unihan". 1.817 + // Replace U+4E00 with the first boundary string from the tailoring. 1.818 + // TODO: This becomes obsolete when the root collator gets 1.819 + // reliable script-first-primary mappings. 1.820 + int32_t hanIndex = binarySearch( 1.821 + *firstCharsInScripts_, UnicodeString((UChar)0x4E00), *collatorPrimaryOnly_); 1.822 + if (hanIndex >= 0) { 1.823 + UnicodeString *fh = new UnicodeString(firstHanBoundary); 1.824 + if (fh == NULL) { 1.825 + errorCode = U_MEMORY_ALLOCATION_ERROR; 1.826 + return FALSE; 1.827 + } 1.828 + firstCharsInScripts_->setElementAt(fh, hanIndex); 1.829 + } 1.830 + return TRUE; 1.831 + } else { 1.832 + return FALSE; 1.833 + } 1.834 +} 1.835 + 1.836 + 1.837 +/* 1.838 + * Return the string with interspersed CGJs. Input must have more than 2 codepoints. 1.839 + */ 1.840 +static const UChar CGJ = 0x034F; 1.841 +UnicodeString AlphabeticIndex::separated(const UnicodeString &item) { 1.842 + UnicodeString result; 1.843 + if (item.length() == 0) { 1.844 + return result; 1.845 + } 1.846 + int32_t i = 0; 1.847 + for (;;) { 1.848 + UChar32 cp = item.char32At(i); 1.849 + result.append(cp); 1.850 + i = item.moveIndex32(i, 1); 1.851 + if (i >= item.length()) { 1.852 + break; 1.853 + } 1.854 + result.append(CGJ); 1.855 + } 1.856 + return result; 1.857 +} 1.858 + 1.859 + 1.860 +UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const { 1.861 + return FALSE; 1.862 +} 1.863 + 1.864 + 1.865 +UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const { 1.866 + return FALSE; 1.867 +} 1.868 + 1.869 + 1.870 +const RuleBasedCollator &AlphabeticIndex::getCollator() const { 1.871 + // There are no known non-RuleBasedCollator collators, and none ever expected. 1.872 + // But, in case that changes, better a null pointer than a wrong type. 1.873 + return *dynamic_cast<RuleBasedCollator *>(collator_); 1.874 +} 1.875 + 1.876 + 1.877 +const UnicodeString &AlphabeticIndex::getInflowLabel() const { 1.878 + return inflowLabel_; 1.879 +} 1.880 + 1.881 +const UnicodeString &AlphabeticIndex::getOverflowLabel() const { 1.882 + return overflowLabel_; 1.883 +} 1.884 + 1.885 + 1.886 +const UnicodeString &AlphabeticIndex::getUnderflowLabel() const { 1.887 + return underflowLabel_; 1.888 +} 1.889 + 1.890 + 1.891 +AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { 1.892 + inflowLabel_ = label; 1.893 + clearBuckets(); 1.894 + return *this; 1.895 +} 1.896 + 1.897 + 1.898 +AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { 1.899 + overflowLabel_ = label; 1.900 + clearBuckets(); 1.901 + return *this; 1.902 +} 1.903 + 1.904 + 1.905 +AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { 1.906 + underflowLabel_ = label; 1.907 + clearBuckets(); 1.908 + return *this; 1.909 +} 1.910 + 1.911 + 1.912 +int32_t AlphabeticIndex::getMaxLabelCount() const { 1.913 + return maxLabelCount_; 1.914 +} 1.915 + 1.916 + 1.917 +AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) { 1.918 + if (U_FAILURE(status)) { 1.919 + return *this; 1.920 + } 1.921 + if (maxLabelCount <= 0) { 1.922 + status = U_ILLEGAL_ARGUMENT_ERROR; 1.923 + return *this; 1.924 + } 1.925 + maxLabelCount_ = maxLabelCount; 1.926 + clearBuckets(); 1.927 + return *this; 1.928 +} 1.929 + 1.930 + 1.931 +// 1.932 +// init() - Common code for constructors. 1.933 +// 1.934 + 1.935 +void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) { 1.936 + if (U_FAILURE(status)) { return; } 1.937 + if (locale == NULL && collator_ == NULL) { 1.938 + status = U_ILLEGAL_ARGUMENT_ERROR; 1.939 + return; 1.940 + } 1.941 + 1.942 + initialLabels_ = new UnicodeSet(); 1.943 + if (initialLabels_ == NULL) { 1.944 + status = U_MEMORY_ALLOCATION_ERROR; 1.945 + return; 1.946 + } 1.947 + 1.948 + inflowLabel_.setTo((UChar)0x2026); // Ellipsis 1.949 + overflowLabel_ = inflowLabel_; 1.950 + underflowLabel_ = inflowLabel_; 1.951 + 1.952 + if (collator_ == NULL) { 1.953 + collator_ = static_cast<RuleBasedCollator *>(Collator::createInstance(*locale, status)); 1.954 + if (U_FAILURE(status)) { return; } 1.955 + if (collator_ == NULL) { 1.956 + status = U_MEMORY_ALLOCATION_ERROR; 1.957 + return; 1.958 + } 1.959 + } 1.960 + collatorPrimaryOnly_ = static_cast<RuleBasedCollator *>(collator_->clone()); 1.961 + if (collatorPrimaryOnly_ == NULL) { 1.962 + status = U_MEMORY_ALLOCATION_ERROR; 1.963 + return; 1.964 + } 1.965 + collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status); 1.966 + firstCharsInScripts_ = firstStringsInScript(status); 1.967 + if (U_FAILURE(status)) { return; } 1.968 + firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status); 1.969 + UnicodeString _4E00((UChar)0x4E00); 1.970 + UnicodeString _1100((UChar)0x1100); 1.971 + UnicodeString _1112((UChar)0x1112); 1.972 + if (collatorPrimaryOnly_->compare(_4E00, _1112, status) <= 0 && 1.973 + collatorPrimaryOnly_->compare(_1100, _4E00, status) <= 0) { 1.974 + // The standard Korean tailoring sorts Hanja (Han characters) 1.975 + // as secondary differences from Hangul syllables. 1.976 + // This makes U+4E00 not useful as a Han-script boundary. 1.977 + // TODO: This becomes obsolete when the root collator gets 1.978 + // reliable script-first-primary mappings. 1.979 + int32_t hanIndex = binarySearch( 1.980 + *firstCharsInScripts_, _4E00, *collatorPrimaryOnly_); 1.981 + if (hanIndex >= 0) { 1.982 + firstCharsInScripts_->removeElementAt(hanIndex); 1.983 + } 1.984 + } 1.985 + // Guard against a degenerate collator where 1.986 + // some script boundary strings are primary ignorable. 1.987 + for (;;) { 1.988 + if (U_FAILURE(status)) { return; } 1.989 + if (firstCharsInScripts_->isEmpty()) { 1.990 + // AlphabeticIndex requires some non-ignorable script boundary strings. 1.991 + status = U_ILLEGAL_ARGUMENT_ERROR; 1.992 + return; 1.993 + } 1.994 + if (collatorPrimaryOnly_->compare( 1.995 + *static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)), 1.996 + emptyString_, status) == UCOL_EQUAL) { 1.997 + firstCharsInScripts_->removeElementAt(0); 1.998 + } else { 1.999 + break; 1.1000 + } 1.1001 + } 1.1002 + 1.1003 + if (locale != NULL) { 1.1004 + addIndexExemplars(*locale, status); 1.1005 + } 1.1006 +} 1.1007 + 1.1008 + 1.1009 +// 1.1010 +// Comparison function for UVector<UnicodeString *> sorting with a collator. 1.1011 +// 1.1012 +static int32_t U_CALLCONV 1.1013 +collatorComparator(const void *context, const void *left, const void *right) { 1.1014 + const UElement *leftElement = static_cast<const UElement *>(left); 1.1015 + const UElement *rightElement = static_cast<const UElement *>(right); 1.1016 + const UnicodeString *leftString = static_cast<const UnicodeString *>(leftElement->pointer); 1.1017 + const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer); 1.1018 + 1.1019 + if (leftString == rightString) { 1.1020 + // Catches case where both are NULL 1.1021 + return 0; 1.1022 + } 1.1023 + if (leftString == NULL) { 1.1024 + return 1; 1.1025 + }; 1.1026 + if (rightString == NULL) { 1.1027 + return -1; 1.1028 + } 1.1029 + const Collator *col = static_cast<const Collator *>(context); 1.1030 + UErrorCode errorCode = U_ZERO_ERROR; 1.1031 + return col->compare(*leftString, *rightString, errorCode); 1.1032 +} 1.1033 + 1.1034 +// 1.1035 +// Comparison function for UVector<Record *> sorting with a collator. 1.1036 +// 1.1037 +static int32_t U_CALLCONV 1.1038 +recordCompareFn(const void *context, const void *left, const void *right) { 1.1039 + const UElement *leftElement = static_cast<const UElement *>(left); 1.1040 + const UElement *rightElement = static_cast<const UElement *>(right); 1.1041 + const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer); 1.1042 + const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer); 1.1043 + const Collator *col = static_cast<const Collator *>(context); 1.1044 + UErrorCode errorCode = U_ZERO_ERROR; 1.1045 + return col->compare(leftRec->name_, rightRec->name_, errorCode); 1.1046 +} 1.1047 + 1.1048 + 1.1049 +/** 1.1050 + * This list contains one character per script that has the 1.1051 + * lowest primary weight for that script in the root collator. 1.1052 + * This list will be copied and sorted to account for script reordering. 1.1053 + * 1.1054 + * <p>TODO: This is fragile. If the first character of a script is tailored 1.1055 + * so that it does not map to the script's lowest primary weight any more, 1.1056 + * then the buckets will be off. 1.1057 + * There are hacks in the code to handle the known CJK tailorings of U+4E00. 1.1058 + * 1.1059 + * <p>We use "A" not "a" because the en_US_POSIX tailoring sorts A primary-before a. 1.1060 + * 1.1061 + * Keep this in sync with HACK_FIRST_CHARS_IN_SCRIPTS in 1.1062 + * ICU4J main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java 1.1063 + */ 1.1064 +static const UChar HACK_FIRST_CHARS_IN_SCRIPTS[] = { 1.1065 + 0x41, 0, 0x03B1, 0, 1.1066 + 0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0, 1.1067 + 0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0, 1.1068 + 0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0, 1.1069 + 0xAAF2, 0, // Meetei Mayek 1.1070 + 0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0, 1.1071 + U16_LEAD(0x111C4), U16_TRAIL(0x111C4), 0, // Sharada 1.1072 + U16_LEAD(0x11680), U16_TRAIL(0x11680), 0, // Takri 1.1073 + 0x1B83, 0, 1.1074 + 0xD802, 0xDE00, 0, 0x0E01, 0, 1.1075 + 0x0EDE, 0, // Lao 1.1076 + 0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0, 1.1077 + 0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0, 1.1078 + U16_LEAD(0x11103), U16_TRAIL(0x11103), 0, // Chakma 1.1079 + 0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0, 1.1080 + 0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0, 1.1081 + 0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0, 1.1082 + U16_LEAD(0x16F00), U16_TRAIL(0x16F00), 0, // Miao 1.1083 + 0xD800, 0xDE80, 0, 1.1084 + 0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0, 1.1085 + 0xD801, 0xDC80, 0, 1.1086 + U16_LEAD(0x110D0), U16_TRAIL(0x110D0), 0, // Sora Sompeng 1.1087 + 0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0, 1.1088 + 0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0, 1.1089 + U16_LEAD(0x109A0), U16_TRAIL(0x109A0), 0, // Meroitic Cursive 1.1090 + U16_LEAD(0x10980), U16_TRAIL(0x10980), 0, // Meroitic Hieroglyphs 1.1091 + 0x4E00, 0, 1.1092 + // TODO: The overflow bucket's lowerBoundary string should be the 1.1093 + // first item after the last reordering group in the collator's script order. 1.1094 + // This should normally be the first Unicode code point 1.1095 + // that is unassigned (U+0378 in Unicode 6.3) and untailored. 1.1096 + // However, at least up to ICU 51 the Hani reordering group includes 1.1097 + // unassigned code points, 1.1098 + // and there is no stable string for the start of the trailing-weights range. 1.1099 + // The only known string that sorts "high" is U+FFFF. 1.1100 + // When ICU separates Hani vs. unassigned reordering groups, we need to fix this, 1.1101 + // and fix relevant test code. 1.1102 + // Ideally, FractionalUCA.txt will have a "script first primary" 1.1103 + // for unassigned code points. 1.1104 + 0xFFFF, 0 1.1105 +}; 1.1106 + 1.1107 +UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) { 1.1108 + if (U_FAILURE(status)) { 1.1109 + return NULL; 1.1110 + } 1.1111 + UVector *dest = new UVector(status); 1.1112 + if (dest == NULL) { 1.1113 + status = U_MEMORY_ALLOCATION_ERROR; 1.1114 + return NULL; 1.1115 + } 1.1116 + dest->setDeleter(uprv_deleteUObject); 1.1117 + const UChar *src = HACK_FIRST_CHARS_IN_SCRIPTS; 1.1118 + const UChar *limit = src + LENGTHOF(HACK_FIRST_CHARS_IN_SCRIPTS); 1.1119 + do { 1.1120 + if (U_FAILURE(status)) { 1.1121 + return dest; 1.1122 + } 1.1123 + UnicodeString *str = new UnicodeString(src, -1); 1.1124 + if (str == NULL) { 1.1125 + status = U_MEMORY_ALLOCATION_ERROR; 1.1126 + return dest; 1.1127 + } 1.1128 + dest->addElement(str, status); 1.1129 + src += str->length() + 1; 1.1130 + } while (src < limit); 1.1131 + return dest; 1.1132 +} 1.1133 + 1.1134 + 1.1135 +namespace { 1.1136 + 1.1137 +/** 1.1138 + * Returns true if one index character string is "better" than the other. 1.1139 + * Shorter NFKD is better, and otherwise NFKD-binary-less-than is 1.1140 + * better, and otherwise binary-less-than is better. 1.1141 + */ 1.1142 +UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer, 1.1143 + const UnicodeString &one, const UnicodeString &other) { 1.1144 + // This is called with primary-equal strings, but never with one.equals(other). 1.1145 + UErrorCode status = U_ZERO_ERROR; 1.1146 + UnicodeString n1 = nfkdNormalizer.normalize(one, status); 1.1147 + UnicodeString n2 = nfkdNormalizer.normalize(other, status); 1.1148 + if (U_FAILURE(status)) { return FALSE; } 1.1149 + int32_t result = n1.countChar32() - n2.countChar32(); 1.1150 + if (result != 0) { 1.1151 + return result < 0; 1.1152 + } 1.1153 + result = n1.compareCodePointOrder(n2); 1.1154 + if (result != 0) { 1.1155 + return result < 0; 1.1156 + } 1.1157 + return one.compareCodePointOrder(other) < 0; 1.1158 +} 1.1159 + 1.1160 +} // namespace 1.1161 + 1.1162 +// 1.1163 +// Constructor & Destructor for AlphabeticIndex::Record 1.1164 +// 1.1165 +// Records are internal only, instances are not directly surfaced in the public API. 1.1166 +// This class is mostly struct-like, with all public fields. 1.1167 + 1.1168 +AlphabeticIndex::Record::Record(const UnicodeString &name, const void *data) 1.1169 + : name_(name), data_(data) {} 1.1170 + 1.1171 +AlphabeticIndex::Record::~Record() { 1.1172 +} 1.1173 + 1.1174 + 1.1175 +AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) { 1.1176 + if (U_FAILURE(status)) { 1.1177 + return *this; 1.1178 + } 1.1179 + if (inputList_ == NULL) { 1.1180 + inputList_ = new UVector(status); 1.1181 + if (inputList_ == NULL) { 1.1182 + status = U_MEMORY_ALLOCATION_ERROR; 1.1183 + return *this; 1.1184 + } 1.1185 + inputList_->setDeleter(alphaIndex_deleteRecord); 1.1186 + } 1.1187 + Record *r = new Record(name, data); 1.1188 + if (r == NULL) { 1.1189 + status = U_MEMORY_ALLOCATION_ERROR; 1.1190 + return *this; 1.1191 + } 1.1192 + inputList_->addElement(r, status); 1.1193 + clearBuckets(); 1.1194 + //std::string ss; 1.1195 + //std::string ss2; 1.1196 + //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" << 1.1197 + // " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl; 1.1198 + return *this; 1.1199 +} 1.1200 + 1.1201 + 1.1202 +AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) { 1.1203 + if (U_SUCCESS(status) && inputList_ != NULL && !inputList_->isEmpty()) { 1.1204 + inputList_->removeAllElements(); 1.1205 + clearBuckets(); 1.1206 + } 1.1207 + return *this; 1.1208 +} 1.1209 + 1.1210 +int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) { 1.1211 + initBuckets(status); 1.1212 + if (U_FAILURE(status)) { 1.1213 + return 0; 1.1214 + } 1.1215 + return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, status); 1.1216 +} 1.1217 + 1.1218 + 1.1219 +int32_t AlphabeticIndex::getBucketIndex() const { 1.1220 + return labelsIterIndex_; 1.1221 +} 1.1222 + 1.1223 + 1.1224 +UBool AlphabeticIndex::nextBucket(UErrorCode &status) { 1.1225 + if (U_FAILURE(status)) { 1.1226 + return FALSE; 1.1227 + } 1.1228 + if (buckets_ == NULL && currentBucket_ != NULL) { 1.1229 + status = U_ENUM_OUT_OF_SYNC_ERROR; 1.1230 + return FALSE; 1.1231 + } 1.1232 + initBuckets(status); 1.1233 + if (U_FAILURE(status)) { 1.1234 + return FALSE; 1.1235 + } 1.1236 + ++labelsIterIndex_; 1.1237 + if (labelsIterIndex_ >= buckets_->getBucketCount()) { 1.1238 + labelsIterIndex_ = buckets_->getBucketCount(); 1.1239 + return FALSE; 1.1240 + } 1.1241 + currentBucket_ = getBucket(*buckets_->immutableVisibleList_, labelsIterIndex_); 1.1242 + resetRecordIterator(); 1.1243 + return TRUE; 1.1244 +} 1.1245 + 1.1246 +const UnicodeString &AlphabeticIndex::getBucketLabel() const { 1.1247 + if (currentBucket_ != NULL) { 1.1248 + return currentBucket_->label_; 1.1249 + } else { 1.1250 + return emptyString_; 1.1251 + } 1.1252 +} 1.1253 + 1.1254 + 1.1255 +UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const { 1.1256 + if (currentBucket_ != NULL) { 1.1257 + return currentBucket_->labelType_; 1.1258 + } else { 1.1259 + return U_ALPHAINDEX_NORMAL; 1.1260 + } 1.1261 +} 1.1262 + 1.1263 + 1.1264 +int32_t AlphabeticIndex::getBucketRecordCount() const { 1.1265 + if (currentBucket_ != NULL && currentBucket_->records_ != NULL) { 1.1266 + return currentBucket_->records_->size(); 1.1267 + } else { 1.1268 + return 0; 1.1269 + } 1.1270 +} 1.1271 + 1.1272 + 1.1273 +AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) { 1.1274 + if (U_FAILURE(status)) { 1.1275 + return *this; 1.1276 + } 1.1277 + internalResetBucketIterator(); 1.1278 + return *this; 1.1279 +} 1.1280 + 1.1281 + 1.1282 +UBool AlphabeticIndex::nextRecord(UErrorCode &status) { 1.1283 + if (U_FAILURE(status)) { 1.1284 + return FALSE; 1.1285 + } 1.1286 + if (currentBucket_ == NULL) { 1.1287 + // We are trying to iterate over the items in a bucket, but there is no 1.1288 + // current bucket from the enumeration of buckets. 1.1289 + status = U_INVALID_STATE_ERROR; 1.1290 + return FALSE; 1.1291 + } 1.1292 + if (buckets_ == NULL) { 1.1293 + status = U_ENUM_OUT_OF_SYNC_ERROR; 1.1294 + return FALSE; 1.1295 + } 1.1296 + if (currentBucket_->records_ == NULL) { 1.1297 + return FALSE; 1.1298 + } 1.1299 + ++itemsIterIndex_; 1.1300 + if (itemsIterIndex_ >= currentBucket_->records_->size()) { 1.1301 + itemsIterIndex_ = currentBucket_->records_->size(); 1.1302 + return FALSE; 1.1303 + } 1.1304 + return TRUE; 1.1305 +} 1.1306 + 1.1307 + 1.1308 +const UnicodeString &AlphabeticIndex::getRecordName() const { 1.1309 + const UnicodeString *retStr = &emptyString_; 1.1310 + if (currentBucket_ != NULL && currentBucket_->records_ != NULL && 1.1311 + itemsIterIndex_ >= 0 && 1.1312 + itemsIterIndex_ < currentBucket_->records_->size()) { 1.1313 + Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_)); 1.1314 + retStr = &item->name_; 1.1315 + } 1.1316 + return *retStr; 1.1317 +} 1.1318 + 1.1319 +const void *AlphabeticIndex::getRecordData() const { 1.1320 + const void *retPtr = NULL; 1.1321 + if (currentBucket_ != NULL && currentBucket_->records_ != NULL && 1.1322 + itemsIterIndex_ >= 0 && 1.1323 + itemsIterIndex_ < currentBucket_->records_->size()) { 1.1324 + Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_)); 1.1325 + retPtr = item->data_; 1.1326 + } 1.1327 + return retPtr; 1.1328 +} 1.1329 + 1.1330 + 1.1331 +AlphabeticIndex & AlphabeticIndex::resetRecordIterator() { 1.1332 + itemsIterIndex_ = -1; 1.1333 + return *this; 1.1334 +} 1.1335 + 1.1336 + 1.1337 + 1.1338 +AlphabeticIndex::Bucket::Bucket(const UnicodeString &label, 1.1339 + const UnicodeString &lowerBoundary, 1.1340 + UAlphabeticIndexLabelType type) 1.1341 + : label_(label), lowerBoundary_(lowerBoundary), labelType_(type), 1.1342 + displayBucket_(NULL), displayIndex_(-1), 1.1343 + records_(NULL) { 1.1344 +} 1.1345 + 1.1346 + 1.1347 +AlphabeticIndex::Bucket::~Bucket() { 1.1348 + delete records_; 1.1349 +} 1.1350 + 1.1351 +U_NAMESPACE_END 1.1352 + 1.1353 +#endif