michael@0: /* michael@0: ******************************************************************************* michael@0: * Copyright (C) 2009-2013, International Business Machines Corporation and michael@0: * others. All Rights Reserved. michael@0: ******************************************************************************* michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION michael@0: michael@0: #include "unicode/alphaindex.h" michael@0: #include "unicode/coleitr.h" michael@0: #include "unicode/coll.h" michael@0: #include "unicode/localpointer.h" michael@0: #include "unicode/normalizer2.h" michael@0: #include "unicode/tblcoll.h" michael@0: #include "unicode/ulocdata.h" michael@0: #include "unicode/uniset.h" michael@0: #include "unicode/uobject.h" michael@0: #include "unicode/usetiter.h" michael@0: #include "unicode/utf16.h" michael@0: michael@0: #include "cmemory.h" michael@0: #include "cstring.h" michael@0: #include "uassert.h" michael@0: #include "uvector.h" michael@0: michael@0: //#include michael@0: //#include michael@0: michael@0: #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: namespace { michael@0: michael@0: /** michael@0: * Prefix string for Chinese index buckets. michael@0: * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes michael@0: */ michael@0: const UChar BASE[1] = { 0xFDD0 }; michael@0: const int32_t BASE_LENGTH = 1; michael@0: michael@0: UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer, michael@0: const UnicodeString &one, const UnicodeString &other); michael@0: michael@0: } // namespace michael@0: michael@0: static int32_t U_CALLCONV michael@0: collatorComparator(const void *context, const void *left, const void *right); michael@0: michael@0: static int32_t U_CALLCONV michael@0: recordCompareFn(const void *context, const void *left, const void *right); michael@0: michael@0: // UVector support function, delete a Record. michael@0: static void U_CALLCONV michael@0: alphaIndex_deleteRecord(void *obj) { michael@0: delete static_cast(obj); michael@0: } michael@0: michael@0: namespace { michael@0: michael@0: UnicodeString *ownedString(const UnicodeString &s, LocalPointer &owned, michael@0: UErrorCode &errorCode) { michael@0: if (U_FAILURE(errorCode)) { return NULL; } michael@0: if (owned.isValid()) { michael@0: return owned.orphan(); michael@0: } michael@0: UnicodeString *p = new UnicodeString(s); michael@0: if (p == NULL) { michael@0: errorCode = U_MEMORY_ALLOCATION_ERROR; michael@0: } michael@0: return p; michael@0: } michael@0: michael@0: inline UnicodeString *getString(const UVector &list, int32_t i) { michael@0: return static_cast(list[i]); michael@0: } michael@0: michael@0: inline AlphabeticIndex::Bucket *getBucket(const UVector &list, int32_t i) { michael@0: return static_cast(list[i]); michael@0: } michael@0: michael@0: inline AlphabeticIndex::Record *getRecord(const UVector &list, int32_t i) { michael@0: return static_cast(list[i]); michael@0: } michael@0: michael@0: /** michael@0: * Like Java Collections.binarySearch(List, String, Comparator). michael@0: * michael@0: * @return the index>=0 where the item was found, michael@0: * or the index<0 for inserting the string at ~index in sorted order michael@0: */ michael@0: int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) { michael@0: if (list.size() == 0) { return ~0; } michael@0: int32_t start = 0; michael@0: int32_t limit = list.size(); michael@0: for (;;) { michael@0: int32_t i = (start + limit) / 2; michael@0: const UnicodeString *si = static_cast(list.elementAt(i)); michael@0: UErrorCode errorCode = U_ZERO_ERROR; michael@0: UCollationResult cmp = coll.compare(s, *si, errorCode); michael@0: if (cmp == UCOL_EQUAL) { michael@0: return i; michael@0: } else if (cmp < 0) { michael@0: if (i == start) { michael@0: return ~start; // insert s before *si michael@0: } michael@0: limit = i; michael@0: } else { michael@0: if (i == start) { michael@0: return ~(start + 1); // insert s after *si michael@0: } michael@0: start = i; michael@0: } michael@0: } michael@0: } michael@0: michael@0: } // namespace michael@0: michael@0: // The BucketList is not in the anonymous namespace because only Clang michael@0: // seems to support its use in other classes from there. michael@0: // However, we also don't need U_I18N_API because it is not used from outside the i18n library. michael@0: class BucketList : public UObject { michael@0: public: michael@0: BucketList(UVector *bucketList, UVector *publicBucketList) michael@0: : bucketList_(bucketList), immutableVisibleList_(publicBucketList) { michael@0: int32_t displayIndex = 0; michael@0: for (int32_t i = 0; i < publicBucketList->size(); ++i) { michael@0: getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++; michael@0: } michael@0: } michael@0: michael@0: // The virtual destructor must not be inline. michael@0: // See ticket #8454 for details. michael@0: virtual ~BucketList(); michael@0: michael@0: int32_t getBucketCount() const { michael@0: return immutableVisibleList_->size(); michael@0: } michael@0: michael@0: int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly, michael@0: UErrorCode &errorCode) { michael@0: // binary search michael@0: int32_t start = 0; michael@0: int32_t limit = bucketList_->size(); michael@0: while ((start + 1) < limit) { michael@0: int32_t i = (start + limit) / 2; michael@0: const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, i); michael@0: UCollationResult nameVsBucket = michael@0: collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode); michael@0: if (nameVsBucket < 0) { michael@0: limit = i; michael@0: } else { michael@0: start = i; michael@0: } michael@0: } michael@0: const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, start); michael@0: if (bucket->displayBucket_ != NULL) { michael@0: bucket = bucket->displayBucket_; michael@0: } michael@0: return bucket->displayIndex_; michael@0: } michael@0: michael@0: /** All of the buckets, visible and invisible. */ michael@0: UVector *bucketList_; michael@0: /** Just the visible buckets. */ michael@0: UVector *immutableVisibleList_; michael@0: }; michael@0: michael@0: BucketList::~BucketList() { michael@0: delete bucketList_; michael@0: if (immutableVisibleList_ != bucketList_) { michael@0: delete immutableVisibleList_; michael@0: } michael@0: } michael@0: michael@0: AlphabeticIndex::ImmutableIndex::~ImmutableIndex() { michael@0: delete buckets_; michael@0: delete collatorPrimaryOnly_; michael@0: } michael@0: michael@0: int32_t michael@0: AlphabeticIndex::ImmutableIndex::getBucketCount() const { michael@0: return buckets_->getBucketCount(); michael@0: } michael@0: michael@0: int32_t michael@0: AlphabeticIndex::ImmutableIndex::getBucketIndex( michael@0: const UnicodeString &name, UErrorCode &errorCode) const { michael@0: return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode); michael@0: } michael@0: michael@0: const AlphabeticIndex::Bucket * michael@0: AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const { michael@0: if (0 <= index && index < buckets_->getBucketCount()) { michael@0: return icu::getBucket(*buckets_->immutableVisibleList_, index); michael@0: } else { michael@0: return NULL; michael@0: } michael@0: } michael@0: michael@0: AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status) michael@0: : inputList_(NULL), michael@0: labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL), michael@0: maxLabelCount_(99), michael@0: initialLabels_(NULL), firstCharsInScripts_(NULL), michael@0: collator_(NULL), collatorPrimaryOnly_(NULL), michael@0: buckets_(NULL) { michael@0: init(&locale, status); michael@0: } michael@0: michael@0: michael@0: AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status) michael@0: : inputList_(NULL), michael@0: labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL), michael@0: maxLabelCount_(99), michael@0: initialLabels_(NULL), firstCharsInScripts_(NULL), michael@0: collator_(collator), collatorPrimaryOnly_(NULL), michael@0: buckets_(NULL) { michael@0: init(NULL, status); michael@0: } michael@0: michael@0: michael@0: michael@0: AlphabeticIndex::~AlphabeticIndex() { michael@0: delete collator_; michael@0: delete collatorPrimaryOnly_; michael@0: delete firstCharsInScripts_; michael@0: delete buckets_; michael@0: delete inputList_; michael@0: delete initialLabels_; michael@0: } michael@0: michael@0: michael@0: AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) { michael@0: if (U_FAILURE(status)) { michael@0: return *this; michael@0: } michael@0: initialLabels_->addAll(additions); michael@0: clearBuckets(); michael@0: return *this; michael@0: } michael@0: michael@0: michael@0: AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) { michael@0: addIndexExemplars(locale, status); michael@0: clearBuckets(); michael@0: return *this; michael@0: } michael@0: michael@0: michael@0: AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) { michael@0: if (U_FAILURE(errorCode)) { return NULL; } michael@0: // In C++, the ImmutableIndex must own its copy of the BucketList, michael@0: // even if it contains no records, for proper memory management. michael@0: // We could clone the buckets_ if they are not NULL, michael@0: // but that would be worth it only if this method is called multiple times, michael@0: // or called after using the old-style bucket iterator API. michael@0: LocalPointer immutableBucketList(createBucketList(errorCode)); michael@0: LocalPointer coll( michael@0: static_cast(collatorPrimaryOnly_->clone())); michael@0: if (immutableBucketList.isNull() || coll.isNull()) { michael@0: errorCode = U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias()); michael@0: if (immIndex == NULL) { michael@0: errorCode = U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: // The ImmutableIndex adopted its parameter objects. michael@0: immutableBucketList.orphan(); michael@0: coll.orphan(); michael@0: return immIndex; michael@0: } michael@0: michael@0: int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) { michael@0: initBuckets(status); michael@0: if (U_FAILURE(status)) { michael@0: return 0; michael@0: } michael@0: return buckets_->getBucketCount(); michael@0: } michael@0: michael@0: michael@0: int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) { michael@0: if (U_FAILURE(status) || inputList_ == NULL) { michael@0: return 0; michael@0: } michael@0: return inputList_->size(); michael@0: } michael@0: michael@0: void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const { michael@0: const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode); michael@0: if (U_FAILURE(errorCode)) { return; } michael@0: michael@0: const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0); michael@0: const UnicodeString &overflowBoundary = michael@0: *getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1); michael@0: michael@0: // We make a sorted array of elements. michael@0: // Some of the input may be redundant. michael@0: // That is, we might have c, ch, d, where "ch" sorts just like "c", "h". michael@0: // We filter out those cases. michael@0: UnicodeSetIterator iter(*initialLabels_); michael@0: while (iter.next()) { michael@0: const UnicodeString *item = &iter.getString(); michael@0: LocalPointer ownedItem; michael@0: UBool checkDistinct; michael@0: int32_t itemLength = item->length(); michael@0: if (!item->hasMoreChar32Than(0, itemLength, 1)) { michael@0: checkDistinct = FALSE; michael@0: } else if(item->charAt(itemLength - 1) == 0x2a && // '*' michael@0: item->charAt(itemLength - 2) != 0x2a) { michael@0: // Use a label if it is marked with one trailing star, michael@0: // even if the label string sorts the same when all contractions are suppressed. michael@0: ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1)); michael@0: item = ownedItem.getAlias(); michael@0: if (item == NULL) { michael@0: errorCode = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: checkDistinct = FALSE; michael@0: } else { michael@0: checkDistinct = TRUE; michael@0: } michael@0: if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) { michael@0: // Ignore a primary-ignorable or non-alphabetic index character. michael@0: } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) { michael@0: // Ignore an index characters that will land in the overflow bucket. michael@0: } else if (checkDistinct && michael@0: collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) { michael@0: // Ignore a multi-code point index character that does not sort distinctly michael@0: // from the sequence of its separate characters. michael@0: } else { michael@0: int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_); michael@0: if (insertionPoint < 0) { michael@0: indexCharacters.insertElementAt( michael@0: ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode); michael@0: } else { michael@0: const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint); michael@0: if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) { michael@0: indexCharacters.setElementAt( michael@0: ownedString(*item, ownedItem, errorCode), insertionPoint); michael@0: } michael@0: } michael@0: } michael@0: } michael@0: if (U_FAILURE(errorCode)) { return; } michael@0: michael@0: // if the result is still too large, cut down to maxCount elements, by removing every nth element michael@0: michael@0: int32_t size = indexCharacters.size() - 1; michael@0: if (size > maxLabelCount_) { michael@0: int32_t count = 0; michael@0: int32_t old = -1; michael@0: for (int32_t i = 0; i < indexCharacters.size();) { michael@0: ++count; michael@0: int32_t bump = count * maxLabelCount_ / size; michael@0: if (bump == old) { michael@0: indexCharacters.removeElementAt(i); michael@0: } else { michael@0: old = bump; michael@0: ++i; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: namespace { michael@0: michael@0: const UnicodeString &fixLabel(const UnicodeString ¤t, UnicodeString &temp) { michael@0: if (!current.startsWith(BASE, BASE_LENGTH)) { michael@0: return current; michael@0: } michael@0: UChar rest = current.charAt(BASE_LENGTH); michael@0: if (0x2800 < rest && rest <= 0x28FF) { // stroke count michael@0: int32_t count = rest-0x2800; michael@0: temp.setTo((UChar)(0x30 + count % 10)); michael@0: if (count >= 10) { michael@0: count /= 10; michael@0: temp.insert(0, (UChar)(0x30 + count % 10)); michael@0: if (count >= 10) { michael@0: count /= 10; michael@0: temp.insert(0, (UChar)(0x30 + count)); michael@0: } michael@0: } michael@0: return temp.append((UChar)0x5283); michael@0: } michael@0: return temp.setTo(current, BASE_LENGTH); michael@0: } michael@0: michael@0: UBool hasMultiplePrimaryWeights( michael@0: CollationElementIterator &cei, int32_t variableTop, michael@0: const UnicodeString &s, UErrorCode &errorCode) { michael@0: cei.setText(s, errorCode); michael@0: UBool seenPrimary = FALSE; michael@0: for (;;) { michael@0: int32_t ce32 = cei.next(errorCode); michael@0: if (ce32 == CollationElementIterator::NULLORDER) { michael@0: break; michael@0: } michael@0: int32_t p = CollationElementIterator::primaryOrder(ce32); michael@0: if (p > variableTop && (ce32 & 0xc0) != 0xc0) { michael@0: // not primary ignorable, and not a continuation CE michael@0: if (seenPrimary) { michael@0: return TRUE; michael@0: } michael@0: seenPrimary = TRUE; michael@0: } michael@0: } michael@0: return FALSE; michael@0: } michael@0: michael@0: } // namespace michael@0: michael@0: BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const { michael@0: // Initialize indexCharacters. michael@0: UVector indexCharacters(errorCode); michael@0: indexCharacters.setDeleter(uprv_deleteUObject); michael@0: initLabels(indexCharacters, errorCode); michael@0: if (U_FAILURE(errorCode)) { return NULL; } michael@0: michael@0: // Variables for hasMultiplePrimaryWeights(). michael@0: LocalPointer cei( michael@0: collatorPrimaryOnly_->createCollationElementIterator(emptyString_)); michael@0: if (cei.isNull()) { michael@0: errorCode = U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: int32_t variableTop; michael@0: if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) { michael@0: variableTop = CollationElementIterator::primaryOrder( michael@0: (int32_t)collatorPrimaryOnly_->getVariableTop(errorCode)); michael@0: } else { michael@0: variableTop = 0; michael@0: } michael@0: UBool hasInvisibleBuckets = FALSE; michael@0: michael@0: // Helper arrays for Chinese Pinyin collation. michael@0: Bucket *asciiBuckets[26] = { michael@0: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, michael@0: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL michael@0: }; michael@0: Bucket *pinyinBuckets[26] = { michael@0: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, michael@0: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL michael@0: }; michael@0: UBool hasPinyin = FALSE; michael@0: michael@0: LocalPointer bucketList(new UVector(errorCode)); michael@0: if (bucketList.isNull()) { michael@0: errorCode = U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: bucketList->setDeleter(uprv_deleteUObject); michael@0: michael@0: // underflow bucket michael@0: Bucket *bucket = new Bucket(getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW); michael@0: if (bucket == NULL) { michael@0: errorCode = U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: bucketList->addElement(bucket, errorCode); michael@0: if (U_FAILURE(errorCode)) { return NULL; } michael@0: michael@0: UnicodeString temp; michael@0: michael@0: // fix up the list, adding underflow, additions, overflow michael@0: // Insert inflow labels as needed. michael@0: int32_t scriptIndex = -1; michael@0: const UnicodeString *scriptUpperBoundary = &emptyString_; michael@0: for (int32_t i = 0; i < indexCharacters.size(); ++i) { michael@0: UnicodeString ¤t = *getString(indexCharacters, i); michael@0: if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) { michael@0: // We crossed the script boundary into a new script. michael@0: const UnicodeString &inflowBoundary = *scriptUpperBoundary; michael@0: UBool skippedScript = FALSE; michael@0: for (;;) { michael@0: scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex); michael@0: if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) { michael@0: break; michael@0: } michael@0: skippedScript = TRUE; michael@0: } michael@0: if (skippedScript && bucketList->size() > 1) { michael@0: // We are skipping one or more scripts, michael@0: // and we are not just getting out of the underflow label. michael@0: bucket = new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW); michael@0: if (bucket == NULL) { michael@0: errorCode = U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: bucketList->addElement(bucket, errorCode); michael@0: } michael@0: } michael@0: // Add a bucket with the current label. michael@0: bucket = new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL); michael@0: if (bucket == NULL) { michael@0: errorCode = U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: bucketList->addElement(bucket, errorCode); michael@0: // Remember ASCII and Pinyin buckets for Pinyin redirects. michael@0: UChar c; michael@0: if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) { // A-Z michael@0: asciiBuckets[c - 0x41] = bucket; michael@0: } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) && michael@0: 0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) { michael@0: pinyinBuckets[c - 0x41] = bucket; michael@0: hasPinyin = TRUE; michael@0: } michael@0: // Check for multiple primary weights. michael@0: if (!current.startsWith(BASE, BASE_LENGTH) && michael@0: hasMultiplePrimaryWeights(*cei, variableTop, current, errorCode) && michael@0: current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) { michael@0: // "AE-ligature" or "Sch" etc. michael@0: for (int32_t i = bucketList->size() - 2;; --i) { michael@0: Bucket *singleBucket = getBucket(*bucketList, i); michael@0: if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) { michael@0: // There is no single-character bucket since the last michael@0: // underflow or inflow label. michael@0: break; michael@0: } michael@0: if (singleBucket->displayBucket_ == NULL && michael@0: !hasMultiplePrimaryWeights( michael@0: *cei, variableTop, singleBucket->lowerBoundary_, errorCode)) { michael@0: // Add an invisible bucket that redirects strings greater than the expansion michael@0: // to the previous single-character bucket. michael@0: // For example, after ... Q R S Sch we add Sch\uFFFF->S michael@0: // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S. michael@0: bucket = new Bucket(emptyString_, michael@0: UnicodeString(current).append((UChar)0xFFFF), michael@0: U_ALPHAINDEX_NORMAL); michael@0: if (bucket == NULL) { michael@0: errorCode = U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: bucket->displayBucket_ = singleBucket; michael@0: bucketList->addElement(bucket, errorCode); michael@0: hasInvisibleBuckets = TRUE; michael@0: break; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: if (U_FAILURE(errorCode)) { return NULL; } michael@0: if (bucketList->size() == 1) { michael@0: // No real labels, show only the underflow label. michael@0: BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias()); michael@0: if (bl == NULL) { michael@0: errorCode = U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: bucketList.orphan(); michael@0: return bl; michael@0: } michael@0: // overflow bucket michael@0: bucket = new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW); michael@0: if (bucket == NULL) { michael@0: errorCode = U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: bucketList->addElement(bucket, errorCode); // final michael@0: michael@0: if (hasPinyin) { michael@0: // Redirect Pinyin buckets. michael@0: Bucket *asciiBucket = NULL; michael@0: for (int32_t i = 0; i < 26; ++i) { michael@0: if (asciiBuckets[i] != NULL) { michael@0: asciiBucket = asciiBuckets[i]; michael@0: } michael@0: if (pinyinBuckets[i] != NULL && asciiBucket != NULL) { michael@0: pinyinBuckets[i]->displayBucket_ = asciiBucket; michael@0: hasInvisibleBuckets = TRUE; michael@0: } michael@0: } michael@0: } michael@0: michael@0: if (U_FAILURE(errorCode)) { return NULL; } michael@0: if (!hasInvisibleBuckets) { michael@0: BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias()); michael@0: if (bl == NULL) { michael@0: errorCode = U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: bucketList.orphan(); michael@0: return bl; michael@0: } michael@0: // Merge inflow buckets that are visually adjacent. michael@0: // Iterate backwards: Merge inflow into overflow rather than the other way around. michael@0: int32_t i = bucketList->size() - 1; michael@0: Bucket *nextBucket = getBucket(*bucketList, i); michael@0: while (--i > 0) { michael@0: bucket = getBucket(*bucketList, i); michael@0: if (bucket->displayBucket_ != NULL) { michael@0: continue; // skip invisible buckets michael@0: } michael@0: if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) { michael@0: if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) { michael@0: bucket->displayBucket_ = nextBucket; michael@0: continue; michael@0: } michael@0: } michael@0: nextBucket = bucket; michael@0: } michael@0: michael@0: LocalPointer publicBucketList(new UVector(errorCode)); michael@0: if (bucketList.isNull()) { michael@0: errorCode = U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: // Do not call publicBucketList->setDeleter(): michael@0: // This vector shares its objects with the bucketList. michael@0: for (int32_t i = 0; i < bucketList->size(); ++i) { michael@0: bucket = getBucket(*bucketList, i); michael@0: if (bucket->displayBucket_ == NULL) { michael@0: publicBucketList->addElement(bucket, errorCode); michael@0: } michael@0: } michael@0: if (U_FAILURE(errorCode)) { return NULL; } michael@0: BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias()); michael@0: if (bl == NULL) { michael@0: errorCode = U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: bucketList.orphan(); michael@0: publicBucketList.orphan(); michael@0: return bl; michael@0: } michael@0: michael@0: /** michael@0: * Creates an index, and buckets and sorts the list of records into the index. michael@0: */ michael@0: void AlphabeticIndex::initBuckets(UErrorCode &errorCode) { michael@0: if (U_FAILURE(errorCode) || buckets_ != NULL) { michael@0: return; michael@0: } michael@0: buckets_ = createBucketList(errorCode); michael@0: if (U_FAILURE(errorCode) || inputList_ == NULL || inputList_->isEmpty()) { michael@0: return; michael@0: } michael@0: michael@0: // Sort the records by name. michael@0: // Stable sort preserves input order of collation duplicates. michael@0: inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode); michael@0: michael@0: // Now, we traverse all of the input, which is now sorted. michael@0: // If the item doesn't go in the current bucket, we find the next bucket that contains it. michael@0: // This makes the process order n*log(n), since we just sort the list and then do a linear process. michael@0: // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so michael@0: // we need to improve it for that case. michael@0: michael@0: Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0); michael@0: int32_t bucketIndex = 1; michael@0: Bucket *nextBucket; michael@0: const UnicodeString *upperBoundary; michael@0: if (bucketIndex < buckets_->bucketList_->size()) { michael@0: nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++); michael@0: upperBoundary = &nextBucket->lowerBoundary_; michael@0: } else { michael@0: nextBucket = NULL; michael@0: upperBoundary = NULL; michael@0: } michael@0: for (int32_t i = 0; i < inputList_->size(); ++i) { michael@0: Record *r = getRecord(*inputList_, i); michael@0: // if the current bucket isn't the right one, find the one that is michael@0: // We have a special flag for the last bucket so that we don't look any further michael@0: while (upperBoundary != NULL && michael@0: collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) { michael@0: currentBucket = nextBucket; michael@0: // now reset the boundary that we compare against michael@0: if (bucketIndex < buckets_->bucketList_->size()) { michael@0: nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++); michael@0: upperBoundary = &nextBucket->lowerBoundary_; michael@0: } else { michael@0: upperBoundary = NULL; michael@0: } michael@0: } michael@0: // now put the record into the bucket. michael@0: Bucket *bucket = currentBucket; michael@0: if (bucket->displayBucket_ != NULL) { michael@0: bucket = bucket->displayBucket_; michael@0: } michael@0: if (bucket->records_ == NULL) { michael@0: bucket->records_ = new UVector(errorCode); michael@0: if (bucket->records_ == NULL) { michael@0: errorCode = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: } michael@0: bucket->records_->addElement(r, errorCode); michael@0: } michael@0: } michael@0: michael@0: void AlphabeticIndex::clearBuckets() { michael@0: if (buckets_ != NULL) { michael@0: delete buckets_; michael@0: buckets_ = NULL; michael@0: internalResetBucketIterator(); michael@0: } michael@0: } michael@0: michael@0: void AlphabeticIndex::internalResetBucketIterator() { michael@0: labelsIterIndex_ = -1; michael@0: currentBucket_ = NULL; michael@0: } michael@0: michael@0: michael@0: void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) { michael@0: if (U_FAILURE(status)) { return; } michael@0: // Chinese index characters, which are specific to each of the several Chinese tailorings, michael@0: // take precedence over the single locale data exemplar set per language. michael@0: const char *language = locale.getLanguage(); michael@0: if (uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 || michael@0: uprv_strcmp(language, "ko") == 0) { michael@0: // TODO: This should be done regardless of the language, but it's expensive. michael@0: // We should add a Collator function (can be @internal) michael@0: // to enumerate just the contractions that start with a given code point or string. michael@0: if (addChineseIndexCharacters(status) || U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: } michael@0: michael@0: LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status)); michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: michael@0: UnicodeSet exemplars; michael@0: ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status); michael@0: if (U_SUCCESS(status)) { michael@0: initialLabels_->addAll(exemplars); michael@0: return; michael@0: } michael@0: status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR michael@0: michael@0: // The locale data did not include explicit Index characters. michael@0: // Synthesize a set of them from the locale's standard exemplar characters. michael@0: ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status); michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: michael@0: // question: should we add auxiliary exemplars? michael@0: if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.size() == 0) { michael@0: exemplars.add(0x61, 0x7A); michael@0: } michael@0: if (exemplars.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables michael@0: // cut down to small list michael@0: exemplars.remove(0xAC00, 0xD7A3). michael@0: add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C). michael@0: add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544). michael@0: add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0). michael@0: add(0xD30C).add(0xD558); michael@0: } michael@0: if (exemplars.containsSome(0x1200, 0x137F)) { // Ethiopic block michael@0: // cut down to small list michael@0: // make use of the fact that Ethiopic is allocated in 8's, where michael@0: // the base is 0 mod 8. michael@0: UnicodeSet ethiopic( michael@0: UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"), status); michael@0: UnicodeSetIterator it(ethiopic); michael@0: while (it.next() && !it.isString()) { michael@0: if ((it.getCodepoint() & 0x7) != 0) { michael@0: exemplars.remove(it.getCodepoint()); michael@0: } michael@0: } michael@0: } michael@0: michael@0: // Upper-case any that aren't already so. michael@0: // (We only do this for synthesized index characters.) michael@0: UnicodeSetIterator it(exemplars); michael@0: UnicodeString upperC; michael@0: while (it.next()) { michael@0: const UnicodeString &exemplarC = it.getString(); michael@0: upperC = exemplarC; michael@0: upperC.toUpper(locale); michael@0: initialLabels_->add(upperC); michael@0: } michael@0: } michael@0: michael@0: UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) { michael@0: UnicodeSet contractions; michael@0: ucol_getContractionsAndExpansions(collatorPrimaryOnly_->getUCollator(), michael@0: contractions.toUSet(), NULL, FALSE, &errorCode); michael@0: if (U_FAILURE(errorCode)) { return FALSE; } michael@0: UnicodeString firstHanBoundary; michael@0: UBool hasPinyin = FALSE; michael@0: UnicodeSetIterator iter(contractions); michael@0: while (iter.next()) { michael@0: const UnicodeString &s = iter.getString(); michael@0: if (s.startsWith(BASE, BASE_LENGTH)) { michael@0: initialLabels_->add(s); michael@0: if (firstHanBoundary.isEmpty() || michael@0: collatorPrimaryOnly_->compare(s, firstHanBoundary, errorCode) < 0) { michael@0: firstHanBoundary = s; michael@0: } michael@0: UChar c = s.charAt(s.length() - 1); michael@0: if (0x41 <= c && c <= 0x5A) { // A-Z michael@0: hasPinyin = TRUE; michael@0: } michael@0: } michael@0: } michael@0: if (hasPinyin) { michael@0: initialLabels_->add(0x41, 0x5A); // A-Z michael@0: } michael@0: if (!firstHanBoundary.isEmpty()) { michael@0: // The hardcoded list of script boundaries includes U+4E00 michael@0: // which is tailored to not be the first primary michael@0: // in all Chinese tailorings except "unihan". michael@0: // Replace U+4E00 with the first boundary string from the tailoring. michael@0: // TODO: This becomes obsolete when the root collator gets michael@0: // reliable script-first-primary mappings. michael@0: int32_t hanIndex = binarySearch( michael@0: *firstCharsInScripts_, UnicodeString((UChar)0x4E00), *collatorPrimaryOnly_); michael@0: if (hanIndex >= 0) { michael@0: UnicodeString *fh = new UnicodeString(firstHanBoundary); michael@0: if (fh == NULL) { michael@0: errorCode = U_MEMORY_ALLOCATION_ERROR; michael@0: return FALSE; michael@0: } michael@0: firstCharsInScripts_->setElementAt(fh, hanIndex); michael@0: } michael@0: return TRUE; michael@0: } else { michael@0: return FALSE; michael@0: } michael@0: } michael@0: michael@0: michael@0: /* michael@0: * Return the string with interspersed CGJs. Input must have more than 2 codepoints. michael@0: */ michael@0: static const UChar CGJ = 0x034F; michael@0: UnicodeString AlphabeticIndex::separated(const UnicodeString &item) { michael@0: UnicodeString result; michael@0: if (item.length() == 0) { michael@0: return result; michael@0: } michael@0: int32_t i = 0; michael@0: for (;;) { michael@0: UChar32 cp = item.char32At(i); michael@0: result.append(cp); michael@0: i = item.moveIndex32(i, 1); michael@0: if (i >= item.length()) { michael@0: break; michael@0: } michael@0: result.append(CGJ); michael@0: } michael@0: return result; michael@0: } michael@0: michael@0: michael@0: UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const { michael@0: return FALSE; michael@0: } michael@0: michael@0: michael@0: UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const { michael@0: return FALSE; michael@0: } michael@0: michael@0: michael@0: const RuleBasedCollator &AlphabeticIndex::getCollator() const { michael@0: // There are no known non-RuleBasedCollator collators, and none ever expected. michael@0: // But, in case that changes, better a null pointer than a wrong type. michael@0: return *dynamic_cast(collator_); michael@0: } michael@0: michael@0: michael@0: const UnicodeString &AlphabeticIndex::getInflowLabel() const { michael@0: return inflowLabel_; michael@0: } michael@0: michael@0: const UnicodeString &AlphabeticIndex::getOverflowLabel() const { michael@0: return overflowLabel_; michael@0: } michael@0: michael@0: michael@0: const UnicodeString &AlphabeticIndex::getUnderflowLabel() const { michael@0: return underflowLabel_; michael@0: } michael@0: michael@0: michael@0: AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { michael@0: inflowLabel_ = label; michael@0: clearBuckets(); michael@0: return *this; michael@0: } michael@0: michael@0: michael@0: AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { michael@0: overflowLabel_ = label; michael@0: clearBuckets(); michael@0: return *this; michael@0: } michael@0: michael@0: michael@0: AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { michael@0: underflowLabel_ = label; michael@0: clearBuckets(); michael@0: return *this; michael@0: } michael@0: michael@0: michael@0: int32_t AlphabeticIndex::getMaxLabelCount() const { michael@0: return maxLabelCount_; michael@0: } michael@0: michael@0: michael@0: AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) { michael@0: if (U_FAILURE(status)) { michael@0: return *this; michael@0: } michael@0: if (maxLabelCount <= 0) { michael@0: status = U_ILLEGAL_ARGUMENT_ERROR; michael@0: return *this; michael@0: } michael@0: maxLabelCount_ = maxLabelCount; michael@0: clearBuckets(); michael@0: return *this; michael@0: } michael@0: michael@0: michael@0: // michael@0: // init() - Common code for constructors. michael@0: // michael@0: michael@0: void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) { michael@0: if (U_FAILURE(status)) { return; } michael@0: if (locale == NULL && collator_ == NULL) { michael@0: status = U_ILLEGAL_ARGUMENT_ERROR; michael@0: return; michael@0: } michael@0: michael@0: initialLabels_ = new UnicodeSet(); michael@0: if (initialLabels_ == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: michael@0: inflowLabel_.setTo((UChar)0x2026); // Ellipsis michael@0: overflowLabel_ = inflowLabel_; michael@0: underflowLabel_ = inflowLabel_; michael@0: michael@0: if (collator_ == NULL) { michael@0: collator_ = static_cast(Collator::createInstance(*locale, status)); michael@0: if (U_FAILURE(status)) { return; } michael@0: if (collator_ == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: } michael@0: collatorPrimaryOnly_ = static_cast(collator_->clone()); michael@0: if (collatorPrimaryOnly_ == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status); michael@0: firstCharsInScripts_ = firstStringsInScript(status); michael@0: if (U_FAILURE(status)) { return; } michael@0: firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status); michael@0: UnicodeString _4E00((UChar)0x4E00); michael@0: UnicodeString _1100((UChar)0x1100); michael@0: UnicodeString _1112((UChar)0x1112); michael@0: if (collatorPrimaryOnly_->compare(_4E00, _1112, status) <= 0 && michael@0: collatorPrimaryOnly_->compare(_1100, _4E00, status) <= 0) { michael@0: // The standard Korean tailoring sorts Hanja (Han characters) michael@0: // as secondary differences from Hangul syllables. michael@0: // This makes U+4E00 not useful as a Han-script boundary. michael@0: // TODO: This becomes obsolete when the root collator gets michael@0: // reliable script-first-primary mappings. michael@0: int32_t hanIndex = binarySearch( michael@0: *firstCharsInScripts_, _4E00, *collatorPrimaryOnly_); michael@0: if (hanIndex >= 0) { michael@0: firstCharsInScripts_->removeElementAt(hanIndex); michael@0: } michael@0: } michael@0: // Guard against a degenerate collator where michael@0: // some script boundary strings are primary ignorable. michael@0: for (;;) { michael@0: if (U_FAILURE(status)) { return; } michael@0: if (firstCharsInScripts_->isEmpty()) { michael@0: // AlphabeticIndex requires some non-ignorable script boundary strings. michael@0: status = U_ILLEGAL_ARGUMENT_ERROR; michael@0: return; michael@0: } michael@0: if (collatorPrimaryOnly_->compare( michael@0: *static_cast(firstCharsInScripts_->elementAt(0)), michael@0: emptyString_, status) == UCOL_EQUAL) { michael@0: firstCharsInScripts_->removeElementAt(0); michael@0: } else { michael@0: break; michael@0: } michael@0: } michael@0: michael@0: if (locale != NULL) { michael@0: addIndexExemplars(*locale, status); michael@0: } michael@0: } michael@0: michael@0: michael@0: // michael@0: // Comparison function for UVector sorting with a collator. michael@0: // michael@0: static int32_t U_CALLCONV michael@0: collatorComparator(const void *context, const void *left, const void *right) { michael@0: const UElement *leftElement = static_cast(left); michael@0: const UElement *rightElement = static_cast(right); michael@0: const UnicodeString *leftString = static_cast(leftElement->pointer); michael@0: const UnicodeString *rightString = static_cast(rightElement->pointer); michael@0: michael@0: if (leftString == rightString) { michael@0: // Catches case where both are NULL michael@0: return 0; michael@0: } michael@0: if (leftString == NULL) { michael@0: return 1; michael@0: }; michael@0: if (rightString == NULL) { michael@0: return -1; michael@0: } michael@0: const Collator *col = static_cast(context); michael@0: UErrorCode errorCode = U_ZERO_ERROR; michael@0: return col->compare(*leftString, *rightString, errorCode); michael@0: } michael@0: michael@0: // michael@0: // Comparison function for UVector sorting with a collator. michael@0: // michael@0: static int32_t U_CALLCONV michael@0: recordCompareFn(const void *context, const void *left, const void *right) { michael@0: const UElement *leftElement = static_cast(left); michael@0: const UElement *rightElement = static_cast(right); michael@0: const AlphabeticIndex::Record *leftRec = static_cast(leftElement->pointer); michael@0: const AlphabeticIndex::Record *rightRec = static_cast(rightElement->pointer); michael@0: const Collator *col = static_cast(context); michael@0: UErrorCode errorCode = U_ZERO_ERROR; michael@0: return col->compare(leftRec->name_, rightRec->name_, errorCode); michael@0: } michael@0: michael@0: michael@0: /** michael@0: * This list contains one character per script that has the michael@0: * lowest primary weight for that script in the root collator. michael@0: * This list will be copied and sorted to account for script reordering. michael@0: * michael@0: *

TODO: This is fragile. If the first character of a script is tailored michael@0: * so that it does not map to the script's lowest primary weight any more, michael@0: * then the buckets will be off. michael@0: * There are hacks in the code to handle the known CJK tailorings of U+4E00. michael@0: * michael@0: *

We use "A" not "a" because the en_US_POSIX tailoring sorts A primary-before a. michael@0: * michael@0: * Keep this in sync with HACK_FIRST_CHARS_IN_SCRIPTS in michael@0: * ICU4J main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java michael@0: */ michael@0: static const UChar HACK_FIRST_CHARS_IN_SCRIPTS[] = { michael@0: 0x41, 0, 0x03B1, 0, michael@0: 0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0, michael@0: 0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0, michael@0: 0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0, michael@0: 0xAAF2, 0, // Meetei Mayek michael@0: 0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0, michael@0: U16_LEAD(0x111C4), U16_TRAIL(0x111C4), 0, // Sharada michael@0: U16_LEAD(0x11680), U16_TRAIL(0x11680), 0, // Takri michael@0: 0x1B83, 0, michael@0: 0xD802, 0xDE00, 0, 0x0E01, 0, michael@0: 0x0EDE, 0, // Lao michael@0: 0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0, michael@0: 0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0, michael@0: U16_LEAD(0x11103), U16_TRAIL(0x11103), 0, // Chakma michael@0: 0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0, michael@0: 0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0, michael@0: 0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0, michael@0: U16_LEAD(0x16F00), U16_TRAIL(0x16F00), 0, // Miao michael@0: 0xD800, 0xDE80, 0, michael@0: 0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0, michael@0: 0xD801, 0xDC80, 0, michael@0: U16_LEAD(0x110D0), U16_TRAIL(0x110D0), 0, // Sora Sompeng michael@0: 0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0, michael@0: 0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0, michael@0: U16_LEAD(0x109A0), U16_TRAIL(0x109A0), 0, // Meroitic Cursive michael@0: U16_LEAD(0x10980), U16_TRAIL(0x10980), 0, // Meroitic Hieroglyphs michael@0: 0x4E00, 0, michael@0: // TODO: The overflow bucket's lowerBoundary string should be the michael@0: // first item after the last reordering group in the collator's script order. michael@0: // This should normally be the first Unicode code point michael@0: // that is unassigned (U+0378 in Unicode 6.3) and untailored. michael@0: // However, at least up to ICU 51 the Hani reordering group includes michael@0: // unassigned code points, michael@0: // and there is no stable string for the start of the trailing-weights range. michael@0: // The only known string that sorts "high" is U+FFFF. michael@0: // When ICU separates Hani vs. unassigned reordering groups, we need to fix this, michael@0: // and fix relevant test code. michael@0: // Ideally, FractionalUCA.txt will have a "script first primary" michael@0: // for unassigned code points. michael@0: 0xFFFF, 0 michael@0: }; michael@0: michael@0: UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) { michael@0: if (U_FAILURE(status)) { michael@0: return NULL; michael@0: } michael@0: UVector *dest = new UVector(status); michael@0: if (dest == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: dest->setDeleter(uprv_deleteUObject); michael@0: const UChar *src = HACK_FIRST_CHARS_IN_SCRIPTS; michael@0: const UChar *limit = src + LENGTHOF(HACK_FIRST_CHARS_IN_SCRIPTS); michael@0: do { michael@0: if (U_FAILURE(status)) { michael@0: return dest; michael@0: } michael@0: UnicodeString *str = new UnicodeString(src, -1); michael@0: if (str == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: return dest; michael@0: } michael@0: dest->addElement(str, status); michael@0: src += str->length() + 1; michael@0: } while (src < limit); michael@0: return dest; michael@0: } michael@0: michael@0: michael@0: namespace { michael@0: michael@0: /** michael@0: * Returns true if one index character string is "better" than the other. michael@0: * Shorter NFKD is better, and otherwise NFKD-binary-less-than is michael@0: * better, and otherwise binary-less-than is better. michael@0: */ michael@0: UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer, michael@0: const UnicodeString &one, const UnicodeString &other) { michael@0: // This is called with primary-equal strings, but never with one.equals(other). michael@0: UErrorCode status = U_ZERO_ERROR; michael@0: UnicodeString n1 = nfkdNormalizer.normalize(one, status); michael@0: UnicodeString n2 = nfkdNormalizer.normalize(other, status); michael@0: if (U_FAILURE(status)) { return FALSE; } michael@0: int32_t result = n1.countChar32() - n2.countChar32(); michael@0: if (result != 0) { michael@0: return result < 0; michael@0: } michael@0: result = n1.compareCodePointOrder(n2); michael@0: if (result != 0) { michael@0: return result < 0; michael@0: } michael@0: return one.compareCodePointOrder(other) < 0; michael@0: } michael@0: michael@0: } // namespace michael@0: michael@0: // michael@0: // Constructor & Destructor for AlphabeticIndex::Record michael@0: // michael@0: // Records are internal only, instances are not directly surfaced in the public API. michael@0: // This class is mostly struct-like, with all public fields. michael@0: michael@0: AlphabeticIndex::Record::Record(const UnicodeString &name, const void *data) michael@0: : name_(name), data_(data) {} michael@0: michael@0: AlphabeticIndex::Record::~Record() { michael@0: } michael@0: michael@0: michael@0: AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) { michael@0: if (U_FAILURE(status)) { michael@0: return *this; michael@0: } michael@0: if (inputList_ == NULL) { michael@0: inputList_ = new UVector(status); michael@0: if (inputList_ == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: return *this; michael@0: } michael@0: inputList_->setDeleter(alphaIndex_deleteRecord); michael@0: } michael@0: Record *r = new Record(name, data); michael@0: if (r == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: return *this; michael@0: } michael@0: inputList_->addElement(r, status); michael@0: clearBuckets(); michael@0: //std::string ss; michael@0: //std::string ss2; michael@0: //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" << michael@0: // " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl; michael@0: return *this; michael@0: } michael@0: michael@0: michael@0: AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) { michael@0: if (U_SUCCESS(status) && inputList_ != NULL && !inputList_->isEmpty()) { michael@0: inputList_->removeAllElements(); michael@0: clearBuckets(); michael@0: } michael@0: return *this; michael@0: } michael@0: michael@0: int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) { michael@0: initBuckets(status); michael@0: if (U_FAILURE(status)) { michael@0: return 0; michael@0: } michael@0: return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, status); michael@0: } michael@0: michael@0: michael@0: int32_t AlphabeticIndex::getBucketIndex() const { michael@0: return labelsIterIndex_; michael@0: } michael@0: michael@0: michael@0: UBool AlphabeticIndex::nextBucket(UErrorCode &status) { michael@0: if (U_FAILURE(status)) { michael@0: return FALSE; michael@0: } michael@0: if (buckets_ == NULL && currentBucket_ != NULL) { michael@0: status = U_ENUM_OUT_OF_SYNC_ERROR; michael@0: return FALSE; michael@0: } michael@0: initBuckets(status); michael@0: if (U_FAILURE(status)) { michael@0: return FALSE; michael@0: } michael@0: ++labelsIterIndex_; michael@0: if (labelsIterIndex_ >= buckets_->getBucketCount()) { michael@0: labelsIterIndex_ = buckets_->getBucketCount(); michael@0: return FALSE; michael@0: } michael@0: currentBucket_ = getBucket(*buckets_->immutableVisibleList_, labelsIterIndex_); michael@0: resetRecordIterator(); michael@0: return TRUE; michael@0: } michael@0: michael@0: const UnicodeString &AlphabeticIndex::getBucketLabel() const { michael@0: if (currentBucket_ != NULL) { michael@0: return currentBucket_->label_; michael@0: } else { michael@0: return emptyString_; michael@0: } michael@0: } michael@0: michael@0: michael@0: UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const { michael@0: if (currentBucket_ != NULL) { michael@0: return currentBucket_->labelType_; michael@0: } else { michael@0: return U_ALPHAINDEX_NORMAL; michael@0: } michael@0: } michael@0: michael@0: michael@0: int32_t AlphabeticIndex::getBucketRecordCount() const { michael@0: if (currentBucket_ != NULL && currentBucket_->records_ != NULL) { michael@0: return currentBucket_->records_->size(); michael@0: } else { michael@0: return 0; michael@0: } michael@0: } michael@0: michael@0: michael@0: AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) { michael@0: if (U_FAILURE(status)) { michael@0: return *this; michael@0: } michael@0: internalResetBucketIterator(); michael@0: return *this; michael@0: } michael@0: michael@0: michael@0: UBool AlphabeticIndex::nextRecord(UErrorCode &status) { michael@0: if (U_FAILURE(status)) { michael@0: return FALSE; michael@0: } michael@0: if (currentBucket_ == NULL) { michael@0: // We are trying to iterate over the items in a bucket, but there is no michael@0: // current bucket from the enumeration of buckets. michael@0: status = U_INVALID_STATE_ERROR; michael@0: return FALSE; michael@0: } michael@0: if (buckets_ == NULL) { michael@0: status = U_ENUM_OUT_OF_SYNC_ERROR; michael@0: return FALSE; michael@0: } michael@0: if (currentBucket_->records_ == NULL) { michael@0: return FALSE; michael@0: } michael@0: ++itemsIterIndex_; michael@0: if (itemsIterIndex_ >= currentBucket_->records_->size()) { michael@0: itemsIterIndex_ = currentBucket_->records_->size(); michael@0: return FALSE; michael@0: } michael@0: return TRUE; michael@0: } michael@0: michael@0: michael@0: const UnicodeString &AlphabeticIndex::getRecordName() const { michael@0: const UnicodeString *retStr = &emptyString_; michael@0: if (currentBucket_ != NULL && currentBucket_->records_ != NULL && michael@0: itemsIterIndex_ >= 0 && michael@0: itemsIterIndex_ < currentBucket_->records_->size()) { michael@0: Record *item = static_cast(currentBucket_->records_->elementAt(itemsIterIndex_)); michael@0: retStr = &item->name_; michael@0: } michael@0: return *retStr; michael@0: } michael@0: michael@0: const void *AlphabeticIndex::getRecordData() const { michael@0: const void *retPtr = NULL; michael@0: if (currentBucket_ != NULL && currentBucket_->records_ != NULL && michael@0: itemsIterIndex_ >= 0 && michael@0: itemsIterIndex_ < currentBucket_->records_->size()) { michael@0: Record *item = static_cast(currentBucket_->records_->elementAt(itemsIterIndex_)); michael@0: retPtr = item->data_; michael@0: } michael@0: return retPtr; michael@0: } michael@0: michael@0: michael@0: AlphabeticIndex & AlphabeticIndex::resetRecordIterator() { michael@0: itemsIterIndex_ = -1; michael@0: return *this; michael@0: } michael@0: michael@0: michael@0: michael@0: AlphabeticIndex::Bucket::Bucket(const UnicodeString &label, michael@0: const UnicodeString &lowerBoundary, michael@0: UAlphabeticIndexLabelType type) michael@0: : label_(label), lowerBoundary_(lowerBoundary), labelType_(type), michael@0: displayBucket_(NULL), displayIndex_(-1), michael@0: records_(NULL) { michael@0: } michael@0: michael@0: michael@0: AlphabeticIndex::Bucket::~Bucket() { michael@0: delete records_; michael@0: } michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: #endif