diff -r 000000000000 -r 6474c204b198 intl/icu/source/i18n/alphaindex.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/intl/icu/source/i18n/alphaindex.cpp Wed Dec 31 06:09:35 2014 +0100 @@ -0,0 +1,1350 @@ +/* +******************************************************************************* +* Copyright (C) 2009-2013, International Business Machines Corporation and +* others. All Rights Reserved. +******************************************************************************* +*/ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION + +#include "unicode/alphaindex.h" +#include "unicode/coleitr.h" +#include "unicode/coll.h" +#include "unicode/localpointer.h" +#include "unicode/normalizer2.h" +#include "unicode/tblcoll.h" +#include "unicode/ulocdata.h" +#include "unicode/uniset.h" +#include "unicode/uobject.h" +#include "unicode/usetiter.h" +#include "unicode/utf16.h" + +#include "cmemory.h" +#include "cstring.h" +#include "uassert.h" +#include "uvector.h" + +//#include +//#include + +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) + +U_NAMESPACE_BEGIN + +namespace { + +/** + * Prefix string for Chinese index buckets. + * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes + */ +const UChar BASE[1] = { 0xFDD0 }; +const int32_t BASE_LENGTH = 1; + +UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer, + const UnicodeString &one, const UnicodeString &other); + +} // namespace + +static int32_t U_CALLCONV +collatorComparator(const void *context, const void *left, const void *right); + +static int32_t U_CALLCONV +recordCompareFn(const void *context, const void *left, const void *right); + +// UVector support function, delete a Record. +static void U_CALLCONV +alphaIndex_deleteRecord(void *obj) { + delete static_cast(obj); +} + +namespace { + +UnicodeString *ownedString(const UnicodeString &s, LocalPointer &owned, + UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return NULL; } + if (owned.isValid()) { + return owned.orphan(); + } + UnicodeString *p = new UnicodeString(s); + if (p == NULL) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + } + return p; +} + +inline UnicodeString *getString(const UVector &list, int32_t i) { + return static_cast(list[i]); +} + +inline AlphabeticIndex::Bucket *getBucket(const UVector &list, int32_t i) { + return static_cast(list[i]); +} + +inline AlphabeticIndex::Record *getRecord(const UVector &list, int32_t i) { + return static_cast(list[i]); +} + +/** + * Like Java Collections.binarySearch(List, String, Comparator). + * + * @return the index>=0 where the item was found, + * or the index<0 for inserting the string at ~index in sorted order + */ +int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) { + if (list.size() == 0) { return ~0; } + int32_t start = 0; + int32_t limit = list.size(); + for (;;) { + int32_t i = (start + limit) / 2; + const UnicodeString *si = static_cast(list.elementAt(i)); + UErrorCode errorCode = U_ZERO_ERROR; + UCollationResult cmp = coll.compare(s, *si, errorCode); + if (cmp == UCOL_EQUAL) { + return i; + } else if (cmp < 0) { + if (i == start) { + return ~start; // insert s before *si + } + limit = i; + } else { + if (i == start) { + return ~(start + 1); // insert s after *si + } + start = i; + } + } +} + +} // namespace + +// The BucketList is not in the anonymous namespace because only Clang +// seems to support its use in other classes from there. +// However, we also don't need U_I18N_API because it is not used from outside the i18n library. +class BucketList : public UObject { +public: + BucketList(UVector *bucketList, UVector *publicBucketList) + : bucketList_(bucketList), immutableVisibleList_(publicBucketList) { + int32_t displayIndex = 0; + for (int32_t i = 0; i < publicBucketList->size(); ++i) { + getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++; + } + } + + // The virtual destructor must not be inline. + // See ticket #8454 for details. + virtual ~BucketList(); + + int32_t getBucketCount() const { + return immutableVisibleList_->size(); + } + + int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly, + UErrorCode &errorCode) { + // binary search + int32_t start = 0; + int32_t limit = bucketList_->size(); + while ((start + 1) < limit) { + int32_t i = (start + limit) / 2; + const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, i); + UCollationResult nameVsBucket = + collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode); + if (nameVsBucket < 0) { + limit = i; + } else { + start = i; + } + } + const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, start); + if (bucket->displayBucket_ != NULL) { + bucket = bucket->displayBucket_; + } + return bucket->displayIndex_; + } + + /** All of the buckets, visible and invisible. */ + UVector *bucketList_; + /** Just the visible buckets. */ + UVector *immutableVisibleList_; +}; + +BucketList::~BucketList() { + delete bucketList_; + if (immutableVisibleList_ != bucketList_) { + delete immutableVisibleList_; + } +} + +AlphabeticIndex::ImmutableIndex::~ImmutableIndex() { + delete buckets_; + delete collatorPrimaryOnly_; +} + +int32_t +AlphabeticIndex::ImmutableIndex::getBucketCount() const { + return buckets_->getBucketCount(); +} + +int32_t +AlphabeticIndex::ImmutableIndex::getBucketIndex( + const UnicodeString &name, UErrorCode &errorCode) const { + return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode); +} + +const AlphabeticIndex::Bucket * +AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const { + if (0 <= index && index < buckets_->getBucketCount()) { + return icu::getBucket(*buckets_->immutableVisibleList_, index); + } else { + return NULL; + } +} + +AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status) + : inputList_(NULL), + labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL), + maxLabelCount_(99), + initialLabels_(NULL), firstCharsInScripts_(NULL), + collator_(NULL), collatorPrimaryOnly_(NULL), + buckets_(NULL) { + init(&locale, status); +} + + +AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status) + : inputList_(NULL), + labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL), + maxLabelCount_(99), + initialLabels_(NULL), firstCharsInScripts_(NULL), + collator_(collator), collatorPrimaryOnly_(NULL), + buckets_(NULL) { + init(NULL, status); +} + + + +AlphabeticIndex::~AlphabeticIndex() { + delete collator_; + delete collatorPrimaryOnly_; + delete firstCharsInScripts_; + delete buckets_; + delete inputList_; + delete initialLabels_; +} + + +AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) { + if (U_FAILURE(status)) { + return *this; + } + initialLabels_->addAll(additions); + clearBuckets(); + return *this; +} + + +AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) { + addIndexExemplars(locale, status); + clearBuckets(); + return *this; +} + + +AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return NULL; } + // In C++, the ImmutableIndex must own its copy of the BucketList, + // even if it contains no records, for proper memory management. + // We could clone the buckets_ if they are not NULL, + // but that would be worth it only if this method is called multiple times, + // or called after using the old-style bucket iterator API. + LocalPointer immutableBucketList(createBucketList(errorCode)); + LocalPointer coll( + static_cast(collatorPrimaryOnly_->clone())); + if (immutableBucketList.isNull() || coll.isNull()) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias()); + if (immIndex == NULL) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + // The ImmutableIndex adopted its parameter objects. + immutableBucketList.orphan(); + coll.orphan(); + return immIndex; +} + +int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) { + initBuckets(status); + if (U_FAILURE(status)) { + return 0; + } + return buckets_->getBucketCount(); +} + + +int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) { + if (U_FAILURE(status) || inputList_ == NULL) { + return 0; + } + return inputList_->size(); +} + +void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const { + const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode); + if (U_FAILURE(errorCode)) { return; } + + const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0); + const UnicodeString &overflowBoundary = + *getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1); + + // We make a sorted array of elements. + // Some of the input may be redundant. + // That is, we might have c, ch, d, where "ch" sorts just like "c", "h". + // We filter out those cases. + UnicodeSetIterator iter(*initialLabels_); + while (iter.next()) { + const UnicodeString *item = &iter.getString(); + LocalPointer ownedItem; + UBool checkDistinct; + int32_t itemLength = item->length(); + if (!item->hasMoreChar32Than(0, itemLength, 1)) { + checkDistinct = FALSE; + } else if(item->charAt(itemLength - 1) == 0x2a && // '*' + item->charAt(itemLength - 2) != 0x2a) { + // Use a label if it is marked with one trailing star, + // even if the label string sorts the same when all contractions are suppressed. + ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1)); + item = ownedItem.getAlias(); + if (item == NULL) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + checkDistinct = FALSE; + } else { + checkDistinct = TRUE; + } + if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) { + // Ignore a primary-ignorable or non-alphabetic index character. + } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) { + // Ignore an index characters that will land in the overflow bucket. + } else if (checkDistinct && + collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) { + // Ignore a multi-code point index character that does not sort distinctly + // from the sequence of its separate characters. + } else { + int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_); + if (insertionPoint < 0) { + indexCharacters.insertElementAt( + ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode); + } else { + const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint); + if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) { + indexCharacters.setElementAt( + ownedString(*item, ownedItem, errorCode), insertionPoint); + } + } + } + } + if (U_FAILURE(errorCode)) { return; } + + // if the result is still too large, cut down to maxCount elements, by removing every nth element + + int32_t size = indexCharacters.size() - 1; + if (size > maxLabelCount_) { + int32_t count = 0; + int32_t old = -1; + for (int32_t i = 0; i < indexCharacters.size();) { + ++count; + int32_t bump = count * maxLabelCount_ / size; + if (bump == old) { + indexCharacters.removeElementAt(i); + } else { + old = bump; + ++i; + } + } + } +} + +namespace { + +const UnicodeString &fixLabel(const UnicodeString ¤t, UnicodeString &temp) { + if (!current.startsWith(BASE, BASE_LENGTH)) { + return current; + } + UChar rest = current.charAt(BASE_LENGTH); + if (0x2800 < rest && rest <= 0x28FF) { // stroke count + int32_t count = rest-0x2800; + temp.setTo((UChar)(0x30 + count % 10)); + if (count >= 10) { + count /= 10; + temp.insert(0, (UChar)(0x30 + count % 10)); + if (count >= 10) { + count /= 10; + temp.insert(0, (UChar)(0x30 + count)); + } + } + return temp.append((UChar)0x5283); + } + return temp.setTo(current, BASE_LENGTH); +} + +UBool hasMultiplePrimaryWeights( + CollationElementIterator &cei, int32_t variableTop, + const UnicodeString &s, UErrorCode &errorCode) { + cei.setText(s, errorCode); + UBool seenPrimary = FALSE; + for (;;) { + int32_t ce32 = cei.next(errorCode); + if (ce32 == CollationElementIterator::NULLORDER) { + break; + } + int32_t p = CollationElementIterator::primaryOrder(ce32); + if (p > variableTop && (ce32 & 0xc0) != 0xc0) { + // not primary ignorable, and not a continuation CE + if (seenPrimary) { + return TRUE; + } + seenPrimary = TRUE; + } + } + return FALSE; +} + +} // namespace + +BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const { + // Initialize indexCharacters. + UVector indexCharacters(errorCode); + indexCharacters.setDeleter(uprv_deleteUObject); + initLabels(indexCharacters, errorCode); + if (U_FAILURE(errorCode)) { return NULL; } + + // Variables for hasMultiplePrimaryWeights(). + LocalPointer cei( + collatorPrimaryOnly_->createCollationElementIterator(emptyString_)); + if (cei.isNull()) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + int32_t variableTop; + if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) { + variableTop = CollationElementIterator::primaryOrder( + (int32_t)collatorPrimaryOnly_->getVariableTop(errorCode)); + } else { + variableTop = 0; + } + UBool hasInvisibleBuckets = FALSE; + + // Helper arrays for Chinese Pinyin collation. + Bucket *asciiBuckets[26] = { + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL + }; + Bucket *pinyinBuckets[26] = { + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL + }; + UBool hasPinyin = FALSE; + + LocalPointer bucketList(new UVector(errorCode)); + if (bucketList.isNull()) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + bucketList->setDeleter(uprv_deleteUObject); + + // underflow bucket + Bucket *bucket = new Bucket(getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW); + if (bucket == NULL) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + bucketList->addElement(bucket, errorCode); + if (U_FAILURE(errorCode)) { return NULL; } + + UnicodeString temp; + + // fix up the list, adding underflow, additions, overflow + // Insert inflow labels as needed. + int32_t scriptIndex = -1; + const UnicodeString *scriptUpperBoundary = &emptyString_; + for (int32_t i = 0; i < indexCharacters.size(); ++i) { + UnicodeString ¤t = *getString(indexCharacters, i); + if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) { + // We crossed the script boundary into a new script. + const UnicodeString &inflowBoundary = *scriptUpperBoundary; + UBool skippedScript = FALSE; + for (;;) { + scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex); + if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) { + break; + } + skippedScript = TRUE; + } + if (skippedScript && bucketList->size() > 1) { + // We are skipping one or more scripts, + // and we are not just getting out of the underflow label. + bucket = new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW); + if (bucket == NULL) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + bucketList->addElement(bucket, errorCode); + } + } + // Add a bucket with the current label. + bucket = new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL); + if (bucket == NULL) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + bucketList->addElement(bucket, errorCode); + // Remember ASCII and Pinyin buckets for Pinyin redirects. + UChar c; + if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) { // A-Z + asciiBuckets[c - 0x41] = bucket; + } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) && + 0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) { + pinyinBuckets[c - 0x41] = bucket; + hasPinyin = TRUE; + } + // Check for multiple primary weights. + if (!current.startsWith(BASE, BASE_LENGTH) && + hasMultiplePrimaryWeights(*cei, variableTop, current, errorCode) && + current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) { + // "AE-ligature" or "Sch" etc. + for (int32_t i = bucketList->size() - 2;; --i) { + Bucket *singleBucket = getBucket(*bucketList, i); + if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) { + // There is no single-character bucket since the last + // underflow or inflow label. + break; + } + if (singleBucket->displayBucket_ == NULL && + !hasMultiplePrimaryWeights( + *cei, variableTop, singleBucket->lowerBoundary_, errorCode)) { + // Add an invisible bucket that redirects strings greater than the expansion + // to the previous single-character bucket. + // For example, after ... Q R S Sch we add Sch\uFFFF->S + // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S. + bucket = new Bucket(emptyString_, + UnicodeString(current).append((UChar)0xFFFF), + U_ALPHAINDEX_NORMAL); + if (bucket == NULL) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + bucket->displayBucket_ = singleBucket; + bucketList->addElement(bucket, errorCode); + hasInvisibleBuckets = TRUE; + break; + } + } + } + } + if (U_FAILURE(errorCode)) { return NULL; } + if (bucketList->size() == 1) { + // No real labels, show only the underflow label. + BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias()); + if (bl == NULL) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + bucketList.orphan(); + return bl; + } + // overflow bucket + bucket = new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW); + if (bucket == NULL) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + bucketList->addElement(bucket, errorCode); // final + + if (hasPinyin) { + // Redirect Pinyin buckets. + Bucket *asciiBucket = NULL; + for (int32_t i = 0; i < 26; ++i) { + if (asciiBuckets[i] != NULL) { + asciiBucket = asciiBuckets[i]; + } + if (pinyinBuckets[i] != NULL && asciiBucket != NULL) { + pinyinBuckets[i]->displayBucket_ = asciiBucket; + hasInvisibleBuckets = TRUE; + } + } + } + + if (U_FAILURE(errorCode)) { return NULL; } + if (!hasInvisibleBuckets) { + BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias()); + if (bl == NULL) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + bucketList.orphan(); + return bl; + } + // Merge inflow buckets that are visually adjacent. + // Iterate backwards: Merge inflow into overflow rather than the other way around. + int32_t i = bucketList->size() - 1; + Bucket *nextBucket = getBucket(*bucketList, i); + while (--i > 0) { + bucket = getBucket(*bucketList, i); + if (bucket->displayBucket_ != NULL) { + continue; // skip invisible buckets + } + if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) { + if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) { + bucket->displayBucket_ = nextBucket; + continue; + } + } + nextBucket = bucket; + } + + LocalPointer publicBucketList(new UVector(errorCode)); + if (bucketList.isNull()) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + // Do not call publicBucketList->setDeleter(): + // This vector shares its objects with the bucketList. + for (int32_t i = 0; i < bucketList->size(); ++i) { + bucket = getBucket(*bucketList, i); + if (bucket->displayBucket_ == NULL) { + publicBucketList->addElement(bucket, errorCode); + } + } + if (U_FAILURE(errorCode)) { return NULL; } + BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias()); + if (bl == NULL) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + bucketList.orphan(); + publicBucketList.orphan(); + return bl; +} + +/** + * Creates an index, and buckets and sorts the list of records into the index. + */ +void AlphabeticIndex::initBuckets(UErrorCode &errorCode) { + if (U_FAILURE(errorCode) || buckets_ != NULL) { + return; + } + buckets_ = createBucketList(errorCode); + if (U_FAILURE(errorCode) || inputList_ == NULL || inputList_->isEmpty()) { + return; + } + + // Sort the records by name. + // Stable sort preserves input order of collation duplicates. + inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode); + + // Now, we traverse all of the input, which is now sorted. + // If the item doesn't go in the current bucket, we find the next bucket that contains it. + // This makes the process order n*log(n), since we just sort the list and then do a linear process. + // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so + // we need to improve it for that case. + + Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0); + int32_t bucketIndex = 1; + Bucket *nextBucket; + const UnicodeString *upperBoundary; + if (bucketIndex < buckets_->bucketList_->size()) { + nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++); + upperBoundary = &nextBucket->lowerBoundary_; + } else { + nextBucket = NULL; + upperBoundary = NULL; + } + for (int32_t i = 0; i < inputList_->size(); ++i) { + Record *r = getRecord(*inputList_, i); + // if the current bucket isn't the right one, find the one that is + // We have a special flag for the last bucket so that we don't look any further + while (upperBoundary != NULL && + collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) { + currentBucket = nextBucket; + // now reset the boundary that we compare against + if (bucketIndex < buckets_->bucketList_->size()) { + nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++); + upperBoundary = &nextBucket->lowerBoundary_; + } else { + upperBoundary = NULL; + } + } + // now put the record into the bucket. + Bucket *bucket = currentBucket; + if (bucket->displayBucket_ != NULL) { + bucket = bucket->displayBucket_; + } + if (bucket->records_ == NULL) { + bucket->records_ = new UVector(errorCode); + if (bucket->records_ == NULL) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + } + bucket->records_->addElement(r, errorCode); + } +} + +void AlphabeticIndex::clearBuckets() { + if (buckets_ != NULL) { + delete buckets_; + buckets_ = NULL; + internalResetBucketIterator(); + } +} + +void AlphabeticIndex::internalResetBucketIterator() { + labelsIterIndex_ = -1; + currentBucket_ = NULL; +} + + +void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) { + if (U_FAILURE(status)) { return; } + // Chinese index characters, which are specific to each of the several Chinese tailorings, + // take precedence over the single locale data exemplar set per language. + const char *language = locale.getLanguage(); + if (uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 || + uprv_strcmp(language, "ko") == 0) { + // TODO: This should be done regardless of the language, but it's expensive. + // We should add a Collator function (can be @internal) + // to enumerate just the contractions that start with a given code point or string. + if (addChineseIndexCharacters(status) || U_FAILURE(status)) { + return; + } + } + + LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status)); + if (U_FAILURE(status)) { + return; + } + + UnicodeSet exemplars; + ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status); + if (U_SUCCESS(status)) { + initialLabels_->addAll(exemplars); + return; + } + status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR + + // The locale data did not include explicit Index characters. + // Synthesize a set of them from the locale's standard exemplar characters. + ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status); + if (U_FAILURE(status)) { + return; + } + + // question: should we add auxiliary exemplars? + if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.size() == 0) { + exemplars.add(0x61, 0x7A); + } + if (exemplars.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables + // cut down to small list + exemplars.remove(0xAC00, 0xD7A3). + add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C). + add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544). + add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0). + add(0xD30C).add(0xD558); + } + if (exemplars.containsSome(0x1200, 0x137F)) { // Ethiopic block + // cut down to small list + // make use of the fact that Ethiopic is allocated in 8's, where + // the base is 0 mod 8. + UnicodeSet ethiopic( + UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"), status); + UnicodeSetIterator it(ethiopic); + while (it.next() && !it.isString()) { + if ((it.getCodepoint() & 0x7) != 0) { + exemplars.remove(it.getCodepoint()); + } + } + } + + // Upper-case any that aren't already so. + // (We only do this for synthesized index characters.) + UnicodeSetIterator it(exemplars); + UnicodeString upperC; + while (it.next()) { + const UnicodeString &exemplarC = it.getString(); + upperC = exemplarC; + upperC.toUpper(locale); + initialLabels_->add(upperC); + } +} + +UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) { + UnicodeSet contractions; + ucol_getContractionsAndExpansions(collatorPrimaryOnly_->getUCollator(), + contractions.toUSet(), NULL, FALSE, &errorCode); + if (U_FAILURE(errorCode)) { return FALSE; } + UnicodeString firstHanBoundary; + UBool hasPinyin = FALSE; + UnicodeSetIterator iter(contractions); + while (iter.next()) { + const UnicodeString &s = iter.getString(); + if (s.startsWith(BASE, BASE_LENGTH)) { + initialLabels_->add(s); + if (firstHanBoundary.isEmpty() || + collatorPrimaryOnly_->compare(s, firstHanBoundary, errorCode) < 0) { + firstHanBoundary = s; + } + UChar c = s.charAt(s.length() - 1); + if (0x41 <= c && c <= 0x5A) { // A-Z + hasPinyin = TRUE; + } + } + } + if (hasPinyin) { + initialLabels_->add(0x41, 0x5A); // A-Z + } + if (!firstHanBoundary.isEmpty()) { + // The hardcoded list of script boundaries includes U+4E00 + // which is tailored to not be the first primary + // in all Chinese tailorings except "unihan". + // Replace U+4E00 with the first boundary string from the tailoring. + // TODO: This becomes obsolete when the root collator gets + // reliable script-first-primary mappings. + int32_t hanIndex = binarySearch( + *firstCharsInScripts_, UnicodeString((UChar)0x4E00), *collatorPrimaryOnly_); + if (hanIndex >= 0) { + UnicodeString *fh = new UnicodeString(firstHanBoundary); + if (fh == NULL) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return FALSE; + } + firstCharsInScripts_->setElementAt(fh, hanIndex); + } + return TRUE; + } else { + return FALSE; + } +} + + +/* + * Return the string with interspersed CGJs. Input must have more than 2 codepoints. + */ +static const UChar CGJ = 0x034F; +UnicodeString AlphabeticIndex::separated(const UnicodeString &item) { + UnicodeString result; + if (item.length() == 0) { + return result; + } + int32_t i = 0; + for (;;) { + UChar32 cp = item.char32At(i); + result.append(cp); + i = item.moveIndex32(i, 1); + if (i >= item.length()) { + break; + } + result.append(CGJ); + } + return result; +} + + +UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const { + return FALSE; +} + + +UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const { + return FALSE; +} + + +const RuleBasedCollator &AlphabeticIndex::getCollator() const { + // There are no known non-RuleBasedCollator collators, and none ever expected. + // But, in case that changes, better a null pointer than a wrong type. + return *dynamic_cast(collator_); +} + + +const UnicodeString &AlphabeticIndex::getInflowLabel() const { + return inflowLabel_; +} + +const UnicodeString &AlphabeticIndex::getOverflowLabel() const { + return overflowLabel_; +} + + +const UnicodeString &AlphabeticIndex::getUnderflowLabel() const { + return underflowLabel_; +} + + +AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { + inflowLabel_ = label; + clearBuckets(); + return *this; +} + + +AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { + overflowLabel_ = label; + clearBuckets(); + return *this; +} + + +AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { + underflowLabel_ = label; + clearBuckets(); + return *this; +} + + +int32_t AlphabeticIndex::getMaxLabelCount() const { + return maxLabelCount_; +} + + +AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) { + if (U_FAILURE(status)) { + return *this; + } + if (maxLabelCount <= 0) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return *this; + } + maxLabelCount_ = maxLabelCount; + clearBuckets(); + return *this; +} + + +// +// init() - Common code for constructors. +// + +void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) { + if (U_FAILURE(status)) { return; } + if (locale == NULL && collator_ == NULL) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + + initialLabels_ = new UnicodeSet(); + if (initialLabels_ == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + + inflowLabel_.setTo((UChar)0x2026); // Ellipsis + overflowLabel_ = inflowLabel_; + underflowLabel_ = inflowLabel_; + + if (collator_ == NULL) { + collator_ = static_cast(Collator::createInstance(*locale, status)); + if (U_FAILURE(status)) { return; } + if (collator_ == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + } + collatorPrimaryOnly_ = static_cast(collator_->clone()); + if (collatorPrimaryOnly_ == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status); + firstCharsInScripts_ = firstStringsInScript(status); + if (U_FAILURE(status)) { return; } + firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status); + UnicodeString _4E00((UChar)0x4E00); + UnicodeString _1100((UChar)0x1100); + UnicodeString _1112((UChar)0x1112); + if (collatorPrimaryOnly_->compare(_4E00, _1112, status) <= 0 && + collatorPrimaryOnly_->compare(_1100, _4E00, status) <= 0) { + // The standard Korean tailoring sorts Hanja (Han characters) + // as secondary differences from Hangul syllables. + // This makes U+4E00 not useful as a Han-script boundary. + // TODO: This becomes obsolete when the root collator gets + // reliable script-first-primary mappings. + int32_t hanIndex = binarySearch( + *firstCharsInScripts_, _4E00, *collatorPrimaryOnly_); + if (hanIndex >= 0) { + firstCharsInScripts_->removeElementAt(hanIndex); + } + } + // Guard against a degenerate collator where + // some script boundary strings are primary ignorable. + for (;;) { + if (U_FAILURE(status)) { return; } + if (firstCharsInScripts_->isEmpty()) { + // AlphabeticIndex requires some non-ignorable script boundary strings. + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + if (collatorPrimaryOnly_->compare( + *static_cast(firstCharsInScripts_->elementAt(0)), + emptyString_, status) == UCOL_EQUAL) { + firstCharsInScripts_->removeElementAt(0); + } else { + break; + } + } + + if (locale != NULL) { + addIndexExemplars(*locale, status); + } +} + + +// +// Comparison function for UVector sorting with a collator. +// +static int32_t U_CALLCONV +collatorComparator(const void *context, const void *left, const void *right) { + const UElement *leftElement = static_cast(left); + const UElement *rightElement = static_cast(right); + const UnicodeString *leftString = static_cast(leftElement->pointer); + const UnicodeString *rightString = static_cast(rightElement->pointer); + + if (leftString == rightString) { + // Catches case where both are NULL + return 0; + } + if (leftString == NULL) { + return 1; + }; + if (rightString == NULL) { + return -1; + } + const Collator *col = static_cast(context); + UErrorCode errorCode = U_ZERO_ERROR; + return col->compare(*leftString, *rightString, errorCode); +} + +// +// Comparison function for UVector sorting with a collator. +// +static int32_t U_CALLCONV +recordCompareFn(const void *context, const void *left, const void *right) { + const UElement *leftElement = static_cast(left); + const UElement *rightElement = static_cast(right); + const AlphabeticIndex::Record *leftRec = static_cast(leftElement->pointer); + const AlphabeticIndex::Record *rightRec = static_cast(rightElement->pointer); + const Collator *col = static_cast(context); + UErrorCode errorCode = U_ZERO_ERROR; + return col->compare(leftRec->name_, rightRec->name_, errorCode); +} + + +/** + * This list contains one character per script that has the + * lowest primary weight for that script in the root collator. + * This list will be copied and sorted to account for script reordering. + * + *

TODO: This is fragile. If the first character of a script is tailored + * so that it does not map to the script's lowest primary weight any more, + * then the buckets will be off. + * There are hacks in the code to handle the known CJK tailorings of U+4E00. + * + *

We use "A" not "a" because the en_US_POSIX tailoring sorts A primary-before a. + * + * Keep this in sync with HACK_FIRST_CHARS_IN_SCRIPTS in + * ICU4J main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java + */ +static const UChar HACK_FIRST_CHARS_IN_SCRIPTS[] = { + 0x41, 0, 0x03B1, 0, + 0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0, + 0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0, + 0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0, + 0xAAF2, 0, // Meetei Mayek + 0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0, + U16_LEAD(0x111C4), U16_TRAIL(0x111C4), 0, // Sharada + U16_LEAD(0x11680), U16_TRAIL(0x11680), 0, // Takri + 0x1B83, 0, + 0xD802, 0xDE00, 0, 0x0E01, 0, + 0x0EDE, 0, // Lao + 0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0, + 0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0, + U16_LEAD(0x11103), U16_TRAIL(0x11103), 0, // Chakma + 0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0, + 0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0, + 0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0, + U16_LEAD(0x16F00), U16_TRAIL(0x16F00), 0, // Miao + 0xD800, 0xDE80, 0, + 0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0, + 0xD801, 0xDC80, 0, + U16_LEAD(0x110D0), U16_TRAIL(0x110D0), 0, // Sora Sompeng + 0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0, + 0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0, + U16_LEAD(0x109A0), U16_TRAIL(0x109A0), 0, // Meroitic Cursive + U16_LEAD(0x10980), U16_TRAIL(0x10980), 0, // Meroitic Hieroglyphs + 0x4E00, 0, + // TODO: The overflow bucket's lowerBoundary string should be the + // first item after the last reordering group in the collator's script order. + // This should normally be the first Unicode code point + // that is unassigned (U+0378 in Unicode 6.3) and untailored. + // However, at least up to ICU 51 the Hani reordering group includes + // unassigned code points, + // and there is no stable string for the start of the trailing-weights range. + // The only known string that sorts "high" is U+FFFF. + // When ICU separates Hani vs. unassigned reordering groups, we need to fix this, + // and fix relevant test code. + // Ideally, FractionalUCA.txt will have a "script first primary" + // for unassigned code points. + 0xFFFF, 0 +}; + +UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) { + if (U_FAILURE(status)) { + return NULL; + } + UVector *dest = new UVector(status); + if (dest == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + dest->setDeleter(uprv_deleteUObject); + const UChar *src = HACK_FIRST_CHARS_IN_SCRIPTS; + const UChar *limit = src + LENGTHOF(HACK_FIRST_CHARS_IN_SCRIPTS); + do { + if (U_FAILURE(status)) { + return dest; + } + UnicodeString *str = new UnicodeString(src, -1); + if (str == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return dest; + } + dest->addElement(str, status); + src += str->length() + 1; + } while (src < limit); + return dest; +} + + +namespace { + +/** + * Returns true if one index character string is "better" than the other. + * Shorter NFKD is better, and otherwise NFKD-binary-less-than is + * better, and otherwise binary-less-than is better. + */ +UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer, + const UnicodeString &one, const UnicodeString &other) { + // This is called with primary-equal strings, but never with one.equals(other). + UErrorCode status = U_ZERO_ERROR; + UnicodeString n1 = nfkdNormalizer.normalize(one, status); + UnicodeString n2 = nfkdNormalizer.normalize(other, status); + if (U_FAILURE(status)) { return FALSE; } + int32_t result = n1.countChar32() - n2.countChar32(); + if (result != 0) { + return result < 0; + } + result = n1.compareCodePointOrder(n2); + if (result != 0) { + return result < 0; + } + return one.compareCodePointOrder(other) < 0; +} + +} // namespace + +// +// Constructor & Destructor for AlphabeticIndex::Record +// +// Records are internal only, instances are not directly surfaced in the public API. +// This class is mostly struct-like, with all public fields. + +AlphabeticIndex::Record::Record(const UnicodeString &name, const void *data) + : name_(name), data_(data) {} + +AlphabeticIndex::Record::~Record() { +} + + +AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) { + if (U_FAILURE(status)) { + return *this; + } + if (inputList_ == NULL) { + inputList_ = new UVector(status); + if (inputList_ == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return *this; + } + inputList_->setDeleter(alphaIndex_deleteRecord); + } + Record *r = new Record(name, data); + if (r == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return *this; + } + inputList_->addElement(r, status); + clearBuckets(); + //std::string ss; + //std::string ss2; + //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" << + // " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl; + return *this; +} + + +AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) { + if (U_SUCCESS(status) && inputList_ != NULL && !inputList_->isEmpty()) { + inputList_->removeAllElements(); + clearBuckets(); + } + return *this; +} + +int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) { + initBuckets(status); + if (U_FAILURE(status)) { + return 0; + } + return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, status); +} + + +int32_t AlphabeticIndex::getBucketIndex() const { + return labelsIterIndex_; +} + + +UBool AlphabeticIndex::nextBucket(UErrorCode &status) { + if (U_FAILURE(status)) { + return FALSE; + } + if (buckets_ == NULL && currentBucket_ != NULL) { + status = U_ENUM_OUT_OF_SYNC_ERROR; + return FALSE; + } + initBuckets(status); + if (U_FAILURE(status)) { + return FALSE; + } + ++labelsIterIndex_; + if (labelsIterIndex_ >= buckets_->getBucketCount()) { + labelsIterIndex_ = buckets_->getBucketCount(); + return FALSE; + } + currentBucket_ = getBucket(*buckets_->immutableVisibleList_, labelsIterIndex_); + resetRecordIterator(); + return TRUE; +} + +const UnicodeString &AlphabeticIndex::getBucketLabel() const { + if (currentBucket_ != NULL) { + return currentBucket_->label_; + } else { + return emptyString_; + } +} + + +UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const { + if (currentBucket_ != NULL) { + return currentBucket_->labelType_; + } else { + return U_ALPHAINDEX_NORMAL; + } +} + + +int32_t AlphabeticIndex::getBucketRecordCount() const { + if (currentBucket_ != NULL && currentBucket_->records_ != NULL) { + return currentBucket_->records_->size(); + } else { + return 0; + } +} + + +AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) { + if (U_FAILURE(status)) { + return *this; + } + internalResetBucketIterator(); + return *this; +} + + +UBool AlphabeticIndex::nextRecord(UErrorCode &status) { + if (U_FAILURE(status)) { + return FALSE; + } + if (currentBucket_ == NULL) { + // We are trying to iterate over the items in a bucket, but there is no + // current bucket from the enumeration of buckets. + status = U_INVALID_STATE_ERROR; + return FALSE; + } + if (buckets_ == NULL) { + status = U_ENUM_OUT_OF_SYNC_ERROR; + return FALSE; + } + if (currentBucket_->records_ == NULL) { + return FALSE; + } + ++itemsIterIndex_; + if (itemsIterIndex_ >= currentBucket_->records_->size()) { + itemsIterIndex_ = currentBucket_->records_->size(); + return FALSE; + } + return TRUE; +} + + +const UnicodeString &AlphabeticIndex::getRecordName() const { + const UnicodeString *retStr = &emptyString_; + if (currentBucket_ != NULL && currentBucket_->records_ != NULL && + itemsIterIndex_ >= 0 && + itemsIterIndex_ < currentBucket_->records_->size()) { + Record *item = static_cast(currentBucket_->records_->elementAt(itemsIterIndex_)); + retStr = &item->name_; + } + return *retStr; +} + +const void *AlphabeticIndex::getRecordData() const { + const void *retPtr = NULL; + if (currentBucket_ != NULL && currentBucket_->records_ != NULL && + itemsIterIndex_ >= 0 && + itemsIterIndex_ < currentBucket_->records_->size()) { + Record *item = static_cast(currentBucket_->records_->elementAt(itemsIterIndex_)); + retPtr = item->data_; + } + return retPtr; +} + + +AlphabeticIndex & AlphabeticIndex::resetRecordIterator() { + itemsIterIndex_ = -1; + return *this; +} + + + +AlphabeticIndex::Bucket::Bucket(const UnicodeString &label, + const UnicodeString &lowerBoundary, + UAlphabeticIndexLabelType type) + : label_(label), lowerBoundary_(lowerBoundary), labelType_(type), + displayBucket_(NULL), displayIndex_(-1), + records_(NULL) { +} + + +AlphabeticIndex::Bucket::~Bucket() { + delete records_; +} + +U_NAMESPACE_END + +#endif