|
1 /* |
|
2 ******************************************************************************* |
|
3 * Copyright (C) 2009-2013, International Business Machines Corporation and |
|
4 * others. All Rights Reserved. |
|
5 ******************************************************************************* |
|
6 */ |
|
7 |
|
8 #include "unicode/utypes.h" |
|
9 |
|
10 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION |
|
11 |
|
12 #include "unicode/alphaindex.h" |
|
13 #include "unicode/coleitr.h" |
|
14 #include "unicode/coll.h" |
|
15 #include "unicode/localpointer.h" |
|
16 #include "unicode/normalizer2.h" |
|
17 #include "unicode/tblcoll.h" |
|
18 #include "unicode/ulocdata.h" |
|
19 #include "unicode/uniset.h" |
|
20 #include "unicode/uobject.h" |
|
21 #include "unicode/usetiter.h" |
|
22 #include "unicode/utf16.h" |
|
23 |
|
24 #include "cmemory.h" |
|
25 #include "cstring.h" |
|
26 #include "uassert.h" |
|
27 #include "uvector.h" |
|
28 |
|
29 //#include <string> |
|
30 //#include <iostream> |
|
31 |
|
32 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
|
33 |
|
34 U_NAMESPACE_BEGIN |
|
35 |
|
36 namespace { |
|
37 |
|
38 /** |
|
39 * Prefix string for Chinese index buckets. |
|
40 * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes |
|
41 */ |
|
42 const UChar BASE[1] = { 0xFDD0 }; |
|
43 const int32_t BASE_LENGTH = 1; |
|
44 |
|
45 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer, |
|
46 const UnicodeString &one, const UnicodeString &other); |
|
47 |
|
48 } // namespace |
|
49 |
|
50 static int32_t U_CALLCONV |
|
51 collatorComparator(const void *context, const void *left, const void *right); |
|
52 |
|
53 static int32_t U_CALLCONV |
|
54 recordCompareFn(const void *context, const void *left, const void *right); |
|
55 |
|
56 // UVector<Record *> support function, delete a Record. |
|
57 static void U_CALLCONV |
|
58 alphaIndex_deleteRecord(void *obj) { |
|
59 delete static_cast<AlphabeticIndex::Record *>(obj); |
|
60 } |
|
61 |
|
62 namespace { |
|
63 |
|
64 UnicodeString *ownedString(const UnicodeString &s, LocalPointer<UnicodeString> &owned, |
|
65 UErrorCode &errorCode) { |
|
66 if (U_FAILURE(errorCode)) { return NULL; } |
|
67 if (owned.isValid()) { |
|
68 return owned.orphan(); |
|
69 } |
|
70 UnicodeString *p = new UnicodeString(s); |
|
71 if (p == NULL) { |
|
72 errorCode = U_MEMORY_ALLOCATION_ERROR; |
|
73 } |
|
74 return p; |
|
75 } |
|
76 |
|
77 inline UnicodeString *getString(const UVector &list, int32_t i) { |
|
78 return static_cast<UnicodeString *>(list[i]); |
|
79 } |
|
80 |
|
81 inline AlphabeticIndex::Bucket *getBucket(const UVector &list, int32_t i) { |
|
82 return static_cast<AlphabeticIndex::Bucket *>(list[i]); |
|
83 } |
|
84 |
|
85 inline AlphabeticIndex::Record *getRecord(const UVector &list, int32_t i) { |
|
86 return static_cast<AlphabeticIndex::Record *>(list[i]); |
|
87 } |
|
88 |
|
89 /** |
|
90 * Like Java Collections.binarySearch(List, String, Comparator). |
|
91 * |
|
92 * @return the index>=0 where the item was found, |
|
93 * or the index<0 for inserting the string at ~index in sorted order |
|
94 */ |
|
95 int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) { |
|
96 if (list.size() == 0) { return ~0; } |
|
97 int32_t start = 0; |
|
98 int32_t limit = list.size(); |
|
99 for (;;) { |
|
100 int32_t i = (start + limit) / 2; |
|
101 const UnicodeString *si = static_cast<UnicodeString *>(list.elementAt(i)); |
|
102 UErrorCode errorCode = U_ZERO_ERROR; |
|
103 UCollationResult cmp = coll.compare(s, *si, errorCode); |
|
104 if (cmp == UCOL_EQUAL) { |
|
105 return i; |
|
106 } else if (cmp < 0) { |
|
107 if (i == start) { |
|
108 return ~start; // insert s before *si |
|
109 } |
|
110 limit = i; |
|
111 } else { |
|
112 if (i == start) { |
|
113 return ~(start + 1); // insert s after *si |
|
114 } |
|
115 start = i; |
|
116 } |
|
117 } |
|
118 } |
|
119 |
|
120 } // namespace |
|
121 |
|
122 // The BucketList is not in the anonymous namespace because only Clang |
|
123 // seems to support its use in other classes from there. |
|
124 // However, we also don't need U_I18N_API because it is not used from outside the i18n library. |
|
125 class BucketList : public UObject { |
|
126 public: |
|
127 BucketList(UVector *bucketList, UVector *publicBucketList) |
|
128 : bucketList_(bucketList), immutableVisibleList_(publicBucketList) { |
|
129 int32_t displayIndex = 0; |
|
130 for (int32_t i = 0; i < publicBucketList->size(); ++i) { |
|
131 getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++; |
|
132 } |
|
133 } |
|
134 |
|
135 // The virtual destructor must not be inline. |
|
136 // See ticket #8454 for details. |
|
137 virtual ~BucketList(); |
|
138 |
|
139 int32_t getBucketCount() const { |
|
140 return immutableVisibleList_->size(); |
|
141 } |
|
142 |
|
143 int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly, |
|
144 UErrorCode &errorCode) { |
|
145 // binary search |
|
146 int32_t start = 0; |
|
147 int32_t limit = bucketList_->size(); |
|
148 while ((start + 1) < limit) { |
|
149 int32_t i = (start + limit) / 2; |
|
150 const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, i); |
|
151 UCollationResult nameVsBucket = |
|
152 collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode); |
|
153 if (nameVsBucket < 0) { |
|
154 limit = i; |
|
155 } else { |
|
156 start = i; |
|
157 } |
|
158 } |
|
159 const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, start); |
|
160 if (bucket->displayBucket_ != NULL) { |
|
161 bucket = bucket->displayBucket_; |
|
162 } |
|
163 return bucket->displayIndex_; |
|
164 } |
|
165 |
|
166 /** All of the buckets, visible and invisible. */ |
|
167 UVector *bucketList_; |
|
168 /** Just the visible buckets. */ |
|
169 UVector *immutableVisibleList_; |
|
170 }; |
|
171 |
|
172 BucketList::~BucketList() { |
|
173 delete bucketList_; |
|
174 if (immutableVisibleList_ != bucketList_) { |
|
175 delete immutableVisibleList_; |
|
176 } |
|
177 } |
|
178 |
|
179 AlphabeticIndex::ImmutableIndex::~ImmutableIndex() { |
|
180 delete buckets_; |
|
181 delete collatorPrimaryOnly_; |
|
182 } |
|
183 |
|
184 int32_t |
|
185 AlphabeticIndex::ImmutableIndex::getBucketCount() const { |
|
186 return buckets_->getBucketCount(); |
|
187 } |
|
188 |
|
189 int32_t |
|
190 AlphabeticIndex::ImmutableIndex::getBucketIndex( |
|
191 const UnicodeString &name, UErrorCode &errorCode) const { |
|
192 return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode); |
|
193 } |
|
194 |
|
195 const AlphabeticIndex::Bucket * |
|
196 AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const { |
|
197 if (0 <= index && index < buckets_->getBucketCount()) { |
|
198 return icu::getBucket(*buckets_->immutableVisibleList_, index); |
|
199 } else { |
|
200 return NULL; |
|
201 } |
|
202 } |
|
203 |
|
204 AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status) |
|
205 : inputList_(NULL), |
|
206 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL), |
|
207 maxLabelCount_(99), |
|
208 initialLabels_(NULL), firstCharsInScripts_(NULL), |
|
209 collator_(NULL), collatorPrimaryOnly_(NULL), |
|
210 buckets_(NULL) { |
|
211 init(&locale, status); |
|
212 } |
|
213 |
|
214 |
|
215 AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status) |
|
216 : inputList_(NULL), |
|
217 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL), |
|
218 maxLabelCount_(99), |
|
219 initialLabels_(NULL), firstCharsInScripts_(NULL), |
|
220 collator_(collator), collatorPrimaryOnly_(NULL), |
|
221 buckets_(NULL) { |
|
222 init(NULL, status); |
|
223 } |
|
224 |
|
225 |
|
226 |
|
227 AlphabeticIndex::~AlphabeticIndex() { |
|
228 delete collator_; |
|
229 delete collatorPrimaryOnly_; |
|
230 delete firstCharsInScripts_; |
|
231 delete buckets_; |
|
232 delete inputList_; |
|
233 delete initialLabels_; |
|
234 } |
|
235 |
|
236 |
|
237 AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) { |
|
238 if (U_FAILURE(status)) { |
|
239 return *this; |
|
240 } |
|
241 initialLabels_->addAll(additions); |
|
242 clearBuckets(); |
|
243 return *this; |
|
244 } |
|
245 |
|
246 |
|
247 AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) { |
|
248 addIndexExemplars(locale, status); |
|
249 clearBuckets(); |
|
250 return *this; |
|
251 } |
|
252 |
|
253 |
|
254 AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) { |
|
255 if (U_FAILURE(errorCode)) { return NULL; } |
|
256 // In C++, the ImmutableIndex must own its copy of the BucketList, |
|
257 // even if it contains no records, for proper memory management. |
|
258 // We could clone the buckets_ if they are not NULL, |
|
259 // but that would be worth it only if this method is called multiple times, |
|
260 // or called after using the old-style bucket iterator API. |
|
261 LocalPointer<BucketList> immutableBucketList(createBucketList(errorCode)); |
|
262 LocalPointer<RuleBasedCollator> coll( |
|
263 static_cast<RuleBasedCollator *>(collatorPrimaryOnly_->clone())); |
|
264 if (immutableBucketList.isNull() || coll.isNull()) { |
|
265 errorCode = U_MEMORY_ALLOCATION_ERROR; |
|
266 return NULL; |
|
267 } |
|
268 ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias()); |
|
269 if (immIndex == NULL) { |
|
270 errorCode = U_MEMORY_ALLOCATION_ERROR; |
|
271 return NULL; |
|
272 } |
|
273 // The ImmutableIndex adopted its parameter objects. |
|
274 immutableBucketList.orphan(); |
|
275 coll.orphan(); |
|
276 return immIndex; |
|
277 } |
|
278 |
|
279 int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) { |
|
280 initBuckets(status); |
|
281 if (U_FAILURE(status)) { |
|
282 return 0; |
|
283 } |
|
284 return buckets_->getBucketCount(); |
|
285 } |
|
286 |
|
287 |
|
288 int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) { |
|
289 if (U_FAILURE(status) || inputList_ == NULL) { |
|
290 return 0; |
|
291 } |
|
292 return inputList_->size(); |
|
293 } |
|
294 |
|
295 void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const { |
|
296 const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode); |
|
297 if (U_FAILURE(errorCode)) { return; } |
|
298 |
|
299 const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0); |
|
300 const UnicodeString &overflowBoundary = |
|
301 *getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1); |
|
302 |
|
303 // We make a sorted array of elements. |
|
304 // Some of the input may be redundant. |
|
305 // That is, we might have c, ch, d, where "ch" sorts just like "c", "h". |
|
306 // We filter out those cases. |
|
307 UnicodeSetIterator iter(*initialLabels_); |
|
308 while (iter.next()) { |
|
309 const UnicodeString *item = &iter.getString(); |
|
310 LocalPointer<UnicodeString> ownedItem; |
|
311 UBool checkDistinct; |
|
312 int32_t itemLength = item->length(); |
|
313 if (!item->hasMoreChar32Than(0, itemLength, 1)) { |
|
314 checkDistinct = FALSE; |
|
315 } else if(item->charAt(itemLength - 1) == 0x2a && // '*' |
|
316 item->charAt(itemLength - 2) != 0x2a) { |
|
317 // Use a label if it is marked with one trailing star, |
|
318 // even if the label string sorts the same when all contractions are suppressed. |
|
319 ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1)); |
|
320 item = ownedItem.getAlias(); |
|
321 if (item == NULL) { |
|
322 errorCode = U_MEMORY_ALLOCATION_ERROR; |
|
323 return; |
|
324 } |
|
325 checkDistinct = FALSE; |
|
326 } else { |
|
327 checkDistinct = TRUE; |
|
328 } |
|
329 if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) { |
|
330 // Ignore a primary-ignorable or non-alphabetic index character. |
|
331 } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) { |
|
332 // Ignore an index characters that will land in the overflow bucket. |
|
333 } else if (checkDistinct && |
|
334 collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) { |
|
335 // Ignore a multi-code point index character that does not sort distinctly |
|
336 // from the sequence of its separate characters. |
|
337 } else { |
|
338 int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_); |
|
339 if (insertionPoint < 0) { |
|
340 indexCharacters.insertElementAt( |
|
341 ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode); |
|
342 } else { |
|
343 const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint); |
|
344 if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) { |
|
345 indexCharacters.setElementAt( |
|
346 ownedString(*item, ownedItem, errorCode), insertionPoint); |
|
347 } |
|
348 } |
|
349 } |
|
350 } |
|
351 if (U_FAILURE(errorCode)) { return; } |
|
352 |
|
353 // if the result is still too large, cut down to maxCount elements, by removing every nth element |
|
354 |
|
355 int32_t size = indexCharacters.size() - 1; |
|
356 if (size > maxLabelCount_) { |
|
357 int32_t count = 0; |
|
358 int32_t old = -1; |
|
359 for (int32_t i = 0; i < indexCharacters.size();) { |
|
360 ++count; |
|
361 int32_t bump = count * maxLabelCount_ / size; |
|
362 if (bump == old) { |
|
363 indexCharacters.removeElementAt(i); |
|
364 } else { |
|
365 old = bump; |
|
366 ++i; |
|
367 } |
|
368 } |
|
369 } |
|
370 } |
|
371 |
|
372 namespace { |
|
373 |
|
374 const UnicodeString &fixLabel(const UnicodeString ¤t, UnicodeString &temp) { |
|
375 if (!current.startsWith(BASE, BASE_LENGTH)) { |
|
376 return current; |
|
377 } |
|
378 UChar rest = current.charAt(BASE_LENGTH); |
|
379 if (0x2800 < rest && rest <= 0x28FF) { // stroke count |
|
380 int32_t count = rest-0x2800; |
|
381 temp.setTo((UChar)(0x30 + count % 10)); |
|
382 if (count >= 10) { |
|
383 count /= 10; |
|
384 temp.insert(0, (UChar)(0x30 + count % 10)); |
|
385 if (count >= 10) { |
|
386 count /= 10; |
|
387 temp.insert(0, (UChar)(0x30 + count)); |
|
388 } |
|
389 } |
|
390 return temp.append((UChar)0x5283); |
|
391 } |
|
392 return temp.setTo(current, BASE_LENGTH); |
|
393 } |
|
394 |
|
395 UBool hasMultiplePrimaryWeights( |
|
396 CollationElementIterator &cei, int32_t variableTop, |
|
397 const UnicodeString &s, UErrorCode &errorCode) { |
|
398 cei.setText(s, errorCode); |
|
399 UBool seenPrimary = FALSE; |
|
400 for (;;) { |
|
401 int32_t ce32 = cei.next(errorCode); |
|
402 if (ce32 == CollationElementIterator::NULLORDER) { |
|
403 break; |
|
404 } |
|
405 int32_t p = CollationElementIterator::primaryOrder(ce32); |
|
406 if (p > variableTop && (ce32 & 0xc0) != 0xc0) { |
|
407 // not primary ignorable, and not a continuation CE |
|
408 if (seenPrimary) { |
|
409 return TRUE; |
|
410 } |
|
411 seenPrimary = TRUE; |
|
412 } |
|
413 } |
|
414 return FALSE; |
|
415 } |
|
416 |
|
417 } // namespace |
|
418 |
|
419 BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const { |
|
420 // Initialize indexCharacters. |
|
421 UVector indexCharacters(errorCode); |
|
422 indexCharacters.setDeleter(uprv_deleteUObject); |
|
423 initLabels(indexCharacters, errorCode); |
|
424 if (U_FAILURE(errorCode)) { return NULL; } |
|
425 |
|
426 // Variables for hasMultiplePrimaryWeights(). |
|
427 LocalPointer<CollationElementIterator> cei( |
|
428 collatorPrimaryOnly_->createCollationElementIterator(emptyString_)); |
|
429 if (cei.isNull()) { |
|
430 errorCode = U_MEMORY_ALLOCATION_ERROR; |
|
431 return NULL; |
|
432 } |
|
433 int32_t variableTop; |
|
434 if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) { |
|
435 variableTop = CollationElementIterator::primaryOrder( |
|
436 (int32_t)collatorPrimaryOnly_->getVariableTop(errorCode)); |
|
437 } else { |
|
438 variableTop = 0; |
|
439 } |
|
440 UBool hasInvisibleBuckets = FALSE; |
|
441 |
|
442 // Helper arrays for Chinese Pinyin collation. |
|
443 Bucket *asciiBuckets[26] = { |
|
444 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
|
445 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL |
|
446 }; |
|
447 Bucket *pinyinBuckets[26] = { |
|
448 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
|
449 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL |
|
450 }; |
|
451 UBool hasPinyin = FALSE; |
|
452 |
|
453 LocalPointer<UVector> bucketList(new UVector(errorCode)); |
|
454 if (bucketList.isNull()) { |
|
455 errorCode = U_MEMORY_ALLOCATION_ERROR; |
|
456 return NULL; |
|
457 } |
|
458 bucketList->setDeleter(uprv_deleteUObject); |
|
459 |
|
460 // underflow bucket |
|
461 Bucket *bucket = new Bucket(getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW); |
|
462 if (bucket == NULL) { |
|
463 errorCode = U_MEMORY_ALLOCATION_ERROR; |
|
464 return NULL; |
|
465 } |
|
466 bucketList->addElement(bucket, errorCode); |
|
467 if (U_FAILURE(errorCode)) { return NULL; } |
|
468 |
|
469 UnicodeString temp; |
|
470 |
|
471 // fix up the list, adding underflow, additions, overflow |
|
472 // Insert inflow labels as needed. |
|
473 int32_t scriptIndex = -1; |
|
474 const UnicodeString *scriptUpperBoundary = &emptyString_; |
|
475 for (int32_t i = 0; i < indexCharacters.size(); ++i) { |
|
476 UnicodeString ¤t = *getString(indexCharacters, i); |
|
477 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) { |
|
478 // We crossed the script boundary into a new script. |
|
479 const UnicodeString &inflowBoundary = *scriptUpperBoundary; |
|
480 UBool skippedScript = FALSE; |
|
481 for (;;) { |
|
482 scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex); |
|
483 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) { |
|
484 break; |
|
485 } |
|
486 skippedScript = TRUE; |
|
487 } |
|
488 if (skippedScript && bucketList->size() > 1) { |
|
489 // We are skipping one or more scripts, |
|
490 // and we are not just getting out of the underflow label. |
|
491 bucket = new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW); |
|
492 if (bucket == NULL) { |
|
493 errorCode = U_MEMORY_ALLOCATION_ERROR; |
|
494 return NULL; |
|
495 } |
|
496 bucketList->addElement(bucket, errorCode); |
|
497 } |
|
498 } |
|
499 // Add a bucket with the current label. |
|
500 bucket = new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL); |
|
501 if (bucket == NULL) { |
|
502 errorCode = U_MEMORY_ALLOCATION_ERROR; |
|
503 return NULL; |
|
504 } |
|
505 bucketList->addElement(bucket, errorCode); |
|
506 // Remember ASCII and Pinyin buckets for Pinyin redirects. |
|
507 UChar c; |
|
508 if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) { // A-Z |
|
509 asciiBuckets[c - 0x41] = bucket; |
|
510 } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) && |
|
511 0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) { |
|
512 pinyinBuckets[c - 0x41] = bucket; |
|
513 hasPinyin = TRUE; |
|
514 } |
|
515 // Check for multiple primary weights. |
|
516 if (!current.startsWith(BASE, BASE_LENGTH) && |
|
517 hasMultiplePrimaryWeights(*cei, variableTop, current, errorCode) && |
|
518 current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) { |
|
519 // "AE-ligature" or "Sch" etc. |
|
520 for (int32_t i = bucketList->size() - 2;; --i) { |
|
521 Bucket *singleBucket = getBucket(*bucketList, i); |
|
522 if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) { |
|
523 // There is no single-character bucket since the last |
|
524 // underflow or inflow label. |
|
525 break; |
|
526 } |
|
527 if (singleBucket->displayBucket_ == NULL && |
|
528 !hasMultiplePrimaryWeights( |
|
529 *cei, variableTop, singleBucket->lowerBoundary_, errorCode)) { |
|
530 // Add an invisible bucket that redirects strings greater than the expansion |
|
531 // to the previous single-character bucket. |
|
532 // For example, after ... Q R S Sch we add Sch\uFFFF->S |
|
533 // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S. |
|
534 bucket = new Bucket(emptyString_, |
|
535 UnicodeString(current).append((UChar)0xFFFF), |
|
536 U_ALPHAINDEX_NORMAL); |
|
537 if (bucket == NULL) { |
|
538 errorCode = U_MEMORY_ALLOCATION_ERROR; |
|
539 return NULL; |
|
540 } |
|
541 bucket->displayBucket_ = singleBucket; |
|
542 bucketList->addElement(bucket, errorCode); |
|
543 hasInvisibleBuckets = TRUE; |
|
544 break; |
|
545 } |
|
546 } |
|
547 } |
|
548 } |
|
549 if (U_FAILURE(errorCode)) { return NULL; } |
|
550 if (bucketList->size() == 1) { |
|
551 // No real labels, show only the underflow label. |
|
552 BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias()); |
|
553 if (bl == NULL) { |
|
554 errorCode = U_MEMORY_ALLOCATION_ERROR; |
|
555 return NULL; |
|
556 } |
|
557 bucketList.orphan(); |
|
558 return bl; |
|
559 } |
|
560 // overflow bucket |
|
561 bucket = new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW); |
|
562 if (bucket == NULL) { |
|
563 errorCode = U_MEMORY_ALLOCATION_ERROR; |
|
564 return NULL; |
|
565 } |
|
566 bucketList->addElement(bucket, errorCode); // final |
|
567 |
|
568 if (hasPinyin) { |
|
569 // Redirect Pinyin buckets. |
|
570 Bucket *asciiBucket = NULL; |
|
571 for (int32_t i = 0; i < 26; ++i) { |
|
572 if (asciiBuckets[i] != NULL) { |
|
573 asciiBucket = asciiBuckets[i]; |
|
574 } |
|
575 if (pinyinBuckets[i] != NULL && asciiBucket != NULL) { |
|
576 pinyinBuckets[i]->displayBucket_ = asciiBucket; |
|
577 hasInvisibleBuckets = TRUE; |
|
578 } |
|
579 } |
|
580 } |
|
581 |
|
582 if (U_FAILURE(errorCode)) { return NULL; } |
|
583 if (!hasInvisibleBuckets) { |
|
584 BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias()); |
|
585 if (bl == NULL) { |
|
586 errorCode = U_MEMORY_ALLOCATION_ERROR; |
|
587 return NULL; |
|
588 } |
|
589 bucketList.orphan(); |
|
590 return bl; |
|
591 } |
|
592 // Merge inflow buckets that are visually adjacent. |
|
593 // Iterate backwards: Merge inflow into overflow rather than the other way around. |
|
594 int32_t i = bucketList->size() - 1; |
|
595 Bucket *nextBucket = getBucket(*bucketList, i); |
|
596 while (--i > 0) { |
|
597 bucket = getBucket(*bucketList, i); |
|
598 if (bucket->displayBucket_ != NULL) { |
|
599 continue; // skip invisible buckets |
|
600 } |
|
601 if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) { |
|
602 if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) { |
|
603 bucket->displayBucket_ = nextBucket; |
|
604 continue; |
|
605 } |
|
606 } |
|
607 nextBucket = bucket; |
|
608 } |
|
609 |
|
610 LocalPointer<UVector> publicBucketList(new UVector(errorCode)); |
|
611 if (bucketList.isNull()) { |
|
612 errorCode = U_MEMORY_ALLOCATION_ERROR; |
|
613 return NULL; |
|
614 } |
|
615 // Do not call publicBucketList->setDeleter(): |
|
616 // This vector shares its objects with the bucketList. |
|
617 for (int32_t i = 0; i < bucketList->size(); ++i) { |
|
618 bucket = getBucket(*bucketList, i); |
|
619 if (bucket->displayBucket_ == NULL) { |
|
620 publicBucketList->addElement(bucket, errorCode); |
|
621 } |
|
622 } |
|
623 if (U_FAILURE(errorCode)) { return NULL; } |
|
624 BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias()); |
|
625 if (bl == NULL) { |
|
626 errorCode = U_MEMORY_ALLOCATION_ERROR; |
|
627 return NULL; |
|
628 } |
|
629 bucketList.orphan(); |
|
630 publicBucketList.orphan(); |
|
631 return bl; |
|
632 } |
|
633 |
|
634 /** |
|
635 * Creates an index, and buckets and sorts the list of records into the index. |
|
636 */ |
|
637 void AlphabeticIndex::initBuckets(UErrorCode &errorCode) { |
|
638 if (U_FAILURE(errorCode) || buckets_ != NULL) { |
|
639 return; |
|
640 } |
|
641 buckets_ = createBucketList(errorCode); |
|
642 if (U_FAILURE(errorCode) || inputList_ == NULL || inputList_->isEmpty()) { |
|
643 return; |
|
644 } |
|
645 |
|
646 // Sort the records by name. |
|
647 // Stable sort preserves input order of collation duplicates. |
|
648 inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode); |
|
649 |
|
650 // Now, we traverse all of the input, which is now sorted. |
|
651 // If the item doesn't go in the current bucket, we find the next bucket that contains it. |
|
652 // This makes the process order n*log(n), since we just sort the list and then do a linear process. |
|
653 // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so |
|
654 // we need to improve it for that case. |
|
655 |
|
656 Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0); |
|
657 int32_t bucketIndex = 1; |
|
658 Bucket *nextBucket; |
|
659 const UnicodeString *upperBoundary; |
|
660 if (bucketIndex < buckets_->bucketList_->size()) { |
|
661 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++); |
|
662 upperBoundary = &nextBucket->lowerBoundary_; |
|
663 } else { |
|
664 nextBucket = NULL; |
|
665 upperBoundary = NULL; |
|
666 } |
|
667 for (int32_t i = 0; i < inputList_->size(); ++i) { |
|
668 Record *r = getRecord(*inputList_, i); |
|
669 // if the current bucket isn't the right one, find the one that is |
|
670 // We have a special flag for the last bucket so that we don't look any further |
|
671 while (upperBoundary != NULL && |
|
672 collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) { |
|
673 currentBucket = nextBucket; |
|
674 // now reset the boundary that we compare against |
|
675 if (bucketIndex < buckets_->bucketList_->size()) { |
|
676 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++); |
|
677 upperBoundary = &nextBucket->lowerBoundary_; |
|
678 } else { |
|
679 upperBoundary = NULL; |
|
680 } |
|
681 } |
|
682 // now put the record into the bucket. |
|
683 Bucket *bucket = currentBucket; |
|
684 if (bucket->displayBucket_ != NULL) { |
|
685 bucket = bucket->displayBucket_; |
|
686 } |
|
687 if (bucket->records_ == NULL) { |
|
688 bucket->records_ = new UVector(errorCode); |
|
689 if (bucket->records_ == NULL) { |
|
690 errorCode = U_MEMORY_ALLOCATION_ERROR; |
|
691 return; |
|
692 } |
|
693 } |
|
694 bucket->records_->addElement(r, errorCode); |
|
695 } |
|
696 } |
|
697 |
|
698 void AlphabeticIndex::clearBuckets() { |
|
699 if (buckets_ != NULL) { |
|
700 delete buckets_; |
|
701 buckets_ = NULL; |
|
702 internalResetBucketIterator(); |
|
703 } |
|
704 } |
|
705 |
|
706 void AlphabeticIndex::internalResetBucketIterator() { |
|
707 labelsIterIndex_ = -1; |
|
708 currentBucket_ = NULL; |
|
709 } |
|
710 |
|
711 |
|
712 void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) { |
|
713 if (U_FAILURE(status)) { return; } |
|
714 // Chinese index characters, which are specific to each of the several Chinese tailorings, |
|
715 // take precedence over the single locale data exemplar set per language. |
|
716 const char *language = locale.getLanguage(); |
|
717 if (uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 || |
|
718 uprv_strcmp(language, "ko") == 0) { |
|
719 // TODO: This should be done regardless of the language, but it's expensive. |
|
720 // We should add a Collator function (can be @internal) |
|
721 // to enumerate just the contractions that start with a given code point or string. |
|
722 if (addChineseIndexCharacters(status) || U_FAILURE(status)) { |
|
723 return; |
|
724 } |
|
725 } |
|
726 |
|
727 LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status)); |
|
728 if (U_FAILURE(status)) { |
|
729 return; |
|
730 } |
|
731 |
|
732 UnicodeSet exemplars; |
|
733 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status); |
|
734 if (U_SUCCESS(status)) { |
|
735 initialLabels_->addAll(exemplars); |
|
736 return; |
|
737 } |
|
738 status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR |
|
739 |
|
740 // The locale data did not include explicit Index characters. |
|
741 // Synthesize a set of them from the locale's standard exemplar characters. |
|
742 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status); |
|
743 if (U_FAILURE(status)) { |
|
744 return; |
|
745 } |
|
746 |
|
747 // question: should we add auxiliary exemplars? |
|
748 if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.size() == 0) { |
|
749 exemplars.add(0x61, 0x7A); |
|
750 } |
|
751 if (exemplars.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables |
|
752 // cut down to small list |
|
753 exemplars.remove(0xAC00, 0xD7A3). |
|
754 add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C). |
|
755 add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544). |
|
756 add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0). |
|
757 add(0xD30C).add(0xD558); |
|
758 } |
|
759 if (exemplars.containsSome(0x1200, 0x137F)) { // Ethiopic block |
|
760 // cut down to small list |
|
761 // make use of the fact that Ethiopic is allocated in 8's, where |
|
762 // the base is 0 mod 8. |
|
763 UnicodeSet ethiopic( |
|
764 UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"), status); |
|
765 UnicodeSetIterator it(ethiopic); |
|
766 while (it.next() && !it.isString()) { |
|
767 if ((it.getCodepoint() & 0x7) != 0) { |
|
768 exemplars.remove(it.getCodepoint()); |
|
769 } |
|
770 } |
|
771 } |
|
772 |
|
773 // Upper-case any that aren't already so. |
|
774 // (We only do this for synthesized index characters.) |
|
775 UnicodeSetIterator it(exemplars); |
|
776 UnicodeString upperC; |
|
777 while (it.next()) { |
|
778 const UnicodeString &exemplarC = it.getString(); |
|
779 upperC = exemplarC; |
|
780 upperC.toUpper(locale); |
|
781 initialLabels_->add(upperC); |
|
782 } |
|
783 } |
|
784 |
|
785 UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) { |
|
786 UnicodeSet contractions; |
|
787 ucol_getContractionsAndExpansions(collatorPrimaryOnly_->getUCollator(), |
|
788 contractions.toUSet(), NULL, FALSE, &errorCode); |
|
789 if (U_FAILURE(errorCode)) { return FALSE; } |
|
790 UnicodeString firstHanBoundary; |
|
791 UBool hasPinyin = FALSE; |
|
792 UnicodeSetIterator iter(contractions); |
|
793 while (iter.next()) { |
|
794 const UnicodeString &s = iter.getString(); |
|
795 if (s.startsWith(BASE, BASE_LENGTH)) { |
|
796 initialLabels_->add(s); |
|
797 if (firstHanBoundary.isEmpty() || |
|
798 collatorPrimaryOnly_->compare(s, firstHanBoundary, errorCode) < 0) { |
|
799 firstHanBoundary = s; |
|
800 } |
|
801 UChar c = s.charAt(s.length() - 1); |
|
802 if (0x41 <= c && c <= 0x5A) { // A-Z |
|
803 hasPinyin = TRUE; |
|
804 } |
|
805 } |
|
806 } |
|
807 if (hasPinyin) { |
|
808 initialLabels_->add(0x41, 0x5A); // A-Z |
|
809 } |
|
810 if (!firstHanBoundary.isEmpty()) { |
|
811 // The hardcoded list of script boundaries includes U+4E00 |
|
812 // which is tailored to not be the first primary |
|
813 // in all Chinese tailorings except "unihan". |
|
814 // Replace U+4E00 with the first boundary string from the tailoring. |
|
815 // TODO: This becomes obsolete when the root collator gets |
|
816 // reliable script-first-primary mappings. |
|
817 int32_t hanIndex = binarySearch( |
|
818 *firstCharsInScripts_, UnicodeString((UChar)0x4E00), *collatorPrimaryOnly_); |
|
819 if (hanIndex >= 0) { |
|
820 UnicodeString *fh = new UnicodeString(firstHanBoundary); |
|
821 if (fh == NULL) { |
|
822 errorCode = U_MEMORY_ALLOCATION_ERROR; |
|
823 return FALSE; |
|
824 } |
|
825 firstCharsInScripts_->setElementAt(fh, hanIndex); |
|
826 } |
|
827 return TRUE; |
|
828 } else { |
|
829 return FALSE; |
|
830 } |
|
831 } |
|
832 |
|
833 |
|
834 /* |
|
835 * Return the string with interspersed CGJs. Input must have more than 2 codepoints. |
|
836 */ |
|
837 static const UChar CGJ = 0x034F; |
|
838 UnicodeString AlphabeticIndex::separated(const UnicodeString &item) { |
|
839 UnicodeString result; |
|
840 if (item.length() == 0) { |
|
841 return result; |
|
842 } |
|
843 int32_t i = 0; |
|
844 for (;;) { |
|
845 UChar32 cp = item.char32At(i); |
|
846 result.append(cp); |
|
847 i = item.moveIndex32(i, 1); |
|
848 if (i >= item.length()) { |
|
849 break; |
|
850 } |
|
851 result.append(CGJ); |
|
852 } |
|
853 return result; |
|
854 } |
|
855 |
|
856 |
|
857 UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const { |
|
858 return FALSE; |
|
859 } |
|
860 |
|
861 |
|
862 UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const { |
|
863 return FALSE; |
|
864 } |
|
865 |
|
866 |
|
867 const RuleBasedCollator &AlphabeticIndex::getCollator() const { |
|
868 // There are no known non-RuleBasedCollator collators, and none ever expected. |
|
869 // But, in case that changes, better a null pointer than a wrong type. |
|
870 return *dynamic_cast<RuleBasedCollator *>(collator_); |
|
871 } |
|
872 |
|
873 |
|
874 const UnicodeString &AlphabeticIndex::getInflowLabel() const { |
|
875 return inflowLabel_; |
|
876 } |
|
877 |
|
878 const UnicodeString &AlphabeticIndex::getOverflowLabel() const { |
|
879 return overflowLabel_; |
|
880 } |
|
881 |
|
882 |
|
883 const UnicodeString &AlphabeticIndex::getUnderflowLabel() const { |
|
884 return underflowLabel_; |
|
885 } |
|
886 |
|
887 |
|
888 AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { |
|
889 inflowLabel_ = label; |
|
890 clearBuckets(); |
|
891 return *this; |
|
892 } |
|
893 |
|
894 |
|
895 AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { |
|
896 overflowLabel_ = label; |
|
897 clearBuckets(); |
|
898 return *this; |
|
899 } |
|
900 |
|
901 |
|
902 AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { |
|
903 underflowLabel_ = label; |
|
904 clearBuckets(); |
|
905 return *this; |
|
906 } |
|
907 |
|
908 |
|
909 int32_t AlphabeticIndex::getMaxLabelCount() const { |
|
910 return maxLabelCount_; |
|
911 } |
|
912 |
|
913 |
|
914 AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) { |
|
915 if (U_FAILURE(status)) { |
|
916 return *this; |
|
917 } |
|
918 if (maxLabelCount <= 0) { |
|
919 status = U_ILLEGAL_ARGUMENT_ERROR; |
|
920 return *this; |
|
921 } |
|
922 maxLabelCount_ = maxLabelCount; |
|
923 clearBuckets(); |
|
924 return *this; |
|
925 } |
|
926 |
|
927 |
|
928 // |
|
929 // init() - Common code for constructors. |
|
930 // |
|
931 |
|
932 void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) { |
|
933 if (U_FAILURE(status)) { return; } |
|
934 if (locale == NULL && collator_ == NULL) { |
|
935 status = U_ILLEGAL_ARGUMENT_ERROR; |
|
936 return; |
|
937 } |
|
938 |
|
939 initialLabels_ = new UnicodeSet(); |
|
940 if (initialLabels_ == NULL) { |
|
941 status = U_MEMORY_ALLOCATION_ERROR; |
|
942 return; |
|
943 } |
|
944 |
|
945 inflowLabel_.setTo((UChar)0x2026); // Ellipsis |
|
946 overflowLabel_ = inflowLabel_; |
|
947 underflowLabel_ = inflowLabel_; |
|
948 |
|
949 if (collator_ == NULL) { |
|
950 collator_ = static_cast<RuleBasedCollator *>(Collator::createInstance(*locale, status)); |
|
951 if (U_FAILURE(status)) { return; } |
|
952 if (collator_ == NULL) { |
|
953 status = U_MEMORY_ALLOCATION_ERROR; |
|
954 return; |
|
955 } |
|
956 } |
|
957 collatorPrimaryOnly_ = static_cast<RuleBasedCollator *>(collator_->clone()); |
|
958 if (collatorPrimaryOnly_ == NULL) { |
|
959 status = U_MEMORY_ALLOCATION_ERROR; |
|
960 return; |
|
961 } |
|
962 collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status); |
|
963 firstCharsInScripts_ = firstStringsInScript(status); |
|
964 if (U_FAILURE(status)) { return; } |
|
965 firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status); |
|
966 UnicodeString _4E00((UChar)0x4E00); |
|
967 UnicodeString _1100((UChar)0x1100); |
|
968 UnicodeString _1112((UChar)0x1112); |
|
969 if (collatorPrimaryOnly_->compare(_4E00, _1112, status) <= 0 && |
|
970 collatorPrimaryOnly_->compare(_1100, _4E00, status) <= 0) { |
|
971 // The standard Korean tailoring sorts Hanja (Han characters) |
|
972 // as secondary differences from Hangul syllables. |
|
973 // This makes U+4E00 not useful as a Han-script boundary. |
|
974 // TODO: This becomes obsolete when the root collator gets |
|
975 // reliable script-first-primary mappings. |
|
976 int32_t hanIndex = binarySearch( |
|
977 *firstCharsInScripts_, _4E00, *collatorPrimaryOnly_); |
|
978 if (hanIndex >= 0) { |
|
979 firstCharsInScripts_->removeElementAt(hanIndex); |
|
980 } |
|
981 } |
|
982 // Guard against a degenerate collator where |
|
983 // some script boundary strings are primary ignorable. |
|
984 for (;;) { |
|
985 if (U_FAILURE(status)) { return; } |
|
986 if (firstCharsInScripts_->isEmpty()) { |
|
987 // AlphabeticIndex requires some non-ignorable script boundary strings. |
|
988 status = U_ILLEGAL_ARGUMENT_ERROR; |
|
989 return; |
|
990 } |
|
991 if (collatorPrimaryOnly_->compare( |
|
992 *static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)), |
|
993 emptyString_, status) == UCOL_EQUAL) { |
|
994 firstCharsInScripts_->removeElementAt(0); |
|
995 } else { |
|
996 break; |
|
997 } |
|
998 } |
|
999 |
|
1000 if (locale != NULL) { |
|
1001 addIndexExemplars(*locale, status); |
|
1002 } |
|
1003 } |
|
1004 |
|
1005 |
|
1006 // |
|
1007 // Comparison function for UVector<UnicodeString *> sorting with a collator. |
|
1008 // |
|
1009 static int32_t U_CALLCONV |
|
1010 collatorComparator(const void *context, const void *left, const void *right) { |
|
1011 const UElement *leftElement = static_cast<const UElement *>(left); |
|
1012 const UElement *rightElement = static_cast<const UElement *>(right); |
|
1013 const UnicodeString *leftString = static_cast<const UnicodeString *>(leftElement->pointer); |
|
1014 const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer); |
|
1015 |
|
1016 if (leftString == rightString) { |
|
1017 // Catches case where both are NULL |
|
1018 return 0; |
|
1019 } |
|
1020 if (leftString == NULL) { |
|
1021 return 1; |
|
1022 }; |
|
1023 if (rightString == NULL) { |
|
1024 return -1; |
|
1025 } |
|
1026 const Collator *col = static_cast<const Collator *>(context); |
|
1027 UErrorCode errorCode = U_ZERO_ERROR; |
|
1028 return col->compare(*leftString, *rightString, errorCode); |
|
1029 } |
|
1030 |
|
1031 // |
|
1032 // Comparison function for UVector<Record *> sorting with a collator. |
|
1033 // |
|
1034 static int32_t U_CALLCONV |
|
1035 recordCompareFn(const void *context, const void *left, const void *right) { |
|
1036 const UElement *leftElement = static_cast<const UElement *>(left); |
|
1037 const UElement *rightElement = static_cast<const UElement *>(right); |
|
1038 const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer); |
|
1039 const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer); |
|
1040 const Collator *col = static_cast<const Collator *>(context); |
|
1041 UErrorCode errorCode = U_ZERO_ERROR; |
|
1042 return col->compare(leftRec->name_, rightRec->name_, errorCode); |
|
1043 } |
|
1044 |
|
1045 |
|
1046 /** |
|
1047 * This list contains one character per script that has the |
|
1048 * lowest primary weight for that script in the root collator. |
|
1049 * This list will be copied and sorted to account for script reordering. |
|
1050 * |
|
1051 * <p>TODO: This is fragile. If the first character of a script is tailored |
|
1052 * so that it does not map to the script's lowest primary weight any more, |
|
1053 * then the buckets will be off. |
|
1054 * There are hacks in the code to handle the known CJK tailorings of U+4E00. |
|
1055 * |
|
1056 * <p>We use "A" not "a" because the en_US_POSIX tailoring sorts A primary-before a. |
|
1057 * |
|
1058 * Keep this in sync with HACK_FIRST_CHARS_IN_SCRIPTS in |
|
1059 * ICU4J main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java |
|
1060 */ |
|
1061 static const UChar HACK_FIRST_CHARS_IN_SCRIPTS[] = { |
|
1062 0x41, 0, 0x03B1, 0, |
|
1063 0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0, |
|
1064 0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0, |
|
1065 0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0, |
|
1066 0xAAF2, 0, // Meetei Mayek |
|
1067 0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0, |
|
1068 U16_LEAD(0x111C4), U16_TRAIL(0x111C4), 0, // Sharada |
|
1069 U16_LEAD(0x11680), U16_TRAIL(0x11680), 0, // Takri |
|
1070 0x1B83, 0, |
|
1071 0xD802, 0xDE00, 0, 0x0E01, 0, |
|
1072 0x0EDE, 0, // Lao |
|
1073 0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0, |
|
1074 0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0, |
|
1075 U16_LEAD(0x11103), U16_TRAIL(0x11103), 0, // Chakma |
|
1076 0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0, |
|
1077 0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0, |
|
1078 0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0, |
|
1079 U16_LEAD(0x16F00), U16_TRAIL(0x16F00), 0, // Miao |
|
1080 0xD800, 0xDE80, 0, |
|
1081 0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0, |
|
1082 0xD801, 0xDC80, 0, |
|
1083 U16_LEAD(0x110D0), U16_TRAIL(0x110D0), 0, // Sora Sompeng |
|
1084 0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0, |
|
1085 0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0, |
|
1086 U16_LEAD(0x109A0), U16_TRAIL(0x109A0), 0, // Meroitic Cursive |
|
1087 U16_LEAD(0x10980), U16_TRAIL(0x10980), 0, // Meroitic Hieroglyphs |
|
1088 0x4E00, 0, |
|
1089 // TODO: The overflow bucket's lowerBoundary string should be the |
|
1090 // first item after the last reordering group in the collator's script order. |
|
1091 // This should normally be the first Unicode code point |
|
1092 // that is unassigned (U+0378 in Unicode 6.3) and untailored. |
|
1093 // However, at least up to ICU 51 the Hani reordering group includes |
|
1094 // unassigned code points, |
|
1095 // and there is no stable string for the start of the trailing-weights range. |
|
1096 // The only known string that sorts "high" is U+FFFF. |
|
1097 // When ICU separates Hani vs. unassigned reordering groups, we need to fix this, |
|
1098 // and fix relevant test code. |
|
1099 // Ideally, FractionalUCA.txt will have a "script first primary" |
|
1100 // for unassigned code points. |
|
1101 0xFFFF, 0 |
|
1102 }; |
|
1103 |
|
1104 UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) { |
|
1105 if (U_FAILURE(status)) { |
|
1106 return NULL; |
|
1107 } |
|
1108 UVector *dest = new UVector(status); |
|
1109 if (dest == NULL) { |
|
1110 status = U_MEMORY_ALLOCATION_ERROR; |
|
1111 return NULL; |
|
1112 } |
|
1113 dest->setDeleter(uprv_deleteUObject); |
|
1114 const UChar *src = HACK_FIRST_CHARS_IN_SCRIPTS; |
|
1115 const UChar *limit = src + LENGTHOF(HACK_FIRST_CHARS_IN_SCRIPTS); |
|
1116 do { |
|
1117 if (U_FAILURE(status)) { |
|
1118 return dest; |
|
1119 } |
|
1120 UnicodeString *str = new UnicodeString(src, -1); |
|
1121 if (str == NULL) { |
|
1122 status = U_MEMORY_ALLOCATION_ERROR; |
|
1123 return dest; |
|
1124 } |
|
1125 dest->addElement(str, status); |
|
1126 src += str->length() + 1; |
|
1127 } while (src < limit); |
|
1128 return dest; |
|
1129 } |
|
1130 |
|
1131 |
|
1132 namespace { |
|
1133 |
|
1134 /** |
|
1135 * Returns true if one index character string is "better" than the other. |
|
1136 * Shorter NFKD is better, and otherwise NFKD-binary-less-than is |
|
1137 * better, and otherwise binary-less-than is better. |
|
1138 */ |
|
1139 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer, |
|
1140 const UnicodeString &one, const UnicodeString &other) { |
|
1141 // This is called with primary-equal strings, but never with one.equals(other). |
|
1142 UErrorCode status = U_ZERO_ERROR; |
|
1143 UnicodeString n1 = nfkdNormalizer.normalize(one, status); |
|
1144 UnicodeString n2 = nfkdNormalizer.normalize(other, status); |
|
1145 if (U_FAILURE(status)) { return FALSE; } |
|
1146 int32_t result = n1.countChar32() - n2.countChar32(); |
|
1147 if (result != 0) { |
|
1148 return result < 0; |
|
1149 } |
|
1150 result = n1.compareCodePointOrder(n2); |
|
1151 if (result != 0) { |
|
1152 return result < 0; |
|
1153 } |
|
1154 return one.compareCodePointOrder(other) < 0; |
|
1155 } |
|
1156 |
|
1157 } // namespace |
|
1158 |
|
1159 // |
|
1160 // Constructor & Destructor for AlphabeticIndex::Record |
|
1161 // |
|
1162 // Records are internal only, instances are not directly surfaced in the public API. |
|
1163 // This class is mostly struct-like, with all public fields. |
|
1164 |
|
1165 AlphabeticIndex::Record::Record(const UnicodeString &name, const void *data) |
|
1166 : name_(name), data_(data) {} |
|
1167 |
|
1168 AlphabeticIndex::Record::~Record() { |
|
1169 } |
|
1170 |
|
1171 |
|
1172 AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) { |
|
1173 if (U_FAILURE(status)) { |
|
1174 return *this; |
|
1175 } |
|
1176 if (inputList_ == NULL) { |
|
1177 inputList_ = new UVector(status); |
|
1178 if (inputList_ == NULL) { |
|
1179 status = U_MEMORY_ALLOCATION_ERROR; |
|
1180 return *this; |
|
1181 } |
|
1182 inputList_->setDeleter(alphaIndex_deleteRecord); |
|
1183 } |
|
1184 Record *r = new Record(name, data); |
|
1185 if (r == NULL) { |
|
1186 status = U_MEMORY_ALLOCATION_ERROR; |
|
1187 return *this; |
|
1188 } |
|
1189 inputList_->addElement(r, status); |
|
1190 clearBuckets(); |
|
1191 //std::string ss; |
|
1192 //std::string ss2; |
|
1193 //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" << |
|
1194 // " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl; |
|
1195 return *this; |
|
1196 } |
|
1197 |
|
1198 |
|
1199 AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) { |
|
1200 if (U_SUCCESS(status) && inputList_ != NULL && !inputList_->isEmpty()) { |
|
1201 inputList_->removeAllElements(); |
|
1202 clearBuckets(); |
|
1203 } |
|
1204 return *this; |
|
1205 } |
|
1206 |
|
1207 int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) { |
|
1208 initBuckets(status); |
|
1209 if (U_FAILURE(status)) { |
|
1210 return 0; |
|
1211 } |
|
1212 return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, status); |
|
1213 } |
|
1214 |
|
1215 |
|
1216 int32_t AlphabeticIndex::getBucketIndex() const { |
|
1217 return labelsIterIndex_; |
|
1218 } |
|
1219 |
|
1220 |
|
1221 UBool AlphabeticIndex::nextBucket(UErrorCode &status) { |
|
1222 if (U_FAILURE(status)) { |
|
1223 return FALSE; |
|
1224 } |
|
1225 if (buckets_ == NULL && currentBucket_ != NULL) { |
|
1226 status = U_ENUM_OUT_OF_SYNC_ERROR; |
|
1227 return FALSE; |
|
1228 } |
|
1229 initBuckets(status); |
|
1230 if (U_FAILURE(status)) { |
|
1231 return FALSE; |
|
1232 } |
|
1233 ++labelsIterIndex_; |
|
1234 if (labelsIterIndex_ >= buckets_->getBucketCount()) { |
|
1235 labelsIterIndex_ = buckets_->getBucketCount(); |
|
1236 return FALSE; |
|
1237 } |
|
1238 currentBucket_ = getBucket(*buckets_->immutableVisibleList_, labelsIterIndex_); |
|
1239 resetRecordIterator(); |
|
1240 return TRUE; |
|
1241 } |
|
1242 |
|
1243 const UnicodeString &AlphabeticIndex::getBucketLabel() const { |
|
1244 if (currentBucket_ != NULL) { |
|
1245 return currentBucket_->label_; |
|
1246 } else { |
|
1247 return emptyString_; |
|
1248 } |
|
1249 } |
|
1250 |
|
1251 |
|
1252 UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const { |
|
1253 if (currentBucket_ != NULL) { |
|
1254 return currentBucket_->labelType_; |
|
1255 } else { |
|
1256 return U_ALPHAINDEX_NORMAL; |
|
1257 } |
|
1258 } |
|
1259 |
|
1260 |
|
1261 int32_t AlphabeticIndex::getBucketRecordCount() const { |
|
1262 if (currentBucket_ != NULL && currentBucket_->records_ != NULL) { |
|
1263 return currentBucket_->records_->size(); |
|
1264 } else { |
|
1265 return 0; |
|
1266 } |
|
1267 } |
|
1268 |
|
1269 |
|
1270 AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) { |
|
1271 if (U_FAILURE(status)) { |
|
1272 return *this; |
|
1273 } |
|
1274 internalResetBucketIterator(); |
|
1275 return *this; |
|
1276 } |
|
1277 |
|
1278 |
|
1279 UBool AlphabeticIndex::nextRecord(UErrorCode &status) { |
|
1280 if (U_FAILURE(status)) { |
|
1281 return FALSE; |
|
1282 } |
|
1283 if (currentBucket_ == NULL) { |
|
1284 // We are trying to iterate over the items in a bucket, but there is no |
|
1285 // current bucket from the enumeration of buckets. |
|
1286 status = U_INVALID_STATE_ERROR; |
|
1287 return FALSE; |
|
1288 } |
|
1289 if (buckets_ == NULL) { |
|
1290 status = U_ENUM_OUT_OF_SYNC_ERROR; |
|
1291 return FALSE; |
|
1292 } |
|
1293 if (currentBucket_->records_ == NULL) { |
|
1294 return FALSE; |
|
1295 } |
|
1296 ++itemsIterIndex_; |
|
1297 if (itemsIterIndex_ >= currentBucket_->records_->size()) { |
|
1298 itemsIterIndex_ = currentBucket_->records_->size(); |
|
1299 return FALSE; |
|
1300 } |
|
1301 return TRUE; |
|
1302 } |
|
1303 |
|
1304 |
|
1305 const UnicodeString &AlphabeticIndex::getRecordName() const { |
|
1306 const UnicodeString *retStr = &emptyString_; |
|
1307 if (currentBucket_ != NULL && currentBucket_->records_ != NULL && |
|
1308 itemsIterIndex_ >= 0 && |
|
1309 itemsIterIndex_ < currentBucket_->records_->size()) { |
|
1310 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_)); |
|
1311 retStr = &item->name_; |
|
1312 } |
|
1313 return *retStr; |
|
1314 } |
|
1315 |
|
1316 const void *AlphabeticIndex::getRecordData() const { |
|
1317 const void *retPtr = NULL; |
|
1318 if (currentBucket_ != NULL && currentBucket_->records_ != NULL && |
|
1319 itemsIterIndex_ >= 0 && |
|
1320 itemsIterIndex_ < currentBucket_->records_->size()) { |
|
1321 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_)); |
|
1322 retPtr = item->data_; |
|
1323 } |
|
1324 return retPtr; |
|
1325 } |
|
1326 |
|
1327 |
|
1328 AlphabeticIndex & AlphabeticIndex::resetRecordIterator() { |
|
1329 itemsIterIndex_ = -1; |
|
1330 return *this; |
|
1331 } |
|
1332 |
|
1333 |
|
1334 |
|
1335 AlphabeticIndex::Bucket::Bucket(const UnicodeString &label, |
|
1336 const UnicodeString &lowerBoundary, |
|
1337 UAlphabeticIndexLabelType type) |
|
1338 : label_(label), lowerBoundary_(lowerBoundary), labelType_(type), |
|
1339 displayBucket_(NULL), displayIndex_(-1), |
|
1340 records_(NULL) { |
|
1341 } |
|
1342 |
|
1343 |
|
1344 AlphabeticIndex::Bucket::~Bucket() { |
|
1345 delete records_; |
|
1346 } |
|
1347 |
|
1348 U_NAMESPACE_END |
|
1349 |
|
1350 #endif |