intl/icu/source/common/caniter.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 *****************************************************************************
michael@0 3 * Copyright (C) 1996-2011, International Business Machines Corporation and *
michael@0 4 * others. All Rights Reserved. *
michael@0 5 *****************************************************************************
michael@0 6 */
michael@0 7
michael@0 8 #include "unicode/utypes.h"
michael@0 9
michael@0 10 #if !UCONFIG_NO_NORMALIZATION
michael@0 11
michael@0 12 #include "unicode/caniter.h"
michael@0 13 #include "unicode/normalizer2.h"
michael@0 14 #include "unicode/uchar.h"
michael@0 15 #include "unicode/uniset.h"
michael@0 16 #include "unicode/usetiter.h"
michael@0 17 #include "unicode/ustring.h"
michael@0 18 #include "unicode/utf16.h"
michael@0 19 #include "cmemory.h"
michael@0 20 #include "hash.h"
michael@0 21 #include "normalizer2impl.h"
michael@0 22
michael@0 23 /**
michael@0 24 * This class allows one to iterate through all the strings that are canonically equivalent to a given
michael@0 25 * string. For example, here are some sample results:
michael@0 26 Results for: {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
michael@0 27 1: \u0041\u030A\u0064\u0307\u0327
michael@0 28 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
michael@0 29 2: \u0041\u030A\u0064\u0327\u0307
michael@0 30 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
michael@0 31 3: \u0041\u030A\u1E0B\u0327
michael@0 32 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
michael@0 33 4: \u0041\u030A\u1E11\u0307
michael@0 34 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
michael@0 35 5: \u00C5\u0064\u0307\u0327
michael@0 36 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
michael@0 37 6: \u00C5\u0064\u0327\u0307
michael@0 38 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
michael@0 39 7: \u00C5\u1E0B\u0327
michael@0 40 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
michael@0 41 8: \u00C5\u1E11\u0307
michael@0 42 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
michael@0 43 9: \u212B\u0064\u0307\u0327
michael@0 44 = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
michael@0 45 10: \u212B\u0064\u0327\u0307
michael@0 46 = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
michael@0 47 11: \u212B\u1E0B\u0327
michael@0 48 = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
michael@0 49 12: \u212B\u1E11\u0307
michael@0 50 = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
michael@0 51 *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,
michael@0 52 * since it has not been optimized for that situation.
michael@0 53 *@author M. Davis
michael@0 54 *@draft
michael@0 55 */
michael@0 56
michael@0 57 // public
michael@0 58
michael@0 59 U_NAMESPACE_BEGIN
michael@0 60
michael@0 61 // TODO: add boilerplate methods.
michael@0 62
michael@0 63 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CanonicalIterator)
michael@0 64
michael@0 65 /**
michael@0 66 *@param source string to get results for
michael@0 67 */
michael@0 68 CanonicalIterator::CanonicalIterator(const UnicodeString &sourceStr, UErrorCode &status) :
michael@0 69 pieces(NULL),
michael@0 70 pieces_length(0),
michael@0 71 pieces_lengths(NULL),
michael@0 72 current(NULL),
michael@0 73 current_length(0),
michael@0 74 nfd(*Normalizer2Factory::getNFDInstance(status)),
michael@0 75 nfcImpl(*Normalizer2Factory::getNFCImpl(status))
michael@0 76 {
michael@0 77 if(U_SUCCESS(status) && nfcImpl.ensureCanonIterData(status)) {
michael@0 78 setSource(sourceStr, status);
michael@0 79 }
michael@0 80 }
michael@0 81
michael@0 82 CanonicalIterator::~CanonicalIterator() {
michael@0 83 cleanPieces();
michael@0 84 }
michael@0 85
michael@0 86 void CanonicalIterator::cleanPieces() {
michael@0 87 int32_t i = 0;
michael@0 88 if(pieces != NULL) {
michael@0 89 for(i = 0; i < pieces_length; i++) {
michael@0 90 if(pieces[i] != NULL) {
michael@0 91 delete[] pieces[i];
michael@0 92 }
michael@0 93 }
michael@0 94 uprv_free(pieces);
michael@0 95 pieces = NULL;
michael@0 96 pieces_length = 0;
michael@0 97 }
michael@0 98 if(pieces_lengths != NULL) {
michael@0 99 uprv_free(pieces_lengths);
michael@0 100 pieces_lengths = NULL;
michael@0 101 }
michael@0 102 if(current != NULL) {
michael@0 103 uprv_free(current);
michael@0 104 current = NULL;
michael@0 105 current_length = 0;
michael@0 106 }
michael@0 107 }
michael@0 108
michael@0 109 /**
michael@0 110 *@return gets the source: NOTE: it is the NFD form of source
michael@0 111 */
michael@0 112 UnicodeString CanonicalIterator::getSource() {
michael@0 113 return source;
michael@0 114 }
michael@0 115
michael@0 116 /**
michael@0 117 * Resets the iterator so that one can start again from the beginning.
michael@0 118 */
michael@0 119 void CanonicalIterator::reset() {
michael@0 120 done = FALSE;
michael@0 121 for (int i = 0; i < current_length; ++i) {
michael@0 122 current[i] = 0;
michael@0 123 }
michael@0 124 }
michael@0 125
michael@0 126 /**
michael@0 127 *@return the next string that is canonically equivalent. The value null is returned when
michael@0 128 * the iteration is done.
michael@0 129 */
michael@0 130 UnicodeString CanonicalIterator::next() {
michael@0 131 int32_t i = 0;
michael@0 132
michael@0 133 if (done) {
michael@0 134 buffer.setToBogus();
michael@0 135 return buffer;
michael@0 136 }
michael@0 137
michael@0 138 // delete old contents
michael@0 139 buffer.remove();
michael@0 140
michael@0 141 // construct return value
michael@0 142
michael@0 143 for (i = 0; i < pieces_length; ++i) {
michael@0 144 buffer.append(pieces[i][current[i]]);
michael@0 145 }
michael@0 146 //String result = buffer.toString(); // not needed
michael@0 147
michael@0 148 // find next value for next time
michael@0 149
michael@0 150 for (i = current_length - 1; ; --i) {
michael@0 151 if (i < 0) {
michael@0 152 done = TRUE;
michael@0 153 break;
michael@0 154 }
michael@0 155 current[i]++;
michael@0 156 if (current[i] < pieces_lengths[i]) break; // got sequence
michael@0 157 current[i] = 0;
michael@0 158 }
michael@0 159 return buffer;
michael@0 160 }
michael@0 161
michael@0 162 /**
michael@0 163 *@param set the source string to iterate against. This allows the same iterator to be used
michael@0 164 * while changing the source string, saving object creation.
michael@0 165 */
michael@0 166 void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &status) {
michael@0 167 int32_t list_length = 0;
michael@0 168 UChar32 cp = 0;
michael@0 169 int32_t start = 0;
michael@0 170 int32_t i = 0;
michael@0 171 UnicodeString *list = NULL;
michael@0 172
michael@0 173 nfd.normalize(newSource, source, status);
michael@0 174 if(U_FAILURE(status)) {
michael@0 175 return;
michael@0 176 }
michael@0 177 done = FALSE;
michael@0 178
michael@0 179 cleanPieces();
michael@0 180
michael@0 181 // catch degenerate case
michael@0 182 if (newSource.length() == 0) {
michael@0 183 pieces = (UnicodeString **)uprv_malloc(sizeof(UnicodeString *));
michael@0 184 pieces_lengths = (int32_t*)uprv_malloc(1 * sizeof(int32_t));
michael@0 185 pieces_length = 1;
michael@0 186 current = (int32_t*)uprv_malloc(1 * sizeof(int32_t));
michael@0 187 current_length = 1;
michael@0 188 if (pieces == NULL || pieces_lengths == NULL || current == NULL) {
michael@0 189 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 190 goto CleanPartialInitialization;
michael@0 191 }
michael@0 192 current[0] = 0;
michael@0 193 pieces[0] = new UnicodeString[1];
michael@0 194 pieces_lengths[0] = 1;
michael@0 195 if (pieces[0] == 0) {
michael@0 196 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 197 goto CleanPartialInitialization;
michael@0 198 }
michael@0 199 return;
michael@0 200 }
michael@0 201
michael@0 202
michael@0 203 list = new UnicodeString[source.length()];
michael@0 204 if (list == 0) {
michael@0 205 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 206 goto CleanPartialInitialization;
michael@0 207 }
michael@0 208
michael@0 209 // i should initialy be the number of code units at the
michael@0 210 // start of the string
michael@0 211 i = U16_LENGTH(source.char32At(0));
michael@0 212 //int32_t i = 1;
michael@0 213 // find the segments
michael@0 214 // This code iterates through the source string and
michael@0 215 // extracts segments that end up on a codepoint that
michael@0 216 // doesn't start any decompositions. (Analysis is done
michael@0 217 // on the NFD form - see above).
michael@0 218 for (; i < source.length(); i += U16_LENGTH(cp)) {
michael@0 219 cp = source.char32At(i);
michael@0 220 if (nfcImpl.isCanonSegmentStarter(cp)) {
michael@0 221 source.extract(start, i-start, list[list_length++]); // add up to i
michael@0 222 start = i;
michael@0 223 }
michael@0 224 }
michael@0 225 source.extract(start, i-start, list[list_length++]); // add last one
michael@0 226
michael@0 227
michael@0 228 // allocate the arrays, and find the strings that are CE to each segment
michael@0 229 pieces = (UnicodeString **)uprv_malloc(list_length * sizeof(UnicodeString *));
michael@0 230 pieces_length = list_length;
michael@0 231 pieces_lengths = (int32_t*)uprv_malloc(list_length * sizeof(int32_t));
michael@0 232 current = (int32_t*)uprv_malloc(list_length * sizeof(int32_t));
michael@0 233 current_length = list_length;
michael@0 234 if (pieces == NULL || pieces_lengths == NULL || current == NULL) {
michael@0 235 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 236 goto CleanPartialInitialization;
michael@0 237 }
michael@0 238
michael@0 239 for (i = 0; i < current_length; i++) {
michael@0 240 current[i] = 0;
michael@0 241 }
michael@0 242 // for each segment, get all the combinations that can produce
michael@0 243 // it after NFD normalization
michael@0 244 for (i = 0; i < pieces_length; ++i) {
michael@0 245 //if (PROGRESS) printf("SEGMENT\n");
michael@0 246 pieces[i] = getEquivalents(list[i], pieces_lengths[i], status);
michael@0 247 }
michael@0 248
michael@0 249 delete[] list;
michael@0 250 return;
michael@0 251 // Common section to cleanup all local variables and reset object variables.
michael@0 252 CleanPartialInitialization:
michael@0 253 if (list != NULL) {
michael@0 254 delete[] list;
michael@0 255 }
michael@0 256 cleanPieces();
michael@0 257 }
michael@0 258
michael@0 259 /**
michael@0 260 * Dumb recursive implementation of permutation.
michael@0 261 * TODO: optimize
michael@0 262 * @param source the string to find permutations for
michael@0 263 * @return the results in a set.
michael@0 264 */
michael@0 265 void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status) {
michael@0 266 if(U_FAILURE(status)) {
michael@0 267 return;
michael@0 268 }
michael@0 269 //if (PROGRESS) printf("Permute: %s\n", UToS(Tr(source)));
michael@0 270 int32_t i = 0;
michael@0 271
michael@0 272 // optimization:
michael@0 273 // if zero or one character, just return a set with it
michael@0 274 // we check for length < 2 to keep from counting code points all the time
michael@0 275 if (source.length() <= 2 && source.countChar32() <= 1) {
michael@0 276 UnicodeString *toPut = new UnicodeString(source);
michael@0 277 /* test for NULL */
michael@0 278 if (toPut == 0) {
michael@0 279 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 280 return;
michael@0 281 }
michael@0 282 result->put(source, toPut, status);
michael@0 283 return;
michael@0 284 }
michael@0 285
michael@0 286 // otherwise iterate through the string, and recursively permute all the other characters
michael@0 287 UChar32 cp;
michael@0 288 Hashtable subpermute(status);
michael@0 289 if(U_FAILURE(status)) {
michael@0 290 return;
michael@0 291 }
michael@0 292 subpermute.setValueDeleter(uprv_deleteUObject);
michael@0 293
michael@0 294 for (i = 0; i < source.length(); i += U16_LENGTH(cp)) {
michael@0 295 cp = source.char32At(i);
michael@0 296 const UHashElement *ne = NULL;
michael@0 297 int32_t el = -1;
michael@0 298 UnicodeString subPermuteString = source;
michael@0 299
michael@0 300 // optimization:
michael@0 301 // if the character is canonical combining class zero,
michael@0 302 // don't permute it
michael@0 303 if (skipZeros && i != 0 && u_getCombiningClass(cp) == 0) {
michael@0 304 //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));
michael@0 305 continue;
michael@0 306 }
michael@0 307
michael@0 308 subpermute.removeAll();
michael@0 309
michael@0 310 // see what the permutations of the characters before and after this one are
michael@0 311 //Hashtable *subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp)));
michael@0 312 permute(subPermuteString.replace(i, U16_LENGTH(cp), NULL, 0), skipZeros, &subpermute, status);
michael@0 313 /* Test for buffer overflows */
michael@0 314 if(U_FAILURE(status)) {
michael@0 315 return;
michael@0 316 }
michael@0 317 // The upper replace is destructive. The question is do we have to make a copy, or we don't care about the contents
michael@0 318 // of source at this point.
michael@0 319
michael@0 320 // prefix this character to all of them
michael@0 321 ne = subpermute.nextElement(el);
michael@0 322 while (ne != NULL) {
michael@0 323 UnicodeString *permRes = (UnicodeString *)(ne->value.pointer);
michael@0 324 UnicodeString *chStr = new UnicodeString(cp);
michael@0 325 //test for NULL
michael@0 326 if (chStr == NULL) {
michael@0 327 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 328 return;
michael@0 329 }
michael@0 330 chStr->append(*permRes); //*((UnicodeString *)(ne->value.pointer));
michael@0 331 //if (PROGRESS) printf(" Piece: %s\n", UToS(*chStr));
michael@0 332 result->put(*chStr, chStr, status);
michael@0 333 ne = subpermute.nextElement(el);
michael@0 334 }
michael@0 335 }
michael@0 336 //return result;
michael@0 337 }
michael@0 338
michael@0 339 // privates
michael@0 340
michael@0 341 // we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
michael@0 342 UnicodeString* CanonicalIterator::getEquivalents(const UnicodeString &segment, int32_t &result_len, UErrorCode &status) {
michael@0 343 Hashtable result(status);
michael@0 344 Hashtable permutations(status);
michael@0 345 Hashtable basic(status);
michael@0 346 if (U_FAILURE(status)) {
michael@0 347 return 0;
michael@0 348 }
michael@0 349 result.setValueDeleter(uprv_deleteUObject);
michael@0 350 permutations.setValueDeleter(uprv_deleteUObject);
michael@0 351 basic.setValueDeleter(uprv_deleteUObject);
michael@0 352
michael@0 353 UChar USeg[256];
michael@0 354 int32_t segLen = segment.extract(USeg, 256, status);
michael@0 355 getEquivalents2(&basic, USeg, segLen, status);
michael@0 356
michael@0 357 // now get all the permutations
michael@0 358 // add only the ones that are canonically equivalent
michael@0 359 // TODO: optimize by not permuting any class zero.
michael@0 360
michael@0 361 const UHashElement *ne = NULL;
michael@0 362 int32_t el = -1;
michael@0 363 //Iterator it = basic.iterator();
michael@0 364 ne = basic.nextElement(el);
michael@0 365 //while (it.hasNext())
michael@0 366 while (ne != NULL) {
michael@0 367 //String item = (String) it.next();
michael@0 368 UnicodeString item = *((UnicodeString *)(ne->value.pointer));
michael@0 369
michael@0 370 permutations.removeAll();
michael@0 371 permute(item, CANITER_SKIP_ZEROES, &permutations, status);
michael@0 372 const UHashElement *ne2 = NULL;
michael@0 373 int32_t el2 = -1;
michael@0 374 //Iterator it2 = permutations.iterator();
michael@0 375 ne2 = permutations.nextElement(el2);
michael@0 376 //while (it2.hasNext())
michael@0 377 while (ne2 != NULL) {
michael@0 378 //String possible = (String) it2.next();
michael@0 379 //UnicodeString *possible = new UnicodeString(*((UnicodeString *)(ne2->value.pointer)));
michael@0 380 UnicodeString possible(*((UnicodeString *)(ne2->value.pointer)));
michael@0 381 UnicodeString attempt;
michael@0 382 nfd.normalize(possible, attempt, status);
michael@0 383
michael@0 384 // TODO: check if operator == is semanticaly the same as attempt.equals(segment)
michael@0 385 if (attempt==segment) {
michael@0 386 //if (PROGRESS) printf("Adding Permutation: %s\n", UToS(Tr(*possible)));
michael@0 387 // TODO: use the hashtable just to catch duplicates - store strings directly (somehow).
michael@0 388 result.put(possible, new UnicodeString(possible), status); //add(possible);
michael@0 389 } else {
michael@0 390 //if (PROGRESS) printf("-Skipping Permutation: %s\n", UToS(Tr(*possible)));
michael@0 391 }
michael@0 392
michael@0 393 ne2 = permutations.nextElement(el2);
michael@0 394 }
michael@0 395 ne = basic.nextElement(el);
michael@0 396 }
michael@0 397
michael@0 398 /* Test for buffer overflows */
michael@0 399 if(U_FAILURE(status)) {
michael@0 400 return 0;
michael@0 401 }
michael@0 402 // convert into a String[] to clean up storage
michael@0 403 //String[] finalResult = new String[result.size()];
michael@0 404 UnicodeString *finalResult = NULL;
michael@0 405 int32_t resultCount;
michael@0 406 if((resultCount = result.count())) {
michael@0 407 finalResult = new UnicodeString[resultCount];
michael@0 408 if (finalResult == 0) {
michael@0 409 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 410 return NULL;
michael@0 411 }
michael@0 412 }
michael@0 413 else {
michael@0 414 status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 415 return NULL;
michael@0 416 }
michael@0 417 //result.toArray(finalResult);
michael@0 418 result_len = 0;
michael@0 419 el = -1;
michael@0 420 ne = result.nextElement(el);
michael@0 421 while(ne != NULL) {
michael@0 422 finalResult[result_len++] = *((UnicodeString *)(ne->value.pointer));
michael@0 423 ne = result.nextElement(el);
michael@0 424 }
michael@0 425
michael@0 426
michael@0 427 return finalResult;
michael@0 428 }
michael@0 429
michael@0 430 Hashtable *CanonicalIterator::getEquivalents2(Hashtable *fillinResult, const UChar *segment, int32_t segLen, UErrorCode &status) {
michael@0 431
michael@0 432 if (U_FAILURE(status)) {
michael@0 433 return NULL;
michael@0 434 }
michael@0 435
michael@0 436 //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(segment)));
michael@0 437
michael@0 438 UnicodeString toPut(segment, segLen);
michael@0 439
michael@0 440 fillinResult->put(toPut, new UnicodeString(toPut), status);
michael@0 441
michael@0 442 UnicodeSet starts;
michael@0 443
michael@0 444 // cycle through all the characters
michael@0 445 UChar32 cp;
michael@0 446 for (int32_t i = 0; i < segLen; i += U16_LENGTH(cp)) {
michael@0 447 // see if any character is at the start of some decomposition
michael@0 448 U16_GET(segment, 0, i, segLen, cp);
michael@0 449 if (!nfcImpl.getCanonStartSet(cp, starts)) {
michael@0 450 continue;
michael@0 451 }
michael@0 452 // if so, see which decompositions match
michael@0 453 UnicodeSetIterator iter(starts);
michael@0 454 while (iter.next()) {
michael@0 455 UChar32 cp2 = iter.getCodepoint();
michael@0 456 Hashtable remainder(status);
michael@0 457 remainder.setValueDeleter(uprv_deleteUObject);
michael@0 458 if (extract(&remainder, cp2, segment, segLen, i, status) == NULL) {
michael@0 459 continue;
michael@0 460 }
michael@0 461
michael@0 462 // there were some matches, so add all the possibilities to the set.
michael@0 463 UnicodeString prefix(segment, i);
michael@0 464 prefix += cp2;
michael@0 465
michael@0 466 int32_t el = -1;
michael@0 467 const UHashElement *ne = remainder.nextElement(el);
michael@0 468 while (ne != NULL) {
michael@0 469 UnicodeString item = *((UnicodeString *)(ne->value.pointer));
michael@0 470 UnicodeString *toAdd = new UnicodeString(prefix);
michael@0 471 /* test for NULL */
michael@0 472 if (toAdd == 0) {
michael@0 473 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 474 return NULL;
michael@0 475 }
michael@0 476 *toAdd += item;
michael@0 477 fillinResult->put(*toAdd, toAdd, status);
michael@0 478
michael@0 479 //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd)));
michael@0 480
michael@0 481 ne = remainder.nextElement(el);
michael@0 482 }
michael@0 483 }
michael@0 484 }
michael@0 485
michael@0 486 /* Test for buffer overflows */
michael@0 487 if(U_FAILURE(status)) {
michael@0 488 return NULL;
michael@0 489 }
michael@0 490 return fillinResult;
michael@0 491 }
michael@0 492
michael@0 493 /**
michael@0 494 * See if the decomposition of cp2 is at segment starting at segmentPos
michael@0 495 * (with canonical rearrangment!)
michael@0 496 * If so, take the remainder, and return the equivalents
michael@0 497 */
michael@0 498 Hashtable *CanonicalIterator::extract(Hashtable *fillinResult, UChar32 comp, const UChar *segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) {
michael@0 499 //Hashtable *CanonicalIterator::extract(UChar32 comp, const UnicodeString &segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) {
michael@0 500 //if (PROGRESS) printf(" extract: %s, ", UToS(Tr(UnicodeString(comp))));
michael@0 501 //if (PROGRESS) printf("%s, %i\n", UToS(Tr(segment)), segmentPos);
michael@0 502
michael@0 503 if (U_FAILURE(status)) {
michael@0 504 return NULL;
michael@0 505 }
michael@0 506
michael@0 507 UnicodeString temp(comp);
michael@0 508 int32_t inputLen=temp.length();
michael@0 509 UnicodeString decompString;
michael@0 510 nfd.normalize(temp, decompString, status);
michael@0 511 const UChar *decomp=decompString.getBuffer();
michael@0 512 int32_t decompLen=decompString.length();
michael@0 513
michael@0 514 // See if it matches the start of segment (at segmentPos)
michael@0 515 UBool ok = FALSE;
michael@0 516 UChar32 cp;
michael@0 517 int32_t decompPos = 0;
michael@0 518 UChar32 decompCp;
michael@0 519 U16_NEXT(decomp, decompPos, decompLen, decompCp);
michael@0 520
michael@0 521 int32_t i = segmentPos;
michael@0 522 while(i < segLen) {
michael@0 523 U16_NEXT(segment, i, segLen, cp);
michael@0 524
michael@0 525 if (cp == decompCp) { // if equal, eat another cp from decomp
michael@0 526
michael@0 527 //if (PROGRESS) printf(" matches: %s\n", UToS(Tr(UnicodeString(cp))));
michael@0 528
michael@0 529 if (decompPos == decompLen) { // done, have all decomp characters!
michael@0 530 temp.append(segment+i, segLen-i);
michael@0 531 ok = TRUE;
michael@0 532 break;
michael@0 533 }
michael@0 534 U16_NEXT(decomp, decompPos, decompLen, decompCp);
michael@0 535 } else {
michael@0 536 //if (PROGRESS) printf(" buffer: %s\n", UToS(Tr(UnicodeString(cp))));
michael@0 537
michael@0 538 // brute force approach
michael@0 539 temp.append(cp);
michael@0 540
michael@0 541 /* TODO: optimize
michael@0 542 // since we know that the classes are monotonically increasing, after zero
michael@0 543 // e.g. 0 5 7 9 0 3
michael@0 544 // we can do an optimization
michael@0 545 // there are only a few cases that work: zero, less, same, greater
michael@0 546 // if both classes are the same, we fail
michael@0 547 // if the decomp class < the segment class, we fail
michael@0 548
michael@0 549 segClass = getClass(cp);
michael@0 550 if (decompClass <= segClass) return null;
michael@0 551 */
michael@0 552 }
michael@0 553 }
michael@0 554 if (!ok)
michael@0 555 return NULL; // we failed, characters left over
michael@0 556
michael@0 557 //if (PROGRESS) printf("Matches\n");
michael@0 558
michael@0 559 if (inputLen == temp.length()) {
michael@0 560 fillinResult->put(UnicodeString(), new UnicodeString(), status);
michael@0 561 return fillinResult; // succeed, but no remainder
michael@0 562 }
michael@0 563
michael@0 564 // brute force approach
michael@0 565 // check to make sure result is canonically equivalent
michael@0 566 UnicodeString trial;
michael@0 567 nfd.normalize(temp, trial, status);
michael@0 568 if(U_FAILURE(status) || trial.compare(segment+segmentPos, segLen - segmentPos) != 0) {
michael@0 569 return NULL;
michael@0 570 }
michael@0 571
michael@0 572 return getEquivalents2(fillinResult, temp.getBuffer()+inputLen, temp.length()-inputLen, status);
michael@0 573 }
michael@0 574
michael@0 575 U_NAMESPACE_END
michael@0 576
michael@0 577 #endif /* #if !UCONFIG_NO_NORMALIZATION */

mercurial