1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/caniter.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,577 @@ 1.4 +/* 1.5 + ***************************************************************************** 1.6 + * Copyright (C) 1996-2011, International Business Machines Corporation and * 1.7 + * others. All Rights Reserved. * 1.8 + ***************************************************************************** 1.9 + */ 1.10 + 1.11 +#include "unicode/utypes.h" 1.12 + 1.13 +#if !UCONFIG_NO_NORMALIZATION 1.14 + 1.15 +#include "unicode/caniter.h" 1.16 +#include "unicode/normalizer2.h" 1.17 +#include "unicode/uchar.h" 1.18 +#include "unicode/uniset.h" 1.19 +#include "unicode/usetiter.h" 1.20 +#include "unicode/ustring.h" 1.21 +#include "unicode/utf16.h" 1.22 +#include "cmemory.h" 1.23 +#include "hash.h" 1.24 +#include "normalizer2impl.h" 1.25 + 1.26 +/** 1.27 + * This class allows one to iterate through all the strings that are canonically equivalent to a given 1.28 + * string. For example, here are some sample results: 1.29 +Results for: {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 1.30 +1: \u0041\u030A\u0064\u0307\u0327 1.31 + = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 1.32 +2: \u0041\u030A\u0064\u0327\u0307 1.33 + = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE} 1.34 +3: \u0041\u030A\u1E0B\u0327 1.35 + = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA} 1.36 +4: \u0041\u030A\u1E11\u0307 1.37 + = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE} 1.38 +5: \u00C5\u0064\u0307\u0327 1.39 + = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 1.40 +6: \u00C5\u0064\u0327\u0307 1.41 + = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE} 1.42 +7: \u00C5\u1E0B\u0327 1.43 + = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA} 1.44 +8: \u00C5\u1E11\u0307 1.45 + = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE} 1.46 +9: \u212B\u0064\u0307\u0327 1.47 + = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 1.48 +10: \u212B\u0064\u0327\u0307 1.49 + = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE} 1.50 +11: \u212B\u1E0B\u0327 1.51 + = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA} 1.52 +12: \u212B\u1E11\u0307 1.53 + = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE} 1.54 + *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones, 1.55 + * since it has not been optimized for that situation. 1.56 + *@author M. Davis 1.57 + *@draft 1.58 + */ 1.59 + 1.60 +// public 1.61 + 1.62 +U_NAMESPACE_BEGIN 1.63 + 1.64 +// TODO: add boilerplate methods. 1.65 + 1.66 +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CanonicalIterator) 1.67 + 1.68 +/** 1.69 + *@param source string to get results for 1.70 + */ 1.71 +CanonicalIterator::CanonicalIterator(const UnicodeString &sourceStr, UErrorCode &status) : 1.72 + pieces(NULL), 1.73 + pieces_length(0), 1.74 + pieces_lengths(NULL), 1.75 + current(NULL), 1.76 + current_length(0), 1.77 + nfd(*Normalizer2Factory::getNFDInstance(status)), 1.78 + nfcImpl(*Normalizer2Factory::getNFCImpl(status)) 1.79 +{ 1.80 + if(U_SUCCESS(status) && nfcImpl.ensureCanonIterData(status)) { 1.81 + setSource(sourceStr, status); 1.82 + } 1.83 +} 1.84 + 1.85 +CanonicalIterator::~CanonicalIterator() { 1.86 + cleanPieces(); 1.87 +} 1.88 + 1.89 +void CanonicalIterator::cleanPieces() { 1.90 + int32_t i = 0; 1.91 + if(pieces != NULL) { 1.92 + for(i = 0; i < pieces_length; i++) { 1.93 + if(pieces[i] != NULL) { 1.94 + delete[] pieces[i]; 1.95 + } 1.96 + } 1.97 + uprv_free(pieces); 1.98 + pieces = NULL; 1.99 + pieces_length = 0; 1.100 + } 1.101 + if(pieces_lengths != NULL) { 1.102 + uprv_free(pieces_lengths); 1.103 + pieces_lengths = NULL; 1.104 + } 1.105 + if(current != NULL) { 1.106 + uprv_free(current); 1.107 + current = NULL; 1.108 + current_length = 0; 1.109 + } 1.110 +} 1.111 + 1.112 +/** 1.113 + *@return gets the source: NOTE: it is the NFD form of source 1.114 + */ 1.115 +UnicodeString CanonicalIterator::getSource() { 1.116 + return source; 1.117 +} 1.118 + 1.119 +/** 1.120 + * Resets the iterator so that one can start again from the beginning. 1.121 + */ 1.122 +void CanonicalIterator::reset() { 1.123 + done = FALSE; 1.124 + for (int i = 0; i < current_length; ++i) { 1.125 + current[i] = 0; 1.126 + } 1.127 +} 1.128 + 1.129 +/** 1.130 + *@return the next string that is canonically equivalent. The value null is returned when 1.131 + * the iteration is done. 1.132 + */ 1.133 +UnicodeString CanonicalIterator::next() { 1.134 + int32_t i = 0; 1.135 + 1.136 + if (done) { 1.137 + buffer.setToBogus(); 1.138 + return buffer; 1.139 + } 1.140 + 1.141 + // delete old contents 1.142 + buffer.remove(); 1.143 + 1.144 + // construct return value 1.145 + 1.146 + for (i = 0; i < pieces_length; ++i) { 1.147 + buffer.append(pieces[i][current[i]]); 1.148 + } 1.149 + //String result = buffer.toString(); // not needed 1.150 + 1.151 + // find next value for next time 1.152 + 1.153 + for (i = current_length - 1; ; --i) { 1.154 + if (i < 0) { 1.155 + done = TRUE; 1.156 + break; 1.157 + } 1.158 + current[i]++; 1.159 + if (current[i] < pieces_lengths[i]) break; // got sequence 1.160 + current[i] = 0; 1.161 + } 1.162 + return buffer; 1.163 +} 1.164 + 1.165 +/** 1.166 + *@param set the source string to iterate against. This allows the same iterator to be used 1.167 + * while changing the source string, saving object creation. 1.168 + */ 1.169 +void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &status) { 1.170 + int32_t list_length = 0; 1.171 + UChar32 cp = 0; 1.172 + int32_t start = 0; 1.173 + int32_t i = 0; 1.174 + UnicodeString *list = NULL; 1.175 + 1.176 + nfd.normalize(newSource, source, status); 1.177 + if(U_FAILURE(status)) { 1.178 + return; 1.179 + } 1.180 + done = FALSE; 1.181 + 1.182 + cleanPieces(); 1.183 + 1.184 + // catch degenerate case 1.185 + if (newSource.length() == 0) { 1.186 + pieces = (UnicodeString **)uprv_malloc(sizeof(UnicodeString *)); 1.187 + pieces_lengths = (int32_t*)uprv_malloc(1 * sizeof(int32_t)); 1.188 + pieces_length = 1; 1.189 + current = (int32_t*)uprv_malloc(1 * sizeof(int32_t)); 1.190 + current_length = 1; 1.191 + if (pieces == NULL || pieces_lengths == NULL || current == NULL) { 1.192 + status = U_MEMORY_ALLOCATION_ERROR; 1.193 + goto CleanPartialInitialization; 1.194 + } 1.195 + current[0] = 0; 1.196 + pieces[0] = new UnicodeString[1]; 1.197 + pieces_lengths[0] = 1; 1.198 + if (pieces[0] == 0) { 1.199 + status = U_MEMORY_ALLOCATION_ERROR; 1.200 + goto CleanPartialInitialization; 1.201 + } 1.202 + return; 1.203 + } 1.204 + 1.205 + 1.206 + list = new UnicodeString[source.length()]; 1.207 + if (list == 0) { 1.208 + status = U_MEMORY_ALLOCATION_ERROR; 1.209 + goto CleanPartialInitialization; 1.210 + } 1.211 + 1.212 + // i should initialy be the number of code units at the 1.213 + // start of the string 1.214 + i = U16_LENGTH(source.char32At(0)); 1.215 + //int32_t i = 1; 1.216 + // find the segments 1.217 + // This code iterates through the source string and 1.218 + // extracts segments that end up on a codepoint that 1.219 + // doesn't start any decompositions. (Analysis is done 1.220 + // on the NFD form - see above). 1.221 + for (; i < source.length(); i += U16_LENGTH(cp)) { 1.222 + cp = source.char32At(i); 1.223 + if (nfcImpl.isCanonSegmentStarter(cp)) { 1.224 + source.extract(start, i-start, list[list_length++]); // add up to i 1.225 + start = i; 1.226 + } 1.227 + } 1.228 + source.extract(start, i-start, list[list_length++]); // add last one 1.229 + 1.230 + 1.231 + // allocate the arrays, and find the strings that are CE to each segment 1.232 + pieces = (UnicodeString **)uprv_malloc(list_length * sizeof(UnicodeString *)); 1.233 + pieces_length = list_length; 1.234 + pieces_lengths = (int32_t*)uprv_malloc(list_length * sizeof(int32_t)); 1.235 + current = (int32_t*)uprv_malloc(list_length * sizeof(int32_t)); 1.236 + current_length = list_length; 1.237 + if (pieces == NULL || pieces_lengths == NULL || current == NULL) { 1.238 + status = U_MEMORY_ALLOCATION_ERROR; 1.239 + goto CleanPartialInitialization; 1.240 + } 1.241 + 1.242 + for (i = 0; i < current_length; i++) { 1.243 + current[i] = 0; 1.244 + } 1.245 + // for each segment, get all the combinations that can produce 1.246 + // it after NFD normalization 1.247 + for (i = 0; i < pieces_length; ++i) { 1.248 + //if (PROGRESS) printf("SEGMENT\n"); 1.249 + pieces[i] = getEquivalents(list[i], pieces_lengths[i], status); 1.250 + } 1.251 + 1.252 + delete[] list; 1.253 + return; 1.254 +// Common section to cleanup all local variables and reset object variables. 1.255 +CleanPartialInitialization: 1.256 + if (list != NULL) { 1.257 + delete[] list; 1.258 + } 1.259 + cleanPieces(); 1.260 +} 1.261 + 1.262 +/** 1.263 + * Dumb recursive implementation of permutation. 1.264 + * TODO: optimize 1.265 + * @param source the string to find permutations for 1.266 + * @return the results in a set. 1.267 + */ 1.268 +void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status) { 1.269 + if(U_FAILURE(status)) { 1.270 + return; 1.271 + } 1.272 + //if (PROGRESS) printf("Permute: %s\n", UToS(Tr(source))); 1.273 + int32_t i = 0; 1.274 + 1.275 + // optimization: 1.276 + // if zero or one character, just return a set with it 1.277 + // we check for length < 2 to keep from counting code points all the time 1.278 + if (source.length() <= 2 && source.countChar32() <= 1) { 1.279 + UnicodeString *toPut = new UnicodeString(source); 1.280 + /* test for NULL */ 1.281 + if (toPut == 0) { 1.282 + status = U_MEMORY_ALLOCATION_ERROR; 1.283 + return; 1.284 + } 1.285 + result->put(source, toPut, status); 1.286 + return; 1.287 + } 1.288 + 1.289 + // otherwise iterate through the string, and recursively permute all the other characters 1.290 + UChar32 cp; 1.291 + Hashtable subpermute(status); 1.292 + if(U_FAILURE(status)) { 1.293 + return; 1.294 + } 1.295 + subpermute.setValueDeleter(uprv_deleteUObject); 1.296 + 1.297 + for (i = 0; i < source.length(); i += U16_LENGTH(cp)) { 1.298 + cp = source.char32At(i); 1.299 + const UHashElement *ne = NULL; 1.300 + int32_t el = -1; 1.301 + UnicodeString subPermuteString = source; 1.302 + 1.303 + // optimization: 1.304 + // if the character is canonical combining class zero, 1.305 + // don't permute it 1.306 + if (skipZeros && i != 0 && u_getCombiningClass(cp) == 0) { 1.307 + //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i))); 1.308 + continue; 1.309 + } 1.310 + 1.311 + subpermute.removeAll(); 1.312 + 1.313 + // see what the permutations of the characters before and after this one are 1.314 + //Hashtable *subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp))); 1.315 + permute(subPermuteString.replace(i, U16_LENGTH(cp), NULL, 0), skipZeros, &subpermute, status); 1.316 + /* Test for buffer overflows */ 1.317 + if(U_FAILURE(status)) { 1.318 + return; 1.319 + } 1.320 + // The upper replace is destructive. The question is do we have to make a copy, or we don't care about the contents 1.321 + // of source at this point. 1.322 + 1.323 + // prefix this character to all of them 1.324 + ne = subpermute.nextElement(el); 1.325 + while (ne != NULL) { 1.326 + UnicodeString *permRes = (UnicodeString *)(ne->value.pointer); 1.327 + UnicodeString *chStr = new UnicodeString(cp); 1.328 + //test for NULL 1.329 + if (chStr == NULL) { 1.330 + status = U_MEMORY_ALLOCATION_ERROR; 1.331 + return; 1.332 + } 1.333 + chStr->append(*permRes); //*((UnicodeString *)(ne->value.pointer)); 1.334 + //if (PROGRESS) printf(" Piece: %s\n", UToS(*chStr)); 1.335 + result->put(*chStr, chStr, status); 1.336 + ne = subpermute.nextElement(el); 1.337 + } 1.338 + } 1.339 + //return result; 1.340 +} 1.341 + 1.342 +// privates 1.343 + 1.344 +// we have a segment, in NFD. Find all the strings that are canonically equivalent to it. 1.345 +UnicodeString* CanonicalIterator::getEquivalents(const UnicodeString &segment, int32_t &result_len, UErrorCode &status) { 1.346 + Hashtable result(status); 1.347 + Hashtable permutations(status); 1.348 + Hashtable basic(status); 1.349 + if (U_FAILURE(status)) { 1.350 + return 0; 1.351 + } 1.352 + result.setValueDeleter(uprv_deleteUObject); 1.353 + permutations.setValueDeleter(uprv_deleteUObject); 1.354 + basic.setValueDeleter(uprv_deleteUObject); 1.355 + 1.356 + UChar USeg[256]; 1.357 + int32_t segLen = segment.extract(USeg, 256, status); 1.358 + getEquivalents2(&basic, USeg, segLen, status); 1.359 + 1.360 + // now get all the permutations 1.361 + // add only the ones that are canonically equivalent 1.362 + // TODO: optimize by not permuting any class zero. 1.363 + 1.364 + const UHashElement *ne = NULL; 1.365 + int32_t el = -1; 1.366 + //Iterator it = basic.iterator(); 1.367 + ne = basic.nextElement(el); 1.368 + //while (it.hasNext()) 1.369 + while (ne != NULL) { 1.370 + //String item = (String) it.next(); 1.371 + UnicodeString item = *((UnicodeString *)(ne->value.pointer)); 1.372 + 1.373 + permutations.removeAll(); 1.374 + permute(item, CANITER_SKIP_ZEROES, &permutations, status); 1.375 + const UHashElement *ne2 = NULL; 1.376 + int32_t el2 = -1; 1.377 + //Iterator it2 = permutations.iterator(); 1.378 + ne2 = permutations.nextElement(el2); 1.379 + //while (it2.hasNext()) 1.380 + while (ne2 != NULL) { 1.381 + //String possible = (String) it2.next(); 1.382 + //UnicodeString *possible = new UnicodeString(*((UnicodeString *)(ne2->value.pointer))); 1.383 + UnicodeString possible(*((UnicodeString *)(ne2->value.pointer))); 1.384 + UnicodeString attempt; 1.385 + nfd.normalize(possible, attempt, status); 1.386 + 1.387 + // TODO: check if operator == is semanticaly the same as attempt.equals(segment) 1.388 + if (attempt==segment) { 1.389 + //if (PROGRESS) printf("Adding Permutation: %s\n", UToS(Tr(*possible))); 1.390 + // TODO: use the hashtable just to catch duplicates - store strings directly (somehow). 1.391 + result.put(possible, new UnicodeString(possible), status); //add(possible); 1.392 + } else { 1.393 + //if (PROGRESS) printf("-Skipping Permutation: %s\n", UToS(Tr(*possible))); 1.394 + } 1.395 + 1.396 + ne2 = permutations.nextElement(el2); 1.397 + } 1.398 + ne = basic.nextElement(el); 1.399 + } 1.400 + 1.401 + /* Test for buffer overflows */ 1.402 + if(U_FAILURE(status)) { 1.403 + return 0; 1.404 + } 1.405 + // convert into a String[] to clean up storage 1.406 + //String[] finalResult = new String[result.size()]; 1.407 + UnicodeString *finalResult = NULL; 1.408 + int32_t resultCount; 1.409 + if((resultCount = result.count())) { 1.410 + finalResult = new UnicodeString[resultCount]; 1.411 + if (finalResult == 0) { 1.412 + status = U_MEMORY_ALLOCATION_ERROR; 1.413 + return NULL; 1.414 + } 1.415 + } 1.416 + else { 1.417 + status = U_ILLEGAL_ARGUMENT_ERROR; 1.418 + return NULL; 1.419 + } 1.420 + //result.toArray(finalResult); 1.421 + result_len = 0; 1.422 + el = -1; 1.423 + ne = result.nextElement(el); 1.424 + while(ne != NULL) { 1.425 + finalResult[result_len++] = *((UnicodeString *)(ne->value.pointer)); 1.426 + ne = result.nextElement(el); 1.427 + } 1.428 + 1.429 + 1.430 + return finalResult; 1.431 +} 1.432 + 1.433 +Hashtable *CanonicalIterator::getEquivalents2(Hashtable *fillinResult, const UChar *segment, int32_t segLen, UErrorCode &status) { 1.434 + 1.435 + if (U_FAILURE(status)) { 1.436 + return NULL; 1.437 + } 1.438 + 1.439 + //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(segment))); 1.440 + 1.441 + UnicodeString toPut(segment, segLen); 1.442 + 1.443 + fillinResult->put(toPut, new UnicodeString(toPut), status); 1.444 + 1.445 + UnicodeSet starts; 1.446 + 1.447 + // cycle through all the characters 1.448 + UChar32 cp; 1.449 + for (int32_t i = 0; i < segLen; i += U16_LENGTH(cp)) { 1.450 + // see if any character is at the start of some decomposition 1.451 + U16_GET(segment, 0, i, segLen, cp); 1.452 + if (!nfcImpl.getCanonStartSet(cp, starts)) { 1.453 + continue; 1.454 + } 1.455 + // if so, see which decompositions match 1.456 + UnicodeSetIterator iter(starts); 1.457 + while (iter.next()) { 1.458 + UChar32 cp2 = iter.getCodepoint(); 1.459 + Hashtable remainder(status); 1.460 + remainder.setValueDeleter(uprv_deleteUObject); 1.461 + if (extract(&remainder, cp2, segment, segLen, i, status) == NULL) { 1.462 + continue; 1.463 + } 1.464 + 1.465 + // there were some matches, so add all the possibilities to the set. 1.466 + UnicodeString prefix(segment, i); 1.467 + prefix += cp2; 1.468 + 1.469 + int32_t el = -1; 1.470 + const UHashElement *ne = remainder.nextElement(el); 1.471 + while (ne != NULL) { 1.472 + UnicodeString item = *((UnicodeString *)(ne->value.pointer)); 1.473 + UnicodeString *toAdd = new UnicodeString(prefix); 1.474 + /* test for NULL */ 1.475 + if (toAdd == 0) { 1.476 + status = U_MEMORY_ALLOCATION_ERROR; 1.477 + return NULL; 1.478 + } 1.479 + *toAdd += item; 1.480 + fillinResult->put(*toAdd, toAdd, status); 1.481 + 1.482 + //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd))); 1.483 + 1.484 + ne = remainder.nextElement(el); 1.485 + } 1.486 + } 1.487 + } 1.488 + 1.489 + /* Test for buffer overflows */ 1.490 + if(U_FAILURE(status)) { 1.491 + return NULL; 1.492 + } 1.493 + return fillinResult; 1.494 +} 1.495 + 1.496 +/** 1.497 + * See if the decomposition of cp2 is at segment starting at segmentPos 1.498 + * (with canonical rearrangment!) 1.499 + * If so, take the remainder, and return the equivalents 1.500 + */ 1.501 +Hashtable *CanonicalIterator::extract(Hashtable *fillinResult, UChar32 comp, const UChar *segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) { 1.502 +//Hashtable *CanonicalIterator::extract(UChar32 comp, const UnicodeString &segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) { 1.503 + //if (PROGRESS) printf(" extract: %s, ", UToS(Tr(UnicodeString(comp)))); 1.504 + //if (PROGRESS) printf("%s, %i\n", UToS(Tr(segment)), segmentPos); 1.505 + 1.506 + if (U_FAILURE(status)) { 1.507 + return NULL; 1.508 + } 1.509 + 1.510 + UnicodeString temp(comp); 1.511 + int32_t inputLen=temp.length(); 1.512 + UnicodeString decompString; 1.513 + nfd.normalize(temp, decompString, status); 1.514 + const UChar *decomp=decompString.getBuffer(); 1.515 + int32_t decompLen=decompString.length(); 1.516 + 1.517 + // See if it matches the start of segment (at segmentPos) 1.518 + UBool ok = FALSE; 1.519 + UChar32 cp; 1.520 + int32_t decompPos = 0; 1.521 + UChar32 decompCp; 1.522 + U16_NEXT(decomp, decompPos, decompLen, decompCp); 1.523 + 1.524 + int32_t i = segmentPos; 1.525 + while(i < segLen) { 1.526 + U16_NEXT(segment, i, segLen, cp); 1.527 + 1.528 + if (cp == decompCp) { // if equal, eat another cp from decomp 1.529 + 1.530 + //if (PROGRESS) printf(" matches: %s\n", UToS(Tr(UnicodeString(cp)))); 1.531 + 1.532 + if (decompPos == decompLen) { // done, have all decomp characters! 1.533 + temp.append(segment+i, segLen-i); 1.534 + ok = TRUE; 1.535 + break; 1.536 + } 1.537 + U16_NEXT(decomp, decompPos, decompLen, decompCp); 1.538 + } else { 1.539 + //if (PROGRESS) printf(" buffer: %s\n", UToS(Tr(UnicodeString(cp)))); 1.540 + 1.541 + // brute force approach 1.542 + temp.append(cp); 1.543 + 1.544 + /* TODO: optimize 1.545 + // since we know that the classes are monotonically increasing, after zero 1.546 + // e.g. 0 5 7 9 0 3 1.547 + // we can do an optimization 1.548 + // there are only a few cases that work: zero, less, same, greater 1.549 + // if both classes are the same, we fail 1.550 + // if the decomp class < the segment class, we fail 1.551 + 1.552 + segClass = getClass(cp); 1.553 + if (decompClass <= segClass) return null; 1.554 + */ 1.555 + } 1.556 + } 1.557 + if (!ok) 1.558 + return NULL; // we failed, characters left over 1.559 + 1.560 + //if (PROGRESS) printf("Matches\n"); 1.561 + 1.562 + if (inputLen == temp.length()) { 1.563 + fillinResult->put(UnicodeString(), new UnicodeString(), status); 1.564 + return fillinResult; // succeed, but no remainder 1.565 + } 1.566 + 1.567 + // brute force approach 1.568 + // check to make sure result is canonically equivalent 1.569 + UnicodeString trial; 1.570 + nfd.normalize(temp, trial, status); 1.571 + if(U_FAILURE(status) || trial.compare(segment+segmentPos, segLen - segmentPos) != 0) { 1.572 + return NULL; 1.573 + } 1.574 + 1.575 + return getEquivalents2(fillinResult, temp.getBuffer()+inputLen, temp.length()-inputLen, status); 1.576 +} 1.577 + 1.578 +U_NAMESPACE_END 1.579 + 1.580 +#endif /* #if !UCONFIG_NO_NORMALIZATION */