1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/tridpars.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,937 @@ 1.4 +/* 1.5 +********************************************************************** 1.6 +* Copyright (c) 2002-2012, International Business Machines Corporation 1.7 +* and others. All Rights Reserved. 1.8 +********************************************************************** 1.9 +* Date Name Description 1.10 +* 01/14/2002 aliu Creation. 1.11 +********************************************************************** 1.12 +*/ 1.13 + 1.14 +#include "unicode/utypes.h" 1.15 + 1.16 +#if !UCONFIG_NO_TRANSLITERATION 1.17 + 1.18 +#include "tridpars.h" 1.19 +#include "hash.h" 1.20 +#include "mutex.h" 1.21 +#include "ucln_in.h" 1.22 +#include "unicode/parsepos.h" 1.23 +#include "unicode/translit.h" 1.24 +#include "unicode/uchar.h" 1.25 +#include "unicode/uniset.h" 1.26 +#include "unicode/unistr.h" 1.27 +#include "unicode/utrans.h" 1.28 +#include "util.h" 1.29 +#include "uvector.h" 1.30 + 1.31 +U_NAMESPACE_BEGIN 1.32 + 1.33 +static const UChar ID_DELIM = 0x003B; // ; 1.34 +static const UChar TARGET_SEP = 0x002D; // - 1.35 +static const UChar VARIANT_SEP = 0x002F; // / 1.36 +static const UChar OPEN_REV = 0x0028; // ( 1.37 +static const UChar CLOSE_REV = 0x0029; // ) 1.38 + 1.39 +//static const UChar EMPTY[] = {0}; // "" 1.40 +static const UChar ANY[] = {65,110,121,0}; // "Any" 1.41 +static const UChar ANY_NULL[] = {65,110,121,45,78,117,108,108,0}; // "Any-Null" 1.42 + 1.43 +static const int32_t FORWARD = UTRANS_FORWARD; 1.44 +static const int32_t REVERSE = UTRANS_REVERSE; 1.45 + 1.46 +static Hashtable* SPECIAL_INVERSES = NULL; 1.47 + 1.48 +/** 1.49 + * The mutex controlling access to SPECIAL_INVERSES 1.50 + */ 1.51 +static UMutex LOCK = U_MUTEX_INITIALIZER; 1.52 + 1.53 +TransliteratorIDParser::Specs::Specs(const UnicodeString& s, const UnicodeString& t, 1.54 + const UnicodeString& v, UBool sawS, 1.55 + const UnicodeString& f) { 1.56 + source = s; 1.57 + target = t; 1.58 + variant = v; 1.59 + sawSource = sawS; 1.60 + filter = f; 1.61 +} 1.62 + 1.63 +TransliteratorIDParser::SingleID::SingleID(const UnicodeString& c, const UnicodeString& b, 1.64 + const UnicodeString& f) { 1.65 + canonID = c; 1.66 + basicID = b; 1.67 + filter = f; 1.68 +} 1.69 + 1.70 +TransliteratorIDParser::SingleID::SingleID(const UnicodeString& c, const UnicodeString& b) { 1.71 + canonID = c; 1.72 + basicID = b; 1.73 +} 1.74 + 1.75 +Transliterator* TransliteratorIDParser::SingleID::createInstance() { 1.76 + Transliterator* t; 1.77 + if (basicID.length() == 0) { 1.78 + t = createBasicInstance(UnicodeString(TRUE, ANY_NULL, 8), &canonID); 1.79 + } else { 1.80 + t = createBasicInstance(basicID, &canonID); 1.81 + } 1.82 + if (t != NULL) { 1.83 + if (filter.length() != 0) { 1.84 + UErrorCode ec = U_ZERO_ERROR; 1.85 + UnicodeSet *set = new UnicodeSet(filter, ec); 1.86 + if (U_FAILURE(ec)) { 1.87 + delete set; 1.88 + } else { 1.89 + t->adoptFilter(set); 1.90 + } 1.91 + } 1.92 + } 1.93 + return t; 1.94 +} 1.95 + 1.96 + 1.97 +/** 1.98 + * Parse a single ID, that is, an ID of the general form 1.99 + * "[f1] s1-t1/v1 ([f2] s2-t3/v2)", with the parenthesized element 1.100 + * optional, the filters optional, and the variants optional. 1.101 + * @param id the id to be parsed 1.102 + * @param pos INPUT-OUTPUT parameter. On input, the position of 1.103 + * the first character to parse. On output, the position after 1.104 + * the last character parsed. 1.105 + * @param dir the direction. If the direction is REVERSE then the 1.106 + * SingleID is constructed for the reverse direction. 1.107 + * @return a SingleID object or NULL 1.108 + */ 1.109 +TransliteratorIDParser::SingleID* 1.110 +TransliteratorIDParser::parseSingleID(const UnicodeString& id, int32_t& pos, 1.111 + int32_t dir, UErrorCode& status) { 1.112 + 1.113 + int32_t start = pos; 1.114 + 1.115 + // The ID will be of the form A, A(), A(B), or (B), where 1.116 + // A and B are filter IDs. 1.117 + Specs* specsA = NULL; 1.118 + Specs* specsB = NULL; 1.119 + UBool sawParen = FALSE; 1.120 + 1.121 + // On the first pass, look for (B) or (). If this fails, then 1.122 + // on the second pass, look for A, A(B), or A(). 1.123 + for (int32_t pass=1; pass<=2; ++pass) { 1.124 + if (pass == 2) { 1.125 + specsA = parseFilterID(id, pos, TRUE); 1.126 + if (specsA == NULL) { 1.127 + pos = start; 1.128 + return NULL; 1.129 + } 1.130 + } 1.131 + if (ICU_Utility::parseChar(id, pos, OPEN_REV)) { 1.132 + sawParen = TRUE; 1.133 + if (!ICU_Utility::parseChar(id, pos, CLOSE_REV)) { 1.134 + specsB = parseFilterID(id, pos, TRUE); 1.135 + // Must close with a ')' 1.136 + if (specsB == NULL || !ICU_Utility::parseChar(id, pos, CLOSE_REV)) { 1.137 + delete specsA; 1.138 + pos = start; 1.139 + return NULL; 1.140 + } 1.141 + } 1.142 + break; 1.143 + } 1.144 + } 1.145 + 1.146 + // Assemble return results 1.147 + SingleID* single; 1.148 + if (sawParen) { 1.149 + if (dir == FORWARD) { 1.150 + SingleID* b = specsToID(specsB, FORWARD); 1.151 + single = specsToID(specsA, FORWARD); 1.152 + // Null pointers check 1.153 + if (b == NULL || single == NULL) { 1.154 + delete b; 1.155 + delete single; 1.156 + status = U_MEMORY_ALLOCATION_ERROR; 1.157 + return NULL; 1.158 + } 1.159 + single->canonID.append(OPEN_REV) 1.160 + .append(b->canonID).append(CLOSE_REV); 1.161 + if (specsA != NULL) { 1.162 + single->filter = specsA->filter; 1.163 + } 1.164 + delete b; 1.165 + } else { 1.166 + SingleID* a = specsToID(specsA, FORWARD); 1.167 + single = specsToID(specsB, FORWARD); 1.168 + // Check for null pointer. 1.169 + if (a == NULL || single == NULL) { 1.170 + delete a; 1.171 + delete single; 1.172 + status = U_MEMORY_ALLOCATION_ERROR; 1.173 + return NULL; 1.174 + } 1.175 + single->canonID.append(OPEN_REV) 1.176 + .append(a->canonID).append(CLOSE_REV); 1.177 + if (specsB != NULL) { 1.178 + single->filter = specsB->filter; 1.179 + } 1.180 + delete a; 1.181 + } 1.182 + } else { 1.183 + // assert(specsA != NULL); 1.184 + if (dir == FORWARD) { 1.185 + single = specsToID(specsA, FORWARD); 1.186 + } else { 1.187 + single = specsToSpecialInverse(*specsA, status); 1.188 + if (single == NULL) { 1.189 + single = specsToID(specsA, REVERSE); 1.190 + } 1.191 + } 1.192 + // Check for NULL pointer 1.193 + if (single == NULL) { 1.194 + status = U_MEMORY_ALLOCATION_ERROR; 1.195 + return NULL; 1.196 + } 1.197 + single->filter = specsA->filter; 1.198 + } 1.199 + 1.200 + delete specsA; 1.201 + delete specsB; 1.202 + 1.203 + return single; 1.204 +} 1.205 + 1.206 +/** 1.207 + * Parse a filter ID, that is, an ID of the general form 1.208 + * "[f1] s1-t1/v1", with the filters optional, and the variants optional. 1.209 + * @param id the id to be parsed 1.210 + * @param pos INPUT-OUTPUT parameter. On input, the position of 1.211 + * the first character to parse. On output, the position after 1.212 + * the last character parsed. 1.213 + * @return a SingleID object or null if the parse fails 1.214 + */ 1.215 +TransliteratorIDParser::SingleID* 1.216 +TransliteratorIDParser::parseFilterID(const UnicodeString& id, int32_t& pos) { 1.217 + 1.218 + int32_t start = pos; 1.219 + 1.220 + Specs* specs = parseFilterID(id, pos, TRUE); 1.221 + if (specs == NULL) { 1.222 + pos = start; 1.223 + return NULL; 1.224 + } 1.225 + 1.226 + // Assemble return results 1.227 + SingleID* single = specsToID(specs, FORWARD); 1.228 + if (single != NULL) { 1.229 + single->filter = specs->filter; 1.230 + } 1.231 + delete specs; 1.232 + return single; 1.233 +} 1.234 + 1.235 +/** 1.236 + * Parse a global filter of the form "[f]" or "([f])", depending 1.237 + * on 'withParens'. 1.238 + * @param id the pattern the parse 1.239 + * @param pos INPUT-OUTPUT parameter. On input, the position of 1.240 + * the first character to parse. On output, the position after 1.241 + * the last character parsed. 1.242 + * @param dir the direction. 1.243 + * @param withParens INPUT-OUTPUT parameter. On entry, if 1.244 + * withParens is 0, then parens are disallowed. If it is 1, 1.245 + * then parens are requires. If it is -1, then parens are 1.246 + * optional, and the return result will be set to 0 or 1. 1.247 + * @param canonID OUTPUT parameter. The pattern for the filter 1.248 + * added to the canonID, either at the end, if dir is FORWARD, or 1.249 + * at the start, if dir is REVERSE. The pattern will be enclosed 1.250 + * in parentheses if appropriate, and will be suffixed with an 1.251 + * ID_DELIM character. May be NULL. 1.252 + * @return a UnicodeSet object or NULL. A non-NULL results 1.253 + * indicates a successful parse, regardless of whether the filter 1.254 + * applies to the given direction. The caller should discard it 1.255 + * if withParens != (dir == REVERSE). 1.256 + */ 1.257 +UnicodeSet* TransliteratorIDParser::parseGlobalFilter(const UnicodeString& id, int32_t& pos, 1.258 + int32_t dir, 1.259 + int32_t& withParens, 1.260 + UnicodeString* canonID) { 1.261 + UnicodeSet* filter = NULL; 1.262 + int32_t start = pos; 1.263 + 1.264 + if (withParens == -1) { 1.265 + withParens = ICU_Utility::parseChar(id, pos, OPEN_REV) ? 1 : 0; 1.266 + } else if (withParens == 1) { 1.267 + if (!ICU_Utility::parseChar(id, pos, OPEN_REV)) { 1.268 + pos = start; 1.269 + return NULL; 1.270 + } 1.271 + } 1.272 + 1.273 + ICU_Utility::skipWhitespace(id, pos, TRUE); 1.274 + 1.275 + if (UnicodeSet::resemblesPattern(id, pos)) { 1.276 + ParsePosition ppos(pos); 1.277 + UErrorCode ec = U_ZERO_ERROR; 1.278 + filter = new UnicodeSet(id, ppos, USET_IGNORE_SPACE, NULL, ec); 1.279 + /* test for NULL */ 1.280 + if (filter == 0) { 1.281 + pos = start; 1.282 + return 0; 1.283 + } 1.284 + if (U_FAILURE(ec)) { 1.285 + delete filter; 1.286 + pos = start; 1.287 + return NULL; 1.288 + } 1.289 + 1.290 + UnicodeString pattern; 1.291 + id.extractBetween(pos, ppos.getIndex(), pattern); 1.292 + pos = ppos.getIndex(); 1.293 + 1.294 + if (withParens == 1 && !ICU_Utility::parseChar(id, pos, CLOSE_REV)) { 1.295 + pos = start; 1.296 + return NULL; 1.297 + } 1.298 + 1.299 + // In the forward direction, append the pattern to the 1.300 + // canonID. In the reverse, insert it at zero, and invert 1.301 + // the presence of parens ("A" <-> "(A)"). 1.302 + if (canonID != NULL) { 1.303 + if (dir == FORWARD) { 1.304 + if (withParens == 1) { 1.305 + pattern.insert(0, OPEN_REV); 1.306 + pattern.append(CLOSE_REV); 1.307 + } 1.308 + canonID->append(pattern).append(ID_DELIM); 1.309 + } else { 1.310 + if (withParens == 0) { 1.311 + pattern.insert(0, OPEN_REV); 1.312 + pattern.append(CLOSE_REV); 1.313 + } 1.314 + canonID->insert(0, pattern); 1.315 + canonID->insert(pattern.length(), ID_DELIM); 1.316 + } 1.317 + } 1.318 + } 1.319 + 1.320 + return filter; 1.321 +} 1.322 + 1.323 +U_CDECL_BEGIN 1.324 +static void U_CALLCONV _deleteSingleID(void* obj) { 1.325 + delete (TransliteratorIDParser::SingleID*) obj; 1.326 +} 1.327 + 1.328 +static void U_CALLCONV _deleteTransliteratorTrIDPars(void* obj) { 1.329 + delete (Transliterator*) obj; 1.330 +} 1.331 +U_CDECL_END 1.332 + 1.333 +/** 1.334 + * Parse a compound ID, consisting of an optional forward global 1.335 + * filter, a separator, one or more single IDs delimited by 1.336 + * separators, an an optional reverse global filter. The 1.337 + * separator is a semicolon. The global filters are UnicodeSet 1.338 + * patterns. The reverse global filter must be enclosed in 1.339 + * parentheses. 1.340 + * @param id the pattern the parse 1.341 + * @param dir the direction. 1.342 + * @param canonID OUTPUT parameter that receives the canonical ID, 1.343 + * consisting of canonical IDs for all elements, as returned by 1.344 + * parseSingleID(), separated by semicolons. Previous contents 1.345 + * are discarded. 1.346 + * @param list OUTPUT parameter that receives a list of SingleID 1.347 + * objects representing the parsed IDs. Previous contents are 1.348 + * discarded. 1.349 + * @param globalFilter OUTPUT parameter that receives a pointer to 1.350 + * a newly created global filter for this ID in this direction, or 1.351 + * NULL if there is none. 1.352 + * @return TRUE if the parse succeeds, that is, if the entire 1.353 + * id is consumed without syntax error. 1.354 + */ 1.355 +UBool TransliteratorIDParser::parseCompoundID(const UnicodeString& id, int32_t dir, 1.356 + UnicodeString& canonID, 1.357 + UVector& list, 1.358 + UnicodeSet*& globalFilter) { 1.359 + UErrorCode ec = U_ZERO_ERROR; 1.360 + int32_t i; 1.361 + int32_t pos = 0; 1.362 + int32_t withParens = 1; 1.363 + list.removeAllElements(); 1.364 + UnicodeSet* filter; 1.365 + globalFilter = NULL; 1.366 + canonID.truncate(0); 1.367 + 1.368 + // Parse leading global filter, if any 1.369 + withParens = 0; // parens disallowed 1.370 + filter = parseGlobalFilter(id, pos, dir, withParens, &canonID); 1.371 + if (filter != NULL) { 1.372 + if (!ICU_Utility::parseChar(id, pos, ID_DELIM)) { 1.373 + // Not a global filter; backup and resume 1.374 + canonID.truncate(0); 1.375 + pos = 0; 1.376 + } 1.377 + if (dir == FORWARD) { 1.378 + globalFilter = filter; 1.379 + } else { 1.380 + delete filter; 1.381 + } 1.382 + filter = NULL; 1.383 + } 1.384 + 1.385 + UBool sawDelimiter = TRUE; 1.386 + for (;;) { 1.387 + SingleID* single = parseSingleID(id, pos, dir, ec); 1.388 + if (single == NULL) { 1.389 + break; 1.390 + } 1.391 + if (dir == FORWARD) { 1.392 + list.addElement(single, ec); 1.393 + } else { 1.394 + list.insertElementAt(single, 0, ec); 1.395 + } 1.396 + if (U_FAILURE(ec)) { 1.397 + goto FAIL; 1.398 + } 1.399 + if (!ICU_Utility::parseChar(id, pos, ID_DELIM)) { 1.400 + sawDelimiter = FALSE; 1.401 + break; 1.402 + } 1.403 + } 1.404 + 1.405 + if (list.size() == 0) { 1.406 + goto FAIL; 1.407 + } 1.408 + 1.409 + // Construct canonical ID 1.410 + for (i=0; i<list.size(); ++i) { 1.411 + SingleID* single = (SingleID*) list.elementAt(i); 1.412 + canonID.append(single->canonID); 1.413 + if (i != (list.size()-1)) { 1.414 + canonID.append(ID_DELIM); 1.415 + } 1.416 + } 1.417 + 1.418 + // Parse trailing global filter, if any, and only if we saw 1.419 + // a trailing delimiter after the IDs. 1.420 + if (sawDelimiter) { 1.421 + withParens = 1; // parens required 1.422 + filter = parseGlobalFilter(id, pos, dir, withParens, &canonID); 1.423 + if (filter != NULL) { 1.424 + // Don't require trailing ';', but parse it if present 1.425 + ICU_Utility::parseChar(id, pos, ID_DELIM); 1.426 + 1.427 + if (dir == REVERSE) { 1.428 + globalFilter = filter; 1.429 + } else { 1.430 + delete filter; 1.431 + } 1.432 + filter = NULL; 1.433 + } 1.434 + } 1.435 + 1.436 + // Trailing unparsed text is a syntax error 1.437 + ICU_Utility::skipWhitespace(id, pos, TRUE); 1.438 + if (pos != id.length()) { 1.439 + goto FAIL; 1.440 + } 1.441 + 1.442 + return TRUE; 1.443 + 1.444 + FAIL: 1.445 + UObjectDeleter *save = list.setDeleter(_deleteSingleID); 1.446 + list.removeAllElements(); 1.447 + list.setDeleter(save); 1.448 + delete globalFilter; 1.449 + globalFilter = NULL; 1.450 + return FALSE; 1.451 +} 1.452 + 1.453 +/** 1.454 + * Convert the elements of the 'list' vector, which are SingleID 1.455 + * objects, into actual Transliterator objects. In the course of 1.456 + * this, some (or all) entries may be removed. If all entries 1.457 + * are removed, the NULL transliterator will be added. 1.458 + * 1.459 + * Delete entries with empty basicIDs; these are generated by 1.460 + * elements like "(A)" in the forward direction, or "A()" in 1.461 + * the reverse. THIS MAY RESULT IN AN EMPTY VECTOR. Convert 1.462 + * SingleID entries to actual transliterators. 1.463 + * 1.464 + * @param list vector of SingleID objects. On exit, vector 1.465 + * of one or more Transliterators. 1.466 + * @return new value of insertIndex. The index will shift if 1.467 + * there are empty items, like "(Lower)", with indices less than 1.468 + * insertIndex. 1.469 + */ 1.470 +void TransliteratorIDParser::instantiateList(UVector& list, 1.471 + UErrorCode& ec) { 1.472 + UVector tlist(ec); 1.473 + if (U_FAILURE(ec)) { 1.474 + goto RETURN; 1.475 + } 1.476 + tlist.setDeleter(_deleteTransliteratorTrIDPars); 1.477 + 1.478 + Transliterator* t; 1.479 + int32_t i; 1.480 + for (i=0; i<=list.size(); ++i) { // [sic]: i<=list.size() 1.481 + // We run the loop too long by one, so we can 1.482 + // do an insert after the last element 1.483 + if (i==list.size()) { 1.484 + break; 1.485 + } 1.486 + 1.487 + SingleID* single = (SingleID*) list.elementAt(i); 1.488 + if (single->basicID.length() != 0) { 1.489 + t = single->createInstance(); 1.490 + if (t == NULL) { 1.491 + ec = U_INVALID_ID; 1.492 + goto RETURN; 1.493 + } 1.494 + tlist.addElement(t, ec); 1.495 + if (U_FAILURE(ec)) { 1.496 + delete t; 1.497 + goto RETURN; 1.498 + } 1.499 + } 1.500 + } 1.501 + 1.502 + // An empty list is equivalent to a NULL transliterator. 1.503 + if (tlist.size() == 0) { 1.504 + t = createBasicInstance(UnicodeString(TRUE, ANY_NULL, 8), NULL); 1.505 + if (t == NULL) { 1.506 + // Should never happen 1.507 + ec = U_INTERNAL_TRANSLITERATOR_ERROR; 1.508 + } 1.509 + tlist.addElement(t, ec); 1.510 + if (U_FAILURE(ec)) { 1.511 + delete t; 1.512 + } 1.513 + } 1.514 + 1.515 + RETURN: 1.516 + 1.517 + UObjectDeleter *save = list.setDeleter(_deleteSingleID); 1.518 + list.removeAllElements(); 1.519 + 1.520 + if (U_SUCCESS(ec)) { 1.521 + list.setDeleter(_deleteTransliteratorTrIDPars); 1.522 + 1.523 + while (tlist.size() > 0) { 1.524 + t = (Transliterator*) tlist.orphanElementAt(0); 1.525 + list.addElement(t, ec); 1.526 + if (U_FAILURE(ec)) { 1.527 + delete t; 1.528 + list.removeAllElements(); 1.529 + break; 1.530 + } 1.531 + } 1.532 + } 1.533 + 1.534 + list.setDeleter(save); 1.535 +} 1.536 + 1.537 +/** 1.538 + * Parse an ID into pieces. Take IDs of the form T, T/V, S-T, 1.539 + * S-T/V, or S/V-T. If the source is missing, return a source of 1.540 + * ANY. 1.541 + * @param id the id string, in any of several forms 1.542 + * @return an array of 4 strings: source, target, variant, and 1.543 + * isSourcePresent. If the source is not present, ANY will be 1.544 + * given as the source, and isSourcePresent will be NULL. Otherwise 1.545 + * isSourcePresent will be non-NULL. The target may be empty if the 1.546 + * id is not well-formed. The variant may be empty. 1.547 + */ 1.548 +void TransliteratorIDParser::IDtoSTV(const UnicodeString& id, 1.549 + UnicodeString& source, 1.550 + UnicodeString& target, 1.551 + UnicodeString& variant, 1.552 + UBool& isSourcePresent) { 1.553 + source.setTo(ANY, 3); 1.554 + target.truncate(0); 1.555 + variant.truncate(0); 1.556 + 1.557 + int32_t sep = id.indexOf(TARGET_SEP); 1.558 + int32_t var = id.indexOf(VARIANT_SEP); 1.559 + if (var < 0) { 1.560 + var = id.length(); 1.561 + } 1.562 + isSourcePresent = FALSE; 1.563 + 1.564 + if (sep < 0) { 1.565 + // Form: T/V or T (or /V) 1.566 + id.extractBetween(0, var, target); 1.567 + id.extractBetween(var, id.length(), variant); 1.568 + } else if (sep < var) { 1.569 + // Form: S-T/V or S-T (or -T/V or -T) 1.570 + if (sep > 0) { 1.571 + id.extractBetween(0, sep, source); 1.572 + isSourcePresent = TRUE; 1.573 + } 1.574 + id.extractBetween(++sep, var, target); 1.575 + id.extractBetween(var, id.length(), variant); 1.576 + } else { 1.577 + // Form: (S/V-T or /V-T) 1.578 + if (var > 0) { 1.579 + id.extractBetween(0, var, source); 1.580 + isSourcePresent = TRUE; 1.581 + } 1.582 + id.extractBetween(var, sep++, variant); 1.583 + id.extractBetween(sep, id.length(), target); 1.584 + } 1.585 + 1.586 + if (variant.length() > 0) { 1.587 + variant.remove(0, 1); 1.588 + } 1.589 +} 1.590 + 1.591 +/** 1.592 + * Given source, target, and variant strings, concatenate them into a 1.593 + * full ID. If the source is empty, then "Any" will be used for the 1.594 + * source, so the ID will always be of the form s-t/v or s-t. 1.595 + */ 1.596 +void TransliteratorIDParser::STVtoID(const UnicodeString& source, 1.597 + const UnicodeString& target, 1.598 + const UnicodeString& variant, 1.599 + UnicodeString& id) { 1.600 + id = source; 1.601 + if (id.length() == 0) { 1.602 + id.setTo(ANY, 3); 1.603 + } 1.604 + id.append(TARGET_SEP).append(target); 1.605 + if (variant.length() != 0) { 1.606 + id.append(VARIANT_SEP).append(variant); 1.607 + } 1.608 + // NUL-terminate the ID string for getTerminatedBuffer. 1.609 + // This prevents valgrind and Purify warnings. 1.610 + id.append((UChar)0); 1.611 + id.truncate(id.length()-1); 1.612 +} 1.613 + 1.614 +/** 1.615 + * Register two targets as being inverses of one another. For 1.616 + * example, calling registerSpecialInverse("NFC", "NFD", TRUE) causes 1.617 + * Transliterator to form the following inverse relationships: 1.618 + * 1.619 + * <pre>NFC => NFD 1.620 + * Any-NFC => Any-NFD 1.621 + * NFD => NFC 1.622 + * Any-NFD => Any-NFC</pre> 1.623 + * 1.624 + * (Without the special inverse registration, the inverse of NFC 1.625 + * would be NFC-Any.) Note that NFD is shorthand for Any-NFD, but 1.626 + * that the presence or absence of "Any-" is preserved. 1.627 + * 1.628 + * <p>The relationship is symmetrical; registering (a, b) is 1.629 + * equivalent to registering (b, a). 1.630 + * 1.631 + * <p>The relevant IDs must still be registered separately as 1.632 + * factories or classes. 1.633 + * 1.634 + * <p>Only the targets are specified. Special inverses always 1.635 + * have the form Any-Target1 <=> Any-Target2. The target should 1.636 + * have canonical casing (the casing desired to be produced when 1.637 + * an inverse is formed) and should contain no whitespace or other 1.638 + * extraneous characters. 1.639 + * 1.640 + * @param target the target against which to register the inverse 1.641 + * @param inverseTarget the inverse of target, that is 1.642 + * Any-target.getInverse() => Any-inverseTarget 1.643 + * @param bidirectional if TRUE, register the reverse relation 1.644 + * as well, that is, Any-inverseTarget.getInverse() => Any-target 1.645 + */ 1.646 +void TransliteratorIDParser::registerSpecialInverse(const UnicodeString& target, 1.647 + const UnicodeString& inverseTarget, 1.648 + UBool bidirectional, 1.649 + UErrorCode &status) { 1.650 + init(status); 1.651 + if (U_FAILURE(status)) { 1.652 + return; 1.653 + } 1.654 + 1.655 + // If target == inverseTarget then force bidirectional => FALSE 1.656 + if (bidirectional && 0==target.caseCompare(inverseTarget, U_FOLD_CASE_DEFAULT)) { 1.657 + bidirectional = FALSE; 1.658 + } 1.659 + 1.660 + Mutex lock(&LOCK); 1.661 + 1.662 + UnicodeString *tempus = new UnicodeString(inverseTarget); // Used for null pointer check before usage. 1.663 + if (tempus == NULL) { 1.664 + status = U_MEMORY_ALLOCATION_ERROR; 1.665 + return; 1.666 + } 1.667 + SPECIAL_INVERSES->put(target, tempus, status); 1.668 + if (bidirectional) { 1.669 + tempus = new UnicodeString(target); 1.670 + if (tempus == NULL) { 1.671 + status = U_MEMORY_ALLOCATION_ERROR; 1.672 + return; 1.673 + } 1.674 + SPECIAL_INVERSES->put(inverseTarget, tempus, status); 1.675 + } 1.676 +} 1.677 + 1.678 +//---------------------------------------------------------------- 1.679 +// Private implementation 1.680 +//---------------------------------------------------------------- 1.681 + 1.682 +/** 1.683 + * Parse an ID into component pieces. Take IDs of the form T, 1.684 + * T/V, S-T, S-T/V, or S/V-T. If the source is missing, return a 1.685 + * source of ANY. 1.686 + * @param id the id string, in any of several forms 1.687 + * @param pos INPUT-OUTPUT parameter. On input, pos is the 1.688 + * offset of the first character to parse in id. On output, 1.689 + * pos is the offset after the last parsed character. If the 1.690 + * parse failed, pos will be unchanged. 1.691 + * @param allowFilter2 if TRUE, a UnicodeSet pattern is allowed 1.692 + * at any location between specs or delimiters, and is returned 1.693 + * as the fifth string in the array. 1.694 + * @return a Specs object, or NULL if the parse failed. If 1.695 + * neither source nor target was seen in the parsed id, then the 1.696 + * parse fails. If allowFilter is TRUE, then the parsed filter 1.697 + * pattern is returned in the Specs object, otherwise the returned 1.698 + * filter reference is NULL. If the parse fails for any reason 1.699 + * NULL is returned. 1.700 + */ 1.701 +TransliteratorIDParser::Specs* 1.702 +TransliteratorIDParser::parseFilterID(const UnicodeString& id, int32_t& pos, 1.703 + UBool allowFilter) { 1.704 + UnicodeString first; 1.705 + UnicodeString source; 1.706 + UnicodeString target; 1.707 + UnicodeString variant; 1.708 + UnicodeString filter; 1.709 + UChar delimiter = 0; 1.710 + int32_t specCount = 0; 1.711 + int32_t start = pos; 1.712 + 1.713 + // This loop parses one of the following things with each 1.714 + // pass: a filter, a delimiter character (either '-' or '/'), 1.715 + // or a spec (source, target, or variant). 1.716 + for (;;) { 1.717 + ICU_Utility::skipWhitespace(id, pos, TRUE); 1.718 + if (pos == id.length()) { 1.719 + break; 1.720 + } 1.721 + 1.722 + // Parse filters 1.723 + if (allowFilter && filter.length() == 0 && 1.724 + UnicodeSet::resemblesPattern(id, pos)) { 1.725 + 1.726 + ParsePosition ppos(pos); 1.727 + UErrorCode ec = U_ZERO_ERROR; 1.728 + UnicodeSet set(id, ppos, USET_IGNORE_SPACE, NULL, ec); 1.729 + if (U_FAILURE(ec)) { 1.730 + pos = start; 1.731 + return NULL; 1.732 + } 1.733 + id.extractBetween(pos, ppos.getIndex(), filter); 1.734 + pos = ppos.getIndex(); 1.735 + continue; 1.736 + } 1.737 + 1.738 + if (delimiter == 0) { 1.739 + UChar c = id.charAt(pos); 1.740 + if ((c == TARGET_SEP && target.length() == 0) || 1.741 + (c == VARIANT_SEP && variant.length() == 0)) { 1.742 + delimiter = c; 1.743 + ++pos; 1.744 + continue; 1.745 + } 1.746 + } 1.747 + 1.748 + // We are about to try to parse a spec with no delimiter 1.749 + // when we can no longer do so (we can only do so at the 1.750 + // start); break. 1.751 + if (delimiter == 0 && specCount > 0) { 1.752 + break; 1.753 + } 1.754 + 1.755 + UnicodeString spec = ICU_Utility::parseUnicodeIdentifier(id, pos); 1.756 + if (spec.length() == 0) { 1.757 + // Note that if there was a trailing delimiter, we 1.758 + // consume it. So Foo-, Foo/, Foo-Bar/, and Foo/Bar- 1.759 + // are legal. 1.760 + break; 1.761 + } 1.762 + 1.763 + switch (delimiter) { 1.764 + case 0: 1.765 + first = spec; 1.766 + break; 1.767 + case TARGET_SEP: 1.768 + target = spec; 1.769 + break; 1.770 + case VARIANT_SEP: 1.771 + variant = spec; 1.772 + break; 1.773 + } 1.774 + ++specCount; 1.775 + delimiter = 0; 1.776 + } 1.777 + 1.778 + // A spec with no prior character is either source or target, 1.779 + // depending on whether an explicit "-target" was seen. 1.780 + if (first.length() != 0) { 1.781 + if (target.length() == 0) { 1.782 + target = first; 1.783 + } else { 1.784 + source = first; 1.785 + } 1.786 + } 1.787 + 1.788 + // Must have either source or target 1.789 + if (source.length() == 0 && target.length() == 0) { 1.790 + pos = start; 1.791 + return NULL; 1.792 + } 1.793 + 1.794 + // Empty source or target defaults to ANY 1.795 + UBool sawSource = TRUE; 1.796 + if (source.length() == 0) { 1.797 + source.setTo(ANY, 3); 1.798 + sawSource = FALSE; 1.799 + } 1.800 + if (target.length() == 0) { 1.801 + target.setTo(ANY, 3); 1.802 + } 1.803 + 1.804 + return new Specs(source, target, variant, sawSource, filter); 1.805 +} 1.806 + 1.807 +/** 1.808 + * Givens a Spec object, convert it to a SingleID object. The 1.809 + * Spec object is a more unprocessed parse result. The SingleID 1.810 + * object contains information about canonical and basic IDs. 1.811 + * @return a SingleID; never returns NULL. Returned object always 1.812 + * has 'filter' field of NULL. 1.813 + */ 1.814 +TransliteratorIDParser::SingleID* 1.815 +TransliteratorIDParser::specsToID(const Specs* specs, int32_t dir) { 1.816 + UnicodeString canonID; 1.817 + UnicodeString basicID; 1.818 + UnicodeString basicPrefix; 1.819 + if (specs != NULL) { 1.820 + UnicodeString buf; 1.821 + if (dir == FORWARD) { 1.822 + if (specs->sawSource) { 1.823 + buf.append(specs->source).append(TARGET_SEP); 1.824 + } else { 1.825 + basicPrefix = specs->source; 1.826 + basicPrefix.append(TARGET_SEP); 1.827 + } 1.828 + buf.append(specs->target); 1.829 + } else { 1.830 + buf.append(specs->target).append(TARGET_SEP).append(specs->source); 1.831 + } 1.832 + if (specs->variant.length() != 0) { 1.833 + buf.append(VARIANT_SEP).append(specs->variant); 1.834 + } 1.835 + basicID = basicPrefix; 1.836 + basicID.append(buf); 1.837 + if (specs->filter.length() != 0) { 1.838 + buf.insert(0, specs->filter); 1.839 + } 1.840 + canonID = buf; 1.841 + } 1.842 + return new SingleID(canonID, basicID); 1.843 +} 1.844 + 1.845 +/** 1.846 + * Given a Specs object, return a SingleID representing the 1.847 + * special inverse of that ID. If there is no special inverse 1.848 + * then return NULL. 1.849 + * @return a SingleID or NULL. Returned object always has 1.850 + * 'filter' field of NULL. 1.851 + */ 1.852 +TransliteratorIDParser::SingleID* 1.853 +TransliteratorIDParser::specsToSpecialInverse(const Specs& specs, UErrorCode &status) { 1.854 + if (0!=specs.source.caseCompare(ANY, 3, U_FOLD_CASE_DEFAULT)) { 1.855 + return NULL; 1.856 + } 1.857 + init(status); 1.858 + 1.859 + UnicodeString* inverseTarget; 1.860 + 1.861 + umtx_lock(&LOCK); 1.862 + inverseTarget = (UnicodeString*) SPECIAL_INVERSES->get(specs.target); 1.863 + umtx_unlock(&LOCK); 1.864 + 1.865 + if (inverseTarget != NULL) { 1.866 + // If the original ID contained "Any-" then make the 1.867 + // special inverse "Any-Foo"; otherwise make it "Foo". 1.868 + // So "Any-NFC" => "Any-NFD" but "NFC" => "NFD". 1.869 + UnicodeString buf; 1.870 + if (specs.filter.length() != 0) { 1.871 + buf.append(specs.filter); 1.872 + } 1.873 + if (specs.sawSource) { 1.874 + buf.append(ANY, 3).append(TARGET_SEP); 1.875 + } 1.876 + buf.append(*inverseTarget); 1.877 + 1.878 + UnicodeString basicID(TRUE, ANY, 3); 1.879 + basicID.append(TARGET_SEP).append(*inverseTarget); 1.880 + 1.881 + if (specs.variant.length() != 0) { 1.882 + buf.append(VARIANT_SEP).append(specs.variant); 1.883 + basicID.append(VARIANT_SEP).append(specs.variant); 1.884 + } 1.885 + return new SingleID(buf, basicID); 1.886 + } 1.887 + return NULL; 1.888 +} 1.889 + 1.890 +/** 1.891 + * Glue method to get around access problems in C++. This would 1.892 + * ideally be inline but we want to avoid a circular header 1.893 + * dependency. 1.894 + */ 1.895 +Transliterator* TransliteratorIDParser::createBasicInstance(const UnicodeString& id, const UnicodeString* canonID) { 1.896 + return Transliterator::createBasicInstance(id, canonID); 1.897 +} 1.898 + 1.899 +/** 1.900 + * Initialize static memory. 1.901 + */ 1.902 +void TransliteratorIDParser::init(UErrorCode &status) { 1.903 + if (SPECIAL_INVERSES != NULL) { 1.904 + return; 1.905 + } 1.906 + 1.907 + Hashtable* special_inverses = new Hashtable(TRUE, status); 1.908 + // Null pointer check 1.909 + if (special_inverses == NULL) { 1.910 + status = U_MEMORY_ALLOCATION_ERROR; 1.911 + return; 1.912 + } 1.913 + special_inverses->setValueDeleter(uprv_deleteUObject); 1.914 + 1.915 + umtx_lock(&LOCK); 1.916 + if (SPECIAL_INVERSES == NULL) { 1.917 + SPECIAL_INVERSES = special_inverses; 1.918 + special_inverses = NULL; 1.919 + } 1.920 + umtx_unlock(&LOCK); 1.921 + delete special_inverses; /*null instance*/ 1.922 + 1.923 + ucln_i18n_registerCleanup(UCLN_I18N_TRANSLITERATOR, utrans_transliterator_cleanup); 1.924 +} 1.925 + 1.926 +/** 1.927 + * Free static memory. 1.928 + */ 1.929 +void TransliteratorIDParser::cleanup() { 1.930 + if (SPECIAL_INVERSES) { 1.931 + delete SPECIAL_INVERSES; 1.932 + SPECIAL_INVERSES = NULL; 1.933 + } 1.934 +} 1.935 + 1.936 +U_NAMESPACE_END 1.937 + 1.938 +#endif /* #if !UCONFIG_NO_TRANSLITERATION */ 1.939 + 1.940 +//eof