intl/icu/source/i18n/tridpars.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/tridpars.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,937 @@
     1.4 +/*
     1.5 +**********************************************************************
     1.6 +*   Copyright (c) 2002-2012, International Business Machines Corporation
     1.7 +*   and others.  All Rights Reserved.
     1.8 +**********************************************************************
     1.9 +*   Date        Name        Description
    1.10 +*   01/14/2002  aliu        Creation.
    1.11 +**********************************************************************
    1.12 +*/
    1.13 +
    1.14 +#include "unicode/utypes.h"
    1.15 +
    1.16 +#if !UCONFIG_NO_TRANSLITERATION
    1.17 +
    1.18 +#include "tridpars.h"
    1.19 +#include "hash.h"
    1.20 +#include "mutex.h"
    1.21 +#include "ucln_in.h"
    1.22 +#include "unicode/parsepos.h"
    1.23 +#include "unicode/translit.h"
    1.24 +#include "unicode/uchar.h"
    1.25 +#include "unicode/uniset.h"
    1.26 +#include "unicode/unistr.h"
    1.27 +#include "unicode/utrans.h"
    1.28 +#include "util.h"
    1.29 +#include "uvector.h"
    1.30 +
    1.31 +U_NAMESPACE_BEGIN
    1.32 +
    1.33 +static const UChar ID_DELIM    = 0x003B; // ;
    1.34 +static const UChar TARGET_SEP  = 0x002D; // -
    1.35 +static const UChar VARIANT_SEP = 0x002F; // /
    1.36 +static const UChar OPEN_REV    = 0x0028; // (
    1.37 +static const UChar CLOSE_REV   = 0x0029; // )
    1.38 +
    1.39 +//static const UChar EMPTY[]     = {0}; // ""
    1.40 +static const UChar ANY[]       = {65,110,121,0}; // "Any"
    1.41 +static const UChar ANY_NULL[]  = {65,110,121,45,78,117,108,108,0}; // "Any-Null"
    1.42 +
    1.43 +static const int32_t FORWARD = UTRANS_FORWARD;
    1.44 +static const int32_t REVERSE = UTRANS_REVERSE;
    1.45 +
    1.46 +static Hashtable* SPECIAL_INVERSES = NULL;
    1.47 +
    1.48 +/**
    1.49 + * The mutex controlling access to SPECIAL_INVERSES
    1.50 + */
    1.51 +static UMutex LOCK = U_MUTEX_INITIALIZER;
    1.52 +
    1.53 +TransliteratorIDParser::Specs::Specs(const UnicodeString& s, const UnicodeString& t,
    1.54 +                                     const UnicodeString& v, UBool sawS,
    1.55 +                                     const UnicodeString& f) {
    1.56 +    source = s;
    1.57 +    target = t;
    1.58 +    variant = v;
    1.59 +    sawSource = sawS;
    1.60 +    filter = f;
    1.61 +}
    1.62 +
    1.63 +TransliteratorIDParser::SingleID::SingleID(const UnicodeString& c, const UnicodeString& b,
    1.64 +                                           const UnicodeString& f) {
    1.65 +    canonID = c;
    1.66 +    basicID = b;
    1.67 +    filter = f;
    1.68 +}
    1.69 +
    1.70 +TransliteratorIDParser::SingleID::SingleID(const UnicodeString& c, const UnicodeString& b) {
    1.71 +    canonID = c;
    1.72 +    basicID = b;
    1.73 +}
    1.74 +
    1.75 +Transliterator* TransliteratorIDParser::SingleID::createInstance() {
    1.76 +    Transliterator* t;
    1.77 +    if (basicID.length() == 0) {
    1.78 +        t = createBasicInstance(UnicodeString(TRUE, ANY_NULL, 8), &canonID);
    1.79 +    } else {
    1.80 +        t = createBasicInstance(basicID, &canonID);
    1.81 +    }
    1.82 +    if (t != NULL) {
    1.83 +        if (filter.length() != 0) {
    1.84 +            UErrorCode ec = U_ZERO_ERROR;
    1.85 +            UnicodeSet *set = new UnicodeSet(filter, ec);
    1.86 +            if (U_FAILURE(ec)) {
    1.87 +                delete set;
    1.88 +            } else {
    1.89 +                t->adoptFilter(set);
    1.90 +            }
    1.91 +        }
    1.92 +    }
    1.93 +    return t;
    1.94 +}
    1.95 +
    1.96 +
    1.97 +/**
    1.98 + * Parse a single ID, that is, an ID of the general form
    1.99 + * "[f1] s1-t1/v1 ([f2] s2-t3/v2)", with the parenthesized element
   1.100 + * optional, the filters optional, and the variants optional.
   1.101 + * @param id the id to be parsed
   1.102 + * @param pos INPUT-OUTPUT parameter.  On input, the position of
   1.103 + * the first character to parse.  On output, the position after
   1.104 + * the last character parsed.
   1.105 + * @param dir the direction.  If the direction is REVERSE then the
   1.106 + * SingleID is constructed for the reverse direction.
   1.107 + * @return a SingleID object or NULL
   1.108 + */
   1.109 +TransliteratorIDParser::SingleID*
   1.110 +TransliteratorIDParser::parseSingleID(const UnicodeString& id, int32_t& pos,
   1.111 +                                      int32_t dir, UErrorCode& status) {
   1.112 +
   1.113 +    int32_t start = pos;
   1.114 +
   1.115 +    // The ID will be of the form A, A(), A(B), or (B), where
   1.116 +    // A and B are filter IDs.
   1.117 +    Specs* specsA = NULL;
   1.118 +    Specs* specsB = NULL;
   1.119 +    UBool sawParen = FALSE;
   1.120 +
   1.121 +    // On the first pass, look for (B) or ().  If this fails, then
   1.122 +    // on the second pass, look for A, A(B), or A().
   1.123 +    for (int32_t pass=1; pass<=2; ++pass) {
   1.124 +        if (pass == 2) {
   1.125 +            specsA = parseFilterID(id, pos, TRUE);
   1.126 +            if (specsA == NULL) {
   1.127 +                pos = start;
   1.128 +                return NULL;
   1.129 +            }
   1.130 +        }
   1.131 +        if (ICU_Utility::parseChar(id, pos, OPEN_REV)) {
   1.132 +            sawParen = TRUE;
   1.133 +            if (!ICU_Utility::parseChar(id, pos, CLOSE_REV)) {
   1.134 +                specsB = parseFilterID(id, pos, TRUE);
   1.135 +                // Must close with a ')'
   1.136 +                if (specsB == NULL || !ICU_Utility::parseChar(id, pos, CLOSE_REV)) {
   1.137 +                    delete specsA;
   1.138 +                    pos = start;
   1.139 +                    return NULL;
   1.140 +                }
   1.141 +            }
   1.142 +            break;
   1.143 +        }
   1.144 +    }
   1.145 +
   1.146 +    // Assemble return results
   1.147 +    SingleID* single;
   1.148 +    if (sawParen) {
   1.149 +        if (dir == FORWARD) {
   1.150 +            SingleID* b = specsToID(specsB, FORWARD);
   1.151 +            single = specsToID(specsA, FORWARD);
   1.152 +            // Null pointers check
   1.153 +            if (b == NULL || single == NULL) {
   1.154 +            	delete b;
   1.155 +            	delete single;
   1.156 +            	status = U_MEMORY_ALLOCATION_ERROR;
   1.157 +            	return NULL;
   1.158 +            }
   1.159 +            single->canonID.append(OPEN_REV)
   1.160 +                .append(b->canonID).append(CLOSE_REV);
   1.161 +            if (specsA != NULL) {
   1.162 +                single->filter = specsA->filter;
   1.163 +            }
   1.164 +            delete b;
   1.165 +        } else {
   1.166 +            SingleID* a = specsToID(specsA, FORWARD);
   1.167 +            single = specsToID(specsB, FORWARD);
   1.168 +            // Check for null pointer.
   1.169 +            if (a == NULL || single == NULL) {
   1.170 +            	delete a;
   1.171 +            	delete single;
   1.172 +            	status = U_MEMORY_ALLOCATION_ERROR;
   1.173 +            	return NULL;
   1.174 +            }
   1.175 +            single->canonID.append(OPEN_REV)
   1.176 +                .append(a->canonID).append(CLOSE_REV);
   1.177 +            if (specsB != NULL) {
   1.178 +                single->filter = specsB->filter;
   1.179 +            }
   1.180 +            delete a;
   1.181 +        }
   1.182 +    } else {
   1.183 +        // assert(specsA != NULL);
   1.184 +        if (dir == FORWARD) {
   1.185 +            single = specsToID(specsA, FORWARD);
   1.186 +        } else {
   1.187 +            single = specsToSpecialInverse(*specsA, status);
   1.188 +            if (single == NULL) {
   1.189 +                single = specsToID(specsA, REVERSE);
   1.190 +            }
   1.191 +        }
   1.192 +        // Check for NULL pointer
   1.193 +        if (single == NULL) {
   1.194 +        	status = U_MEMORY_ALLOCATION_ERROR;
   1.195 +        	return NULL;
   1.196 +        }
   1.197 +        single->filter = specsA->filter;
   1.198 +    }
   1.199 +
   1.200 +    delete specsA;
   1.201 +    delete specsB;
   1.202 +
   1.203 +    return single;
   1.204 +}
   1.205 +
   1.206 +/**
   1.207 + * Parse a filter ID, that is, an ID of the general form
   1.208 + * "[f1] s1-t1/v1", with the filters optional, and the variants optional.
   1.209 + * @param id the id to be parsed
   1.210 + * @param pos INPUT-OUTPUT parameter.  On input, the position of
   1.211 + * the first character to parse.  On output, the position after
   1.212 + * the last character parsed.
   1.213 + * @return a SingleID object or null if the parse fails
   1.214 + */
   1.215 +TransliteratorIDParser::SingleID*
   1.216 +TransliteratorIDParser::parseFilterID(const UnicodeString& id, int32_t& pos) {
   1.217 +
   1.218 +    int32_t start = pos;
   1.219 +
   1.220 +    Specs* specs = parseFilterID(id, pos, TRUE);
   1.221 +    if (specs == NULL) {
   1.222 +        pos = start;
   1.223 +        return NULL;
   1.224 +    }
   1.225 +
   1.226 +    // Assemble return results
   1.227 +    SingleID* single = specsToID(specs, FORWARD);
   1.228 +    if (single != NULL) {
   1.229 +        single->filter = specs->filter;
   1.230 +    }
   1.231 +    delete specs;
   1.232 +    return single;
   1.233 +}
   1.234 +
   1.235 +/**
   1.236 + * Parse a global filter of the form "[f]" or "([f])", depending
   1.237 + * on 'withParens'.
   1.238 + * @param id the pattern the parse
   1.239 + * @param pos INPUT-OUTPUT parameter.  On input, the position of
   1.240 + * the first character to parse.  On output, the position after
   1.241 + * the last character parsed.
   1.242 + * @param dir the direction.
   1.243 + * @param withParens INPUT-OUTPUT parameter.  On entry, if
   1.244 + * withParens is 0, then parens are disallowed.  If it is 1,
   1.245 + * then parens are requires.  If it is -1, then parens are
   1.246 + * optional, and the return result will be set to 0 or 1.
   1.247 + * @param canonID OUTPUT parameter.  The pattern for the filter
   1.248 + * added to the canonID, either at the end, if dir is FORWARD, or
   1.249 + * at the start, if dir is REVERSE.  The pattern will be enclosed
   1.250 + * in parentheses if appropriate, and will be suffixed with an
   1.251 + * ID_DELIM character.  May be NULL.
   1.252 + * @return a UnicodeSet object or NULL.  A non-NULL results
   1.253 + * indicates a successful parse, regardless of whether the filter
   1.254 + * applies to the given direction.  The caller should discard it
   1.255 + * if withParens != (dir == REVERSE).
   1.256 + */
   1.257 +UnicodeSet* TransliteratorIDParser::parseGlobalFilter(const UnicodeString& id, int32_t& pos,
   1.258 +                                                      int32_t dir,
   1.259 +                                                      int32_t& withParens,
   1.260 +                                                      UnicodeString* canonID) {
   1.261 +    UnicodeSet* filter = NULL;
   1.262 +    int32_t start = pos;
   1.263 +
   1.264 +    if (withParens == -1) {
   1.265 +        withParens = ICU_Utility::parseChar(id, pos, OPEN_REV) ? 1 : 0;
   1.266 +    } else if (withParens == 1) {
   1.267 +        if (!ICU_Utility::parseChar(id, pos, OPEN_REV)) {
   1.268 +            pos = start;
   1.269 +            return NULL;
   1.270 +        }
   1.271 +    }
   1.272 +
   1.273 +    ICU_Utility::skipWhitespace(id, pos, TRUE);
   1.274 +
   1.275 +    if (UnicodeSet::resemblesPattern(id, pos)) {
   1.276 +        ParsePosition ppos(pos);
   1.277 +        UErrorCode ec = U_ZERO_ERROR;
   1.278 +        filter = new UnicodeSet(id, ppos, USET_IGNORE_SPACE, NULL, ec);
   1.279 +        /* test for NULL */
   1.280 +        if (filter == 0) {
   1.281 +            pos = start;
   1.282 +            return 0;
   1.283 +        }
   1.284 +        if (U_FAILURE(ec)) {
   1.285 +            delete filter;
   1.286 +            pos = start;
   1.287 +            return NULL;
   1.288 +        }
   1.289 +
   1.290 +        UnicodeString pattern;
   1.291 +        id.extractBetween(pos, ppos.getIndex(), pattern);
   1.292 +        pos = ppos.getIndex();
   1.293 +
   1.294 +        if (withParens == 1 && !ICU_Utility::parseChar(id, pos, CLOSE_REV)) {
   1.295 +            pos = start;
   1.296 +            return NULL;
   1.297 +        }
   1.298 +
   1.299 +        // In the forward direction, append the pattern to the
   1.300 +        // canonID.  In the reverse, insert it at zero, and invert
   1.301 +        // the presence of parens ("A" <-> "(A)").
   1.302 +        if (canonID != NULL) {
   1.303 +            if (dir == FORWARD) {
   1.304 +                if (withParens == 1) {
   1.305 +                    pattern.insert(0, OPEN_REV);
   1.306 +                    pattern.append(CLOSE_REV);
   1.307 +                }
   1.308 +                canonID->append(pattern).append(ID_DELIM);
   1.309 +            } else {
   1.310 +                if (withParens == 0) {
   1.311 +                    pattern.insert(0, OPEN_REV);
   1.312 +                    pattern.append(CLOSE_REV);
   1.313 +                }
   1.314 +                canonID->insert(0, pattern);
   1.315 +                canonID->insert(pattern.length(), ID_DELIM);
   1.316 +            }
   1.317 +        }
   1.318 +    }
   1.319 +
   1.320 +    return filter;
   1.321 +}
   1.322 +
   1.323 +U_CDECL_BEGIN
   1.324 +static void U_CALLCONV _deleteSingleID(void* obj) {
   1.325 +    delete (TransliteratorIDParser::SingleID*) obj;
   1.326 +}
   1.327 +
   1.328 +static void U_CALLCONV _deleteTransliteratorTrIDPars(void* obj) {
   1.329 +    delete (Transliterator*) obj;
   1.330 +}
   1.331 +U_CDECL_END
   1.332 +
   1.333 +/**
   1.334 + * Parse a compound ID, consisting of an optional forward global
   1.335 + * filter, a separator, one or more single IDs delimited by
   1.336 + * separators, an an optional reverse global filter.  The
   1.337 + * separator is a semicolon.  The global filters are UnicodeSet
   1.338 + * patterns.  The reverse global filter must be enclosed in
   1.339 + * parentheses.
   1.340 + * @param id the pattern the parse
   1.341 + * @param dir the direction.
   1.342 + * @param canonID OUTPUT parameter that receives the canonical ID,
   1.343 + * consisting of canonical IDs for all elements, as returned by
   1.344 + * parseSingleID(), separated by semicolons.  Previous contents
   1.345 + * are discarded.
   1.346 + * @param list OUTPUT parameter that receives a list of SingleID
   1.347 + * objects representing the parsed IDs.  Previous contents are
   1.348 + * discarded.
   1.349 + * @param globalFilter OUTPUT parameter that receives a pointer to
   1.350 + * a newly created global filter for this ID in this direction, or
   1.351 + * NULL if there is none.
   1.352 + * @return TRUE if the parse succeeds, that is, if the entire
   1.353 + * id is consumed without syntax error.
   1.354 + */
   1.355 +UBool TransliteratorIDParser::parseCompoundID(const UnicodeString& id, int32_t dir,
   1.356 +                                              UnicodeString& canonID,
   1.357 +                                              UVector& list,
   1.358 +                                              UnicodeSet*& globalFilter) {
   1.359 +    UErrorCode ec = U_ZERO_ERROR;
   1.360 +    int32_t i;
   1.361 +    int32_t pos = 0;
   1.362 +    int32_t withParens = 1;
   1.363 +    list.removeAllElements();
   1.364 +    UnicodeSet* filter;
   1.365 +    globalFilter = NULL;
   1.366 +    canonID.truncate(0);
   1.367 +
   1.368 +    // Parse leading global filter, if any
   1.369 +    withParens = 0; // parens disallowed
   1.370 +    filter = parseGlobalFilter(id, pos, dir, withParens, &canonID);
   1.371 +    if (filter != NULL) {
   1.372 +        if (!ICU_Utility::parseChar(id, pos, ID_DELIM)) {
   1.373 +            // Not a global filter; backup and resume
   1.374 +            canonID.truncate(0);
   1.375 +            pos = 0;
   1.376 +        }
   1.377 +        if (dir == FORWARD) {
   1.378 +            globalFilter = filter;
   1.379 +        } else {
   1.380 +            delete filter;
   1.381 +        }
   1.382 +        filter = NULL;
   1.383 +    }
   1.384 +
   1.385 +    UBool sawDelimiter = TRUE;
   1.386 +    for (;;) {
   1.387 +        SingleID* single = parseSingleID(id, pos, dir, ec);
   1.388 +        if (single == NULL) {
   1.389 +            break;
   1.390 +        }
   1.391 +        if (dir == FORWARD) {
   1.392 +            list.addElement(single, ec);
   1.393 +        } else {
   1.394 +            list.insertElementAt(single, 0, ec);
   1.395 +        }
   1.396 +        if (U_FAILURE(ec)) {
   1.397 +            goto FAIL;
   1.398 +        }
   1.399 +        if (!ICU_Utility::parseChar(id, pos, ID_DELIM)) {
   1.400 +            sawDelimiter = FALSE;
   1.401 +            break;
   1.402 +        }
   1.403 +    }
   1.404 +
   1.405 +    if (list.size() == 0) {
   1.406 +        goto FAIL;
   1.407 +    }
   1.408 +
   1.409 +    // Construct canonical ID
   1.410 +    for (i=0; i<list.size(); ++i) {
   1.411 +        SingleID* single = (SingleID*) list.elementAt(i);
   1.412 +        canonID.append(single->canonID);
   1.413 +        if (i != (list.size()-1)) {
   1.414 +            canonID.append(ID_DELIM);
   1.415 +        }
   1.416 +    }
   1.417 +
   1.418 +    // Parse trailing global filter, if any, and only if we saw
   1.419 +    // a trailing delimiter after the IDs.
   1.420 +    if (sawDelimiter) {
   1.421 +        withParens = 1; // parens required
   1.422 +        filter = parseGlobalFilter(id, pos, dir, withParens, &canonID);
   1.423 +        if (filter != NULL) {
   1.424 +            // Don't require trailing ';', but parse it if present
   1.425 +            ICU_Utility::parseChar(id, pos, ID_DELIM);
   1.426 +
   1.427 +            if (dir == REVERSE) {
   1.428 +                globalFilter = filter;
   1.429 +            } else {
   1.430 +                delete filter;
   1.431 +            }
   1.432 +            filter = NULL;
   1.433 +        }
   1.434 +    }
   1.435 +
   1.436 +    // Trailing unparsed text is a syntax error
   1.437 +    ICU_Utility::skipWhitespace(id, pos, TRUE);
   1.438 +    if (pos != id.length()) {
   1.439 +        goto FAIL;
   1.440 +    }
   1.441 +
   1.442 +    return TRUE;
   1.443 +
   1.444 + FAIL:
   1.445 +    UObjectDeleter *save = list.setDeleter(_deleteSingleID);
   1.446 +    list.removeAllElements();
   1.447 +    list.setDeleter(save);
   1.448 +    delete globalFilter;
   1.449 +    globalFilter = NULL;
   1.450 +    return FALSE;
   1.451 +}
   1.452 +
   1.453 +/**
   1.454 + * Convert the elements of the 'list' vector, which are SingleID
   1.455 + * objects, into actual Transliterator objects.  In the course of
   1.456 + * this, some (or all) entries may be removed.  If all entries
   1.457 + * are removed, the NULL transliterator will be added.
   1.458 + *
   1.459 + * Delete entries with empty basicIDs; these are generated by
   1.460 + * elements like "(A)" in the forward direction, or "A()" in
   1.461 + * the reverse.  THIS MAY RESULT IN AN EMPTY VECTOR.  Convert
   1.462 + * SingleID entries to actual transliterators.
   1.463 + *
   1.464 + * @param list vector of SingleID objects.  On exit, vector
   1.465 + * of one or more Transliterators.
   1.466 + * @return new value of insertIndex.  The index will shift if
   1.467 + * there are empty items, like "(Lower)", with indices less than
   1.468 + * insertIndex.
   1.469 + */
   1.470 +void TransliteratorIDParser::instantiateList(UVector& list,
   1.471 +                                                UErrorCode& ec) {
   1.472 +    UVector tlist(ec);
   1.473 +    if (U_FAILURE(ec)) {
   1.474 +        goto RETURN;
   1.475 +    }
   1.476 +    tlist.setDeleter(_deleteTransliteratorTrIDPars);
   1.477 +
   1.478 +    Transliterator* t;
   1.479 +    int32_t i;
   1.480 +    for (i=0; i<=list.size(); ++i) { // [sic]: i<=list.size()
   1.481 +        // We run the loop too long by one, so we can
   1.482 +        // do an insert after the last element
   1.483 +        if (i==list.size()) {
   1.484 +            break;
   1.485 +        }
   1.486 +
   1.487 +        SingleID* single = (SingleID*) list.elementAt(i);
   1.488 +        if (single->basicID.length() != 0) {
   1.489 +            t = single->createInstance();
   1.490 +            if (t == NULL) {
   1.491 +                ec = U_INVALID_ID;
   1.492 +                goto RETURN;
   1.493 +            }
   1.494 +            tlist.addElement(t, ec);
   1.495 +            if (U_FAILURE(ec)) {
   1.496 +                delete t;
   1.497 +                goto RETURN;
   1.498 +            }
   1.499 +        }
   1.500 +    }
   1.501 +
   1.502 +    // An empty list is equivalent to a NULL transliterator.
   1.503 +    if (tlist.size() == 0) {
   1.504 +        t = createBasicInstance(UnicodeString(TRUE, ANY_NULL, 8), NULL);
   1.505 +        if (t == NULL) {
   1.506 +            // Should never happen
   1.507 +            ec = U_INTERNAL_TRANSLITERATOR_ERROR;
   1.508 +        }
   1.509 +        tlist.addElement(t, ec);
   1.510 +        if (U_FAILURE(ec)) {
   1.511 +            delete t;
   1.512 +        }
   1.513 +    }
   1.514 +
   1.515 + RETURN:
   1.516 +
   1.517 +    UObjectDeleter *save = list.setDeleter(_deleteSingleID);
   1.518 +    list.removeAllElements();
   1.519 +
   1.520 +    if (U_SUCCESS(ec)) {
   1.521 +        list.setDeleter(_deleteTransliteratorTrIDPars);
   1.522 +
   1.523 +        while (tlist.size() > 0) {
   1.524 +            t = (Transliterator*) tlist.orphanElementAt(0);
   1.525 +            list.addElement(t, ec);
   1.526 +            if (U_FAILURE(ec)) {
   1.527 +                delete t;
   1.528 +                list.removeAllElements();
   1.529 +                break;
   1.530 +            }
   1.531 +        }
   1.532 +    }
   1.533 +
   1.534 +    list.setDeleter(save);
   1.535 +}
   1.536 +
   1.537 +/**
   1.538 + * Parse an ID into pieces.  Take IDs of the form T, T/V, S-T,
   1.539 + * S-T/V, or S/V-T.  If the source is missing, return a source of
   1.540 + * ANY.
   1.541 + * @param id the id string, in any of several forms
   1.542 + * @return an array of 4 strings: source, target, variant, and
   1.543 + * isSourcePresent.  If the source is not present, ANY will be
   1.544 + * given as the source, and isSourcePresent will be NULL.  Otherwise
   1.545 + * isSourcePresent will be non-NULL.  The target may be empty if the
   1.546 + * id is not well-formed.  The variant may be empty.
   1.547 + */
   1.548 +void TransliteratorIDParser::IDtoSTV(const UnicodeString& id,
   1.549 +                                     UnicodeString& source,
   1.550 +                                     UnicodeString& target,
   1.551 +                                     UnicodeString& variant,
   1.552 +                                     UBool& isSourcePresent) {
   1.553 +    source.setTo(ANY, 3);
   1.554 +    target.truncate(0);
   1.555 +    variant.truncate(0);
   1.556 +
   1.557 +    int32_t sep = id.indexOf(TARGET_SEP);
   1.558 +    int32_t var = id.indexOf(VARIANT_SEP);
   1.559 +    if (var < 0) {
   1.560 +        var = id.length();
   1.561 +    }
   1.562 +    isSourcePresent = FALSE;
   1.563 +
   1.564 +    if (sep < 0) {
   1.565 +        // Form: T/V or T (or /V)
   1.566 +        id.extractBetween(0, var, target);
   1.567 +        id.extractBetween(var, id.length(), variant);
   1.568 +    } else if (sep < var) {
   1.569 +        // Form: S-T/V or S-T (or -T/V or -T)
   1.570 +        if (sep > 0) {
   1.571 +            id.extractBetween(0, sep, source);
   1.572 +            isSourcePresent = TRUE;
   1.573 +        }
   1.574 +        id.extractBetween(++sep, var, target);
   1.575 +        id.extractBetween(var, id.length(), variant);
   1.576 +    } else {
   1.577 +        // Form: (S/V-T or /V-T)
   1.578 +        if (var > 0) {
   1.579 +            id.extractBetween(0, var, source);
   1.580 +            isSourcePresent = TRUE;
   1.581 +        }
   1.582 +        id.extractBetween(var, sep++, variant);
   1.583 +        id.extractBetween(sep, id.length(), target);
   1.584 +    }
   1.585 +
   1.586 +    if (variant.length() > 0) {
   1.587 +        variant.remove(0, 1);
   1.588 +    }
   1.589 +}
   1.590 +
   1.591 +/**
   1.592 + * Given source, target, and variant strings, concatenate them into a
   1.593 + * full ID.  If the source is empty, then "Any" will be used for the
   1.594 + * source, so the ID will always be of the form s-t/v or s-t.
   1.595 + */
   1.596 +void TransliteratorIDParser::STVtoID(const UnicodeString& source,
   1.597 +                                     const UnicodeString& target,
   1.598 +                                     const UnicodeString& variant,
   1.599 +                                     UnicodeString& id) {
   1.600 +    id = source;
   1.601 +    if (id.length() == 0) {
   1.602 +        id.setTo(ANY, 3);
   1.603 +    }
   1.604 +    id.append(TARGET_SEP).append(target);
   1.605 +    if (variant.length() != 0) {
   1.606 +        id.append(VARIANT_SEP).append(variant);
   1.607 +    }
   1.608 +    // NUL-terminate the ID string for getTerminatedBuffer.
   1.609 +    // This prevents valgrind and Purify warnings.
   1.610 +    id.append((UChar)0);
   1.611 +    id.truncate(id.length()-1);
   1.612 +}
   1.613 +
   1.614 +/**
   1.615 + * Register two targets as being inverses of one another.  For
   1.616 + * example, calling registerSpecialInverse("NFC", "NFD", TRUE) causes
   1.617 + * Transliterator to form the following inverse relationships:
   1.618 + *
   1.619 + * <pre>NFC => NFD
   1.620 + * Any-NFC => Any-NFD
   1.621 + * NFD => NFC
   1.622 + * Any-NFD => Any-NFC</pre>
   1.623 + *
   1.624 + * (Without the special inverse registration, the inverse of NFC
   1.625 + * would be NFC-Any.)  Note that NFD is shorthand for Any-NFD, but
   1.626 + * that the presence or absence of "Any-" is preserved.
   1.627 + *
   1.628 + * <p>The relationship is symmetrical; registering (a, b) is
   1.629 + * equivalent to registering (b, a).
   1.630 + *
   1.631 + * <p>The relevant IDs must still be registered separately as
   1.632 + * factories or classes.
   1.633 + *
   1.634 + * <p>Only the targets are specified.  Special inverses always
   1.635 + * have the form Any-Target1 <=> Any-Target2.  The target should
   1.636 + * have canonical casing (the casing desired to be produced when
   1.637 + * an inverse is formed) and should contain no whitespace or other
   1.638 + * extraneous characters.
   1.639 + *
   1.640 + * @param target the target against which to register the inverse
   1.641 + * @param inverseTarget the inverse of target, that is
   1.642 + * Any-target.getInverse() => Any-inverseTarget
   1.643 + * @param bidirectional if TRUE, register the reverse relation
   1.644 + * as well, that is, Any-inverseTarget.getInverse() => Any-target
   1.645 + */
   1.646 +void TransliteratorIDParser::registerSpecialInverse(const UnicodeString& target,
   1.647 +                                                    const UnicodeString& inverseTarget,
   1.648 +                                                    UBool bidirectional,
   1.649 +                                                    UErrorCode &status) {
   1.650 +    init(status);
   1.651 +    if (U_FAILURE(status)) {
   1.652 +        return;
   1.653 +    }
   1.654 +
   1.655 +    // If target == inverseTarget then force bidirectional => FALSE
   1.656 +    if (bidirectional && 0==target.caseCompare(inverseTarget, U_FOLD_CASE_DEFAULT)) {
   1.657 +        bidirectional = FALSE;
   1.658 +    }
   1.659 +
   1.660 +    Mutex lock(&LOCK);
   1.661 +
   1.662 +    UnicodeString *tempus = new UnicodeString(inverseTarget);  // Used for null pointer check before usage.
   1.663 +    if (tempus == NULL) {
   1.664 +    	status = U_MEMORY_ALLOCATION_ERROR;
   1.665 +    	return;
   1.666 +    }
   1.667 +    SPECIAL_INVERSES->put(target, tempus, status);
   1.668 +    if (bidirectional) {
   1.669 +    	tempus = new UnicodeString(target);
   1.670 +    	if (tempus == NULL) {
   1.671 +    		status = U_MEMORY_ALLOCATION_ERROR;
   1.672 +    		return;
   1.673 +    	}
   1.674 +        SPECIAL_INVERSES->put(inverseTarget, tempus, status);
   1.675 +    }
   1.676 +}
   1.677 +
   1.678 +//----------------------------------------------------------------
   1.679 +// Private implementation
   1.680 +//----------------------------------------------------------------
   1.681 +
   1.682 +/**
   1.683 + * Parse an ID into component pieces.  Take IDs of the form T,
   1.684 + * T/V, S-T, S-T/V, or S/V-T.  If the source is missing, return a
   1.685 + * source of ANY.
   1.686 + * @param id the id string, in any of several forms
   1.687 + * @param pos INPUT-OUTPUT parameter.  On input, pos is the
   1.688 + * offset of the first character to parse in id.  On output,
   1.689 + * pos is the offset after the last parsed character.  If the
   1.690 + * parse failed, pos will be unchanged.
   1.691 + * @param allowFilter2 if TRUE, a UnicodeSet pattern is allowed
   1.692 + * at any location between specs or delimiters, and is returned
   1.693 + * as the fifth string in the array.
   1.694 + * @return a Specs object, or NULL if the parse failed.  If
   1.695 + * neither source nor target was seen in the parsed id, then the
   1.696 + * parse fails.  If allowFilter is TRUE, then the parsed filter
   1.697 + * pattern is returned in the Specs object, otherwise the returned
   1.698 + * filter reference is NULL.  If the parse fails for any reason
   1.699 + * NULL is returned.
   1.700 + */
   1.701 +TransliteratorIDParser::Specs*
   1.702 +TransliteratorIDParser::parseFilterID(const UnicodeString& id, int32_t& pos,
   1.703 +                                      UBool allowFilter) {
   1.704 +    UnicodeString first;
   1.705 +    UnicodeString source;
   1.706 +    UnicodeString target;
   1.707 +    UnicodeString variant;
   1.708 +    UnicodeString filter;
   1.709 +    UChar delimiter = 0;
   1.710 +    int32_t specCount = 0;
   1.711 +    int32_t start = pos;
   1.712 +
   1.713 +    // This loop parses one of the following things with each
   1.714 +    // pass: a filter, a delimiter character (either '-' or '/'),
   1.715 +    // or a spec (source, target, or variant).
   1.716 +    for (;;) {
   1.717 +        ICU_Utility::skipWhitespace(id, pos, TRUE);
   1.718 +        if (pos == id.length()) {
   1.719 +            break;
   1.720 +        }
   1.721 +
   1.722 +        // Parse filters
   1.723 +        if (allowFilter && filter.length() == 0 &&
   1.724 +            UnicodeSet::resemblesPattern(id, pos)) {
   1.725 +
   1.726 +            ParsePosition ppos(pos);
   1.727 +            UErrorCode ec = U_ZERO_ERROR;
   1.728 +            UnicodeSet set(id, ppos, USET_IGNORE_SPACE, NULL, ec);
   1.729 +            if (U_FAILURE(ec)) {
   1.730 +                pos = start;
   1.731 +                return NULL;
   1.732 +            }
   1.733 +            id.extractBetween(pos, ppos.getIndex(), filter);
   1.734 +            pos = ppos.getIndex();
   1.735 +            continue;
   1.736 +        }
   1.737 +
   1.738 +        if (delimiter == 0) {
   1.739 +            UChar c = id.charAt(pos);
   1.740 +            if ((c == TARGET_SEP && target.length() == 0) ||
   1.741 +                (c == VARIANT_SEP && variant.length() == 0)) {
   1.742 +                delimiter = c;
   1.743 +                ++pos;
   1.744 +                continue;
   1.745 +            }
   1.746 +        }
   1.747 +
   1.748 +        // We are about to try to parse a spec with no delimiter
   1.749 +        // when we can no longer do so (we can only do so at the
   1.750 +        // start); break.
   1.751 +        if (delimiter == 0 && specCount > 0) {
   1.752 +            break;
   1.753 +        }
   1.754 +
   1.755 +        UnicodeString spec = ICU_Utility::parseUnicodeIdentifier(id, pos);
   1.756 +        if (spec.length() == 0) {
   1.757 +            // Note that if there was a trailing delimiter, we
   1.758 +            // consume it.  So Foo-, Foo/, Foo-Bar/, and Foo/Bar-
   1.759 +            // are legal.
   1.760 +            break;
   1.761 +        }
   1.762 +
   1.763 +        switch (delimiter) {
   1.764 +        case 0:
   1.765 +            first = spec;
   1.766 +            break;
   1.767 +        case TARGET_SEP:
   1.768 +            target = spec;
   1.769 +            break;
   1.770 +        case VARIANT_SEP:
   1.771 +            variant = spec;
   1.772 +            break;
   1.773 +        }
   1.774 +        ++specCount;
   1.775 +        delimiter = 0;
   1.776 +    }
   1.777 +
   1.778 +    // A spec with no prior character is either source or target,
   1.779 +    // depending on whether an explicit "-target" was seen.
   1.780 +    if (first.length() != 0) {
   1.781 +        if (target.length() == 0) {
   1.782 +            target = first;
   1.783 +        } else {
   1.784 +            source = first;
   1.785 +        }
   1.786 +    }
   1.787 +
   1.788 +    // Must have either source or target
   1.789 +    if (source.length() == 0 && target.length() == 0) {
   1.790 +        pos = start;
   1.791 +        return NULL;
   1.792 +    }
   1.793 +
   1.794 +    // Empty source or target defaults to ANY
   1.795 +    UBool sawSource = TRUE;
   1.796 +    if (source.length() == 0) {
   1.797 +        source.setTo(ANY, 3);
   1.798 +        sawSource = FALSE;
   1.799 +    }
   1.800 +    if (target.length() == 0) {
   1.801 +        target.setTo(ANY, 3);
   1.802 +    }
   1.803 +
   1.804 +    return new Specs(source, target, variant, sawSource, filter);
   1.805 +}
   1.806 +
   1.807 +/**
   1.808 + * Givens a Spec object, convert it to a SingleID object.  The
   1.809 + * Spec object is a more unprocessed parse result.  The SingleID
   1.810 + * object contains information about canonical and basic IDs.
   1.811 + * @return a SingleID; never returns NULL.  Returned object always
   1.812 + * has 'filter' field of NULL.
   1.813 + */
   1.814 +TransliteratorIDParser::SingleID*
   1.815 +TransliteratorIDParser::specsToID(const Specs* specs, int32_t dir) {
   1.816 +    UnicodeString canonID;
   1.817 +    UnicodeString basicID;
   1.818 +    UnicodeString basicPrefix;
   1.819 +    if (specs != NULL) {
   1.820 +        UnicodeString buf;
   1.821 +        if (dir == FORWARD) {
   1.822 +            if (specs->sawSource) {
   1.823 +                buf.append(specs->source).append(TARGET_SEP);
   1.824 +            } else {
   1.825 +                basicPrefix = specs->source;
   1.826 +                basicPrefix.append(TARGET_SEP);
   1.827 +            }
   1.828 +            buf.append(specs->target);
   1.829 +        } else {
   1.830 +            buf.append(specs->target).append(TARGET_SEP).append(specs->source);
   1.831 +        }
   1.832 +        if (specs->variant.length() != 0) {
   1.833 +            buf.append(VARIANT_SEP).append(specs->variant);
   1.834 +        }
   1.835 +        basicID = basicPrefix;
   1.836 +        basicID.append(buf);
   1.837 +        if (specs->filter.length() != 0) {
   1.838 +            buf.insert(0, specs->filter);
   1.839 +        }
   1.840 +        canonID = buf;
   1.841 +    }
   1.842 +    return new SingleID(canonID, basicID);
   1.843 +}
   1.844 +
   1.845 +/**
   1.846 + * Given a Specs object, return a SingleID representing the
   1.847 + * special inverse of that ID.  If there is no special inverse
   1.848 + * then return NULL.
   1.849 + * @return a SingleID or NULL.  Returned object always has
   1.850 + * 'filter' field of NULL.
   1.851 + */
   1.852 +TransliteratorIDParser::SingleID*
   1.853 +TransliteratorIDParser::specsToSpecialInverse(const Specs& specs, UErrorCode &status) {
   1.854 +    if (0!=specs.source.caseCompare(ANY, 3, U_FOLD_CASE_DEFAULT)) {
   1.855 +        return NULL;
   1.856 +    }
   1.857 +    init(status);
   1.858 +
   1.859 +    UnicodeString* inverseTarget;
   1.860 +
   1.861 +    umtx_lock(&LOCK);
   1.862 +    inverseTarget = (UnicodeString*) SPECIAL_INVERSES->get(specs.target);
   1.863 +    umtx_unlock(&LOCK);
   1.864 +
   1.865 +    if (inverseTarget != NULL) {
   1.866 +        // If the original ID contained "Any-" then make the
   1.867 +        // special inverse "Any-Foo"; otherwise make it "Foo".
   1.868 +        // So "Any-NFC" => "Any-NFD" but "NFC" => "NFD".
   1.869 +        UnicodeString buf;
   1.870 +        if (specs.filter.length() != 0) {
   1.871 +            buf.append(specs.filter);
   1.872 +        }
   1.873 +        if (specs.sawSource) {
   1.874 +            buf.append(ANY, 3).append(TARGET_SEP);
   1.875 +        }
   1.876 +        buf.append(*inverseTarget);
   1.877 +
   1.878 +        UnicodeString basicID(TRUE, ANY, 3);
   1.879 +        basicID.append(TARGET_SEP).append(*inverseTarget);
   1.880 +
   1.881 +        if (specs.variant.length() != 0) {
   1.882 +            buf.append(VARIANT_SEP).append(specs.variant);
   1.883 +            basicID.append(VARIANT_SEP).append(specs.variant);
   1.884 +        }
   1.885 +        return new SingleID(buf, basicID);
   1.886 +    }
   1.887 +    return NULL;
   1.888 +}
   1.889 +
   1.890 +/**
   1.891 + * Glue method to get around access problems in C++.  This would
   1.892 + * ideally be inline but we want to avoid a circular header
   1.893 + * dependency.
   1.894 + */
   1.895 +Transliterator* TransliteratorIDParser::createBasicInstance(const UnicodeString& id, const UnicodeString* canonID) {
   1.896 +    return Transliterator::createBasicInstance(id, canonID);
   1.897 +}
   1.898 +
   1.899 +/**
   1.900 + * Initialize static memory.
   1.901 + */
   1.902 +void TransliteratorIDParser::init(UErrorCode &status) {
   1.903 +    if (SPECIAL_INVERSES != NULL) {
   1.904 +        return;
   1.905 +    }
   1.906 +
   1.907 +    Hashtable* special_inverses = new Hashtable(TRUE, status);
   1.908 +    // Null pointer check
   1.909 +    if (special_inverses == NULL) {
   1.910 +    	status = U_MEMORY_ALLOCATION_ERROR;
   1.911 +    	return;
   1.912 +    }
   1.913 +    special_inverses->setValueDeleter(uprv_deleteUObject);
   1.914 +
   1.915 +    umtx_lock(&LOCK);
   1.916 +    if (SPECIAL_INVERSES == NULL) {
   1.917 +        SPECIAL_INVERSES = special_inverses;
   1.918 +        special_inverses = NULL;
   1.919 +    }
   1.920 +    umtx_unlock(&LOCK);
   1.921 +    delete special_inverses; /*null instance*/
   1.922 +
   1.923 +    ucln_i18n_registerCleanup(UCLN_I18N_TRANSLITERATOR, utrans_transliterator_cleanup);
   1.924 +}
   1.925 +
   1.926 +/**
   1.927 + * Free static memory.
   1.928 + */
   1.929 +void TransliteratorIDParser::cleanup() {
   1.930 +    if (SPECIAL_INVERSES) {
   1.931 +        delete SPECIAL_INVERSES;
   1.932 +        SPECIAL_INVERSES = NULL;
   1.933 +    }
   1.934 +}
   1.935 +
   1.936 +U_NAMESPACE_END
   1.937 +
   1.938 +#endif /* #if !UCONFIG_NO_TRANSLITERATION */
   1.939 +
   1.940 +//eof

mercurial