The Tor Browser: diff intl/icu/source/common/uniset

     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/uniset_props.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1302 @@
     1.4 +/*
     1.5 +*******************************************************************************
     1.6 +*
     1.7 +*   Copyright (C) 1999-2013, International Business Machines
     1.8 +*   Corporation and others.  All Rights Reserved.
     1.9 +*
    1.10 +*******************************************************************************
    1.11 +*   file name:  uniset_props.cpp
    1.12 +*   encoding:   US-ASCII
    1.13 +*   tab size:   8 (not used)
    1.14 +*   indentation:4
    1.15 +*
    1.16 +*   created on: 2004aug25
    1.17 +*   created by: Markus W. Scherer
    1.18 +*
    1.19 +*   Character property dependent functions moved here from uniset.cpp
    1.20 +*/
    1.21 +
    1.22 +#include "unicode/utypes.h"
    1.23 +#include "unicode/uniset.h"
    1.24 +#include "unicode/parsepos.h"
    1.25 +#include "unicode/uchar.h"
    1.26 +#include "unicode/uscript.h"
    1.27 +#include "unicode/symtable.h"
    1.28 +#include "unicode/uset.h"
    1.29 +#include "unicode/locid.h"
    1.30 +#include "unicode/brkiter.h"
    1.31 +#include "uset_imp.h"
    1.32 +#include "ruleiter.h"
    1.33 +#include "cmemory.h"
    1.34 +#include "ucln_cmn.h"
    1.35 +#include "util.h"
    1.36 +#include "uvector.h"
    1.37 +#include "uprops.h"
    1.38 +#include "propname.h"
    1.39 +#include "normalizer2impl.h"
    1.40 +#include "ucase.h"
    1.41 +#include "ubidi_props.h"
    1.42 +#include "uinvchar.h"
    1.43 +#include "uprops.h"
    1.44 +#include "charstr.h"
    1.45 +#include "cstring.h"
    1.46 +#include "mutex.h"
    1.47 +#include "umutex.h"
    1.48 +#include "uassert.h"
    1.49 +#include "hash.h"
    1.50 +
    1.51 +U_NAMESPACE_USE
    1.52 +
    1.53 +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
    1.54 +
    1.55 +// initial storage. Must be >= 0
    1.56 +// *** same as in uniset.cpp ! ***
    1.57 +#define START_EXTRA 16
    1.58 +
    1.59 +// Define UChar constants using hex for EBCDIC compatibility
    1.60 +// Used #define to reduce private static exports and memory access time.
    1.61 +#define SET_OPEN        ((UChar)0x005B) /*[*/
    1.62 +#define SET_CLOSE       ((UChar)0x005D) /*]*/
    1.63 +#define HYPHEN          ((UChar)0x002D) /*-*/
    1.64 +#define COMPLEMENT      ((UChar)0x005E) /*^*/
    1.65 +#define COLON           ((UChar)0x003A) /*:*/
    1.66 +#define BACKSLASH       ((UChar)0x005C) /*\*/
    1.67 +#define INTERSECTION    ((UChar)0x0026) /*&*/
    1.68 +#define UPPER_U         ((UChar)0x0055) /*U*/
    1.69 +#define LOWER_U         ((UChar)0x0075) /*u*/
    1.70 +#define OPEN_BRACE      ((UChar)123)    /*{*/
    1.71 +#define CLOSE_BRACE     ((UChar)125)    /*}*/
    1.72 +#define UPPER_P         ((UChar)0x0050) /*P*/
    1.73 +#define LOWER_P         ((UChar)0x0070) /*p*/
    1.74 +#define UPPER_N         ((UChar)78)     /*N*/
    1.75 +#define EQUALS          ((UChar)0x003D) /*=*/
    1.76 +
    1.77 +//static const UChar POSIX_OPEN[]  = { SET_OPEN,COLON,0 };  // "[:"
    1.78 +static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 };  // ":]"
    1.79 +//static const UChar PERL_OPEN[]   = { BACKSLASH,LOWER_P,0 }; // "\\p"
    1.80 +//static const UChar PERL_CLOSE[]  = { CLOSE_BRACE,0 };    // "}"
    1.81 +//static const UChar NAME_OPEN[]   = { BACKSLASH,UPPER_N,0 };  // "\\N"
    1.82 +static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/
    1.83 +
    1.84 +// Special property set IDs
    1.85 +static const char ANY[]   = "ANY";   // [\u0000-\U0010FFFF]
    1.86 +static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
    1.87 +static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
    1.88 +
    1.89 +// Unicode name property alias
    1.90 +#define NAME_PROP "na"
    1.91 +#define NAME_PROP_LENGTH 2
    1.92 +
    1.93 +/**
    1.94 + * Delimiter string used in patterns to close a category reference:
    1.95 + * ":]".  Example: "[:Lu:]".
    1.96 + */
    1.97 +//static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */
    1.98 +
    1.99 +// Cached sets ------------------------------------------------------------- ***
   1.100 +
   1.101 +U_CDECL_BEGIN
   1.102 +static UBool U_CALLCONV uset_cleanup();
   1.103 +
   1.104 +struct Inclusion {
   1.105 +    UnicodeSet  *fSet;
   1.106 +    UInitOnce    fInitOnce;
   1.107 +};
   1.108 +static Inclusion gInclusions[UPROPS_SRC_COUNT]; // cached getInclusions()
   1.109 +
   1.110 +static UnicodeSet *uni32Singleton;
   1.111 +static icu::UInitOnce uni32InitOnce = U_INITONCE_INITIALIZER;
   1.112 +
   1.113 +//----------------------------------------------------------------
   1.114 +// Inclusions list
   1.115 +//----------------------------------------------------------------
   1.116 +
   1.117 +// USetAdder implementation
   1.118 +// Does not use uset.h to reduce code dependencies
   1.119 +static void U_CALLCONV
   1.120 +_set_add(USet *set, UChar32 c) {
   1.121 +    ((UnicodeSet *)set)->add(c);
   1.122 +}
   1.123 +
   1.124 +static void U_CALLCONV
   1.125 +_set_addRange(USet *set, UChar32 start, UChar32 end) {
   1.126 +    ((UnicodeSet *)set)->add(start, end);
   1.127 +}
   1.128 +
   1.129 +static void U_CALLCONV
   1.130 +_set_addString(USet *set, const UChar *str, int32_t length) {
   1.131 +    ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length));
   1.132 +}
   1.133 +
   1.134 +/**
   1.135 + * Cleanup function for UnicodeSet
   1.136 + */
   1.137 +static UBool U_CALLCONV uset_cleanup(void) {
   1.138 +    for(int32_t i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) {
   1.139 +        Inclusion &in = gInclusions[i];
   1.140 +        delete in.fSet;
   1.141 +        in.fSet = NULL;
   1.142 +        in.fInitOnce.reset();
   1.143 +    }
   1.144 +
   1.145 +    delete uni32Singleton;
   1.146 +    uni32Singleton = NULL;
   1.147 +    uni32InitOnce.reset();
   1.148 +    return TRUE;
   1.149 +}
   1.150 +
   1.151 +U_CDECL_END
   1.152 +
   1.153 +U_NAMESPACE_BEGIN
   1.154 +
   1.155 +/*
   1.156 +Reduce excessive reallocation, and make it easier to detect initialization problems.
   1.157 +Usually you don't see smaller sets than this for Unicode 5.0.
   1.158 +*/
   1.159 +#define DEFAULT_INCLUSION_CAPACITY 3072
   1.160 +
   1.161 +void U_CALLCONV UnicodeSet_initInclusion(int32_t src, UErrorCode &status) {
   1.162 +    // This function is invoked only via umtx_initOnce().
   1.163 +    // This function is a friend of class UnicodeSet.
   1.164 +
   1.165 +    U_ASSERT(src >=0 && src<UPROPS_SRC_COUNT);
   1.166 +    UnicodeSet * &incl = gInclusions[src].fSet;
   1.167 +    U_ASSERT(incl == NULL);
   1.168 +
   1.169 +    incl = new UnicodeSet();
   1.170 +    if (incl == NULL) {
   1.171 +        status = U_MEMORY_ALLOCATION_ERROR;
   1.172 +        return;
   1.173 +    }
   1.174 +    USetAdder sa = {
   1.175 +        (USet *)incl,
   1.176 +        _set_add,
   1.177 +        _set_addRange,
   1.178 +        _set_addString,
   1.179 +        NULL, // don't need remove()
   1.180 +        NULL // don't need removeRange()
   1.181 +    };
   1.182 +
   1.183 +    incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, status);
   1.184 +    switch(src) {
   1.185 +    case UPROPS_SRC_CHAR:
   1.186 +        uchar_addPropertyStarts(&sa, &status);
   1.187 +        break;
   1.188 +    case UPROPS_SRC_PROPSVEC:
   1.189 +        upropsvec_addPropertyStarts(&sa, &status);
   1.190 +        break;
   1.191 +    case UPROPS_SRC_CHAR_AND_PROPSVEC:
   1.192 +        uchar_addPropertyStarts(&sa, &status);
   1.193 +        upropsvec_addPropertyStarts(&sa, &status);
   1.194 +        break;
   1.195 +#if !UCONFIG_NO_NORMALIZATION
   1.196 +    case UPROPS_SRC_CASE_AND_NORM: {
   1.197 +        const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);
   1.198 +        if(U_SUCCESS(status)) {
   1.199 +            impl->addPropertyStarts(&sa, status);
   1.200 +        }
   1.201 +        ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status);
   1.202 +        break;
   1.203 +    }
   1.204 +    case UPROPS_SRC_NFC: {
   1.205 +        const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);
   1.206 +        if(U_SUCCESS(status)) {
   1.207 +            impl->addPropertyStarts(&sa, status);
   1.208 +        }
   1.209 +        break;
   1.210 +    }
   1.211 +    case UPROPS_SRC_NFKC: {
   1.212 +        const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(status);
   1.213 +        if(U_SUCCESS(status)) {
   1.214 +            impl->addPropertyStarts(&sa, status);
   1.215 +        }
   1.216 +        break;
   1.217 +    }
   1.218 +    case UPROPS_SRC_NFKC_CF: {
   1.219 +        const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(status);
   1.220 +        if(U_SUCCESS(status)) {
   1.221 +            impl->addPropertyStarts(&sa, status);
   1.222 +        }
   1.223 +        break;
   1.224 +    }
   1.225 +    case UPROPS_SRC_NFC_CANON_ITER: {
   1.226 +        const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);
   1.227 +        if(U_SUCCESS(status)) {
   1.228 +            impl->addCanonIterPropertyStarts(&sa, status);
   1.229 +        }
   1.230 +        break;
   1.231 +    }
   1.232 +#endif
   1.233 +    case UPROPS_SRC_CASE:
   1.234 +        ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status);
   1.235 +        break;
   1.236 +    case UPROPS_SRC_BIDI:
   1.237 +        ubidi_addPropertyStarts(ubidi_getSingleton(), &sa, &status);
   1.238 +        break;
   1.239 +    default:
   1.240 +        status = U_INTERNAL_PROGRAM_ERROR;
   1.241 +        break;
   1.242 +    }
   1.243 +
   1.244 +    if (U_FAILURE(status)) {
   1.245 +        delete incl;
   1.246 +        incl = NULL;
   1.247 +        return;
   1.248 +    }
   1.249 +    // Compact for caching
   1.250 +    incl->compact();
   1.251 +    ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
   1.252 +}
   1.253 +
   1.254 +
   1.255 +
   1.256 +const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) {
   1.257 +    U_ASSERT(src >=0 && src<UPROPS_SRC_COUNT);
   1.258 +    Inclusion &i = gInclusions[src];
   1.259 +    umtx_initOnce(i.fInitOnce, &UnicodeSet_initInclusion, src, status);
   1.260 +    return i.fSet;
   1.261 +}
   1.262 +
   1.263 +
   1.264 +// Cache some sets for other services -------------------------------------- ***
   1.265 +void U_CALLCONV createUni32Set(UErrorCode &errorCode) {
   1.266 +    U_ASSERT(uni32Singleton == NULL);
   1.267 +    uni32Singleton = new UnicodeSet(UNICODE_STRING_SIMPLE("[:age=3.2:]"), errorCode);
   1.268 +    if(uni32Singleton==NULL) {
   1.269 +        errorCode=U_MEMORY_ALLOCATION_ERROR;
   1.270 +    } else {
   1.271 +        uni32Singleton->freeze();
   1.272 +    }
   1.273 +    ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
   1.274 +}
   1.275 +
   1.276 +
   1.277 +U_CFUNC UnicodeSet *
   1.278 +uniset_getUnicode32Instance(UErrorCode &errorCode) {
   1.279 +    umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode);
   1.280 +    return uni32Singleton;
   1.281 +}
   1.282 +
   1.283 +// helper functions for matching of pattern syntax pieces ------------------ ***
   1.284 +// these functions are parallel to the PERL_OPEN etc. strings above
   1.285 +
   1.286 +// using these functions is not only faster than UnicodeString::compare() and
   1.287 +// caseCompare(), but they also make UnicodeSet work for simple patterns when
   1.288 +// no Unicode properties data is available - when caseCompare() fails
   1.289 +
   1.290 +static inline UBool
   1.291 +isPerlOpen(const UnicodeString &pattern, int32_t pos) {
   1.292 +    UChar c;
   1.293 +    return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P);
   1.294 +}
   1.295 +
   1.296 +/*static inline UBool
   1.297 +isPerlClose(const UnicodeString &pattern, int32_t pos) {
   1.298 +    return pattern.charAt(pos)==CLOSE_BRACE;
   1.299 +}*/
   1.300 +
   1.301 +static inline UBool
   1.302 +isNameOpen(const UnicodeString &pattern, int32_t pos) {
   1.303 +    return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N;
   1.304 +}
   1.305 +
   1.306 +static inline UBool
   1.307 +isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
   1.308 +    return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON;
   1.309 +}
   1.310 +
   1.311 +/*static inline UBool
   1.312 +isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
   1.313 +    return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE;
   1.314 +}*/
   1.315 +
   1.316 +// TODO memory debugging provided inside uniset.cpp
   1.317 +// could be made available here but probably obsolete with use of modern
   1.318 +// memory leak checker tools
   1.319 +#define _dbgct(me)
   1.320 +
   1.321 +//----------------------------------------------------------------
   1.322 +// Constructors &c
   1.323 +//----------------------------------------------------------------
   1.324 +
   1.325 +/**
   1.326 + * Constructs a set from the given pattern, optionally ignoring
   1.327 + * white space.  See the class description for the syntax of the
   1.328 + * pattern language.
   1.329 + * @param pattern a string specifying what characters are in the set
   1.330 + */
   1.331 +UnicodeSet::UnicodeSet(const UnicodeString& pattern,
   1.332 +                       UErrorCode& status) :
   1.333 +    len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
   1.334 +    bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
   1.335 +    fFlags(0)
   1.336 +{
   1.337 +    if(U_SUCCESS(status)){
   1.338 +        list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
   1.339 +        /* test for NULL */
   1.340 +        if(list == NULL) {
   1.341 +            status = U_MEMORY_ALLOCATION_ERROR;  
   1.342 +        }else{
   1.343 +            allocateStrings(status);
   1.344 +            applyPattern(pattern, status);
   1.345 +        }
   1.346 +    }
   1.347 +    _dbgct(this);
   1.348 +}
   1.349 +
   1.350 +//----------------------------------------------------------------
   1.351 +// Public API
   1.352 +//----------------------------------------------------------------
   1.353 +
   1.354 +UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
   1.355 +                                     UErrorCode& status) {
   1.356 +    // Equivalent to
   1.357 +    //   return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
   1.358 +    // but without dependency on closeOver().
   1.359 +    ParsePosition pos(0);
   1.360 +    applyPatternIgnoreSpace(pattern, pos, NULL, status);
   1.361 +    if (U_FAILURE(status)) return *this;
   1.362 +
   1.363 +    int32_t i = pos.getIndex();
   1.364 +    // Skip over trailing whitespace
   1.365 +    ICU_Utility::skipWhitespace(pattern, i, TRUE);
   1.366 +    if (i != pattern.length()) {
   1.367 +        status = U_ILLEGAL_ARGUMENT_ERROR;
   1.368 +    }
   1.369 +    return *this;
   1.370 +}
   1.371 +
   1.372 +void
   1.373 +UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,
   1.374 +                                    ParsePosition& pos,
   1.375 +                                    const SymbolTable* symbols,
   1.376 +                                    UErrorCode& status) {
   1.377 +    if (U_FAILURE(status)) {
   1.378 +        return;
   1.379 +    }
   1.380 +    if (isFrozen()) {
   1.381 +        status = U_NO_WRITE_PERMISSION;
   1.382 +        return;
   1.383 +    }
   1.384 +    // Need to build the pattern in a temporary string because
   1.385 +    // _applyPattern calls add() etc., which set pat to empty.
   1.386 +    UnicodeString rebuiltPat;
   1.387 +    RuleCharacterIterator chars(pattern, symbols, pos);
   1.388 +    applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, status);
   1.389 +    if (U_FAILURE(status)) return;
   1.390 +    if (chars.inVariable()) {
   1.391 +        // syntaxError(chars, "Extra chars in variable value");
   1.392 +        status = U_MALFORMED_SET;
   1.393 +        return;
   1.394 +    }
   1.395 +    setPattern(rebuiltPat);
   1.396 +}
   1.397 +
   1.398 +/**
   1.399 + * Return true if the given position, in the given pattern, appears
   1.400 + * to be the start of a UnicodeSet pattern.
   1.401 + */
   1.402 +UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
   1.403 +    return ((pos+1) < pattern.length() &&
   1.404 +            pattern.charAt(pos) == (UChar)91/*[*/) ||
   1.405 +        resemblesPropertyPattern(pattern, pos);
   1.406 +}
   1.407 +
   1.408 +//----------------------------------------------------------------
   1.409 +// Implementation: Pattern parsing
   1.410 +//----------------------------------------------------------------
   1.411 +
   1.412 +/**
   1.413 + * A small all-inline class to manage a UnicodeSet pointer.  Add
   1.414 + * operator->() etc. as needed.
   1.415 + */
   1.416 +class UnicodeSetPointer {
   1.417 +    UnicodeSet* p;
   1.418 +public:
   1.419 +    inline UnicodeSetPointer() : p(0) {}
   1.420 +    inline ~UnicodeSetPointer() { delete p; }
   1.421 +    inline UnicodeSet* pointer() { return p; }
   1.422 +    inline UBool allocate() {
   1.423 +        if (p == 0) {
   1.424 +            p = new UnicodeSet();
   1.425 +        }
   1.426 +        return p != 0;
   1.427 +    }
   1.428 +};
   1.429 +
   1.430 +/**
   1.431 + * Parse the pattern from the given RuleCharacterIterator.  The
   1.432 + * iterator is advanced over the parsed pattern.
   1.433 + * @param chars iterator over the pattern characters.  Upon return
   1.434 + * it will be advanced to the first character after the parsed
   1.435 + * pattern, or the end of the iteration if all characters are
   1.436 + * parsed.
   1.437 + * @param symbols symbol table to use to parse and dereference
   1.438 + * variables, or null if none.
   1.439 + * @param rebuiltPat the pattern that was parsed, rebuilt or
   1.440 + * copied from the input pattern, as appropriate.
   1.441 + * @param options a bit mask of zero or more of the following:
   1.442 + * IGNORE_SPACE, CASE.
   1.443 + */
   1.444 +void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
   1.445 +                              const SymbolTable* symbols,
   1.446 +                              UnicodeString& rebuiltPat,
   1.447 +                              uint32_t options,
   1.448 +                              UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
   1.449 +                              UErrorCode& ec) {
   1.450 +    if (U_FAILURE(ec)) return;
   1.451 +
   1.452 +    // Syntax characters: [ ] ^ - & { }
   1.453 +
   1.454 +    // Recognized special forms for chars, sets: c-c s-s s&s
   1.455 +
   1.456 +    int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |
   1.457 +                   RuleCharacterIterator::PARSE_ESCAPES;
   1.458 +    if ((options & USET_IGNORE_SPACE) != 0) {
   1.459 +        opts |= RuleCharacterIterator::SKIP_WHITESPACE;
   1.460 +    }
   1.461 +
   1.462 +    UnicodeString patLocal, buf;
   1.463 +    UBool usePat = FALSE;
   1.464 +    UnicodeSetPointer scratch;
   1.465 +    RuleCharacterIterator::Pos backup;
   1.466 +
   1.467 +    // mode: 0=before [, 1=between [...], 2=after ]
   1.468 +    // lastItem: 0=none, 1=char, 2=set
   1.469 +    int8_t lastItem = 0, mode = 0;
   1.470 +    UChar32 lastChar = 0;
   1.471 +    UChar op = 0;
   1.472 +
   1.473 +    UBool invert = FALSE;
   1.474 +
   1.475 +    clear();
   1.476 +
   1.477 +    while (mode != 2 && !chars.atEnd()) {
   1.478 +        U_ASSERT((lastItem == 0 && op == 0) ||
   1.479 +                 (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) ||
   1.480 +                 (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ ||
   1.481 +                                    op == INTERSECTION /*'&'*/)));
   1.482 +
   1.483 +        UChar32 c = 0;
   1.484 +        UBool literal = FALSE;
   1.485 +        UnicodeSet* nested = 0; // alias - do not delete
   1.486 +
   1.487 +        // -------- Check for property pattern
   1.488 +
   1.489 +        // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
   1.490 +        int8_t setMode = 0;
   1.491 +        if (resemblesPropertyPattern(chars, opts)) {
   1.492 +            setMode = 2;
   1.493 +        }
   1.494 +
   1.495 +        // -------- Parse '[' of opening delimiter OR nested set.
   1.496 +        // If there is a nested set, use `setMode' to define how
   1.497 +        // the set should be parsed.  If the '[' is part of the
   1.498 +        // opening delimiter for this pattern, parse special
   1.499 +        // strings "[", "[^", "[-", and "[^-".  Check for stand-in
   1.500 +        // characters representing a nested set in the symbol
   1.501 +        // table.
   1.502 +
   1.503 +        else {
   1.504 +            // Prepare to backup if necessary
   1.505 +            chars.getPos(backup);
   1.506 +            c = chars.next(opts, literal, ec);
   1.507 +            if (U_FAILURE(ec)) return;
   1.508 +
   1.509 +            if (c == 0x5B /*'['*/ && !literal) {
   1.510 +                if (mode == 1) {
   1.511 +                    chars.setPos(backup); // backup
   1.512 +                    setMode = 1;
   1.513 +                } else {
   1.514 +                    // Handle opening '[' delimiter
   1.515 +                    mode = 1;
   1.516 +                    patLocal.append((UChar) 0x5B /*'['*/);
   1.517 +                    chars.getPos(backup); // prepare to backup
   1.518 +                    c = chars.next(opts, literal, ec); 
   1.519 +                    if (U_FAILURE(ec)) return;
   1.520 +                    if (c == 0x5E /*'^'*/ && !literal) {
   1.521 +                        invert = TRUE;
   1.522 +                        patLocal.append((UChar) 0x5E /*'^'*/);
   1.523 +                        chars.getPos(backup); // prepare to backup
   1.524 +                        c = chars.next(opts, literal, ec);
   1.525 +                        if (U_FAILURE(ec)) return;
   1.526 +                    }
   1.527 +                    // Fall through to handle special leading '-';
   1.528 +                    // otherwise restart loop for nested [], \p{}, etc.
   1.529 +                    if (c == HYPHEN /*'-'*/) {
   1.530 +                        literal = TRUE;
   1.531 +                        // Fall through to handle literal '-' below
   1.532 +                    } else {
   1.533 +                        chars.setPos(backup); // backup
   1.534 +                        continue;
   1.535 +                    }
   1.536 +                }
   1.537 +            } else if (symbols != 0) {
   1.538 +                const UnicodeFunctor *m = symbols->lookupMatcher(c);
   1.539 +                if (m != 0) {
   1.540 +                    const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m);
   1.541 +                    if (ms == NULL) {
   1.542 +                        ec = U_MALFORMED_SET;
   1.543 +                        return;
   1.544 +                    }
   1.545 +                    // casting away const, but `nested' won't be modified
   1.546 +                    // (important not to modify stored set)
   1.547 +                    nested = const_cast<UnicodeSet*>(ms);
   1.548 +                    setMode = 3;
   1.549 +                }
   1.550 +            }
   1.551 +        }
   1.552 +
   1.553 +        // -------- Handle a nested set.  This either is inline in
   1.554 +        // the pattern or represented by a stand-in that has
   1.555 +        // previously been parsed and was looked up in the symbol
   1.556 +        // table.
   1.557 +
   1.558 +        if (setMode != 0) {
   1.559 +            if (lastItem == 1) {
   1.560 +                if (op != 0) {
   1.561 +                    // syntaxError(chars, "Char expected after operator");
   1.562 +                    ec = U_MALFORMED_SET;
   1.563 +                    return;
   1.564 +                }
   1.565 +                add(lastChar, lastChar);
   1.566 +                _appendToPat(patLocal, lastChar, FALSE);
   1.567 +                lastItem = 0;
   1.568 +                op = 0;
   1.569 +            }
   1.570 +
   1.571 +            if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) {
   1.572 +                patLocal.append(op);
   1.573 +            }
   1.574 +
   1.575 +            if (nested == 0) {
   1.576 +                // lazy allocation
   1.577 +                if (!scratch.allocate()) {
   1.578 +                    ec = U_MEMORY_ALLOCATION_ERROR;
   1.579 +                    return;
   1.580 +                }
   1.581 +                nested = scratch.pointer();
   1.582 +            }
   1.583 +            switch (setMode) {
   1.584 +            case 1:
   1.585 +                nested->applyPattern(chars, symbols, patLocal, options, caseClosure, ec);
   1.586 +                break;
   1.587 +            case 2:
   1.588 +                chars.skipIgnored(opts);
   1.589 +                nested->applyPropertyPattern(chars, patLocal, ec);
   1.590 +                if (U_FAILURE(ec)) return;
   1.591 +                break;
   1.592 +            case 3: // `nested' already parsed
   1.593 +                nested->_toPattern(patLocal, FALSE);
   1.594 +                break;
   1.595 +            }
   1.596 +
   1.597 +            usePat = TRUE;
   1.598 +
   1.599 +            if (mode == 0) {
   1.600 +                // Entire pattern is a category; leave parse loop
   1.601 +                *this = *nested;
   1.602 +                mode = 2;
   1.603 +                break;
   1.604 +            }
   1.605 +
   1.606 +            switch (op) {
   1.607 +            case HYPHEN: /*'-'*/
   1.608 +                removeAll(*nested);
   1.609 +                break;
   1.610 +            case INTERSECTION: /*'&'*/
   1.611 +                retainAll(*nested);
   1.612 +                break;
   1.613 +            case 0:
   1.614 +                addAll(*nested);
   1.615 +                break;
   1.616 +            }
   1.617 +
   1.618 +            op = 0;
   1.619 +            lastItem = 2;
   1.620 +
   1.621 +            continue;
   1.622 +        }
   1.623 +
   1.624 +        if (mode == 0) {
   1.625 +            // syntaxError(chars, "Missing '['");
   1.626 +            ec = U_MALFORMED_SET;
   1.627 +            return;
   1.628 +        }
   1.629 +
   1.630 +        // -------- Parse special (syntax) characters.  If the
   1.631 +        // current character is not special, or if it is escaped,
   1.632 +        // then fall through and handle it below.
   1.633 +
   1.634 +        if (!literal) {
   1.635 +            switch (c) {
   1.636 +            case 0x5D /*']'*/:
   1.637 +                if (lastItem == 1) {
   1.638 +                    add(lastChar, lastChar);
   1.639 +                    _appendToPat(patLocal, lastChar, FALSE);
   1.640 +                }
   1.641 +                // Treat final trailing '-' as a literal
   1.642 +                if (op == HYPHEN /*'-'*/) {
   1.643 +                    add(op, op);
   1.644 +                    patLocal.append(op);
   1.645 +                } else if (op == INTERSECTION /*'&'*/) {
   1.646 +                    // syntaxError(chars, "Trailing '&'");
   1.647 +                    ec = U_MALFORMED_SET;
   1.648 +                    return;
   1.649 +                }
   1.650 +                patLocal.append((UChar) 0x5D /*']'*/);
   1.651 +                mode = 2;
   1.652 +                continue;
   1.653 +            case HYPHEN /*'-'*/:
   1.654 +                if (op == 0) {
   1.655 +                    if (lastItem != 0) {
   1.656 +                        op = (UChar) c;
   1.657 +                        continue;
   1.658 +                    } else {
   1.659 +                        // Treat final trailing '-' as a literal
   1.660 +                        add(c, c);
   1.661 +                        c = chars.next(opts, literal, ec);
   1.662 +                        if (U_FAILURE(ec)) return;
   1.663 +                        if (c == 0x5D /*']'*/ && !literal) {
   1.664 +                            patLocal.append(HYPHEN_RIGHT_BRACE, 2);
   1.665 +                            mode = 2;
   1.666 +                            continue;
   1.667 +                        }
   1.668 +                    }
   1.669 +                }
   1.670 +                // syntaxError(chars, "'-' not after char or set");
   1.671 +                ec = U_MALFORMED_SET;
   1.672 +                return;
   1.673 +            case INTERSECTION /*'&'*/:
   1.674 +                if (lastItem == 2 && op == 0) {
   1.675 +                    op = (UChar) c;
   1.676 +                    continue;
   1.677 +                }
   1.678 +                // syntaxError(chars, "'&' not after set");
   1.679 +                ec = U_MALFORMED_SET;
   1.680 +                return;
   1.681 +            case 0x5E /*'^'*/:
   1.682 +                // syntaxError(chars, "'^' not after '['");
   1.683 +                ec = U_MALFORMED_SET;
   1.684 +                return;
   1.685 +            case 0x7B /*'{'*/:
   1.686 +                if (op != 0) {
   1.687 +                    // syntaxError(chars, "Missing operand after operator");
   1.688 +                    ec = U_MALFORMED_SET;
   1.689 +                    return;
   1.690 +                }
   1.691 +                if (lastItem == 1) {
   1.692 +                    add(lastChar, lastChar);
   1.693 +                    _appendToPat(patLocal, lastChar, FALSE);
   1.694 +                }
   1.695 +                lastItem = 0;
   1.696 +                buf.truncate(0);
   1.697 +                {
   1.698 +                    UBool ok = FALSE;
   1.699 +                    while (!chars.atEnd()) {
   1.700 +                        c = chars.next(opts, literal, ec);
   1.701 +                        if (U_FAILURE(ec)) return;
   1.702 +                        if (c == 0x7D /*'}'*/ && !literal) {
   1.703 +                            ok = TRUE;
   1.704 +                            break;
   1.705 +                        }
   1.706 +                        buf.append(c);
   1.707 +                    }
   1.708 +                    if (buf.length() < 1 || !ok) {
   1.709 +                        // syntaxError(chars, "Invalid multicharacter string");
   1.710 +                        ec = U_MALFORMED_SET;
   1.711 +                        return;
   1.712 +                    }
   1.713 +                }
   1.714 +                // We have new string. Add it to set and continue;
   1.715 +                // we don't need to drop through to the further
   1.716 +                // processing
   1.717 +                add(buf);
   1.718 +                patLocal.append((UChar) 0x7B /*'{'*/);
   1.719 +                _appendToPat(patLocal, buf, FALSE);
   1.720 +                patLocal.append((UChar) 0x7D /*'}'*/);
   1.721 +                continue;
   1.722 +            case SymbolTable::SYMBOL_REF:
   1.723 +                //         symbols  nosymbols
   1.724 +                // [a-$]   error    error (ambiguous)
   1.725 +                // [a$]    anchor   anchor
   1.726 +                // [a-$x]  var "x"* literal '$'
   1.727 +                // [a-$.]  error    literal '$'
   1.728 +                // *We won't get here in the case of var "x"
   1.729 +                {
   1.730 +                    chars.getPos(backup);
   1.731 +                    c = chars.next(opts, literal, ec);
   1.732 +                    if (U_FAILURE(ec)) return;
   1.733 +                    UBool anchor = (c == 0x5D /*']'*/ && !literal);
   1.734 +                    if (symbols == 0 && !anchor) {
   1.735 +                        c = SymbolTable::SYMBOL_REF;
   1.736 +                        chars.setPos(backup);
   1.737 +                        break; // literal '$'
   1.738 +                    }
   1.739 +                    if (anchor && op == 0) {
   1.740 +                        if (lastItem == 1) {
   1.741 +                            add(lastChar, lastChar);
   1.742 +                            _appendToPat(patLocal, lastChar, FALSE);
   1.743 +                        }
   1.744 +                        add(U_ETHER);
   1.745 +                        usePat = TRUE;
   1.746 +                        patLocal.append((UChar) SymbolTable::SYMBOL_REF);
   1.747 +                        patLocal.append((UChar) 0x5D /*']'*/);
   1.748 +                        mode = 2;
   1.749 +                        continue;
   1.750 +                    }
   1.751 +                    // syntaxError(chars, "Unquoted '$'");
   1.752 +                    ec = U_MALFORMED_SET;
   1.753 +                    return;
   1.754 +                }
   1.755 +            default:
   1.756 +                break;
   1.757 +            }
   1.758 +        }
   1.759 +
   1.760 +        // -------- Parse literal characters.  This includes both
   1.761 +        // escaped chars ("\u4E01") and non-syntax characters
   1.762 +        // ("a").
   1.763 +
   1.764 +        switch (lastItem) {
   1.765 +        case 0:
   1.766 +            lastItem = 1;
   1.767 +            lastChar = c;
   1.768 +            break;
   1.769 +        case 1:
   1.770 +            if (op == HYPHEN /*'-'*/) {
   1.771 +                if (lastChar >= c) {
   1.772 +                    // Don't allow redundant (a-a) or empty (b-a) ranges;
   1.773 +                    // these are most likely typos.
   1.774 +                    // syntaxError(chars, "Invalid range");
   1.775 +                    ec = U_MALFORMED_SET;
   1.776 +                    return;
   1.777 +                }
   1.778 +                add(lastChar, c);
   1.779 +                _appendToPat(patLocal, lastChar, FALSE);
   1.780 +                patLocal.append(op);
   1.781 +                _appendToPat(patLocal, c, FALSE);
   1.782 +                lastItem = 0;
   1.783 +                op = 0;
   1.784 +            } else {
   1.785 +                add(lastChar, lastChar);
   1.786 +                _appendToPat(patLocal, lastChar, FALSE);
   1.787 +                lastChar = c;
   1.788 +            }
   1.789 +            break;
   1.790 +        case 2:
   1.791 +            if (op != 0) {
   1.792 +                // syntaxError(chars, "Set expected after operator");
   1.793 +                ec = U_MALFORMED_SET;
   1.794 +                return;
   1.795 +            }
   1.796 +            lastChar = c;
   1.797 +            lastItem = 1;
   1.798 +            break;
   1.799 +        }
   1.800 +    }
   1.801 +
   1.802 +    if (mode != 2) {
   1.803 +        // syntaxError(chars, "Missing ']'");
   1.804 +        ec = U_MALFORMED_SET;
   1.805 +        return;
   1.806 +    }
   1.807 +
   1.808 +    chars.skipIgnored(opts);
   1.809 +
   1.810 +    /**
   1.811 +     * Handle global flags (invert, case insensitivity).  If this
   1.812 +     * pattern should be compiled case-insensitive, then we need
   1.813 +     * to close over case BEFORE COMPLEMENTING.  This makes
   1.814 +     * patterns like /[^abc]/i work.
   1.815 +     */
   1.816 +    if ((options & USET_CASE_INSENSITIVE) != 0) {
   1.817 +        (this->*caseClosure)(USET_CASE_INSENSITIVE);
   1.818 +    }
   1.819 +    else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
   1.820 +        (this->*caseClosure)(USET_ADD_CASE_MAPPINGS);
   1.821 +    }
   1.822 +    if (invert) {
   1.823 +        complement();
   1.824 +    }
   1.825 +
   1.826 +    // Use the rebuilt pattern (patLocal) only if necessary.  Prefer the
   1.827 +    // generated pattern.
   1.828 +    if (usePat) {
   1.829 +        rebuiltPat.append(patLocal);
   1.830 +    } else {
   1.831 +        _generatePattern(rebuiltPat, FALSE);
   1.832 +    }
   1.833 +    if (isBogus() && U_SUCCESS(ec)) {
   1.834 +        // We likely ran out of memory. AHHH!
   1.835 +        ec = U_MEMORY_ALLOCATION_ERROR;
   1.836 +    }
   1.837 +}
   1.838 +
   1.839 +//----------------------------------------------------------------
   1.840 +// Property set implementation
   1.841 +//----------------------------------------------------------------
   1.842 +
   1.843 +static UBool numericValueFilter(UChar32 ch, void* context) {
   1.844 +    return u_getNumericValue(ch) == *(double*)context;
   1.845 +}
   1.846 +
   1.847 +static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
   1.848 +    int32_t value = *(int32_t*)context;
   1.849 +    return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
   1.850 +}
   1.851 +
   1.852 +static UBool versionFilter(UChar32 ch, void* context) {
   1.853 +    static const UVersionInfo none = { 0, 0, 0, 0 };
   1.854 +    UVersionInfo v;
   1.855 +    u_charAge(ch, v);
   1.856 +    UVersionInfo* version = (UVersionInfo*)context;
   1.857 +    return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
   1.858 +}
   1.859 +
   1.860 +typedef struct {
   1.861 +    UProperty prop;
   1.862 +    int32_t value;
   1.863 +} IntPropertyContext;
   1.864 +
   1.865 +static UBool intPropertyFilter(UChar32 ch, void* context) {
   1.866 +    IntPropertyContext* c = (IntPropertyContext*)context;
   1.867 +    return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
   1.868 +}
   1.869 +
   1.870 +static UBool scriptExtensionsFilter(UChar32 ch, void* context) {
   1.871 +    return uscript_hasScript(ch, *(UScriptCode*)context);
   1.872 +}
   1.873 +
   1.874 +/**
   1.875 + * Generic filter-based scanning code for UCD property UnicodeSets.
   1.876 + */
   1.877 +void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
   1.878 +                             void* context,
   1.879 +                             int32_t src,
   1.880 +                             UErrorCode &status) {
   1.881 +    if (U_FAILURE(status)) return;
   1.882 +
   1.883 +    // Logically, walk through all Unicode characters, noting the start
   1.884 +    // and end of each range for which filter.contain(c) is
   1.885 +    // true.  Add each range to a set.
   1.886 +    //
   1.887 +    // To improve performance, use an inclusions set which
   1.888 +    // encodes information about character ranges that are known
   1.889 +    // to have identical properties.
   1.890 +    // getInclusions(src) contains exactly the first characters of
   1.891 +    // same-value ranges for the given properties "source".
   1.892 +    const UnicodeSet* inclusions = getInclusions(src, status);
   1.893 +    if (U_FAILURE(status)) {
   1.894 +        return;
   1.895 +    }
   1.896 +
   1.897 +    clear();
   1.898 +
   1.899 +    UChar32 startHasProperty = -1;
   1.900 +    int32_t limitRange = inclusions->getRangeCount();
   1.901 +
   1.902 +    for (int j=0; j<limitRange; ++j) {
   1.903 +        // get current range
   1.904 +        UChar32 start = inclusions->getRangeStart(j);
   1.905 +        UChar32 end = inclusions->getRangeEnd(j);
   1.906 +
   1.907 +        // for all the code points in the range, process
   1.908 +        for (UChar32 ch = start; ch <= end; ++ch) {
   1.909 +            // only add to this UnicodeSet on inflection points --
   1.910 +            // where the hasProperty value changes to false
   1.911 +            if ((*filter)(ch, context)) {
   1.912 +                if (startHasProperty < 0) {
   1.913 +                    startHasProperty = ch;
   1.914 +                }
   1.915 +            } else if (startHasProperty >= 0) {
   1.916 +                add(startHasProperty, ch-1);
   1.917 +                startHasProperty = -1;
   1.918 +            }
   1.919 +        }
   1.920 +    }
   1.921 +    if (startHasProperty >= 0) {
   1.922 +        add((UChar32)startHasProperty, (UChar32)0x10FFFF);
   1.923 +    }
   1.924 +    if (isBogus() && U_SUCCESS(status)) {
   1.925 +        // We likely ran out of memory. AHHH!
   1.926 +        status = U_MEMORY_ALLOCATION_ERROR;
   1.927 +    }
   1.928 +}
   1.929 +
   1.930 +static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
   1.931 +    /* Note: we use ' ' in compiler code page */
   1.932 +    int32_t j = 0;
   1.933 +    char ch;
   1.934 +    --dstCapacity; /* make room for term. zero */
   1.935 +    while ((ch = *src++) != 0) {
   1.936 +        if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {
   1.937 +            continue;
   1.938 +        }
   1.939 +        if (j >= dstCapacity) return FALSE;
   1.940 +        dst[j++] = ch;
   1.941 +    }
   1.942 +    if (j > 0 && dst[j-1] == ' ') --j;
   1.943 +    dst[j] = 0;
   1.944 +    return TRUE;
   1.945 +}
   1.946 +
   1.947 +//----------------------------------------------------------------
   1.948 +// Property set API
   1.949 +//----------------------------------------------------------------
   1.950 +
   1.951 +#define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;}
   1.952 +
   1.953 +UnicodeSet&
   1.954 +UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
   1.955 +    if (U_FAILURE(ec) || isFrozen()) return *this;
   1.956 +
   1.957 +    if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
   1.958 +        applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec);
   1.959 +    } else if (prop == UCHAR_SCRIPT_EXTENSIONS) {
   1.960 +        UScriptCode script = (UScriptCode)value;
   1.961 +        applyFilter(scriptExtensionsFilter, &script, UPROPS_SRC_PROPSVEC, ec);
   1.962 +    } else {
   1.963 +        IntPropertyContext c = {prop, value};
   1.964 +        applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec);
   1.965 +    }
   1.966 +    return *this;
   1.967 +}
   1.968 +
   1.969 +UnicodeSet&
   1.970 +UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
   1.971 +                               const UnicodeString& value,
   1.972 +                               UErrorCode& ec) {
   1.973 +    if (U_FAILURE(ec) || isFrozen()) return *this;
   1.974 +
   1.975 +    // prop and value used to be converted to char * using the default
   1.976 +    // converter instead of the invariant conversion.
   1.977 +    // This should not be necessary because all Unicode property and value
   1.978 +    // names use only invariant characters.
   1.979 +    // If there are any variant characters, then we won't find them anyway.
   1.980 +    // Checking first avoids assertion failures in the conversion.
   1.981 +    if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) ||
   1.982 +        !uprv_isInvariantUString(value.getBuffer(), value.length())
   1.983 +    ) {
   1.984 +        FAIL(ec);
   1.985 +    }
   1.986 +    CharString pname, vname;
   1.987 +    pname.appendInvariantChars(prop, ec);
   1.988 +    vname.appendInvariantChars(value, ec);
   1.989 +    if (U_FAILURE(ec)) return *this;
   1.990 +
   1.991 +    UProperty p;
   1.992 +    int32_t v;
   1.993 +    UBool mustNotBeEmpty = FALSE, invert = FALSE;
   1.994 +
   1.995 +    if (value.length() > 0) {
   1.996 +        p = u_getPropertyEnum(pname.data());
   1.997 +        if (p == UCHAR_INVALID_CODE) FAIL(ec);
   1.998 +
   1.999 +        // Treat gc as gcm
  1.1000 +        if (p == UCHAR_GENERAL_CATEGORY) {
  1.1001 +            p = UCHAR_GENERAL_CATEGORY_MASK;
  1.1002 +        }
  1.1003 +
  1.1004 +        if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||
  1.1005 +            (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||
  1.1006 +            (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
  1.1007 +            v = u_getPropertyValueEnum(p, vname.data());
  1.1008 +            if (v == UCHAR_INVALID_CODE) {
  1.1009 +                // Handle numeric CCC
  1.1010 +                if (p == UCHAR_CANONICAL_COMBINING_CLASS ||
  1.1011 +                    p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||
  1.1012 +                    p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
  1.1013 +                    char* end;
  1.1014 +                    double value = uprv_strtod(vname.data(), &end);
  1.1015 +                    v = (int32_t) value;
  1.1016 +                    if (v != value || v < 0 || *end != 0) {
  1.1017 +                        // non-integral or negative value, or trailing junk
  1.1018 +                        FAIL(ec);
  1.1019 +                    }
  1.1020 +                    // If the resultant set is empty then the numeric value
  1.1021 +                    // was invalid.
  1.1022 +                    mustNotBeEmpty = TRUE;
  1.1023 +                } else {
  1.1024 +                    FAIL(ec);
  1.1025 +                }
  1.1026 +            }
  1.1027 +        }
  1.1028 +
  1.1029 +        else {
  1.1030 +
  1.1031 +            switch (p) {
  1.1032 +            case UCHAR_NUMERIC_VALUE:
  1.1033 +                {
  1.1034 +                    char* end;
  1.1035 +                    double value = uprv_strtod(vname.data(), &end);
  1.1036 +                    if (*end != 0) {
  1.1037 +                        FAIL(ec);
  1.1038 +                    }
  1.1039 +                    applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec);
  1.1040 +                    return *this;
  1.1041 +                }
  1.1042 +            case UCHAR_NAME:
  1.1043 +                {
  1.1044 +                    // Must munge name, since u_charFromName() does not do
  1.1045 +                    // 'loose' matching.
  1.1046 +                    char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
  1.1047 +                    if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
  1.1048 +                    UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec);
  1.1049 +                    if (U_SUCCESS(ec)) {
  1.1050 +                        clear();
  1.1051 +                        add(ch);
  1.1052 +                        return *this;
  1.1053 +                    } else {
  1.1054 +                        FAIL(ec);
  1.1055 +                    }
  1.1056 +                }
  1.1057 +            case UCHAR_UNICODE_1_NAME:
  1.1058 +                // ICU 49 deprecates the Unicode_1_Name property APIs.
  1.1059 +                FAIL(ec);
  1.1060 +            case UCHAR_AGE:
  1.1061 +                {
  1.1062 +                    // Must munge name, since u_versionFromString() does not do
  1.1063 +                    // 'loose' matching.
  1.1064 +                    char buf[128];
  1.1065 +                    if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
  1.1066 +                    UVersionInfo version;
  1.1067 +                    u_versionFromString(version, buf);
  1.1068 +                    applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec);
  1.1069 +                    return *this;
  1.1070 +                }
  1.1071 +            case UCHAR_SCRIPT_EXTENSIONS:
  1.1072 +                v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data());
  1.1073 +                if (v == UCHAR_INVALID_CODE) {
  1.1074 +                    FAIL(ec);
  1.1075 +                }
  1.1076 +                // fall through to calling applyIntPropertyValue()
  1.1077 +                break;
  1.1078 +            default:
  1.1079 +                // p is a non-binary, non-enumerated property that we
  1.1080 +                // don't support (yet).
  1.1081 +                FAIL(ec);
  1.1082 +            }
  1.1083 +        }
  1.1084 +    }
  1.1085 +
  1.1086 +    else {
  1.1087 +        // value is empty.  Interpret as General Category, Script, or
  1.1088 +        // Binary property.
  1.1089 +        p = UCHAR_GENERAL_CATEGORY_MASK;
  1.1090 +        v = u_getPropertyValueEnum(p, pname.data());
  1.1091 +        if (v == UCHAR_INVALID_CODE) {
  1.1092 +            p = UCHAR_SCRIPT;
  1.1093 +            v = u_getPropertyValueEnum(p, pname.data());
  1.1094 +            if (v == UCHAR_INVALID_CODE) {
  1.1095 +                p = u_getPropertyEnum(pname.data());
  1.1096 +                if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
  1.1097 +                    v = 1;
  1.1098 +                } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) {
  1.1099 +                    set(MIN_VALUE, MAX_VALUE);
  1.1100 +                    return *this;
  1.1101 +                } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) {
  1.1102 +                    set(0, 0x7F);
  1.1103 +                    return *this;
  1.1104 +                } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) {
  1.1105 +                    // [:Assigned:]=[:^Cn:]
  1.1106 +                    p = UCHAR_GENERAL_CATEGORY_MASK;
  1.1107 +                    v = U_GC_CN_MASK;
  1.1108 +                    invert = TRUE;
  1.1109 +                } else {
  1.1110 +                    FAIL(ec);
  1.1111 +                }
  1.1112 +            }
  1.1113 +        }
  1.1114 +    }
  1.1115 +
  1.1116 +    applyIntPropertyValue(p, v, ec);
  1.1117 +    if(invert) {
  1.1118 +        complement();
  1.1119 +    }
  1.1120 +
  1.1121 +    if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) {
  1.1122 +        // mustNotBeEmpty is set to true if an empty set indicates
  1.1123 +        // invalid input.
  1.1124 +        ec = U_ILLEGAL_ARGUMENT_ERROR;
  1.1125 +    }
  1.1126 +
  1.1127 +    if (isBogus() && U_SUCCESS(ec)) {
  1.1128 +        // We likely ran out of memory. AHHH!
  1.1129 +        ec = U_MEMORY_ALLOCATION_ERROR;
  1.1130 +    }
  1.1131 +    return *this;
  1.1132 +}
  1.1133 +
  1.1134 +//----------------------------------------------------------------
  1.1135 +// Property set patterns
  1.1136 +//----------------------------------------------------------------
  1.1137 +
  1.1138 +/**
  1.1139 + * Return true if the given position, in the given pattern, appears
  1.1140 + * to be the start of a property set pattern.
  1.1141 + */
  1.1142 +UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
  1.1143 +                                           int32_t pos) {
  1.1144 +    // Patterns are at least 5 characters long
  1.1145 +    if ((pos+5) > pattern.length()) {
  1.1146 +        return FALSE;
  1.1147 +    }
  1.1148 +
  1.1149 +    // Look for an opening [:, [:^, \p, or \P
  1.1150 +    return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);
  1.1151 +}
  1.1152 +
  1.1153 +/**
  1.1154 + * Return true if the given iterator appears to point at a
  1.1155 + * property pattern.  Regardless of the result, return with the
  1.1156 + * iterator unchanged.
  1.1157 + * @param chars iterator over the pattern characters.  Upon return
  1.1158 + * it will be unchanged.
  1.1159 + * @param iterOpts RuleCharacterIterator options
  1.1160 + */
  1.1161 +UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
  1.1162 +                                           int32_t iterOpts) {
  1.1163 +    // NOTE: literal will always be FALSE, because we don't parse escapes.
  1.1164 +    UBool result = FALSE, literal;
  1.1165 +    UErrorCode ec = U_ZERO_ERROR;
  1.1166 +    iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
  1.1167 +    RuleCharacterIterator::Pos pos;
  1.1168 +    chars.getPos(pos);
  1.1169 +    UChar32 c = chars.next(iterOpts, literal, ec);
  1.1170 +    if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) {
  1.1171 +        UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
  1.1172 +                               literal, ec);
  1.1173 +        result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) :
  1.1174 +                 (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/);
  1.1175 +    }
  1.1176 +    chars.setPos(pos);
  1.1177 +    return result && U_SUCCESS(ec);
  1.1178 +}
  1.1179 +
  1.1180 +/**
  1.1181 + * Parse the given property pattern at the given parse position.
  1.1182 + */
  1.1183 +UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
  1.1184 +                                             ParsePosition& ppos,
  1.1185 +                                             UErrorCode &ec) {
  1.1186 +    int32_t pos = ppos.getIndex();
  1.1187 +
  1.1188 +    UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
  1.1189 +    UBool isName = FALSE; // true for \N{pat}, o/w false
  1.1190 +    UBool invert = FALSE;
  1.1191 +
  1.1192 +    if (U_FAILURE(ec)) return *this;
  1.1193 +
  1.1194 +    // Minimum length is 5 characters, e.g. \p{L}
  1.1195 +    if ((pos+5) > pattern.length()) {
  1.1196 +        FAIL(ec);
  1.1197 +    }
  1.1198 +
  1.1199 +    // On entry, ppos should point to one of the following locations:
  1.1200 +    // Look for an opening [:, [:^, \p, or \P
  1.1201 +    if (isPOSIXOpen(pattern, pos)) {
  1.1202 +        posix = TRUE;
  1.1203 +        pos += 2;
  1.1204 +        pos = ICU_Utility::skipWhitespace(pattern, pos);
  1.1205 +        if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) {
  1.1206 +            ++pos;
  1.1207 +            invert = TRUE;
  1.1208 +        }
  1.1209 +    } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {
  1.1210 +        UChar c = pattern.charAt(pos+1);
  1.1211 +        invert = (c == UPPER_P);
  1.1212 +        isName = (c == UPPER_N);
  1.1213 +        pos += 2;
  1.1214 +        pos = ICU_Utility::skipWhitespace(pattern, pos);
  1.1215 +        if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) {
  1.1216 +            // Syntax error; "\p" or "\P" not followed by "{"
  1.1217 +            FAIL(ec);
  1.1218 +        }
  1.1219 +    } else {
  1.1220 +        // Open delimiter not seen
  1.1221 +        FAIL(ec);
  1.1222 +    }
  1.1223 +
  1.1224 +    // Look for the matching close delimiter, either :] or }
  1.1225 +    int32_t close;
  1.1226 +    if (posix) {
  1.1227 +      close = pattern.indexOf(POSIX_CLOSE, 2, pos);
  1.1228 +    } else {
  1.1229 +      close = pattern.indexOf(CLOSE_BRACE, pos);
  1.1230 +    }
  1.1231 +    if (close < 0) {
  1.1232 +        // Syntax error; close delimiter missing
  1.1233 +        FAIL(ec);
  1.1234 +    }
  1.1235 +
  1.1236 +    // Look for an '=' sign.  If this is present, we will parse a
  1.1237 +    // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
  1.1238 +    // pattern.
  1.1239 +    int32_t equals = pattern.indexOf(EQUALS, pos);
  1.1240 +    UnicodeString propName, valueName;
  1.1241 +    if (equals >= 0 && equals < close && !isName) {
  1.1242 +        // Equals seen; parse medium/long pattern
  1.1243 +        pattern.extractBetween(pos, equals, propName);
  1.1244 +        pattern.extractBetween(equals+1, close, valueName);
  1.1245 +    }
  1.1246 +
  1.1247 +    else {
  1.1248 +        // Handle case where no '=' is seen, and \N{}
  1.1249 +        pattern.extractBetween(pos, close, propName);
  1.1250 +            
  1.1251 +        // Handle \N{name}
  1.1252 +        if (isName) {
  1.1253 +            // This is a little inefficient since it means we have to
  1.1254 +            // parse NAME_PROP back to UCHAR_NAME even though we already
  1.1255 +            // know it's UCHAR_NAME.  If we refactor the API to
  1.1256 +            // support args of (UProperty, char*) then we can remove
  1.1257 +            // NAME_PROP and make this a little more efficient.
  1.1258 +            valueName = propName;
  1.1259 +            propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV);
  1.1260 +        }
  1.1261 +    }
  1.1262 +
  1.1263 +    applyPropertyAlias(propName, valueName, ec);
  1.1264 +
  1.1265 +    if (U_SUCCESS(ec)) {
  1.1266 +        if (invert) {
  1.1267 +            complement();
  1.1268 +        }
  1.1269 +            
  1.1270 +        // Move to the limit position after the close delimiter if the
  1.1271 +        // parse succeeded.
  1.1272 +        ppos.setIndex(close + (posix ? 2 : 1));
  1.1273 +    }
  1.1274 +
  1.1275 +    return *this;
  1.1276 +}
  1.1277 +
  1.1278 +/**
  1.1279 + * Parse a property pattern.
  1.1280 + * @param chars iterator over the pattern characters.  Upon return
  1.1281 + * it will be advanced to the first character after the parsed
  1.1282 + * pattern, or the end of the iteration if all characters are
  1.1283 + * parsed.
  1.1284 + * @param rebuiltPat the pattern that was parsed, rebuilt or
  1.1285 + * copied from the input pattern, as appropriate.
  1.1286 + */
  1.1287 +void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
  1.1288 +                                      UnicodeString& rebuiltPat,
  1.1289 +                                      UErrorCode& ec) {
  1.1290 +    if (U_FAILURE(ec)) return;
  1.1291 +    UnicodeString pattern;
  1.1292 +    chars.lookahead(pattern);
  1.1293 +    ParsePosition pos(0);
  1.1294 +    applyPropertyPattern(pattern, pos, ec);
  1.1295 +    if (U_FAILURE(ec)) return;
  1.1296 +    if (pos.getIndex() == 0) {
  1.1297 +        // syntaxError(chars, "Invalid property pattern");
  1.1298 +        ec = U_MALFORMED_SET;
  1.1299 +        return;
  1.1300 +    }
  1.1301 +    chars.jumpahead(pos.getIndex());
  1.1302 +    rebuiltPat.append(pattern, 0, pos.getIndex());
  1.1303 +}
  1.1304 +
  1.1305 +U_NAMESPACE_END
The Tor Browser / file diff

diff: intl/icu/source/common/uniset_props.cpp

intl/icu/source/common/uniset_props.cpp