1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/uniset_props.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1302 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* 1.7 +* Copyright (C) 1999-2013, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +******************************************************************************* 1.11 +* file name: uniset_props.cpp 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 2004aug25 1.17 +* created by: Markus W. Scherer 1.18 +* 1.19 +* Character property dependent functions moved here from uniset.cpp 1.20 +*/ 1.21 + 1.22 +#include "unicode/utypes.h" 1.23 +#include "unicode/uniset.h" 1.24 +#include "unicode/parsepos.h" 1.25 +#include "unicode/uchar.h" 1.26 +#include "unicode/uscript.h" 1.27 +#include "unicode/symtable.h" 1.28 +#include "unicode/uset.h" 1.29 +#include "unicode/locid.h" 1.30 +#include "unicode/brkiter.h" 1.31 +#include "uset_imp.h" 1.32 +#include "ruleiter.h" 1.33 +#include "cmemory.h" 1.34 +#include "ucln_cmn.h" 1.35 +#include "util.h" 1.36 +#include "uvector.h" 1.37 +#include "uprops.h" 1.38 +#include "propname.h" 1.39 +#include "normalizer2impl.h" 1.40 +#include "ucase.h" 1.41 +#include "ubidi_props.h" 1.42 +#include "uinvchar.h" 1.43 +#include "uprops.h" 1.44 +#include "charstr.h" 1.45 +#include "cstring.h" 1.46 +#include "mutex.h" 1.47 +#include "umutex.h" 1.48 +#include "uassert.h" 1.49 +#include "hash.h" 1.50 + 1.51 +U_NAMESPACE_USE 1.52 + 1.53 +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 1.54 + 1.55 +// initial storage. Must be >= 0 1.56 +// *** same as in uniset.cpp ! *** 1.57 +#define START_EXTRA 16 1.58 + 1.59 +// Define UChar constants using hex for EBCDIC compatibility 1.60 +// Used #define to reduce private static exports and memory access time. 1.61 +#define SET_OPEN ((UChar)0x005B) /*[*/ 1.62 +#define SET_CLOSE ((UChar)0x005D) /*]*/ 1.63 +#define HYPHEN ((UChar)0x002D) /*-*/ 1.64 +#define COMPLEMENT ((UChar)0x005E) /*^*/ 1.65 +#define COLON ((UChar)0x003A) /*:*/ 1.66 +#define BACKSLASH ((UChar)0x005C) /*\*/ 1.67 +#define INTERSECTION ((UChar)0x0026) /*&*/ 1.68 +#define UPPER_U ((UChar)0x0055) /*U*/ 1.69 +#define LOWER_U ((UChar)0x0075) /*u*/ 1.70 +#define OPEN_BRACE ((UChar)123) /*{*/ 1.71 +#define CLOSE_BRACE ((UChar)125) /*}*/ 1.72 +#define UPPER_P ((UChar)0x0050) /*P*/ 1.73 +#define LOWER_P ((UChar)0x0070) /*p*/ 1.74 +#define UPPER_N ((UChar)78) /*N*/ 1.75 +#define EQUALS ((UChar)0x003D) /*=*/ 1.76 + 1.77 +//static const UChar POSIX_OPEN[] = { SET_OPEN,COLON,0 }; // "[:" 1.78 +static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 }; // ":]" 1.79 +//static const UChar PERL_OPEN[] = { BACKSLASH,LOWER_P,0 }; // "\\p" 1.80 +//static const UChar PERL_CLOSE[] = { CLOSE_BRACE,0 }; // "}" 1.81 +//static const UChar NAME_OPEN[] = { BACKSLASH,UPPER_N,0 }; // "\\N" 1.82 +static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/ 1.83 + 1.84 +// Special property set IDs 1.85 +static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF] 1.86 +static const char ASCII[] = "ASCII"; // [\u0000-\u007F] 1.87 +static const char ASSIGNED[] = "Assigned"; // [:^Cn:] 1.88 + 1.89 +// Unicode name property alias 1.90 +#define NAME_PROP "na" 1.91 +#define NAME_PROP_LENGTH 2 1.92 + 1.93 +/** 1.94 + * Delimiter string used in patterns to close a category reference: 1.95 + * ":]". Example: "[:Lu:]". 1.96 + */ 1.97 +//static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */ 1.98 + 1.99 +// Cached sets ------------------------------------------------------------- *** 1.100 + 1.101 +U_CDECL_BEGIN 1.102 +static UBool U_CALLCONV uset_cleanup(); 1.103 + 1.104 +struct Inclusion { 1.105 + UnicodeSet *fSet; 1.106 + UInitOnce fInitOnce; 1.107 +}; 1.108 +static Inclusion gInclusions[UPROPS_SRC_COUNT]; // cached getInclusions() 1.109 + 1.110 +static UnicodeSet *uni32Singleton; 1.111 +static icu::UInitOnce uni32InitOnce = U_INITONCE_INITIALIZER; 1.112 + 1.113 +//---------------------------------------------------------------- 1.114 +// Inclusions list 1.115 +//---------------------------------------------------------------- 1.116 + 1.117 +// USetAdder implementation 1.118 +// Does not use uset.h to reduce code dependencies 1.119 +static void U_CALLCONV 1.120 +_set_add(USet *set, UChar32 c) { 1.121 + ((UnicodeSet *)set)->add(c); 1.122 +} 1.123 + 1.124 +static void U_CALLCONV 1.125 +_set_addRange(USet *set, UChar32 start, UChar32 end) { 1.126 + ((UnicodeSet *)set)->add(start, end); 1.127 +} 1.128 + 1.129 +static void U_CALLCONV 1.130 +_set_addString(USet *set, const UChar *str, int32_t length) { 1.131 + ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length)); 1.132 +} 1.133 + 1.134 +/** 1.135 + * Cleanup function for UnicodeSet 1.136 + */ 1.137 +static UBool U_CALLCONV uset_cleanup(void) { 1.138 + for(int32_t i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) { 1.139 + Inclusion &in = gInclusions[i]; 1.140 + delete in.fSet; 1.141 + in.fSet = NULL; 1.142 + in.fInitOnce.reset(); 1.143 + } 1.144 + 1.145 + delete uni32Singleton; 1.146 + uni32Singleton = NULL; 1.147 + uni32InitOnce.reset(); 1.148 + return TRUE; 1.149 +} 1.150 + 1.151 +U_CDECL_END 1.152 + 1.153 +U_NAMESPACE_BEGIN 1.154 + 1.155 +/* 1.156 +Reduce excessive reallocation, and make it easier to detect initialization problems. 1.157 +Usually you don't see smaller sets than this for Unicode 5.0. 1.158 +*/ 1.159 +#define DEFAULT_INCLUSION_CAPACITY 3072 1.160 + 1.161 +void U_CALLCONV UnicodeSet_initInclusion(int32_t src, UErrorCode &status) { 1.162 + // This function is invoked only via umtx_initOnce(). 1.163 + // This function is a friend of class UnicodeSet. 1.164 + 1.165 + U_ASSERT(src >=0 && src<UPROPS_SRC_COUNT); 1.166 + UnicodeSet * &incl = gInclusions[src].fSet; 1.167 + U_ASSERT(incl == NULL); 1.168 + 1.169 + incl = new UnicodeSet(); 1.170 + if (incl == NULL) { 1.171 + status = U_MEMORY_ALLOCATION_ERROR; 1.172 + return; 1.173 + } 1.174 + USetAdder sa = { 1.175 + (USet *)incl, 1.176 + _set_add, 1.177 + _set_addRange, 1.178 + _set_addString, 1.179 + NULL, // don't need remove() 1.180 + NULL // don't need removeRange() 1.181 + }; 1.182 + 1.183 + incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, status); 1.184 + switch(src) { 1.185 + case UPROPS_SRC_CHAR: 1.186 + uchar_addPropertyStarts(&sa, &status); 1.187 + break; 1.188 + case UPROPS_SRC_PROPSVEC: 1.189 + upropsvec_addPropertyStarts(&sa, &status); 1.190 + break; 1.191 + case UPROPS_SRC_CHAR_AND_PROPSVEC: 1.192 + uchar_addPropertyStarts(&sa, &status); 1.193 + upropsvec_addPropertyStarts(&sa, &status); 1.194 + break; 1.195 +#if !UCONFIG_NO_NORMALIZATION 1.196 + case UPROPS_SRC_CASE_AND_NORM: { 1.197 + const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); 1.198 + if(U_SUCCESS(status)) { 1.199 + impl->addPropertyStarts(&sa, status); 1.200 + } 1.201 + ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status); 1.202 + break; 1.203 + } 1.204 + case UPROPS_SRC_NFC: { 1.205 + const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); 1.206 + if(U_SUCCESS(status)) { 1.207 + impl->addPropertyStarts(&sa, status); 1.208 + } 1.209 + break; 1.210 + } 1.211 + case UPROPS_SRC_NFKC: { 1.212 + const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(status); 1.213 + if(U_SUCCESS(status)) { 1.214 + impl->addPropertyStarts(&sa, status); 1.215 + } 1.216 + break; 1.217 + } 1.218 + case UPROPS_SRC_NFKC_CF: { 1.219 + const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(status); 1.220 + if(U_SUCCESS(status)) { 1.221 + impl->addPropertyStarts(&sa, status); 1.222 + } 1.223 + break; 1.224 + } 1.225 + case UPROPS_SRC_NFC_CANON_ITER: { 1.226 + const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); 1.227 + if(U_SUCCESS(status)) { 1.228 + impl->addCanonIterPropertyStarts(&sa, status); 1.229 + } 1.230 + break; 1.231 + } 1.232 +#endif 1.233 + case UPROPS_SRC_CASE: 1.234 + ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status); 1.235 + break; 1.236 + case UPROPS_SRC_BIDI: 1.237 + ubidi_addPropertyStarts(ubidi_getSingleton(), &sa, &status); 1.238 + break; 1.239 + default: 1.240 + status = U_INTERNAL_PROGRAM_ERROR; 1.241 + break; 1.242 + } 1.243 + 1.244 + if (U_FAILURE(status)) { 1.245 + delete incl; 1.246 + incl = NULL; 1.247 + return; 1.248 + } 1.249 + // Compact for caching 1.250 + incl->compact(); 1.251 + ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); 1.252 +} 1.253 + 1.254 + 1.255 + 1.256 +const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) { 1.257 + U_ASSERT(src >=0 && src<UPROPS_SRC_COUNT); 1.258 + Inclusion &i = gInclusions[src]; 1.259 + umtx_initOnce(i.fInitOnce, &UnicodeSet_initInclusion, src, status); 1.260 + return i.fSet; 1.261 +} 1.262 + 1.263 + 1.264 +// Cache some sets for other services -------------------------------------- *** 1.265 +void U_CALLCONV createUni32Set(UErrorCode &errorCode) { 1.266 + U_ASSERT(uni32Singleton == NULL); 1.267 + uni32Singleton = new UnicodeSet(UNICODE_STRING_SIMPLE("[:age=3.2:]"), errorCode); 1.268 + if(uni32Singleton==NULL) { 1.269 + errorCode=U_MEMORY_ALLOCATION_ERROR; 1.270 + } else { 1.271 + uni32Singleton->freeze(); 1.272 + } 1.273 + ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); 1.274 +} 1.275 + 1.276 + 1.277 +U_CFUNC UnicodeSet * 1.278 +uniset_getUnicode32Instance(UErrorCode &errorCode) { 1.279 + umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode); 1.280 + return uni32Singleton; 1.281 +} 1.282 + 1.283 +// helper functions for matching of pattern syntax pieces ------------------ *** 1.284 +// these functions are parallel to the PERL_OPEN etc. strings above 1.285 + 1.286 +// using these functions is not only faster than UnicodeString::compare() and 1.287 +// caseCompare(), but they also make UnicodeSet work for simple patterns when 1.288 +// no Unicode properties data is available - when caseCompare() fails 1.289 + 1.290 +static inline UBool 1.291 +isPerlOpen(const UnicodeString &pattern, int32_t pos) { 1.292 + UChar c; 1.293 + return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P); 1.294 +} 1.295 + 1.296 +/*static inline UBool 1.297 +isPerlClose(const UnicodeString &pattern, int32_t pos) { 1.298 + return pattern.charAt(pos)==CLOSE_BRACE; 1.299 +}*/ 1.300 + 1.301 +static inline UBool 1.302 +isNameOpen(const UnicodeString &pattern, int32_t pos) { 1.303 + return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N; 1.304 +} 1.305 + 1.306 +static inline UBool 1.307 +isPOSIXOpen(const UnicodeString &pattern, int32_t pos) { 1.308 + return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON; 1.309 +} 1.310 + 1.311 +/*static inline UBool 1.312 +isPOSIXClose(const UnicodeString &pattern, int32_t pos) { 1.313 + return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE; 1.314 +}*/ 1.315 + 1.316 +// TODO memory debugging provided inside uniset.cpp 1.317 +// could be made available here but probably obsolete with use of modern 1.318 +// memory leak checker tools 1.319 +#define _dbgct(me) 1.320 + 1.321 +//---------------------------------------------------------------- 1.322 +// Constructors &c 1.323 +//---------------------------------------------------------------- 1.324 + 1.325 +/** 1.326 + * Constructs a set from the given pattern, optionally ignoring 1.327 + * white space. See the class description for the syntax of the 1.328 + * pattern language. 1.329 + * @param pattern a string specifying what characters are in the set 1.330 + */ 1.331 +UnicodeSet::UnicodeSet(const UnicodeString& pattern, 1.332 + UErrorCode& status) : 1.333 + len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), 1.334 + bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 1.335 + fFlags(0) 1.336 +{ 1.337 + if(U_SUCCESS(status)){ 1.338 + list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 1.339 + /* test for NULL */ 1.340 + if(list == NULL) { 1.341 + status = U_MEMORY_ALLOCATION_ERROR; 1.342 + }else{ 1.343 + allocateStrings(status); 1.344 + applyPattern(pattern, status); 1.345 + } 1.346 + } 1.347 + _dbgct(this); 1.348 +} 1.349 + 1.350 +//---------------------------------------------------------------- 1.351 +// Public API 1.352 +//---------------------------------------------------------------- 1.353 + 1.354 +UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, 1.355 + UErrorCode& status) { 1.356 + // Equivalent to 1.357 + // return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status); 1.358 + // but without dependency on closeOver(). 1.359 + ParsePosition pos(0); 1.360 + applyPatternIgnoreSpace(pattern, pos, NULL, status); 1.361 + if (U_FAILURE(status)) return *this; 1.362 + 1.363 + int32_t i = pos.getIndex(); 1.364 + // Skip over trailing whitespace 1.365 + ICU_Utility::skipWhitespace(pattern, i, TRUE); 1.366 + if (i != pattern.length()) { 1.367 + status = U_ILLEGAL_ARGUMENT_ERROR; 1.368 + } 1.369 + return *this; 1.370 +} 1.371 + 1.372 +void 1.373 +UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern, 1.374 + ParsePosition& pos, 1.375 + const SymbolTable* symbols, 1.376 + UErrorCode& status) { 1.377 + if (U_FAILURE(status)) { 1.378 + return; 1.379 + } 1.380 + if (isFrozen()) { 1.381 + status = U_NO_WRITE_PERMISSION; 1.382 + return; 1.383 + } 1.384 + // Need to build the pattern in a temporary string because 1.385 + // _applyPattern calls add() etc., which set pat to empty. 1.386 + UnicodeString rebuiltPat; 1.387 + RuleCharacterIterator chars(pattern, symbols, pos); 1.388 + applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, status); 1.389 + if (U_FAILURE(status)) return; 1.390 + if (chars.inVariable()) { 1.391 + // syntaxError(chars, "Extra chars in variable value"); 1.392 + status = U_MALFORMED_SET; 1.393 + return; 1.394 + } 1.395 + setPattern(rebuiltPat); 1.396 +} 1.397 + 1.398 +/** 1.399 + * Return true if the given position, in the given pattern, appears 1.400 + * to be the start of a UnicodeSet pattern. 1.401 + */ 1.402 +UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) { 1.403 + return ((pos+1) < pattern.length() && 1.404 + pattern.charAt(pos) == (UChar)91/*[*/) || 1.405 + resemblesPropertyPattern(pattern, pos); 1.406 +} 1.407 + 1.408 +//---------------------------------------------------------------- 1.409 +// Implementation: Pattern parsing 1.410 +//---------------------------------------------------------------- 1.411 + 1.412 +/** 1.413 + * A small all-inline class to manage a UnicodeSet pointer. Add 1.414 + * operator->() etc. as needed. 1.415 + */ 1.416 +class UnicodeSetPointer { 1.417 + UnicodeSet* p; 1.418 +public: 1.419 + inline UnicodeSetPointer() : p(0) {} 1.420 + inline ~UnicodeSetPointer() { delete p; } 1.421 + inline UnicodeSet* pointer() { return p; } 1.422 + inline UBool allocate() { 1.423 + if (p == 0) { 1.424 + p = new UnicodeSet(); 1.425 + } 1.426 + return p != 0; 1.427 + } 1.428 +}; 1.429 + 1.430 +/** 1.431 + * Parse the pattern from the given RuleCharacterIterator. The 1.432 + * iterator is advanced over the parsed pattern. 1.433 + * @param chars iterator over the pattern characters. Upon return 1.434 + * it will be advanced to the first character after the parsed 1.435 + * pattern, or the end of the iteration if all characters are 1.436 + * parsed. 1.437 + * @param symbols symbol table to use to parse and dereference 1.438 + * variables, or null if none. 1.439 + * @param rebuiltPat the pattern that was parsed, rebuilt or 1.440 + * copied from the input pattern, as appropriate. 1.441 + * @param options a bit mask of zero or more of the following: 1.442 + * IGNORE_SPACE, CASE. 1.443 + */ 1.444 +void UnicodeSet::applyPattern(RuleCharacterIterator& chars, 1.445 + const SymbolTable* symbols, 1.446 + UnicodeString& rebuiltPat, 1.447 + uint32_t options, 1.448 + UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute), 1.449 + UErrorCode& ec) { 1.450 + if (U_FAILURE(ec)) return; 1.451 + 1.452 + // Syntax characters: [ ] ^ - & { } 1.453 + 1.454 + // Recognized special forms for chars, sets: c-c s-s s&s 1.455 + 1.456 + int32_t opts = RuleCharacterIterator::PARSE_VARIABLES | 1.457 + RuleCharacterIterator::PARSE_ESCAPES; 1.458 + if ((options & USET_IGNORE_SPACE) != 0) { 1.459 + opts |= RuleCharacterIterator::SKIP_WHITESPACE; 1.460 + } 1.461 + 1.462 + UnicodeString patLocal, buf; 1.463 + UBool usePat = FALSE; 1.464 + UnicodeSetPointer scratch; 1.465 + RuleCharacterIterator::Pos backup; 1.466 + 1.467 + // mode: 0=before [, 1=between [...], 2=after ] 1.468 + // lastItem: 0=none, 1=char, 2=set 1.469 + int8_t lastItem = 0, mode = 0; 1.470 + UChar32 lastChar = 0; 1.471 + UChar op = 0; 1.472 + 1.473 + UBool invert = FALSE; 1.474 + 1.475 + clear(); 1.476 + 1.477 + while (mode != 2 && !chars.atEnd()) { 1.478 + U_ASSERT((lastItem == 0 && op == 0) || 1.479 + (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) || 1.480 + (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ || 1.481 + op == INTERSECTION /*'&'*/))); 1.482 + 1.483 + UChar32 c = 0; 1.484 + UBool literal = FALSE; 1.485 + UnicodeSet* nested = 0; // alias - do not delete 1.486 + 1.487 + // -------- Check for property pattern 1.488 + 1.489 + // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed 1.490 + int8_t setMode = 0; 1.491 + if (resemblesPropertyPattern(chars, opts)) { 1.492 + setMode = 2; 1.493 + } 1.494 + 1.495 + // -------- Parse '[' of opening delimiter OR nested set. 1.496 + // If there is a nested set, use `setMode' to define how 1.497 + // the set should be parsed. If the '[' is part of the 1.498 + // opening delimiter for this pattern, parse special 1.499 + // strings "[", "[^", "[-", and "[^-". Check for stand-in 1.500 + // characters representing a nested set in the symbol 1.501 + // table. 1.502 + 1.503 + else { 1.504 + // Prepare to backup if necessary 1.505 + chars.getPos(backup); 1.506 + c = chars.next(opts, literal, ec); 1.507 + if (U_FAILURE(ec)) return; 1.508 + 1.509 + if (c == 0x5B /*'['*/ && !literal) { 1.510 + if (mode == 1) { 1.511 + chars.setPos(backup); // backup 1.512 + setMode = 1; 1.513 + } else { 1.514 + // Handle opening '[' delimiter 1.515 + mode = 1; 1.516 + patLocal.append((UChar) 0x5B /*'['*/); 1.517 + chars.getPos(backup); // prepare to backup 1.518 + c = chars.next(opts, literal, ec); 1.519 + if (U_FAILURE(ec)) return; 1.520 + if (c == 0x5E /*'^'*/ && !literal) { 1.521 + invert = TRUE; 1.522 + patLocal.append((UChar) 0x5E /*'^'*/); 1.523 + chars.getPos(backup); // prepare to backup 1.524 + c = chars.next(opts, literal, ec); 1.525 + if (U_FAILURE(ec)) return; 1.526 + } 1.527 + // Fall through to handle special leading '-'; 1.528 + // otherwise restart loop for nested [], \p{}, etc. 1.529 + if (c == HYPHEN /*'-'*/) { 1.530 + literal = TRUE; 1.531 + // Fall through to handle literal '-' below 1.532 + } else { 1.533 + chars.setPos(backup); // backup 1.534 + continue; 1.535 + } 1.536 + } 1.537 + } else if (symbols != 0) { 1.538 + const UnicodeFunctor *m = symbols->lookupMatcher(c); 1.539 + if (m != 0) { 1.540 + const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m); 1.541 + if (ms == NULL) { 1.542 + ec = U_MALFORMED_SET; 1.543 + return; 1.544 + } 1.545 + // casting away const, but `nested' won't be modified 1.546 + // (important not to modify stored set) 1.547 + nested = const_cast<UnicodeSet*>(ms); 1.548 + setMode = 3; 1.549 + } 1.550 + } 1.551 + } 1.552 + 1.553 + // -------- Handle a nested set. This either is inline in 1.554 + // the pattern or represented by a stand-in that has 1.555 + // previously been parsed and was looked up in the symbol 1.556 + // table. 1.557 + 1.558 + if (setMode != 0) { 1.559 + if (lastItem == 1) { 1.560 + if (op != 0) { 1.561 + // syntaxError(chars, "Char expected after operator"); 1.562 + ec = U_MALFORMED_SET; 1.563 + return; 1.564 + } 1.565 + add(lastChar, lastChar); 1.566 + _appendToPat(patLocal, lastChar, FALSE); 1.567 + lastItem = 0; 1.568 + op = 0; 1.569 + } 1.570 + 1.571 + if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) { 1.572 + patLocal.append(op); 1.573 + } 1.574 + 1.575 + if (nested == 0) { 1.576 + // lazy allocation 1.577 + if (!scratch.allocate()) { 1.578 + ec = U_MEMORY_ALLOCATION_ERROR; 1.579 + return; 1.580 + } 1.581 + nested = scratch.pointer(); 1.582 + } 1.583 + switch (setMode) { 1.584 + case 1: 1.585 + nested->applyPattern(chars, symbols, patLocal, options, caseClosure, ec); 1.586 + break; 1.587 + case 2: 1.588 + chars.skipIgnored(opts); 1.589 + nested->applyPropertyPattern(chars, patLocal, ec); 1.590 + if (U_FAILURE(ec)) return; 1.591 + break; 1.592 + case 3: // `nested' already parsed 1.593 + nested->_toPattern(patLocal, FALSE); 1.594 + break; 1.595 + } 1.596 + 1.597 + usePat = TRUE; 1.598 + 1.599 + if (mode == 0) { 1.600 + // Entire pattern is a category; leave parse loop 1.601 + *this = *nested; 1.602 + mode = 2; 1.603 + break; 1.604 + } 1.605 + 1.606 + switch (op) { 1.607 + case HYPHEN: /*'-'*/ 1.608 + removeAll(*nested); 1.609 + break; 1.610 + case INTERSECTION: /*'&'*/ 1.611 + retainAll(*nested); 1.612 + break; 1.613 + case 0: 1.614 + addAll(*nested); 1.615 + break; 1.616 + } 1.617 + 1.618 + op = 0; 1.619 + lastItem = 2; 1.620 + 1.621 + continue; 1.622 + } 1.623 + 1.624 + if (mode == 0) { 1.625 + // syntaxError(chars, "Missing '['"); 1.626 + ec = U_MALFORMED_SET; 1.627 + return; 1.628 + } 1.629 + 1.630 + // -------- Parse special (syntax) characters. If the 1.631 + // current character is not special, or if it is escaped, 1.632 + // then fall through and handle it below. 1.633 + 1.634 + if (!literal) { 1.635 + switch (c) { 1.636 + case 0x5D /*']'*/: 1.637 + if (lastItem == 1) { 1.638 + add(lastChar, lastChar); 1.639 + _appendToPat(patLocal, lastChar, FALSE); 1.640 + } 1.641 + // Treat final trailing '-' as a literal 1.642 + if (op == HYPHEN /*'-'*/) { 1.643 + add(op, op); 1.644 + patLocal.append(op); 1.645 + } else if (op == INTERSECTION /*'&'*/) { 1.646 + // syntaxError(chars, "Trailing '&'"); 1.647 + ec = U_MALFORMED_SET; 1.648 + return; 1.649 + } 1.650 + patLocal.append((UChar) 0x5D /*']'*/); 1.651 + mode = 2; 1.652 + continue; 1.653 + case HYPHEN /*'-'*/: 1.654 + if (op == 0) { 1.655 + if (lastItem != 0) { 1.656 + op = (UChar) c; 1.657 + continue; 1.658 + } else { 1.659 + // Treat final trailing '-' as a literal 1.660 + add(c, c); 1.661 + c = chars.next(opts, literal, ec); 1.662 + if (U_FAILURE(ec)) return; 1.663 + if (c == 0x5D /*']'*/ && !literal) { 1.664 + patLocal.append(HYPHEN_RIGHT_BRACE, 2); 1.665 + mode = 2; 1.666 + continue; 1.667 + } 1.668 + } 1.669 + } 1.670 + // syntaxError(chars, "'-' not after char or set"); 1.671 + ec = U_MALFORMED_SET; 1.672 + return; 1.673 + case INTERSECTION /*'&'*/: 1.674 + if (lastItem == 2 && op == 0) { 1.675 + op = (UChar) c; 1.676 + continue; 1.677 + } 1.678 + // syntaxError(chars, "'&' not after set"); 1.679 + ec = U_MALFORMED_SET; 1.680 + return; 1.681 + case 0x5E /*'^'*/: 1.682 + // syntaxError(chars, "'^' not after '['"); 1.683 + ec = U_MALFORMED_SET; 1.684 + return; 1.685 + case 0x7B /*'{'*/: 1.686 + if (op != 0) { 1.687 + // syntaxError(chars, "Missing operand after operator"); 1.688 + ec = U_MALFORMED_SET; 1.689 + return; 1.690 + } 1.691 + if (lastItem == 1) { 1.692 + add(lastChar, lastChar); 1.693 + _appendToPat(patLocal, lastChar, FALSE); 1.694 + } 1.695 + lastItem = 0; 1.696 + buf.truncate(0); 1.697 + { 1.698 + UBool ok = FALSE; 1.699 + while (!chars.atEnd()) { 1.700 + c = chars.next(opts, literal, ec); 1.701 + if (U_FAILURE(ec)) return; 1.702 + if (c == 0x7D /*'}'*/ && !literal) { 1.703 + ok = TRUE; 1.704 + break; 1.705 + } 1.706 + buf.append(c); 1.707 + } 1.708 + if (buf.length() < 1 || !ok) { 1.709 + // syntaxError(chars, "Invalid multicharacter string"); 1.710 + ec = U_MALFORMED_SET; 1.711 + return; 1.712 + } 1.713 + } 1.714 + // We have new string. Add it to set and continue; 1.715 + // we don't need to drop through to the further 1.716 + // processing 1.717 + add(buf); 1.718 + patLocal.append((UChar) 0x7B /*'{'*/); 1.719 + _appendToPat(patLocal, buf, FALSE); 1.720 + patLocal.append((UChar) 0x7D /*'}'*/); 1.721 + continue; 1.722 + case SymbolTable::SYMBOL_REF: 1.723 + // symbols nosymbols 1.724 + // [a-$] error error (ambiguous) 1.725 + // [a$] anchor anchor 1.726 + // [a-$x] var "x"* literal '$' 1.727 + // [a-$.] error literal '$' 1.728 + // *We won't get here in the case of var "x" 1.729 + { 1.730 + chars.getPos(backup); 1.731 + c = chars.next(opts, literal, ec); 1.732 + if (U_FAILURE(ec)) return; 1.733 + UBool anchor = (c == 0x5D /*']'*/ && !literal); 1.734 + if (symbols == 0 && !anchor) { 1.735 + c = SymbolTable::SYMBOL_REF; 1.736 + chars.setPos(backup); 1.737 + break; // literal '$' 1.738 + } 1.739 + if (anchor && op == 0) { 1.740 + if (lastItem == 1) { 1.741 + add(lastChar, lastChar); 1.742 + _appendToPat(patLocal, lastChar, FALSE); 1.743 + } 1.744 + add(U_ETHER); 1.745 + usePat = TRUE; 1.746 + patLocal.append((UChar) SymbolTable::SYMBOL_REF); 1.747 + patLocal.append((UChar) 0x5D /*']'*/); 1.748 + mode = 2; 1.749 + continue; 1.750 + } 1.751 + // syntaxError(chars, "Unquoted '$'"); 1.752 + ec = U_MALFORMED_SET; 1.753 + return; 1.754 + } 1.755 + default: 1.756 + break; 1.757 + } 1.758 + } 1.759 + 1.760 + // -------- Parse literal characters. This includes both 1.761 + // escaped chars ("\u4E01") and non-syntax characters 1.762 + // ("a"). 1.763 + 1.764 + switch (lastItem) { 1.765 + case 0: 1.766 + lastItem = 1; 1.767 + lastChar = c; 1.768 + break; 1.769 + case 1: 1.770 + if (op == HYPHEN /*'-'*/) { 1.771 + if (lastChar >= c) { 1.772 + // Don't allow redundant (a-a) or empty (b-a) ranges; 1.773 + // these are most likely typos. 1.774 + // syntaxError(chars, "Invalid range"); 1.775 + ec = U_MALFORMED_SET; 1.776 + return; 1.777 + } 1.778 + add(lastChar, c); 1.779 + _appendToPat(patLocal, lastChar, FALSE); 1.780 + patLocal.append(op); 1.781 + _appendToPat(patLocal, c, FALSE); 1.782 + lastItem = 0; 1.783 + op = 0; 1.784 + } else { 1.785 + add(lastChar, lastChar); 1.786 + _appendToPat(patLocal, lastChar, FALSE); 1.787 + lastChar = c; 1.788 + } 1.789 + break; 1.790 + case 2: 1.791 + if (op != 0) { 1.792 + // syntaxError(chars, "Set expected after operator"); 1.793 + ec = U_MALFORMED_SET; 1.794 + return; 1.795 + } 1.796 + lastChar = c; 1.797 + lastItem = 1; 1.798 + break; 1.799 + } 1.800 + } 1.801 + 1.802 + if (mode != 2) { 1.803 + // syntaxError(chars, "Missing ']'"); 1.804 + ec = U_MALFORMED_SET; 1.805 + return; 1.806 + } 1.807 + 1.808 + chars.skipIgnored(opts); 1.809 + 1.810 + /** 1.811 + * Handle global flags (invert, case insensitivity). If this 1.812 + * pattern should be compiled case-insensitive, then we need 1.813 + * to close over case BEFORE COMPLEMENTING. This makes 1.814 + * patterns like /[^abc]/i work. 1.815 + */ 1.816 + if ((options & USET_CASE_INSENSITIVE) != 0) { 1.817 + (this->*caseClosure)(USET_CASE_INSENSITIVE); 1.818 + } 1.819 + else if ((options & USET_ADD_CASE_MAPPINGS) != 0) { 1.820 + (this->*caseClosure)(USET_ADD_CASE_MAPPINGS); 1.821 + } 1.822 + if (invert) { 1.823 + complement(); 1.824 + } 1.825 + 1.826 + // Use the rebuilt pattern (patLocal) only if necessary. Prefer the 1.827 + // generated pattern. 1.828 + if (usePat) { 1.829 + rebuiltPat.append(patLocal); 1.830 + } else { 1.831 + _generatePattern(rebuiltPat, FALSE); 1.832 + } 1.833 + if (isBogus() && U_SUCCESS(ec)) { 1.834 + // We likely ran out of memory. AHHH! 1.835 + ec = U_MEMORY_ALLOCATION_ERROR; 1.836 + } 1.837 +} 1.838 + 1.839 +//---------------------------------------------------------------- 1.840 +// Property set implementation 1.841 +//---------------------------------------------------------------- 1.842 + 1.843 +static UBool numericValueFilter(UChar32 ch, void* context) { 1.844 + return u_getNumericValue(ch) == *(double*)context; 1.845 +} 1.846 + 1.847 +static UBool generalCategoryMaskFilter(UChar32 ch, void* context) { 1.848 + int32_t value = *(int32_t*)context; 1.849 + return (U_GET_GC_MASK((UChar32) ch) & value) != 0; 1.850 +} 1.851 + 1.852 +static UBool versionFilter(UChar32 ch, void* context) { 1.853 + static const UVersionInfo none = { 0, 0, 0, 0 }; 1.854 + UVersionInfo v; 1.855 + u_charAge(ch, v); 1.856 + UVersionInfo* version = (UVersionInfo*)context; 1.857 + return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0; 1.858 +} 1.859 + 1.860 +typedef struct { 1.861 + UProperty prop; 1.862 + int32_t value; 1.863 +} IntPropertyContext; 1.864 + 1.865 +static UBool intPropertyFilter(UChar32 ch, void* context) { 1.866 + IntPropertyContext* c = (IntPropertyContext*)context; 1.867 + return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value; 1.868 +} 1.869 + 1.870 +static UBool scriptExtensionsFilter(UChar32 ch, void* context) { 1.871 + return uscript_hasScript(ch, *(UScriptCode*)context); 1.872 +} 1.873 + 1.874 +/** 1.875 + * Generic filter-based scanning code for UCD property UnicodeSets. 1.876 + */ 1.877 +void UnicodeSet::applyFilter(UnicodeSet::Filter filter, 1.878 + void* context, 1.879 + int32_t src, 1.880 + UErrorCode &status) { 1.881 + if (U_FAILURE(status)) return; 1.882 + 1.883 + // Logically, walk through all Unicode characters, noting the start 1.884 + // and end of each range for which filter.contain(c) is 1.885 + // true. Add each range to a set. 1.886 + // 1.887 + // To improve performance, use an inclusions set which 1.888 + // encodes information about character ranges that are known 1.889 + // to have identical properties. 1.890 + // getInclusions(src) contains exactly the first characters of 1.891 + // same-value ranges for the given properties "source". 1.892 + const UnicodeSet* inclusions = getInclusions(src, status); 1.893 + if (U_FAILURE(status)) { 1.894 + return; 1.895 + } 1.896 + 1.897 + clear(); 1.898 + 1.899 + UChar32 startHasProperty = -1; 1.900 + int32_t limitRange = inclusions->getRangeCount(); 1.901 + 1.902 + for (int j=0; j<limitRange; ++j) { 1.903 + // get current range 1.904 + UChar32 start = inclusions->getRangeStart(j); 1.905 + UChar32 end = inclusions->getRangeEnd(j); 1.906 + 1.907 + // for all the code points in the range, process 1.908 + for (UChar32 ch = start; ch <= end; ++ch) { 1.909 + // only add to this UnicodeSet on inflection points -- 1.910 + // where the hasProperty value changes to false 1.911 + if ((*filter)(ch, context)) { 1.912 + if (startHasProperty < 0) { 1.913 + startHasProperty = ch; 1.914 + } 1.915 + } else if (startHasProperty >= 0) { 1.916 + add(startHasProperty, ch-1); 1.917 + startHasProperty = -1; 1.918 + } 1.919 + } 1.920 + } 1.921 + if (startHasProperty >= 0) { 1.922 + add((UChar32)startHasProperty, (UChar32)0x10FFFF); 1.923 + } 1.924 + if (isBogus() && U_SUCCESS(status)) { 1.925 + // We likely ran out of memory. AHHH! 1.926 + status = U_MEMORY_ALLOCATION_ERROR; 1.927 + } 1.928 +} 1.929 + 1.930 +static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) { 1.931 + /* Note: we use ' ' in compiler code page */ 1.932 + int32_t j = 0; 1.933 + char ch; 1.934 + --dstCapacity; /* make room for term. zero */ 1.935 + while ((ch = *src++) != 0) { 1.936 + if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) { 1.937 + continue; 1.938 + } 1.939 + if (j >= dstCapacity) return FALSE; 1.940 + dst[j++] = ch; 1.941 + } 1.942 + if (j > 0 && dst[j-1] == ' ') --j; 1.943 + dst[j] = 0; 1.944 + return TRUE; 1.945 +} 1.946 + 1.947 +//---------------------------------------------------------------- 1.948 +// Property set API 1.949 +//---------------------------------------------------------------- 1.950 + 1.951 +#define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;} 1.952 + 1.953 +UnicodeSet& 1.954 +UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) { 1.955 + if (U_FAILURE(ec) || isFrozen()) return *this; 1.956 + 1.957 + if (prop == UCHAR_GENERAL_CATEGORY_MASK) { 1.958 + applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec); 1.959 + } else if (prop == UCHAR_SCRIPT_EXTENSIONS) { 1.960 + UScriptCode script = (UScriptCode)value; 1.961 + applyFilter(scriptExtensionsFilter, &script, UPROPS_SRC_PROPSVEC, ec); 1.962 + } else { 1.963 + IntPropertyContext c = {prop, value}; 1.964 + applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec); 1.965 + } 1.966 + return *this; 1.967 +} 1.968 + 1.969 +UnicodeSet& 1.970 +UnicodeSet::applyPropertyAlias(const UnicodeString& prop, 1.971 + const UnicodeString& value, 1.972 + UErrorCode& ec) { 1.973 + if (U_FAILURE(ec) || isFrozen()) return *this; 1.974 + 1.975 + // prop and value used to be converted to char * using the default 1.976 + // converter instead of the invariant conversion. 1.977 + // This should not be necessary because all Unicode property and value 1.978 + // names use only invariant characters. 1.979 + // If there are any variant characters, then we won't find them anyway. 1.980 + // Checking first avoids assertion failures in the conversion. 1.981 + if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) || 1.982 + !uprv_isInvariantUString(value.getBuffer(), value.length()) 1.983 + ) { 1.984 + FAIL(ec); 1.985 + } 1.986 + CharString pname, vname; 1.987 + pname.appendInvariantChars(prop, ec); 1.988 + vname.appendInvariantChars(value, ec); 1.989 + if (U_FAILURE(ec)) return *this; 1.990 + 1.991 + UProperty p; 1.992 + int32_t v; 1.993 + UBool mustNotBeEmpty = FALSE, invert = FALSE; 1.994 + 1.995 + if (value.length() > 0) { 1.996 + p = u_getPropertyEnum(pname.data()); 1.997 + if (p == UCHAR_INVALID_CODE) FAIL(ec); 1.998 + 1.999 + // Treat gc as gcm 1.1000 + if (p == UCHAR_GENERAL_CATEGORY) { 1.1001 + p = UCHAR_GENERAL_CATEGORY_MASK; 1.1002 + } 1.1003 + 1.1004 + if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) || 1.1005 + (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) || 1.1006 + (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) { 1.1007 + v = u_getPropertyValueEnum(p, vname.data()); 1.1008 + if (v == UCHAR_INVALID_CODE) { 1.1009 + // Handle numeric CCC 1.1010 + if (p == UCHAR_CANONICAL_COMBINING_CLASS || 1.1011 + p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS || 1.1012 + p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) { 1.1013 + char* end; 1.1014 + double value = uprv_strtod(vname.data(), &end); 1.1015 + v = (int32_t) value; 1.1016 + if (v != value || v < 0 || *end != 0) { 1.1017 + // non-integral or negative value, or trailing junk 1.1018 + FAIL(ec); 1.1019 + } 1.1020 + // If the resultant set is empty then the numeric value 1.1021 + // was invalid. 1.1022 + mustNotBeEmpty = TRUE; 1.1023 + } else { 1.1024 + FAIL(ec); 1.1025 + } 1.1026 + } 1.1027 + } 1.1028 + 1.1029 + else { 1.1030 + 1.1031 + switch (p) { 1.1032 + case UCHAR_NUMERIC_VALUE: 1.1033 + { 1.1034 + char* end; 1.1035 + double value = uprv_strtod(vname.data(), &end); 1.1036 + if (*end != 0) { 1.1037 + FAIL(ec); 1.1038 + } 1.1039 + applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec); 1.1040 + return *this; 1.1041 + } 1.1042 + case UCHAR_NAME: 1.1043 + { 1.1044 + // Must munge name, since u_charFromName() does not do 1.1045 + // 'loose' matching. 1.1046 + char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength 1.1047 + if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec); 1.1048 + UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec); 1.1049 + if (U_SUCCESS(ec)) { 1.1050 + clear(); 1.1051 + add(ch); 1.1052 + return *this; 1.1053 + } else { 1.1054 + FAIL(ec); 1.1055 + } 1.1056 + } 1.1057 + case UCHAR_UNICODE_1_NAME: 1.1058 + // ICU 49 deprecates the Unicode_1_Name property APIs. 1.1059 + FAIL(ec); 1.1060 + case UCHAR_AGE: 1.1061 + { 1.1062 + // Must munge name, since u_versionFromString() does not do 1.1063 + // 'loose' matching. 1.1064 + char buf[128]; 1.1065 + if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec); 1.1066 + UVersionInfo version; 1.1067 + u_versionFromString(version, buf); 1.1068 + applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec); 1.1069 + return *this; 1.1070 + } 1.1071 + case UCHAR_SCRIPT_EXTENSIONS: 1.1072 + v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data()); 1.1073 + if (v == UCHAR_INVALID_CODE) { 1.1074 + FAIL(ec); 1.1075 + } 1.1076 + // fall through to calling applyIntPropertyValue() 1.1077 + break; 1.1078 + default: 1.1079 + // p is a non-binary, non-enumerated property that we 1.1080 + // don't support (yet). 1.1081 + FAIL(ec); 1.1082 + } 1.1083 + } 1.1084 + } 1.1085 + 1.1086 + else { 1.1087 + // value is empty. Interpret as General Category, Script, or 1.1088 + // Binary property. 1.1089 + p = UCHAR_GENERAL_CATEGORY_MASK; 1.1090 + v = u_getPropertyValueEnum(p, pname.data()); 1.1091 + if (v == UCHAR_INVALID_CODE) { 1.1092 + p = UCHAR_SCRIPT; 1.1093 + v = u_getPropertyValueEnum(p, pname.data()); 1.1094 + if (v == UCHAR_INVALID_CODE) { 1.1095 + p = u_getPropertyEnum(pname.data()); 1.1096 + if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) { 1.1097 + v = 1; 1.1098 + } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) { 1.1099 + set(MIN_VALUE, MAX_VALUE); 1.1100 + return *this; 1.1101 + } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) { 1.1102 + set(0, 0x7F); 1.1103 + return *this; 1.1104 + } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) { 1.1105 + // [:Assigned:]=[:^Cn:] 1.1106 + p = UCHAR_GENERAL_CATEGORY_MASK; 1.1107 + v = U_GC_CN_MASK; 1.1108 + invert = TRUE; 1.1109 + } else { 1.1110 + FAIL(ec); 1.1111 + } 1.1112 + } 1.1113 + } 1.1114 + } 1.1115 + 1.1116 + applyIntPropertyValue(p, v, ec); 1.1117 + if(invert) { 1.1118 + complement(); 1.1119 + } 1.1120 + 1.1121 + if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) { 1.1122 + // mustNotBeEmpty is set to true if an empty set indicates 1.1123 + // invalid input. 1.1124 + ec = U_ILLEGAL_ARGUMENT_ERROR; 1.1125 + } 1.1126 + 1.1127 + if (isBogus() && U_SUCCESS(ec)) { 1.1128 + // We likely ran out of memory. AHHH! 1.1129 + ec = U_MEMORY_ALLOCATION_ERROR; 1.1130 + } 1.1131 + return *this; 1.1132 +} 1.1133 + 1.1134 +//---------------------------------------------------------------- 1.1135 +// Property set patterns 1.1136 +//---------------------------------------------------------------- 1.1137 + 1.1138 +/** 1.1139 + * Return true if the given position, in the given pattern, appears 1.1140 + * to be the start of a property set pattern. 1.1141 + */ 1.1142 +UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern, 1.1143 + int32_t pos) { 1.1144 + // Patterns are at least 5 characters long 1.1145 + if ((pos+5) > pattern.length()) { 1.1146 + return FALSE; 1.1147 + } 1.1148 + 1.1149 + // Look for an opening [:, [:^, \p, or \P 1.1150 + return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos); 1.1151 +} 1.1152 + 1.1153 +/** 1.1154 + * Return true if the given iterator appears to point at a 1.1155 + * property pattern. Regardless of the result, return with the 1.1156 + * iterator unchanged. 1.1157 + * @param chars iterator over the pattern characters. Upon return 1.1158 + * it will be unchanged. 1.1159 + * @param iterOpts RuleCharacterIterator options 1.1160 + */ 1.1161 +UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars, 1.1162 + int32_t iterOpts) { 1.1163 + // NOTE: literal will always be FALSE, because we don't parse escapes. 1.1164 + UBool result = FALSE, literal; 1.1165 + UErrorCode ec = U_ZERO_ERROR; 1.1166 + iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES; 1.1167 + RuleCharacterIterator::Pos pos; 1.1168 + chars.getPos(pos); 1.1169 + UChar32 c = chars.next(iterOpts, literal, ec); 1.1170 + if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) { 1.1171 + UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE, 1.1172 + literal, ec); 1.1173 + result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) : 1.1174 + (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/); 1.1175 + } 1.1176 + chars.setPos(pos); 1.1177 + return result && U_SUCCESS(ec); 1.1178 +} 1.1179 + 1.1180 +/** 1.1181 + * Parse the given property pattern at the given parse position. 1.1182 + */ 1.1183 +UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern, 1.1184 + ParsePosition& ppos, 1.1185 + UErrorCode &ec) { 1.1186 + int32_t pos = ppos.getIndex(); 1.1187 + 1.1188 + UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat} 1.1189 + UBool isName = FALSE; // true for \N{pat}, o/w false 1.1190 + UBool invert = FALSE; 1.1191 + 1.1192 + if (U_FAILURE(ec)) return *this; 1.1193 + 1.1194 + // Minimum length is 5 characters, e.g. \p{L} 1.1195 + if ((pos+5) > pattern.length()) { 1.1196 + FAIL(ec); 1.1197 + } 1.1198 + 1.1199 + // On entry, ppos should point to one of the following locations: 1.1200 + // Look for an opening [:, [:^, \p, or \P 1.1201 + if (isPOSIXOpen(pattern, pos)) { 1.1202 + posix = TRUE; 1.1203 + pos += 2; 1.1204 + pos = ICU_Utility::skipWhitespace(pattern, pos); 1.1205 + if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) { 1.1206 + ++pos; 1.1207 + invert = TRUE; 1.1208 + } 1.1209 + } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) { 1.1210 + UChar c = pattern.charAt(pos+1); 1.1211 + invert = (c == UPPER_P); 1.1212 + isName = (c == UPPER_N); 1.1213 + pos += 2; 1.1214 + pos = ICU_Utility::skipWhitespace(pattern, pos); 1.1215 + if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) { 1.1216 + // Syntax error; "\p" or "\P" not followed by "{" 1.1217 + FAIL(ec); 1.1218 + } 1.1219 + } else { 1.1220 + // Open delimiter not seen 1.1221 + FAIL(ec); 1.1222 + } 1.1223 + 1.1224 + // Look for the matching close delimiter, either :] or } 1.1225 + int32_t close; 1.1226 + if (posix) { 1.1227 + close = pattern.indexOf(POSIX_CLOSE, 2, pos); 1.1228 + } else { 1.1229 + close = pattern.indexOf(CLOSE_BRACE, pos); 1.1230 + } 1.1231 + if (close < 0) { 1.1232 + // Syntax error; close delimiter missing 1.1233 + FAIL(ec); 1.1234 + } 1.1235 + 1.1236 + // Look for an '=' sign. If this is present, we will parse a 1.1237 + // medium \p{gc=Cf} or long \p{GeneralCategory=Format} 1.1238 + // pattern. 1.1239 + int32_t equals = pattern.indexOf(EQUALS, pos); 1.1240 + UnicodeString propName, valueName; 1.1241 + if (equals >= 0 && equals < close && !isName) { 1.1242 + // Equals seen; parse medium/long pattern 1.1243 + pattern.extractBetween(pos, equals, propName); 1.1244 + pattern.extractBetween(equals+1, close, valueName); 1.1245 + } 1.1246 + 1.1247 + else { 1.1248 + // Handle case where no '=' is seen, and \N{} 1.1249 + pattern.extractBetween(pos, close, propName); 1.1250 + 1.1251 + // Handle \N{name} 1.1252 + if (isName) { 1.1253 + // This is a little inefficient since it means we have to 1.1254 + // parse NAME_PROP back to UCHAR_NAME even though we already 1.1255 + // know it's UCHAR_NAME. If we refactor the API to 1.1256 + // support args of (UProperty, char*) then we can remove 1.1257 + // NAME_PROP and make this a little more efficient. 1.1258 + valueName = propName; 1.1259 + propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV); 1.1260 + } 1.1261 + } 1.1262 + 1.1263 + applyPropertyAlias(propName, valueName, ec); 1.1264 + 1.1265 + if (U_SUCCESS(ec)) { 1.1266 + if (invert) { 1.1267 + complement(); 1.1268 + } 1.1269 + 1.1270 + // Move to the limit position after the close delimiter if the 1.1271 + // parse succeeded. 1.1272 + ppos.setIndex(close + (posix ? 2 : 1)); 1.1273 + } 1.1274 + 1.1275 + return *this; 1.1276 +} 1.1277 + 1.1278 +/** 1.1279 + * Parse a property pattern. 1.1280 + * @param chars iterator over the pattern characters. Upon return 1.1281 + * it will be advanced to the first character after the parsed 1.1282 + * pattern, or the end of the iteration if all characters are 1.1283 + * parsed. 1.1284 + * @param rebuiltPat the pattern that was parsed, rebuilt or 1.1285 + * copied from the input pattern, as appropriate. 1.1286 + */ 1.1287 +void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars, 1.1288 + UnicodeString& rebuiltPat, 1.1289 + UErrorCode& ec) { 1.1290 + if (U_FAILURE(ec)) return; 1.1291 + UnicodeString pattern; 1.1292 + chars.lookahead(pattern); 1.1293 + ParsePosition pos(0); 1.1294 + applyPropertyPattern(pattern, pos, ec); 1.1295 + if (U_FAILURE(ec)) return; 1.1296 + if (pos.getIndex() == 0) { 1.1297 + // syntaxError(chars, "Invalid property pattern"); 1.1298 + ec = U_MALFORMED_SET; 1.1299 + return; 1.1300 + } 1.1301 + chars.jumpahead(pos.getIndex()); 1.1302 + rebuiltPat.append(pattern, 0, pos.getIndex()); 1.1303 +} 1.1304 + 1.1305 +U_NAMESPACE_END