Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* |
michael@0 | 2 | ******************************************************************************* |
michael@0 | 3 | * |
michael@0 | 4 | * Copyright (C) 1999-2013, International Business Machines |
michael@0 | 5 | * Corporation and others. All Rights Reserved. |
michael@0 | 6 | * |
michael@0 | 7 | ******************************************************************************* |
michael@0 | 8 | * file name: uniset_props.cpp |
michael@0 | 9 | * encoding: US-ASCII |
michael@0 | 10 | * tab size: 8 (not used) |
michael@0 | 11 | * indentation:4 |
michael@0 | 12 | * |
michael@0 | 13 | * created on: 2004aug25 |
michael@0 | 14 | * created by: Markus W. Scherer |
michael@0 | 15 | * |
michael@0 | 16 | * Character property dependent functions moved here from uniset.cpp |
michael@0 | 17 | */ |
michael@0 | 18 | |
michael@0 | 19 | #include "unicode/utypes.h" |
michael@0 | 20 | #include "unicode/uniset.h" |
michael@0 | 21 | #include "unicode/parsepos.h" |
michael@0 | 22 | #include "unicode/uchar.h" |
michael@0 | 23 | #include "unicode/uscript.h" |
michael@0 | 24 | #include "unicode/symtable.h" |
michael@0 | 25 | #include "unicode/uset.h" |
michael@0 | 26 | #include "unicode/locid.h" |
michael@0 | 27 | #include "unicode/brkiter.h" |
michael@0 | 28 | #include "uset_imp.h" |
michael@0 | 29 | #include "ruleiter.h" |
michael@0 | 30 | #include "cmemory.h" |
michael@0 | 31 | #include "ucln_cmn.h" |
michael@0 | 32 | #include "util.h" |
michael@0 | 33 | #include "uvector.h" |
michael@0 | 34 | #include "uprops.h" |
michael@0 | 35 | #include "propname.h" |
michael@0 | 36 | #include "normalizer2impl.h" |
michael@0 | 37 | #include "ucase.h" |
michael@0 | 38 | #include "ubidi_props.h" |
michael@0 | 39 | #include "uinvchar.h" |
michael@0 | 40 | #include "uprops.h" |
michael@0 | 41 | #include "charstr.h" |
michael@0 | 42 | #include "cstring.h" |
michael@0 | 43 | #include "mutex.h" |
michael@0 | 44 | #include "umutex.h" |
michael@0 | 45 | #include "uassert.h" |
michael@0 | 46 | #include "hash.h" |
michael@0 | 47 | |
michael@0 | 48 | U_NAMESPACE_USE |
michael@0 | 49 | |
michael@0 | 50 | #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
michael@0 | 51 | |
michael@0 | 52 | // initial storage. Must be >= 0 |
michael@0 | 53 | // *** same as in uniset.cpp ! *** |
michael@0 | 54 | #define START_EXTRA 16 |
michael@0 | 55 | |
michael@0 | 56 | // Define UChar constants using hex for EBCDIC compatibility |
michael@0 | 57 | // Used #define to reduce private static exports and memory access time. |
michael@0 | 58 | #define SET_OPEN ((UChar)0x005B) /*[*/ |
michael@0 | 59 | #define SET_CLOSE ((UChar)0x005D) /*]*/ |
michael@0 | 60 | #define HYPHEN ((UChar)0x002D) /*-*/ |
michael@0 | 61 | #define COMPLEMENT ((UChar)0x005E) /*^*/ |
michael@0 | 62 | #define COLON ((UChar)0x003A) /*:*/ |
michael@0 | 63 | #define BACKSLASH ((UChar)0x005C) /*\*/ |
michael@0 | 64 | #define INTERSECTION ((UChar)0x0026) /*&*/ |
michael@0 | 65 | #define UPPER_U ((UChar)0x0055) /*U*/ |
michael@0 | 66 | #define LOWER_U ((UChar)0x0075) /*u*/ |
michael@0 | 67 | #define OPEN_BRACE ((UChar)123) /*{*/ |
michael@0 | 68 | #define CLOSE_BRACE ((UChar)125) /*}*/ |
michael@0 | 69 | #define UPPER_P ((UChar)0x0050) /*P*/ |
michael@0 | 70 | #define LOWER_P ((UChar)0x0070) /*p*/ |
michael@0 | 71 | #define UPPER_N ((UChar)78) /*N*/ |
michael@0 | 72 | #define EQUALS ((UChar)0x003D) /*=*/ |
michael@0 | 73 | |
michael@0 | 74 | //static const UChar POSIX_OPEN[] = { SET_OPEN,COLON,0 }; // "[:" |
michael@0 | 75 | static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 }; // ":]" |
michael@0 | 76 | //static const UChar PERL_OPEN[] = { BACKSLASH,LOWER_P,0 }; // "\\p" |
michael@0 | 77 | //static const UChar PERL_CLOSE[] = { CLOSE_BRACE,0 }; // "}" |
michael@0 | 78 | //static const UChar NAME_OPEN[] = { BACKSLASH,UPPER_N,0 }; // "\\N" |
michael@0 | 79 | static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/ |
michael@0 | 80 | |
michael@0 | 81 | // Special property set IDs |
michael@0 | 82 | static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF] |
michael@0 | 83 | static const char ASCII[] = "ASCII"; // [\u0000-\u007F] |
michael@0 | 84 | static const char ASSIGNED[] = "Assigned"; // [:^Cn:] |
michael@0 | 85 | |
michael@0 | 86 | // Unicode name property alias |
michael@0 | 87 | #define NAME_PROP "na" |
michael@0 | 88 | #define NAME_PROP_LENGTH 2 |
michael@0 | 89 | |
michael@0 | 90 | /** |
michael@0 | 91 | * Delimiter string used in patterns to close a category reference: |
michael@0 | 92 | * ":]". Example: "[:Lu:]". |
michael@0 | 93 | */ |
michael@0 | 94 | //static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */ |
michael@0 | 95 | |
michael@0 | 96 | // Cached sets ------------------------------------------------------------- *** |
michael@0 | 97 | |
michael@0 | 98 | U_CDECL_BEGIN |
michael@0 | 99 | static UBool U_CALLCONV uset_cleanup(); |
michael@0 | 100 | |
michael@0 | 101 | struct Inclusion { |
michael@0 | 102 | UnicodeSet *fSet; |
michael@0 | 103 | UInitOnce fInitOnce; |
michael@0 | 104 | }; |
michael@0 | 105 | static Inclusion gInclusions[UPROPS_SRC_COUNT]; // cached getInclusions() |
michael@0 | 106 | |
michael@0 | 107 | static UnicodeSet *uni32Singleton; |
michael@0 | 108 | static icu::UInitOnce uni32InitOnce = U_INITONCE_INITIALIZER; |
michael@0 | 109 | |
michael@0 | 110 | //---------------------------------------------------------------- |
michael@0 | 111 | // Inclusions list |
michael@0 | 112 | //---------------------------------------------------------------- |
michael@0 | 113 | |
michael@0 | 114 | // USetAdder implementation |
michael@0 | 115 | // Does not use uset.h to reduce code dependencies |
michael@0 | 116 | static void U_CALLCONV |
michael@0 | 117 | _set_add(USet *set, UChar32 c) { |
michael@0 | 118 | ((UnicodeSet *)set)->add(c); |
michael@0 | 119 | } |
michael@0 | 120 | |
michael@0 | 121 | static void U_CALLCONV |
michael@0 | 122 | _set_addRange(USet *set, UChar32 start, UChar32 end) { |
michael@0 | 123 | ((UnicodeSet *)set)->add(start, end); |
michael@0 | 124 | } |
michael@0 | 125 | |
michael@0 | 126 | static void U_CALLCONV |
michael@0 | 127 | _set_addString(USet *set, const UChar *str, int32_t length) { |
michael@0 | 128 | ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length)); |
michael@0 | 129 | } |
michael@0 | 130 | |
michael@0 | 131 | /** |
michael@0 | 132 | * Cleanup function for UnicodeSet |
michael@0 | 133 | */ |
michael@0 | 134 | static UBool U_CALLCONV uset_cleanup(void) { |
michael@0 | 135 | for(int32_t i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) { |
michael@0 | 136 | Inclusion &in = gInclusions[i]; |
michael@0 | 137 | delete in.fSet; |
michael@0 | 138 | in.fSet = NULL; |
michael@0 | 139 | in.fInitOnce.reset(); |
michael@0 | 140 | } |
michael@0 | 141 | |
michael@0 | 142 | delete uni32Singleton; |
michael@0 | 143 | uni32Singleton = NULL; |
michael@0 | 144 | uni32InitOnce.reset(); |
michael@0 | 145 | return TRUE; |
michael@0 | 146 | } |
michael@0 | 147 | |
michael@0 | 148 | U_CDECL_END |
michael@0 | 149 | |
michael@0 | 150 | U_NAMESPACE_BEGIN |
michael@0 | 151 | |
michael@0 | 152 | /* |
michael@0 | 153 | Reduce excessive reallocation, and make it easier to detect initialization problems. |
michael@0 | 154 | Usually you don't see smaller sets than this for Unicode 5.0. |
michael@0 | 155 | */ |
michael@0 | 156 | #define DEFAULT_INCLUSION_CAPACITY 3072 |
michael@0 | 157 | |
michael@0 | 158 | void U_CALLCONV UnicodeSet_initInclusion(int32_t src, UErrorCode &status) { |
michael@0 | 159 | // This function is invoked only via umtx_initOnce(). |
michael@0 | 160 | // This function is a friend of class UnicodeSet. |
michael@0 | 161 | |
michael@0 | 162 | U_ASSERT(src >=0 && src<UPROPS_SRC_COUNT); |
michael@0 | 163 | UnicodeSet * &incl = gInclusions[src].fSet; |
michael@0 | 164 | U_ASSERT(incl == NULL); |
michael@0 | 165 | |
michael@0 | 166 | incl = new UnicodeSet(); |
michael@0 | 167 | if (incl == NULL) { |
michael@0 | 168 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 169 | return; |
michael@0 | 170 | } |
michael@0 | 171 | USetAdder sa = { |
michael@0 | 172 | (USet *)incl, |
michael@0 | 173 | _set_add, |
michael@0 | 174 | _set_addRange, |
michael@0 | 175 | _set_addString, |
michael@0 | 176 | NULL, // don't need remove() |
michael@0 | 177 | NULL // don't need removeRange() |
michael@0 | 178 | }; |
michael@0 | 179 | |
michael@0 | 180 | incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, status); |
michael@0 | 181 | switch(src) { |
michael@0 | 182 | case UPROPS_SRC_CHAR: |
michael@0 | 183 | uchar_addPropertyStarts(&sa, &status); |
michael@0 | 184 | break; |
michael@0 | 185 | case UPROPS_SRC_PROPSVEC: |
michael@0 | 186 | upropsvec_addPropertyStarts(&sa, &status); |
michael@0 | 187 | break; |
michael@0 | 188 | case UPROPS_SRC_CHAR_AND_PROPSVEC: |
michael@0 | 189 | uchar_addPropertyStarts(&sa, &status); |
michael@0 | 190 | upropsvec_addPropertyStarts(&sa, &status); |
michael@0 | 191 | break; |
michael@0 | 192 | #if !UCONFIG_NO_NORMALIZATION |
michael@0 | 193 | case UPROPS_SRC_CASE_AND_NORM: { |
michael@0 | 194 | const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); |
michael@0 | 195 | if(U_SUCCESS(status)) { |
michael@0 | 196 | impl->addPropertyStarts(&sa, status); |
michael@0 | 197 | } |
michael@0 | 198 | ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status); |
michael@0 | 199 | break; |
michael@0 | 200 | } |
michael@0 | 201 | case UPROPS_SRC_NFC: { |
michael@0 | 202 | const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); |
michael@0 | 203 | if(U_SUCCESS(status)) { |
michael@0 | 204 | impl->addPropertyStarts(&sa, status); |
michael@0 | 205 | } |
michael@0 | 206 | break; |
michael@0 | 207 | } |
michael@0 | 208 | case UPROPS_SRC_NFKC: { |
michael@0 | 209 | const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(status); |
michael@0 | 210 | if(U_SUCCESS(status)) { |
michael@0 | 211 | impl->addPropertyStarts(&sa, status); |
michael@0 | 212 | } |
michael@0 | 213 | break; |
michael@0 | 214 | } |
michael@0 | 215 | case UPROPS_SRC_NFKC_CF: { |
michael@0 | 216 | const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(status); |
michael@0 | 217 | if(U_SUCCESS(status)) { |
michael@0 | 218 | impl->addPropertyStarts(&sa, status); |
michael@0 | 219 | } |
michael@0 | 220 | break; |
michael@0 | 221 | } |
michael@0 | 222 | case UPROPS_SRC_NFC_CANON_ITER: { |
michael@0 | 223 | const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); |
michael@0 | 224 | if(U_SUCCESS(status)) { |
michael@0 | 225 | impl->addCanonIterPropertyStarts(&sa, status); |
michael@0 | 226 | } |
michael@0 | 227 | break; |
michael@0 | 228 | } |
michael@0 | 229 | #endif |
michael@0 | 230 | case UPROPS_SRC_CASE: |
michael@0 | 231 | ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status); |
michael@0 | 232 | break; |
michael@0 | 233 | case UPROPS_SRC_BIDI: |
michael@0 | 234 | ubidi_addPropertyStarts(ubidi_getSingleton(), &sa, &status); |
michael@0 | 235 | break; |
michael@0 | 236 | default: |
michael@0 | 237 | status = U_INTERNAL_PROGRAM_ERROR; |
michael@0 | 238 | break; |
michael@0 | 239 | } |
michael@0 | 240 | |
michael@0 | 241 | if (U_FAILURE(status)) { |
michael@0 | 242 | delete incl; |
michael@0 | 243 | incl = NULL; |
michael@0 | 244 | return; |
michael@0 | 245 | } |
michael@0 | 246 | // Compact for caching |
michael@0 | 247 | incl->compact(); |
michael@0 | 248 | ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); |
michael@0 | 249 | } |
michael@0 | 250 | |
michael@0 | 251 | |
michael@0 | 252 | |
michael@0 | 253 | const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) { |
michael@0 | 254 | U_ASSERT(src >=0 && src<UPROPS_SRC_COUNT); |
michael@0 | 255 | Inclusion &i = gInclusions[src]; |
michael@0 | 256 | umtx_initOnce(i.fInitOnce, &UnicodeSet_initInclusion, src, status); |
michael@0 | 257 | return i.fSet; |
michael@0 | 258 | } |
michael@0 | 259 | |
michael@0 | 260 | |
michael@0 | 261 | // Cache some sets for other services -------------------------------------- *** |
michael@0 | 262 | void U_CALLCONV createUni32Set(UErrorCode &errorCode) { |
michael@0 | 263 | U_ASSERT(uni32Singleton == NULL); |
michael@0 | 264 | uni32Singleton = new UnicodeSet(UNICODE_STRING_SIMPLE("[:age=3.2:]"), errorCode); |
michael@0 | 265 | if(uni32Singleton==NULL) { |
michael@0 | 266 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 267 | } else { |
michael@0 | 268 | uni32Singleton->freeze(); |
michael@0 | 269 | } |
michael@0 | 270 | ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); |
michael@0 | 271 | } |
michael@0 | 272 | |
michael@0 | 273 | |
michael@0 | 274 | U_CFUNC UnicodeSet * |
michael@0 | 275 | uniset_getUnicode32Instance(UErrorCode &errorCode) { |
michael@0 | 276 | umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode); |
michael@0 | 277 | return uni32Singleton; |
michael@0 | 278 | } |
michael@0 | 279 | |
michael@0 | 280 | // helper functions for matching of pattern syntax pieces ------------------ *** |
michael@0 | 281 | // these functions are parallel to the PERL_OPEN etc. strings above |
michael@0 | 282 | |
michael@0 | 283 | // using these functions is not only faster than UnicodeString::compare() and |
michael@0 | 284 | // caseCompare(), but they also make UnicodeSet work for simple patterns when |
michael@0 | 285 | // no Unicode properties data is available - when caseCompare() fails |
michael@0 | 286 | |
michael@0 | 287 | static inline UBool |
michael@0 | 288 | isPerlOpen(const UnicodeString &pattern, int32_t pos) { |
michael@0 | 289 | UChar c; |
michael@0 | 290 | return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P); |
michael@0 | 291 | } |
michael@0 | 292 | |
michael@0 | 293 | /*static inline UBool |
michael@0 | 294 | isPerlClose(const UnicodeString &pattern, int32_t pos) { |
michael@0 | 295 | return pattern.charAt(pos)==CLOSE_BRACE; |
michael@0 | 296 | }*/ |
michael@0 | 297 | |
michael@0 | 298 | static inline UBool |
michael@0 | 299 | isNameOpen(const UnicodeString &pattern, int32_t pos) { |
michael@0 | 300 | return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N; |
michael@0 | 301 | } |
michael@0 | 302 | |
michael@0 | 303 | static inline UBool |
michael@0 | 304 | isPOSIXOpen(const UnicodeString &pattern, int32_t pos) { |
michael@0 | 305 | return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON; |
michael@0 | 306 | } |
michael@0 | 307 | |
michael@0 | 308 | /*static inline UBool |
michael@0 | 309 | isPOSIXClose(const UnicodeString &pattern, int32_t pos) { |
michael@0 | 310 | return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE; |
michael@0 | 311 | }*/ |
michael@0 | 312 | |
michael@0 | 313 | // TODO memory debugging provided inside uniset.cpp |
michael@0 | 314 | // could be made available here but probably obsolete with use of modern |
michael@0 | 315 | // memory leak checker tools |
michael@0 | 316 | #define _dbgct(me) |
michael@0 | 317 | |
michael@0 | 318 | //---------------------------------------------------------------- |
michael@0 | 319 | // Constructors &c |
michael@0 | 320 | //---------------------------------------------------------------- |
michael@0 | 321 | |
michael@0 | 322 | /** |
michael@0 | 323 | * Constructs a set from the given pattern, optionally ignoring |
michael@0 | 324 | * white space. See the class description for the syntax of the |
michael@0 | 325 | * pattern language. |
michael@0 | 326 | * @param pattern a string specifying what characters are in the set |
michael@0 | 327 | */ |
michael@0 | 328 | UnicodeSet::UnicodeSet(const UnicodeString& pattern, |
michael@0 | 329 | UErrorCode& status) : |
michael@0 | 330 | len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), |
michael@0 | 331 | bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), |
michael@0 | 332 | fFlags(0) |
michael@0 | 333 | { |
michael@0 | 334 | if(U_SUCCESS(status)){ |
michael@0 | 335 | list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); |
michael@0 | 336 | /* test for NULL */ |
michael@0 | 337 | if(list == NULL) { |
michael@0 | 338 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 339 | }else{ |
michael@0 | 340 | allocateStrings(status); |
michael@0 | 341 | applyPattern(pattern, status); |
michael@0 | 342 | } |
michael@0 | 343 | } |
michael@0 | 344 | _dbgct(this); |
michael@0 | 345 | } |
michael@0 | 346 | |
michael@0 | 347 | //---------------------------------------------------------------- |
michael@0 | 348 | // Public API |
michael@0 | 349 | //---------------------------------------------------------------- |
michael@0 | 350 | |
michael@0 | 351 | UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, |
michael@0 | 352 | UErrorCode& status) { |
michael@0 | 353 | // Equivalent to |
michael@0 | 354 | // return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status); |
michael@0 | 355 | // but without dependency on closeOver(). |
michael@0 | 356 | ParsePosition pos(0); |
michael@0 | 357 | applyPatternIgnoreSpace(pattern, pos, NULL, status); |
michael@0 | 358 | if (U_FAILURE(status)) return *this; |
michael@0 | 359 | |
michael@0 | 360 | int32_t i = pos.getIndex(); |
michael@0 | 361 | // Skip over trailing whitespace |
michael@0 | 362 | ICU_Utility::skipWhitespace(pattern, i, TRUE); |
michael@0 | 363 | if (i != pattern.length()) { |
michael@0 | 364 | status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 365 | } |
michael@0 | 366 | return *this; |
michael@0 | 367 | } |
michael@0 | 368 | |
michael@0 | 369 | void |
michael@0 | 370 | UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern, |
michael@0 | 371 | ParsePosition& pos, |
michael@0 | 372 | const SymbolTable* symbols, |
michael@0 | 373 | UErrorCode& status) { |
michael@0 | 374 | if (U_FAILURE(status)) { |
michael@0 | 375 | return; |
michael@0 | 376 | } |
michael@0 | 377 | if (isFrozen()) { |
michael@0 | 378 | status = U_NO_WRITE_PERMISSION; |
michael@0 | 379 | return; |
michael@0 | 380 | } |
michael@0 | 381 | // Need to build the pattern in a temporary string because |
michael@0 | 382 | // _applyPattern calls add() etc., which set pat to empty. |
michael@0 | 383 | UnicodeString rebuiltPat; |
michael@0 | 384 | RuleCharacterIterator chars(pattern, symbols, pos); |
michael@0 | 385 | applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, status); |
michael@0 | 386 | if (U_FAILURE(status)) return; |
michael@0 | 387 | if (chars.inVariable()) { |
michael@0 | 388 | // syntaxError(chars, "Extra chars in variable value"); |
michael@0 | 389 | status = U_MALFORMED_SET; |
michael@0 | 390 | return; |
michael@0 | 391 | } |
michael@0 | 392 | setPattern(rebuiltPat); |
michael@0 | 393 | } |
michael@0 | 394 | |
michael@0 | 395 | /** |
michael@0 | 396 | * Return true if the given position, in the given pattern, appears |
michael@0 | 397 | * to be the start of a UnicodeSet pattern. |
michael@0 | 398 | */ |
michael@0 | 399 | UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) { |
michael@0 | 400 | return ((pos+1) < pattern.length() && |
michael@0 | 401 | pattern.charAt(pos) == (UChar)91/*[*/) || |
michael@0 | 402 | resemblesPropertyPattern(pattern, pos); |
michael@0 | 403 | } |
michael@0 | 404 | |
michael@0 | 405 | //---------------------------------------------------------------- |
michael@0 | 406 | // Implementation: Pattern parsing |
michael@0 | 407 | //---------------------------------------------------------------- |
michael@0 | 408 | |
michael@0 | 409 | /** |
michael@0 | 410 | * A small all-inline class to manage a UnicodeSet pointer. Add |
michael@0 | 411 | * operator->() etc. as needed. |
michael@0 | 412 | */ |
michael@0 | 413 | class UnicodeSetPointer { |
michael@0 | 414 | UnicodeSet* p; |
michael@0 | 415 | public: |
michael@0 | 416 | inline UnicodeSetPointer() : p(0) {} |
michael@0 | 417 | inline ~UnicodeSetPointer() { delete p; } |
michael@0 | 418 | inline UnicodeSet* pointer() { return p; } |
michael@0 | 419 | inline UBool allocate() { |
michael@0 | 420 | if (p == 0) { |
michael@0 | 421 | p = new UnicodeSet(); |
michael@0 | 422 | } |
michael@0 | 423 | return p != 0; |
michael@0 | 424 | } |
michael@0 | 425 | }; |
michael@0 | 426 | |
michael@0 | 427 | /** |
michael@0 | 428 | * Parse the pattern from the given RuleCharacterIterator. The |
michael@0 | 429 | * iterator is advanced over the parsed pattern. |
michael@0 | 430 | * @param chars iterator over the pattern characters. Upon return |
michael@0 | 431 | * it will be advanced to the first character after the parsed |
michael@0 | 432 | * pattern, or the end of the iteration if all characters are |
michael@0 | 433 | * parsed. |
michael@0 | 434 | * @param symbols symbol table to use to parse and dereference |
michael@0 | 435 | * variables, or null if none. |
michael@0 | 436 | * @param rebuiltPat the pattern that was parsed, rebuilt or |
michael@0 | 437 | * copied from the input pattern, as appropriate. |
michael@0 | 438 | * @param options a bit mask of zero or more of the following: |
michael@0 | 439 | * IGNORE_SPACE, CASE. |
michael@0 | 440 | */ |
michael@0 | 441 | void UnicodeSet::applyPattern(RuleCharacterIterator& chars, |
michael@0 | 442 | const SymbolTable* symbols, |
michael@0 | 443 | UnicodeString& rebuiltPat, |
michael@0 | 444 | uint32_t options, |
michael@0 | 445 | UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute), |
michael@0 | 446 | UErrorCode& ec) { |
michael@0 | 447 | if (U_FAILURE(ec)) return; |
michael@0 | 448 | |
michael@0 | 449 | // Syntax characters: [ ] ^ - & { } |
michael@0 | 450 | |
michael@0 | 451 | // Recognized special forms for chars, sets: c-c s-s s&s |
michael@0 | 452 | |
michael@0 | 453 | int32_t opts = RuleCharacterIterator::PARSE_VARIABLES | |
michael@0 | 454 | RuleCharacterIterator::PARSE_ESCAPES; |
michael@0 | 455 | if ((options & USET_IGNORE_SPACE) != 0) { |
michael@0 | 456 | opts |= RuleCharacterIterator::SKIP_WHITESPACE; |
michael@0 | 457 | } |
michael@0 | 458 | |
michael@0 | 459 | UnicodeString patLocal, buf; |
michael@0 | 460 | UBool usePat = FALSE; |
michael@0 | 461 | UnicodeSetPointer scratch; |
michael@0 | 462 | RuleCharacterIterator::Pos backup; |
michael@0 | 463 | |
michael@0 | 464 | // mode: 0=before [, 1=between [...], 2=after ] |
michael@0 | 465 | // lastItem: 0=none, 1=char, 2=set |
michael@0 | 466 | int8_t lastItem = 0, mode = 0; |
michael@0 | 467 | UChar32 lastChar = 0; |
michael@0 | 468 | UChar op = 0; |
michael@0 | 469 | |
michael@0 | 470 | UBool invert = FALSE; |
michael@0 | 471 | |
michael@0 | 472 | clear(); |
michael@0 | 473 | |
michael@0 | 474 | while (mode != 2 && !chars.atEnd()) { |
michael@0 | 475 | U_ASSERT((lastItem == 0 && op == 0) || |
michael@0 | 476 | (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) || |
michael@0 | 477 | (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ || |
michael@0 | 478 | op == INTERSECTION /*'&'*/))); |
michael@0 | 479 | |
michael@0 | 480 | UChar32 c = 0; |
michael@0 | 481 | UBool literal = FALSE; |
michael@0 | 482 | UnicodeSet* nested = 0; // alias - do not delete |
michael@0 | 483 | |
michael@0 | 484 | // -------- Check for property pattern |
michael@0 | 485 | |
michael@0 | 486 | // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed |
michael@0 | 487 | int8_t setMode = 0; |
michael@0 | 488 | if (resemblesPropertyPattern(chars, opts)) { |
michael@0 | 489 | setMode = 2; |
michael@0 | 490 | } |
michael@0 | 491 | |
michael@0 | 492 | // -------- Parse '[' of opening delimiter OR nested set. |
michael@0 | 493 | // If there is a nested set, use `setMode' to define how |
michael@0 | 494 | // the set should be parsed. If the '[' is part of the |
michael@0 | 495 | // opening delimiter for this pattern, parse special |
michael@0 | 496 | // strings "[", "[^", "[-", and "[^-". Check for stand-in |
michael@0 | 497 | // characters representing a nested set in the symbol |
michael@0 | 498 | // table. |
michael@0 | 499 | |
michael@0 | 500 | else { |
michael@0 | 501 | // Prepare to backup if necessary |
michael@0 | 502 | chars.getPos(backup); |
michael@0 | 503 | c = chars.next(opts, literal, ec); |
michael@0 | 504 | if (U_FAILURE(ec)) return; |
michael@0 | 505 | |
michael@0 | 506 | if (c == 0x5B /*'['*/ && !literal) { |
michael@0 | 507 | if (mode == 1) { |
michael@0 | 508 | chars.setPos(backup); // backup |
michael@0 | 509 | setMode = 1; |
michael@0 | 510 | } else { |
michael@0 | 511 | // Handle opening '[' delimiter |
michael@0 | 512 | mode = 1; |
michael@0 | 513 | patLocal.append((UChar) 0x5B /*'['*/); |
michael@0 | 514 | chars.getPos(backup); // prepare to backup |
michael@0 | 515 | c = chars.next(opts, literal, ec); |
michael@0 | 516 | if (U_FAILURE(ec)) return; |
michael@0 | 517 | if (c == 0x5E /*'^'*/ && !literal) { |
michael@0 | 518 | invert = TRUE; |
michael@0 | 519 | patLocal.append((UChar) 0x5E /*'^'*/); |
michael@0 | 520 | chars.getPos(backup); // prepare to backup |
michael@0 | 521 | c = chars.next(opts, literal, ec); |
michael@0 | 522 | if (U_FAILURE(ec)) return; |
michael@0 | 523 | } |
michael@0 | 524 | // Fall through to handle special leading '-'; |
michael@0 | 525 | // otherwise restart loop for nested [], \p{}, etc. |
michael@0 | 526 | if (c == HYPHEN /*'-'*/) { |
michael@0 | 527 | literal = TRUE; |
michael@0 | 528 | // Fall through to handle literal '-' below |
michael@0 | 529 | } else { |
michael@0 | 530 | chars.setPos(backup); // backup |
michael@0 | 531 | continue; |
michael@0 | 532 | } |
michael@0 | 533 | } |
michael@0 | 534 | } else if (symbols != 0) { |
michael@0 | 535 | const UnicodeFunctor *m = symbols->lookupMatcher(c); |
michael@0 | 536 | if (m != 0) { |
michael@0 | 537 | const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m); |
michael@0 | 538 | if (ms == NULL) { |
michael@0 | 539 | ec = U_MALFORMED_SET; |
michael@0 | 540 | return; |
michael@0 | 541 | } |
michael@0 | 542 | // casting away const, but `nested' won't be modified |
michael@0 | 543 | // (important not to modify stored set) |
michael@0 | 544 | nested = const_cast<UnicodeSet*>(ms); |
michael@0 | 545 | setMode = 3; |
michael@0 | 546 | } |
michael@0 | 547 | } |
michael@0 | 548 | } |
michael@0 | 549 | |
michael@0 | 550 | // -------- Handle a nested set. This either is inline in |
michael@0 | 551 | // the pattern or represented by a stand-in that has |
michael@0 | 552 | // previously been parsed and was looked up in the symbol |
michael@0 | 553 | // table. |
michael@0 | 554 | |
michael@0 | 555 | if (setMode != 0) { |
michael@0 | 556 | if (lastItem == 1) { |
michael@0 | 557 | if (op != 0) { |
michael@0 | 558 | // syntaxError(chars, "Char expected after operator"); |
michael@0 | 559 | ec = U_MALFORMED_SET; |
michael@0 | 560 | return; |
michael@0 | 561 | } |
michael@0 | 562 | add(lastChar, lastChar); |
michael@0 | 563 | _appendToPat(patLocal, lastChar, FALSE); |
michael@0 | 564 | lastItem = 0; |
michael@0 | 565 | op = 0; |
michael@0 | 566 | } |
michael@0 | 567 | |
michael@0 | 568 | if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) { |
michael@0 | 569 | patLocal.append(op); |
michael@0 | 570 | } |
michael@0 | 571 | |
michael@0 | 572 | if (nested == 0) { |
michael@0 | 573 | // lazy allocation |
michael@0 | 574 | if (!scratch.allocate()) { |
michael@0 | 575 | ec = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 576 | return; |
michael@0 | 577 | } |
michael@0 | 578 | nested = scratch.pointer(); |
michael@0 | 579 | } |
michael@0 | 580 | switch (setMode) { |
michael@0 | 581 | case 1: |
michael@0 | 582 | nested->applyPattern(chars, symbols, patLocal, options, caseClosure, ec); |
michael@0 | 583 | break; |
michael@0 | 584 | case 2: |
michael@0 | 585 | chars.skipIgnored(opts); |
michael@0 | 586 | nested->applyPropertyPattern(chars, patLocal, ec); |
michael@0 | 587 | if (U_FAILURE(ec)) return; |
michael@0 | 588 | break; |
michael@0 | 589 | case 3: // `nested' already parsed |
michael@0 | 590 | nested->_toPattern(patLocal, FALSE); |
michael@0 | 591 | break; |
michael@0 | 592 | } |
michael@0 | 593 | |
michael@0 | 594 | usePat = TRUE; |
michael@0 | 595 | |
michael@0 | 596 | if (mode == 0) { |
michael@0 | 597 | // Entire pattern is a category; leave parse loop |
michael@0 | 598 | *this = *nested; |
michael@0 | 599 | mode = 2; |
michael@0 | 600 | break; |
michael@0 | 601 | } |
michael@0 | 602 | |
michael@0 | 603 | switch (op) { |
michael@0 | 604 | case HYPHEN: /*'-'*/ |
michael@0 | 605 | removeAll(*nested); |
michael@0 | 606 | break; |
michael@0 | 607 | case INTERSECTION: /*'&'*/ |
michael@0 | 608 | retainAll(*nested); |
michael@0 | 609 | break; |
michael@0 | 610 | case 0: |
michael@0 | 611 | addAll(*nested); |
michael@0 | 612 | break; |
michael@0 | 613 | } |
michael@0 | 614 | |
michael@0 | 615 | op = 0; |
michael@0 | 616 | lastItem = 2; |
michael@0 | 617 | |
michael@0 | 618 | continue; |
michael@0 | 619 | } |
michael@0 | 620 | |
michael@0 | 621 | if (mode == 0) { |
michael@0 | 622 | // syntaxError(chars, "Missing '['"); |
michael@0 | 623 | ec = U_MALFORMED_SET; |
michael@0 | 624 | return; |
michael@0 | 625 | } |
michael@0 | 626 | |
michael@0 | 627 | // -------- Parse special (syntax) characters. If the |
michael@0 | 628 | // current character is not special, or if it is escaped, |
michael@0 | 629 | // then fall through and handle it below. |
michael@0 | 630 | |
michael@0 | 631 | if (!literal) { |
michael@0 | 632 | switch (c) { |
michael@0 | 633 | case 0x5D /*']'*/: |
michael@0 | 634 | if (lastItem == 1) { |
michael@0 | 635 | add(lastChar, lastChar); |
michael@0 | 636 | _appendToPat(patLocal, lastChar, FALSE); |
michael@0 | 637 | } |
michael@0 | 638 | // Treat final trailing '-' as a literal |
michael@0 | 639 | if (op == HYPHEN /*'-'*/) { |
michael@0 | 640 | add(op, op); |
michael@0 | 641 | patLocal.append(op); |
michael@0 | 642 | } else if (op == INTERSECTION /*'&'*/) { |
michael@0 | 643 | // syntaxError(chars, "Trailing '&'"); |
michael@0 | 644 | ec = U_MALFORMED_SET; |
michael@0 | 645 | return; |
michael@0 | 646 | } |
michael@0 | 647 | patLocal.append((UChar) 0x5D /*']'*/); |
michael@0 | 648 | mode = 2; |
michael@0 | 649 | continue; |
michael@0 | 650 | case HYPHEN /*'-'*/: |
michael@0 | 651 | if (op == 0) { |
michael@0 | 652 | if (lastItem != 0) { |
michael@0 | 653 | op = (UChar) c; |
michael@0 | 654 | continue; |
michael@0 | 655 | } else { |
michael@0 | 656 | // Treat final trailing '-' as a literal |
michael@0 | 657 | add(c, c); |
michael@0 | 658 | c = chars.next(opts, literal, ec); |
michael@0 | 659 | if (U_FAILURE(ec)) return; |
michael@0 | 660 | if (c == 0x5D /*']'*/ && !literal) { |
michael@0 | 661 | patLocal.append(HYPHEN_RIGHT_BRACE, 2); |
michael@0 | 662 | mode = 2; |
michael@0 | 663 | continue; |
michael@0 | 664 | } |
michael@0 | 665 | } |
michael@0 | 666 | } |
michael@0 | 667 | // syntaxError(chars, "'-' not after char or set"); |
michael@0 | 668 | ec = U_MALFORMED_SET; |
michael@0 | 669 | return; |
michael@0 | 670 | case INTERSECTION /*'&'*/: |
michael@0 | 671 | if (lastItem == 2 && op == 0) { |
michael@0 | 672 | op = (UChar) c; |
michael@0 | 673 | continue; |
michael@0 | 674 | } |
michael@0 | 675 | // syntaxError(chars, "'&' not after set"); |
michael@0 | 676 | ec = U_MALFORMED_SET; |
michael@0 | 677 | return; |
michael@0 | 678 | case 0x5E /*'^'*/: |
michael@0 | 679 | // syntaxError(chars, "'^' not after '['"); |
michael@0 | 680 | ec = U_MALFORMED_SET; |
michael@0 | 681 | return; |
michael@0 | 682 | case 0x7B /*'{'*/: |
michael@0 | 683 | if (op != 0) { |
michael@0 | 684 | // syntaxError(chars, "Missing operand after operator"); |
michael@0 | 685 | ec = U_MALFORMED_SET; |
michael@0 | 686 | return; |
michael@0 | 687 | } |
michael@0 | 688 | if (lastItem == 1) { |
michael@0 | 689 | add(lastChar, lastChar); |
michael@0 | 690 | _appendToPat(patLocal, lastChar, FALSE); |
michael@0 | 691 | } |
michael@0 | 692 | lastItem = 0; |
michael@0 | 693 | buf.truncate(0); |
michael@0 | 694 | { |
michael@0 | 695 | UBool ok = FALSE; |
michael@0 | 696 | while (!chars.atEnd()) { |
michael@0 | 697 | c = chars.next(opts, literal, ec); |
michael@0 | 698 | if (U_FAILURE(ec)) return; |
michael@0 | 699 | if (c == 0x7D /*'}'*/ && !literal) { |
michael@0 | 700 | ok = TRUE; |
michael@0 | 701 | break; |
michael@0 | 702 | } |
michael@0 | 703 | buf.append(c); |
michael@0 | 704 | } |
michael@0 | 705 | if (buf.length() < 1 || !ok) { |
michael@0 | 706 | // syntaxError(chars, "Invalid multicharacter string"); |
michael@0 | 707 | ec = U_MALFORMED_SET; |
michael@0 | 708 | return; |
michael@0 | 709 | } |
michael@0 | 710 | } |
michael@0 | 711 | // We have new string. Add it to set and continue; |
michael@0 | 712 | // we don't need to drop through to the further |
michael@0 | 713 | // processing |
michael@0 | 714 | add(buf); |
michael@0 | 715 | patLocal.append((UChar) 0x7B /*'{'*/); |
michael@0 | 716 | _appendToPat(patLocal, buf, FALSE); |
michael@0 | 717 | patLocal.append((UChar) 0x7D /*'}'*/); |
michael@0 | 718 | continue; |
michael@0 | 719 | case SymbolTable::SYMBOL_REF: |
michael@0 | 720 | // symbols nosymbols |
michael@0 | 721 | // [a-$] error error (ambiguous) |
michael@0 | 722 | // [a$] anchor anchor |
michael@0 | 723 | // [a-$x] var "x"* literal '$' |
michael@0 | 724 | // [a-$.] error literal '$' |
michael@0 | 725 | // *We won't get here in the case of var "x" |
michael@0 | 726 | { |
michael@0 | 727 | chars.getPos(backup); |
michael@0 | 728 | c = chars.next(opts, literal, ec); |
michael@0 | 729 | if (U_FAILURE(ec)) return; |
michael@0 | 730 | UBool anchor = (c == 0x5D /*']'*/ && !literal); |
michael@0 | 731 | if (symbols == 0 && !anchor) { |
michael@0 | 732 | c = SymbolTable::SYMBOL_REF; |
michael@0 | 733 | chars.setPos(backup); |
michael@0 | 734 | break; // literal '$' |
michael@0 | 735 | } |
michael@0 | 736 | if (anchor && op == 0) { |
michael@0 | 737 | if (lastItem == 1) { |
michael@0 | 738 | add(lastChar, lastChar); |
michael@0 | 739 | _appendToPat(patLocal, lastChar, FALSE); |
michael@0 | 740 | } |
michael@0 | 741 | add(U_ETHER); |
michael@0 | 742 | usePat = TRUE; |
michael@0 | 743 | patLocal.append((UChar) SymbolTable::SYMBOL_REF); |
michael@0 | 744 | patLocal.append((UChar) 0x5D /*']'*/); |
michael@0 | 745 | mode = 2; |
michael@0 | 746 | continue; |
michael@0 | 747 | } |
michael@0 | 748 | // syntaxError(chars, "Unquoted '$'"); |
michael@0 | 749 | ec = U_MALFORMED_SET; |
michael@0 | 750 | return; |
michael@0 | 751 | } |
michael@0 | 752 | default: |
michael@0 | 753 | break; |
michael@0 | 754 | } |
michael@0 | 755 | } |
michael@0 | 756 | |
michael@0 | 757 | // -------- Parse literal characters. This includes both |
michael@0 | 758 | // escaped chars ("\u4E01") and non-syntax characters |
michael@0 | 759 | // ("a"). |
michael@0 | 760 | |
michael@0 | 761 | switch (lastItem) { |
michael@0 | 762 | case 0: |
michael@0 | 763 | lastItem = 1; |
michael@0 | 764 | lastChar = c; |
michael@0 | 765 | break; |
michael@0 | 766 | case 1: |
michael@0 | 767 | if (op == HYPHEN /*'-'*/) { |
michael@0 | 768 | if (lastChar >= c) { |
michael@0 | 769 | // Don't allow redundant (a-a) or empty (b-a) ranges; |
michael@0 | 770 | // these are most likely typos. |
michael@0 | 771 | // syntaxError(chars, "Invalid range"); |
michael@0 | 772 | ec = U_MALFORMED_SET; |
michael@0 | 773 | return; |
michael@0 | 774 | } |
michael@0 | 775 | add(lastChar, c); |
michael@0 | 776 | _appendToPat(patLocal, lastChar, FALSE); |
michael@0 | 777 | patLocal.append(op); |
michael@0 | 778 | _appendToPat(patLocal, c, FALSE); |
michael@0 | 779 | lastItem = 0; |
michael@0 | 780 | op = 0; |
michael@0 | 781 | } else { |
michael@0 | 782 | add(lastChar, lastChar); |
michael@0 | 783 | _appendToPat(patLocal, lastChar, FALSE); |
michael@0 | 784 | lastChar = c; |
michael@0 | 785 | } |
michael@0 | 786 | break; |
michael@0 | 787 | case 2: |
michael@0 | 788 | if (op != 0) { |
michael@0 | 789 | // syntaxError(chars, "Set expected after operator"); |
michael@0 | 790 | ec = U_MALFORMED_SET; |
michael@0 | 791 | return; |
michael@0 | 792 | } |
michael@0 | 793 | lastChar = c; |
michael@0 | 794 | lastItem = 1; |
michael@0 | 795 | break; |
michael@0 | 796 | } |
michael@0 | 797 | } |
michael@0 | 798 | |
michael@0 | 799 | if (mode != 2) { |
michael@0 | 800 | // syntaxError(chars, "Missing ']'"); |
michael@0 | 801 | ec = U_MALFORMED_SET; |
michael@0 | 802 | return; |
michael@0 | 803 | } |
michael@0 | 804 | |
michael@0 | 805 | chars.skipIgnored(opts); |
michael@0 | 806 | |
michael@0 | 807 | /** |
michael@0 | 808 | * Handle global flags (invert, case insensitivity). If this |
michael@0 | 809 | * pattern should be compiled case-insensitive, then we need |
michael@0 | 810 | * to close over case BEFORE COMPLEMENTING. This makes |
michael@0 | 811 | * patterns like /[^abc]/i work. |
michael@0 | 812 | */ |
michael@0 | 813 | if ((options & USET_CASE_INSENSITIVE) != 0) { |
michael@0 | 814 | (this->*caseClosure)(USET_CASE_INSENSITIVE); |
michael@0 | 815 | } |
michael@0 | 816 | else if ((options & USET_ADD_CASE_MAPPINGS) != 0) { |
michael@0 | 817 | (this->*caseClosure)(USET_ADD_CASE_MAPPINGS); |
michael@0 | 818 | } |
michael@0 | 819 | if (invert) { |
michael@0 | 820 | complement(); |
michael@0 | 821 | } |
michael@0 | 822 | |
michael@0 | 823 | // Use the rebuilt pattern (patLocal) only if necessary. Prefer the |
michael@0 | 824 | // generated pattern. |
michael@0 | 825 | if (usePat) { |
michael@0 | 826 | rebuiltPat.append(patLocal); |
michael@0 | 827 | } else { |
michael@0 | 828 | _generatePattern(rebuiltPat, FALSE); |
michael@0 | 829 | } |
michael@0 | 830 | if (isBogus() && U_SUCCESS(ec)) { |
michael@0 | 831 | // We likely ran out of memory. AHHH! |
michael@0 | 832 | ec = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 833 | } |
michael@0 | 834 | } |
michael@0 | 835 | |
michael@0 | 836 | //---------------------------------------------------------------- |
michael@0 | 837 | // Property set implementation |
michael@0 | 838 | //---------------------------------------------------------------- |
michael@0 | 839 | |
michael@0 | 840 | static UBool numericValueFilter(UChar32 ch, void* context) { |
michael@0 | 841 | return u_getNumericValue(ch) == *(double*)context; |
michael@0 | 842 | } |
michael@0 | 843 | |
michael@0 | 844 | static UBool generalCategoryMaskFilter(UChar32 ch, void* context) { |
michael@0 | 845 | int32_t value = *(int32_t*)context; |
michael@0 | 846 | return (U_GET_GC_MASK((UChar32) ch) & value) != 0; |
michael@0 | 847 | } |
michael@0 | 848 | |
michael@0 | 849 | static UBool versionFilter(UChar32 ch, void* context) { |
michael@0 | 850 | static const UVersionInfo none = { 0, 0, 0, 0 }; |
michael@0 | 851 | UVersionInfo v; |
michael@0 | 852 | u_charAge(ch, v); |
michael@0 | 853 | UVersionInfo* version = (UVersionInfo*)context; |
michael@0 | 854 | return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0; |
michael@0 | 855 | } |
michael@0 | 856 | |
michael@0 | 857 | typedef struct { |
michael@0 | 858 | UProperty prop; |
michael@0 | 859 | int32_t value; |
michael@0 | 860 | } IntPropertyContext; |
michael@0 | 861 | |
michael@0 | 862 | static UBool intPropertyFilter(UChar32 ch, void* context) { |
michael@0 | 863 | IntPropertyContext* c = (IntPropertyContext*)context; |
michael@0 | 864 | return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value; |
michael@0 | 865 | } |
michael@0 | 866 | |
michael@0 | 867 | static UBool scriptExtensionsFilter(UChar32 ch, void* context) { |
michael@0 | 868 | return uscript_hasScript(ch, *(UScriptCode*)context); |
michael@0 | 869 | } |
michael@0 | 870 | |
michael@0 | 871 | /** |
michael@0 | 872 | * Generic filter-based scanning code for UCD property UnicodeSets. |
michael@0 | 873 | */ |
michael@0 | 874 | void UnicodeSet::applyFilter(UnicodeSet::Filter filter, |
michael@0 | 875 | void* context, |
michael@0 | 876 | int32_t src, |
michael@0 | 877 | UErrorCode &status) { |
michael@0 | 878 | if (U_FAILURE(status)) return; |
michael@0 | 879 | |
michael@0 | 880 | // Logically, walk through all Unicode characters, noting the start |
michael@0 | 881 | // and end of each range for which filter.contain(c) is |
michael@0 | 882 | // true. Add each range to a set. |
michael@0 | 883 | // |
michael@0 | 884 | // To improve performance, use an inclusions set which |
michael@0 | 885 | // encodes information about character ranges that are known |
michael@0 | 886 | // to have identical properties. |
michael@0 | 887 | // getInclusions(src) contains exactly the first characters of |
michael@0 | 888 | // same-value ranges for the given properties "source". |
michael@0 | 889 | const UnicodeSet* inclusions = getInclusions(src, status); |
michael@0 | 890 | if (U_FAILURE(status)) { |
michael@0 | 891 | return; |
michael@0 | 892 | } |
michael@0 | 893 | |
michael@0 | 894 | clear(); |
michael@0 | 895 | |
michael@0 | 896 | UChar32 startHasProperty = -1; |
michael@0 | 897 | int32_t limitRange = inclusions->getRangeCount(); |
michael@0 | 898 | |
michael@0 | 899 | for (int j=0; j<limitRange; ++j) { |
michael@0 | 900 | // get current range |
michael@0 | 901 | UChar32 start = inclusions->getRangeStart(j); |
michael@0 | 902 | UChar32 end = inclusions->getRangeEnd(j); |
michael@0 | 903 | |
michael@0 | 904 | // for all the code points in the range, process |
michael@0 | 905 | for (UChar32 ch = start; ch <= end; ++ch) { |
michael@0 | 906 | // only add to this UnicodeSet on inflection points -- |
michael@0 | 907 | // where the hasProperty value changes to false |
michael@0 | 908 | if ((*filter)(ch, context)) { |
michael@0 | 909 | if (startHasProperty < 0) { |
michael@0 | 910 | startHasProperty = ch; |
michael@0 | 911 | } |
michael@0 | 912 | } else if (startHasProperty >= 0) { |
michael@0 | 913 | add(startHasProperty, ch-1); |
michael@0 | 914 | startHasProperty = -1; |
michael@0 | 915 | } |
michael@0 | 916 | } |
michael@0 | 917 | } |
michael@0 | 918 | if (startHasProperty >= 0) { |
michael@0 | 919 | add((UChar32)startHasProperty, (UChar32)0x10FFFF); |
michael@0 | 920 | } |
michael@0 | 921 | if (isBogus() && U_SUCCESS(status)) { |
michael@0 | 922 | // We likely ran out of memory. AHHH! |
michael@0 | 923 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 924 | } |
michael@0 | 925 | } |
michael@0 | 926 | |
michael@0 | 927 | static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) { |
michael@0 | 928 | /* Note: we use ' ' in compiler code page */ |
michael@0 | 929 | int32_t j = 0; |
michael@0 | 930 | char ch; |
michael@0 | 931 | --dstCapacity; /* make room for term. zero */ |
michael@0 | 932 | while ((ch = *src++) != 0) { |
michael@0 | 933 | if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) { |
michael@0 | 934 | continue; |
michael@0 | 935 | } |
michael@0 | 936 | if (j >= dstCapacity) return FALSE; |
michael@0 | 937 | dst[j++] = ch; |
michael@0 | 938 | } |
michael@0 | 939 | if (j > 0 && dst[j-1] == ' ') --j; |
michael@0 | 940 | dst[j] = 0; |
michael@0 | 941 | return TRUE; |
michael@0 | 942 | } |
michael@0 | 943 | |
michael@0 | 944 | //---------------------------------------------------------------- |
michael@0 | 945 | // Property set API |
michael@0 | 946 | //---------------------------------------------------------------- |
michael@0 | 947 | |
michael@0 | 948 | #define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;} |
michael@0 | 949 | |
michael@0 | 950 | UnicodeSet& |
michael@0 | 951 | UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) { |
michael@0 | 952 | if (U_FAILURE(ec) || isFrozen()) return *this; |
michael@0 | 953 | |
michael@0 | 954 | if (prop == UCHAR_GENERAL_CATEGORY_MASK) { |
michael@0 | 955 | applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec); |
michael@0 | 956 | } else if (prop == UCHAR_SCRIPT_EXTENSIONS) { |
michael@0 | 957 | UScriptCode script = (UScriptCode)value; |
michael@0 | 958 | applyFilter(scriptExtensionsFilter, &script, UPROPS_SRC_PROPSVEC, ec); |
michael@0 | 959 | } else { |
michael@0 | 960 | IntPropertyContext c = {prop, value}; |
michael@0 | 961 | applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec); |
michael@0 | 962 | } |
michael@0 | 963 | return *this; |
michael@0 | 964 | } |
michael@0 | 965 | |
michael@0 | 966 | UnicodeSet& |
michael@0 | 967 | UnicodeSet::applyPropertyAlias(const UnicodeString& prop, |
michael@0 | 968 | const UnicodeString& value, |
michael@0 | 969 | UErrorCode& ec) { |
michael@0 | 970 | if (U_FAILURE(ec) || isFrozen()) return *this; |
michael@0 | 971 | |
michael@0 | 972 | // prop and value used to be converted to char * using the default |
michael@0 | 973 | // converter instead of the invariant conversion. |
michael@0 | 974 | // This should not be necessary because all Unicode property and value |
michael@0 | 975 | // names use only invariant characters. |
michael@0 | 976 | // If there are any variant characters, then we won't find them anyway. |
michael@0 | 977 | // Checking first avoids assertion failures in the conversion. |
michael@0 | 978 | if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) || |
michael@0 | 979 | !uprv_isInvariantUString(value.getBuffer(), value.length()) |
michael@0 | 980 | ) { |
michael@0 | 981 | FAIL(ec); |
michael@0 | 982 | } |
michael@0 | 983 | CharString pname, vname; |
michael@0 | 984 | pname.appendInvariantChars(prop, ec); |
michael@0 | 985 | vname.appendInvariantChars(value, ec); |
michael@0 | 986 | if (U_FAILURE(ec)) return *this; |
michael@0 | 987 | |
michael@0 | 988 | UProperty p; |
michael@0 | 989 | int32_t v; |
michael@0 | 990 | UBool mustNotBeEmpty = FALSE, invert = FALSE; |
michael@0 | 991 | |
michael@0 | 992 | if (value.length() > 0) { |
michael@0 | 993 | p = u_getPropertyEnum(pname.data()); |
michael@0 | 994 | if (p == UCHAR_INVALID_CODE) FAIL(ec); |
michael@0 | 995 | |
michael@0 | 996 | // Treat gc as gcm |
michael@0 | 997 | if (p == UCHAR_GENERAL_CATEGORY) { |
michael@0 | 998 | p = UCHAR_GENERAL_CATEGORY_MASK; |
michael@0 | 999 | } |
michael@0 | 1000 | |
michael@0 | 1001 | if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) || |
michael@0 | 1002 | (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) || |
michael@0 | 1003 | (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) { |
michael@0 | 1004 | v = u_getPropertyValueEnum(p, vname.data()); |
michael@0 | 1005 | if (v == UCHAR_INVALID_CODE) { |
michael@0 | 1006 | // Handle numeric CCC |
michael@0 | 1007 | if (p == UCHAR_CANONICAL_COMBINING_CLASS || |
michael@0 | 1008 | p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS || |
michael@0 | 1009 | p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) { |
michael@0 | 1010 | char* end; |
michael@0 | 1011 | double value = uprv_strtod(vname.data(), &end); |
michael@0 | 1012 | v = (int32_t) value; |
michael@0 | 1013 | if (v != value || v < 0 || *end != 0) { |
michael@0 | 1014 | // non-integral or negative value, or trailing junk |
michael@0 | 1015 | FAIL(ec); |
michael@0 | 1016 | } |
michael@0 | 1017 | // If the resultant set is empty then the numeric value |
michael@0 | 1018 | // was invalid. |
michael@0 | 1019 | mustNotBeEmpty = TRUE; |
michael@0 | 1020 | } else { |
michael@0 | 1021 | FAIL(ec); |
michael@0 | 1022 | } |
michael@0 | 1023 | } |
michael@0 | 1024 | } |
michael@0 | 1025 | |
michael@0 | 1026 | else { |
michael@0 | 1027 | |
michael@0 | 1028 | switch (p) { |
michael@0 | 1029 | case UCHAR_NUMERIC_VALUE: |
michael@0 | 1030 | { |
michael@0 | 1031 | char* end; |
michael@0 | 1032 | double value = uprv_strtod(vname.data(), &end); |
michael@0 | 1033 | if (*end != 0) { |
michael@0 | 1034 | FAIL(ec); |
michael@0 | 1035 | } |
michael@0 | 1036 | applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec); |
michael@0 | 1037 | return *this; |
michael@0 | 1038 | } |
michael@0 | 1039 | case UCHAR_NAME: |
michael@0 | 1040 | { |
michael@0 | 1041 | // Must munge name, since u_charFromName() does not do |
michael@0 | 1042 | // 'loose' matching. |
michael@0 | 1043 | char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength |
michael@0 | 1044 | if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec); |
michael@0 | 1045 | UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec); |
michael@0 | 1046 | if (U_SUCCESS(ec)) { |
michael@0 | 1047 | clear(); |
michael@0 | 1048 | add(ch); |
michael@0 | 1049 | return *this; |
michael@0 | 1050 | } else { |
michael@0 | 1051 | FAIL(ec); |
michael@0 | 1052 | } |
michael@0 | 1053 | } |
michael@0 | 1054 | case UCHAR_UNICODE_1_NAME: |
michael@0 | 1055 | // ICU 49 deprecates the Unicode_1_Name property APIs. |
michael@0 | 1056 | FAIL(ec); |
michael@0 | 1057 | case UCHAR_AGE: |
michael@0 | 1058 | { |
michael@0 | 1059 | // Must munge name, since u_versionFromString() does not do |
michael@0 | 1060 | // 'loose' matching. |
michael@0 | 1061 | char buf[128]; |
michael@0 | 1062 | if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec); |
michael@0 | 1063 | UVersionInfo version; |
michael@0 | 1064 | u_versionFromString(version, buf); |
michael@0 | 1065 | applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec); |
michael@0 | 1066 | return *this; |
michael@0 | 1067 | } |
michael@0 | 1068 | case UCHAR_SCRIPT_EXTENSIONS: |
michael@0 | 1069 | v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data()); |
michael@0 | 1070 | if (v == UCHAR_INVALID_CODE) { |
michael@0 | 1071 | FAIL(ec); |
michael@0 | 1072 | } |
michael@0 | 1073 | // fall through to calling applyIntPropertyValue() |
michael@0 | 1074 | break; |
michael@0 | 1075 | default: |
michael@0 | 1076 | // p is a non-binary, non-enumerated property that we |
michael@0 | 1077 | // don't support (yet). |
michael@0 | 1078 | FAIL(ec); |
michael@0 | 1079 | } |
michael@0 | 1080 | } |
michael@0 | 1081 | } |
michael@0 | 1082 | |
michael@0 | 1083 | else { |
michael@0 | 1084 | // value is empty. Interpret as General Category, Script, or |
michael@0 | 1085 | // Binary property. |
michael@0 | 1086 | p = UCHAR_GENERAL_CATEGORY_MASK; |
michael@0 | 1087 | v = u_getPropertyValueEnum(p, pname.data()); |
michael@0 | 1088 | if (v == UCHAR_INVALID_CODE) { |
michael@0 | 1089 | p = UCHAR_SCRIPT; |
michael@0 | 1090 | v = u_getPropertyValueEnum(p, pname.data()); |
michael@0 | 1091 | if (v == UCHAR_INVALID_CODE) { |
michael@0 | 1092 | p = u_getPropertyEnum(pname.data()); |
michael@0 | 1093 | if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) { |
michael@0 | 1094 | v = 1; |
michael@0 | 1095 | } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) { |
michael@0 | 1096 | set(MIN_VALUE, MAX_VALUE); |
michael@0 | 1097 | return *this; |
michael@0 | 1098 | } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) { |
michael@0 | 1099 | set(0, 0x7F); |
michael@0 | 1100 | return *this; |
michael@0 | 1101 | } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) { |
michael@0 | 1102 | // [:Assigned:]=[:^Cn:] |
michael@0 | 1103 | p = UCHAR_GENERAL_CATEGORY_MASK; |
michael@0 | 1104 | v = U_GC_CN_MASK; |
michael@0 | 1105 | invert = TRUE; |
michael@0 | 1106 | } else { |
michael@0 | 1107 | FAIL(ec); |
michael@0 | 1108 | } |
michael@0 | 1109 | } |
michael@0 | 1110 | } |
michael@0 | 1111 | } |
michael@0 | 1112 | |
michael@0 | 1113 | applyIntPropertyValue(p, v, ec); |
michael@0 | 1114 | if(invert) { |
michael@0 | 1115 | complement(); |
michael@0 | 1116 | } |
michael@0 | 1117 | |
michael@0 | 1118 | if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) { |
michael@0 | 1119 | // mustNotBeEmpty is set to true if an empty set indicates |
michael@0 | 1120 | // invalid input. |
michael@0 | 1121 | ec = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 1122 | } |
michael@0 | 1123 | |
michael@0 | 1124 | if (isBogus() && U_SUCCESS(ec)) { |
michael@0 | 1125 | // We likely ran out of memory. AHHH! |
michael@0 | 1126 | ec = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 1127 | } |
michael@0 | 1128 | return *this; |
michael@0 | 1129 | } |
michael@0 | 1130 | |
michael@0 | 1131 | //---------------------------------------------------------------- |
michael@0 | 1132 | // Property set patterns |
michael@0 | 1133 | //---------------------------------------------------------------- |
michael@0 | 1134 | |
michael@0 | 1135 | /** |
michael@0 | 1136 | * Return true if the given position, in the given pattern, appears |
michael@0 | 1137 | * to be the start of a property set pattern. |
michael@0 | 1138 | */ |
michael@0 | 1139 | UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern, |
michael@0 | 1140 | int32_t pos) { |
michael@0 | 1141 | // Patterns are at least 5 characters long |
michael@0 | 1142 | if ((pos+5) > pattern.length()) { |
michael@0 | 1143 | return FALSE; |
michael@0 | 1144 | } |
michael@0 | 1145 | |
michael@0 | 1146 | // Look for an opening [:, [:^, \p, or \P |
michael@0 | 1147 | return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos); |
michael@0 | 1148 | } |
michael@0 | 1149 | |
michael@0 | 1150 | /** |
michael@0 | 1151 | * Return true if the given iterator appears to point at a |
michael@0 | 1152 | * property pattern. Regardless of the result, return with the |
michael@0 | 1153 | * iterator unchanged. |
michael@0 | 1154 | * @param chars iterator over the pattern characters. Upon return |
michael@0 | 1155 | * it will be unchanged. |
michael@0 | 1156 | * @param iterOpts RuleCharacterIterator options |
michael@0 | 1157 | */ |
michael@0 | 1158 | UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars, |
michael@0 | 1159 | int32_t iterOpts) { |
michael@0 | 1160 | // NOTE: literal will always be FALSE, because we don't parse escapes. |
michael@0 | 1161 | UBool result = FALSE, literal; |
michael@0 | 1162 | UErrorCode ec = U_ZERO_ERROR; |
michael@0 | 1163 | iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES; |
michael@0 | 1164 | RuleCharacterIterator::Pos pos; |
michael@0 | 1165 | chars.getPos(pos); |
michael@0 | 1166 | UChar32 c = chars.next(iterOpts, literal, ec); |
michael@0 | 1167 | if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) { |
michael@0 | 1168 | UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE, |
michael@0 | 1169 | literal, ec); |
michael@0 | 1170 | result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) : |
michael@0 | 1171 | (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/); |
michael@0 | 1172 | } |
michael@0 | 1173 | chars.setPos(pos); |
michael@0 | 1174 | return result && U_SUCCESS(ec); |
michael@0 | 1175 | } |
michael@0 | 1176 | |
michael@0 | 1177 | /** |
michael@0 | 1178 | * Parse the given property pattern at the given parse position. |
michael@0 | 1179 | */ |
michael@0 | 1180 | UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern, |
michael@0 | 1181 | ParsePosition& ppos, |
michael@0 | 1182 | UErrorCode &ec) { |
michael@0 | 1183 | int32_t pos = ppos.getIndex(); |
michael@0 | 1184 | |
michael@0 | 1185 | UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat} |
michael@0 | 1186 | UBool isName = FALSE; // true for \N{pat}, o/w false |
michael@0 | 1187 | UBool invert = FALSE; |
michael@0 | 1188 | |
michael@0 | 1189 | if (U_FAILURE(ec)) return *this; |
michael@0 | 1190 | |
michael@0 | 1191 | // Minimum length is 5 characters, e.g. \p{L} |
michael@0 | 1192 | if ((pos+5) > pattern.length()) { |
michael@0 | 1193 | FAIL(ec); |
michael@0 | 1194 | } |
michael@0 | 1195 | |
michael@0 | 1196 | // On entry, ppos should point to one of the following locations: |
michael@0 | 1197 | // Look for an opening [:, [:^, \p, or \P |
michael@0 | 1198 | if (isPOSIXOpen(pattern, pos)) { |
michael@0 | 1199 | posix = TRUE; |
michael@0 | 1200 | pos += 2; |
michael@0 | 1201 | pos = ICU_Utility::skipWhitespace(pattern, pos); |
michael@0 | 1202 | if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) { |
michael@0 | 1203 | ++pos; |
michael@0 | 1204 | invert = TRUE; |
michael@0 | 1205 | } |
michael@0 | 1206 | } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) { |
michael@0 | 1207 | UChar c = pattern.charAt(pos+1); |
michael@0 | 1208 | invert = (c == UPPER_P); |
michael@0 | 1209 | isName = (c == UPPER_N); |
michael@0 | 1210 | pos += 2; |
michael@0 | 1211 | pos = ICU_Utility::skipWhitespace(pattern, pos); |
michael@0 | 1212 | if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) { |
michael@0 | 1213 | // Syntax error; "\p" or "\P" not followed by "{" |
michael@0 | 1214 | FAIL(ec); |
michael@0 | 1215 | } |
michael@0 | 1216 | } else { |
michael@0 | 1217 | // Open delimiter not seen |
michael@0 | 1218 | FAIL(ec); |
michael@0 | 1219 | } |
michael@0 | 1220 | |
michael@0 | 1221 | // Look for the matching close delimiter, either :] or } |
michael@0 | 1222 | int32_t close; |
michael@0 | 1223 | if (posix) { |
michael@0 | 1224 | close = pattern.indexOf(POSIX_CLOSE, 2, pos); |
michael@0 | 1225 | } else { |
michael@0 | 1226 | close = pattern.indexOf(CLOSE_BRACE, pos); |
michael@0 | 1227 | } |
michael@0 | 1228 | if (close < 0) { |
michael@0 | 1229 | // Syntax error; close delimiter missing |
michael@0 | 1230 | FAIL(ec); |
michael@0 | 1231 | } |
michael@0 | 1232 | |
michael@0 | 1233 | // Look for an '=' sign. If this is present, we will parse a |
michael@0 | 1234 | // medium \p{gc=Cf} or long \p{GeneralCategory=Format} |
michael@0 | 1235 | // pattern. |
michael@0 | 1236 | int32_t equals = pattern.indexOf(EQUALS, pos); |
michael@0 | 1237 | UnicodeString propName, valueName; |
michael@0 | 1238 | if (equals >= 0 && equals < close && !isName) { |
michael@0 | 1239 | // Equals seen; parse medium/long pattern |
michael@0 | 1240 | pattern.extractBetween(pos, equals, propName); |
michael@0 | 1241 | pattern.extractBetween(equals+1, close, valueName); |
michael@0 | 1242 | } |
michael@0 | 1243 | |
michael@0 | 1244 | else { |
michael@0 | 1245 | // Handle case where no '=' is seen, and \N{} |
michael@0 | 1246 | pattern.extractBetween(pos, close, propName); |
michael@0 | 1247 | |
michael@0 | 1248 | // Handle \N{name} |
michael@0 | 1249 | if (isName) { |
michael@0 | 1250 | // This is a little inefficient since it means we have to |
michael@0 | 1251 | // parse NAME_PROP back to UCHAR_NAME even though we already |
michael@0 | 1252 | // know it's UCHAR_NAME. If we refactor the API to |
michael@0 | 1253 | // support args of (UProperty, char*) then we can remove |
michael@0 | 1254 | // NAME_PROP and make this a little more efficient. |
michael@0 | 1255 | valueName = propName; |
michael@0 | 1256 | propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV); |
michael@0 | 1257 | } |
michael@0 | 1258 | } |
michael@0 | 1259 | |
michael@0 | 1260 | applyPropertyAlias(propName, valueName, ec); |
michael@0 | 1261 | |
michael@0 | 1262 | if (U_SUCCESS(ec)) { |
michael@0 | 1263 | if (invert) { |
michael@0 | 1264 | complement(); |
michael@0 | 1265 | } |
michael@0 | 1266 | |
michael@0 | 1267 | // Move to the limit position after the close delimiter if the |
michael@0 | 1268 | // parse succeeded. |
michael@0 | 1269 | ppos.setIndex(close + (posix ? 2 : 1)); |
michael@0 | 1270 | } |
michael@0 | 1271 | |
michael@0 | 1272 | return *this; |
michael@0 | 1273 | } |
michael@0 | 1274 | |
michael@0 | 1275 | /** |
michael@0 | 1276 | * Parse a property pattern. |
michael@0 | 1277 | * @param chars iterator over the pattern characters. Upon return |
michael@0 | 1278 | * it will be advanced to the first character after the parsed |
michael@0 | 1279 | * pattern, or the end of the iteration if all characters are |
michael@0 | 1280 | * parsed. |
michael@0 | 1281 | * @param rebuiltPat the pattern that was parsed, rebuilt or |
michael@0 | 1282 | * copied from the input pattern, as appropriate. |
michael@0 | 1283 | */ |
michael@0 | 1284 | void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars, |
michael@0 | 1285 | UnicodeString& rebuiltPat, |
michael@0 | 1286 | UErrorCode& ec) { |
michael@0 | 1287 | if (U_FAILURE(ec)) return; |
michael@0 | 1288 | UnicodeString pattern; |
michael@0 | 1289 | chars.lookahead(pattern); |
michael@0 | 1290 | ParsePosition pos(0); |
michael@0 | 1291 | applyPropertyPattern(pattern, pos, ec); |
michael@0 | 1292 | if (U_FAILURE(ec)) return; |
michael@0 | 1293 | if (pos.getIndex() == 0) { |
michael@0 | 1294 | // syntaxError(chars, "Invalid property pattern"); |
michael@0 | 1295 | ec = U_MALFORMED_SET; |
michael@0 | 1296 | return; |
michael@0 | 1297 | } |
michael@0 | 1298 | chars.jumpahead(pos.getIndex()); |
michael@0 | 1299 | rebuiltPat.append(pattern, 0, pos.getIndex()); |
michael@0 | 1300 | } |
michael@0 | 1301 | |
michael@0 | 1302 | U_NAMESPACE_END |