intl/icu/source/common/uniset_props.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /*
michael@0 2 *******************************************************************************
michael@0 3 *
michael@0 4 * Copyright (C) 1999-2013, International Business Machines
michael@0 5 * Corporation and others. All Rights Reserved.
michael@0 6 *
michael@0 7 *******************************************************************************
michael@0 8 * file name: uniset_props.cpp
michael@0 9 * encoding: US-ASCII
michael@0 10 * tab size: 8 (not used)
michael@0 11 * indentation:4
michael@0 12 *
michael@0 13 * created on: 2004aug25
michael@0 14 * created by: Markus W. Scherer
michael@0 15 *
michael@0 16 * Character property dependent functions moved here from uniset.cpp
michael@0 17 */
michael@0 18
michael@0 19 #include "unicode/utypes.h"
michael@0 20 #include "unicode/uniset.h"
michael@0 21 #include "unicode/parsepos.h"
michael@0 22 #include "unicode/uchar.h"
michael@0 23 #include "unicode/uscript.h"
michael@0 24 #include "unicode/symtable.h"
michael@0 25 #include "unicode/uset.h"
michael@0 26 #include "unicode/locid.h"
michael@0 27 #include "unicode/brkiter.h"
michael@0 28 #include "uset_imp.h"
michael@0 29 #include "ruleiter.h"
michael@0 30 #include "cmemory.h"
michael@0 31 #include "ucln_cmn.h"
michael@0 32 #include "util.h"
michael@0 33 #include "uvector.h"
michael@0 34 #include "uprops.h"
michael@0 35 #include "propname.h"
michael@0 36 #include "normalizer2impl.h"
michael@0 37 #include "ucase.h"
michael@0 38 #include "ubidi_props.h"
michael@0 39 #include "uinvchar.h"
michael@0 40 #include "uprops.h"
michael@0 41 #include "charstr.h"
michael@0 42 #include "cstring.h"
michael@0 43 #include "mutex.h"
michael@0 44 #include "umutex.h"
michael@0 45 #include "uassert.h"
michael@0 46 #include "hash.h"
michael@0 47
michael@0 48 U_NAMESPACE_USE
michael@0 49
michael@0 50 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
michael@0 51
michael@0 52 // initial storage. Must be >= 0
michael@0 53 // *** same as in uniset.cpp ! ***
michael@0 54 #define START_EXTRA 16
michael@0 55
michael@0 56 // Define UChar constants using hex for EBCDIC compatibility
michael@0 57 // Used #define to reduce private static exports and memory access time.
michael@0 58 #define SET_OPEN ((UChar)0x005B) /*[*/
michael@0 59 #define SET_CLOSE ((UChar)0x005D) /*]*/
michael@0 60 #define HYPHEN ((UChar)0x002D) /*-*/
michael@0 61 #define COMPLEMENT ((UChar)0x005E) /*^*/
michael@0 62 #define COLON ((UChar)0x003A) /*:*/
michael@0 63 #define BACKSLASH ((UChar)0x005C) /*\*/
michael@0 64 #define INTERSECTION ((UChar)0x0026) /*&*/
michael@0 65 #define UPPER_U ((UChar)0x0055) /*U*/
michael@0 66 #define LOWER_U ((UChar)0x0075) /*u*/
michael@0 67 #define OPEN_BRACE ((UChar)123) /*{*/
michael@0 68 #define CLOSE_BRACE ((UChar)125) /*}*/
michael@0 69 #define UPPER_P ((UChar)0x0050) /*P*/
michael@0 70 #define LOWER_P ((UChar)0x0070) /*p*/
michael@0 71 #define UPPER_N ((UChar)78) /*N*/
michael@0 72 #define EQUALS ((UChar)0x003D) /*=*/
michael@0 73
michael@0 74 //static const UChar POSIX_OPEN[] = { SET_OPEN,COLON,0 }; // "[:"
michael@0 75 static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 }; // ":]"
michael@0 76 //static const UChar PERL_OPEN[] = { BACKSLASH,LOWER_P,0 }; // "\\p"
michael@0 77 //static const UChar PERL_CLOSE[] = { CLOSE_BRACE,0 }; // "}"
michael@0 78 //static const UChar NAME_OPEN[] = { BACKSLASH,UPPER_N,0 }; // "\\N"
michael@0 79 static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/
michael@0 80
michael@0 81 // Special property set IDs
michael@0 82 static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF]
michael@0 83 static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
michael@0 84 static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
michael@0 85
michael@0 86 // Unicode name property alias
michael@0 87 #define NAME_PROP "na"
michael@0 88 #define NAME_PROP_LENGTH 2
michael@0 89
michael@0 90 /**
michael@0 91 * Delimiter string used in patterns to close a category reference:
michael@0 92 * ":]". Example: "[:Lu:]".
michael@0 93 */
michael@0 94 //static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */
michael@0 95
michael@0 96 // Cached sets ------------------------------------------------------------- ***
michael@0 97
michael@0 98 U_CDECL_BEGIN
michael@0 99 static UBool U_CALLCONV uset_cleanup();
michael@0 100
michael@0 101 struct Inclusion {
michael@0 102 UnicodeSet *fSet;
michael@0 103 UInitOnce fInitOnce;
michael@0 104 };
michael@0 105 static Inclusion gInclusions[UPROPS_SRC_COUNT]; // cached getInclusions()
michael@0 106
michael@0 107 static UnicodeSet *uni32Singleton;
michael@0 108 static icu::UInitOnce uni32InitOnce = U_INITONCE_INITIALIZER;
michael@0 109
michael@0 110 //----------------------------------------------------------------
michael@0 111 // Inclusions list
michael@0 112 //----------------------------------------------------------------
michael@0 113
michael@0 114 // USetAdder implementation
michael@0 115 // Does not use uset.h to reduce code dependencies
michael@0 116 static void U_CALLCONV
michael@0 117 _set_add(USet *set, UChar32 c) {
michael@0 118 ((UnicodeSet *)set)->add(c);
michael@0 119 }
michael@0 120
michael@0 121 static void U_CALLCONV
michael@0 122 _set_addRange(USet *set, UChar32 start, UChar32 end) {
michael@0 123 ((UnicodeSet *)set)->add(start, end);
michael@0 124 }
michael@0 125
michael@0 126 static void U_CALLCONV
michael@0 127 _set_addString(USet *set, const UChar *str, int32_t length) {
michael@0 128 ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length));
michael@0 129 }
michael@0 130
michael@0 131 /**
michael@0 132 * Cleanup function for UnicodeSet
michael@0 133 */
michael@0 134 static UBool U_CALLCONV uset_cleanup(void) {
michael@0 135 for(int32_t i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) {
michael@0 136 Inclusion &in = gInclusions[i];
michael@0 137 delete in.fSet;
michael@0 138 in.fSet = NULL;
michael@0 139 in.fInitOnce.reset();
michael@0 140 }
michael@0 141
michael@0 142 delete uni32Singleton;
michael@0 143 uni32Singleton = NULL;
michael@0 144 uni32InitOnce.reset();
michael@0 145 return TRUE;
michael@0 146 }
michael@0 147
michael@0 148 U_CDECL_END
michael@0 149
michael@0 150 U_NAMESPACE_BEGIN
michael@0 151
michael@0 152 /*
michael@0 153 Reduce excessive reallocation, and make it easier to detect initialization problems.
michael@0 154 Usually you don't see smaller sets than this for Unicode 5.0.
michael@0 155 */
michael@0 156 #define DEFAULT_INCLUSION_CAPACITY 3072
michael@0 157
michael@0 158 void U_CALLCONV UnicodeSet_initInclusion(int32_t src, UErrorCode &status) {
michael@0 159 // This function is invoked only via umtx_initOnce().
michael@0 160 // This function is a friend of class UnicodeSet.
michael@0 161
michael@0 162 U_ASSERT(src >=0 && src<UPROPS_SRC_COUNT);
michael@0 163 UnicodeSet * &incl = gInclusions[src].fSet;
michael@0 164 U_ASSERT(incl == NULL);
michael@0 165
michael@0 166 incl = new UnicodeSet();
michael@0 167 if (incl == NULL) {
michael@0 168 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 169 return;
michael@0 170 }
michael@0 171 USetAdder sa = {
michael@0 172 (USet *)incl,
michael@0 173 _set_add,
michael@0 174 _set_addRange,
michael@0 175 _set_addString,
michael@0 176 NULL, // don't need remove()
michael@0 177 NULL // don't need removeRange()
michael@0 178 };
michael@0 179
michael@0 180 incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, status);
michael@0 181 switch(src) {
michael@0 182 case UPROPS_SRC_CHAR:
michael@0 183 uchar_addPropertyStarts(&sa, &status);
michael@0 184 break;
michael@0 185 case UPROPS_SRC_PROPSVEC:
michael@0 186 upropsvec_addPropertyStarts(&sa, &status);
michael@0 187 break;
michael@0 188 case UPROPS_SRC_CHAR_AND_PROPSVEC:
michael@0 189 uchar_addPropertyStarts(&sa, &status);
michael@0 190 upropsvec_addPropertyStarts(&sa, &status);
michael@0 191 break;
michael@0 192 #if !UCONFIG_NO_NORMALIZATION
michael@0 193 case UPROPS_SRC_CASE_AND_NORM: {
michael@0 194 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);
michael@0 195 if(U_SUCCESS(status)) {
michael@0 196 impl->addPropertyStarts(&sa, status);
michael@0 197 }
michael@0 198 ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status);
michael@0 199 break;
michael@0 200 }
michael@0 201 case UPROPS_SRC_NFC: {
michael@0 202 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);
michael@0 203 if(U_SUCCESS(status)) {
michael@0 204 impl->addPropertyStarts(&sa, status);
michael@0 205 }
michael@0 206 break;
michael@0 207 }
michael@0 208 case UPROPS_SRC_NFKC: {
michael@0 209 const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(status);
michael@0 210 if(U_SUCCESS(status)) {
michael@0 211 impl->addPropertyStarts(&sa, status);
michael@0 212 }
michael@0 213 break;
michael@0 214 }
michael@0 215 case UPROPS_SRC_NFKC_CF: {
michael@0 216 const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(status);
michael@0 217 if(U_SUCCESS(status)) {
michael@0 218 impl->addPropertyStarts(&sa, status);
michael@0 219 }
michael@0 220 break;
michael@0 221 }
michael@0 222 case UPROPS_SRC_NFC_CANON_ITER: {
michael@0 223 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);
michael@0 224 if(U_SUCCESS(status)) {
michael@0 225 impl->addCanonIterPropertyStarts(&sa, status);
michael@0 226 }
michael@0 227 break;
michael@0 228 }
michael@0 229 #endif
michael@0 230 case UPROPS_SRC_CASE:
michael@0 231 ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status);
michael@0 232 break;
michael@0 233 case UPROPS_SRC_BIDI:
michael@0 234 ubidi_addPropertyStarts(ubidi_getSingleton(), &sa, &status);
michael@0 235 break;
michael@0 236 default:
michael@0 237 status = U_INTERNAL_PROGRAM_ERROR;
michael@0 238 break;
michael@0 239 }
michael@0 240
michael@0 241 if (U_FAILURE(status)) {
michael@0 242 delete incl;
michael@0 243 incl = NULL;
michael@0 244 return;
michael@0 245 }
michael@0 246 // Compact for caching
michael@0 247 incl->compact();
michael@0 248 ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
michael@0 249 }
michael@0 250
michael@0 251
michael@0 252
michael@0 253 const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) {
michael@0 254 U_ASSERT(src >=0 && src<UPROPS_SRC_COUNT);
michael@0 255 Inclusion &i = gInclusions[src];
michael@0 256 umtx_initOnce(i.fInitOnce, &UnicodeSet_initInclusion, src, status);
michael@0 257 return i.fSet;
michael@0 258 }
michael@0 259
michael@0 260
michael@0 261 // Cache some sets for other services -------------------------------------- ***
michael@0 262 void U_CALLCONV createUni32Set(UErrorCode &errorCode) {
michael@0 263 U_ASSERT(uni32Singleton == NULL);
michael@0 264 uni32Singleton = new UnicodeSet(UNICODE_STRING_SIMPLE("[:age=3.2:]"), errorCode);
michael@0 265 if(uni32Singleton==NULL) {
michael@0 266 errorCode=U_MEMORY_ALLOCATION_ERROR;
michael@0 267 } else {
michael@0 268 uni32Singleton->freeze();
michael@0 269 }
michael@0 270 ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
michael@0 271 }
michael@0 272
michael@0 273
michael@0 274 U_CFUNC UnicodeSet *
michael@0 275 uniset_getUnicode32Instance(UErrorCode &errorCode) {
michael@0 276 umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode);
michael@0 277 return uni32Singleton;
michael@0 278 }
michael@0 279
michael@0 280 // helper functions for matching of pattern syntax pieces ------------------ ***
michael@0 281 // these functions are parallel to the PERL_OPEN etc. strings above
michael@0 282
michael@0 283 // using these functions is not only faster than UnicodeString::compare() and
michael@0 284 // caseCompare(), but they also make UnicodeSet work for simple patterns when
michael@0 285 // no Unicode properties data is available - when caseCompare() fails
michael@0 286
michael@0 287 static inline UBool
michael@0 288 isPerlOpen(const UnicodeString &pattern, int32_t pos) {
michael@0 289 UChar c;
michael@0 290 return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P);
michael@0 291 }
michael@0 292
michael@0 293 /*static inline UBool
michael@0 294 isPerlClose(const UnicodeString &pattern, int32_t pos) {
michael@0 295 return pattern.charAt(pos)==CLOSE_BRACE;
michael@0 296 }*/
michael@0 297
michael@0 298 static inline UBool
michael@0 299 isNameOpen(const UnicodeString &pattern, int32_t pos) {
michael@0 300 return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N;
michael@0 301 }
michael@0 302
michael@0 303 static inline UBool
michael@0 304 isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
michael@0 305 return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON;
michael@0 306 }
michael@0 307
michael@0 308 /*static inline UBool
michael@0 309 isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
michael@0 310 return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE;
michael@0 311 }*/
michael@0 312
michael@0 313 // TODO memory debugging provided inside uniset.cpp
michael@0 314 // could be made available here but probably obsolete with use of modern
michael@0 315 // memory leak checker tools
michael@0 316 #define _dbgct(me)
michael@0 317
michael@0 318 //----------------------------------------------------------------
michael@0 319 // Constructors &c
michael@0 320 //----------------------------------------------------------------
michael@0 321
michael@0 322 /**
michael@0 323 * Constructs a set from the given pattern, optionally ignoring
michael@0 324 * white space. See the class description for the syntax of the
michael@0 325 * pattern language.
michael@0 326 * @param pattern a string specifying what characters are in the set
michael@0 327 */
michael@0 328 UnicodeSet::UnicodeSet(const UnicodeString& pattern,
michael@0 329 UErrorCode& status) :
michael@0 330 len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
michael@0 331 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
michael@0 332 fFlags(0)
michael@0 333 {
michael@0 334 if(U_SUCCESS(status)){
michael@0 335 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
michael@0 336 /* test for NULL */
michael@0 337 if(list == NULL) {
michael@0 338 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 339 }else{
michael@0 340 allocateStrings(status);
michael@0 341 applyPattern(pattern, status);
michael@0 342 }
michael@0 343 }
michael@0 344 _dbgct(this);
michael@0 345 }
michael@0 346
michael@0 347 //----------------------------------------------------------------
michael@0 348 // Public API
michael@0 349 //----------------------------------------------------------------
michael@0 350
michael@0 351 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
michael@0 352 UErrorCode& status) {
michael@0 353 // Equivalent to
michael@0 354 // return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
michael@0 355 // but without dependency on closeOver().
michael@0 356 ParsePosition pos(0);
michael@0 357 applyPatternIgnoreSpace(pattern, pos, NULL, status);
michael@0 358 if (U_FAILURE(status)) return *this;
michael@0 359
michael@0 360 int32_t i = pos.getIndex();
michael@0 361 // Skip over trailing whitespace
michael@0 362 ICU_Utility::skipWhitespace(pattern, i, TRUE);
michael@0 363 if (i != pattern.length()) {
michael@0 364 status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 365 }
michael@0 366 return *this;
michael@0 367 }
michael@0 368
michael@0 369 void
michael@0 370 UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,
michael@0 371 ParsePosition& pos,
michael@0 372 const SymbolTable* symbols,
michael@0 373 UErrorCode& status) {
michael@0 374 if (U_FAILURE(status)) {
michael@0 375 return;
michael@0 376 }
michael@0 377 if (isFrozen()) {
michael@0 378 status = U_NO_WRITE_PERMISSION;
michael@0 379 return;
michael@0 380 }
michael@0 381 // Need to build the pattern in a temporary string because
michael@0 382 // _applyPattern calls add() etc., which set pat to empty.
michael@0 383 UnicodeString rebuiltPat;
michael@0 384 RuleCharacterIterator chars(pattern, symbols, pos);
michael@0 385 applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, status);
michael@0 386 if (U_FAILURE(status)) return;
michael@0 387 if (chars.inVariable()) {
michael@0 388 // syntaxError(chars, "Extra chars in variable value");
michael@0 389 status = U_MALFORMED_SET;
michael@0 390 return;
michael@0 391 }
michael@0 392 setPattern(rebuiltPat);
michael@0 393 }
michael@0 394
michael@0 395 /**
michael@0 396 * Return true if the given position, in the given pattern, appears
michael@0 397 * to be the start of a UnicodeSet pattern.
michael@0 398 */
michael@0 399 UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
michael@0 400 return ((pos+1) < pattern.length() &&
michael@0 401 pattern.charAt(pos) == (UChar)91/*[*/) ||
michael@0 402 resemblesPropertyPattern(pattern, pos);
michael@0 403 }
michael@0 404
michael@0 405 //----------------------------------------------------------------
michael@0 406 // Implementation: Pattern parsing
michael@0 407 //----------------------------------------------------------------
michael@0 408
michael@0 409 /**
michael@0 410 * A small all-inline class to manage a UnicodeSet pointer. Add
michael@0 411 * operator->() etc. as needed.
michael@0 412 */
michael@0 413 class UnicodeSetPointer {
michael@0 414 UnicodeSet* p;
michael@0 415 public:
michael@0 416 inline UnicodeSetPointer() : p(0) {}
michael@0 417 inline ~UnicodeSetPointer() { delete p; }
michael@0 418 inline UnicodeSet* pointer() { return p; }
michael@0 419 inline UBool allocate() {
michael@0 420 if (p == 0) {
michael@0 421 p = new UnicodeSet();
michael@0 422 }
michael@0 423 return p != 0;
michael@0 424 }
michael@0 425 };
michael@0 426
michael@0 427 /**
michael@0 428 * Parse the pattern from the given RuleCharacterIterator. The
michael@0 429 * iterator is advanced over the parsed pattern.
michael@0 430 * @param chars iterator over the pattern characters. Upon return
michael@0 431 * it will be advanced to the first character after the parsed
michael@0 432 * pattern, or the end of the iteration if all characters are
michael@0 433 * parsed.
michael@0 434 * @param symbols symbol table to use to parse and dereference
michael@0 435 * variables, or null if none.
michael@0 436 * @param rebuiltPat the pattern that was parsed, rebuilt or
michael@0 437 * copied from the input pattern, as appropriate.
michael@0 438 * @param options a bit mask of zero or more of the following:
michael@0 439 * IGNORE_SPACE, CASE.
michael@0 440 */
michael@0 441 void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
michael@0 442 const SymbolTable* symbols,
michael@0 443 UnicodeString& rebuiltPat,
michael@0 444 uint32_t options,
michael@0 445 UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
michael@0 446 UErrorCode& ec) {
michael@0 447 if (U_FAILURE(ec)) return;
michael@0 448
michael@0 449 // Syntax characters: [ ] ^ - & { }
michael@0 450
michael@0 451 // Recognized special forms for chars, sets: c-c s-s s&s
michael@0 452
michael@0 453 int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |
michael@0 454 RuleCharacterIterator::PARSE_ESCAPES;
michael@0 455 if ((options & USET_IGNORE_SPACE) != 0) {
michael@0 456 opts |= RuleCharacterIterator::SKIP_WHITESPACE;
michael@0 457 }
michael@0 458
michael@0 459 UnicodeString patLocal, buf;
michael@0 460 UBool usePat = FALSE;
michael@0 461 UnicodeSetPointer scratch;
michael@0 462 RuleCharacterIterator::Pos backup;
michael@0 463
michael@0 464 // mode: 0=before [, 1=between [...], 2=after ]
michael@0 465 // lastItem: 0=none, 1=char, 2=set
michael@0 466 int8_t lastItem = 0, mode = 0;
michael@0 467 UChar32 lastChar = 0;
michael@0 468 UChar op = 0;
michael@0 469
michael@0 470 UBool invert = FALSE;
michael@0 471
michael@0 472 clear();
michael@0 473
michael@0 474 while (mode != 2 && !chars.atEnd()) {
michael@0 475 U_ASSERT((lastItem == 0 && op == 0) ||
michael@0 476 (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) ||
michael@0 477 (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ ||
michael@0 478 op == INTERSECTION /*'&'*/)));
michael@0 479
michael@0 480 UChar32 c = 0;
michael@0 481 UBool literal = FALSE;
michael@0 482 UnicodeSet* nested = 0; // alias - do not delete
michael@0 483
michael@0 484 // -------- Check for property pattern
michael@0 485
michael@0 486 // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
michael@0 487 int8_t setMode = 0;
michael@0 488 if (resemblesPropertyPattern(chars, opts)) {
michael@0 489 setMode = 2;
michael@0 490 }
michael@0 491
michael@0 492 // -------- Parse '[' of opening delimiter OR nested set.
michael@0 493 // If there is a nested set, use `setMode' to define how
michael@0 494 // the set should be parsed. If the '[' is part of the
michael@0 495 // opening delimiter for this pattern, parse special
michael@0 496 // strings "[", "[^", "[-", and "[^-". Check for stand-in
michael@0 497 // characters representing a nested set in the symbol
michael@0 498 // table.
michael@0 499
michael@0 500 else {
michael@0 501 // Prepare to backup if necessary
michael@0 502 chars.getPos(backup);
michael@0 503 c = chars.next(opts, literal, ec);
michael@0 504 if (U_FAILURE(ec)) return;
michael@0 505
michael@0 506 if (c == 0x5B /*'['*/ && !literal) {
michael@0 507 if (mode == 1) {
michael@0 508 chars.setPos(backup); // backup
michael@0 509 setMode = 1;
michael@0 510 } else {
michael@0 511 // Handle opening '[' delimiter
michael@0 512 mode = 1;
michael@0 513 patLocal.append((UChar) 0x5B /*'['*/);
michael@0 514 chars.getPos(backup); // prepare to backup
michael@0 515 c = chars.next(opts, literal, ec);
michael@0 516 if (U_FAILURE(ec)) return;
michael@0 517 if (c == 0x5E /*'^'*/ && !literal) {
michael@0 518 invert = TRUE;
michael@0 519 patLocal.append((UChar) 0x5E /*'^'*/);
michael@0 520 chars.getPos(backup); // prepare to backup
michael@0 521 c = chars.next(opts, literal, ec);
michael@0 522 if (U_FAILURE(ec)) return;
michael@0 523 }
michael@0 524 // Fall through to handle special leading '-';
michael@0 525 // otherwise restart loop for nested [], \p{}, etc.
michael@0 526 if (c == HYPHEN /*'-'*/) {
michael@0 527 literal = TRUE;
michael@0 528 // Fall through to handle literal '-' below
michael@0 529 } else {
michael@0 530 chars.setPos(backup); // backup
michael@0 531 continue;
michael@0 532 }
michael@0 533 }
michael@0 534 } else if (symbols != 0) {
michael@0 535 const UnicodeFunctor *m = symbols->lookupMatcher(c);
michael@0 536 if (m != 0) {
michael@0 537 const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m);
michael@0 538 if (ms == NULL) {
michael@0 539 ec = U_MALFORMED_SET;
michael@0 540 return;
michael@0 541 }
michael@0 542 // casting away const, but `nested' won't be modified
michael@0 543 // (important not to modify stored set)
michael@0 544 nested = const_cast<UnicodeSet*>(ms);
michael@0 545 setMode = 3;
michael@0 546 }
michael@0 547 }
michael@0 548 }
michael@0 549
michael@0 550 // -------- Handle a nested set. This either is inline in
michael@0 551 // the pattern or represented by a stand-in that has
michael@0 552 // previously been parsed and was looked up in the symbol
michael@0 553 // table.
michael@0 554
michael@0 555 if (setMode != 0) {
michael@0 556 if (lastItem == 1) {
michael@0 557 if (op != 0) {
michael@0 558 // syntaxError(chars, "Char expected after operator");
michael@0 559 ec = U_MALFORMED_SET;
michael@0 560 return;
michael@0 561 }
michael@0 562 add(lastChar, lastChar);
michael@0 563 _appendToPat(patLocal, lastChar, FALSE);
michael@0 564 lastItem = 0;
michael@0 565 op = 0;
michael@0 566 }
michael@0 567
michael@0 568 if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) {
michael@0 569 patLocal.append(op);
michael@0 570 }
michael@0 571
michael@0 572 if (nested == 0) {
michael@0 573 // lazy allocation
michael@0 574 if (!scratch.allocate()) {
michael@0 575 ec = U_MEMORY_ALLOCATION_ERROR;
michael@0 576 return;
michael@0 577 }
michael@0 578 nested = scratch.pointer();
michael@0 579 }
michael@0 580 switch (setMode) {
michael@0 581 case 1:
michael@0 582 nested->applyPattern(chars, symbols, patLocal, options, caseClosure, ec);
michael@0 583 break;
michael@0 584 case 2:
michael@0 585 chars.skipIgnored(opts);
michael@0 586 nested->applyPropertyPattern(chars, patLocal, ec);
michael@0 587 if (U_FAILURE(ec)) return;
michael@0 588 break;
michael@0 589 case 3: // `nested' already parsed
michael@0 590 nested->_toPattern(patLocal, FALSE);
michael@0 591 break;
michael@0 592 }
michael@0 593
michael@0 594 usePat = TRUE;
michael@0 595
michael@0 596 if (mode == 0) {
michael@0 597 // Entire pattern is a category; leave parse loop
michael@0 598 *this = *nested;
michael@0 599 mode = 2;
michael@0 600 break;
michael@0 601 }
michael@0 602
michael@0 603 switch (op) {
michael@0 604 case HYPHEN: /*'-'*/
michael@0 605 removeAll(*nested);
michael@0 606 break;
michael@0 607 case INTERSECTION: /*'&'*/
michael@0 608 retainAll(*nested);
michael@0 609 break;
michael@0 610 case 0:
michael@0 611 addAll(*nested);
michael@0 612 break;
michael@0 613 }
michael@0 614
michael@0 615 op = 0;
michael@0 616 lastItem = 2;
michael@0 617
michael@0 618 continue;
michael@0 619 }
michael@0 620
michael@0 621 if (mode == 0) {
michael@0 622 // syntaxError(chars, "Missing '['");
michael@0 623 ec = U_MALFORMED_SET;
michael@0 624 return;
michael@0 625 }
michael@0 626
michael@0 627 // -------- Parse special (syntax) characters. If the
michael@0 628 // current character is not special, or if it is escaped,
michael@0 629 // then fall through and handle it below.
michael@0 630
michael@0 631 if (!literal) {
michael@0 632 switch (c) {
michael@0 633 case 0x5D /*']'*/:
michael@0 634 if (lastItem == 1) {
michael@0 635 add(lastChar, lastChar);
michael@0 636 _appendToPat(patLocal, lastChar, FALSE);
michael@0 637 }
michael@0 638 // Treat final trailing '-' as a literal
michael@0 639 if (op == HYPHEN /*'-'*/) {
michael@0 640 add(op, op);
michael@0 641 patLocal.append(op);
michael@0 642 } else if (op == INTERSECTION /*'&'*/) {
michael@0 643 // syntaxError(chars, "Trailing '&'");
michael@0 644 ec = U_MALFORMED_SET;
michael@0 645 return;
michael@0 646 }
michael@0 647 patLocal.append((UChar) 0x5D /*']'*/);
michael@0 648 mode = 2;
michael@0 649 continue;
michael@0 650 case HYPHEN /*'-'*/:
michael@0 651 if (op == 0) {
michael@0 652 if (lastItem != 0) {
michael@0 653 op = (UChar) c;
michael@0 654 continue;
michael@0 655 } else {
michael@0 656 // Treat final trailing '-' as a literal
michael@0 657 add(c, c);
michael@0 658 c = chars.next(opts, literal, ec);
michael@0 659 if (U_FAILURE(ec)) return;
michael@0 660 if (c == 0x5D /*']'*/ && !literal) {
michael@0 661 patLocal.append(HYPHEN_RIGHT_BRACE, 2);
michael@0 662 mode = 2;
michael@0 663 continue;
michael@0 664 }
michael@0 665 }
michael@0 666 }
michael@0 667 // syntaxError(chars, "'-' not after char or set");
michael@0 668 ec = U_MALFORMED_SET;
michael@0 669 return;
michael@0 670 case INTERSECTION /*'&'*/:
michael@0 671 if (lastItem == 2 && op == 0) {
michael@0 672 op = (UChar) c;
michael@0 673 continue;
michael@0 674 }
michael@0 675 // syntaxError(chars, "'&' not after set");
michael@0 676 ec = U_MALFORMED_SET;
michael@0 677 return;
michael@0 678 case 0x5E /*'^'*/:
michael@0 679 // syntaxError(chars, "'^' not after '['");
michael@0 680 ec = U_MALFORMED_SET;
michael@0 681 return;
michael@0 682 case 0x7B /*'{'*/:
michael@0 683 if (op != 0) {
michael@0 684 // syntaxError(chars, "Missing operand after operator");
michael@0 685 ec = U_MALFORMED_SET;
michael@0 686 return;
michael@0 687 }
michael@0 688 if (lastItem == 1) {
michael@0 689 add(lastChar, lastChar);
michael@0 690 _appendToPat(patLocal, lastChar, FALSE);
michael@0 691 }
michael@0 692 lastItem = 0;
michael@0 693 buf.truncate(0);
michael@0 694 {
michael@0 695 UBool ok = FALSE;
michael@0 696 while (!chars.atEnd()) {
michael@0 697 c = chars.next(opts, literal, ec);
michael@0 698 if (U_FAILURE(ec)) return;
michael@0 699 if (c == 0x7D /*'}'*/ && !literal) {
michael@0 700 ok = TRUE;
michael@0 701 break;
michael@0 702 }
michael@0 703 buf.append(c);
michael@0 704 }
michael@0 705 if (buf.length() < 1 || !ok) {
michael@0 706 // syntaxError(chars, "Invalid multicharacter string");
michael@0 707 ec = U_MALFORMED_SET;
michael@0 708 return;
michael@0 709 }
michael@0 710 }
michael@0 711 // We have new string. Add it to set and continue;
michael@0 712 // we don't need to drop through to the further
michael@0 713 // processing
michael@0 714 add(buf);
michael@0 715 patLocal.append((UChar) 0x7B /*'{'*/);
michael@0 716 _appendToPat(patLocal, buf, FALSE);
michael@0 717 patLocal.append((UChar) 0x7D /*'}'*/);
michael@0 718 continue;
michael@0 719 case SymbolTable::SYMBOL_REF:
michael@0 720 // symbols nosymbols
michael@0 721 // [a-$] error error (ambiguous)
michael@0 722 // [a$] anchor anchor
michael@0 723 // [a-$x] var "x"* literal '$'
michael@0 724 // [a-$.] error literal '$'
michael@0 725 // *We won't get here in the case of var "x"
michael@0 726 {
michael@0 727 chars.getPos(backup);
michael@0 728 c = chars.next(opts, literal, ec);
michael@0 729 if (U_FAILURE(ec)) return;
michael@0 730 UBool anchor = (c == 0x5D /*']'*/ && !literal);
michael@0 731 if (symbols == 0 && !anchor) {
michael@0 732 c = SymbolTable::SYMBOL_REF;
michael@0 733 chars.setPos(backup);
michael@0 734 break; // literal '$'
michael@0 735 }
michael@0 736 if (anchor && op == 0) {
michael@0 737 if (lastItem == 1) {
michael@0 738 add(lastChar, lastChar);
michael@0 739 _appendToPat(patLocal, lastChar, FALSE);
michael@0 740 }
michael@0 741 add(U_ETHER);
michael@0 742 usePat = TRUE;
michael@0 743 patLocal.append((UChar) SymbolTable::SYMBOL_REF);
michael@0 744 patLocal.append((UChar) 0x5D /*']'*/);
michael@0 745 mode = 2;
michael@0 746 continue;
michael@0 747 }
michael@0 748 // syntaxError(chars, "Unquoted '$'");
michael@0 749 ec = U_MALFORMED_SET;
michael@0 750 return;
michael@0 751 }
michael@0 752 default:
michael@0 753 break;
michael@0 754 }
michael@0 755 }
michael@0 756
michael@0 757 // -------- Parse literal characters. This includes both
michael@0 758 // escaped chars ("\u4E01") and non-syntax characters
michael@0 759 // ("a").
michael@0 760
michael@0 761 switch (lastItem) {
michael@0 762 case 0:
michael@0 763 lastItem = 1;
michael@0 764 lastChar = c;
michael@0 765 break;
michael@0 766 case 1:
michael@0 767 if (op == HYPHEN /*'-'*/) {
michael@0 768 if (lastChar >= c) {
michael@0 769 // Don't allow redundant (a-a) or empty (b-a) ranges;
michael@0 770 // these are most likely typos.
michael@0 771 // syntaxError(chars, "Invalid range");
michael@0 772 ec = U_MALFORMED_SET;
michael@0 773 return;
michael@0 774 }
michael@0 775 add(lastChar, c);
michael@0 776 _appendToPat(patLocal, lastChar, FALSE);
michael@0 777 patLocal.append(op);
michael@0 778 _appendToPat(patLocal, c, FALSE);
michael@0 779 lastItem = 0;
michael@0 780 op = 0;
michael@0 781 } else {
michael@0 782 add(lastChar, lastChar);
michael@0 783 _appendToPat(patLocal, lastChar, FALSE);
michael@0 784 lastChar = c;
michael@0 785 }
michael@0 786 break;
michael@0 787 case 2:
michael@0 788 if (op != 0) {
michael@0 789 // syntaxError(chars, "Set expected after operator");
michael@0 790 ec = U_MALFORMED_SET;
michael@0 791 return;
michael@0 792 }
michael@0 793 lastChar = c;
michael@0 794 lastItem = 1;
michael@0 795 break;
michael@0 796 }
michael@0 797 }
michael@0 798
michael@0 799 if (mode != 2) {
michael@0 800 // syntaxError(chars, "Missing ']'");
michael@0 801 ec = U_MALFORMED_SET;
michael@0 802 return;
michael@0 803 }
michael@0 804
michael@0 805 chars.skipIgnored(opts);
michael@0 806
michael@0 807 /**
michael@0 808 * Handle global flags (invert, case insensitivity). If this
michael@0 809 * pattern should be compiled case-insensitive, then we need
michael@0 810 * to close over case BEFORE COMPLEMENTING. This makes
michael@0 811 * patterns like /[^abc]/i work.
michael@0 812 */
michael@0 813 if ((options & USET_CASE_INSENSITIVE) != 0) {
michael@0 814 (this->*caseClosure)(USET_CASE_INSENSITIVE);
michael@0 815 }
michael@0 816 else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
michael@0 817 (this->*caseClosure)(USET_ADD_CASE_MAPPINGS);
michael@0 818 }
michael@0 819 if (invert) {
michael@0 820 complement();
michael@0 821 }
michael@0 822
michael@0 823 // Use the rebuilt pattern (patLocal) only if necessary. Prefer the
michael@0 824 // generated pattern.
michael@0 825 if (usePat) {
michael@0 826 rebuiltPat.append(patLocal);
michael@0 827 } else {
michael@0 828 _generatePattern(rebuiltPat, FALSE);
michael@0 829 }
michael@0 830 if (isBogus() && U_SUCCESS(ec)) {
michael@0 831 // We likely ran out of memory. AHHH!
michael@0 832 ec = U_MEMORY_ALLOCATION_ERROR;
michael@0 833 }
michael@0 834 }
michael@0 835
michael@0 836 //----------------------------------------------------------------
michael@0 837 // Property set implementation
michael@0 838 //----------------------------------------------------------------
michael@0 839
michael@0 840 static UBool numericValueFilter(UChar32 ch, void* context) {
michael@0 841 return u_getNumericValue(ch) == *(double*)context;
michael@0 842 }
michael@0 843
michael@0 844 static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
michael@0 845 int32_t value = *(int32_t*)context;
michael@0 846 return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
michael@0 847 }
michael@0 848
michael@0 849 static UBool versionFilter(UChar32 ch, void* context) {
michael@0 850 static const UVersionInfo none = { 0, 0, 0, 0 };
michael@0 851 UVersionInfo v;
michael@0 852 u_charAge(ch, v);
michael@0 853 UVersionInfo* version = (UVersionInfo*)context;
michael@0 854 return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
michael@0 855 }
michael@0 856
michael@0 857 typedef struct {
michael@0 858 UProperty prop;
michael@0 859 int32_t value;
michael@0 860 } IntPropertyContext;
michael@0 861
michael@0 862 static UBool intPropertyFilter(UChar32 ch, void* context) {
michael@0 863 IntPropertyContext* c = (IntPropertyContext*)context;
michael@0 864 return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
michael@0 865 }
michael@0 866
michael@0 867 static UBool scriptExtensionsFilter(UChar32 ch, void* context) {
michael@0 868 return uscript_hasScript(ch, *(UScriptCode*)context);
michael@0 869 }
michael@0 870
michael@0 871 /**
michael@0 872 * Generic filter-based scanning code for UCD property UnicodeSets.
michael@0 873 */
michael@0 874 void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
michael@0 875 void* context,
michael@0 876 int32_t src,
michael@0 877 UErrorCode &status) {
michael@0 878 if (U_FAILURE(status)) return;
michael@0 879
michael@0 880 // Logically, walk through all Unicode characters, noting the start
michael@0 881 // and end of each range for which filter.contain(c) is
michael@0 882 // true. Add each range to a set.
michael@0 883 //
michael@0 884 // To improve performance, use an inclusions set which
michael@0 885 // encodes information about character ranges that are known
michael@0 886 // to have identical properties.
michael@0 887 // getInclusions(src) contains exactly the first characters of
michael@0 888 // same-value ranges for the given properties "source".
michael@0 889 const UnicodeSet* inclusions = getInclusions(src, status);
michael@0 890 if (U_FAILURE(status)) {
michael@0 891 return;
michael@0 892 }
michael@0 893
michael@0 894 clear();
michael@0 895
michael@0 896 UChar32 startHasProperty = -1;
michael@0 897 int32_t limitRange = inclusions->getRangeCount();
michael@0 898
michael@0 899 for (int j=0; j<limitRange; ++j) {
michael@0 900 // get current range
michael@0 901 UChar32 start = inclusions->getRangeStart(j);
michael@0 902 UChar32 end = inclusions->getRangeEnd(j);
michael@0 903
michael@0 904 // for all the code points in the range, process
michael@0 905 for (UChar32 ch = start; ch <= end; ++ch) {
michael@0 906 // only add to this UnicodeSet on inflection points --
michael@0 907 // where the hasProperty value changes to false
michael@0 908 if ((*filter)(ch, context)) {
michael@0 909 if (startHasProperty < 0) {
michael@0 910 startHasProperty = ch;
michael@0 911 }
michael@0 912 } else if (startHasProperty >= 0) {
michael@0 913 add(startHasProperty, ch-1);
michael@0 914 startHasProperty = -1;
michael@0 915 }
michael@0 916 }
michael@0 917 }
michael@0 918 if (startHasProperty >= 0) {
michael@0 919 add((UChar32)startHasProperty, (UChar32)0x10FFFF);
michael@0 920 }
michael@0 921 if (isBogus() && U_SUCCESS(status)) {
michael@0 922 // We likely ran out of memory. AHHH!
michael@0 923 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 924 }
michael@0 925 }
michael@0 926
michael@0 927 static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
michael@0 928 /* Note: we use ' ' in compiler code page */
michael@0 929 int32_t j = 0;
michael@0 930 char ch;
michael@0 931 --dstCapacity; /* make room for term. zero */
michael@0 932 while ((ch = *src++) != 0) {
michael@0 933 if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {
michael@0 934 continue;
michael@0 935 }
michael@0 936 if (j >= dstCapacity) return FALSE;
michael@0 937 dst[j++] = ch;
michael@0 938 }
michael@0 939 if (j > 0 && dst[j-1] == ' ') --j;
michael@0 940 dst[j] = 0;
michael@0 941 return TRUE;
michael@0 942 }
michael@0 943
michael@0 944 //----------------------------------------------------------------
michael@0 945 // Property set API
michael@0 946 //----------------------------------------------------------------
michael@0 947
michael@0 948 #define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;}
michael@0 949
michael@0 950 UnicodeSet&
michael@0 951 UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
michael@0 952 if (U_FAILURE(ec) || isFrozen()) return *this;
michael@0 953
michael@0 954 if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
michael@0 955 applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec);
michael@0 956 } else if (prop == UCHAR_SCRIPT_EXTENSIONS) {
michael@0 957 UScriptCode script = (UScriptCode)value;
michael@0 958 applyFilter(scriptExtensionsFilter, &script, UPROPS_SRC_PROPSVEC, ec);
michael@0 959 } else {
michael@0 960 IntPropertyContext c = {prop, value};
michael@0 961 applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec);
michael@0 962 }
michael@0 963 return *this;
michael@0 964 }
michael@0 965
michael@0 966 UnicodeSet&
michael@0 967 UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
michael@0 968 const UnicodeString& value,
michael@0 969 UErrorCode& ec) {
michael@0 970 if (U_FAILURE(ec) || isFrozen()) return *this;
michael@0 971
michael@0 972 // prop and value used to be converted to char * using the default
michael@0 973 // converter instead of the invariant conversion.
michael@0 974 // This should not be necessary because all Unicode property and value
michael@0 975 // names use only invariant characters.
michael@0 976 // If there are any variant characters, then we won't find them anyway.
michael@0 977 // Checking first avoids assertion failures in the conversion.
michael@0 978 if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) ||
michael@0 979 !uprv_isInvariantUString(value.getBuffer(), value.length())
michael@0 980 ) {
michael@0 981 FAIL(ec);
michael@0 982 }
michael@0 983 CharString pname, vname;
michael@0 984 pname.appendInvariantChars(prop, ec);
michael@0 985 vname.appendInvariantChars(value, ec);
michael@0 986 if (U_FAILURE(ec)) return *this;
michael@0 987
michael@0 988 UProperty p;
michael@0 989 int32_t v;
michael@0 990 UBool mustNotBeEmpty = FALSE, invert = FALSE;
michael@0 991
michael@0 992 if (value.length() > 0) {
michael@0 993 p = u_getPropertyEnum(pname.data());
michael@0 994 if (p == UCHAR_INVALID_CODE) FAIL(ec);
michael@0 995
michael@0 996 // Treat gc as gcm
michael@0 997 if (p == UCHAR_GENERAL_CATEGORY) {
michael@0 998 p = UCHAR_GENERAL_CATEGORY_MASK;
michael@0 999 }
michael@0 1000
michael@0 1001 if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||
michael@0 1002 (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||
michael@0 1003 (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
michael@0 1004 v = u_getPropertyValueEnum(p, vname.data());
michael@0 1005 if (v == UCHAR_INVALID_CODE) {
michael@0 1006 // Handle numeric CCC
michael@0 1007 if (p == UCHAR_CANONICAL_COMBINING_CLASS ||
michael@0 1008 p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||
michael@0 1009 p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
michael@0 1010 char* end;
michael@0 1011 double value = uprv_strtod(vname.data(), &end);
michael@0 1012 v = (int32_t) value;
michael@0 1013 if (v != value || v < 0 || *end != 0) {
michael@0 1014 // non-integral or negative value, or trailing junk
michael@0 1015 FAIL(ec);
michael@0 1016 }
michael@0 1017 // If the resultant set is empty then the numeric value
michael@0 1018 // was invalid.
michael@0 1019 mustNotBeEmpty = TRUE;
michael@0 1020 } else {
michael@0 1021 FAIL(ec);
michael@0 1022 }
michael@0 1023 }
michael@0 1024 }
michael@0 1025
michael@0 1026 else {
michael@0 1027
michael@0 1028 switch (p) {
michael@0 1029 case UCHAR_NUMERIC_VALUE:
michael@0 1030 {
michael@0 1031 char* end;
michael@0 1032 double value = uprv_strtod(vname.data(), &end);
michael@0 1033 if (*end != 0) {
michael@0 1034 FAIL(ec);
michael@0 1035 }
michael@0 1036 applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec);
michael@0 1037 return *this;
michael@0 1038 }
michael@0 1039 case UCHAR_NAME:
michael@0 1040 {
michael@0 1041 // Must munge name, since u_charFromName() does not do
michael@0 1042 // 'loose' matching.
michael@0 1043 char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
michael@0 1044 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
michael@0 1045 UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec);
michael@0 1046 if (U_SUCCESS(ec)) {
michael@0 1047 clear();
michael@0 1048 add(ch);
michael@0 1049 return *this;
michael@0 1050 } else {
michael@0 1051 FAIL(ec);
michael@0 1052 }
michael@0 1053 }
michael@0 1054 case UCHAR_UNICODE_1_NAME:
michael@0 1055 // ICU 49 deprecates the Unicode_1_Name property APIs.
michael@0 1056 FAIL(ec);
michael@0 1057 case UCHAR_AGE:
michael@0 1058 {
michael@0 1059 // Must munge name, since u_versionFromString() does not do
michael@0 1060 // 'loose' matching.
michael@0 1061 char buf[128];
michael@0 1062 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
michael@0 1063 UVersionInfo version;
michael@0 1064 u_versionFromString(version, buf);
michael@0 1065 applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec);
michael@0 1066 return *this;
michael@0 1067 }
michael@0 1068 case UCHAR_SCRIPT_EXTENSIONS:
michael@0 1069 v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data());
michael@0 1070 if (v == UCHAR_INVALID_CODE) {
michael@0 1071 FAIL(ec);
michael@0 1072 }
michael@0 1073 // fall through to calling applyIntPropertyValue()
michael@0 1074 break;
michael@0 1075 default:
michael@0 1076 // p is a non-binary, non-enumerated property that we
michael@0 1077 // don't support (yet).
michael@0 1078 FAIL(ec);
michael@0 1079 }
michael@0 1080 }
michael@0 1081 }
michael@0 1082
michael@0 1083 else {
michael@0 1084 // value is empty. Interpret as General Category, Script, or
michael@0 1085 // Binary property.
michael@0 1086 p = UCHAR_GENERAL_CATEGORY_MASK;
michael@0 1087 v = u_getPropertyValueEnum(p, pname.data());
michael@0 1088 if (v == UCHAR_INVALID_CODE) {
michael@0 1089 p = UCHAR_SCRIPT;
michael@0 1090 v = u_getPropertyValueEnum(p, pname.data());
michael@0 1091 if (v == UCHAR_INVALID_CODE) {
michael@0 1092 p = u_getPropertyEnum(pname.data());
michael@0 1093 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
michael@0 1094 v = 1;
michael@0 1095 } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) {
michael@0 1096 set(MIN_VALUE, MAX_VALUE);
michael@0 1097 return *this;
michael@0 1098 } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) {
michael@0 1099 set(0, 0x7F);
michael@0 1100 return *this;
michael@0 1101 } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) {
michael@0 1102 // [:Assigned:]=[:^Cn:]
michael@0 1103 p = UCHAR_GENERAL_CATEGORY_MASK;
michael@0 1104 v = U_GC_CN_MASK;
michael@0 1105 invert = TRUE;
michael@0 1106 } else {
michael@0 1107 FAIL(ec);
michael@0 1108 }
michael@0 1109 }
michael@0 1110 }
michael@0 1111 }
michael@0 1112
michael@0 1113 applyIntPropertyValue(p, v, ec);
michael@0 1114 if(invert) {
michael@0 1115 complement();
michael@0 1116 }
michael@0 1117
michael@0 1118 if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) {
michael@0 1119 // mustNotBeEmpty is set to true if an empty set indicates
michael@0 1120 // invalid input.
michael@0 1121 ec = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 1122 }
michael@0 1123
michael@0 1124 if (isBogus() && U_SUCCESS(ec)) {
michael@0 1125 // We likely ran out of memory. AHHH!
michael@0 1126 ec = U_MEMORY_ALLOCATION_ERROR;
michael@0 1127 }
michael@0 1128 return *this;
michael@0 1129 }
michael@0 1130
michael@0 1131 //----------------------------------------------------------------
michael@0 1132 // Property set patterns
michael@0 1133 //----------------------------------------------------------------
michael@0 1134
michael@0 1135 /**
michael@0 1136 * Return true if the given position, in the given pattern, appears
michael@0 1137 * to be the start of a property set pattern.
michael@0 1138 */
michael@0 1139 UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
michael@0 1140 int32_t pos) {
michael@0 1141 // Patterns are at least 5 characters long
michael@0 1142 if ((pos+5) > pattern.length()) {
michael@0 1143 return FALSE;
michael@0 1144 }
michael@0 1145
michael@0 1146 // Look for an opening [:, [:^, \p, or \P
michael@0 1147 return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);
michael@0 1148 }
michael@0 1149
michael@0 1150 /**
michael@0 1151 * Return true if the given iterator appears to point at a
michael@0 1152 * property pattern. Regardless of the result, return with the
michael@0 1153 * iterator unchanged.
michael@0 1154 * @param chars iterator over the pattern characters. Upon return
michael@0 1155 * it will be unchanged.
michael@0 1156 * @param iterOpts RuleCharacterIterator options
michael@0 1157 */
michael@0 1158 UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
michael@0 1159 int32_t iterOpts) {
michael@0 1160 // NOTE: literal will always be FALSE, because we don't parse escapes.
michael@0 1161 UBool result = FALSE, literal;
michael@0 1162 UErrorCode ec = U_ZERO_ERROR;
michael@0 1163 iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
michael@0 1164 RuleCharacterIterator::Pos pos;
michael@0 1165 chars.getPos(pos);
michael@0 1166 UChar32 c = chars.next(iterOpts, literal, ec);
michael@0 1167 if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) {
michael@0 1168 UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
michael@0 1169 literal, ec);
michael@0 1170 result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) :
michael@0 1171 (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/);
michael@0 1172 }
michael@0 1173 chars.setPos(pos);
michael@0 1174 return result && U_SUCCESS(ec);
michael@0 1175 }
michael@0 1176
michael@0 1177 /**
michael@0 1178 * Parse the given property pattern at the given parse position.
michael@0 1179 */
michael@0 1180 UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
michael@0 1181 ParsePosition& ppos,
michael@0 1182 UErrorCode &ec) {
michael@0 1183 int32_t pos = ppos.getIndex();
michael@0 1184
michael@0 1185 UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
michael@0 1186 UBool isName = FALSE; // true for \N{pat}, o/w false
michael@0 1187 UBool invert = FALSE;
michael@0 1188
michael@0 1189 if (U_FAILURE(ec)) return *this;
michael@0 1190
michael@0 1191 // Minimum length is 5 characters, e.g. \p{L}
michael@0 1192 if ((pos+5) > pattern.length()) {
michael@0 1193 FAIL(ec);
michael@0 1194 }
michael@0 1195
michael@0 1196 // On entry, ppos should point to one of the following locations:
michael@0 1197 // Look for an opening [:, [:^, \p, or \P
michael@0 1198 if (isPOSIXOpen(pattern, pos)) {
michael@0 1199 posix = TRUE;
michael@0 1200 pos += 2;
michael@0 1201 pos = ICU_Utility::skipWhitespace(pattern, pos);
michael@0 1202 if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) {
michael@0 1203 ++pos;
michael@0 1204 invert = TRUE;
michael@0 1205 }
michael@0 1206 } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {
michael@0 1207 UChar c = pattern.charAt(pos+1);
michael@0 1208 invert = (c == UPPER_P);
michael@0 1209 isName = (c == UPPER_N);
michael@0 1210 pos += 2;
michael@0 1211 pos = ICU_Utility::skipWhitespace(pattern, pos);
michael@0 1212 if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) {
michael@0 1213 // Syntax error; "\p" or "\P" not followed by "{"
michael@0 1214 FAIL(ec);
michael@0 1215 }
michael@0 1216 } else {
michael@0 1217 // Open delimiter not seen
michael@0 1218 FAIL(ec);
michael@0 1219 }
michael@0 1220
michael@0 1221 // Look for the matching close delimiter, either :] or }
michael@0 1222 int32_t close;
michael@0 1223 if (posix) {
michael@0 1224 close = pattern.indexOf(POSIX_CLOSE, 2, pos);
michael@0 1225 } else {
michael@0 1226 close = pattern.indexOf(CLOSE_BRACE, pos);
michael@0 1227 }
michael@0 1228 if (close < 0) {
michael@0 1229 // Syntax error; close delimiter missing
michael@0 1230 FAIL(ec);
michael@0 1231 }
michael@0 1232
michael@0 1233 // Look for an '=' sign. If this is present, we will parse a
michael@0 1234 // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
michael@0 1235 // pattern.
michael@0 1236 int32_t equals = pattern.indexOf(EQUALS, pos);
michael@0 1237 UnicodeString propName, valueName;
michael@0 1238 if (equals >= 0 && equals < close && !isName) {
michael@0 1239 // Equals seen; parse medium/long pattern
michael@0 1240 pattern.extractBetween(pos, equals, propName);
michael@0 1241 pattern.extractBetween(equals+1, close, valueName);
michael@0 1242 }
michael@0 1243
michael@0 1244 else {
michael@0 1245 // Handle case where no '=' is seen, and \N{}
michael@0 1246 pattern.extractBetween(pos, close, propName);
michael@0 1247
michael@0 1248 // Handle \N{name}
michael@0 1249 if (isName) {
michael@0 1250 // This is a little inefficient since it means we have to
michael@0 1251 // parse NAME_PROP back to UCHAR_NAME even though we already
michael@0 1252 // know it's UCHAR_NAME. If we refactor the API to
michael@0 1253 // support args of (UProperty, char*) then we can remove
michael@0 1254 // NAME_PROP and make this a little more efficient.
michael@0 1255 valueName = propName;
michael@0 1256 propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV);
michael@0 1257 }
michael@0 1258 }
michael@0 1259
michael@0 1260 applyPropertyAlias(propName, valueName, ec);
michael@0 1261
michael@0 1262 if (U_SUCCESS(ec)) {
michael@0 1263 if (invert) {
michael@0 1264 complement();
michael@0 1265 }
michael@0 1266
michael@0 1267 // Move to the limit position after the close delimiter if the
michael@0 1268 // parse succeeded.
michael@0 1269 ppos.setIndex(close + (posix ? 2 : 1));
michael@0 1270 }
michael@0 1271
michael@0 1272 return *this;
michael@0 1273 }
michael@0 1274
michael@0 1275 /**
michael@0 1276 * Parse a property pattern.
michael@0 1277 * @param chars iterator over the pattern characters. Upon return
michael@0 1278 * it will be advanced to the first character after the parsed
michael@0 1279 * pattern, or the end of the iteration if all characters are
michael@0 1280 * parsed.
michael@0 1281 * @param rebuiltPat the pattern that was parsed, rebuilt or
michael@0 1282 * copied from the input pattern, as appropriate.
michael@0 1283 */
michael@0 1284 void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
michael@0 1285 UnicodeString& rebuiltPat,
michael@0 1286 UErrorCode& ec) {
michael@0 1287 if (U_FAILURE(ec)) return;
michael@0 1288 UnicodeString pattern;
michael@0 1289 chars.lookahead(pattern);
michael@0 1290 ParsePosition pos(0);
michael@0 1291 applyPropertyPattern(pattern, pos, ec);
michael@0 1292 if (U_FAILURE(ec)) return;
michael@0 1293 if (pos.getIndex() == 0) {
michael@0 1294 // syntaxError(chars, "Invalid property pattern");
michael@0 1295 ec = U_MALFORMED_SET;
michael@0 1296 return;
michael@0 1297 }
michael@0 1298 chars.jumpahead(pos.getIndex());
michael@0 1299 rebuiltPat.append(pattern, 0, pos.getIndex());
michael@0 1300 }
michael@0 1301
michael@0 1302 U_NAMESPACE_END

mercurial