michael@0: /* michael@0: ******************************************************************************* michael@0: * michael@0: * Copyright (C) 1999-2013, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ******************************************************************************* michael@0: * file name: uniset_props.cpp michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2004aug25 michael@0: * created by: Markus W. Scherer michael@0: * michael@0: * Character property dependent functions moved here from uniset.cpp michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: #include "unicode/uniset.h" michael@0: #include "unicode/parsepos.h" michael@0: #include "unicode/uchar.h" michael@0: #include "unicode/uscript.h" michael@0: #include "unicode/symtable.h" michael@0: #include "unicode/uset.h" michael@0: #include "unicode/locid.h" michael@0: #include "unicode/brkiter.h" michael@0: #include "uset_imp.h" michael@0: #include "ruleiter.h" michael@0: #include "cmemory.h" michael@0: #include "ucln_cmn.h" michael@0: #include "util.h" michael@0: #include "uvector.h" michael@0: #include "uprops.h" michael@0: #include "propname.h" michael@0: #include "normalizer2impl.h" michael@0: #include "ucase.h" michael@0: #include "ubidi_props.h" michael@0: #include "uinvchar.h" michael@0: #include "uprops.h" michael@0: #include "charstr.h" michael@0: #include "cstring.h" michael@0: #include "mutex.h" michael@0: #include "umutex.h" michael@0: #include "uassert.h" michael@0: #include "hash.h" michael@0: michael@0: U_NAMESPACE_USE michael@0: michael@0: #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) michael@0: michael@0: // initial storage. Must be >= 0 michael@0: // *** same as in uniset.cpp ! *** michael@0: #define START_EXTRA 16 michael@0: michael@0: // Define UChar constants using hex for EBCDIC compatibility michael@0: // Used #define to reduce private static exports and memory access time. michael@0: #define SET_OPEN ((UChar)0x005B) /*[*/ michael@0: #define SET_CLOSE ((UChar)0x005D) /*]*/ michael@0: #define HYPHEN ((UChar)0x002D) /*-*/ michael@0: #define COMPLEMENT ((UChar)0x005E) /*^*/ michael@0: #define COLON ((UChar)0x003A) /*:*/ michael@0: #define BACKSLASH ((UChar)0x005C) /*\*/ michael@0: #define INTERSECTION ((UChar)0x0026) /*&*/ michael@0: #define UPPER_U ((UChar)0x0055) /*U*/ michael@0: #define LOWER_U ((UChar)0x0075) /*u*/ michael@0: #define OPEN_BRACE ((UChar)123) /*{*/ michael@0: #define CLOSE_BRACE ((UChar)125) /*}*/ michael@0: #define UPPER_P ((UChar)0x0050) /*P*/ michael@0: #define LOWER_P ((UChar)0x0070) /*p*/ michael@0: #define UPPER_N ((UChar)78) /*N*/ michael@0: #define EQUALS ((UChar)0x003D) /*=*/ michael@0: michael@0: //static const UChar POSIX_OPEN[] = { SET_OPEN,COLON,0 }; // "[:" michael@0: static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 }; // ":]" michael@0: //static const UChar PERL_OPEN[] = { BACKSLASH,LOWER_P,0 }; // "\\p" michael@0: //static const UChar PERL_CLOSE[] = { CLOSE_BRACE,0 }; // "}" michael@0: //static const UChar NAME_OPEN[] = { BACKSLASH,UPPER_N,0 }; // "\\N" michael@0: static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/ michael@0: michael@0: // Special property set IDs michael@0: static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF] michael@0: static const char ASCII[] = "ASCII"; // [\u0000-\u007F] michael@0: static const char ASSIGNED[] = "Assigned"; // [:^Cn:] michael@0: michael@0: // Unicode name property alias michael@0: #define NAME_PROP "na" michael@0: #define NAME_PROP_LENGTH 2 michael@0: michael@0: /** michael@0: * Delimiter string used in patterns to close a category reference: michael@0: * ":]". Example: "[:Lu:]". michael@0: */ michael@0: //static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */ michael@0: michael@0: // Cached sets ------------------------------------------------------------- *** michael@0: michael@0: U_CDECL_BEGIN michael@0: static UBool U_CALLCONV uset_cleanup(); michael@0: michael@0: struct Inclusion { michael@0: UnicodeSet *fSet; michael@0: UInitOnce fInitOnce; michael@0: }; michael@0: static Inclusion gInclusions[UPROPS_SRC_COUNT]; // cached getInclusions() michael@0: michael@0: static UnicodeSet *uni32Singleton; michael@0: static icu::UInitOnce uni32InitOnce = U_INITONCE_INITIALIZER; michael@0: michael@0: //---------------------------------------------------------------- michael@0: // Inclusions list michael@0: //---------------------------------------------------------------- michael@0: michael@0: // USetAdder implementation michael@0: // Does not use uset.h to reduce code dependencies michael@0: static void U_CALLCONV michael@0: _set_add(USet *set, UChar32 c) { michael@0: ((UnicodeSet *)set)->add(c); michael@0: } michael@0: michael@0: static void U_CALLCONV michael@0: _set_addRange(USet *set, UChar32 start, UChar32 end) { michael@0: ((UnicodeSet *)set)->add(start, end); michael@0: } michael@0: michael@0: static void U_CALLCONV michael@0: _set_addString(USet *set, const UChar *str, int32_t length) { michael@0: ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length)); michael@0: } michael@0: michael@0: /** michael@0: * Cleanup function for UnicodeSet michael@0: */ michael@0: static UBool U_CALLCONV uset_cleanup(void) { michael@0: for(int32_t i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) { michael@0: Inclusion &in = gInclusions[i]; michael@0: delete in.fSet; michael@0: in.fSet = NULL; michael@0: in.fInitOnce.reset(); michael@0: } michael@0: michael@0: delete uni32Singleton; michael@0: uni32Singleton = NULL; michael@0: uni32InitOnce.reset(); michael@0: return TRUE; michael@0: } michael@0: michael@0: U_CDECL_END michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: /* michael@0: Reduce excessive reallocation, and make it easier to detect initialization problems. michael@0: Usually you don't see smaller sets than this for Unicode 5.0. michael@0: */ michael@0: #define DEFAULT_INCLUSION_CAPACITY 3072 michael@0: michael@0: void U_CALLCONV UnicodeSet_initInclusion(int32_t src, UErrorCode &status) { michael@0: // This function is invoked only via umtx_initOnce(). michael@0: // This function is a friend of class UnicodeSet. michael@0: michael@0: U_ASSERT(src >=0 && srcensureCapacity(DEFAULT_INCLUSION_CAPACITY, status); michael@0: switch(src) { michael@0: case UPROPS_SRC_CHAR: michael@0: uchar_addPropertyStarts(&sa, &status); michael@0: break; michael@0: case UPROPS_SRC_PROPSVEC: michael@0: upropsvec_addPropertyStarts(&sa, &status); michael@0: break; michael@0: case UPROPS_SRC_CHAR_AND_PROPSVEC: michael@0: uchar_addPropertyStarts(&sa, &status); michael@0: upropsvec_addPropertyStarts(&sa, &status); michael@0: break; michael@0: #if !UCONFIG_NO_NORMALIZATION michael@0: case UPROPS_SRC_CASE_AND_NORM: { michael@0: const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); michael@0: if(U_SUCCESS(status)) { michael@0: impl->addPropertyStarts(&sa, status); michael@0: } michael@0: ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status); michael@0: break; michael@0: } michael@0: case UPROPS_SRC_NFC: { michael@0: const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); michael@0: if(U_SUCCESS(status)) { michael@0: impl->addPropertyStarts(&sa, status); michael@0: } michael@0: break; michael@0: } michael@0: case UPROPS_SRC_NFKC: { michael@0: const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(status); michael@0: if(U_SUCCESS(status)) { michael@0: impl->addPropertyStarts(&sa, status); michael@0: } michael@0: break; michael@0: } michael@0: case UPROPS_SRC_NFKC_CF: { michael@0: const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(status); michael@0: if(U_SUCCESS(status)) { michael@0: impl->addPropertyStarts(&sa, status); michael@0: } michael@0: break; michael@0: } michael@0: case UPROPS_SRC_NFC_CANON_ITER: { michael@0: const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); michael@0: if(U_SUCCESS(status)) { michael@0: impl->addCanonIterPropertyStarts(&sa, status); michael@0: } michael@0: break; michael@0: } michael@0: #endif michael@0: case UPROPS_SRC_CASE: michael@0: ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status); michael@0: break; michael@0: case UPROPS_SRC_BIDI: michael@0: ubidi_addPropertyStarts(ubidi_getSingleton(), &sa, &status); michael@0: break; michael@0: default: michael@0: status = U_INTERNAL_PROGRAM_ERROR; michael@0: break; michael@0: } michael@0: michael@0: if (U_FAILURE(status)) { michael@0: delete incl; michael@0: incl = NULL; michael@0: return; michael@0: } michael@0: // Compact for caching michael@0: incl->compact(); michael@0: ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); michael@0: } michael@0: michael@0: michael@0: michael@0: const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) { michael@0: U_ASSERT(src >=0 && srcfreeze(); michael@0: } michael@0: ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); michael@0: } michael@0: michael@0: michael@0: U_CFUNC UnicodeSet * michael@0: uniset_getUnicode32Instance(UErrorCode &errorCode) { michael@0: umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode); michael@0: return uni32Singleton; michael@0: } michael@0: michael@0: // helper functions for matching of pattern syntax pieces ------------------ *** michael@0: // these functions are parallel to the PERL_OPEN etc. strings above michael@0: michael@0: // using these functions is not only faster than UnicodeString::compare() and michael@0: // caseCompare(), but they also make UnicodeSet work for simple patterns when michael@0: // no Unicode properties data is available - when caseCompare() fails michael@0: michael@0: static inline UBool michael@0: isPerlOpen(const UnicodeString &pattern, int32_t pos) { michael@0: UChar c; michael@0: return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P); michael@0: } michael@0: michael@0: /*static inline UBool michael@0: isPerlClose(const UnicodeString &pattern, int32_t pos) { michael@0: return pattern.charAt(pos)==CLOSE_BRACE; michael@0: }*/ michael@0: michael@0: static inline UBool michael@0: isNameOpen(const UnicodeString &pattern, int32_t pos) { michael@0: return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N; michael@0: } michael@0: michael@0: static inline UBool michael@0: isPOSIXOpen(const UnicodeString &pattern, int32_t pos) { michael@0: return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON; michael@0: } michael@0: michael@0: /*static inline UBool michael@0: isPOSIXClose(const UnicodeString &pattern, int32_t pos) { michael@0: return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE; michael@0: }*/ michael@0: michael@0: // TODO memory debugging provided inside uniset.cpp michael@0: // could be made available here but probably obsolete with use of modern michael@0: // memory leak checker tools michael@0: #define _dbgct(me) michael@0: michael@0: //---------------------------------------------------------------- michael@0: // Constructors &c michael@0: //---------------------------------------------------------------- michael@0: michael@0: /** michael@0: * Constructs a set from the given pattern, optionally ignoring michael@0: * white space. See the class description for the syntax of the michael@0: * pattern language. michael@0: * @param pattern a string specifying what characters are in the set michael@0: */ michael@0: UnicodeSet::UnicodeSet(const UnicodeString& pattern, michael@0: UErrorCode& status) : michael@0: len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), michael@0: bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), michael@0: fFlags(0) michael@0: { michael@0: if(U_SUCCESS(status)){ michael@0: list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); michael@0: /* test for NULL */ michael@0: if(list == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: }else{ michael@0: allocateStrings(status); michael@0: applyPattern(pattern, status); michael@0: } michael@0: } michael@0: _dbgct(this); michael@0: } michael@0: michael@0: //---------------------------------------------------------------- michael@0: // Public API michael@0: //---------------------------------------------------------------- michael@0: michael@0: UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, michael@0: UErrorCode& status) { michael@0: // Equivalent to michael@0: // return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status); michael@0: // but without dependency on closeOver(). michael@0: ParsePosition pos(0); michael@0: applyPatternIgnoreSpace(pattern, pos, NULL, status); michael@0: if (U_FAILURE(status)) return *this; michael@0: michael@0: int32_t i = pos.getIndex(); michael@0: // Skip over trailing whitespace michael@0: ICU_Utility::skipWhitespace(pattern, i, TRUE); michael@0: if (i != pattern.length()) { michael@0: status = U_ILLEGAL_ARGUMENT_ERROR; michael@0: } michael@0: return *this; michael@0: } michael@0: michael@0: void michael@0: UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern, michael@0: ParsePosition& pos, michael@0: const SymbolTable* symbols, michael@0: UErrorCode& status) { michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: if (isFrozen()) { michael@0: status = U_NO_WRITE_PERMISSION; michael@0: return; michael@0: } michael@0: // Need to build the pattern in a temporary string because michael@0: // _applyPattern calls add() etc., which set pat to empty. michael@0: UnicodeString rebuiltPat; michael@0: RuleCharacterIterator chars(pattern, symbols, pos); michael@0: applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, status); michael@0: if (U_FAILURE(status)) return; michael@0: if (chars.inVariable()) { michael@0: // syntaxError(chars, "Extra chars in variable value"); michael@0: status = U_MALFORMED_SET; michael@0: return; michael@0: } michael@0: setPattern(rebuiltPat); michael@0: } michael@0: michael@0: /** michael@0: * Return true if the given position, in the given pattern, appears michael@0: * to be the start of a UnicodeSet pattern. michael@0: */ michael@0: UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) { michael@0: return ((pos+1) < pattern.length() && michael@0: pattern.charAt(pos) == (UChar)91/*[*/) || michael@0: resemblesPropertyPattern(pattern, pos); michael@0: } michael@0: michael@0: //---------------------------------------------------------------- michael@0: // Implementation: Pattern parsing michael@0: //---------------------------------------------------------------- michael@0: michael@0: /** michael@0: * A small all-inline class to manage a UnicodeSet pointer. Add michael@0: * operator->() etc. as needed. michael@0: */ michael@0: class UnicodeSetPointer { michael@0: UnicodeSet* p; michael@0: public: michael@0: inline UnicodeSetPointer() : p(0) {} michael@0: inline ~UnicodeSetPointer() { delete p; } michael@0: inline UnicodeSet* pointer() { return p; } michael@0: inline UBool allocate() { michael@0: if (p == 0) { michael@0: p = new UnicodeSet(); michael@0: } michael@0: return p != 0; michael@0: } michael@0: }; michael@0: michael@0: /** michael@0: * Parse the pattern from the given RuleCharacterIterator. The michael@0: * iterator is advanced over the parsed pattern. michael@0: * @param chars iterator over the pattern characters. Upon return michael@0: * it will be advanced to the first character after the parsed michael@0: * pattern, or the end of the iteration if all characters are michael@0: * parsed. michael@0: * @param symbols symbol table to use to parse and dereference michael@0: * variables, or null if none. michael@0: * @param rebuiltPat the pattern that was parsed, rebuilt or michael@0: * copied from the input pattern, as appropriate. michael@0: * @param options a bit mask of zero or more of the following: michael@0: * IGNORE_SPACE, CASE. michael@0: */ michael@0: void UnicodeSet::applyPattern(RuleCharacterIterator& chars, michael@0: const SymbolTable* symbols, michael@0: UnicodeString& rebuiltPat, michael@0: uint32_t options, michael@0: UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute), michael@0: UErrorCode& ec) { michael@0: if (U_FAILURE(ec)) return; michael@0: michael@0: // Syntax characters: [ ] ^ - & { } michael@0: michael@0: // Recognized special forms for chars, sets: c-c s-s s&s michael@0: michael@0: int32_t opts = RuleCharacterIterator::PARSE_VARIABLES | michael@0: RuleCharacterIterator::PARSE_ESCAPES; michael@0: if ((options & USET_IGNORE_SPACE) != 0) { michael@0: opts |= RuleCharacterIterator::SKIP_WHITESPACE; michael@0: } michael@0: michael@0: UnicodeString patLocal, buf; michael@0: UBool usePat = FALSE; michael@0: UnicodeSetPointer scratch; michael@0: RuleCharacterIterator::Pos backup; michael@0: michael@0: // mode: 0=before [, 1=between [...], 2=after ] michael@0: // lastItem: 0=none, 1=char, 2=set michael@0: int8_t lastItem = 0, mode = 0; michael@0: UChar32 lastChar = 0; michael@0: UChar op = 0; michael@0: michael@0: UBool invert = FALSE; michael@0: michael@0: clear(); michael@0: michael@0: while (mode != 2 && !chars.atEnd()) { michael@0: U_ASSERT((lastItem == 0 && op == 0) || michael@0: (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) || michael@0: (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ || michael@0: op == INTERSECTION /*'&'*/))); michael@0: michael@0: UChar32 c = 0; michael@0: UBool literal = FALSE; michael@0: UnicodeSet* nested = 0; // alias - do not delete michael@0: michael@0: // -------- Check for property pattern michael@0: michael@0: // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed michael@0: int8_t setMode = 0; michael@0: if (resemblesPropertyPattern(chars, opts)) { michael@0: setMode = 2; michael@0: } michael@0: michael@0: // -------- Parse '[' of opening delimiter OR nested set. michael@0: // If there is a nested set, use `setMode' to define how michael@0: // the set should be parsed. If the '[' is part of the michael@0: // opening delimiter for this pattern, parse special michael@0: // strings "[", "[^", "[-", and "[^-". Check for stand-in michael@0: // characters representing a nested set in the symbol michael@0: // table. michael@0: michael@0: else { michael@0: // Prepare to backup if necessary michael@0: chars.getPos(backup); michael@0: c = chars.next(opts, literal, ec); michael@0: if (U_FAILURE(ec)) return; michael@0: michael@0: if (c == 0x5B /*'['*/ && !literal) { michael@0: if (mode == 1) { michael@0: chars.setPos(backup); // backup michael@0: setMode = 1; michael@0: } else { michael@0: // Handle opening '[' delimiter michael@0: mode = 1; michael@0: patLocal.append((UChar) 0x5B /*'['*/); michael@0: chars.getPos(backup); // prepare to backup michael@0: c = chars.next(opts, literal, ec); michael@0: if (U_FAILURE(ec)) return; michael@0: if (c == 0x5E /*'^'*/ && !literal) { michael@0: invert = TRUE; michael@0: patLocal.append((UChar) 0x5E /*'^'*/); michael@0: chars.getPos(backup); // prepare to backup michael@0: c = chars.next(opts, literal, ec); michael@0: if (U_FAILURE(ec)) return; michael@0: } michael@0: // Fall through to handle special leading '-'; michael@0: // otherwise restart loop for nested [], \p{}, etc. michael@0: if (c == HYPHEN /*'-'*/) { michael@0: literal = TRUE; michael@0: // Fall through to handle literal '-' below michael@0: } else { michael@0: chars.setPos(backup); // backup michael@0: continue; michael@0: } michael@0: } michael@0: } else if (symbols != 0) { michael@0: const UnicodeFunctor *m = symbols->lookupMatcher(c); michael@0: if (m != 0) { michael@0: const UnicodeSet *ms = dynamic_cast(m); michael@0: if (ms == NULL) { michael@0: ec = U_MALFORMED_SET; michael@0: return; michael@0: } michael@0: // casting away const, but `nested' won't be modified michael@0: // (important not to modify stored set) michael@0: nested = const_cast(ms); michael@0: setMode = 3; michael@0: } michael@0: } michael@0: } michael@0: michael@0: // -------- Handle a nested set. This either is inline in michael@0: // the pattern or represented by a stand-in that has michael@0: // previously been parsed and was looked up in the symbol michael@0: // table. michael@0: michael@0: if (setMode != 0) { michael@0: if (lastItem == 1) { michael@0: if (op != 0) { michael@0: // syntaxError(chars, "Char expected after operator"); michael@0: ec = U_MALFORMED_SET; michael@0: return; michael@0: } michael@0: add(lastChar, lastChar); michael@0: _appendToPat(patLocal, lastChar, FALSE); michael@0: lastItem = 0; michael@0: op = 0; michael@0: } michael@0: michael@0: if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) { michael@0: patLocal.append(op); michael@0: } michael@0: michael@0: if (nested == 0) { michael@0: // lazy allocation michael@0: if (!scratch.allocate()) { michael@0: ec = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: nested = scratch.pointer(); michael@0: } michael@0: switch (setMode) { michael@0: case 1: michael@0: nested->applyPattern(chars, symbols, patLocal, options, caseClosure, ec); michael@0: break; michael@0: case 2: michael@0: chars.skipIgnored(opts); michael@0: nested->applyPropertyPattern(chars, patLocal, ec); michael@0: if (U_FAILURE(ec)) return; michael@0: break; michael@0: case 3: // `nested' already parsed michael@0: nested->_toPattern(patLocal, FALSE); michael@0: break; michael@0: } michael@0: michael@0: usePat = TRUE; michael@0: michael@0: if (mode == 0) { michael@0: // Entire pattern is a category; leave parse loop michael@0: *this = *nested; michael@0: mode = 2; michael@0: break; michael@0: } michael@0: michael@0: switch (op) { michael@0: case HYPHEN: /*'-'*/ michael@0: removeAll(*nested); michael@0: break; michael@0: case INTERSECTION: /*'&'*/ michael@0: retainAll(*nested); michael@0: break; michael@0: case 0: michael@0: addAll(*nested); michael@0: break; michael@0: } michael@0: michael@0: op = 0; michael@0: lastItem = 2; michael@0: michael@0: continue; michael@0: } michael@0: michael@0: if (mode == 0) { michael@0: // syntaxError(chars, "Missing '['"); michael@0: ec = U_MALFORMED_SET; michael@0: return; michael@0: } michael@0: michael@0: // -------- Parse special (syntax) characters. If the michael@0: // current character is not special, or if it is escaped, michael@0: // then fall through and handle it below. michael@0: michael@0: if (!literal) { michael@0: switch (c) { michael@0: case 0x5D /*']'*/: michael@0: if (lastItem == 1) { michael@0: add(lastChar, lastChar); michael@0: _appendToPat(patLocal, lastChar, FALSE); michael@0: } michael@0: // Treat final trailing '-' as a literal michael@0: if (op == HYPHEN /*'-'*/) { michael@0: add(op, op); michael@0: patLocal.append(op); michael@0: } else if (op == INTERSECTION /*'&'*/) { michael@0: // syntaxError(chars, "Trailing '&'"); michael@0: ec = U_MALFORMED_SET; michael@0: return; michael@0: } michael@0: patLocal.append((UChar) 0x5D /*']'*/); michael@0: mode = 2; michael@0: continue; michael@0: case HYPHEN /*'-'*/: michael@0: if (op == 0) { michael@0: if (lastItem != 0) { michael@0: op = (UChar) c; michael@0: continue; michael@0: } else { michael@0: // Treat final trailing '-' as a literal michael@0: add(c, c); michael@0: c = chars.next(opts, literal, ec); michael@0: if (U_FAILURE(ec)) return; michael@0: if (c == 0x5D /*']'*/ && !literal) { michael@0: patLocal.append(HYPHEN_RIGHT_BRACE, 2); michael@0: mode = 2; michael@0: continue; michael@0: } michael@0: } michael@0: } michael@0: // syntaxError(chars, "'-' not after char or set"); michael@0: ec = U_MALFORMED_SET; michael@0: return; michael@0: case INTERSECTION /*'&'*/: michael@0: if (lastItem == 2 && op == 0) { michael@0: op = (UChar) c; michael@0: continue; michael@0: } michael@0: // syntaxError(chars, "'&' not after set"); michael@0: ec = U_MALFORMED_SET; michael@0: return; michael@0: case 0x5E /*'^'*/: michael@0: // syntaxError(chars, "'^' not after '['"); michael@0: ec = U_MALFORMED_SET; michael@0: return; michael@0: case 0x7B /*'{'*/: michael@0: if (op != 0) { michael@0: // syntaxError(chars, "Missing operand after operator"); michael@0: ec = U_MALFORMED_SET; michael@0: return; michael@0: } michael@0: if (lastItem == 1) { michael@0: add(lastChar, lastChar); michael@0: _appendToPat(patLocal, lastChar, FALSE); michael@0: } michael@0: lastItem = 0; michael@0: buf.truncate(0); michael@0: { michael@0: UBool ok = FALSE; michael@0: while (!chars.atEnd()) { michael@0: c = chars.next(opts, literal, ec); michael@0: if (U_FAILURE(ec)) return; michael@0: if (c == 0x7D /*'}'*/ && !literal) { michael@0: ok = TRUE; michael@0: break; michael@0: } michael@0: buf.append(c); michael@0: } michael@0: if (buf.length() < 1 || !ok) { michael@0: // syntaxError(chars, "Invalid multicharacter string"); michael@0: ec = U_MALFORMED_SET; michael@0: return; michael@0: } michael@0: } michael@0: // We have new string. Add it to set and continue; michael@0: // we don't need to drop through to the further michael@0: // processing michael@0: add(buf); michael@0: patLocal.append((UChar) 0x7B /*'{'*/); michael@0: _appendToPat(patLocal, buf, FALSE); michael@0: patLocal.append((UChar) 0x7D /*'}'*/); michael@0: continue; michael@0: case SymbolTable::SYMBOL_REF: michael@0: // symbols nosymbols michael@0: // [a-$] error error (ambiguous) michael@0: // [a$] anchor anchor michael@0: // [a-$x] var "x"* literal '$' michael@0: // [a-$.] error literal '$' michael@0: // *We won't get here in the case of var "x" michael@0: { michael@0: chars.getPos(backup); michael@0: c = chars.next(opts, literal, ec); michael@0: if (U_FAILURE(ec)) return; michael@0: UBool anchor = (c == 0x5D /*']'*/ && !literal); michael@0: if (symbols == 0 && !anchor) { michael@0: c = SymbolTable::SYMBOL_REF; michael@0: chars.setPos(backup); michael@0: break; // literal '$' michael@0: } michael@0: if (anchor && op == 0) { michael@0: if (lastItem == 1) { michael@0: add(lastChar, lastChar); michael@0: _appendToPat(patLocal, lastChar, FALSE); michael@0: } michael@0: add(U_ETHER); michael@0: usePat = TRUE; michael@0: patLocal.append((UChar) SymbolTable::SYMBOL_REF); michael@0: patLocal.append((UChar) 0x5D /*']'*/); michael@0: mode = 2; michael@0: continue; michael@0: } michael@0: // syntaxError(chars, "Unquoted '$'"); michael@0: ec = U_MALFORMED_SET; michael@0: return; michael@0: } michael@0: default: michael@0: break; michael@0: } michael@0: } michael@0: michael@0: // -------- Parse literal characters. This includes both michael@0: // escaped chars ("\u4E01") and non-syntax characters michael@0: // ("a"). michael@0: michael@0: switch (lastItem) { michael@0: case 0: michael@0: lastItem = 1; michael@0: lastChar = c; michael@0: break; michael@0: case 1: michael@0: if (op == HYPHEN /*'-'*/) { michael@0: if (lastChar >= c) { michael@0: // Don't allow redundant (a-a) or empty (b-a) ranges; michael@0: // these are most likely typos. michael@0: // syntaxError(chars, "Invalid range"); michael@0: ec = U_MALFORMED_SET; michael@0: return; michael@0: } michael@0: add(lastChar, c); michael@0: _appendToPat(patLocal, lastChar, FALSE); michael@0: patLocal.append(op); michael@0: _appendToPat(patLocal, c, FALSE); michael@0: lastItem = 0; michael@0: op = 0; michael@0: } else { michael@0: add(lastChar, lastChar); michael@0: _appendToPat(patLocal, lastChar, FALSE); michael@0: lastChar = c; michael@0: } michael@0: break; michael@0: case 2: michael@0: if (op != 0) { michael@0: // syntaxError(chars, "Set expected after operator"); michael@0: ec = U_MALFORMED_SET; michael@0: return; michael@0: } michael@0: lastChar = c; michael@0: lastItem = 1; michael@0: break; michael@0: } michael@0: } michael@0: michael@0: if (mode != 2) { michael@0: // syntaxError(chars, "Missing ']'"); michael@0: ec = U_MALFORMED_SET; michael@0: return; michael@0: } michael@0: michael@0: chars.skipIgnored(opts); michael@0: michael@0: /** michael@0: * Handle global flags (invert, case insensitivity). If this michael@0: * pattern should be compiled case-insensitive, then we need michael@0: * to close over case BEFORE COMPLEMENTING. This makes michael@0: * patterns like /[^abc]/i work. michael@0: */ michael@0: if ((options & USET_CASE_INSENSITIVE) != 0) { michael@0: (this->*caseClosure)(USET_CASE_INSENSITIVE); michael@0: } michael@0: else if ((options & USET_ADD_CASE_MAPPINGS) != 0) { michael@0: (this->*caseClosure)(USET_ADD_CASE_MAPPINGS); michael@0: } michael@0: if (invert) { michael@0: complement(); michael@0: } michael@0: michael@0: // Use the rebuilt pattern (patLocal) only if necessary. Prefer the michael@0: // generated pattern. michael@0: if (usePat) { michael@0: rebuiltPat.append(patLocal); michael@0: } else { michael@0: _generatePattern(rebuiltPat, FALSE); michael@0: } michael@0: if (isBogus() && U_SUCCESS(ec)) { michael@0: // We likely ran out of memory. AHHH! michael@0: ec = U_MEMORY_ALLOCATION_ERROR; michael@0: } michael@0: } michael@0: michael@0: //---------------------------------------------------------------- michael@0: // Property set implementation michael@0: //---------------------------------------------------------------- michael@0: michael@0: static UBool numericValueFilter(UChar32 ch, void* context) { michael@0: return u_getNumericValue(ch) == *(double*)context; michael@0: } michael@0: michael@0: static UBool generalCategoryMaskFilter(UChar32 ch, void* context) { michael@0: int32_t value = *(int32_t*)context; michael@0: return (U_GET_GC_MASK((UChar32) ch) & value) != 0; michael@0: } michael@0: michael@0: static UBool versionFilter(UChar32 ch, void* context) { michael@0: static const UVersionInfo none = { 0, 0, 0, 0 }; michael@0: UVersionInfo v; michael@0: u_charAge(ch, v); michael@0: UVersionInfo* version = (UVersionInfo*)context; michael@0: return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0; michael@0: } michael@0: michael@0: typedef struct { michael@0: UProperty prop; michael@0: int32_t value; michael@0: } IntPropertyContext; michael@0: michael@0: static UBool intPropertyFilter(UChar32 ch, void* context) { michael@0: IntPropertyContext* c = (IntPropertyContext*)context; michael@0: return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value; michael@0: } michael@0: michael@0: static UBool scriptExtensionsFilter(UChar32 ch, void* context) { michael@0: return uscript_hasScript(ch, *(UScriptCode*)context); michael@0: } michael@0: michael@0: /** michael@0: * Generic filter-based scanning code for UCD property UnicodeSets. michael@0: */ michael@0: void UnicodeSet::applyFilter(UnicodeSet::Filter filter, michael@0: void* context, michael@0: int32_t src, michael@0: UErrorCode &status) { michael@0: if (U_FAILURE(status)) return; michael@0: michael@0: // Logically, walk through all Unicode characters, noting the start michael@0: // and end of each range for which filter.contain(c) is michael@0: // true. Add each range to a set. michael@0: // michael@0: // To improve performance, use an inclusions set which michael@0: // encodes information about character ranges that are known michael@0: // to have identical properties. michael@0: // getInclusions(src) contains exactly the first characters of michael@0: // same-value ranges for the given properties "source". michael@0: const UnicodeSet* inclusions = getInclusions(src, status); michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: michael@0: clear(); michael@0: michael@0: UChar32 startHasProperty = -1; michael@0: int32_t limitRange = inclusions->getRangeCount(); michael@0: michael@0: for (int j=0; jgetRangeStart(j); michael@0: UChar32 end = inclusions->getRangeEnd(j); michael@0: michael@0: // for all the code points in the range, process michael@0: for (UChar32 ch = start; ch <= end; ++ch) { michael@0: // only add to this UnicodeSet on inflection points -- michael@0: // where the hasProperty value changes to false michael@0: if ((*filter)(ch, context)) { michael@0: if (startHasProperty < 0) { michael@0: startHasProperty = ch; michael@0: } michael@0: } else if (startHasProperty >= 0) { michael@0: add(startHasProperty, ch-1); michael@0: startHasProperty = -1; michael@0: } michael@0: } michael@0: } michael@0: if (startHasProperty >= 0) { michael@0: add((UChar32)startHasProperty, (UChar32)0x10FFFF); michael@0: } michael@0: if (isBogus() && U_SUCCESS(status)) { michael@0: // We likely ran out of memory. AHHH! michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: } michael@0: } michael@0: michael@0: static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) { michael@0: /* Note: we use ' ' in compiler code page */ michael@0: int32_t j = 0; michael@0: char ch; michael@0: --dstCapacity; /* make room for term. zero */ michael@0: while ((ch = *src++) != 0) { michael@0: if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) { michael@0: continue; michael@0: } michael@0: if (j >= dstCapacity) return FALSE; michael@0: dst[j++] = ch; michael@0: } michael@0: if (j > 0 && dst[j-1] == ' ') --j; michael@0: dst[j] = 0; michael@0: return TRUE; michael@0: } michael@0: michael@0: //---------------------------------------------------------------- michael@0: // Property set API michael@0: //---------------------------------------------------------------- michael@0: michael@0: #define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;} michael@0: michael@0: UnicodeSet& michael@0: UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) { michael@0: if (U_FAILURE(ec) || isFrozen()) return *this; michael@0: michael@0: if (prop == UCHAR_GENERAL_CATEGORY_MASK) { michael@0: applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec); michael@0: } else if (prop == UCHAR_SCRIPT_EXTENSIONS) { michael@0: UScriptCode script = (UScriptCode)value; michael@0: applyFilter(scriptExtensionsFilter, &script, UPROPS_SRC_PROPSVEC, ec); michael@0: } else { michael@0: IntPropertyContext c = {prop, value}; michael@0: applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec); michael@0: } michael@0: return *this; michael@0: } michael@0: michael@0: UnicodeSet& michael@0: UnicodeSet::applyPropertyAlias(const UnicodeString& prop, michael@0: const UnicodeString& value, michael@0: UErrorCode& ec) { michael@0: if (U_FAILURE(ec) || isFrozen()) return *this; michael@0: michael@0: // prop and value used to be converted to char * using the default michael@0: // converter instead of the invariant conversion. michael@0: // This should not be necessary because all Unicode property and value michael@0: // names use only invariant characters. michael@0: // If there are any variant characters, then we won't find them anyway. michael@0: // Checking first avoids assertion failures in the conversion. michael@0: if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) || michael@0: !uprv_isInvariantUString(value.getBuffer(), value.length()) michael@0: ) { michael@0: FAIL(ec); michael@0: } michael@0: CharString pname, vname; michael@0: pname.appendInvariantChars(prop, ec); michael@0: vname.appendInvariantChars(value, ec); michael@0: if (U_FAILURE(ec)) return *this; michael@0: michael@0: UProperty p; michael@0: int32_t v; michael@0: UBool mustNotBeEmpty = FALSE, invert = FALSE; michael@0: michael@0: if (value.length() > 0) { michael@0: p = u_getPropertyEnum(pname.data()); michael@0: if (p == UCHAR_INVALID_CODE) FAIL(ec); michael@0: michael@0: // Treat gc as gcm michael@0: if (p == UCHAR_GENERAL_CATEGORY) { michael@0: p = UCHAR_GENERAL_CATEGORY_MASK; michael@0: } michael@0: michael@0: if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) || michael@0: (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) || michael@0: (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) { michael@0: v = u_getPropertyValueEnum(p, vname.data()); michael@0: if (v == UCHAR_INVALID_CODE) { michael@0: // Handle numeric CCC michael@0: if (p == UCHAR_CANONICAL_COMBINING_CLASS || michael@0: p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS || michael@0: p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) { michael@0: char* end; michael@0: double value = uprv_strtod(vname.data(), &end); michael@0: v = (int32_t) value; michael@0: if (v != value || v < 0 || *end != 0) { michael@0: // non-integral or negative value, or trailing junk michael@0: FAIL(ec); michael@0: } michael@0: // If the resultant set is empty then the numeric value michael@0: // was invalid. michael@0: mustNotBeEmpty = TRUE; michael@0: } else { michael@0: FAIL(ec); michael@0: } michael@0: } michael@0: } michael@0: michael@0: else { michael@0: michael@0: switch (p) { michael@0: case UCHAR_NUMERIC_VALUE: michael@0: { michael@0: char* end; michael@0: double value = uprv_strtod(vname.data(), &end); michael@0: if (*end != 0) { michael@0: FAIL(ec); michael@0: } michael@0: applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec); michael@0: return *this; michael@0: } michael@0: case UCHAR_NAME: michael@0: { michael@0: // Must munge name, since u_charFromName() does not do michael@0: // 'loose' matching. michael@0: char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength michael@0: if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec); michael@0: UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec); michael@0: if (U_SUCCESS(ec)) { michael@0: clear(); michael@0: add(ch); michael@0: return *this; michael@0: } else { michael@0: FAIL(ec); michael@0: } michael@0: } michael@0: case UCHAR_UNICODE_1_NAME: michael@0: // ICU 49 deprecates the Unicode_1_Name property APIs. michael@0: FAIL(ec); michael@0: case UCHAR_AGE: michael@0: { michael@0: // Must munge name, since u_versionFromString() does not do michael@0: // 'loose' matching. michael@0: char buf[128]; michael@0: if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec); michael@0: UVersionInfo version; michael@0: u_versionFromString(version, buf); michael@0: applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec); michael@0: return *this; michael@0: } michael@0: case UCHAR_SCRIPT_EXTENSIONS: michael@0: v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data()); michael@0: if (v == UCHAR_INVALID_CODE) { michael@0: FAIL(ec); michael@0: } michael@0: // fall through to calling applyIntPropertyValue() michael@0: break; michael@0: default: michael@0: // p is a non-binary, non-enumerated property that we michael@0: // don't support (yet). michael@0: FAIL(ec); michael@0: } michael@0: } michael@0: } michael@0: michael@0: else { michael@0: // value is empty. Interpret as General Category, Script, or michael@0: // Binary property. michael@0: p = UCHAR_GENERAL_CATEGORY_MASK; michael@0: v = u_getPropertyValueEnum(p, pname.data()); michael@0: if (v == UCHAR_INVALID_CODE) { michael@0: p = UCHAR_SCRIPT; michael@0: v = u_getPropertyValueEnum(p, pname.data()); michael@0: if (v == UCHAR_INVALID_CODE) { michael@0: p = u_getPropertyEnum(pname.data()); michael@0: if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) { michael@0: v = 1; michael@0: } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) { michael@0: set(MIN_VALUE, MAX_VALUE); michael@0: return *this; michael@0: } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) { michael@0: set(0, 0x7F); michael@0: return *this; michael@0: } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) { michael@0: // [:Assigned:]=[:^Cn:] michael@0: p = UCHAR_GENERAL_CATEGORY_MASK; michael@0: v = U_GC_CN_MASK; michael@0: invert = TRUE; michael@0: } else { michael@0: FAIL(ec); michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: applyIntPropertyValue(p, v, ec); michael@0: if(invert) { michael@0: complement(); michael@0: } michael@0: michael@0: if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) { michael@0: // mustNotBeEmpty is set to true if an empty set indicates michael@0: // invalid input. michael@0: ec = U_ILLEGAL_ARGUMENT_ERROR; michael@0: } michael@0: michael@0: if (isBogus() && U_SUCCESS(ec)) { michael@0: // We likely ran out of memory. AHHH! michael@0: ec = U_MEMORY_ALLOCATION_ERROR; michael@0: } michael@0: return *this; michael@0: } michael@0: michael@0: //---------------------------------------------------------------- michael@0: // Property set patterns michael@0: //---------------------------------------------------------------- michael@0: michael@0: /** michael@0: * Return true if the given position, in the given pattern, appears michael@0: * to be the start of a property set pattern. michael@0: */ michael@0: UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern, michael@0: int32_t pos) { michael@0: // Patterns are at least 5 characters long michael@0: if ((pos+5) > pattern.length()) { michael@0: return FALSE; michael@0: } michael@0: michael@0: // Look for an opening [:, [:^, \p, or \P michael@0: return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos); michael@0: } michael@0: michael@0: /** michael@0: * Return true if the given iterator appears to point at a michael@0: * property pattern. Regardless of the result, return with the michael@0: * iterator unchanged. michael@0: * @param chars iterator over the pattern characters. Upon return michael@0: * it will be unchanged. michael@0: * @param iterOpts RuleCharacterIterator options michael@0: */ michael@0: UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars, michael@0: int32_t iterOpts) { michael@0: // NOTE: literal will always be FALSE, because we don't parse escapes. michael@0: UBool result = FALSE, literal; michael@0: UErrorCode ec = U_ZERO_ERROR; michael@0: iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES; michael@0: RuleCharacterIterator::Pos pos; michael@0: chars.getPos(pos); michael@0: UChar32 c = chars.next(iterOpts, literal, ec); michael@0: if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) { michael@0: UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE, michael@0: literal, ec); michael@0: result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) : michael@0: (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/); michael@0: } michael@0: chars.setPos(pos); michael@0: return result && U_SUCCESS(ec); michael@0: } michael@0: michael@0: /** michael@0: * Parse the given property pattern at the given parse position. michael@0: */ michael@0: UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern, michael@0: ParsePosition& ppos, michael@0: UErrorCode &ec) { michael@0: int32_t pos = ppos.getIndex(); michael@0: michael@0: UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat} michael@0: UBool isName = FALSE; // true for \N{pat}, o/w false michael@0: UBool invert = FALSE; michael@0: michael@0: if (U_FAILURE(ec)) return *this; michael@0: michael@0: // Minimum length is 5 characters, e.g. \p{L} michael@0: if ((pos+5) > pattern.length()) { michael@0: FAIL(ec); michael@0: } michael@0: michael@0: // On entry, ppos should point to one of the following locations: michael@0: // Look for an opening [:, [:^, \p, or \P michael@0: if (isPOSIXOpen(pattern, pos)) { michael@0: posix = TRUE; michael@0: pos += 2; michael@0: pos = ICU_Utility::skipWhitespace(pattern, pos); michael@0: if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) { michael@0: ++pos; michael@0: invert = TRUE; michael@0: } michael@0: } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) { michael@0: UChar c = pattern.charAt(pos+1); michael@0: invert = (c == UPPER_P); michael@0: isName = (c == UPPER_N); michael@0: pos += 2; michael@0: pos = ICU_Utility::skipWhitespace(pattern, pos); michael@0: if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) { michael@0: // Syntax error; "\p" or "\P" not followed by "{" michael@0: FAIL(ec); michael@0: } michael@0: } else { michael@0: // Open delimiter not seen michael@0: FAIL(ec); michael@0: } michael@0: michael@0: // Look for the matching close delimiter, either :] or } michael@0: int32_t close; michael@0: if (posix) { michael@0: close = pattern.indexOf(POSIX_CLOSE, 2, pos); michael@0: } else { michael@0: close = pattern.indexOf(CLOSE_BRACE, pos); michael@0: } michael@0: if (close < 0) { michael@0: // Syntax error; close delimiter missing michael@0: FAIL(ec); michael@0: } michael@0: michael@0: // Look for an '=' sign. If this is present, we will parse a michael@0: // medium \p{gc=Cf} or long \p{GeneralCategory=Format} michael@0: // pattern. michael@0: int32_t equals = pattern.indexOf(EQUALS, pos); michael@0: UnicodeString propName, valueName; michael@0: if (equals >= 0 && equals < close && !isName) { michael@0: // Equals seen; parse medium/long pattern michael@0: pattern.extractBetween(pos, equals, propName); michael@0: pattern.extractBetween(equals+1, close, valueName); michael@0: } michael@0: michael@0: else { michael@0: // Handle case where no '=' is seen, and \N{} michael@0: pattern.extractBetween(pos, close, propName); michael@0: michael@0: // Handle \N{name} michael@0: if (isName) { michael@0: // This is a little inefficient since it means we have to michael@0: // parse NAME_PROP back to UCHAR_NAME even though we already michael@0: // know it's UCHAR_NAME. If we refactor the API to michael@0: // support args of (UProperty, char*) then we can remove michael@0: // NAME_PROP and make this a little more efficient. michael@0: valueName = propName; michael@0: propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV); michael@0: } michael@0: } michael@0: michael@0: applyPropertyAlias(propName, valueName, ec); michael@0: michael@0: if (U_SUCCESS(ec)) { michael@0: if (invert) { michael@0: complement(); michael@0: } michael@0: michael@0: // Move to the limit position after the close delimiter if the michael@0: // parse succeeded. michael@0: ppos.setIndex(close + (posix ? 2 : 1)); michael@0: } michael@0: michael@0: return *this; michael@0: } michael@0: michael@0: /** michael@0: * Parse a property pattern. michael@0: * @param chars iterator over the pattern characters. Upon return michael@0: * it will be advanced to the first character after the parsed michael@0: * pattern, or the end of the iteration if all characters are michael@0: * parsed. michael@0: * @param rebuiltPat the pattern that was parsed, rebuilt or michael@0: * copied from the input pattern, as appropriate. michael@0: */ michael@0: void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars, michael@0: UnicodeString& rebuiltPat, michael@0: UErrorCode& ec) { michael@0: if (U_FAILURE(ec)) return; michael@0: UnicodeString pattern; michael@0: chars.lookahead(pattern); michael@0: ParsePosition pos(0); michael@0: applyPropertyPattern(pattern, pos, ec); michael@0: if (U_FAILURE(ec)) return; michael@0: if (pos.getIndex() == 0) { michael@0: // syntaxError(chars, "Invalid property pattern"); michael@0: ec = U_MALFORMED_SET; michael@0: return; michael@0: } michael@0: chars.jumpahead(pos.getIndex()); michael@0: rebuiltPat.append(pattern, 0, pos.getIndex()); michael@0: } michael@0: michael@0: U_NAMESPACE_END