The Tor Browser: intl/icu/source/common/uniset

     1 /*

     2 *******************************************************************************

3 *

     4 *   Copyright (C) 1999-2013, International Business Machines

     5 *   Corporation and others.  All Rights Reserved.

6 *

     7 *******************************************************************************

     8 *   file name:  uniset_props.cpp

     9 *   encoding:   US-ASCII

    10 *   tab size:   8 (not used)

    11 *   indentation:4

    12 *

    13 *   created on: 2004aug25

    14 *   created by: Markus W. Scherer

    15 *

    16 *   Character property dependent functions moved here from uniset.cpp

    17 */

    19 #include "unicode/utypes.h"

    20 #include "unicode/uniset.h"

    21 #include "unicode/parsepos.h"

    22 #include "unicode/uchar.h"

    23 #include "unicode/uscript.h"

    24 #include "unicode/symtable.h"

    25 #include "unicode/uset.h"

    26 #include "unicode/locid.h"

    27 #include "unicode/brkiter.h"

    28 #include "uset_imp.h"

    29 #include "ruleiter.h"

    30 #include "cmemory.h"

    31 #include "ucln_cmn.h"

    32 #include "util.h"

    33 #include "uvector.h"

    34 #include "uprops.h"

    35 #include "propname.h"

    36 #include "normalizer2impl.h"

    37 #include "ucase.h"

    38 #include "ubidi_props.h"

    39 #include "uinvchar.h"

    40 #include "uprops.h"

    41 #include "charstr.h"

    42 #include "cstring.h"

    43 #include "mutex.h"

    44 #include "umutex.h"

    45 #include "uassert.h"

    46 #include "hash.h"

    48 U_NAMESPACE_USE

    50 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))

    52 // initial storage. Must be >= 0

    53 // *** same as in uniset.cpp ! ***

    54 #define START_EXTRA 16

    56 // Define UChar constants using hex for EBCDIC compatibility

    57 // Used #define to reduce private static exports and memory access time.

    58 #define SET_OPEN        ((UChar)0x005B) /*[*/

    59 #define SET_CLOSE       ((UChar)0x005D) /*]*/

    60 #define HYPHEN          ((UChar)0x002D) /*-*/

    61 #define COMPLEMENT      ((UChar)0x005E) /*^*/

    62 #define COLON           ((UChar)0x003A) /*:*/

    63 #define BACKSLASH       ((UChar)0x005C) /*\*/

    64 #define INTERSECTION    ((UChar)0x0026) /*&*/

    65 #define UPPER_U         ((UChar)0x0055) /*U*/

    66 #define LOWER_U         ((UChar)0x0075) /*u*/

    67 #define OPEN_BRACE      ((UChar)123)    /*{*/

    68 #define CLOSE_BRACE     ((UChar)125)    /*}*/

    69 #define UPPER_P         ((UChar)0x0050) /*P*/

    70 #define LOWER_P         ((UChar)0x0070) /*p*/

    71 #define UPPER_N         ((UChar)78)     /*N*/

    72 #define EQUALS          ((UChar)0x003D) /*=*/

    74 //static const UChar POSIX_OPEN[]  = { SET_OPEN,COLON,0 };  // "[:"

    75 static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 };  // ":]"

    76 //static const UChar PERL_OPEN[]   = { BACKSLASH,LOWER_P,0 }; // "\\p"

    77 //static const UChar PERL_CLOSE[]  = { CLOSE_BRACE,0 };    // "}"

    78 //static const UChar NAME_OPEN[]   = { BACKSLASH,UPPER_N,0 };  // "\\N"

    79 static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/

    81 // Special property set IDs

    82 static const char ANY[]   = "ANY";   // [\u0000-\U0010FFFF]

    83 static const char ASCII[] = "ASCII"; // [\u0000-\u007F]

    84 static const char ASSIGNED[] = "Assigned"; // [:^Cn:]

    86 // Unicode name property alias

    87 #define NAME_PROP "na"

    88 #define NAME_PROP_LENGTH 2

    90 /**

    91  * Delimiter string used in patterns to close a category reference:

    92  * ":]".  Example: "[:Lu:]".

    93  */

    94 //static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */

    96 // Cached sets ------------------------------------------------------------- ***

    98 U_CDECL_BEGIN

    99 static UBool U_CALLCONV uset_cleanup();

   101 struct Inclusion {

   102     UnicodeSet  *fSet;

   103     UInitOnce    fInitOnce;

   104 };

   105 static Inclusion gInclusions[UPROPS_SRC_COUNT]; // cached getInclusions()

   107 static UnicodeSet *uni32Singleton;

   108 static icu::UInitOnce uni32InitOnce = U_INITONCE_INITIALIZER;

   110 //----------------------------------------------------------------

   111 // Inclusions list

   112 //----------------------------------------------------------------

   114 // USetAdder implementation

   115 // Does not use uset.h to reduce code dependencies

   116 static void U_CALLCONV

   117 _set_add(USet *set, UChar32 c) {

   118     ((UnicodeSet *)set)->add(c);

   119 }

   121 static void U_CALLCONV

   122 _set_addRange(USet *set, UChar32 start, UChar32 end) {

   123     ((UnicodeSet *)set)->add(start, end);

   124 }

   126 static void U_CALLCONV

   127 _set_addString(USet *set, const UChar *str, int32_t length) {

   128     ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length));

   129 }

   131 /**

   132  * Cleanup function for UnicodeSet

   133  */

   134 static UBool U_CALLCONV uset_cleanup(void) {

   135     for(int32_t i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) {

   136         Inclusion &in = gInclusions[i];

   137         delete in.fSet;

   138         in.fSet = NULL;

   139         in.fInitOnce.reset();

   140     }

   142     delete uni32Singleton;

   143     uni32Singleton = NULL;

   144     uni32InitOnce.reset();

   145     return TRUE;

   146 }

   148 U_CDECL_END

   150 U_NAMESPACE_BEGIN

   152 /*

   153 Reduce excessive reallocation, and make it easier to detect initialization problems.

   154 Usually you don't see smaller sets than this for Unicode 5.0.

   155 */

   156 #define DEFAULT_INCLUSION_CAPACITY 3072

   158 void U_CALLCONV UnicodeSet_initInclusion(int32_t src, UErrorCode &status) {

   159     // This function is invoked only via umtx_initOnce().

   160     // This function is a friend of class UnicodeSet.

   162     U_ASSERT(src >=0 && src<UPROPS_SRC_COUNT);

   163     UnicodeSet * &incl = gInclusions[src].fSet;

   164     U_ASSERT(incl == NULL);

   166     incl = new UnicodeSet();

   167     if (incl == NULL) {

   168         status = U_MEMORY_ALLOCATION_ERROR;

   169         return;

   170     }

   171     USetAdder sa = {

   172         (USet *)incl,

   173         _set_add,

   174         _set_addRange,

   175         _set_addString,

   176         NULL, // don't need remove()

   177         NULL // don't need removeRange()

   178     };

   180     incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, status);

   181     switch(src) {

   182     case UPROPS_SRC_CHAR:

   183         uchar_addPropertyStarts(&sa, &status);

   184         break;

   185     case UPROPS_SRC_PROPSVEC:

   186         upropsvec_addPropertyStarts(&sa, &status);

   187         break;

   188     case UPROPS_SRC_CHAR_AND_PROPSVEC:

   189         uchar_addPropertyStarts(&sa, &status);

   190         upropsvec_addPropertyStarts(&sa, &status);

   191         break;

   192 #if !UCONFIG_NO_NORMALIZATION

   193     case UPROPS_SRC_CASE_AND_NORM: {

   194         const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);

   195         if(U_SUCCESS(status)) {

   196             impl->addPropertyStarts(&sa, status);

   197         }

   198         ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status);

   199         break;

   200     }

   201     case UPROPS_SRC_NFC: {

   202         const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);

   203         if(U_SUCCESS(status)) {

   204             impl->addPropertyStarts(&sa, status);

   205         }

   206         break;

   207     }

   208     case UPROPS_SRC_NFKC: {

   209         const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(status);

   210         if(U_SUCCESS(status)) {

   211             impl->addPropertyStarts(&sa, status);

   212         }

   213         break;

   214     }

   215     case UPROPS_SRC_NFKC_CF: {

   216         const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(status);

   217         if(U_SUCCESS(status)) {

   218             impl->addPropertyStarts(&sa, status);

   219         }

   220         break;

   221     }

   222     case UPROPS_SRC_NFC_CANON_ITER: {

   223         const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);

   224         if(U_SUCCESS(status)) {

   225             impl->addCanonIterPropertyStarts(&sa, status);

   226         }

   227         break;

   228     }

   229 #endif

   230     case UPROPS_SRC_CASE:

   231         ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status);

   232         break;

   233     case UPROPS_SRC_BIDI:

   234         ubidi_addPropertyStarts(ubidi_getSingleton(), &sa, &status);

   235         break;

   236     default:

   237         status = U_INTERNAL_PROGRAM_ERROR;

   238         break;

   239     }

   241     if (U_FAILURE(status)) {

   242         delete incl;

   243         incl = NULL;

   244         return;

   245     }

   246     // Compact for caching

   247     incl->compact();

   248     ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);

   249 }

   253 const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) {

   254     U_ASSERT(src >=0 && src<UPROPS_SRC_COUNT);

   255     Inclusion &i = gInclusions[src];

   256     umtx_initOnce(i.fInitOnce, &UnicodeSet_initInclusion, src, status);

   257     return i.fSet;

   258 }

   261 // Cache some sets for other services -------------------------------------- ***

   262 void U_CALLCONV createUni32Set(UErrorCode &errorCode) {

   263     U_ASSERT(uni32Singleton == NULL);

   264     uni32Singleton = new UnicodeSet(UNICODE_STRING_SIMPLE("[:age=3.2:]"), errorCode);

   265     if(uni32Singleton==NULL) {

   266         errorCode=U_MEMORY_ALLOCATION_ERROR;

   267     } else {

   268         uni32Singleton->freeze();

   269     }

   270     ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);

   271 }

   274 U_CFUNC UnicodeSet *

   275 uniset_getUnicode32Instance(UErrorCode &errorCode) {

   276     umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode);

   277     return uni32Singleton;

   278 }

   280 // helper functions for matching of pattern syntax pieces ------------------ ***

   281 // these functions are parallel to the PERL_OPEN etc. strings above

   283 // using these functions is not only faster than UnicodeString::compare() and

   284 // caseCompare(), but they also make UnicodeSet work for simple patterns when

   285 // no Unicode properties data is available - when caseCompare() fails

   287 static inline UBool

   288 isPerlOpen(const UnicodeString &pattern, int32_t pos) {

   289     UChar c;

   290     return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P);

   291 }

   293 /*static inline UBool

   294 isPerlClose(const UnicodeString &pattern, int32_t pos) {

   295     return pattern.charAt(pos)==CLOSE_BRACE;

   296 }*/

   298 static inline UBool

   299 isNameOpen(const UnicodeString &pattern, int32_t pos) {

   300     return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N;

   301 }

   303 static inline UBool

   304 isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {

   305     return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON;

   306 }

   308 /*static inline UBool

   309 isPOSIXClose(const UnicodeString &pattern, int32_t pos) {

   310     return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE;

   311 }*/

   313 // TODO memory debugging provided inside uniset.cpp

   314 // could be made available here but probably obsolete with use of modern

   315 // memory leak checker tools

   316 #define _dbgct(me)

   318 //----------------------------------------------------------------

   319 // Constructors &c

   320 //----------------------------------------------------------------

   322 /**

   323  * Constructs a set from the given pattern, optionally ignoring

   324  * white space.  See the class description for the syntax of the

   325  * pattern language.

   326  * @param pattern a string specifying what characters are in the set

   327  */

   328 UnicodeSet::UnicodeSet(const UnicodeString& pattern,

   329                        UErrorCode& status) :

   330     len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),

   331     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),

   332     fFlags(0)

   333 {

   334     if(U_SUCCESS(status)){

   335         list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);

   336         /* test for NULL */

   337         if(list == NULL) {

   338             status = U_MEMORY_ALLOCATION_ERROR;

   339         }else{

   340             allocateStrings(status);

   341             applyPattern(pattern, status);

   342         }

   343     }

   344     _dbgct(this);

   345 }

   347 //----------------------------------------------------------------

   348 // Public API

   349 //----------------------------------------------------------------

   351 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,

   352                                      UErrorCode& status) {

   353     // Equivalent to

   354     //   return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);

   355     // but without dependency on closeOver().

   356     ParsePosition pos(0);

   357     applyPatternIgnoreSpace(pattern, pos, NULL, status);

   358     if (U_FAILURE(status)) return *this;

   360     int32_t i = pos.getIndex();

   361     // Skip over trailing whitespace

   362     ICU_Utility::skipWhitespace(pattern, i, TRUE);

   363     if (i != pattern.length()) {

   364         status = U_ILLEGAL_ARGUMENT_ERROR;

   365     }

   366     return *this;

   367 }

   369 void

   370 UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,

   371                                     ParsePosition& pos,

   372                                     const SymbolTable* symbols,

   373                                     UErrorCode& status) {

   374     if (U_FAILURE(status)) {

   375         return;

   376     }

   377     if (isFrozen()) {

   378         status = U_NO_WRITE_PERMISSION;

   379         return;

   380     }

   381     // Need to build the pattern in a temporary string because

   382     // _applyPattern calls add() etc., which set pat to empty.

   383     UnicodeString rebuiltPat;

   384     RuleCharacterIterator chars(pattern, symbols, pos);

   385     applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, status);

   386     if (U_FAILURE(status)) return;

   387     if (chars.inVariable()) {

   388         // syntaxError(chars, "Extra chars in variable value");

   389         status = U_MALFORMED_SET;

   390         return;

   391     }

   392     setPattern(rebuiltPat);

   393 }

   395 /**

   396  * Return true if the given position, in the given pattern, appears

   397  * to be the start of a UnicodeSet pattern.

   398  */

   399 UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {

   400     return ((pos+1) < pattern.length() &&

   401             pattern.charAt(pos) == (UChar)91/*[*/) ||

   402         resemblesPropertyPattern(pattern, pos);

   403 }

   405 //----------------------------------------------------------------

   406 // Implementation: Pattern parsing

   407 //----------------------------------------------------------------

   409 /**

   410  * A small all-inline class to manage a UnicodeSet pointer.  Add

   411  * operator->() etc. as needed.

   412  */

   413 class UnicodeSetPointer {

   414     UnicodeSet* p;

   415 public:

   416     inline UnicodeSetPointer() : p(0) {}

   417     inline ~UnicodeSetPointer() { delete p; }

   418     inline UnicodeSet* pointer() { return p; }

   419     inline UBool allocate() {

   420         if (p == 0) {

   421             p = new UnicodeSet();

   422         }

   423         return p != 0;

   424     }

   425 };

   427 /**

   428  * Parse the pattern from the given RuleCharacterIterator.  The

   429  * iterator is advanced over the parsed pattern.

   430  * @param chars iterator over the pattern characters.  Upon return

   431  * it will be advanced to the first character after the parsed

   432  * pattern, or the end of the iteration if all characters are

   433  * parsed.

   434  * @param symbols symbol table to use to parse and dereference

   435  * variables, or null if none.

   436  * @param rebuiltPat the pattern that was parsed, rebuilt or

   437  * copied from the input pattern, as appropriate.

   438  * @param options a bit mask of zero or more of the following:

   439  * IGNORE_SPACE, CASE.

   440  */

   441 void UnicodeSet::applyPattern(RuleCharacterIterator& chars,

   442                               const SymbolTable* symbols,

   443                               UnicodeString& rebuiltPat,

   444                               uint32_t options,

   445                               UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),

   446                               UErrorCode& ec) {

   447     if (U_FAILURE(ec)) return;

   449     // Syntax characters: [ ] ^ - & { }

   451     // Recognized special forms for chars, sets: c-c s-s s&s

   453     int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |

   454                    RuleCharacterIterator::PARSE_ESCAPES;

   455     if ((options & USET_IGNORE_SPACE) != 0) {

   456         opts |= RuleCharacterIterator::SKIP_WHITESPACE;

   457     }

   459     UnicodeString patLocal, buf;

   460     UBool usePat = FALSE;

   461     UnicodeSetPointer scratch;

   462     RuleCharacterIterator::Pos backup;

   464     // mode: 0=before [, 1=between [...], 2=after ]

   465     // lastItem: 0=none, 1=char, 2=set

   466     int8_t lastItem = 0, mode = 0;

   467     UChar32 lastChar = 0;

   468     UChar op = 0;

   470     UBool invert = FALSE;

   472     clear();

   474     while (mode != 2 && !chars.atEnd()) {

   475         U_ASSERT((lastItem == 0 && op == 0) ||

   476                  (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) ||

   477                  (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ ||

   478                                     op == INTERSECTION /*'&'*/)));

   480         UChar32 c = 0;

   481         UBool literal = FALSE;

   482         UnicodeSet* nested = 0; // alias - do not delete

   484         // -------- Check for property pattern

   486         // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed

   487         int8_t setMode = 0;

   488         if (resemblesPropertyPattern(chars, opts)) {

   489             setMode = 2;

   490         }

   492         // -------- Parse '[' of opening delimiter OR nested set.

   493         // If there is a nested set, use `setMode' to define how

   494         // the set should be parsed.  If the '[' is part of the

   495         // opening delimiter for this pattern, parse special

   496         // strings "[", "[^", "[-", and "[^-".  Check for stand-in

   497         // characters representing a nested set in the symbol

   498         // table.

   500         else {

   501             // Prepare to backup if necessary

   502             chars.getPos(backup);

   503             c = chars.next(opts, literal, ec);

   504             if (U_FAILURE(ec)) return;

   506             if (c == 0x5B /*'['*/ && !literal) {

   507                 if (mode == 1) {

   508                     chars.setPos(backup); // backup

   509                     setMode = 1;

   510                 } else {

   511                     // Handle opening '[' delimiter

   512                     mode = 1;

   513                     patLocal.append((UChar) 0x5B /*'['*/);

   514                     chars.getPos(backup); // prepare to backup

   515                     c = chars.next(opts, literal, ec);

   516                     if (U_FAILURE(ec)) return;

   517                     if (c == 0x5E /*'^'*/ && !literal) {

   518                         invert = TRUE;

   519                         patLocal.append((UChar) 0x5E /*'^'*/);

   520                         chars.getPos(backup); // prepare to backup

   521                         c = chars.next(opts, literal, ec);

   522                         if (U_FAILURE(ec)) return;

   523                     }

   524                     // Fall through to handle special leading '-';

   525                     // otherwise restart loop for nested [], \p{}, etc.

   526                     if (c == HYPHEN /*'-'*/) {

   527                         literal = TRUE;

   528                         // Fall through to handle literal '-' below

   529                     } else {

   530                         chars.setPos(backup); // backup

   531                         continue;

   532                     }

   533                 }

   534             } else if (symbols != 0) {

   535                 const UnicodeFunctor *m = symbols->lookupMatcher(c);

   536                 if (m != 0) {

   537                     const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m);

   538                     if (ms == NULL) {

   539                         ec = U_MALFORMED_SET;

   540                         return;

   541                     }

   542                     // casting away const, but `nested' won't be modified

   543                     // (important not to modify stored set)

   544                     nested = const_cast<UnicodeSet*>(ms);

   545                     setMode = 3;

   546                 }

   547             }

   548         }

   550         // -------- Handle a nested set.  This either is inline in

   551         // the pattern or represented by a stand-in that has

   552         // previously been parsed and was looked up in the symbol

   553         // table.

   555         if (setMode != 0) {

   556             if (lastItem == 1) {

   557                 if (op != 0) {

   558                     // syntaxError(chars, "Char expected after operator");

   559                     ec = U_MALFORMED_SET;

   560                     return;

   561                 }

   562                 add(lastChar, lastChar);

   563                 _appendToPat(patLocal, lastChar, FALSE);

   564                 lastItem = 0;

   565                 op = 0;

   566             }

   568             if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) {

   569                 patLocal.append(op);

   570             }

   572             if (nested == 0) {

   573                 // lazy allocation

   574                 if (!scratch.allocate()) {

   575                     ec = U_MEMORY_ALLOCATION_ERROR;

   576                     return;

   577                 }

   578                 nested = scratch.pointer();

   579             }

   580             switch (setMode) {

   581             case 1:

   582                 nested->applyPattern(chars, symbols, patLocal, options, caseClosure, ec);

   583                 break;

   584             case 2:

   585                 chars.skipIgnored(opts);

   586                 nested->applyPropertyPattern(chars, patLocal, ec);

   587                 if (U_FAILURE(ec)) return;

   588                 break;

   589             case 3: // `nested' already parsed

   590                 nested->_toPattern(patLocal, FALSE);

   591                 break;

   592             }

   594             usePat = TRUE;

   596             if (mode == 0) {

   597                 // Entire pattern is a category; leave parse loop

   598                 *this = *nested;

   599                 mode = 2;

   600                 break;

   601             }

   603             switch (op) {

   604             case HYPHEN: /*'-'*/

   605                 removeAll(*nested);

   606                 break;

   607             case INTERSECTION: /*'&'*/

   608                 retainAll(*nested);

   609                 break;

   610             case 0:

   611                 addAll(*nested);

   612                 break;

   613             }

   615             op = 0;

   616             lastItem = 2;

   618             continue;

   619         }

   621         if (mode == 0) {

   622             // syntaxError(chars, "Missing '['");

   623             ec = U_MALFORMED_SET;

   624             return;

   625         }

   627         // -------- Parse special (syntax) characters.  If the

   628         // current character is not special, or if it is escaped,

   629         // then fall through and handle it below.

   631         if (!literal) {

   632             switch (c) {

   633             case 0x5D /*']'*/:

   634                 if (lastItem == 1) {

   635                     add(lastChar, lastChar);

   636                     _appendToPat(patLocal, lastChar, FALSE);

   637                 }

   638                 // Treat final trailing '-' as a literal

   639                 if (op == HYPHEN /*'-'*/) {

   640                     add(op, op);

   641                     patLocal.append(op);

   642                 } else if (op == INTERSECTION /*'&'*/) {

   643                     // syntaxError(chars, "Trailing '&'");

   644                     ec = U_MALFORMED_SET;

   645                     return;

   646                 }

   647                 patLocal.append((UChar) 0x5D /*']'*/);

   648                 mode = 2;

   649                 continue;

   650             case HYPHEN /*'-'*/:

   651                 if (op == 0) {

   652                     if (lastItem != 0) {

   653                         op = (UChar) c;

   654                         continue;

   655                     } else {

   656                         // Treat final trailing '-' as a literal

   657                         add(c, c);

   658                         c = chars.next(opts, literal, ec);

   659                         if (U_FAILURE(ec)) return;

   660                         if (c == 0x5D /*']'*/ && !literal) {

   661                             patLocal.append(HYPHEN_RIGHT_BRACE, 2);

   662                             mode = 2;

   663                             continue;

   664                         }

   665                     }

   666                 }

   667                 // syntaxError(chars, "'-' not after char or set");

   668                 ec = U_MALFORMED_SET;

   669                 return;

   670             case INTERSECTION /*'&'*/:

   671                 if (lastItem == 2 && op == 0) {

   672                     op = (UChar) c;

   673                     continue;

   674                 }

   675                 // syntaxError(chars, "'&' not after set");

   676                 ec = U_MALFORMED_SET;

   677                 return;

   678             case 0x5E /*'^'*/:

   679                 // syntaxError(chars, "'^' not after '['");

   680                 ec = U_MALFORMED_SET;

   681                 return;

   682             case 0x7B /*'{'*/:

   683                 if (op != 0) {

   684                     // syntaxError(chars, "Missing operand after operator");

   685                     ec = U_MALFORMED_SET;

   686                     return;

   687                 }

   688                 if (lastItem == 1) {

   689                     add(lastChar, lastChar);

   690                     _appendToPat(patLocal, lastChar, FALSE);

   691                 }

   692                 lastItem = 0;

   693                 buf.truncate(0);

   694                 {

   695                     UBool ok = FALSE;

   696                     while (!chars.atEnd()) {

   697                         c = chars.next(opts, literal, ec);

   698                         if (U_FAILURE(ec)) return;

   699                         if (c == 0x7D /*'}'*/ && !literal) {

   700                             ok = TRUE;

   701                             break;

   702                         }

   703                         buf.append(c);

   704                     }

   705                     if (buf.length() < 1 || !ok) {

   706                         // syntaxError(chars, "Invalid multicharacter string");

   707                         ec = U_MALFORMED_SET;

   708                         return;

   709                     }

   710                 }

   711                 // We have new string. Add it to set and continue;

   712                 // we don't need to drop through to the further

   713                 // processing

   714                 add(buf);

   715                 patLocal.append((UChar) 0x7B /*'{'*/);

   716                 _appendToPat(patLocal, buf, FALSE);

   717                 patLocal.append((UChar) 0x7D /*'}'*/);

   718                 continue;

   719             case SymbolTable::SYMBOL_REF:

   720                 //         symbols  nosymbols

   721                 // [a-$]   error    error (ambiguous)

   722                 // [a$]    anchor   anchor

   723                 // [a-$x]  var "x"* literal '$'

   724                 // [a-$.]  error    literal '$'

   725                 // *We won't get here in the case of var "x"

   726                 {

   727                     chars.getPos(backup);

   728                     c = chars.next(opts, literal, ec);

   729                     if (U_FAILURE(ec)) return;

   730                     UBool anchor = (c == 0x5D /*']'*/ && !literal);

   731                     if (symbols == 0 && !anchor) {

   732                         c = SymbolTable::SYMBOL_REF;

   733                         chars.setPos(backup);

   734                         break; // literal '$'

   735                     }

   736                     if (anchor && op == 0) {

   737                         if (lastItem == 1) {

   738                             add(lastChar, lastChar);

   739                             _appendToPat(patLocal, lastChar, FALSE);

   740                         }

   741                         add(U_ETHER);

   742                         usePat = TRUE;

   743                         patLocal.append((UChar) SymbolTable::SYMBOL_REF);

   744                         patLocal.append((UChar) 0x5D /*']'*/);

   745                         mode = 2;

   746                         continue;

   747                     }

   748                     // syntaxError(chars, "Unquoted '$'");

   749                     ec = U_MALFORMED_SET;

   750                     return;

   751                 }

   752             default:

   753                 break;

   754             }

   755         }

   757         // -------- Parse literal characters.  This includes both

   758         // escaped chars ("\u4E01") and non-syntax characters

   759         // ("a").

   761         switch (lastItem) {

   762         case 0:

   763             lastItem = 1;

   764             lastChar = c;

   765             break;

   766         case 1:

   767             if (op == HYPHEN /*'-'*/) {

   768                 if (lastChar >= c) {

   769                     // Don't allow redundant (a-a) or empty (b-a) ranges;

   770                     // these are most likely typos.

   771                     // syntaxError(chars, "Invalid range");

   772                     ec = U_MALFORMED_SET;

   773                     return;

   774                 }

   775                 add(lastChar, c);

   776                 _appendToPat(patLocal, lastChar, FALSE);

   777                 patLocal.append(op);

   778                 _appendToPat(patLocal, c, FALSE);

   779                 lastItem = 0;

   780                 op = 0;

   781             } else {

   782                 add(lastChar, lastChar);

   783                 _appendToPat(patLocal, lastChar, FALSE);

   784                 lastChar = c;

   785             }

   786             break;

   787         case 2:

   788             if (op != 0) {

   789                 // syntaxError(chars, "Set expected after operator");

   790                 ec = U_MALFORMED_SET;

   791                 return;

   792             }

   793             lastChar = c;

   794             lastItem = 1;

   795             break;

   796         }

   797     }

   799     if (mode != 2) {

   800         // syntaxError(chars, "Missing ']'");

   801         ec = U_MALFORMED_SET;

   802         return;

   803     }

   805     chars.skipIgnored(opts);

   807     /**

   808      * Handle global flags (invert, case insensitivity).  If this

   809      * pattern should be compiled case-insensitive, then we need

   810      * to close over case BEFORE COMPLEMENTING.  This makes

   811      * patterns like /[^abc]/i work.

   812      */

   813     if ((options & USET_CASE_INSENSITIVE) != 0) {

   814         (this->*caseClosure)(USET_CASE_INSENSITIVE);

   815     }

   816     else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {

   817         (this->*caseClosure)(USET_ADD_CASE_MAPPINGS);

   818     }

   819     if (invert) {

   820         complement();

   821     }

   823     // Use the rebuilt pattern (patLocal) only if necessary.  Prefer the

   824     // generated pattern.

   825     if (usePat) {

   826         rebuiltPat.append(patLocal);

   827     } else {

   828         _generatePattern(rebuiltPat, FALSE);

   829     }

   830     if (isBogus() && U_SUCCESS(ec)) {

   831         // We likely ran out of memory. AHHH!

   832         ec = U_MEMORY_ALLOCATION_ERROR;

   833     }

   834 }

   836 //----------------------------------------------------------------

   837 // Property set implementation

   838 //----------------------------------------------------------------

   840 static UBool numericValueFilter(UChar32 ch, void* context) {

   841     return u_getNumericValue(ch) == *(double*)context;

   842 }

   844 static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {

   845     int32_t value = *(int32_t*)context;

   846     return (U_GET_GC_MASK((UChar32) ch) & value) != 0;

   847 }

   849 static UBool versionFilter(UChar32 ch, void* context) {

   850     static const UVersionInfo none = { 0, 0, 0, 0 };

   851     UVersionInfo v;

   852     u_charAge(ch, v);

   853     UVersionInfo* version = (UVersionInfo*)context;

   854     return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;

   855 }

   857 typedef struct {

   858     UProperty prop;

   859     int32_t value;

   860 } IntPropertyContext;

   862 static UBool intPropertyFilter(UChar32 ch, void* context) {

   863     IntPropertyContext* c = (IntPropertyContext*)context;

   864     return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;

   865 }

   867 static UBool scriptExtensionsFilter(UChar32 ch, void* context) {

   868     return uscript_hasScript(ch, *(UScriptCode*)context);

   869 }

   871 /**

   872  * Generic filter-based scanning code for UCD property UnicodeSets.

   873  */

   874 void UnicodeSet::applyFilter(UnicodeSet::Filter filter,

   875                              void* context,

   876                              int32_t src,

   877                              UErrorCode &status) {

   878     if (U_FAILURE(status)) return;

   880     // Logically, walk through all Unicode characters, noting the start

   881     // and end of each range for which filter.contain(c) is

   882     // true.  Add each range to a set.

   883     //

   884     // To improve performance, use an inclusions set which

   885     // encodes information about character ranges that are known

   886     // to have identical properties.

   887     // getInclusions(src) contains exactly the first characters of

   888     // same-value ranges for the given properties "source".

   889     const UnicodeSet* inclusions = getInclusions(src, status);

   890     if (U_FAILURE(status)) {

   891         return;

   892     }

   894     clear();

   896     UChar32 startHasProperty = -1;

   897     int32_t limitRange = inclusions->getRangeCount();

   899     for (int j=0; j<limitRange; ++j) {

   900         // get current range

   901         UChar32 start = inclusions->getRangeStart(j);

   902         UChar32 end = inclusions->getRangeEnd(j);

   904         // for all the code points in the range, process

   905         for (UChar32 ch = start; ch <= end; ++ch) {

   906             // only add to this UnicodeSet on inflection points --

   907             // where the hasProperty value changes to false

   908             if ((*filter)(ch, context)) {

   909                 if (startHasProperty < 0) {

   910                     startHasProperty = ch;

   911                 }

   912             } else if (startHasProperty >= 0) {

   913                 add(startHasProperty, ch-1);

   914                 startHasProperty = -1;

   915             }

   916         }

   917     }

   918     if (startHasProperty >= 0) {

   919         add((UChar32)startHasProperty, (UChar32)0x10FFFF);

   920     }

   921     if (isBogus() && U_SUCCESS(status)) {

   922         // We likely ran out of memory. AHHH!

   923         status = U_MEMORY_ALLOCATION_ERROR;

   924     }

   925 }

   927 static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {

   928     /* Note: we use ' ' in compiler code page */

   929     int32_t j = 0;

   930     char ch;

   931     --dstCapacity; /* make room for term. zero */

   932     while ((ch = *src++) != 0) {

   933         if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {

   934             continue;

   935         }

   936         if (j >= dstCapacity) return FALSE;

   937         dst[j++] = ch;

   938     }

   939     if (j > 0 && dst[j-1] == ' ') --j;

   940     dst[j] = 0;

   941     return TRUE;

   942 }

   944 //----------------------------------------------------------------

   945 // Property set API

   946 //----------------------------------------------------------------

   948 #define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;}

   950 UnicodeSet&

   951 UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {

   952     if (U_FAILURE(ec) || isFrozen()) return *this;

   954     if (prop == UCHAR_GENERAL_CATEGORY_MASK) {

   955         applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec);

   956     } else if (prop == UCHAR_SCRIPT_EXTENSIONS) {

   957         UScriptCode script = (UScriptCode)value;

   958         applyFilter(scriptExtensionsFilter, &script, UPROPS_SRC_PROPSVEC, ec);

   959     } else {

   960         IntPropertyContext c = {prop, value};

   961         applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec);

   962     }

   963     return *this;

   964 }

   966 UnicodeSet&

   967 UnicodeSet::applyPropertyAlias(const UnicodeString& prop,

   968                                const UnicodeString& value,

   969                                UErrorCode& ec) {

   970     if (U_FAILURE(ec) || isFrozen()) return *this;

   972     // prop and value used to be converted to char * using the default

   973     // converter instead of the invariant conversion.

   974     // This should not be necessary because all Unicode property and value

   975     // names use only invariant characters.

   976     // If there are any variant characters, then we won't find them anyway.

   977     // Checking first avoids assertion failures in the conversion.

   978     if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) ||

   979         !uprv_isInvariantUString(value.getBuffer(), value.length())

   980     ) {

   981         FAIL(ec);

   982     }

   983     CharString pname, vname;

   984     pname.appendInvariantChars(prop, ec);

   985     vname.appendInvariantChars(value, ec);

   986     if (U_FAILURE(ec)) return *this;

   988     UProperty p;

   989     int32_t v;

   990     UBool mustNotBeEmpty = FALSE, invert = FALSE;

   992     if (value.length() > 0) {

   993         p = u_getPropertyEnum(pname.data());

   994         if (p == UCHAR_INVALID_CODE) FAIL(ec);

   996         // Treat gc as gcm

   997         if (p == UCHAR_GENERAL_CATEGORY) {

   998             p = UCHAR_GENERAL_CATEGORY_MASK;

   999         }

  1001         if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||

  1002             (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||

  1003             (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {

  1004             v = u_getPropertyValueEnum(p, vname.data());

  1005             if (v == UCHAR_INVALID_CODE) {

  1006                 // Handle numeric CCC

  1007                 if (p == UCHAR_CANONICAL_COMBINING_CLASS ||

  1008                     p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||

  1009                     p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {

  1010                     char* end;

  1011                     double value = uprv_strtod(vname.data(), &end);

  1012                     v = (int32_t) value;

  1013                     if (v != value || v < 0 || *end != 0) {

  1014                         // non-integral or negative value, or trailing junk

  1015                         FAIL(ec);

  1016                     }

  1017                     // If the resultant set is empty then the numeric value

  1018                     // was invalid.

  1019                     mustNotBeEmpty = TRUE;

  1020                 } else {

  1021                     FAIL(ec);

  1022                 }

  1023             }

  1024         }

  1026         else {

  1028             switch (p) {

  1029             case UCHAR_NUMERIC_VALUE:

  1030                 {

  1031                     char* end;

  1032                     double value = uprv_strtod(vname.data(), &end);

  1033                     if (*end != 0) {

  1034                         FAIL(ec);

  1035                     }

  1036                     applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec);

  1037                     return *this;

  1038                 }

  1039             case UCHAR_NAME:

  1040                 {

  1041                     // Must munge name, since u_charFromName() does not do

  1042                     // 'loose' matching.

  1043                     char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength

  1044                     if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);

  1045                     UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec);

  1046                     if (U_SUCCESS(ec)) {

  1047                         clear();

  1048                         add(ch);

  1049                         return *this;

  1050                     } else {

  1051                         FAIL(ec);

  1052                     }

  1053                 }

  1054             case UCHAR_UNICODE_1_NAME:

  1055                 // ICU 49 deprecates the Unicode_1_Name property APIs.

  1056                 FAIL(ec);

  1057             case UCHAR_AGE:

  1058                 {

  1059                     // Must munge name, since u_versionFromString() does not do

  1060                     // 'loose' matching.

  1061                     char buf[128];

  1062                     if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);

  1063                     UVersionInfo version;

  1064                     u_versionFromString(version, buf);

  1065                     applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec);

  1066                     return *this;

  1067                 }

  1068             case UCHAR_SCRIPT_EXTENSIONS:

  1069                 v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data());

  1070                 if (v == UCHAR_INVALID_CODE) {

  1071                     FAIL(ec);

  1072                 }

  1073                 // fall through to calling applyIntPropertyValue()

  1074                 break;

  1075             default:

  1076                 // p is a non-binary, non-enumerated property that we

  1077                 // don't support (yet).

  1078                 FAIL(ec);

  1079             }

  1080         }

  1081     }

  1083     else {

  1084         // value is empty.  Interpret as General Category, Script, or

  1085         // Binary property.

  1086         p = UCHAR_GENERAL_CATEGORY_MASK;

  1087         v = u_getPropertyValueEnum(p, pname.data());

  1088         if (v == UCHAR_INVALID_CODE) {

  1089             p = UCHAR_SCRIPT;

  1090             v = u_getPropertyValueEnum(p, pname.data());

  1091             if (v == UCHAR_INVALID_CODE) {

  1092                 p = u_getPropertyEnum(pname.data());

  1093                 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {

  1094                     v = 1;

  1095                 } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) {

  1096                     set(MIN_VALUE, MAX_VALUE);

  1097                     return *this;

  1098                 } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) {

  1099                     set(0, 0x7F);

  1100                     return *this;

  1101                 } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) {

  1102                     // [:Assigned:]=[:^Cn:]

  1103                     p = UCHAR_GENERAL_CATEGORY_MASK;

  1104                     v = U_GC_CN_MASK;

  1105                     invert = TRUE;

  1106                 } else {

  1107                     FAIL(ec);

  1108                 }

  1109             }

  1110         }

  1111     }

  1113     applyIntPropertyValue(p, v, ec);

  1114     if(invert) {

  1115         complement();

  1116     }

  1118     if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) {

  1119         // mustNotBeEmpty is set to true if an empty set indicates

  1120         // invalid input.

  1121         ec = U_ILLEGAL_ARGUMENT_ERROR;

  1122     }

  1124     if (isBogus() && U_SUCCESS(ec)) {

  1125         // We likely ran out of memory. AHHH!

  1126         ec = U_MEMORY_ALLOCATION_ERROR;

  1127     }

  1128     return *this;

  1129 }

  1131 //----------------------------------------------------------------

  1132 // Property set patterns

  1133 //----------------------------------------------------------------

  1135 /**

  1136  * Return true if the given position, in the given pattern, appears

  1137  * to be the start of a property set pattern.

  1138  */

  1139 UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,

  1140                                            int32_t pos) {

  1141     // Patterns are at least 5 characters long

  1142     if ((pos+5) > pattern.length()) {

  1143         return FALSE;

  1144     }

  1146     // Look for an opening [:, [:^, \p, or \P

  1147     return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);

  1148 }

  1150 /**

  1151  * Return true if the given iterator appears to point at a

  1152  * property pattern.  Regardless of the result, return with the

  1153  * iterator unchanged.

  1154  * @param chars iterator over the pattern characters.  Upon return

  1155  * it will be unchanged.

  1156  * @param iterOpts RuleCharacterIterator options

  1157  */

  1158 UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,

  1159                                            int32_t iterOpts) {

  1160     // NOTE: literal will always be FALSE, because we don't parse escapes.

  1161     UBool result = FALSE, literal;

  1162     UErrorCode ec = U_ZERO_ERROR;

  1163     iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;

  1164     RuleCharacterIterator::Pos pos;

  1165     chars.getPos(pos);

  1166     UChar32 c = chars.next(iterOpts, literal, ec);

  1167     if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) {

  1168         UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,

  1169                                literal, ec);

  1170         result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) :

  1171                  (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/);

  1172     }

  1173     chars.setPos(pos);

  1174     return result && U_SUCCESS(ec);

  1175 }

  1177 /**

  1178  * Parse the given property pattern at the given parse position.

  1179  */

  1180 UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,

  1181                                              ParsePosition& ppos,

  1182                                              UErrorCode &ec) {

  1183     int32_t pos = ppos.getIndex();

  1185     UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}

  1186     UBool isName = FALSE; // true for \N{pat}, o/w false

  1187     UBool invert = FALSE;

  1189     if (U_FAILURE(ec)) return *this;

  1191     // Minimum length is 5 characters, e.g. \p{L}

  1192     if ((pos+5) > pattern.length()) {

  1193         FAIL(ec);

  1194     }

  1196     // On entry, ppos should point to one of the following locations:

  1197     // Look for an opening [:, [:^, \p, or \P

  1198     if (isPOSIXOpen(pattern, pos)) {

  1199         posix = TRUE;

  1200         pos += 2;

  1201         pos = ICU_Utility::skipWhitespace(pattern, pos);

  1202         if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) {

  1203             ++pos;

  1204             invert = TRUE;

  1205         }

  1206     } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {

  1207         UChar c = pattern.charAt(pos+1);

  1208         invert = (c == UPPER_P);

  1209         isName = (c == UPPER_N);

  1210         pos += 2;

  1211         pos = ICU_Utility::skipWhitespace(pattern, pos);

  1212         if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) {

  1213             // Syntax error; "\p" or "\P" not followed by "{"

  1214             FAIL(ec);

  1215         }

  1216     } else {

  1217         // Open delimiter not seen

  1218         FAIL(ec);

  1219     }

  1221     // Look for the matching close delimiter, either :] or }

  1222     int32_t close;

  1223     if (posix) {

  1224       close = pattern.indexOf(POSIX_CLOSE, 2, pos);

  1225     } else {

  1226       close = pattern.indexOf(CLOSE_BRACE, pos);

  1227     }

  1228     if (close < 0) {

  1229         // Syntax error; close delimiter missing

  1230         FAIL(ec);

  1231     }

  1233     // Look for an '=' sign.  If this is present, we will parse a

  1234     // medium \p{gc=Cf} or long \p{GeneralCategory=Format}

  1235     // pattern.

  1236     int32_t equals = pattern.indexOf(EQUALS, pos);

  1237     UnicodeString propName, valueName;

  1238     if (equals >= 0 && equals < close && !isName) {

  1239         // Equals seen; parse medium/long pattern

  1240         pattern.extractBetween(pos, equals, propName);

  1241         pattern.extractBetween(equals+1, close, valueName);

  1242     }

  1244     else {

  1245         // Handle case where no '=' is seen, and \N{}

  1246         pattern.extractBetween(pos, close, propName);

  1248         // Handle \N{name}

  1249         if (isName) {

  1250             // This is a little inefficient since it means we have to

  1251             // parse NAME_PROP back to UCHAR_NAME even though we already

  1252             // know it's UCHAR_NAME.  If we refactor the API to

  1253             // support args of (UProperty, char*) then we can remove

  1254             // NAME_PROP and make this a little more efficient.

  1255             valueName = propName;

  1256             propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV);

  1257         }

  1258     }

  1260     applyPropertyAlias(propName, valueName, ec);

  1262     if (U_SUCCESS(ec)) {

  1263         if (invert) {

  1264             complement();

  1265         }

  1267         // Move to the limit position after the close delimiter if the

  1268         // parse succeeded.

  1269         ppos.setIndex(close + (posix ? 2 : 1));

  1270     }

  1272     return *this;

  1273 }

  1275 /**

  1276  * Parse a property pattern.

  1277  * @param chars iterator over the pattern characters.  Upon return

  1278  * it will be advanced to the first character after the parsed

  1279  * pattern, or the end of the iteration if all characters are

  1280  * parsed.

  1281  * @param rebuiltPat the pattern that was parsed, rebuilt or

  1282  * copied from the input pattern, as appropriate.

  1283  */

  1284 void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,

  1285                                       UnicodeString& rebuiltPat,

  1286                                       UErrorCode& ec) {

  1287     if (U_FAILURE(ec)) return;

  1288     UnicodeString pattern;

  1289     chars.lookahead(pattern);

  1290     ParsePosition pos(0);

  1291     applyPropertyPattern(pattern, pos, ec);

  1292     if (U_FAILURE(ec)) return;

  1293     if (pos.getIndex() == 0) {

  1294         // syntaxError(chars, "Invalid property pattern");

  1295         ec = U_MALFORMED_SET;

  1296         return;

  1297     }

  1298     chars.jumpahead(pos.getIndex());

  1299     rebuiltPat.append(pattern, 0, pos.getIndex());

  1300 }

  1302 U_NAMESPACE_END

The Tor Browser / file revision

intl/icu/source/common/uniset_props.cpp@fc2d59ddac77

intl/icu/source/common/uniset_props.cpp