|
1 /* |
|
2 ******************************************************************************* |
|
3 * |
|
4 * Copyright (C) 1999-2013, International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ******************************************************************************* |
|
8 * file name: uniset_props.cpp |
|
9 * encoding: US-ASCII |
|
10 * tab size: 8 (not used) |
|
11 * indentation:4 |
|
12 * |
|
13 * created on: 2004aug25 |
|
14 * created by: Markus W. Scherer |
|
15 * |
|
16 * Character property dependent functions moved here from uniset.cpp |
|
17 */ |
|
18 |
|
19 #include "unicode/utypes.h" |
|
20 #include "unicode/uniset.h" |
|
21 #include "unicode/parsepos.h" |
|
22 #include "unicode/uchar.h" |
|
23 #include "unicode/uscript.h" |
|
24 #include "unicode/symtable.h" |
|
25 #include "unicode/uset.h" |
|
26 #include "unicode/locid.h" |
|
27 #include "unicode/brkiter.h" |
|
28 #include "uset_imp.h" |
|
29 #include "ruleiter.h" |
|
30 #include "cmemory.h" |
|
31 #include "ucln_cmn.h" |
|
32 #include "util.h" |
|
33 #include "uvector.h" |
|
34 #include "uprops.h" |
|
35 #include "propname.h" |
|
36 #include "normalizer2impl.h" |
|
37 #include "ucase.h" |
|
38 #include "ubidi_props.h" |
|
39 #include "uinvchar.h" |
|
40 #include "uprops.h" |
|
41 #include "charstr.h" |
|
42 #include "cstring.h" |
|
43 #include "mutex.h" |
|
44 #include "umutex.h" |
|
45 #include "uassert.h" |
|
46 #include "hash.h" |
|
47 |
|
48 U_NAMESPACE_USE |
|
49 |
|
50 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
|
51 |
|
52 // initial storage. Must be >= 0 |
|
53 // *** same as in uniset.cpp ! *** |
|
54 #define START_EXTRA 16 |
|
55 |
|
56 // Define UChar constants using hex for EBCDIC compatibility |
|
57 // Used #define to reduce private static exports and memory access time. |
|
58 #define SET_OPEN ((UChar)0x005B) /*[*/ |
|
59 #define SET_CLOSE ((UChar)0x005D) /*]*/ |
|
60 #define HYPHEN ((UChar)0x002D) /*-*/ |
|
61 #define COMPLEMENT ((UChar)0x005E) /*^*/ |
|
62 #define COLON ((UChar)0x003A) /*:*/ |
|
63 #define BACKSLASH ((UChar)0x005C) /*\*/ |
|
64 #define INTERSECTION ((UChar)0x0026) /*&*/ |
|
65 #define UPPER_U ((UChar)0x0055) /*U*/ |
|
66 #define LOWER_U ((UChar)0x0075) /*u*/ |
|
67 #define OPEN_BRACE ((UChar)123) /*{*/ |
|
68 #define CLOSE_BRACE ((UChar)125) /*}*/ |
|
69 #define UPPER_P ((UChar)0x0050) /*P*/ |
|
70 #define LOWER_P ((UChar)0x0070) /*p*/ |
|
71 #define UPPER_N ((UChar)78) /*N*/ |
|
72 #define EQUALS ((UChar)0x003D) /*=*/ |
|
73 |
|
74 //static const UChar POSIX_OPEN[] = { SET_OPEN,COLON,0 }; // "[:" |
|
75 static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 }; // ":]" |
|
76 //static const UChar PERL_OPEN[] = { BACKSLASH,LOWER_P,0 }; // "\\p" |
|
77 //static const UChar PERL_CLOSE[] = { CLOSE_BRACE,0 }; // "}" |
|
78 //static const UChar NAME_OPEN[] = { BACKSLASH,UPPER_N,0 }; // "\\N" |
|
79 static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/ |
|
80 |
|
81 // Special property set IDs |
|
82 static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF] |
|
83 static const char ASCII[] = "ASCII"; // [\u0000-\u007F] |
|
84 static const char ASSIGNED[] = "Assigned"; // [:^Cn:] |
|
85 |
|
86 // Unicode name property alias |
|
87 #define NAME_PROP "na" |
|
88 #define NAME_PROP_LENGTH 2 |
|
89 |
|
90 /** |
|
91 * Delimiter string used in patterns to close a category reference: |
|
92 * ":]". Example: "[:Lu:]". |
|
93 */ |
|
94 //static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */ |
|
95 |
|
96 // Cached sets ------------------------------------------------------------- *** |
|
97 |
|
98 U_CDECL_BEGIN |
|
99 static UBool U_CALLCONV uset_cleanup(); |
|
100 |
|
101 struct Inclusion { |
|
102 UnicodeSet *fSet; |
|
103 UInitOnce fInitOnce; |
|
104 }; |
|
105 static Inclusion gInclusions[UPROPS_SRC_COUNT]; // cached getInclusions() |
|
106 |
|
107 static UnicodeSet *uni32Singleton; |
|
108 static icu::UInitOnce uni32InitOnce = U_INITONCE_INITIALIZER; |
|
109 |
|
110 //---------------------------------------------------------------- |
|
111 // Inclusions list |
|
112 //---------------------------------------------------------------- |
|
113 |
|
114 // USetAdder implementation |
|
115 // Does not use uset.h to reduce code dependencies |
|
116 static void U_CALLCONV |
|
117 _set_add(USet *set, UChar32 c) { |
|
118 ((UnicodeSet *)set)->add(c); |
|
119 } |
|
120 |
|
121 static void U_CALLCONV |
|
122 _set_addRange(USet *set, UChar32 start, UChar32 end) { |
|
123 ((UnicodeSet *)set)->add(start, end); |
|
124 } |
|
125 |
|
126 static void U_CALLCONV |
|
127 _set_addString(USet *set, const UChar *str, int32_t length) { |
|
128 ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length)); |
|
129 } |
|
130 |
|
131 /** |
|
132 * Cleanup function for UnicodeSet |
|
133 */ |
|
134 static UBool U_CALLCONV uset_cleanup(void) { |
|
135 for(int32_t i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) { |
|
136 Inclusion &in = gInclusions[i]; |
|
137 delete in.fSet; |
|
138 in.fSet = NULL; |
|
139 in.fInitOnce.reset(); |
|
140 } |
|
141 |
|
142 delete uni32Singleton; |
|
143 uni32Singleton = NULL; |
|
144 uni32InitOnce.reset(); |
|
145 return TRUE; |
|
146 } |
|
147 |
|
148 U_CDECL_END |
|
149 |
|
150 U_NAMESPACE_BEGIN |
|
151 |
|
152 /* |
|
153 Reduce excessive reallocation, and make it easier to detect initialization problems. |
|
154 Usually you don't see smaller sets than this for Unicode 5.0. |
|
155 */ |
|
156 #define DEFAULT_INCLUSION_CAPACITY 3072 |
|
157 |
|
158 void U_CALLCONV UnicodeSet_initInclusion(int32_t src, UErrorCode &status) { |
|
159 // This function is invoked only via umtx_initOnce(). |
|
160 // This function is a friend of class UnicodeSet. |
|
161 |
|
162 U_ASSERT(src >=0 && src<UPROPS_SRC_COUNT); |
|
163 UnicodeSet * &incl = gInclusions[src].fSet; |
|
164 U_ASSERT(incl == NULL); |
|
165 |
|
166 incl = new UnicodeSet(); |
|
167 if (incl == NULL) { |
|
168 status = U_MEMORY_ALLOCATION_ERROR; |
|
169 return; |
|
170 } |
|
171 USetAdder sa = { |
|
172 (USet *)incl, |
|
173 _set_add, |
|
174 _set_addRange, |
|
175 _set_addString, |
|
176 NULL, // don't need remove() |
|
177 NULL // don't need removeRange() |
|
178 }; |
|
179 |
|
180 incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, status); |
|
181 switch(src) { |
|
182 case UPROPS_SRC_CHAR: |
|
183 uchar_addPropertyStarts(&sa, &status); |
|
184 break; |
|
185 case UPROPS_SRC_PROPSVEC: |
|
186 upropsvec_addPropertyStarts(&sa, &status); |
|
187 break; |
|
188 case UPROPS_SRC_CHAR_AND_PROPSVEC: |
|
189 uchar_addPropertyStarts(&sa, &status); |
|
190 upropsvec_addPropertyStarts(&sa, &status); |
|
191 break; |
|
192 #if !UCONFIG_NO_NORMALIZATION |
|
193 case UPROPS_SRC_CASE_AND_NORM: { |
|
194 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); |
|
195 if(U_SUCCESS(status)) { |
|
196 impl->addPropertyStarts(&sa, status); |
|
197 } |
|
198 ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status); |
|
199 break; |
|
200 } |
|
201 case UPROPS_SRC_NFC: { |
|
202 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); |
|
203 if(U_SUCCESS(status)) { |
|
204 impl->addPropertyStarts(&sa, status); |
|
205 } |
|
206 break; |
|
207 } |
|
208 case UPROPS_SRC_NFKC: { |
|
209 const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(status); |
|
210 if(U_SUCCESS(status)) { |
|
211 impl->addPropertyStarts(&sa, status); |
|
212 } |
|
213 break; |
|
214 } |
|
215 case UPROPS_SRC_NFKC_CF: { |
|
216 const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(status); |
|
217 if(U_SUCCESS(status)) { |
|
218 impl->addPropertyStarts(&sa, status); |
|
219 } |
|
220 break; |
|
221 } |
|
222 case UPROPS_SRC_NFC_CANON_ITER: { |
|
223 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); |
|
224 if(U_SUCCESS(status)) { |
|
225 impl->addCanonIterPropertyStarts(&sa, status); |
|
226 } |
|
227 break; |
|
228 } |
|
229 #endif |
|
230 case UPROPS_SRC_CASE: |
|
231 ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status); |
|
232 break; |
|
233 case UPROPS_SRC_BIDI: |
|
234 ubidi_addPropertyStarts(ubidi_getSingleton(), &sa, &status); |
|
235 break; |
|
236 default: |
|
237 status = U_INTERNAL_PROGRAM_ERROR; |
|
238 break; |
|
239 } |
|
240 |
|
241 if (U_FAILURE(status)) { |
|
242 delete incl; |
|
243 incl = NULL; |
|
244 return; |
|
245 } |
|
246 // Compact for caching |
|
247 incl->compact(); |
|
248 ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); |
|
249 } |
|
250 |
|
251 |
|
252 |
|
253 const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) { |
|
254 U_ASSERT(src >=0 && src<UPROPS_SRC_COUNT); |
|
255 Inclusion &i = gInclusions[src]; |
|
256 umtx_initOnce(i.fInitOnce, &UnicodeSet_initInclusion, src, status); |
|
257 return i.fSet; |
|
258 } |
|
259 |
|
260 |
|
261 // Cache some sets for other services -------------------------------------- *** |
|
262 void U_CALLCONV createUni32Set(UErrorCode &errorCode) { |
|
263 U_ASSERT(uni32Singleton == NULL); |
|
264 uni32Singleton = new UnicodeSet(UNICODE_STRING_SIMPLE("[:age=3.2:]"), errorCode); |
|
265 if(uni32Singleton==NULL) { |
|
266 errorCode=U_MEMORY_ALLOCATION_ERROR; |
|
267 } else { |
|
268 uni32Singleton->freeze(); |
|
269 } |
|
270 ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); |
|
271 } |
|
272 |
|
273 |
|
274 U_CFUNC UnicodeSet * |
|
275 uniset_getUnicode32Instance(UErrorCode &errorCode) { |
|
276 umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode); |
|
277 return uni32Singleton; |
|
278 } |
|
279 |
|
280 // helper functions for matching of pattern syntax pieces ------------------ *** |
|
281 // these functions are parallel to the PERL_OPEN etc. strings above |
|
282 |
|
283 // using these functions is not only faster than UnicodeString::compare() and |
|
284 // caseCompare(), but they also make UnicodeSet work for simple patterns when |
|
285 // no Unicode properties data is available - when caseCompare() fails |
|
286 |
|
287 static inline UBool |
|
288 isPerlOpen(const UnicodeString &pattern, int32_t pos) { |
|
289 UChar c; |
|
290 return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P); |
|
291 } |
|
292 |
|
293 /*static inline UBool |
|
294 isPerlClose(const UnicodeString &pattern, int32_t pos) { |
|
295 return pattern.charAt(pos)==CLOSE_BRACE; |
|
296 }*/ |
|
297 |
|
298 static inline UBool |
|
299 isNameOpen(const UnicodeString &pattern, int32_t pos) { |
|
300 return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N; |
|
301 } |
|
302 |
|
303 static inline UBool |
|
304 isPOSIXOpen(const UnicodeString &pattern, int32_t pos) { |
|
305 return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON; |
|
306 } |
|
307 |
|
308 /*static inline UBool |
|
309 isPOSIXClose(const UnicodeString &pattern, int32_t pos) { |
|
310 return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE; |
|
311 }*/ |
|
312 |
|
313 // TODO memory debugging provided inside uniset.cpp |
|
314 // could be made available here but probably obsolete with use of modern |
|
315 // memory leak checker tools |
|
316 #define _dbgct(me) |
|
317 |
|
318 //---------------------------------------------------------------- |
|
319 // Constructors &c |
|
320 //---------------------------------------------------------------- |
|
321 |
|
322 /** |
|
323 * Constructs a set from the given pattern, optionally ignoring |
|
324 * white space. See the class description for the syntax of the |
|
325 * pattern language. |
|
326 * @param pattern a string specifying what characters are in the set |
|
327 */ |
|
328 UnicodeSet::UnicodeSet(const UnicodeString& pattern, |
|
329 UErrorCode& status) : |
|
330 len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), |
|
331 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), |
|
332 fFlags(0) |
|
333 { |
|
334 if(U_SUCCESS(status)){ |
|
335 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); |
|
336 /* test for NULL */ |
|
337 if(list == NULL) { |
|
338 status = U_MEMORY_ALLOCATION_ERROR; |
|
339 }else{ |
|
340 allocateStrings(status); |
|
341 applyPattern(pattern, status); |
|
342 } |
|
343 } |
|
344 _dbgct(this); |
|
345 } |
|
346 |
|
347 //---------------------------------------------------------------- |
|
348 // Public API |
|
349 //---------------------------------------------------------------- |
|
350 |
|
351 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, |
|
352 UErrorCode& status) { |
|
353 // Equivalent to |
|
354 // return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status); |
|
355 // but without dependency on closeOver(). |
|
356 ParsePosition pos(0); |
|
357 applyPatternIgnoreSpace(pattern, pos, NULL, status); |
|
358 if (U_FAILURE(status)) return *this; |
|
359 |
|
360 int32_t i = pos.getIndex(); |
|
361 // Skip over trailing whitespace |
|
362 ICU_Utility::skipWhitespace(pattern, i, TRUE); |
|
363 if (i != pattern.length()) { |
|
364 status = U_ILLEGAL_ARGUMENT_ERROR; |
|
365 } |
|
366 return *this; |
|
367 } |
|
368 |
|
369 void |
|
370 UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern, |
|
371 ParsePosition& pos, |
|
372 const SymbolTable* symbols, |
|
373 UErrorCode& status) { |
|
374 if (U_FAILURE(status)) { |
|
375 return; |
|
376 } |
|
377 if (isFrozen()) { |
|
378 status = U_NO_WRITE_PERMISSION; |
|
379 return; |
|
380 } |
|
381 // Need to build the pattern in a temporary string because |
|
382 // _applyPattern calls add() etc., which set pat to empty. |
|
383 UnicodeString rebuiltPat; |
|
384 RuleCharacterIterator chars(pattern, symbols, pos); |
|
385 applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, status); |
|
386 if (U_FAILURE(status)) return; |
|
387 if (chars.inVariable()) { |
|
388 // syntaxError(chars, "Extra chars in variable value"); |
|
389 status = U_MALFORMED_SET; |
|
390 return; |
|
391 } |
|
392 setPattern(rebuiltPat); |
|
393 } |
|
394 |
|
395 /** |
|
396 * Return true if the given position, in the given pattern, appears |
|
397 * to be the start of a UnicodeSet pattern. |
|
398 */ |
|
399 UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) { |
|
400 return ((pos+1) < pattern.length() && |
|
401 pattern.charAt(pos) == (UChar)91/*[*/) || |
|
402 resemblesPropertyPattern(pattern, pos); |
|
403 } |
|
404 |
|
405 //---------------------------------------------------------------- |
|
406 // Implementation: Pattern parsing |
|
407 //---------------------------------------------------------------- |
|
408 |
|
409 /** |
|
410 * A small all-inline class to manage a UnicodeSet pointer. Add |
|
411 * operator->() etc. as needed. |
|
412 */ |
|
413 class UnicodeSetPointer { |
|
414 UnicodeSet* p; |
|
415 public: |
|
416 inline UnicodeSetPointer() : p(0) {} |
|
417 inline ~UnicodeSetPointer() { delete p; } |
|
418 inline UnicodeSet* pointer() { return p; } |
|
419 inline UBool allocate() { |
|
420 if (p == 0) { |
|
421 p = new UnicodeSet(); |
|
422 } |
|
423 return p != 0; |
|
424 } |
|
425 }; |
|
426 |
|
427 /** |
|
428 * Parse the pattern from the given RuleCharacterIterator. The |
|
429 * iterator is advanced over the parsed pattern. |
|
430 * @param chars iterator over the pattern characters. Upon return |
|
431 * it will be advanced to the first character after the parsed |
|
432 * pattern, or the end of the iteration if all characters are |
|
433 * parsed. |
|
434 * @param symbols symbol table to use to parse and dereference |
|
435 * variables, or null if none. |
|
436 * @param rebuiltPat the pattern that was parsed, rebuilt or |
|
437 * copied from the input pattern, as appropriate. |
|
438 * @param options a bit mask of zero or more of the following: |
|
439 * IGNORE_SPACE, CASE. |
|
440 */ |
|
441 void UnicodeSet::applyPattern(RuleCharacterIterator& chars, |
|
442 const SymbolTable* symbols, |
|
443 UnicodeString& rebuiltPat, |
|
444 uint32_t options, |
|
445 UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute), |
|
446 UErrorCode& ec) { |
|
447 if (U_FAILURE(ec)) return; |
|
448 |
|
449 // Syntax characters: [ ] ^ - & { } |
|
450 |
|
451 // Recognized special forms for chars, sets: c-c s-s s&s |
|
452 |
|
453 int32_t opts = RuleCharacterIterator::PARSE_VARIABLES | |
|
454 RuleCharacterIterator::PARSE_ESCAPES; |
|
455 if ((options & USET_IGNORE_SPACE) != 0) { |
|
456 opts |= RuleCharacterIterator::SKIP_WHITESPACE; |
|
457 } |
|
458 |
|
459 UnicodeString patLocal, buf; |
|
460 UBool usePat = FALSE; |
|
461 UnicodeSetPointer scratch; |
|
462 RuleCharacterIterator::Pos backup; |
|
463 |
|
464 // mode: 0=before [, 1=between [...], 2=after ] |
|
465 // lastItem: 0=none, 1=char, 2=set |
|
466 int8_t lastItem = 0, mode = 0; |
|
467 UChar32 lastChar = 0; |
|
468 UChar op = 0; |
|
469 |
|
470 UBool invert = FALSE; |
|
471 |
|
472 clear(); |
|
473 |
|
474 while (mode != 2 && !chars.atEnd()) { |
|
475 U_ASSERT((lastItem == 0 && op == 0) || |
|
476 (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) || |
|
477 (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ || |
|
478 op == INTERSECTION /*'&'*/))); |
|
479 |
|
480 UChar32 c = 0; |
|
481 UBool literal = FALSE; |
|
482 UnicodeSet* nested = 0; // alias - do not delete |
|
483 |
|
484 // -------- Check for property pattern |
|
485 |
|
486 // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed |
|
487 int8_t setMode = 0; |
|
488 if (resemblesPropertyPattern(chars, opts)) { |
|
489 setMode = 2; |
|
490 } |
|
491 |
|
492 // -------- Parse '[' of opening delimiter OR nested set. |
|
493 // If there is a nested set, use `setMode' to define how |
|
494 // the set should be parsed. If the '[' is part of the |
|
495 // opening delimiter for this pattern, parse special |
|
496 // strings "[", "[^", "[-", and "[^-". Check for stand-in |
|
497 // characters representing a nested set in the symbol |
|
498 // table. |
|
499 |
|
500 else { |
|
501 // Prepare to backup if necessary |
|
502 chars.getPos(backup); |
|
503 c = chars.next(opts, literal, ec); |
|
504 if (U_FAILURE(ec)) return; |
|
505 |
|
506 if (c == 0x5B /*'['*/ && !literal) { |
|
507 if (mode == 1) { |
|
508 chars.setPos(backup); // backup |
|
509 setMode = 1; |
|
510 } else { |
|
511 // Handle opening '[' delimiter |
|
512 mode = 1; |
|
513 patLocal.append((UChar) 0x5B /*'['*/); |
|
514 chars.getPos(backup); // prepare to backup |
|
515 c = chars.next(opts, literal, ec); |
|
516 if (U_FAILURE(ec)) return; |
|
517 if (c == 0x5E /*'^'*/ && !literal) { |
|
518 invert = TRUE; |
|
519 patLocal.append((UChar) 0x5E /*'^'*/); |
|
520 chars.getPos(backup); // prepare to backup |
|
521 c = chars.next(opts, literal, ec); |
|
522 if (U_FAILURE(ec)) return; |
|
523 } |
|
524 // Fall through to handle special leading '-'; |
|
525 // otherwise restart loop for nested [], \p{}, etc. |
|
526 if (c == HYPHEN /*'-'*/) { |
|
527 literal = TRUE; |
|
528 // Fall through to handle literal '-' below |
|
529 } else { |
|
530 chars.setPos(backup); // backup |
|
531 continue; |
|
532 } |
|
533 } |
|
534 } else if (symbols != 0) { |
|
535 const UnicodeFunctor *m = symbols->lookupMatcher(c); |
|
536 if (m != 0) { |
|
537 const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m); |
|
538 if (ms == NULL) { |
|
539 ec = U_MALFORMED_SET; |
|
540 return; |
|
541 } |
|
542 // casting away const, but `nested' won't be modified |
|
543 // (important not to modify stored set) |
|
544 nested = const_cast<UnicodeSet*>(ms); |
|
545 setMode = 3; |
|
546 } |
|
547 } |
|
548 } |
|
549 |
|
550 // -------- Handle a nested set. This either is inline in |
|
551 // the pattern or represented by a stand-in that has |
|
552 // previously been parsed and was looked up in the symbol |
|
553 // table. |
|
554 |
|
555 if (setMode != 0) { |
|
556 if (lastItem == 1) { |
|
557 if (op != 0) { |
|
558 // syntaxError(chars, "Char expected after operator"); |
|
559 ec = U_MALFORMED_SET; |
|
560 return; |
|
561 } |
|
562 add(lastChar, lastChar); |
|
563 _appendToPat(patLocal, lastChar, FALSE); |
|
564 lastItem = 0; |
|
565 op = 0; |
|
566 } |
|
567 |
|
568 if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) { |
|
569 patLocal.append(op); |
|
570 } |
|
571 |
|
572 if (nested == 0) { |
|
573 // lazy allocation |
|
574 if (!scratch.allocate()) { |
|
575 ec = U_MEMORY_ALLOCATION_ERROR; |
|
576 return; |
|
577 } |
|
578 nested = scratch.pointer(); |
|
579 } |
|
580 switch (setMode) { |
|
581 case 1: |
|
582 nested->applyPattern(chars, symbols, patLocal, options, caseClosure, ec); |
|
583 break; |
|
584 case 2: |
|
585 chars.skipIgnored(opts); |
|
586 nested->applyPropertyPattern(chars, patLocal, ec); |
|
587 if (U_FAILURE(ec)) return; |
|
588 break; |
|
589 case 3: // `nested' already parsed |
|
590 nested->_toPattern(patLocal, FALSE); |
|
591 break; |
|
592 } |
|
593 |
|
594 usePat = TRUE; |
|
595 |
|
596 if (mode == 0) { |
|
597 // Entire pattern is a category; leave parse loop |
|
598 *this = *nested; |
|
599 mode = 2; |
|
600 break; |
|
601 } |
|
602 |
|
603 switch (op) { |
|
604 case HYPHEN: /*'-'*/ |
|
605 removeAll(*nested); |
|
606 break; |
|
607 case INTERSECTION: /*'&'*/ |
|
608 retainAll(*nested); |
|
609 break; |
|
610 case 0: |
|
611 addAll(*nested); |
|
612 break; |
|
613 } |
|
614 |
|
615 op = 0; |
|
616 lastItem = 2; |
|
617 |
|
618 continue; |
|
619 } |
|
620 |
|
621 if (mode == 0) { |
|
622 // syntaxError(chars, "Missing '['"); |
|
623 ec = U_MALFORMED_SET; |
|
624 return; |
|
625 } |
|
626 |
|
627 // -------- Parse special (syntax) characters. If the |
|
628 // current character is not special, or if it is escaped, |
|
629 // then fall through and handle it below. |
|
630 |
|
631 if (!literal) { |
|
632 switch (c) { |
|
633 case 0x5D /*']'*/: |
|
634 if (lastItem == 1) { |
|
635 add(lastChar, lastChar); |
|
636 _appendToPat(patLocal, lastChar, FALSE); |
|
637 } |
|
638 // Treat final trailing '-' as a literal |
|
639 if (op == HYPHEN /*'-'*/) { |
|
640 add(op, op); |
|
641 patLocal.append(op); |
|
642 } else if (op == INTERSECTION /*'&'*/) { |
|
643 // syntaxError(chars, "Trailing '&'"); |
|
644 ec = U_MALFORMED_SET; |
|
645 return; |
|
646 } |
|
647 patLocal.append((UChar) 0x5D /*']'*/); |
|
648 mode = 2; |
|
649 continue; |
|
650 case HYPHEN /*'-'*/: |
|
651 if (op == 0) { |
|
652 if (lastItem != 0) { |
|
653 op = (UChar) c; |
|
654 continue; |
|
655 } else { |
|
656 // Treat final trailing '-' as a literal |
|
657 add(c, c); |
|
658 c = chars.next(opts, literal, ec); |
|
659 if (U_FAILURE(ec)) return; |
|
660 if (c == 0x5D /*']'*/ && !literal) { |
|
661 patLocal.append(HYPHEN_RIGHT_BRACE, 2); |
|
662 mode = 2; |
|
663 continue; |
|
664 } |
|
665 } |
|
666 } |
|
667 // syntaxError(chars, "'-' not after char or set"); |
|
668 ec = U_MALFORMED_SET; |
|
669 return; |
|
670 case INTERSECTION /*'&'*/: |
|
671 if (lastItem == 2 && op == 0) { |
|
672 op = (UChar) c; |
|
673 continue; |
|
674 } |
|
675 // syntaxError(chars, "'&' not after set"); |
|
676 ec = U_MALFORMED_SET; |
|
677 return; |
|
678 case 0x5E /*'^'*/: |
|
679 // syntaxError(chars, "'^' not after '['"); |
|
680 ec = U_MALFORMED_SET; |
|
681 return; |
|
682 case 0x7B /*'{'*/: |
|
683 if (op != 0) { |
|
684 // syntaxError(chars, "Missing operand after operator"); |
|
685 ec = U_MALFORMED_SET; |
|
686 return; |
|
687 } |
|
688 if (lastItem == 1) { |
|
689 add(lastChar, lastChar); |
|
690 _appendToPat(patLocal, lastChar, FALSE); |
|
691 } |
|
692 lastItem = 0; |
|
693 buf.truncate(0); |
|
694 { |
|
695 UBool ok = FALSE; |
|
696 while (!chars.atEnd()) { |
|
697 c = chars.next(opts, literal, ec); |
|
698 if (U_FAILURE(ec)) return; |
|
699 if (c == 0x7D /*'}'*/ && !literal) { |
|
700 ok = TRUE; |
|
701 break; |
|
702 } |
|
703 buf.append(c); |
|
704 } |
|
705 if (buf.length() < 1 || !ok) { |
|
706 // syntaxError(chars, "Invalid multicharacter string"); |
|
707 ec = U_MALFORMED_SET; |
|
708 return; |
|
709 } |
|
710 } |
|
711 // We have new string. Add it to set and continue; |
|
712 // we don't need to drop through to the further |
|
713 // processing |
|
714 add(buf); |
|
715 patLocal.append((UChar) 0x7B /*'{'*/); |
|
716 _appendToPat(patLocal, buf, FALSE); |
|
717 patLocal.append((UChar) 0x7D /*'}'*/); |
|
718 continue; |
|
719 case SymbolTable::SYMBOL_REF: |
|
720 // symbols nosymbols |
|
721 // [a-$] error error (ambiguous) |
|
722 // [a$] anchor anchor |
|
723 // [a-$x] var "x"* literal '$' |
|
724 // [a-$.] error literal '$' |
|
725 // *We won't get here in the case of var "x" |
|
726 { |
|
727 chars.getPos(backup); |
|
728 c = chars.next(opts, literal, ec); |
|
729 if (U_FAILURE(ec)) return; |
|
730 UBool anchor = (c == 0x5D /*']'*/ && !literal); |
|
731 if (symbols == 0 && !anchor) { |
|
732 c = SymbolTable::SYMBOL_REF; |
|
733 chars.setPos(backup); |
|
734 break; // literal '$' |
|
735 } |
|
736 if (anchor && op == 0) { |
|
737 if (lastItem == 1) { |
|
738 add(lastChar, lastChar); |
|
739 _appendToPat(patLocal, lastChar, FALSE); |
|
740 } |
|
741 add(U_ETHER); |
|
742 usePat = TRUE; |
|
743 patLocal.append((UChar) SymbolTable::SYMBOL_REF); |
|
744 patLocal.append((UChar) 0x5D /*']'*/); |
|
745 mode = 2; |
|
746 continue; |
|
747 } |
|
748 // syntaxError(chars, "Unquoted '$'"); |
|
749 ec = U_MALFORMED_SET; |
|
750 return; |
|
751 } |
|
752 default: |
|
753 break; |
|
754 } |
|
755 } |
|
756 |
|
757 // -------- Parse literal characters. This includes both |
|
758 // escaped chars ("\u4E01") and non-syntax characters |
|
759 // ("a"). |
|
760 |
|
761 switch (lastItem) { |
|
762 case 0: |
|
763 lastItem = 1; |
|
764 lastChar = c; |
|
765 break; |
|
766 case 1: |
|
767 if (op == HYPHEN /*'-'*/) { |
|
768 if (lastChar >= c) { |
|
769 // Don't allow redundant (a-a) or empty (b-a) ranges; |
|
770 // these are most likely typos. |
|
771 // syntaxError(chars, "Invalid range"); |
|
772 ec = U_MALFORMED_SET; |
|
773 return; |
|
774 } |
|
775 add(lastChar, c); |
|
776 _appendToPat(patLocal, lastChar, FALSE); |
|
777 patLocal.append(op); |
|
778 _appendToPat(patLocal, c, FALSE); |
|
779 lastItem = 0; |
|
780 op = 0; |
|
781 } else { |
|
782 add(lastChar, lastChar); |
|
783 _appendToPat(patLocal, lastChar, FALSE); |
|
784 lastChar = c; |
|
785 } |
|
786 break; |
|
787 case 2: |
|
788 if (op != 0) { |
|
789 // syntaxError(chars, "Set expected after operator"); |
|
790 ec = U_MALFORMED_SET; |
|
791 return; |
|
792 } |
|
793 lastChar = c; |
|
794 lastItem = 1; |
|
795 break; |
|
796 } |
|
797 } |
|
798 |
|
799 if (mode != 2) { |
|
800 // syntaxError(chars, "Missing ']'"); |
|
801 ec = U_MALFORMED_SET; |
|
802 return; |
|
803 } |
|
804 |
|
805 chars.skipIgnored(opts); |
|
806 |
|
807 /** |
|
808 * Handle global flags (invert, case insensitivity). If this |
|
809 * pattern should be compiled case-insensitive, then we need |
|
810 * to close over case BEFORE COMPLEMENTING. This makes |
|
811 * patterns like /[^abc]/i work. |
|
812 */ |
|
813 if ((options & USET_CASE_INSENSITIVE) != 0) { |
|
814 (this->*caseClosure)(USET_CASE_INSENSITIVE); |
|
815 } |
|
816 else if ((options & USET_ADD_CASE_MAPPINGS) != 0) { |
|
817 (this->*caseClosure)(USET_ADD_CASE_MAPPINGS); |
|
818 } |
|
819 if (invert) { |
|
820 complement(); |
|
821 } |
|
822 |
|
823 // Use the rebuilt pattern (patLocal) only if necessary. Prefer the |
|
824 // generated pattern. |
|
825 if (usePat) { |
|
826 rebuiltPat.append(patLocal); |
|
827 } else { |
|
828 _generatePattern(rebuiltPat, FALSE); |
|
829 } |
|
830 if (isBogus() && U_SUCCESS(ec)) { |
|
831 // We likely ran out of memory. AHHH! |
|
832 ec = U_MEMORY_ALLOCATION_ERROR; |
|
833 } |
|
834 } |
|
835 |
|
836 //---------------------------------------------------------------- |
|
837 // Property set implementation |
|
838 //---------------------------------------------------------------- |
|
839 |
|
840 static UBool numericValueFilter(UChar32 ch, void* context) { |
|
841 return u_getNumericValue(ch) == *(double*)context; |
|
842 } |
|
843 |
|
844 static UBool generalCategoryMaskFilter(UChar32 ch, void* context) { |
|
845 int32_t value = *(int32_t*)context; |
|
846 return (U_GET_GC_MASK((UChar32) ch) & value) != 0; |
|
847 } |
|
848 |
|
849 static UBool versionFilter(UChar32 ch, void* context) { |
|
850 static const UVersionInfo none = { 0, 0, 0, 0 }; |
|
851 UVersionInfo v; |
|
852 u_charAge(ch, v); |
|
853 UVersionInfo* version = (UVersionInfo*)context; |
|
854 return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0; |
|
855 } |
|
856 |
|
857 typedef struct { |
|
858 UProperty prop; |
|
859 int32_t value; |
|
860 } IntPropertyContext; |
|
861 |
|
862 static UBool intPropertyFilter(UChar32 ch, void* context) { |
|
863 IntPropertyContext* c = (IntPropertyContext*)context; |
|
864 return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value; |
|
865 } |
|
866 |
|
867 static UBool scriptExtensionsFilter(UChar32 ch, void* context) { |
|
868 return uscript_hasScript(ch, *(UScriptCode*)context); |
|
869 } |
|
870 |
|
871 /** |
|
872 * Generic filter-based scanning code for UCD property UnicodeSets. |
|
873 */ |
|
874 void UnicodeSet::applyFilter(UnicodeSet::Filter filter, |
|
875 void* context, |
|
876 int32_t src, |
|
877 UErrorCode &status) { |
|
878 if (U_FAILURE(status)) return; |
|
879 |
|
880 // Logically, walk through all Unicode characters, noting the start |
|
881 // and end of each range for which filter.contain(c) is |
|
882 // true. Add each range to a set. |
|
883 // |
|
884 // To improve performance, use an inclusions set which |
|
885 // encodes information about character ranges that are known |
|
886 // to have identical properties. |
|
887 // getInclusions(src) contains exactly the first characters of |
|
888 // same-value ranges for the given properties "source". |
|
889 const UnicodeSet* inclusions = getInclusions(src, status); |
|
890 if (U_FAILURE(status)) { |
|
891 return; |
|
892 } |
|
893 |
|
894 clear(); |
|
895 |
|
896 UChar32 startHasProperty = -1; |
|
897 int32_t limitRange = inclusions->getRangeCount(); |
|
898 |
|
899 for (int j=0; j<limitRange; ++j) { |
|
900 // get current range |
|
901 UChar32 start = inclusions->getRangeStart(j); |
|
902 UChar32 end = inclusions->getRangeEnd(j); |
|
903 |
|
904 // for all the code points in the range, process |
|
905 for (UChar32 ch = start; ch <= end; ++ch) { |
|
906 // only add to this UnicodeSet on inflection points -- |
|
907 // where the hasProperty value changes to false |
|
908 if ((*filter)(ch, context)) { |
|
909 if (startHasProperty < 0) { |
|
910 startHasProperty = ch; |
|
911 } |
|
912 } else if (startHasProperty >= 0) { |
|
913 add(startHasProperty, ch-1); |
|
914 startHasProperty = -1; |
|
915 } |
|
916 } |
|
917 } |
|
918 if (startHasProperty >= 0) { |
|
919 add((UChar32)startHasProperty, (UChar32)0x10FFFF); |
|
920 } |
|
921 if (isBogus() && U_SUCCESS(status)) { |
|
922 // We likely ran out of memory. AHHH! |
|
923 status = U_MEMORY_ALLOCATION_ERROR; |
|
924 } |
|
925 } |
|
926 |
|
927 static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) { |
|
928 /* Note: we use ' ' in compiler code page */ |
|
929 int32_t j = 0; |
|
930 char ch; |
|
931 --dstCapacity; /* make room for term. zero */ |
|
932 while ((ch = *src++) != 0) { |
|
933 if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) { |
|
934 continue; |
|
935 } |
|
936 if (j >= dstCapacity) return FALSE; |
|
937 dst[j++] = ch; |
|
938 } |
|
939 if (j > 0 && dst[j-1] == ' ') --j; |
|
940 dst[j] = 0; |
|
941 return TRUE; |
|
942 } |
|
943 |
|
944 //---------------------------------------------------------------- |
|
945 // Property set API |
|
946 //---------------------------------------------------------------- |
|
947 |
|
948 #define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;} |
|
949 |
|
950 UnicodeSet& |
|
951 UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) { |
|
952 if (U_FAILURE(ec) || isFrozen()) return *this; |
|
953 |
|
954 if (prop == UCHAR_GENERAL_CATEGORY_MASK) { |
|
955 applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec); |
|
956 } else if (prop == UCHAR_SCRIPT_EXTENSIONS) { |
|
957 UScriptCode script = (UScriptCode)value; |
|
958 applyFilter(scriptExtensionsFilter, &script, UPROPS_SRC_PROPSVEC, ec); |
|
959 } else { |
|
960 IntPropertyContext c = {prop, value}; |
|
961 applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec); |
|
962 } |
|
963 return *this; |
|
964 } |
|
965 |
|
966 UnicodeSet& |
|
967 UnicodeSet::applyPropertyAlias(const UnicodeString& prop, |
|
968 const UnicodeString& value, |
|
969 UErrorCode& ec) { |
|
970 if (U_FAILURE(ec) || isFrozen()) return *this; |
|
971 |
|
972 // prop and value used to be converted to char * using the default |
|
973 // converter instead of the invariant conversion. |
|
974 // This should not be necessary because all Unicode property and value |
|
975 // names use only invariant characters. |
|
976 // If there are any variant characters, then we won't find them anyway. |
|
977 // Checking first avoids assertion failures in the conversion. |
|
978 if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) || |
|
979 !uprv_isInvariantUString(value.getBuffer(), value.length()) |
|
980 ) { |
|
981 FAIL(ec); |
|
982 } |
|
983 CharString pname, vname; |
|
984 pname.appendInvariantChars(prop, ec); |
|
985 vname.appendInvariantChars(value, ec); |
|
986 if (U_FAILURE(ec)) return *this; |
|
987 |
|
988 UProperty p; |
|
989 int32_t v; |
|
990 UBool mustNotBeEmpty = FALSE, invert = FALSE; |
|
991 |
|
992 if (value.length() > 0) { |
|
993 p = u_getPropertyEnum(pname.data()); |
|
994 if (p == UCHAR_INVALID_CODE) FAIL(ec); |
|
995 |
|
996 // Treat gc as gcm |
|
997 if (p == UCHAR_GENERAL_CATEGORY) { |
|
998 p = UCHAR_GENERAL_CATEGORY_MASK; |
|
999 } |
|
1000 |
|
1001 if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) || |
|
1002 (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) || |
|
1003 (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) { |
|
1004 v = u_getPropertyValueEnum(p, vname.data()); |
|
1005 if (v == UCHAR_INVALID_CODE) { |
|
1006 // Handle numeric CCC |
|
1007 if (p == UCHAR_CANONICAL_COMBINING_CLASS || |
|
1008 p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS || |
|
1009 p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) { |
|
1010 char* end; |
|
1011 double value = uprv_strtod(vname.data(), &end); |
|
1012 v = (int32_t) value; |
|
1013 if (v != value || v < 0 || *end != 0) { |
|
1014 // non-integral or negative value, or trailing junk |
|
1015 FAIL(ec); |
|
1016 } |
|
1017 // If the resultant set is empty then the numeric value |
|
1018 // was invalid. |
|
1019 mustNotBeEmpty = TRUE; |
|
1020 } else { |
|
1021 FAIL(ec); |
|
1022 } |
|
1023 } |
|
1024 } |
|
1025 |
|
1026 else { |
|
1027 |
|
1028 switch (p) { |
|
1029 case UCHAR_NUMERIC_VALUE: |
|
1030 { |
|
1031 char* end; |
|
1032 double value = uprv_strtod(vname.data(), &end); |
|
1033 if (*end != 0) { |
|
1034 FAIL(ec); |
|
1035 } |
|
1036 applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec); |
|
1037 return *this; |
|
1038 } |
|
1039 case UCHAR_NAME: |
|
1040 { |
|
1041 // Must munge name, since u_charFromName() does not do |
|
1042 // 'loose' matching. |
|
1043 char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength |
|
1044 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec); |
|
1045 UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec); |
|
1046 if (U_SUCCESS(ec)) { |
|
1047 clear(); |
|
1048 add(ch); |
|
1049 return *this; |
|
1050 } else { |
|
1051 FAIL(ec); |
|
1052 } |
|
1053 } |
|
1054 case UCHAR_UNICODE_1_NAME: |
|
1055 // ICU 49 deprecates the Unicode_1_Name property APIs. |
|
1056 FAIL(ec); |
|
1057 case UCHAR_AGE: |
|
1058 { |
|
1059 // Must munge name, since u_versionFromString() does not do |
|
1060 // 'loose' matching. |
|
1061 char buf[128]; |
|
1062 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec); |
|
1063 UVersionInfo version; |
|
1064 u_versionFromString(version, buf); |
|
1065 applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec); |
|
1066 return *this; |
|
1067 } |
|
1068 case UCHAR_SCRIPT_EXTENSIONS: |
|
1069 v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data()); |
|
1070 if (v == UCHAR_INVALID_CODE) { |
|
1071 FAIL(ec); |
|
1072 } |
|
1073 // fall through to calling applyIntPropertyValue() |
|
1074 break; |
|
1075 default: |
|
1076 // p is a non-binary, non-enumerated property that we |
|
1077 // don't support (yet). |
|
1078 FAIL(ec); |
|
1079 } |
|
1080 } |
|
1081 } |
|
1082 |
|
1083 else { |
|
1084 // value is empty. Interpret as General Category, Script, or |
|
1085 // Binary property. |
|
1086 p = UCHAR_GENERAL_CATEGORY_MASK; |
|
1087 v = u_getPropertyValueEnum(p, pname.data()); |
|
1088 if (v == UCHAR_INVALID_CODE) { |
|
1089 p = UCHAR_SCRIPT; |
|
1090 v = u_getPropertyValueEnum(p, pname.data()); |
|
1091 if (v == UCHAR_INVALID_CODE) { |
|
1092 p = u_getPropertyEnum(pname.data()); |
|
1093 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) { |
|
1094 v = 1; |
|
1095 } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) { |
|
1096 set(MIN_VALUE, MAX_VALUE); |
|
1097 return *this; |
|
1098 } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) { |
|
1099 set(0, 0x7F); |
|
1100 return *this; |
|
1101 } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) { |
|
1102 // [:Assigned:]=[:^Cn:] |
|
1103 p = UCHAR_GENERAL_CATEGORY_MASK; |
|
1104 v = U_GC_CN_MASK; |
|
1105 invert = TRUE; |
|
1106 } else { |
|
1107 FAIL(ec); |
|
1108 } |
|
1109 } |
|
1110 } |
|
1111 } |
|
1112 |
|
1113 applyIntPropertyValue(p, v, ec); |
|
1114 if(invert) { |
|
1115 complement(); |
|
1116 } |
|
1117 |
|
1118 if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) { |
|
1119 // mustNotBeEmpty is set to true if an empty set indicates |
|
1120 // invalid input. |
|
1121 ec = U_ILLEGAL_ARGUMENT_ERROR; |
|
1122 } |
|
1123 |
|
1124 if (isBogus() && U_SUCCESS(ec)) { |
|
1125 // We likely ran out of memory. AHHH! |
|
1126 ec = U_MEMORY_ALLOCATION_ERROR; |
|
1127 } |
|
1128 return *this; |
|
1129 } |
|
1130 |
|
1131 //---------------------------------------------------------------- |
|
1132 // Property set patterns |
|
1133 //---------------------------------------------------------------- |
|
1134 |
|
1135 /** |
|
1136 * Return true if the given position, in the given pattern, appears |
|
1137 * to be the start of a property set pattern. |
|
1138 */ |
|
1139 UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern, |
|
1140 int32_t pos) { |
|
1141 // Patterns are at least 5 characters long |
|
1142 if ((pos+5) > pattern.length()) { |
|
1143 return FALSE; |
|
1144 } |
|
1145 |
|
1146 // Look for an opening [:, [:^, \p, or \P |
|
1147 return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos); |
|
1148 } |
|
1149 |
|
1150 /** |
|
1151 * Return true if the given iterator appears to point at a |
|
1152 * property pattern. Regardless of the result, return with the |
|
1153 * iterator unchanged. |
|
1154 * @param chars iterator over the pattern characters. Upon return |
|
1155 * it will be unchanged. |
|
1156 * @param iterOpts RuleCharacterIterator options |
|
1157 */ |
|
1158 UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars, |
|
1159 int32_t iterOpts) { |
|
1160 // NOTE: literal will always be FALSE, because we don't parse escapes. |
|
1161 UBool result = FALSE, literal; |
|
1162 UErrorCode ec = U_ZERO_ERROR; |
|
1163 iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES; |
|
1164 RuleCharacterIterator::Pos pos; |
|
1165 chars.getPos(pos); |
|
1166 UChar32 c = chars.next(iterOpts, literal, ec); |
|
1167 if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) { |
|
1168 UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE, |
|
1169 literal, ec); |
|
1170 result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) : |
|
1171 (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/); |
|
1172 } |
|
1173 chars.setPos(pos); |
|
1174 return result && U_SUCCESS(ec); |
|
1175 } |
|
1176 |
|
1177 /** |
|
1178 * Parse the given property pattern at the given parse position. |
|
1179 */ |
|
1180 UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern, |
|
1181 ParsePosition& ppos, |
|
1182 UErrorCode &ec) { |
|
1183 int32_t pos = ppos.getIndex(); |
|
1184 |
|
1185 UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat} |
|
1186 UBool isName = FALSE; // true for \N{pat}, o/w false |
|
1187 UBool invert = FALSE; |
|
1188 |
|
1189 if (U_FAILURE(ec)) return *this; |
|
1190 |
|
1191 // Minimum length is 5 characters, e.g. \p{L} |
|
1192 if ((pos+5) > pattern.length()) { |
|
1193 FAIL(ec); |
|
1194 } |
|
1195 |
|
1196 // On entry, ppos should point to one of the following locations: |
|
1197 // Look for an opening [:, [:^, \p, or \P |
|
1198 if (isPOSIXOpen(pattern, pos)) { |
|
1199 posix = TRUE; |
|
1200 pos += 2; |
|
1201 pos = ICU_Utility::skipWhitespace(pattern, pos); |
|
1202 if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) { |
|
1203 ++pos; |
|
1204 invert = TRUE; |
|
1205 } |
|
1206 } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) { |
|
1207 UChar c = pattern.charAt(pos+1); |
|
1208 invert = (c == UPPER_P); |
|
1209 isName = (c == UPPER_N); |
|
1210 pos += 2; |
|
1211 pos = ICU_Utility::skipWhitespace(pattern, pos); |
|
1212 if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) { |
|
1213 // Syntax error; "\p" or "\P" not followed by "{" |
|
1214 FAIL(ec); |
|
1215 } |
|
1216 } else { |
|
1217 // Open delimiter not seen |
|
1218 FAIL(ec); |
|
1219 } |
|
1220 |
|
1221 // Look for the matching close delimiter, either :] or } |
|
1222 int32_t close; |
|
1223 if (posix) { |
|
1224 close = pattern.indexOf(POSIX_CLOSE, 2, pos); |
|
1225 } else { |
|
1226 close = pattern.indexOf(CLOSE_BRACE, pos); |
|
1227 } |
|
1228 if (close < 0) { |
|
1229 // Syntax error; close delimiter missing |
|
1230 FAIL(ec); |
|
1231 } |
|
1232 |
|
1233 // Look for an '=' sign. If this is present, we will parse a |
|
1234 // medium \p{gc=Cf} or long \p{GeneralCategory=Format} |
|
1235 // pattern. |
|
1236 int32_t equals = pattern.indexOf(EQUALS, pos); |
|
1237 UnicodeString propName, valueName; |
|
1238 if (equals >= 0 && equals < close && !isName) { |
|
1239 // Equals seen; parse medium/long pattern |
|
1240 pattern.extractBetween(pos, equals, propName); |
|
1241 pattern.extractBetween(equals+1, close, valueName); |
|
1242 } |
|
1243 |
|
1244 else { |
|
1245 // Handle case where no '=' is seen, and \N{} |
|
1246 pattern.extractBetween(pos, close, propName); |
|
1247 |
|
1248 // Handle \N{name} |
|
1249 if (isName) { |
|
1250 // This is a little inefficient since it means we have to |
|
1251 // parse NAME_PROP back to UCHAR_NAME even though we already |
|
1252 // know it's UCHAR_NAME. If we refactor the API to |
|
1253 // support args of (UProperty, char*) then we can remove |
|
1254 // NAME_PROP and make this a little more efficient. |
|
1255 valueName = propName; |
|
1256 propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV); |
|
1257 } |
|
1258 } |
|
1259 |
|
1260 applyPropertyAlias(propName, valueName, ec); |
|
1261 |
|
1262 if (U_SUCCESS(ec)) { |
|
1263 if (invert) { |
|
1264 complement(); |
|
1265 } |
|
1266 |
|
1267 // Move to the limit position after the close delimiter if the |
|
1268 // parse succeeded. |
|
1269 ppos.setIndex(close + (posix ? 2 : 1)); |
|
1270 } |
|
1271 |
|
1272 return *this; |
|
1273 } |
|
1274 |
|
1275 /** |
|
1276 * Parse a property pattern. |
|
1277 * @param chars iterator over the pattern characters. Upon return |
|
1278 * it will be advanced to the first character after the parsed |
|
1279 * pattern, or the end of the iteration if all characters are |
|
1280 * parsed. |
|
1281 * @param rebuiltPat the pattern that was parsed, rebuilt or |
|
1282 * copied from the input pattern, as appropriate. |
|
1283 */ |
|
1284 void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars, |
|
1285 UnicodeString& rebuiltPat, |
|
1286 UErrorCode& ec) { |
|
1287 if (U_FAILURE(ec)) return; |
|
1288 UnicodeString pattern; |
|
1289 chars.lookahead(pattern); |
|
1290 ParsePosition pos(0); |
|
1291 applyPropertyPattern(pattern, pos, ec); |
|
1292 if (U_FAILURE(ec)) return; |
|
1293 if (pos.getIndex() == 0) { |
|
1294 // syntaxError(chars, "Invalid property pattern"); |
|
1295 ec = U_MALFORMED_SET; |
|
1296 return; |
|
1297 } |
|
1298 chars.jumpahead(pos.getIndex()); |
|
1299 rebuiltPat.append(pattern, 0, pos.getIndex()); |
|
1300 } |
|
1301 |
|
1302 U_NAMESPACE_END |