|
1 /* |
|
2 ******************************************************************************* |
|
3 * |
|
4 * Copyright (C) 2011, International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ******************************************************************************* |
|
8 * file name: uniset_closure.cpp |
|
9 * encoding: US-ASCII |
|
10 * tab size: 8 (not used) |
|
11 * indentation:4 |
|
12 * |
|
13 * created on: 2011may30 |
|
14 * created by: Markus W. Scherer |
|
15 * |
|
16 * UnicodeSet::closeOver() and related methods moved here from uniset_props.cpp |
|
17 * to simplify dependencies. |
|
18 * In particular, this depends on the BreakIterator, but the BreakIterator |
|
19 * code also builds UnicodeSets from patterns and needs uniset_props. |
|
20 */ |
|
21 |
|
22 #include "unicode/brkiter.h" |
|
23 #include "unicode/locid.h" |
|
24 #include "unicode/parsepos.h" |
|
25 #include "unicode/uniset.h" |
|
26 #include "cmemory.h" |
|
27 #include "ruleiter.h" |
|
28 #include "ucase.h" |
|
29 #include "util.h" |
|
30 #include "uvector.h" |
|
31 |
|
32 // initial storage. Must be >= 0 |
|
33 // *** same as in uniset.cpp ! *** |
|
34 #define START_EXTRA 16 |
|
35 |
|
36 U_NAMESPACE_BEGIN |
|
37 |
|
38 // TODO memory debugging provided inside uniset.cpp |
|
39 // could be made available here but probably obsolete with use of modern |
|
40 // memory leak checker tools |
|
41 #define _dbgct(me) |
|
42 |
|
43 //---------------------------------------------------------------- |
|
44 // Constructors &c |
|
45 //---------------------------------------------------------------- |
|
46 |
|
47 UnicodeSet::UnicodeSet(const UnicodeString& pattern, |
|
48 uint32_t options, |
|
49 const SymbolTable* symbols, |
|
50 UErrorCode& status) : |
|
51 len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), |
|
52 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), |
|
53 fFlags(0) |
|
54 { |
|
55 if(U_SUCCESS(status)){ |
|
56 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); |
|
57 /* test for NULL */ |
|
58 if(list == NULL) { |
|
59 status = U_MEMORY_ALLOCATION_ERROR; |
|
60 }else{ |
|
61 allocateStrings(status); |
|
62 applyPattern(pattern, options, symbols, status); |
|
63 } |
|
64 } |
|
65 _dbgct(this); |
|
66 } |
|
67 |
|
68 UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos, |
|
69 uint32_t options, |
|
70 const SymbolTable* symbols, |
|
71 UErrorCode& status) : |
|
72 len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), |
|
73 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), |
|
74 fFlags(0) |
|
75 { |
|
76 if(U_SUCCESS(status)){ |
|
77 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); |
|
78 /* test for NULL */ |
|
79 if(list == NULL) { |
|
80 status = U_MEMORY_ALLOCATION_ERROR; |
|
81 }else{ |
|
82 allocateStrings(status); |
|
83 applyPattern(pattern, pos, options, symbols, status); |
|
84 } |
|
85 } |
|
86 _dbgct(this); |
|
87 } |
|
88 |
|
89 //---------------------------------------------------------------- |
|
90 // Public API |
|
91 //---------------------------------------------------------------- |
|
92 |
|
93 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, |
|
94 uint32_t options, |
|
95 const SymbolTable* symbols, |
|
96 UErrorCode& status) { |
|
97 ParsePosition pos(0); |
|
98 applyPattern(pattern, pos, options, symbols, status); |
|
99 if (U_FAILURE(status)) return *this; |
|
100 |
|
101 int32_t i = pos.getIndex(); |
|
102 |
|
103 if (options & USET_IGNORE_SPACE) { |
|
104 // Skip over trailing whitespace |
|
105 ICU_Utility::skipWhitespace(pattern, i, TRUE); |
|
106 } |
|
107 |
|
108 if (i != pattern.length()) { |
|
109 status = U_ILLEGAL_ARGUMENT_ERROR; |
|
110 } |
|
111 return *this; |
|
112 } |
|
113 |
|
114 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, |
|
115 ParsePosition& pos, |
|
116 uint32_t options, |
|
117 const SymbolTable* symbols, |
|
118 UErrorCode& status) { |
|
119 if (U_FAILURE(status)) { |
|
120 return *this; |
|
121 } |
|
122 if (isFrozen()) { |
|
123 status = U_NO_WRITE_PERMISSION; |
|
124 return *this; |
|
125 } |
|
126 // Need to build the pattern in a temporary string because |
|
127 // _applyPattern calls add() etc., which set pat to empty. |
|
128 UnicodeString rebuiltPat; |
|
129 RuleCharacterIterator chars(pattern, symbols, pos); |
|
130 applyPattern(chars, symbols, rebuiltPat, options, &UnicodeSet::closeOver, status); |
|
131 if (U_FAILURE(status)) return *this; |
|
132 if (chars.inVariable()) { |
|
133 // syntaxError(chars, "Extra chars in variable value"); |
|
134 status = U_MALFORMED_SET; |
|
135 return *this; |
|
136 } |
|
137 setPattern(rebuiltPat); |
|
138 return *this; |
|
139 } |
|
140 |
|
141 // USetAdder implementation |
|
142 // Does not use uset.h to reduce code dependencies |
|
143 static void U_CALLCONV |
|
144 _set_add(USet *set, UChar32 c) { |
|
145 ((UnicodeSet *)set)->add(c); |
|
146 } |
|
147 |
|
148 static void U_CALLCONV |
|
149 _set_addRange(USet *set, UChar32 start, UChar32 end) { |
|
150 ((UnicodeSet *)set)->add(start, end); |
|
151 } |
|
152 |
|
153 static void U_CALLCONV |
|
154 _set_addString(USet *set, const UChar *str, int32_t length) { |
|
155 ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length)); |
|
156 } |
|
157 |
|
158 //---------------------------------------------------------------- |
|
159 // Case folding API |
|
160 //---------------------------------------------------------------- |
|
161 |
|
162 // add the result of a full case mapping to the set |
|
163 // use str as a temporary string to avoid constructing one |
|
164 static inline void |
|
165 addCaseMapping(UnicodeSet &set, int32_t result, const UChar *full, UnicodeString &str) { |
|
166 if(result >= 0) { |
|
167 if(result > UCASE_MAX_STRING_LENGTH) { |
|
168 // add a single-code point case mapping |
|
169 set.add(result); |
|
170 } else { |
|
171 // add a string case mapping from full with length result |
|
172 str.setTo((UBool)FALSE, full, result); |
|
173 set.add(str); |
|
174 } |
|
175 } |
|
176 // result < 0: the code point mapped to itself, no need to add it |
|
177 // see ucase.h |
|
178 } |
|
179 |
|
180 UnicodeSet& UnicodeSet::closeOver(int32_t attribute) { |
|
181 if (isFrozen() || isBogus()) { |
|
182 return *this; |
|
183 } |
|
184 if (attribute & (USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS)) { |
|
185 const UCaseProps *csp = ucase_getSingleton(); |
|
186 { |
|
187 UnicodeSet foldSet(*this); |
|
188 UnicodeString str; |
|
189 USetAdder sa = { |
|
190 foldSet.toUSet(), |
|
191 _set_add, |
|
192 _set_addRange, |
|
193 _set_addString, |
|
194 NULL, // don't need remove() |
|
195 NULL // don't need removeRange() |
|
196 }; |
|
197 |
|
198 // start with input set to guarantee inclusion |
|
199 // USET_CASE: remove strings because the strings will actually be reduced (folded); |
|
200 // therefore, start with no strings and add only those needed |
|
201 if (attribute & USET_CASE_INSENSITIVE) { |
|
202 foldSet.strings->removeAllElements(); |
|
203 } |
|
204 |
|
205 int32_t n = getRangeCount(); |
|
206 UChar32 result; |
|
207 const UChar *full; |
|
208 int32_t locCache = 0; |
|
209 |
|
210 for (int32_t i=0; i<n; ++i) { |
|
211 UChar32 start = getRangeStart(i); |
|
212 UChar32 end = getRangeEnd(i); |
|
213 |
|
214 if (attribute & USET_CASE_INSENSITIVE) { |
|
215 // full case closure |
|
216 for (UChar32 cp=start; cp<=end; ++cp) { |
|
217 ucase_addCaseClosure(csp, cp, &sa); |
|
218 } |
|
219 } else { |
|
220 // add case mappings |
|
221 // (does not add long s for regular s, or Kelvin for k, for example) |
|
222 for (UChar32 cp=start; cp<=end; ++cp) { |
|
223 result = ucase_toFullLower(csp, cp, NULL, NULL, &full, "", &locCache); |
|
224 addCaseMapping(foldSet, result, full, str); |
|
225 |
|
226 result = ucase_toFullTitle(csp, cp, NULL, NULL, &full, "", &locCache); |
|
227 addCaseMapping(foldSet, result, full, str); |
|
228 |
|
229 result = ucase_toFullUpper(csp, cp, NULL, NULL, &full, "", &locCache); |
|
230 addCaseMapping(foldSet, result, full, str); |
|
231 |
|
232 result = ucase_toFullFolding(csp, cp, &full, 0); |
|
233 addCaseMapping(foldSet, result, full, str); |
|
234 } |
|
235 } |
|
236 } |
|
237 if (strings != NULL && strings->size() > 0) { |
|
238 if (attribute & USET_CASE_INSENSITIVE) { |
|
239 for (int32_t j=0; j<strings->size(); ++j) { |
|
240 str = *(const UnicodeString *) strings->elementAt(j); |
|
241 str.foldCase(); |
|
242 if(!ucase_addStringCaseClosure(csp, str.getBuffer(), str.length(), &sa)) { |
|
243 foldSet.add(str); // does not map to code points: add the folded string itself |
|
244 } |
|
245 } |
|
246 } else { |
|
247 Locale root(""); |
|
248 #if !UCONFIG_NO_BREAK_ITERATION |
|
249 UErrorCode status = U_ZERO_ERROR; |
|
250 BreakIterator *bi = BreakIterator::createWordInstance(root, status); |
|
251 if (U_SUCCESS(status)) { |
|
252 #endif |
|
253 const UnicodeString *pStr; |
|
254 |
|
255 for (int32_t j=0; j<strings->size(); ++j) { |
|
256 pStr = (const UnicodeString *) strings->elementAt(j); |
|
257 (str = *pStr).toLower(root); |
|
258 foldSet.add(str); |
|
259 #if !UCONFIG_NO_BREAK_ITERATION |
|
260 (str = *pStr).toTitle(bi, root); |
|
261 foldSet.add(str); |
|
262 #endif |
|
263 (str = *pStr).toUpper(root); |
|
264 foldSet.add(str); |
|
265 (str = *pStr).foldCase(); |
|
266 foldSet.add(str); |
|
267 } |
|
268 #if !UCONFIG_NO_BREAK_ITERATION |
|
269 } |
|
270 delete bi; |
|
271 #endif |
|
272 } |
|
273 } |
|
274 *this = foldSet; |
|
275 } |
|
276 } |
|
277 return *this; |
|
278 } |
|
279 |
|
280 U_NAMESPACE_END |