|
1 /* |
|
2 ************************************************************************* |
|
3 * COPYRIGHT: |
|
4 * Copyright (c) 1996-2012, International Business Machines Corporation and |
|
5 * others. All Rights Reserved. |
|
6 ************************************************************************* |
|
7 */ |
|
8 |
|
9 #include "unicode/utypes.h" |
|
10 |
|
11 #if !UCONFIG_NO_NORMALIZATION |
|
12 |
|
13 #include "unicode/uniset.h" |
|
14 #include "unicode/unistr.h" |
|
15 #include "unicode/chariter.h" |
|
16 #include "unicode/schriter.h" |
|
17 #include "unicode/uchriter.h" |
|
18 #include "unicode/normlzr.h" |
|
19 #include "unicode/utf16.h" |
|
20 #include "cmemory.h" |
|
21 #include "normalizer2impl.h" |
|
22 #include "uprops.h" // for uniset_getUnicode32Instance() |
|
23 |
|
24 U_NAMESPACE_BEGIN |
|
25 |
|
26 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer) |
|
27 |
|
28 //------------------------------------------------------------------------- |
|
29 // Constructors and other boilerplate |
|
30 //------------------------------------------------------------------------- |
|
31 |
|
32 Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) : |
|
33 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), |
|
34 text(new StringCharacterIterator(str)), |
|
35 currentIndex(0), nextIndex(0), |
|
36 buffer(), bufferPos(0) |
|
37 { |
|
38 init(); |
|
39 } |
|
40 |
|
41 Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) : |
|
42 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), |
|
43 text(new UCharCharacterIterator(str, length)), |
|
44 currentIndex(0), nextIndex(0), |
|
45 buffer(), bufferPos(0) |
|
46 { |
|
47 init(); |
|
48 } |
|
49 |
|
50 Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) : |
|
51 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), |
|
52 text(iter.clone()), |
|
53 currentIndex(0), nextIndex(0), |
|
54 buffer(), bufferPos(0) |
|
55 { |
|
56 init(); |
|
57 } |
|
58 |
|
59 Normalizer::Normalizer(const Normalizer ©) : |
|
60 UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions), |
|
61 text(copy.text->clone()), |
|
62 currentIndex(copy.currentIndex), nextIndex(copy.nextIndex), |
|
63 buffer(copy.buffer), bufferPos(copy.bufferPos) |
|
64 { |
|
65 init(); |
|
66 } |
|
67 |
|
68 void |
|
69 Normalizer::init() { |
|
70 UErrorCode errorCode=U_ZERO_ERROR; |
|
71 fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode); |
|
72 if(fOptions&UNORM_UNICODE_3_2) { |
|
73 delete fFilteredNorm2; |
|
74 fNorm2=fFilteredNorm2= |
|
75 new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode)); |
|
76 } |
|
77 if(U_FAILURE(errorCode)) { |
|
78 errorCode=U_ZERO_ERROR; |
|
79 fNorm2=Normalizer2Factory::getNoopInstance(errorCode); |
|
80 } |
|
81 } |
|
82 |
|
83 Normalizer::~Normalizer() |
|
84 { |
|
85 delete fFilteredNorm2; |
|
86 delete text; |
|
87 } |
|
88 |
|
89 Normalizer* |
|
90 Normalizer::clone() const |
|
91 { |
|
92 return new Normalizer(*this); |
|
93 } |
|
94 |
|
95 /** |
|
96 * Generates a hash code for this iterator. |
|
97 */ |
|
98 int32_t Normalizer::hashCode() const |
|
99 { |
|
100 return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex; |
|
101 } |
|
102 |
|
103 UBool Normalizer::operator==(const Normalizer& that) const |
|
104 { |
|
105 return |
|
106 this==&that || |
|
107 (fUMode==that.fUMode && |
|
108 fOptions==that.fOptions && |
|
109 *text==*that.text && |
|
110 buffer==that.buffer && |
|
111 bufferPos==that.bufferPos && |
|
112 nextIndex==that.nextIndex); |
|
113 } |
|
114 |
|
115 //------------------------------------------------------------------------- |
|
116 // Static utility methods |
|
117 //------------------------------------------------------------------------- |
|
118 |
|
119 void U_EXPORT2 |
|
120 Normalizer::normalize(const UnicodeString& source, |
|
121 UNormalizationMode mode, int32_t options, |
|
122 UnicodeString& result, |
|
123 UErrorCode &status) { |
|
124 if(source.isBogus() || U_FAILURE(status)) { |
|
125 result.setToBogus(); |
|
126 if(U_SUCCESS(status)) { |
|
127 status=U_ILLEGAL_ARGUMENT_ERROR; |
|
128 } |
|
129 } else { |
|
130 UnicodeString localDest; |
|
131 UnicodeString *dest; |
|
132 |
|
133 if(&source!=&result) { |
|
134 dest=&result; |
|
135 } else { |
|
136 // the source and result strings are the same object, use a temporary one |
|
137 dest=&localDest; |
|
138 } |
|
139 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); |
|
140 if(U_SUCCESS(status)) { |
|
141 if(options&UNORM_UNICODE_3_2) { |
|
142 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). |
|
143 normalize(source, *dest, status); |
|
144 } else { |
|
145 n2->normalize(source, *dest, status); |
|
146 } |
|
147 } |
|
148 if(dest==&localDest && U_SUCCESS(status)) { |
|
149 result=*dest; |
|
150 } |
|
151 } |
|
152 } |
|
153 |
|
154 void U_EXPORT2 |
|
155 Normalizer::compose(const UnicodeString& source, |
|
156 UBool compat, int32_t options, |
|
157 UnicodeString& result, |
|
158 UErrorCode &status) { |
|
159 normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status); |
|
160 } |
|
161 |
|
162 void U_EXPORT2 |
|
163 Normalizer::decompose(const UnicodeString& source, |
|
164 UBool compat, int32_t options, |
|
165 UnicodeString& result, |
|
166 UErrorCode &status) { |
|
167 normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status); |
|
168 } |
|
169 |
|
170 UNormalizationCheckResult |
|
171 Normalizer::quickCheck(const UnicodeString& source, |
|
172 UNormalizationMode mode, int32_t options, |
|
173 UErrorCode &status) { |
|
174 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); |
|
175 if(U_SUCCESS(status)) { |
|
176 if(options&UNORM_UNICODE_3_2) { |
|
177 return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). |
|
178 quickCheck(source, status); |
|
179 } else { |
|
180 return n2->quickCheck(source, status); |
|
181 } |
|
182 } else { |
|
183 return UNORM_MAYBE; |
|
184 } |
|
185 } |
|
186 |
|
187 UBool |
|
188 Normalizer::isNormalized(const UnicodeString& source, |
|
189 UNormalizationMode mode, int32_t options, |
|
190 UErrorCode &status) { |
|
191 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); |
|
192 if(U_SUCCESS(status)) { |
|
193 if(options&UNORM_UNICODE_3_2) { |
|
194 return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). |
|
195 isNormalized(source, status); |
|
196 } else { |
|
197 return n2->isNormalized(source, status); |
|
198 } |
|
199 } else { |
|
200 return FALSE; |
|
201 } |
|
202 } |
|
203 |
|
204 UnicodeString & U_EXPORT2 |
|
205 Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right, |
|
206 UnicodeString &result, |
|
207 UNormalizationMode mode, int32_t options, |
|
208 UErrorCode &errorCode) { |
|
209 if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) { |
|
210 result.setToBogus(); |
|
211 if(U_SUCCESS(errorCode)) { |
|
212 errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
213 } |
|
214 } else { |
|
215 UnicodeString localDest; |
|
216 UnicodeString *dest; |
|
217 |
|
218 if(&right!=&result) { |
|
219 dest=&result; |
|
220 } else { |
|
221 // the right and result strings are the same object, use a temporary one |
|
222 dest=&localDest; |
|
223 } |
|
224 *dest=left; |
|
225 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode); |
|
226 if(U_SUCCESS(errorCode)) { |
|
227 if(options&UNORM_UNICODE_3_2) { |
|
228 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)). |
|
229 append(*dest, right, errorCode); |
|
230 } else { |
|
231 n2->append(*dest, right, errorCode); |
|
232 } |
|
233 } |
|
234 if(dest==&localDest && U_SUCCESS(errorCode)) { |
|
235 result=*dest; |
|
236 } |
|
237 } |
|
238 return result; |
|
239 } |
|
240 |
|
241 //------------------------------------------------------------------------- |
|
242 // Iteration API |
|
243 //------------------------------------------------------------------------- |
|
244 |
|
245 /** |
|
246 * Return the current character in the normalized text. |
|
247 */ |
|
248 UChar32 Normalizer::current() { |
|
249 if(bufferPos<buffer.length() || nextNormalize()) { |
|
250 return buffer.char32At(bufferPos); |
|
251 } else { |
|
252 return DONE; |
|
253 } |
|
254 } |
|
255 |
|
256 /** |
|
257 * Return the next character in the normalized text and advance |
|
258 * the iteration position by one. If the end |
|
259 * of the text has already been reached, {@link #DONE} is returned. |
|
260 */ |
|
261 UChar32 Normalizer::next() { |
|
262 if(bufferPos<buffer.length() || nextNormalize()) { |
|
263 UChar32 c=buffer.char32At(bufferPos); |
|
264 bufferPos+=U16_LENGTH(c); |
|
265 return c; |
|
266 } else { |
|
267 return DONE; |
|
268 } |
|
269 } |
|
270 |
|
271 /** |
|
272 * Return the previous character in the normalized text and decrement |
|
273 * the iteration position by one. If the beginning |
|
274 * of the text has already been reached, {@link #DONE} is returned. |
|
275 */ |
|
276 UChar32 Normalizer::previous() { |
|
277 if(bufferPos>0 || previousNormalize()) { |
|
278 UChar32 c=buffer.char32At(bufferPos-1); |
|
279 bufferPos-=U16_LENGTH(c); |
|
280 return c; |
|
281 } else { |
|
282 return DONE; |
|
283 } |
|
284 } |
|
285 |
|
286 void Normalizer::reset() { |
|
287 currentIndex=nextIndex=text->setToStart(); |
|
288 clearBuffer(); |
|
289 } |
|
290 |
|
291 void |
|
292 Normalizer::setIndexOnly(int32_t index) { |
|
293 text->setIndex(index); // pins index |
|
294 currentIndex=nextIndex=text->getIndex(); |
|
295 clearBuffer(); |
|
296 } |
|
297 |
|
298 /** |
|
299 * Return the first character in the normalized text. This resets |
|
300 * the <tt>Normalizer's</tt> position to the beginning of the text. |
|
301 */ |
|
302 UChar32 Normalizer::first() { |
|
303 reset(); |
|
304 return next(); |
|
305 } |
|
306 |
|
307 /** |
|
308 * Return the last character in the normalized text. This resets |
|
309 * the <tt>Normalizer's</tt> position to be just before the |
|
310 * the input text corresponding to that normalized character. |
|
311 */ |
|
312 UChar32 Normalizer::last() { |
|
313 currentIndex=nextIndex=text->setToEnd(); |
|
314 clearBuffer(); |
|
315 return previous(); |
|
316 } |
|
317 |
|
318 /** |
|
319 * Retrieve the current iteration position in the input text that is |
|
320 * being normalized. This method is useful in applications such as |
|
321 * searching, where you need to be able to determine the position in |
|
322 * the input text that corresponds to a given normalized output character. |
|
323 * <p> |
|
324 * <b>Note:</b> This method sets the position in the <em>input</em>, while |
|
325 * {@link #next} and {@link #previous} iterate through characters in the |
|
326 * <em>output</em>. This means that there is not necessarily a one-to-one |
|
327 * correspondence between characters returned by <tt>next</tt> and |
|
328 * <tt>previous</tt> and the indices passed to and returned from |
|
329 * <tt>setIndex</tt> and {@link #getIndex}. |
|
330 * |
|
331 */ |
|
332 int32_t Normalizer::getIndex() const { |
|
333 if(bufferPos<buffer.length()) { |
|
334 return currentIndex; |
|
335 } else { |
|
336 return nextIndex; |
|
337 } |
|
338 } |
|
339 |
|
340 /** |
|
341 * Retrieve the index of the start of the input text. This is the begin index |
|
342 * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt> |
|
343 * over which this <tt>Normalizer</tt> is iterating |
|
344 */ |
|
345 int32_t Normalizer::startIndex() const { |
|
346 return text->startIndex(); |
|
347 } |
|
348 |
|
349 /** |
|
350 * Retrieve the index of the end of the input text. This is the end index |
|
351 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt> |
|
352 * over which this <tt>Normalizer</tt> is iterating |
|
353 */ |
|
354 int32_t Normalizer::endIndex() const { |
|
355 return text->endIndex(); |
|
356 } |
|
357 |
|
358 //------------------------------------------------------------------------- |
|
359 // Property access methods |
|
360 //------------------------------------------------------------------------- |
|
361 |
|
362 void |
|
363 Normalizer::setMode(UNormalizationMode newMode) |
|
364 { |
|
365 fUMode = newMode; |
|
366 init(); |
|
367 } |
|
368 |
|
369 UNormalizationMode |
|
370 Normalizer::getUMode() const |
|
371 { |
|
372 return fUMode; |
|
373 } |
|
374 |
|
375 void |
|
376 Normalizer::setOption(int32_t option, |
|
377 UBool value) |
|
378 { |
|
379 if (value) { |
|
380 fOptions |= option; |
|
381 } else { |
|
382 fOptions &= (~option); |
|
383 } |
|
384 init(); |
|
385 } |
|
386 |
|
387 UBool |
|
388 Normalizer::getOption(int32_t option) const |
|
389 { |
|
390 return (fOptions & option) != 0; |
|
391 } |
|
392 |
|
393 /** |
|
394 * Set the input text over which this <tt>Normalizer</tt> will iterate. |
|
395 * The iteration position is set to the beginning of the input text. |
|
396 */ |
|
397 void |
|
398 Normalizer::setText(const UnicodeString& newText, |
|
399 UErrorCode &status) |
|
400 { |
|
401 if (U_FAILURE(status)) { |
|
402 return; |
|
403 } |
|
404 CharacterIterator *newIter = new StringCharacterIterator(newText); |
|
405 if (newIter == NULL) { |
|
406 status = U_MEMORY_ALLOCATION_ERROR; |
|
407 return; |
|
408 } |
|
409 delete text; |
|
410 text = newIter; |
|
411 reset(); |
|
412 } |
|
413 |
|
414 /** |
|
415 * Set the input text over which this <tt>Normalizer</tt> will iterate. |
|
416 * The iteration position is set to the beginning of the string. |
|
417 */ |
|
418 void |
|
419 Normalizer::setText(const CharacterIterator& newText, |
|
420 UErrorCode &status) |
|
421 { |
|
422 if (U_FAILURE(status)) { |
|
423 return; |
|
424 } |
|
425 CharacterIterator *newIter = newText.clone(); |
|
426 if (newIter == NULL) { |
|
427 status = U_MEMORY_ALLOCATION_ERROR; |
|
428 return; |
|
429 } |
|
430 delete text; |
|
431 text = newIter; |
|
432 reset(); |
|
433 } |
|
434 |
|
435 void |
|
436 Normalizer::setText(const UChar* newText, |
|
437 int32_t length, |
|
438 UErrorCode &status) |
|
439 { |
|
440 if (U_FAILURE(status)) { |
|
441 return; |
|
442 } |
|
443 CharacterIterator *newIter = new UCharCharacterIterator(newText, length); |
|
444 if (newIter == NULL) { |
|
445 status = U_MEMORY_ALLOCATION_ERROR; |
|
446 return; |
|
447 } |
|
448 delete text; |
|
449 text = newIter; |
|
450 reset(); |
|
451 } |
|
452 |
|
453 /** |
|
454 * Copies the text under iteration into the UnicodeString referred to by "result". |
|
455 * @param result Receives a copy of the text under iteration. |
|
456 */ |
|
457 void |
|
458 Normalizer::getText(UnicodeString& result) |
|
459 { |
|
460 text->getText(result); |
|
461 } |
|
462 |
|
463 //------------------------------------------------------------------------- |
|
464 // Private utility methods |
|
465 //------------------------------------------------------------------------- |
|
466 |
|
467 void Normalizer::clearBuffer() { |
|
468 buffer.remove(); |
|
469 bufferPos=0; |
|
470 } |
|
471 |
|
472 UBool |
|
473 Normalizer::nextNormalize() { |
|
474 clearBuffer(); |
|
475 currentIndex=nextIndex; |
|
476 text->setIndex(nextIndex); |
|
477 if(!text->hasNext()) { |
|
478 return FALSE; |
|
479 } |
|
480 // Skip at least one character so we make progress. |
|
481 UnicodeString segment(text->next32PostInc()); |
|
482 while(text->hasNext()) { |
|
483 UChar32 c; |
|
484 if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) { |
|
485 text->move32(-1, CharacterIterator::kCurrent); |
|
486 break; |
|
487 } |
|
488 segment.append(c); |
|
489 } |
|
490 nextIndex=text->getIndex(); |
|
491 UErrorCode errorCode=U_ZERO_ERROR; |
|
492 fNorm2->normalize(segment, buffer, errorCode); |
|
493 return U_SUCCESS(errorCode) && !buffer.isEmpty(); |
|
494 } |
|
495 |
|
496 UBool |
|
497 Normalizer::previousNormalize() { |
|
498 clearBuffer(); |
|
499 nextIndex=currentIndex; |
|
500 text->setIndex(currentIndex); |
|
501 if(!text->hasPrevious()) { |
|
502 return FALSE; |
|
503 } |
|
504 UnicodeString segment; |
|
505 while(text->hasPrevious()) { |
|
506 UChar32 c=text->previous32(); |
|
507 segment.insert(0, c); |
|
508 if(fNorm2->hasBoundaryBefore(c)) { |
|
509 break; |
|
510 } |
|
511 } |
|
512 currentIndex=text->getIndex(); |
|
513 UErrorCode errorCode=U_ZERO_ERROR; |
|
514 fNorm2->normalize(segment, buffer, errorCode); |
|
515 bufferPos=buffer.length(); |
|
516 return U_SUCCESS(errorCode) && !buffer.isEmpty(); |
|
517 } |
|
518 |
|
519 U_NAMESPACE_END |
|
520 |
|
521 #endif /* #if !UCONFIG_NO_NORMALIZATION */ |