|
1 /* |
|
2 ******************************************************************************* |
|
3 * |
|
4 * Copyright (C) 2009-2013, International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ******************************************************************************* |
|
8 * file name: normalizer2.h |
|
9 * encoding: US-ASCII |
|
10 * tab size: 8 (not used) |
|
11 * indentation:4 |
|
12 * |
|
13 * created on: 2009nov22 |
|
14 * created by: Markus W. Scherer |
|
15 */ |
|
16 |
|
17 #ifndef __NORMALIZER2_H__ |
|
18 #define __NORMALIZER2_H__ |
|
19 |
|
20 /** |
|
21 * \file |
|
22 * \brief C++ API: New API for Unicode Normalization. |
|
23 */ |
|
24 |
|
25 #include "unicode/utypes.h" |
|
26 |
|
27 #if !UCONFIG_NO_NORMALIZATION |
|
28 |
|
29 #include "unicode/uniset.h" |
|
30 #include "unicode/unistr.h" |
|
31 #include "unicode/unorm2.h" |
|
32 |
|
33 U_NAMESPACE_BEGIN |
|
34 |
|
35 /** |
|
36 * Unicode normalization functionality for standard Unicode normalization or |
|
37 * for using custom mapping tables. |
|
38 * All instances of this class are unmodifiable/immutable. |
|
39 * Instances returned by getInstance() are singletons that must not be deleted by the caller. |
|
40 * The Normalizer2 class is not intended for public subclassing. |
|
41 * |
|
42 * The primary functions are to produce a normalized string and to detect whether |
|
43 * a string is already normalized. |
|
44 * The most commonly used normalization forms are those defined in |
|
45 * http://www.unicode.org/unicode/reports/tr15/ |
|
46 * However, this API supports additional normalization forms for specialized purposes. |
|
47 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE) |
|
48 * and can be used in implementations of UTS #46. |
|
49 * |
|
50 * Not only are the standard compose and decompose modes supplied, |
|
51 * but additional modes are provided as documented in the Mode enum. |
|
52 * |
|
53 * Some of the functions in this class identify normalization boundaries. |
|
54 * At a normalization boundary, the portions of the string |
|
55 * before it and starting from it do not interact and can be handled independently. |
|
56 * |
|
57 * The spanQuickCheckYes() stops at a normalization boundary. |
|
58 * When the goal is a normalized string, then the text before the boundary |
|
59 * can be copied, and the remainder can be processed with normalizeSecondAndAppend(). |
|
60 * |
|
61 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether |
|
62 * a character is guaranteed to be at a normalization boundary, |
|
63 * regardless of context. |
|
64 * This is used for moving from one normalization boundary to the next |
|
65 * or preceding boundary, and for performing iterative normalization. |
|
66 * |
|
67 * Iterative normalization is useful when only a small portion of a |
|
68 * longer string needs to be processed. |
|
69 * For example, in ICU, iterative normalization is used by the NormalizationTransliterator |
|
70 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart() |
|
71 * (to process only the substring for which sort key bytes are computed). |
|
72 * |
|
73 * The set of normalization boundaries returned by these functions may not be |
|
74 * complete: There may be more boundaries that could be returned. |
|
75 * Different functions may return different boundaries. |
|
76 * @stable ICU 4.4 |
|
77 */ |
|
78 class U_COMMON_API Normalizer2 : public UObject { |
|
79 public: |
|
80 /** |
|
81 * Destructor. |
|
82 * @stable ICU 4.4 |
|
83 */ |
|
84 ~Normalizer2(); |
|
85 |
|
86 /** |
|
87 * Returns a Normalizer2 instance for Unicode NFC normalization. |
|
88 * Same as getInstance(NULL, "nfc", UNORM2_COMPOSE, errorCode). |
|
89 * Returns an unmodifiable singleton instance. Do not delete it. |
|
90 * @param errorCode Standard ICU error code. Its input value must |
|
91 * pass the U_SUCCESS() test, or else the function returns |
|
92 * immediately. Check for U_FAILURE() on output or use with |
|
93 * function chaining. (See User Guide for details.) |
|
94 * @return the requested Normalizer2, if successful |
|
95 * @stable ICU 49 |
|
96 */ |
|
97 static const Normalizer2 * |
|
98 getNFCInstance(UErrorCode &errorCode); |
|
99 |
|
100 /** |
|
101 * Returns a Normalizer2 instance for Unicode NFD normalization. |
|
102 * Same as getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode). |
|
103 * Returns an unmodifiable singleton instance. Do not delete it. |
|
104 * @param errorCode Standard ICU error code. Its input value must |
|
105 * pass the U_SUCCESS() test, or else the function returns |
|
106 * immediately. Check for U_FAILURE() on output or use with |
|
107 * function chaining. (See User Guide for details.) |
|
108 * @return the requested Normalizer2, if successful |
|
109 * @stable ICU 49 |
|
110 */ |
|
111 static const Normalizer2 * |
|
112 getNFDInstance(UErrorCode &errorCode); |
|
113 |
|
114 /** |
|
115 * Returns a Normalizer2 instance for Unicode NFKC normalization. |
|
116 * Same as getInstance(NULL, "nfkc", UNORM2_COMPOSE, errorCode). |
|
117 * Returns an unmodifiable singleton instance. Do not delete it. |
|
118 * @param errorCode Standard ICU error code. Its input value must |
|
119 * pass the U_SUCCESS() test, or else the function returns |
|
120 * immediately. Check for U_FAILURE() on output or use with |
|
121 * function chaining. (See User Guide for details.) |
|
122 * @return the requested Normalizer2, if successful |
|
123 * @stable ICU 49 |
|
124 */ |
|
125 static const Normalizer2 * |
|
126 getNFKCInstance(UErrorCode &errorCode); |
|
127 |
|
128 /** |
|
129 * Returns a Normalizer2 instance for Unicode NFKD normalization. |
|
130 * Same as getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, errorCode). |
|
131 * Returns an unmodifiable singleton instance. Do not delete it. |
|
132 * @param errorCode Standard ICU error code. Its input value must |
|
133 * pass the U_SUCCESS() test, or else the function returns |
|
134 * immediately. Check for U_FAILURE() on output or use with |
|
135 * function chaining. (See User Guide for details.) |
|
136 * @return the requested Normalizer2, if successful |
|
137 * @stable ICU 49 |
|
138 */ |
|
139 static const Normalizer2 * |
|
140 getNFKDInstance(UErrorCode &errorCode); |
|
141 |
|
142 /** |
|
143 * Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization. |
|
144 * Same as getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode). |
|
145 * Returns an unmodifiable singleton instance. Do not delete it. |
|
146 * @param errorCode Standard ICU error code. Its input value must |
|
147 * pass the U_SUCCESS() test, or else the function returns |
|
148 * immediately. Check for U_FAILURE() on output or use with |
|
149 * function chaining. (See User Guide for details.) |
|
150 * @return the requested Normalizer2, if successful |
|
151 * @stable ICU 49 |
|
152 */ |
|
153 static const Normalizer2 * |
|
154 getNFKCCasefoldInstance(UErrorCode &errorCode); |
|
155 |
|
156 /** |
|
157 * Returns a Normalizer2 instance which uses the specified data file |
|
158 * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle) |
|
159 * and which composes or decomposes text according to the specified mode. |
|
160 * Returns an unmodifiable singleton instance. Do not delete it. |
|
161 * |
|
162 * Use packageName=NULL for data files that are part of ICU's own data. |
|
163 * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD. |
|
164 * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD. |
|
165 * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold. |
|
166 * |
|
167 * @param packageName NULL for ICU built-in data, otherwise application data package name |
|
168 * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file |
|
169 * @param mode normalization mode (compose or decompose etc.) |
|
170 * @param errorCode Standard ICU error code. Its input value must |
|
171 * pass the U_SUCCESS() test, or else the function returns |
|
172 * immediately. Check for U_FAILURE() on output or use with |
|
173 * function chaining. (See User Guide for details.) |
|
174 * @return the requested Normalizer2, if successful |
|
175 * @stable ICU 4.4 |
|
176 */ |
|
177 static const Normalizer2 * |
|
178 getInstance(const char *packageName, |
|
179 const char *name, |
|
180 UNormalization2Mode mode, |
|
181 UErrorCode &errorCode); |
|
182 |
|
183 /** |
|
184 * Returns the normalized form of the source string. |
|
185 * @param src source string |
|
186 * @param errorCode Standard ICU error code. Its input value must |
|
187 * pass the U_SUCCESS() test, or else the function returns |
|
188 * immediately. Check for U_FAILURE() on output or use with |
|
189 * function chaining. (See User Guide for details.) |
|
190 * @return normalized src |
|
191 * @stable ICU 4.4 |
|
192 */ |
|
193 UnicodeString |
|
194 normalize(const UnicodeString &src, UErrorCode &errorCode) const { |
|
195 UnicodeString result; |
|
196 normalize(src, result, errorCode); |
|
197 return result; |
|
198 } |
|
199 /** |
|
200 * Writes the normalized form of the source string to the destination string |
|
201 * (replacing its contents) and returns the destination string. |
|
202 * The source and destination strings must be different objects. |
|
203 * @param src source string |
|
204 * @param dest destination string; its contents is replaced with normalized src |
|
205 * @param errorCode Standard ICU error code. Its input value must |
|
206 * pass the U_SUCCESS() test, or else the function returns |
|
207 * immediately. Check for U_FAILURE() on output or use with |
|
208 * function chaining. (See User Guide for details.) |
|
209 * @return dest |
|
210 * @stable ICU 4.4 |
|
211 */ |
|
212 virtual UnicodeString & |
|
213 normalize(const UnicodeString &src, |
|
214 UnicodeString &dest, |
|
215 UErrorCode &errorCode) const = 0; |
|
216 /** |
|
217 * Appends the normalized form of the second string to the first string |
|
218 * (merging them at the boundary) and returns the first string. |
|
219 * The result is normalized if the first string was normalized. |
|
220 * The first and second strings must be different objects. |
|
221 * @param first string, should be normalized |
|
222 * @param second string, will be normalized |
|
223 * @param errorCode Standard ICU error code. Its input value must |
|
224 * pass the U_SUCCESS() test, or else the function returns |
|
225 * immediately. Check for U_FAILURE() on output or use with |
|
226 * function chaining. (See User Guide for details.) |
|
227 * @return first |
|
228 * @stable ICU 4.4 |
|
229 */ |
|
230 virtual UnicodeString & |
|
231 normalizeSecondAndAppend(UnicodeString &first, |
|
232 const UnicodeString &second, |
|
233 UErrorCode &errorCode) const = 0; |
|
234 /** |
|
235 * Appends the second string to the first string |
|
236 * (merging them at the boundary) and returns the first string. |
|
237 * The result is normalized if both the strings were normalized. |
|
238 * The first and second strings must be different objects. |
|
239 * @param first string, should be normalized |
|
240 * @param second string, should be normalized |
|
241 * @param errorCode Standard ICU error code. Its input value must |
|
242 * pass the U_SUCCESS() test, or else the function returns |
|
243 * immediately. Check for U_FAILURE() on output or use with |
|
244 * function chaining. (See User Guide for details.) |
|
245 * @return first |
|
246 * @stable ICU 4.4 |
|
247 */ |
|
248 virtual UnicodeString & |
|
249 append(UnicodeString &first, |
|
250 const UnicodeString &second, |
|
251 UErrorCode &errorCode) const = 0; |
|
252 |
|
253 /** |
|
254 * Gets the decomposition mapping of c. |
|
255 * Roughly equivalent to normalizing the String form of c |
|
256 * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function |
|
257 * returns FALSE and does not write a string |
|
258 * if c does not have a decomposition mapping in this instance's data. |
|
259 * This function is independent of the mode of the Normalizer2. |
|
260 * @param c code point |
|
261 * @param decomposition String object which will be set to c's |
|
262 * decomposition mapping, if there is one. |
|
263 * @return TRUE if c has a decomposition, otherwise FALSE |
|
264 * @stable ICU 4.6 |
|
265 */ |
|
266 virtual UBool |
|
267 getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0; |
|
268 |
|
269 /** |
|
270 * Gets the raw decomposition mapping of c. |
|
271 * |
|
272 * This is similar to the getDecomposition() method but returns the |
|
273 * raw decomposition mapping as specified in UnicodeData.txt or |
|
274 * (for custom data) in the mapping files processed by the gennorm2 tool. |
|
275 * By contrast, getDecomposition() returns the processed, |
|
276 * recursively-decomposed version of this mapping. |
|
277 * |
|
278 * When used on a standard NFKC Normalizer2 instance, |
|
279 * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property. |
|
280 * |
|
281 * When used on a standard NFC Normalizer2 instance, |
|
282 * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can); |
|
283 * in this case, the result contains either one or two code points (=1..4 UChars). |
|
284 * |
|
285 * This function is independent of the mode of the Normalizer2. |
|
286 * The default implementation returns FALSE. |
|
287 * @param c code point |
|
288 * @param decomposition String object which will be set to c's |
|
289 * raw decomposition mapping, if there is one. |
|
290 * @return TRUE if c has a decomposition, otherwise FALSE |
|
291 * @stable ICU 49 |
|
292 */ |
|
293 virtual UBool |
|
294 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const; |
|
295 |
|
296 /** |
|
297 * Performs pairwise composition of a & b and returns the composite if there is one. |
|
298 * |
|
299 * Returns a composite code point c only if c has a two-way mapping to a+b. |
|
300 * In standard Unicode normalization, this means that |
|
301 * c has a canonical decomposition to a+b |
|
302 * and c does not have the Full_Composition_Exclusion property. |
|
303 * |
|
304 * This function is independent of the mode of the Normalizer2. |
|
305 * The default implementation returns a negative value. |
|
306 * @param a A (normalization starter) code point. |
|
307 * @param b Another code point. |
|
308 * @return The non-negative composite code point if there is one; otherwise a negative value. |
|
309 * @stable ICU 49 |
|
310 */ |
|
311 virtual UChar32 |
|
312 composePair(UChar32 a, UChar32 b) const; |
|
313 |
|
314 /** |
|
315 * Gets the combining class of c. |
|
316 * The default implementation returns 0 |
|
317 * but all standard implementations return the Unicode Canonical_Combining_Class value. |
|
318 * @param c code point |
|
319 * @return c's combining class |
|
320 * @stable ICU 49 |
|
321 */ |
|
322 virtual uint8_t |
|
323 getCombiningClass(UChar32 c) const; |
|
324 |
|
325 /** |
|
326 * Tests if the string is normalized. |
|
327 * Internally, in cases where the quickCheck() method would return "maybe" |
|
328 * (which is only possible for the two COMPOSE modes) this method |
|
329 * resolves to "yes" or "no" to provide a definitive result, |
|
330 * at the cost of doing more work in those cases. |
|
331 * @param s input string |
|
332 * @param errorCode Standard ICU error code. Its input value must |
|
333 * pass the U_SUCCESS() test, or else the function returns |
|
334 * immediately. Check for U_FAILURE() on output or use with |
|
335 * function chaining. (See User Guide for details.) |
|
336 * @return TRUE if s is normalized |
|
337 * @stable ICU 4.4 |
|
338 */ |
|
339 virtual UBool |
|
340 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0; |
|
341 |
|
342 /** |
|
343 * Tests if the string is normalized. |
|
344 * For the two COMPOSE modes, the result could be "maybe" in cases that |
|
345 * would take a little more work to resolve definitively. |
|
346 * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster |
|
347 * combination of quick check + normalization, to avoid |
|
348 * re-checking the "yes" prefix. |
|
349 * @param s input string |
|
350 * @param errorCode Standard ICU error code. Its input value must |
|
351 * pass the U_SUCCESS() test, or else the function returns |
|
352 * immediately. Check for U_FAILURE() on output or use with |
|
353 * function chaining. (See User Guide for details.) |
|
354 * @return UNormalizationCheckResult |
|
355 * @stable ICU 4.4 |
|
356 */ |
|
357 virtual UNormalizationCheckResult |
|
358 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0; |
|
359 |
|
360 /** |
|
361 * Returns the end of the normalized substring of the input string. |
|
362 * In other words, with <code>end=spanQuickCheckYes(s, ec);</code> |
|
363 * the substring <code>UnicodeString(s, 0, end)</code> |
|
364 * will pass the quick check with a "yes" result. |
|
365 * |
|
366 * The returned end index is usually one or more characters before the |
|
367 * "no" or "maybe" character: The end index is at a normalization boundary. |
|
368 * (See the class documentation for more about normalization boundaries.) |
|
369 * |
|
370 * When the goal is a normalized string and most input strings are expected |
|
371 * to be normalized already, then call this method, |
|
372 * and if it returns a prefix shorter than the input string, |
|
373 * copy that prefix and use normalizeSecondAndAppend() for the remainder. |
|
374 * @param s input string |
|
375 * @param errorCode Standard ICU error code. Its input value must |
|
376 * pass the U_SUCCESS() test, or else the function returns |
|
377 * immediately. Check for U_FAILURE() on output or use with |
|
378 * function chaining. (See User Guide for details.) |
|
379 * @return "yes" span end index |
|
380 * @stable ICU 4.4 |
|
381 */ |
|
382 virtual int32_t |
|
383 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0; |
|
384 |
|
385 /** |
|
386 * Tests if the character always has a normalization boundary before it, |
|
387 * regardless of context. |
|
388 * If true, then the character does not normalization-interact with |
|
389 * preceding characters. |
|
390 * In other words, a string containing this character can be normalized |
|
391 * by processing portions before this character and starting from this |
|
392 * character independently. |
|
393 * This is used for iterative normalization. See the class documentation for details. |
|
394 * @param c character to test |
|
395 * @return TRUE if c has a normalization boundary before it |
|
396 * @stable ICU 4.4 |
|
397 */ |
|
398 virtual UBool hasBoundaryBefore(UChar32 c) const = 0; |
|
399 |
|
400 /** |
|
401 * Tests if the character always has a normalization boundary after it, |
|
402 * regardless of context. |
|
403 * If true, then the character does not normalization-interact with |
|
404 * following characters. |
|
405 * In other words, a string containing this character can be normalized |
|
406 * by processing portions up to this character and after this |
|
407 * character independently. |
|
408 * This is used for iterative normalization. See the class documentation for details. |
|
409 * Note that this operation may be significantly slower than hasBoundaryBefore(). |
|
410 * @param c character to test |
|
411 * @return TRUE if c has a normalization boundary after it |
|
412 * @stable ICU 4.4 |
|
413 */ |
|
414 virtual UBool hasBoundaryAfter(UChar32 c) const = 0; |
|
415 |
|
416 /** |
|
417 * Tests if the character is normalization-inert. |
|
418 * If true, then the character does not change, nor normalization-interact with |
|
419 * preceding or following characters. |
|
420 * In other words, a string containing this character can be normalized |
|
421 * by processing portions before this character and after this |
|
422 * character independently. |
|
423 * This is used for iterative normalization. See the class documentation for details. |
|
424 * Note that this operation may be significantly slower than hasBoundaryBefore(). |
|
425 * @param c character to test |
|
426 * @return TRUE if c is normalization-inert |
|
427 * @stable ICU 4.4 |
|
428 */ |
|
429 virtual UBool isInert(UChar32 c) const = 0; |
|
430 }; |
|
431 |
|
432 /** |
|
433 * Normalization filtered by a UnicodeSet. |
|
434 * Normalizes portions of the text contained in the filter set and leaves |
|
435 * portions not contained in the filter set unchanged. |
|
436 * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE). |
|
437 * Not-in-the-filter text is treated as "is normalized" and "quick check yes". |
|
438 * This class implements all of (and only) the Normalizer2 API. |
|
439 * An instance of this class is unmodifiable/immutable but is constructed and |
|
440 * must be destructed by the owner. |
|
441 * @stable ICU 4.4 |
|
442 */ |
|
443 class U_COMMON_API FilteredNormalizer2 : public Normalizer2 { |
|
444 public: |
|
445 /** |
|
446 * Constructs a filtered normalizer wrapping any Normalizer2 instance |
|
447 * and a filter set. |
|
448 * Both are aliased and must not be modified or deleted while this object |
|
449 * is used. |
|
450 * The filter set should be frozen; otherwise the performance will suffer greatly. |
|
451 * @param n2 wrapped Normalizer2 instance |
|
452 * @param filterSet UnicodeSet which determines the characters to be normalized |
|
453 * @stable ICU 4.4 |
|
454 */ |
|
455 FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) : |
|
456 norm2(n2), set(filterSet) {} |
|
457 |
|
458 /** |
|
459 * Destructor. |
|
460 * @stable ICU 4.4 |
|
461 */ |
|
462 ~FilteredNormalizer2(); |
|
463 |
|
464 /** |
|
465 * Writes the normalized form of the source string to the destination string |
|
466 * (replacing its contents) and returns the destination string. |
|
467 * The source and destination strings must be different objects. |
|
468 * @param src source string |
|
469 * @param dest destination string; its contents is replaced with normalized src |
|
470 * @param errorCode Standard ICU error code. Its input value must |
|
471 * pass the U_SUCCESS() test, or else the function returns |
|
472 * immediately. Check for U_FAILURE() on output or use with |
|
473 * function chaining. (See User Guide for details.) |
|
474 * @return dest |
|
475 * @stable ICU 4.4 |
|
476 */ |
|
477 virtual UnicodeString & |
|
478 normalize(const UnicodeString &src, |
|
479 UnicodeString &dest, |
|
480 UErrorCode &errorCode) const; |
|
481 /** |
|
482 * Appends the normalized form of the second string to the first string |
|
483 * (merging them at the boundary) and returns the first string. |
|
484 * The result is normalized if the first string was normalized. |
|
485 * The first and second strings must be different objects. |
|
486 * @param first string, should be normalized |
|
487 * @param second string, will be normalized |
|
488 * @param errorCode Standard ICU error code. Its input value must |
|
489 * pass the U_SUCCESS() test, or else the function returns |
|
490 * immediately. Check for U_FAILURE() on output or use with |
|
491 * function chaining. (See User Guide for details.) |
|
492 * @return first |
|
493 * @stable ICU 4.4 |
|
494 */ |
|
495 virtual UnicodeString & |
|
496 normalizeSecondAndAppend(UnicodeString &first, |
|
497 const UnicodeString &second, |
|
498 UErrorCode &errorCode) const; |
|
499 /** |
|
500 * Appends the second string to the first string |
|
501 * (merging them at the boundary) and returns the first string. |
|
502 * The result is normalized if both the strings were normalized. |
|
503 * The first and second strings must be different objects. |
|
504 * @param first string, should be normalized |
|
505 * @param second string, should be normalized |
|
506 * @param errorCode Standard ICU error code. Its input value must |
|
507 * pass the U_SUCCESS() test, or else the function returns |
|
508 * immediately. Check for U_FAILURE() on output or use with |
|
509 * function chaining. (See User Guide for details.) |
|
510 * @return first |
|
511 * @stable ICU 4.4 |
|
512 */ |
|
513 virtual UnicodeString & |
|
514 append(UnicodeString &first, |
|
515 const UnicodeString &second, |
|
516 UErrorCode &errorCode) const; |
|
517 |
|
518 /** |
|
519 * Gets the decomposition mapping of c. |
|
520 * For details see the base class documentation. |
|
521 * |
|
522 * This function is independent of the mode of the Normalizer2. |
|
523 * @param c code point |
|
524 * @param decomposition String object which will be set to c's |
|
525 * decomposition mapping, if there is one. |
|
526 * @return TRUE if c has a decomposition, otherwise FALSE |
|
527 * @stable ICU 4.6 |
|
528 */ |
|
529 virtual UBool |
|
530 getDecomposition(UChar32 c, UnicodeString &decomposition) const; |
|
531 |
|
532 /** |
|
533 * Gets the raw decomposition mapping of c. |
|
534 * For details see the base class documentation. |
|
535 * |
|
536 * This function is independent of the mode of the Normalizer2. |
|
537 * @param c code point |
|
538 * @param decomposition String object which will be set to c's |
|
539 * raw decomposition mapping, if there is one. |
|
540 * @return TRUE if c has a decomposition, otherwise FALSE |
|
541 * @stable ICU 49 |
|
542 */ |
|
543 virtual UBool |
|
544 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const; |
|
545 |
|
546 /** |
|
547 * Performs pairwise composition of a & b and returns the composite if there is one. |
|
548 * For details see the base class documentation. |
|
549 * |
|
550 * This function is independent of the mode of the Normalizer2. |
|
551 * @param a A (normalization starter) code point. |
|
552 * @param b Another code point. |
|
553 * @return The non-negative composite code point if there is one; otherwise a negative value. |
|
554 * @stable ICU 49 |
|
555 */ |
|
556 virtual UChar32 |
|
557 composePair(UChar32 a, UChar32 b) const; |
|
558 |
|
559 /** |
|
560 * Gets the combining class of c. |
|
561 * The default implementation returns 0 |
|
562 * but all standard implementations return the Unicode Canonical_Combining_Class value. |
|
563 * @param c code point |
|
564 * @return c's combining class |
|
565 * @stable ICU 49 |
|
566 */ |
|
567 virtual uint8_t |
|
568 getCombiningClass(UChar32 c) const; |
|
569 |
|
570 /** |
|
571 * Tests if the string is normalized. |
|
572 * For details see the Normalizer2 base class documentation. |
|
573 * @param s input string |
|
574 * @param errorCode Standard ICU error code. Its input value must |
|
575 * pass the U_SUCCESS() test, or else the function returns |
|
576 * immediately. Check for U_FAILURE() on output or use with |
|
577 * function chaining. (See User Guide for details.) |
|
578 * @return TRUE if s is normalized |
|
579 * @stable ICU 4.4 |
|
580 */ |
|
581 virtual UBool |
|
582 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const; |
|
583 /** |
|
584 * Tests if the string is normalized. |
|
585 * For details see the Normalizer2 base class documentation. |
|
586 * @param s input string |
|
587 * @param errorCode Standard ICU error code. Its input value must |
|
588 * pass the U_SUCCESS() test, or else the function returns |
|
589 * immediately. Check for U_FAILURE() on output or use with |
|
590 * function chaining. (See User Guide for details.) |
|
591 * @return UNormalizationCheckResult |
|
592 * @stable ICU 4.4 |
|
593 */ |
|
594 virtual UNormalizationCheckResult |
|
595 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const; |
|
596 /** |
|
597 * Returns the end of the normalized substring of the input string. |
|
598 * For details see the Normalizer2 base class documentation. |
|
599 * @param s input string |
|
600 * @param errorCode Standard ICU error code. Its input value must |
|
601 * pass the U_SUCCESS() test, or else the function returns |
|
602 * immediately. Check for U_FAILURE() on output or use with |
|
603 * function chaining. (See User Guide for details.) |
|
604 * @return "yes" span end index |
|
605 * @stable ICU 4.4 |
|
606 */ |
|
607 virtual int32_t |
|
608 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const; |
|
609 |
|
610 /** |
|
611 * Tests if the character always has a normalization boundary before it, |
|
612 * regardless of context. |
|
613 * For details see the Normalizer2 base class documentation. |
|
614 * @param c character to test |
|
615 * @return TRUE if c has a normalization boundary before it |
|
616 * @stable ICU 4.4 |
|
617 */ |
|
618 virtual UBool hasBoundaryBefore(UChar32 c) const; |
|
619 |
|
620 /** |
|
621 * Tests if the character always has a normalization boundary after it, |
|
622 * regardless of context. |
|
623 * For details see the Normalizer2 base class documentation. |
|
624 * @param c character to test |
|
625 * @return TRUE if c has a normalization boundary after it |
|
626 * @stable ICU 4.4 |
|
627 */ |
|
628 virtual UBool hasBoundaryAfter(UChar32 c) const; |
|
629 |
|
630 /** |
|
631 * Tests if the character is normalization-inert. |
|
632 * For details see the Normalizer2 base class documentation. |
|
633 * @param c character to test |
|
634 * @return TRUE if c is normalization-inert |
|
635 * @stable ICU 4.4 |
|
636 */ |
|
637 virtual UBool isInert(UChar32 c) const; |
|
638 private: |
|
639 UnicodeString & |
|
640 normalize(const UnicodeString &src, |
|
641 UnicodeString &dest, |
|
642 USetSpanCondition spanCondition, |
|
643 UErrorCode &errorCode) const; |
|
644 |
|
645 UnicodeString & |
|
646 normalizeSecondAndAppend(UnicodeString &first, |
|
647 const UnicodeString &second, |
|
648 UBool doNormalize, |
|
649 UErrorCode &errorCode) const; |
|
650 |
|
651 const Normalizer2 &norm2; |
|
652 const UnicodeSet &set; |
|
653 }; |
|
654 |
|
655 U_NAMESPACE_END |
|
656 |
|
657 #endif // !UCONFIG_NO_NORMALIZATION |
|
658 #endif // __NORMALIZER2_H__ |