|
1 /* |
|
2 ****************************************************************************** |
|
3 * Copyright (c) 1996-2011, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ****************************************************************************** |
|
6 * File unorm.cpp |
|
7 * |
|
8 * Created by: Vladimir Weinstein 12052000 |
|
9 * |
|
10 * Modification history : |
|
11 * |
|
12 * Date Name Description |
|
13 * 02/01/01 synwee Added normalization quickcheck enum and method. |
|
14 * 02/12/01 synwee Commented out quickcheck util api has been approved |
|
15 * Added private method for doing FCD checks |
|
16 * 02/23/01 synwee Modified quickcheck and checkFCE to run through |
|
17 * string for codepoints < 0x300 for the normalization |
|
18 * mode NFC. |
|
19 * 05/25/01+ Markus Scherer total rewrite, implement all normalization here |
|
20 * instead of just wrappers around normlzr.cpp, |
|
21 * load unorm.dat, support Unicode 3.1 with |
|
22 * supplementary code points, etc. |
|
23 * 2009-nov..2010-jan Markus Scherer total rewrite, new Normalizer2 API & code |
|
24 */ |
|
25 |
|
26 #include "unicode/utypes.h" |
|
27 |
|
28 #if !UCONFIG_NO_NORMALIZATION |
|
29 |
|
30 #include "unicode/udata.h" |
|
31 #include "unicode/ustring.h" |
|
32 #include "unicode/uiter.h" |
|
33 #include "unicode/unorm.h" |
|
34 #include "unicode/unorm2.h" |
|
35 #include "normalizer2impl.h" |
|
36 #include "unormimp.h" |
|
37 #include "uprops.h" |
|
38 #include "ustr_imp.h" |
|
39 |
|
40 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
|
41 |
|
42 U_NAMESPACE_USE |
|
43 |
|
44 /* quick check functions ---------------------------------------------------- */ |
|
45 |
|
46 U_CAPI UNormalizationCheckResult U_EXPORT2 |
|
47 unorm_quickCheck(const UChar *src, |
|
48 int32_t srcLength, |
|
49 UNormalizationMode mode, |
|
50 UErrorCode *pErrorCode) { |
|
51 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); |
|
52 return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode); |
|
53 } |
|
54 |
|
55 U_CAPI UNormalizationCheckResult U_EXPORT2 |
|
56 unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength, |
|
57 UNormalizationMode mode, int32_t options, |
|
58 UErrorCode *pErrorCode) { |
|
59 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); |
|
60 if(options&UNORM_UNICODE_3_2) { |
|
61 FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); |
|
62 return unorm2_quickCheck( |
|
63 reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), |
|
64 src, srcLength, pErrorCode); |
|
65 } else { |
|
66 return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode); |
|
67 } |
|
68 } |
|
69 |
|
70 U_CAPI UBool U_EXPORT2 |
|
71 unorm_isNormalized(const UChar *src, int32_t srcLength, |
|
72 UNormalizationMode mode, |
|
73 UErrorCode *pErrorCode) { |
|
74 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); |
|
75 return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode); |
|
76 } |
|
77 |
|
78 U_CAPI UBool U_EXPORT2 |
|
79 unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength, |
|
80 UNormalizationMode mode, int32_t options, |
|
81 UErrorCode *pErrorCode) { |
|
82 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); |
|
83 if(options&UNORM_UNICODE_3_2) { |
|
84 FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); |
|
85 return unorm2_isNormalized( |
|
86 reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), |
|
87 src, srcLength, pErrorCode); |
|
88 } else { |
|
89 return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode); |
|
90 } |
|
91 } |
|
92 |
|
93 /* normalize() API ---------------------------------------------------------- */ |
|
94 |
|
95 /** Public API for normalizing. */ |
|
96 U_CAPI int32_t U_EXPORT2 |
|
97 unorm_normalize(const UChar *src, int32_t srcLength, |
|
98 UNormalizationMode mode, int32_t options, |
|
99 UChar *dest, int32_t destCapacity, |
|
100 UErrorCode *pErrorCode) { |
|
101 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); |
|
102 if(options&UNORM_UNICODE_3_2) { |
|
103 FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); |
|
104 return unorm2_normalize( |
|
105 reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), |
|
106 src, srcLength, dest, destCapacity, pErrorCode); |
|
107 } else { |
|
108 return unorm2_normalize((const UNormalizer2 *)n2, |
|
109 src, srcLength, dest, destCapacity, pErrorCode); |
|
110 } |
|
111 } |
|
112 |
|
113 |
|
114 /* iteration functions ------------------------------------------------------ */ |
|
115 |
|
116 static int32_t |
|
117 _iterate(UCharIterator *src, UBool forward, |
|
118 UChar *dest, int32_t destCapacity, |
|
119 const Normalizer2 *n2, |
|
120 UBool doNormalize, UBool *pNeededToNormalize, |
|
121 UErrorCode *pErrorCode) { |
|
122 if(U_FAILURE(*pErrorCode)) { |
|
123 return 0; |
|
124 } |
|
125 if(destCapacity<0 || (dest==NULL && destCapacity>0) || src==NULL) { |
|
126 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
127 return 0; |
|
128 } |
|
129 |
|
130 if(pNeededToNormalize!=NULL) { |
|
131 *pNeededToNormalize=FALSE; |
|
132 } |
|
133 if(!(forward ? src->hasNext(src) : src->hasPrevious(src))) { |
|
134 return u_terminateUChars(dest, destCapacity, 0, pErrorCode); |
|
135 } |
|
136 |
|
137 UnicodeString buffer; |
|
138 UChar32 c; |
|
139 if(forward) { |
|
140 /* get one character and ignore its properties */ |
|
141 buffer.append(uiter_next32(src)); |
|
142 /* get all following characters until we see a boundary */ |
|
143 while((c=uiter_next32(src))>=0) { |
|
144 if(n2->hasBoundaryBefore(c)) { |
|
145 /* back out the latest movement to stop at the boundary */ |
|
146 src->move(src, -U16_LENGTH(c), UITER_CURRENT); |
|
147 break; |
|
148 } else { |
|
149 buffer.append(c); |
|
150 } |
|
151 } |
|
152 } else { |
|
153 while((c=uiter_previous32(src))>=0) { |
|
154 /* always write this character to the front of the buffer */ |
|
155 buffer.insert(0, c); |
|
156 /* stop if this just-copied character is a boundary */ |
|
157 if(n2->hasBoundaryBefore(c)) { |
|
158 break; |
|
159 } |
|
160 } |
|
161 } |
|
162 |
|
163 UnicodeString destString(dest, 0, destCapacity); |
|
164 if(buffer.length()>0 && doNormalize) { |
|
165 n2->normalize(buffer, destString, *pErrorCode).extract(dest, destCapacity, *pErrorCode); |
|
166 if(pNeededToNormalize!=NULL && U_SUCCESS(*pErrorCode)) { |
|
167 *pNeededToNormalize= destString!=buffer; |
|
168 } |
|
169 return destString.length(); |
|
170 } else { |
|
171 /* just copy the source characters */ |
|
172 return buffer.extract(dest, destCapacity, *pErrorCode); |
|
173 } |
|
174 } |
|
175 |
|
176 static int32_t |
|
177 unorm_iterate(UCharIterator *src, UBool forward, |
|
178 UChar *dest, int32_t destCapacity, |
|
179 UNormalizationMode mode, int32_t options, |
|
180 UBool doNormalize, UBool *pNeededToNormalize, |
|
181 UErrorCode *pErrorCode) { |
|
182 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); |
|
183 if(options&UNORM_UNICODE_3_2) { |
|
184 const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode); |
|
185 if(U_FAILURE(*pErrorCode)) { |
|
186 return 0; |
|
187 } |
|
188 FilteredNormalizer2 fn2(*n2, *uni32); |
|
189 return _iterate(src, forward, dest, destCapacity, |
|
190 &fn2, doNormalize, pNeededToNormalize, pErrorCode); |
|
191 } |
|
192 return _iterate(src, forward, dest, destCapacity, |
|
193 n2, doNormalize, pNeededToNormalize, pErrorCode); |
|
194 } |
|
195 |
|
196 U_CAPI int32_t U_EXPORT2 |
|
197 unorm_previous(UCharIterator *src, |
|
198 UChar *dest, int32_t destCapacity, |
|
199 UNormalizationMode mode, int32_t options, |
|
200 UBool doNormalize, UBool *pNeededToNormalize, |
|
201 UErrorCode *pErrorCode) { |
|
202 return unorm_iterate(src, FALSE, |
|
203 dest, destCapacity, |
|
204 mode, options, |
|
205 doNormalize, pNeededToNormalize, |
|
206 pErrorCode); |
|
207 } |
|
208 |
|
209 U_CAPI int32_t U_EXPORT2 |
|
210 unorm_next(UCharIterator *src, |
|
211 UChar *dest, int32_t destCapacity, |
|
212 UNormalizationMode mode, int32_t options, |
|
213 UBool doNormalize, UBool *pNeededToNormalize, |
|
214 UErrorCode *pErrorCode) { |
|
215 return unorm_iterate(src, TRUE, |
|
216 dest, destCapacity, |
|
217 mode, options, |
|
218 doNormalize, pNeededToNormalize, |
|
219 pErrorCode); |
|
220 } |
|
221 |
|
222 /* Concatenation of normalized strings -------------------------------------- */ |
|
223 |
|
224 static int32_t |
|
225 _concatenate(const UChar *left, int32_t leftLength, |
|
226 const UChar *right, int32_t rightLength, |
|
227 UChar *dest, int32_t destCapacity, |
|
228 const Normalizer2 *n2, |
|
229 UErrorCode *pErrorCode) { |
|
230 if(U_FAILURE(*pErrorCode)) { |
|
231 return 0; |
|
232 } |
|
233 if(destCapacity<0 || (dest==NULL && destCapacity>0) || |
|
234 left==NULL || leftLength<-1 || right==NULL || rightLength<-1) { |
|
235 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
236 return 0; |
|
237 } |
|
238 |
|
239 /* check for overlapping right and destination */ |
|
240 if( dest!=NULL && |
|
241 ((right>=dest && right<(dest+destCapacity)) || |
|
242 (rightLength>0 && dest>=right && dest<(right+rightLength))) |
|
243 ) { |
|
244 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
245 return 0; |
|
246 } |
|
247 |
|
248 /* allow left==dest */ |
|
249 UnicodeString destString; |
|
250 if(left==dest) { |
|
251 destString.setTo(dest, leftLength, destCapacity); |
|
252 } else { |
|
253 destString.setTo(dest, 0, destCapacity); |
|
254 destString.append(left, leftLength); |
|
255 } |
|
256 return n2->append(destString, UnicodeString(rightLength<0, right, rightLength), *pErrorCode). |
|
257 extract(dest, destCapacity, *pErrorCode); |
|
258 } |
|
259 |
|
260 U_CAPI int32_t U_EXPORT2 |
|
261 unorm_concatenate(const UChar *left, int32_t leftLength, |
|
262 const UChar *right, int32_t rightLength, |
|
263 UChar *dest, int32_t destCapacity, |
|
264 UNormalizationMode mode, int32_t options, |
|
265 UErrorCode *pErrorCode) { |
|
266 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); |
|
267 if(options&UNORM_UNICODE_3_2) { |
|
268 const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode); |
|
269 if(U_FAILURE(*pErrorCode)) { |
|
270 return 0; |
|
271 } |
|
272 FilteredNormalizer2 fn2(*n2, *uni32); |
|
273 return _concatenate(left, leftLength, right, rightLength, |
|
274 dest, destCapacity, &fn2, pErrorCode); |
|
275 } |
|
276 return _concatenate(left, leftLength, right, rightLength, |
|
277 dest, destCapacity, n2, pErrorCode); |
|
278 } |
|
279 |
|
280 #endif /* #if !UCONFIG_NO_NORMALIZATION */ |