|
1 /* |
|
2 ******************************************************************************* |
|
3 * |
|
4 * Copyright (C) 1999-2004, International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ******************************************************************************* |
|
8 * file name: utf.h |
|
9 * encoding: US-ASCII |
|
10 * tab size: 8 (not used) |
|
11 * indentation:4 |
|
12 * |
|
13 * created on: 1999sep09 |
|
14 * created by: Markus W. Scherer |
|
15 */ |
|
16 |
|
17 #ifndef BASE_THIRD_PARTY_ICU_ICU_UTF_H_ |
|
18 #define BASE_THIRD_PARTY_ICU_ICU_UTF_H_ |
|
19 |
|
20 #include "base/basictypes.h" |
|
21 |
|
22 namespace base_icu { |
|
23 |
|
24 typedef uint32 UChar32; |
|
25 typedef int8 UBool; |
|
26 |
|
27 // General --------------------------------------------------------------------- |
|
28 // from utf.h |
|
29 |
|
30 /** |
|
31 * This value is intended for sentinel values for APIs that |
|
32 * (take or) return single code points (UChar32). |
|
33 * It is outside of the Unicode code point range 0..0x10ffff. |
|
34 * |
|
35 * For example, a "done" or "error" value in a new API |
|
36 * could be indicated with CBU_SENTINEL. |
|
37 * |
|
38 * ICU APIs designed before ICU 2.4 usually define service-specific "done" |
|
39 * values, mostly 0xffff. |
|
40 * Those may need to be distinguished from |
|
41 * actual U+ffff text contents by calling functions like |
|
42 * CharacterIterator::hasNext() or UnicodeString::length(). |
|
43 * |
|
44 * @return -1 |
|
45 * @see UChar32 |
|
46 * @stable ICU 2.4 |
|
47 */ |
|
48 #define CBU_SENTINEL (-1) |
|
49 |
|
50 /** |
|
51 * Is this code point a Unicode noncharacter? |
|
52 * @param c 32-bit code point |
|
53 * @return TRUE or FALSE |
|
54 * @stable ICU 2.4 |
|
55 */ |
|
56 #define CBU_IS_UNICODE_NONCHAR(c) \ |
|
57 ((c)>=0xfdd0 && \ |
|
58 ((uint32)(c)<=0xfdef || ((c)&0xfffe)==0xfffe) && \ |
|
59 (uint32)(c)<=0x10ffff) |
|
60 |
|
61 /** |
|
62 * Is c a Unicode code point value (0..U+10ffff) |
|
63 * that can be assigned a character? |
|
64 * |
|
65 * Code points that are not characters include: |
|
66 * - single surrogate code points (U+d800..U+dfff, 2048 code points) |
|
67 * - the last two code points on each plane (U+__fffe and U+__ffff, 34 code points) |
|
68 * - U+fdd0..U+fdef (new with Unicode 3.1, 32 code points) |
|
69 * - the highest Unicode code point value is U+10ffff |
|
70 * |
|
71 * This means that all code points below U+d800 are character code points, |
|
72 * and that boundary is tested first for performance. |
|
73 * |
|
74 * @param c 32-bit code point |
|
75 * @return TRUE or FALSE |
|
76 * @stable ICU 2.4 |
|
77 */ |
|
78 #define CBU_IS_UNICODE_CHAR(c) \ |
|
79 ((uint32)(c)<0xd800 || \ |
|
80 ((uint32)(c)>0xdfff && \ |
|
81 (uint32)(c)<=0x10ffff && \ |
|
82 !CBU_IS_UNICODE_NONCHAR(c))) |
|
83 |
|
84 /** |
|
85 * Is this code point a surrogate (U+d800..U+dfff)? |
|
86 * @param c 32-bit code point |
|
87 * @return TRUE or FALSE |
|
88 * @stable ICU 2.4 |
|
89 */ |
|
90 #define CBU_IS_SURROGATE(c) (((c)&0xfffff800)==0xd800) |
|
91 |
|
92 /** |
|
93 * Assuming c is a surrogate code point (U_IS_SURROGATE(c)), |
|
94 * is it a lead surrogate? |
|
95 * @param c 32-bit code point |
|
96 * @return TRUE or FALSE |
|
97 * @stable ICU 2.4 |
|
98 */ |
|
99 #define CBU_IS_SURROGATE_LEAD(c) (((c)&0x400)==0) |
|
100 |
|
101 |
|
102 // UTF-8 macros ---------------------------------------------------------------- |
|
103 // from utf8.h |
|
104 |
|
105 extern const uint8 utf8_countTrailBytes[256]; |
|
106 |
|
107 /** |
|
108 * Count the trail bytes for a UTF-8 lead byte. |
|
109 * @internal |
|
110 */ |
|
111 #define CBU8_COUNT_TRAIL_BYTES(leadByte) (base_icu::utf8_countTrailBytes[(uint8)leadByte]) |
|
112 |
|
113 /** |
|
114 * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value. |
|
115 * @internal |
|
116 */ |
|
117 #define CBU8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) |
|
118 |
|
119 /** |
|
120 * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)? |
|
121 * @param c 8-bit code unit (byte) |
|
122 * @return TRUE or FALSE |
|
123 * @stable ICU 2.4 |
|
124 */ |
|
125 #define CBU8_IS_SINGLE(c) (((c)&0x80)==0) |
|
126 |
|
127 /** |
|
128 * Is this code unit (byte) a UTF-8 lead byte? |
|
129 * @param c 8-bit code unit (byte) |
|
130 * @return TRUE or FALSE |
|
131 * @stable ICU 2.4 |
|
132 */ |
|
133 #define CBU8_IS_LEAD(c) ((uint8)((c)-0xc0)<0x3e) |
|
134 |
|
135 /** |
|
136 * Is this code unit (byte) a UTF-8 trail byte? |
|
137 * @param c 8-bit code unit (byte) |
|
138 * @return TRUE or FALSE |
|
139 * @stable ICU 2.4 |
|
140 */ |
|
141 #define CBU8_IS_TRAIL(c) (((c)&0xc0)==0x80) |
|
142 |
|
143 /** |
|
144 * How many code units (bytes) are used for the UTF-8 encoding |
|
145 * of this Unicode code point? |
|
146 * @param c 32-bit code point |
|
147 * @return 1..4, or 0 if c is a surrogate or not a Unicode code point |
|
148 * @stable ICU 2.4 |
|
149 */ |
|
150 #define CBU8_LENGTH(c) \ |
|
151 ((uint32)(c)<=0x7f ? 1 : \ |
|
152 ((uint32)(c)<=0x7ff ? 2 : \ |
|
153 ((uint32)(c)<=0xd7ff ? 3 : \ |
|
154 ((uint32)(c)<=0xdfff || (uint32)(c)>0x10ffff ? 0 : \ |
|
155 ((uint32)(c)<=0xffff ? 3 : 4)\ |
|
156 ) \ |
|
157 ) \ |
|
158 ) \ |
|
159 ) |
|
160 |
|
161 /** |
|
162 * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff). |
|
163 * @return 4 |
|
164 * @stable ICU 2.4 |
|
165 */ |
|
166 #define CBU8_MAX_LENGTH 4 |
|
167 |
|
168 /** |
|
169 * Function for handling "next code point" with error-checking. |
|
170 * @internal |
|
171 */ |
|
172 UChar32 utf8_nextCharSafeBody(const uint8 *s, int32 *pi, int32 length, UChar32 c, UBool strict); |
|
173 |
|
174 /** |
|
175 * Get a code point from a string at a code point boundary offset, |
|
176 * and advance the offset to the next code point boundary. |
|
177 * (Post-incrementing forward iteration.) |
|
178 * "Safe" macro, checks for illegal sequences and for string boundaries. |
|
179 * |
|
180 * The offset may point to the lead byte of a multi-byte sequence, |
|
181 * in which case the macro will read the whole sequence. |
|
182 * If the offset points to a trail byte or an illegal UTF-8 sequence, then |
|
183 * c is set to a negative value. |
|
184 * |
|
185 * @param s const uint8 * string |
|
186 * @param i string offset, i<length |
|
187 * @param length string length |
|
188 * @param c output UChar32 variable, set to <0 in case of an error |
|
189 * @see CBU8_NEXT_UNSAFE |
|
190 * @stable ICU 2.4 |
|
191 */ |
|
192 #define CBU8_NEXT(s, i, length, c) { \ |
|
193 (c)=(s)[(i)++]; \ |
|
194 if(((uint8)(c))>=0x80) { \ |
|
195 if(CBU8_IS_LEAD(c)) { \ |
|
196 (c)=base_icu::utf8_nextCharSafeBody((const uint8 *)s, &(i), (int32)(length), c, -1); \ |
|
197 } else { \ |
|
198 (c)=CBU_SENTINEL; \ |
|
199 } \ |
|
200 } \ |
|
201 } |
|
202 |
|
203 /** |
|
204 * Append a code point to a string, overwriting 1 to 4 bytes. |
|
205 * The offset points to the current end of the string contents |
|
206 * and is advanced (post-increment). |
|
207 * "Unsafe" macro, assumes a valid code point and sufficient space in the string. |
|
208 * Otherwise, the result is undefined. |
|
209 * |
|
210 * @param s const uint8 * string buffer |
|
211 * @param i string offset |
|
212 * @param c code point to append |
|
213 * @see CBU8_APPEND |
|
214 * @stable ICU 2.4 |
|
215 */ |
|
216 #define CBU8_APPEND_UNSAFE(s, i, c) { \ |
|
217 if((uint32)(c)<=0x7f) { \ |
|
218 (s)[(i)++]=(uint8)(c); \ |
|
219 } else { \ |
|
220 if((uint32)(c)<=0x7ff) { \ |
|
221 (s)[(i)++]=(uint8)(((c)>>6)|0xc0); \ |
|
222 } else { \ |
|
223 if((uint32)(c)<=0xffff) { \ |
|
224 (s)[(i)++]=(uint8)(((c)>>12)|0xe0); \ |
|
225 } else { \ |
|
226 (s)[(i)++]=(uint8)(((c)>>18)|0xf0); \ |
|
227 (s)[(i)++]=(uint8)((((c)>>12)&0x3f)|0x80); \ |
|
228 } \ |
|
229 (s)[(i)++]=(uint8)((((c)>>6)&0x3f)|0x80); \ |
|
230 } \ |
|
231 (s)[(i)++]=(uint8)(((c)&0x3f)|0x80); \ |
|
232 } \ |
|
233 } |
|
234 |
|
235 // UTF-16 macros --------------------------------------------------------------- |
|
236 // from utf16.h |
|
237 |
|
238 /** |
|
239 * Does this code unit alone encode a code point (BMP, not a surrogate)? |
|
240 * @param c 16-bit code unit |
|
241 * @return TRUE or FALSE |
|
242 * @stable ICU 2.4 |
|
243 */ |
|
244 #define CBU16_IS_SINGLE(c) !CBU_IS_SURROGATE(c) |
|
245 |
|
246 /** |
|
247 * Is this code unit a lead surrogate (U+d800..U+dbff)? |
|
248 * @param c 16-bit code unit |
|
249 * @return TRUE or FALSE |
|
250 * @stable ICU 2.4 |
|
251 */ |
|
252 #define CBU16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800) |
|
253 |
|
254 /** |
|
255 * Is this code unit a trail surrogate (U+dc00..U+dfff)? |
|
256 * @param c 16-bit code unit |
|
257 * @return TRUE or FALSE |
|
258 * @stable ICU 2.4 |
|
259 */ |
|
260 #define CBU16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00) |
|
261 |
|
262 /** |
|
263 * Is this code unit a surrogate (U+d800..U+dfff)? |
|
264 * @param c 16-bit code unit |
|
265 * @return TRUE or FALSE |
|
266 * @stable ICU 2.4 |
|
267 */ |
|
268 #define CBU16_IS_SURROGATE(c) CBU_IS_SURROGATE(c) |
|
269 |
|
270 /** |
|
271 * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)), |
|
272 * is it a lead surrogate? |
|
273 * @param c 16-bit code unit |
|
274 * @return TRUE or FALSE |
|
275 * @stable ICU 2.4 |
|
276 */ |
|
277 #define CBU16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0) |
|
278 |
|
279 /** |
|
280 * Helper constant for CBU16_GET_SUPPLEMENTARY. |
|
281 * @internal |
|
282 */ |
|
283 #define CBU16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000) |
|
284 |
|
285 /** |
|
286 * Get a supplementary code point value (U+10000..U+10ffff) |
|
287 * from its lead and trail surrogates. |
|
288 * The result is undefined if the input values are not |
|
289 * lead and trail surrogates. |
|
290 * |
|
291 * @param lead lead surrogate (U+d800..U+dbff) |
|
292 * @param trail trail surrogate (U+dc00..U+dfff) |
|
293 * @return supplementary code point (U+10000..U+10ffff) |
|
294 * @stable ICU 2.4 |
|
295 */ |
|
296 #define CBU16_GET_SUPPLEMENTARY(lead, trail) \ |
|
297 (((base_icu::UChar32)(lead)<<10UL)+(base_icu::UChar32)(trail)-CBU16_SURROGATE_OFFSET) |
|
298 |
|
299 |
|
300 /** |
|
301 * Get the lead surrogate (0xd800..0xdbff) for a |
|
302 * supplementary code point (0x10000..0x10ffff). |
|
303 * @param supplementary 32-bit code point (U+10000..U+10ffff) |
|
304 * @return lead surrogate (U+d800..U+dbff) for supplementary |
|
305 * @stable ICU 2.4 |
|
306 */ |
|
307 #define CBU16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0) |
|
308 |
|
309 /** |
|
310 * Get the trail surrogate (0xdc00..0xdfff) for a |
|
311 * supplementary code point (0x10000..0x10ffff). |
|
312 * @param supplementary 32-bit code point (U+10000..U+10ffff) |
|
313 * @return trail surrogate (U+dc00..U+dfff) for supplementary |
|
314 * @stable ICU 2.4 |
|
315 */ |
|
316 #define CBU16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00) |
|
317 |
|
318 /** |
|
319 * How many 16-bit code units are used to encode this Unicode code point? (1 or 2) |
|
320 * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff). |
|
321 * @param c 32-bit code point |
|
322 * @return 1 or 2 |
|
323 * @stable ICU 2.4 |
|
324 */ |
|
325 #define CBU16_LENGTH(c) ((uint32)(c)<=0xffff ? 1 : 2) |
|
326 |
|
327 /** |
|
328 * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff). |
|
329 * @return 2 |
|
330 * @stable ICU 2.4 |
|
331 */ |
|
332 #define CBU16_MAX_LENGTH 2 |
|
333 |
|
334 /** |
|
335 * Get a code point from a string at a code point boundary offset, |
|
336 * and advance the offset to the next code point boundary. |
|
337 * (Post-incrementing forward iteration.) |
|
338 * "Safe" macro, handles unpaired surrogates and checks for string boundaries. |
|
339 * |
|
340 * The offset may point to the lead surrogate unit |
|
341 * for a supplementary code point, in which case the macro will read |
|
342 * the following trail surrogate as well. |
|
343 * If the offset points to a trail surrogate or |
|
344 * to a single, unpaired lead surrogate, then that itself |
|
345 * will be returned as the code point. |
|
346 * |
|
347 * @param s const UChar * string |
|
348 * @param i string offset, i<length |
|
349 * @param length string length |
|
350 * @param c output UChar32 variable |
|
351 * @stable ICU 2.4 |
|
352 */ |
|
353 #define CBU16_NEXT(s, i, length, c) { \ |
|
354 (c)=(s)[(i)++]; \ |
|
355 if(CBU16_IS_LEAD(c)) { \ |
|
356 uint16 __c2; \ |
|
357 if((i)<(length) && CBU16_IS_TRAIL(__c2=(s)[(i)])) { \ |
|
358 ++(i); \ |
|
359 (c)=CBU16_GET_SUPPLEMENTARY((c), __c2); \ |
|
360 } \ |
|
361 } \ |
|
362 } |
|
363 |
|
364 /** |
|
365 * Append a code point to a string, overwriting 1 or 2 code units. |
|
366 * The offset points to the current end of the string contents |
|
367 * and is advanced (post-increment). |
|
368 * "Unsafe" macro, assumes a valid code point and sufficient space in the string. |
|
369 * Otherwise, the result is undefined. |
|
370 * |
|
371 * @param s const UChar * string buffer |
|
372 * @param i string offset |
|
373 * @param c code point to append |
|
374 * @see CBU16_APPEND |
|
375 * @stable ICU 2.4 |
|
376 */ |
|
377 #define CBU16_APPEND_UNSAFE(s, i, c) { \ |
|
378 if((uint32)(c)<=0xffff) { \ |
|
379 (s)[(i)++]=(uint16)(c); \ |
|
380 } else { \ |
|
381 (s)[(i)++]=(uint16)(((c)>>10)+0xd7c0); \ |
|
382 (s)[(i)++]=(uint16)(((c)&0x3ff)|0xdc00); \ |
|
383 } \ |
|
384 } |
|
385 |
|
386 } // namesapce base_icu |
|
387 |
|
388 #endif // BASE_THIRD_PARTY_ICU_ICU_UTF_H_ |