Wed, 31 Dec 2014 07:16:47 +0100
Revert simplistic fix pending revisit of Mozilla integration attempt.
michael@0 | 1 | /* |
michael@0 | 2 | ******************************************************************************* |
michael@0 | 3 | * |
michael@0 | 4 | * Copyright (C) 1999-2004, International Business Machines |
michael@0 | 5 | * Corporation and others. All Rights Reserved. |
michael@0 | 6 | * |
michael@0 | 7 | ******************************************************************************* |
michael@0 | 8 | * file name: utf.h |
michael@0 | 9 | * encoding: US-ASCII |
michael@0 | 10 | * tab size: 8 (not used) |
michael@0 | 11 | * indentation:4 |
michael@0 | 12 | * |
michael@0 | 13 | * created on: 1999sep09 |
michael@0 | 14 | * created by: Markus W. Scherer |
michael@0 | 15 | */ |
michael@0 | 16 | |
michael@0 | 17 | #ifndef BASE_THIRD_PARTY_ICU_ICU_UTF_H_ |
michael@0 | 18 | #define BASE_THIRD_PARTY_ICU_ICU_UTF_H_ |
michael@0 | 19 | |
michael@0 | 20 | #include "base/basictypes.h" |
michael@0 | 21 | |
michael@0 | 22 | namespace base_icu { |
michael@0 | 23 | |
michael@0 | 24 | typedef uint32 UChar32; |
michael@0 | 25 | typedef int8 UBool; |
michael@0 | 26 | |
michael@0 | 27 | // General --------------------------------------------------------------------- |
michael@0 | 28 | // from utf.h |
michael@0 | 29 | |
michael@0 | 30 | /** |
michael@0 | 31 | * This value is intended for sentinel values for APIs that |
michael@0 | 32 | * (take or) return single code points (UChar32). |
michael@0 | 33 | * It is outside of the Unicode code point range 0..0x10ffff. |
michael@0 | 34 | * |
michael@0 | 35 | * For example, a "done" or "error" value in a new API |
michael@0 | 36 | * could be indicated with CBU_SENTINEL. |
michael@0 | 37 | * |
michael@0 | 38 | * ICU APIs designed before ICU 2.4 usually define service-specific "done" |
michael@0 | 39 | * values, mostly 0xffff. |
michael@0 | 40 | * Those may need to be distinguished from |
michael@0 | 41 | * actual U+ffff text contents by calling functions like |
michael@0 | 42 | * CharacterIterator::hasNext() or UnicodeString::length(). |
michael@0 | 43 | * |
michael@0 | 44 | * @return -1 |
michael@0 | 45 | * @see UChar32 |
michael@0 | 46 | * @stable ICU 2.4 |
michael@0 | 47 | */ |
michael@0 | 48 | #define CBU_SENTINEL (-1) |
michael@0 | 49 | |
michael@0 | 50 | /** |
michael@0 | 51 | * Is this code point a Unicode noncharacter? |
michael@0 | 52 | * @param c 32-bit code point |
michael@0 | 53 | * @return TRUE or FALSE |
michael@0 | 54 | * @stable ICU 2.4 |
michael@0 | 55 | */ |
michael@0 | 56 | #define CBU_IS_UNICODE_NONCHAR(c) \ |
michael@0 | 57 | ((c)>=0xfdd0 && \ |
michael@0 | 58 | ((uint32)(c)<=0xfdef || ((c)&0xfffe)==0xfffe) && \ |
michael@0 | 59 | (uint32)(c)<=0x10ffff) |
michael@0 | 60 | |
michael@0 | 61 | /** |
michael@0 | 62 | * Is c a Unicode code point value (0..U+10ffff) |
michael@0 | 63 | * that can be assigned a character? |
michael@0 | 64 | * |
michael@0 | 65 | * Code points that are not characters include: |
michael@0 | 66 | * - single surrogate code points (U+d800..U+dfff, 2048 code points) |
michael@0 | 67 | * - the last two code points on each plane (U+__fffe and U+__ffff, 34 code points) |
michael@0 | 68 | * - U+fdd0..U+fdef (new with Unicode 3.1, 32 code points) |
michael@0 | 69 | * - the highest Unicode code point value is U+10ffff |
michael@0 | 70 | * |
michael@0 | 71 | * This means that all code points below U+d800 are character code points, |
michael@0 | 72 | * and that boundary is tested first for performance. |
michael@0 | 73 | * |
michael@0 | 74 | * @param c 32-bit code point |
michael@0 | 75 | * @return TRUE or FALSE |
michael@0 | 76 | * @stable ICU 2.4 |
michael@0 | 77 | */ |
michael@0 | 78 | #define CBU_IS_UNICODE_CHAR(c) \ |
michael@0 | 79 | ((uint32)(c)<0xd800 || \ |
michael@0 | 80 | ((uint32)(c)>0xdfff && \ |
michael@0 | 81 | (uint32)(c)<=0x10ffff && \ |
michael@0 | 82 | !CBU_IS_UNICODE_NONCHAR(c))) |
michael@0 | 83 | |
michael@0 | 84 | /** |
michael@0 | 85 | * Is this code point a surrogate (U+d800..U+dfff)? |
michael@0 | 86 | * @param c 32-bit code point |
michael@0 | 87 | * @return TRUE or FALSE |
michael@0 | 88 | * @stable ICU 2.4 |
michael@0 | 89 | */ |
michael@0 | 90 | #define CBU_IS_SURROGATE(c) (((c)&0xfffff800)==0xd800) |
michael@0 | 91 | |
michael@0 | 92 | /** |
michael@0 | 93 | * Assuming c is a surrogate code point (U_IS_SURROGATE(c)), |
michael@0 | 94 | * is it a lead surrogate? |
michael@0 | 95 | * @param c 32-bit code point |
michael@0 | 96 | * @return TRUE or FALSE |
michael@0 | 97 | * @stable ICU 2.4 |
michael@0 | 98 | */ |
michael@0 | 99 | #define CBU_IS_SURROGATE_LEAD(c) (((c)&0x400)==0) |
michael@0 | 100 | |
michael@0 | 101 | |
michael@0 | 102 | // UTF-8 macros ---------------------------------------------------------------- |
michael@0 | 103 | // from utf8.h |
michael@0 | 104 | |
michael@0 | 105 | extern const uint8 utf8_countTrailBytes[256]; |
michael@0 | 106 | |
michael@0 | 107 | /** |
michael@0 | 108 | * Count the trail bytes for a UTF-8 lead byte. |
michael@0 | 109 | * @internal |
michael@0 | 110 | */ |
michael@0 | 111 | #define CBU8_COUNT_TRAIL_BYTES(leadByte) (base_icu::utf8_countTrailBytes[(uint8)leadByte]) |
michael@0 | 112 | |
michael@0 | 113 | /** |
michael@0 | 114 | * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value. |
michael@0 | 115 | * @internal |
michael@0 | 116 | */ |
michael@0 | 117 | #define CBU8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) |
michael@0 | 118 | |
michael@0 | 119 | /** |
michael@0 | 120 | * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)? |
michael@0 | 121 | * @param c 8-bit code unit (byte) |
michael@0 | 122 | * @return TRUE or FALSE |
michael@0 | 123 | * @stable ICU 2.4 |
michael@0 | 124 | */ |
michael@0 | 125 | #define CBU8_IS_SINGLE(c) (((c)&0x80)==0) |
michael@0 | 126 | |
michael@0 | 127 | /** |
michael@0 | 128 | * Is this code unit (byte) a UTF-8 lead byte? |
michael@0 | 129 | * @param c 8-bit code unit (byte) |
michael@0 | 130 | * @return TRUE or FALSE |
michael@0 | 131 | * @stable ICU 2.4 |
michael@0 | 132 | */ |
michael@0 | 133 | #define CBU8_IS_LEAD(c) ((uint8)((c)-0xc0)<0x3e) |
michael@0 | 134 | |
michael@0 | 135 | /** |
michael@0 | 136 | * Is this code unit (byte) a UTF-8 trail byte? |
michael@0 | 137 | * @param c 8-bit code unit (byte) |
michael@0 | 138 | * @return TRUE or FALSE |
michael@0 | 139 | * @stable ICU 2.4 |
michael@0 | 140 | */ |
michael@0 | 141 | #define CBU8_IS_TRAIL(c) (((c)&0xc0)==0x80) |
michael@0 | 142 | |
michael@0 | 143 | /** |
michael@0 | 144 | * How many code units (bytes) are used for the UTF-8 encoding |
michael@0 | 145 | * of this Unicode code point? |
michael@0 | 146 | * @param c 32-bit code point |
michael@0 | 147 | * @return 1..4, or 0 if c is a surrogate or not a Unicode code point |
michael@0 | 148 | * @stable ICU 2.4 |
michael@0 | 149 | */ |
michael@0 | 150 | #define CBU8_LENGTH(c) \ |
michael@0 | 151 | ((uint32)(c)<=0x7f ? 1 : \ |
michael@0 | 152 | ((uint32)(c)<=0x7ff ? 2 : \ |
michael@0 | 153 | ((uint32)(c)<=0xd7ff ? 3 : \ |
michael@0 | 154 | ((uint32)(c)<=0xdfff || (uint32)(c)>0x10ffff ? 0 : \ |
michael@0 | 155 | ((uint32)(c)<=0xffff ? 3 : 4)\ |
michael@0 | 156 | ) \ |
michael@0 | 157 | ) \ |
michael@0 | 158 | ) \ |
michael@0 | 159 | ) |
michael@0 | 160 | |
michael@0 | 161 | /** |
michael@0 | 162 | * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff). |
michael@0 | 163 | * @return 4 |
michael@0 | 164 | * @stable ICU 2.4 |
michael@0 | 165 | */ |
michael@0 | 166 | #define CBU8_MAX_LENGTH 4 |
michael@0 | 167 | |
michael@0 | 168 | /** |
michael@0 | 169 | * Function for handling "next code point" with error-checking. |
michael@0 | 170 | * @internal |
michael@0 | 171 | */ |
michael@0 | 172 | UChar32 utf8_nextCharSafeBody(const uint8 *s, int32 *pi, int32 length, UChar32 c, UBool strict); |
michael@0 | 173 | |
michael@0 | 174 | /** |
michael@0 | 175 | * Get a code point from a string at a code point boundary offset, |
michael@0 | 176 | * and advance the offset to the next code point boundary. |
michael@0 | 177 | * (Post-incrementing forward iteration.) |
michael@0 | 178 | * "Safe" macro, checks for illegal sequences and for string boundaries. |
michael@0 | 179 | * |
michael@0 | 180 | * The offset may point to the lead byte of a multi-byte sequence, |
michael@0 | 181 | * in which case the macro will read the whole sequence. |
michael@0 | 182 | * If the offset points to a trail byte or an illegal UTF-8 sequence, then |
michael@0 | 183 | * c is set to a negative value. |
michael@0 | 184 | * |
michael@0 | 185 | * @param s const uint8 * string |
michael@0 | 186 | * @param i string offset, i<length |
michael@0 | 187 | * @param length string length |
michael@0 | 188 | * @param c output UChar32 variable, set to <0 in case of an error |
michael@0 | 189 | * @see CBU8_NEXT_UNSAFE |
michael@0 | 190 | * @stable ICU 2.4 |
michael@0 | 191 | */ |
michael@0 | 192 | #define CBU8_NEXT(s, i, length, c) { \ |
michael@0 | 193 | (c)=(s)[(i)++]; \ |
michael@0 | 194 | if(((uint8)(c))>=0x80) { \ |
michael@0 | 195 | if(CBU8_IS_LEAD(c)) { \ |
michael@0 | 196 | (c)=base_icu::utf8_nextCharSafeBody((const uint8 *)s, &(i), (int32)(length), c, -1); \ |
michael@0 | 197 | } else { \ |
michael@0 | 198 | (c)=CBU_SENTINEL; \ |
michael@0 | 199 | } \ |
michael@0 | 200 | } \ |
michael@0 | 201 | } |
michael@0 | 202 | |
michael@0 | 203 | /** |
michael@0 | 204 | * Append a code point to a string, overwriting 1 to 4 bytes. |
michael@0 | 205 | * The offset points to the current end of the string contents |
michael@0 | 206 | * and is advanced (post-increment). |
michael@0 | 207 | * "Unsafe" macro, assumes a valid code point and sufficient space in the string. |
michael@0 | 208 | * Otherwise, the result is undefined. |
michael@0 | 209 | * |
michael@0 | 210 | * @param s const uint8 * string buffer |
michael@0 | 211 | * @param i string offset |
michael@0 | 212 | * @param c code point to append |
michael@0 | 213 | * @see CBU8_APPEND |
michael@0 | 214 | * @stable ICU 2.4 |
michael@0 | 215 | */ |
michael@0 | 216 | #define CBU8_APPEND_UNSAFE(s, i, c) { \ |
michael@0 | 217 | if((uint32)(c)<=0x7f) { \ |
michael@0 | 218 | (s)[(i)++]=(uint8)(c); \ |
michael@0 | 219 | } else { \ |
michael@0 | 220 | if((uint32)(c)<=0x7ff) { \ |
michael@0 | 221 | (s)[(i)++]=(uint8)(((c)>>6)|0xc0); \ |
michael@0 | 222 | } else { \ |
michael@0 | 223 | if((uint32)(c)<=0xffff) { \ |
michael@0 | 224 | (s)[(i)++]=(uint8)(((c)>>12)|0xe0); \ |
michael@0 | 225 | } else { \ |
michael@0 | 226 | (s)[(i)++]=(uint8)(((c)>>18)|0xf0); \ |
michael@0 | 227 | (s)[(i)++]=(uint8)((((c)>>12)&0x3f)|0x80); \ |
michael@0 | 228 | } \ |
michael@0 | 229 | (s)[(i)++]=(uint8)((((c)>>6)&0x3f)|0x80); \ |
michael@0 | 230 | } \ |
michael@0 | 231 | (s)[(i)++]=(uint8)(((c)&0x3f)|0x80); \ |
michael@0 | 232 | } \ |
michael@0 | 233 | } |
michael@0 | 234 | |
michael@0 | 235 | // UTF-16 macros --------------------------------------------------------------- |
michael@0 | 236 | // from utf16.h |
michael@0 | 237 | |
michael@0 | 238 | /** |
michael@0 | 239 | * Does this code unit alone encode a code point (BMP, not a surrogate)? |
michael@0 | 240 | * @param c 16-bit code unit |
michael@0 | 241 | * @return TRUE or FALSE |
michael@0 | 242 | * @stable ICU 2.4 |
michael@0 | 243 | */ |
michael@0 | 244 | #define CBU16_IS_SINGLE(c) !CBU_IS_SURROGATE(c) |
michael@0 | 245 | |
michael@0 | 246 | /** |
michael@0 | 247 | * Is this code unit a lead surrogate (U+d800..U+dbff)? |
michael@0 | 248 | * @param c 16-bit code unit |
michael@0 | 249 | * @return TRUE or FALSE |
michael@0 | 250 | * @stable ICU 2.4 |
michael@0 | 251 | */ |
michael@0 | 252 | #define CBU16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800) |
michael@0 | 253 | |
michael@0 | 254 | /** |
michael@0 | 255 | * Is this code unit a trail surrogate (U+dc00..U+dfff)? |
michael@0 | 256 | * @param c 16-bit code unit |
michael@0 | 257 | * @return TRUE or FALSE |
michael@0 | 258 | * @stable ICU 2.4 |
michael@0 | 259 | */ |
michael@0 | 260 | #define CBU16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00) |
michael@0 | 261 | |
michael@0 | 262 | /** |
michael@0 | 263 | * Is this code unit a surrogate (U+d800..U+dfff)? |
michael@0 | 264 | * @param c 16-bit code unit |
michael@0 | 265 | * @return TRUE or FALSE |
michael@0 | 266 | * @stable ICU 2.4 |
michael@0 | 267 | */ |
michael@0 | 268 | #define CBU16_IS_SURROGATE(c) CBU_IS_SURROGATE(c) |
michael@0 | 269 | |
michael@0 | 270 | /** |
michael@0 | 271 | * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)), |
michael@0 | 272 | * is it a lead surrogate? |
michael@0 | 273 | * @param c 16-bit code unit |
michael@0 | 274 | * @return TRUE or FALSE |
michael@0 | 275 | * @stable ICU 2.4 |
michael@0 | 276 | */ |
michael@0 | 277 | #define CBU16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0) |
michael@0 | 278 | |
michael@0 | 279 | /** |
michael@0 | 280 | * Helper constant for CBU16_GET_SUPPLEMENTARY. |
michael@0 | 281 | * @internal |
michael@0 | 282 | */ |
michael@0 | 283 | #define CBU16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000) |
michael@0 | 284 | |
michael@0 | 285 | /** |
michael@0 | 286 | * Get a supplementary code point value (U+10000..U+10ffff) |
michael@0 | 287 | * from its lead and trail surrogates. |
michael@0 | 288 | * The result is undefined if the input values are not |
michael@0 | 289 | * lead and trail surrogates. |
michael@0 | 290 | * |
michael@0 | 291 | * @param lead lead surrogate (U+d800..U+dbff) |
michael@0 | 292 | * @param trail trail surrogate (U+dc00..U+dfff) |
michael@0 | 293 | * @return supplementary code point (U+10000..U+10ffff) |
michael@0 | 294 | * @stable ICU 2.4 |
michael@0 | 295 | */ |
michael@0 | 296 | #define CBU16_GET_SUPPLEMENTARY(lead, trail) \ |
michael@0 | 297 | (((base_icu::UChar32)(lead)<<10UL)+(base_icu::UChar32)(trail)-CBU16_SURROGATE_OFFSET) |
michael@0 | 298 | |
michael@0 | 299 | |
michael@0 | 300 | /** |
michael@0 | 301 | * Get the lead surrogate (0xd800..0xdbff) for a |
michael@0 | 302 | * supplementary code point (0x10000..0x10ffff). |
michael@0 | 303 | * @param supplementary 32-bit code point (U+10000..U+10ffff) |
michael@0 | 304 | * @return lead surrogate (U+d800..U+dbff) for supplementary |
michael@0 | 305 | * @stable ICU 2.4 |
michael@0 | 306 | */ |
michael@0 | 307 | #define CBU16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0) |
michael@0 | 308 | |
michael@0 | 309 | /** |
michael@0 | 310 | * Get the trail surrogate (0xdc00..0xdfff) for a |
michael@0 | 311 | * supplementary code point (0x10000..0x10ffff). |
michael@0 | 312 | * @param supplementary 32-bit code point (U+10000..U+10ffff) |
michael@0 | 313 | * @return trail surrogate (U+dc00..U+dfff) for supplementary |
michael@0 | 314 | * @stable ICU 2.4 |
michael@0 | 315 | */ |
michael@0 | 316 | #define CBU16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00) |
michael@0 | 317 | |
michael@0 | 318 | /** |
michael@0 | 319 | * How many 16-bit code units are used to encode this Unicode code point? (1 or 2) |
michael@0 | 320 | * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff). |
michael@0 | 321 | * @param c 32-bit code point |
michael@0 | 322 | * @return 1 or 2 |
michael@0 | 323 | * @stable ICU 2.4 |
michael@0 | 324 | */ |
michael@0 | 325 | #define CBU16_LENGTH(c) ((uint32)(c)<=0xffff ? 1 : 2) |
michael@0 | 326 | |
michael@0 | 327 | /** |
michael@0 | 328 | * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff). |
michael@0 | 329 | * @return 2 |
michael@0 | 330 | * @stable ICU 2.4 |
michael@0 | 331 | */ |
michael@0 | 332 | #define CBU16_MAX_LENGTH 2 |
michael@0 | 333 | |
michael@0 | 334 | /** |
michael@0 | 335 | * Get a code point from a string at a code point boundary offset, |
michael@0 | 336 | * and advance the offset to the next code point boundary. |
michael@0 | 337 | * (Post-incrementing forward iteration.) |
michael@0 | 338 | * "Safe" macro, handles unpaired surrogates and checks for string boundaries. |
michael@0 | 339 | * |
michael@0 | 340 | * The offset may point to the lead surrogate unit |
michael@0 | 341 | * for a supplementary code point, in which case the macro will read |
michael@0 | 342 | * the following trail surrogate as well. |
michael@0 | 343 | * If the offset points to a trail surrogate or |
michael@0 | 344 | * to a single, unpaired lead surrogate, then that itself |
michael@0 | 345 | * will be returned as the code point. |
michael@0 | 346 | * |
michael@0 | 347 | * @param s const UChar * string |
michael@0 | 348 | * @param i string offset, i<length |
michael@0 | 349 | * @param length string length |
michael@0 | 350 | * @param c output UChar32 variable |
michael@0 | 351 | * @stable ICU 2.4 |
michael@0 | 352 | */ |
michael@0 | 353 | #define CBU16_NEXT(s, i, length, c) { \ |
michael@0 | 354 | (c)=(s)[(i)++]; \ |
michael@0 | 355 | if(CBU16_IS_LEAD(c)) { \ |
michael@0 | 356 | uint16 __c2; \ |
michael@0 | 357 | if((i)<(length) && CBU16_IS_TRAIL(__c2=(s)[(i)])) { \ |
michael@0 | 358 | ++(i); \ |
michael@0 | 359 | (c)=CBU16_GET_SUPPLEMENTARY((c), __c2); \ |
michael@0 | 360 | } \ |
michael@0 | 361 | } \ |
michael@0 | 362 | } |
michael@0 | 363 | |
michael@0 | 364 | /** |
michael@0 | 365 | * Append a code point to a string, overwriting 1 or 2 code units. |
michael@0 | 366 | * The offset points to the current end of the string contents |
michael@0 | 367 | * and is advanced (post-increment). |
michael@0 | 368 | * "Unsafe" macro, assumes a valid code point and sufficient space in the string. |
michael@0 | 369 | * Otherwise, the result is undefined. |
michael@0 | 370 | * |
michael@0 | 371 | * @param s const UChar * string buffer |
michael@0 | 372 | * @param i string offset |
michael@0 | 373 | * @param c code point to append |
michael@0 | 374 | * @see CBU16_APPEND |
michael@0 | 375 | * @stable ICU 2.4 |
michael@0 | 376 | */ |
michael@0 | 377 | #define CBU16_APPEND_UNSAFE(s, i, c) { \ |
michael@0 | 378 | if((uint32)(c)<=0xffff) { \ |
michael@0 | 379 | (s)[(i)++]=(uint16)(c); \ |
michael@0 | 380 | } else { \ |
michael@0 | 381 | (s)[(i)++]=(uint16)(((c)>>10)+0xd7c0); \ |
michael@0 | 382 | (s)[(i)++]=(uint16)(((c)&0x3ff)|0xdc00); \ |
michael@0 | 383 | } \ |
michael@0 | 384 | } |
michael@0 | 385 | |
michael@0 | 386 | } // namesapce base_icu |
michael@0 | 387 | |
michael@0 | 388 | #endif // BASE_THIRD_PARTY_ICU_ICU_UTF_H_ |