1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/security/sandbox/chromium/base/third_party/icu/icu_utf.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,388 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* 1.7 +* Copyright (C) 1999-2004, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +******************************************************************************* 1.11 +* file name: utf.h 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 1999sep09 1.17 +* created by: Markus W. Scherer 1.18 +*/ 1.19 + 1.20 +#ifndef BASE_THIRD_PARTY_ICU_ICU_UTF_H_ 1.21 +#define BASE_THIRD_PARTY_ICU_ICU_UTF_H_ 1.22 + 1.23 +#include "base/basictypes.h" 1.24 + 1.25 +namespace base_icu { 1.26 + 1.27 +typedef uint32 UChar32; 1.28 +typedef int8 UBool; 1.29 + 1.30 +// General --------------------------------------------------------------------- 1.31 +// from utf.h 1.32 + 1.33 +/** 1.34 + * This value is intended for sentinel values for APIs that 1.35 + * (take or) return single code points (UChar32). 1.36 + * It is outside of the Unicode code point range 0..0x10ffff. 1.37 + * 1.38 + * For example, a "done" or "error" value in a new API 1.39 + * could be indicated with CBU_SENTINEL. 1.40 + * 1.41 + * ICU APIs designed before ICU 2.4 usually define service-specific "done" 1.42 + * values, mostly 0xffff. 1.43 + * Those may need to be distinguished from 1.44 + * actual U+ffff text contents by calling functions like 1.45 + * CharacterIterator::hasNext() or UnicodeString::length(). 1.46 + * 1.47 + * @return -1 1.48 + * @see UChar32 1.49 + * @stable ICU 2.4 1.50 + */ 1.51 +#define CBU_SENTINEL (-1) 1.52 + 1.53 +/** 1.54 + * Is this code point a Unicode noncharacter? 1.55 + * @param c 32-bit code point 1.56 + * @return TRUE or FALSE 1.57 + * @stable ICU 2.4 1.58 + */ 1.59 +#define CBU_IS_UNICODE_NONCHAR(c) \ 1.60 + ((c)>=0xfdd0 && \ 1.61 + ((uint32)(c)<=0xfdef || ((c)&0xfffe)==0xfffe) && \ 1.62 + (uint32)(c)<=0x10ffff) 1.63 + 1.64 +/** 1.65 + * Is c a Unicode code point value (0..U+10ffff) 1.66 + * that can be assigned a character? 1.67 + * 1.68 + * Code points that are not characters include: 1.69 + * - single surrogate code points (U+d800..U+dfff, 2048 code points) 1.70 + * - the last two code points on each plane (U+__fffe and U+__ffff, 34 code points) 1.71 + * - U+fdd0..U+fdef (new with Unicode 3.1, 32 code points) 1.72 + * - the highest Unicode code point value is U+10ffff 1.73 + * 1.74 + * This means that all code points below U+d800 are character code points, 1.75 + * and that boundary is tested first for performance. 1.76 + * 1.77 + * @param c 32-bit code point 1.78 + * @return TRUE or FALSE 1.79 + * @stable ICU 2.4 1.80 + */ 1.81 +#define CBU_IS_UNICODE_CHAR(c) \ 1.82 + ((uint32)(c)<0xd800 || \ 1.83 + ((uint32)(c)>0xdfff && \ 1.84 + (uint32)(c)<=0x10ffff && \ 1.85 + !CBU_IS_UNICODE_NONCHAR(c))) 1.86 + 1.87 +/** 1.88 + * Is this code point a surrogate (U+d800..U+dfff)? 1.89 + * @param c 32-bit code point 1.90 + * @return TRUE or FALSE 1.91 + * @stable ICU 2.4 1.92 + */ 1.93 +#define CBU_IS_SURROGATE(c) (((c)&0xfffff800)==0xd800) 1.94 + 1.95 +/** 1.96 + * Assuming c is a surrogate code point (U_IS_SURROGATE(c)), 1.97 + * is it a lead surrogate? 1.98 + * @param c 32-bit code point 1.99 + * @return TRUE or FALSE 1.100 + * @stable ICU 2.4 1.101 + */ 1.102 +#define CBU_IS_SURROGATE_LEAD(c) (((c)&0x400)==0) 1.103 + 1.104 + 1.105 +// UTF-8 macros ---------------------------------------------------------------- 1.106 +// from utf8.h 1.107 + 1.108 +extern const uint8 utf8_countTrailBytes[256]; 1.109 + 1.110 +/** 1.111 + * Count the trail bytes for a UTF-8 lead byte. 1.112 + * @internal 1.113 + */ 1.114 +#define CBU8_COUNT_TRAIL_BYTES(leadByte) (base_icu::utf8_countTrailBytes[(uint8)leadByte]) 1.115 + 1.116 +/** 1.117 + * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value. 1.118 + * @internal 1.119 + */ 1.120 +#define CBU8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) 1.121 + 1.122 +/** 1.123 + * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)? 1.124 + * @param c 8-bit code unit (byte) 1.125 + * @return TRUE or FALSE 1.126 + * @stable ICU 2.4 1.127 + */ 1.128 +#define CBU8_IS_SINGLE(c) (((c)&0x80)==0) 1.129 + 1.130 +/** 1.131 + * Is this code unit (byte) a UTF-8 lead byte? 1.132 + * @param c 8-bit code unit (byte) 1.133 + * @return TRUE or FALSE 1.134 + * @stable ICU 2.4 1.135 + */ 1.136 +#define CBU8_IS_LEAD(c) ((uint8)((c)-0xc0)<0x3e) 1.137 + 1.138 +/** 1.139 + * Is this code unit (byte) a UTF-8 trail byte? 1.140 + * @param c 8-bit code unit (byte) 1.141 + * @return TRUE or FALSE 1.142 + * @stable ICU 2.4 1.143 + */ 1.144 +#define CBU8_IS_TRAIL(c) (((c)&0xc0)==0x80) 1.145 + 1.146 +/** 1.147 + * How many code units (bytes) are used for the UTF-8 encoding 1.148 + * of this Unicode code point? 1.149 + * @param c 32-bit code point 1.150 + * @return 1..4, or 0 if c is a surrogate or not a Unicode code point 1.151 + * @stable ICU 2.4 1.152 + */ 1.153 +#define CBU8_LENGTH(c) \ 1.154 + ((uint32)(c)<=0x7f ? 1 : \ 1.155 + ((uint32)(c)<=0x7ff ? 2 : \ 1.156 + ((uint32)(c)<=0xd7ff ? 3 : \ 1.157 + ((uint32)(c)<=0xdfff || (uint32)(c)>0x10ffff ? 0 : \ 1.158 + ((uint32)(c)<=0xffff ? 3 : 4)\ 1.159 + ) \ 1.160 + ) \ 1.161 + ) \ 1.162 + ) 1.163 + 1.164 +/** 1.165 + * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff). 1.166 + * @return 4 1.167 + * @stable ICU 2.4 1.168 + */ 1.169 +#define CBU8_MAX_LENGTH 4 1.170 + 1.171 +/** 1.172 + * Function for handling "next code point" with error-checking. 1.173 + * @internal 1.174 + */ 1.175 +UChar32 utf8_nextCharSafeBody(const uint8 *s, int32 *pi, int32 length, UChar32 c, UBool strict); 1.176 + 1.177 +/** 1.178 + * Get a code point from a string at a code point boundary offset, 1.179 + * and advance the offset to the next code point boundary. 1.180 + * (Post-incrementing forward iteration.) 1.181 + * "Safe" macro, checks for illegal sequences and for string boundaries. 1.182 + * 1.183 + * The offset may point to the lead byte of a multi-byte sequence, 1.184 + * in which case the macro will read the whole sequence. 1.185 + * If the offset points to a trail byte or an illegal UTF-8 sequence, then 1.186 + * c is set to a negative value. 1.187 + * 1.188 + * @param s const uint8 * string 1.189 + * @param i string offset, i<length 1.190 + * @param length string length 1.191 + * @param c output UChar32 variable, set to <0 in case of an error 1.192 + * @see CBU8_NEXT_UNSAFE 1.193 + * @stable ICU 2.4 1.194 + */ 1.195 +#define CBU8_NEXT(s, i, length, c) { \ 1.196 + (c)=(s)[(i)++]; \ 1.197 + if(((uint8)(c))>=0x80) { \ 1.198 + if(CBU8_IS_LEAD(c)) { \ 1.199 + (c)=base_icu::utf8_nextCharSafeBody((const uint8 *)s, &(i), (int32)(length), c, -1); \ 1.200 + } else { \ 1.201 + (c)=CBU_SENTINEL; \ 1.202 + } \ 1.203 + } \ 1.204 +} 1.205 + 1.206 +/** 1.207 + * Append a code point to a string, overwriting 1 to 4 bytes. 1.208 + * The offset points to the current end of the string contents 1.209 + * and is advanced (post-increment). 1.210 + * "Unsafe" macro, assumes a valid code point and sufficient space in the string. 1.211 + * Otherwise, the result is undefined. 1.212 + * 1.213 + * @param s const uint8 * string buffer 1.214 + * @param i string offset 1.215 + * @param c code point to append 1.216 + * @see CBU8_APPEND 1.217 + * @stable ICU 2.4 1.218 + */ 1.219 +#define CBU8_APPEND_UNSAFE(s, i, c) { \ 1.220 + if((uint32)(c)<=0x7f) { \ 1.221 + (s)[(i)++]=(uint8)(c); \ 1.222 + } else { \ 1.223 + if((uint32)(c)<=0x7ff) { \ 1.224 + (s)[(i)++]=(uint8)(((c)>>6)|0xc0); \ 1.225 + } else { \ 1.226 + if((uint32)(c)<=0xffff) { \ 1.227 + (s)[(i)++]=(uint8)(((c)>>12)|0xe0); \ 1.228 + } else { \ 1.229 + (s)[(i)++]=(uint8)(((c)>>18)|0xf0); \ 1.230 + (s)[(i)++]=(uint8)((((c)>>12)&0x3f)|0x80); \ 1.231 + } \ 1.232 + (s)[(i)++]=(uint8)((((c)>>6)&0x3f)|0x80); \ 1.233 + } \ 1.234 + (s)[(i)++]=(uint8)(((c)&0x3f)|0x80); \ 1.235 + } \ 1.236 +} 1.237 + 1.238 +// UTF-16 macros --------------------------------------------------------------- 1.239 +// from utf16.h 1.240 + 1.241 +/** 1.242 + * Does this code unit alone encode a code point (BMP, not a surrogate)? 1.243 + * @param c 16-bit code unit 1.244 + * @return TRUE or FALSE 1.245 + * @stable ICU 2.4 1.246 + */ 1.247 +#define CBU16_IS_SINGLE(c) !CBU_IS_SURROGATE(c) 1.248 + 1.249 +/** 1.250 + * Is this code unit a lead surrogate (U+d800..U+dbff)? 1.251 + * @param c 16-bit code unit 1.252 + * @return TRUE or FALSE 1.253 + * @stable ICU 2.4 1.254 + */ 1.255 +#define CBU16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800) 1.256 + 1.257 +/** 1.258 + * Is this code unit a trail surrogate (U+dc00..U+dfff)? 1.259 + * @param c 16-bit code unit 1.260 + * @return TRUE or FALSE 1.261 + * @stable ICU 2.4 1.262 + */ 1.263 +#define CBU16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00) 1.264 + 1.265 +/** 1.266 + * Is this code unit a surrogate (U+d800..U+dfff)? 1.267 + * @param c 16-bit code unit 1.268 + * @return TRUE or FALSE 1.269 + * @stable ICU 2.4 1.270 + */ 1.271 +#define CBU16_IS_SURROGATE(c) CBU_IS_SURROGATE(c) 1.272 + 1.273 +/** 1.274 + * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)), 1.275 + * is it a lead surrogate? 1.276 + * @param c 16-bit code unit 1.277 + * @return TRUE or FALSE 1.278 + * @stable ICU 2.4 1.279 + */ 1.280 +#define CBU16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0) 1.281 + 1.282 +/** 1.283 + * Helper constant for CBU16_GET_SUPPLEMENTARY. 1.284 + * @internal 1.285 + */ 1.286 +#define CBU16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000) 1.287 + 1.288 +/** 1.289 + * Get a supplementary code point value (U+10000..U+10ffff) 1.290 + * from its lead and trail surrogates. 1.291 + * The result is undefined if the input values are not 1.292 + * lead and trail surrogates. 1.293 + * 1.294 + * @param lead lead surrogate (U+d800..U+dbff) 1.295 + * @param trail trail surrogate (U+dc00..U+dfff) 1.296 + * @return supplementary code point (U+10000..U+10ffff) 1.297 + * @stable ICU 2.4 1.298 + */ 1.299 +#define CBU16_GET_SUPPLEMENTARY(lead, trail) \ 1.300 + (((base_icu::UChar32)(lead)<<10UL)+(base_icu::UChar32)(trail)-CBU16_SURROGATE_OFFSET) 1.301 + 1.302 + 1.303 +/** 1.304 + * Get the lead surrogate (0xd800..0xdbff) for a 1.305 + * supplementary code point (0x10000..0x10ffff). 1.306 + * @param supplementary 32-bit code point (U+10000..U+10ffff) 1.307 + * @return lead surrogate (U+d800..U+dbff) for supplementary 1.308 + * @stable ICU 2.4 1.309 + */ 1.310 +#define CBU16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0) 1.311 + 1.312 +/** 1.313 + * Get the trail surrogate (0xdc00..0xdfff) for a 1.314 + * supplementary code point (0x10000..0x10ffff). 1.315 + * @param supplementary 32-bit code point (U+10000..U+10ffff) 1.316 + * @return trail surrogate (U+dc00..U+dfff) for supplementary 1.317 + * @stable ICU 2.4 1.318 + */ 1.319 +#define CBU16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00) 1.320 + 1.321 +/** 1.322 + * How many 16-bit code units are used to encode this Unicode code point? (1 or 2) 1.323 + * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff). 1.324 + * @param c 32-bit code point 1.325 + * @return 1 or 2 1.326 + * @stable ICU 2.4 1.327 + */ 1.328 +#define CBU16_LENGTH(c) ((uint32)(c)<=0xffff ? 1 : 2) 1.329 + 1.330 +/** 1.331 + * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff). 1.332 + * @return 2 1.333 + * @stable ICU 2.4 1.334 + */ 1.335 +#define CBU16_MAX_LENGTH 2 1.336 + 1.337 +/** 1.338 + * Get a code point from a string at a code point boundary offset, 1.339 + * and advance the offset to the next code point boundary. 1.340 + * (Post-incrementing forward iteration.) 1.341 + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. 1.342 + * 1.343 + * The offset may point to the lead surrogate unit 1.344 + * for a supplementary code point, in which case the macro will read 1.345 + * the following trail surrogate as well. 1.346 + * If the offset points to a trail surrogate or 1.347 + * to a single, unpaired lead surrogate, then that itself 1.348 + * will be returned as the code point. 1.349 + * 1.350 + * @param s const UChar * string 1.351 + * @param i string offset, i<length 1.352 + * @param length string length 1.353 + * @param c output UChar32 variable 1.354 + * @stable ICU 2.4 1.355 + */ 1.356 +#define CBU16_NEXT(s, i, length, c) { \ 1.357 + (c)=(s)[(i)++]; \ 1.358 + if(CBU16_IS_LEAD(c)) { \ 1.359 + uint16 __c2; \ 1.360 + if((i)<(length) && CBU16_IS_TRAIL(__c2=(s)[(i)])) { \ 1.361 + ++(i); \ 1.362 + (c)=CBU16_GET_SUPPLEMENTARY((c), __c2); \ 1.363 + } \ 1.364 + } \ 1.365 +} 1.366 + 1.367 +/** 1.368 + * Append a code point to a string, overwriting 1 or 2 code units. 1.369 + * The offset points to the current end of the string contents 1.370 + * and is advanced (post-increment). 1.371 + * "Unsafe" macro, assumes a valid code point and sufficient space in the string. 1.372 + * Otherwise, the result is undefined. 1.373 + * 1.374 + * @param s const UChar * string buffer 1.375 + * @param i string offset 1.376 + * @param c code point to append 1.377 + * @see CBU16_APPEND 1.378 + * @stable ICU 2.4 1.379 + */ 1.380 +#define CBU16_APPEND_UNSAFE(s, i, c) { \ 1.381 + if((uint32)(c)<=0xffff) { \ 1.382 + (s)[(i)++]=(uint16)(c); \ 1.383 + } else { \ 1.384 + (s)[(i)++]=(uint16)(((c)>>10)+0xd7c0); \ 1.385 + (s)[(i)++]=(uint16)(((c)&0x3ff)|0xdc00); \ 1.386 + } \ 1.387 +} 1.388 + 1.389 +} // namesapce base_icu 1.390 + 1.391 +#endif // BASE_THIRD_PARTY_ICU_ICU_UTF_H_