security/sandbox/chromium/base/third_party/icu/icu_utf.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/security/sandbox/chromium/base/third_party/icu/icu_utf.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,388 @@
     1.4 +/*
     1.5 +*******************************************************************************
     1.6 +*
     1.7 +*   Copyright (C) 1999-2004, International Business Machines
     1.8 +*   Corporation and others.  All Rights Reserved.
     1.9 +*
    1.10 +*******************************************************************************
    1.11 +*   file name:  utf.h
    1.12 +*   encoding:   US-ASCII
    1.13 +*   tab size:   8 (not used)
    1.14 +*   indentation:4
    1.15 +*
    1.16 +*   created on: 1999sep09
    1.17 +*   created by: Markus W. Scherer
    1.18 +*/
    1.19 +
    1.20 +#ifndef BASE_THIRD_PARTY_ICU_ICU_UTF_H_
    1.21 +#define BASE_THIRD_PARTY_ICU_ICU_UTF_H_
    1.22 +
    1.23 +#include "base/basictypes.h"
    1.24 +
    1.25 +namespace base_icu {
    1.26 +
    1.27 +typedef uint32 UChar32;
    1.28 +typedef int8 UBool;
    1.29 +
    1.30 +// General ---------------------------------------------------------------------
    1.31 +// from utf.h
    1.32 +
    1.33 +/**
    1.34 + * This value is intended for sentinel values for APIs that
    1.35 + * (take or) return single code points (UChar32).
    1.36 + * It is outside of the Unicode code point range 0..0x10ffff.
    1.37 + *
    1.38 + * For example, a "done" or "error" value in a new API
    1.39 + * could be indicated with CBU_SENTINEL.
    1.40 + *
    1.41 + * ICU APIs designed before ICU 2.4 usually define service-specific "done"
    1.42 + * values, mostly 0xffff.
    1.43 + * Those may need to be distinguished from
    1.44 + * actual U+ffff text contents by calling functions like
    1.45 + * CharacterIterator::hasNext() or UnicodeString::length().
    1.46 + *
    1.47 + * @return -1
    1.48 + * @see UChar32
    1.49 + * @stable ICU 2.4
    1.50 + */
    1.51 +#define CBU_SENTINEL (-1)
    1.52 +
    1.53 +/**
    1.54 + * Is this code point a Unicode noncharacter?
    1.55 + * @param c 32-bit code point
    1.56 + * @return TRUE or FALSE
    1.57 + * @stable ICU 2.4
    1.58 + */
    1.59 +#define CBU_IS_UNICODE_NONCHAR(c) \
    1.60 +    ((c)>=0xfdd0 && \
    1.61 +     ((uint32)(c)<=0xfdef || ((c)&0xfffe)==0xfffe) && \
    1.62 +     (uint32)(c)<=0x10ffff)
    1.63 +
    1.64 +/**
    1.65 + * Is c a Unicode code point value (0..U+10ffff)
    1.66 + * that can be assigned a character?
    1.67 + *
    1.68 + * Code points that are not characters include:
    1.69 + * - single surrogate code points (U+d800..U+dfff, 2048 code points)
    1.70 + * - the last two code points on each plane (U+__fffe and U+__ffff, 34 code points)
    1.71 + * - U+fdd0..U+fdef (new with Unicode 3.1, 32 code points)
    1.72 + * - the highest Unicode code point value is U+10ffff
    1.73 + *
    1.74 + * This means that all code points below U+d800 are character code points,
    1.75 + * and that boundary is tested first for performance.
    1.76 + *
    1.77 + * @param c 32-bit code point
    1.78 + * @return TRUE or FALSE
    1.79 + * @stable ICU 2.4
    1.80 + */
    1.81 +#define CBU_IS_UNICODE_CHAR(c) \
    1.82 +    ((uint32)(c)<0xd800 || \
    1.83 +        ((uint32)(c)>0xdfff && \
    1.84 +         (uint32)(c)<=0x10ffff && \
    1.85 +         !CBU_IS_UNICODE_NONCHAR(c)))
    1.86 +
    1.87 +/**
    1.88 + * Is this code point a surrogate (U+d800..U+dfff)?
    1.89 + * @param c 32-bit code point
    1.90 + * @return TRUE or FALSE
    1.91 + * @stable ICU 2.4
    1.92 + */
    1.93 +#define CBU_IS_SURROGATE(c) (((c)&0xfffff800)==0xd800)
    1.94 +
    1.95 +/**
    1.96 + * Assuming c is a surrogate code point (U_IS_SURROGATE(c)),
    1.97 + * is it a lead surrogate?
    1.98 + * @param c 32-bit code point
    1.99 + * @return TRUE or FALSE
   1.100 + * @stable ICU 2.4
   1.101 + */
   1.102 +#define CBU_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
   1.103 +
   1.104 +
   1.105 +// UTF-8 macros ----------------------------------------------------------------
   1.106 +// from utf8.h
   1.107 +
   1.108 +extern const uint8 utf8_countTrailBytes[256];
   1.109 +
   1.110 +/**
   1.111 + * Count the trail bytes for a UTF-8 lead byte.
   1.112 + * @internal
   1.113 + */
   1.114 +#define CBU8_COUNT_TRAIL_BYTES(leadByte) (base_icu::utf8_countTrailBytes[(uint8)leadByte])
   1.115 +
   1.116 +/**
   1.117 + * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
   1.118 + * @internal
   1.119 + */
   1.120 +#define CBU8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
   1.121 +
   1.122 +/**
   1.123 + * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
   1.124 + * @param c 8-bit code unit (byte)
   1.125 + * @return TRUE or FALSE
   1.126 + * @stable ICU 2.4
   1.127 + */
   1.128 +#define CBU8_IS_SINGLE(c) (((c)&0x80)==0)
   1.129 +
   1.130 +/**
   1.131 + * Is this code unit (byte) a UTF-8 lead byte?
   1.132 + * @param c 8-bit code unit (byte)
   1.133 + * @return TRUE or FALSE
   1.134 + * @stable ICU 2.4
   1.135 + */
   1.136 +#define CBU8_IS_LEAD(c) ((uint8)((c)-0xc0)<0x3e)
   1.137 +
   1.138 +/**
   1.139 + * Is this code unit (byte) a UTF-8 trail byte?
   1.140 + * @param c 8-bit code unit (byte)
   1.141 + * @return TRUE or FALSE
   1.142 + * @stable ICU 2.4
   1.143 + */
   1.144 +#define CBU8_IS_TRAIL(c) (((c)&0xc0)==0x80)
   1.145 +
   1.146 +/**
   1.147 + * How many code units (bytes) are used for the UTF-8 encoding
   1.148 + * of this Unicode code point?
   1.149 + * @param c 32-bit code point
   1.150 + * @return 1..4, or 0 if c is a surrogate or not a Unicode code point
   1.151 + * @stable ICU 2.4
   1.152 + */
   1.153 +#define CBU8_LENGTH(c) \
   1.154 +    ((uint32)(c)<=0x7f ? 1 : \
   1.155 +        ((uint32)(c)<=0x7ff ? 2 : \
   1.156 +            ((uint32)(c)<=0xd7ff ? 3 : \
   1.157 +                ((uint32)(c)<=0xdfff || (uint32)(c)>0x10ffff ? 0 : \
   1.158 +                    ((uint32)(c)<=0xffff ? 3 : 4)\
   1.159 +                ) \
   1.160 +            ) \
   1.161 +        ) \
   1.162 +    )
   1.163 +
   1.164 +/**
   1.165 + * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff).
   1.166 + * @return 4
   1.167 + * @stable ICU 2.4
   1.168 + */
   1.169 +#define CBU8_MAX_LENGTH 4
   1.170 +
   1.171 +/**
   1.172 + * Function for handling "next code point" with error-checking.
   1.173 + * @internal
   1.174 + */
   1.175 +UChar32 utf8_nextCharSafeBody(const uint8 *s, int32 *pi, int32 length, UChar32 c, UBool strict);
   1.176 +
   1.177 +/**
   1.178 + * Get a code point from a string at a code point boundary offset,
   1.179 + * and advance the offset to the next code point boundary.
   1.180 + * (Post-incrementing forward iteration.)
   1.181 + * "Safe" macro, checks for illegal sequences and for string boundaries.
   1.182 + *
   1.183 + * The offset may point to the lead byte of a multi-byte sequence,
   1.184 + * in which case the macro will read the whole sequence.
   1.185 + * If the offset points to a trail byte or an illegal UTF-8 sequence, then
   1.186 + * c is set to a negative value.
   1.187 + *
   1.188 + * @param s const uint8 * string
   1.189 + * @param i string offset, i<length
   1.190 + * @param length string length
   1.191 + * @param c output UChar32 variable, set to <0 in case of an error
   1.192 + * @see CBU8_NEXT_UNSAFE
   1.193 + * @stable ICU 2.4
   1.194 + */
   1.195 +#define CBU8_NEXT(s, i, length, c) { \
   1.196 +    (c)=(s)[(i)++]; \
   1.197 +    if(((uint8)(c))>=0x80) { \
   1.198 +        if(CBU8_IS_LEAD(c)) { \
   1.199 +            (c)=base_icu::utf8_nextCharSafeBody((const uint8 *)s, &(i), (int32)(length), c, -1); \
   1.200 +        } else { \
   1.201 +            (c)=CBU_SENTINEL; \
   1.202 +        } \
   1.203 +    } \
   1.204 +}
   1.205 +
   1.206 +/**
   1.207 + * Append a code point to a string, overwriting 1 to 4 bytes.
   1.208 + * The offset points to the current end of the string contents
   1.209 + * and is advanced (post-increment).
   1.210 + * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
   1.211 + * Otherwise, the result is undefined.
   1.212 + *
   1.213 + * @param s const uint8 * string buffer
   1.214 + * @param i string offset
   1.215 + * @param c code point to append
   1.216 + * @see CBU8_APPEND
   1.217 + * @stable ICU 2.4
   1.218 + */
   1.219 +#define CBU8_APPEND_UNSAFE(s, i, c) { \
   1.220 +    if((uint32)(c)<=0x7f) { \
   1.221 +        (s)[(i)++]=(uint8)(c); \
   1.222 +    } else { \
   1.223 +        if((uint32)(c)<=0x7ff) { \
   1.224 +            (s)[(i)++]=(uint8)(((c)>>6)|0xc0); \
   1.225 +        } else { \
   1.226 +            if((uint32)(c)<=0xffff) { \
   1.227 +                (s)[(i)++]=(uint8)(((c)>>12)|0xe0); \
   1.228 +            } else { \
   1.229 +                (s)[(i)++]=(uint8)(((c)>>18)|0xf0); \
   1.230 +                (s)[(i)++]=(uint8)((((c)>>12)&0x3f)|0x80); \
   1.231 +            } \
   1.232 +            (s)[(i)++]=(uint8)((((c)>>6)&0x3f)|0x80); \
   1.233 +        } \
   1.234 +        (s)[(i)++]=(uint8)(((c)&0x3f)|0x80); \
   1.235 +    } \
   1.236 +}
   1.237 +
   1.238 +// UTF-16 macros ---------------------------------------------------------------
   1.239 +// from utf16.h
   1.240 +
   1.241 +/**
   1.242 + * Does this code unit alone encode a code point (BMP, not a surrogate)?
   1.243 + * @param c 16-bit code unit
   1.244 + * @return TRUE or FALSE
   1.245 + * @stable ICU 2.4
   1.246 + */
   1.247 +#define CBU16_IS_SINGLE(c) !CBU_IS_SURROGATE(c)
   1.248 +
   1.249 +/**
   1.250 + * Is this code unit a lead surrogate (U+d800..U+dbff)?
   1.251 + * @param c 16-bit code unit
   1.252 + * @return TRUE or FALSE
   1.253 + * @stable ICU 2.4
   1.254 + */
   1.255 +#define CBU16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
   1.256 +
   1.257 +/**
   1.258 + * Is this code unit a trail surrogate (U+dc00..U+dfff)?
   1.259 + * @param c 16-bit code unit
   1.260 + * @return TRUE or FALSE
   1.261 + * @stable ICU 2.4
   1.262 + */
   1.263 +#define CBU16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
   1.264 +
   1.265 +/**
   1.266 + * Is this code unit a surrogate (U+d800..U+dfff)?
   1.267 + * @param c 16-bit code unit
   1.268 + * @return TRUE or FALSE
   1.269 + * @stable ICU 2.4
   1.270 + */
   1.271 +#define CBU16_IS_SURROGATE(c) CBU_IS_SURROGATE(c)
   1.272 +
   1.273 +/**
   1.274 + * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
   1.275 + * is it a lead surrogate?
   1.276 + * @param c 16-bit code unit
   1.277 + * @return TRUE or FALSE
   1.278 + * @stable ICU 2.4
   1.279 + */
   1.280 +#define CBU16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
   1.281 +
   1.282 +/**
   1.283 + * Helper constant for CBU16_GET_SUPPLEMENTARY.
   1.284 + * @internal
   1.285 + */
   1.286 +#define CBU16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
   1.287 +
   1.288 +/**
   1.289 + * Get a supplementary code point value (U+10000..U+10ffff)
   1.290 + * from its lead and trail surrogates.
   1.291 + * The result is undefined if the input values are not
   1.292 + * lead and trail surrogates.
   1.293 + *
   1.294 + * @param lead lead surrogate (U+d800..U+dbff)
   1.295 + * @param trail trail surrogate (U+dc00..U+dfff)
   1.296 + * @return supplementary code point (U+10000..U+10ffff)
   1.297 + * @stable ICU 2.4
   1.298 + */
   1.299 +#define CBU16_GET_SUPPLEMENTARY(lead, trail) \
   1.300 +    (((base_icu::UChar32)(lead)<<10UL)+(base_icu::UChar32)(trail)-CBU16_SURROGATE_OFFSET)
   1.301 +
   1.302 +
   1.303 +/**
   1.304 + * Get the lead surrogate (0xd800..0xdbff) for a
   1.305 + * supplementary code point (0x10000..0x10ffff).
   1.306 + * @param supplementary 32-bit code point (U+10000..U+10ffff)
   1.307 + * @return lead surrogate (U+d800..U+dbff) for supplementary
   1.308 + * @stable ICU 2.4
   1.309 + */
   1.310 +#define CBU16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
   1.311 +
   1.312 +/**
   1.313 + * Get the trail surrogate (0xdc00..0xdfff) for a
   1.314 + * supplementary code point (0x10000..0x10ffff).
   1.315 + * @param supplementary 32-bit code point (U+10000..U+10ffff)
   1.316 + * @return trail surrogate (U+dc00..U+dfff) for supplementary
   1.317 + * @stable ICU 2.4
   1.318 + */
   1.319 +#define CBU16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
   1.320 +
   1.321 +/**
   1.322 + * How many 16-bit code units are used to encode this Unicode code point? (1 or 2)
   1.323 + * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff).
   1.324 + * @param c 32-bit code point
   1.325 + * @return 1 or 2
   1.326 + * @stable ICU 2.4
   1.327 + */
   1.328 +#define CBU16_LENGTH(c) ((uint32)(c)<=0xffff ? 1 : 2)
   1.329 +
   1.330 +/**
   1.331 + * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
   1.332 + * @return 2
   1.333 + * @stable ICU 2.4
   1.334 + */
   1.335 +#define CBU16_MAX_LENGTH 2
   1.336 +
   1.337 +/**
   1.338 + * Get a code point from a string at a code point boundary offset,
   1.339 + * and advance the offset to the next code point boundary.
   1.340 + * (Post-incrementing forward iteration.)
   1.341 + * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
   1.342 + *
   1.343 + * The offset may point to the lead surrogate unit
   1.344 + * for a supplementary code point, in which case the macro will read
   1.345 + * the following trail surrogate as well.
   1.346 + * If the offset points to a trail surrogate or
   1.347 + * to a single, unpaired lead surrogate, then that itself
   1.348 + * will be returned as the code point.
   1.349 + *
   1.350 + * @param s const UChar * string
   1.351 + * @param i string offset, i<length
   1.352 + * @param length string length
   1.353 + * @param c output UChar32 variable
   1.354 + * @stable ICU 2.4
   1.355 + */
   1.356 +#define CBU16_NEXT(s, i, length, c) { \
   1.357 +    (c)=(s)[(i)++]; \
   1.358 +    if(CBU16_IS_LEAD(c)) { \
   1.359 +        uint16 __c2; \
   1.360 +        if((i)<(length) && CBU16_IS_TRAIL(__c2=(s)[(i)])) { \
   1.361 +            ++(i); \
   1.362 +            (c)=CBU16_GET_SUPPLEMENTARY((c), __c2); \
   1.363 +        } \
   1.364 +    } \
   1.365 +}
   1.366 +
   1.367 +/**
   1.368 + * Append a code point to a string, overwriting 1 or 2 code units.
   1.369 + * The offset points to the current end of the string contents
   1.370 + * and is advanced (post-increment).
   1.371 + * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
   1.372 + * Otherwise, the result is undefined.
   1.373 + *
   1.374 + * @param s const UChar * string buffer
   1.375 + * @param i string offset
   1.376 + * @param c code point to append
   1.377 + * @see CBU16_APPEND
   1.378 + * @stable ICU 2.4
   1.379 + */
   1.380 +#define CBU16_APPEND_UNSAFE(s, i, c) { \
   1.381 +    if((uint32)(c)<=0xffff) { \
   1.382 +        (s)[(i)++]=(uint16)(c); \
   1.383 +    } else { \
   1.384 +        (s)[(i)++]=(uint16)(((c)>>10)+0xd7c0); \
   1.385 +        (s)[(i)++]=(uint16)(((c)&0x3ff)|0xdc00); \
   1.386 +    } \
   1.387 +}
   1.388 +
   1.389 +}  // namesapce base_icu
   1.390 +
   1.391 +#endif  // BASE_THIRD_PARTY_ICU_ICU_UTF_H_

mercurial