1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/unicode/utf8.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,830 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* 1.7 +* Copyright (C) 1999-2013, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +******************************************************************************* 1.11 +* file name: utf8.h 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 1999sep13 1.17 +* created by: Markus W. Scherer 1.18 +*/ 1.19 + 1.20 +/** 1.21 + * \file 1.22 + * \brief C API: 8-bit Unicode handling macros 1.23 + * 1.24 + * This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings. 1.25 + * 1.26 + * For more information see utf.h and the ICU User Guide Strings chapter 1.27 + * (http://userguide.icu-project.org/strings). 1.28 + * 1.29 + * <em>Usage:</em> 1.30 + * ICU coding guidelines for if() statements should be followed when using these macros. 1.31 + * Compound statements (curly braces {}) must be used for if-else-while... 1.32 + * bodies and all macro statements should be terminated with semicolon. 1.33 + */ 1.34 + 1.35 +#ifndef __UTF8_H__ 1.36 +#define __UTF8_H__ 1.37 + 1.38 +#include "unicode/umachine.h" 1.39 +#ifndef __UTF_H__ 1.40 +# include "unicode/utf.h" 1.41 +#endif 1.42 + 1.43 +/* internal definitions ----------------------------------------------------- */ 1.44 + 1.45 +/** 1.46 + * \var utf8_countTrailBytes 1.47 + * Internal array with numbers of trail bytes for any given byte used in 1.48 + * lead byte position. 1.49 + * 1.50 + * This is internal since it is not meant to be called directly by external clients; 1.51 + * however it is called by public macros in this file and thus must remain stable, 1.52 + * and should not be hidden when other internal functions are hidden (otherwise 1.53 + * public macros would fail to compile). 1.54 + * @internal 1.55 + */ 1.56 +#ifdef U_UTF8_IMPL 1.57 +U_EXPORT const uint8_t 1.58 +#elif defined(U_STATIC_IMPLEMENTATION) || defined(U_COMMON_IMPLEMENTATION) 1.59 +U_CFUNC const uint8_t 1.60 +#else 1.61 +U_CFUNC U_IMPORT const uint8_t /* U_IMPORT2? */ /*U_IMPORT*/ 1.62 +#endif 1.63 +utf8_countTrailBytes[256]; 1.64 + 1.65 +/** 1.66 + * Counts the trail bytes for a UTF-8 lead byte. 1.67 + * Returns 0 for 0..0xbf as well as for 0xfe and 0xff. 1.68 + * 1.69 + * This is internal since it is not meant to be called directly by external clients; 1.70 + * however it is called by public macros in this file and thus must remain stable. 1.71 + * 1.72 + * Note: Beginning with ICU 50, the implementation uses a multi-condition expression 1.73 + * which was shown in 2012 (on x86-64) to compile to fast, branch-free code. 1.74 + * leadByte is evaluated multiple times. 1.75 + * 1.76 + * The pre-ICU 50 implementation used the exported array utf8_countTrailBytes: 1.77 + * #define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[leadByte]) 1.78 + * leadByte was evaluated exactly once. 1.79 + * 1.80 + * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. 1.81 + * @internal 1.82 + */ 1.83 +#define U8_COUNT_TRAIL_BYTES(leadByte) \ 1.84 + ((leadByte)<0xf0 ? \ 1.85 + ((leadByte)>=0xc0)+((leadByte)>=0xe0) : \ 1.86 + (leadByte)<0xfe ? 3+((leadByte)>=0xf8)+((leadByte)>=0xfc) : 0) 1.87 + 1.88 +/** 1.89 + * Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence. 1.90 + * The maximum supported lead byte is 0xf4 corresponding to U+10FFFF. 1.91 + * leadByte might be evaluated multiple times. 1.92 + * 1.93 + * This is internal since it is not meant to be called directly by external clients; 1.94 + * however it is called by public macros in this file and thus must remain stable. 1.95 + * 1.96 + * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. 1.97 + * @internal 1.98 + */ 1.99 +#define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \ 1.100 + (((leadByte)>=0xc0)+((leadByte)>=0xe0)+((leadByte)>=0xf0)) 1.101 + 1.102 +/** 1.103 + * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value. 1.104 + * 1.105 + * This is internal since it is not meant to be called directly by external clients; 1.106 + * however it is called by public macros in this file and thus must remain stable. 1.107 + * @internal 1.108 + */ 1.109 +#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) 1.110 + 1.111 +/** 1.112 + * Function for handling "next code point" with error-checking. 1.113 + * 1.114 + * This is internal since it is not meant to be called directly by external clients; 1.115 + * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this 1.116 + * file and thus must remain stable, and should not be hidden when other internal 1.117 + * functions are hidden (otherwise public macros would fail to compile). 1.118 + * @internal 1.119 + */ 1.120 +U_STABLE UChar32 U_EXPORT2 1.121 +utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict); 1.122 + 1.123 +/** 1.124 + * Function for handling "append code point" with error-checking. 1.125 + * 1.126 + * This is internal since it is not meant to be called directly by external clients; 1.127 + * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this 1.128 + * file and thus must remain stable, and should not be hidden when other internal 1.129 + * functions are hidden (otherwise public macros would fail to compile). 1.130 + * @internal 1.131 + */ 1.132 +U_STABLE int32_t U_EXPORT2 1.133 +utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError); 1.134 + 1.135 +/** 1.136 + * Function for handling "previous code point" with error-checking. 1.137 + * 1.138 + * This is internal since it is not meant to be called directly by external clients; 1.139 + * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this 1.140 + * file and thus must remain stable, and should not be hidden when other internal 1.141 + * functions are hidden (otherwise public macros would fail to compile). 1.142 + * @internal 1.143 + */ 1.144 +U_STABLE UChar32 U_EXPORT2 1.145 +utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict); 1.146 + 1.147 +/** 1.148 + * Function for handling "skip backward one code point" with error-checking. 1.149 + * 1.150 + * This is internal since it is not meant to be called directly by external clients; 1.151 + * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this 1.152 + * file and thus must remain stable, and should not be hidden when other internal 1.153 + * functions are hidden (otherwise public macros would fail to compile). 1.154 + * @internal 1.155 + */ 1.156 +U_STABLE int32_t U_EXPORT2 1.157 +utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); 1.158 + 1.159 +/* single-code point definitions -------------------------------------------- */ 1.160 + 1.161 +/** 1.162 + * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)? 1.163 + * @param c 8-bit code unit (byte) 1.164 + * @return TRUE or FALSE 1.165 + * @stable ICU 2.4 1.166 + */ 1.167 +#define U8_IS_SINGLE(c) (((c)&0x80)==0) 1.168 + 1.169 +/** 1.170 + * Is this code unit (byte) a UTF-8 lead byte? 1.171 + * @param c 8-bit code unit (byte) 1.172 + * @return TRUE or FALSE 1.173 + * @stable ICU 2.4 1.174 + */ 1.175 +#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc0)<0x3e) 1.176 + 1.177 +/** 1.178 + * Is this code unit (byte) a UTF-8 trail byte? 1.179 + * @param c 8-bit code unit (byte) 1.180 + * @return TRUE or FALSE 1.181 + * @stable ICU 2.4 1.182 + */ 1.183 +#define U8_IS_TRAIL(c) (((c)&0xc0)==0x80) 1.184 + 1.185 +/** 1.186 + * How many code units (bytes) are used for the UTF-8 encoding 1.187 + * of this Unicode code point? 1.188 + * @param c 32-bit code point 1.189 + * @return 1..4, or 0 if c is a surrogate or not a Unicode code point 1.190 + * @stable ICU 2.4 1.191 + */ 1.192 +#define U8_LENGTH(c) \ 1.193 + ((uint32_t)(c)<=0x7f ? 1 : \ 1.194 + ((uint32_t)(c)<=0x7ff ? 2 : \ 1.195 + ((uint32_t)(c)<=0xd7ff ? 3 : \ 1.196 + ((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \ 1.197 + ((uint32_t)(c)<=0xffff ? 3 : 4)\ 1.198 + ) \ 1.199 + ) \ 1.200 + ) \ 1.201 + ) 1.202 + 1.203 +/** 1.204 + * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff). 1.205 + * @return 4 1.206 + * @stable ICU 2.4 1.207 + */ 1.208 +#define U8_MAX_LENGTH 4 1.209 + 1.210 +/** 1.211 + * Get a code point from a string at a random-access offset, 1.212 + * without changing the offset. 1.213 + * The offset may point to either the lead byte or one of the trail bytes 1.214 + * for a code point, in which case the macro will read all of the bytes 1.215 + * for the code point. 1.216 + * The result is undefined if the offset points to an illegal UTF-8 1.217 + * byte sequence. 1.218 + * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT. 1.219 + * 1.220 + * @param s const uint8_t * string 1.221 + * @param i string offset 1.222 + * @param c output UChar32 variable 1.223 + * @see U8_GET 1.224 + * @stable ICU 2.4 1.225 + */ 1.226 +#define U8_GET_UNSAFE(s, i, c) { \ 1.227 + int32_t _u8_get_unsafe_index=(int32_t)(i); \ 1.228 + U8_SET_CP_START_UNSAFE(s, _u8_get_unsafe_index); \ 1.229 + U8_NEXT_UNSAFE(s, _u8_get_unsafe_index, c); \ 1.230 +} 1.231 + 1.232 +/** 1.233 + * Get a code point from a string at a random-access offset, 1.234 + * without changing the offset. 1.235 + * The offset may point to either the lead byte or one of the trail bytes 1.236 + * for a code point, in which case the macro will read all of the bytes 1.237 + * for the code point. 1.238 + * 1.239 + * The length can be negative for a NUL-terminated string. 1.240 + * 1.241 + * If the offset points to an illegal UTF-8 byte sequence, then 1.242 + * c is set to a negative value. 1.243 + * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT. 1.244 + * 1.245 + * @param s const uint8_t * string 1.246 + * @param start int32_t starting string offset 1.247 + * @param i int32_t string offset, must be start<=i<length 1.248 + * @param length int32_t string length 1.249 + * @param c output UChar32 variable, set to <0 in case of an error 1.250 + * @see U8_GET_UNSAFE 1.251 + * @stable ICU 2.4 1.252 + */ 1.253 +#define U8_GET(s, start, i, length, c) { \ 1.254 + int32_t _u8_get_index=(i); \ 1.255 + U8_SET_CP_START(s, start, _u8_get_index); \ 1.256 + U8_NEXT(s, _u8_get_index, length, c); \ 1.257 +} 1.258 + 1.259 +#ifndef U_HIDE_DRAFT_API 1.260 +/** 1.261 + * Get a code point from a string at a random-access offset, 1.262 + * without changing the offset. 1.263 + * The offset may point to either the lead byte or one of the trail bytes 1.264 + * for a code point, in which case the macro will read all of the bytes 1.265 + * for the code point. 1.266 + * 1.267 + * The length can be negative for a NUL-terminated string. 1.268 + * 1.269 + * If the offset points to an illegal UTF-8 byte sequence, then 1.270 + * c is set to U+FFFD. 1.271 + * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT_OR_FFFD. 1.272 + * 1.273 + * This macro does not distinguish between a real U+FFFD in the text 1.274 + * and U+FFFD returned for an ill-formed sequence. 1.275 + * Use U8_GET() if that distinction is important. 1.276 + * 1.277 + * @param s const uint8_t * string 1.278 + * @param start int32_t starting string offset 1.279 + * @param i int32_t string offset, must be start<=i<length 1.280 + * @param length int32_t string length 1.281 + * @param c output UChar32 variable, set to U+FFFD in case of an error 1.282 + * @see U8_GET 1.283 + * @draft ICU 51 1.284 + */ 1.285 +#define U8_GET_OR_FFFD(s, start, i, length, c) { \ 1.286 + int32_t _u8_get_index=(i); \ 1.287 + U8_SET_CP_START(s, start, _u8_get_index); \ 1.288 + U8_NEXT_OR_FFFD(s, _u8_get_index, length, c); \ 1.289 +} 1.290 +#endif /* U_HIDE_DRAFT_API */ 1.291 + 1.292 +/* definitions with forward iteration --------------------------------------- */ 1.293 + 1.294 +/** 1.295 + * Get a code point from a string at a code point boundary offset, 1.296 + * and advance the offset to the next code point boundary. 1.297 + * (Post-incrementing forward iteration.) 1.298 + * "Unsafe" macro, assumes well-formed UTF-8. 1.299 + * 1.300 + * The offset may point to the lead byte of a multi-byte sequence, 1.301 + * in which case the macro will read the whole sequence. 1.302 + * The result is undefined if the offset points to a trail byte 1.303 + * or an illegal UTF-8 sequence. 1.304 + * 1.305 + * @param s const uint8_t * string 1.306 + * @param i string offset 1.307 + * @param c output UChar32 variable 1.308 + * @see U8_NEXT 1.309 + * @stable ICU 2.4 1.310 + */ 1.311 +#define U8_NEXT_UNSAFE(s, i, c) { \ 1.312 + (c)=(uint8_t)(s)[(i)++]; \ 1.313 + if((c)>=0x80) { \ 1.314 + if((c)<0xe0) { \ 1.315 + (c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \ 1.316 + } else if((c)<0xf0) { \ 1.317 + /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \ 1.318 + (c)=(UChar)(((c)<<12)|(((s)[i]&0x3f)<<6)|((s)[(i)+1]&0x3f)); \ 1.319 + (i)+=2; \ 1.320 + } else { \ 1.321 + (c)=(((c)&7)<<18)|(((s)[i]&0x3f)<<12)|(((s)[(i)+1]&0x3f)<<6)|((s)[(i)+2]&0x3f); \ 1.322 + (i)+=3; \ 1.323 + } \ 1.324 + } \ 1.325 +} 1.326 + 1.327 +/** 1.328 + * Get a code point from a string at a code point boundary offset, 1.329 + * and advance the offset to the next code point boundary. 1.330 + * (Post-incrementing forward iteration.) 1.331 + * "Safe" macro, checks for illegal sequences and for string boundaries. 1.332 + * 1.333 + * The length can be negative for a NUL-terminated string. 1.334 + * 1.335 + * The offset may point to the lead byte of a multi-byte sequence, 1.336 + * in which case the macro will read the whole sequence. 1.337 + * If the offset points to a trail byte or an illegal UTF-8 sequence, then 1.338 + * c is set to a negative value. 1.339 + * 1.340 + * @param s const uint8_t * string 1.341 + * @param i int32_t string offset, must be i<length 1.342 + * @param length int32_t string length 1.343 + * @param c output UChar32 variable, set to <0 in case of an error 1.344 + * @see U8_NEXT_UNSAFE 1.345 + * @stable ICU 2.4 1.346 + */ 1.347 +#define U8_NEXT(s, i, length, c) { \ 1.348 + (c)=(uint8_t)(s)[(i)++]; \ 1.349 + if((c)>=0x80) { \ 1.350 + uint8_t __t1, __t2; \ 1.351 + if( /* handle U+1000..U+CFFF inline */ \ 1.352 + (0xe0<(c) && (c)<=0xec) && \ 1.353 + (((i)+1)<(length) || (length)<0) && \ 1.354 + (__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \ 1.355 + (__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \ 1.356 + ) { \ 1.357 + /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \ 1.358 + (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \ 1.359 + (i)+=2; \ 1.360 + } else if( /* handle U+0080..U+07FF inline */ \ 1.361 + ((c)<0xe0 && (c)>=0xc2) && \ 1.362 + ((i)!=(length)) && \ 1.363 + (__t1=(uint8_t)((s)[i]-0x80))<=0x3f \ 1.364 + ) { \ 1.365 + (c)=(((c)&0x1f)<<6)|__t1; \ 1.366 + ++(i); \ 1.367 + } else { \ 1.368 + /* function call for "complicated" and error cases */ \ 1.369 + (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -1); \ 1.370 + } \ 1.371 + } \ 1.372 +} 1.373 + 1.374 +#ifndef U_HIDE_DRAFT_API 1.375 +/** 1.376 + * Get a code point from a string at a code point boundary offset, 1.377 + * and advance the offset to the next code point boundary. 1.378 + * (Post-incrementing forward iteration.) 1.379 + * "Safe" macro, checks for illegal sequences and for string boundaries. 1.380 + * 1.381 + * The length can be negative for a NUL-terminated string. 1.382 + * 1.383 + * The offset may point to the lead byte of a multi-byte sequence, 1.384 + * in which case the macro will read the whole sequence. 1.385 + * If the offset points to a trail byte or an illegal UTF-8 sequence, then 1.386 + * c is set to U+FFFD. 1.387 + * 1.388 + * This macro does not distinguish between a real U+FFFD in the text 1.389 + * and U+FFFD returned for an ill-formed sequence. 1.390 + * Use U8_NEXT() if that distinction is important. 1.391 + * 1.392 + * @param s const uint8_t * string 1.393 + * @param i int32_t string offset, must be i<length 1.394 + * @param length int32_t string length 1.395 + * @param c output UChar32 variable, set to U+FFFD in case of an error 1.396 + * @see U8_NEXT 1.397 + * @draft ICU 51 1.398 + */ 1.399 +#define U8_NEXT_OR_FFFD(s, i, length, c) { \ 1.400 + (c)=(uint8_t)(s)[(i)++]; \ 1.401 + if((c)>=0x80) { \ 1.402 + uint8_t __t1, __t2; \ 1.403 + if( /* handle U+1000..U+CFFF inline */ \ 1.404 + (0xe0<(c) && (c)<=0xec) && \ 1.405 + (((i)+1)<(length) || (length)<0) && \ 1.406 + (__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \ 1.407 + (__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \ 1.408 + ) { \ 1.409 + /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \ 1.410 + (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \ 1.411 + (i)+=2; \ 1.412 + } else if( /* handle U+0080..U+07FF inline */ \ 1.413 + ((c)<0xe0 && (c)>=0xc2) && \ 1.414 + ((i)!=(length)) && \ 1.415 + (__t1=(uint8_t)((s)[i]-0x80))<=0x3f \ 1.416 + ) { \ 1.417 + (c)=(((c)&0x1f)<<6)|__t1; \ 1.418 + ++(i); \ 1.419 + } else { \ 1.420 + /* function call for "complicated" and error cases */ \ 1.421 + (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -3); \ 1.422 + } \ 1.423 + } \ 1.424 +} 1.425 +#endif /* U_HIDE_DRAFT_API */ 1.426 + 1.427 +/** 1.428 + * Append a code point to a string, overwriting 1 to 4 bytes. 1.429 + * The offset points to the current end of the string contents 1.430 + * and is advanced (post-increment). 1.431 + * "Unsafe" macro, assumes a valid code point and sufficient space in the string. 1.432 + * Otherwise, the result is undefined. 1.433 + * 1.434 + * @param s const uint8_t * string buffer 1.435 + * @param i string offset 1.436 + * @param c code point to append 1.437 + * @see U8_APPEND 1.438 + * @stable ICU 2.4 1.439 + */ 1.440 +#define U8_APPEND_UNSAFE(s, i, c) { \ 1.441 + if((uint32_t)(c)<=0x7f) { \ 1.442 + (s)[(i)++]=(uint8_t)(c); \ 1.443 + } else { \ 1.444 + if((uint32_t)(c)<=0x7ff) { \ 1.445 + (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \ 1.446 + } else { \ 1.447 + if((uint32_t)(c)<=0xffff) { \ 1.448 + (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \ 1.449 + } else { \ 1.450 + (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0); \ 1.451 + (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80); \ 1.452 + } \ 1.453 + (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \ 1.454 + } \ 1.455 + (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \ 1.456 + } \ 1.457 +} 1.458 + 1.459 +/** 1.460 + * Append a code point to a string, overwriting 1 to 4 bytes. 1.461 + * The offset points to the current end of the string contents 1.462 + * and is advanced (post-increment). 1.463 + * "Safe" macro, checks for a valid code point. 1.464 + * If a non-ASCII code point is written, checks for sufficient space in the string. 1.465 + * If the code point is not valid or trail bytes do not fit, 1.466 + * then isError is set to TRUE. 1.467 + * 1.468 + * @param s const uint8_t * string buffer 1.469 + * @param i int32_t string offset, must be i<capacity 1.470 + * @param capacity int32_t size of the string buffer 1.471 + * @param c UChar32 code point to append 1.472 + * @param isError output UBool set to TRUE if an error occurs, otherwise not modified 1.473 + * @see U8_APPEND_UNSAFE 1.474 + * @stable ICU 2.4 1.475 + */ 1.476 +#define U8_APPEND(s, i, capacity, c, isError) { \ 1.477 + if((uint32_t)(c)<=0x7f) { \ 1.478 + (s)[(i)++]=(uint8_t)(c); \ 1.479 + } else if((uint32_t)(c)<=0x7ff && (i)+1<(capacity)) { \ 1.480 + (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \ 1.481 + (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \ 1.482 + } else if((uint32_t)(c)<=0xd7ff && (i)+2<(capacity)) { \ 1.483 + (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \ 1.484 + (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \ 1.485 + (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \ 1.486 + } else { \ 1.487 + (i)=utf8_appendCharSafeBody(s, (i), (capacity), c, &(isError)); \ 1.488 + } \ 1.489 +} 1.490 + 1.491 +/** 1.492 + * Advance the string offset from one code point boundary to the next. 1.493 + * (Post-incrementing iteration.) 1.494 + * "Unsafe" macro, assumes well-formed UTF-8. 1.495 + * 1.496 + * @param s const uint8_t * string 1.497 + * @param i string offset 1.498 + * @see U8_FWD_1 1.499 + * @stable ICU 2.4 1.500 + */ 1.501 +#define U8_FWD_1_UNSAFE(s, i) { \ 1.502 + (i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((uint8_t)(s)[i]); \ 1.503 +} 1.504 + 1.505 +/** 1.506 + * Advance the string offset from one code point boundary to the next. 1.507 + * (Post-incrementing iteration.) 1.508 + * "Safe" macro, checks for illegal sequences and for string boundaries. 1.509 + * 1.510 + * The length can be negative for a NUL-terminated string. 1.511 + * 1.512 + * @param s const uint8_t * string 1.513 + * @param i int32_t string offset, must be i<length 1.514 + * @param length int32_t string length 1.515 + * @see U8_FWD_1_UNSAFE 1.516 + * @stable ICU 2.4 1.517 + */ 1.518 +#define U8_FWD_1(s, i, length) { \ 1.519 + uint8_t __b=(uint8_t)(s)[(i)++]; \ 1.520 + if(U8_IS_LEAD(__b)) { \ 1.521 + uint8_t __count=U8_COUNT_TRAIL_BYTES(__b); \ 1.522 + if((i)+__count>(length) && (length)>=0) { \ 1.523 + __count=(uint8_t)((length)-(i)); \ 1.524 + } \ 1.525 + while(__count>0 && U8_IS_TRAIL((s)[i])) { \ 1.526 + ++(i); \ 1.527 + --__count; \ 1.528 + } \ 1.529 + } \ 1.530 +} 1.531 + 1.532 +/** 1.533 + * Advance the string offset from one code point boundary to the n-th next one, 1.534 + * i.e., move forward by n code points. 1.535 + * (Post-incrementing iteration.) 1.536 + * "Unsafe" macro, assumes well-formed UTF-8. 1.537 + * 1.538 + * @param s const uint8_t * string 1.539 + * @param i string offset 1.540 + * @param n number of code points to skip 1.541 + * @see U8_FWD_N 1.542 + * @stable ICU 2.4 1.543 + */ 1.544 +#define U8_FWD_N_UNSAFE(s, i, n) { \ 1.545 + int32_t __N=(n); \ 1.546 + while(__N>0) { \ 1.547 + U8_FWD_1_UNSAFE(s, i); \ 1.548 + --__N; \ 1.549 + } \ 1.550 +} 1.551 + 1.552 +/** 1.553 + * Advance the string offset from one code point boundary to the n-th next one, 1.554 + * i.e., move forward by n code points. 1.555 + * (Post-incrementing iteration.) 1.556 + * "Safe" macro, checks for illegal sequences and for string boundaries. 1.557 + * 1.558 + * The length can be negative for a NUL-terminated string. 1.559 + * 1.560 + * @param s const uint8_t * string 1.561 + * @param i int32_t string offset, must be i<length 1.562 + * @param length int32_t string length 1.563 + * @param n number of code points to skip 1.564 + * @see U8_FWD_N_UNSAFE 1.565 + * @stable ICU 2.4 1.566 + */ 1.567 +#define U8_FWD_N(s, i, length, n) { \ 1.568 + int32_t __N=(n); \ 1.569 + while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \ 1.570 + U8_FWD_1(s, i, length); \ 1.571 + --__N; \ 1.572 + } \ 1.573 +} 1.574 + 1.575 +/** 1.576 + * Adjust a random-access offset to a code point boundary 1.577 + * at the start of a code point. 1.578 + * If the offset points to a UTF-8 trail byte, 1.579 + * then the offset is moved backward to the corresponding lead byte. 1.580 + * Otherwise, it is not modified. 1.581 + * "Unsafe" macro, assumes well-formed UTF-8. 1.582 + * 1.583 + * @param s const uint8_t * string 1.584 + * @param i string offset 1.585 + * @see U8_SET_CP_START 1.586 + * @stable ICU 2.4 1.587 + */ 1.588 +#define U8_SET_CP_START_UNSAFE(s, i) { \ 1.589 + while(U8_IS_TRAIL((s)[i])) { --(i); } \ 1.590 +} 1.591 + 1.592 +/** 1.593 + * Adjust a random-access offset to a code point boundary 1.594 + * at the start of a code point. 1.595 + * If the offset points to a UTF-8 trail byte, 1.596 + * then the offset is moved backward to the corresponding lead byte. 1.597 + * Otherwise, it is not modified. 1.598 + * "Safe" macro, checks for illegal sequences and for string boundaries. 1.599 + * 1.600 + * @param s const uint8_t * string 1.601 + * @param start int32_t starting string offset (usually 0) 1.602 + * @param i int32_t string offset, must be start<=i 1.603 + * @see U8_SET_CP_START_UNSAFE 1.604 + * @stable ICU 2.4 1.605 + */ 1.606 +#define U8_SET_CP_START(s, start, i) { \ 1.607 + if(U8_IS_TRAIL((s)[(i)])) { \ 1.608 + (i)=utf8_back1SafeBody(s, start, (i)); \ 1.609 + } \ 1.610 +} 1.611 + 1.612 +/* definitions with backward iteration -------------------------------------- */ 1.613 + 1.614 +/** 1.615 + * Move the string offset from one code point boundary to the previous one 1.616 + * and get the code point between them. 1.617 + * (Pre-decrementing backward iteration.) 1.618 + * "Unsafe" macro, assumes well-formed UTF-8. 1.619 + * 1.620 + * The input offset may be the same as the string length. 1.621 + * If the offset is behind a multi-byte sequence, then the macro will read 1.622 + * the whole sequence. 1.623 + * If the offset is behind a lead byte, then that itself 1.624 + * will be returned as the code point. 1.625 + * The result is undefined if the offset is behind an illegal UTF-8 sequence. 1.626 + * 1.627 + * @param s const uint8_t * string 1.628 + * @param i string offset 1.629 + * @param c output UChar32 variable 1.630 + * @see U8_PREV 1.631 + * @stable ICU 2.4 1.632 + */ 1.633 +#define U8_PREV_UNSAFE(s, i, c) { \ 1.634 + (c)=(uint8_t)(s)[--(i)]; \ 1.635 + if(U8_IS_TRAIL(c)) { \ 1.636 + uint8_t __b, __count=1, __shift=6; \ 1.637 +\ 1.638 + /* c is a trail byte */ \ 1.639 + (c)&=0x3f; \ 1.640 + for(;;) { \ 1.641 + __b=(uint8_t)(s)[--(i)]; \ 1.642 + if(__b>=0xc0) { \ 1.643 + U8_MASK_LEAD_BYTE(__b, __count); \ 1.644 + (c)|=(UChar32)__b<<__shift; \ 1.645 + break; \ 1.646 + } else { \ 1.647 + (c)|=(UChar32)(__b&0x3f)<<__shift; \ 1.648 + ++__count; \ 1.649 + __shift+=6; \ 1.650 + } \ 1.651 + } \ 1.652 + } \ 1.653 +} 1.654 + 1.655 +/** 1.656 + * Move the string offset from one code point boundary to the previous one 1.657 + * and get the code point between them. 1.658 + * (Pre-decrementing backward iteration.) 1.659 + * "Safe" macro, checks for illegal sequences and for string boundaries. 1.660 + * 1.661 + * The input offset may be the same as the string length. 1.662 + * If the offset is behind a multi-byte sequence, then the macro will read 1.663 + * the whole sequence. 1.664 + * If the offset is behind a lead byte, then that itself 1.665 + * will be returned as the code point. 1.666 + * If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value. 1.667 + * 1.668 + * @param s const uint8_t * string 1.669 + * @param start int32_t starting string offset (usually 0) 1.670 + * @param i int32_t string offset, must be start<i 1.671 + * @param c output UChar32 variable, set to <0 in case of an error 1.672 + * @see U8_PREV_UNSAFE 1.673 + * @stable ICU 2.4 1.674 + */ 1.675 +#define U8_PREV(s, start, i, c) { \ 1.676 + (c)=(uint8_t)(s)[--(i)]; \ 1.677 + if((c)>=0x80) { \ 1.678 + (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \ 1.679 + } \ 1.680 +} 1.681 + 1.682 +#ifndef U_HIDE_DRAFT_API 1.683 +/** 1.684 + * Move the string offset from one code point boundary to the previous one 1.685 + * and get the code point between them. 1.686 + * (Pre-decrementing backward iteration.) 1.687 + * "Safe" macro, checks for illegal sequences and for string boundaries. 1.688 + * 1.689 + * The input offset may be the same as the string length. 1.690 + * If the offset is behind a multi-byte sequence, then the macro will read 1.691 + * the whole sequence. 1.692 + * If the offset is behind a lead byte, then that itself 1.693 + * will be returned as the code point. 1.694 + * If the offset is behind an illegal UTF-8 sequence, then c is set to U+FFFD. 1.695 + * 1.696 + * This macro does not distinguish between a real U+FFFD in the text 1.697 + * and U+FFFD returned for an ill-formed sequence. 1.698 + * Use U8_PREV() if that distinction is important. 1.699 + * 1.700 + * @param s const uint8_t * string 1.701 + * @param start int32_t starting string offset (usually 0) 1.702 + * @param i int32_t string offset, must be start<i 1.703 + * @param c output UChar32 variable, set to U+FFFD in case of an error 1.704 + * @see U8_PREV 1.705 + * @draft ICU 51 1.706 + */ 1.707 +#define U8_PREV_OR_FFFD(s, start, i, c) { \ 1.708 + (c)=(uint8_t)(s)[--(i)]; \ 1.709 + if((c)>=0x80) { \ 1.710 + (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \ 1.711 + } \ 1.712 +} 1.713 +#endif /* U_HIDE_DRAFT_API */ 1.714 + 1.715 +/** 1.716 + * Move the string offset from one code point boundary to the previous one. 1.717 + * (Pre-decrementing backward iteration.) 1.718 + * The input offset may be the same as the string length. 1.719 + * "Unsafe" macro, assumes well-formed UTF-8. 1.720 + * 1.721 + * @param s const uint8_t * string 1.722 + * @param i string offset 1.723 + * @see U8_BACK_1 1.724 + * @stable ICU 2.4 1.725 + */ 1.726 +#define U8_BACK_1_UNSAFE(s, i) { \ 1.727 + while(U8_IS_TRAIL((s)[--(i)])) {} \ 1.728 +} 1.729 + 1.730 +/** 1.731 + * Move the string offset from one code point boundary to the previous one. 1.732 + * (Pre-decrementing backward iteration.) 1.733 + * The input offset may be the same as the string length. 1.734 + * "Safe" macro, checks for illegal sequences and for string boundaries. 1.735 + * 1.736 + * @param s const uint8_t * string 1.737 + * @param start int32_t starting string offset (usually 0) 1.738 + * @param i int32_t string offset, must be start<i 1.739 + * @see U8_BACK_1_UNSAFE 1.740 + * @stable ICU 2.4 1.741 + */ 1.742 +#define U8_BACK_1(s, start, i) { \ 1.743 + if(U8_IS_TRAIL((s)[--(i)])) { \ 1.744 + (i)=utf8_back1SafeBody(s, start, (i)); \ 1.745 + } \ 1.746 +} 1.747 + 1.748 +/** 1.749 + * Move the string offset from one code point boundary to the n-th one before it, 1.750 + * i.e., move backward by n code points. 1.751 + * (Pre-decrementing backward iteration.) 1.752 + * The input offset may be the same as the string length. 1.753 + * "Unsafe" macro, assumes well-formed UTF-8. 1.754 + * 1.755 + * @param s const uint8_t * string 1.756 + * @param i string offset 1.757 + * @param n number of code points to skip 1.758 + * @see U8_BACK_N 1.759 + * @stable ICU 2.4 1.760 + */ 1.761 +#define U8_BACK_N_UNSAFE(s, i, n) { \ 1.762 + int32_t __N=(n); \ 1.763 + while(__N>0) { \ 1.764 + U8_BACK_1_UNSAFE(s, i); \ 1.765 + --__N; \ 1.766 + } \ 1.767 +} 1.768 + 1.769 +/** 1.770 + * Move the string offset from one code point boundary to the n-th one before it, 1.771 + * i.e., move backward by n code points. 1.772 + * (Pre-decrementing backward iteration.) 1.773 + * The input offset may be the same as the string length. 1.774 + * "Safe" macro, checks for illegal sequences and for string boundaries. 1.775 + * 1.776 + * @param s const uint8_t * string 1.777 + * @param start int32_t index of the start of the string 1.778 + * @param i int32_t string offset, must be start<i 1.779 + * @param n number of code points to skip 1.780 + * @see U8_BACK_N_UNSAFE 1.781 + * @stable ICU 2.4 1.782 + */ 1.783 +#define U8_BACK_N(s, start, i, n) { \ 1.784 + int32_t __N=(n); \ 1.785 + while(__N>0 && (i)>(start)) { \ 1.786 + U8_BACK_1(s, start, i); \ 1.787 + --__N; \ 1.788 + } \ 1.789 +} 1.790 + 1.791 +/** 1.792 + * Adjust a random-access offset to a code point boundary after a code point. 1.793 + * If the offset is behind a partial multi-byte sequence, 1.794 + * then the offset is incremented to behind the whole sequence. 1.795 + * Otherwise, it is not modified. 1.796 + * The input offset may be the same as the string length. 1.797 + * "Unsafe" macro, assumes well-formed UTF-8. 1.798 + * 1.799 + * @param s const uint8_t * string 1.800 + * @param i string offset 1.801 + * @see U8_SET_CP_LIMIT 1.802 + * @stable ICU 2.4 1.803 + */ 1.804 +#define U8_SET_CP_LIMIT_UNSAFE(s, i) { \ 1.805 + U8_BACK_1_UNSAFE(s, i); \ 1.806 + U8_FWD_1_UNSAFE(s, i); \ 1.807 +} 1.808 + 1.809 +/** 1.810 + * Adjust a random-access offset to a code point boundary after a code point. 1.811 + * If the offset is behind a partial multi-byte sequence, 1.812 + * then the offset is incremented to behind the whole sequence. 1.813 + * Otherwise, it is not modified. 1.814 + * The input offset may be the same as the string length. 1.815 + * "Safe" macro, checks for illegal sequences and for string boundaries. 1.816 + * 1.817 + * The length can be negative for a NUL-terminated string. 1.818 + * 1.819 + * @param s const uint8_t * string 1.820 + * @param start int32_t starting string offset (usually 0) 1.821 + * @param i int32_t string offset, must be start<=i<=length 1.822 + * @param length int32_t string length 1.823 + * @see U8_SET_CP_LIMIT_UNSAFE 1.824 + * @stable ICU 2.4 1.825 + */ 1.826 +#define U8_SET_CP_LIMIT(s, start, i, length) { \ 1.827 + if((start)<(i) && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \ 1.828 + U8_BACK_1(s, start, i); \ 1.829 + U8_FWD_1(s, i, length); \ 1.830 + } \ 1.831 +} 1.832 + 1.833 +#endif