1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/ucnv_u8.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1086 @@ 1.4 +/* 1.5 +********************************************************************** 1.6 +* Copyright (C) 2002-2012, International Business Machines 1.7 +* Corporation and others. All Rights Reserved. 1.8 +********************************************************************** 1.9 +* file name: ucnv_u8.c 1.10 +* encoding: US-ASCII 1.11 +* tab size: 8 (not used) 1.12 +* indentation:4 1.13 +* 1.14 +* created on: 2002jul01 1.15 +* created by: Markus W. Scherer 1.16 +* 1.17 +* UTF-8 converter implementation. Used to be in ucnv_utf.c. 1.18 +* 1.19 +* Also, CESU-8 implementation, see UTR 26. 1.20 +* The CESU-8 converter uses all the same functions as the 1.21 +* UTF-8 converter, with a branch for converting supplementary code points. 1.22 +*/ 1.23 + 1.24 +#include "unicode/utypes.h" 1.25 + 1.26 +#if !UCONFIG_NO_CONVERSION 1.27 + 1.28 +#include "unicode/ucnv.h" 1.29 +#include "unicode/utf.h" 1.30 +#include "unicode/utf8.h" 1.31 +#include "unicode/utf16.h" 1.32 +#include "ucnv_bld.h" 1.33 +#include "ucnv_cnv.h" 1.34 +#include "cmemory.h" 1.35 + 1.36 +/* Prototypes --------------------------------------------------------------- */ 1.37 + 1.38 +/* Keep these here to make finicky compilers happy */ 1.39 + 1.40 +U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args, 1.41 + UErrorCode *err); 1.42 +U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args, 1.43 + UErrorCode *err); 1.44 + 1.45 + 1.46 +/* UTF-8 -------------------------------------------------------------------- */ 1.47 + 1.48 +/* UTF-8 Conversion DATA 1.49 + * for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9 1.50 + */ 1.51 +/*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/ 1.52 +#define MAXIMUM_UCS2 0x0000FFFF 1.53 +#define MAXIMUM_UTF 0x0010FFFF 1.54 +#define MAXIMUM_UCS4 0x7FFFFFFF 1.55 +#define HALF_SHIFT 10 1.56 +#define HALF_BASE 0x0010000 1.57 +#define HALF_MASK 0x3FF 1.58 +#define SURROGATE_HIGH_START 0xD800 1.59 +#define SURROGATE_HIGH_END 0xDBFF 1.60 +#define SURROGATE_LOW_START 0xDC00 1.61 +#define SURROGATE_LOW_END 0xDFFF 1.62 + 1.63 +/* -SURROGATE_LOW_START + HALF_BASE */ 1.64 +#define SURROGATE_LOW_BASE 9216 1.65 + 1.66 +static const uint32_t offsetsFromUTF8[7] = {0, 1.67 + (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080, 1.68 + (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080 1.69 +}; 1.70 + 1.71 +/* END OF UTF-8 Conversion DATA */ 1.72 + 1.73 +static const int8_t bytesFromUTF8[256] = { 1.74 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1.75 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1.76 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1.77 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1.78 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.79 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.80 + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1.81 + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 1.82 +}; 1.83 + 1.84 +/* 1.85 + * Starting with Unicode 3.0.1: 1.86 + * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N]; 1.87 + * byte sequences with more than 4 bytes are illegal in UTF-8, 1.88 + * which is tested with impossible values for them 1.89 + */ 1.90 +static const uint32_t 1.91 +utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff }; 1.92 + 1.93 +static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args, 1.94 + UErrorCode * err) 1.95 +{ 1.96 + UConverter *cnv = args->converter; 1.97 + const unsigned char *mySource = (unsigned char *) args->source; 1.98 + UChar *myTarget = args->target; 1.99 + const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; 1.100 + const UChar *targetLimit = args->targetLimit; 1.101 + unsigned char *toUBytes = cnv->toUBytes; 1.102 + UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data); 1.103 + uint32_t ch, ch2 = 0; 1.104 + int32_t i, inBytes; 1.105 + 1.106 + /* Restore size of current sequence */ 1.107 + if (cnv->toUnicodeStatus && myTarget < targetLimit) 1.108 + { 1.109 + inBytes = cnv->mode; /* restore # of bytes to consume */ 1.110 + i = cnv->toULength; /* restore # of bytes consumed */ 1.111 + cnv->toULength = 0; 1.112 + 1.113 + ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/ 1.114 + cnv->toUnicodeStatus = 0; 1.115 + goto morebytes; 1.116 + } 1.117 + 1.118 + 1.119 + while (mySource < sourceLimit && myTarget < targetLimit) 1.120 + { 1.121 + ch = *(mySource++); 1.122 + if (ch < 0x80) /* Simple case */ 1.123 + { 1.124 + *(myTarget++) = (UChar) ch; 1.125 + } 1.126 + else 1.127 + { 1.128 + /* store the first char */ 1.129 + toUBytes[0] = (char)ch; 1.130 + inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */ 1.131 + i = 1; 1.132 + 1.133 +morebytes: 1.134 + while (i < inBytes) 1.135 + { 1.136 + if (mySource < sourceLimit) 1.137 + { 1.138 + toUBytes[i] = (char) (ch2 = *mySource); 1.139 + if (!U8_IS_TRAIL(ch2)) 1.140 + { 1.141 + break; /* i < inBytes */ 1.142 + } 1.143 + ch = (ch << 6) + ch2; 1.144 + ++mySource; 1.145 + i++; 1.146 + } 1.147 + else 1.148 + { 1.149 + /* stores a partially calculated target*/ 1.150 + cnv->toUnicodeStatus = ch; 1.151 + cnv->mode = inBytes; 1.152 + cnv->toULength = (int8_t) i; 1.153 + goto donefornow; 1.154 + } 1.155 + } 1.156 + 1.157 + /* Remove the accumulated high bits */ 1.158 + ch -= offsetsFromUTF8[inBytes]; 1.159 + 1.160 + /* 1.161 + * Legal UTF-8 byte sequences in Unicode 3.0.1 and up: 1.162 + * - use only trail bytes after a lead byte (checked above) 1.163 + * - use the right number of trail bytes for a given lead byte 1.164 + * - encode a code point <= U+10ffff 1.165 + * - use the fewest possible number of bytes for their code points 1.166 + * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[]) 1.167 + * 1.168 + * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. 1.169 + * There are no irregular sequences any more. 1.170 + * In CESU-8, only surrogates, not supplementary code points, are encoded directly. 1.171 + */ 1.172 + if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] && 1.173 + (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch))) 1.174 + { 1.175 + /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ 1.176 + if (ch <= MAXIMUM_UCS2) 1.177 + { 1.178 + /* fits in 16 bits */ 1.179 + *(myTarget++) = (UChar) ch; 1.180 + } 1.181 + else 1.182 + { 1.183 + /* write out the surrogates */ 1.184 + ch -= HALF_BASE; 1.185 + *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START); 1.186 + ch = (ch & HALF_MASK) + SURROGATE_LOW_START; 1.187 + if (myTarget < targetLimit) 1.188 + { 1.189 + *(myTarget++) = (UChar)ch; 1.190 + } 1.191 + else 1.192 + { 1.193 + /* Put in overflow buffer (not handled here) */ 1.194 + cnv->UCharErrorBuffer[0] = (UChar) ch; 1.195 + cnv->UCharErrorBufferLength = 1; 1.196 + *err = U_BUFFER_OVERFLOW_ERROR; 1.197 + break; 1.198 + } 1.199 + } 1.200 + } 1.201 + else 1.202 + { 1.203 + cnv->toULength = (int8_t)i; 1.204 + *err = U_ILLEGAL_CHAR_FOUND; 1.205 + break; 1.206 + } 1.207 + } 1.208 + } 1.209 + 1.210 +donefornow: 1.211 + if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 1.212 + { 1.213 + /* End of target buffer */ 1.214 + *err = U_BUFFER_OVERFLOW_ERROR; 1.215 + } 1.216 + 1.217 + args->target = myTarget; 1.218 + args->source = (const char *) mySource; 1.219 +} 1.220 + 1.221 +static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args, 1.222 + UErrorCode * err) 1.223 +{ 1.224 + UConverter *cnv = args->converter; 1.225 + const unsigned char *mySource = (unsigned char *) args->source; 1.226 + UChar *myTarget = args->target; 1.227 + int32_t *myOffsets = args->offsets; 1.228 + int32_t offsetNum = 0; 1.229 + const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; 1.230 + const UChar *targetLimit = args->targetLimit; 1.231 + unsigned char *toUBytes = cnv->toUBytes; 1.232 + UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data); 1.233 + uint32_t ch, ch2 = 0; 1.234 + int32_t i, inBytes; 1.235 + 1.236 + /* Restore size of current sequence */ 1.237 + if (cnv->toUnicodeStatus && myTarget < targetLimit) 1.238 + { 1.239 + inBytes = cnv->mode; /* restore # of bytes to consume */ 1.240 + i = cnv->toULength; /* restore # of bytes consumed */ 1.241 + cnv->toULength = 0; 1.242 + 1.243 + ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/ 1.244 + cnv->toUnicodeStatus = 0; 1.245 + goto morebytes; 1.246 + } 1.247 + 1.248 + while (mySource < sourceLimit && myTarget < targetLimit) 1.249 + { 1.250 + ch = *(mySource++); 1.251 + if (ch < 0x80) /* Simple case */ 1.252 + { 1.253 + *(myTarget++) = (UChar) ch; 1.254 + *(myOffsets++) = offsetNum++; 1.255 + } 1.256 + else 1.257 + { 1.258 + toUBytes[0] = (char)ch; 1.259 + inBytes = bytesFromUTF8[ch]; 1.260 + i = 1; 1.261 + 1.262 +morebytes: 1.263 + while (i < inBytes) 1.264 + { 1.265 + if (mySource < sourceLimit) 1.266 + { 1.267 + toUBytes[i] = (char) (ch2 = *mySource); 1.268 + if (!U8_IS_TRAIL(ch2)) 1.269 + { 1.270 + break; /* i < inBytes */ 1.271 + } 1.272 + ch = (ch << 6) + ch2; 1.273 + ++mySource; 1.274 + i++; 1.275 + } 1.276 + else 1.277 + { 1.278 + cnv->toUnicodeStatus = ch; 1.279 + cnv->mode = inBytes; 1.280 + cnv->toULength = (int8_t)i; 1.281 + goto donefornow; 1.282 + } 1.283 + } 1.284 + 1.285 + /* Remove the accumulated high bits */ 1.286 + ch -= offsetsFromUTF8[inBytes]; 1.287 + 1.288 + /* 1.289 + * Legal UTF-8 byte sequences in Unicode 3.0.1 and up: 1.290 + * - use only trail bytes after a lead byte (checked above) 1.291 + * - use the right number of trail bytes for a given lead byte 1.292 + * - encode a code point <= U+10ffff 1.293 + * - use the fewest possible number of bytes for their code points 1.294 + * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[]) 1.295 + * 1.296 + * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. 1.297 + * There are no irregular sequences any more. 1.298 + * In CESU-8, only surrogates, not supplementary code points, are encoded directly. 1.299 + */ 1.300 + if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] && 1.301 + (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch))) 1.302 + { 1.303 + /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ 1.304 + if (ch <= MAXIMUM_UCS2) 1.305 + { 1.306 + /* fits in 16 bits */ 1.307 + *(myTarget++) = (UChar) ch; 1.308 + *(myOffsets++) = offsetNum; 1.309 + } 1.310 + else 1.311 + { 1.312 + /* write out the surrogates */ 1.313 + ch -= HALF_BASE; 1.314 + *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START); 1.315 + *(myOffsets++) = offsetNum; 1.316 + ch = (ch & HALF_MASK) + SURROGATE_LOW_START; 1.317 + if (myTarget < targetLimit) 1.318 + { 1.319 + *(myTarget++) = (UChar)ch; 1.320 + *(myOffsets++) = offsetNum; 1.321 + } 1.322 + else 1.323 + { 1.324 + cnv->UCharErrorBuffer[0] = (UChar) ch; 1.325 + cnv->UCharErrorBufferLength = 1; 1.326 + *err = U_BUFFER_OVERFLOW_ERROR; 1.327 + } 1.328 + } 1.329 + offsetNum += i; 1.330 + } 1.331 + else 1.332 + { 1.333 + cnv->toULength = (int8_t)i; 1.334 + *err = U_ILLEGAL_CHAR_FOUND; 1.335 + break; 1.336 + } 1.337 + } 1.338 + } 1.339 + 1.340 +donefornow: 1.341 + if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 1.342 + { /* End of target buffer */ 1.343 + *err = U_BUFFER_OVERFLOW_ERROR; 1.344 + } 1.345 + 1.346 + args->target = myTarget; 1.347 + args->source = (const char *) mySource; 1.348 + args->offsets = myOffsets; 1.349 +} 1.350 + 1.351 +U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args, 1.352 + UErrorCode * err) 1.353 +{ 1.354 + UConverter *cnv = args->converter; 1.355 + const UChar *mySource = args->source; 1.356 + const UChar *sourceLimit = args->sourceLimit; 1.357 + uint8_t *myTarget = (uint8_t *) args->target; 1.358 + const uint8_t *targetLimit = (uint8_t *) args->targetLimit; 1.359 + uint8_t *tempPtr; 1.360 + UChar32 ch; 1.361 + uint8_t tempBuf[4]; 1.362 + int32_t indexToWrite; 1.363 + UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data); 1.364 + 1.365 + if (cnv->fromUChar32 && myTarget < targetLimit) 1.366 + { 1.367 + ch = cnv->fromUChar32; 1.368 + cnv->fromUChar32 = 0; 1.369 + goto lowsurrogate; 1.370 + } 1.371 + 1.372 + while (mySource < sourceLimit && myTarget < targetLimit) 1.373 + { 1.374 + ch = *(mySource++); 1.375 + 1.376 + if (ch < 0x80) /* Single byte */ 1.377 + { 1.378 + *(myTarget++) = (uint8_t) ch; 1.379 + } 1.380 + else if (ch < 0x800) /* Double byte */ 1.381 + { 1.382 + *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0); 1.383 + if (myTarget < targetLimit) 1.384 + { 1.385 + *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80); 1.386 + } 1.387 + else 1.388 + { 1.389 + cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80); 1.390 + cnv->charErrorBufferLength = 1; 1.391 + *err = U_BUFFER_OVERFLOW_ERROR; 1.392 + } 1.393 + } 1.394 + else { 1.395 + /* Check for surrogates */ 1.396 + if(U16_IS_SURROGATE(ch) && isNotCESU8) { 1.397 +lowsurrogate: 1.398 + if (mySource < sourceLimit) { 1.399 + /* test both code units */ 1.400 + if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) { 1.401 + /* convert and consume this supplementary code point */ 1.402 + ch=U16_GET_SUPPLEMENTARY(ch, *mySource); 1.403 + ++mySource; 1.404 + /* exit this condition tree */ 1.405 + } 1.406 + else { 1.407 + /* this is an unpaired trail or lead code unit */ 1.408 + /* callback(illegal) */ 1.409 + cnv->fromUChar32 = ch; 1.410 + *err = U_ILLEGAL_CHAR_FOUND; 1.411 + break; 1.412 + } 1.413 + } 1.414 + else { 1.415 + /* no more input */ 1.416 + cnv->fromUChar32 = ch; 1.417 + break; 1.418 + } 1.419 + } 1.420 + 1.421 + /* Do we write the buffer directly for speed, 1.422 + or do we have to be careful about target buffer space? */ 1.423 + tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf); 1.424 + 1.425 + if (ch <= MAXIMUM_UCS2) { 1.426 + indexToWrite = 2; 1.427 + tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0); 1.428 + } 1.429 + else { 1.430 + indexToWrite = 3; 1.431 + tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0); 1.432 + tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80); 1.433 + } 1.434 + tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80); 1.435 + tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80); 1.436 + 1.437 + if (tempPtr == myTarget) { 1.438 + /* There was enough space to write the codepoint directly. */ 1.439 + myTarget += (indexToWrite + 1); 1.440 + } 1.441 + else { 1.442 + /* We might run out of room soon. Write it slowly. */ 1.443 + for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) { 1.444 + if (myTarget < targetLimit) { 1.445 + *(myTarget++) = *tempPtr; 1.446 + } 1.447 + else { 1.448 + cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr; 1.449 + *err = U_BUFFER_OVERFLOW_ERROR; 1.450 + } 1.451 + } 1.452 + } 1.453 + } 1.454 + } 1.455 + 1.456 + if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 1.457 + { 1.458 + *err = U_BUFFER_OVERFLOW_ERROR; 1.459 + } 1.460 + 1.461 + args->target = (char *) myTarget; 1.462 + args->source = mySource; 1.463 +} 1.464 + 1.465 +U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args, 1.466 + UErrorCode * err) 1.467 +{ 1.468 + UConverter *cnv = args->converter; 1.469 + const UChar *mySource = args->source; 1.470 + int32_t *myOffsets = args->offsets; 1.471 + const UChar *sourceLimit = args->sourceLimit; 1.472 + uint8_t *myTarget = (uint8_t *) args->target; 1.473 + const uint8_t *targetLimit = (uint8_t *) args->targetLimit; 1.474 + uint8_t *tempPtr; 1.475 + UChar32 ch; 1.476 + int32_t offsetNum, nextSourceIndex; 1.477 + int32_t indexToWrite; 1.478 + uint8_t tempBuf[4]; 1.479 + UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data); 1.480 + 1.481 + if (cnv->fromUChar32 && myTarget < targetLimit) 1.482 + { 1.483 + ch = cnv->fromUChar32; 1.484 + cnv->fromUChar32 = 0; 1.485 + offsetNum = -1; 1.486 + nextSourceIndex = 0; 1.487 + goto lowsurrogate; 1.488 + } else { 1.489 + offsetNum = 0; 1.490 + } 1.491 + 1.492 + while (mySource < sourceLimit && myTarget < targetLimit) 1.493 + { 1.494 + ch = *(mySource++); 1.495 + 1.496 + if (ch < 0x80) /* Single byte */ 1.497 + { 1.498 + *(myOffsets++) = offsetNum++; 1.499 + *(myTarget++) = (char) ch; 1.500 + } 1.501 + else if (ch < 0x800) /* Double byte */ 1.502 + { 1.503 + *(myOffsets++) = offsetNum; 1.504 + *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0); 1.505 + if (myTarget < targetLimit) 1.506 + { 1.507 + *(myOffsets++) = offsetNum++; 1.508 + *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80); 1.509 + } 1.510 + else 1.511 + { 1.512 + cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80); 1.513 + cnv->charErrorBufferLength = 1; 1.514 + *err = U_BUFFER_OVERFLOW_ERROR; 1.515 + } 1.516 + } 1.517 + else 1.518 + /* Check for surrogates */ 1.519 + { 1.520 + nextSourceIndex = offsetNum + 1; 1.521 + 1.522 + if(U16_IS_SURROGATE(ch) && isNotCESU8) { 1.523 +lowsurrogate: 1.524 + if (mySource < sourceLimit) { 1.525 + /* test both code units */ 1.526 + if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) { 1.527 + /* convert and consume this supplementary code point */ 1.528 + ch=U16_GET_SUPPLEMENTARY(ch, *mySource); 1.529 + ++mySource; 1.530 + ++nextSourceIndex; 1.531 + /* exit this condition tree */ 1.532 + } 1.533 + else { 1.534 + /* this is an unpaired trail or lead code unit */ 1.535 + /* callback(illegal) */ 1.536 + cnv->fromUChar32 = ch; 1.537 + *err = U_ILLEGAL_CHAR_FOUND; 1.538 + break; 1.539 + } 1.540 + } 1.541 + else { 1.542 + /* no more input */ 1.543 + cnv->fromUChar32 = ch; 1.544 + break; 1.545 + } 1.546 + } 1.547 + 1.548 + /* Do we write the buffer directly for speed, 1.549 + or do we have to be careful about target buffer space? */ 1.550 + tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf); 1.551 + 1.552 + if (ch <= MAXIMUM_UCS2) { 1.553 + indexToWrite = 2; 1.554 + tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0); 1.555 + } 1.556 + else { 1.557 + indexToWrite = 3; 1.558 + tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0); 1.559 + tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80); 1.560 + } 1.561 + tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80); 1.562 + tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80); 1.563 + 1.564 + if (tempPtr == myTarget) { 1.565 + /* There was enough space to write the codepoint directly. */ 1.566 + myTarget += (indexToWrite + 1); 1.567 + myOffsets[0] = offsetNum; 1.568 + myOffsets[1] = offsetNum; 1.569 + myOffsets[2] = offsetNum; 1.570 + if (indexToWrite >= 3) { 1.571 + myOffsets[3] = offsetNum; 1.572 + } 1.573 + myOffsets += (indexToWrite + 1); 1.574 + } 1.575 + else { 1.576 + /* We might run out of room soon. Write it slowly. */ 1.577 + for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) { 1.578 + if (myTarget < targetLimit) 1.579 + { 1.580 + *(myOffsets++) = offsetNum; 1.581 + *(myTarget++) = *tempPtr; 1.582 + } 1.583 + else 1.584 + { 1.585 + cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr; 1.586 + *err = U_BUFFER_OVERFLOW_ERROR; 1.587 + } 1.588 + } 1.589 + } 1.590 + offsetNum = nextSourceIndex; 1.591 + } 1.592 + } 1.593 + 1.594 + if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 1.595 + { 1.596 + *err = U_BUFFER_OVERFLOW_ERROR; 1.597 + } 1.598 + 1.599 + args->target = (char *) myTarget; 1.600 + args->source = mySource; 1.601 + args->offsets = myOffsets; 1.602 +} 1.603 + 1.604 +static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args, 1.605 + UErrorCode *err) { 1.606 + UConverter *cnv; 1.607 + const uint8_t *sourceInitial; 1.608 + const uint8_t *source; 1.609 + uint16_t extraBytesToWrite; 1.610 + uint8_t myByte; 1.611 + UChar32 ch; 1.612 + int8_t i, isLegalSequence; 1.613 + 1.614 + /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */ 1.615 + 1.616 + cnv = args->converter; 1.617 + sourceInitial = source = (const uint8_t *)args->source; 1.618 + if (source >= (const uint8_t *)args->sourceLimit) 1.619 + { 1.620 + /* no input */ 1.621 + *err = U_INDEX_OUTOFBOUNDS_ERROR; 1.622 + return 0xffff; 1.623 + } 1.624 + 1.625 + myByte = (uint8_t)*(source++); 1.626 + if (myByte < 0x80) 1.627 + { 1.628 + args->source = (const char *)source; 1.629 + return (UChar32)myByte; 1.630 + } 1.631 + 1.632 + extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte]; 1.633 + if (extraBytesToWrite == 0) { 1.634 + cnv->toUBytes[0] = myByte; 1.635 + cnv->toULength = 1; 1.636 + *err = U_ILLEGAL_CHAR_FOUND; 1.637 + args->source = (const char *)source; 1.638 + return 0xffff; 1.639 + } 1.640 + 1.641 + /*The byte sequence is longer than the buffer area passed*/ 1.642 + if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit) 1.643 + { 1.644 + /* check if all of the remaining bytes are trail bytes */ 1.645 + cnv->toUBytes[0] = myByte; 1.646 + i = 1; 1.647 + *err = U_TRUNCATED_CHAR_FOUND; 1.648 + while(source < (const uint8_t *)args->sourceLimit) { 1.649 + if(U8_IS_TRAIL(myByte = *source)) { 1.650 + cnv->toUBytes[i++] = myByte; 1.651 + ++source; 1.652 + } else { 1.653 + /* error even before we run out of input */ 1.654 + *err = U_ILLEGAL_CHAR_FOUND; 1.655 + break; 1.656 + } 1.657 + } 1.658 + cnv->toULength = i; 1.659 + args->source = (const char *)source; 1.660 + return 0xffff; 1.661 + } 1.662 + 1.663 + isLegalSequence = 1; 1.664 + ch = myByte << 6; 1.665 + switch(extraBytesToWrite) 1.666 + { 1.667 + /* note: code falls through cases! (sic)*/ 1.668 + case 6: 1.669 + ch += (myByte = *source); 1.670 + ch <<= 6; 1.671 + if (!U8_IS_TRAIL(myByte)) 1.672 + { 1.673 + isLegalSequence = 0; 1.674 + break; 1.675 + } 1.676 + ++source; 1.677 + case 5: /*fall through*/ 1.678 + ch += (myByte = *source); 1.679 + ch <<= 6; 1.680 + if (!U8_IS_TRAIL(myByte)) 1.681 + { 1.682 + isLegalSequence = 0; 1.683 + break; 1.684 + } 1.685 + ++source; 1.686 + case 4: /*fall through*/ 1.687 + ch += (myByte = *source); 1.688 + ch <<= 6; 1.689 + if (!U8_IS_TRAIL(myByte)) 1.690 + { 1.691 + isLegalSequence = 0; 1.692 + break; 1.693 + } 1.694 + ++source; 1.695 + case 3: /*fall through*/ 1.696 + ch += (myByte = *source); 1.697 + ch <<= 6; 1.698 + if (!U8_IS_TRAIL(myByte)) 1.699 + { 1.700 + isLegalSequence = 0; 1.701 + break; 1.702 + } 1.703 + ++source; 1.704 + case 2: /*fall through*/ 1.705 + ch += (myByte = *source); 1.706 + if (!U8_IS_TRAIL(myByte)) 1.707 + { 1.708 + isLegalSequence = 0; 1.709 + break; 1.710 + } 1.711 + ++source; 1.712 + }; 1.713 + ch -= offsetsFromUTF8[extraBytesToWrite]; 1.714 + args->source = (const char *)source; 1.715 + 1.716 + /* 1.717 + * Legal UTF-8 byte sequences in Unicode 3.0.1 and up: 1.718 + * - use only trail bytes after a lead byte (checked above) 1.719 + * - use the right number of trail bytes for a given lead byte 1.720 + * - encode a code point <= U+10ffff 1.721 + * - use the fewest possible number of bytes for their code points 1.722 + * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[]) 1.723 + * 1.724 + * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. 1.725 + * There are no irregular sequences any more. 1.726 + */ 1.727 + if (isLegalSequence && 1.728 + (uint32_t)ch <= MAXIMUM_UTF && 1.729 + (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] && 1.730 + !U_IS_SURROGATE(ch) 1.731 + ) { 1.732 + return ch; /* return the code point */ 1.733 + } 1.734 + 1.735 + for(i = 0; sourceInitial < source; ++i) { 1.736 + cnv->toUBytes[i] = *sourceInitial++; 1.737 + } 1.738 + cnv->toULength = i; 1.739 + *err = U_ILLEGAL_CHAR_FOUND; 1.740 + return 0xffff; 1.741 +} 1.742 + 1.743 +/* UTF-8-from-UTF-8 conversion functions ------------------------------------ */ 1.744 + 1.745 +/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */ 1.746 +static const UChar32 1.747 +utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 }; 1.748 + 1.749 +/* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */ 1.750 +static const UChar32 1.751 +utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 }; 1.752 + 1.753 +/* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */ 1.754 +static void 1.755 +ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 1.756 + UConverterToUnicodeArgs *pToUArgs, 1.757 + UErrorCode *pErrorCode) { 1.758 + UConverter *utf8; 1.759 + const uint8_t *source, *sourceLimit; 1.760 + uint8_t *target; 1.761 + int32_t targetCapacity; 1.762 + int32_t count; 1.763 + 1.764 + int8_t oldToULength, toULength, toULimit; 1.765 + 1.766 + UChar32 c; 1.767 + uint8_t b, t1, t2; 1.768 + 1.769 + /* set up the local pointers */ 1.770 + utf8=pToUArgs->converter; 1.771 + source=(uint8_t *)pToUArgs->source; 1.772 + sourceLimit=(uint8_t *)pToUArgs->sourceLimit; 1.773 + target=(uint8_t *)pFromUArgs->target; 1.774 + targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); 1.775 + 1.776 + /* get the converter state from the UTF-8 UConverter */ 1.777 + c=(UChar32)utf8->toUnicodeStatus; 1.778 + if(c!=0) { 1.779 + toULength=oldToULength=utf8->toULength; 1.780 + toULimit=(int8_t)utf8->mode; 1.781 + } else { 1.782 + toULength=oldToULength=toULimit=0; 1.783 + } 1.784 + 1.785 + count=(int32_t)(sourceLimit-source)+oldToULength; 1.786 + if(count<toULimit) { 1.787 + /* 1.788 + * Not enough input to complete the partial character. 1.789 + * Jump to moreBytes below - it will not output to target. 1.790 + */ 1.791 + } else if(targetCapacity<toULimit) { 1.792 + /* 1.793 + * Not enough target capacity to output the partial character. 1.794 + * Let the standard converter handle this. 1.795 + */ 1.796 + *pErrorCode=U_USING_DEFAULT_WARNING; 1.797 + return; 1.798 + } else { 1.799 + /* 1.800 + * Use a single counter for source and target, counting the minimum of 1.801 + * the source length and the target capacity. 1.802 + * As a result, the source length is checked only once per multi-byte 1.803 + * character instead of twice. 1.804 + * 1.805 + * Make sure that the last byte sequence is complete, or else 1.806 + * stop just before it. 1.807 + * (The longest legal byte sequence has 3 trail bytes.) 1.808 + * Count oldToULength (number of source bytes from a previous buffer) 1.809 + * into the source length but reduce the source index by toULimit 1.810 + * while going back over trail bytes in order to not go back into 1.811 + * the bytes that will be read for finishing a partial 1.812 + * sequence from the previous buffer. 1.813 + * Let the standard converter handle edge cases. 1.814 + */ 1.815 + int32_t i; 1.816 + 1.817 + if(count>targetCapacity) { 1.818 + count=targetCapacity; 1.819 + } 1.820 + 1.821 + i=0; 1.822 + while(i<3 && i<(count-toULimit)) { 1.823 + b=source[count-oldToULength-i-1]; 1.824 + if(U8_IS_TRAIL(b)) { 1.825 + ++i; 1.826 + } else { 1.827 + if(i<U8_COUNT_TRAIL_BYTES(b)) { 1.828 + /* stop converting before the lead byte if there are not enough trail bytes for it */ 1.829 + count-=i+1; 1.830 + } 1.831 + break; 1.832 + } 1.833 + } 1.834 + } 1.835 + 1.836 + if(c!=0) { 1.837 + utf8->toUnicodeStatus=0; 1.838 + utf8->toULength=0; 1.839 + goto moreBytes; 1.840 + /* See note in ucnv_SBCSFromUTF8() about this goto. */ 1.841 + } 1.842 + 1.843 + /* conversion loop */ 1.844 + while(count>0) { 1.845 + b=*source++; 1.846 + if((int8_t)b>=0) { 1.847 + /* convert ASCII */ 1.848 + *target++=b; 1.849 + --count; 1.850 + continue; 1.851 + } else { 1.852 + if(b>0xe0) { 1.853 + if( /* handle U+1000..U+D7FF inline */ 1.854 + (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) || 1.855 + (b==0xed && (t1 <= 0x9f))) && 1.856 + (t2=source[1]) >= 0x80 && t2 <= 0xbf 1.857 + ) { 1.858 + source+=2; 1.859 + *target++=b; 1.860 + *target++=t1; 1.861 + *target++=t2; 1.862 + count-=3; 1.863 + continue; 1.864 + } 1.865 + } else if(b<0xe0) { 1.866 + if( /* handle U+0080..U+07FF inline */ 1.867 + b>=0xc2 && 1.868 + (t1=*source) >= 0x80 && t1 <= 0xbf 1.869 + ) { 1.870 + ++source; 1.871 + *target++=b; 1.872 + *target++=t1; 1.873 + count-=2; 1.874 + continue; 1.875 + } 1.876 + } else if(b==0xe0) { 1.877 + if( /* handle U+0800..U+0FFF inline */ 1.878 + (t1=source[0]) >= 0xa0 && t1 <= 0xbf && 1.879 + (t2=source[1]) >= 0x80 && t2 <= 0xbf 1.880 + ) { 1.881 + source+=2; 1.882 + *target++=b; 1.883 + *target++=t1; 1.884 + *target++=t2; 1.885 + count-=3; 1.886 + continue; 1.887 + } 1.888 + } 1.889 + 1.890 + /* handle "complicated" and error cases, and continuing partial characters */ 1.891 + oldToULength=0; 1.892 + toULength=1; 1.893 + toULimit=U8_COUNT_TRAIL_BYTES(b)+1; 1.894 + c=b; 1.895 +moreBytes: 1.896 + while(toULength<toULimit) { 1.897 + if(source<sourceLimit) { 1.898 + b=*source; 1.899 + if(U8_IS_TRAIL(b)) { 1.900 + ++source; 1.901 + ++toULength; 1.902 + c=(c<<6)+b; 1.903 + } else { 1.904 + break; /* sequence too short, stop with toULength<toULimit */ 1.905 + } 1.906 + } else { 1.907 + /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */ 1.908 + source-=(toULength-oldToULength); 1.909 + while(oldToULength<toULength) { 1.910 + utf8->toUBytes[oldToULength++]=*source++; 1.911 + } 1.912 + utf8->toUnicodeStatus=c; 1.913 + utf8->toULength=toULength; 1.914 + utf8->mode=toULimit; 1.915 + pToUArgs->source=(char *)source; 1.916 + pFromUArgs->target=(char *)target; 1.917 + return; 1.918 + } 1.919 + } 1.920 + 1.921 + if( toULength==toULimit && /* consumed all trail bytes */ 1.922 + (toULength==3 || toULength==2) && /* BMP */ 1.923 + (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] && 1.924 + (c<=0xd7ff || 0xe000<=c) /* not a surrogate */ 1.925 + ) { 1.926 + /* legal byte sequence for BMP code point */ 1.927 + } else if( 1.928 + toULength==toULimit && toULength==4 && 1.929 + (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff) 1.930 + ) { 1.931 + /* legal byte sequence for supplementary code point */ 1.932 + } else { 1.933 + /* error handling: illegal UTF-8 byte sequence */ 1.934 + source-=(toULength-oldToULength); 1.935 + while(oldToULength<toULength) { 1.936 + utf8->toUBytes[oldToULength++]=*source++; 1.937 + } 1.938 + utf8->toULength=toULength; 1.939 + pToUArgs->source=(char *)source; 1.940 + pFromUArgs->target=(char *)target; 1.941 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.942 + return; 1.943 + } 1.944 + 1.945 + /* copy the legal byte sequence to the target */ 1.946 + { 1.947 + int8_t i; 1.948 + 1.949 + for(i=0; i<oldToULength; ++i) { 1.950 + *target++=utf8->toUBytes[i]; 1.951 + } 1.952 + source-=(toULength-oldToULength); 1.953 + for(; i<toULength; ++i) { 1.954 + *target++=*source++; 1.955 + } 1.956 + count-=toULength; 1.957 + } 1.958 + } 1.959 + } 1.960 + 1.961 + if(U_SUCCESS(*pErrorCode) && source<sourceLimit) { 1.962 + if(target==(const uint8_t *)pFromUArgs->targetLimit) { 1.963 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.964 + } else { 1.965 + b=*source; 1.966 + toULimit=U8_COUNT_TRAIL_BYTES(b)+1; 1.967 + if(toULimit>(sourceLimit-source)) { 1.968 + /* collect a truncated byte sequence */ 1.969 + toULength=0; 1.970 + c=b; 1.971 + for(;;) { 1.972 + utf8->toUBytes[toULength++]=b; 1.973 + if(++source==sourceLimit) { 1.974 + /* partial byte sequence at end of source */ 1.975 + utf8->toUnicodeStatus=c; 1.976 + utf8->toULength=toULength; 1.977 + utf8->mode=toULimit; 1.978 + break; 1.979 + } else if(!U8_IS_TRAIL(b=*source)) { 1.980 + /* lead byte in trail byte position */ 1.981 + utf8->toULength=toULength; 1.982 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.983 + break; 1.984 + } 1.985 + c=(c<<6)+b; 1.986 + } 1.987 + } else { 1.988 + /* partial-sequence target overflow: fall back to the pivoting implementation */ 1.989 + *pErrorCode=U_USING_DEFAULT_WARNING; 1.990 + } 1.991 + } 1.992 + } 1.993 + 1.994 + /* write back the updated pointers */ 1.995 + pToUArgs->source=(char *)source; 1.996 + pFromUArgs->target=(char *)target; 1.997 +} 1.998 + 1.999 +/* UTF-8 converter data ----------------------------------------------------- */ 1.1000 + 1.1001 +static const UConverterImpl _UTF8Impl={ 1.1002 + UCNV_UTF8, 1.1003 + 1.1004 + NULL, 1.1005 + NULL, 1.1006 + 1.1007 + NULL, 1.1008 + NULL, 1.1009 + NULL, 1.1010 + 1.1011 + ucnv_toUnicode_UTF8, 1.1012 + ucnv_toUnicode_UTF8_OFFSETS_LOGIC, 1.1013 + ucnv_fromUnicode_UTF8, 1.1014 + ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, 1.1015 + ucnv_getNextUChar_UTF8, 1.1016 + 1.1017 + NULL, 1.1018 + NULL, 1.1019 + NULL, 1.1020 + NULL, 1.1021 + ucnv_getNonSurrogateUnicodeSet, 1.1022 + 1.1023 + ucnv_UTF8FromUTF8, 1.1024 + ucnv_UTF8FromUTF8 1.1025 +}; 1.1026 + 1.1027 +/* The 1208 CCSID refers to any version of Unicode of UTF-8 */ 1.1028 +static const UConverterStaticData _UTF8StaticData={ 1.1029 + sizeof(UConverterStaticData), 1.1030 + "UTF-8", 1.1031 + 1208, UCNV_IBM, UCNV_UTF8, 1.1032 + 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */ 1.1033 + { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE, 1.1034 + 0, 1.1035 + 0, 1.1036 + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1.1037 +}; 1.1038 + 1.1039 + 1.1040 +const UConverterSharedData _UTF8Data={ 1.1041 + sizeof(UConverterSharedData), ~((uint32_t) 0), 1.1042 + NULL, NULL, &_UTF8StaticData, FALSE, &_UTF8Impl, 1.1043 + 0 1.1044 +}; 1.1045 + 1.1046 +/* CESU-8 converter data ---------------------------------------------------- */ 1.1047 + 1.1048 +static const UConverterImpl _CESU8Impl={ 1.1049 + UCNV_CESU8, 1.1050 + 1.1051 + NULL, 1.1052 + NULL, 1.1053 + 1.1054 + NULL, 1.1055 + NULL, 1.1056 + NULL, 1.1057 + 1.1058 + ucnv_toUnicode_UTF8, 1.1059 + ucnv_toUnicode_UTF8_OFFSETS_LOGIC, 1.1060 + ucnv_fromUnicode_UTF8, 1.1061 + ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, 1.1062 + NULL, 1.1063 + 1.1064 + NULL, 1.1065 + NULL, 1.1066 + NULL, 1.1067 + NULL, 1.1068 + ucnv_getCompleteUnicodeSet 1.1069 +}; 1.1070 + 1.1071 +static const UConverterStaticData _CESU8StaticData={ 1.1072 + sizeof(UConverterStaticData), 1.1073 + "CESU-8", 1.1074 + 9400, /* CCSID for CESU-8 */ 1.1075 + UCNV_UNKNOWN, UCNV_CESU8, 1, 3, 1.1076 + { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE, 1.1077 + 0, 1.1078 + 0, 1.1079 + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1.1080 +}; 1.1081 + 1.1082 + 1.1083 +const UConverterSharedData _CESU8Data={ 1.1084 + sizeof(UConverterSharedData), ~((uint32_t) 0), 1.1085 + NULL, NULL, &_CESU8StaticData, FALSE, &_CESU8Impl, 1.1086 + 0 1.1087 +}; 1.1088 + 1.1089 +#endif