1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/toolkit/crashreporter/google-breakpad/src/common/convert_UTF.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,533 @@ 1.4 +/* 1.5 + * Copyright 2001-2004 Unicode, Inc. 1.6 + * 1.7 + * Disclaimer 1.8 + * 1.9 + * This source code is provided as is by Unicode, Inc. No claims are 1.10 + * made as to fitness for any particular purpose. No warranties of any 1.11 + * kind are expressed or implied. The recipient agrees to determine 1.12 + * applicability of information provided. If this file has been 1.13 + * purchased on magnetic or optical media from Unicode, Inc., the 1.14 + * sole remedy for any claim will be exchange of defective media 1.15 + * within 90 days of receipt. 1.16 + * 1.17 + * Limitations on Rights to Redistribute This Code 1.18 + * 1.19 + * Unicode, Inc. hereby grants the right to freely use the information 1.20 + * supplied in this file in the creation of products supporting the 1.21 + * Unicode Standard, and to make copies of this file in any form 1.22 + * for internal or external distribution as long as this notice 1.23 + * remains attached. 1.24 + */ 1.25 + 1.26 +/* --------------------------------------------------------------------- 1.27 + 1.28 +Conversions between UTF32, UTF-16, and UTF-8. Source code file. 1.29 +Author: Mark E. Davis, 1994. 1.30 +Rev History: Rick McGowan, fixes & updates May 2001. 1.31 +Sept 2001: fixed const & error conditions per 1.32 +mods suggested by S. Parent & A. Lillich. 1.33 +June 2002: Tim Dodd added detection and handling of incomplete 1.34 +source sequences, enhanced error detection, added casts 1.35 +to eliminate compiler warnings. 1.36 +July 2003: slight mods to back out aggressive FFFE detection. 1.37 +Jan 2004: updated switches in from-UTF8 conversions. 1.38 +Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. 1.39 + 1.40 +See the header file "ConvertUTF.h" for complete documentation. 1.41 + 1.42 +------------------------------------------------------------------------ */ 1.43 + 1.44 + 1.45 +#include "convert_UTF.h" 1.46 +#ifdef CVTUTF_DEBUG 1.47 +#include <stdio.h> 1.48 +#endif 1.49 + 1.50 +static const int halfShift = 10; /* used for shifting by 10 bits */ 1.51 + 1.52 +static const UTF32 halfBase = 0x0010000UL; 1.53 +static const UTF32 halfMask = 0x3FFUL; 1.54 + 1.55 +#define UNI_SUR_HIGH_START (UTF32)0xD800 1.56 +#define UNI_SUR_HIGH_END (UTF32)0xDBFF 1.57 +#define UNI_SUR_LOW_START (UTF32)0xDC00 1.58 +#define UNI_SUR_LOW_END (UTF32)0xDFFF 1.59 +#define false 0 1.60 +#define true 1 1.61 + 1.62 +/* --------------------------------------------------------------------- */ 1.63 + 1.64 +ConversionResult ConvertUTF32toUTF16 (const UTF32** sourceStart, const UTF32* sourceEnd, 1.65 + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { 1.66 + ConversionResult result = conversionOK; 1.67 + const UTF32* source = *sourceStart; 1.68 + UTF16* target = *targetStart; 1.69 + while (source < sourceEnd) { 1.70 + UTF32 ch; 1.71 + if (target >= targetEnd) { 1.72 + result = targetExhausted; break; 1.73 + } 1.74 + ch = *source++; 1.75 + if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ 1.76 + /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ 1.77 + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 1.78 + if (flags == strictConversion) { 1.79 + --source; /* return to the illegal value itself */ 1.80 + result = sourceIllegal; 1.81 + break; 1.82 + } else { 1.83 + *target++ = UNI_REPLACEMENT_CHAR; 1.84 + } 1.85 + } else { 1.86 + *target++ = (UTF16)ch; /* normal case */ 1.87 + } 1.88 + } else if (ch > UNI_MAX_LEGAL_UTF32) { 1.89 + if (flags == strictConversion) { 1.90 + result = sourceIllegal; 1.91 + } else { 1.92 + *target++ = UNI_REPLACEMENT_CHAR; 1.93 + } 1.94 + } else { 1.95 + /* target is a character in range 0xFFFF - 0x10FFFF. */ 1.96 + if (target + 1 >= targetEnd) { 1.97 + --source; /* Back up source pointer! */ 1.98 + result = targetExhausted; break; 1.99 + } 1.100 + ch -= halfBase; 1.101 + *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); 1.102 + *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); 1.103 + } 1.104 + } 1.105 +*sourceStart = source; 1.106 +*targetStart = target; 1.107 +return result; 1.108 +} 1.109 + 1.110 +/* --------------------------------------------------------------------- */ 1.111 + 1.112 +ConversionResult ConvertUTF16toUTF32 (const UTF16** sourceStart, const UTF16* sourceEnd, 1.113 + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { 1.114 + ConversionResult result = conversionOK; 1.115 + const UTF16* source = *sourceStart; 1.116 + UTF32* target = *targetStart; 1.117 + UTF32 ch, ch2; 1.118 + while (source < sourceEnd) { 1.119 + const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ 1.120 + ch = *source++; 1.121 + /* If we have a surrogate pair, convert to UTF32 first. */ 1.122 + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { 1.123 + /* If the 16 bits following the high surrogate are in the source buffer... */ 1.124 + if (source < sourceEnd) { 1.125 + ch2 = *source; 1.126 + /* If it's a low surrogate, convert to UTF32. */ 1.127 + if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { 1.128 + ch = ((ch - UNI_SUR_HIGH_START) << halfShift) 1.129 + + (ch2 - UNI_SUR_LOW_START) + halfBase; 1.130 + ++source; 1.131 + } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ 1.132 + --source; /* return to the illegal value itself */ 1.133 + result = sourceIllegal; 1.134 + break; 1.135 + } 1.136 + } else { /* We don't have the 16 bits following the high surrogate. */ 1.137 + --source; /* return to the high surrogate */ 1.138 + result = sourceExhausted; 1.139 + break; 1.140 + } 1.141 + } else if (flags == strictConversion) { 1.142 + /* UTF-16 surrogate values are illegal in UTF-32 */ 1.143 + if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { 1.144 + --source; /* return to the illegal value itself */ 1.145 + result = sourceIllegal; 1.146 + break; 1.147 + } 1.148 + } 1.149 + if (target >= targetEnd) { 1.150 + source = oldSource; /* Back up source pointer! */ 1.151 + result = targetExhausted; break; 1.152 + } 1.153 + *target++ = ch; 1.154 + } 1.155 + *sourceStart = source; 1.156 + *targetStart = target; 1.157 +#ifdef CVTUTF_DEBUG 1.158 + if (result == sourceIllegal) { 1.159 + fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2); 1.160 + fflush(stderr); 1.161 + } 1.162 +#endif 1.163 + return result; 1.164 +} 1.165 + 1.166 +/* --------------------------------------------------------------------- */ 1.167 + 1.168 +/* 1.169 + * Index into the table below with the first byte of a UTF-8 sequence to 1.170 + * get the number of trailing bytes that are supposed to follow it. 1.171 + * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is 1.172 + * left as-is for anyone who may want to do such conversion, which was 1.173 + * allowed in earlier algorithms. 1.174 + */ 1.175 +static const char trailingBytesForUTF8[256] = { 1.176 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1.177 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1.178 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1.179 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1.180 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1.181 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1.182 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1.183 + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 1.184 +}; 1.185 + 1.186 +/* 1.187 + * Magic values subtracted from a buffer value during UTF8 conversion. 1.188 + * This table contains as many values as there might be trailing bytes 1.189 + * in a UTF-8 sequence. 1.190 + */ 1.191 +static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 1.192 + 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; 1.193 + 1.194 +/* 1.195 + * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed 1.196 + * into the first byte, depending on how many bytes follow. There are 1.197 + * as many entries in this table as there are UTF-8 sequence types. 1.198 + * (I.e., one byte sequence, two byte... etc.). Remember that sequencs 1.199 + * for *legal* UTF-8 will be 4 or fewer bytes total. 1.200 + */ 1.201 +static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; 1.202 + 1.203 +/* --------------------------------------------------------------------- */ 1.204 + 1.205 +/* The interface converts a whole buffer to avoid function-call overhead. 1.206 +* Constants have been gathered. Loops & conditionals have been removed as 1.207 +* much as possible for efficiency, in favor of drop-through switches. 1.208 +* (See "Note A" at the bottom of the file for equivalent code.) 1.209 +* If your compiler supports it, the "isLegalUTF8" call can be turned 1.210 +* into an inline function. 1.211 +*/ 1.212 + 1.213 +/* --------------------------------------------------------------------- */ 1.214 + 1.215 +ConversionResult ConvertUTF16toUTF8 (const UTF16** sourceStart, const UTF16* sourceEnd, 1.216 + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { 1.217 + ConversionResult result = conversionOK; 1.218 + const UTF16* source = *sourceStart; 1.219 + UTF8* target = *targetStart; 1.220 + while (source < sourceEnd) { 1.221 + UTF32 ch; 1.222 + unsigned short bytesToWrite = 0; 1.223 + const UTF32 byteMask = 0xBF; 1.224 + const UTF32 byteMark = 0x80; 1.225 + const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ 1.226 + ch = *source++; 1.227 + /* If we have a surrogate pair, convert to UTF32 first. */ 1.228 + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { 1.229 + /* If the 16 bits following the high surrogate are in the source buffer... */ 1.230 + if (source < sourceEnd) { 1.231 + UTF32 ch2 = *source; 1.232 + /* If it's a low surrogate, convert to UTF32. */ 1.233 + if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { 1.234 + ch = ((ch - UNI_SUR_HIGH_START) << halfShift) 1.235 + + (ch2 - UNI_SUR_LOW_START) + halfBase; 1.236 + ++source; 1.237 + } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ 1.238 + --source; /* return to the illegal value itself */ 1.239 + result = sourceIllegal; 1.240 + break; 1.241 + } 1.242 + } else { /* We don't have the 16 bits following the high surrogate. */ 1.243 + --source; /* return to the high surrogate */ 1.244 + result = sourceExhausted; 1.245 + break; 1.246 + } 1.247 + } else if (flags == strictConversion) { 1.248 + /* UTF-16 surrogate values are illegal in UTF-32 */ 1.249 + if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { 1.250 + --source; /* return to the illegal value itself */ 1.251 + result = sourceIllegal; 1.252 + break; 1.253 + } 1.254 + } 1.255 + /* Figure out how many bytes the result will require */ 1.256 + if (ch < (UTF32)0x80) { bytesToWrite = 1; 1.257 + } else if (ch < (UTF32)0x800) { bytesToWrite = 2; 1.258 + } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; 1.259 + } else if (ch < (UTF32)0x110000) { bytesToWrite = 4; 1.260 + } else { bytesToWrite = 3; 1.261 + ch = UNI_REPLACEMENT_CHAR; 1.262 + } 1.263 + 1.264 + target += bytesToWrite; 1.265 + if (target > targetEnd) { 1.266 + source = oldSource; /* Back up source pointer! */ 1.267 + target -= bytesToWrite; result = targetExhausted; break; 1.268 + } 1.269 + switch (bytesToWrite) { /* note: everything falls through. */ 1.270 + case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 1.271 + case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 1.272 + case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 1.273 + case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); 1.274 + } 1.275 + target += bytesToWrite; 1.276 + } 1.277 +*sourceStart = source; 1.278 +*targetStart = target; 1.279 +return result; 1.280 +} 1.281 + 1.282 +/* --------------------------------------------------------------------- */ 1.283 + 1.284 +/* 1.285 + * Utility routine to tell whether a sequence of bytes is legal UTF-8. 1.286 + * This must be called with the length pre-determined by the first byte. 1.287 + * If not calling this from ConvertUTF8to*, then the length can be set by: 1.288 + * length = trailingBytesForUTF8[*source]+1; 1.289 + * and the sequence is illegal right away if there aren't that many bytes 1.290 + * available. 1.291 + * If presented with a length > 4, this returns false. The Unicode 1.292 + * definition of UTF-8 goes up to 4-byte sequences. 1.293 + */ 1.294 + 1.295 +static Boolean isLegalUTF8(const UTF8 *source, int length) { 1.296 + UTF8 a; 1.297 + const UTF8 *srcptr = source+length; 1.298 + switch (length) { 1.299 + default: return false; 1.300 + /* Everything else falls through when "true"... */ 1.301 + case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 1.302 + case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 1.303 + case 2: if ((a = (*--srcptr)) > 0xBF) return false; 1.304 + 1.305 + switch (*source) { 1.306 + /* no fall-through in this inner switch */ 1.307 + case 0xE0: if (a < 0xA0) return false; break; 1.308 + case 0xED: if (a > 0x9F) return false; break; 1.309 + case 0xF0: if (a < 0x90) return false; break; 1.310 + case 0xF4: if (a > 0x8F) return false; break; 1.311 + default: if (a < 0x80) return false; 1.312 + } 1.313 + 1.314 + case 1: if (*source >= 0x80 && *source < 0xC2) return false; 1.315 + } 1.316 + if (*source > 0xF4) return false; 1.317 + return true; 1.318 +} 1.319 + 1.320 +/* --------------------------------------------------------------------- */ 1.321 + 1.322 +/* 1.323 + * Exported function to return whether a UTF-8 sequence is legal or not. 1.324 + * This is not used here; it's just exported. 1.325 + */ 1.326 +Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { 1.327 + int length = trailingBytesForUTF8[*source]+1; 1.328 + if (source+length > sourceEnd) { 1.329 + return false; 1.330 + } 1.331 + return isLegalUTF8(source, length); 1.332 +} 1.333 + 1.334 +/* --------------------------------------------------------------------- */ 1.335 + 1.336 +ConversionResult ConvertUTF8toUTF16 (const UTF8** sourceStart, const UTF8* sourceEnd, 1.337 + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { 1.338 + ConversionResult result = conversionOK; 1.339 + const UTF8* source = *sourceStart; 1.340 + UTF16* target = *targetStart; 1.341 + while (source < sourceEnd) { 1.342 + UTF32 ch = 0; 1.343 + unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; 1.344 + if (source + extraBytesToRead >= sourceEnd) { 1.345 + result = sourceExhausted; break; 1.346 + } 1.347 + /* Do this check whether lenient or strict */ 1.348 + if (! isLegalUTF8(source, extraBytesToRead+1)) { 1.349 + result = sourceIllegal; 1.350 + break; 1.351 + } 1.352 + /* 1.353 + * The cases all fall through. See "Note A" below. 1.354 + */ 1.355 + switch (extraBytesToRead) { 1.356 + case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ 1.357 + case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ 1.358 + case 3: ch += *source++; ch <<= 6; 1.359 + case 2: ch += *source++; ch <<= 6; 1.360 + case 1: ch += *source++; ch <<= 6; 1.361 + case 0: ch += *source++; 1.362 + } 1.363 + ch -= offsetsFromUTF8[extraBytesToRead]; 1.364 + 1.365 + if (target >= targetEnd) { 1.366 + source -= (extraBytesToRead+1); /* Back up source pointer! */ 1.367 + result = targetExhausted; break; 1.368 + } 1.369 + if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ 1.370 + /* UTF-16 surrogate values are illegal in UTF-32 */ 1.371 + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 1.372 + if (flags == strictConversion) { 1.373 + source -= (extraBytesToRead+1); /* return to the illegal value itself */ 1.374 + result = sourceIllegal; 1.375 + break; 1.376 + } else { 1.377 + *target++ = UNI_REPLACEMENT_CHAR; 1.378 + } 1.379 + } else { 1.380 + *target++ = (UTF16)ch; /* normal case */ 1.381 + } 1.382 + } else if (ch > UNI_MAX_UTF16) { 1.383 + if (flags == strictConversion) { 1.384 + result = sourceIllegal; 1.385 + source -= (extraBytesToRead+1); /* return to the start */ 1.386 + break; /* Bail out; shouldn't continue */ 1.387 + } else { 1.388 + *target++ = UNI_REPLACEMENT_CHAR; 1.389 + } 1.390 + } else { 1.391 + /* target is a character in range 0xFFFF - 0x10FFFF. */ 1.392 + if (target + 1 >= targetEnd) { 1.393 + source -= (extraBytesToRead+1); /* Back up source pointer! */ 1.394 + result = targetExhausted; break; 1.395 + } 1.396 + ch -= halfBase; 1.397 + *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); 1.398 + *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); 1.399 + } 1.400 + } 1.401 +*sourceStart = source; 1.402 +*targetStart = target; 1.403 +return result; 1.404 +} 1.405 + 1.406 +/* --------------------------------------------------------------------- */ 1.407 + 1.408 +ConversionResult ConvertUTF32toUTF8 (const UTF32** sourceStart, const UTF32* sourceEnd, 1.409 + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { 1.410 + ConversionResult result = conversionOK; 1.411 + const UTF32* source = *sourceStart; 1.412 + UTF8* target = *targetStart; 1.413 + while (source < sourceEnd) { 1.414 + UTF32 ch; 1.415 + unsigned short bytesToWrite = 0; 1.416 + const UTF32 byteMask = 0xBF; 1.417 + const UTF32 byteMark = 0x80; 1.418 + ch = *source++; 1.419 + if (flags == strictConversion ) { 1.420 + /* UTF-16 surrogate values are illegal in UTF-32 */ 1.421 + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 1.422 + --source; /* return to the illegal value itself */ 1.423 + result = sourceIllegal; 1.424 + break; 1.425 + } 1.426 + } 1.427 + /* 1.428 + * Figure out how many bytes the result will require. Turn any 1.429 + * illegally large UTF32 things (> Plane 17) into replacement chars. 1.430 + */ 1.431 + if (ch < (UTF32)0x80) { bytesToWrite = 1; 1.432 + } else if (ch < (UTF32)0x800) { bytesToWrite = 2; 1.433 + } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; 1.434 + } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; 1.435 + } else { bytesToWrite = 3; 1.436 + ch = UNI_REPLACEMENT_CHAR; 1.437 + result = sourceIllegal; 1.438 + } 1.439 + 1.440 + target += bytesToWrite; 1.441 + if (target > targetEnd) { 1.442 + --source; /* Back up source pointer! */ 1.443 + target -= bytesToWrite; result = targetExhausted; break; 1.444 + } 1.445 + switch (bytesToWrite) { /* note: everything falls through. */ 1.446 + case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 1.447 + case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 1.448 + case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 1.449 + case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); 1.450 + } 1.451 + target += bytesToWrite; 1.452 + } 1.453 +*sourceStart = source; 1.454 +*targetStart = target; 1.455 +return result; 1.456 +} 1.457 + 1.458 +/* --------------------------------------------------------------------- */ 1.459 + 1.460 +ConversionResult ConvertUTF8toUTF32 (const UTF8** sourceStart, const UTF8* sourceEnd, 1.461 + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { 1.462 + ConversionResult result = conversionOK; 1.463 + const UTF8* source = *sourceStart; 1.464 + UTF32* target = *targetStart; 1.465 + while (source < sourceEnd) { 1.466 + UTF32 ch = 0; 1.467 + unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; 1.468 + if (source + extraBytesToRead >= sourceEnd) { 1.469 + result = sourceExhausted; break; 1.470 + } 1.471 + /* Do this check whether lenient or strict */ 1.472 + if (! isLegalUTF8(source, extraBytesToRead+1)) { 1.473 + result = sourceIllegal; 1.474 + break; 1.475 + } 1.476 + /* 1.477 + * The cases all fall through. See "Note A" below. 1.478 + */ 1.479 + switch (extraBytesToRead) { 1.480 + case 5: ch += *source++; ch <<= 6; 1.481 + case 4: ch += *source++; ch <<= 6; 1.482 + case 3: ch += *source++; ch <<= 6; 1.483 + case 2: ch += *source++; ch <<= 6; 1.484 + case 1: ch += *source++; ch <<= 6; 1.485 + case 0: ch += *source++; 1.486 + } 1.487 + ch -= offsetsFromUTF8[extraBytesToRead]; 1.488 + 1.489 + if (target >= targetEnd) { 1.490 + source -= (extraBytesToRead+1); /* Back up the source pointer! */ 1.491 + result = targetExhausted; break; 1.492 + } 1.493 + if (ch <= UNI_MAX_LEGAL_UTF32) { 1.494 + /* 1.495 + * UTF-16 surrogate values are illegal in UTF-32, and anything 1.496 + * over Plane 17 (> 0x10FFFF) is illegal. 1.497 + */ 1.498 + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 1.499 + if (flags == strictConversion) { 1.500 + source -= (extraBytesToRead+1); /* return to the illegal value itself */ 1.501 + result = sourceIllegal; 1.502 + break; 1.503 + } else { 1.504 + *target++ = UNI_REPLACEMENT_CHAR; 1.505 + } 1.506 + } else { 1.507 + *target++ = ch; 1.508 + } 1.509 + } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ 1.510 + result = sourceIllegal; 1.511 + *target++ = UNI_REPLACEMENT_CHAR; 1.512 + } 1.513 + } 1.514 + *sourceStart = source; 1.515 + *targetStart = target; 1.516 + return result; 1.517 +} 1.518 + 1.519 +/* --------------------------------------------------------------------- 1.520 + 1.521 +Note A. 1.522 +The fall-through switches in UTF-8 reading code save a 1.523 +temp variable, some decrements & conditionals. The switches 1.524 +are equivalent to the following loop: 1.525 +{ 1.526 + int tmpBytesToRead = extraBytesToRead+1; 1.527 + do { 1.528 + ch += *source++; 1.529 + --tmpBytesToRead; 1.530 + if (tmpBytesToRead) ch <<= 6; 1.531 + } while (tmpBytesToRead > 0); 1.532 +} 1.533 +In UTF-8 writing code, the switches on "bytesToWrite" are 1.534 +similarly unrolled loops. 1.535 + 1.536 +--------------------------------------------------------------------- */