Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | /* |
michael@0 | 2 | ********************************************************************** |
michael@0 | 3 | * Copyright (C) 2002-2012, International Business Machines |
michael@0 | 4 | * Corporation and others. All Rights Reserved. |
michael@0 | 5 | ********************************************************************** |
michael@0 | 6 | * file name: ucnv_u8.c |
michael@0 | 7 | * encoding: US-ASCII |
michael@0 | 8 | * tab size: 8 (not used) |
michael@0 | 9 | * indentation:4 |
michael@0 | 10 | * |
michael@0 | 11 | * created on: 2002jul01 |
michael@0 | 12 | * created by: Markus W. Scherer |
michael@0 | 13 | * |
michael@0 | 14 | * UTF-8 converter implementation. Used to be in ucnv_utf.c. |
michael@0 | 15 | * |
michael@0 | 16 | * Also, CESU-8 implementation, see UTR 26. |
michael@0 | 17 | * The CESU-8 converter uses all the same functions as the |
michael@0 | 18 | * UTF-8 converter, with a branch for converting supplementary code points. |
michael@0 | 19 | */ |
michael@0 | 20 | |
michael@0 | 21 | #include "unicode/utypes.h" |
michael@0 | 22 | |
michael@0 | 23 | #if !UCONFIG_NO_CONVERSION |
michael@0 | 24 | |
michael@0 | 25 | #include "unicode/ucnv.h" |
michael@0 | 26 | #include "unicode/utf.h" |
michael@0 | 27 | #include "unicode/utf8.h" |
michael@0 | 28 | #include "unicode/utf16.h" |
michael@0 | 29 | #include "ucnv_bld.h" |
michael@0 | 30 | #include "ucnv_cnv.h" |
michael@0 | 31 | #include "cmemory.h" |
michael@0 | 32 | |
michael@0 | 33 | /* Prototypes --------------------------------------------------------------- */ |
michael@0 | 34 | |
michael@0 | 35 | /* Keep these here to make finicky compilers happy */ |
michael@0 | 36 | |
michael@0 | 37 | U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args, |
michael@0 | 38 | UErrorCode *err); |
michael@0 | 39 | U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args, |
michael@0 | 40 | UErrorCode *err); |
michael@0 | 41 | |
michael@0 | 42 | |
michael@0 | 43 | /* UTF-8 -------------------------------------------------------------------- */ |
michael@0 | 44 | |
michael@0 | 45 | /* UTF-8 Conversion DATA |
michael@0 | 46 | * for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9 |
michael@0 | 47 | */ |
michael@0 | 48 | /*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/ |
michael@0 | 49 | #define MAXIMUM_UCS2 0x0000FFFF |
michael@0 | 50 | #define MAXIMUM_UTF 0x0010FFFF |
michael@0 | 51 | #define MAXIMUM_UCS4 0x7FFFFFFF |
michael@0 | 52 | #define HALF_SHIFT 10 |
michael@0 | 53 | #define HALF_BASE 0x0010000 |
michael@0 | 54 | #define HALF_MASK 0x3FF |
michael@0 | 55 | #define SURROGATE_HIGH_START 0xD800 |
michael@0 | 56 | #define SURROGATE_HIGH_END 0xDBFF |
michael@0 | 57 | #define SURROGATE_LOW_START 0xDC00 |
michael@0 | 58 | #define SURROGATE_LOW_END 0xDFFF |
michael@0 | 59 | |
michael@0 | 60 | /* -SURROGATE_LOW_START + HALF_BASE */ |
michael@0 | 61 | #define SURROGATE_LOW_BASE 9216 |
michael@0 | 62 | |
michael@0 | 63 | static const uint32_t offsetsFromUTF8[7] = {0, |
michael@0 | 64 | (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080, |
michael@0 | 65 | (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080 |
michael@0 | 66 | }; |
michael@0 | 67 | |
michael@0 | 68 | /* END OF UTF-8 Conversion DATA */ |
michael@0 | 69 | |
michael@0 | 70 | static const int8_t bytesFromUTF8[256] = { |
michael@0 | 71 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
michael@0 | 72 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
michael@0 | 73 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
michael@0 | 74 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
michael@0 | 75 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
michael@0 | 76 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
michael@0 | 77 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
michael@0 | 78 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 |
michael@0 | 79 | }; |
michael@0 | 80 | |
michael@0 | 81 | /* |
michael@0 | 82 | * Starting with Unicode 3.0.1: |
michael@0 | 83 | * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N]; |
michael@0 | 84 | * byte sequences with more than 4 bytes are illegal in UTF-8, |
michael@0 | 85 | * which is tested with impossible values for them |
michael@0 | 86 | */ |
michael@0 | 87 | static const uint32_t |
michael@0 | 88 | utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff }; |
michael@0 | 89 | |
michael@0 | 90 | static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args, |
michael@0 | 91 | UErrorCode * err) |
michael@0 | 92 | { |
michael@0 | 93 | UConverter *cnv = args->converter; |
michael@0 | 94 | const unsigned char *mySource = (unsigned char *) args->source; |
michael@0 | 95 | UChar *myTarget = args->target; |
michael@0 | 96 | const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; |
michael@0 | 97 | const UChar *targetLimit = args->targetLimit; |
michael@0 | 98 | unsigned char *toUBytes = cnv->toUBytes; |
michael@0 | 99 | UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data); |
michael@0 | 100 | uint32_t ch, ch2 = 0; |
michael@0 | 101 | int32_t i, inBytes; |
michael@0 | 102 | |
michael@0 | 103 | /* Restore size of current sequence */ |
michael@0 | 104 | if (cnv->toUnicodeStatus && myTarget < targetLimit) |
michael@0 | 105 | { |
michael@0 | 106 | inBytes = cnv->mode; /* restore # of bytes to consume */ |
michael@0 | 107 | i = cnv->toULength; /* restore # of bytes consumed */ |
michael@0 | 108 | cnv->toULength = 0; |
michael@0 | 109 | |
michael@0 | 110 | ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/ |
michael@0 | 111 | cnv->toUnicodeStatus = 0; |
michael@0 | 112 | goto morebytes; |
michael@0 | 113 | } |
michael@0 | 114 | |
michael@0 | 115 | |
michael@0 | 116 | while (mySource < sourceLimit && myTarget < targetLimit) |
michael@0 | 117 | { |
michael@0 | 118 | ch = *(mySource++); |
michael@0 | 119 | if (ch < 0x80) /* Simple case */ |
michael@0 | 120 | { |
michael@0 | 121 | *(myTarget++) = (UChar) ch; |
michael@0 | 122 | } |
michael@0 | 123 | else |
michael@0 | 124 | { |
michael@0 | 125 | /* store the first char */ |
michael@0 | 126 | toUBytes[0] = (char)ch; |
michael@0 | 127 | inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */ |
michael@0 | 128 | i = 1; |
michael@0 | 129 | |
michael@0 | 130 | morebytes: |
michael@0 | 131 | while (i < inBytes) |
michael@0 | 132 | { |
michael@0 | 133 | if (mySource < sourceLimit) |
michael@0 | 134 | { |
michael@0 | 135 | toUBytes[i] = (char) (ch2 = *mySource); |
michael@0 | 136 | if (!U8_IS_TRAIL(ch2)) |
michael@0 | 137 | { |
michael@0 | 138 | break; /* i < inBytes */ |
michael@0 | 139 | } |
michael@0 | 140 | ch = (ch << 6) + ch2; |
michael@0 | 141 | ++mySource; |
michael@0 | 142 | i++; |
michael@0 | 143 | } |
michael@0 | 144 | else |
michael@0 | 145 | { |
michael@0 | 146 | /* stores a partially calculated target*/ |
michael@0 | 147 | cnv->toUnicodeStatus = ch; |
michael@0 | 148 | cnv->mode = inBytes; |
michael@0 | 149 | cnv->toULength = (int8_t) i; |
michael@0 | 150 | goto donefornow; |
michael@0 | 151 | } |
michael@0 | 152 | } |
michael@0 | 153 | |
michael@0 | 154 | /* Remove the accumulated high bits */ |
michael@0 | 155 | ch -= offsetsFromUTF8[inBytes]; |
michael@0 | 156 | |
michael@0 | 157 | /* |
michael@0 | 158 | * Legal UTF-8 byte sequences in Unicode 3.0.1 and up: |
michael@0 | 159 | * - use only trail bytes after a lead byte (checked above) |
michael@0 | 160 | * - use the right number of trail bytes for a given lead byte |
michael@0 | 161 | * - encode a code point <= U+10ffff |
michael@0 | 162 | * - use the fewest possible number of bytes for their code points |
michael@0 | 163 | * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[]) |
michael@0 | 164 | * |
michael@0 | 165 | * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. |
michael@0 | 166 | * There are no irregular sequences any more. |
michael@0 | 167 | * In CESU-8, only surrogates, not supplementary code points, are encoded directly. |
michael@0 | 168 | */ |
michael@0 | 169 | if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] && |
michael@0 | 170 | (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch))) |
michael@0 | 171 | { |
michael@0 | 172 | /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ |
michael@0 | 173 | if (ch <= MAXIMUM_UCS2) |
michael@0 | 174 | { |
michael@0 | 175 | /* fits in 16 bits */ |
michael@0 | 176 | *(myTarget++) = (UChar) ch; |
michael@0 | 177 | } |
michael@0 | 178 | else |
michael@0 | 179 | { |
michael@0 | 180 | /* write out the surrogates */ |
michael@0 | 181 | ch -= HALF_BASE; |
michael@0 | 182 | *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START); |
michael@0 | 183 | ch = (ch & HALF_MASK) + SURROGATE_LOW_START; |
michael@0 | 184 | if (myTarget < targetLimit) |
michael@0 | 185 | { |
michael@0 | 186 | *(myTarget++) = (UChar)ch; |
michael@0 | 187 | } |
michael@0 | 188 | else |
michael@0 | 189 | { |
michael@0 | 190 | /* Put in overflow buffer (not handled here) */ |
michael@0 | 191 | cnv->UCharErrorBuffer[0] = (UChar) ch; |
michael@0 | 192 | cnv->UCharErrorBufferLength = 1; |
michael@0 | 193 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 194 | break; |
michael@0 | 195 | } |
michael@0 | 196 | } |
michael@0 | 197 | } |
michael@0 | 198 | else |
michael@0 | 199 | { |
michael@0 | 200 | cnv->toULength = (int8_t)i; |
michael@0 | 201 | *err = U_ILLEGAL_CHAR_FOUND; |
michael@0 | 202 | break; |
michael@0 | 203 | } |
michael@0 | 204 | } |
michael@0 | 205 | } |
michael@0 | 206 | |
michael@0 | 207 | donefornow: |
michael@0 | 208 | if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) |
michael@0 | 209 | { |
michael@0 | 210 | /* End of target buffer */ |
michael@0 | 211 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 212 | } |
michael@0 | 213 | |
michael@0 | 214 | args->target = myTarget; |
michael@0 | 215 | args->source = (const char *) mySource; |
michael@0 | 216 | } |
michael@0 | 217 | |
michael@0 | 218 | static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args, |
michael@0 | 219 | UErrorCode * err) |
michael@0 | 220 | { |
michael@0 | 221 | UConverter *cnv = args->converter; |
michael@0 | 222 | const unsigned char *mySource = (unsigned char *) args->source; |
michael@0 | 223 | UChar *myTarget = args->target; |
michael@0 | 224 | int32_t *myOffsets = args->offsets; |
michael@0 | 225 | int32_t offsetNum = 0; |
michael@0 | 226 | const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; |
michael@0 | 227 | const UChar *targetLimit = args->targetLimit; |
michael@0 | 228 | unsigned char *toUBytes = cnv->toUBytes; |
michael@0 | 229 | UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data); |
michael@0 | 230 | uint32_t ch, ch2 = 0; |
michael@0 | 231 | int32_t i, inBytes; |
michael@0 | 232 | |
michael@0 | 233 | /* Restore size of current sequence */ |
michael@0 | 234 | if (cnv->toUnicodeStatus && myTarget < targetLimit) |
michael@0 | 235 | { |
michael@0 | 236 | inBytes = cnv->mode; /* restore # of bytes to consume */ |
michael@0 | 237 | i = cnv->toULength; /* restore # of bytes consumed */ |
michael@0 | 238 | cnv->toULength = 0; |
michael@0 | 239 | |
michael@0 | 240 | ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/ |
michael@0 | 241 | cnv->toUnicodeStatus = 0; |
michael@0 | 242 | goto morebytes; |
michael@0 | 243 | } |
michael@0 | 244 | |
michael@0 | 245 | while (mySource < sourceLimit && myTarget < targetLimit) |
michael@0 | 246 | { |
michael@0 | 247 | ch = *(mySource++); |
michael@0 | 248 | if (ch < 0x80) /* Simple case */ |
michael@0 | 249 | { |
michael@0 | 250 | *(myTarget++) = (UChar) ch; |
michael@0 | 251 | *(myOffsets++) = offsetNum++; |
michael@0 | 252 | } |
michael@0 | 253 | else |
michael@0 | 254 | { |
michael@0 | 255 | toUBytes[0] = (char)ch; |
michael@0 | 256 | inBytes = bytesFromUTF8[ch]; |
michael@0 | 257 | i = 1; |
michael@0 | 258 | |
michael@0 | 259 | morebytes: |
michael@0 | 260 | while (i < inBytes) |
michael@0 | 261 | { |
michael@0 | 262 | if (mySource < sourceLimit) |
michael@0 | 263 | { |
michael@0 | 264 | toUBytes[i] = (char) (ch2 = *mySource); |
michael@0 | 265 | if (!U8_IS_TRAIL(ch2)) |
michael@0 | 266 | { |
michael@0 | 267 | break; /* i < inBytes */ |
michael@0 | 268 | } |
michael@0 | 269 | ch = (ch << 6) + ch2; |
michael@0 | 270 | ++mySource; |
michael@0 | 271 | i++; |
michael@0 | 272 | } |
michael@0 | 273 | else |
michael@0 | 274 | { |
michael@0 | 275 | cnv->toUnicodeStatus = ch; |
michael@0 | 276 | cnv->mode = inBytes; |
michael@0 | 277 | cnv->toULength = (int8_t)i; |
michael@0 | 278 | goto donefornow; |
michael@0 | 279 | } |
michael@0 | 280 | } |
michael@0 | 281 | |
michael@0 | 282 | /* Remove the accumulated high bits */ |
michael@0 | 283 | ch -= offsetsFromUTF8[inBytes]; |
michael@0 | 284 | |
michael@0 | 285 | /* |
michael@0 | 286 | * Legal UTF-8 byte sequences in Unicode 3.0.1 and up: |
michael@0 | 287 | * - use only trail bytes after a lead byte (checked above) |
michael@0 | 288 | * - use the right number of trail bytes for a given lead byte |
michael@0 | 289 | * - encode a code point <= U+10ffff |
michael@0 | 290 | * - use the fewest possible number of bytes for their code points |
michael@0 | 291 | * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[]) |
michael@0 | 292 | * |
michael@0 | 293 | * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. |
michael@0 | 294 | * There are no irregular sequences any more. |
michael@0 | 295 | * In CESU-8, only surrogates, not supplementary code points, are encoded directly. |
michael@0 | 296 | */ |
michael@0 | 297 | if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] && |
michael@0 | 298 | (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch))) |
michael@0 | 299 | { |
michael@0 | 300 | /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ |
michael@0 | 301 | if (ch <= MAXIMUM_UCS2) |
michael@0 | 302 | { |
michael@0 | 303 | /* fits in 16 bits */ |
michael@0 | 304 | *(myTarget++) = (UChar) ch; |
michael@0 | 305 | *(myOffsets++) = offsetNum; |
michael@0 | 306 | } |
michael@0 | 307 | else |
michael@0 | 308 | { |
michael@0 | 309 | /* write out the surrogates */ |
michael@0 | 310 | ch -= HALF_BASE; |
michael@0 | 311 | *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START); |
michael@0 | 312 | *(myOffsets++) = offsetNum; |
michael@0 | 313 | ch = (ch & HALF_MASK) + SURROGATE_LOW_START; |
michael@0 | 314 | if (myTarget < targetLimit) |
michael@0 | 315 | { |
michael@0 | 316 | *(myTarget++) = (UChar)ch; |
michael@0 | 317 | *(myOffsets++) = offsetNum; |
michael@0 | 318 | } |
michael@0 | 319 | else |
michael@0 | 320 | { |
michael@0 | 321 | cnv->UCharErrorBuffer[0] = (UChar) ch; |
michael@0 | 322 | cnv->UCharErrorBufferLength = 1; |
michael@0 | 323 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 324 | } |
michael@0 | 325 | } |
michael@0 | 326 | offsetNum += i; |
michael@0 | 327 | } |
michael@0 | 328 | else |
michael@0 | 329 | { |
michael@0 | 330 | cnv->toULength = (int8_t)i; |
michael@0 | 331 | *err = U_ILLEGAL_CHAR_FOUND; |
michael@0 | 332 | break; |
michael@0 | 333 | } |
michael@0 | 334 | } |
michael@0 | 335 | } |
michael@0 | 336 | |
michael@0 | 337 | donefornow: |
michael@0 | 338 | if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) |
michael@0 | 339 | { /* End of target buffer */ |
michael@0 | 340 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 341 | } |
michael@0 | 342 | |
michael@0 | 343 | args->target = myTarget; |
michael@0 | 344 | args->source = (const char *) mySource; |
michael@0 | 345 | args->offsets = myOffsets; |
michael@0 | 346 | } |
michael@0 | 347 | |
michael@0 | 348 | U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args, |
michael@0 | 349 | UErrorCode * err) |
michael@0 | 350 | { |
michael@0 | 351 | UConverter *cnv = args->converter; |
michael@0 | 352 | const UChar *mySource = args->source; |
michael@0 | 353 | const UChar *sourceLimit = args->sourceLimit; |
michael@0 | 354 | uint8_t *myTarget = (uint8_t *) args->target; |
michael@0 | 355 | const uint8_t *targetLimit = (uint8_t *) args->targetLimit; |
michael@0 | 356 | uint8_t *tempPtr; |
michael@0 | 357 | UChar32 ch; |
michael@0 | 358 | uint8_t tempBuf[4]; |
michael@0 | 359 | int32_t indexToWrite; |
michael@0 | 360 | UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data); |
michael@0 | 361 | |
michael@0 | 362 | if (cnv->fromUChar32 && myTarget < targetLimit) |
michael@0 | 363 | { |
michael@0 | 364 | ch = cnv->fromUChar32; |
michael@0 | 365 | cnv->fromUChar32 = 0; |
michael@0 | 366 | goto lowsurrogate; |
michael@0 | 367 | } |
michael@0 | 368 | |
michael@0 | 369 | while (mySource < sourceLimit && myTarget < targetLimit) |
michael@0 | 370 | { |
michael@0 | 371 | ch = *(mySource++); |
michael@0 | 372 | |
michael@0 | 373 | if (ch < 0x80) /* Single byte */ |
michael@0 | 374 | { |
michael@0 | 375 | *(myTarget++) = (uint8_t) ch; |
michael@0 | 376 | } |
michael@0 | 377 | else if (ch < 0x800) /* Double byte */ |
michael@0 | 378 | { |
michael@0 | 379 | *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0); |
michael@0 | 380 | if (myTarget < targetLimit) |
michael@0 | 381 | { |
michael@0 | 382 | *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80); |
michael@0 | 383 | } |
michael@0 | 384 | else |
michael@0 | 385 | { |
michael@0 | 386 | cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80); |
michael@0 | 387 | cnv->charErrorBufferLength = 1; |
michael@0 | 388 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 389 | } |
michael@0 | 390 | } |
michael@0 | 391 | else { |
michael@0 | 392 | /* Check for surrogates */ |
michael@0 | 393 | if(U16_IS_SURROGATE(ch) && isNotCESU8) { |
michael@0 | 394 | lowsurrogate: |
michael@0 | 395 | if (mySource < sourceLimit) { |
michael@0 | 396 | /* test both code units */ |
michael@0 | 397 | if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) { |
michael@0 | 398 | /* convert and consume this supplementary code point */ |
michael@0 | 399 | ch=U16_GET_SUPPLEMENTARY(ch, *mySource); |
michael@0 | 400 | ++mySource; |
michael@0 | 401 | /* exit this condition tree */ |
michael@0 | 402 | } |
michael@0 | 403 | else { |
michael@0 | 404 | /* this is an unpaired trail or lead code unit */ |
michael@0 | 405 | /* callback(illegal) */ |
michael@0 | 406 | cnv->fromUChar32 = ch; |
michael@0 | 407 | *err = U_ILLEGAL_CHAR_FOUND; |
michael@0 | 408 | break; |
michael@0 | 409 | } |
michael@0 | 410 | } |
michael@0 | 411 | else { |
michael@0 | 412 | /* no more input */ |
michael@0 | 413 | cnv->fromUChar32 = ch; |
michael@0 | 414 | break; |
michael@0 | 415 | } |
michael@0 | 416 | } |
michael@0 | 417 | |
michael@0 | 418 | /* Do we write the buffer directly for speed, |
michael@0 | 419 | or do we have to be careful about target buffer space? */ |
michael@0 | 420 | tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf); |
michael@0 | 421 | |
michael@0 | 422 | if (ch <= MAXIMUM_UCS2) { |
michael@0 | 423 | indexToWrite = 2; |
michael@0 | 424 | tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0); |
michael@0 | 425 | } |
michael@0 | 426 | else { |
michael@0 | 427 | indexToWrite = 3; |
michael@0 | 428 | tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0); |
michael@0 | 429 | tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80); |
michael@0 | 430 | } |
michael@0 | 431 | tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80); |
michael@0 | 432 | tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80); |
michael@0 | 433 | |
michael@0 | 434 | if (tempPtr == myTarget) { |
michael@0 | 435 | /* There was enough space to write the codepoint directly. */ |
michael@0 | 436 | myTarget += (indexToWrite + 1); |
michael@0 | 437 | } |
michael@0 | 438 | else { |
michael@0 | 439 | /* We might run out of room soon. Write it slowly. */ |
michael@0 | 440 | for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) { |
michael@0 | 441 | if (myTarget < targetLimit) { |
michael@0 | 442 | *(myTarget++) = *tempPtr; |
michael@0 | 443 | } |
michael@0 | 444 | else { |
michael@0 | 445 | cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr; |
michael@0 | 446 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 447 | } |
michael@0 | 448 | } |
michael@0 | 449 | } |
michael@0 | 450 | } |
michael@0 | 451 | } |
michael@0 | 452 | |
michael@0 | 453 | if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) |
michael@0 | 454 | { |
michael@0 | 455 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 456 | } |
michael@0 | 457 | |
michael@0 | 458 | args->target = (char *) myTarget; |
michael@0 | 459 | args->source = mySource; |
michael@0 | 460 | } |
michael@0 | 461 | |
michael@0 | 462 | U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args, |
michael@0 | 463 | UErrorCode * err) |
michael@0 | 464 | { |
michael@0 | 465 | UConverter *cnv = args->converter; |
michael@0 | 466 | const UChar *mySource = args->source; |
michael@0 | 467 | int32_t *myOffsets = args->offsets; |
michael@0 | 468 | const UChar *sourceLimit = args->sourceLimit; |
michael@0 | 469 | uint8_t *myTarget = (uint8_t *) args->target; |
michael@0 | 470 | const uint8_t *targetLimit = (uint8_t *) args->targetLimit; |
michael@0 | 471 | uint8_t *tempPtr; |
michael@0 | 472 | UChar32 ch; |
michael@0 | 473 | int32_t offsetNum, nextSourceIndex; |
michael@0 | 474 | int32_t indexToWrite; |
michael@0 | 475 | uint8_t tempBuf[4]; |
michael@0 | 476 | UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data); |
michael@0 | 477 | |
michael@0 | 478 | if (cnv->fromUChar32 && myTarget < targetLimit) |
michael@0 | 479 | { |
michael@0 | 480 | ch = cnv->fromUChar32; |
michael@0 | 481 | cnv->fromUChar32 = 0; |
michael@0 | 482 | offsetNum = -1; |
michael@0 | 483 | nextSourceIndex = 0; |
michael@0 | 484 | goto lowsurrogate; |
michael@0 | 485 | } else { |
michael@0 | 486 | offsetNum = 0; |
michael@0 | 487 | } |
michael@0 | 488 | |
michael@0 | 489 | while (mySource < sourceLimit && myTarget < targetLimit) |
michael@0 | 490 | { |
michael@0 | 491 | ch = *(mySource++); |
michael@0 | 492 | |
michael@0 | 493 | if (ch < 0x80) /* Single byte */ |
michael@0 | 494 | { |
michael@0 | 495 | *(myOffsets++) = offsetNum++; |
michael@0 | 496 | *(myTarget++) = (char) ch; |
michael@0 | 497 | } |
michael@0 | 498 | else if (ch < 0x800) /* Double byte */ |
michael@0 | 499 | { |
michael@0 | 500 | *(myOffsets++) = offsetNum; |
michael@0 | 501 | *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0); |
michael@0 | 502 | if (myTarget < targetLimit) |
michael@0 | 503 | { |
michael@0 | 504 | *(myOffsets++) = offsetNum++; |
michael@0 | 505 | *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80); |
michael@0 | 506 | } |
michael@0 | 507 | else |
michael@0 | 508 | { |
michael@0 | 509 | cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80); |
michael@0 | 510 | cnv->charErrorBufferLength = 1; |
michael@0 | 511 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 512 | } |
michael@0 | 513 | } |
michael@0 | 514 | else |
michael@0 | 515 | /* Check for surrogates */ |
michael@0 | 516 | { |
michael@0 | 517 | nextSourceIndex = offsetNum + 1; |
michael@0 | 518 | |
michael@0 | 519 | if(U16_IS_SURROGATE(ch) && isNotCESU8) { |
michael@0 | 520 | lowsurrogate: |
michael@0 | 521 | if (mySource < sourceLimit) { |
michael@0 | 522 | /* test both code units */ |
michael@0 | 523 | if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) { |
michael@0 | 524 | /* convert and consume this supplementary code point */ |
michael@0 | 525 | ch=U16_GET_SUPPLEMENTARY(ch, *mySource); |
michael@0 | 526 | ++mySource; |
michael@0 | 527 | ++nextSourceIndex; |
michael@0 | 528 | /* exit this condition tree */ |
michael@0 | 529 | } |
michael@0 | 530 | else { |
michael@0 | 531 | /* this is an unpaired trail or lead code unit */ |
michael@0 | 532 | /* callback(illegal) */ |
michael@0 | 533 | cnv->fromUChar32 = ch; |
michael@0 | 534 | *err = U_ILLEGAL_CHAR_FOUND; |
michael@0 | 535 | break; |
michael@0 | 536 | } |
michael@0 | 537 | } |
michael@0 | 538 | else { |
michael@0 | 539 | /* no more input */ |
michael@0 | 540 | cnv->fromUChar32 = ch; |
michael@0 | 541 | break; |
michael@0 | 542 | } |
michael@0 | 543 | } |
michael@0 | 544 | |
michael@0 | 545 | /* Do we write the buffer directly for speed, |
michael@0 | 546 | or do we have to be careful about target buffer space? */ |
michael@0 | 547 | tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf); |
michael@0 | 548 | |
michael@0 | 549 | if (ch <= MAXIMUM_UCS2) { |
michael@0 | 550 | indexToWrite = 2; |
michael@0 | 551 | tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0); |
michael@0 | 552 | } |
michael@0 | 553 | else { |
michael@0 | 554 | indexToWrite = 3; |
michael@0 | 555 | tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0); |
michael@0 | 556 | tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80); |
michael@0 | 557 | } |
michael@0 | 558 | tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80); |
michael@0 | 559 | tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80); |
michael@0 | 560 | |
michael@0 | 561 | if (tempPtr == myTarget) { |
michael@0 | 562 | /* There was enough space to write the codepoint directly. */ |
michael@0 | 563 | myTarget += (indexToWrite + 1); |
michael@0 | 564 | myOffsets[0] = offsetNum; |
michael@0 | 565 | myOffsets[1] = offsetNum; |
michael@0 | 566 | myOffsets[2] = offsetNum; |
michael@0 | 567 | if (indexToWrite >= 3) { |
michael@0 | 568 | myOffsets[3] = offsetNum; |
michael@0 | 569 | } |
michael@0 | 570 | myOffsets += (indexToWrite + 1); |
michael@0 | 571 | } |
michael@0 | 572 | else { |
michael@0 | 573 | /* We might run out of room soon. Write it slowly. */ |
michael@0 | 574 | for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) { |
michael@0 | 575 | if (myTarget < targetLimit) |
michael@0 | 576 | { |
michael@0 | 577 | *(myOffsets++) = offsetNum; |
michael@0 | 578 | *(myTarget++) = *tempPtr; |
michael@0 | 579 | } |
michael@0 | 580 | else |
michael@0 | 581 | { |
michael@0 | 582 | cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr; |
michael@0 | 583 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 584 | } |
michael@0 | 585 | } |
michael@0 | 586 | } |
michael@0 | 587 | offsetNum = nextSourceIndex; |
michael@0 | 588 | } |
michael@0 | 589 | } |
michael@0 | 590 | |
michael@0 | 591 | if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) |
michael@0 | 592 | { |
michael@0 | 593 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 594 | } |
michael@0 | 595 | |
michael@0 | 596 | args->target = (char *) myTarget; |
michael@0 | 597 | args->source = mySource; |
michael@0 | 598 | args->offsets = myOffsets; |
michael@0 | 599 | } |
michael@0 | 600 | |
michael@0 | 601 | static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args, |
michael@0 | 602 | UErrorCode *err) { |
michael@0 | 603 | UConverter *cnv; |
michael@0 | 604 | const uint8_t *sourceInitial; |
michael@0 | 605 | const uint8_t *source; |
michael@0 | 606 | uint16_t extraBytesToWrite; |
michael@0 | 607 | uint8_t myByte; |
michael@0 | 608 | UChar32 ch; |
michael@0 | 609 | int8_t i, isLegalSequence; |
michael@0 | 610 | |
michael@0 | 611 | /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */ |
michael@0 | 612 | |
michael@0 | 613 | cnv = args->converter; |
michael@0 | 614 | sourceInitial = source = (const uint8_t *)args->source; |
michael@0 | 615 | if (source >= (const uint8_t *)args->sourceLimit) |
michael@0 | 616 | { |
michael@0 | 617 | /* no input */ |
michael@0 | 618 | *err = U_INDEX_OUTOFBOUNDS_ERROR; |
michael@0 | 619 | return 0xffff; |
michael@0 | 620 | } |
michael@0 | 621 | |
michael@0 | 622 | myByte = (uint8_t)*(source++); |
michael@0 | 623 | if (myByte < 0x80) |
michael@0 | 624 | { |
michael@0 | 625 | args->source = (const char *)source; |
michael@0 | 626 | return (UChar32)myByte; |
michael@0 | 627 | } |
michael@0 | 628 | |
michael@0 | 629 | extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte]; |
michael@0 | 630 | if (extraBytesToWrite == 0) { |
michael@0 | 631 | cnv->toUBytes[0] = myByte; |
michael@0 | 632 | cnv->toULength = 1; |
michael@0 | 633 | *err = U_ILLEGAL_CHAR_FOUND; |
michael@0 | 634 | args->source = (const char *)source; |
michael@0 | 635 | return 0xffff; |
michael@0 | 636 | } |
michael@0 | 637 | |
michael@0 | 638 | /*The byte sequence is longer than the buffer area passed*/ |
michael@0 | 639 | if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit) |
michael@0 | 640 | { |
michael@0 | 641 | /* check if all of the remaining bytes are trail bytes */ |
michael@0 | 642 | cnv->toUBytes[0] = myByte; |
michael@0 | 643 | i = 1; |
michael@0 | 644 | *err = U_TRUNCATED_CHAR_FOUND; |
michael@0 | 645 | while(source < (const uint8_t *)args->sourceLimit) { |
michael@0 | 646 | if(U8_IS_TRAIL(myByte = *source)) { |
michael@0 | 647 | cnv->toUBytes[i++] = myByte; |
michael@0 | 648 | ++source; |
michael@0 | 649 | } else { |
michael@0 | 650 | /* error even before we run out of input */ |
michael@0 | 651 | *err = U_ILLEGAL_CHAR_FOUND; |
michael@0 | 652 | break; |
michael@0 | 653 | } |
michael@0 | 654 | } |
michael@0 | 655 | cnv->toULength = i; |
michael@0 | 656 | args->source = (const char *)source; |
michael@0 | 657 | return 0xffff; |
michael@0 | 658 | } |
michael@0 | 659 | |
michael@0 | 660 | isLegalSequence = 1; |
michael@0 | 661 | ch = myByte << 6; |
michael@0 | 662 | switch(extraBytesToWrite) |
michael@0 | 663 | { |
michael@0 | 664 | /* note: code falls through cases! (sic)*/ |
michael@0 | 665 | case 6: |
michael@0 | 666 | ch += (myByte = *source); |
michael@0 | 667 | ch <<= 6; |
michael@0 | 668 | if (!U8_IS_TRAIL(myByte)) |
michael@0 | 669 | { |
michael@0 | 670 | isLegalSequence = 0; |
michael@0 | 671 | break; |
michael@0 | 672 | } |
michael@0 | 673 | ++source; |
michael@0 | 674 | case 5: /*fall through*/ |
michael@0 | 675 | ch += (myByte = *source); |
michael@0 | 676 | ch <<= 6; |
michael@0 | 677 | if (!U8_IS_TRAIL(myByte)) |
michael@0 | 678 | { |
michael@0 | 679 | isLegalSequence = 0; |
michael@0 | 680 | break; |
michael@0 | 681 | } |
michael@0 | 682 | ++source; |
michael@0 | 683 | case 4: /*fall through*/ |
michael@0 | 684 | ch += (myByte = *source); |
michael@0 | 685 | ch <<= 6; |
michael@0 | 686 | if (!U8_IS_TRAIL(myByte)) |
michael@0 | 687 | { |
michael@0 | 688 | isLegalSequence = 0; |
michael@0 | 689 | break; |
michael@0 | 690 | } |
michael@0 | 691 | ++source; |
michael@0 | 692 | case 3: /*fall through*/ |
michael@0 | 693 | ch += (myByte = *source); |
michael@0 | 694 | ch <<= 6; |
michael@0 | 695 | if (!U8_IS_TRAIL(myByte)) |
michael@0 | 696 | { |
michael@0 | 697 | isLegalSequence = 0; |
michael@0 | 698 | break; |
michael@0 | 699 | } |
michael@0 | 700 | ++source; |
michael@0 | 701 | case 2: /*fall through*/ |
michael@0 | 702 | ch += (myByte = *source); |
michael@0 | 703 | if (!U8_IS_TRAIL(myByte)) |
michael@0 | 704 | { |
michael@0 | 705 | isLegalSequence = 0; |
michael@0 | 706 | break; |
michael@0 | 707 | } |
michael@0 | 708 | ++source; |
michael@0 | 709 | }; |
michael@0 | 710 | ch -= offsetsFromUTF8[extraBytesToWrite]; |
michael@0 | 711 | args->source = (const char *)source; |
michael@0 | 712 | |
michael@0 | 713 | /* |
michael@0 | 714 | * Legal UTF-8 byte sequences in Unicode 3.0.1 and up: |
michael@0 | 715 | * - use only trail bytes after a lead byte (checked above) |
michael@0 | 716 | * - use the right number of trail bytes for a given lead byte |
michael@0 | 717 | * - encode a code point <= U+10ffff |
michael@0 | 718 | * - use the fewest possible number of bytes for their code points |
michael@0 | 719 | * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[]) |
michael@0 | 720 | * |
michael@0 | 721 | * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. |
michael@0 | 722 | * There are no irregular sequences any more. |
michael@0 | 723 | */ |
michael@0 | 724 | if (isLegalSequence && |
michael@0 | 725 | (uint32_t)ch <= MAXIMUM_UTF && |
michael@0 | 726 | (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] && |
michael@0 | 727 | !U_IS_SURROGATE(ch) |
michael@0 | 728 | ) { |
michael@0 | 729 | return ch; /* return the code point */ |
michael@0 | 730 | } |
michael@0 | 731 | |
michael@0 | 732 | for(i = 0; sourceInitial < source; ++i) { |
michael@0 | 733 | cnv->toUBytes[i] = *sourceInitial++; |
michael@0 | 734 | } |
michael@0 | 735 | cnv->toULength = i; |
michael@0 | 736 | *err = U_ILLEGAL_CHAR_FOUND; |
michael@0 | 737 | return 0xffff; |
michael@0 | 738 | } |
michael@0 | 739 | |
michael@0 | 740 | /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */ |
michael@0 | 741 | |
michael@0 | 742 | /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */ |
michael@0 | 743 | static const UChar32 |
michael@0 | 744 | utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 }; |
michael@0 | 745 | |
michael@0 | 746 | /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */ |
michael@0 | 747 | static const UChar32 |
michael@0 | 748 | utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 }; |
michael@0 | 749 | |
michael@0 | 750 | /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */ |
michael@0 | 751 | static void |
michael@0 | 752 | ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs, |
michael@0 | 753 | UConverterToUnicodeArgs *pToUArgs, |
michael@0 | 754 | UErrorCode *pErrorCode) { |
michael@0 | 755 | UConverter *utf8; |
michael@0 | 756 | const uint8_t *source, *sourceLimit; |
michael@0 | 757 | uint8_t *target; |
michael@0 | 758 | int32_t targetCapacity; |
michael@0 | 759 | int32_t count; |
michael@0 | 760 | |
michael@0 | 761 | int8_t oldToULength, toULength, toULimit; |
michael@0 | 762 | |
michael@0 | 763 | UChar32 c; |
michael@0 | 764 | uint8_t b, t1, t2; |
michael@0 | 765 | |
michael@0 | 766 | /* set up the local pointers */ |
michael@0 | 767 | utf8=pToUArgs->converter; |
michael@0 | 768 | source=(uint8_t *)pToUArgs->source; |
michael@0 | 769 | sourceLimit=(uint8_t *)pToUArgs->sourceLimit; |
michael@0 | 770 | target=(uint8_t *)pFromUArgs->target; |
michael@0 | 771 | targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); |
michael@0 | 772 | |
michael@0 | 773 | /* get the converter state from the UTF-8 UConverter */ |
michael@0 | 774 | c=(UChar32)utf8->toUnicodeStatus; |
michael@0 | 775 | if(c!=0) { |
michael@0 | 776 | toULength=oldToULength=utf8->toULength; |
michael@0 | 777 | toULimit=(int8_t)utf8->mode; |
michael@0 | 778 | } else { |
michael@0 | 779 | toULength=oldToULength=toULimit=0; |
michael@0 | 780 | } |
michael@0 | 781 | |
michael@0 | 782 | count=(int32_t)(sourceLimit-source)+oldToULength; |
michael@0 | 783 | if(count<toULimit) { |
michael@0 | 784 | /* |
michael@0 | 785 | * Not enough input to complete the partial character. |
michael@0 | 786 | * Jump to moreBytes below - it will not output to target. |
michael@0 | 787 | */ |
michael@0 | 788 | } else if(targetCapacity<toULimit) { |
michael@0 | 789 | /* |
michael@0 | 790 | * Not enough target capacity to output the partial character. |
michael@0 | 791 | * Let the standard converter handle this. |
michael@0 | 792 | */ |
michael@0 | 793 | *pErrorCode=U_USING_DEFAULT_WARNING; |
michael@0 | 794 | return; |
michael@0 | 795 | } else { |
michael@0 | 796 | /* |
michael@0 | 797 | * Use a single counter for source and target, counting the minimum of |
michael@0 | 798 | * the source length and the target capacity. |
michael@0 | 799 | * As a result, the source length is checked only once per multi-byte |
michael@0 | 800 | * character instead of twice. |
michael@0 | 801 | * |
michael@0 | 802 | * Make sure that the last byte sequence is complete, or else |
michael@0 | 803 | * stop just before it. |
michael@0 | 804 | * (The longest legal byte sequence has 3 trail bytes.) |
michael@0 | 805 | * Count oldToULength (number of source bytes from a previous buffer) |
michael@0 | 806 | * into the source length but reduce the source index by toULimit |
michael@0 | 807 | * while going back over trail bytes in order to not go back into |
michael@0 | 808 | * the bytes that will be read for finishing a partial |
michael@0 | 809 | * sequence from the previous buffer. |
michael@0 | 810 | * Let the standard converter handle edge cases. |
michael@0 | 811 | */ |
michael@0 | 812 | int32_t i; |
michael@0 | 813 | |
michael@0 | 814 | if(count>targetCapacity) { |
michael@0 | 815 | count=targetCapacity; |
michael@0 | 816 | } |
michael@0 | 817 | |
michael@0 | 818 | i=0; |
michael@0 | 819 | while(i<3 && i<(count-toULimit)) { |
michael@0 | 820 | b=source[count-oldToULength-i-1]; |
michael@0 | 821 | if(U8_IS_TRAIL(b)) { |
michael@0 | 822 | ++i; |
michael@0 | 823 | } else { |
michael@0 | 824 | if(i<U8_COUNT_TRAIL_BYTES(b)) { |
michael@0 | 825 | /* stop converting before the lead byte if there are not enough trail bytes for it */ |
michael@0 | 826 | count-=i+1; |
michael@0 | 827 | } |
michael@0 | 828 | break; |
michael@0 | 829 | } |
michael@0 | 830 | } |
michael@0 | 831 | } |
michael@0 | 832 | |
michael@0 | 833 | if(c!=0) { |
michael@0 | 834 | utf8->toUnicodeStatus=0; |
michael@0 | 835 | utf8->toULength=0; |
michael@0 | 836 | goto moreBytes; |
michael@0 | 837 | /* See note in ucnv_SBCSFromUTF8() about this goto. */ |
michael@0 | 838 | } |
michael@0 | 839 | |
michael@0 | 840 | /* conversion loop */ |
michael@0 | 841 | while(count>0) { |
michael@0 | 842 | b=*source++; |
michael@0 | 843 | if((int8_t)b>=0) { |
michael@0 | 844 | /* convert ASCII */ |
michael@0 | 845 | *target++=b; |
michael@0 | 846 | --count; |
michael@0 | 847 | continue; |
michael@0 | 848 | } else { |
michael@0 | 849 | if(b>0xe0) { |
michael@0 | 850 | if( /* handle U+1000..U+D7FF inline */ |
michael@0 | 851 | (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) || |
michael@0 | 852 | (b==0xed && (t1 <= 0x9f))) && |
michael@0 | 853 | (t2=source[1]) >= 0x80 && t2 <= 0xbf |
michael@0 | 854 | ) { |
michael@0 | 855 | source+=2; |
michael@0 | 856 | *target++=b; |
michael@0 | 857 | *target++=t1; |
michael@0 | 858 | *target++=t2; |
michael@0 | 859 | count-=3; |
michael@0 | 860 | continue; |
michael@0 | 861 | } |
michael@0 | 862 | } else if(b<0xe0) { |
michael@0 | 863 | if( /* handle U+0080..U+07FF inline */ |
michael@0 | 864 | b>=0xc2 && |
michael@0 | 865 | (t1=*source) >= 0x80 && t1 <= 0xbf |
michael@0 | 866 | ) { |
michael@0 | 867 | ++source; |
michael@0 | 868 | *target++=b; |
michael@0 | 869 | *target++=t1; |
michael@0 | 870 | count-=2; |
michael@0 | 871 | continue; |
michael@0 | 872 | } |
michael@0 | 873 | } else if(b==0xe0) { |
michael@0 | 874 | if( /* handle U+0800..U+0FFF inline */ |
michael@0 | 875 | (t1=source[0]) >= 0xa0 && t1 <= 0xbf && |
michael@0 | 876 | (t2=source[1]) >= 0x80 && t2 <= 0xbf |
michael@0 | 877 | ) { |
michael@0 | 878 | source+=2; |
michael@0 | 879 | *target++=b; |
michael@0 | 880 | *target++=t1; |
michael@0 | 881 | *target++=t2; |
michael@0 | 882 | count-=3; |
michael@0 | 883 | continue; |
michael@0 | 884 | } |
michael@0 | 885 | } |
michael@0 | 886 | |
michael@0 | 887 | /* handle "complicated" and error cases, and continuing partial characters */ |
michael@0 | 888 | oldToULength=0; |
michael@0 | 889 | toULength=1; |
michael@0 | 890 | toULimit=U8_COUNT_TRAIL_BYTES(b)+1; |
michael@0 | 891 | c=b; |
michael@0 | 892 | moreBytes: |
michael@0 | 893 | while(toULength<toULimit) { |
michael@0 | 894 | if(source<sourceLimit) { |
michael@0 | 895 | b=*source; |
michael@0 | 896 | if(U8_IS_TRAIL(b)) { |
michael@0 | 897 | ++source; |
michael@0 | 898 | ++toULength; |
michael@0 | 899 | c=(c<<6)+b; |
michael@0 | 900 | } else { |
michael@0 | 901 | break; /* sequence too short, stop with toULength<toULimit */ |
michael@0 | 902 | } |
michael@0 | 903 | } else { |
michael@0 | 904 | /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */ |
michael@0 | 905 | source-=(toULength-oldToULength); |
michael@0 | 906 | while(oldToULength<toULength) { |
michael@0 | 907 | utf8->toUBytes[oldToULength++]=*source++; |
michael@0 | 908 | } |
michael@0 | 909 | utf8->toUnicodeStatus=c; |
michael@0 | 910 | utf8->toULength=toULength; |
michael@0 | 911 | utf8->mode=toULimit; |
michael@0 | 912 | pToUArgs->source=(char *)source; |
michael@0 | 913 | pFromUArgs->target=(char *)target; |
michael@0 | 914 | return; |
michael@0 | 915 | } |
michael@0 | 916 | } |
michael@0 | 917 | |
michael@0 | 918 | if( toULength==toULimit && /* consumed all trail bytes */ |
michael@0 | 919 | (toULength==3 || toULength==2) && /* BMP */ |
michael@0 | 920 | (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] && |
michael@0 | 921 | (c<=0xd7ff || 0xe000<=c) /* not a surrogate */ |
michael@0 | 922 | ) { |
michael@0 | 923 | /* legal byte sequence for BMP code point */ |
michael@0 | 924 | } else if( |
michael@0 | 925 | toULength==toULimit && toULength==4 && |
michael@0 | 926 | (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff) |
michael@0 | 927 | ) { |
michael@0 | 928 | /* legal byte sequence for supplementary code point */ |
michael@0 | 929 | } else { |
michael@0 | 930 | /* error handling: illegal UTF-8 byte sequence */ |
michael@0 | 931 | source-=(toULength-oldToULength); |
michael@0 | 932 | while(oldToULength<toULength) { |
michael@0 | 933 | utf8->toUBytes[oldToULength++]=*source++; |
michael@0 | 934 | } |
michael@0 | 935 | utf8->toULength=toULength; |
michael@0 | 936 | pToUArgs->source=(char *)source; |
michael@0 | 937 | pFromUArgs->target=(char *)target; |
michael@0 | 938 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 939 | return; |
michael@0 | 940 | } |
michael@0 | 941 | |
michael@0 | 942 | /* copy the legal byte sequence to the target */ |
michael@0 | 943 | { |
michael@0 | 944 | int8_t i; |
michael@0 | 945 | |
michael@0 | 946 | for(i=0; i<oldToULength; ++i) { |
michael@0 | 947 | *target++=utf8->toUBytes[i]; |
michael@0 | 948 | } |
michael@0 | 949 | source-=(toULength-oldToULength); |
michael@0 | 950 | for(; i<toULength; ++i) { |
michael@0 | 951 | *target++=*source++; |
michael@0 | 952 | } |
michael@0 | 953 | count-=toULength; |
michael@0 | 954 | } |
michael@0 | 955 | } |
michael@0 | 956 | } |
michael@0 | 957 | |
michael@0 | 958 | if(U_SUCCESS(*pErrorCode) && source<sourceLimit) { |
michael@0 | 959 | if(target==(const uint8_t *)pFromUArgs->targetLimit) { |
michael@0 | 960 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 961 | } else { |
michael@0 | 962 | b=*source; |
michael@0 | 963 | toULimit=U8_COUNT_TRAIL_BYTES(b)+1; |
michael@0 | 964 | if(toULimit>(sourceLimit-source)) { |
michael@0 | 965 | /* collect a truncated byte sequence */ |
michael@0 | 966 | toULength=0; |
michael@0 | 967 | c=b; |
michael@0 | 968 | for(;;) { |
michael@0 | 969 | utf8->toUBytes[toULength++]=b; |
michael@0 | 970 | if(++source==sourceLimit) { |
michael@0 | 971 | /* partial byte sequence at end of source */ |
michael@0 | 972 | utf8->toUnicodeStatus=c; |
michael@0 | 973 | utf8->toULength=toULength; |
michael@0 | 974 | utf8->mode=toULimit; |
michael@0 | 975 | break; |
michael@0 | 976 | } else if(!U8_IS_TRAIL(b=*source)) { |
michael@0 | 977 | /* lead byte in trail byte position */ |
michael@0 | 978 | utf8->toULength=toULength; |
michael@0 | 979 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 980 | break; |
michael@0 | 981 | } |
michael@0 | 982 | c=(c<<6)+b; |
michael@0 | 983 | } |
michael@0 | 984 | } else { |
michael@0 | 985 | /* partial-sequence target overflow: fall back to the pivoting implementation */ |
michael@0 | 986 | *pErrorCode=U_USING_DEFAULT_WARNING; |
michael@0 | 987 | } |
michael@0 | 988 | } |
michael@0 | 989 | } |
michael@0 | 990 | |
michael@0 | 991 | /* write back the updated pointers */ |
michael@0 | 992 | pToUArgs->source=(char *)source; |
michael@0 | 993 | pFromUArgs->target=(char *)target; |
michael@0 | 994 | } |
michael@0 | 995 | |
michael@0 | 996 | /* UTF-8 converter data ----------------------------------------------------- */ |
michael@0 | 997 | |
michael@0 | 998 | static const UConverterImpl _UTF8Impl={ |
michael@0 | 999 | UCNV_UTF8, |
michael@0 | 1000 | |
michael@0 | 1001 | NULL, |
michael@0 | 1002 | NULL, |
michael@0 | 1003 | |
michael@0 | 1004 | NULL, |
michael@0 | 1005 | NULL, |
michael@0 | 1006 | NULL, |
michael@0 | 1007 | |
michael@0 | 1008 | ucnv_toUnicode_UTF8, |
michael@0 | 1009 | ucnv_toUnicode_UTF8_OFFSETS_LOGIC, |
michael@0 | 1010 | ucnv_fromUnicode_UTF8, |
michael@0 | 1011 | ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, |
michael@0 | 1012 | ucnv_getNextUChar_UTF8, |
michael@0 | 1013 | |
michael@0 | 1014 | NULL, |
michael@0 | 1015 | NULL, |
michael@0 | 1016 | NULL, |
michael@0 | 1017 | NULL, |
michael@0 | 1018 | ucnv_getNonSurrogateUnicodeSet, |
michael@0 | 1019 | |
michael@0 | 1020 | ucnv_UTF8FromUTF8, |
michael@0 | 1021 | ucnv_UTF8FromUTF8 |
michael@0 | 1022 | }; |
michael@0 | 1023 | |
michael@0 | 1024 | /* The 1208 CCSID refers to any version of Unicode of UTF-8 */ |
michael@0 | 1025 | static const UConverterStaticData _UTF8StaticData={ |
michael@0 | 1026 | sizeof(UConverterStaticData), |
michael@0 | 1027 | "UTF-8", |
michael@0 | 1028 | 1208, UCNV_IBM, UCNV_UTF8, |
michael@0 | 1029 | 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */ |
michael@0 | 1030 | { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE, |
michael@0 | 1031 | 0, |
michael@0 | 1032 | 0, |
michael@0 | 1033 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
michael@0 | 1034 | }; |
michael@0 | 1035 | |
michael@0 | 1036 | |
michael@0 | 1037 | const UConverterSharedData _UTF8Data={ |
michael@0 | 1038 | sizeof(UConverterSharedData), ~((uint32_t) 0), |
michael@0 | 1039 | NULL, NULL, &_UTF8StaticData, FALSE, &_UTF8Impl, |
michael@0 | 1040 | 0 |
michael@0 | 1041 | }; |
michael@0 | 1042 | |
michael@0 | 1043 | /* CESU-8 converter data ---------------------------------------------------- */ |
michael@0 | 1044 | |
michael@0 | 1045 | static const UConverterImpl _CESU8Impl={ |
michael@0 | 1046 | UCNV_CESU8, |
michael@0 | 1047 | |
michael@0 | 1048 | NULL, |
michael@0 | 1049 | NULL, |
michael@0 | 1050 | |
michael@0 | 1051 | NULL, |
michael@0 | 1052 | NULL, |
michael@0 | 1053 | NULL, |
michael@0 | 1054 | |
michael@0 | 1055 | ucnv_toUnicode_UTF8, |
michael@0 | 1056 | ucnv_toUnicode_UTF8_OFFSETS_LOGIC, |
michael@0 | 1057 | ucnv_fromUnicode_UTF8, |
michael@0 | 1058 | ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, |
michael@0 | 1059 | NULL, |
michael@0 | 1060 | |
michael@0 | 1061 | NULL, |
michael@0 | 1062 | NULL, |
michael@0 | 1063 | NULL, |
michael@0 | 1064 | NULL, |
michael@0 | 1065 | ucnv_getCompleteUnicodeSet |
michael@0 | 1066 | }; |
michael@0 | 1067 | |
michael@0 | 1068 | static const UConverterStaticData _CESU8StaticData={ |
michael@0 | 1069 | sizeof(UConverterStaticData), |
michael@0 | 1070 | "CESU-8", |
michael@0 | 1071 | 9400, /* CCSID for CESU-8 */ |
michael@0 | 1072 | UCNV_UNKNOWN, UCNV_CESU8, 1, 3, |
michael@0 | 1073 | { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE, |
michael@0 | 1074 | 0, |
michael@0 | 1075 | 0, |
michael@0 | 1076 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
michael@0 | 1077 | }; |
michael@0 | 1078 | |
michael@0 | 1079 | |
michael@0 | 1080 | const UConverterSharedData _CESU8Data={ |
michael@0 | 1081 | sizeof(UConverterSharedData), ~((uint32_t) 0), |
michael@0 | 1082 | NULL, NULL, &_CESU8StaticData, FALSE, &_CESU8Impl, |
michael@0 | 1083 | 0 |
michael@0 | 1084 | }; |
michael@0 | 1085 | |
michael@0 | 1086 | #endif |