intl/icu/source/common/ucnv_u32.c

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 **********************************************************************
michael@0 3 * Copyright (C) 2002-2011, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 **********************************************************************
michael@0 6 * file name: ucnv_u32.c
michael@0 7 * encoding: US-ASCII
michael@0 8 * tab size: 8 (not used)
michael@0 9 * indentation:4
michael@0 10 *
michael@0 11 * created on: 2002jul01
michael@0 12 * created by: Markus W. Scherer
michael@0 13 *
michael@0 14 * UTF-32 converter implementation. Used to be in ucnv_utf.c.
michael@0 15 */
michael@0 16
michael@0 17 #include "unicode/utypes.h"
michael@0 18
michael@0 19 #if !UCONFIG_NO_CONVERSION
michael@0 20
michael@0 21 #include "unicode/ucnv.h"
michael@0 22 #include "unicode/utf.h"
michael@0 23 #include "ucnv_bld.h"
michael@0 24 #include "ucnv_cnv.h"
michael@0 25 #include "cmemory.h"
michael@0 26
michael@0 27 #define MAXIMUM_UCS2 0x0000FFFF
michael@0 28 #define MAXIMUM_UTF 0x0010FFFF
michael@0 29 #define HALF_SHIFT 10
michael@0 30 #define HALF_BASE 0x0010000
michael@0 31 #define HALF_MASK 0x3FF
michael@0 32 #define SURROGATE_HIGH_START 0xD800
michael@0 33 #define SURROGATE_LOW_START 0xDC00
michael@0 34
michael@0 35 /* -SURROGATE_LOW_START + HALF_BASE */
michael@0 36 #define SURROGATE_LOW_BASE 9216
michael@0 37
michael@0 38 enum {
michael@0 39 UCNV_NEED_TO_WRITE_BOM=1
michael@0 40 };
michael@0 41
michael@0 42 /* UTF-32BE ----------------------------------------------------------------- */
michael@0 43
michael@0 44 static void
michael@0 45 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
michael@0 46 UErrorCode * err)
michael@0 47 {
michael@0 48 const unsigned char *mySource = (unsigned char *) args->source;
michael@0 49 UChar *myTarget = args->target;
michael@0 50 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
michael@0 51 const UChar *targetLimit = args->targetLimit;
michael@0 52 unsigned char *toUBytes = args->converter->toUBytes;
michael@0 53 uint32_t ch, i;
michael@0 54
michael@0 55 /* Restore state of current sequence */
michael@0 56 if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
michael@0 57 i = args->converter->toULength; /* restore # of bytes consumed */
michael@0 58 args->converter->toULength = 0;
michael@0 59
michael@0 60 ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
michael@0 61 args->converter->toUnicodeStatus = 0;
michael@0 62 goto morebytes;
michael@0 63 }
michael@0 64
michael@0 65 while (mySource < sourceLimit && myTarget < targetLimit) {
michael@0 66 i = 0;
michael@0 67 ch = 0;
michael@0 68 morebytes:
michael@0 69 while (i < sizeof(uint32_t)) {
michael@0 70 if (mySource < sourceLimit) {
michael@0 71 ch = (ch << 8) | (uint8_t)(*mySource);
michael@0 72 toUBytes[i++] = (char) *(mySource++);
michael@0 73 }
michael@0 74 else {
michael@0 75 /* stores a partially calculated target*/
michael@0 76 /* + 1 to make 0 a valid character */
michael@0 77 args->converter->toUnicodeStatus = ch + 1;
michael@0 78 args->converter->toULength = (int8_t) i;
michael@0 79 goto donefornow;
michael@0 80 }
michael@0 81 }
michael@0 82
michael@0 83 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
michael@0 84 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
michael@0 85 if (ch <= MAXIMUM_UCS2)
michael@0 86 {
michael@0 87 /* fits in 16 bits */
michael@0 88 *(myTarget++) = (UChar) ch;
michael@0 89 }
michael@0 90 else {
michael@0 91 /* write out the surrogates */
michael@0 92 *(myTarget++) = U16_LEAD(ch);
michael@0 93 ch = U16_TRAIL(ch);
michael@0 94 if (myTarget < targetLimit) {
michael@0 95 *(myTarget++) = (UChar)ch;
michael@0 96 }
michael@0 97 else {
michael@0 98 /* Put in overflow buffer (not handled here) */
michael@0 99 args->converter->UCharErrorBuffer[0] = (UChar) ch;
michael@0 100 args->converter->UCharErrorBufferLength = 1;
michael@0 101 *err = U_BUFFER_OVERFLOW_ERROR;
michael@0 102 break;
michael@0 103 }
michael@0 104 }
michael@0 105 }
michael@0 106 else {
michael@0 107 args->converter->toULength = (int8_t)i;
michael@0 108 *err = U_ILLEGAL_CHAR_FOUND;
michael@0 109 break;
michael@0 110 }
michael@0 111 }
michael@0 112
michael@0 113 donefornow:
michael@0 114 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
michael@0 115 /* End of target buffer */
michael@0 116 *err = U_BUFFER_OVERFLOW_ERROR;
michael@0 117 }
michael@0 118
michael@0 119 args->target = myTarget;
michael@0 120 args->source = (const char *) mySource;
michael@0 121 }
michael@0 122
michael@0 123 static void
michael@0 124 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
michael@0 125 UErrorCode * err)
michael@0 126 {
michael@0 127 const unsigned char *mySource = (unsigned char *) args->source;
michael@0 128 UChar *myTarget = args->target;
michael@0 129 int32_t *myOffsets = args->offsets;
michael@0 130 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
michael@0 131 const UChar *targetLimit = args->targetLimit;
michael@0 132 unsigned char *toUBytes = args->converter->toUBytes;
michael@0 133 uint32_t ch, i;
michael@0 134 int32_t offsetNum = 0;
michael@0 135
michael@0 136 /* Restore state of current sequence */
michael@0 137 if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
michael@0 138 i = args->converter->toULength; /* restore # of bytes consumed */
michael@0 139 args->converter->toULength = 0;
michael@0 140
michael@0 141 ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
michael@0 142 args->converter->toUnicodeStatus = 0;
michael@0 143 goto morebytes;
michael@0 144 }
michael@0 145
michael@0 146 while (mySource < sourceLimit && myTarget < targetLimit) {
michael@0 147 i = 0;
michael@0 148 ch = 0;
michael@0 149 morebytes:
michael@0 150 while (i < sizeof(uint32_t)) {
michael@0 151 if (mySource < sourceLimit) {
michael@0 152 ch = (ch << 8) | (uint8_t)(*mySource);
michael@0 153 toUBytes[i++] = (char) *(mySource++);
michael@0 154 }
michael@0 155 else {
michael@0 156 /* stores a partially calculated target*/
michael@0 157 /* + 1 to make 0 a valid character */
michael@0 158 args->converter->toUnicodeStatus = ch + 1;
michael@0 159 args->converter->toULength = (int8_t) i;
michael@0 160 goto donefornow;
michael@0 161 }
michael@0 162 }
michael@0 163
michael@0 164 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
michael@0 165 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
michael@0 166 if (ch <= MAXIMUM_UCS2) {
michael@0 167 /* fits in 16 bits */
michael@0 168 *(myTarget++) = (UChar) ch;
michael@0 169 *(myOffsets++) = offsetNum;
michael@0 170 }
michael@0 171 else {
michael@0 172 /* write out the surrogates */
michael@0 173 *(myTarget++) = U16_LEAD(ch);
michael@0 174 *myOffsets++ = offsetNum;
michael@0 175 ch = U16_TRAIL(ch);
michael@0 176 if (myTarget < targetLimit)
michael@0 177 {
michael@0 178 *(myTarget++) = (UChar)ch;
michael@0 179 *(myOffsets++) = offsetNum;
michael@0 180 }
michael@0 181 else {
michael@0 182 /* Put in overflow buffer (not handled here) */
michael@0 183 args->converter->UCharErrorBuffer[0] = (UChar) ch;
michael@0 184 args->converter->UCharErrorBufferLength = 1;
michael@0 185 *err = U_BUFFER_OVERFLOW_ERROR;
michael@0 186 break;
michael@0 187 }
michael@0 188 }
michael@0 189 }
michael@0 190 else {
michael@0 191 args->converter->toULength = (int8_t)i;
michael@0 192 *err = U_ILLEGAL_CHAR_FOUND;
michael@0 193 break;
michael@0 194 }
michael@0 195 offsetNum += i;
michael@0 196 }
michael@0 197
michael@0 198 donefornow:
michael@0 199 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
michael@0 200 {
michael@0 201 /* End of target buffer */
michael@0 202 *err = U_BUFFER_OVERFLOW_ERROR;
michael@0 203 }
michael@0 204
michael@0 205 args->target = myTarget;
michael@0 206 args->source = (const char *) mySource;
michael@0 207 args->offsets = myOffsets;
michael@0 208 }
michael@0 209
michael@0 210 static void
michael@0 211 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
michael@0 212 UErrorCode * err)
michael@0 213 {
michael@0 214 const UChar *mySource = args->source;
michael@0 215 unsigned char *myTarget;
michael@0 216 const UChar *sourceLimit = args->sourceLimit;
michael@0 217 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
michael@0 218 UChar32 ch, ch2;
michael@0 219 unsigned int indexToWrite;
michael@0 220 unsigned char temp[sizeof(uint32_t)];
michael@0 221
michael@0 222 if(mySource >= sourceLimit) {
michael@0 223 /* no input, nothing to do */
michael@0 224 return;
michael@0 225 }
michael@0 226
michael@0 227 /* write the BOM if necessary */
michael@0 228 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
michael@0 229 static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
michael@0 230 ucnv_fromUWriteBytes(args->converter,
michael@0 231 bom, 4,
michael@0 232 &args->target, args->targetLimit,
michael@0 233 &args->offsets, -1,
michael@0 234 err);
michael@0 235 args->converter->fromUnicodeStatus=0;
michael@0 236 }
michael@0 237
michael@0 238 myTarget = (unsigned char *) args->target;
michael@0 239 temp[0] = 0;
michael@0 240
michael@0 241 if (args->converter->fromUChar32) {
michael@0 242 ch = args->converter->fromUChar32;
michael@0 243 args->converter->fromUChar32 = 0;
michael@0 244 goto lowsurogate;
michael@0 245 }
michael@0 246
michael@0 247 while (mySource < sourceLimit && myTarget < targetLimit) {
michael@0 248 ch = *(mySource++);
michael@0 249
michael@0 250 if (U_IS_SURROGATE(ch)) {
michael@0 251 if (U_IS_LEAD(ch)) {
michael@0 252 lowsurogate:
michael@0 253 if (mySource < sourceLimit) {
michael@0 254 ch2 = *mySource;
michael@0 255 if (U_IS_TRAIL(ch2)) {
michael@0 256 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
michael@0 257 mySource++;
michael@0 258 }
michael@0 259 else {
michael@0 260 /* this is an unmatched trail code unit (2nd surrogate) */
michael@0 261 /* callback(illegal) */
michael@0 262 args->converter->fromUChar32 = ch;
michael@0 263 *err = U_ILLEGAL_CHAR_FOUND;
michael@0 264 break;
michael@0 265 }
michael@0 266 }
michael@0 267 else {
michael@0 268 /* ran out of source */
michael@0 269 args->converter->fromUChar32 = ch;
michael@0 270 if (args->flush) {
michael@0 271 /* this is an unmatched trail code unit (2nd surrogate) */
michael@0 272 /* callback(illegal) */
michael@0 273 *err = U_ILLEGAL_CHAR_FOUND;
michael@0 274 }
michael@0 275 break;
michael@0 276 }
michael@0 277 }
michael@0 278 else {
michael@0 279 /* this is an unmatched trail code unit (2nd surrogate) */
michael@0 280 /* callback(illegal) */
michael@0 281 args->converter->fromUChar32 = ch;
michael@0 282 *err = U_ILLEGAL_CHAR_FOUND;
michael@0 283 break;
michael@0 284 }
michael@0 285 }
michael@0 286
michael@0 287 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
michael@0 288 temp[1] = (uint8_t) (ch >> 16 & 0x1F);
michael@0 289 temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
michael@0 290 temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
michael@0 291
michael@0 292 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
michael@0 293 if (myTarget < targetLimit) {
michael@0 294 *(myTarget++) = temp[indexToWrite];
michael@0 295 }
michael@0 296 else {
michael@0 297 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
michael@0 298 *err = U_BUFFER_OVERFLOW_ERROR;
michael@0 299 }
michael@0 300 }
michael@0 301 }
michael@0 302
michael@0 303 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
michael@0 304 *err = U_BUFFER_OVERFLOW_ERROR;
michael@0 305 }
michael@0 306
michael@0 307 args->target = (char *) myTarget;
michael@0 308 args->source = mySource;
michael@0 309 }
michael@0 310
michael@0 311 static void
michael@0 312 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
michael@0 313 UErrorCode * err)
michael@0 314 {
michael@0 315 const UChar *mySource = args->source;
michael@0 316 unsigned char *myTarget;
michael@0 317 int32_t *myOffsets;
michael@0 318 const UChar *sourceLimit = args->sourceLimit;
michael@0 319 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
michael@0 320 UChar32 ch, ch2;
michael@0 321 int32_t offsetNum = 0;
michael@0 322 unsigned int indexToWrite;
michael@0 323 unsigned char temp[sizeof(uint32_t)];
michael@0 324
michael@0 325 if(mySource >= sourceLimit) {
michael@0 326 /* no input, nothing to do */
michael@0 327 return;
michael@0 328 }
michael@0 329
michael@0 330 /* write the BOM if necessary */
michael@0 331 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
michael@0 332 static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
michael@0 333 ucnv_fromUWriteBytes(args->converter,
michael@0 334 bom, 4,
michael@0 335 &args->target, args->targetLimit,
michael@0 336 &args->offsets, -1,
michael@0 337 err);
michael@0 338 args->converter->fromUnicodeStatus=0;
michael@0 339 }
michael@0 340
michael@0 341 myTarget = (unsigned char *) args->target;
michael@0 342 myOffsets = args->offsets;
michael@0 343 temp[0] = 0;
michael@0 344
michael@0 345 if (args->converter->fromUChar32) {
michael@0 346 ch = args->converter->fromUChar32;
michael@0 347 args->converter->fromUChar32 = 0;
michael@0 348 goto lowsurogate;
michael@0 349 }
michael@0 350
michael@0 351 while (mySource < sourceLimit && myTarget < targetLimit) {
michael@0 352 ch = *(mySource++);
michael@0 353
michael@0 354 if (U_IS_SURROGATE(ch)) {
michael@0 355 if (U_IS_LEAD(ch)) {
michael@0 356 lowsurogate:
michael@0 357 if (mySource < sourceLimit) {
michael@0 358 ch2 = *mySource;
michael@0 359 if (U_IS_TRAIL(ch2)) {
michael@0 360 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
michael@0 361 mySource++;
michael@0 362 }
michael@0 363 else {
michael@0 364 /* this is an unmatched trail code unit (2nd surrogate) */
michael@0 365 /* callback(illegal) */
michael@0 366 args->converter->fromUChar32 = ch;
michael@0 367 *err = U_ILLEGAL_CHAR_FOUND;
michael@0 368 break;
michael@0 369 }
michael@0 370 }
michael@0 371 else {
michael@0 372 /* ran out of source */
michael@0 373 args->converter->fromUChar32 = ch;
michael@0 374 if (args->flush) {
michael@0 375 /* this is an unmatched trail code unit (2nd surrogate) */
michael@0 376 /* callback(illegal) */
michael@0 377 *err = U_ILLEGAL_CHAR_FOUND;
michael@0 378 }
michael@0 379 break;
michael@0 380 }
michael@0 381 }
michael@0 382 else {
michael@0 383 /* this is an unmatched trail code unit (2nd surrogate) */
michael@0 384 /* callback(illegal) */
michael@0 385 args->converter->fromUChar32 = ch;
michael@0 386 *err = U_ILLEGAL_CHAR_FOUND;
michael@0 387 break;
michael@0 388 }
michael@0 389 }
michael@0 390
michael@0 391 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
michael@0 392 temp[1] = (uint8_t) (ch >> 16 & 0x1F);
michael@0 393 temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
michael@0 394 temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
michael@0 395
michael@0 396 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
michael@0 397 if (myTarget < targetLimit) {
michael@0 398 *(myTarget++) = temp[indexToWrite];
michael@0 399 *(myOffsets++) = offsetNum;
michael@0 400 }
michael@0 401 else {
michael@0 402 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
michael@0 403 *err = U_BUFFER_OVERFLOW_ERROR;
michael@0 404 }
michael@0 405 }
michael@0 406 offsetNum = offsetNum + 1 + (temp[1] != 0);
michael@0 407 }
michael@0 408
michael@0 409 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
michael@0 410 *err = U_BUFFER_OVERFLOW_ERROR;
michael@0 411 }
michael@0 412
michael@0 413 args->target = (char *) myTarget;
michael@0 414 args->source = mySource;
michael@0 415 args->offsets = myOffsets;
michael@0 416 }
michael@0 417
michael@0 418 static UChar32
michael@0 419 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,
michael@0 420 UErrorCode* err)
michael@0 421 {
michael@0 422 const uint8_t *mySource;
michael@0 423 UChar32 myUChar;
michael@0 424 int32_t length;
michael@0 425
michael@0 426 mySource = (const uint8_t *)args->source;
michael@0 427 if (mySource >= (const uint8_t *)args->sourceLimit)
michael@0 428 {
michael@0 429 /* no input */
michael@0 430 *err = U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 431 return 0xffff;
michael@0 432 }
michael@0 433
michael@0 434 length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
michael@0 435 if (length < 4)
michael@0 436 {
michael@0 437 /* got a partial character */
michael@0 438 uprv_memcpy(args->converter->toUBytes, mySource, length);
michael@0 439 args->converter->toULength = (int8_t)length;
michael@0 440 args->source = (const char *)(mySource + length);
michael@0 441 *err = U_TRUNCATED_CHAR_FOUND;
michael@0 442 return 0xffff;
michael@0 443 }
michael@0 444
michael@0 445 /* Don't even try to do a direct cast because the value may be on an odd address. */
michael@0 446 myUChar = ((UChar32)mySource[0] << 24)
michael@0 447 | ((UChar32)mySource[1] << 16)
michael@0 448 | ((UChar32)mySource[2] << 8)
michael@0 449 | ((UChar32)mySource[3]);
michael@0 450
michael@0 451 args->source = (const char *)(mySource + 4);
michael@0 452 if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
michael@0 453 return myUChar;
michael@0 454 }
michael@0 455
michael@0 456 uprv_memcpy(args->converter->toUBytes, mySource, 4);
michael@0 457 args->converter->toULength = 4;
michael@0 458
michael@0 459 *err = U_ILLEGAL_CHAR_FOUND;
michael@0 460 return 0xffff;
michael@0 461 }
michael@0 462
michael@0 463 static const UConverterImpl _UTF32BEImpl = {
michael@0 464 UCNV_UTF32_BigEndian,
michael@0 465
michael@0 466 NULL,
michael@0 467 NULL,
michael@0 468
michael@0 469 NULL,
michael@0 470 NULL,
michael@0 471 NULL,
michael@0 472
michael@0 473 T_UConverter_toUnicode_UTF32_BE,
michael@0 474 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC,
michael@0 475 T_UConverter_fromUnicode_UTF32_BE,
michael@0 476 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
michael@0 477 T_UConverter_getNextUChar_UTF32_BE,
michael@0 478
michael@0 479 NULL,
michael@0 480 NULL,
michael@0 481 NULL,
michael@0 482 NULL,
michael@0 483 ucnv_getNonSurrogateUnicodeSet
michael@0 484 };
michael@0 485
michael@0 486 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
michael@0 487 static const UConverterStaticData _UTF32BEStaticData = {
michael@0 488 sizeof(UConverterStaticData),
michael@0 489 "UTF-32BE",
michael@0 490 1232,
michael@0 491 UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
michael@0 492 { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE,
michael@0 493 0,
michael@0 494 0,
michael@0 495 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
michael@0 496 };
michael@0 497
michael@0 498 const UConverterSharedData _UTF32BEData = {
michael@0 499 sizeof(UConverterSharedData), ~((uint32_t) 0),
michael@0 500 NULL, NULL, &_UTF32BEStaticData, FALSE, &_UTF32BEImpl,
michael@0 501 0
michael@0 502 };
michael@0 503
michael@0 504 /* UTF-32LE ---------------------------------------------------------- */
michael@0 505
michael@0 506 static void
michael@0 507 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,
michael@0 508 UErrorCode * err)
michael@0 509 {
michael@0 510 const unsigned char *mySource = (unsigned char *) args->source;
michael@0 511 UChar *myTarget = args->target;
michael@0 512 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
michael@0 513 const UChar *targetLimit = args->targetLimit;
michael@0 514 unsigned char *toUBytes = args->converter->toUBytes;
michael@0 515 uint32_t ch, i;
michael@0 516
michael@0 517 /* Restore state of current sequence */
michael@0 518 if (args->converter->toUnicodeStatus && myTarget < targetLimit)
michael@0 519 {
michael@0 520 i = args->converter->toULength; /* restore # of bytes consumed */
michael@0 521 args->converter->toULength = 0;
michael@0 522
michael@0 523 /* Stores the previously calculated ch from a previous call*/
michael@0 524 ch = args->converter->toUnicodeStatus - 1;
michael@0 525 args->converter->toUnicodeStatus = 0;
michael@0 526 goto morebytes;
michael@0 527 }
michael@0 528
michael@0 529 while (mySource < sourceLimit && myTarget < targetLimit)
michael@0 530 {
michael@0 531 i = 0;
michael@0 532 ch = 0;
michael@0 533 morebytes:
michael@0 534 while (i < sizeof(uint32_t))
michael@0 535 {
michael@0 536 if (mySource < sourceLimit)
michael@0 537 {
michael@0 538 ch |= ((uint8_t)(*mySource)) << (i * 8);
michael@0 539 toUBytes[i++] = (char) *(mySource++);
michael@0 540 }
michael@0 541 else
michael@0 542 {
michael@0 543 /* stores a partially calculated target*/
michael@0 544 /* + 1 to make 0 a valid character */
michael@0 545 args->converter->toUnicodeStatus = ch + 1;
michael@0 546 args->converter->toULength = (int8_t) i;
michael@0 547 goto donefornow;
michael@0 548 }
michael@0 549 }
michael@0 550
michael@0 551 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
michael@0 552 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
michael@0 553 if (ch <= MAXIMUM_UCS2) {
michael@0 554 /* fits in 16 bits */
michael@0 555 *(myTarget++) = (UChar) ch;
michael@0 556 }
michael@0 557 else {
michael@0 558 /* write out the surrogates */
michael@0 559 *(myTarget++) = U16_LEAD(ch);
michael@0 560 ch = U16_TRAIL(ch);
michael@0 561 if (myTarget < targetLimit) {
michael@0 562 *(myTarget++) = (UChar)ch;
michael@0 563 }
michael@0 564 else {
michael@0 565 /* Put in overflow buffer (not handled here) */
michael@0 566 args->converter->UCharErrorBuffer[0] = (UChar) ch;
michael@0 567 args->converter->UCharErrorBufferLength = 1;
michael@0 568 *err = U_BUFFER_OVERFLOW_ERROR;
michael@0 569 break;
michael@0 570 }
michael@0 571 }
michael@0 572 }
michael@0 573 else {
michael@0 574 args->converter->toULength = (int8_t)i;
michael@0 575 *err = U_ILLEGAL_CHAR_FOUND;
michael@0 576 break;
michael@0 577 }
michael@0 578 }
michael@0 579
michael@0 580 donefornow:
michael@0 581 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
michael@0 582 {
michael@0 583 /* End of target buffer */
michael@0 584 *err = U_BUFFER_OVERFLOW_ERROR;
michael@0 585 }
michael@0 586
michael@0 587 args->target = myTarget;
michael@0 588 args->source = (const char *) mySource;
michael@0 589 }
michael@0 590
michael@0 591 static void
michael@0 592 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
michael@0 593 UErrorCode * err)
michael@0 594 {
michael@0 595 const unsigned char *mySource = (unsigned char *) args->source;
michael@0 596 UChar *myTarget = args->target;
michael@0 597 int32_t *myOffsets = args->offsets;
michael@0 598 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
michael@0 599 const UChar *targetLimit = args->targetLimit;
michael@0 600 unsigned char *toUBytes = args->converter->toUBytes;
michael@0 601 uint32_t ch, i;
michael@0 602 int32_t offsetNum = 0;
michael@0 603
michael@0 604 /* Restore state of current sequence */
michael@0 605 if (args->converter->toUnicodeStatus && myTarget < targetLimit)
michael@0 606 {
michael@0 607 i = args->converter->toULength; /* restore # of bytes consumed */
michael@0 608 args->converter->toULength = 0;
michael@0 609
michael@0 610 /* Stores the previously calculated ch from a previous call*/
michael@0 611 ch = args->converter->toUnicodeStatus - 1;
michael@0 612 args->converter->toUnicodeStatus = 0;
michael@0 613 goto morebytes;
michael@0 614 }
michael@0 615
michael@0 616 while (mySource < sourceLimit && myTarget < targetLimit)
michael@0 617 {
michael@0 618 i = 0;
michael@0 619 ch = 0;
michael@0 620 morebytes:
michael@0 621 while (i < sizeof(uint32_t))
michael@0 622 {
michael@0 623 if (mySource < sourceLimit)
michael@0 624 {
michael@0 625 ch |= ((uint8_t)(*mySource)) << (i * 8);
michael@0 626 toUBytes[i++] = (char) *(mySource++);
michael@0 627 }
michael@0 628 else
michael@0 629 {
michael@0 630 /* stores a partially calculated target*/
michael@0 631 /* + 1 to make 0 a valid character */
michael@0 632 args->converter->toUnicodeStatus = ch + 1;
michael@0 633 args->converter->toULength = (int8_t) i;
michael@0 634 goto donefornow;
michael@0 635 }
michael@0 636 }
michael@0 637
michael@0 638 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch))
michael@0 639 {
michael@0 640 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
michael@0 641 if (ch <= MAXIMUM_UCS2)
michael@0 642 {
michael@0 643 /* fits in 16 bits */
michael@0 644 *(myTarget++) = (UChar) ch;
michael@0 645 *(myOffsets++) = offsetNum;
michael@0 646 }
michael@0 647 else {
michael@0 648 /* write out the surrogates */
michael@0 649 *(myTarget++) = U16_LEAD(ch);
michael@0 650 *(myOffsets++) = offsetNum;
michael@0 651 ch = U16_TRAIL(ch);
michael@0 652 if (myTarget < targetLimit)
michael@0 653 {
michael@0 654 *(myTarget++) = (UChar)ch;
michael@0 655 *(myOffsets++) = offsetNum;
michael@0 656 }
michael@0 657 else
michael@0 658 {
michael@0 659 /* Put in overflow buffer (not handled here) */
michael@0 660 args->converter->UCharErrorBuffer[0] = (UChar) ch;
michael@0 661 args->converter->UCharErrorBufferLength = 1;
michael@0 662 *err = U_BUFFER_OVERFLOW_ERROR;
michael@0 663 break;
michael@0 664 }
michael@0 665 }
michael@0 666 }
michael@0 667 else
michael@0 668 {
michael@0 669 args->converter->toULength = (int8_t)i;
michael@0 670 *err = U_ILLEGAL_CHAR_FOUND;
michael@0 671 break;
michael@0 672 }
michael@0 673 offsetNum += i;
michael@0 674 }
michael@0 675
michael@0 676 donefornow:
michael@0 677 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
michael@0 678 {
michael@0 679 /* End of target buffer */
michael@0 680 *err = U_BUFFER_OVERFLOW_ERROR;
michael@0 681 }
michael@0 682
michael@0 683 args->target = myTarget;
michael@0 684 args->source = (const char *) mySource;
michael@0 685 args->offsets = myOffsets;
michael@0 686 }
michael@0 687
michael@0 688 static void
michael@0 689 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
michael@0 690 UErrorCode * err)
michael@0 691 {
michael@0 692 const UChar *mySource = args->source;
michael@0 693 unsigned char *myTarget;
michael@0 694 const UChar *sourceLimit = args->sourceLimit;
michael@0 695 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
michael@0 696 UChar32 ch, ch2;
michael@0 697 unsigned int indexToWrite;
michael@0 698 unsigned char temp[sizeof(uint32_t)];
michael@0 699
michael@0 700 if(mySource >= sourceLimit) {
michael@0 701 /* no input, nothing to do */
michael@0 702 return;
michael@0 703 }
michael@0 704
michael@0 705 /* write the BOM if necessary */
michael@0 706 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
michael@0 707 static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
michael@0 708 ucnv_fromUWriteBytes(args->converter,
michael@0 709 bom, 4,
michael@0 710 &args->target, args->targetLimit,
michael@0 711 &args->offsets, -1,
michael@0 712 err);
michael@0 713 args->converter->fromUnicodeStatus=0;
michael@0 714 }
michael@0 715
michael@0 716 myTarget = (unsigned char *) args->target;
michael@0 717 temp[3] = 0;
michael@0 718
michael@0 719 if (args->converter->fromUChar32)
michael@0 720 {
michael@0 721 ch = args->converter->fromUChar32;
michael@0 722 args->converter->fromUChar32 = 0;
michael@0 723 goto lowsurogate;
michael@0 724 }
michael@0 725
michael@0 726 while (mySource < sourceLimit && myTarget < targetLimit)
michael@0 727 {
michael@0 728 ch = *(mySource++);
michael@0 729
michael@0 730 if (U16_IS_SURROGATE(ch)) {
michael@0 731 if (U16_IS_LEAD(ch))
michael@0 732 {
michael@0 733 lowsurogate:
michael@0 734 if (mySource < sourceLimit)
michael@0 735 {
michael@0 736 ch2 = *mySource;
michael@0 737 if (U16_IS_TRAIL(ch2)) {
michael@0 738 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
michael@0 739 mySource++;
michael@0 740 }
michael@0 741 else {
michael@0 742 /* this is an unmatched trail code unit (2nd surrogate) */
michael@0 743 /* callback(illegal) */
michael@0 744 args->converter->fromUChar32 = ch;
michael@0 745 *err = U_ILLEGAL_CHAR_FOUND;
michael@0 746 break;
michael@0 747 }
michael@0 748 }
michael@0 749 else {
michael@0 750 /* ran out of source */
michael@0 751 args->converter->fromUChar32 = ch;
michael@0 752 if (args->flush) {
michael@0 753 /* this is an unmatched trail code unit (2nd surrogate) */
michael@0 754 /* callback(illegal) */
michael@0 755 *err = U_ILLEGAL_CHAR_FOUND;
michael@0 756 }
michael@0 757 break;
michael@0 758 }
michael@0 759 }
michael@0 760 else {
michael@0 761 /* this is an unmatched trail code unit (2nd surrogate) */
michael@0 762 /* callback(illegal) */
michael@0 763 args->converter->fromUChar32 = ch;
michael@0 764 *err = U_ILLEGAL_CHAR_FOUND;
michael@0 765 break;
michael@0 766 }
michael@0 767 }
michael@0 768
michael@0 769 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
michael@0 770 temp[2] = (uint8_t) (ch >> 16 & 0x1F);
michael@0 771 temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
michael@0 772 temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
michael@0 773
michael@0 774 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
michael@0 775 {
michael@0 776 if (myTarget < targetLimit)
michael@0 777 {
michael@0 778 *(myTarget++) = temp[indexToWrite];
michael@0 779 }
michael@0 780 else
michael@0 781 {
michael@0 782 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
michael@0 783 *err = U_BUFFER_OVERFLOW_ERROR;
michael@0 784 }
michael@0 785 }
michael@0 786 }
michael@0 787
michael@0 788 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
michael@0 789 {
michael@0 790 *err = U_BUFFER_OVERFLOW_ERROR;
michael@0 791 }
michael@0 792
michael@0 793 args->target = (char *) myTarget;
michael@0 794 args->source = mySource;
michael@0 795 }
michael@0 796
michael@0 797 static void
michael@0 798 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
michael@0 799 UErrorCode * err)
michael@0 800 {
michael@0 801 const UChar *mySource = args->source;
michael@0 802 unsigned char *myTarget;
michael@0 803 int32_t *myOffsets;
michael@0 804 const UChar *sourceLimit = args->sourceLimit;
michael@0 805 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
michael@0 806 UChar32 ch, ch2;
michael@0 807 unsigned int indexToWrite;
michael@0 808 unsigned char temp[sizeof(uint32_t)];
michael@0 809 int32_t offsetNum = 0;
michael@0 810
michael@0 811 if(mySource >= sourceLimit) {
michael@0 812 /* no input, nothing to do */
michael@0 813 return;
michael@0 814 }
michael@0 815
michael@0 816 /* write the BOM if necessary */
michael@0 817 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
michael@0 818 static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
michael@0 819 ucnv_fromUWriteBytes(args->converter,
michael@0 820 bom, 4,
michael@0 821 &args->target, args->targetLimit,
michael@0 822 &args->offsets, -1,
michael@0 823 err);
michael@0 824 args->converter->fromUnicodeStatus=0;
michael@0 825 }
michael@0 826
michael@0 827 myTarget = (unsigned char *) args->target;
michael@0 828 myOffsets = args->offsets;
michael@0 829 temp[3] = 0;
michael@0 830
michael@0 831 if (args->converter->fromUChar32)
michael@0 832 {
michael@0 833 ch = args->converter->fromUChar32;
michael@0 834 args->converter->fromUChar32 = 0;
michael@0 835 goto lowsurogate;
michael@0 836 }
michael@0 837
michael@0 838 while (mySource < sourceLimit && myTarget < targetLimit)
michael@0 839 {
michael@0 840 ch = *(mySource++);
michael@0 841
michael@0 842 if (U16_IS_SURROGATE(ch)) {
michael@0 843 if (U16_IS_LEAD(ch))
michael@0 844 {
michael@0 845 lowsurogate:
michael@0 846 if (mySource < sourceLimit)
michael@0 847 {
michael@0 848 ch2 = *mySource;
michael@0 849 if (U16_IS_TRAIL(ch2))
michael@0 850 {
michael@0 851 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
michael@0 852 mySource++;
michael@0 853 }
michael@0 854 else {
michael@0 855 /* this is an unmatched trail code unit (2nd surrogate) */
michael@0 856 /* callback(illegal) */
michael@0 857 args->converter->fromUChar32 = ch;
michael@0 858 *err = U_ILLEGAL_CHAR_FOUND;
michael@0 859 break;
michael@0 860 }
michael@0 861 }
michael@0 862 else {
michael@0 863 /* ran out of source */
michael@0 864 args->converter->fromUChar32 = ch;
michael@0 865 if (args->flush) {
michael@0 866 /* this is an unmatched trail code unit (2nd surrogate) */
michael@0 867 /* callback(illegal) */
michael@0 868 *err = U_ILLEGAL_CHAR_FOUND;
michael@0 869 }
michael@0 870 break;
michael@0 871 }
michael@0 872 }
michael@0 873 else {
michael@0 874 /* this is an unmatched trail code unit (2nd surrogate) */
michael@0 875 /* callback(illegal) */
michael@0 876 args->converter->fromUChar32 = ch;
michael@0 877 *err = U_ILLEGAL_CHAR_FOUND;
michael@0 878 break;
michael@0 879 }
michael@0 880 }
michael@0 881
michael@0 882 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
michael@0 883 temp[2] = (uint8_t) (ch >> 16 & 0x1F);
michael@0 884 temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
michael@0 885 temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
michael@0 886
michael@0 887 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
michael@0 888 {
michael@0 889 if (myTarget < targetLimit)
michael@0 890 {
michael@0 891 *(myTarget++) = temp[indexToWrite];
michael@0 892 *(myOffsets++) = offsetNum;
michael@0 893 }
michael@0 894 else
michael@0 895 {
michael@0 896 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
michael@0 897 *err = U_BUFFER_OVERFLOW_ERROR;
michael@0 898 }
michael@0 899 }
michael@0 900 offsetNum = offsetNum + 1 + (temp[2] != 0);
michael@0 901 }
michael@0 902
michael@0 903 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
michael@0 904 {
michael@0 905 *err = U_BUFFER_OVERFLOW_ERROR;
michael@0 906 }
michael@0 907
michael@0 908 args->target = (char *) myTarget;
michael@0 909 args->source = mySource;
michael@0 910 args->offsets = myOffsets;
michael@0 911 }
michael@0 912
michael@0 913 static UChar32
michael@0 914 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,
michael@0 915 UErrorCode* err)
michael@0 916 {
michael@0 917 const uint8_t *mySource;
michael@0 918 UChar32 myUChar;
michael@0 919 int32_t length;
michael@0 920
michael@0 921 mySource = (const uint8_t *)args->source;
michael@0 922 if (mySource >= (const uint8_t *)args->sourceLimit)
michael@0 923 {
michael@0 924 /* no input */
michael@0 925 *err = U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 926 return 0xffff;
michael@0 927 }
michael@0 928
michael@0 929 length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
michael@0 930 if (length < 4)
michael@0 931 {
michael@0 932 /* got a partial character */
michael@0 933 uprv_memcpy(args->converter->toUBytes, mySource, length);
michael@0 934 args->converter->toULength = (int8_t)length;
michael@0 935 args->source = (const char *)(mySource + length);
michael@0 936 *err = U_TRUNCATED_CHAR_FOUND;
michael@0 937 return 0xffff;
michael@0 938 }
michael@0 939
michael@0 940 /* Don't even try to do a direct cast because the value may be on an odd address. */
michael@0 941 myUChar = ((UChar32)mySource[3] << 24)
michael@0 942 | ((UChar32)mySource[2] << 16)
michael@0 943 | ((UChar32)mySource[1] << 8)
michael@0 944 | ((UChar32)mySource[0]);
michael@0 945
michael@0 946 args->source = (const char *)(mySource + 4);
michael@0 947 if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
michael@0 948 return myUChar;
michael@0 949 }
michael@0 950
michael@0 951 uprv_memcpy(args->converter->toUBytes, mySource, 4);
michael@0 952 args->converter->toULength = 4;
michael@0 953
michael@0 954 *err = U_ILLEGAL_CHAR_FOUND;
michael@0 955 return 0xffff;
michael@0 956 }
michael@0 957
michael@0 958 static const UConverterImpl _UTF32LEImpl = {
michael@0 959 UCNV_UTF32_LittleEndian,
michael@0 960
michael@0 961 NULL,
michael@0 962 NULL,
michael@0 963
michael@0 964 NULL,
michael@0 965 NULL,
michael@0 966 NULL,
michael@0 967
michael@0 968 T_UConverter_toUnicode_UTF32_LE,
michael@0 969 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC,
michael@0 970 T_UConverter_fromUnicode_UTF32_LE,
michael@0 971 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
michael@0 972 T_UConverter_getNextUChar_UTF32_LE,
michael@0 973
michael@0 974 NULL,
michael@0 975 NULL,
michael@0 976 NULL,
michael@0 977 NULL,
michael@0 978 ucnv_getNonSurrogateUnicodeSet
michael@0 979 };
michael@0 980
michael@0 981 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
michael@0 982 static const UConverterStaticData _UTF32LEStaticData = {
michael@0 983 sizeof(UConverterStaticData),
michael@0 984 "UTF-32LE",
michael@0 985 1234,
michael@0 986 UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4,
michael@0 987 { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE,
michael@0 988 0,
michael@0 989 0,
michael@0 990 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
michael@0 991 };
michael@0 992
michael@0 993
michael@0 994 const UConverterSharedData _UTF32LEData = {
michael@0 995 sizeof(UConverterSharedData), ~((uint32_t) 0),
michael@0 996 NULL, NULL, &_UTF32LEStaticData, FALSE, &_UTF32LEImpl,
michael@0 997 0
michael@0 998 };
michael@0 999
michael@0 1000 /* UTF-32 (Detect BOM) ------------------------------------------------------ */
michael@0 1001
michael@0 1002 /*
michael@0 1003 * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE
michael@0 1004 * accordingly.
michael@0 1005 *
michael@0 1006 * State values:
michael@0 1007 * 0 initial state
michael@0 1008 * 1 saw 00
michael@0 1009 * 2 saw 00 00
michael@0 1010 * 3 saw 00 00 FE
michael@0 1011 * 4 -
michael@0 1012 * 5 saw FF
michael@0 1013 * 6 saw FF FE
michael@0 1014 * 7 saw FF FE 00
michael@0 1015 * 8 UTF-32BE mode
michael@0 1016 * 9 UTF-32LE mode
michael@0 1017 *
michael@0 1018 * During detection: state&3==number of matching bytes so far.
michael@0 1019 *
michael@0 1020 * On output, emit U+FEFF as the first code point.
michael@0 1021 */
michael@0 1022
michael@0 1023 static void
michael@0 1024 _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {
michael@0 1025 if(choice<=UCNV_RESET_TO_UNICODE) {
michael@0 1026 /* reset toUnicode: state=0 */
michael@0 1027 cnv->mode=0;
michael@0 1028 }
michael@0 1029 if(choice!=UCNV_RESET_TO_UNICODE) {
michael@0 1030 /* reset fromUnicode: prepare to output the UTF-32PE BOM */
michael@0 1031 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
michael@0 1032 }
michael@0 1033 }
michael@0 1034
michael@0 1035 static void
michael@0 1036 _UTF32Open(UConverter *cnv,
michael@0 1037 UConverterLoadArgs *pArgs,
michael@0 1038 UErrorCode *pErrorCode) {
michael@0 1039 _UTF32Reset(cnv, UCNV_RESET_BOTH);
michael@0 1040 }
michael@0 1041
michael@0 1042 static const char utf32BOM[8]={ 0, 0, (char)0xfe, (char)0xff, (char)0xff, (char)0xfe, 0, 0 };
michael@0 1043
michael@0 1044 static void
michael@0 1045 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
michael@0 1046 UErrorCode *pErrorCode) {
michael@0 1047 UConverter *cnv=pArgs->converter;
michael@0 1048 const char *source=pArgs->source;
michael@0 1049 const char *sourceLimit=pArgs->sourceLimit;
michael@0 1050 int32_t *offsets=pArgs->offsets;
michael@0 1051
michael@0 1052 int32_t state, offsetDelta;
michael@0 1053 char b;
michael@0 1054
michael@0 1055 state=cnv->mode;
michael@0 1056
michael@0 1057 /*
michael@0 1058 * If we detect a BOM in this buffer, then we must add the BOM size to the
michael@0 1059 * offsets because the actual converter function will not see and count the BOM.
michael@0 1060 * offsetDelta will have the number of the BOM bytes that are in the current buffer.
michael@0 1061 */
michael@0 1062 offsetDelta=0;
michael@0 1063
michael@0 1064 while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
michael@0 1065 switch(state) {
michael@0 1066 case 0:
michael@0 1067 b=*source;
michael@0 1068 if(b==0) {
michael@0 1069 state=1; /* could be 00 00 FE FF */
michael@0 1070 } else if(b==(char)0xff) {
michael@0 1071 state=5; /* could be FF FE 00 00 */
michael@0 1072 } else {
michael@0 1073 state=8; /* default to UTF-32BE */
michael@0 1074 continue;
michael@0 1075 }
michael@0 1076 ++source;
michael@0 1077 break;
michael@0 1078 case 1:
michael@0 1079 case 2:
michael@0 1080 case 3:
michael@0 1081 case 5:
michael@0 1082 case 6:
michael@0 1083 case 7:
michael@0 1084 if(*source==utf32BOM[state]) {
michael@0 1085 ++state;
michael@0 1086 ++source;
michael@0 1087 if(state==4) {
michael@0 1088 state=8; /* detect UTF-32BE */
michael@0 1089 offsetDelta=(int32_t)(source-pArgs->source);
michael@0 1090 } else if(state==8) {
michael@0 1091 state=9; /* detect UTF-32LE */
michael@0 1092 offsetDelta=(int32_t)(source-pArgs->source);
michael@0 1093 }
michael@0 1094 } else {
michael@0 1095 /* switch to UTF-32BE and pass the previous bytes */
michael@0 1096 int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */
michael@0 1097
michael@0 1098 /* reset the source */
michael@0 1099 source=pArgs->source;
michael@0 1100
michael@0 1101 if(count==(state&3)) {
michael@0 1102 /* simple: all in the same buffer, just reset source */
michael@0 1103 } else {
michael@0 1104 UBool oldFlush=pArgs->flush;
michael@0 1105
michael@0 1106 /* some of the bytes are from a previous buffer, replay those first */
michael@0 1107 pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
michael@0 1108 pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */
michael@0 1109 pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */
michael@0 1110
michael@0 1111 /* no offsets: bytes from previous buffer, and not enough for output */
michael@0 1112 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
michael@0 1113
michael@0 1114 /* restore real pointers; pArgs->source will be set in case 8/9 */
michael@0 1115 pArgs->sourceLimit=sourceLimit;
michael@0 1116 pArgs->flush=oldFlush;
michael@0 1117 }
michael@0 1118 state=8;
michael@0 1119 continue;
michael@0 1120 }
michael@0 1121 break;
michael@0 1122 case 8:
michael@0 1123 /* call UTF-32BE */
michael@0 1124 pArgs->source=source;
michael@0 1125 if(offsets==NULL) {
michael@0 1126 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
michael@0 1127 } else {
michael@0 1128 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode);
michael@0 1129 }
michael@0 1130 source=pArgs->source;
michael@0 1131 break;
michael@0 1132 case 9:
michael@0 1133 /* call UTF-32LE */
michael@0 1134 pArgs->source=source;
michael@0 1135 if(offsets==NULL) {
michael@0 1136 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
michael@0 1137 } else {
michael@0 1138 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode);
michael@0 1139 }
michael@0 1140 source=pArgs->source;
michael@0 1141 break;
michael@0 1142 default:
michael@0 1143 break; /* does not occur */
michael@0 1144 }
michael@0 1145 }
michael@0 1146
michael@0 1147 /* add BOM size to offsets - see comment at offsetDelta declaration */
michael@0 1148 if(offsets!=NULL && offsetDelta!=0) {
michael@0 1149 int32_t *offsetsLimit=pArgs->offsets;
michael@0 1150 while(offsets<offsetsLimit) {
michael@0 1151 *offsets++ += offsetDelta;
michael@0 1152 }
michael@0 1153 }
michael@0 1154
michael@0 1155 pArgs->source=source;
michael@0 1156
michael@0 1157 if(source==sourceLimit && pArgs->flush) {
michael@0 1158 /* handle truncated input */
michael@0 1159 switch(state) {
michael@0 1160 case 0:
michael@0 1161 break; /* no input at all, nothing to do */
michael@0 1162 case 8:
michael@0 1163 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
michael@0 1164 break;
michael@0 1165 case 9:
michael@0 1166 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
michael@0 1167 break;
michael@0 1168 default:
michael@0 1169 /* handle 0<state<8: call UTF-32BE with too-short input */
michael@0 1170 pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
michael@0 1171 pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */
michael@0 1172
michael@0 1173 /* no offsets: not enough for output */
michael@0 1174 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
michael@0 1175 pArgs->source=source;
michael@0 1176 pArgs->sourceLimit=sourceLimit;
michael@0 1177 state=8;
michael@0 1178 break;
michael@0 1179 }
michael@0 1180 }
michael@0 1181
michael@0 1182 cnv->mode=state;
michael@0 1183 }
michael@0 1184
michael@0 1185 static UChar32
michael@0 1186 _UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs,
michael@0 1187 UErrorCode *pErrorCode) {
michael@0 1188 switch(pArgs->converter->mode) {
michael@0 1189 case 8:
michael@0 1190 return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode);
michael@0 1191 case 9:
michael@0 1192 return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode);
michael@0 1193 default:
michael@0 1194 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
michael@0 1195 }
michael@0 1196 }
michael@0 1197
michael@0 1198 static const UConverterImpl _UTF32Impl = {
michael@0 1199 UCNV_UTF32,
michael@0 1200
michael@0 1201 NULL,
michael@0 1202 NULL,
michael@0 1203
michael@0 1204 _UTF32Open,
michael@0 1205 NULL,
michael@0 1206 _UTF32Reset,
michael@0 1207
michael@0 1208 _UTF32ToUnicodeWithOffsets,
michael@0 1209 _UTF32ToUnicodeWithOffsets,
michael@0 1210 #if U_IS_BIG_ENDIAN
michael@0 1211 T_UConverter_fromUnicode_UTF32_BE,
michael@0 1212 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
michael@0 1213 #else
michael@0 1214 T_UConverter_fromUnicode_UTF32_LE,
michael@0 1215 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
michael@0 1216 #endif
michael@0 1217 _UTF32GetNextUChar,
michael@0 1218
michael@0 1219 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
michael@0 1220 NULL,
michael@0 1221 NULL,
michael@0 1222 NULL,
michael@0 1223 ucnv_getNonSurrogateUnicodeSet
michael@0 1224 };
michael@0 1225
michael@0 1226 /* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */
michael@0 1227 static const UConverterStaticData _UTF32StaticData = {
michael@0 1228 sizeof(UConverterStaticData),
michael@0 1229 "UTF-32",
michael@0 1230 1236,
michael@0 1231 UCNV_IBM, UCNV_UTF32, 4, 4,
michael@0 1232 #if U_IS_BIG_ENDIAN
michael@0 1233 { 0, 0, 0xff, 0xfd }, 4,
michael@0 1234 #else
michael@0 1235 { 0xfd, 0xff, 0, 0 }, 4,
michael@0 1236 #endif
michael@0 1237 FALSE, FALSE,
michael@0 1238 0,
michael@0 1239 0,
michael@0 1240 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
michael@0 1241 };
michael@0 1242
michael@0 1243 const UConverterSharedData _UTF32Data = {
michael@0 1244 sizeof(UConverterSharedData), ~((uint32_t) 0),
michael@0 1245 NULL, NULL, &_UTF32StaticData, FALSE, &_UTF32Impl,
michael@0 1246 0
michael@0 1247 };
michael@0 1248
michael@0 1249 #endif

mercurial