Sat, 03 Jan 2015 20:18:00 +0100
Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.
michael@0 | 1 | /* |
michael@0 | 2 | ********************************************************************** |
michael@0 | 3 | * Copyright (C) 2002-2011, International Business Machines |
michael@0 | 4 | * Corporation and others. All Rights Reserved. |
michael@0 | 5 | ********************************************************************** |
michael@0 | 6 | * file name: ucnv_u32.c |
michael@0 | 7 | * encoding: US-ASCII |
michael@0 | 8 | * tab size: 8 (not used) |
michael@0 | 9 | * indentation:4 |
michael@0 | 10 | * |
michael@0 | 11 | * created on: 2002jul01 |
michael@0 | 12 | * created by: Markus W. Scherer |
michael@0 | 13 | * |
michael@0 | 14 | * UTF-32 converter implementation. Used to be in ucnv_utf.c. |
michael@0 | 15 | */ |
michael@0 | 16 | |
michael@0 | 17 | #include "unicode/utypes.h" |
michael@0 | 18 | |
michael@0 | 19 | #if !UCONFIG_NO_CONVERSION |
michael@0 | 20 | |
michael@0 | 21 | #include "unicode/ucnv.h" |
michael@0 | 22 | #include "unicode/utf.h" |
michael@0 | 23 | #include "ucnv_bld.h" |
michael@0 | 24 | #include "ucnv_cnv.h" |
michael@0 | 25 | #include "cmemory.h" |
michael@0 | 26 | |
michael@0 | 27 | #define MAXIMUM_UCS2 0x0000FFFF |
michael@0 | 28 | #define MAXIMUM_UTF 0x0010FFFF |
michael@0 | 29 | #define HALF_SHIFT 10 |
michael@0 | 30 | #define HALF_BASE 0x0010000 |
michael@0 | 31 | #define HALF_MASK 0x3FF |
michael@0 | 32 | #define SURROGATE_HIGH_START 0xD800 |
michael@0 | 33 | #define SURROGATE_LOW_START 0xDC00 |
michael@0 | 34 | |
michael@0 | 35 | /* -SURROGATE_LOW_START + HALF_BASE */ |
michael@0 | 36 | #define SURROGATE_LOW_BASE 9216 |
michael@0 | 37 | |
michael@0 | 38 | enum { |
michael@0 | 39 | UCNV_NEED_TO_WRITE_BOM=1 |
michael@0 | 40 | }; |
michael@0 | 41 | |
michael@0 | 42 | /* UTF-32BE ----------------------------------------------------------------- */ |
michael@0 | 43 | |
michael@0 | 44 | static void |
michael@0 | 45 | T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args, |
michael@0 | 46 | UErrorCode * err) |
michael@0 | 47 | { |
michael@0 | 48 | const unsigned char *mySource = (unsigned char *) args->source; |
michael@0 | 49 | UChar *myTarget = args->target; |
michael@0 | 50 | const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; |
michael@0 | 51 | const UChar *targetLimit = args->targetLimit; |
michael@0 | 52 | unsigned char *toUBytes = args->converter->toUBytes; |
michael@0 | 53 | uint32_t ch, i; |
michael@0 | 54 | |
michael@0 | 55 | /* Restore state of current sequence */ |
michael@0 | 56 | if (args->converter->toUnicodeStatus && myTarget < targetLimit) { |
michael@0 | 57 | i = args->converter->toULength; /* restore # of bytes consumed */ |
michael@0 | 58 | args->converter->toULength = 0; |
michael@0 | 59 | |
michael@0 | 60 | ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/ |
michael@0 | 61 | args->converter->toUnicodeStatus = 0; |
michael@0 | 62 | goto morebytes; |
michael@0 | 63 | } |
michael@0 | 64 | |
michael@0 | 65 | while (mySource < sourceLimit && myTarget < targetLimit) { |
michael@0 | 66 | i = 0; |
michael@0 | 67 | ch = 0; |
michael@0 | 68 | morebytes: |
michael@0 | 69 | while (i < sizeof(uint32_t)) { |
michael@0 | 70 | if (mySource < sourceLimit) { |
michael@0 | 71 | ch = (ch << 8) | (uint8_t)(*mySource); |
michael@0 | 72 | toUBytes[i++] = (char) *(mySource++); |
michael@0 | 73 | } |
michael@0 | 74 | else { |
michael@0 | 75 | /* stores a partially calculated target*/ |
michael@0 | 76 | /* + 1 to make 0 a valid character */ |
michael@0 | 77 | args->converter->toUnicodeStatus = ch + 1; |
michael@0 | 78 | args->converter->toULength = (int8_t) i; |
michael@0 | 79 | goto donefornow; |
michael@0 | 80 | } |
michael@0 | 81 | } |
michael@0 | 82 | |
michael@0 | 83 | if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) { |
michael@0 | 84 | /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ |
michael@0 | 85 | if (ch <= MAXIMUM_UCS2) |
michael@0 | 86 | { |
michael@0 | 87 | /* fits in 16 bits */ |
michael@0 | 88 | *(myTarget++) = (UChar) ch; |
michael@0 | 89 | } |
michael@0 | 90 | else { |
michael@0 | 91 | /* write out the surrogates */ |
michael@0 | 92 | *(myTarget++) = U16_LEAD(ch); |
michael@0 | 93 | ch = U16_TRAIL(ch); |
michael@0 | 94 | if (myTarget < targetLimit) { |
michael@0 | 95 | *(myTarget++) = (UChar)ch; |
michael@0 | 96 | } |
michael@0 | 97 | else { |
michael@0 | 98 | /* Put in overflow buffer (not handled here) */ |
michael@0 | 99 | args->converter->UCharErrorBuffer[0] = (UChar) ch; |
michael@0 | 100 | args->converter->UCharErrorBufferLength = 1; |
michael@0 | 101 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 102 | break; |
michael@0 | 103 | } |
michael@0 | 104 | } |
michael@0 | 105 | } |
michael@0 | 106 | else { |
michael@0 | 107 | args->converter->toULength = (int8_t)i; |
michael@0 | 108 | *err = U_ILLEGAL_CHAR_FOUND; |
michael@0 | 109 | break; |
michael@0 | 110 | } |
michael@0 | 111 | } |
michael@0 | 112 | |
michael@0 | 113 | donefornow: |
michael@0 | 114 | if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) { |
michael@0 | 115 | /* End of target buffer */ |
michael@0 | 116 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 117 | } |
michael@0 | 118 | |
michael@0 | 119 | args->target = myTarget; |
michael@0 | 120 | args->source = (const char *) mySource; |
michael@0 | 121 | } |
michael@0 | 122 | |
michael@0 | 123 | static void |
michael@0 | 124 | T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args, |
michael@0 | 125 | UErrorCode * err) |
michael@0 | 126 | { |
michael@0 | 127 | const unsigned char *mySource = (unsigned char *) args->source; |
michael@0 | 128 | UChar *myTarget = args->target; |
michael@0 | 129 | int32_t *myOffsets = args->offsets; |
michael@0 | 130 | const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; |
michael@0 | 131 | const UChar *targetLimit = args->targetLimit; |
michael@0 | 132 | unsigned char *toUBytes = args->converter->toUBytes; |
michael@0 | 133 | uint32_t ch, i; |
michael@0 | 134 | int32_t offsetNum = 0; |
michael@0 | 135 | |
michael@0 | 136 | /* Restore state of current sequence */ |
michael@0 | 137 | if (args->converter->toUnicodeStatus && myTarget < targetLimit) { |
michael@0 | 138 | i = args->converter->toULength; /* restore # of bytes consumed */ |
michael@0 | 139 | args->converter->toULength = 0; |
michael@0 | 140 | |
michael@0 | 141 | ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/ |
michael@0 | 142 | args->converter->toUnicodeStatus = 0; |
michael@0 | 143 | goto morebytes; |
michael@0 | 144 | } |
michael@0 | 145 | |
michael@0 | 146 | while (mySource < sourceLimit && myTarget < targetLimit) { |
michael@0 | 147 | i = 0; |
michael@0 | 148 | ch = 0; |
michael@0 | 149 | morebytes: |
michael@0 | 150 | while (i < sizeof(uint32_t)) { |
michael@0 | 151 | if (mySource < sourceLimit) { |
michael@0 | 152 | ch = (ch << 8) | (uint8_t)(*mySource); |
michael@0 | 153 | toUBytes[i++] = (char) *(mySource++); |
michael@0 | 154 | } |
michael@0 | 155 | else { |
michael@0 | 156 | /* stores a partially calculated target*/ |
michael@0 | 157 | /* + 1 to make 0 a valid character */ |
michael@0 | 158 | args->converter->toUnicodeStatus = ch + 1; |
michael@0 | 159 | args->converter->toULength = (int8_t) i; |
michael@0 | 160 | goto donefornow; |
michael@0 | 161 | } |
michael@0 | 162 | } |
michael@0 | 163 | |
michael@0 | 164 | if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) { |
michael@0 | 165 | /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ |
michael@0 | 166 | if (ch <= MAXIMUM_UCS2) { |
michael@0 | 167 | /* fits in 16 bits */ |
michael@0 | 168 | *(myTarget++) = (UChar) ch; |
michael@0 | 169 | *(myOffsets++) = offsetNum; |
michael@0 | 170 | } |
michael@0 | 171 | else { |
michael@0 | 172 | /* write out the surrogates */ |
michael@0 | 173 | *(myTarget++) = U16_LEAD(ch); |
michael@0 | 174 | *myOffsets++ = offsetNum; |
michael@0 | 175 | ch = U16_TRAIL(ch); |
michael@0 | 176 | if (myTarget < targetLimit) |
michael@0 | 177 | { |
michael@0 | 178 | *(myTarget++) = (UChar)ch; |
michael@0 | 179 | *(myOffsets++) = offsetNum; |
michael@0 | 180 | } |
michael@0 | 181 | else { |
michael@0 | 182 | /* Put in overflow buffer (not handled here) */ |
michael@0 | 183 | args->converter->UCharErrorBuffer[0] = (UChar) ch; |
michael@0 | 184 | args->converter->UCharErrorBufferLength = 1; |
michael@0 | 185 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 186 | break; |
michael@0 | 187 | } |
michael@0 | 188 | } |
michael@0 | 189 | } |
michael@0 | 190 | else { |
michael@0 | 191 | args->converter->toULength = (int8_t)i; |
michael@0 | 192 | *err = U_ILLEGAL_CHAR_FOUND; |
michael@0 | 193 | break; |
michael@0 | 194 | } |
michael@0 | 195 | offsetNum += i; |
michael@0 | 196 | } |
michael@0 | 197 | |
michael@0 | 198 | donefornow: |
michael@0 | 199 | if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) |
michael@0 | 200 | { |
michael@0 | 201 | /* End of target buffer */ |
michael@0 | 202 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 203 | } |
michael@0 | 204 | |
michael@0 | 205 | args->target = myTarget; |
michael@0 | 206 | args->source = (const char *) mySource; |
michael@0 | 207 | args->offsets = myOffsets; |
michael@0 | 208 | } |
michael@0 | 209 | |
michael@0 | 210 | static void |
michael@0 | 211 | T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args, |
michael@0 | 212 | UErrorCode * err) |
michael@0 | 213 | { |
michael@0 | 214 | const UChar *mySource = args->source; |
michael@0 | 215 | unsigned char *myTarget; |
michael@0 | 216 | const UChar *sourceLimit = args->sourceLimit; |
michael@0 | 217 | const unsigned char *targetLimit = (unsigned char *) args->targetLimit; |
michael@0 | 218 | UChar32 ch, ch2; |
michael@0 | 219 | unsigned int indexToWrite; |
michael@0 | 220 | unsigned char temp[sizeof(uint32_t)]; |
michael@0 | 221 | |
michael@0 | 222 | if(mySource >= sourceLimit) { |
michael@0 | 223 | /* no input, nothing to do */ |
michael@0 | 224 | return; |
michael@0 | 225 | } |
michael@0 | 226 | |
michael@0 | 227 | /* write the BOM if necessary */ |
michael@0 | 228 | if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { |
michael@0 | 229 | static const char bom[]={ 0, 0, (char)0xfe, (char)0xff }; |
michael@0 | 230 | ucnv_fromUWriteBytes(args->converter, |
michael@0 | 231 | bom, 4, |
michael@0 | 232 | &args->target, args->targetLimit, |
michael@0 | 233 | &args->offsets, -1, |
michael@0 | 234 | err); |
michael@0 | 235 | args->converter->fromUnicodeStatus=0; |
michael@0 | 236 | } |
michael@0 | 237 | |
michael@0 | 238 | myTarget = (unsigned char *) args->target; |
michael@0 | 239 | temp[0] = 0; |
michael@0 | 240 | |
michael@0 | 241 | if (args->converter->fromUChar32) { |
michael@0 | 242 | ch = args->converter->fromUChar32; |
michael@0 | 243 | args->converter->fromUChar32 = 0; |
michael@0 | 244 | goto lowsurogate; |
michael@0 | 245 | } |
michael@0 | 246 | |
michael@0 | 247 | while (mySource < sourceLimit && myTarget < targetLimit) { |
michael@0 | 248 | ch = *(mySource++); |
michael@0 | 249 | |
michael@0 | 250 | if (U_IS_SURROGATE(ch)) { |
michael@0 | 251 | if (U_IS_LEAD(ch)) { |
michael@0 | 252 | lowsurogate: |
michael@0 | 253 | if (mySource < sourceLimit) { |
michael@0 | 254 | ch2 = *mySource; |
michael@0 | 255 | if (U_IS_TRAIL(ch2)) { |
michael@0 | 256 | ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; |
michael@0 | 257 | mySource++; |
michael@0 | 258 | } |
michael@0 | 259 | else { |
michael@0 | 260 | /* this is an unmatched trail code unit (2nd surrogate) */ |
michael@0 | 261 | /* callback(illegal) */ |
michael@0 | 262 | args->converter->fromUChar32 = ch; |
michael@0 | 263 | *err = U_ILLEGAL_CHAR_FOUND; |
michael@0 | 264 | break; |
michael@0 | 265 | } |
michael@0 | 266 | } |
michael@0 | 267 | else { |
michael@0 | 268 | /* ran out of source */ |
michael@0 | 269 | args->converter->fromUChar32 = ch; |
michael@0 | 270 | if (args->flush) { |
michael@0 | 271 | /* this is an unmatched trail code unit (2nd surrogate) */ |
michael@0 | 272 | /* callback(illegal) */ |
michael@0 | 273 | *err = U_ILLEGAL_CHAR_FOUND; |
michael@0 | 274 | } |
michael@0 | 275 | break; |
michael@0 | 276 | } |
michael@0 | 277 | } |
michael@0 | 278 | else { |
michael@0 | 279 | /* this is an unmatched trail code unit (2nd surrogate) */ |
michael@0 | 280 | /* callback(illegal) */ |
michael@0 | 281 | args->converter->fromUChar32 = ch; |
michael@0 | 282 | *err = U_ILLEGAL_CHAR_FOUND; |
michael@0 | 283 | break; |
michael@0 | 284 | } |
michael@0 | 285 | } |
michael@0 | 286 | |
michael@0 | 287 | /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ |
michael@0 | 288 | temp[1] = (uint8_t) (ch >> 16 & 0x1F); |
michael@0 | 289 | temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */ |
michael@0 | 290 | temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */ |
michael@0 | 291 | |
michael@0 | 292 | for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) { |
michael@0 | 293 | if (myTarget < targetLimit) { |
michael@0 | 294 | *(myTarget++) = temp[indexToWrite]; |
michael@0 | 295 | } |
michael@0 | 296 | else { |
michael@0 | 297 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; |
michael@0 | 298 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 299 | } |
michael@0 | 300 | } |
michael@0 | 301 | } |
michael@0 | 302 | |
michael@0 | 303 | if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) { |
michael@0 | 304 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 305 | } |
michael@0 | 306 | |
michael@0 | 307 | args->target = (char *) myTarget; |
michael@0 | 308 | args->source = mySource; |
michael@0 | 309 | } |
michael@0 | 310 | |
michael@0 | 311 | static void |
michael@0 | 312 | T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args, |
michael@0 | 313 | UErrorCode * err) |
michael@0 | 314 | { |
michael@0 | 315 | const UChar *mySource = args->source; |
michael@0 | 316 | unsigned char *myTarget; |
michael@0 | 317 | int32_t *myOffsets; |
michael@0 | 318 | const UChar *sourceLimit = args->sourceLimit; |
michael@0 | 319 | const unsigned char *targetLimit = (unsigned char *) args->targetLimit; |
michael@0 | 320 | UChar32 ch, ch2; |
michael@0 | 321 | int32_t offsetNum = 0; |
michael@0 | 322 | unsigned int indexToWrite; |
michael@0 | 323 | unsigned char temp[sizeof(uint32_t)]; |
michael@0 | 324 | |
michael@0 | 325 | if(mySource >= sourceLimit) { |
michael@0 | 326 | /* no input, nothing to do */ |
michael@0 | 327 | return; |
michael@0 | 328 | } |
michael@0 | 329 | |
michael@0 | 330 | /* write the BOM if necessary */ |
michael@0 | 331 | if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { |
michael@0 | 332 | static const char bom[]={ 0, 0, (char)0xfe, (char)0xff }; |
michael@0 | 333 | ucnv_fromUWriteBytes(args->converter, |
michael@0 | 334 | bom, 4, |
michael@0 | 335 | &args->target, args->targetLimit, |
michael@0 | 336 | &args->offsets, -1, |
michael@0 | 337 | err); |
michael@0 | 338 | args->converter->fromUnicodeStatus=0; |
michael@0 | 339 | } |
michael@0 | 340 | |
michael@0 | 341 | myTarget = (unsigned char *) args->target; |
michael@0 | 342 | myOffsets = args->offsets; |
michael@0 | 343 | temp[0] = 0; |
michael@0 | 344 | |
michael@0 | 345 | if (args->converter->fromUChar32) { |
michael@0 | 346 | ch = args->converter->fromUChar32; |
michael@0 | 347 | args->converter->fromUChar32 = 0; |
michael@0 | 348 | goto lowsurogate; |
michael@0 | 349 | } |
michael@0 | 350 | |
michael@0 | 351 | while (mySource < sourceLimit && myTarget < targetLimit) { |
michael@0 | 352 | ch = *(mySource++); |
michael@0 | 353 | |
michael@0 | 354 | if (U_IS_SURROGATE(ch)) { |
michael@0 | 355 | if (U_IS_LEAD(ch)) { |
michael@0 | 356 | lowsurogate: |
michael@0 | 357 | if (mySource < sourceLimit) { |
michael@0 | 358 | ch2 = *mySource; |
michael@0 | 359 | if (U_IS_TRAIL(ch2)) { |
michael@0 | 360 | ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; |
michael@0 | 361 | mySource++; |
michael@0 | 362 | } |
michael@0 | 363 | else { |
michael@0 | 364 | /* this is an unmatched trail code unit (2nd surrogate) */ |
michael@0 | 365 | /* callback(illegal) */ |
michael@0 | 366 | args->converter->fromUChar32 = ch; |
michael@0 | 367 | *err = U_ILLEGAL_CHAR_FOUND; |
michael@0 | 368 | break; |
michael@0 | 369 | } |
michael@0 | 370 | } |
michael@0 | 371 | else { |
michael@0 | 372 | /* ran out of source */ |
michael@0 | 373 | args->converter->fromUChar32 = ch; |
michael@0 | 374 | if (args->flush) { |
michael@0 | 375 | /* this is an unmatched trail code unit (2nd surrogate) */ |
michael@0 | 376 | /* callback(illegal) */ |
michael@0 | 377 | *err = U_ILLEGAL_CHAR_FOUND; |
michael@0 | 378 | } |
michael@0 | 379 | break; |
michael@0 | 380 | } |
michael@0 | 381 | } |
michael@0 | 382 | else { |
michael@0 | 383 | /* this is an unmatched trail code unit (2nd surrogate) */ |
michael@0 | 384 | /* callback(illegal) */ |
michael@0 | 385 | args->converter->fromUChar32 = ch; |
michael@0 | 386 | *err = U_ILLEGAL_CHAR_FOUND; |
michael@0 | 387 | break; |
michael@0 | 388 | } |
michael@0 | 389 | } |
michael@0 | 390 | |
michael@0 | 391 | /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ |
michael@0 | 392 | temp[1] = (uint8_t) (ch >> 16 & 0x1F); |
michael@0 | 393 | temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */ |
michael@0 | 394 | temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */ |
michael@0 | 395 | |
michael@0 | 396 | for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) { |
michael@0 | 397 | if (myTarget < targetLimit) { |
michael@0 | 398 | *(myTarget++) = temp[indexToWrite]; |
michael@0 | 399 | *(myOffsets++) = offsetNum; |
michael@0 | 400 | } |
michael@0 | 401 | else { |
michael@0 | 402 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; |
michael@0 | 403 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 404 | } |
michael@0 | 405 | } |
michael@0 | 406 | offsetNum = offsetNum + 1 + (temp[1] != 0); |
michael@0 | 407 | } |
michael@0 | 408 | |
michael@0 | 409 | if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) { |
michael@0 | 410 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 411 | } |
michael@0 | 412 | |
michael@0 | 413 | args->target = (char *) myTarget; |
michael@0 | 414 | args->source = mySource; |
michael@0 | 415 | args->offsets = myOffsets; |
michael@0 | 416 | } |
michael@0 | 417 | |
michael@0 | 418 | static UChar32 |
michael@0 | 419 | T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args, |
michael@0 | 420 | UErrorCode* err) |
michael@0 | 421 | { |
michael@0 | 422 | const uint8_t *mySource; |
michael@0 | 423 | UChar32 myUChar; |
michael@0 | 424 | int32_t length; |
michael@0 | 425 | |
michael@0 | 426 | mySource = (const uint8_t *)args->source; |
michael@0 | 427 | if (mySource >= (const uint8_t *)args->sourceLimit) |
michael@0 | 428 | { |
michael@0 | 429 | /* no input */ |
michael@0 | 430 | *err = U_INDEX_OUTOFBOUNDS_ERROR; |
michael@0 | 431 | return 0xffff; |
michael@0 | 432 | } |
michael@0 | 433 | |
michael@0 | 434 | length = (int32_t)((const uint8_t *)args->sourceLimit - mySource); |
michael@0 | 435 | if (length < 4) |
michael@0 | 436 | { |
michael@0 | 437 | /* got a partial character */ |
michael@0 | 438 | uprv_memcpy(args->converter->toUBytes, mySource, length); |
michael@0 | 439 | args->converter->toULength = (int8_t)length; |
michael@0 | 440 | args->source = (const char *)(mySource + length); |
michael@0 | 441 | *err = U_TRUNCATED_CHAR_FOUND; |
michael@0 | 442 | return 0xffff; |
michael@0 | 443 | } |
michael@0 | 444 | |
michael@0 | 445 | /* Don't even try to do a direct cast because the value may be on an odd address. */ |
michael@0 | 446 | myUChar = ((UChar32)mySource[0] << 24) |
michael@0 | 447 | | ((UChar32)mySource[1] << 16) |
michael@0 | 448 | | ((UChar32)mySource[2] << 8) |
michael@0 | 449 | | ((UChar32)mySource[3]); |
michael@0 | 450 | |
michael@0 | 451 | args->source = (const char *)(mySource + 4); |
michael@0 | 452 | if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) { |
michael@0 | 453 | return myUChar; |
michael@0 | 454 | } |
michael@0 | 455 | |
michael@0 | 456 | uprv_memcpy(args->converter->toUBytes, mySource, 4); |
michael@0 | 457 | args->converter->toULength = 4; |
michael@0 | 458 | |
michael@0 | 459 | *err = U_ILLEGAL_CHAR_FOUND; |
michael@0 | 460 | return 0xffff; |
michael@0 | 461 | } |
michael@0 | 462 | |
michael@0 | 463 | static const UConverterImpl _UTF32BEImpl = { |
michael@0 | 464 | UCNV_UTF32_BigEndian, |
michael@0 | 465 | |
michael@0 | 466 | NULL, |
michael@0 | 467 | NULL, |
michael@0 | 468 | |
michael@0 | 469 | NULL, |
michael@0 | 470 | NULL, |
michael@0 | 471 | NULL, |
michael@0 | 472 | |
michael@0 | 473 | T_UConverter_toUnicode_UTF32_BE, |
michael@0 | 474 | T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC, |
michael@0 | 475 | T_UConverter_fromUnicode_UTF32_BE, |
michael@0 | 476 | T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC, |
michael@0 | 477 | T_UConverter_getNextUChar_UTF32_BE, |
michael@0 | 478 | |
michael@0 | 479 | NULL, |
michael@0 | 480 | NULL, |
michael@0 | 481 | NULL, |
michael@0 | 482 | NULL, |
michael@0 | 483 | ucnv_getNonSurrogateUnicodeSet |
michael@0 | 484 | }; |
michael@0 | 485 | |
michael@0 | 486 | /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */ |
michael@0 | 487 | static const UConverterStaticData _UTF32BEStaticData = { |
michael@0 | 488 | sizeof(UConverterStaticData), |
michael@0 | 489 | "UTF-32BE", |
michael@0 | 490 | 1232, |
michael@0 | 491 | UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4, |
michael@0 | 492 | { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE, |
michael@0 | 493 | 0, |
michael@0 | 494 | 0, |
michael@0 | 495 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
michael@0 | 496 | }; |
michael@0 | 497 | |
michael@0 | 498 | const UConverterSharedData _UTF32BEData = { |
michael@0 | 499 | sizeof(UConverterSharedData), ~((uint32_t) 0), |
michael@0 | 500 | NULL, NULL, &_UTF32BEStaticData, FALSE, &_UTF32BEImpl, |
michael@0 | 501 | 0 |
michael@0 | 502 | }; |
michael@0 | 503 | |
michael@0 | 504 | /* UTF-32LE ---------------------------------------------------------- */ |
michael@0 | 505 | |
michael@0 | 506 | static void |
michael@0 | 507 | T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args, |
michael@0 | 508 | UErrorCode * err) |
michael@0 | 509 | { |
michael@0 | 510 | const unsigned char *mySource = (unsigned char *) args->source; |
michael@0 | 511 | UChar *myTarget = args->target; |
michael@0 | 512 | const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; |
michael@0 | 513 | const UChar *targetLimit = args->targetLimit; |
michael@0 | 514 | unsigned char *toUBytes = args->converter->toUBytes; |
michael@0 | 515 | uint32_t ch, i; |
michael@0 | 516 | |
michael@0 | 517 | /* Restore state of current sequence */ |
michael@0 | 518 | if (args->converter->toUnicodeStatus && myTarget < targetLimit) |
michael@0 | 519 | { |
michael@0 | 520 | i = args->converter->toULength; /* restore # of bytes consumed */ |
michael@0 | 521 | args->converter->toULength = 0; |
michael@0 | 522 | |
michael@0 | 523 | /* Stores the previously calculated ch from a previous call*/ |
michael@0 | 524 | ch = args->converter->toUnicodeStatus - 1; |
michael@0 | 525 | args->converter->toUnicodeStatus = 0; |
michael@0 | 526 | goto morebytes; |
michael@0 | 527 | } |
michael@0 | 528 | |
michael@0 | 529 | while (mySource < sourceLimit && myTarget < targetLimit) |
michael@0 | 530 | { |
michael@0 | 531 | i = 0; |
michael@0 | 532 | ch = 0; |
michael@0 | 533 | morebytes: |
michael@0 | 534 | while (i < sizeof(uint32_t)) |
michael@0 | 535 | { |
michael@0 | 536 | if (mySource < sourceLimit) |
michael@0 | 537 | { |
michael@0 | 538 | ch |= ((uint8_t)(*mySource)) << (i * 8); |
michael@0 | 539 | toUBytes[i++] = (char) *(mySource++); |
michael@0 | 540 | } |
michael@0 | 541 | else |
michael@0 | 542 | { |
michael@0 | 543 | /* stores a partially calculated target*/ |
michael@0 | 544 | /* + 1 to make 0 a valid character */ |
michael@0 | 545 | args->converter->toUnicodeStatus = ch + 1; |
michael@0 | 546 | args->converter->toULength = (int8_t) i; |
michael@0 | 547 | goto donefornow; |
michael@0 | 548 | } |
michael@0 | 549 | } |
michael@0 | 550 | |
michael@0 | 551 | if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) { |
michael@0 | 552 | /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ |
michael@0 | 553 | if (ch <= MAXIMUM_UCS2) { |
michael@0 | 554 | /* fits in 16 bits */ |
michael@0 | 555 | *(myTarget++) = (UChar) ch; |
michael@0 | 556 | } |
michael@0 | 557 | else { |
michael@0 | 558 | /* write out the surrogates */ |
michael@0 | 559 | *(myTarget++) = U16_LEAD(ch); |
michael@0 | 560 | ch = U16_TRAIL(ch); |
michael@0 | 561 | if (myTarget < targetLimit) { |
michael@0 | 562 | *(myTarget++) = (UChar)ch; |
michael@0 | 563 | } |
michael@0 | 564 | else { |
michael@0 | 565 | /* Put in overflow buffer (not handled here) */ |
michael@0 | 566 | args->converter->UCharErrorBuffer[0] = (UChar) ch; |
michael@0 | 567 | args->converter->UCharErrorBufferLength = 1; |
michael@0 | 568 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 569 | break; |
michael@0 | 570 | } |
michael@0 | 571 | } |
michael@0 | 572 | } |
michael@0 | 573 | else { |
michael@0 | 574 | args->converter->toULength = (int8_t)i; |
michael@0 | 575 | *err = U_ILLEGAL_CHAR_FOUND; |
michael@0 | 576 | break; |
michael@0 | 577 | } |
michael@0 | 578 | } |
michael@0 | 579 | |
michael@0 | 580 | donefornow: |
michael@0 | 581 | if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) |
michael@0 | 582 | { |
michael@0 | 583 | /* End of target buffer */ |
michael@0 | 584 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 585 | } |
michael@0 | 586 | |
michael@0 | 587 | args->target = myTarget; |
michael@0 | 588 | args->source = (const char *) mySource; |
michael@0 | 589 | } |
michael@0 | 590 | |
michael@0 | 591 | static void |
michael@0 | 592 | T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args, |
michael@0 | 593 | UErrorCode * err) |
michael@0 | 594 | { |
michael@0 | 595 | const unsigned char *mySource = (unsigned char *) args->source; |
michael@0 | 596 | UChar *myTarget = args->target; |
michael@0 | 597 | int32_t *myOffsets = args->offsets; |
michael@0 | 598 | const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; |
michael@0 | 599 | const UChar *targetLimit = args->targetLimit; |
michael@0 | 600 | unsigned char *toUBytes = args->converter->toUBytes; |
michael@0 | 601 | uint32_t ch, i; |
michael@0 | 602 | int32_t offsetNum = 0; |
michael@0 | 603 | |
michael@0 | 604 | /* Restore state of current sequence */ |
michael@0 | 605 | if (args->converter->toUnicodeStatus && myTarget < targetLimit) |
michael@0 | 606 | { |
michael@0 | 607 | i = args->converter->toULength; /* restore # of bytes consumed */ |
michael@0 | 608 | args->converter->toULength = 0; |
michael@0 | 609 | |
michael@0 | 610 | /* Stores the previously calculated ch from a previous call*/ |
michael@0 | 611 | ch = args->converter->toUnicodeStatus - 1; |
michael@0 | 612 | args->converter->toUnicodeStatus = 0; |
michael@0 | 613 | goto morebytes; |
michael@0 | 614 | } |
michael@0 | 615 | |
michael@0 | 616 | while (mySource < sourceLimit && myTarget < targetLimit) |
michael@0 | 617 | { |
michael@0 | 618 | i = 0; |
michael@0 | 619 | ch = 0; |
michael@0 | 620 | morebytes: |
michael@0 | 621 | while (i < sizeof(uint32_t)) |
michael@0 | 622 | { |
michael@0 | 623 | if (mySource < sourceLimit) |
michael@0 | 624 | { |
michael@0 | 625 | ch |= ((uint8_t)(*mySource)) << (i * 8); |
michael@0 | 626 | toUBytes[i++] = (char) *(mySource++); |
michael@0 | 627 | } |
michael@0 | 628 | else |
michael@0 | 629 | { |
michael@0 | 630 | /* stores a partially calculated target*/ |
michael@0 | 631 | /* + 1 to make 0 a valid character */ |
michael@0 | 632 | args->converter->toUnicodeStatus = ch + 1; |
michael@0 | 633 | args->converter->toULength = (int8_t) i; |
michael@0 | 634 | goto donefornow; |
michael@0 | 635 | } |
michael@0 | 636 | } |
michael@0 | 637 | |
michael@0 | 638 | if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) |
michael@0 | 639 | { |
michael@0 | 640 | /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ |
michael@0 | 641 | if (ch <= MAXIMUM_UCS2) |
michael@0 | 642 | { |
michael@0 | 643 | /* fits in 16 bits */ |
michael@0 | 644 | *(myTarget++) = (UChar) ch; |
michael@0 | 645 | *(myOffsets++) = offsetNum; |
michael@0 | 646 | } |
michael@0 | 647 | else { |
michael@0 | 648 | /* write out the surrogates */ |
michael@0 | 649 | *(myTarget++) = U16_LEAD(ch); |
michael@0 | 650 | *(myOffsets++) = offsetNum; |
michael@0 | 651 | ch = U16_TRAIL(ch); |
michael@0 | 652 | if (myTarget < targetLimit) |
michael@0 | 653 | { |
michael@0 | 654 | *(myTarget++) = (UChar)ch; |
michael@0 | 655 | *(myOffsets++) = offsetNum; |
michael@0 | 656 | } |
michael@0 | 657 | else |
michael@0 | 658 | { |
michael@0 | 659 | /* Put in overflow buffer (not handled here) */ |
michael@0 | 660 | args->converter->UCharErrorBuffer[0] = (UChar) ch; |
michael@0 | 661 | args->converter->UCharErrorBufferLength = 1; |
michael@0 | 662 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 663 | break; |
michael@0 | 664 | } |
michael@0 | 665 | } |
michael@0 | 666 | } |
michael@0 | 667 | else |
michael@0 | 668 | { |
michael@0 | 669 | args->converter->toULength = (int8_t)i; |
michael@0 | 670 | *err = U_ILLEGAL_CHAR_FOUND; |
michael@0 | 671 | break; |
michael@0 | 672 | } |
michael@0 | 673 | offsetNum += i; |
michael@0 | 674 | } |
michael@0 | 675 | |
michael@0 | 676 | donefornow: |
michael@0 | 677 | if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) |
michael@0 | 678 | { |
michael@0 | 679 | /* End of target buffer */ |
michael@0 | 680 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 681 | } |
michael@0 | 682 | |
michael@0 | 683 | args->target = myTarget; |
michael@0 | 684 | args->source = (const char *) mySource; |
michael@0 | 685 | args->offsets = myOffsets; |
michael@0 | 686 | } |
michael@0 | 687 | |
michael@0 | 688 | static void |
michael@0 | 689 | T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args, |
michael@0 | 690 | UErrorCode * err) |
michael@0 | 691 | { |
michael@0 | 692 | const UChar *mySource = args->source; |
michael@0 | 693 | unsigned char *myTarget; |
michael@0 | 694 | const UChar *sourceLimit = args->sourceLimit; |
michael@0 | 695 | const unsigned char *targetLimit = (unsigned char *) args->targetLimit; |
michael@0 | 696 | UChar32 ch, ch2; |
michael@0 | 697 | unsigned int indexToWrite; |
michael@0 | 698 | unsigned char temp[sizeof(uint32_t)]; |
michael@0 | 699 | |
michael@0 | 700 | if(mySource >= sourceLimit) { |
michael@0 | 701 | /* no input, nothing to do */ |
michael@0 | 702 | return; |
michael@0 | 703 | } |
michael@0 | 704 | |
michael@0 | 705 | /* write the BOM if necessary */ |
michael@0 | 706 | if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { |
michael@0 | 707 | static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 }; |
michael@0 | 708 | ucnv_fromUWriteBytes(args->converter, |
michael@0 | 709 | bom, 4, |
michael@0 | 710 | &args->target, args->targetLimit, |
michael@0 | 711 | &args->offsets, -1, |
michael@0 | 712 | err); |
michael@0 | 713 | args->converter->fromUnicodeStatus=0; |
michael@0 | 714 | } |
michael@0 | 715 | |
michael@0 | 716 | myTarget = (unsigned char *) args->target; |
michael@0 | 717 | temp[3] = 0; |
michael@0 | 718 | |
michael@0 | 719 | if (args->converter->fromUChar32) |
michael@0 | 720 | { |
michael@0 | 721 | ch = args->converter->fromUChar32; |
michael@0 | 722 | args->converter->fromUChar32 = 0; |
michael@0 | 723 | goto lowsurogate; |
michael@0 | 724 | } |
michael@0 | 725 | |
michael@0 | 726 | while (mySource < sourceLimit && myTarget < targetLimit) |
michael@0 | 727 | { |
michael@0 | 728 | ch = *(mySource++); |
michael@0 | 729 | |
michael@0 | 730 | if (U16_IS_SURROGATE(ch)) { |
michael@0 | 731 | if (U16_IS_LEAD(ch)) |
michael@0 | 732 | { |
michael@0 | 733 | lowsurogate: |
michael@0 | 734 | if (mySource < sourceLimit) |
michael@0 | 735 | { |
michael@0 | 736 | ch2 = *mySource; |
michael@0 | 737 | if (U16_IS_TRAIL(ch2)) { |
michael@0 | 738 | ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; |
michael@0 | 739 | mySource++; |
michael@0 | 740 | } |
michael@0 | 741 | else { |
michael@0 | 742 | /* this is an unmatched trail code unit (2nd surrogate) */ |
michael@0 | 743 | /* callback(illegal) */ |
michael@0 | 744 | args->converter->fromUChar32 = ch; |
michael@0 | 745 | *err = U_ILLEGAL_CHAR_FOUND; |
michael@0 | 746 | break; |
michael@0 | 747 | } |
michael@0 | 748 | } |
michael@0 | 749 | else { |
michael@0 | 750 | /* ran out of source */ |
michael@0 | 751 | args->converter->fromUChar32 = ch; |
michael@0 | 752 | if (args->flush) { |
michael@0 | 753 | /* this is an unmatched trail code unit (2nd surrogate) */ |
michael@0 | 754 | /* callback(illegal) */ |
michael@0 | 755 | *err = U_ILLEGAL_CHAR_FOUND; |
michael@0 | 756 | } |
michael@0 | 757 | break; |
michael@0 | 758 | } |
michael@0 | 759 | } |
michael@0 | 760 | else { |
michael@0 | 761 | /* this is an unmatched trail code unit (2nd surrogate) */ |
michael@0 | 762 | /* callback(illegal) */ |
michael@0 | 763 | args->converter->fromUChar32 = ch; |
michael@0 | 764 | *err = U_ILLEGAL_CHAR_FOUND; |
michael@0 | 765 | break; |
michael@0 | 766 | } |
michael@0 | 767 | } |
michael@0 | 768 | |
michael@0 | 769 | /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ |
michael@0 | 770 | temp[2] = (uint8_t) (ch >> 16 & 0x1F); |
michael@0 | 771 | temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */ |
michael@0 | 772 | temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */ |
michael@0 | 773 | |
michael@0 | 774 | for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) |
michael@0 | 775 | { |
michael@0 | 776 | if (myTarget < targetLimit) |
michael@0 | 777 | { |
michael@0 | 778 | *(myTarget++) = temp[indexToWrite]; |
michael@0 | 779 | } |
michael@0 | 780 | else |
michael@0 | 781 | { |
michael@0 | 782 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; |
michael@0 | 783 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 784 | } |
michael@0 | 785 | } |
michael@0 | 786 | } |
michael@0 | 787 | |
michael@0 | 788 | if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) |
michael@0 | 789 | { |
michael@0 | 790 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 791 | } |
michael@0 | 792 | |
michael@0 | 793 | args->target = (char *) myTarget; |
michael@0 | 794 | args->source = mySource; |
michael@0 | 795 | } |
michael@0 | 796 | |
michael@0 | 797 | static void |
michael@0 | 798 | T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args, |
michael@0 | 799 | UErrorCode * err) |
michael@0 | 800 | { |
michael@0 | 801 | const UChar *mySource = args->source; |
michael@0 | 802 | unsigned char *myTarget; |
michael@0 | 803 | int32_t *myOffsets; |
michael@0 | 804 | const UChar *sourceLimit = args->sourceLimit; |
michael@0 | 805 | const unsigned char *targetLimit = (unsigned char *) args->targetLimit; |
michael@0 | 806 | UChar32 ch, ch2; |
michael@0 | 807 | unsigned int indexToWrite; |
michael@0 | 808 | unsigned char temp[sizeof(uint32_t)]; |
michael@0 | 809 | int32_t offsetNum = 0; |
michael@0 | 810 | |
michael@0 | 811 | if(mySource >= sourceLimit) { |
michael@0 | 812 | /* no input, nothing to do */ |
michael@0 | 813 | return; |
michael@0 | 814 | } |
michael@0 | 815 | |
michael@0 | 816 | /* write the BOM if necessary */ |
michael@0 | 817 | if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { |
michael@0 | 818 | static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 }; |
michael@0 | 819 | ucnv_fromUWriteBytes(args->converter, |
michael@0 | 820 | bom, 4, |
michael@0 | 821 | &args->target, args->targetLimit, |
michael@0 | 822 | &args->offsets, -1, |
michael@0 | 823 | err); |
michael@0 | 824 | args->converter->fromUnicodeStatus=0; |
michael@0 | 825 | } |
michael@0 | 826 | |
michael@0 | 827 | myTarget = (unsigned char *) args->target; |
michael@0 | 828 | myOffsets = args->offsets; |
michael@0 | 829 | temp[3] = 0; |
michael@0 | 830 | |
michael@0 | 831 | if (args->converter->fromUChar32) |
michael@0 | 832 | { |
michael@0 | 833 | ch = args->converter->fromUChar32; |
michael@0 | 834 | args->converter->fromUChar32 = 0; |
michael@0 | 835 | goto lowsurogate; |
michael@0 | 836 | } |
michael@0 | 837 | |
michael@0 | 838 | while (mySource < sourceLimit && myTarget < targetLimit) |
michael@0 | 839 | { |
michael@0 | 840 | ch = *(mySource++); |
michael@0 | 841 | |
michael@0 | 842 | if (U16_IS_SURROGATE(ch)) { |
michael@0 | 843 | if (U16_IS_LEAD(ch)) |
michael@0 | 844 | { |
michael@0 | 845 | lowsurogate: |
michael@0 | 846 | if (mySource < sourceLimit) |
michael@0 | 847 | { |
michael@0 | 848 | ch2 = *mySource; |
michael@0 | 849 | if (U16_IS_TRAIL(ch2)) |
michael@0 | 850 | { |
michael@0 | 851 | ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; |
michael@0 | 852 | mySource++; |
michael@0 | 853 | } |
michael@0 | 854 | else { |
michael@0 | 855 | /* this is an unmatched trail code unit (2nd surrogate) */ |
michael@0 | 856 | /* callback(illegal) */ |
michael@0 | 857 | args->converter->fromUChar32 = ch; |
michael@0 | 858 | *err = U_ILLEGAL_CHAR_FOUND; |
michael@0 | 859 | break; |
michael@0 | 860 | } |
michael@0 | 861 | } |
michael@0 | 862 | else { |
michael@0 | 863 | /* ran out of source */ |
michael@0 | 864 | args->converter->fromUChar32 = ch; |
michael@0 | 865 | if (args->flush) { |
michael@0 | 866 | /* this is an unmatched trail code unit (2nd surrogate) */ |
michael@0 | 867 | /* callback(illegal) */ |
michael@0 | 868 | *err = U_ILLEGAL_CHAR_FOUND; |
michael@0 | 869 | } |
michael@0 | 870 | break; |
michael@0 | 871 | } |
michael@0 | 872 | } |
michael@0 | 873 | else { |
michael@0 | 874 | /* this is an unmatched trail code unit (2nd surrogate) */ |
michael@0 | 875 | /* callback(illegal) */ |
michael@0 | 876 | args->converter->fromUChar32 = ch; |
michael@0 | 877 | *err = U_ILLEGAL_CHAR_FOUND; |
michael@0 | 878 | break; |
michael@0 | 879 | } |
michael@0 | 880 | } |
michael@0 | 881 | |
michael@0 | 882 | /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ |
michael@0 | 883 | temp[2] = (uint8_t) (ch >> 16 & 0x1F); |
michael@0 | 884 | temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */ |
michael@0 | 885 | temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */ |
michael@0 | 886 | |
michael@0 | 887 | for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) |
michael@0 | 888 | { |
michael@0 | 889 | if (myTarget < targetLimit) |
michael@0 | 890 | { |
michael@0 | 891 | *(myTarget++) = temp[indexToWrite]; |
michael@0 | 892 | *(myOffsets++) = offsetNum; |
michael@0 | 893 | } |
michael@0 | 894 | else |
michael@0 | 895 | { |
michael@0 | 896 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; |
michael@0 | 897 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 898 | } |
michael@0 | 899 | } |
michael@0 | 900 | offsetNum = offsetNum + 1 + (temp[2] != 0); |
michael@0 | 901 | } |
michael@0 | 902 | |
michael@0 | 903 | if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) |
michael@0 | 904 | { |
michael@0 | 905 | *err = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 906 | } |
michael@0 | 907 | |
michael@0 | 908 | args->target = (char *) myTarget; |
michael@0 | 909 | args->source = mySource; |
michael@0 | 910 | args->offsets = myOffsets; |
michael@0 | 911 | } |
michael@0 | 912 | |
michael@0 | 913 | static UChar32 |
michael@0 | 914 | T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args, |
michael@0 | 915 | UErrorCode* err) |
michael@0 | 916 | { |
michael@0 | 917 | const uint8_t *mySource; |
michael@0 | 918 | UChar32 myUChar; |
michael@0 | 919 | int32_t length; |
michael@0 | 920 | |
michael@0 | 921 | mySource = (const uint8_t *)args->source; |
michael@0 | 922 | if (mySource >= (const uint8_t *)args->sourceLimit) |
michael@0 | 923 | { |
michael@0 | 924 | /* no input */ |
michael@0 | 925 | *err = U_INDEX_OUTOFBOUNDS_ERROR; |
michael@0 | 926 | return 0xffff; |
michael@0 | 927 | } |
michael@0 | 928 | |
michael@0 | 929 | length = (int32_t)((const uint8_t *)args->sourceLimit - mySource); |
michael@0 | 930 | if (length < 4) |
michael@0 | 931 | { |
michael@0 | 932 | /* got a partial character */ |
michael@0 | 933 | uprv_memcpy(args->converter->toUBytes, mySource, length); |
michael@0 | 934 | args->converter->toULength = (int8_t)length; |
michael@0 | 935 | args->source = (const char *)(mySource + length); |
michael@0 | 936 | *err = U_TRUNCATED_CHAR_FOUND; |
michael@0 | 937 | return 0xffff; |
michael@0 | 938 | } |
michael@0 | 939 | |
michael@0 | 940 | /* Don't even try to do a direct cast because the value may be on an odd address. */ |
michael@0 | 941 | myUChar = ((UChar32)mySource[3] << 24) |
michael@0 | 942 | | ((UChar32)mySource[2] << 16) |
michael@0 | 943 | | ((UChar32)mySource[1] << 8) |
michael@0 | 944 | | ((UChar32)mySource[0]); |
michael@0 | 945 | |
michael@0 | 946 | args->source = (const char *)(mySource + 4); |
michael@0 | 947 | if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) { |
michael@0 | 948 | return myUChar; |
michael@0 | 949 | } |
michael@0 | 950 | |
michael@0 | 951 | uprv_memcpy(args->converter->toUBytes, mySource, 4); |
michael@0 | 952 | args->converter->toULength = 4; |
michael@0 | 953 | |
michael@0 | 954 | *err = U_ILLEGAL_CHAR_FOUND; |
michael@0 | 955 | return 0xffff; |
michael@0 | 956 | } |
michael@0 | 957 | |
michael@0 | 958 | static const UConverterImpl _UTF32LEImpl = { |
michael@0 | 959 | UCNV_UTF32_LittleEndian, |
michael@0 | 960 | |
michael@0 | 961 | NULL, |
michael@0 | 962 | NULL, |
michael@0 | 963 | |
michael@0 | 964 | NULL, |
michael@0 | 965 | NULL, |
michael@0 | 966 | NULL, |
michael@0 | 967 | |
michael@0 | 968 | T_UConverter_toUnicode_UTF32_LE, |
michael@0 | 969 | T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC, |
michael@0 | 970 | T_UConverter_fromUnicode_UTF32_LE, |
michael@0 | 971 | T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC, |
michael@0 | 972 | T_UConverter_getNextUChar_UTF32_LE, |
michael@0 | 973 | |
michael@0 | 974 | NULL, |
michael@0 | 975 | NULL, |
michael@0 | 976 | NULL, |
michael@0 | 977 | NULL, |
michael@0 | 978 | ucnv_getNonSurrogateUnicodeSet |
michael@0 | 979 | }; |
michael@0 | 980 | |
michael@0 | 981 | /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */ |
michael@0 | 982 | static const UConverterStaticData _UTF32LEStaticData = { |
michael@0 | 983 | sizeof(UConverterStaticData), |
michael@0 | 984 | "UTF-32LE", |
michael@0 | 985 | 1234, |
michael@0 | 986 | UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4, |
michael@0 | 987 | { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE, |
michael@0 | 988 | 0, |
michael@0 | 989 | 0, |
michael@0 | 990 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
michael@0 | 991 | }; |
michael@0 | 992 | |
michael@0 | 993 | |
michael@0 | 994 | const UConverterSharedData _UTF32LEData = { |
michael@0 | 995 | sizeof(UConverterSharedData), ~((uint32_t) 0), |
michael@0 | 996 | NULL, NULL, &_UTF32LEStaticData, FALSE, &_UTF32LEImpl, |
michael@0 | 997 | 0 |
michael@0 | 998 | }; |
michael@0 | 999 | |
michael@0 | 1000 | /* UTF-32 (Detect BOM) ------------------------------------------------------ */ |
michael@0 | 1001 | |
michael@0 | 1002 | /* |
michael@0 | 1003 | * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE |
michael@0 | 1004 | * accordingly. |
michael@0 | 1005 | * |
michael@0 | 1006 | * State values: |
michael@0 | 1007 | * 0 initial state |
michael@0 | 1008 | * 1 saw 00 |
michael@0 | 1009 | * 2 saw 00 00 |
michael@0 | 1010 | * 3 saw 00 00 FE |
michael@0 | 1011 | * 4 - |
michael@0 | 1012 | * 5 saw FF |
michael@0 | 1013 | * 6 saw FF FE |
michael@0 | 1014 | * 7 saw FF FE 00 |
michael@0 | 1015 | * 8 UTF-32BE mode |
michael@0 | 1016 | * 9 UTF-32LE mode |
michael@0 | 1017 | * |
michael@0 | 1018 | * During detection: state&3==number of matching bytes so far. |
michael@0 | 1019 | * |
michael@0 | 1020 | * On output, emit U+FEFF as the first code point. |
michael@0 | 1021 | */ |
michael@0 | 1022 | |
michael@0 | 1023 | static void |
michael@0 | 1024 | _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) { |
michael@0 | 1025 | if(choice<=UCNV_RESET_TO_UNICODE) { |
michael@0 | 1026 | /* reset toUnicode: state=0 */ |
michael@0 | 1027 | cnv->mode=0; |
michael@0 | 1028 | } |
michael@0 | 1029 | if(choice!=UCNV_RESET_TO_UNICODE) { |
michael@0 | 1030 | /* reset fromUnicode: prepare to output the UTF-32PE BOM */ |
michael@0 | 1031 | cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; |
michael@0 | 1032 | } |
michael@0 | 1033 | } |
michael@0 | 1034 | |
michael@0 | 1035 | static void |
michael@0 | 1036 | _UTF32Open(UConverter *cnv, |
michael@0 | 1037 | UConverterLoadArgs *pArgs, |
michael@0 | 1038 | UErrorCode *pErrorCode) { |
michael@0 | 1039 | _UTF32Reset(cnv, UCNV_RESET_BOTH); |
michael@0 | 1040 | } |
michael@0 | 1041 | |
michael@0 | 1042 | static const char utf32BOM[8]={ 0, 0, (char)0xfe, (char)0xff, (char)0xff, (char)0xfe, 0, 0 }; |
michael@0 | 1043 | |
michael@0 | 1044 | static void |
michael@0 | 1045 | _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, |
michael@0 | 1046 | UErrorCode *pErrorCode) { |
michael@0 | 1047 | UConverter *cnv=pArgs->converter; |
michael@0 | 1048 | const char *source=pArgs->source; |
michael@0 | 1049 | const char *sourceLimit=pArgs->sourceLimit; |
michael@0 | 1050 | int32_t *offsets=pArgs->offsets; |
michael@0 | 1051 | |
michael@0 | 1052 | int32_t state, offsetDelta; |
michael@0 | 1053 | char b; |
michael@0 | 1054 | |
michael@0 | 1055 | state=cnv->mode; |
michael@0 | 1056 | |
michael@0 | 1057 | /* |
michael@0 | 1058 | * If we detect a BOM in this buffer, then we must add the BOM size to the |
michael@0 | 1059 | * offsets because the actual converter function will not see and count the BOM. |
michael@0 | 1060 | * offsetDelta will have the number of the BOM bytes that are in the current buffer. |
michael@0 | 1061 | */ |
michael@0 | 1062 | offsetDelta=0; |
michael@0 | 1063 | |
michael@0 | 1064 | while(source<sourceLimit && U_SUCCESS(*pErrorCode)) { |
michael@0 | 1065 | switch(state) { |
michael@0 | 1066 | case 0: |
michael@0 | 1067 | b=*source; |
michael@0 | 1068 | if(b==0) { |
michael@0 | 1069 | state=1; /* could be 00 00 FE FF */ |
michael@0 | 1070 | } else if(b==(char)0xff) { |
michael@0 | 1071 | state=5; /* could be FF FE 00 00 */ |
michael@0 | 1072 | } else { |
michael@0 | 1073 | state=8; /* default to UTF-32BE */ |
michael@0 | 1074 | continue; |
michael@0 | 1075 | } |
michael@0 | 1076 | ++source; |
michael@0 | 1077 | break; |
michael@0 | 1078 | case 1: |
michael@0 | 1079 | case 2: |
michael@0 | 1080 | case 3: |
michael@0 | 1081 | case 5: |
michael@0 | 1082 | case 6: |
michael@0 | 1083 | case 7: |
michael@0 | 1084 | if(*source==utf32BOM[state]) { |
michael@0 | 1085 | ++state; |
michael@0 | 1086 | ++source; |
michael@0 | 1087 | if(state==4) { |
michael@0 | 1088 | state=8; /* detect UTF-32BE */ |
michael@0 | 1089 | offsetDelta=(int32_t)(source-pArgs->source); |
michael@0 | 1090 | } else if(state==8) { |
michael@0 | 1091 | state=9; /* detect UTF-32LE */ |
michael@0 | 1092 | offsetDelta=(int32_t)(source-pArgs->source); |
michael@0 | 1093 | } |
michael@0 | 1094 | } else { |
michael@0 | 1095 | /* switch to UTF-32BE and pass the previous bytes */ |
michael@0 | 1096 | int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */ |
michael@0 | 1097 | |
michael@0 | 1098 | /* reset the source */ |
michael@0 | 1099 | source=pArgs->source; |
michael@0 | 1100 | |
michael@0 | 1101 | if(count==(state&3)) { |
michael@0 | 1102 | /* simple: all in the same buffer, just reset source */ |
michael@0 | 1103 | } else { |
michael@0 | 1104 | UBool oldFlush=pArgs->flush; |
michael@0 | 1105 | |
michael@0 | 1106 | /* some of the bytes are from a previous buffer, replay those first */ |
michael@0 | 1107 | pArgs->source=utf32BOM+(state&4); /* select the correct BOM */ |
michael@0 | 1108 | pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */ |
michael@0 | 1109 | pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */ |
michael@0 | 1110 | |
michael@0 | 1111 | /* no offsets: bytes from previous buffer, and not enough for output */ |
michael@0 | 1112 | T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode); |
michael@0 | 1113 | |
michael@0 | 1114 | /* restore real pointers; pArgs->source will be set in case 8/9 */ |
michael@0 | 1115 | pArgs->sourceLimit=sourceLimit; |
michael@0 | 1116 | pArgs->flush=oldFlush; |
michael@0 | 1117 | } |
michael@0 | 1118 | state=8; |
michael@0 | 1119 | continue; |
michael@0 | 1120 | } |
michael@0 | 1121 | break; |
michael@0 | 1122 | case 8: |
michael@0 | 1123 | /* call UTF-32BE */ |
michael@0 | 1124 | pArgs->source=source; |
michael@0 | 1125 | if(offsets==NULL) { |
michael@0 | 1126 | T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode); |
michael@0 | 1127 | } else { |
michael@0 | 1128 | T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode); |
michael@0 | 1129 | } |
michael@0 | 1130 | source=pArgs->source; |
michael@0 | 1131 | break; |
michael@0 | 1132 | case 9: |
michael@0 | 1133 | /* call UTF-32LE */ |
michael@0 | 1134 | pArgs->source=source; |
michael@0 | 1135 | if(offsets==NULL) { |
michael@0 | 1136 | T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode); |
michael@0 | 1137 | } else { |
michael@0 | 1138 | T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode); |
michael@0 | 1139 | } |
michael@0 | 1140 | source=pArgs->source; |
michael@0 | 1141 | break; |
michael@0 | 1142 | default: |
michael@0 | 1143 | break; /* does not occur */ |
michael@0 | 1144 | } |
michael@0 | 1145 | } |
michael@0 | 1146 | |
michael@0 | 1147 | /* add BOM size to offsets - see comment at offsetDelta declaration */ |
michael@0 | 1148 | if(offsets!=NULL && offsetDelta!=0) { |
michael@0 | 1149 | int32_t *offsetsLimit=pArgs->offsets; |
michael@0 | 1150 | while(offsets<offsetsLimit) { |
michael@0 | 1151 | *offsets++ += offsetDelta; |
michael@0 | 1152 | } |
michael@0 | 1153 | } |
michael@0 | 1154 | |
michael@0 | 1155 | pArgs->source=source; |
michael@0 | 1156 | |
michael@0 | 1157 | if(source==sourceLimit && pArgs->flush) { |
michael@0 | 1158 | /* handle truncated input */ |
michael@0 | 1159 | switch(state) { |
michael@0 | 1160 | case 0: |
michael@0 | 1161 | break; /* no input at all, nothing to do */ |
michael@0 | 1162 | case 8: |
michael@0 | 1163 | T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode); |
michael@0 | 1164 | break; |
michael@0 | 1165 | case 9: |
michael@0 | 1166 | T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode); |
michael@0 | 1167 | break; |
michael@0 | 1168 | default: |
michael@0 | 1169 | /* handle 0<state<8: call UTF-32BE with too-short input */ |
michael@0 | 1170 | pArgs->source=utf32BOM+(state&4); /* select the correct BOM */ |
michael@0 | 1171 | pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */ |
michael@0 | 1172 | |
michael@0 | 1173 | /* no offsets: not enough for output */ |
michael@0 | 1174 | T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode); |
michael@0 | 1175 | pArgs->source=source; |
michael@0 | 1176 | pArgs->sourceLimit=sourceLimit; |
michael@0 | 1177 | state=8; |
michael@0 | 1178 | break; |
michael@0 | 1179 | } |
michael@0 | 1180 | } |
michael@0 | 1181 | |
michael@0 | 1182 | cnv->mode=state; |
michael@0 | 1183 | } |
michael@0 | 1184 | |
michael@0 | 1185 | static UChar32 |
michael@0 | 1186 | _UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs, |
michael@0 | 1187 | UErrorCode *pErrorCode) { |
michael@0 | 1188 | switch(pArgs->converter->mode) { |
michael@0 | 1189 | case 8: |
michael@0 | 1190 | return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode); |
michael@0 | 1191 | case 9: |
michael@0 | 1192 | return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode); |
michael@0 | 1193 | default: |
michael@0 | 1194 | return UCNV_GET_NEXT_UCHAR_USE_TO_U; |
michael@0 | 1195 | } |
michael@0 | 1196 | } |
michael@0 | 1197 | |
michael@0 | 1198 | static const UConverterImpl _UTF32Impl = { |
michael@0 | 1199 | UCNV_UTF32, |
michael@0 | 1200 | |
michael@0 | 1201 | NULL, |
michael@0 | 1202 | NULL, |
michael@0 | 1203 | |
michael@0 | 1204 | _UTF32Open, |
michael@0 | 1205 | NULL, |
michael@0 | 1206 | _UTF32Reset, |
michael@0 | 1207 | |
michael@0 | 1208 | _UTF32ToUnicodeWithOffsets, |
michael@0 | 1209 | _UTF32ToUnicodeWithOffsets, |
michael@0 | 1210 | #if U_IS_BIG_ENDIAN |
michael@0 | 1211 | T_UConverter_fromUnicode_UTF32_BE, |
michael@0 | 1212 | T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC, |
michael@0 | 1213 | #else |
michael@0 | 1214 | T_UConverter_fromUnicode_UTF32_LE, |
michael@0 | 1215 | T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC, |
michael@0 | 1216 | #endif |
michael@0 | 1217 | _UTF32GetNextUChar, |
michael@0 | 1218 | |
michael@0 | 1219 | NULL, /* ### TODO implement getStarters for all Unicode encodings?! */ |
michael@0 | 1220 | NULL, |
michael@0 | 1221 | NULL, |
michael@0 | 1222 | NULL, |
michael@0 | 1223 | ucnv_getNonSurrogateUnicodeSet |
michael@0 | 1224 | }; |
michael@0 | 1225 | |
michael@0 | 1226 | /* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */ |
michael@0 | 1227 | static const UConverterStaticData _UTF32StaticData = { |
michael@0 | 1228 | sizeof(UConverterStaticData), |
michael@0 | 1229 | "UTF-32", |
michael@0 | 1230 | 1236, |
michael@0 | 1231 | UCNV_IBM, UCNV_UTF32, 4, 4, |
michael@0 | 1232 | #if U_IS_BIG_ENDIAN |
michael@0 | 1233 | { 0, 0, 0xff, 0xfd }, 4, |
michael@0 | 1234 | #else |
michael@0 | 1235 | { 0xfd, 0xff, 0, 0 }, 4, |
michael@0 | 1236 | #endif |
michael@0 | 1237 | FALSE, FALSE, |
michael@0 | 1238 | 0, |
michael@0 | 1239 | 0, |
michael@0 | 1240 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
michael@0 | 1241 | }; |
michael@0 | 1242 | |
michael@0 | 1243 | const UConverterSharedData _UTF32Data = { |
michael@0 | 1244 | sizeof(UConverterSharedData), ~((uint32_t) 0), |
michael@0 | 1245 | NULL, NULL, &_UTF32StaticData, FALSE, &_UTF32Impl, |
michael@0 | 1246 | 0 |
michael@0 | 1247 | }; |
michael@0 | 1248 | |
michael@0 | 1249 | #endif |