intl/icu/source/common/ucnvlat1.c

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 **********************************************************************
michael@0 3 * Copyright (C) 2000-2012, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 **********************************************************************
michael@0 6 * file name: ucnvlat1.cpp
michael@0 7 * encoding: US-ASCII
michael@0 8 * tab size: 8 (not used)
michael@0 9 * indentation:4
michael@0 10 *
michael@0 11 * created on: 2000feb07
michael@0 12 * created by: Markus W. Scherer
michael@0 13 */
michael@0 14
michael@0 15 #include "unicode/utypes.h"
michael@0 16
michael@0 17 #if !UCONFIG_NO_CONVERSION
michael@0 18
michael@0 19 #include "unicode/ucnv.h"
michael@0 20 #include "unicode/uset.h"
michael@0 21 #include "unicode/utf8.h"
michael@0 22 #include "ucnv_bld.h"
michael@0 23 #include "ucnv_cnv.h"
michael@0 24
michael@0 25 /* control optimizations according to the platform */
michael@0 26 #define LATIN1_UNROLL_FROM_UNICODE 1
michael@0 27
michael@0 28 /* ISO 8859-1 --------------------------------------------------------------- */
michael@0 29
michael@0 30 /* This is a table-less and callback-less version of ucnv_MBCSSingleToBMPWithOffsets(). */
michael@0 31 static void
michael@0 32 _Latin1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
michael@0 33 UErrorCode *pErrorCode) {
michael@0 34 const uint8_t *source;
michael@0 35 UChar *target;
michael@0 36 int32_t targetCapacity, length;
michael@0 37 int32_t *offsets;
michael@0 38
michael@0 39 int32_t sourceIndex;
michael@0 40
michael@0 41 /* set up the local pointers */
michael@0 42 source=(const uint8_t *)pArgs->source;
michael@0 43 target=pArgs->target;
michael@0 44 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
michael@0 45 offsets=pArgs->offsets;
michael@0 46
michael@0 47 sourceIndex=0;
michael@0 48
michael@0 49 /*
michael@0 50 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
michael@0 51 * for the minimum of the sourceLength and targetCapacity
michael@0 52 */
michael@0 53 length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
michael@0 54 if(length<=targetCapacity) {
michael@0 55 targetCapacity=length;
michael@0 56 } else {
michael@0 57 /* target will be full */
michael@0 58 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 59 length=targetCapacity;
michael@0 60 }
michael@0 61
michael@0 62 if(targetCapacity>=8) {
michael@0 63 /* This loop is unrolled for speed and improved pipelining. */
michael@0 64 int32_t count, loops;
michael@0 65
michael@0 66 loops=count=targetCapacity>>3;
michael@0 67 length=targetCapacity&=0x7;
michael@0 68 do {
michael@0 69 target[0]=source[0];
michael@0 70 target[1]=source[1];
michael@0 71 target[2]=source[2];
michael@0 72 target[3]=source[3];
michael@0 73 target[4]=source[4];
michael@0 74 target[5]=source[5];
michael@0 75 target[6]=source[6];
michael@0 76 target[7]=source[7];
michael@0 77 target+=8;
michael@0 78 source+=8;
michael@0 79 } while(--count>0);
michael@0 80
michael@0 81 if(offsets!=NULL) {
michael@0 82 do {
michael@0 83 offsets[0]=sourceIndex++;
michael@0 84 offsets[1]=sourceIndex++;
michael@0 85 offsets[2]=sourceIndex++;
michael@0 86 offsets[3]=sourceIndex++;
michael@0 87 offsets[4]=sourceIndex++;
michael@0 88 offsets[5]=sourceIndex++;
michael@0 89 offsets[6]=sourceIndex++;
michael@0 90 offsets[7]=sourceIndex++;
michael@0 91 offsets+=8;
michael@0 92 } while(--loops>0);
michael@0 93 }
michael@0 94 }
michael@0 95
michael@0 96 /* conversion loop */
michael@0 97 while(targetCapacity>0) {
michael@0 98 *target++=*source++;
michael@0 99 --targetCapacity;
michael@0 100 }
michael@0 101
michael@0 102 /* write back the updated pointers */
michael@0 103 pArgs->source=(const char *)source;
michael@0 104 pArgs->target=target;
michael@0 105
michael@0 106 /* set offsets */
michael@0 107 if(offsets!=NULL) {
michael@0 108 while(length>0) {
michael@0 109 *offsets++=sourceIndex++;
michael@0 110 --length;
michael@0 111 }
michael@0 112 pArgs->offsets=offsets;
michael@0 113 }
michael@0 114 }
michael@0 115
michael@0 116 /* This is a table-less and callback-less version of ucnv_MBCSSingleGetNextUChar(). */
michael@0 117 static UChar32
michael@0 118 _Latin1GetNextUChar(UConverterToUnicodeArgs *pArgs,
michael@0 119 UErrorCode *pErrorCode) {
michael@0 120 const uint8_t *source=(const uint8_t *)pArgs->source;
michael@0 121 if(source<(const uint8_t *)pArgs->sourceLimit) {
michael@0 122 pArgs->source=(const char *)(source+1);
michael@0 123 return *source;
michael@0 124 }
michael@0 125
michael@0 126 /* no output because of empty input */
michael@0 127 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 128 return 0xffff;
michael@0 129 }
michael@0 130
michael@0 131 /* This is a table-less version of ucnv_MBCSSingleFromBMPWithOffsets(). */
michael@0 132 static void
michael@0 133 _Latin1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
michael@0 134 UErrorCode *pErrorCode) {
michael@0 135 UConverter *cnv;
michael@0 136 const UChar *source, *sourceLimit;
michael@0 137 uint8_t *target, *oldTarget;
michael@0 138 int32_t targetCapacity, length;
michael@0 139 int32_t *offsets;
michael@0 140
michael@0 141 UChar32 cp;
michael@0 142 UChar c, max;
michael@0 143
michael@0 144 int32_t sourceIndex;
michael@0 145
michael@0 146 /* set up the local pointers */
michael@0 147 cnv=pArgs->converter;
michael@0 148 source=pArgs->source;
michael@0 149 sourceLimit=pArgs->sourceLimit;
michael@0 150 target=oldTarget=(uint8_t *)pArgs->target;
michael@0 151 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
michael@0 152 offsets=pArgs->offsets;
michael@0 153
michael@0 154 if(cnv->sharedData==&_Latin1Data) {
michael@0 155 max=0xff; /* Latin-1 */
michael@0 156 } else {
michael@0 157 max=0x7f; /* US-ASCII */
michael@0 158 }
michael@0 159
michael@0 160 /* get the converter state from UConverter */
michael@0 161 cp=cnv->fromUChar32;
michael@0 162
michael@0 163 /* sourceIndex=-1 if the current character began in the previous buffer */
michael@0 164 sourceIndex= cp==0 ? 0 : -1;
michael@0 165
michael@0 166 /*
michael@0 167 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
michael@0 168 * for the minimum of the sourceLength and targetCapacity
michael@0 169 */
michael@0 170 length=(int32_t)(sourceLimit-source);
michael@0 171 if(length<targetCapacity) {
michael@0 172 targetCapacity=length;
michael@0 173 }
michael@0 174
michael@0 175 /* conversion loop */
michael@0 176 if(cp!=0 && targetCapacity>0) {
michael@0 177 goto getTrail;
michael@0 178 }
michael@0 179
michael@0 180 #if LATIN1_UNROLL_FROM_UNICODE
michael@0 181 /* unroll the loop with the most common case */
michael@0 182 if(targetCapacity>=16) {
michael@0 183 int32_t count, loops;
michael@0 184 UChar u, oredChars;
michael@0 185
michael@0 186 loops=count=targetCapacity>>4;
michael@0 187 do {
michael@0 188 oredChars=u=*source++;
michael@0 189 *target++=(uint8_t)u;
michael@0 190 oredChars|=u=*source++;
michael@0 191 *target++=(uint8_t)u;
michael@0 192 oredChars|=u=*source++;
michael@0 193 *target++=(uint8_t)u;
michael@0 194 oredChars|=u=*source++;
michael@0 195 *target++=(uint8_t)u;
michael@0 196 oredChars|=u=*source++;
michael@0 197 *target++=(uint8_t)u;
michael@0 198 oredChars|=u=*source++;
michael@0 199 *target++=(uint8_t)u;
michael@0 200 oredChars|=u=*source++;
michael@0 201 *target++=(uint8_t)u;
michael@0 202 oredChars|=u=*source++;
michael@0 203 *target++=(uint8_t)u;
michael@0 204 oredChars|=u=*source++;
michael@0 205 *target++=(uint8_t)u;
michael@0 206 oredChars|=u=*source++;
michael@0 207 *target++=(uint8_t)u;
michael@0 208 oredChars|=u=*source++;
michael@0 209 *target++=(uint8_t)u;
michael@0 210 oredChars|=u=*source++;
michael@0 211 *target++=(uint8_t)u;
michael@0 212 oredChars|=u=*source++;
michael@0 213 *target++=(uint8_t)u;
michael@0 214 oredChars|=u=*source++;
michael@0 215 *target++=(uint8_t)u;
michael@0 216 oredChars|=u=*source++;
michael@0 217 *target++=(uint8_t)u;
michael@0 218 oredChars|=u=*source++;
michael@0 219 *target++=(uint8_t)u;
michael@0 220
michael@0 221 /* were all 16 entries really valid? */
michael@0 222 if(oredChars>max) {
michael@0 223 /* no, return to the first of these 16 */
michael@0 224 source-=16;
michael@0 225 target-=16;
michael@0 226 break;
michael@0 227 }
michael@0 228 } while(--count>0);
michael@0 229 count=loops-count;
michael@0 230 targetCapacity-=16*count;
michael@0 231
michael@0 232 if(offsets!=NULL) {
michael@0 233 oldTarget+=16*count;
michael@0 234 while(count>0) {
michael@0 235 *offsets++=sourceIndex++;
michael@0 236 *offsets++=sourceIndex++;
michael@0 237 *offsets++=sourceIndex++;
michael@0 238 *offsets++=sourceIndex++;
michael@0 239 *offsets++=sourceIndex++;
michael@0 240 *offsets++=sourceIndex++;
michael@0 241 *offsets++=sourceIndex++;
michael@0 242 *offsets++=sourceIndex++;
michael@0 243 *offsets++=sourceIndex++;
michael@0 244 *offsets++=sourceIndex++;
michael@0 245 *offsets++=sourceIndex++;
michael@0 246 *offsets++=sourceIndex++;
michael@0 247 *offsets++=sourceIndex++;
michael@0 248 *offsets++=sourceIndex++;
michael@0 249 *offsets++=sourceIndex++;
michael@0 250 *offsets++=sourceIndex++;
michael@0 251 --count;
michael@0 252 }
michael@0 253 }
michael@0 254 }
michael@0 255 #endif
michael@0 256
michael@0 257 /* conversion loop */
michael@0 258 c=0;
michael@0 259 while(targetCapacity>0 && (c=*source++)<=max) {
michael@0 260 /* convert the Unicode code point */
michael@0 261 *target++=(uint8_t)c;
michael@0 262 --targetCapacity;
michael@0 263 }
michael@0 264
michael@0 265 if(c>max) {
michael@0 266 cp=c;
michael@0 267 if(!U_IS_SURROGATE(cp)) {
michael@0 268 /* callback(unassigned) */
michael@0 269 } else if(U_IS_SURROGATE_LEAD(cp)) {
michael@0 270 getTrail:
michael@0 271 if(source<sourceLimit) {
michael@0 272 /* test the following code unit */
michael@0 273 UChar trail=*source;
michael@0 274 if(U16_IS_TRAIL(trail)) {
michael@0 275 ++source;
michael@0 276 cp=U16_GET_SUPPLEMENTARY(cp, trail);
michael@0 277 /* this codepage does not map supplementary code points */
michael@0 278 /* callback(unassigned) */
michael@0 279 } else {
michael@0 280 /* this is an unmatched lead code unit (1st surrogate) */
michael@0 281 /* callback(illegal) */
michael@0 282 }
michael@0 283 } else {
michael@0 284 /* no more input */
michael@0 285 cnv->fromUChar32=cp;
michael@0 286 goto noMoreInput;
michael@0 287 }
michael@0 288 } else {
michael@0 289 /* this is an unmatched trail code unit (2nd surrogate) */
michael@0 290 /* callback(illegal) */
michael@0 291 }
michael@0 292
michael@0 293 *pErrorCode= U_IS_SURROGATE(cp) ? U_ILLEGAL_CHAR_FOUND : U_INVALID_CHAR_FOUND;
michael@0 294 cnv->fromUChar32=cp;
michael@0 295 }
michael@0 296 noMoreInput:
michael@0 297
michael@0 298 /* set offsets since the start */
michael@0 299 if(offsets!=NULL) {
michael@0 300 size_t count=target-oldTarget;
michael@0 301 while(count>0) {
michael@0 302 *offsets++=sourceIndex++;
michael@0 303 --count;
michael@0 304 }
michael@0 305 }
michael@0 306
michael@0 307 if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) {
michael@0 308 /* target is full */
michael@0 309 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 310 }
michael@0 311
michael@0 312 /* write back the updated pointers */
michael@0 313 pArgs->source=source;
michael@0 314 pArgs->target=(char *)target;
michael@0 315 pArgs->offsets=offsets;
michael@0 316 }
michael@0 317
michael@0 318 /* Convert UTF-8 to Latin-1. Adapted from ucnv_SBCSFromUTF8(). */
michael@0 319 static void
michael@0 320 ucnv_Latin1FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
michael@0 321 UConverterToUnicodeArgs *pToUArgs,
michael@0 322 UErrorCode *pErrorCode) {
michael@0 323 UConverter *utf8;
michael@0 324 const uint8_t *source, *sourceLimit;
michael@0 325 uint8_t *target;
michael@0 326 int32_t targetCapacity;
michael@0 327
michael@0 328 UChar32 c;
michael@0 329 uint8_t b, t1;
michael@0 330
michael@0 331 /* set up the local pointers */
michael@0 332 utf8=pToUArgs->converter;
michael@0 333 source=(uint8_t *)pToUArgs->source;
michael@0 334 sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
michael@0 335 target=(uint8_t *)pFromUArgs->target;
michael@0 336 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
michael@0 337
michael@0 338 /* get the converter state from the UTF-8 UConverter */
michael@0 339 c=(UChar32)utf8->toUnicodeStatus;
michael@0 340 if(c!=0 && source<sourceLimit) {
michael@0 341 if(targetCapacity==0) {
michael@0 342 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 343 return;
michael@0 344 } else if(c>=0xc2 && c<=0xc3 && (t1=(uint8_t)(*source-0x80)) <= 0x3f) {
michael@0 345 ++source;
michael@0 346 *target++=(uint8_t)(((c&3)<<6)|t1);
michael@0 347 --targetCapacity;
michael@0 348
michael@0 349 utf8->toUnicodeStatus=0;
michael@0 350 utf8->toULength=0;
michael@0 351 } else {
michael@0 352 /* complicated, illegal or unmappable input: fall back to the pivoting implementation */
michael@0 353 *pErrorCode=U_USING_DEFAULT_WARNING;
michael@0 354 return;
michael@0 355 }
michael@0 356 }
michael@0 357
michael@0 358 /*
michael@0 359 * Make sure that the last byte sequence before sourceLimit is complete
michael@0 360 * or runs into a lead byte.
michael@0 361 * In the conversion loop compare source with sourceLimit only once
michael@0 362 * per multi-byte character.
michael@0 363 * For Latin-1, adjust sourceLimit only for 1 trail byte because
michael@0 364 * the conversion loop handles at most 2-byte sequences.
michael@0 365 */
michael@0 366 if(source<sourceLimit && U8_IS_LEAD(*(sourceLimit-1))) {
michael@0 367 --sourceLimit;
michael@0 368 }
michael@0 369
michael@0 370 /* conversion loop */
michael@0 371 while(source<sourceLimit) {
michael@0 372 if(targetCapacity>0) {
michael@0 373 b=*source++;
michael@0 374 if((int8_t)b>=0) {
michael@0 375 /* convert ASCII */
michael@0 376 *target++=(uint8_t)b;
michael@0 377 --targetCapacity;
michael@0 378 } else if( /* handle U+0080..U+00FF inline */
michael@0 379 b>=0xc2 && b<=0xc3 &&
michael@0 380 (t1=(uint8_t)(*source-0x80)) <= 0x3f
michael@0 381 ) {
michael@0 382 ++source;
michael@0 383 *target++=(uint8_t)(((b&3)<<6)|t1);
michael@0 384 --targetCapacity;
michael@0 385 } else {
michael@0 386 /* complicated, illegal or unmappable input: fall back to the pivoting implementation */
michael@0 387 pToUArgs->source=(char *)(source-1);
michael@0 388 pFromUArgs->target=(char *)target;
michael@0 389 *pErrorCode=U_USING_DEFAULT_WARNING;
michael@0 390 return;
michael@0 391 }
michael@0 392 } else {
michael@0 393 /* target is full */
michael@0 394 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 395 break;
michael@0 396 }
michael@0 397 }
michael@0 398
michael@0 399 /*
michael@0 400 * The sourceLimit may have been adjusted before the conversion loop
michael@0 401 * to stop before a truncated sequence.
michael@0 402 * If so, then collect the truncated sequence now.
michael@0 403 * For Latin-1, there is at most exactly one lead byte because of the
michael@0 404 * smaller sourceLimit adjustment logic.
michael@0 405 */
michael@0 406 if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
michael@0 407 utf8->toUnicodeStatus=utf8->toUBytes[0]=b=*source++;
michael@0 408 utf8->toULength=1;
michael@0 409 utf8->mode=U8_COUNT_TRAIL_BYTES(b)+1;
michael@0 410 }
michael@0 411
michael@0 412 /* write back the updated pointers */
michael@0 413 pToUArgs->source=(char *)source;
michael@0 414 pFromUArgs->target=(char *)target;
michael@0 415 }
michael@0 416
michael@0 417 static void
michael@0 418 _Latin1GetUnicodeSet(const UConverter *cnv,
michael@0 419 const USetAdder *sa,
michael@0 420 UConverterUnicodeSet which,
michael@0 421 UErrorCode *pErrorCode) {
michael@0 422 sa->addRange(sa->set, 0, 0xff);
michael@0 423 }
michael@0 424
michael@0 425 static const UConverterImpl _Latin1Impl={
michael@0 426 UCNV_LATIN_1,
michael@0 427
michael@0 428 NULL,
michael@0 429 NULL,
michael@0 430
michael@0 431 NULL,
michael@0 432 NULL,
michael@0 433 NULL,
michael@0 434
michael@0 435 _Latin1ToUnicodeWithOffsets,
michael@0 436 _Latin1ToUnicodeWithOffsets,
michael@0 437 _Latin1FromUnicodeWithOffsets,
michael@0 438 _Latin1FromUnicodeWithOffsets,
michael@0 439 _Latin1GetNextUChar,
michael@0 440
michael@0 441 NULL,
michael@0 442 NULL,
michael@0 443 NULL,
michael@0 444 NULL,
michael@0 445 _Latin1GetUnicodeSet,
michael@0 446
michael@0 447 NULL,
michael@0 448 ucnv_Latin1FromUTF8
michael@0 449 };
michael@0 450
michael@0 451 static const UConverterStaticData _Latin1StaticData={
michael@0 452 sizeof(UConverterStaticData),
michael@0 453 "ISO-8859-1",
michael@0 454 819, UCNV_IBM, UCNV_LATIN_1, 1, 1,
michael@0 455 { 0x1a, 0, 0, 0 }, 1, FALSE, FALSE,
michael@0 456 0,
michael@0 457 0,
michael@0 458 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
michael@0 459 };
michael@0 460
michael@0 461 const UConverterSharedData _Latin1Data={
michael@0 462 sizeof(UConverterSharedData), ~((uint32_t) 0),
michael@0 463 NULL, NULL, &_Latin1StaticData, FALSE, &_Latin1Impl,
michael@0 464 0
michael@0 465 };
michael@0 466
michael@0 467 /* US-ASCII ----------------------------------------------------------------- */
michael@0 468
michael@0 469 /* This is a table-less version of ucnv_MBCSSingleToBMPWithOffsets(). */
michael@0 470 static void
michael@0 471 _ASCIIToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
michael@0 472 UErrorCode *pErrorCode) {
michael@0 473 const uint8_t *source, *sourceLimit;
michael@0 474 UChar *target, *oldTarget;
michael@0 475 int32_t targetCapacity, length;
michael@0 476 int32_t *offsets;
michael@0 477
michael@0 478 int32_t sourceIndex;
michael@0 479
michael@0 480 uint8_t c;
michael@0 481
michael@0 482 /* set up the local pointers */
michael@0 483 source=(const uint8_t *)pArgs->source;
michael@0 484 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
michael@0 485 target=oldTarget=pArgs->target;
michael@0 486 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
michael@0 487 offsets=pArgs->offsets;
michael@0 488
michael@0 489 /* sourceIndex=-1 if the current character began in the previous buffer */
michael@0 490 sourceIndex=0;
michael@0 491
michael@0 492 /*
michael@0 493 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
michael@0 494 * for the minimum of the sourceLength and targetCapacity
michael@0 495 */
michael@0 496 length=(int32_t)(sourceLimit-source);
michael@0 497 if(length<targetCapacity) {
michael@0 498 targetCapacity=length;
michael@0 499 }
michael@0 500
michael@0 501 if(targetCapacity>=8) {
michael@0 502 /* This loop is unrolled for speed and improved pipelining. */
michael@0 503 int32_t count, loops;
michael@0 504 UChar oredChars;
michael@0 505
michael@0 506 loops=count=targetCapacity>>3;
michael@0 507 do {
michael@0 508 oredChars=target[0]=source[0];
michael@0 509 oredChars|=target[1]=source[1];
michael@0 510 oredChars|=target[2]=source[2];
michael@0 511 oredChars|=target[3]=source[3];
michael@0 512 oredChars|=target[4]=source[4];
michael@0 513 oredChars|=target[5]=source[5];
michael@0 514 oredChars|=target[6]=source[6];
michael@0 515 oredChars|=target[7]=source[7];
michael@0 516
michael@0 517 /* were all 16 entries really valid? */
michael@0 518 if(oredChars>0x7f) {
michael@0 519 /* no, return to the first of these 16 */
michael@0 520 break;
michael@0 521 }
michael@0 522 source+=8;
michael@0 523 target+=8;
michael@0 524 } while(--count>0);
michael@0 525 count=loops-count;
michael@0 526 targetCapacity-=count*8;
michael@0 527
michael@0 528 if(offsets!=NULL) {
michael@0 529 oldTarget+=count*8;
michael@0 530 while(count>0) {
michael@0 531 offsets[0]=sourceIndex++;
michael@0 532 offsets[1]=sourceIndex++;
michael@0 533 offsets[2]=sourceIndex++;
michael@0 534 offsets[3]=sourceIndex++;
michael@0 535 offsets[4]=sourceIndex++;
michael@0 536 offsets[5]=sourceIndex++;
michael@0 537 offsets[6]=sourceIndex++;
michael@0 538 offsets[7]=sourceIndex++;
michael@0 539 offsets+=8;
michael@0 540 --count;
michael@0 541 }
michael@0 542 }
michael@0 543 }
michael@0 544
michael@0 545 /* conversion loop */
michael@0 546 c=0;
michael@0 547 while(targetCapacity>0 && (c=*source++)<=0x7f) {
michael@0 548 *target++=c;
michael@0 549 --targetCapacity;
michael@0 550 }
michael@0 551
michael@0 552 if(c>0x7f) {
michael@0 553 /* callback(illegal); copy the current bytes to toUBytes[] */
michael@0 554 UConverter *cnv=pArgs->converter;
michael@0 555 cnv->toUBytes[0]=c;
michael@0 556 cnv->toULength=1;
michael@0 557 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 558 } else if(source<sourceLimit && target>=pArgs->targetLimit) {
michael@0 559 /* target is full */
michael@0 560 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 561 }
michael@0 562
michael@0 563 /* set offsets since the start */
michael@0 564 if(offsets!=NULL) {
michael@0 565 size_t count=target-oldTarget;
michael@0 566 while(count>0) {
michael@0 567 *offsets++=sourceIndex++;
michael@0 568 --count;
michael@0 569 }
michael@0 570 }
michael@0 571
michael@0 572 /* write back the updated pointers */
michael@0 573 pArgs->source=(const char *)source;
michael@0 574 pArgs->target=target;
michael@0 575 pArgs->offsets=offsets;
michael@0 576 }
michael@0 577
michael@0 578 /* This is a table-less version of ucnv_MBCSSingleGetNextUChar(). */
michael@0 579 static UChar32
michael@0 580 _ASCIIGetNextUChar(UConverterToUnicodeArgs *pArgs,
michael@0 581 UErrorCode *pErrorCode) {
michael@0 582 const uint8_t *source;
michael@0 583 uint8_t b;
michael@0 584
michael@0 585 source=(const uint8_t *)pArgs->source;
michael@0 586 if(source<(const uint8_t *)pArgs->sourceLimit) {
michael@0 587 b=*source++;
michael@0 588 pArgs->source=(const char *)source;
michael@0 589 if(b<=0x7f) {
michael@0 590 return b;
michael@0 591 } else {
michael@0 592 UConverter *cnv=pArgs->converter;
michael@0 593 cnv->toUBytes[0]=b;
michael@0 594 cnv->toULength=1;
michael@0 595 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 596 return 0xffff;
michael@0 597 }
michael@0 598 }
michael@0 599
michael@0 600 /* no output because of empty input */
michael@0 601 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 602 return 0xffff;
michael@0 603 }
michael@0 604
michael@0 605 /* "Convert" UTF-8 to US-ASCII: Validate and copy. */
michael@0 606 static void
michael@0 607 ucnv_ASCIIFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
michael@0 608 UConverterToUnicodeArgs *pToUArgs,
michael@0 609 UErrorCode *pErrorCode) {
michael@0 610 const uint8_t *source, *sourceLimit;
michael@0 611 uint8_t *target;
michael@0 612 int32_t targetCapacity, length;
michael@0 613
michael@0 614 uint8_t c;
michael@0 615
michael@0 616 if(pToUArgs->converter->toUnicodeStatus!=0) {
michael@0 617 /* no handling of partial UTF-8 characters here, fall back to pivoting */
michael@0 618 *pErrorCode=U_USING_DEFAULT_WARNING;
michael@0 619 return;
michael@0 620 }
michael@0 621
michael@0 622 /* set up the local pointers */
michael@0 623 source=(const uint8_t *)pToUArgs->source;
michael@0 624 sourceLimit=(const uint8_t *)pToUArgs->sourceLimit;
michael@0 625 target=(uint8_t *)pFromUArgs->target;
michael@0 626 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
michael@0 627
michael@0 628 /*
michael@0 629 * since the conversion here is 1:1 uint8_t:uint8_t, we need only one counter
michael@0 630 * for the minimum of the sourceLength and targetCapacity
michael@0 631 */
michael@0 632 length=(int32_t)(sourceLimit-source);
michael@0 633 if(length<targetCapacity) {
michael@0 634 targetCapacity=length;
michael@0 635 }
michael@0 636
michael@0 637 /* unroll the loop with the most common case */
michael@0 638 if(targetCapacity>=16) {
michael@0 639 int32_t count, loops;
michael@0 640 uint8_t oredChars;
michael@0 641
michael@0 642 loops=count=targetCapacity>>4;
michael@0 643 do {
michael@0 644 oredChars=*target++=*source++;
michael@0 645 oredChars|=*target++=*source++;
michael@0 646 oredChars|=*target++=*source++;
michael@0 647 oredChars|=*target++=*source++;
michael@0 648 oredChars|=*target++=*source++;
michael@0 649 oredChars|=*target++=*source++;
michael@0 650 oredChars|=*target++=*source++;
michael@0 651 oredChars|=*target++=*source++;
michael@0 652 oredChars|=*target++=*source++;
michael@0 653 oredChars|=*target++=*source++;
michael@0 654 oredChars|=*target++=*source++;
michael@0 655 oredChars|=*target++=*source++;
michael@0 656 oredChars|=*target++=*source++;
michael@0 657 oredChars|=*target++=*source++;
michael@0 658 oredChars|=*target++=*source++;
michael@0 659 oredChars|=*target++=*source++;
michael@0 660
michael@0 661 /* were all 16 entries really valid? */
michael@0 662 if(oredChars>0x7f) {
michael@0 663 /* no, return to the first of these 16 */
michael@0 664 source-=16;
michael@0 665 target-=16;
michael@0 666 break;
michael@0 667 }
michael@0 668 } while(--count>0);
michael@0 669 count=loops-count;
michael@0 670 targetCapacity-=16*count;
michael@0 671 }
michael@0 672
michael@0 673 /* conversion loop */
michael@0 674 c=0;
michael@0 675 while(targetCapacity>0 && (c=*source)<=0x7f) {
michael@0 676 ++source;
michael@0 677 *target++=c;
michael@0 678 --targetCapacity;
michael@0 679 }
michael@0 680
michael@0 681 if(c>0x7f) {
michael@0 682 /* non-ASCII character, handle in standard converter */
michael@0 683 *pErrorCode=U_USING_DEFAULT_WARNING;
michael@0 684 } else if(source<sourceLimit && target>=(const uint8_t *)pFromUArgs->targetLimit) {
michael@0 685 /* target is full */
michael@0 686 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 687 }
michael@0 688
michael@0 689 /* write back the updated pointers */
michael@0 690 pToUArgs->source=(const char *)source;
michael@0 691 pFromUArgs->target=(char *)target;
michael@0 692 }
michael@0 693
michael@0 694 static void
michael@0 695 _ASCIIGetUnicodeSet(const UConverter *cnv,
michael@0 696 const USetAdder *sa,
michael@0 697 UConverterUnicodeSet which,
michael@0 698 UErrorCode *pErrorCode) {
michael@0 699 sa->addRange(sa->set, 0, 0x7f);
michael@0 700 }
michael@0 701
michael@0 702 static const UConverterImpl _ASCIIImpl={
michael@0 703 UCNV_US_ASCII,
michael@0 704
michael@0 705 NULL,
michael@0 706 NULL,
michael@0 707
michael@0 708 NULL,
michael@0 709 NULL,
michael@0 710 NULL,
michael@0 711
michael@0 712 _ASCIIToUnicodeWithOffsets,
michael@0 713 _ASCIIToUnicodeWithOffsets,
michael@0 714 _Latin1FromUnicodeWithOffsets,
michael@0 715 _Latin1FromUnicodeWithOffsets,
michael@0 716 _ASCIIGetNextUChar,
michael@0 717
michael@0 718 NULL,
michael@0 719 NULL,
michael@0 720 NULL,
michael@0 721 NULL,
michael@0 722 _ASCIIGetUnicodeSet,
michael@0 723
michael@0 724 NULL,
michael@0 725 ucnv_ASCIIFromUTF8
michael@0 726 };
michael@0 727
michael@0 728 static const UConverterStaticData _ASCIIStaticData={
michael@0 729 sizeof(UConverterStaticData),
michael@0 730 "US-ASCII",
michael@0 731 367, UCNV_IBM, UCNV_US_ASCII, 1, 1,
michael@0 732 { 0x1a, 0, 0, 0 }, 1, FALSE, FALSE,
michael@0 733 0,
michael@0 734 0,
michael@0 735 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
michael@0 736 };
michael@0 737
michael@0 738 const UConverterSharedData _ASCIIData={
michael@0 739 sizeof(UConverterSharedData), ~((uint32_t) 0),
michael@0 740 NULL, NULL, &_ASCIIStaticData, FALSE, &_ASCIIImpl,
michael@0 741 0
michael@0 742 };
michael@0 743
michael@0 744 #endif

mercurial