1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/ucnvlat1.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,744 @@ 1.4 +/* 1.5 +********************************************************************** 1.6 +* Copyright (C) 2000-2012, International Business Machines 1.7 +* Corporation and others. All Rights Reserved. 1.8 +********************************************************************** 1.9 +* file name: ucnvlat1.cpp 1.10 +* encoding: US-ASCII 1.11 +* tab size: 8 (not used) 1.12 +* indentation:4 1.13 +* 1.14 +* created on: 2000feb07 1.15 +* created by: Markus W. Scherer 1.16 +*/ 1.17 + 1.18 +#include "unicode/utypes.h" 1.19 + 1.20 +#if !UCONFIG_NO_CONVERSION 1.21 + 1.22 +#include "unicode/ucnv.h" 1.23 +#include "unicode/uset.h" 1.24 +#include "unicode/utf8.h" 1.25 +#include "ucnv_bld.h" 1.26 +#include "ucnv_cnv.h" 1.27 + 1.28 +/* control optimizations according to the platform */ 1.29 +#define LATIN1_UNROLL_FROM_UNICODE 1 1.30 + 1.31 +/* ISO 8859-1 --------------------------------------------------------------- */ 1.32 + 1.33 +/* This is a table-less and callback-less version of ucnv_MBCSSingleToBMPWithOffsets(). */ 1.34 +static void 1.35 +_Latin1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 1.36 + UErrorCode *pErrorCode) { 1.37 + const uint8_t *source; 1.38 + UChar *target; 1.39 + int32_t targetCapacity, length; 1.40 + int32_t *offsets; 1.41 + 1.42 + int32_t sourceIndex; 1.43 + 1.44 + /* set up the local pointers */ 1.45 + source=(const uint8_t *)pArgs->source; 1.46 + target=pArgs->target; 1.47 + targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 1.48 + offsets=pArgs->offsets; 1.49 + 1.50 + sourceIndex=0; 1.51 + 1.52 + /* 1.53 + * since the conversion here is 1:1 UChar:uint8_t, we need only one counter 1.54 + * for the minimum of the sourceLength and targetCapacity 1.55 + */ 1.56 + length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source); 1.57 + if(length<=targetCapacity) { 1.58 + targetCapacity=length; 1.59 + } else { 1.60 + /* target will be full */ 1.61 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.62 + length=targetCapacity; 1.63 + } 1.64 + 1.65 + if(targetCapacity>=8) { 1.66 + /* This loop is unrolled for speed and improved pipelining. */ 1.67 + int32_t count, loops; 1.68 + 1.69 + loops=count=targetCapacity>>3; 1.70 + length=targetCapacity&=0x7; 1.71 + do { 1.72 + target[0]=source[0]; 1.73 + target[1]=source[1]; 1.74 + target[2]=source[2]; 1.75 + target[3]=source[3]; 1.76 + target[4]=source[4]; 1.77 + target[5]=source[5]; 1.78 + target[6]=source[6]; 1.79 + target[7]=source[7]; 1.80 + target+=8; 1.81 + source+=8; 1.82 + } while(--count>0); 1.83 + 1.84 + if(offsets!=NULL) { 1.85 + do { 1.86 + offsets[0]=sourceIndex++; 1.87 + offsets[1]=sourceIndex++; 1.88 + offsets[2]=sourceIndex++; 1.89 + offsets[3]=sourceIndex++; 1.90 + offsets[4]=sourceIndex++; 1.91 + offsets[5]=sourceIndex++; 1.92 + offsets[6]=sourceIndex++; 1.93 + offsets[7]=sourceIndex++; 1.94 + offsets+=8; 1.95 + } while(--loops>0); 1.96 + } 1.97 + } 1.98 + 1.99 + /* conversion loop */ 1.100 + while(targetCapacity>0) { 1.101 + *target++=*source++; 1.102 + --targetCapacity; 1.103 + } 1.104 + 1.105 + /* write back the updated pointers */ 1.106 + pArgs->source=(const char *)source; 1.107 + pArgs->target=target; 1.108 + 1.109 + /* set offsets */ 1.110 + if(offsets!=NULL) { 1.111 + while(length>0) { 1.112 + *offsets++=sourceIndex++; 1.113 + --length; 1.114 + } 1.115 + pArgs->offsets=offsets; 1.116 + } 1.117 +} 1.118 + 1.119 +/* This is a table-less and callback-less version of ucnv_MBCSSingleGetNextUChar(). */ 1.120 +static UChar32 1.121 +_Latin1GetNextUChar(UConverterToUnicodeArgs *pArgs, 1.122 + UErrorCode *pErrorCode) { 1.123 + const uint8_t *source=(const uint8_t *)pArgs->source; 1.124 + if(source<(const uint8_t *)pArgs->sourceLimit) { 1.125 + pArgs->source=(const char *)(source+1); 1.126 + return *source; 1.127 + } 1.128 + 1.129 + /* no output because of empty input */ 1.130 + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1.131 + return 0xffff; 1.132 +} 1.133 + 1.134 +/* This is a table-less version of ucnv_MBCSSingleFromBMPWithOffsets(). */ 1.135 +static void 1.136 +_Latin1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 1.137 + UErrorCode *pErrorCode) { 1.138 + UConverter *cnv; 1.139 + const UChar *source, *sourceLimit; 1.140 + uint8_t *target, *oldTarget; 1.141 + int32_t targetCapacity, length; 1.142 + int32_t *offsets; 1.143 + 1.144 + UChar32 cp; 1.145 + UChar c, max; 1.146 + 1.147 + int32_t sourceIndex; 1.148 + 1.149 + /* set up the local pointers */ 1.150 + cnv=pArgs->converter; 1.151 + source=pArgs->source; 1.152 + sourceLimit=pArgs->sourceLimit; 1.153 + target=oldTarget=(uint8_t *)pArgs->target; 1.154 + targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 1.155 + offsets=pArgs->offsets; 1.156 + 1.157 + if(cnv->sharedData==&_Latin1Data) { 1.158 + max=0xff; /* Latin-1 */ 1.159 + } else { 1.160 + max=0x7f; /* US-ASCII */ 1.161 + } 1.162 + 1.163 + /* get the converter state from UConverter */ 1.164 + cp=cnv->fromUChar32; 1.165 + 1.166 + /* sourceIndex=-1 if the current character began in the previous buffer */ 1.167 + sourceIndex= cp==0 ? 0 : -1; 1.168 + 1.169 + /* 1.170 + * since the conversion here is 1:1 UChar:uint8_t, we need only one counter 1.171 + * for the minimum of the sourceLength and targetCapacity 1.172 + */ 1.173 + length=(int32_t)(sourceLimit-source); 1.174 + if(length<targetCapacity) { 1.175 + targetCapacity=length; 1.176 + } 1.177 + 1.178 + /* conversion loop */ 1.179 + if(cp!=0 && targetCapacity>0) { 1.180 + goto getTrail; 1.181 + } 1.182 + 1.183 +#if LATIN1_UNROLL_FROM_UNICODE 1.184 + /* unroll the loop with the most common case */ 1.185 + if(targetCapacity>=16) { 1.186 + int32_t count, loops; 1.187 + UChar u, oredChars; 1.188 + 1.189 + loops=count=targetCapacity>>4; 1.190 + do { 1.191 + oredChars=u=*source++; 1.192 + *target++=(uint8_t)u; 1.193 + oredChars|=u=*source++; 1.194 + *target++=(uint8_t)u; 1.195 + oredChars|=u=*source++; 1.196 + *target++=(uint8_t)u; 1.197 + oredChars|=u=*source++; 1.198 + *target++=(uint8_t)u; 1.199 + oredChars|=u=*source++; 1.200 + *target++=(uint8_t)u; 1.201 + oredChars|=u=*source++; 1.202 + *target++=(uint8_t)u; 1.203 + oredChars|=u=*source++; 1.204 + *target++=(uint8_t)u; 1.205 + oredChars|=u=*source++; 1.206 + *target++=(uint8_t)u; 1.207 + oredChars|=u=*source++; 1.208 + *target++=(uint8_t)u; 1.209 + oredChars|=u=*source++; 1.210 + *target++=(uint8_t)u; 1.211 + oredChars|=u=*source++; 1.212 + *target++=(uint8_t)u; 1.213 + oredChars|=u=*source++; 1.214 + *target++=(uint8_t)u; 1.215 + oredChars|=u=*source++; 1.216 + *target++=(uint8_t)u; 1.217 + oredChars|=u=*source++; 1.218 + *target++=(uint8_t)u; 1.219 + oredChars|=u=*source++; 1.220 + *target++=(uint8_t)u; 1.221 + oredChars|=u=*source++; 1.222 + *target++=(uint8_t)u; 1.223 + 1.224 + /* were all 16 entries really valid? */ 1.225 + if(oredChars>max) { 1.226 + /* no, return to the first of these 16 */ 1.227 + source-=16; 1.228 + target-=16; 1.229 + break; 1.230 + } 1.231 + } while(--count>0); 1.232 + count=loops-count; 1.233 + targetCapacity-=16*count; 1.234 + 1.235 + if(offsets!=NULL) { 1.236 + oldTarget+=16*count; 1.237 + while(count>0) { 1.238 + *offsets++=sourceIndex++; 1.239 + *offsets++=sourceIndex++; 1.240 + *offsets++=sourceIndex++; 1.241 + *offsets++=sourceIndex++; 1.242 + *offsets++=sourceIndex++; 1.243 + *offsets++=sourceIndex++; 1.244 + *offsets++=sourceIndex++; 1.245 + *offsets++=sourceIndex++; 1.246 + *offsets++=sourceIndex++; 1.247 + *offsets++=sourceIndex++; 1.248 + *offsets++=sourceIndex++; 1.249 + *offsets++=sourceIndex++; 1.250 + *offsets++=sourceIndex++; 1.251 + *offsets++=sourceIndex++; 1.252 + *offsets++=sourceIndex++; 1.253 + *offsets++=sourceIndex++; 1.254 + --count; 1.255 + } 1.256 + } 1.257 + } 1.258 +#endif 1.259 + 1.260 + /* conversion loop */ 1.261 + c=0; 1.262 + while(targetCapacity>0 && (c=*source++)<=max) { 1.263 + /* convert the Unicode code point */ 1.264 + *target++=(uint8_t)c; 1.265 + --targetCapacity; 1.266 + } 1.267 + 1.268 + if(c>max) { 1.269 + cp=c; 1.270 + if(!U_IS_SURROGATE(cp)) { 1.271 + /* callback(unassigned) */ 1.272 + } else if(U_IS_SURROGATE_LEAD(cp)) { 1.273 +getTrail: 1.274 + if(source<sourceLimit) { 1.275 + /* test the following code unit */ 1.276 + UChar trail=*source; 1.277 + if(U16_IS_TRAIL(trail)) { 1.278 + ++source; 1.279 + cp=U16_GET_SUPPLEMENTARY(cp, trail); 1.280 + /* this codepage does not map supplementary code points */ 1.281 + /* callback(unassigned) */ 1.282 + } else { 1.283 + /* this is an unmatched lead code unit (1st surrogate) */ 1.284 + /* callback(illegal) */ 1.285 + } 1.286 + } else { 1.287 + /* no more input */ 1.288 + cnv->fromUChar32=cp; 1.289 + goto noMoreInput; 1.290 + } 1.291 + } else { 1.292 + /* this is an unmatched trail code unit (2nd surrogate) */ 1.293 + /* callback(illegal) */ 1.294 + } 1.295 + 1.296 + *pErrorCode= U_IS_SURROGATE(cp) ? U_ILLEGAL_CHAR_FOUND : U_INVALID_CHAR_FOUND; 1.297 + cnv->fromUChar32=cp; 1.298 + } 1.299 +noMoreInput: 1.300 + 1.301 + /* set offsets since the start */ 1.302 + if(offsets!=NULL) { 1.303 + size_t count=target-oldTarget; 1.304 + while(count>0) { 1.305 + *offsets++=sourceIndex++; 1.306 + --count; 1.307 + } 1.308 + } 1.309 + 1.310 + if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) { 1.311 + /* target is full */ 1.312 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.313 + } 1.314 + 1.315 + /* write back the updated pointers */ 1.316 + pArgs->source=source; 1.317 + pArgs->target=(char *)target; 1.318 + pArgs->offsets=offsets; 1.319 +} 1.320 + 1.321 +/* Convert UTF-8 to Latin-1. Adapted from ucnv_SBCSFromUTF8(). */ 1.322 +static void 1.323 +ucnv_Latin1FromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 1.324 + UConverterToUnicodeArgs *pToUArgs, 1.325 + UErrorCode *pErrorCode) { 1.326 + UConverter *utf8; 1.327 + const uint8_t *source, *sourceLimit; 1.328 + uint8_t *target; 1.329 + int32_t targetCapacity; 1.330 + 1.331 + UChar32 c; 1.332 + uint8_t b, t1; 1.333 + 1.334 + /* set up the local pointers */ 1.335 + utf8=pToUArgs->converter; 1.336 + source=(uint8_t *)pToUArgs->source; 1.337 + sourceLimit=(uint8_t *)pToUArgs->sourceLimit; 1.338 + target=(uint8_t *)pFromUArgs->target; 1.339 + targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); 1.340 + 1.341 + /* get the converter state from the UTF-8 UConverter */ 1.342 + c=(UChar32)utf8->toUnicodeStatus; 1.343 + if(c!=0 && source<sourceLimit) { 1.344 + if(targetCapacity==0) { 1.345 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.346 + return; 1.347 + } else if(c>=0xc2 && c<=0xc3 && (t1=(uint8_t)(*source-0x80)) <= 0x3f) { 1.348 + ++source; 1.349 + *target++=(uint8_t)(((c&3)<<6)|t1); 1.350 + --targetCapacity; 1.351 + 1.352 + utf8->toUnicodeStatus=0; 1.353 + utf8->toULength=0; 1.354 + } else { 1.355 + /* complicated, illegal or unmappable input: fall back to the pivoting implementation */ 1.356 + *pErrorCode=U_USING_DEFAULT_WARNING; 1.357 + return; 1.358 + } 1.359 + } 1.360 + 1.361 + /* 1.362 + * Make sure that the last byte sequence before sourceLimit is complete 1.363 + * or runs into a lead byte. 1.364 + * In the conversion loop compare source with sourceLimit only once 1.365 + * per multi-byte character. 1.366 + * For Latin-1, adjust sourceLimit only for 1 trail byte because 1.367 + * the conversion loop handles at most 2-byte sequences. 1.368 + */ 1.369 + if(source<sourceLimit && U8_IS_LEAD(*(sourceLimit-1))) { 1.370 + --sourceLimit; 1.371 + } 1.372 + 1.373 + /* conversion loop */ 1.374 + while(source<sourceLimit) { 1.375 + if(targetCapacity>0) { 1.376 + b=*source++; 1.377 + if((int8_t)b>=0) { 1.378 + /* convert ASCII */ 1.379 + *target++=(uint8_t)b; 1.380 + --targetCapacity; 1.381 + } else if( /* handle U+0080..U+00FF inline */ 1.382 + b>=0xc2 && b<=0xc3 && 1.383 + (t1=(uint8_t)(*source-0x80)) <= 0x3f 1.384 + ) { 1.385 + ++source; 1.386 + *target++=(uint8_t)(((b&3)<<6)|t1); 1.387 + --targetCapacity; 1.388 + } else { 1.389 + /* complicated, illegal or unmappable input: fall back to the pivoting implementation */ 1.390 + pToUArgs->source=(char *)(source-1); 1.391 + pFromUArgs->target=(char *)target; 1.392 + *pErrorCode=U_USING_DEFAULT_WARNING; 1.393 + return; 1.394 + } 1.395 + } else { 1.396 + /* target is full */ 1.397 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.398 + break; 1.399 + } 1.400 + } 1.401 + 1.402 + /* 1.403 + * The sourceLimit may have been adjusted before the conversion loop 1.404 + * to stop before a truncated sequence. 1.405 + * If so, then collect the truncated sequence now. 1.406 + * For Latin-1, there is at most exactly one lead byte because of the 1.407 + * smaller sourceLimit adjustment logic. 1.408 + */ 1.409 + if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) { 1.410 + utf8->toUnicodeStatus=utf8->toUBytes[0]=b=*source++; 1.411 + utf8->toULength=1; 1.412 + utf8->mode=U8_COUNT_TRAIL_BYTES(b)+1; 1.413 + } 1.414 + 1.415 + /* write back the updated pointers */ 1.416 + pToUArgs->source=(char *)source; 1.417 + pFromUArgs->target=(char *)target; 1.418 +} 1.419 + 1.420 +static void 1.421 +_Latin1GetUnicodeSet(const UConverter *cnv, 1.422 + const USetAdder *sa, 1.423 + UConverterUnicodeSet which, 1.424 + UErrorCode *pErrorCode) { 1.425 + sa->addRange(sa->set, 0, 0xff); 1.426 +} 1.427 + 1.428 +static const UConverterImpl _Latin1Impl={ 1.429 + UCNV_LATIN_1, 1.430 + 1.431 + NULL, 1.432 + NULL, 1.433 + 1.434 + NULL, 1.435 + NULL, 1.436 + NULL, 1.437 + 1.438 + _Latin1ToUnicodeWithOffsets, 1.439 + _Latin1ToUnicodeWithOffsets, 1.440 + _Latin1FromUnicodeWithOffsets, 1.441 + _Latin1FromUnicodeWithOffsets, 1.442 + _Latin1GetNextUChar, 1.443 + 1.444 + NULL, 1.445 + NULL, 1.446 + NULL, 1.447 + NULL, 1.448 + _Latin1GetUnicodeSet, 1.449 + 1.450 + NULL, 1.451 + ucnv_Latin1FromUTF8 1.452 +}; 1.453 + 1.454 +static const UConverterStaticData _Latin1StaticData={ 1.455 + sizeof(UConverterStaticData), 1.456 + "ISO-8859-1", 1.457 + 819, UCNV_IBM, UCNV_LATIN_1, 1, 1, 1.458 + { 0x1a, 0, 0, 0 }, 1, FALSE, FALSE, 1.459 + 0, 1.460 + 0, 1.461 + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1.462 +}; 1.463 + 1.464 +const UConverterSharedData _Latin1Data={ 1.465 + sizeof(UConverterSharedData), ~((uint32_t) 0), 1.466 + NULL, NULL, &_Latin1StaticData, FALSE, &_Latin1Impl, 1.467 + 0 1.468 +}; 1.469 + 1.470 +/* US-ASCII ----------------------------------------------------------------- */ 1.471 + 1.472 +/* This is a table-less version of ucnv_MBCSSingleToBMPWithOffsets(). */ 1.473 +static void 1.474 +_ASCIIToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 1.475 + UErrorCode *pErrorCode) { 1.476 + const uint8_t *source, *sourceLimit; 1.477 + UChar *target, *oldTarget; 1.478 + int32_t targetCapacity, length; 1.479 + int32_t *offsets; 1.480 + 1.481 + int32_t sourceIndex; 1.482 + 1.483 + uint8_t c; 1.484 + 1.485 + /* set up the local pointers */ 1.486 + source=(const uint8_t *)pArgs->source; 1.487 + sourceLimit=(const uint8_t *)pArgs->sourceLimit; 1.488 + target=oldTarget=pArgs->target; 1.489 + targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 1.490 + offsets=pArgs->offsets; 1.491 + 1.492 + /* sourceIndex=-1 if the current character began in the previous buffer */ 1.493 + sourceIndex=0; 1.494 + 1.495 + /* 1.496 + * since the conversion here is 1:1 UChar:uint8_t, we need only one counter 1.497 + * for the minimum of the sourceLength and targetCapacity 1.498 + */ 1.499 + length=(int32_t)(sourceLimit-source); 1.500 + if(length<targetCapacity) { 1.501 + targetCapacity=length; 1.502 + } 1.503 + 1.504 + if(targetCapacity>=8) { 1.505 + /* This loop is unrolled for speed and improved pipelining. */ 1.506 + int32_t count, loops; 1.507 + UChar oredChars; 1.508 + 1.509 + loops=count=targetCapacity>>3; 1.510 + do { 1.511 + oredChars=target[0]=source[0]; 1.512 + oredChars|=target[1]=source[1]; 1.513 + oredChars|=target[2]=source[2]; 1.514 + oredChars|=target[3]=source[3]; 1.515 + oredChars|=target[4]=source[4]; 1.516 + oredChars|=target[5]=source[5]; 1.517 + oredChars|=target[6]=source[6]; 1.518 + oredChars|=target[7]=source[7]; 1.519 + 1.520 + /* were all 16 entries really valid? */ 1.521 + if(oredChars>0x7f) { 1.522 + /* no, return to the first of these 16 */ 1.523 + break; 1.524 + } 1.525 + source+=8; 1.526 + target+=8; 1.527 + } while(--count>0); 1.528 + count=loops-count; 1.529 + targetCapacity-=count*8; 1.530 + 1.531 + if(offsets!=NULL) { 1.532 + oldTarget+=count*8; 1.533 + while(count>0) { 1.534 + offsets[0]=sourceIndex++; 1.535 + offsets[1]=sourceIndex++; 1.536 + offsets[2]=sourceIndex++; 1.537 + offsets[3]=sourceIndex++; 1.538 + offsets[4]=sourceIndex++; 1.539 + offsets[5]=sourceIndex++; 1.540 + offsets[6]=sourceIndex++; 1.541 + offsets[7]=sourceIndex++; 1.542 + offsets+=8; 1.543 + --count; 1.544 + } 1.545 + } 1.546 + } 1.547 + 1.548 + /* conversion loop */ 1.549 + c=0; 1.550 + while(targetCapacity>0 && (c=*source++)<=0x7f) { 1.551 + *target++=c; 1.552 + --targetCapacity; 1.553 + } 1.554 + 1.555 + if(c>0x7f) { 1.556 + /* callback(illegal); copy the current bytes to toUBytes[] */ 1.557 + UConverter *cnv=pArgs->converter; 1.558 + cnv->toUBytes[0]=c; 1.559 + cnv->toULength=1; 1.560 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.561 + } else if(source<sourceLimit && target>=pArgs->targetLimit) { 1.562 + /* target is full */ 1.563 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.564 + } 1.565 + 1.566 + /* set offsets since the start */ 1.567 + if(offsets!=NULL) { 1.568 + size_t count=target-oldTarget; 1.569 + while(count>0) { 1.570 + *offsets++=sourceIndex++; 1.571 + --count; 1.572 + } 1.573 + } 1.574 + 1.575 + /* write back the updated pointers */ 1.576 + pArgs->source=(const char *)source; 1.577 + pArgs->target=target; 1.578 + pArgs->offsets=offsets; 1.579 +} 1.580 + 1.581 +/* This is a table-less version of ucnv_MBCSSingleGetNextUChar(). */ 1.582 +static UChar32 1.583 +_ASCIIGetNextUChar(UConverterToUnicodeArgs *pArgs, 1.584 + UErrorCode *pErrorCode) { 1.585 + const uint8_t *source; 1.586 + uint8_t b; 1.587 + 1.588 + source=(const uint8_t *)pArgs->source; 1.589 + if(source<(const uint8_t *)pArgs->sourceLimit) { 1.590 + b=*source++; 1.591 + pArgs->source=(const char *)source; 1.592 + if(b<=0x7f) { 1.593 + return b; 1.594 + } else { 1.595 + UConverter *cnv=pArgs->converter; 1.596 + cnv->toUBytes[0]=b; 1.597 + cnv->toULength=1; 1.598 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.599 + return 0xffff; 1.600 + } 1.601 + } 1.602 + 1.603 + /* no output because of empty input */ 1.604 + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1.605 + return 0xffff; 1.606 +} 1.607 + 1.608 +/* "Convert" UTF-8 to US-ASCII: Validate and copy. */ 1.609 +static void 1.610 +ucnv_ASCIIFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 1.611 + UConverterToUnicodeArgs *pToUArgs, 1.612 + UErrorCode *pErrorCode) { 1.613 + const uint8_t *source, *sourceLimit; 1.614 + uint8_t *target; 1.615 + int32_t targetCapacity, length; 1.616 + 1.617 + uint8_t c; 1.618 + 1.619 + if(pToUArgs->converter->toUnicodeStatus!=0) { 1.620 + /* no handling of partial UTF-8 characters here, fall back to pivoting */ 1.621 + *pErrorCode=U_USING_DEFAULT_WARNING; 1.622 + return; 1.623 + } 1.624 + 1.625 + /* set up the local pointers */ 1.626 + source=(const uint8_t *)pToUArgs->source; 1.627 + sourceLimit=(const uint8_t *)pToUArgs->sourceLimit; 1.628 + target=(uint8_t *)pFromUArgs->target; 1.629 + targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); 1.630 + 1.631 + /* 1.632 + * since the conversion here is 1:1 uint8_t:uint8_t, we need only one counter 1.633 + * for the minimum of the sourceLength and targetCapacity 1.634 + */ 1.635 + length=(int32_t)(sourceLimit-source); 1.636 + if(length<targetCapacity) { 1.637 + targetCapacity=length; 1.638 + } 1.639 + 1.640 + /* unroll the loop with the most common case */ 1.641 + if(targetCapacity>=16) { 1.642 + int32_t count, loops; 1.643 + uint8_t oredChars; 1.644 + 1.645 + loops=count=targetCapacity>>4; 1.646 + do { 1.647 + oredChars=*target++=*source++; 1.648 + oredChars|=*target++=*source++; 1.649 + oredChars|=*target++=*source++; 1.650 + oredChars|=*target++=*source++; 1.651 + oredChars|=*target++=*source++; 1.652 + oredChars|=*target++=*source++; 1.653 + oredChars|=*target++=*source++; 1.654 + oredChars|=*target++=*source++; 1.655 + oredChars|=*target++=*source++; 1.656 + oredChars|=*target++=*source++; 1.657 + oredChars|=*target++=*source++; 1.658 + oredChars|=*target++=*source++; 1.659 + oredChars|=*target++=*source++; 1.660 + oredChars|=*target++=*source++; 1.661 + oredChars|=*target++=*source++; 1.662 + oredChars|=*target++=*source++; 1.663 + 1.664 + /* were all 16 entries really valid? */ 1.665 + if(oredChars>0x7f) { 1.666 + /* no, return to the first of these 16 */ 1.667 + source-=16; 1.668 + target-=16; 1.669 + break; 1.670 + } 1.671 + } while(--count>0); 1.672 + count=loops-count; 1.673 + targetCapacity-=16*count; 1.674 + } 1.675 + 1.676 + /* conversion loop */ 1.677 + c=0; 1.678 + while(targetCapacity>0 && (c=*source)<=0x7f) { 1.679 + ++source; 1.680 + *target++=c; 1.681 + --targetCapacity; 1.682 + } 1.683 + 1.684 + if(c>0x7f) { 1.685 + /* non-ASCII character, handle in standard converter */ 1.686 + *pErrorCode=U_USING_DEFAULT_WARNING; 1.687 + } else if(source<sourceLimit && target>=(const uint8_t *)pFromUArgs->targetLimit) { 1.688 + /* target is full */ 1.689 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.690 + } 1.691 + 1.692 + /* write back the updated pointers */ 1.693 + pToUArgs->source=(const char *)source; 1.694 + pFromUArgs->target=(char *)target; 1.695 +} 1.696 + 1.697 +static void 1.698 +_ASCIIGetUnicodeSet(const UConverter *cnv, 1.699 + const USetAdder *sa, 1.700 + UConverterUnicodeSet which, 1.701 + UErrorCode *pErrorCode) { 1.702 + sa->addRange(sa->set, 0, 0x7f); 1.703 +} 1.704 + 1.705 +static const UConverterImpl _ASCIIImpl={ 1.706 + UCNV_US_ASCII, 1.707 + 1.708 + NULL, 1.709 + NULL, 1.710 + 1.711 + NULL, 1.712 + NULL, 1.713 + NULL, 1.714 + 1.715 + _ASCIIToUnicodeWithOffsets, 1.716 + _ASCIIToUnicodeWithOffsets, 1.717 + _Latin1FromUnicodeWithOffsets, 1.718 + _Latin1FromUnicodeWithOffsets, 1.719 + _ASCIIGetNextUChar, 1.720 + 1.721 + NULL, 1.722 + NULL, 1.723 + NULL, 1.724 + NULL, 1.725 + _ASCIIGetUnicodeSet, 1.726 + 1.727 + NULL, 1.728 + ucnv_ASCIIFromUTF8 1.729 +}; 1.730 + 1.731 +static const UConverterStaticData _ASCIIStaticData={ 1.732 + sizeof(UConverterStaticData), 1.733 + "US-ASCII", 1.734 + 367, UCNV_IBM, UCNV_US_ASCII, 1, 1, 1.735 + { 0x1a, 0, 0, 0 }, 1, FALSE, FALSE, 1.736 + 0, 1.737 + 0, 1.738 + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1.739 +}; 1.740 + 1.741 +const UConverterSharedData _ASCIIData={ 1.742 + sizeof(UConverterSharedData), ~((uint32_t) 0), 1.743 + NULL, NULL, &_ASCIIStaticData, FALSE, &_ASCIIImpl, 1.744 + 0 1.745 +}; 1.746 + 1.747 +#endif