intl/icu/source/common/ucnvlat1.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/ucnvlat1.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,744 @@
     1.4 +/* 
     1.5 +**********************************************************************
     1.6 +*   Copyright (C) 2000-2012, International Business Machines
     1.7 +*   Corporation and others.  All Rights Reserved.
     1.8 +**********************************************************************
     1.9 +*   file name:  ucnvlat1.cpp
    1.10 +*   encoding:   US-ASCII
    1.11 +*   tab size:   8 (not used)
    1.12 +*   indentation:4
    1.13 +*
    1.14 +*   created on: 2000feb07
    1.15 +*   created by: Markus W. Scherer
    1.16 +*/
    1.17 +
    1.18 +#include "unicode/utypes.h"
    1.19 +
    1.20 +#if !UCONFIG_NO_CONVERSION
    1.21 +
    1.22 +#include "unicode/ucnv.h"
    1.23 +#include "unicode/uset.h"
    1.24 +#include "unicode/utf8.h"
    1.25 +#include "ucnv_bld.h"
    1.26 +#include "ucnv_cnv.h"
    1.27 +
    1.28 +/* control optimizations according to the platform */
    1.29 +#define LATIN1_UNROLL_FROM_UNICODE 1
    1.30 +
    1.31 +/* ISO 8859-1 --------------------------------------------------------------- */
    1.32 +
    1.33 +/* This is a table-less and callback-less version of ucnv_MBCSSingleToBMPWithOffsets(). */
    1.34 +static void
    1.35 +_Latin1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
    1.36 +                            UErrorCode *pErrorCode) {
    1.37 +    const uint8_t *source;
    1.38 +    UChar *target;
    1.39 +    int32_t targetCapacity, length;
    1.40 +    int32_t *offsets;
    1.41 +
    1.42 +    int32_t sourceIndex;
    1.43 +
    1.44 +    /* set up the local pointers */
    1.45 +    source=(const uint8_t *)pArgs->source;
    1.46 +    target=pArgs->target;
    1.47 +    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
    1.48 +    offsets=pArgs->offsets;
    1.49 +
    1.50 +    sourceIndex=0;
    1.51 +
    1.52 +    /*
    1.53 +     * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
    1.54 +     * for the minimum of the sourceLength and targetCapacity
    1.55 +     */
    1.56 +    length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
    1.57 +    if(length<=targetCapacity) {
    1.58 +        targetCapacity=length;
    1.59 +    } else {
    1.60 +        /* target will be full */
    1.61 +        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    1.62 +        length=targetCapacity;
    1.63 +    }
    1.64 +
    1.65 +    if(targetCapacity>=8) {
    1.66 +        /* This loop is unrolled for speed and improved pipelining. */
    1.67 +        int32_t count, loops;
    1.68 +
    1.69 +        loops=count=targetCapacity>>3;
    1.70 +        length=targetCapacity&=0x7;
    1.71 +        do {
    1.72 +            target[0]=source[0];
    1.73 +            target[1]=source[1];
    1.74 +            target[2]=source[2];
    1.75 +            target[3]=source[3];
    1.76 +            target[4]=source[4];
    1.77 +            target[5]=source[5];
    1.78 +            target[6]=source[6];
    1.79 +            target[7]=source[7];
    1.80 +            target+=8;
    1.81 +            source+=8;
    1.82 +        } while(--count>0);
    1.83 +
    1.84 +        if(offsets!=NULL) {
    1.85 +            do {
    1.86 +                offsets[0]=sourceIndex++;
    1.87 +                offsets[1]=sourceIndex++;
    1.88 +                offsets[2]=sourceIndex++;
    1.89 +                offsets[3]=sourceIndex++;
    1.90 +                offsets[4]=sourceIndex++;
    1.91 +                offsets[5]=sourceIndex++;
    1.92 +                offsets[6]=sourceIndex++;
    1.93 +                offsets[7]=sourceIndex++;
    1.94 +                offsets+=8;
    1.95 +            } while(--loops>0);
    1.96 +        }
    1.97 +    }
    1.98 +
    1.99 +    /* conversion loop */
   1.100 +    while(targetCapacity>0) {
   1.101 +        *target++=*source++;
   1.102 +        --targetCapacity;
   1.103 +    }
   1.104 +
   1.105 +    /* write back the updated pointers */
   1.106 +    pArgs->source=(const char *)source;
   1.107 +    pArgs->target=target;
   1.108 +
   1.109 +    /* set offsets */
   1.110 +    if(offsets!=NULL) {
   1.111 +        while(length>0) {
   1.112 +            *offsets++=sourceIndex++;
   1.113 +            --length;
   1.114 +        }
   1.115 +        pArgs->offsets=offsets;
   1.116 +    }
   1.117 +}
   1.118 +
   1.119 +/* This is a table-less and callback-less version of ucnv_MBCSSingleGetNextUChar(). */
   1.120 +static UChar32
   1.121 +_Latin1GetNextUChar(UConverterToUnicodeArgs *pArgs,
   1.122 +                    UErrorCode *pErrorCode) {
   1.123 +    const uint8_t *source=(const uint8_t *)pArgs->source;
   1.124 +    if(source<(const uint8_t *)pArgs->sourceLimit) {
   1.125 +        pArgs->source=(const char *)(source+1);
   1.126 +        return *source;
   1.127 +    }
   1.128 +
   1.129 +    /* no output because of empty input */
   1.130 +    *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   1.131 +    return 0xffff;
   1.132 +}
   1.133 +
   1.134 +/* This is a table-less version of ucnv_MBCSSingleFromBMPWithOffsets(). */
   1.135 +static void
   1.136 +_Latin1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
   1.137 +                              UErrorCode *pErrorCode) {
   1.138 +    UConverter *cnv;
   1.139 +    const UChar *source, *sourceLimit;
   1.140 +    uint8_t *target, *oldTarget;
   1.141 +    int32_t targetCapacity, length;
   1.142 +    int32_t *offsets;
   1.143 +
   1.144 +    UChar32 cp;
   1.145 +    UChar c, max;
   1.146 +
   1.147 +    int32_t sourceIndex;
   1.148 +
   1.149 +    /* set up the local pointers */
   1.150 +    cnv=pArgs->converter;
   1.151 +    source=pArgs->source;
   1.152 +    sourceLimit=pArgs->sourceLimit;
   1.153 +    target=oldTarget=(uint8_t *)pArgs->target;
   1.154 +    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
   1.155 +    offsets=pArgs->offsets;
   1.156 +
   1.157 +    if(cnv->sharedData==&_Latin1Data) {
   1.158 +        max=0xff; /* Latin-1 */
   1.159 +    } else {
   1.160 +        max=0x7f; /* US-ASCII */
   1.161 +    }
   1.162 +
   1.163 +    /* get the converter state from UConverter */
   1.164 +    cp=cnv->fromUChar32;
   1.165 +
   1.166 +    /* sourceIndex=-1 if the current character began in the previous buffer */
   1.167 +    sourceIndex= cp==0 ? 0 : -1;
   1.168 +
   1.169 +    /*
   1.170 +     * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
   1.171 +     * for the minimum of the sourceLength and targetCapacity
   1.172 +     */
   1.173 +    length=(int32_t)(sourceLimit-source);
   1.174 +    if(length<targetCapacity) {
   1.175 +        targetCapacity=length;
   1.176 +    }
   1.177 +
   1.178 +    /* conversion loop */
   1.179 +    if(cp!=0 && targetCapacity>0) {
   1.180 +        goto getTrail;
   1.181 +    }
   1.182 +
   1.183 +#if LATIN1_UNROLL_FROM_UNICODE
   1.184 +    /* unroll the loop with the most common case */
   1.185 +    if(targetCapacity>=16) {
   1.186 +        int32_t count, loops;
   1.187 +        UChar u, oredChars;
   1.188 +
   1.189 +        loops=count=targetCapacity>>4;
   1.190 +        do {
   1.191 +            oredChars=u=*source++;
   1.192 +            *target++=(uint8_t)u;
   1.193 +            oredChars|=u=*source++;
   1.194 +            *target++=(uint8_t)u;
   1.195 +            oredChars|=u=*source++;
   1.196 +            *target++=(uint8_t)u;
   1.197 +            oredChars|=u=*source++;
   1.198 +            *target++=(uint8_t)u;
   1.199 +            oredChars|=u=*source++;
   1.200 +            *target++=(uint8_t)u;
   1.201 +            oredChars|=u=*source++;
   1.202 +            *target++=(uint8_t)u;
   1.203 +            oredChars|=u=*source++;
   1.204 +            *target++=(uint8_t)u;
   1.205 +            oredChars|=u=*source++;
   1.206 +            *target++=(uint8_t)u;
   1.207 +            oredChars|=u=*source++;
   1.208 +            *target++=(uint8_t)u;
   1.209 +            oredChars|=u=*source++;
   1.210 +            *target++=(uint8_t)u;
   1.211 +            oredChars|=u=*source++;
   1.212 +            *target++=(uint8_t)u;
   1.213 +            oredChars|=u=*source++;
   1.214 +            *target++=(uint8_t)u;
   1.215 +            oredChars|=u=*source++;
   1.216 +            *target++=(uint8_t)u;
   1.217 +            oredChars|=u=*source++;
   1.218 +            *target++=(uint8_t)u;
   1.219 +            oredChars|=u=*source++;
   1.220 +            *target++=(uint8_t)u;
   1.221 +            oredChars|=u=*source++;
   1.222 +            *target++=(uint8_t)u;
   1.223 +
   1.224 +            /* were all 16 entries really valid? */
   1.225 +            if(oredChars>max) {
   1.226 +                /* no, return to the first of these 16 */
   1.227 +                source-=16;
   1.228 +                target-=16;
   1.229 +                break;
   1.230 +            }
   1.231 +        } while(--count>0);
   1.232 +        count=loops-count;
   1.233 +        targetCapacity-=16*count;
   1.234 +
   1.235 +        if(offsets!=NULL) {
   1.236 +            oldTarget+=16*count;
   1.237 +            while(count>0) {
   1.238 +                *offsets++=sourceIndex++;
   1.239 +                *offsets++=sourceIndex++;
   1.240 +                *offsets++=sourceIndex++;
   1.241 +                *offsets++=sourceIndex++;
   1.242 +                *offsets++=sourceIndex++;
   1.243 +                *offsets++=sourceIndex++;
   1.244 +                *offsets++=sourceIndex++;
   1.245 +                *offsets++=sourceIndex++;
   1.246 +                *offsets++=sourceIndex++;
   1.247 +                *offsets++=sourceIndex++;
   1.248 +                *offsets++=sourceIndex++;
   1.249 +                *offsets++=sourceIndex++;
   1.250 +                *offsets++=sourceIndex++;
   1.251 +                *offsets++=sourceIndex++;
   1.252 +                *offsets++=sourceIndex++;
   1.253 +                *offsets++=sourceIndex++;
   1.254 +                --count;
   1.255 +            }
   1.256 +        }
   1.257 +    }
   1.258 +#endif
   1.259 +
   1.260 +    /* conversion loop */
   1.261 +    c=0;
   1.262 +    while(targetCapacity>0 && (c=*source++)<=max) {
   1.263 +        /* convert the Unicode code point */
   1.264 +        *target++=(uint8_t)c;
   1.265 +        --targetCapacity;
   1.266 +    }
   1.267 +
   1.268 +    if(c>max) {
   1.269 +        cp=c;
   1.270 +        if(!U_IS_SURROGATE(cp)) {
   1.271 +            /* callback(unassigned) */
   1.272 +        } else if(U_IS_SURROGATE_LEAD(cp)) {
   1.273 +getTrail:
   1.274 +            if(source<sourceLimit) {
   1.275 +                /* test the following code unit */
   1.276 +                UChar trail=*source;
   1.277 +                if(U16_IS_TRAIL(trail)) {
   1.278 +                    ++source;
   1.279 +                    cp=U16_GET_SUPPLEMENTARY(cp, trail);
   1.280 +                    /* this codepage does not map supplementary code points */
   1.281 +                    /* callback(unassigned) */
   1.282 +                } else {
   1.283 +                    /* this is an unmatched lead code unit (1st surrogate) */
   1.284 +                    /* callback(illegal) */
   1.285 +                }
   1.286 +            } else {
   1.287 +                /* no more input */
   1.288 +                cnv->fromUChar32=cp;
   1.289 +                goto noMoreInput;
   1.290 +            }
   1.291 +        } else {
   1.292 +            /* this is an unmatched trail code unit (2nd surrogate) */
   1.293 +            /* callback(illegal) */
   1.294 +        }
   1.295 +
   1.296 +        *pErrorCode= U_IS_SURROGATE(cp) ? U_ILLEGAL_CHAR_FOUND : U_INVALID_CHAR_FOUND;
   1.297 +        cnv->fromUChar32=cp;
   1.298 +    }
   1.299 +noMoreInput:
   1.300 +
   1.301 +    /* set offsets since the start */
   1.302 +    if(offsets!=NULL) {
   1.303 +        size_t count=target-oldTarget;
   1.304 +        while(count>0) {
   1.305 +            *offsets++=sourceIndex++;
   1.306 +            --count;
   1.307 +        }
   1.308 +    }
   1.309 +
   1.310 +    if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) {
   1.311 +        /* target is full */
   1.312 +        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1.313 +    }
   1.314 +
   1.315 +    /* write back the updated pointers */
   1.316 +    pArgs->source=source;
   1.317 +    pArgs->target=(char *)target;
   1.318 +    pArgs->offsets=offsets;
   1.319 +}
   1.320 +
   1.321 +/* Convert UTF-8 to Latin-1. Adapted from ucnv_SBCSFromUTF8(). */
   1.322 +static void
   1.323 +ucnv_Latin1FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
   1.324 +                    UConverterToUnicodeArgs *pToUArgs,
   1.325 +                    UErrorCode *pErrorCode) {
   1.326 +    UConverter *utf8;
   1.327 +    const uint8_t *source, *sourceLimit;
   1.328 +    uint8_t *target;
   1.329 +    int32_t targetCapacity;
   1.330 +
   1.331 +    UChar32 c;
   1.332 +    uint8_t b, t1;
   1.333 +
   1.334 +    /* set up the local pointers */
   1.335 +    utf8=pToUArgs->converter;
   1.336 +    source=(uint8_t *)pToUArgs->source;
   1.337 +    sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
   1.338 +    target=(uint8_t *)pFromUArgs->target;
   1.339 +    targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
   1.340 +
   1.341 +    /* get the converter state from the UTF-8 UConverter */
   1.342 +    c=(UChar32)utf8->toUnicodeStatus;
   1.343 +    if(c!=0 && source<sourceLimit) {
   1.344 +        if(targetCapacity==0) {
   1.345 +            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1.346 +            return;
   1.347 +        } else if(c>=0xc2 && c<=0xc3 && (t1=(uint8_t)(*source-0x80)) <= 0x3f) {
   1.348 +            ++source;
   1.349 +            *target++=(uint8_t)(((c&3)<<6)|t1);
   1.350 +            --targetCapacity;
   1.351 +
   1.352 +            utf8->toUnicodeStatus=0;
   1.353 +            utf8->toULength=0;
   1.354 +        } else {
   1.355 +            /* complicated, illegal or unmappable input: fall back to the pivoting implementation */
   1.356 +            *pErrorCode=U_USING_DEFAULT_WARNING;
   1.357 +            return;
   1.358 +        }
   1.359 +    }
   1.360 +
   1.361 +    /*
   1.362 +     * Make sure that the last byte sequence before sourceLimit is complete
   1.363 +     * or runs into a lead byte.
   1.364 +     * In the conversion loop compare source with sourceLimit only once
   1.365 +     * per multi-byte character.
   1.366 +     * For Latin-1, adjust sourceLimit only for 1 trail byte because
   1.367 +     * the conversion loop handles at most 2-byte sequences.
   1.368 +     */
   1.369 +    if(source<sourceLimit && U8_IS_LEAD(*(sourceLimit-1))) {
   1.370 +        --sourceLimit;
   1.371 +    }
   1.372 +
   1.373 +    /* conversion loop */
   1.374 +    while(source<sourceLimit) {
   1.375 +        if(targetCapacity>0) {
   1.376 +            b=*source++;
   1.377 +            if((int8_t)b>=0) {
   1.378 +                /* convert ASCII */
   1.379 +                *target++=(uint8_t)b;
   1.380 +                --targetCapacity;
   1.381 +            } else if( /* handle U+0080..U+00FF inline */
   1.382 +                       b>=0xc2 && b<=0xc3 &&
   1.383 +                       (t1=(uint8_t)(*source-0x80)) <= 0x3f
   1.384 +            ) {
   1.385 +                ++source;
   1.386 +                *target++=(uint8_t)(((b&3)<<6)|t1);
   1.387 +                --targetCapacity;
   1.388 +            } else {
   1.389 +                /* complicated, illegal or unmappable input: fall back to the pivoting implementation */
   1.390 +                pToUArgs->source=(char *)(source-1);
   1.391 +                pFromUArgs->target=(char *)target;
   1.392 +                *pErrorCode=U_USING_DEFAULT_WARNING;
   1.393 +                return;
   1.394 +            }
   1.395 +        } else {
   1.396 +            /* target is full */
   1.397 +            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1.398 +            break;
   1.399 +        }
   1.400 +    }
   1.401 +
   1.402 +    /*
   1.403 +     * The sourceLimit may have been adjusted before the conversion loop
   1.404 +     * to stop before a truncated sequence.
   1.405 +     * If so, then collect the truncated sequence now.
   1.406 +     * For Latin-1, there is at most exactly one lead byte because of the
   1.407 +     * smaller sourceLimit adjustment logic.
   1.408 +     */
   1.409 +    if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
   1.410 +        utf8->toUnicodeStatus=utf8->toUBytes[0]=b=*source++;
   1.411 +        utf8->toULength=1;
   1.412 +        utf8->mode=U8_COUNT_TRAIL_BYTES(b)+1;
   1.413 +    }
   1.414 +
   1.415 +    /* write back the updated pointers */
   1.416 +    pToUArgs->source=(char *)source;
   1.417 +    pFromUArgs->target=(char *)target;
   1.418 +}
   1.419 +
   1.420 +static void
   1.421 +_Latin1GetUnicodeSet(const UConverter *cnv,
   1.422 +                     const USetAdder *sa,
   1.423 +                     UConverterUnicodeSet which,
   1.424 +                     UErrorCode *pErrorCode) {
   1.425 +    sa->addRange(sa->set, 0, 0xff);
   1.426 +}
   1.427 +
   1.428 +static const UConverterImpl _Latin1Impl={
   1.429 +    UCNV_LATIN_1,
   1.430 +
   1.431 +    NULL,
   1.432 +    NULL,
   1.433 +
   1.434 +    NULL,
   1.435 +    NULL,
   1.436 +    NULL,
   1.437 +
   1.438 +    _Latin1ToUnicodeWithOffsets,
   1.439 +    _Latin1ToUnicodeWithOffsets,
   1.440 +    _Latin1FromUnicodeWithOffsets,
   1.441 +    _Latin1FromUnicodeWithOffsets,
   1.442 +    _Latin1GetNextUChar,
   1.443 +
   1.444 +    NULL,
   1.445 +    NULL,
   1.446 +    NULL,
   1.447 +    NULL,
   1.448 +    _Latin1GetUnicodeSet,
   1.449 +
   1.450 +    NULL,
   1.451 +    ucnv_Latin1FromUTF8
   1.452 +};
   1.453 +
   1.454 +static const UConverterStaticData _Latin1StaticData={
   1.455 +    sizeof(UConverterStaticData),
   1.456 +    "ISO-8859-1",
   1.457 +    819, UCNV_IBM, UCNV_LATIN_1, 1, 1,
   1.458 +    { 0x1a, 0, 0, 0 }, 1, FALSE, FALSE,
   1.459 +    0,
   1.460 +    0,
   1.461 +    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   1.462 +};
   1.463 +
   1.464 +const UConverterSharedData _Latin1Data={
   1.465 +    sizeof(UConverterSharedData), ~((uint32_t) 0),
   1.466 +    NULL, NULL, &_Latin1StaticData, FALSE, &_Latin1Impl, 
   1.467 +    0
   1.468 +};
   1.469 +
   1.470 +/* US-ASCII ----------------------------------------------------------------- */
   1.471 +
   1.472 +/* This is a table-less version of ucnv_MBCSSingleToBMPWithOffsets(). */
   1.473 +static void
   1.474 +_ASCIIToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
   1.475 +                           UErrorCode *pErrorCode) {
   1.476 +    const uint8_t *source, *sourceLimit;
   1.477 +    UChar *target, *oldTarget;
   1.478 +    int32_t targetCapacity, length;
   1.479 +    int32_t *offsets;
   1.480 +
   1.481 +    int32_t sourceIndex;
   1.482 +
   1.483 +    uint8_t c;
   1.484 +
   1.485 +    /* set up the local pointers */
   1.486 +    source=(const uint8_t *)pArgs->source;
   1.487 +    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   1.488 +    target=oldTarget=pArgs->target;
   1.489 +    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
   1.490 +    offsets=pArgs->offsets;
   1.491 +
   1.492 +    /* sourceIndex=-1 if the current character began in the previous buffer */
   1.493 +    sourceIndex=0;
   1.494 +
   1.495 +    /*
   1.496 +     * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
   1.497 +     * for the minimum of the sourceLength and targetCapacity
   1.498 +     */
   1.499 +    length=(int32_t)(sourceLimit-source);
   1.500 +    if(length<targetCapacity) {
   1.501 +        targetCapacity=length;
   1.502 +    }
   1.503 +
   1.504 +    if(targetCapacity>=8) {
   1.505 +        /* This loop is unrolled for speed and improved pipelining. */
   1.506 +        int32_t count, loops;
   1.507 +        UChar oredChars;
   1.508 +
   1.509 +        loops=count=targetCapacity>>3;
   1.510 +        do {
   1.511 +            oredChars=target[0]=source[0];
   1.512 +            oredChars|=target[1]=source[1];
   1.513 +            oredChars|=target[2]=source[2];
   1.514 +            oredChars|=target[3]=source[3];
   1.515 +            oredChars|=target[4]=source[4];
   1.516 +            oredChars|=target[5]=source[5];
   1.517 +            oredChars|=target[6]=source[6];
   1.518 +            oredChars|=target[7]=source[7];
   1.519 +
   1.520 +            /* were all 16 entries really valid? */
   1.521 +            if(oredChars>0x7f) {
   1.522 +                /* no, return to the first of these 16 */
   1.523 +                break;
   1.524 +            }
   1.525 +            source+=8;
   1.526 +            target+=8;
   1.527 +        } while(--count>0);
   1.528 +        count=loops-count;
   1.529 +        targetCapacity-=count*8;
   1.530 +
   1.531 +        if(offsets!=NULL) {
   1.532 +            oldTarget+=count*8;
   1.533 +            while(count>0) {
   1.534 +                offsets[0]=sourceIndex++;
   1.535 +                offsets[1]=sourceIndex++;
   1.536 +                offsets[2]=sourceIndex++;
   1.537 +                offsets[3]=sourceIndex++;
   1.538 +                offsets[4]=sourceIndex++;
   1.539 +                offsets[5]=sourceIndex++;
   1.540 +                offsets[6]=sourceIndex++;
   1.541 +                offsets[7]=sourceIndex++;
   1.542 +                offsets+=8;
   1.543 +                --count;
   1.544 +            }
   1.545 +        }
   1.546 +    }
   1.547 +
   1.548 +    /* conversion loop */
   1.549 +    c=0;
   1.550 +    while(targetCapacity>0 && (c=*source++)<=0x7f) {
   1.551 +        *target++=c;
   1.552 +        --targetCapacity;
   1.553 +    }
   1.554 +
   1.555 +    if(c>0x7f) {
   1.556 +        /* callback(illegal); copy the current bytes to toUBytes[] */
   1.557 +        UConverter *cnv=pArgs->converter;
   1.558 +        cnv->toUBytes[0]=c;
   1.559 +        cnv->toULength=1;
   1.560 +        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1.561 +    } else if(source<sourceLimit && target>=pArgs->targetLimit) {
   1.562 +        /* target is full */
   1.563 +        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1.564 +    }
   1.565 +
   1.566 +    /* set offsets since the start */
   1.567 +    if(offsets!=NULL) {
   1.568 +        size_t count=target-oldTarget;
   1.569 +        while(count>0) {
   1.570 +            *offsets++=sourceIndex++;
   1.571 +            --count;
   1.572 +        }
   1.573 +    }
   1.574 +
   1.575 +    /* write back the updated pointers */
   1.576 +    pArgs->source=(const char *)source;
   1.577 +    pArgs->target=target;
   1.578 +    pArgs->offsets=offsets;
   1.579 +}
   1.580 +
   1.581 +/* This is a table-less version of ucnv_MBCSSingleGetNextUChar(). */
   1.582 +static UChar32
   1.583 +_ASCIIGetNextUChar(UConverterToUnicodeArgs *pArgs,
   1.584 +                   UErrorCode *pErrorCode) {
   1.585 +    const uint8_t *source;
   1.586 +    uint8_t b;
   1.587 +
   1.588 +    source=(const uint8_t *)pArgs->source;
   1.589 +    if(source<(const uint8_t *)pArgs->sourceLimit) {
   1.590 +        b=*source++;
   1.591 +        pArgs->source=(const char *)source;
   1.592 +        if(b<=0x7f) {
   1.593 +            return b;
   1.594 +        } else {
   1.595 +            UConverter *cnv=pArgs->converter;
   1.596 +            cnv->toUBytes[0]=b;
   1.597 +            cnv->toULength=1;
   1.598 +            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1.599 +            return 0xffff;
   1.600 +        }
   1.601 +    }
   1.602 +
   1.603 +    /* no output because of empty input */
   1.604 +    *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   1.605 +    return 0xffff;
   1.606 +}
   1.607 +
   1.608 +/* "Convert" UTF-8 to US-ASCII: Validate and copy. */
   1.609 +static void
   1.610 +ucnv_ASCIIFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
   1.611 +                   UConverterToUnicodeArgs *pToUArgs,
   1.612 +                   UErrorCode *pErrorCode) {
   1.613 +    const uint8_t *source, *sourceLimit;
   1.614 +    uint8_t *target;
   1.615 +    int32_t targetCapacity, length;
   1.616 +
   1.617 +    uint8_t c;
   1.618 +
   1.619 +    if(pToUArgs->converter->toUnicodeStatus!=0) {
   1.620 +        /* no handling of partial UTF-8 characters here, fall back to pivoting */
   1.621 +        *pErrorCode=U_USING_DEFAULT_WARNING;
   1.622 +        return;
   1.623 +    }
   1.624 +
   1.625 +    /* set up the local pointers */
   1.626 +    source=(const uint8_t *)pToUArgs->source;
   1.627 +    sourceLimit=(const uint8_t *)pToUArgs->sourceLimit;
   1.628 +    target=(uint8_t *)pFromUArgs->target;
   1.629 +    targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
   1.630 +
   1.631 +    /*
   1.632 +     * since the conversion here is 1:1 uint8_t:uint8_t, we need only one counter
   1.633 +     * for the minimum of the sourceLength and targetCapacity
   1.634 +     */
   1.635 +    length=(int32_t)(sourceLimit-source);
   1.636 +    if(length<targetCapacity) {
   1.637 +        targetCapacity=length;
   1.638 +    }
   1.639 +
   1.640 +    /* unroll the loop with the most common case */
   1.641 +    if(targetCapacity>=16) {
   1.642 +        int32_t count, loops;
   1.643 +        uint8_t oredChars;
   1.644 +
   1.645 +        loops=count=targetCapacity>>4;
   1.646 +        do {
   1.647 +            oredChars=*target++=*source++;
   1.648 +            oredChars|=*target++=*source++;
   1.649 +            oredChars|=*target++=*source++;
   1.650 +            oredChars|=*target++=*source++;
   1.651 +            oredChars|=*target++=*source++;
   1.652 +            oredChars|=*target++=*source++;
   1.653 +            oredChars|=*target++=*source++;
   1.654 +            oredChars|=*target++=*source++;
   1.655 +            oredChars|=*target++=*source++;
   1.656 +            oredChars|=*target++=*source++;
   1.657 +            oredChars|=*target++=*source++;
   1.658 +            oredChars|=*target++=*source++;
   1.659 +            oredChars|=*target++=*source++;
   1.660 +            oredChars|=*target++=*source++;
   1.661 +            oredChars|=*target++=*source++;
   1.662 +            oredChars|=*target++=*source++;
   1.663 +
   1.664 +            /* were all 16 entries really valid? */
   1.665 +            if(oredChars>0x7f) {
   1.666 +                /* no, return to the first of these 16 */
   1.667 +                source-=16;
   1.668 +                target-=16;
   1.669 +                break;
   1.670 +            }
   1.671 +        } while(--count>0);
   1.672 +        count=loops-count;
   1.673 +        targetCapacity-=16*count;
   1.674 +    }
   1.675 +
   1.676 +    /* conversion loop */
   1.677 +    c=0;
   1.678 +    while(targetCapacity>0 && (c=*source)<=0x7f) {
   1.679 +        ++source;
   1.680 +        *target++=c;
   1.681 +        --targetCapacity;
   1.682 +    }
   1.683 +
   1.684 +    if(c>0x7f) {
   1.685 +        /* non-ASCII character, handle in standard converter */
   1.686 +        *pErrorCode=U_USING_DEFAULT_WARNING;
   1.687 +    } else if(source<sourceLimit && target>=(const uint8_t *)pFromUArgs->targetLimit) {
   1.688 +        /* target is full */
   1.689 +        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1.690 +    }
   1.691 +
   1.692 +    /* write back the updated pointers */
   1.693 +    pToUArgs->source=(const char *)source;
   1.694 +    pFromUArgs->target=(char *)target;
   1.695 +}
   1.696 +
   1.697 +static void
   1.698 +_ASCIIGetUnicodeSet(const UConverter *cnv,
   1.699 +                    const USetAdder *sa,
   1.700 +                    UConverterUnicodeSet which,
   1.701 +                    UErrorCode *pErrorCode) {
   1.702 +    sa->addRange(sa->set, 0, 0x7f);
   1.703 +}
   1.704 +
   1.705 +static const UConverterImpl _ASCIIImpl={
   1.706 +    UCNV_US_ASCII,
   1.707 +
   1.708 +    NULL,
   1.709 +    NULL,
   1.710 +
   1.711 +    NULL,
   1.712 +    NULL,
   1.713 +    NULL,
   1.714 +
   1.715 +    _ASCIIToUnicodeWithOffsets,
   1.716 +    _ASCIIToUnicodeWithOffsets,
   1.717 +    _Latin1FromUnicodeWithOffsets,
   1.718 +    _Latin1FromUnicodeWithOffsets,
   1.719 +    _ASCIIGetNextUChar,
   1.720 +
   1.721 +    NULL,
   1.722 +    NULL,
   1.723 +    NULL,
   1.724 +    NULL,
   1.725 +    _ASCIIGetUnicodeSet,
   1.726 +
   1.727 +    NULL,
   1.728 +    ucnv_ASCIIFromUTF8
   1.729 +};
   1.730 +
   1.731 +static const UConverterStaticData _ASCIIStaticData={
   1.732 +    sizeof(UConverterStaticData),
   1.733 +    "US-ASCII",
   1.734 +    367, UCNV_IBM, UCNV_US_ASCII, 1, 1,
   1.735 +    { 0x1a, 0, 0, 0 }, 1, FALSE, FALSE,
   1.736 +    0,
   1.737 +    0,
   1.738 +    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   1.739 +};
   1.740 +
   1.741 +const UConverterSharedData _ASCIIData={
   1.742 +    sizeof(UConverterSharedData), ~((uint32_t) 0),
   1.743 +    NULL, NULL, &_ASCIIStaticData, FALSE, &_ASCIIImpl, 
   1.744 +    0
   1.745 +};
   1.746 +
   1.747 +#endif

mercurial