The Tor Browser: diff intl/icu/source/common/ucnv

     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/ucnv_u8.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1086 @@
     1.4 +/*  
     1.5 +**********************************************************************
     1.6 +*   Copyright (C) 2002-2012, International Business Machines
     1.7 +*   Corporation and others.  All Rights Reserved.
     1.8 +**********************************************************************
     1.9 +*   file name:  ucnv_u8.c
    1.10 +*   encoding:   US-ASCII
    1.11 +*   tab size:   8 (not used)
    1.12 +*   indentation:4
    1.13 +*
    1.14 +*   created on: 2002jul01
    1.15 +*   created by: Markus W. Scherer
    1.16 +*
    1.17 +*   UTF-8 converter implementation. Used to be in ucnv_utf.c.
    1.18 +*
    1.19 +*   Also, CESU-8 implementation, see UTR 26.
    1.20 +*   The CESU-8 converter uses all the same functions as the
    1.21 +*   UTF-8 converter, with a branch for converting supplementary code points.
    1.22 +*/
    1.23 +
    1.24 +#include "unicode/utypes.h"
    1.25 +
    1.26 +#if !UCONFIG_NO_CONVERSION
    1.27 +
    1.28 +#include "unicode/ucnv.h"
    1.29 +#include "unicode/utf.h"
    1.30 +#include "unicode/utf8.h"
    1.31 +#include "unicode/utf16.h"
    1.32 +#include "ucnv_bld.h"
    1.33 +#include "ucnv_cnv.h"
    1.34 +#include "cmemory.h"
    1.35 +
    1.36 +/* Prototypes --------------------------------------------------------------- */
    1.37 +
    1.38 +/* Keep these here to make finicky compilers happy */
    1.39 +
    1.40 +U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
    1.41 +                                           UErrorCode *err);
    1.42 +U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
    1.43 +                                                        UErrorCode *err);
    1.44 +
    1.45 +
    1.46 +/* UTF-8 -------------------------------------------------------------------- */
    1.47 +
    1.48 +/* UTF-8 Conversion DATA
    1.49 + *   for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
    1.50 + */
    1.51 +/*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
    1.52 +#define MAXIMUM_UCS2            0x0000FFFF
    1.53 +#define MAXIMUM_UTF             0x0010FFFF
    1.54 +#define MAXIMUM_UCS4            0x7FFFFFFF
    1.55 +#define HALF_SHIFT              10
    1.56 +#define HALF_BASE               0x0010000
    1.57 +#define HALF_MASK               0x3FF
    1.58 +#define SURROGATE_HIGH_START    0xD800
    1.59 +#define SURROGATE_HIGH_END      0xDBFF
    1.60 +#define SURROGATE_LOW_START     0xDC00
    1.61 +#define SURROGATE_LOW_END       0xDFFF
    1.62 +
    1.63 +/* -SURROGATE_LOW_START + HALF_BASE */
    1.64 +#define SURROGATE_LOW_BASE      9216
    1.65 +
    1.66 +static const uint32_t offsetsFromUTF8[7] = {0,
    1.67 +  (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
    1.68 +  (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
    1.69 +};
    1.70 +
    1.71 +/* END OF UTF-8 Conversion DATA */
    1.72 +
    1.73 +static const int8_t bytesFromUTF8[256] = {
    1.74 +  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1.75 +  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1.76 +  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1.77 +  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1.78 +  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1.79 +  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1.80 +  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    1.81 +  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
    1.82 +};
    1.83 +
    1.84 +/*
    1.85 + * Starting with Unicode 3.0.1:
    1.86 + * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
    1.87 + * byte sequences with more than 4 bytes are illegal in UTF-8,
    1.88 + * which is tested with impossible values for them
    1.89 + */
    1.90 +static const uint32_t
    1.91 +utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
    1.92 +
    1.93 +static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
    1.94 +                                  UErrorCode * err)
    1.95 +{
    1.96 +    UConverter *cnv = args->converter;
    1.97 +    const unsigned char *mySource = (unsigned char *) args->source;
    1.98 +    UChar *myTarget = args->target;
    1.99 +    const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
   1.100 +    const UChar *targetLimit = args->targetLimit;
   1.101 +    unsigned char *toUBytes = cnv->toUBytes;
   1.102 +    UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data);
   1.103 +    uint32_t ch, ch2 = 0;
   1.104 +    int32_t i, inBytes;
   1.105 +  
   1.106 +    /* Restore size of current sequence */
   1.107 +    if (cnv->toUnicodeStatus && myTarget < targetLimit)
   1.108 +    {
   1.109 +        inBytes = cnv->mode;            /* restore # of bytes to consume */
   1.110 +        i = cnv->toULength;             /* restore # of bytes consumed */
   1.111 +        cnv->toULength = 0;
   1.112 +
   1.113 +        ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
   1.114 +        cnv->toUnicodeStatus = 0;
   1.115 +        goto morebytes;
   1.116 +    }
   1.117 +
   1.118 +
   1.119 +    while (mySource < sourceLimit && myTarget < targetLimit)
   1.120 +    {
   1.121 +        ch = *(mySource++);
   1.122 +        if (ch < 0x80)        /* Simple case */
   1.123 +        {
   1.124 +            *(myTarget++) = (UChar) ch;
   1.125 +        }
   1.126 +        else
   1.127 +        {
   1.128 +            /* store the first char */
   1.129 +            toUBytes[0] = (char)ch;
   1.130 +            inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
   1.131 +            i = 1;
   1.132 +
   1.133 +morebytes:
   1.134 +            while (i < inBytes)
   1.135 +            {
   1.136 +                if (mySource < sourceLimit)
   1.137 +                {
   1.138 +                    toUBytes[i] = (char) (ch2 = *mySource);
   1.139 +                    if (!U8_IS_TRAIL(ch2))
   1.140 +                    {
   1.141 +                        break; /* i < inBytes */
   1.142 +                    }
   1.143 +                    ch = (ch << 6) + ch2;
   1.144 +                    ++mySource;
   1.145 +                    i++;
   1.146 +                }
   1.147 +                else
   1.148 +                {
   1.149 +                    /* stores a partially calculated target*/
   1.150 +                    cnv->toUnicodeStatus = ch;
   1.151 +                    cnv->mode = inBytes;
   1.152 +                    cnv->toULength = (int8_t) i;
   1.153 +                    goto donefornow;
   1.154 +                }
   1.155 +            }
   1.156 +
   1.157 +            /* Remove the accumulated high bits */
   1.158 +            ch -= offsetsFromUTF8[inBytes];
   1.159 +
   1.160 +            /*
   1.161 +             * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
   1.162 +             * - use only trail bytes after a lead byte (checked above)
   1.163 +             * - use the right number of trail bytes for a given lead byte
   1.164 +             * - encode a code point <= U+10ffff
   1.165 +             * - use the fewest possible number of bytes for their code points
   1.166 +             * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
   1.167 +             *
   1.168 +             * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
   1.169 +             * There are no irregular sequences any more.
   1.170 +             * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
   1.171 +             */
   1.172 +            if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
   1.173 +                (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
   1.174 +            {
   1.175 +                /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
   1.176 +                if (ch <= MAXIMUM_UCS2) 
   1.177 +                {
   1.178 +                    /* fits in 16 bits */
   1.179 +                    *(myTarget++) = (UChar) ch;
   1.180 +                }
   1.181 +                else
   1.182 +                {
   1.183 +                    /* write out the surrogates */
   1.184 +                    ch -= HALF_BASE;
   1.185 +                    *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
   1.186 +                    ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
   1.187 +                    if (myTarget < targetLimit)
   1.188 +                    {
   1.189 +                        *(myTarget++) = (UChar)ch;
   1.190 +                    }
   1.191 +                    else
   1.192 +                    {
   1.193 +                        /* Put in overflow buffer (not handled here) */
   1.194 +                        cnv->UCharErrorBuffer[0] = (UChar) ch;
   1.195 +                        cnv->UCharErrorBufferLength = 1;
   1.196 +                        *err = U_BUFFER_OVERFLOW_ERROR;
   1.197 +                        break;
   1.198 +                    }
   1.199 +                }
   1.200 +            }
   1.201 +            else
   1.202 +            {
   1.203 +                cnv->toULength = (int8_t)i;
   1.204 +                *err = U_ILLEGAL_CHAR_FOUND;
   1.205 +                break;
   1.206 +            }
   1.207 +        }
   1.208 +    }
   1.209 +
   1.210 +donefornow:
   1.211 +    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
   1.212 +    {
   1.213 +        /* End of target buffer */
   1.214 +        *err = U_BUFFER_OVERFLOW_ERROR;
   1.215 +    }
   1.216 +
   1.217 +    args->target = myTarget;
   1.218 +    args->source = (const char *) mySource;
   1.219 +}
   1.220 +
   1.221 +static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
   1.222 +                                                UErrorCode * err)
   1.223 +{
   1.224 +    UConverter *cnv = args->converter;
   1.225 +    const unsigned char *mySource = (unsigned char *) args->source;
   1.226 +    UChar *myTarget = args->target;
   1.227 +    int32_t *myOffsets = args->offsets;
   1.228 +    int32_t offsetNum = 0;
   1.229 +    const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
   1.230 +    const UChar *targetLimit = args->targetLimit;
   1.231 +    unsigned char *toUBytes = cnv->toUBytes;
   1.232 +    UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data);
   1.233 +    uint32_t ch, ch2 = 0;
   1.234 +    int32_t i, inBytes;
   1.235 +
   1.236 +    /* Restore size of current sequence */
   1.237 +    if (cnv->toUnicodeStatus && myTarget < targetLimit)
   1.238 +    {
   1.239 +        inBytes = cnv->mode;            /* restore # of bytes to consume */
   1.240 +        i = cnv->toULength;             /* restore # of bytes consumed */
   1.241 +        cnv->toULength = 0;
   1.242 +
   1.243 +        ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
   1.244 +        cnv->toUnicodeStatus = 0;
   1.245 +        goto morebytes;
   1.246 +    }
   1.247 +
   1.248 +    while (mySource < sourceLimit && myTarget < targetLimit)
   1.249 +    {
   1.250 +        ch = *(mySource++);
   1.251 +        if (ch < 0x80)        /* Simple case */
   1.252 +        {
   1.253 +            *(myTarget++) = (UChar) ch;
   1.254 +            *(myOffsets++) = offsetNum++;
   1.255 +        }
   1.256 +        else
   1.257 +        {
   1.258 +            toUBytes[0] = (char)ch;
   1.259 +            inBytes = bytesFromUTF8[ch];
   1.260 +            i = 1;
   1.261 +
   1.262 +morebytes:
   1.263 +            while (i < inBytes)
   1.264 +            {
   1.265 +                if (mySource < sourceLimit)
   1.266 +                {
   1.267 +                    toUBytes[i] = (char) (ch2 = *mySource);
   1.268 +                    if (!U8_IS_TRAIL(ch2))
   1.269 +                    {
   1.270 +                        break; /* i < inBytes */
   1.271 +                    }
   1.272 +                    ch = (ch << 6) + ch2;
   1.273 +                    ++mySource;
   1.274 +                    i++;
   1.275 +                }
   1.276 +                else
   1.277 +                {
   1.278 +                    cnv->toUnicodeStatus = ch;
   1.279 +                    cnv->mode = inBytes;
   1.280 +                    cnv->toULength = (int8_t)i;
   1.281 +                    goto donefornow;
   1.282 +                }
   1.283 +            }
   1.284 +
   1.285 +            /* Remove the accumulated high bits */
   1.286 +            ch -= offsetsFromUTF8[inBytes];
   1.287 +
   1.288 +            /*
   1.289 +             * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
   1.290 +             * - use only trail bytes after a lead byte (checked above)
   1.291 +             * - use the right number of trail bytes for a given lead byte
   1.292 +             * - encode a code point <= U+10ffff
   1.293 +             * - use the fewest possible number of bytes for their code points
   1.294 +             * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
   1.295 +             *
   1.296 +             * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
   1.297 +             * There are no irregular sequences any more.
   1.298 +             * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
   1.299 +             */
   1.300 +            if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
   1.301 +                (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
   1.302 +            {
   1.303 +                /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
   1.304 +                if (ch <= MAXIMUM_UCS2) 
   1.305 +                {
   1.306 +                    /* fits in 16 bits */
   1.307 +                    *(myTarget++) = (UChar) ch;
   1.308 +                    *(myOffsets++) = offsetNum;
   1.309 +                }
   1.310 +                else
   1.311 +                {
   1.312 +                    /* write out the surrogates */
   1.313 +                    ch -= HALF_BASE;
   1.314 +                    *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
   1.315 +                    *(myOffsets++) = offsetNum;
   1.316 +                    ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
   1.317 +                    if (myTarget < targetLimit)
   1.318 +                    {
   1.319 +                        *(myTarget++) = (UChar)ch;
   1.320 +                        *(myOffsets++) = offsetNum;
   1.321 +                    }
   1.322 +                    else
   1.323 +                    {
   1.324 +                        cnv->UCharErrorBuffer[0] = (UChar) ch;
   1.325 +                        cnv->UCharErrorBufferLength = 1;
   1.326 +                        *err = U_BUFFER_OVERFLOW_ERROR;
   1.327 +                    }
   1.328 +                }
   1.329 +                offsetNum += i;
   1.330 +            }
   1.331 +            else
   1.332 +            {
   1.333 +                cnv->toULength = (int8_t)i;
   1.334 +                *err = U_ILLEGAL_CHAR_FOUND;
   1.335 +                break;
   1.336 +            }
   1.337 +        }
   1.338 +    }
   1.339 +
   1.340 +donefornow:
   1.341 +    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
   1.342 +    {   /* End of target buffer */
   1.343 +        *err = U_BUFFER_OVERFLOW_ERROR;
   1.344 +    }
   1.345 +
   1.346 +    args->target = myTarget;
   1.347 +    args->source = (const char *) mySource;
   1.348 +    args->offsets = myOffsets;
   1.349 +}
   1.350 +
   1.351 +U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
   1.352 +                                    UErrorCode * err)
   1.353 +{
   1.354 +    UConverter *cnv = args->converter;
   1.355 +    const UChar *mySource = args->source;
   1.356 +    const UChar *sourceLimit = args->sourceLimit;
   1.357 +    uint8_t *myTarget = (uint8_t *) args->target;
   1.358 +    const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
   1.359 +    uint8_t *tempPtr;
   1.360 +    UChar32 ch;
   1.361 +    uint8_t tempBuf[4];
   1.362 +    int32_t indexToWrite;
   1.363 +    UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data);
   1.364 +
   1.365 +    if (cnv->fromUChar32 && myTarget < targetLimit)
   1.366 +    {
   1.367 +        ch = cnv->fromUChar32;
   1.368 +        cnv->fromUChar32 = 0;
   1.369 +        goto lowsurrogate;
   1.370 +    }
   1.371 +
   1.372 +    while (mySource < sourceLimit && myTarget < targetLimit)
   1.373 +    {
   1.374 +        ch = *(mySource++);
   1.375 +
   1.376 +        if (ch < 0x80)        /* Single byte */
   1.377 +        {
   1.378 +            *(myTarget++) = (uint8_t) ch;
   1.379 +        }
   1.380 +        else if (ch < 0x800)  /* Double byte */
   1.381 +        {
   1.382 +            *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
   1.383 +            if (myTarget < targetLimit)
   1.384 +            {
   1.385 +                *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
   1.386 +            }
   1.387 +            else
   1.388 +            {
   1.389 +                cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
   1.390 +                cnv->charErrorBufferLength = 1;
   1.391 +                *err = U_BUFFER_OVERFLOW_ERROR;
   1.392 +            }
   1.393 +        }
   1.394 +        else {
   1.395 +            /* Check for surrogates */
   1.396 +            if(U16_IS_SURROGATE(ch) && isNotCESU8) {
   1.397 +lowsurrogate:
   1.398 +                if (mySource < sourceLimit) {
   1.399 +                    /* test both code units */
   1.400 +                    if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
   1.401 +                        /* convert and consume this supplementary code point */
   1.402 +                        ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
   1.403 +                        ++mySource;
   1.404 +                        /* exit this condition tree */
   1.405 +                    }
   1.406 +                    else {
   1.407 +                        /* this is an unpaired trail or lead code unit */
   1.408 +                        /* callback(illegal) */
   1.409 +                        cnv->fromUChar32 = ch;
   1.410 +                        *err = U_ILLEGAL_CHAR_FOUND;
   1.411 +                        break;
   1.412 +                    }
   1.413 +                }
   1.414 +                else {
   1.415 +                    /* no more input */
   1.416 +                    cnv->fromUChar32 = ch;
   1.417 +                    break;
   1.418 +                }
   1.419 +            }
   1.420 +
   1.421 +            /* Do we write the buffer directly for speed,
   1.422 +            or do we have to be careful about target buffer space? */
   1.423 +            tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
   1.424 +
   1.425 +            if (ch <= MAXIMUM_UCS2) {
   1.426 +                indexToWrite = 2;
   1.427 +                tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
   1.428 +            }
   1.429 +            else {
   1.430 +                indexToWrite = 3;
   1.431 +                tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
   1.432 +                tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
   1.433 +            }
   1.434 +            tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
   1.435 +            tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
   1.436 +
   1.437 +            if (tempPtr == myTarget) {
   1.438 +                /* There was enough space to write the codepoint directly. */
   1.439 +                myTarget += (indexToWrite + 1);
   1.440 +            }
   1.441 +            else {
   1.442 +                /* We might run out of room soon. Write it slowly. */
   1.443 +                for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
   1.444 +                    if (myTarget < targetLimit) {
   1.445 +                        *(myTarget++) = *tempPtr;
   1.446 +                    }
   1.447 +                    else {
   1.448 +                        cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
   1.449 +                        *err = U_BUFFER_OVERFLOW_ERROR;
   1.450 +                    }
   1.451 +                }
   1.452 +            }
   1.453 +        }
   1.454 +    }
   1.455 +
   1.456 +    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
   1.457 +    {
   1.458 +        *err = U_BUFFER_OVERFLOW_ERROR;
   1.459 +    }
   1.460 +
   1.461 +    args->target = (char *) myTarget;
   1.462 +    args->source = mySource;
   1.463 +}
   1.464 +
   1.465 +U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
   1.466 +                                                  UErrorCode * err)
   1.467 +{
   1.468 +    UConverter *cnv = args->converter;
   1.469 +    const UChar *mySource = args->source;
   1.470 +    int32_t *myOffsets = args->offsets;
   1.471 +    const UChar *sourceLimit = args->sourceLimit;
   1.472 +    uint8_t *myTarget = (uint8_t *) args->target;
   1.473 +    const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
   1.474 +    uint8_t *tempPtr;
   1.475 +    UChar32 ch;
   1.476 +    int32_t offsetNum, nextSourceIndex;
   1.477 +    int32_t indexToWrite;
   1.478 +    uint8_t tempBuf[4];
   1.479 +    UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data);
   1.480 +
   1.481 +    if (cnv->fromUChar32 && myTarget < targetLimit)
   1.482 +    {
   1.483 +        ch = cnv->fromUChar32;
   1.484 +        cnv->fromUChar32 = 0;
   1.485 +        offsetNum = -1;
   1.486 +        nextSourceIndex = 0;
   1.487 +        goto lowsurrogate;
   1.488 +    } else {
   1.489 +        offsetNum = 0;
   1.490 +    }
   1.491 +
   1.492 +    while (mySource < sourceLimit && myTarget < targetLimit)
   1.493 +    {
   1.494 +        ch = *(mySource++);
   1.495 +
   1.496 +        if (ch < 0x80)        /* Single byte */
   1.497 +        {
   1.498 +            *(myOffsets++) = offsetNum++;
   1.499 +            *(myTarget++) = (char) ch;
   1.500 +        }
   1.501 +        else if (ch < 0x800)  /* Double byte */
   1.502 +        {
   1.503 +            *(myOffsets++) = offsetNum;
   1.504 +            *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
   1.505 +            if (myTarget < targetLimit)
   1.506 +            {
   1.507 +                *(myOffsets++) = offsetNum++;
   1.508 +                *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
   1.509 +            }
   1.510 +            else
   1.511 +            {
   1.512 +                cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
   1.513 +                cnv->charErrorBufferLength = 1;
   1.514 +                *err = U_BUFFER_OVERFLOW_ERROR;
   1.515 +            }
   1.516 +        }
   1.517 +        else
   1.518 +        /* Check for surrogates */
   1.519 +        {
   1.520 +            nextSourceIndex = offsetNum + 1;
   1.521 +
   1.522 +            if(U16_IS_SURROGATE(ch) && isNotCESU8) {
   1.523 +lowsurrogate:
   1.524 +                if (mySource < sourceLimit) {
   1.525 +                    /* test both code units */
   1.526 +                    if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
   1.527 +                        /* convert and consume this supplementary code point */
   1.528 +                        ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
   1.529 +                        ++mySource;
   1.530 +                        ++nextSourceIndex;
   1.531 +                        /* exit this condition tree */
   1.532 +                    }
   1.533 +                    else {
   1.534 +                        /* this is an unpaired trail or lead code unit */
   1.535 +                        /* callback(illegal) */
   1.536 +                        cnv->fromUChar32 = ch;
   1.537 +                        *err = U_ILLEGAL_CHAR_FOUND;
   1.538 +                        break;
   1.539 +                    }
   1.540 +                }
   1.541 +                else {
   1.542 +                    /* no more input */
   1.543 +                    cnv->fromUChar32 = ch;
   1.544 +                    break;
   1.545 +                }
   1.546 +            }
   1.547 +
   1.548 +            /* Do we write the buffer directly for speed,
   1.549 +            or do we have to be careful about target buffer space? */
   1.550 +            tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
   1.551 +
   1.552 +            if (ch <= MAXIMUM_UCS2) {
   1.553 +                indexToWrite = 2;
   1.554 +                tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
   1.555 +            }
   1.556 +            else {
   1.557 +                indexToWrite = 3;
   1.558 +                tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
   1.559 +                tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
   1.560 +            }
   1.561 +            tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
   1.562 +            tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
   1.563 +
   1.564 +            if (tempPtr == myTarget) {
   1.565 +                /* There was enough space to write the codepoint directly. */
   1.566 +                myTarget += (indexToWrite + 1);
   1.567 +                myOffsets[0] = offsetNum;
   1.568 +                myOffsets[1] = offsetNum;
   1.569 +                myOffsets[2] = offsetNum;
   1.570 +                if (indexToWrite >= 3) {
   1.571 +                    myOffsets[3] = offsetNum;
   1.572 +                }
   1.573 +                myOffsets += (indexToWrite + 1);
   1.574 +            }
   1.575 +            else {
   1.576 +                /* We might run out of room soon. Write it slowly. */
   1.577 +                for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
   1.578 +                    if (myTarget < targetLimit)
   1.579 +                    {
   1.580 +                        *(myOffsets++) = offsetNum;
   1.581 +                        *(myTarget++) = *tempPtr;
   1.582 +                    }
   1.583 +                    else
   1.584 +                    {
   1.585 +                        cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
   1.586 +                        *err = U_BUFFER_OVERFLOW_ERROR;
   1.587 +                    }
   1.588 +                }
   1.589 +            }
   1.590 +            offsetNum = nextSourceIndex;
   1.591 +        }
   1.592 +    }
   1.593 +
   1.594 +    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
   1.595 +    {
   1.596 +        *err = U_BUFFER_OVERFLOW_ERROR;
   1.597 +    }
   1.598 +
   1.599 +    args->target = (char *) myTarget;
   1.600 +    args->source = mySource;
   1.601 +    args->offsets = myOffsets;
   1.602 +}
   1.603 +
   1.604 +static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
   1.605 +                                               UErrorCode *err) {
   1.606 +    UConverter *cnv;
   1.607 +    const uint8_t *sourceInitial;
   1.608 +    const uint8_t *source;
   1.609 +    uint16_t extraBytesToWrite;
   1.610 +    uint8_t myByte;
   1.611 +    UChar32 ch;
   1.612 +    int8_t i, isLegalSequence;
   1.613 +
   1.614 +    /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
   1.615 +
   1.616 +    cnv = args->converter;
   1.617 +    sourceInitial = source = (const uint8_t *)args->source;
   1.618 +    if (source >= (const uint8_t *)args->sourceLimit)
   1.619 +    {
   1.620 +        /* no input */
   1.621 +        *err = U_INDEX_OUTOFBOUNDS_ERROR;
   1.622 +        return 0xffff;
   1.623 +    }
   1.624 +
   1.625 +    myByte = (uint8_t)*(source++);
   1.626 +    if (myByte < 0x80)
   1.627 +    {
   1.628 +        args->source = (const char *)source;
   1.629 +        return (UChar32)myByte;
   1.630 +    }
   1.631 +
   1.632 +    extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
   1.633 +    if (extraBytesToWrite == 0) {
   1.634 +        cnv->toUBytes[0] = myByte;
   1.635 +        cnv->toULength = 1;
   1.636 +        *err = U_ILLEGAL_CHAR_FOUND;
   1.637 +        args->source = (const char *)source;
   1.638 +        return 0xffff;
   1.639 +    }
   1.640 +
   1.641 +    /*The byte sequence is longer than the buffer area passed*/
   1.642 +    if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
   1.643 +    {
   1.644 +        /* check if all of the remaining bytes are trail bytes */
   1.645 +        cnv->toUBytes[0] = myByte;
   1.646 +        i = 1;
   1.647 +        *err = U_TRUNCATED_CHAR_FOUND;
   1.648 +        while(source < (const uint8_t *)args->sourceLimit) {
   1.649 +            if(U8_IS_TRAIL(myByte = *source)) {
   1.650 +                cnv->toUBytes[i++] = myByte;
   1.651 +                ++source;
   1.652 +            } else {
   1.653 +                /* error even before we run out of input */
   1.654 +                *err = U_ILLEGAL_CHAR_FOUND;
   1.655 +                break;
   1.656 +            }
   1.657 +        }
   1.658 +        cnv->toULength = i;
   1.659 +        args->source = (const char *)source;
   1.660 +        return 0xffff;
   1.661 +    }
   1.662 +
   1.663 +    isLegalSequence = 1;
   1.664 +    ch = myByte << 6;
   1.665 +    switch(extraBytesToWrite)
   1.666 +    {     
   1.667 +      /* note: code falls through cases! (sic)*/ 
   1.668 +    case 6:
   1.669 +        ch += (myByte = *source);
   1.670 +        ch <<= 6;
   1.671 +        if (!U8_IS_TRAIL(myByte))
   1.672 +        {
   1.673 +            isLegalSequence = 0;
   1.674 +            break;
   1.675 +        }
   1.676 +        ++source;
   1.677 +    case 5: /*fall through*/
   1.678 +        ch += (myByte = *source);
   1.679 +        ch <<= 6;
   1.680 +        if (!U8_IS_TRAIL(myByte))
   1.681 +        {
   1.682 +            isLegalSequence = 0;
   1.683 +            break;
   1.684 +        }
   1.685 +        ++source;
   1.686 +    case 4: /*fall through*/
   1.687 +        ch += (myByte = *source);
   1.688 +        ch <<= 6;
   1.689 +        if (!U8_IS_TRAIL(myByte))
   1.690 +        {
   1.691 +            isLegalSequence = 0;
   1.692 +            break;
   1.693 +        }
   1.694 +        ++source;
   1.695 +    case 3: /*fall through*/
   1.696 +        ch += (myByte = *source);
   1.697 +        ch <<= 6;
   1.698 +        if (!U8_IS_TRAIL(myByte))
   1.699 +        {
   1.700 +            isLegalSequence = 0;
   1.701 +            break;
   1.702 +        }
   1.703 +        ++source;
   1.704 +    case 2: /*fall through*/
   1.705 +        ch += (myByte = *source);
   1.706 +        if (!U8_IS_TRAIL(myByte))
   1.707 +        {
   1.708 +            isLegalSequence = 0;
   1.709 +            break;
   1.710 +        }
   1.711 +        ++source;
   1.712 +    };
   1.713 +    ch -= offsetsFromUTF8[extraBytesToWrite];
   1.714 +    args->source = (const char *)source;
   1.715 +
   1.716 +    /*
   1.717 +     * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
   1.718 +     * - use only trail bytes after a lead byte (checked above)
   1.719 +     * - use the right number of trail bytes for a given lead byte
   1.720 +     * - encode a code point <= U+10ffff
   1.721 +     * - use the fewest possible number of bytes for their code points
   1.722 +     * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
   1.723 +     *
   1.724 +     * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
   1.725 +     * There are no irregular sequences any more.
   1.726 +     */
   1.727 +    if (isLegalSequence &&
   1.728 +        (uint32_t)ch <= MAXIMUM_UTF &&
   1.729 +        (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
   1.730 +        !U_IS_SURROGATE(ch)
   1.731 +    ) {
   1.732 +        return ch; /* return the code point */
   1.733 +    }
   1.734 +
   1.735 +    for(i = 0; sourceInitial < source; ++i) {
   1.736 +        cnv->toUBytes[i] = *sourceInitial++;
   1.737 +    }
   1.738 +    cnv->toULength = i;
   1.739 +    *err = U_ILLEGAL_CHAR_FOUND;
   1.740 +    return 0xffff;
   1.741 +} 
   1.742 +
   1.743 +/* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
   1.744 +
   1.745 +/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
   1.746 +static const UChar32
   1.747 +utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
   1.748 +
   1.749 +/* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
   1.750 +static const UChar32
   1.751 +utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
   1.752 +
   1.753 +/* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
   1.754 +static void
   1.755 +ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
   1.756 +                  UConverterToUnicodeArgs *pToUArgs,
   1.757 +                  UErrorCode *pErrorCode) {
   1.758 +    UConverter *utf8;
   1.759 +    const uint8_t *source, *sourceLimit;
   1.760 +    uint8_t *target;
   1.761 +    int32_t targetCapacity;
   1.762 +    int32_t count;
   1.763 +
   1.764 +    int8_t oldToULength, toULength, toULimit;
   1.765 +
   1.766 +    UChar32 c;
   1.767 +    uint8_t b, t1, t2;
   1.768 +
   1.769 +    /* set up the local pointers */
   1.770 +    utf8=pToUArgs->converter;
   1.771 +    source=(uint8_t *)pToUArgs->source;
   1.772 +    sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
   1.773 +    target=(uint8_t *)pFromUArgs->target;
   1.774 +    targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
   1.775 +
   1.776 +    /* get the converter state from the UTF-8 UConverter */
   1.777 +    c=(UChar32)utf8->toUnicodeStatus;
   1.778 +    if(c!=0) {
   1.779 +        toULength=oldToULength=utf8->toULength;
   1.780 +        toULimit=(int8_t)utf8->mode;
   1.781 +    } else {
   1.782 +        toULength=oldToULength=toULimit=0;
   1.783 +    }
   1.784 +
   1.785 +    count=(int32_t)(sourceLimit-source)+oldToULength;
   1.786 +    if(count<toULimit) {
   1.787 +        /*
   1.788 +         * Not enough input to complete the partial character.
   1.789 +         * Jump to moreBytes below - it will not output to target.
   1.790 +         */
   1.791 +    } else if(targetCapacity<toULimit) {
   1.792 +        /*
   1.793 +         * Not enough target capacity to output the partial character.
   1.794 +         * Let the standard converter handle this.
   1.795 +         */
   1.796 +        *pErrorCode=U_USING_DEFAULT_WARNING;
   1.797 +        return;
   1.798 +    } else {
   1.799 +        /*
   1.800 +         * Use a single counter for source and target, counting the minimum of
   1.801 +         * the source length and the target capacity.
   1.802 +         * As a result, the source length is checked only once per multi-byte
   1.803 +         * character instead of twice.
   1.804 +         *
   1.805 +         * Make sure that the last byte sequence is complete, or else
   1.806 +         * stop just before it.
   1.807 +         * (The longest legal byte sequence has 3 trail bytes.)
   1.808 +         * Count oldToULength (number of source bytes from a previous buffer)
   1.809 +         * into the source length but reduce the source index by toULimit
   1.810 +         * while going back over trail bytes in order to not go back into
   1.811 +         * the bytes that will be read for finishing a partial
   1.812 +         * sequence from the previous buffer.
   1.813 +         * Let the standard converter handle edge cases.
   1.814 +         */
   1.815 +        int32_t i;
   1.816 +
   1.817 +        if(count>targetCapacity) {
   1.818 +            count=targetCapacity;
   1.819 +        }
   1.820 +
   1.821 +        i=0;
   1.822 +        while(i<3 && i<(count-toULimit)) {
   1.823 +            b=source[count-oldToULength-i-1];
   1.824 +            if(U8_IS_TRAIL(b)) {
   1.825 +                ++i;
   1.826 +            } else {
   1.827 +                if(i<U8_COUNT_TRAIL_BYTES(b)) {
   1.828 +                    /* stop converting before the lead byte if there are not enough trail bytes for it */
   1.829 +                    count-=i+1;
   1.830 +                }
   1.831 +                break;
   1.832 +            }
   1.833 +        }
   1.834 +    }
   1.835 +
   1.836 +    if(c!=0) {
   1.837 +        utf8->toUnicodeStatus=0;
   1.838 +        utf8->toULength=0;
   1.839 +        goto moreBytes;
   1.840 +        /* See note in ucnv_SBCSFromUTF8() about this goto. */
   1.841 +    }
   1.842 +
   1.843 +    /* conversion loop */
   1.844 +    while(count>0) {
   1.845 +        b=*source++;
   1.846 +        if((int8_t)b>=0) {
   1.847 +            /* convert ASCII */
   1.848 +            *target++=b;
   1.849 +            --count;
   1.850 +            continue;
   1.851 +        } else {
   1.852 +            if(b>0xe0) {
   1.853 +                if( /* handle U+1000..U+D7FF inline */
   1.854 +                    (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) ||
   1.855 +                                               (b==0xed && (t1 <= 0x9f))) &&
   1.856 +                    (t2=source[1]) >= 0x80 && t2 <= 0xbf
   1.857 +                ) {
   1.858 +                    source+=2;
   1.859 +                    *target++=b;
   1.860 +                    *target++=t1;
   1.861 +                    *target++=t2;
   1.862 +                    count-=3;
   1.863 +                    continue;
   1.864 +                }
   1.865 +            } else if(b<0xe0) {
   1.866 +                if( /* handle U+0080..U+07FF inline */
   1.867 +                    b>=0xc2 &&
   1.868 +                    (t1=*source) >= 0x80 && t1 <= 0xbf
   1.869 +                ) {
   1.870 +                    ++source;
   1.871 +                    *target++=b;
   1.872 +                    *target++=t1;
   1.873 +                    count-=2;
   1.874 +                    continue;
   1.875 +                }
   1.876 +            } else if(b==0xe0) {
   1.877 +                if( /* handle U+0800..U+0FFF inline */
   1.878 +                    (t1=source[0]) >= 0xa0 && t1 <= 0xbf &&
   1.879 +                    (t2=source[1]) >= 0x80 && t2 <= 0xbf
   1.880 +                ) {
   1.881 +                    source+=2;
   1.882 +                    *target++=b;
   1.883 +                    *target++=t1;
   1.884 +                    *target++=t2;
   1.885 +                    count-=3;
   1.886 +                    continue;
   1.887 +                }
   1.888 +            }
   1.889 +
   1.890 +            /* handle "complicated" and error cases, and continuing partial characters */
   1.891 +            oldToULength=0;
   1.892 +            toULength=1;
   1.893 +            toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
   1.894 +            c=b;
   1.895 +moreBytes:
   1.896 +            while(toULength<toULimit) {
   1.897 +                if(source<sourceLimit) {
   1.898 +                    b=*source;
   1.899 +                    if(U8_IS_TRAIL(b)) {
   1.900 +                        ++source;
   1.901 +                        ++toULength;
   1.902 +                        c=(c<<6)+b;
   1.903 +                    } else {
   1.904 +                        break; /* sequence too short, stop with toULength<toULimit */
   1.905 +                    }
   1.906 +                } else {
   1.907 +                    /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
   1.908 +                    source-=(toULength-oldToULength);
   1.909 +                    while(oldToULength<toULength) {
   1.910 +                        utf8->toUBytes[oldToULength++]=*source++;
   1.911 +                    }
   1.912 +                    utf8->toUnicodeStatus=c;
   1.913 +                    utf8->toULength=toULength;
   1.914 +                    utf8->mode=toULimit;
   1.915 +                    pToUArgs->source=(char *)source;
   1.916 +                    pFromUArgs->target=(char *)target;
   1.917 +                    return;
   1.918 +                }
   1.919 +            }
   1.920 +
   1.921 +            if( toULength==toULimit &&      /* consumed all trail bytes */
   1.922 +                (toULength==3 || toULength==2) &&             /* BMP */
   1.923 +                (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
   1.924 +                (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
   1.925 +            ) {
   1.926 +                /* legal byte sequence for BMP code point */
   1.927 +            } else if(
   1.928 +                toULength==toULimit && toULength==4 &&
   1.929 +                (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
   1.930 +            ) {
   1.931 +                /* legal byte sequence for supplementary code point */
   1.932 +            } else {
   1.933 +                /* error handling: illegal UTF-8 byte sequence */
   1.934 +                source-=(toULength-oldToULength);
   1.935 +                while(oldToULength<toULength) {
   1.936 +                    utf8->toUBytes[oldToULength++]=*source++;
   1.937 +                }
   1.938 +                utf8->toULength=toULength;
   1.939 +                pToUArgs->source=(char *)source;
   1.940 +                pFromUArgs->target=(char *)target;
   1.941 +                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1.942 +                return;
   1.943 +            }
   1.944 +
   1.945 +            /* copy the legal byte sequence to the target */
   1.946 +            {
   1.947 +                int8_t i;
   1.948 +
   1.949 +                for(i=0; i<oldToULength; ++i) {
   1.950 +                    *target++=utf8->toUBytes[i];
   1.951 +                }
   1.952 +                source-=(toULength-oldToULength);
   1.953 +                for(; i<toULength; ++i) {
   1.954 +                    *target++=*source++;
   1.955 +                }
   1.956 +                count-=toULength;
   1.957 +            }
   1.958 +        }
   1.959 +    }
   1.960 +
   1.961 +    if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
   1.962 +        if(target==(const uint8_t *)pFromUArgs->targetLimit) {
   1.963 +            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1.964 +        } else {
   1.965 +            b=*source;
   1.966 +            toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
   1.967 +            if(toULimit>(sourceLimit-source)) {
   1.968 +                /* collect a truncated byte sequence */
   1.969 +                toULength=0;
   1.970 +                c=b;
   1.971 +                for(;;) {
   1.972 +                    utf8->toUBytes[toULength++]=b;
   1.973 +                    if(++source==sourceLimit) {
   1.974 +                        /* partial byte sequence at end of source */
   1.975 +                        utf8->toUnicodeStatus=c;
   1.976 +                        utf8->toULength=toULength;
   1.977 +                        utf8->mode=toULimit;
   1.978 +                        break;
   1.979 +                    } else if(!U8_IS_TRAIL(b=*source)) {
   1.980 +                        /* lead byte in trail byte position */
   1.981 +                        utf8->toULength=toULength;
   1.982 +                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1.983 +                        break;
   1.984 +                    }
   1.985 +                    c=(c<<6)+b;
   1.986 +                }
   1.987 +            } else {
   1.988 +                /* partial-sequence target overflow: fall back to the pivoting implementation */
   1.989 +                *pErrorCode=U_USING_DEFAULT_WARNING;
   1.990 +            }
   1.991 +        }
   1.992 +    }
   1.993 +
   1.994 +    /* write back the updated pointers */
   1.995 +    pToUArgs->source=(char *)source;
   1.996 +    pFromUArgs->target=(char *)target;
   1.997 +}
   1.998 +
   1.999 +/* UTF-8 converter data ----------------------------------------------------- */
  1.1000 +
  1.1001 +static const UConverterImpl _UTF8Impl={
  1.1002 +    UCNV_UTF8,
  1.1003 +
  1.1004 +    NULL,
  1.1005 +    NULL,
  1.1006 +
  1.1007 +    NULL,
  1.1008 +    NULL,
  1.1009 +    NULL,
  1.1010 +
  1.1011 +    ucnv_toUnicode_UTF8,
  1.1012 +    ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
  1.1013 +    ucnv_fromUnicode_UTF8,
  1.1014 +    ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
  1.1015 +    ucnv_getNextUChar_UTF8,
  1.1016 +
  1.1017 +    NULL,
  1.1018 +    NULL,
  1.1019 +    NULL,
  1.1020 +    NULL,
  1.1021 +    ucnv_getNonSurrogateUnicodeSet,
  1.1022 +
  1.1023 +    ucnv_UTF8FromUTF8,
  1.1024 +    ucnv_UTF8FromUTF8
  1.1025 +};
  1.1026 +
  1.1027 +/* The 1208 CCSID refers to any version of Unicode of UTF-8 */
  1.1028 +static const UConverterStaticData _UTF8StaticData={
  1.1029 +    sizeof(UConverterStaticData),
  1.1030 +    "UTF-8",
  1.1031 +    1208, UCNV_IBM, UCNV_UTF8,
  1.1032 +    1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
  1.1033 +    { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
  1.1034 +    0,
  1.1035 +    0,
  1.1036 +    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  1.1037 +};
  1.1038 +
  1.1039 +
  1.1040 +const UConverterSharedData _UTF8Data={
  1.1041 +    sizeof(UConverterSharedData), ~((uint32_t) 0),
  1.1042 +    NULL, NULL, &_UTF8StaticData, FALSE, &_UTF8Impl,
  1.1043 +    0
  1.1044 +};
  1.1045 +
  1.1046 +/* CESU-8 converter data ---------------------------------------------------- */
  1.1047 +
  1.1048 +static const UConverterImpl _CESU8Impl={
  1.1049 +    UCNV_CESU8,
  1.1050 +
  1.1051 +    NULL,
  1.1052 +    NULL,
  1.1053 +
  1.1054 +    NULL,
  1.1055 +    NULL,
  1.1056 +    NULL,
  1.1057 +
  1.1058 +    ucnv_toUnicode_UTF8,
  1.1059 +    ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
  1.1060 +    ucnv_fromUnicode_UTF8,
  1.1061 +    ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
  1.1062 +    NULL,
  1.1063 +
  1.1064 +    NULL,
  1.1065 +    NULL,
  1.1066 +    NULL,
  1.1067 +    NULL,
  1.1068 +    ucnv_getCompleteUnicodeSet
  1.1069 +};
  1.1070 +
  1.1071 +static const UConverterStaticData _CESU8StaticData={
  1.1072 +    sizeof(UConverterStaticData),
  1.1073 +    "CESU-8",
  1.1074 +    9400, /* CCSID for CESU-8 */
  1.1075 +    UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
  1.1076 +    { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
  1.1077 +    0,
  1.1078 +    0,
  1.1079 +    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  1.1080 +};
  1.1081 +
  1.1082 +
  1.1083 +const UConverterSharedData _CESU8Data={
  1.1084 +    sizeof(UConverterSharedData), ~((uint32_t) 0),
  1.1085 +    NULL, NULL, &_CESU8StaticData, FALSE, &_CESU8Impl,
  1.1086 +    0
  1.1087 +};
  1.1088 +
  1.1089 +#endif
The Tor Browser / file diff

diff: intl/icu/source/common/ucnv_u8.c

intl/icu/source/common/ucnv_u8.c