The Tor Browser: intl/icu/source/common/ucnv

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*

     2 **********************************************************************

     3 *   Copyright (C) 2002-2012, International Business Machines

     4 *   Corporation and others.  All Rights Reserved.

     5 **********************************************************************

     6 *   file name:  ucnv_u8.c

     7 *   encoding:   US-ASCII

     8 *   tab size:   8 (not used)

     9 *   indentation:4

    10 *

    11 *   created on: 2002jul01

    12 *   created by: Markus W. Scherer

    13 *

    14 *   UTF-8 converter implementation. Used to be in ucnv_utf.c.

    15 *

    16 *   Also, CESU-8 implementation, see UTR 26.

    17 *   The CESU-8 converter uses all the same functions as the

    18 *   UTF-8 converter, with a branch for converting supplementary code points.

    19 */

    21 #include "unicode/utypes.h"

    23 #if !UCONFIG_NO_CONVERSION

    25 #include "unicode/ucnv.h"

    26 #include "unicode/utf.h"

    27 #include "unicode/utf8.h"

    28 #include "unicode/utf16.h"

    29 #include "ucnv_bld.h"

    30 #include "ucnv_cnv.h"

    31 #include "cmemory.h"

    33 /* Prototypes --------------------------------------------------------------- */

    35 /* Keep these here to make finicky compilers happy */

    37 U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,

    38                                            UErrorCode *err);

    39 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,

    40                                                         UErrorCode *err);

    43 /* UTF-8 -------------------------------------------------------------------- */

    45 /* UTF-8 Conversion DATA

    46  *   for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9

    47  */

    48 /*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/

    49 #define MAXIMUM_UCS2            0x0000FFFF

    50 #define MAXIMUM_UTF             0x0010FFFF

    51 #define MAXIMUM_UCS4            0x7FFFFFFF

    52 #define HALF_SHIFT              10

    53 #define HALF_BASE               0x0010000

    54 #define HALF_MASK               0x3FF

    55 #define SURROGATE_HIGH_START    0xD800

    56 #define SURROGATE_HIGH_END      0xDBFF

    57 #define SURROGATE_LOW_START     0xDC00

    58 #define SURROGATE_LOW_END       0xDFFF

    60 /* -SURROGATE_LOW_START + HALF_BASE */

    61 #define SURROGATE_LOW_BASE      9216

    63 static const uint32_t offsetsFromUTF8[7] = {0,

    64   (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,

    65   (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080

    66 };

    68 /* END OF UTF-8 Conversion DATA */

    70 static const int8_t bytesFromUTF8[256] = {

    71   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

    72   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

    73   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

    74   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

    75   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

    76   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

    77   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

    78   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0

    79 };

    81 /*

    82  * Starting with Unicode 3.0.1:

    83  * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];

    84  * byte sequences with more than 4 bytes are illegal in UTF-8,

    85  * which is tested with impossible values for them

    86  */

    87 static const uint32_t

    88 utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };

    90 static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,

    91                                   UErrorCode * err)

    92 {

    93     UConverter *cnv = args->converter;

    94     const unsigned char *mySource = (unsigned char *) args->source;

    95     UChar *myTarget = args->target;

    96     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;

    97     const UChar *targetLimit = args->targetLimit;

    98     unsigned char *toUBytes = cnv->toUBytes;

    99     UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data);

   100     uint32_t ch, ch2 = 0;

   101     int32_t i, inBytes;

   103     /* Restore size of current sequence */

   104     if (cnv->toUnicodeStatus && myTarget < targetLimit)

   105     {

   106         inBytes = cnv->mode;            /* restore # of bytes to consume */

   107         i = cnv->toULength;             /* restore # of bytes consumed */

   108         cnv->toULength = 0;

   110         ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/

   111         cnv->toUnicodeStatus = 0;

   112         goto morebytes;

   113     }

   116     while (mySource < sourceLimit && myTarget < targetLimit)

   117     {

   118         ch = *(mySource++);

   119         if (ch < 0x80)        /* Simple case */

   120         {

   121             *(myTarget++) = (UChar) ch;

   122         }

   123         else

   124         {

   125             /* store the first char */

   126             toUBytes[0] = (char)ch;

   127             inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */

   128             i = 1;

   130 morebytes:

   131             while (i < inBytes)

   132             {

   133                 if (mySource < sourceLimit)

   134                 {

   135                     toUBytes[i] = (char) (ch2 = *mySource);

   136                     if (!U8_IS_TRAIL(ch2))

   137                     {

   138                         break; /* i < inBytes */

   139                     }

   140                     ch = (ch << 6) + ch2;

   141                     ++mySource;

   142                     i++;

   143                 }

   144                 else

   145                 {

   146                     /* stores a partially calculated target*/

   147                     cnv->toUnicodeStatus = ch;

   148                     cnv->mode = inBytes;

   149                     cnv->toULength = (int8_t) i;

   150                     goto donefornow;

   151                 }

   152             }

   154             /* Remove the accumulated high bits */

   155             ch -= offsetsFromUTF8[inBytes];

   157             /*

   158              * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:

   159              * - use only trail bytes after a lead byte (checked above)

   160              * - use the right number of trail bytes for a given lead byte

   161              * - encode a code point <= U+10ffff

   162              * - use the fewest possible number of bytes for their code points

   163              * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])

   164              *

   165              * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.

   166              * There are no irregular sequences any more.

   167              * In CESU-8, only surrogates, not supplementary code points, are encoded directly.

   168              */

   169             if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&

   170                 (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))

   171             {

   172                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */

   173                 if (ch <= MAXIMUM_UCS2)

   174                 {

   175                     /* fits in 16 bits */

   176                     *(myTarget++) = (UChar) ch;

   177                 }

   178                 else

   179                 {

   180                     /* write out the surrogates */

   181                     ch -= HALF_BASE;

   182                     *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);

   183                     ch = (ch & HALF_MASK) + SURROGATE_LOW_START;

   184                     if (myTarget < targetLimit)

   185                     {

   186                         *(myTarget++) = (UChar)ch;

   187                     }

   188                     else

   189                     {

   190                         /* Put in overflow buffer (not handled here) */

   191                         cnv->UCharErrorBuffer[0] = (UChar) ch;

   192                         cnv->UCharErrorBufferLength = 1;

   193                         *err = U_BUFFER_OVERFLOW_ERROR;

   194                         break;

   195                     }

   196                 }

   197             }

   198             else

   199             {

   200                 cnv->toULength = (int8_t)i;

   201                 *err = U_ILLEGAL_CHAR_FOUND;

   202                 break;

   203             }

   204         }

   205     }

   207 donefornow:

   208     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))

   209     {

   210         /* End of target buffer */

   211         *err = U_BUFFER_OVERFLOW_ERROR;

   212     }

   214     args->target = myTarget;

   215     args->source = (const char *) mySource;

   216 }

   218 static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,

   219                                                 UErrorCode * err)

   220 {

   221     UConverter *cnv = args->converter;

   222     const unsigned char *mySource = (unsigned char *) args->source;

   223     UChar *myTarget = args->target;

   224     int32_t *myOffsets = args->offsets;

   225     int32_t offsetNum = 0;

   226     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;

   227     const UChar *targetLimit = args->targetLimit;

   228     unsigned char *toUBytes = cnv->toUBytes;

   229     UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data);

   230     uint32_t ch, ch2 = 0;

   231     int32_t i, inBytes;

   233     /* Restore size of current sequence */

   234     if (cnv->toUnicodeStatus && myTarget < targetLimit)

   235     {

   236         inBytes = cnv->mode;            /* restore # of bytes to consume */

   237         i = cnv->toULength;             /* restore # of bytes consumed */

   238         cnv->toULength = 0;

   240         ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/

   241         cnv->toUnicodeStatus = 0;

   242         goto morebytes;

   243     }

   245     while (mySource < sourceLimit && myTarget < targetLimit)

   246     {

   247         ch = *(mySource++);

   248         if (ch < 0x80)        /* Simple case */

   249         {

   250             *(myTarget++) = (UChar) ch;

   251             *(myOffsets++) = offsetNum++;

   252         }

   253         else

   254         {

   255             toUBytes[0] = (char)ch;

   256             inBytes = bytesFromUTF8[ch];

   257             i = 1;

   259 morebytes:

   260             while (i < inBytes)

   261             {

   262                 if (mySource < sourceLimit)

   263                 {

   264                     toUBytes[i] = (char) (ch2 = *mySource);

   265                     if (!U8_IS_TRAIL(ch2))

   266                     {

   267                         break; /* i < inBytes */

   268                     }

   269                     ch = (ch << 6) + ch2;

   270                     ++mySource;

   271                     i++;

   272                 }

   273                 else

   274                 {

   275                     cnv->toUnicodeStatus = ch;

   276                     cnv->mode = inBytes;

   277                     cnv->toULength = (int8_t)i;

   278                     goto donefornow;

   279                 }

   280             }

   282             /* Remove the accumulated high bits */

   283             ch -= offsetsFromUTF8[inBytes];

   285             /*

   286              * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:

   287              * - use only trail bytes after a lead byte (checked above)

   288              * - use the right number of trail bytes for a given lead byte

   289              * - encode a code point <= U+10ffff

   290              * - use the fewest possible number of bytes for their code points

   291              * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])

   292              *

   293              * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.

   294              * There are no irregular sequences any more.

   295              * In CESU-8, only surrogates, not supplementary code points, are encoded directly.

   296              */

   297             if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&

   298                 (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))

   299             {

   300                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */

   301                 if (ch <= MAXIMUM_UCS2)

   302                 {

   303                     /* fits in 16 bits */

   304                     *(myTarget++) = (UChar) ch;

   305                     *(myOffsets++) = offsetNum;

   306                 }

   307                 else

   308                 {

   309                     /* write out the surrogates */

   310                     ch -= HALF_BASE;

   311                     *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);

   312                     *(myOffsets++) = offsetNum;

   313                     ch = (ch & HALF_MASK) + SURROGATE_LOW_START;

   314                     if (myTarget < targetLimit)

   315                     {

   316                         *(myTarget++) = (UChar)ch;

   317                         *(myOffsets++) = offsetNum;

   318                     }

   319                     else

   320                     {

   321                         cnv->UCharErrorBuffer[0] = (UChar) ch;

   322                         cnv->UCharErrorBufferLength = 1;

   323                         *err = U_BUFFER_OVERFLOW_ERROR;

   324                     }

   325                 }

   326                 offsetNum += i;

   327             }

   328             else

   329             {

   330                 cnv->toULength = (int8_t)i;

   331                 *err = U_ILLEGAL_CHAR_FOUND;

   332                 break;

   333             }

   334         }

   335     }

   337 donefornow:

   338     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))

   339     {   /* End of target buffer */

   340         *err = U_BUFFER_OVERFLOW_ERROR;

   341     }

   343     args->target = myTarget;

   344     args->source = (const char *) mySource;

   345     args->offsets = myOffsets;

   346 }

   348 U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,

   349                                     UErrorCode * err)

   350 {

   351     UConverter *cnv = args->converter;

   352     const UChar *mySource = args->source;

   353     const UChar *sourceLimit = args->sourceLimit;

   354     uint8_t *myTarget = (uint8_t *) args->target;

   355     const uint8_t *targetLimit = (uint8_t *) args->targetLimit;

   356     uint8_t *tempPtr;

   357     UChar32 ch;

   358     uint8_t tempBuf[4];

   359     int32_t indexToWrite;

   360     UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data);

   362     if (cnv->fromUChar32 && myTarget < targetLimit)

   363     {

   364         ch = cnv->fromUChar32;

   365         cnv->fromUChar32 = 0;

   366         goto lowsurrogate;

   367     }

   369     while (mySource < sourceLimit && myTarget < targetLimit)

   370     {

   371         ch = *(mySource++);

   373         if (ch < 0x80)        /* Single byte */

   374         {

   375             *(myTarget++) = (uint8_t) ch;

   376         }

   377         else if (ch < 0x800)  /* Double byte */

   378         {

   379             *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);

   380             if (myTarget < targetLimit)

   381             {

   382                 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);

   383             }

   384             else

   385             {

   386                 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);

   387                 cnv->charErrorBufferLength = 1;

   388                 *err = U_BUFFER_OVERFLOW_ERROR;

   389             }

   390         }

   391         else {

   392             /* Check for surrogates */

   393             if(U16_IS_SURROGATE(ch) && isNotCESU8) {

   394 lowsurrogate:

   395                 if (mySource < sourceLimit) {

   396                     /* test both code units */

   397                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {

   398                         /* convert and consume this supplementary code point */

   399                         ch=U16_GET_SUPPLEMENTARY(ch, *mySource);

   400                         ++mySource;

   401                         /* exit this condition tree */

   402                     }

   403                     else {

   404                         /* this is an unpaired trail or lead code unit */

   405                         /* callback(illegal) */

   406                         cnv->fromUChar32 = ch;

   407                         *err = U_ILLEGAL_CHAR_FOUND;

   408                         break;

   409                     }

   410                 }

   411                 else {

   412                     /* no more input */

   413                     cnv->fromUChar32 = ch;

   414                     break;

   415                 }

   416             }

   418             /* Do we write the buffer directly for speed,

   419             or do we have to be careful about target buffer space? */

   420             tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);

   422             if (ch <= MAXIMUM_UCS2) {

   423                 indexToWrite = 2;

   424                 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);

   425             }

   426             else {

   427                 indexToWrite = 3;

   428                 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);

   429                 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);

   430             }

   431             tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);

   432             tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);

   434             if (tempPtr == myTarget) {

   435                 /* There was enough space to write the codepoint directly. */

   436                 myTarget += (indexToWrite + 1);

   437             }

   438             else {

   439                 /* We might run out of room soon. Write it slowly. */

   440                 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {

   441                     if (myTarget < targetLimit) {

   442                         *(myTarget++) = *tempPtr;

   443                     }

   444                     else {

   445                         cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;

   446                         *err = U_BUFFER_OVERFLOW_ERROR;

   447                     }

   448                 }

   449             }

   450         }

   451     }

   453     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))

   454     {

   455         *err = U_BUFFER_OVERFLOW_ERROR;

   456     }

   458     args->target = (char *) myTarget;

   459     args->source = mySource;

   460 }

   462 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,

   463                                                   UErrorCode * err)

   464 {

   465     UConverter *cnv = args->converter;

   466     const UChar *mySource = args->source;

   467     int32_t *myOffsets = args->offsets;

   468     const UChar *sourceLimit = args->sourceLimit;

   469     uint8_t *myTarget = (uint8_t *) args->target;

   470     const uint8_t *targetLimit = (uint8_t *) args->targetLimit;

   471     uint8_t *tempPtr;

   472     UChar32 ch;

   473     int32_t offsetNum, nextSourceIndex;

   474     int32_t indexToWrite;

   475     uint8_t tempBuf[4];

   476     UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data);

   478     if (cnv->fromUChar32 && myTarget < targetLimit)

   479     {

   480         ch = cnv->fromUChar32;

   481         cnv->fromUChar32 = 0;

   482         offsetNum = -1;

   483         nextSourceIndex = 0;

   484         goto lowsurrogate;

   485     } else {

   486         offsetNum = 0;

   487     }

   489     while (mySource < sourceLimit && myTarget < targetLimit)

   490     {

   491         ch = *(mySource++);

   493         if (ch < 0x80)        /* Single byte */

   494         {

   495             *(myOffsets++) = offsetNum++;

   496             *(myTarget++) = (char) ch;

   497         }

   498         else if (ch < 0x800)  /* Double byte */

   499         {

   500             *(myOffsets++) = offsetNum;

   501             *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);

   502             if (myTarget < targetLimit)

   503             {

   504                 *(myOffsets++) = offsetNum++;

   505                 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);

   506             }

   507             else

   508             {

   509                 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);

   510                 cnv->charErrorBufferLength = 1;

   511                 *err = U_BUFFER_OVERFLOW_ERROR;

   512             }

   513         }

   514         else

   515         /* Check for surrogates */

   516         {

   517             nextSourceIndex = offsetNum + 1;

   519             if(U16_IS_SURROGATE(ch) && isNotCESU8) {

   520 lowsurrogate:

   521                 if (mySource < sourceLimit) {

   522                     /* test both code units */

   523                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {

   524                         /* convert and consume this supplementary code point */

   525                         ch=U16_GET_SUPPLEMENTARY(ch, *mySource);

   526                         ++mySource;

   527                         ++nextSourceIndex;

   528                         /* exit this condition tree */

   529                     }

   530                     else {

   531                         /* this is an unpaired trail or lead code unit */

   532                         /* callback(illegal) */

   533                         cnv->fromUChar32 = ch;

   534                         *err = U_ILLEGAL_CHAR_FOUND;

   535                         break;

   536                     }

   537                 }

   538                 else {

   539                     /* no more input */

   540                     cnv->fromUChar32 = ch;

   541                     break;

   542                 }

   543             }

   545             /* Do we write the buffer directly for speed,

   546             or do we have to be careful about target buffer space? */

   547             tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);

   549             if (ch <= MAXIMUM_UCS2) {

   550                 indexToWrite = 2;

   551                 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);

   552             }

   553             else {

   554                 indexToWrite = 3;

   555                 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);

   556                 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);

   557             }

   558             tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);

   559             tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);

   561             if (tempPtr == myTarget) {

   562                 /* There was enough space to write the codepoint directly. */

   563                 myTarget += (indexToWrite + 1);

   564                 myOffsets[0] = offsetNum;

   565                 myOffsets[1] = offsetNum;

   566                 myOffsets[2] = offsetNum;

   567                 if (indexToWrite >= 3) {

   568                     myOffsets[3] = offsetNum;

   569                 }

   570                 myOffsets += (indexToWrite + 1);

   571             }

   572             else {

   573                 /* We might run out of room soon. Write it slowly. */

   574                 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {

   575                     if (myTarget < targetLimit)

   576                     {

   577                         *(myOffsets++) = offsetNum;

   578                         *(myTarget++) = *tempPtr;

   579                     }

   580                     else

   581                     {

   582                         cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;

   583                         *err = U_BUFFER_OVERFLOW_ERROR;

   584                     }

   585                 }

   586             }

   587             offsetNum = nextSourceIndex;

   588         }

   589     }

   591     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))

   592     {

   593         *err = U_BUFFER_OVERFLOW_ERROR;

   594     }

   596     args->target = (char *) myTarget;

   597     args->source = mySource;

   598     args->offsets = myOffsets;

   599 }

   601 static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,

   602                                                UErrorCode *err) {

   603     UConverter *cnv;

   604     const uint8_t *sourceInitial;

   605     const uint8_t *source;

   606     uint16_t extraBytesToWrite;

   607     uint8_t myByte;

   608     UChar32 ch;

   609     int8_t i, isLegalSequence;

   611     /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */

   613     cnv = args->converter;

   614     sourceInitial = source = (const uint8_t *)args->source;

   615     if (source >= (const uint8_t *)args->sourceLimit)

   616     {

   617         /* no input */

   618         *err = U_INDEX_OUTOFBOUNDS_ERROR;

   619         return 0xffff;

   620     }

   622     myByte = (uint8_t)*(source++);

   623     if (myByte < 0x80)

   624     {

   625         args->source = (const char *)source;

   626         return (UChar32)myByte;

   627     }

   629     extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];

   630     if (extraBytesToWrite == 0) {

   631         cnv->toUBytes[0] = myByte;

   632         cnv->toULength = 1;

   633         *err = U_ILLEGAL_CHAR_FOUND;

   634         args->source = (const char *)source;

   635         return 0xffff;

   636     }

   638     /*The byte sequence is longer than the buffer area passed*/

   639     if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)

   640     {

   641         /* check if all of the remaining bytes are trail bytes */

   642         cnv->toUBytes[0] = myByte;

   643         i = 1;

   644         *err = U_TRUNCATED_CHAR_FOUND;

   645         while(source < (const uint8_t *)args->sourceLimit) {

   646             if(U8_IS_TRAIL(myByte = *source)) {

   647                 cnv->toUBytes[i++] = myByte;

   648                 ++source;

   649             } else {

   650                 /* error even before we run out of input */

   651                 *err = U_ILLEGAL_CHAR_FOUND;

   652                 break;

   653             }

   654         }

   655         cnv->toULength = i;

   656         args->source = (const char *)source;

   657         return 0xffff;

   658     }

   660     isLegalSequence = 1;

   661     ch = myByte << 6;

   662     switch(extraBytesToWrite)

   663     {

   664       /* note: code falls through cases! (sic)*/

   665     case 6:

   666         ch += (myByte = *source);

   667         ch <<= 6;

   668         if (!U8_IS_TRAIL(myByte))

   669         {

   670             isLegalSequence = 0;

   671             break;

   672         }

   673         ++source;

   674     case 5: /*fall through*/

   675         ch += (myByte = *source);

   676         ch <<= 6;

   677         if (!U8_IS_TRAIL(myByte))

   678         {

   679             isLegalSequence = 0;

   680             break;

   681         }

   682         ++source;

   683     case 4: /*fall through*/

   684         ch += (myByte = *source);

   685         ch <<= 6;

   686         if (!U8_IS_TRAIL(myByte))

   687         {

   688             isLegalSequence = 0;

   689             break;

   690         }

   691         ++source;

   692     case 3: /*fall through*/

   693         ch += (myByte = *source);

   694         ch <<= 6;

   695         if (!U8_IS_TRAIL(myByte))

   696         {

   697             isLegalSequence = 0;

   698             break;

   699         }

   700         ++source;

   701     case 2: /*fall through*/

   702         ch += (myByte = *source);

   703         if (!U8_IS_TRAIL(myByte))

   704         {

   705             isLegalSequence = 0;

   706             break;

   707         }

   708         ++source;

   709     };

   710     ch -= offsetsFromUTF8[extraBytesToWrite];

   711     args->source = (const char *)source;

   713     /*

   714      * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:

   715      * - use only trail bytes after a lead byte (checked above)

   716      * - use the right number of trail bytes for a given lead byte

   717      * - encode a code point <= U+10ffff

   718      * - use the fewest possible number of bytes for their code points

   719      * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])

   720      *

   721      * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.

   722      * There are no irregular sequences any more.

   723      */

   724     if (isLegalSequence &&

   725         (uint32_t)ch <= MAXIMUM_UTF &&

   726         (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&

   727         !U_IS_SURROGATE(ch)

   728     ) {

   729         return ch; /* return the code point */

   730     }

   732     for(i = 0; sourceInitial < source; ++i) {

   733         cnv->toUBytes[i] = *sourceInitial++;

   734     }

   735     cnv->toULength = i;

   736     *err = U_ILLEGAL_CHAR_FOUND;

   737     return 0xffff;

   738 }

   740 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */

   742 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */

   743 static const UChar32

   744 utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };

   746 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */

   747 static const UChar32

   748 utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };

   750 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */

   751 static void

   752 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,

   753                   UConverterToUnicodeArgs *pToUArgs,

   754                   UErrorCode *pErrorCode) {

   755     UConverter *utf8;

   756     const uint8_t *source, *sourceLimit;

   757     uint8_t *target;

   758     int32_t targetCapacity;

   759     int32_t count;

   761     int8_t oldToULength, toULength, toULimit;

   763     UChar32 c;

   764     uint8_t b, t1, t2;

   766     /* set up the local pointers */

   767     utf8=pToUArgs->converter;

   768     source=(uint8_t *)pToUArgs->source;

   769     sourceLimit=(uint8_t *)pToUArgs->sourceLimit;

   770     target=(uint8_t *)pFromUArgs->target;

   771     targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);

   773     /* get the converter state from the UTF-8 UConverter */

   774     c=(UChar32)utf8->toUnicodeStatus;

   775     if(c!=0) {

   776         toULength=oldToULength=utf8->toULength;

   777         toULimit=(int8_t)utf8->mode;

   778     } else {

   779         toULength=oldToULength=toULimit=0;

   780     }

   782     count=(int32_t)(sourceLimit-source)+oldToULength;

   783     if(count<toULimit) {

   784         /*

   785          * Not enough input to complete the partial character.

   786          * Jump to moreBytes below - it will not output to target.

   787          */

   788     } else if(targetCapacity<toULimit) {

   789         /*

   790          * Not enough target capacity to output the partial character.

   791          * Let the standard converter handle this.

   792          */

   793         *pErrorCode=U_USING_DEFAULT_WARNING;

   794         return;

   795     } else {

   796         /*

   797          * Use a single counter for source and target, counting the minimum of

   798          * the source length and the target capacity.

   799          * As a result, the source length is checked only once per multi-byte

   800          * character instead of twice.

   801          *

   802          * Make sure that the last byte sequence is complete, or else

   803          * stop just before it.

   804          * (The longest legal byte sequence has 3 trail bytes.)

   805          * Count oldToULength (number of source bytes from a previous buffer)

   806          * into the source length but reduce the source index by toULimit

   807          * while going back over trail bytes in order to not go back into

   808          * the bytes that will be read for finishing a partial

   809          * sequence from the previous buffer.

   810          * Let the standard converter handle edge cases.

   811          */

   812         int32_t i;

   814         if(count>targetCapacity) {

   815             count=targetCapacity;

   816         }

   818         i=0;

   819         while(i<3 && i<(count-toULimit)) {

   820             b=source[count-oldToULength-i-1];

   821             if(U8_IS_TRAIL(b)) {

   822                 ++i;

   823             } else {

   824                 if(i<U8_COUNT_TRAIL_BYTES(b)) {

   825                     /* stop converting before the lead byte if there are not enough trail bytes for it */

   826                     count-=i+1;

   827                 }

   828                 break;

   829             }

   830         }

   831     }

   833     if(c!=0) {

   834         utf8->toUnicodeStatus=0;

   835         utf8->toULength=0;

   836         goto moreBytes;

   837         /* See note in ucnv_SBCSFromUTF8() about this goto. */

   838     }

   840     /* conversion loop */

   841     while(count>0) {

   842         b=*source++;

   843         if((int8_t)b>=0) {

   844             /* convert ASCII */

   845             *target++=b;

   846             --count;

   847             continue;

   848         } else {

   849             if(b>0xe0) {

   850                 if( /* handle U+1000..U+D7FF inline */

   851                     (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) ||

   852                                                (b==0xed && (t1 <= 0x9f))) &&

   853                     (t2=source[1]) >= 0x80 && t2 <= 0xbf

   854                 ) {

   855                     source+=2;

   856                     *target++=b;

   857                     *target++=t1;

   858                     *target++=t2;

   859                     count-=3;

   860                     continue;

   861                 }

   862             } else if(b<0xe0) {

   863                 if( /* handle U+0080..U+07FF inline */

   864                     b>=0xc2 &&

   865                     (t1=*source) >= 0x80 && t1 <= 0xbf

   866                 ) {

   867                     ++source;

   868                     *target++=b;

   869                     *target++=t1;

   870                     count-=2;

   871                     continue;

   872                 }

   873             } else if(b==0xe0) {

   874                 if( /* handle U+0800..U+0FFF inline */

   875                     (t1=source[0]) >= 0xa0 && t1 <= 0xbf &&

   876                     (t2=source[1]) >= 0x80 && t2 <= 0xbf

   877                 ) {

   878                     source+=2;

   879                     *target++=b;

   880                     *target++=t1;

   881                     *target++=t2;

   882                     count-=3;

   883                     continue;

   884                 }

   885             }

   887             /* handle "complicated" and error cases, and continuing partial characters */

   888             oldToULength=0;

   889             toULength=1;

   890             toULimit=U8_COUNT_TRAIL_BYTES(b)+1;

   891             c=b;

   892 moreBytes:

   893             while(toULength<toULimit) {

   894                 if(source<sourceLimit) {

   895                     b=*source;

   896                     if(U8_IS_TRAIL(b)) {

   897                         ++source;

   898                         ++toULength;

   899                         c=(c<<6)+b;

   900                     } else {

   901                         break; /* sequence too short, stop with toULength<toULimit */

   902                     }

   903                 } else {

   904                     /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */

   905                     source-=(toULength-oldToULength);

   906                     while(oldToULength<toULength) {

   907                         utf8->toUBytes[oldToULength++]=*source++;

   908                     }

   909                     utf8->toUnicodeStatus=c;

   910                     utf8->toULength=toULength;

   911                     utf8->mode=toULimit;

   912                     pToUArgs->source=(char *)source;

   913                     pFromUArgs->target=(char *)target;

   914                     return;

   915                 }

   916             }

   918             if( toULength==toULimit &&      /* consumed all trail bytes */

   919                 (toULength==3 || toULength==2) &&             /* BMP */

   920                 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&

   921                 (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */

   922             ) {

   923                 /* legal byte sequence for BMP code point */

   924             } else if(

   925                 toULength==toULimit && toULength==4 &&

   926                 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)

   927             ) {

   928                 /* legal byte sequence for supplementary code point */

   929             } else {

   930                 /* error handling: illegal UTF-8 byte sequence */

   931                 source-=(toULength-oldToULength);

   932                 while(oldToULength<toULength) {

   933                     utf8->toUBytes[oldToULength++]=*source++;

   934                 }

   935                 utf8->toULength=toULength;

   936                 pToUArgs->source=(char *)source;

   937                 pFromUArgs->target=(char *)target;

   938                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

   939                 return;

   940             }

   942             /* copy the legal byte sequence to the target */

   943             {

   944                 int8_t i;

   946                 for(i=0; i<oldToULength; ++i) {

   947                     *target++=utf8->toUBytes[i];

   948                 }

   949                 source-=(toULength-oldToULength);

   950                 for(; i<toULength; ++i) {

   951                     *target++=*source++;

   952                 }

   953                 count-=toULength;

   954             }

   955         }

   956     }

   958     if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {

   959         if(target==(const uint8_t *)pFromUArgs->targetLimit) {

   960             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

   961         } else {

   962             b=*source;

   963             toULimit=U8_COUNT_TRAIL_BYTES(b)+1;

   964             if(toULimit>(sourceLimit-source)) {

   965                 /* collect a truncated byte sequence */

   966                 toULength=0;

   967                 c=b;

   968                 for(;;) {

   969                     utf8->toUBytes[toULength++]=b;

   970                     if(++source==sourceLimit) {

   971                         /* partial byte sequence at end of source */

   972                         utf8->toUnicodeStatus=c;

   973                         utf8->toULength=toULength;

   974                         utf8->mode=toULimit;

   975                         break;

   976                     } else if(!U8_IS_TRAIL(b=*source)) {

   977                         /* lead byte in trail byte position */

   978                         utf8->toULength=toULength;

   979                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;

   980                         break;

   981                     }

   982                     c=(c<<6)+b;

   983                 }

   984             } else {

   985                 /* partial-sequence target overflow: fall back to the pivoting implementation */

   986                 *pErrorCode=U_USING_DEFAULT_WARNING;

   987             }

   988         }

   989     }

   991     /* write back the updated pointers */

   992     pToUArgs->source=(char *)source;

   993     pFromUArgs->target=(char *)target;

   994 }

   996 /* UTF-8 converter data ----------------------------------------------------- */

   998 static const UConverterImpl _UTF8Impl={

   999     UCNV_UTF8,

  1001     NULL,

  1002     NULL,

  1004     NULL,

  1005     NULL,

  1006     NULL,

  1008     ucnv_toUnicode_UTF8,

  1009     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,

  1010     ucnv_fromUnicode_UTF8,

  1011     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,

  1012     ucnv_getNextUChar_UTF8,

  1014     NULL,

  1015     NULL,

  1016     NULL,

  1017     NULL,

  1018     ucnv_getNonSurrogateUnicodeSet,

  1020     ucnv_UTF8FromUTF8,

  1021     ucnv_UTF8FromUTF8

  1022 };

  1024 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */

  1025 static const UConverterStaticData _UTF8StaticData={

  1026     sizeof(UConverterStaticData),

  1027     "UTF-8",

  1028     1208, UCNV_IBM, UCNV_UTF8,

  1029     1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */

  1030     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,

  1031     0,

  1032     0,

  1033     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */

  1034 };

  1037 const UConverterSharedData _UTF8Data={

  1038     sizeof(UConverterSharedData), ~((uint32_t) 0),

  1039     NULL, NULL, &_UTF8StaticData, FALSE, &_UTF8Impl,

  1040     0

  1041 };

  1043 /* CESU-8 converter data ---------------------------------------------------- */

  1045 static const UConverterImpl _CESU8Impl={

  1046     UCNV_CESU8,

  1048     NULL,

  1049     NULL,

  1051     NULL,

  1052     NULL,

  1053     NULL,

  1055     ucnv_toUnicode_UTF8,

  1056     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,

  1057     ucnv_fromUnicode_UTF8,

  1058     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,

  1059     NULL,

  1061     NULL,

  1062     NULL,

  1063     NULL,

  1064     NULL,

  1065     ucnv_getCompleteUnicodeSet

  1066 };

  1068 static const UConverterStaticData _CESU8StaticData={

  1069     sizeof(UConverterStaticData),

  1070     "CESU-8",

  1071     9400, /* CCSID for CESU-8 */

  1072     UCNV_UNKNOWN, UCNV_CESU8, 1, 3,

  1073     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,

  1074     0,

  1075     0,

  1076     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */

  1077 };

  1080 const UConverterSharedData _CESU8Data={

  1081     sizeof(UConverterSharedData), ~((uint32_t) 0),

  1082     NULL, NULL, &_CESU8StaticData, FALSE, &_CESU8Impl,

  1083     0

  1084 };

  1086 #endif

The Tor Browser / file revision

intl/icu/source/common/ucnv_u8.c@6474c204b198

intl/icu/source/common/ucnv_u8.c