intl/icu/source/common/ucnv_u8.c

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*  
     2 **********************************************************************
     3 *   Copyright (C) 2002-2012, International Business Machines
     4 *   Corporation and others.  All Rights Reserved.
     5 **********************************************************************
     6 *   file name:  ucnv_u8.c
     7 *   encoding:   US-ASCII
     8 *   tab size:   8 (not used)
     9 *   indentation:4
    10 *
    11 *   created on: 2002jul01
    12 *   created by: Markus W. Scherer
    13 *
    14 *   UTF-8 converter implementation. Used to be in ucnv_utf.c.
    15 *
    16 *   Also, CESU-8 implementation, see UTR 26.
    17 *   The CESU-8 converter uses all the same functions as the
    18 *   UTF-8 converter, with a branch for converting supplementary code points.
    19 */
    21 #include "unicode/utypes.h"
    23 #if !UCONFIG_NO_CONVERSION
    25 #include "unicode/ucnv.h"
    26 #include "unicode/utf.h"
    27 #include "unicode/utf8.h"
    28 #include "unicode/utf16.h"
    29 #include "ucnv_bld.h"
    30 #include "ucnv_cnv.h"
    31 #include "cmemory.h"
    33 /* Prototypes --------------------------------------------------------------- */
    35 /* Keep these here to make finicky compilers happy */
    37 U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
    38                                            UErrorCode *err);
    39 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
    40                                                         UErrorCode *err);
    43 /* UTF-8 -------------------------------------------------------------------- */
    45 /* UTF-8 Conversion DATA
    46  *   for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
    47  */
    48 /*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
    49 #define MAXIMUM_UCS2            0x0000FFFF
    50 #define MAXIMUM_UTF             0x0010FFFF
    51 #define MAXIMUM_UCS4            0x7FFFFFFF
    52 #define HALF_SHIFT              10
    53 #define HALF_BASE               0x0010000
    54 #define HALF_MASK               0x3FF
    55 #define SURROGATE_HIGH_START    0xD800
    56 #define SURROGATE_HIGH_END      0xDBFF
    57 #define SURROGATE_LOW_START     0xDC00
    58 #define SURROGATE_LOW_END       0xDFFF
    60 /* -SURROGATE_LOW_START + HALF_BASE */
    61 #define SURROGATE_LOW_BASE      9216
    63 static const uint32_t offsetsFromUTF8[7] = {0,
    64   (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
    65   (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
    66 };
    68 /* END OF UTF-8 Conversion DATA */
    70 static const int8_t bytesFromUTF8[256] = {
    71   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    72   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    73   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    74   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    75   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    76   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    77   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    78   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
    79 };
    81 /*
    82  * Starting with Unicode 3.0.1:
    83  * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
    84  * byte sequences with more than 4 bytes are illegal in UTF-8,
    85  * which is tested with impossible values for them
    86  */
    87 static const uint32_t
    88 utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
    90 static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
    91                                   UErrorCode * err)
    92 {
    93     UConverter *cnv = args->converter;
    94     const unsigned char *mySource = (unsigned char *) args->source;
    95     UChar *myTarget = args->target;
    96     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
    97     const UChar *targetLimit = args->targetLimit;
    98     unsigned char *toUBytes = cnv->toUBytes;
    99     UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data);
   100     uint32_t ch, ch2 = 0;
   101     int32_t i, inBytes;
   103     /* Restore size of current sequence */
   104     if (cnv->toUnicodeStatus && myTarget < targetLimit)
   105     {
   106         inBytes = cnv->mode;            /* restore # of bytes to consume */
   107         i = cnv->toULength;             /* restore # of bytes consumed */
   108         cnv->toULength = 0;
   110         ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
   111         cnv->toUnicodeStatus = 0;
   112         goto morebytes;
   113     }
   116     while (mySource < sourceLimit && myTarget < targetLimit)
   117     {
   118         ch = *(mySource++);
   119         if (ch < 0x80)        /* Simple case */
   120         {
   121             *(myTarget++) = (UChar) ch;
   122         }
   123         else
   124         {
   125             /* store the first char */
   126             toUBytes[0] = (char)ch;
   127             inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
   128             i = 1;
   130 morebytes:
   131             while (i < inBytes)
   132             {
   133                 if (mySource < sourceLimit)
   134                 {
   135                     toUBytes[i] = (char) (ch2 = *mySource);
   136                     if (!U8_IS_TRAIL(ch2))
   137                     {
   138                         break; /* i < inBytes */
   139                     }
   140                     ch = (ch << 6) + ch2;
   141                     ++mySource;
   142                     i++;
   143                 }
   144                 else
   145                 {
   146                     /* stores a partially calculated target*/
   147                     cnv->toUnicodeStatus = ch;
   148                     cnv->mode = inBytes;
   149                     cnv->toULength = (int8_t) i;
   150                     goto donefornow;
   151                 }
   152             }
   154             /* Remove the accumulated high bits */
   155             ch -= offsetsFromUTF8[inBytes];
   157             /*
   158              * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
   159              * - use only trail bytes after a lead byte (checked above)
   160              * - use the right number of trail bytes for a given lead byte
   161              * - encode a code point <= U+10ffff
   162              * - use the fewest possible number of bytes for their code points
   163              * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
   164              *
   165              * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
   166              * There are no irregular sequences any more.
   167              * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
   168              */
   169             if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
   170                 (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
   171             {
   172                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
   173                 if (ch <= MAXIMUM_UCS2) 
   174                 {
   175                     /* fits in 16 bits */
   176                     *(myTarget++) = (UChar) ch;
   177                 }
   178                 else
   179                 {
   180                     /* write out the surrogates */
   181                     ch -= HALF_BASE;
   182                     *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
   183                     ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
   184                     if (myTarget < targetLimit)
   185                     {
   186                         *(myTarget++) = (UChar)ch;
   187                     }
   188                     else
   189                     {
   190                         /* Put in overflow buffer (not handled here) */
   191                         cnv->UCharErrorBuffer[0] = (UChar) ch;
   192                         cnv->UCharErrorBufferLength = 1;
   193                         *err = U_BUFFER_OVERFLOW_ERROR;
   194                         break;
   195                     }
   196                 }
   197             }
   198             else
   199             {
   200                 cnv->toULength = (int8_t)i;
   201                 *err = U_ILLEGAL_CHAR_FOUND;
   202                 break;
   203             }
   204         }
   205     }
   207 donefornow:
   208     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
   209     {
   210         /* End of target buffer */
   211         *err = U_BUFFER_OVERFLOW_ERROR;
   212     }
   214     args->target = myTarget;
   215     args->source = (const char *) mySource;
   216 }
   218 static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
   219                                                 UErrorCode * err)
   220 {
   221     UConverter *cnv = args->converter;
   222     const unsigned char *mySource = (unsigned char *) args->source;
   223     UChar *myTarget = args->target;
   224     int32_t *myOffsets = args->offsets;
   225     int32_t offsetNum = 0;
   226     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
   227     const UChar *targetLimit = args->targetLimit;
   228     unsigned char *toUBytes = cnv->toUBytes;
   229     UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data);
   230     uint32_t ch, ch2 = 0;
   231     int32_t i, inBytes;
   233     /* Restore size of current sequence */
   234     if (cnv->toUnicodeStatus && myTarget < targetLimit)
   235     {
   236         inBytes = cnv->mode;            /* restore # of bytes to consume */
   237         i = cnv->toULength;             /* restore # of bytes consumed */
   238         cnv->toULength = 0;
   240         ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
   241         cnv->toUnicodeStatus = 0;
   242         goto morebytes;
   243     }
   245     while (mySource < sourceLimit && myTarget < targetLimit)
   246     {
   247         ch = *(mySource++);
   248         if (ch < 0x80)        /* Simple case */
   249         {
   250             *(myTarget++) = (UChar) ch;
   251             *(myOffsets++) = offsetNum++;
   252         }
   253         else
   254         {
   255             toUBytes[0] = (char)ch;
   256             inBytes = bytesFromUTF8[ch];
   257             i = 1;
   259 morebytes:
   260             while (i < inBytes)
   261             {
   262                 if (mySource < sourceLimit)
   263                 {
   264                     toUBytes[i] = (char) (ch2 = *mySource);
   265                     if (!U8_IS_TRAIL(ch2))
   266                     {
   267                         break; /* i < inBytes */
   268                     }
   269                     ch = (ch << 6) + ch2;
   270                     ++mySource;
   271                     i++;
   272                 }
   273                 else
   274                 {
   275                     cnv->toUnicodeStatus = ch;
   276                     cnv->mode = inBytes;
   277                     cnv->toULength = (int8_t)i;
   278                     goto donefornow;
   279                 }
   280             }
   282             /* Remove the accumulated high bits */
   283             ch -= offsetsFromUTF8[inBytes];
   285             /*
   286              * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
   287              * - use only trail bytes after a lead byte (checked above)
   288              * - use the right number of trail bytes for a given lead byte
   289              * - encode a code point <= U+10ffff
   290              * - use the fewest possible number of bytes for their code points
   291              * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
   292              *
   293              * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
   294              * There are no irregular sequences any more.
   295              * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
   296              */
   297             if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
   298                 (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
   299             {
   300                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
   301                 if (ch <= MAXIMUM_UCS2) 
   302                 {
   303                     /* fits in 16 bits */
   304                     *(myTarget++) = (UChar) ch;
   305                     *(myOffsets++) = offsetNum;
   306                 }
   307                 else
   308                 {
   309                     /* write out the surrogates */
   310                     ch -= HALF_BASE;
   311                     *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
   312                     *(myOffsets++) = offsetNum;
   313                     ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
   314                     if (myTarget < targetLimit)
   315                     {
   316                         *(myTarget++) = (UChar)ch;
   317                         *(myOffsets++) = offsetNum;
   318                     }
   319                     else
   320                     {
   321                         cnv->UCharErrorBuffer[0] = (UChar) ch;
   322                         cnv->UCharErrorBufferLength = 1;
   323                         *err = U_BUFFER_OVERFLOW_ERROR;
   324                     }
   325                 }
   326                 offsetNum += i;
   327             }
   328             else
   329             {
   330                 cnv->toULength = (int8_t)i;
   331                 *err = U_ILLEGAL_CHAR_FOUND;
   332                 break;
   333             }
   334         }
   335     }
   337 donefornow:
   338     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
   339     {   /* End of target buffer */
   340         *err = U_BUFFER_OVERFLOW_ERROR;
   341     }
   343     args->target = myTarget;
   344     args->source = (const char *) mySource;
   345     args->offsets = myOffsets;
   346 }
   348 U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
   349                                     UErrorCode * err)
   350 {
   351     UConverter *cnv = args->converter;
   352     const UChar *mySource = args->source;
   353     const UChar *sourceLimit = args->sourceLimit;
   354     uint8_t *myTarget = (uint8_t *) args->target;
   355     const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
   356     uint8_t *tempPtr;
   357     UChar32 ch;
   358     uint8_t tempBuf[4];
   359     int32_t indexToWrite;
   360     UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data);
   362     if (cnv->fromUChar32 && myTarget < targetLimit)
   363     {
   364         ch = cnv->fromUChar32;
   365         cnv->fromUChar32 = 0;
   366         goto lowsurrogate;
   367     }
   369     while (mySource < sourceLimit && myTarget < targetLimit)
   370     {
   371         ch = *(mySource++);
   373         if (ch < 0x80)        /* Single byte */
   374         {
   375             *(myTarget++) = (uint8_t) ch;
   376         }
   377         else if (ch < 0x800)  /* Double byte */
   378         {
   379             *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
   380             if (myTarget < targetLimit)
   381             {
   382                 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
   383             }
   384             else
   385             {
   386                 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
   387                 cnv->charErrorBufferLength = 1;
   388                 *err = U_BUFFER_OVERFLOW_ERROR;
   389             }
   390         }
   391         else {
   392             /* Check for surrogates */
   393             if(U16_IS_SURROGATE(ch) && isNotCESU8) {
   394 lowsurrogate:
   395                 if (mySource < sourceLimit) {
   396                     /* test both code units */
   397                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
   398                         /* convert and consume this supplementary code point */
   399                         ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
   400                         ++mySource;
   401                         /* exit this condition tree */
   402                     }
   403                     else {
   404                         /* this is an unpaired trail or lead code unit */
   405                         /* callback(illegal) */
   406                         cnv->fromUChar32 = ch;
   407                         *err = U_ILLEGAL_CHAR_FOUND;
   408                         break;
   409                     }
   410                 }
   411                 else {
   412                     /* no more input */
   413                     cnv->fromUChar32 = ch;
   414                     break;
   415                 }
   416             }
   418             /* Do we write the buffer directly for speed,
   419             or do we have to be careful about target buffer space? */
   420             tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
   422             if (ch <= MAXIMUM_UCS2) {
   423                 indexToWrite = 2;
   424                 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
   425             }
   426             else {
   427                 indexToWrite = 3;
   428                 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
   429                 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
   430             }
   431             tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
   432             tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
   434             if (tempPtr == myTarget) {
   435                 /* There was enough space to write the codepoint directly. */
   436                 myTarget += (indexToWrite + 1);
   437             }
   438             else {
   439                 /* We might run out of room soon. Write it slowly. */
   440                 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
   441                     if (myTarget < targetLimit) {
   442                         *(myTarget++) = *tempPtr;
   443                     }
   444                     else {
   445                         cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
   446                         *err = U_BUFFER_OVERFLOW_ERROR;
   447                     }
   448                 }
   449             }
   450         }
   451     }
   453     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
   454     {
   455         *err = U_BUFFER_OVERFLOW_ERROR;
   456     }
   458     args->target = (char *) myTarget;
   459     args->source = mySource;
   460 }
   462 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
   463                                                   UErrorCode * err)
   464 {
   465     UConverter *cnv = args->converter;
   466     const UChar *mySource = args->source;
   467     int32_t *myOffsets = args->offsets;
   468     const UChar *sourceLimit = args->sourceLimit;
   469     uint8_t *myTarget = (uint8_t *) args->target;
   470     const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
   471     uint8_t *tempPtr;
   472     UChar32 ch;
   473     int32_t offsetNum, nextSourceIndex;
   474     int32_t indexToWrite;
   475     uint8_t tempBuf[4];
   476     UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data);
   478     if (cnv->fromUChar32 && myTarget < targetLimit)
   479     {
   480         ch = cnv->fromUChar32;
   481         cnv->fromUChar32 = 0;
   482         offsetNum = -1;
   483         nextSourceIndex = 0;
   484         goto lowsurrogate;
   485     } else {
   486         offsetNum = 0;
   487     }
   489     while (mySource < sourceLimit && myTarget < targetLimit)
   490     {
   491         ch = *(mySource++);
   493         if (ch < 0x80)        /* Single byte */
   494         {
   495             *(myOffsets++) = offsetNum++;
   496             *(myTarget++) = (char) ch;
   497         }
   498         else if (ch < 0x800)  /* Double byte */
   499         {
   500             *(myOffsets++) = offsetNum;
   501             *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
   502             if (myTarget < targetLimit)
   503             {
   504                 *(myOffsets++) = offsetNum++;
   505                 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
   506             }
   507             else
   508             {
   509                 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
   510                 cnv->charErrorBufferLength = 1;
   511                 *err = U_BUFFER_OVERFLOW_ERROR;
   512             }
   513         }
   514         else
   515         /* Check for surrogates */
   516         {
   517             nextSourceIndex = offsetNum + 1;
   519             if(U16_IS_SURROGATE(ch) && isNotCESU8) {
   520 lowsurrogate:
   521                 if (mySource < sourceLimit) {
   522                     /* test both code units */
   523                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
   524                         /* convert and consume this supplementary code point */
   525                         ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
   526                         ++mySource;
   527                         ++nextSourceIndex;
   528                         /* exit this condition tree */
   529                     }
   530                     else {
   531                         /* this is an unpaired trail or lead code unit */
   532                         /* callback(illegal) */
   533                         cnv->fromUChar32 = ch;
   534                         *err = U_ILLEGAL_CHAR_FOUND;
   535                         break;
   536                     }
   537                 }
   538                 else {
   539                     /* no more input */
   540                     cnv->fromUChar32 = ch;
   541                     break;
   542                 }
   543             }
   545             /* Do we write the buffer directly for speed,
   546             or do we have to be careful about target buffer space? */
   547             tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
   549             if (ch <= MAXIMUM_UCS2) {
   550                 indexToWrite = 2;
   551                 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
   552             }
   553             else {
   554                 indexToWrite = 3;
   555                 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
   556                 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
   557             }
   558             tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
   559             tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
   561             if (tempPtr == myTarget) {
   562                 /* There was enough space to write the codepoint directly. */
   563                 myTarget += (indexToWrite + 1);
   564                 myOffsets[0] = offsetNum;
   565                 myOffsets[1] = offsetNum;
   566                 myOffsets[2] = offsetNum;
   567                 if (indexToWrite >= 3) {
   568                     myOffsets[3] = offsetNum;
   569                 }
   570                 myOffsets += (indexToWrite + 1);
   571             }
   572             else {
   573                 /* We might run out of room soon. Write it slowly. */
   574                 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
   575                     if (myTarget < targetLimit)
   576                     {
   577                         *(myOffsets++) = offsetNum;
   578                         *(myTarget++) = *tempPtr;
   579                     }
   580                     else
   581                     {
   582                         cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
   583                         *err = U_BUFFER_OVERFLOW_ERROR;
   584                     }
   585                 }
   586             }
   587             offsetNum = nextSourceIndex;
   588         }
   589     }
   591     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
   592     {
   593         *err = U_BUFFER_OVERFLOW_ERROR;
   594     }
   596     args->target = (char *) myTarget;
   597     args->source = mySource;
   598     args->offsets = myOffsets;
   599 }
   601 static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
   602                                                UErrorCode *err) {
   603     UConverter *cnv;
   604     const uint8_t *sourceInitial;
   605     const uint8_t *source;
   606     uint16_t extraBytesToWrite;
   607     uint8_t myByte;
   608     UChar32 ch;
   609     int8_t i, isLegalSequence;
   611     /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
   613     cnv = args->converter;
   614     sourceInitial = source = (const uint8_t *)args->source;
   615     if (source >= (const uint8_t *)args->sourceLimit)
   616     {
   617         /* no input */
   618         *err = U_INDEX_OUTOFBOUNDS_ERROR;
   619         return 0xffff;
   620     }
   622     myByte = (uint8_t)*(source++);
   623     if (myByte < 0x80)
   624     {
   625         args->source = (const char *)source;
   626         return (UChar32)myByte;
   627     }
   629     extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
   630     if (extraBytesToWrite == 0) {
   631         cnv->toUBytes[0] = myByte;
   632         cnv->toULength = 1;
   633         *err = U_ILLEGAL_CHAR_FOUND;
   634         args->source = (const char *)source;
   635         return 0xffff;
   636     }
   638     /*The byte sequence is longer than the buffer area passed*/
   639     if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
   640     {
   641         /* check if all of the remaining bytes are trail bytes */
   642         cnv->toUBytes[0] = myByte;
   643         i = 1;
   644         *err = U_TRUNCATED_CHAR_FOUND;
   645         while(source < (const uint8_t *)args->sourceLimit) {
   646             if(U8_IS_TRAIL(myByte = *source)) {
   647                 cnv->toUBytes[i++] = myByte;
   648                 ++source;
   649             } else {
   650                 /* error even before we run out of input */
   651                 *err = U_ILLEGAL_CHAR_FOUND;
   652                 break;
   653             }
   654         }
   655         cnv->toULength = i;
   656         args->source = (const char *)source;
   657         return 0xffff;
   658     }
   660     isLegalSequence = 1;
   661     ch = myByte << 6;
   662     switch(extraBytesToWrite)
   663     {     
   664       /* note: code falls through cases! (sic)*/ 
   665     case 6:
   666         ch += (myByte = *source);
   667         ch <<= 6;
   668         if (!U8_IS_TRAIL(myByte))
   669         {
   670             isLegalSequence = 0;
   671             break;
   672         }
   673         ++source;
   674     case 5: /*fall through*/
   675         ch += (myByte = *source);
   676         ch <<= 6;
   677         if (!U8_IS_TRAIL(myByte))
   678         {
   679             isLegalSequence = 0;
   680             break;
   681         }
   682         ++source;
   683     case 4: /*fall through*/
   684         ch += (myByte = *source);
   685         ch <<= 6;
   686         if (!U8_IS_TRAIL(myByte))
   687         {
   688             isLegalSequence = 0;
   689             break;
   690         }
   691         ++source;
   692     case 3: /*fall through*/
   693         ch += (myByte = *source);
   694         ch <<= 6;
   695         if (!U8_IS_TRAIL(myByte))
   696         {
   697             isLegalSequence = 0;
   698             break;
   699         }
   700         ++source;
   701     case 2: /*fall through*/
   702         ch += (myByte = *source);
   703         if (!U8_IS_TRAIL(myByte))
   704         {
   705             isLegalSequence = 0;
   706             break;
   707         }
   708         ++source;
   709     };
   710     ch -= offsetsFromUTF8[extraBytesToWrite];
   711     args->source = (const char *)source;
   713     /*
   714      * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
   715      * - use only trail bytes after a lead byte (checked above)
   716      * - use the right number of trail bytes for a given lead byte
   717      * - encode a code point <= U+10ffff
   718      * - use the fewest possible number of bytes for their code points
   719      * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
   720      *
   721      * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
   722      * There are no irregular sequences any more.
   723      */
   724     if (isLegalSequence &&
   725         (uint32_t)ch <= MAXIMUM_UTF &&
   726         (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
   727         !U_IS_SURROGATE(ch)
   728     ) {
   729         return ch; /* return the code point */
   730     }
   732     for(i = 0; sourceInitial < source; ++i) {
   733         cnv->toUBytes[i] = *sourceInitial++;
   734     }
   735     cnv->toULength = i;
   736     *err = U_ILLEGAL_CHAR_FOUND;
   737     return 0xffff;
   738 } 
   740 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
   742 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
   743 static const UChar32
   744 utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
   746 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
   747 static const UChar32
   748 utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
   750 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
   751 static void
   752 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
   753                   UConverterToUnicodeArgs *pToUArgs,
   754                   UErrorCode *pErrorCode) {
   755     UConverter *utf8;
   756     const uint8_t *source, *sourceLimit;
   757     uint8_t *target;
   758     int32_t targetCapacity;
   759     int32_t count;
   761     int8_t oldToULength, toULength, toULimit;
   763     UChar32 c;
   764     uint8_t b, t1, t2;
   766     /* set up the local pointers */
   767     utf8=pToUArgs->converter;
   768     source=(uint8_t *)pToUArgs->source;
   769     sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
   770     target=(uint8_t *)pFromUArgs->target;
   771     targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
   773     /* get the converter state from the UTF-8 UConverter */
   774     c=(UChar32)utf8->toUnicodeStatus;
   775     if(c!=0) {
   776         toULength=oldToULength=utf8->toULength;
   777         toULimit=(int8_t)utf8->mode;
   778     } else {
   779         toULength=oldToULength=toULimit=0;
   780     }
   782     count=(int32_t)(sourceLimit-source)+oldToULength;
   783     if(count<toULimit) {
   784         /*
   785          * Not enough input to complete the partial character.
   786          * Jump to moreBytes below - it will not output to target.
   787          */
   788     } else if(targetCapacity<toULimit) {
   789         /*
   790          * Not enough target capacity to output the partial character.
   791          * Let the standard converter handle this.
   792          */
   793         *pErrorCode=U_USING_DEFAULT_WARNING;
   794         return;
   795     } else {
   796         /*
   797          * Use a single counter for source and target, counting the minimum of
   798          * the source length and the target capacity.
   799          * As a result, the source length is checked only once per multi-byte
   800          * character instead of twice.
   801          *
   802          * Make sure that the last byte sequence is complete, or else
   803          * stop just before it.
   804          * (The longest legal byte sequence has 3 trail bytes.)
   805          * Count oldToULength (number of source bytes from a previous buffer)
   806          * into the source length but reduce the source index by toULimit
   807          * while going back over trail bytes in order to not go back into
   808          * the bytes that will be read for finishing a partial
   809          * sequence from the previous buffer.
   810          * Let the standard converter handle edge cases.
   811          */
   812         int32_t i;
   814         if(count>targetCapacity) {
   815             count=targetCapacity;
   816         }
   818         i=0;
   819         while(i<3 && i<(count-toULimit)) {
   820             b=source[count-oldToULength-i-1];
   821             if(U8_IS_TRAIL(b)) {
   822                 ++i;
   823             } else {
   824                 if(i<U8_COUNT_TRAIL_BYTES(b)) {
   825                     /* stop converting before the lead byte if there are not enough trail bytes for it */
   826                     count-=i+1;
   827                 }
   828                 break;
   829             }
   830         }
   831     }
   833     if(c!=0) {
   834         utf8->toUnicodeStatus=0;
   835         utf8->toULength=0;
   836         goto moreBytes;
   837         /* See note in ucnv_SBCSFromUTF8() about this goto. */
   838     }
   840     /* conversion loop */
   841     while(count>0) {
   842         b=*source++;
   843         if((int8_t)b>=0) {
   844             /* convert ASCII */
   845             *target++=b;
   846             --count;
   847             continue;
   848         } else {
   849             if(b>0xe0) {
   850                 if( /* handle U+1000..U+D7FF inline */
   851                     (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) ||
   852                                                (b==0xed && (t1 <= 0x9f))) &&
   853                     (t2=source[1]) >= 0x80 && t2 <= 0xbf
   854                 ) {
   855                     source+=2;
   856                     *target++=b;
   857                     *target++=t1;
   858                     *target++=t2;
   859                     count-=3;
   860                     continue;
   861                 }
   862             } else if(b<0xe0) {
   863                 if( /* handle U+0080..U+07FF inline */
   864                     b>=0xc2 &&
   865                     (t1=*source) >= 0x80 && t1 <= 0xbf
   866                 ) {
   867                     ++source;
   868                     *target++=b;
   869                     *target++=t1;
   870                     count-=2;
   871                     continue;
   872                 }
   873             } else if(b==0xe0) {
   874                 if( /* handle U+0800..U+0FFF inline */
   875                     (t1=source[0]) >= 0xa0 && t1 <= 0xbf &&
   876                     (t2=source[1]) >= 0x80 && t2 <= 0xbf
   877                 ) {
   878                     source+=2;
   879                     *target++=b;
   880                     *target++=t1;
   881                     *target++=t2;
   882                     count-=3;
   883                     continue;
   884                 }
   885             }
   887             /* handle "complicated" and error cases, and continuing partial characters */
   888             oldToULength=0;
   889             toULength=1;
   890             toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
   891             c=b;
   892 moreBytes:
   893             while(toULength<toULimit) {
   894                 if(source<sourceLimit) {
   895                     b=*source;
   896                     if(U8_IS_TRAIL(b)) {
   897                         ++source;
   898                         ++toULength;
   899                         c=(c<<6)+b;
   900                     } else {
   901                         break; /* sequence too short, stop with toULength<toULimit */
   902                     }
   903                 } else {
   904                     /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
   905                     source-=(toULength-oldToULength);
   906                     while(oldToULength<toULength) {
   907                         utf8->toUBytes[oldToULength++]=*source++;
   908                     }
   909                     utf8->toUnicodeStatus=c;
   910                     utf8->toULength=toULength;
   911                     utf8->mode=toULimit;
   912                     pToUArgs->source=(char *)source;
   913                     pFromUArgs->target=(char *)target;
   914                     return;
   915                 }
   916             }
   918             if( toULength==toULimit &&      /* consumed all trail bytes */
   919                 (toULength==3 || toULength==2) &&             /* BMP */
   920                 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
   921                 (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
   922             ) {
   923                 /* legal byte sequence for BMP code point */
   924             } else if(
   925                 toULength==toULimit && toULength==4 &&
   926                 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
   927             ) {
   928                 /* legal byte sequence for supplementary code point */
   929             } else {
   930                 /* error handling: illegal UTF-8 byte sequence */
   931                 source-=(toULength-oldToULength);
   932                 while(oldToULength<toULength) {
   933                     utf8->toUBytes[oldToULength++]=*source++;
   934                 }
   935                 utf8->toULength=toULength;
   936                 pToUArgs->source=(char *)source;
   937                 pFromUArgs->target=(char *)target;
   938                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   939                 return;
   940             }
   942             /* copy the legal byte sequence to the target */
   943             {
   944                 int8_t i;
   946                 for(i=0; i<oldToULength; ++i) {
   947                     *target++=utf8->toUBytes[i];
   948                 }
   949                 source-=(toULength-oldToULength);
   950                 for(; i<toULength; ++i) {
   951                     *target++=*source++;
   952                 }
   953                 count-=toULength;
   954             }
   955         }
   956     }
   958     if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
   959         if(target==(const uint8_t *)pFromUArgs->targetLimit) {
   960             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   961         } else {
   962             b=*source;
   963             toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
   964             if(toULimit>(sourceLimit-source)) {
   965                 /* collect a truncated byte sequence */
   966                 toULength=0;
   967                 c=b;
   968                 for(;;) {
   969                     utf8->toUBytes[toULength++]=b;
   970                     if(++source==sourceLimit) {
   971                         /* partial byte sequence at end of source */
   972                         utf8->toUnicodeStatus=c;
   973                         utf8->toULength=toULength;
   974                         utf8->mode=toULimit;
   975                         break;
   976                     } else if(!U8_IS_TRAIL(b=*source)) {
   977                         /* lead byte in trail byte position */
   978                         utf8->toULength=toULength;
   979                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   980                         break;
   981                     }
   982                     c=(c<<6)+b;
   983                 }
   984             } else {
   985                 /* partial-sequence target overflow: fall back to the pivoting implementation */
   986                 *pErrorCode=U_USING_DEFAULT_WARNING;
   987             }
   988         }
   989     }
   991     /* write back the updated pointers */
   992     pToUArgs->source=(char *)source;
   993     pFromUArgs->target=(char *)target;
   994 }
   996 /* UTF-8 converter data ----------------------------------------------------- */
   998 static const UConverterImpl _UTF8Impl={
   999     UCNV_UTF8,
  1001     NULL,
  1002     NULL,
  1004     NULL,
  1005     NULL,
  1006     NULL,
  1008     ucnv_toUnicode_UTF8,
  1009     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
  1010     ucnv_fromUnicode_UTF8,
  1011     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
  1012     ucnv_getNextUChar_UTF8,
  1014     NULL,
  1015     NULL,
  1016     NULL,
  1017     NULL,
  1018     ucnv_getNonSurrogateUnicodeSet,
  1020     ucnv_UTF8FromUTF8,
  1021     ucnv_UTF8FromUTF8
  1022 };
  1024 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */
  1025 static const UConverterStaticData _UTF8StaticData={
  1026     sizeof(UConverterStaticData),
  1027     "UTF-8",
  1028     1208, UCNV_IBM, UCNV_UTF8,
  1029     1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
  1030     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
  1031     0,
  1032     0,
  1033     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  1034 };
  1037 const UConverterSharedData _UTF8Data={
  1038     sizeof(UConverterSharedData), ~((uint32_t) 0),
  1039     NULL, NULL, &_UTF8StaticData, FALSE, &_UTF8Impl,
  1041 };
  1043 /* CESU-8 converter data ---------------------------------------------------- */
  1045 static const UConverterImpl _CESU8Impl={
  1046     UCNV_CESU8,
  1048     NULL,
  1049     NULL,
  1051     NULL,
  1052     NULL,
  1053     NULL,
  1055     ucnv_toUnicode_UTF8,
  1056     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
  1057     ucnv_fromUnicode_UTF8,
  1058     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
  1059     NULL,
  1061     NULL,
  1062     NULL,
  1063     NULL,
  1064     NULL,
  1065     ucnv_getCompleteUnicodeSet
  1066 };
  1068 static const UConverterStaticData _CESU8StaticData={
  1069     sizeof(UConverterStaticData),
  1070     "CESU-8",
  1071     9400, /* CCSID for CESU-8 */
  1072     UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
  1073     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
  1074     0,
  1075     0,
  1076     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  1077 };
  1080 const UConverterSharedData _CESU8Data={
  1081     sizeof(UConverterSharedData), ~((uint32_t) 0),
  1082     NULL, NULL, &_CESU8StaticData, FALSE, &_CESU8Impl,
  1084 };
  1086 #endif

mercurial