The Tor Browser: intl/icu/source/common/ucnv

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*

     2 **********************************************************************

     3 *   Copyright (C) 2002-2011, International Business Machines

     4 *   Corporation and others.  All Rights Reserved.

     5 **********************************************************************

     6 *   file name:  ucnv_u32.c

     7 *   encoding:   US-ASCII

     8 *   tab size:   8 (not used)

     9 *   indentation:4

    10 *

    11 *   created on: 2002jul01

    12 *   created by: Markus W. Scherer

    13 *

    14 *   UTF-32 converter implementation. Used to be in ucnv_utf.c.

    15 */

    17 #include "unicode/utypes.h"

    19 #if !UCONFIG_NO_CONVERSION

    21 #include "unicode/ucnv.h"

    22 #include "unicode/utf.h"

    23 #include "ucnv_bld.h"

    24 #include "ucnv_cnv.h"

    25 #include "cmemory.h"

    27 #define MAXIMUM_UCS2            0x0000FFFF

    28 #define MAXIMUM_UTF             0x0010FFFF

    29 #define HALF_SHIFT              10

    30 #define HALF_BASE               0x0010000

    31 #define HALF_MASK               0x3FF

    32 #define SURROGATE_HIGH_START    0xD800

    33 #define SURROGATE_LOW_START     0xDC00

    35 /* -SURROGATE_LOW_START + HALF_BASE */

    36 #define SURROGATE_LOW_BASE      9216

    38 enum {

    39     UCNV_NEED_TO_WRITE_BOM=1

    40 };

    42 /* UTF-32BE ----------------------------------------------------------------- */

    44 static void

    45 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,

    46                                 UErrorCode * err)

    47 {

    48     const unsigned char *mySource = (unsigned char *) args->source;

    49     UChar *myTarget = args->target;

    50     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;

    51     const UChar *targetLimit = args->targetLimit;

    52     unsigned char *toUBytes = args->converter->toUBytes;

    53     uint32_t ch, i;

    55     /* Restore state of current sequence */

    56     if (args->converter->toUnicodeStatus && myTarget < targetLimit) {

    57         i = args->converter->toULength;       /* restore # of bytes consumed */

    58         args->converter->toULength = 0;

    60         ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/

    61         args->converter->toUnicodeStatus = 0;

    62         goto morebytes;

    63     }

    65     while (mySource < sourceLimit && myTarget < targetLimit) {

    66         i = 0;

    67         ch = 0;

    68 morebytes:

    69         while (i < sizeof(uint32_t)) {

    70             if (mySource < sourceLimit) {

    71                 ch = (ch << 8) | (uint8_t)(*mySource);

    72                 toUBytes[i++] = (char) *(mySource++);

    73             }

    74             else {

    75                 /* stores a partially calculated target*/

    76                 /* + 1 to make 0 a valid character */

    77                 args->converter->toUnicodeStatus = ch + 1;

    78                 args->converter->toULength = (int8_t) i;

    79                 goto donefornow;

    80             }

    81         }

    83         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {

    84             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */

    85             if (ch <= MAXIMUM_UCS2)

    86             {

    87                 /* fits in 16 bits */

    88                 *(myTarget++) = (UChar) ch;

    89             }

    90             else {

    91                 /* write out the surrogates */

    92                 *(myTarget++) = U16_LEAD(ch);

    93                 ch = U16_TRAIL(ch);

    94                 if (myTarget < targetLimit) {

    95                     *(myTarget++) = (UChar)ch;

    96                 }

    97                 else {

    98                     /* Put in overflow buffer (not handled here) */

    99                     args->converter->UCharErrorBuffer[0] = (UChar) ch;

   100                     args->converter->UCharErrorBufferLength = 1;

   101                     *err = U_BUFFER_OVERFLOW_ERROR;

   102                     break;

   103                 }

   104             }

   105         }

   106         else {

   107             args->converter->toULength = (int8_t)i;

   108             *err = U_ILLEGAL_CHAR_FOUND;

   109             break;

   110         }

   111     }

   113 donefornow:

   114     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {

   115         /* End of target buffer */

   116         *err = U_BUFFER_OVERFLOW_ERROR;

   117     }

   119     args->target = myTarget;

   120     args->source = (const char *) mySource;

   121 }

   123 static void

   124 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,

   125                                              UErrorCode * err)

   126 {

   127     const unsigned char *mySource = (unsigned char *) args->source;

   128     UChar *myTarget = args->target;

   129     int32_t *myOffsets = args->offsets;

   130     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;

   131     const UChar *targetLimit = args->targetLimit;

   132     unsigned char *toUBytes = args->converter->toUBytes;

   133     uint32_t ch, i;

   134     int32_t offsetNum = 0;

   136     /* Restore state of current sequence */

   137     if (args->converter->toUnicodeStatus && myTarget < targetLimit) {

   138         i = args->converter->toULength;       /* restore # of bytes consumed */

   139         args->converter->toULength = 0;

   141         ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/

   142         args->converter->toUnicodeStatus = 0;

   143         goto morebytes;

   144     }

   146     while (mySource < sourceLimit && myTarget < targetLimit) {

   147         i = 0;

   148         ch = 0;

   149 morebytes:

   150         while (i < sizeof(uint32_t)) {

   151             if (mySource < sourceLimit) {

   152                 ch = (ch << 8) | (uint8_t)(*mySource);

   153                 toUBytes[i++] = (char) *(mySource++);

   154             }

   155             else {

   156                 /* stores a partially calculated target*/

   157                 /* + 1 to make 0 a valid character */

   158                 args->converter->toUnicodeStatus = ch + 1;

   159                 args->converter->toULength = (int8_t) i;

   160                 goto donefornow;

   161             }

   162         }

   164         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {

   165             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */

   166             if (ch <= MAXIMUM_UCS2) {

   167                 /* fits in 16 bits */

   168                 *(myTarget++) = (UChar) ch;

   169                 *(myOffsets++) = offsetNum;

   170             }

   171             else {

   172                 /* write out the surrogates */

   173                 *(myTarget++) = U16_LEAD(ch);

   174                 *myOffsets++ = offsetNum;

   175                 ch = U16_TRAIL(ch);

   176                 if (myTarget < targetLimit)

   177                 {

   178                     *(myTarget++) = (UChar)ch;

   179                     *(myOffsets++) = offsetNum;

   180                 }

   181                 else {

   182                     /* Put in overflow buffer (not handled here) */

   183                     args->converter->UCharErrorBuffer[0] = (UChar) ch;

   184                     args->converter->UCharErrorBufferLength = 1;

   185                     *err = U_BUFFER_OVERFLOW_ERROR;

   186                     break;

   187                 }

   188             }

   189         }

   190         else {

   191             args->converter->toULength = (int8_t)i;

   192             *err = U_ILLEGAL_CHAR_FOUND;

   193             break;

   194         }

   195         offsetNum += i;

   196     }

   198 donefornow:

   199     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))

   200     {

   201         /* End of target buffer */

   202         *err = U_BUFFER_OVERFLOW_ERROR;

   203     }

   205     args->target = myTarget;

   206     args->source = (const char *) mySource;

   207     args->offsets = myOffsets;

   208 }

   210 static void

   211 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,

   212                                   UErrorCode * err)

   213 {

   214     const UChar *mySource = args->source;

   215     unsigned char *myTarget;

   216     const UChar *sourceLimit = args->sourceLimit;

   217     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;

   218     UChar32 ch, ch2;

   219     unsigned int indexToWrite;

   220     unsigned char temp[sizeof(uint32_t)];

   222     if(mySource >= sourceLimit) {

   223         /* no input, nothing to do */

   224         return;

   225     }

   227     /* write the BOM if necessary */

   228     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {

   229         static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };

   230         ucnv_fromUWriteBytes(args->converter,

   231                              bom, 4,

   232                              &args->target, args->targetLimit,

   233                              &args->offsets, -1,

   234                              err);

   235         args->converter->fromUnicodeStatus=0;

   236     }

   238     myTarget = (unsigned char *) args->target;

   239     temp[0] = 0;

   241     if (args->converter->fromUChar32) {

   242         ch = args->converter->fromUChar32;

   243         args->converter->fromUChar32 = 0;

   244         goto lowsurogate;

   245     }

   247     while (mySource < sourceLimit && myTarget < targetLimit) {

   248         ch = *(mySource++);

   250         if (U_IS_SURROGATE(ch)) {

   251             if (U_IS_LEAD(ch)) {

   252 lowsurogate:

   253                 if (mySource < sourceLimit) {

   254                     ch2 = *mySource;

   255                     if (U_IS_TRAIL(ch2)) {

   256                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;

   257                         mySource++;

   258                     }

   259                     else {

   260                         /* this is an unmatched trail code unit (2nd surrogate) */

   261                         /* callback(illegal) */

   262                         args->converter->fromUChar32 = ch;

   263                         *err = U_ILLEGAL_CHAR_FOUND;

   264                         break;

   265                     }

   266                 }

   267                 else {

   268                     /* ran out of source */

   269                     args->converter->fromUChar32 = ch;

   270                     if (args->flush) {

   271                         /* this is an unmatched trail code unit (2nd surrogate) */

   272                         /* callback(illegal) */

   273                         *err = U_ILLEGAL_CHAR_FOUND;

   274                     }

   275                     break;

   276                 }

   277             }

   278             else {

   279                 /* this is an unmatched trail code unit (2nd surrogate) */

   280                 /* callback(illegal) */

   281                 args->converter->fromUChar32 = ch;

   282                 *err = U_ILLEGAL_CHAR_FOUND;

   283                 break;

   284             }

   285         }

   287         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */

   288         temp[1] = (uint8_t) (ch >> 16 & 0x1F);

   289         temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */

   290         temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */

   292         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {

   293             if (myTarget < targetLimit) {

   294                 *(myTarget++) = temp[indexToWrite];

   295             }

   296             else {

   297                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];

   298                 *err = U_BUFFER_OVERFLOW_ERROR;

   299             }

   300         }

   301     }

   303     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {

   304         *err = U_BUFFER_OVERFLOW_ERROR;

   305     }

   307     args->target = (char *) myTarget;

   308     args->source = mySource;

   309 }

   311 static void

   312 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,

   313                                                UErrorCode * err)

   314 {

   315     const UChar *mySource = args->source;

   316     unsigned char *myTarget;

   317     int32_t *myOffsets;

   318     const UChar *sourceLimit = args->sourceLimit;

   319     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;

   320     UChar32 ch, ch2;

   321     int32_t offsetNum = 0;

   322     unsigned int indexToWrite;

   323     unsigned char temp[sizeof(uint32_t)];

   325     if(mySource >= sourceLimit) {

   326         /* no input, nothing to do */

   327         return;

   328     }

   330     /* write the BOM if necessary */

   331     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {

   332         static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };

   333         ucnv_fromUWriteBytes(args->converter,

   334                              bom, 4,

   335                              &args->target, args->targetLimit,

   336                              &args->offsets, -1,

   337                              err);

   338         args->converter->fromUnicodeStatus=0;

   339     }

   341     myTarget = (unsigned char *) args->target;

   342     myOffsets = args->offsets;

   343     temp[0] = 0;

   345     if (args->converter->fromUChar32) {

   346         ch = args->converter->fromUChar32;

   347         args->converter->fromUChar32 = 0;

   348         goto lowsurogate;

   349     }

   351     while (mySource < sourceLimit && myTarget < targetLimit) {

   352         ch = *(mySource++);

   354         if (U_IS_SURROGATE(ch)) {

   355             if (U_IS_LEAD(ch)) {

   356 lowsurogate:

   357                 if (mySource < sourceLimit) {

   358                     ch2 = *mySource;

   359                     if (U_IS_TRAIL(ch2)) {

   360                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;

   361                         mySource++;

   362                     }

   363                     else {

   364                         /* this is an unmatched trail code unit (2nd surrogate) */

   365                         /* callback(illegal) */

   366                         args->converter->fromUChar32 = ch;

   367                         *err = U_ILLEGAL_CHAR_FOUND;

   368                         break;

   369                     }

   370                 }

   371                 else {

   372                     /* ran out of source */

   373                     args->converter->fromUChar32 = ch;

   374                     if (args->flush) {

   375                         /* this is an unmatched trail code unit (2nd surrogate) */

   376                         /* callback(illegal) */

   377                         *err = U_ILLEGAL_CHAR_FOUND;

   378                     }

   379                     break;

   380                 }

   381             }

   382             else {

   383                 /* this is an unmatched trail code unit (2nd surrogate) */

   384                 /* callback(illegal) */

   385                 args->converter->fromUChar32 = ch;

   386                 *err = U_ILLEGAL_CHAR_FOUND;

   387                 break;

   388             }

   389         }

   391         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */

   392         temp[1] = (uint8_t) (ch >> 16 & 0x1F);

   393         temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */

   394         temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */

   396         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {

   397             if (myTarget < targetLimit) {

   398                 *(myTarget++) = temp[indexToWrite];

   399                 *(myOffsets++) = offsetNum;

   400             }

   401             else {

   402                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];

   403                 *err = U_BUFFER_OVERFLOW_ERROR;

   404             }

   405         }

   406         offsetNum = offsetNum + 1 + (temp[1] != 0);

   407     }

   409     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {

   410         *err = U_BUFFER_OVERFLOW_ERROR;

   411     }

   413     args->target = (char *) myTarget;

   414     args->source = mySource;

   415     args->offsets = myOffsets;

   416 }

   418 static UChar32

   419 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,

   420                                    UErrorCode* err)

   421 {

   422     const uint8_t *mySource;

   423     UChar32 myUChar;

   424     int32_t length;

   426     mySource = (const uint8_t *)args->source;

   427     if (mySource >= (const uint8_t *)args->sourceLimit)

   428     {

   429         /* no input */

   430         *err = U_INDEX_OUTOFBOUNDS_ERROR;

   431         return 0xffff;

   432     }

   434     length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);

   435     if (length < 4)

   436     {

   437         /* got a partial character */

   438         uprv_memcpy(args->converter->toUBytes, mySource, length);

   439         args->converter->toULength = (int8_t)length;

   440         args->source = (const char *)(mySource + length);

   441         *err = U_TRUNCATED_CHAR_FOUND;

   442         return 0xffff;

   443     }

   445     /* Don't even try to do a direct cast because the value may be on an odd address. */

   446     myUChar = ((UChar32)mySource[0] << 24)

   447             | ((UChar32)mySource[1] << 16)

   448             | ((UChar32)mySource[2] << 8)

   449             | ((UChar32)mySource[3]);

   451     args->source = (const char *)(mySource + 4);

   452     if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {

   453         return myUChar;

   454     }

   456     uprv_memcpy(args->converter->toUBytes, mySource, 4);

   457     args->converter->toULength = 4;

   459     *err = U_ILLEGAL_CHAR_FOUND;

   460     return 0xffff;

   461 }

   463 static const UConverterImpl _UTF32BEImpl = {

   464     UCNV_UTF32_BigEndian,

   466     NULL,

   467     NULL,

   469     NULL,

   470     NULL,

   471     NULL,

   473     T_UConverter_toUnicode_UTF32_BE,

   474     T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC,

   475     T_UConverter_fromUnicode_UTF32_BE,

   476     T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,

   477     T_UConverter_getNextUChar_UTF32_BE,

   479     NULL,

   480     NULL,

   481     NULL,

   482     NULL,

   483     ucnv_getNonSurrogateUnicodeSet

   484 };

   486 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */

   487 static const UConverterStaticData _UTF32BEStaticData = {

   488     sizeof(UConverterStaticData),

   489     "UTF-32BE",

   490     1232,

   491     UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,

   492     { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE,

   493     0,

   494     0,

   495     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */

   496 };

   498 const UConverterSharedData _UTF32BEData = {

   499     sizeof(UConverterSharedData), ~((uint32_t) 0),

   500     NULL, NULL, &_UTF32BEStaticData, FALSE, &_UTF32BEImpl,

   501     0

   502 };

   504 /* UTF-32LE ---------------------------------------------------------- */

   506 static void

   507 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,

   508                                 UErrorCode * err)

   509 {

   510     const unsigned char *mySource = (unsigned char *) args->source;

   511     UChar *myTarget = args->target;

   512     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;

   513     const UChar *targetLimit = args->targetLimit;

   514     unsigned char *toUBytes = args->converter->toUBytes;

   515     uint32_t ch, i;

   517     /* Restore state of current sequence */

   518     if (args->converter->toUnicodeStatus && myTarget < targetLimit)

   519     {

   520         i = args->converter->toULength;       /* restore # of bytes consumed */

   521         args->converter->toULength = 0;

   523         /* Stores the previously calculated ch from a previous call*/

   524         ch = args->converter->toUnicodeStatus - 1;

   525         args->converter->toUnicodeStatus = 0;

   526         goto morebytes;

   527     }

   529     while (mySource < sourceLimit && myTarget < targetLimit)

   530     {

   531         i = 0;

   532         ch = 0;

   533 morebytes:

   534         while (i < sizeof(uint32_t))

   535         {

   536             if (mySource < sourceLimit)

   537             {

   538                 ch |= ((uint8_t)(*mySource)) << (i * 8);

   539                 toUBytes[i++] = (char) *(mySource++);

   540             }

   541             else

   542             {

   543                 /* stores a partially calculated target*/

   544                 /* + 1 to make 0 a valid character */

   545                 args->converter->toUnicodeStatus = ch + 1;

   546                 args->converter->toULength = (int8_t) i;

   547                 goto donefornow;

   548             }

   549         }

   551         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {

   552             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */

   553             if (ch <= MAXIMUM_UCS2) {

   554                 /* fits in 16 bits */

   555                 *(myTarget++) = (UChar) ch;

   556             }

   557             else {

   558                 /* write out the surrogates */

   559                 *(myTarget++) = U16_LEAD(ch);

   560                 ch = U16_TRAIL(ch);

   561                 if (myTarget < targetLimit) {

   562                     *(myTarget++) = (UChar)ch;

   563                 }

   564                 else {

   565                     /* Put in overflow buffer (not handled here) */

   566                     args->converter->UCharErrorBuffer[0] = (UChar) ch;

   567                     args->converter->UCharErrorBufferLength = 1;

   568                     *err = U_BUFFER_OVERFLOW_ERROR;

   569                     break;

   570                 }

   571             }

   572         }

   573         else {

   574             args->converter->toULength = (int8_t)i;

   575             *err = U_ILLEGAL_CHAR_FOUND;

   576             break;

   577         }

   578     }

   580 donefornow:

   581     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))

   582     {

   583         /* End of target buffer */

   584         *err = U_BUFFER_OVERFLOW_ERROR;

   585     }

   587     args->target = myTarget;

   588     args->source = (const char *) mySource;

   589 }

   591 static void

   592 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,

   593                                              UErrorCode * err)

   594 {

   595     const unsigned char *mySource = (unsigned char *) args->source;

   596     UChar *myTarget = args->target;

   597     int32_t *myOffsets = args->offsets;

   598     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;

   599     const UChar *targetLimit = args->targetLimit;

   600     unsigned char *toUBytes = args->converter->toUBytes;

   601     uint32_t ch, i;

   602     int32_t offsetNum = 0;

   604     /* Restore state of current sequence */

   605     if (args->converter->toUnicodeStatus && myTarget < targetLimit)

   606     {

   607         i = args->converter->toULength;       /* restore # of bytes consumed */

   608         args->converter->toULength = 0;

   610         /* Stores the previously calculated ch from a previous call*/

   611         ch = args->converter->toUnicodeStatus - 1;

   612         args->converter->toUnicodeStatus = 0;

   613         goto morebytes;

   614     }

   616     while (mySource < sourceLimit && myTarget < targetLimit)

   617     {

   618         i = 0;

   619         ch = 0;

   620 morebytes:

   621         while (i < sizeof(uint32_t))

   622         {

   623             if (mySource < sourceLimit)

   624             {

   625                 ch |= ((uint8_t)(*mySource)) << (i * 8);

   626                 toUBytes[i++] = (char) *(mySource++);

   627             }

   628             else

   629             {

   630                 /* stores a partially calculated target*/

   631                 /* + 1 to make 0 a valid character */

   632                 args->converter->toUnicodeStatus = ch + 1;

   633                 args->converter->toULength = (int8_t) i;

   634                 goto donefornow;

   635             }

   636         }

   638         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch))

   639         {

   640             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */

   641             if (ch <= MAXIMUM_UCS2)

   642             {

   643                 /* fits in 16 bits */

   644                 *(myTarget++) = (UChar) ch;

   645                 *(myOffsets++) = offsetNum;

   646             }

   647             else {

   648                 /* write out the surrogates */

   649                 *(myTarget++) = U16_LEAD(ch);

   650                 *(myOffsets++) = offsetNum;

   651                 ch = U16_TRAIL(ch);

   652                 if (myTarget < targetLimit)

   653                 {

   654                     *(myTarget++) = (UChar)ch;

   655                     *(myOffsets++) = offsetNum;

   656                 }

   657                 else

   658                 {

   659                     /* Put in overflow buffer (not handled here) */

   660                     args->converter->UCharErrorBuffer[0] = (UChar) ch;

   661                     args->converter->UCharErrorBufferLength = 1;

   662                     *err = U_BUFFER_OVERFLOW_ERROR;

   663                     break;

   664                 }

   665             }

   666         }

   667         else

   668         {

   669             args->converter->toULength = (int8_t)i;

   670             *err = U_ILLEGAL_CHAR_FOUND;

   671             break;

   672         }

   673         offsetNum += i;

   674     }

   676 donefornow:

   677     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))

   678     {

   679         /* End of target buffer */

   680         *err = U_BUFFER_OVERFLOW_ERROR;

   681     }

   683     args->target = myTarget;

   684     args->source = (const char *) mySource;

   685     args->offsets = myOffsets;

   686 }

   688 static void

   689 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,

   690                                   UErrorCode * err)

   691 {

   692     const UChar *mySource = args->source;

   693     unsigned char *myTarget;

   694     const UChar *sourceLimit = args->sourceLimit;

   695     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;

   696     UChar32 ch, ch2;

   697     unsigned int indexToWrite;

   698     unsigned char temp[sizeof(uint32_t)];

   700     if(mySource >= sourceLimit) {

   701         /* no input, nothing to do */

   702         return;

   703     }

   705     /* write the BOM if necessary */

   706     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {

   707         static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };

   708         ucnv_fromUWriteBytes(args->converter,

   709                              bom, 4,

   710                              &args->target, args->targetLimit,

   711                              &args->offsets, -1,

   712                              err);

   713         args->converter->fromUnicodeStatus=0;

   714     }

   716     myTarget = (unsigned char *) args->target;

   717     temp[3] = 0;

   719     if (args->converter->fromUChar32)

   720     {

   721         ch = args->converter->fromUChar32;

   722         args->converter->fromUChar32 = 0;

   723         goto lowsurogate;

   724     }

   726     while (mySource < sourceLimit && myTarget < targetLimit)

   727     {

   728         ch = *(mySource++);

   730         if (U16_IS_SURROGATE(ch)) {

   731             if (U16_IS_LEAD(ch))

   732             {

   733 lowsurogate:

   734                 if (mySource < sourceLimit)

   735                 {

   736                     ch2 = *mySource;

   737                     if (U16_IS_TRAIL(ch2)) {

   738                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;

   739                         mySource++;

   740                     }

   741                     else {

   742                         /* this is an unmatched trail code unit (2nd surrogate) */

   743                         /* callback(illegal) */

   744                         args->converter->fromUChar32 = ch;

   745                         *err = U_ILLEGAL_CHAR_FOUND;

   746                         break;

   747                     }

   748                 }

   749                 else {

   750                     /* ran out of source */

   751                     args->converter->fromUChar32 = ch;

   752                     if (args->flush) {

   753                         /* this is an unmatched trail code unit (2nd surrogate) */

   754                         /* callback(illegal) */

   755                         *err = U_ILLEGAL_CHAR_FOUND;

   756                     }

   757                     break;

   758                 }

   759             }

   760             else {

   761                 /* this is an unmatched trail code unit (2nd surrogate) */

   762                 /* callback(illegal) */

   763                 args->converter->fromUChar32 = ch;

   764                 *err = U_ILLEGAL_CHAR_FOUND;

   765                 break;

   766             }

   767         }

   769         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */

   770         temp[2] = (uint8_t) (ch >> 16 & 0x1F);

   771         temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */

   772         temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */

   774         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)

   775         {

   776             if (myTarget < targetLimit)

   777             {

   778                 *(myTarget++) = temp[indexToWrite];

   779             }

   780             else

   781             {

   782                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];

   783                 *err = U_BUFFER_OVERFLOW_ERROR;

   784             }

   785         }

   786     }

   788     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))

   789     {

   790         *err = U_BUFFER_OVERFLOW_ERROR;

   791     }

   793     args->target = (char *) myTarget;

   794     args->source = mySource;

   795 }

   797 static void

   798 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,

   799                                                UErrorCode * err)

   800 {

   801     const UChar *mySource = args->source;

   802     unsigned char *myTarget;

   803     int32_t *myOffsets;

   804     const UChar *sourceLimit = args->sourceLimit;

   805     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;

   806     UChar32 ch, ch2;

   807     unsigned int indexToWrite;

   808     unsigned char temp[sizeof(uint32_t)];

   809     int32_t offsetNum = 0;

   811     if(mySource >= sourceLimit) {

   812         /* no input, nothing to do */

   813         return;

   814     }

   816     /* write the BOM if necessary */

   817     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {

   818         static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };

   819         ucnv_fromUWriteBytes(args->converter,

   820                              bom, 4,

   821                              &args->target, args->targetLimit,

   822                              &args->offsets, -1,

   823                              err);

   824         args->converter->fromUnicodeStatus=0;

   825     }

   827     myTarget = (unsigned char *) args->target;

   828     myOffsets = args->offsets;

   829     temp[3] = 0;

   831     if (args->converter->fromUChar32)

   832     {

   833         ch = args->converter->fromUChar32;

   834         args->converter->fromUChar32 = 0;

   835         goto lowsurogate;

   836     }

   838     while (mySource < sourceLimit && myTarget < targetLimit)

   839     {

   840         ch = *(mySource++);

   842         if (U16_IS_SURROGATE(ch)) {

   843             if (U16_IS_LEAD(ch))

   844             {

   845 lowsurogate:

   846                 if (mySource < sourceLimit)

   847                 {

   848                     ch2 = *mySource;

   849                     if (U16_IS_TRAIL(ch2))

   850                     {

   851                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;

   852                         mySource++;

   853                     }

   854                     else {

   855                         /* this is an unmatched trail code unit (2nd surrogate) */

   856                         /* callback(illegal) */

   857                         args->converter->fromUChar32 = ch;

   858                         *err = U_ILLEGAL_CHAR_FOUND;

   859                         break;

   860                     }

   861                 }

   862                 else {

   863                     /* ran out of source */

   864                     args->converter->fromUChar32 = ch;

   865                     if (args->flush) {

   866                         /* this is an unmatched trail code unit (2nd surrogate) */

   867                         /* callback(illegal) */

   868                         *err = U_ILLEGAL_CHAR_FOUND;

   869                     }

   870                     break;

   871                 }

   872             }

   873             else {

   874                 /* this is an unmatched trail code unit (2nd surrogate) */

   875                 /* callback(illegal) */

   876                 args->converter->fromUChar32 = ch;

   877                 *err = U_ILLEGAL_CHAR_FOUND;

   878                 break;

   879             }

   880         }

   882         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */

   883         temp[2] = (uint8_t) (ch >> 16 & 0x1F);

   884         temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */

   885         temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */

   887         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)

   888         {

   889             if (myTarget < targetLimit)

   890             {

   891                 *(myTarget++) = temp[indexToWrite];

   892                 *(myOffsets++) = offsetNum;

   893             }

   894             else

   895             {

   896                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];

   897                 *err = U_BUFFER_OVERFLOW_ERROR;

   898             }

   899         }

   900         offsetNum = offsetNum + 1 + (temp[2] != 0);

   901     }

   903     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))

   904     {

   905         *err = U_BUFFER_OVERFLOW_ERROR;

   906     }

   908     args->target = (char *) myTarget;

   909     args->source = mySource;

   910     args->offsets = myOffsets;

   911 }

   913 static UChar32

   914 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,

   915                                    UErrorCode* err)

   916 {

   917     const uint8_t *mySource;

   918     UChar32 myUChar;

   919     int32_t length;

   921     mySource = (const uint8_t *)args->source;

   922     if (mySource >= (const uint8_t *)args->sourceLimit)

   923     {

   924         /* no input */

   925         *err = U_INDEX_OUTOFBOUNDS_ERROR;

   926         return 0xffff;

   927     }

   929     length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);

   930     if (length < 4)

   931     {

   932         /* got a partial character */

   933         uprv_memcpy(args->converter->toUBytes, mySource, length);

   934         args->converter->toULength = (int8_t)length;

   935         args->source = (const char *)(mySource + length);

   936         *err = U_TRUNCATED_CHAR_FOUND;

   937         return 0xffff;

   938     }

   940     /* Don't even try to do a direct cast because the value may be on an odd address. */

   941     myUChar = ((UChar32)mySource[3] << 24)

   942             | ((UChar32)mySource[2] << 16)

   943             | ((UChar32)mySource[1] << 8)

   944             | ((UChar32)mySource[0]);

   946     args->source = (const char *)(mySource + 4);

   947     if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {

   948         return myUChar;

   949     }

   951     uprv_memcpy(args->converter->toUBytes, mySource, 4);

   952     args->converter->toULength = 4;

   954     *err = U_ILLEGAL_CHAR_FOUND;

   955     return 0xffff;

   956 }

   958 static const UConverterImpl _UTF32LEImpl = {

   959     UCNV_UTF32_LittleEndian,

   961     NULL,

   962     NULL,

   964     NULL,

   965     NULL,

   966     NULL,

   968     T_UConverter_toUnicode_UTF32_LE,

   969     T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC,

   970     T_UConverter_fromUnicode_UTF32_LE,

   971     T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,

   972     T_UConverter_getNextUChar_UTF32_LE,

   974     NULL,

   975     NULL,

   976     NULL,

   977     NULL,

   978     ucnv_getNonSurrogateUnicodeSet

   979 };

   981 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */

   982 static const UConverterStaticData _UTF32LEStaticData = {

   983     sizeof(UConverterStaticData),

   984     "UTF-32LE",

   985     1234,

   986     UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4,

   987     { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE,

   988     0,

   989     0,

   990     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */

   991 };

   994 const UConverterSharedData _UTF32LEData = {

   995     sizeof(UConverterSharedData), ~((uint32_t) 0),

   996     NULL, NULL, &_UTF32LEStaticData, FALSE, &_UTF32LEImpl,

   997     0

   998 };

  1000 /* UTF-32 (Detect BOM) ------------------------------------------------------ */

  1002 /*

  1003  * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE

  1004  * accordingly.

  1005  *

  1006  * State values:

  1007  * 0    initial state

  1008  * 1    saw 00

  1009  * 2    saw 00 00

  1010  * 3    saw 00 00 FE

  1011  * 4    -

  1012  * 5    saw FF

  1013  * 6    saw FF FE

  1014  * 7    saw FF FE 00

  1015  * 8    UTF-32BE mode

  1016  * 9    UTF-32LE mode

  1017  *

  1018  * During detection: state&3==number of matching bytes so far.

  1019  *

  1020  * On output, emit U+FEFF as the first code point.

  1021  */

  1023 static void

  1024 _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {

  1025     if(choice<=UCNV_RESET_TO_UNICODE) {

  1026         /* reset toUnicode: state=0 */

  1027         cnv->mode=0;

  1028     }

  1029     if(choice!=UCNV_RESET_TO_UNICODE) {

  1030         /* reset fromUnicode: prepare to output the UTF-32PE BOM */

  1031         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;

  1032     }

  1033 }

  1035 static void

  1036 _UTF32Open(UConverter *cnv,

  1037            UConverterLoadArgs *pArgs,

  1038            UErrorCode *pErrorCode) {

  1039     _UTF32Reset(cnv, UCNV_RESET_BOTH);

  1040 }

  1042 static const char utf32BOM[8]={ 0, 0, (char)0xfe, (char)0xff,    (char)0xff, (char)0xfe, 0, 0 };

  1044 static void

  1045 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,

  1046                            UErrorCode *pErrorCode) {

  1047     UConverter *cnv=pArgs->converter;

  1048     const char *source=pArgs->source;

  1049     const char *sourceLimit=pArgs->sourceLimit;

  1050     int32_t *offsets=pArgs->offsets;

  1052     int32_t state, offsetDelta;

  1053     char b;

  1055     state=cnv->mode;

  1057     /*

  1058      * If we detect a BOM in this buffer, then we must add the BOM size to the

  1059      * offsets because the actual converter function will not see and count the BOM.

  1060      * offsetDelta will have the number of the BOM bytes that are in the current buffer.

  1061      */

  1062     offsetDelta=0;

  1064     while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {

  1065         switch(state) {

  1066         case 0:

  1067             b=*source;

  1068             if(b==0) {

  1069                 state=1; /* could be 00 00 FE FF */

  1070             } else if(b==(char)0xff) {

  1071                 state=5; /* could be FF FE 00 00 */

  1072             } else {

  1073                 state=8; /* default to UTF-32BE */

  1074                 continue;

  1075             }

  1076             ++source;

  1077             break;

  1078         case 1:

  1079         case 2:

  1080         case 3:

  1081         case 5:

  1082         case 6:

  1083         case 7:

  1084             if(*source==utf32BOM[state]) {

  1085                 ++state;

  1086                 ++source;

  1087                 if(state==4) {

  1088                     state=8; /* detect UTF-32BE */

  1089                     offsetDelta=(int32_t)(source-pArgs->source);

  1090                 } else if(state==8) {

  1091                     state=9; /* detect UTF-32LE */

  1092                     offsetDelta=(int32_t)(source-pArgs->source);

  1093                 }

  1094             } else {

  1095                 /* switch to UTF-32BE and pass the previous bytes */

  1096                 int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */

  1098                 /* reset the source */

  1099                 source=pArgs->source;

  1101                 if(count==(state&3)) {

  1102                     /* simple: all in the same buffer, just reset source */

  1103                 } else {

  1104                     UBool oldFlush=pArgs->flush;

  1106                     /* some of the bytes are from a previous buffer, replay those first */

  1107                     pArgs->source=utf32BOM+(state&4); /* select the correct BOM */

  1108                     pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */

  1109                     pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */

  1111                     /* no offsets: bytes from previous buffer, and not enough for output */

  1112                     T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);

  1114                     /* restore real pointers; pArgs->source will be set in case 8/9 */

  1115                     pArgs->sourceLimit=sourceLimit;

  1116                     pArgs->flush=oldFlush;

  1117                 }

  1118                 state=8;

  1119                 continue;

  1120             }

  1121             break;

  1122         case 8:

  1123             /* call UTF-32BE */

  1124             pArgs->source=source;

  1125             if(offsets==NULL) {

  1126                 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);

  1127             } else {

  1128                 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode);

  1129             }

  1130             source=pArgs->source;

  1131             break;

  1132         case 9:

  1133             /* call UTF-32LE */

  1134             pArgs->source=source;

  1135             if(offsets==NULL) {

  1136                 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);

  1137             } else {

  1138                 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode);

  1139             }

  1140             source=pArgs->source;

  1141             break;

  1142         default:

  1143             break; /* does not occur */

  1144         }

  1145     }

  1147     /* add BOM size to offsets - see comment at offsetDelta declaration */

  1148     if(offsets!=NULL && offsetDelta!=0) {

  1149         int32_t *offsetsLimit=pArgs->offsets;

  1150         while(offsets<offsetsLimit) {

  1151             *offsets++ += offsetDelta;

  1152         }

  1153     }

  1155     pArgs->source=source;

  1157     if(source==sourceLimit && pArgs->flush) {

  1158         /* handle truncated input */

  1159         switch(state) {

  1160         case 0:

  1161             break; /* no input at all, nothing to do */

  1162         case 8:

  1163             T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);

  1164             break;

  1165         case 9:

  1166             T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);

  1167             break;

  1168         default:

  1169             /* handle 0<state<8: call UTF-32BE with too-short input */

  1170             pArgs->source=utf32BOM+(state&4); /* select the correct BOM */

  1171             pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */

  1173             /* no offsets: not enough for output */

  1174             T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);

  1175             pArgs->source=source;

  1176             pArgs->sourceLimit=sourceLimit;

  1177             state=8;

  1178             break;

  1179         }

  1180     }

  1182     cnv->mode=state;

  1183 }

  1185 static UChar32

  1186 _UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs,

  1187                    UErrorCode *pErrorCode) {

  1188     switch(pArgs->converter->mode) {

  1189     case 8:

  1190         return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode);

  1191     case 9:

  1192         return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode);

  1193     default:

  1194         return UCNV_GET_NEXT_UCHAR_USE_TO_U;

  1195     }

  1196 }

  1198 static const UConverterImpl _UTF32Impl = {

  1199     UCNV_UTF32,

  1201     NULL,

  1202     NULL,

  1204     _UTF32Open,

  1205     NULL,

  1206     _UTF32Reset,

  1208     _UTF32ToUnicodeWithOffsets,

  1209     _UTF32ToUnicodeWithOffsets,

  1210 #if U_IS_BIG_ENDIAN

  1211     T_UConverter_fromUnicode_UTF32_BE,

  1212     T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,

  1213 #else

  1214     T_UConverter_fromUnicode_UTF32_LE,

  1215     T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,

  1216 #endif

  1217     _UTF32GetNextUChar,

  1219     NULL, /* ### TODO implement getStarters for all Unicode encodings?! */

  1220     NULL,

  1221     NULL,

  1222     NULL,

  1223     ucnv_getNonSurrogateUnicodeSet

  1224 };

  1226 /* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */

  1227 static const UConverterStaticData _UTF32StaticData = {

  1228     sizeof(UConverterStaticData),

  1229     "UTF-32",

  1230     1236,

  1231     UCNV_IBM, UCNV_UTF32, 4, 4,

  1232 #if U_IS_BIG_ENDIAN

  1233     { 0, 0, 0xff, 0xfd }, 4,

  1234 #else

  1235     { 0xfd, 0xff, 0, 0 }, 4,

  1236 #endif

  1237     FALSE, FALSE,

  1238     0,

  1239     0,

  1240     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */

  1241 };

  1243 const UConverterSharedData _UTF32Data = {

  1244     sizeof(UConverterSharedData), ~((uint32_t) 0),

  1245     NULL, NULL, &_UTF32StaticData, FALSE, &_UTF32Impl,

  1246     0

  1247 };

  1249 #endif

The Tor Browser / file revision

intl/icu/source/common/ucnv_u32.c@6474c204b198

intl/icu/source/common/ucnv_u32.c