intl/icu/source/common/ucnv_u32.c

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*  
     2 **********************************************************************
     3 *   Copyright (C) 2002-2011, International Business Machines
     4 *   Corporation and others.  All Rights Reserved.
     5 **********************************************************************
     6 *   file name:  ucnv_u32.c
     7 *   encoding:   US-ASCII
     8 *   tab size:   8 (not used)
     9 *   indentation:4
    10 *
    11 *   created on: 2002jul01
    12 *   created by: Markus W. Scherer
    13 *
    14 *   UTF-32 converter implementation. Used to be in ucnv_utf.c.
    15 */
    17 #include "unicode/utypes.h"
    19 #if !UCONFIG_NO_CONVERSION
    21 #include "unicode/ucnv.h"
    22 #include "unicode/utf.h"
    23 #include "ucnv_bld.h"
    24 #include "ucnv_cnv.h"
    25 #include "cmemory.h"
    27 #define MAXIMUM_UCS2            0x0000FFFF
    28 #define MAXIMUM_UTF             0x0010FFFF
    29 #define HALF_SHIFT              10
    30 #define HALF_BASE               0x0010000
    31 #define HALF_MASK               0x3FF
    32 #define SURROGATE_HIGH_START    0xD800
    33 #define SURROGATE_LOW_START     0xDC00
    35 /* -SURROGATE_LOW_START + HALF_BASE */
    36 #define SURROGATE_LOW_BASE      9216
    38 enum {
    39     UCNV_NEED_TO_WRITE_BOM=1
    40 };
    42 /* UTF-32BE ----------------------------------------------------------------- */
    44 static void
    45 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
    46                                 UErrorCode * err)
    47 {
    48     const unsigned char *mySource = (unsigned char *) args->source;
    49     UChar *myTarget = args->target;
    50     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
    51     const UChar *targetLimit = args->targetLimit;
    52     unsigned char *toUBytes = args->converter->toUBytes;
    53     uint32_t ch, i;
    55     /* Restore state of current sequence */
    56     if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
    57         i = args->converter->toULength;       /* restore # of bytes consumed */
    58         args->converter->toULength = 0;
    60         ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
    61         args->converter->toUnicodeStatus = 0;
    62         goto morebytes;
    63     }
    65     while (mySource < sourceLimit && myTarget < targetLimit) {
    66         i = 0;
    67         ch = 0;
    68 morebytes:
    69         while (i < sizeof(uint32_t)) {
    70             if (mySource < sourceLimit) {
    71                 ch = (ch << 8) | (uint8_t)(*mySource);
    72                 toUBytes[i++] = (char) *(mySource++);
    73             }
    74             else {
    75                 /* stores a partially calculated target*/
    76                 /* + 1 to make 0 a valid character */
    77                 args->converter->toUnicodeStatus = ch + 1;
    78                 args->converter->toULength = (int8_t) i;
    79                 goto donefornow;
    80             }
    81         }
    83         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
    84             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
    85             if (ch <= MAXIMUM_UCS2) 
    86             {
    87                 /* fits in 16 bits */
    88                 *(myTarget++) = (UChar) ch;
    89             }
    90             else {
    91                 /* write out the surrogates */
    92                 *(myTarget++) = U16_LEAD(ch);
    93                 ch = U16_TRAIL(ch);
    94                 if (myTarget < targetLimit) {
    95                     *(myTarget++) = (UChar)ch;
    96                 }
    97                 else {
    98                     /* Put in overflow buffer (not handled here) */
    99                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
   100                     args->converter->UCharErrorBufferLength = 1;
   101                     *err = U_BUFFER_OVERFLOW_ERROR;
   102                     break;
   103                 }
   104             }
   105         }
   106         else {
   107             args->converter->toULength = (int8_t)i;
   108             *err = U_ILLEGAL_CHAR_FOUND;
   109             break;
   110         }
   111     }
   113 donefornow:
   114     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
   115         /* End of target buffer */
   116         *err = U_BUFFER_OVERFLOW_ERROR;
   117     }
   119     args->target = myTarget;
   120     args->source = (const char *) mySource;
   121 }
   123 static void
   124 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
   125                                              UErrorCode * err)
   126 {
   127     const unsigned char *mySource = (unsigned char *) args->source;
   128     UChar *myTarget = args->target;
   129     int32_t *myOffsets = args->offsets;
   130     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
   131     const UChar *targetLimit = args->targetLimit;
   132     unsigned char *toUBytes = args->converter->toUBytes;
   133     uint32_t ch, i;
   134     int32_t offsetNum = 0;
   136     /* Restore state of current sequence */
   137     if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
   138         i = args->converter->toULength;       /* restore # of bytes consumed */
   139         args->converter->toULength = 0;
   141         ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
   142         args->converter->toUnicodeStatus = 0;
   143         goto morebytes;
   144     }
   146     while (mySource < sourceLimit && myTarget < targetLimit) {
   147         i = 0;
   148         ch = 0;
   149 morebytes:
   150         while (i < sizeof(uint32_t)) {
   151             if (mySource < sourceLimit) {
   152                 ch = (ch << 8) | (uint8_t)(*mySource);
   153                 toUBytes[i++] = (char) *(mySource++);
   154             }
   155             else {
   156                 /* stores a partially calculated target*/
   157                 /* + 1 to make 0 a valid character */
   158                 args->converter->toUnicodeStatus = ch + 1;
   159                 args->converter->toULength = (int8_t) i;
   160                 goto donefornow;
   161             }
   162         }
   164         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
   165             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
   166             if (ch <= MAXIMUM_UCS2) {
   167                 /* fits in 16 bits */
   168                 *(myTarget++) = (UChar) ch;
   169                 *(myOffsets++) = offsetNum;
   170             }
   171             else {
   172                 /* write out the surrogates */
   173                 *(myTarget++) = U16_LEAD(ch);
   174                 *myOffsets++ = offsetNum;
   175                 ch = U16_TRAIL(ch);
   176                 if (myTarget < targetLimit)
   177                 {
   178                     *(myTarget++) = (UChar)ch;
   179                     *(myOffsets++) = offsetNum;
   180                 }
   181                 else {
   182                     /* Put in overflow buffer (not handled here) */
   183                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
   184                     args->converter->UCharErrorBufferLength = 1;
   185                     *err = U_BUFFER_OVERFLOW_ERROR;
   186                     break;
   187                 }
   188             }
   189         }
   190         else {
   191             args->converter->toULength = (int8_t)i;
   192             *err = U_ILLEGAL_CHAR_FOUND;
   193             break;
   194         }
   195         offsetNum += i;
   196     }
   198 donefornow:
   199     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
   200     {
   201         /* End of target buffer */
   202         *err = U_BUFFER_OVERFLOW_ERROR;
   203     }
   205     args->target = myTarget;
   206     args->source = (const char *) mySource;
   207     args->offsets = myOffsets;
   208 }
   210 static void
   211 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
   212                                   UErrorCode * err)
   213 {
   214     const UChar *mySource = args->source;
   215     unsigned char *myTarget;
   216     const UChar *sourceLimit = args->sourceLimit;
   217     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
   218     UChar32 ch, ch2;
   219     unsigned int indexToWrite;
   220     unsigned char temp[sizeof(uint32_t)];
   222     if(mySource >= sourceLimit) {
   223         /* no input, nothing to do */
   224         return;
   225     }
   227     /* write the BOM if necessary */
   228     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
   229         static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
   230         ucnv_fromUWriteBytes(args->converter,
   231                              bom, 4,
   232                              &args->target, args->targetLimit,
   233                              &args->offsets, -1,
   234                              err);
   235         args->converter->fromUnicodeStatus=0;
   236     }
   238     myTarget = (unsigned char *) args->target;
   239     temp[0] = 0;
   241     if (args->converter->fromUChar32) {
   242         ch = args->converter->fromUChar32;
   243         args->converter->fromUChar32 = 0;
   244         goto lowsurogate;
   245     }
   247     while (mySource < sourceLimit && myTarget < targetLimit) {
   248         ch = *(mySource++);
   250         if (U_IS_SURROGATE(ch)) {
   251             if (U_IS_LEAD(ch)) {
   252 lowsurogate:
   253                 if (mySource < sourceLimit) {
   254                     ch2 = *mySource;
   255                     if (U_IS_TRAIL(ch2)) {
   256                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
   257                         mySource++;
   258                     }
   259                     else {
   260                         /* this is an unmatched trail code unit (2nd surrogate) */
   261                         /* callback(illegal) */
   262                         args->converter->fromUChar32 = ch;
   263                         *err = U_ILLEGAL_CHAR_FOUND;
   264                         break;
   265                     }
   266                 }
   267                 else {
   268                     /* ran out of source */
   269                     args->converter->fromUChar32 = ch;
   270                     if (args->flush) {
   271                         /* this is an unmatched trail code unit (2nd surrogate) */
   272                         /* callback(illegal) */
   273                         *err = U_ILLEGAL_CHAR_FOUND;
   274                     }
   275                     break;
   276                 }
   277             }
   278             else {
   279                 /* this is an unmatched trail code unit (2nd surrogate) */
   280                 /* callback(illegal) */
   281                 args->converter->fromUChar32 = ch;
   282                 *err = U_ILLEGAL_CHAR_FOUND;
   283                 break;
   284             }
   285         }
   287         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
   288         temp[1] = (uint8_t) (ch >> 16 & 0x1F);
   289         temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
   290         temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
   292         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
   293             if (myTarget < targetLimit) {
   294                 *(myTarget++) = temp[indexToWrite];
   295             }
   296             else {
   297                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
   298                 *err = U_BUFFER_OVERFLOW_ERROR;
   299             }
   300         }
   301     }
   303     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
   304         *err = U_BUFFER_OVERFLOW_ERROR;
   305     }
   307     args->target = (char *) myTarget;
   308     args->source = mySource;
   309 }
   311 static void
   312 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
   313                                                UErrorCode * err)
   314 {
   315     const UChar *mySource = args->source;
   316     unsigned char *myTarget;
   317     int32_t *myOffsets;
   318     const UChar *sourceLimit = args->sourceLimit;
   319     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
   320     UChar32 ch, ch2;
   321     int32_t offsetNum = 0;
   322     unsigned int indexToWrite;
   323     unsigned char temp[sizeof(uint32_t)];
   325     if(mySource >= sourceLimit) {
   326         /* no input, nothing to do */
   327         return;
   328     }
   330     /* write the BOM if necessary */
   331     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
   332         static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
   333         ucnv_fromUWriteBytes(args->converter,
   334                              bom, 4,
   335                              &args->target, args->targetLimit,
   336                              &args->offsets, -1,
   337                              err);
   338         args->converter->fromUnicodeStatus=0;
   339     }
   341     myTarget = (unsigned char *) args->target;
   342     myOffsets = args->offsets;
   343     temp[0] = 0;
   345     if (args->converter->fromUChar32) {
   346         ch = args->converter->fromUChar32;
   347         args->converter->fromUChar32 = 0;
   348         goto lowsurogate;
   349     }
   351     while (mySource < sourceLimit && myTarget < targetLimit) {
   352         ch = *(mySource++);
   354         if (U_IS_SURROGATE(ch)) {
   355             if (U_IS_LEAD(ch)) {
   356 lowsurogate:
   357                 if (mySource < sourceLimit) {
   358                     ch2 = *mySource;
   359                     if (U_IS_TRAIL(ch2)) {
   360                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
   361                         mySource++;
   362                     }
   363                     else {
   364                         /* this is an unmatched trail code unit (2nd surrogate) */
   365                         /* callback(illegal) */
   366                         args->converter->fromUChar32 = ch;
   367                         *err = U_ILLEGAL_CHAR_FOUND;
   368                         break;
   369                     }
   370                 }
   371                 else {
   372                     /* ran out of source */
   373                     args->converter->fromUChar32 = ch;
   374                     if (args->flush) {
   375                         /* this is an unmatched trail code unit (2nd surrogate) */
   376                         /* callback(illegal) */
   377                         *err = U_ILLEGAL_CHAR_FOUND;
   378                     }
   379                     break;
   380                 }
   381             }
   382             else {
   383                 /* this is an unmatched trail code unit (2nd surrogate) */
   384                 /* callback(illegal) */
   385                 args->converter->fromUChar32 = ch;
   386                 *err = U_ILLEGAL_CHAR_FOUND;
   387                 break;
   388             }
   389         }
   391         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
   392         temp[1] = (uint8_t) (ch >> 16 & 0x1F);
   393         temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
   394         temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
   396         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
   397             if (myTarget < targetLimit) {
   398                 *(myTarget++) = temp[indexToWrite];
   399                 *(myOffsets++) = offsetNum;
   400             }
   401             else {
   402                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
   403                 *err = U_BUFFER_OVERFLOW_ERROR;
   404             }
   405         }
   406         offsetNum = offsetNum + 1 + (temp[1] != 0);
   407     }
   409     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
   410         *err = U_BUFFER_OVERFLOW_ERROR;
   411     }
   413     args->target = (char *) myTarget;
   414     args->source = mySource;
   415     args->offsets = myOffsets;
   416 }
   418 static UChar32
   419 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,
   420                                    UErrorCode* err)
   421 {
   422     const uint8_t *mySource;
   423     UChar32 myUChar;
   424     int32_t length;
   426     mySource = (const uint8_t *)args->source;
   427     if (mySource >= (const uint8_t *)args->sourceLimit)
   428     {
   429         /* no input */
   430         *err = U_INDEX_OUTOFBOUNDS_ERROR;
   431         return 0xffff;
   432     }
   434     length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
   435     if (length < 4) 
   436     {
   437         /* got a partial character */
   438         uprv_memcpy(args->converter->toUBytes, mySource, length);
   439         args->converter->toULength = (int8_t)length;
   440         args->source = (const char *)(mySource + length);
   441         *err = U_TRUNCATED_CHAR_FOUND;
   442         return 0xffff;
   443     }
   445     /* Don't even try to do a direct cast because the value may be on an odd address. */
   446     myUChar = ((UChar32)mySource[0] << 24)
   447             | ((UChar32)mySource[1] << 16)
   448             | ((UChar32)mySource[2] << 8)
   449             | ((UChar32)mySource[3]);
   451     args->source = (const char *)(mySource + 4);
   452     if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
   453         return myUChar;
   454     }
   456     uprv_memcpy(args->converter->toUBytes, mySource, 4);
   457     args->converter->toULength = 4;
   459     *err = U_ILLEGAL_CHAR_FOUND;
   460     return 0xffff;
   461 }
   463 static const UConverterImpl _UTF32BEImpl = {
   464     UCNV_UTF32_BigEndian,
   466     NULL,
   467     NULL,
   469     NULL,
   470     NULL,
   471     NULL,
   473     T_UConverter_toUnicode_UTF32_BE,
   474     T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC,
   475     T_UConverter_fromUnicode_UTF32_BE,
   476     T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
   477     T_UConverter_getNextUChar_UTF32_BE,
   479     NULL,
   480     NULL,
   481     NULL,
   482     NULL,
   483     ucnv_getNonSurrogateUnicodeSet
   484 };
   486 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
   487 static const UConverterStaticData _UTF32BEStaticData = {
   488     sizeof(UConverterStaticData),
   489     "UTF-32BE",
   490     1232,
   491     UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
   492     { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE,
   493     0,
   494     0,
   495     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   496 };
   498 const UConverterSharedData _UTF32BEData = {
   499     sizeof(UConverterSharedData), ~((uint32_t) 0),
   500     NULL, NULL, &_UTF32BEStaticData, FALSE, &_UTF32BEImpl, 
   501     0
   502 };
   504 /* UTF-32LE ---------------------------------------------------------- */
   506 static void
   507 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,
   508                                 UErrorCode * err)
   509 {
   510     const unsigned char *mySource = (unsigned char *) args->source;
   511     UChar *myTarget = args->target;
   512     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
   513     const UChar *targetLimit = args->targetLimit;
   514     unsigned char *toUBytes = args->converter->toUBytes;
   515     uint32_t ch, i;
   517     /* Restore state of current sequence */
   518     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
   519     {
   520         i = args->converter->toULength;       /* restore # of bytes consumed */
   521         args->converter->toULength = 0;
   523         /* Stores the previously calculated ch from a previous call*/
   524         ch = args->converter->toUnicodeStatus - 1;
   525         args->converter->toUnicodeStatus = 0;
   526         goto morebytes;
   527     }
   529     while (mySource < sourceLimit && myTarget < targetLimit)
   530     {
   531         i = 0;
   532         ch = 0;
   533 morebytes:
   534         while (i < sizeof(uint32_t))
   535         {
   536             if (mySource < sourceLimit)
   537             {
   538                 ch |= ((uint8_t)(*mySource)) << (i * 8);
   539                 toUBytes[i++] = (char) *(mySource++);
   540             }
   541             else
   542             {
   543                 /* stores a partially calculated target*/
   544                 /* + 1 to make 0 a valid character */
   545                 args->converter->toUnicodeStatus = ch + 1;
   546                 args->converter->toULength = (int8_t) i;
   547                 goto donefornow;
   548             }
   549         }
   551         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
   552             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
   553             if (ch <= MAXIMUM_UCS2) {
   554                 /* fits in 16 bits */
   555                 *(myTarget++) = (UChar) ch;
   556             }
   557             else {
   558                 /* write out the surrogates */
   559                 *(myTarget++) = U16_LEAD(ch);
   560                 ch = U16_TRAIL(ch);
   561                 if (myTarget < targetLimit) {
   562                     *(myTarget++) = (UChar)ch;
   563                 }
   564                 else {
   565                     /* Put in overflow buffer (not handled here) */
   566                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
   567                     args->converter->UCharErrorBufferLength = 1;
   568                     *err = U_BUFFER_OVERFLOW_ERROR;
   569                     break;
   570                 }
   571             }
   572         }
   573         else {
   574             args->converter->toULength = (int8_t)i;
   575             *err = U_ILLEGAL_CHAR_FOUND;
   576             break;
   577         }
   578     }
   580 donefornow:
   581     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
   582     {
   583         /* End of target buffer */
   584         *err = U_BUFFER_OVERFLOW_ERROR;
   585     }
   587     args->target = myTarget;
   588     args->source = (const char *) mySource;
   589 }
   591 static void
   592 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
   593                                              UErrorCode * err)
   594 {
   595     const unsigned char *mySource = (unsigned char *) args->source;
   596     UChar *myTarget = args->target;
   597     int32_t *myOffsets = args->offsets;
   598     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
   599     const UChar *targetLimit = args->targetLimit;
   600     unsigned char *toUBytes = args->converter->toUBytes;
   601     uint32_t ch, i;
   602     int32_t offsetNum = 0;
   604     /* Restore state of current sequence */
   605     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
   606     {
   607         i = args->converter->toULength;       /* restore # of bytes consumed */
   608         args->converter->toULength = 0;
   610         /* Stores the previously calculated ch from a previous call*/
   611         ch = args->converter->toUnicodeStatus - 1;
   612         args->converter->toUnicodeStatus = 0;
   613         goto morebytes;
   614     }
   616     while (mySource < sourceLimit && myTarget < targetLimit)
   617     {
   618         i = 0;
   619         ch = 0;
   620 morebytes:
   621         while (i < sizeof(uint32_t))
   622         {
   623             if (mySource < sourceLimit)
   624             {
   625                 ch |= ((uint8_t)(*mySource)) << (i * 8);
   626                 toUBytes[i++] = (char) *(mySource++);
   627             }
   628             else
   629             {
   630                 /* stores a partially calculated target*/
   631                 /* + 1 to make 0 a valid character */
   632                 args->converter->toUnicodeStatus = ch + 1;
   633                 args->converter->toULength = (int8_t) i;
   634                 goto donefornow;
   635             }
   636         }
   638         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch))
   639         {
   640             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
   641             if (ch <= MAXIMUM_UCS2) 
   642             {
   643                 /* fits in 16 bits */
   644                 *(myTarget++) = (UChar) ch;
   645                 *(myOffsets++) = offsetNum;
   646             }
   647             else {
   648                 /* write out the surrogates */
   649                 *(myTarget++) = U16_LEAD(ch);
   650                 *(myOffsets++) = offsetNum;
   651                 ch = U16_TRAIL(ch);
   652                 if (myTarget < targetLimit)
   653                 {
   654                     *(myTarget++) = (UChar)ch;
   655                     *(myOffsets++) = offsetNum;
   656                 }
   657                 else
   658                 {
   659                     /* Put in overflow buffer (not handled here) */
   660                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
   661                     args->converter->UCharErrorBufferLength = 1;
   662                     *err = U_BUFFER_OVERFLOW_ERROR;
   663                     break;
   664                 }
   665             }
   666         }
   667         else
   668         {
   669             args->converter->toULength = (int8_t)i;
   670             *err = U_ILLEGAL_CHAR_FOUND;
   671             break;
   672         }
   673         offsetNum += i;
   674     }
   676 donefornow:
   677     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
   678     {
   679         /* End of target buffer */
   680         *err = U_BUFFER_OVERFLOW_ERROR;
   681     }
   683     args->target = myTarget;
   684     args->source = (const char *) mySource;
   685     args->offsets = myOffsets;
   686 }
   688 static void
   689 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
   690                                   UErrorCode * err)
   691 {
   692     const UChar *mySource = args->source;
   693     unsigned char *myTarget;
   694     const UChar *sourceLimit = args->sourceLimit;
   695     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
   696     UChar32 ch, ch2;
   697     unsigned int indexToWrite;
   698     unsigned char temp[sizeof(uint32_t)];
   700     if(mySource >= sourceLimit) {
   701         /* no input, nothing to do */
   702         return;
   703     }
   705     /* write the BOM if necessary */
   706     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
   707         static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
   708         ucnv_fromUWriteBytes(args->converter,
   709                              bom, 4,
   710                              &args->target, args->targetLimit,
   711                              &args->offsets, -1,
   712                              err);
   713         args->converter->fromUnicodeStatus=0;
   714     }
   716     myTarget = (unsigned char *) args->target;
   717     temp[3] = 0;
   719     if (args->converter->fromUChar32)
   720     {
   721         ch = args->converter->fromUChar32;
   722         args->converter->fromUChar32 = 0;
   723         goto lowsurogate;
   724     }
   726     while (mySource < sourceLimit && myTarget < targetLimit)
   727     {
   728         ch = *(mySource++);
   730         if (U16_IS_SURROGATE(ch)) {
   731             if (U16_IS_LEAD(ch))
   732             {
   733 lowsurogate:
   734                 if (mySource < sourceLimit)
   735                 {
   736                     ch2 = *mySource;
   737                     if (U16_IS_TRAIL(ch2)) {
   738                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
   739                         mySource++;
   740                     }
   741                     else {
   742                         /* this is an unmatched trail code unit (2nd surrogate) */
   743                         /* callback(illegal) */
   744                         args->converter->fromUChar32 = ch;
   745                         *err = U_ILLEGAL_CHAR_FOUND;
   746                         break;
   747                     }
   748                 }
   749                 else {
   750                     /* ran out of source */
   751                     args->converter->fromUChar32 = ch;
   752                     if (args->flush) {
   753                         /* this is an unmatched trail code unit (2nd surrogate) */
   754                         /* callback(illegal) */
   755                         *err = U_ILLEGAL_CHAR_FOUND;
   756                     }
   757                     break;
   758                 }
   759             }
   760             else {
   761                 /* this is an unmatched trail code unit (2nd surrogate) */
   762                 /* callback(illegal) */
   763                 args->converter->fromUChar32 = ch;
   764                 *err = U_ILLEGAL_CHAR_FOUND;
   765                 break;
   766             }
   767         }
   769         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
   770         temp[2] = (uint8_t) (ch >> 16 & 0x1F);
   771         temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
   772         temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
   774         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
   775         {
   776             if (myTarget < targetLimit)
   777             {
   778                 *(myTarget++) = temp[indexToWrite];
   779             }
   780             else
   781             {
   782                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
   783                 *err = U_BUFFER_OVERFLOW_ERROR;
   784             }
   785         }
   786     }
   788     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
   789     {
   790         *err = U_BUFFER_OVERFLOW_ERROR;
   791     }
   793     args->target = (char *) myTarget;
   794     args->source = mySource;
   795 }
   797 static void
   798 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
   799                                                UErrorCode * err)
   800 {
   801     const UChar *mySource = args->source;
   802     unsigned char *myTarget;
   803     int32_t *myOffsets;
   804     const UChar *sourceLimit = args->sourceLimit;
   805     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
   806     UChar32 ch, ch2;
   807     unsigned int indexToWrite;
   808     unsigned char temp[sizeof(uint32_t)];
   809     int32_t offsetNum = 0;
   811     if(mySource >= sourceLimit) {
   812         /* no input, nothing to do */
   813         return;
   814     }
   816     /* write the BOM if necessary */
   817     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
   818         static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
   819         ucnv_fromUWriteBytes(args->converter,
   820                              bom, 4,
   821                              &args->target, args->targetLimit,
   822                              &args->offsets, -1,
   823                              err);
   824         args->converter->fromUnicodeStatus=0;
   825     }
   827     myTarget = (unsigned char *) args->target;
   828     myOffsets = args->offsets;
   829     temp[3] = 0;
   831     if (args->converter->fromUChar32)
   832     {
   833         ch = args->converter->fromUChar32;
   834         args->converter->fromUChar32 = 0;
   835         goto lowsurogate;
   836     }
   838     while (mySource < sourceLimit && myTarget < targetLimit)
   839     {
   840         ch = *(mySource++);
   842         if (U16_IS_SURROGATE(ch)) {
   843             if (U16_IS_LEAD(ch))
   844             {
   845 lowsurogate:
   846                 if (mySource < sourceLimit)
   847                 {
   848                     ch2 = *mySource;
   849                     if (U16_IS_TRAIL(ch2))
   850                     {
   851                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
   852                         mySource++;
   853                     }
   854                     else {
   855                         /* this is an unmatched trail code unit (2nd surrogate) */
   856                         /* callback(illegal) */
   857                         args->converter->fromUChar32 = ch;
   858                         *err = U_ILLEGAL_CHAR_FOUND;
   859                         break;
   860                     }
   861                 }
   862                 else {
   863                     /* ran out of source */
   864                     args->converter->fromUChar32 = ch;
   865                     if (args->flush) {
   866                         /* this is an unmatched trail code unit (2nd surrogate) */
   867                         /* callback(illegal) */
   868                         *err = U_ILLEGAL_CHAR_FOUND;
   869                     }
   870                     break;
   871                 }
   872             }
   873             else {
   874                 /* this is an unmatched trail code unit (2nd surrogate) */
   875                 /* callback(illegal) */
   876                 args->converter->fromUChar32 = ch;
   877                 *err = U_ILLEGAL_CHAR_FOUND;
   878                 break;
   879             }
   880         }
   882         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
   883         temp[2] = (uint8_t) (ch >> 16 & 0x1F);
   884         temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
   885         temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
   887         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
   888         {
   889             if (myTarget < targetLimit)
   890             {
   891                 *(myTarget++) = temp[indexToWrite];
   892                 *(myOffsets++) = offsetNum;
   893             }
   894             else
   895             {
   896                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
   897                 *err = U_BUFFER_OVERFLOW_ERROR;
   898             }
   899         }
   900         offsetNum = offsetNum + 1 + (temp[2] != 0);
   901     }
   903     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
   904     {
   905         *err = U_BUFFER_OVERFLOW_ERROR;
   906     }
   908     args->target = (char *) myTarget;
   909     args->source = mySource;
   910     args->offsets = myOffsets;
   911 }
   913 static UChar32
   914 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,
   915                                    UErrorCode* err)
   916 {
   917     const uint8_t *mySource;
   918     UChar32 myUChar;
   919     int32_t length;
   921     mySource = (const uint8_t *)args->source;
   922     if (mySource >= (const uint8_t *)args->sourceLimit)
   923     {
   924         /* no input */
   925         *err = U_INDEX_OUTOFBOUNDS_ERROR;
   926         return 0xffff;
   927     }
   929     length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
   930     if (length < 4) 
   931     {
   932         /* got a partial character */
   933         uprv_memcpy(args->converter->toUBytes, mySource, length);
   934         args->converter->toULength = (int8_t)length;
   935         args->source = (const char *)(mySource + length);
   936         *err = U_TRUNCATED_CHAR_FOUND;
   937         return 0xffff;
   938     }
   940     /* Don't even try to do a direct cast because the value may be on an odd address. */
   941     myUChar = ((UChar32)mySource[3] << 24)
   942             | ((UChar32)mySource[2] << 16)
   943             | ((UChar32)mySource[1] << 8)
   944             | ((UChar32)mySource[0]);
   946     args->source = (const char *)(mySource + 4);
   947     if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
   948         return myUChar;
   949     }
   951     uprv_memcpy(args->converter->toUBytes, mySource, 4);
   952     args->converter->toULength = 4;
   954     *err = U_ILLEGAL_CHAR_FOUND;
   955     return 0xffff;
   956 }
   958 static const UConverterImpl _UTF32LEImpl = {
   959     UCNV_UTF32_LittleEndian,
   961     NULL,
   962     NULL,
   964     NULL,
   965     NULL,
   966     NULL,
   968     T_UConverter_toUnicode_UTF32_LE,
   969     T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC,
   970     T_UConverter_fromUnicode_UTF32_LE,
   971     T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
   972     T_UConverter_getNextUChar_UTF32_LE,
   974     NULL,
   975     NULL,
   976     NULL,
   977     NULL,
   978     ucnv_getNonSurrogateUnicodeSet
   979 };
   981 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
   982 static const UConverterStaticData _UTF32LEStaticData = {
   983     sizeof(UConverterStaticData),
   984     "UTF-32LE",
   985     1234,
   986     UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4,
   987     { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE,
   988     0,
   989     0,
   990     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   991 };
   994 const UConverterSharedData _UTF32LEData = {
   995     sizeof(UConverterSharedData), ~((uint32_t) 0),
   996     NULL, NULL, &_UTF32LEStaticData, FALSE, &_UTF32LEImpl, 
   997     0
   998 };
  1000 /* UTF-32 (Detect BOM) ------------------------------------------------------ */
  1002 /*
  1003  * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE
  1004  * accordingly.
  1006  * State values:
  1007  * 0    initial state
  1008  * 1    saw 00
  1009  * 2    saw 00 00
  1010  * 3    saw 00 00 FE
  1011  * 4    -
  1012  * 5    saw FF
  1013  * 6    saw FF FE
  1014  * 7    saw FF FE 00
  1015  * 8    UTF-32BE mode
  1016  * 9    UTF-32LE mode
  1018  * During detection: state&3==number of matching bytes so far.
  1020  * On output, emit U+FEFF as the first code point.
  1021  */
  1023 static void
  1024 _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {
  1025     if(choice<=UCNV_RESET_TO_UNICODE) {
  1026         /* reset toUnicode: state=0 */
  1027         cnv->mode=0;
  1029     if(choice!=UCNV_RESET_TO_UNICODE) {
  1030         /* reset fromUnicode: prepare to output the UTF-32PE BOM */
  1031         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
  1035 static void
  1036 _UTF32Open(UConverter *cnv,
  1037            UConverterLoadArgs *pArgs,
  1038            UErrorCode *pErrorCode) {
  1039     _UTF32Reset(cnv, UCNV_RESET_BOTH);
  1042 static const char utf32BOM[8]={ 0, 0, (char)0xfe, (char)0xff,    (char)0xff, (char)0xfe, 0, 0 };
  1044 static void
  1045 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
  1046                            UErrorCode *pErrorCode) {
  1047     UConverter *cnv=pArgs->converter;
  1048     const char *source=pArgs->source;
  1049     const char *sourceLimit=pArgs->sourceLimit;
  1050     int32_t *offsets=pArgs->offsets;
  1052     int32_t state, offsetDelta;
  1053     char b;
  1055     state=cnv->mode;
  1057     /*
  1058      * If we detect a BOM in this buffer, then we must add the BOM size to the
  1059      * offsets because the actual converter function will not see and count the BOM.
  1060      * offsetDelta will have the number of the BOM bytes that are in the current buffer.
  1061      */
  1062     offsetDelta=0;
  1064     while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
  1065         switch(state) {
  1066         case 0:
  1067             b=*source;
  1068             if(b==0) {
  1069                 state=1; /* could be 00 00 FE FF */
  1070             } else if(b==(char)0xff) {
  1071                 state=5; /* could be FF FE 00 00 */
  1072             } else {
  1073                 state=8; /* default to UTF-32BE */
  1074                 continue;
  1076             ++source;
  1077             break;
  1078         case 1:
  1079         case 2:
  1080         case 3:
  1081         case 5:
  1082         case 6:
  1083         case 7:
  1084             if(*source==utf32BOM[state]) {
  1085                 ++state;
  1086                 ++source;
  1087                 if(state==4) {
  1088                     state=8; /* detect UTF-32BE */
  1089                     offsetDelta=(int32_t)(source-pArgs->source);
  1090                 } else if(state==8) {
  1091                     state=9; /* detect UTF-32LE */
  1092                     offsetDelta=(int32_t)(source-pArgs->source);
  1094             } else {
  1095                 /* switch to UTF-32BE and pass the previous bytes */
  1096                 int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */
  1098                 /* reset the source */
  1099                 source=pArgs->source;
  1101                 if(count==(state&3)) {
  1102                     /* simple: all in the same buffer, just reset source */
  1103                 } else {
  1104                     UBool oldFlush=pArgs->flush;
  1106                     /* some of the bytes are from a previous buffer, replay those first */
  1107                     pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
  1108                     pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */
  1109                     pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */
  1111                     /* no offsets: bytes from previous buffer, and not enough for output */
  1112                     T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
  1114                     /* restore real pointers; pArgs->source will be set in case 8/9 */
  1115                     pArgs->sourceLimit=sourceLimit;
  1116                     pArgs->flush=oldFlush;
  1118                 state=8;
  1119                 continue;
  1121             break;
  1122         case 8:
  1123             /* call UTF-32BE */
  1124             pArgs->source=source;
  1125             if(offsets==NULL) {
  1126                 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
  1127             } else {
  1128                 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode);
  1130             source=pArgs->source;
  1131             break;
  1132         case 9:
  1133             /* call UTF-32LE */
  1134             pArgs->source=source;
  1135             if(offsets==NULL) {
  1136                 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
  1137             } else {
  1138                 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode);
  1140             source=pArgs->source;
  1141             break;
  1142         default:
  1143             break; /* does not occur */
  1147     /* add BOM size to offsets - see comment at offsetDelta declaration */
  1148     if(offsets!=NULL && offsetDelta!=0) {
  1149         int32_t *offsetsLimit=pArgs->offsets;
  1150         while(offsets<offsetsLimit) {
  1151             *offsets++ += offsetDelta;
  1155     pArgs->source=source;
  1157     if(source==sourceLimit && pArgs->flush) {
  1158         /* handle truncated input */
  1159         switch(state) {
  1160         case 0:
  1161             break; /* no input at all, nothing to do */
  1162         case 8:
  1163             T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
  1164             break;
  1165         case 9:
  1166             T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
  1167             break;
  1168         default:
  1169             /* handle 0<state<8: call UTF-32BE with too-short input */
  1170             pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
  1171             pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */
  1173             /* no offsets: not enough for output */
  1174             T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
  1175             pArgs->source=source;
  1176             pArgs->sourceLimit=sourceLimit;
  1177             state=8;
  1178             break;
  1182     cnv->mode=state;
  1185 static UChar32
  1186 _UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs,
  1187                    UErrorCode *pErrorCode) {
  1188     switch(pArgs->converter->mode) {
  1189     case 8:
  1190         return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode);
  1191     case 9:
  1192         return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode);
  1193     default:
  1194         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
  1198 static const UConverterImpl _UTF32Impl = {
  1199     UCNV_UTF32,
  1201     NULL,
  1202     NULL,
  1204     _UTF32Open,
  1205     NULL,
  1206     _UTF32Reset,
  1208     _UTF32ToUnicodeWithOffsets,
  1209     _UTF32ToUnicodeWithOffsets,
  1210 #if U_IS_BIG_ENDIAN
  1211     T_UConverter_fromUnicode_UTF32_BE,
  1212     T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
  1213 #else
  1214     T_UConverter_fromUnicode_UTF32_LE,
  1215     T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
  1216 #endif
  1217     _UTF32GetNextUChar,
  1219     NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
  1220     NULL,
  1221     NULL,
  1222     NULL,
  1223     ucnv_getNonSurrogateUnicodeSet
  1224 };
  1226 /* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */
  1227 static const UConverterStaticData _UTF32StaticData = {
  1228     sizeof(UConverterStaticData),
  1229     "UTF-32",
  1230     1236,
  1231     UCNV_IBM, UCNV_UTF32, 4, 4,
  1232 #if U_IS_BIG_ENDIAN
  1233     { 0, 0, 0xff, 0xfd }, 4,
  1234 #else
  1235     { 0xfd, 0xff, 0, 0 }, 4,
  1236 #endif
  1237     FALSE, FALSE,
  1238     0,
  1239     0,
  1240     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  1241 };
  1243 const UConverterSharedData _UTF32Data = {
  1244     sizeof(UConverterSharedData), ~((uint32_t) 0),
  1245     NULL, NULL, &_UTF32StaticData, FALSE, &_UTF32Impl, 
  1247 };
  1249 #endif

mercurial