The Tor Browser: intl/icu/source/common/ucnvbocu.cpp@fc2d59ddac77

     1 /*

     2 ******************************************************************************

3 *

     4 *   Copyright (C) 2002-2011, International Business Machines

     5 *   Corporation and others.  All Rights Reserved.

6 *

     7 ******************************************************************************

     8 *   file name:  ucnvbocu.cpp

     9 *   encoding:   US-ASCII

    10 *   tab size:   8 (not used)

    11 *   indentation:4

    12 *

    13 *   created on: 2002mar27

    14 *   created by: Markus W. Scherer

    15 *

    16 *   This is an implementation of the Binary Ordered Compression for Unicode,

    17 *   in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/

    18 */

    20 #include "unicode/utypes.h"

    22 #if !UCONFIG_NO_CONVERSION

    24 #include "unicode/ucnv.h"

    25 #include "unicode/ucnv_cb.h"

    26 #include "unicode/utf16.h"

    27 #include "putilimp.h"

    28 #include "ucnv_bld.h"

    29 #include "ucnv_cnv.h"

    30 #include "uassert.h"

    32 /* BOCU-1 constants and macros ---------------------------------------------- */

    34 /*

    35  * BOCU-1 encodes the code points of a Unicode string as

    36  * a sequence of byte-encoded differences (slope detection),

    37  * preserving lexical order.

    38  *

    39  * Optimize the difference-taking for runs of Unicode text within

    40  * small scripts:

    41  *

    42  * Most small scripts are allocated within aligned 128-blocks of Unicode

    43  * code points. Lexical order is preserved if the "previous code point" state

    44  * is always moved into the middle of such a block.

    45  *

    46  * Additionally, "prev" is moved from anywhere in the Unihan and Hangul

    47  * areas into the middle of those areas.

    48  *

    49  * C0 control codes and space are encoded with their US-ASCII bytes.

    50  * "prev" is reset for C0 controls but not for space.

    51  */

    53 /* initial value for "prev": middle of the ASCII range */

    54 #define BOCU1_ASCII_PREV        0x40

    56 /* bounding byte values for differences */

    57 #define BOCU1_MIN               0x21

    58 #define BOCU1_MIDDLE            0x90

    59 #define BOCU1_MAX_LEAD          0xfe

    60 #define BOCU1_MAX_TRAIL         0xff

    61 #define BOCU1_RESET             0xff

    63 /* number of lead bytes */

    64 #define BOCU1_COUNT             (BOCU1_MAX_LEAD-BOCU1_MIN+1)

    66 /* adjust trail byte counts for the use of some C0 control byte values */

    67 #define BOCU1_TRAIL_CONTROLS_COUNT  20

    68 #define BOCU1_TRAIL_BYTE_OFFSET     (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)

    70 /* number of trail bytes */

    71 #define BOCU1_TRAIL_COUNT       ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)

    73 /*

    74  * number of positive and negative single-byte codes

    75  * (counting 0==BOCU1_MIDDLE among the positive ones)

    76  */

    77 #define BOCU1_SINGLE            64

    79 /* number of lead bytes for positive and negative 2/3/4-byte sequences */

    80 #define BOCU1_LEAD_2            43

    81 #define BOCU1_LEAD_3            3

    82 #define BOCU1_LEAD_4            1

    84 /* The difference value range for single-byters. */

    85 #define BOCU1_REACH_POS_1   (BOCU1_SINGLE-1)

    86 #define BOCU1_REACH_NEG_1   (-BOCU1_SINGLE)

    88 /* The difference value range for double-byters. */

    89 #define BOCU1_REACH_POS_2   (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)

    90 #define BOCU1_REACH_NEG_2   (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)

    92 /* The difference value range for 3-byters. */

    93 #define BOCU1_REACH_POS_3   \

    94     (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)

    96 #define BOCU1_REACH_NEG_3   (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)

    98 /* The lead byte start values. */

    99 #define BOCU1_START_POS_2   (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)

   100 #define BOCU1_START_POS_3   (BOCU1_START_POS_2+BOCU1_LEAD_2)

   101 #define BOCU1_START_POS_4   (BOCU1_START_POS_3+BOCU1_LEAD_3)

   102      /* ==BOCU1_MAX_LEAD */

   104 #define BOCU1_START_NEG_2   (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)

   105 #define BOCU1_START_NEG_3   (BOCU1_START_NEG_2-BOCU1_LEAD_2)

   106 #define BOCU1_START_NEG_4   (BOCU1_START_NEG_3-BOCU1_LEAD_3)

   107      /* ==BOCU1_MIN+1 */

   109 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */

   110 #define BOCU1_LENGTH_FROM_LEAD(lead) \

   111     ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \

   112      (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \

   113      (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)

   115 /* The length of a byte sequence, according to its packed form. */

   116 #define BOCU1_LENGTH_FROM_PACKED(packed) \

   117     ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)

   119 /*

   120  * 12 commonly used C0 control codes (and space) are only used to encode

   121  * themselves directly,

   122  * which makes BOCU-1 MIME-usable and reasonably safe for

   123  * ASCII-oriented software.

   124  *

   125  * These controls are

   126  *  0   NUL

   127  *

   128  *  7   BEL

   129  *  8   BS

   130  *

   131  *  9   TAB

   132  *  a   LF

   133  *  b   VT

   134  *  c   FF

   135  *  d   CR

   136  *

   137  *  e   SO

   138  *  f   SI

   139  *

   140  * 1a   SUB

   141  * 1b   ESC

   142  *

   143  * The other 20 C0 controls are also encoded directly (to preserve order)

   144  * but are also used as trail bytes in difference encoding

   145  * (for better compression).

   146  */

   147 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])

   149 /*

   150  * Byte value map for control codes,

   151  * from external byte values 0x00..0x20

   152  * to trail byte values 0..19 (0..0x13) as used in the difference calculation.

   153  * External byte values that are illegal as trail bytes are mapped to -1.

   154  */

   155 static const int8_t

   156 bocu1ByteToTrail[BOCU1_MIN]={

   157 /*  0     1     2     3     4     5     6     7    */

   158     -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,

   160 /*  8     9     a     b     c     d     e     f    */

   161     -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,

   163 /*  10    11    12    13    14    15    16    17   */

   164     0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,

   166 /*  18    19    1a    1b    1c    1d    1e    1f   */

   167     0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,

   169 /*  20   */

   170     -1

   171 };

   173 /*

   174  * Byte value map for control codes,

   175  * from trail byte values 0..19 (0..0x13) as used in the difference calculation

   176  * to external byte values 0x00..0x20.

   177  */

   178 static const int8_t

   179 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={

   180 /*  0     1     2     3     4     5     6     7    */

   181     0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,

   183 /*  8     9     a     b     c     d     e     f    */

   184     0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,

   186 /*  10    11    12    13   */

   187     0x1c, 0x1d, 0x1e, 0x1f

   188 };

   190 /**

   191  * Integer division and modulo with negative numerators

   192  * yields negative modulo results and quotients that are one more than

   193  * what we need here.

   194  * This macro adjust the results so that the modulo-value m is always >=0.

   195  *

   196  * For positive n, the if() condition is always FALSE.

   197  *

   198  * @param n Number to be split into quotient and rest.

   199  *          Will be modified to contain the quotient.

   200  * @param d Divisor.

   201  * @param m Output variable for the rest (modulo result).

   202  */

   203 #define NEGDIVMOD(n, d, m) { \

   204     (m)=(n)%(d); \

   205     (n)/=(d); \

   206     if((m)<0) { \

   207         --(n); \

   208         (m)+=(d); \

   209     } \

   210 }

   212 /* Faster versions of packDiff() for single-byte-encoded diff values. */

   214 /** Is a diff value encodable in a single byte? */

   215 #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)

   217 /** Encode a diff value in a single byte. */

   218 #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))

   220 /** Is a diff value encodable in two bytes? */

   221 #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)

   223 /* BOCU-1 implementation functions ------------------------------------------ */

   225 #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)

   227 /**

   228  * Compute the next "previous" value for differencing

   229  * from the current code point.

   230  *

   231  * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)

   232  * @return "previous code point" state value

   233  */

   234 static inline int32_t

   235 bocu1Prev(int32_t c) {

   236     /* compute new prev */

   237     if(/* 0x3040<=c && */ c<=0x309f) {

   238         /* Hiragana is not 128-aligned */

   239         return 0x3070;

   240     } else if(0x4e00<=c && c<=0x9fa5) {

   241         /* CJK Unihan */

   242         return 0x4e00-BOCU1_REACH_NEG_2;

   243     } else if(0xac00<=c /* && c<=0xd7a3 */) {

   244         /* Korean Hangul */

   245         return (0xd7a3+0xac00)/2;

   246     } else {

   247         /* mostly small scripts */

   248         return BOCU1_SIMPLE_PREV(c);

   249     }

   250 }

   252 /** Fast version of bocu1Prev() for most scripts. */

   253 #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))

   255 /*

   256  * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.

   257  * The UConverter fields are used as follows:

   258  *

   259  * fromUnicodeStatus    encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)

   260  *

   261  * toUnicodeStatus      decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)

   262  * mode                 decoder's incomplete (diff<<2)|count (ignored when toULength==0)

   263  */

   265 /* BOCU-1-from-Unicode conversion functions --------------------------------- */

   267 /**

   268  * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes

   269  * and return a packed integer with them.

   270  *

   271  * The encoding favors small absolute differences with short encodings

   272  * to compress runs of same-script characters.

   273  *

   274  * Optimized version with unrolled loops and fewer floating-point operations

   275  * than the standard packDiff().

   276  *

   277  * @param diff difference value -0x10ffff..0x10ffff

   278  * @return

   279  *      0x010000zz for 1-byte sequence zz

   280  *      0x0200yyzz for 2-byte sequence yy zz

   281  *      0x03xxyyzz for 3-byte sequence xx yy zz

   282  *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)

   283  */

   284 static int32_t

   285 packDiff(int32_t diff) {

   286     int32_t result, m;

   288     U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */

   289     if(diff>=BOCU1_REACH_NEG_1) {

   290         /* mostly positive differences, and single-byte negative ones */

   291 #if 0   /* single-byte case handled in macros, see below */

   292         if(diff<=BOCU1_REACH_POS_1) {

   293             /* single byte */

   294             return 0x01000000|(BOCU1_MIDDLE+diff);

   295         } else

   296 #endif

   297         if(diff<=BOCU1_REACH_POS_2) {

   298             /* two bytes */

   299             diff-=BOCU1_REACH_POS_1+1;

   300             result=0x02000000;

   302             m=diff%BOCU1_TRAIL_COUNT;

   303             diff/=BOCU1_TRAIL_COUNT;

   304             result|=BOCU1_TRAIL_TO_BYTE(m);

   306             result|=(BOCU1_START_POS_2+diff)<<8;

   307         } else if(diff<=BOCU1_REACH_POS_3) {

   308             /* three bytes */

   309             diff-=BOCU1_REACH_POS_2+1;

   310             result=0x03000000;

   312             m=diff%BOCU1_TRAIL_COUNT;

   313             diff/=BOCU1_TRAIL_COUNT;

   314             result|=BOCU1_TRAIL_TO_BYTE(m);

   316             m=diff%BOCU1_TRAIL_COUNT;

   317             diff/=BOCU1_TRAIL_COUNT;

   318             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;

   320             result|=(BOCU1_START_POS_3+diff)<<16;

   321         } else {

   322             /* four bytes */

   323             diff-=BOCU1_REACH_POS_3+1;

   325             m=diff%BOCU1_TRAIL_COUNT;

   326             diff/=BOCU1_TRAIL_COUNT;

   327             result=BOCU1_TRAIL_TO_BYTE(m);

   329             m=diff%BOCU1_TRAIL_COUNT;

   330             diff/=BOCU1_TRAIL_COUNT;

   331             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;

   333             /*

   334              * We know that / and % would deliver quotient 0 and rest=diff.

   335              * Avoid division and modulo for performance.

   336              */

   337             result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;

   339             result|=((uint32_t)BOCU1_START_POS_4)<<24;

   340         }

   341     } else {

   342         /* two- to four-byte negative differences */

   343         if(diff>=BOCU1_REACH_NEG_2) {

   344             /* two bytes */

   345             diff-=BOCU1_REACH_NEG_1;

   346             result=0x02000000;

   348             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);

   349             result|=BOCU1_TRAIL_TO_BYTE(m);

   351             result|=(BOCU1_START_NEG_2+diff)<<8;

   352         } else if(diff>=BOCU1_REACH_NEG_3) {

   353             /* three bytes */

   354             diff-=BOCU1_REACH_NEG_2;

   355             result=0x03000000;

   357             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);

   358             result|=BOCU1_TRAIL_TO_BYTE(m);

   360             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);

   361             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;

   363             result|=(BOCU1_START_NEG_3+diff)<<16;

   364         } else {

   365             /* four bytes */

   366             diff-=BOCU1_REACH_NEG_3;

   368             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);

   369             result=BOCU1_TRAIL_TO_BYTE(m);

   371             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);

   372             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;

   374             /*

   375              * We know that NEGDIVMOD would deliver

   376              * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.

   377              * Avoid division and modulo for performance.

   378              */

   379             m=diff+BOCU1_TRAIL_COUNT;

   380             result|=BOCU1_TRAIL_TO_BYTE(m)<<16;

   382             result|=BOCU1_MIN<<24;

   383         }

   384     }

   385     return result;

   386 }

   389 static void

   390 _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,

   391                              UErrorCode *pErrorCode) {

   392     UConverter *cnv;

   393     const UChar *source, *sourceLimit;

   394     uint8_t *target;

   395     int32_t targetCapacity;

   396     int32_t *offsets;

   398     int32_t prev, c, diff;

   400     int32_t sourceIndex, nextSourceIndex;

   402 U_ALIGN_CODE(16)

   404     /* set up the local pointers */

   405     cnv=pArgs->converter;

   406     source=pArgs->source;

   407     sourceLimit=pArgs->sourceLimit;

   408     target=(uint8_t *)pArgs->target;

   409     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);

   410     offsets=pArgs->offsets;

   412     /* get the converter state from UConverter */

   413     c=cnv->fromUChar32;

   414     prev=(int32_t)cnv->fromUnicodeStatus;

   415     if(prev==0) {

   416         prev=BOCU1_ASCII_PREV;

   417     }

   419     /* sourceIndex=-1 if the current character began in the previous buffer */

   420     sourceIndex= c==0 ? 0 : -1;

   421     nextSourceIndex=0;

   423     /* conversion loop */

   424     if(c!=0 && targetCapacity>0) {

   425         goto getTrail;

   426     }

   428 fastSingle:

   429     /* fast loop for single-byte differences */

   430     /* use only one loop counter variable, targetCapacity, not also source */

   431     diff=(int32_t)(sourceLimit-source);

   432     if(targetCapacity>diff) {

   433         targetCapacity=diff;

   434     }

   435     while(targetCapacity>0 && (c=*source)<0x3000) {

   436         if(c<=0x20) {

   437             if(c!=0x20) {

   438                 prev=BOCU1_ASCII_PREV;

   439             }

   440             *target++=(uint8_t)c;

   441             *offsets++=nextSourceIndex++;

   442             ++source;

   443             --targetCapacity;

   444         } else {

   445             diff=c-prev;

   446             if(DIFF_IS_SINGLE(diff)) {

   447                 prev=BOCU1_SIMPLE_PREV(c);

   448                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);

   449                 *offsets++=nextSourceIndex++;

   450                 ++source;

   451                 --targetCapacity;

   452             } else {

   453                 break;

   454             }

   455         }

   456     }

   457     /* restore real values */

   458     targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);

   459     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */

   461     /* regular loop for all cases */

   462     while(source<sourceLimit) {

   463         if(targetCapacity>0) {

   464             c=*source++;

   465             ++nextSourceIndex;

   467             if(c<=0x20) {

   468                 /*

   469                  * ISO C0 control & space:

   470                  * Encode directly for MIME compatibility,

   471                  * and reset state except for space, to not disrupt compression.

   472                  */

   473                 if(c!=0x20) {

   474                     prev=BOCU1_ASCII_PREV;

   475                 }

   476                 *target++=(uint8_t)c;

   477                 *offsets++=sourceIndex;

   478                 --targetCapacity;

   480                 sourceIndex=nextSourceIndex;

   481                 continue;

   482             }

   484             if(U16_IS_LEAD(c)) {

   485 getTrail:

   486                 if(source<sourceLimit) {

   487                     /* test the following code unit */

   488                     UChar trail=*source;

   489                     if(U16_IS_TRAIL(trail)) {

   490                         ++source;

   491                         ++nextSourceIndex;

   492                         c=U16_GET_SUPPLEMENTARY(c, trail);

   493                     }

   494                 } else {

   495                     /* no more input */

   496                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */

   497                     break;

   498                 }

   499             }

   501             /*

   502              * all other Unicode code points c==U+0021..U+10ffff

   503              * are encoded with the difference c-prev

   504              *

   505              * a new prev is computed from c,

   506              * placed in the middle of a 0x80-block (for most small scripts) or

   507              * in the middle of the Unihan and Hangul blocks

   508              * to statistically minimize the following difference

   509              */

   510             diff=c-prev;

   511             prev=BOCU1_PREV(c);

   512             if(DIFF_IS_SINGLE(diff)) {

   513                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);

   514                 *offsets++=sourceIndex;

   515                 --targetCapacity;

   516                 sourceIndex=nextSourceIndex;

   517                 if(c<0x3000) {

   518                     goto fastSingle;

   519                 }

   520             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {

   521                 /* optimize 2-byte case */

   522                 int32_t m;

   524                 if(diff>=0) {

   525                     diff-=BOCU1_REACH_POS_1+1;

   526                     m=diff%BOCU1_TRAIL_COUNT;

   527                     diff/=BOCU1_TRAIL_COUNT;

   528                     diff+=BOCU1_START_POS_2;

   529                 } else {

   530                     diff-=BOCU1_REACH_NEG_1;

   531                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);

   532                     diff+=BOCU1_START_NEG_2;

   533                 }

   534                 *target++=(uint8_t)diff;

   535                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);

   536                 *offsets++=sourceIndex;

   537                 *offsets++=sourceIndex;

   538                 targetCapacity-=2;

   539                 sourceIndex=nextSourceIndex;

   540             } else {

   541                 int32_t length; /* will be 2..4 */

   543                 diff=packDiff(diff);

   544                 length=BOCU1_LENGTH_FROM_PACKED(diff);

   546                 /* write the output character bytes from diff and length */

   547                 /* from the first if in the loop we know that targetCapacity>0 */

   548                 if(length<=targetCapacity) {

   549                     switch(length) {

   550                         /* each branch falls through to the next one */

   551                     case 4:

   552                         *target++=(uint8_t)(diff>>24);

   553                         *offsets++=sourceIndex;

   554                     case 3: /*fall through*/

   555                         *target++=(uint8_t)(diff>>16);

   556                         *offsets++=sourceIndex;

   557                     case 2: /*fall through*/

   558                         *target++=(uint8_t)(diff>>8);

   559                         *offsets++=sourceIndex;

   560                     /* case 1: handled above */

   561                         *target++=(uint8_t)diff;

   562                         *offsets++=sourceIndex;

   563                     default:

   564                         /* will never occur */

   565                         break;

   566                     }

   567                     targetCapacity-=length;

   568                     sourceIndex=nextSourceIndex;

   569                 } else {

   570                     uint8_t *charErrorBuffer;

   572                     /*

   573                      * We actually do this backwards here:

   574                      * In order to save an intermediate variable, we output

   575                      * first to the overflow buffer what does not fit into the

   576                      * regular target.

   577                      */

   578                     /* we know that 1<=targetCapacity<length<=4 */

   579                     length-=targetCapacity;

   580                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;

   581                     switch(length) {

   582                         /* each branch falls through to the next one */

   583                     case 3:

   584                         *charErrorBuffer++=(uint8_t)(diff>>16);

   585                     case 2: /*fall through*/

   586                         *charErrorBuffer++=(uint8_t)(diff>>8);

   587                     case 1: /*fall through*/

   588                         *charErrorBuffer=(uint8_t)diff;

   589                     default:

   590                         /* will never occur */

   591                         break;

   592                     }

   593                     cnv->charErrorBufferLength=(int8_t)length;

   595                     /* now output what fits into the regular target */

   596                     diff>>=8*length; /* length was reduced by targetCapacity */

   597                     switch(targetCapacity) {

   598                         /* each branch falls through to the next one */

   599                     case 3:

   600                         *target++=(uint8_t)(diff>>16);

   601                         *offsets++=sourceIndex;

   602                     case 2: /*fall through*/

   603                         *target++=(uint8_t)(diff>>8);

   604                         *offsets++=sourceIndex;

   605                     case 1: /*fall through*/

   606                         *target++=(uint8_t)diff;

   607                         *offsets++=sourceIndex;

   608                     default:

   609                         /* will never occur */

   610                         break;

   611                     }

   613                     /* target overflow */

   614                     targetCapacity=0;

   615                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

   616                     break;

   617                 }

   618             }

   619         } else {

   620             /* target is full */

   621             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

   622             break;

   623         }

   624     }

   626     /* set the converter state back into UConverter */

   627     cnv->fromUChar32= c<0 ? -c : 0;

   628     cnv->fromUnicodeStatus=(uint32_t)prev;

   630     /* write back the updated pointers */

   631     pArgs->source=source;

   632     pArgs->target=(char *)target;

   633     pArgs->offsets=offsets;

   634 }

   636 /*

   637  * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.

   638  * If a change is made in the original function, then either

   639  * change this function the same way or

   640  * re-copy the original function and remove the variables

   641  * offsets, sourceIndex, and nextSourceIndex.

   642  */

   643 static void

   644 _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,

   645                   UErrorCode *pErrorCode) {

   646     UConverter *cnv;

   647     const UChar *source, *sourceLimit;

   648     uint8_t *target;

   649     int32_t targetCapacity;

   651     int32_t prev, c, diff;

   653     /* set up the local pointers */

   654     cnv=pArgs->converter;

   655     source=pArgs->source;

   656     sourceLimit=pArgs->sourceLimit;

   657     target=(uint8_t *)pArgs->target;

   658     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);

   660     /* get the converter state from UConverter */

   661     c=cnv->fromUChar32;

   662     prev=(int32_t)cnv->fromUnicodeStatus;

   663     if(prev==0) {

   664         prev=BOCU1_ASCII_PREV;

   665     }

   667     /* conversion loop */

   668     if(c!=0 && targetCapacity>0) {

   669         goto getTrail;

   670     }

   672 fastSingle:

   673     /* fast loop for single-byte differences */

   674     /* use only one loop counter variable, targetCapacity, not also source */

   675     diff=(int32_t)(sourceLimit-source);

   676     if(targetCapacity>diff) {

   677         targetCapacity=diff;

   678     }

   679     while(targetCapacity>0 && (c=*source)<0x3000) {

   680         if(c<=0x20) {

   681             if(c!=0x20) {

   682                 prev=BOCU1_ASCII_PREV;

   683             }

   684             *target++=(uint8_t)c;

   685         } else {

   686             diff=c-prev;

   687             if(DIFF_IS_SINGLE(diff)) {

   688                 prev=BOCU1_SIMPLE_PREV(c);

   689                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);

   690             } else {

   691                 break;

   692             }

   693         }

   694         ++source;

   695         --targetCapacity;

   696     }

   697     /* restore real values */

   698     targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);

   700     /* regular loop for all cases */

   701     while(source<sourceLimit) {

   702         if(targetCapacity>0) {

   703             c=*source++;

   705             if(c<=0x20) {

   706                 /*

   707                  * ISO C0 control & space:

   708                  * Encode directly for MIME compatibility,

   709                  * and reset state except for space, to not disrupt compression.

   710                  */

   711                 if(c!=0x20) {

   712                     prev=BOCU1_ASCII_PREV;

   713                 }

   714                 *target++=(uint8_t)c;

   715                 --targetCapacity;

   716                 continue;

   717             }

   719             if(U16_IS_LEAD(c)) {

   720 getTrail:

   721                 if(source<sourceLimit) {

   722                     /* test the following code unit */

   723                     UChar trail=*source;

   724                     if(U16_IS_TRAIL(trail)) {

   725                         ++source;

   726                         c=U16_GET_SUPPLEMENTARY(c, trail);

   727                     }

   728                 } else {

   729                     /* no more input */

   730                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */

   731                     break;

   732                 }

   733             }

   735             /*

   736              * all other Unicode code points c==U+0021..U+10ffff

   737              * are encoded with the difference c-prev

   738              *

   739              * a new prev is computed from c,

   740              * placed in the middle of a 0x80-block (for most small scripts) or

   741              * in the middle of the Unihan and Hangul blocks

   742              * to statistically minimize the following difference

   743              */

   744             diff=c-prev;

   745             prev=BOCU1_PREV(c);

   746             if(DIFF_IS_SINGLE(diff)) {

   747                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);

   748                 --targetCapacity;

   749                 if(c<0x3000) {

   750                     goto fastSingle;

   751                 }

   752             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {

   753                 /* optimize 2-byte case */

   754                 int32_t m;

   756                 if(diff>=0) {

   757                     diff-=BOCU1_REACH_POS_1+1;

   758                     m=diff%BOCU1_TRAIL_COUNT;

   759                     diff/=BOCU1_TRAIL_COUNT;

   760                     diff+=BOCU1_START_POS_2;

   761                 } else {

   762                     diff-=BOCU1_REACH_NEG_1;

   763                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);

   764                     diff+=BOCU1_START_NEG_2;

   765                 }

   766                 *target++=(uint8_t)diff;

   767                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);

   768                 targetCapacity-=2;

   769             } else {

   770                 int32_t length; /* will be 2..4 */

   772                 diff=packDiff(diff);

   773                 length=BOCU1_LENGTH_FROM_PACKED(diff);

   775                 /* write the output character bytes from diff and length */

   776                 /* from the first if in the loop we know that targetCapacity>0 */

   777                 if(length<=targetCapacity) {

   778                     switch(length) {

   779                         /* each branch falls through to the next one */

   780                     case 4:

   781                         *target++=(uint8_t)(diff>>24);

   782                     case 3: /*fall through*/

   783                         *target++=(uint8_t)(diff>>16);

   784                     /* case 2: handled above */

   785                         *target++=(uint8_t)(diff>>8);

   786                     /* case 1: handled above */

   787                         *target++=(uint8_t)diff;

   788                     default:

   789                         /* will never occur */

   790                         break;

   791                     }

   792                     targetCapacity-=length;

   793                 } else {

   794                     uint8_t *charErrorBuffer;

   796                     /*

   797                      * We actually do this backwards here:

   798                      * In order to save an intermediate variable, we output

   799                      * first to the overflow buffer what does not fit into the

   800                      * regular target.

   801                      */

   802                     /* we know that 1<=targetCapacity<length<=4 */

   803                     length-=targetCapacity;

   804                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;

   805                     switch(length) {

   806                         /* each branch falls through to the next one */

   807                     case 3:

   808                         *charErrorBuffer++=(uint8_t)(diff>>16);

   809                     case 2: /*fall through*/

   810                         *charErrorBuffer++=(uint8_t)(diff>>8);

   811                     case 1: /*fall through*/

   812                         *charErrorBuffer=(uint8_t)diff;

   813                     default:

   814                         /* will never occur */

   815                         break;

   816                     }

   817                     cnv->charErrorBufferLength=(int8_t)length;

   819                     /* now output what fits into the regular target */

   820                     diff>>=8*length; /* length was reduced by targetCapacity */

   821                     switch(targetCapacity) {

   822                         /* each branch falls through to the next one */

   823                     case 3:

   824                         *target++=(uint8_t)(diff>>16);

   825                     case 2: /*fall through*/

   826                         *target++=(uint8_t)(diff>>8);

   827                     case 1: /*fall through*/

   828                         *target++=(uint8_t)diff;

   829                     default:

   830                         /* will never occur */

   831                         break;

   832                     }

   834                     /* target overflow */

   835                     targetCapacity=0;

   836                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

   837                     break;

   838                 }

   839             }

   840         } else {

   841             /* target is full */

   842             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

   843             break;

   844         }

   845     }

   847     /* set the converter state back into UConverter */

   848     cnv->fromUChar32= c<0 ? -c : 0;

   849     cnv->fromUnicodeStatus=(uint32_t)prev;

   851     /* write back the updated pointers */

   852     pArgs->source=source;

   853     pArgs->target=(char *)target;

   854 }

   856 /* BOCU-1-to-Unicode conversion functions ----------------------------------- */

   858 /**

   859  * Function for BOCU-1 decoder; handles multi-byte lead bytes.

   860  *

   861  * @param b lead byte;

   862  *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD

   863  * @return (diff<<2)|count

   864  */

   865 static inline int32_t

   866 decodeBocu1LeadByte(int32_t b) {

   867     int32_t diff, count;

   869     if(b>=BOCU1_START_NEG_2) {

   870         /* positive difference */

   871         if(b<BOCU1_START_POS_3) {

   872             /* two bytes */

   873             diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;

   874             count=1;

   875         } else if(b<BOCU1_START_POS_4) {

   876             /* three bytes */

   877             diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;

   878             count=2;

   879         } else {

   880             /* four bytes */

   881             diff=BOCU1_REACH_POS_3+1;

   882             count=3;

   883         }

   884     } else {

   885         /* negative difference */

   886         if(b>=BOCU1_START_NEG_3) {

   887             /* two bytes */

   888             diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;

   889             count=1;

   890         } else if(b>BOCU1_MIN) {

   891             /* three bytes */

   892             diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;

   893             count=2;

   894         } else {

   895             /* four bytes */

   896             diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;

   897             count=3;

   898         }

   899     }

   901     /* return the state for decoding the trail byte(s) */

   902     return (diff<<2)|count;

   903 }

   905 /**

   906  * Function for BOCU-1 decoder; handles multi-byte trail bytes.

   907  *

   908  * @param count number of remaining trail bytes including this one

   909  * @param b trail byte

   910  * @return new delta for diff including b - <0 indicates an error

   911  *

   912  * @see decodeBocu1

   913  */

   914 static inline int32_t

   915 decodeBocu1TrailByte(int32_t count, int32_t b) {

   916     if(b<=0x20) {

   917         /* skip some C0 controls and make the trail byte range contiguous */

   918         b=bocu1ByteToTrail[b];

   919         /* b<0 for an illegal trail byte value will result in return<0 below */

   920 #if BOCU1_MAX_TRAIL<0xff

   921     } else if(b>BOCU1_MAX_TRAIL) {

   922         return -99;

   923 #endif

   924     } else {

   925         b-=BOCU1_TRAIL_BYTE_OFFSET;

   926     }

   928     /* add trail byte into difference and decrement count */

   929     if(count==1) {

   930         return b;

   931     } else if(count==2) {

   932         return b*BOCU1_TRAIL_COUNT;

   933     } else /* count==3 */ {

   934         return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);

   935     }

   936 }

   938 static void

   939 _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,

   940                            UErrorCode *pErrorCode) {

   941     UConverter *cnv;

   942     const uint8_t *source, *sourceLimit;

   943     UChar *target;

   944     const UChar *targetLimit;

   945     int32_t *offsets;

   947     int32_t prev, count, diff, c;

   949     int8_t byteIndex;

   950     uint8_t *bytes;

   952     int32_t sourceIndex, nextSourceIndex;

   954     /* set up the local pointers */

   955     cnv=pArgs->converter;

   956     source=(const uint8_t *)pArgs->source;

   957     sourceLimit=(const uint8_t *)pArgs->sourceLimit;

   958     target=pArgs->target;

   959     targetLimit=pArgs->targetLimit;

   960     offsets=pArgs->offsets;

   962     /* get the converter state from UConverter */

   963     prev=(int32_t)cnv->toUnicodeStatus;

   964     if(prev==0) {

   965         prev=BOCU1_ASCII_PREV;

   966     }

   967     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */

   968     count=diff&3;

   969     diff>>=2;

   971     byteIndex=cnv->toULength;

   972     bytes=cnv->toUBytes;

   974     /* sourceIndex=-1 if the current character began in the previous buffer */

   975     sourceIndex=byteIndex==0 ? 0 : -1;

   976     nextSourceIndex=0;

   978     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */

   979     if(count>0 && byteIndex>0 && target<targetLimit) {

   980         goto getTrail;

   981     }

   983 fastSingle:

   984     /* fast loop for single-byte differences */

   985     /* use count as the only loop counter variable */

   986     diff=(int32_t)(sourceLimit-source);

   987     count=(int32_t)(pArgs->targetLimit-target);

   988     if(count>diff) {

   989         count=diff;

   990     }

   991     while(count>0) {

   992         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {

   993             c=prev+(c-BOCU1_MIDDLE);

   994             if(c<0x3000) {

   995                 *target++=(UChar)c;

   996                 *offsets++=nextSourceIndex++;

   997                 prev=BOCU1_SIMPLE_PREV(c);

   998             } else {

   999                 break;

  1000             }

  1001         } else if(c<=0x20) {

  1002             if(c!=0x20) {

  1003                 prev=BOCU1_ASCII_PREV;

  1004             }

  1005             *target++=(UChar)c;

  1006             *offsets++=nextSourceIndex++;

  1007         } else {

  1008             break;

  1009         }

  1010         ++source;

  1011         --count;

  1012     }

  1013     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */

  1015     /* decode a sequence of single and lead bytes */

  1016     while(source<sourceLimit) {

  1017         if(target>=targetLimit) {

  1018             /* target is full */

  1019             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

  1020             break;

  1021         }

  1023         ++nextSourceIndex;

  1024         c=*source++;

  1025         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {

  1026             /* Write a code point directly from a single-byte difference. */

  1027             c=prev+(c-BOCU1_MIDDLE);

  1028             if(c<0x3000) {

  1029                 *target++=(UChar)c;

  1030                 *offsets++=sourceIndex;

  1031                 prev=BOCU1_SIMPLE_PREV(c);

  1032                 sourceIndex=nextSourceIndex;

  1033                 goto fastSingle;

  1034             }

  1035         } else if(c<=0x20) {

  1036             /*

  1037              * Direct-encoded C0 control code or space.

  1038              * Reset prev for C0 control codes but not for space.

  1039              */

  1040             if(c!=0x20) {

  1041                 prev=BOCU1_ASCII_PREV;

  1042             }

  1043             *target++=(UChar)c;

  1044             *offsets++=sourceIndex;

  1045             sourceIndex=nextSourceIndex;

  1046             continue;

  1047         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {

  1048             /* Optimize two-byte case. */

  1049             if(c>=BOCU1_MIDDLE) {

  1050                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;

  1051             } else {

  1052                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;

  1053             }

  1055             /* trail byte */

  1056             ++nextSourceIndex;

  1057             c=decodeBocu1TrailByte(1, *source++);

  1058             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {

  1059                 bytes[0]=source[-2];

  1060                 bytes[1]=source[-1];

  1061                 byteIndex=2;

  1062                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

  1063                 break;

  1064             }

  1065         } else if(c==BOCU1_RESET) {

  1066             /* only reset the state, no code point */

  1067             prev=BOCU1_ASCII_PREV;

  1068             sourceIndex=nextSourceIndex;

  1069             continue;

  1070         } else {

  1071             /*

  1072              * For multi-byte difference lead bytes, set the decoder state

  1073              * with the partial difference value from the lead byte and

  1074              * with the number of trail bytes.

  1075              */

  1076             bytes[0]=(uint8_t)c;

  1077             byteIndex=1;

  1079             diff=decodeBocu1LeadByte(c);

  1080             count=diff&3;

  1081             diff>>=2;

  1082 getTrail:

  1083             for(;;) {

  1084                 if(source>=sourceLimit) {

  1085                     goto endloop;

  1086                 }

  1087                 ++nextSourceIndex;

  1088                 c=bytes[byteIndex++]=*source++;

  1090                 /* trail byte in any position */

  1091                 c=decodeBocu1TrailByte(count, c);

  1092                 if(c<0) {

  1093                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;

  1094                     goto endloop;

  1095                 }

  1097                 diff+=c;

  1098                 if(--count==0) {

  1099                     /* final trail byte, deliver a code point */

  1100                     byteIndex=0;

  1101                     c=prev+diff;

  1102                     if((uint32_t)c>0x10ffff) {

  1103                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;

  1104                         goto endloop;

  1105                     }

  1106                     break;

  1107                 }

  1108             }

  1109         }

  1111         /* calculate the next prev and output c */

  1112         prev=BOCU1_PREV(c);

  1113         if(c<=0xffff) {

  1114             *target++=(UChar)c;

  1115             *offsets++=sourceIndex;

  1116         } else {

  1117             /* output surrogate pair */

  1118             *target++=U16_LEAD(c);

  1119             if(target<targetLimit) {

  1120                 *target++=U16_TRAIL(c);

  1121                 *offsets++=sourceIndex;

  1122                 *offsets++=sourceIndex;

  1123             } else {

  1124                 /* target overflow */

  1125                 *offsets++=sourceIndex;

  1126                 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);

  1127                 cnv->UCharErrorBufferLength=1;

  1128                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

  1129                 break;

  1130             }

  1131         }

  1132         sourceIndex=nextSourceIndex;

  1133     }

  1134 endloop:

  1136     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {

  1137         /* set the converter state in UConverter to deal with the next character */

  1138         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;

  1139         cnv->mode=0;

  1140     } else {

  1141         /* set the converter state back into UConverter */

  1142         cnv->toUnicodeStatus=(uint32_t)prev;

  1143         cnv->mode=(diff<<2)|count;

  1144     }

  1145     cnv->toULength=byteIndex;

  1147     /* write back the updated pointers */

  1148     pArgs->source=(const char *)source;

  1149     pArgs->target=target;

  1150     pArgs->offsets=offsets;

  1151     return;

  1152 }

  1154 /*

  1155  * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.

  1156  * If a change is made in the original function, then either

  1157  * change this function the same way or

  1158  * re-copy the original function and remove the variables

  1159  * offsets, sourceIndex, and nextSourceIndex.

  1160  */

  1161 static void

  1162 _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,

  1163                 UErrorCode *pErrorCode) {

  1164     UConverter *cnv;

  1165     const uint8_t *source, *sourceLimit;

  1166     UChar *target;

  1167     const UChar *targetLimit;

  1169     int32_t prev, count, diff, c;

  1171     int8_t byteIndex;

  1172     uint8_t *bytes;

  1174 U_ALIGN_CODE(16)

  1176     /* set up the local pointers */

  1177     cnv=pArgs->converter;

  1178     source=(const uint8_t *)pArgs->source;

  1179     sourceLimit=(const uint8_t *)pArgs->sourceLimit;

  1180     target=pArgs->target;

  1181     targetLimit=pArgs->targetLimit;

  1183     /* get the converter state from UConverter */

  1184     prev=(int32_t)cnv->toUnicodeStatus;

  1185     if(prev==0) {

  1186         prev=BOCU1_ASCII_PREV;

  1187     }

  1188     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */

  1189     count=diff&3;

  1190     diff>>=2;

  1192     byteIndex=cnv->toULength;

  1193     bytes=cnv->toUBytes;

  1195     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */

  1196     if(count>0 && byteIndex>0 && target<targetLimit) {

  1197         goto getTrail;

  1198     }

  1200 fastSingle:

  1201     /* fast loop for single-byte differences */

  1202     /* use count as the only loop counter variable */

  1203     diff=(int32_t)(sourceLimit-source);

  1204     count=(int32_t)(pArgs->targetLimit-target);

  1205     if(count>diff) {

  1206         count=diff;

  1207     }

  1208     while(count>0) {

  1209         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {

  1210             c=prev+(c-BOCU1_MIDDLE);

  1211             if(c<0x3000) {

  1212                 *target++=(UChar)c;

  1213                 prev=BOCU1_SIMPLE_PREV(c);

  1214             } else {

  1215                 break;

  1216             }

  1217         } else if(c<=0x20) {

  1218             if(c!=0x20) {

  1219                 prev=BOCU1_ASCII_PREV;

  1220             }

  1221             *target++=(UChar)c;

  1222         } else {

  1223             break;

  1224         }

  1225         ++source;

  1226         --count;

  1227     }

  1229     /* decode a sequence of single and lead bytes */

  1230     while(source<sourceLimit) {

  1231         if(target>=targetLimit) {

  1232             /* target is full */

  1233             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

  1234             break;

  1235         }

  1237         c=*source++;

  1238         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {

  1239             /* Write a code point directly from a single-byte difference. */

  1240             c=prev+(c-BOCU1_MIDDLE);

  1241             if(c<0x3000) {

  1242                 *target++=(UChar)c;

  1243                 prev=BOCU1_SIMPLE_PREV(c);

  1244                 goto fastSingle;

  1245             }

  1246         } else if(c<=0x20) {

  1247             /*

  1248              * Direct-encoded C0 control code or space.

  1249              * Reset prev for C0 control codes but not for space.

  1250              */

  1251             if(c!=0x20) {

  1252                 prev=BOCU1_ASCII_PREV;

  1253             }

  1254             *target++=(UChar)c;

  1255             continue;

  1256         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {

  1257             /* Optimize two-byte case. */

  1258             if(c>=BOCU1_MIDDLE) {

  1259                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;

  1260             } else {

  1261                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;

  1262             }

  1264             /* trail byte */

  1265             c=decodeBocu1TrailByte(1, *source++);

  1266             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {

  1267                 bytes[0]=source[-2];

  1268                 bytes[1]=source[-1];

  1269                 byteIndex=2;

  1270                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

  1271                 break;

  1272             }

  1273         } else if(c==BOCU1_RESET) {

  1274             /* only reset the state, no code point */

  1275             prev=BOCU1_ASCII_PREV;

  1276             continue;

  1277         } else {

  1278             /*

  1279              * For multi-byte difference lead bytes, set the decoder state

  1280              * with the partial difference value from the lead byte and

  1281              * with the number of trail bytes.

  1282              */

  1283             bytes[0]=(uint8_t)c;

  1284             byteIndex=1;

  1286             diff=decodeBocu1LeadByte(c);

  1287             count=diff&3;

  1288             diff>>=2;

  1289 getTrail:

  1290             for(;;) {

  1291                 if(source>=sourceLimit) {

  1292                     goto endloop;

  1293                 }

  1294                 c=bytes[byteIndex++]=*source++;

  1296                 /* trail byte in any position */

  1297                 c=decodeBocu1TrailByte(count, c);

  1298                 if(c<0) {

  1299                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;

  1300                     goto endloop;

  1301                 }

  1303                 diff+=c;

  1304                 if(--count==0) {

  1305                     /* final trail byte, deliver a code point */

  1306                     byteIndex=0;

  1307                     c=prev+diff;

  1308                     if((uint32_t)c>0x10ffff) {

  1309                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;

  1310                         goto endloop;

  1311                     }

  1312                     break;

  1313                 }

  1314             }

  1315         }

  1317         /* calculate the next prev and output c */

  1318         prev=BOCU1_PREV(c);

  1319         if(c<=0xffff) {

  1320             *target++=(UChar)c;

  1321         } else {

  1322             /* output surrogate pair */

  1323             *target++=U16_LEAD(c);

  1324             if(target<targetLimit) {

  1325                 *target++=U16_TRAIL(c);

  1326             } else {

  1327                 /* target overflow */

  1328                 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);

  1329                 cnv->UCharErrorBufferLength=1;

  1330                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

  1331                 break;

  1332             }

  1333         }

  1334     }

  1335 endloop:

  1337     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {

  1338         /* set the converter state in UConverter to deal with the next character */

  1339         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;

  1340         cnv->mode=0;

  1341     } else {

  1342         /* set the converter state back into UConverter */

  1343         cnv->toUnicodeStatus=(uint32_t)prev;

  1344         cnv->mode=(diff<<2)|count;

  1345     }

  1346     cnv->toULength=byteIndex;

  1348     /* write back the updated pointers */

  1349     pArgs->source=(const char *)source;

  1350     pArgs->target=target;

  1351     return;

  1352 }

  1354 /* miscellaneous ------------------------------------------------------------ */

  1356 static const UConverterImpl _Bocu1Impl={

  1357     UCNV_BOCU1,

  1359     NULL,

  1360     NULL,

  1362     NULL,

  1363     NULL,

  1364     NULL,

  1366     _Bocu1ToUnicode,

  1367     _Bocu1ToUnicodeWithOffsets,

  1368     _Bocu1FromUnicode,

  1369     _Bocu1FromUnicodeWithOffsets,

  1370     NULL,

  1372     NULL,

  1373     NULL,

  1374     NULL,

  1375     NULL,

  1376     ucnv_getCompleteUnicodeSet,

  1378     NULL,

  1379     NULL

  1380 };

  1382 static const UConverterStaticData _Bocu1StaticData={

  1383     sizeof(UConverterStaticData),

  1384     "BOCU-1",

  1385     1214, /* CCSID for BOCU-1 */

  1386     UCNV_IBM, UCNV_BOCU1,

  1387     1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */

  1388     { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */

  1389     FALSE, FALSE,

  1390     0,

  1391     0,

  1392     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */

  1393 };

  1395 const UConverterSharedData _Bocu1Data={

  1396     sizeof(UConverterSharedData), ~((uint32_t)0),

  1397     NULL, NULL, &_Bocu1StaticData, FALSE, &_Bocu1Impl,

  1398     0,

  1399     UCNV_MBCS_TABLE_INITIALIZER

  1400 };

  1402 #endif

The Tor Browser / file revision

intl/icu/source/common/ucnvbocu.cpp@fc2d59ddac77

intl/icu/source/common/ucnvbocu.cpp