intl/icu/source/common/ucnvbocu.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

     1 /*
     2 ******************************************************************************
     3 *
     4 *   Copyright (C) 2002-2011, International Business Machines
     5 *   Corporation and others.  All Rights Reserved.
     6 *
     7 ******************************************************************************
     8 *   file name:  ucnvbocu.cpp
     9 *   encoding:   US-ASCII
    10 *   tab size:   8 (not used)
    11 *   indentation:4
    12 *
    13 *   created on: 2002mar27
    14 *   created by: Markus W. Scherer
    15 *
    16 *   This is an implementation of the Binary Ordered Compression for Unicode,
    17 *   in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
    18 */
    20 #include "unicode/utypes.h"
    22 #if !UCONFIG_NO_CONVERSION
    24 #include "unicode/ucnv.h"
    25 #include "unicode/ucnv_cb.h"
    26 #include "unicode/utf16.h"
    27 #include "putilimp.h"
    28 #include "ucnv_bld.h"
    29 #include "ucnv_cnv.h"
    30 #include "uassert.h"
    32 /* BOCU-1 constants and macros ---------------------------------------------- */
    34 /*
    35  * BOCU-1 encodes the code points of a Unicode string as
    36  * a sequence of byte-encoded differences (slope detection),
    37  * preserving lexical order.
    38  *
    39  * Optimize the difference-taking for runs of Unicode text within
    40  * small scripts:
    41  *
    42  * Most small scripts are allocated within aligned 128-blocks of Unicode
    43  * code points. Lexical order is preserved if the "previous code point" state
    44  * is always moved into the middle of such a block.
    45  *
    46  * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
    47  * areas into the middle of those areas.
    48  *
    49  * C0 control codes and space are encoded with their US-ASCII bytes.
    50  * "prev" is reset for C0 controls but not for space.
    51  */
    53 /* initial value for "prev": middle of the ASCII range */
    54 #define BOCU1_ASCII_PREV        0x40
    56 /* bounding byte values for differences */
    57 #define BOCU1_MIN               0x21
    58 #define BOCU1_MIDDLE            0x90
    59 #define BOCU1_MAX_LEAD          0xfe
    60 #define BOCU1_MAX_TRAIL         0xff
    61 #define BOCU1_RESET             0xff
    63 /* number of lead bytes */
    64 #define BOCU1_COUNT             (BOCU1_MAX_LEAD-BOCU1_MIN+1)
    66 /* adjust trail byte counts for the use of some C0 control byte values */
    67 #define BOCU1_TRAIL_CONTROLS_COUNT  20
    68 #define BOCU1_TRAIL_BYTE_OFFSET     (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
    70 /* number of trail bytes */
    71 #define BOCU1_TRAIL_COUNT       ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
    73 /*
    74  * number of positive and negative single-byte codes
    75  * (counting 0==BOCU1_MIDDLE among the positive ones)
    76  */
    77 #define BOCU1_SINGLE            64
    79 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
    80 #define BOCU1_LEAD_2            43
    81 #define BOCU1_LEAD_3            3
    82 #define BOCU1_LEAD_4            1
    84 /* The difference value range for single-byters. */
    85 #define BOCU1_REACH_POS_1   (BOCU1_SINGLE-1)
    86 #define BOCU1_REACH_NEG_1   (-BOCU1_SINGLE)
    88 /* The difference value range for double-byters. */
    89 #define BOCU1_REACH_POS_2   (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
    90 #define BOCU1_REACH_NEG_2   (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
    92 /* The difference value range for 3-byters. */
    93 #define BOCU1_REACH_POS_3   \
    94     (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
    96 #define BOCU1_REACH_NEG_3   (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
    98 /* The lead byte start values. */
    99 #define BOCU1_START_POS_2   (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
   100 #define BOCU1_START_POS_3   (BOCU1_START_POS_2+BOCU1_LEAD_2)
   101 #define BOCU1_START_POS_4   (BOCU1_START_POS_3+BOCU1_LEAD_3)
   102      /* ==BOCU1_MAX_LEAD */
   104 #define BOCU1_START_NEG_2   (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
   105 #define BOCU1_START_NEG_3   (BOCU1_START_NEG_2-BOCU1_LEAD_2)
   106 #define BOCU1_START_NEG_4   (BOCU1_START_NEG_3-BOCU1_LEAD_3)
   107      /* ==BOCU1_MIN+1 */
   109 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
   110 #define BOCU1_LENGTH_FROM_LEAD(lead) \
   111     ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
   112      (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
   113      (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
   115 /* The length of a byte sequence, according to its packed form. */
   116 #define BOCU1_LENGTH_FROM_PACKED(packed) \
   117     ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
   119 /*
   120  * 12 commonly used C0 control codes (and space) are only used to encode
   121  * themselves directly,
   122  * which makes BOCU-1 MIME-usable and reasonably safe for
   123  * ASCII-oriented software.
   124  *
   125  * These controls are
   126  *  0   NUL
   127  *
   128  *  7   BEL
   129  *  8   BS
   130  *
   131  *  9   TAB
   132  *  a   LF
   133  *  b   VT
   134  *  c   FF
   135  *  d   CR
   136  *
   137  *  e   SO
   138  *  f   SI
   139  *
   140  * 1a   SUB
   141  * 1b   ESC
   142  *
   143  * The other 20 C0 controls are also encoded directly (to preserve order)
   144  * but are also used as trail bytes in difference encoding
   145  * (for better compression).
   146  */
   147 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
   149 /*
   150  * Byte value map for control codes,
   151  * from external byte values 0x00..0x20
   152  * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
   153  * External byte values that are illegal as trail bytes are mapped to -1.
   154  */
   155 static const int8_t
   156 bocu1ByteToTrail[BOCU1_MIN]={
   157 /*  0     1     2     3     4     5     6     7    */
   158     -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
   160 /*  8     9     a     b     c     d     e     f    */
   161     -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
   163 /*  10    11    12    13    14    15    16    17   */
   164     0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
   166 /*  18    19    1a    1b    1c    1d    1e    1f   */
   167     0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,
   169 /*  20   */
   170     -1
   171 };
   173 /*
   174  * Byte value map for control codes,
   175  * from trail byte values 0..19 (0..0x13) as used in the difference calculation
   176  * to external byte values 0x00..0x20.
   177  */
   178 static const int8_t
   179 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
   180 /*  0     1     2     3     4     5     6     7    */
   181     0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
   183 /*  8     9     a     b     c     d     e     f    */
   184     0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
   186 /*  10    11    12    13   */
   187     0x1c, 0x1d, 0x1e, 0x1f
   188 };
   190 /**
   191  * Integer division and modulo with negative numerators
   192  * yields negative modulo results and quotients that are one more than
   193  * what we need here.
   194  * This macro adjust the results so that the modulo-value m is always >=0.
   195  *
   196  * For positive n, the if() condition is always FALSE.
   197  *
   198  * @param n Number to be split into quotient and rest.
   199  *          Will be modified to contain the quotient.
   200  * @param d Divisor.
   201  * @param m Output variable for the rest (modulo result).
   202  */
   203 #define NEGDIVMOD(n, d, m) { \
   204     (m)=(n)%(d); \
   205     (n)/=(d); \
   206     if((m)<0) { \
   207         --(n); \
   208         (m)+=(d); \
   209     } \
   210 }
   212 /* Faster versions of packDiff() for single-byte-encoded diff values. */
   214 /** Is a diff value encodable in a single byte? */
   215 #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
   217 /** Encode a diff value in a single byte. */
   218 #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
   220 /** Is a diff value encodable in two bytes? */
   221 #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
   223 /* BOCU-1 implementation functions ------------------------------------------ */
   225 #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
   227 /**
   228  * Compute the next "previous" value for differencing
   229  * from the current code point.
   230  *
   231  * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
   232  * @return "previous code point" state value
   233  */
   234 static inline int32_t
   235 bocu1Prev(int32_t c) {
   236     /* compute new prev */
   237     if(/* 0x3040<=c && */ c<=0x309f) {
   238         /* Hiragana is not 128-aligned */
   239         return 0x3070;
   240     } else if(0x4e00<=c && c<=0x9fa5) {
   241         /* CJK Unihan */
   242         return 0x4e00-BOCU1_REACH_NEG_2;
   243     } else if(0xac00<=c /* && c<=0xd7a3 */) {
   244         /* Korean Hangul */
   245         return (0xd7a3+0xac00)/2;
   246     } else {
   247         /* mostly small scripts */
   248         return BOCU1_SIMPLE_PREV(c);
   249     }
   250 }
   252 /** Fast version of bocu1Prev() for most scripts. */
   253 #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
   255 /*
   256  * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
   257  * The UConverter fields are used as follows:
   258  *
   259  * fromUnicodeStatus    encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
   260  *
   261  * toUnicodeStatus      decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
   262  * mode                 decoder's incomplete (diff<<2)|count (ignored when toULength==0)
   263  */
   265 /* BOCU-1-from-Unicode conversion functions --------------------------------- */
   267 /**
   268  * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
   269  * and return a packed integer with them.
   270  *
   271  * The encoding favors small absolute differences with short encodings
   272  * to compress runs of same-script characters.
   273  *
   274  * Optimized version with unrolled loops and fewer floating-point operations
   275  * than the standard packDiff().
   276  *
   277  * @param diff difference value -0x10ffff..0x10ffff
   278  * @return
   279  *      0x010000zz for 1-byte sequence zz
   280  *      0x0200yyzz for 2-byte sequence yy zz
   281  *      0x03xxyyzz for 3-byte sequence xx yy zz
   282  *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
   283  */
   284 static int32_t
   285 packDiff(int32_t diff) {
   286     int32_t result, m;
   288     U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */
   289     if(diff>=BOCU1_REACH_NEG_1) {
   290         /* mostly positive differences, and single-byte negative ones */
   291 #if 0   /* single-byte case handled in macros, see below */
   292         if(diff<=BOCU1_REACH_POS_1) {
   293             /* single byte */
   294             return 0x01000000|(BOCU1_MIDDLE+diff);
   295         } else
   296 #endif
   297         if(diff<=BOCU1_REACH_POS_2) {
   298             /* two bytes */
   299             diff-=BOCU1_REACH_POS_1+1;
   300             result=0x02000000;
   302             m=diff%BOCU1_TRAIL_COUNT;
   303             diff/=BOCU1_TRAIL_COUNT;
   304             result|=BOCU1_TRAIL_TO_BYTE(m);
   306             result|=(BOCU1_START_POS_2+diff)<<8;
   307         } else if(diff<=BOCU1_REACH_POS_3) {
   308             /* three bytes */
   309             diff-=BOCU1_REACH_POS_2+1;
   310             result=0x03000000;
   312             m=diff%BOCU1_TRAIL_COUNT;
   313             diff/=BOCU1_TRAIL_COUNT;
   314             result|=BOCU1_TRAIL_TO_BYTE(m);
   316             m=diff%BOCU1_TRAIL_COUNT;
   317             diff/=BOCU1_TRAIL_COUNT;
   318             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
   320             result|=(BOCU1_START_POS_3+diff)<<16;
   321         } else {
   322             /* four bytes */
   323             diff-=BOCU1_REACH_POS_3+1;
   325             m=diff%BOCU1_TRAIL_COUNT;
   326             diff/=BOCU1_TRAIL_COUNT;
   327             result=BOCU1_TRAIL_TO_BYTE(m);
   329             m=diff%BOCU1_TRAIL_COUNT;
   330             diff/=BOCU1_TRAIL_COUNT;
   331             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
   333             /*
   334              * We know that / and % would deliver quotient 0 and rest=diff.
   335              * Avoid division and modulo for performance.
   336              */
   337             result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
   339             result|=((uint32_t)BOCU1_START_POS_4)<<24;
   340         }
   341     } else {
   342         /* two- to four-byte negative differences */
   343         if(diff>=BOCU1_REACH_NEG_2) {
   344             /* two bytes */
   345             diff-=BOCU1_REACH_NEG_1;
   346             result=0x02000000;
   348             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
   349             result|=BOCU1_TRAIL_TO_BYTE(m);
   351             result|=(BOCU1_START_NEG_2+diff)<<8;
   352         } else if(diff>=BOCU1_REACH_NEG_3) {
   353             /* three bytes */
   354             diff-=BOCU1_REACH_NEG_2;
   355             result=0x03000000;
   357             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
   358             result|=BOCU1_TRAIL_TO_BYTE(m);
   360             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
   361             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
   363             result|=(BOCU1_START_NEG_3+diff)<<16;
   364         } else {
   365             /* four bytes */
   366             diff-=BOCU1_REACH_NEG_3;
   368             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
   369             result=BOCU1_TRAIL_TO_BYTE(m);
   371             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
   372             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
   374             /*
   375              * We know that NEGDIVMOD would deliver
   376              * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
   377              * Avoid division and modulo for performance.
   378              */
   379             m=diff+BOCU1_TRAIL_COUNT;
   380             result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
   382             result|=BOCU1_MIN<<24;
   383         }
   384     }
   385     return result;
   386 }
   389 static void
   390 _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
   391                              UErrorCode *pErrorCode) {
   392     UConverter *cnv;
   393     const UChar *source, *sourceLimit;
   394     uint8_t *target;
   395     int32_t targetCapacity;
   396     int32_t *offsets;
   398     int32_t prev, c, diff;
   400     int32_t sourceIndex, nextSourceIndex;
   402 U_ALIGN_CODE(16)
   404     /* set up the local pointers */
   405     cnv=pArgs->converter;
   406     source=pArgs->source;
   407     sourceLimit=pArgs->sourceLimit;
   408     target=(uint8_t *)pArgs->target;
   409     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
   410     offsets=pArgs->offsets;
   412     /* get the converter state from UConverter */
   413     c=cnv->fromUChar32;
   414     prev=(int32_t)cnv->fromUnicodeStatus;
   415     if(prev==0) {
   416         prev=BOCU1_ASCII_PREV;
   417     }
   419     /* sourceIndex=-1 if the current character began in the previous buffer */
   420     sourceIndex= c==0 ? 0 : -1;
   421     nextSourceIndex=0;
   423     /* conversion loop */
   424     if(c!=0 && targetCapacity>0) {
   425         goto getTrail;
   426     }
   428 fastSingle:
   429     /* fast loop for single-byte differences */
   430     /* use only one loop counter variable, targetCapacity, not also source */
   431     diff=(int32_t)(sourceLimit-source);
   432     if(targetCapacity>diff) {
   433         targetCapacity=diff;
   434     }
   435     while(targetCapacity>0 && (c=*source)<0x3000) {
   436         if(c<=0x20) {
   437             if(c!=0x20) {
   438                 prev=BOCU1_ASCII_PREV;
   439             }
   440             *target++=(uint8_t)c;
   441             *offsets++=nextSourceIndex++;
   442             ++source;
   443             --targetCapacity;
   444         } else {
   445             diff=c-prev;
   446             if(DIFF_IS_SINGLE(diff)) {
   447                 prev=BOCU1_SIMPLE_PREV(c);
   448                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
   449                 *offsets++=nextSourceIndex++;
   450                 ++source;
   451                 --targetCapacity;
   452             } else {
   453                 break;
   454             }
   455         }
   456     }
   457     /* restore real values */
   458     targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
   459     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
   461     /* regular loop for all cases */
   462     while(source<sourceLimit) {
   463         if(targetCapacity>0) {
   464             c=*source++;
   465             ++nextSourceIndex;
   467             if(c<=0x20) {
   468                 /*
   469                  * ISO C0 control & space:
   470                  * Encode directly for MIME compatibility,
   471                  * and reset state except for space, to not disrupt compression.
   472                  */
   473                 if(c!=0x20) {
   474                     prev=BOCU1_ASCII_PREV;
   475                 }
   476                 *target++=(uint8_t)c;
   477                 *offsets++=sourceIndex;
   478                 --targetCapacity;
   480                 sourceIndex=nextSourceIndex;
   481                 continue;
   482             }
   484             if(U16_IS_LEAD(c)) {
   485 getTrail:
   486                 if(source<sourceLimit) {
   487                     /* test the following code unit */
   488                     UChar trail=*source;
   489                     if(U16_IS_TRAIL(trail)) {
   490                         ++source;
   491                         ++nextSourceIndex;
   492                         c=U16_GET_SUPPLEMENTARY(c, trail);
   493                     }
   494                 } else {
   495                     /* no more input */
   496                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
   497                     break;
   498                 }
   499             }
   501             /*
   502              * all other Unicode code points c==U+0021..U+10ffff
   503              * are encoded with the difference c-prev
   504              *
   505              * a new prev is computed from c,
   506              * placed in the middle of a 0x80-block (for most small scripts) or
   507              * in the middle of the Unihan and Hangul blocks
   508              * to statistically minimize the following difference
   509              */
   510             diff=c-prev;
   511             prev=BOCU1_PREV(c);
   512             if(DIFF_IS_SINGLE(diff)) {
   513                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
   514                 *offsets++=sourceIndex;
   515                 --targetCapacity;
   516                 sourceIndex=nextSourceIndex;
   517                 if(c<0x3000) {
   518                     goto fastSingle;
   519                 }
   520             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
   521                 /* optimize 2-byte case */
   522                 int32_t m;
   524                 if(diff>=0) {
   525                     diff-=BOCU1_REACH_POS_1+1;
   526                     m=diff%BOCU1_TRAIL_COUNT;
   527                     diff/=BOCU1_TRAIL_COUNT;
   528                     diff+=BOCU1_START_POS_2;
   529                 } else {
   530                     diff-=BOCU1_REACH_NEG_1;
   531                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
   532                     diff+=BOCU1_START_NEG_2;
   533                 }
   534                 *target++=(uint8_t)diff;
   535                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
   536                 *offsets++=sourceIndex;
   537                 *offsets++=sourceIndex;
   538                 targetCapacity-=2;
   539                 sourceIndex=nextSourceIndex;
   540             } else {
   541                 int32_t length; /* will be 2..4 */
   543                 diff=packDiff(diff);
   544                 length=BOCU1_LENGTH_FROM_PACKED(diff);
   546                 /* write the output character bytes from diff and length */
   547                 /* from the first if in the loop we know that targetCapacity>0 */
   548                 if(length<=targetCapacity) {
   549                     switch(length) {
   550                         /* each branch falls through to the next one */
   551                     case 4:
   552                         *target++=(uint8_t)(diff>>24);
   553                         *offsets++=sourceIndex;
   554                     case 3: /*fall through*/
   555                         *target++=(uint8_t)(diff>>16);
   556                         *offsets++=sourceIndex;
   557                     case 2: /*fall through*/
   558                         *target++=(uint8_t)(diff>>8);
   559                         *offsets++=sourceIndex;
   560                     /* case 1: handled above */
   561                         *target++=(uint8_t)diff;
   562                         *offsets++=sourceIndex;
   563                     default:
   564                         /* will never occur */
   565                         break;
   566                     }
   567                     targetCapacity-=length;
   568                     sourceIndex=nextSourceIndex;
   569                 } else {
   570                     uint8_t *charErrorBuffer;
   572                     /*
   573                      * We actually do this backwards here:
   574                      * In order to save an intermediate variable, we output
   575                      * first to the overflow buffer what does not fit into the
   576                      * regular target.
   577                      */
   578                     /* we know that 1<=targetCapacity<length<=4 */
   579                     length-=targetCapacity;
   580                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
   581                     switch(length) {
   582                         /* each branch falls through to the next one */
   583                     case 3:
   584                         *charErrorBuffer++=(uint8_t)(diff>>16);
   585                     case 2: /*fall through*/
   586                         *charErrorBuffer++=(uint8_t)(diff>>8);
   587                     case 1: /*fall through*/
   588                         *charErrorBuffer=(uint8_t)diff;
   589                     default:
   590                         /* will never occur */
   591                         break;
   592                     }
   593                     cnv->charErrorBufferLength=(int8_t)length;
   595                     /* now output what fits into the regular target */
   596                     diff>>=8*length; /* length was reduced by targetCapacity */
   597                     switch(targetCapacity) {
   598                         /* each branch falls through to the next one */
   599                     case 3:
   600                         *target++=(uint8_t)(diff>>16);
   601                         *offsets++=sourceIndex;
   602                     case 2: /*fall through*/
   603                         *target++=(uint8_t)(diff>>8);
   604                         *offsets++=sourceIndex;
   605                     case 1: /*fall through*/
   606                         *target++=(uint8_t)diff;
   607                         *offsets++=sourceIndex;
   608                     default:
   609                         /* will never occur */
   610                         break;
   611                     }
   613                     /* target overflow */
   614                     targetCapacity=0;
   615                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   616                     break;
   617                 }
   618             }
   619         } else {
   620             /* target is full */
   621             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   622             break;
   623         }
   624     }
   626     /* set the converter state back into UConverter */
   627     cnv->fromUChar32= c<0 ? -c : 0;
   628     cnv->fromUnicodeStatus=(uint32_t)prev;
   630     /* write back the updated pointers */
   631     pArgs->source=source;
   632     pArgs->target=(char *)target;
   633     pArgs->offsets=offsets;
   634 }
   636 /*
   637  * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
   638  * If a change is made in the original function, then either
   639  * change this function the same way or
   640  * re-copy the original function and remove the variables
   641  * offsets, sourceIndex, and nextSourceIndex.
   642  */
   643 static void
   644 _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
   645                   UErrorCode *pErrorCode) {
   646     UConverter *cnv;
   647     const UChar *source, *sourceLimit;
   648     uint8_t *target;
   649     int32_t targetCapacity;
   651     int32_t prev, c, diff;
   653     /* set up the local pointers */
   654     cnv=pArgs->converter;
   655     source=pArgs->source;
   656     sourceLimit=pArgs->sourceLimit;
   657     target=(uint8_t *)pArgs->target;
   658     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
   660     /* get the converter state from UConverter */
   661     c=cnv->fromUChar32;
   662     prev=(int32_t)cnv->fromUnicodeStatus;
   663     if(prev==0) {
   664         prev=BOCU1_ASCII_PREV;
   665     }
   667     /* conversion loop */
   668     if(c!=0 && targetCapacity>0) {
   669         goto getTrail;
   670     }
   672 fastSingle:
   673     /* fast loop for single-byte differences */
   674     /* use only one loop counter variable, targetCapacity, not also source */
   675     diff=(int32_t)(sourceLimit-source);
   676     if(targetCapacity>diff) {
   677         targetCapacity=diff;
   678     }
   679     while(targetCapacity>0 && (c=*source)<0x3000) {
   680         if(c<=0x20) {
   681             if(c!=0x20) {
   682                 prev=BOCU1_ASCII_PREV;
   683             }
   684             *target++=(uint8_t)c;
   685         } else {
   686             diff=c-prev;
   687             if(DIFF_IS_SINGLE(diff)) {
   688                 prev=BOCU1_SIMPLE_PREV(c);
   689                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
   690             } else {
   691                 break;
   692             }
   693         }
   694         ++source;
   695         --targetCapacity;
   696     }
   697     /* restore real values */
   698     targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
   700     /* regular loop for all cases */
   701     while(source<sourceLimit) {
   702         if(targetCapacity>0) {
   703             c=*source++;
   705             if(c<=0x20) {
   706                 /*
   707                  * ISO C0 control & space:
   708                  * Encode directly for MIME compatibility,
   709                  * and reset state except for space, to not disrupt compression.
   710                  */
   711                 if(c!=0x20) {
   712                     prev=BOCU1_ASCII_PREV;
   713                 }
   714                 *target++=(uint8_t)c;
   715                 --targetCapacity;
   716                 continue;
   717             }
   719             if(U16_IS_LEAD(c)) {
   720 getTrail:
   721                 if(source<sourceLimit) {
   722                     /* test the following code unit */
   723                     UChar trail=*source;
   724                     if(U16_IS_TRAIL(trail)) {
   725                         ++source;
   726                         c=U16_GET_SUPPLEMENTARY(c, trail);
   727                     }
   728                 } else {
   729                     /* no more input */
   730                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
   731                     break;
   732                 }
   733             }
   735             /*
   736              * all other Unicode code points c==U+0021..U+10ffff
   737              * are encoded with the difference c-prev
   738              *
   739              * a new prev is computed from c,
   740              * placed in the middle of a 0x80-block (for most small scripts) or
   741              * in the middle of the Unihan and Hangul blocks
   742              * to statistically minimize the following difference
   743              */
   744             diff=c-prev;
   745             prev=BOCU1_PREV(c);
   746             if(DIFF_IS_SINGLE(diff)) {
   747                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
   748                 --targetCapacity;
   749                 if(c<0x3000) {
   750                     goto fastSingle;
   751                 }
   752             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
   753                 /* optimize 2-byte case */
   754                 int32_t m;
   756                 if(diff>=0) {
   757                     diff-=BOCU1_REACH_POS_1+1;
   758                     m=diff%BOCU1_TRAIL_COUNT;
   759                     diff/=BOCU1_TRAIL_COUNT;
   760                     diff+=BOCU1_START_POS_2;
   761                 } else {
   762                     diff-=BOCU1_REACH_NEG_1;
   763                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
   764                     diff+=BOCU1_START_NEG_2;
   765                 }
   766                 *target++=(uint8_t)diff;
   767                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
   768                 targetCapacity-=2;
   769             } else {
   770                 int32_t length; /* will be 2..4 */
   772                 diff=packDiff(diff);
   773                 length=BOCU1_LENGTH_FROM_PACKED(diff);
   775                 /* write the output character bytes from diff and length */
   776                 /* from the first if in the loop we know that targetCapacity>0 */
   777                 if(length<=targetCapacity) {
   778                     switch(length) {
   779                         /* each branch falls through to the next one */
   780                     case 4:
   781                         *target++=(uint8_t)(diff>>24);
   782                     case 3: /*fall through*/
   783                         *target++=(uint8_t)(diff>>16);
   784                     /* case 2: handled above */
   785                         *target++=(uint8_t)(diff>>8);
   786                     /* case 1: handled above */
   787                         *target++=(uint8_t)diff;
   788                     default:
   789                         /* will never occur */
   790                         break;
   791                     }
   792                     targetCapacity-=length;
   793                 } else {
   794                     uint8_t *charErrorBuffer;
   796                     /*
   797                      * We actually do this backwards here:
   798                      * In order to save an intermediate variable, we output
   799                      * first to the overflow buffer what does not fit into the
   800                      * regular target.
   801                      */
   802                     /* we know that 1<=targetCapacity<length<=4 */
   803                     length-=targetCapacity;
   804                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
   805                     switch(length) {
   806                         /* each branch falls through to the next one */
   807                     case 3:
   808                         *charErrorBuffer++=(uint8_t)(diff>>16);
   809                     case 2: /*fall through*/
   810                         *charErrorBuffer++=(uint8_t)(diff>>8);
   811                     case 1: /*fall through*/
   812                         *charErrorBuffer=(uint8_t)diff;
   813                     default:
   814                         /* will never occur */
   815                         break;
   816                     }
   817                     cnv->charErrorBufferLength=(int8_t)length;
   819                     /* now output what fits into the regular target */
   820                     diff>>=8*length; /* length was reduced by targetCapacity */
   821                     switch(targetCapacity) {
   822                         /* each branch falls through to the next one */
   823                     case 3:
   824                         *target++=(uint8_t)(diff>>16);
   825                     case 2: /*fall through*/
   826                         *target++=(uint8_t)(diff>>8);
   827                     case 1: /*fall through*/
   828                         *target++=(uint8_t)diff;
   829                     default:
   830                         /* will never occur */
   831                         break;
   832                     }
   834                     /* target overflow */
   835                     targetCapacity=0;
   836                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   837                     break;
   838                 }
   839             }
   840         } else {
   841             /* target is full */
   842             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   843             break;
   844         }
   845     }
   847     /* set the converter state back into UConverter */
   848     cnv->fromUChar32= c<0 ? -c : 0;
   849     cnv->fromUnicodeStatus=(uint32_t)prev;
   851     /* write back the updated pointers */
   852     pArgs->source=source;
   853     pArgs->target=(char *)target;
   854 }
   856 /* BOCU-1-to-Unicode conversion functions ----------------------------------- */
   858 /**
   859  * Function for BOCU-1 decoder; handles multi-byte lead bytes.
   860  *
   861  * @param b lead byte;
   862  *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
   863  * @return (diff<<2)|count
   864  */
   865 static inline int32_t
   866 decodeBocu1LeadByte(int32_t b) {
   867     int32_t diff, count;
   869     if(b>=BOCU1_START_NEG_2) {
   870         /* positive difference */
   871         if(b<BOCU1_START_POS_3) {
   872             /* two bytes */
   873             diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
   874             count=1;
   875         } else if(b<BOCU1_START_POS_4) {
   876             /* three bytes */
   877             diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
   878             count=2;
   879         } else {
   880             /* four bytes */
   881             diff=BOCU1_REACH_POS_3+1;
   882             count=3;
   883         }
   884     } else {
   885         /* negative difference */
   886         if(b>=BOCU1_START_NEG_3) {
   887             /* two bytes */
   888             diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
   889             count=1;
   890         } else if(b>BOCU1_MIN) {
   891             /* three bytes */
   892             diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
   893             count=2;
   894         } else {
   895             /* four bytes */
   896             diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
   897             count=3;
   898         }
   899     }
   901     /* return the state for decoding the trail byte(s) */
   902     return (diff<<2)|count;
   903 }
   905 /**
   906  * Function for BOCU-1 decoder; handles multi-byte trail bytes.
   907  *
   908  * @param count number of remaining trail bytes including this one
   909  * @param b trail byte
   910  * @return new delta for diff including b - <0 indicates an error
   911  *
   912  * @see decodeBocu1
   913  */
   914 static inline int32_t
   915 decodeBocu1TrailByte(int32_t count, int32_t b) {
   916     if(b<=0x20) {
   917         /* skip some C0 controls and make the trail byte range contiguous */
   918         b=bocu1ByteToTrail[b];
   919         /* b<0 for an illegal trail byte value will result in return<0 below */
   920 #if BOCU1_MAX_TRAIL<0xff
   921     } else if(b>BOCU1_MAX_TRAIL) {
   922         return -99;
   923 #endif
   924     } else {
   925         b-=BOCU1_TRAIL_BYTE_OFFSET;
   926     }
   928     /* add trail byte into difference and decrement count */
   929     if(count==1) {
   930         return b;
   931     } else if(count==2) {
   932         return b*BOCU1_TRAIL_COUNT;
   933     } else /* count==3 */ {
   934         return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
   935     }
   936 }
   938 static void
   939 _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
   940                            UErrorCode *pErrorCode) {
   941     UConverter *cnv;
   942     const uint8_t *source, *sourceLimit;
   943     UChar *target;
   944     const UChar *targetLimit;
   945     int32_t *offsets;
   947     int32_t prev, count, diff, c;
   949     int8_t byteIndex;
   950     uint8_t *bytes;
   952     int32_t sourceIndex, nextSourceIndex;
   954     /* set up the local pointers */
   955     cnv=pArgs->converter;
   956     source=(const uint8_t *)pArgs->source;
   957     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   958     target=pArgs->target;
   959     targetLimit=pArgs->targetLimit;
   960     offsets=pArgs->offsets;
   962     /* get the converter state from UConverter */
   963     prev=(int32_t)cnv->toUnicodeStatus;
   964     if(prev==0) {
   965         prev=BOCU1_ASCII_PREV;
   966     }
   967     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
   968     count=diff&3;
   969     diff>>=2;
   971     byteIndex=cnv->toULength;
   972     bytes=cnv->toUBytes;
   974     /* sourceIndex=-1 if the current character began in the previous buffer */
   975     sourceIndex=byteIndex==0 ? 0 : -1;
   976     nextSourceIndex=0;
   978     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
   979     if(count>0 && byteIndex>0 && target<targetLimit) {
   980         goto getTrail;
   981     }
   983 fastSingle:
   984     /* fast loop for single-byte differences */
   985     /* use count as the only loop counter variable */
   986     diff=(int32_t)(sourceLimit-source);
   987     count=(int32_t)(pArgs->targetLimit-target);
   988     if(count>diff) {
   989         count=diff;
   990     }
   991     while(count>0) {
   992         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
   993             c=prev+(c-BOCU1_MIDDLE);
   994             if(c<0x3000) {
   995                 *target++=(UChar)c;
   996                 *offsets++=nextSourceIndex++;
   997                 prev=BOCU1_SIMPLE_PREV(c);
   998             } else {
   999                 break;
  1001         } else if(c<=0x20) {
  1002             if(c!=0x20) {
  1003                 prev=BOCU1_ASCII_PREV;
  1005             *target++=(UChar)c;
  1006             *offsets++=nextSourceIndex++;
  1007         } else {
  1008             break;
  1010         ++source;
  1011         --count;
  1013     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
  1015     /* decode a sequence of single and lead bytes */
  1016     while(source<sourceLimit) {
  1017         if(target>=targetLimit) {
  1018             /* target is full */
  1019             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1020             break;
  1023         ++nextSourceIndex;
  1024         c=*source++;
  1025         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
  1026             /* Write a code point directly from a single-byte difference. */
  1027             c=prev+(c-BOCU1_MIDDLE);
  1028             if(c<0x3000) {
  1029                 *target++=(UChar)c;
  1030                 *offsets++=sourceIndex;
  1031                 prev=BOCU1_SIMPLE_PREV(c);
  1032                 sourceIndex=nextSourceIndex;
  1033                 goto fastSingle;
  1035         } else if(c<=0x20) {
  1036             /*
  1037              * Direct-encoded C0 control code or space.
  1038              * Reset prev for C0 control codes but not for space.
  1039              */
  1040             if(c!=0x20) {
  1041                 prev=BOCU1_ASCII_PREV;
  1043             *target++=(UChar)c;
  1044             *offsets++=sourceIndex;
  1045             sourceIndex=nextSourceIndex;
  1046             continue;
  1047         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
  1048             /* Optimize two-byte case. */
  1049             if(c>=BOCU1_MIDDLE) {
  1050                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
  1051             } else {
  1052                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
  1055             /* trail byte */
  1056             ++nextSourceIndex;
  1057             c=decodeBocu1TrailByte(1, *source++);
  1058             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
  1059                 bytes[0]=source[-2];
  1060                 bytes[1]=source[-1];
  1061                 byteIndex=2;
  1062                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1063                 break;
  1065         } else if(c==BOCU1_RESET) {
  1066             /* only reset the state, no code point */
  1067             prev=BOCU1_ASCII_PREV;
  1068             sourceIndex=nextSourceIndex;
  1069             continue;
  1070         } else {
  1071             /*
  1072              * For multi-byte difference lead bytes, set the decoder state
  1073              * with the partial difference value from the lead byte and
  1074              * with the number of trail bytes.
  1075              */
  1076             bytes[0]=(uint8_t)c;
  1077             byteIndex=1;
  1079             diff=decodeBocu1LeadByte(c);
  1080             count=diff&3;
  1081             diff>>=2;
  1082 getTrail:
  1083             for(;;) {
  1084                 if(source>=sourceLimit) {
  1085                     goto endloop;
  1087                 ++nextSourceIndex;
  1088                 c=bytes[byteIndex++]=*source++;
  1090                 /* trail byte in any position */
  1091                 c=decodeBocu1TrailByte(count, c);
  1092                 if(c<0) {
  1093                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1094                     goto endloop;
  1097                 diff+=c;
  1098                 if(--count==0) {
  1099                     /* final trail byte, deliver a code point */
  1100                     byteIndex=0;
  1101                     c=prev+diff;
  1102                     if((uint32_t)c>0x10ffff) {
  1103                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1104                         goto endloop;
  1106                     break;
  1111         /* calculate the next prev and output c */
  1112         prev=BOCU1_PREV(c);
  1113         if(c<=0xffff) {
  1114             *target++=(UChar)c;
  1115             *offsets++=sourceIndex;
  1116         } else {
  1117             /* output surrogate pair */
  1118             *target++=U16_LEAD(c);
  1119             if(target<targetLimit) {
  1120                 *target++=U16_TRAIL(c);
  1121                 *offsets++=sourceIndex;
  1122                 *offsets++=sourceIndex;
  1123             } else {
  1124                 /* target overflow */
  1125                 *offsets++=sourceIndex;
  1126                 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
  1127                 cnv->UCharErrorBufferLength=1;
  1128                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1129                 break;
  1132         sourceIndex=nextSourceIndex;
  1134 endloop:
  1136     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
  1137         /* set the converter state in UConverter to deal with the next character */
  1138         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
  1139         cnv->mode=0;
  1140     } else {
  1141         /* set the converter state back into UConverter */
  1142         cnv->toUnicodeStatus=(uint32_t)prev;
  1143         cnv->mode=(diff<<2)|count;
  1145     cnv->toULength=byteIndex;
  1147     /* write back the updated pointers */
  1148     pArgs->source=(const char *)source;
  1149     pArgs->target=target;
  1150     pArgs->offsets=offsets;
  1151     return;
  1154 /*
  1155  * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
  1156  * If a change is made in the original function, then either
  1157  * change this function the same way or
  1158  * re-copy the original function and remove the variables
  1159  * offsets, sourceIndex, and nextSourceIndex.
  1160  */
  1161 static void
  1162 _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
  1163                 UErrorCode *pErrorCode) {
  1164     UConverter *cnv;
  1165     const uint8_t *source, *sourceLimit;
  1166     UChar *target;
  1167     const UChar *targetLimit;
  1169     int32_t prev, count, diff, c;
  1171     int8_t byteIndex;
  1172     uint8_t *bytes;
  1174 U_ALIGN_CODE(16)
  1176     /* set up the local pointers */
  1177     cnv=pArgs->converter;
  1178     source=(const uint8_t *)pArgs->source;
  1179     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
  1180     target=pArgs->target;
  1181     targetLimit=pArgs->targetLimit;
  1183     /* get the converter state from UConverter */
  1184     prev=(int32_t)cnv->toUnicodeStatus;
  1185     if(prev==0) {
  1186         prev=BOCU1_ASCII_PREV;
  1188     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
  1189     count=diff&3;
  1190     diff>>=2;
  1192     byteIndex=cnv->toULength;
  1193     bytes=cnv->toUBytes;
  1195     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
  1196     if(count>0 && byteIndex>0 && target<targetLimit) {
  1197         goto getTrail;
  1200 fastSingle:
  1201     /* fast loop for single-byte differences */
  1202     /* use count as the only loop counter variable */
  1203     diff=(int32_t)(sourceLimit-source);
  1204     count=(int32_t)(pArgs->targetLimit-target);
  1205     if(count>diff) {
  1206         count=diff;
  1208     while(count>0) {
  1209         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
  1210             c=prev+(c-BOCU1_MIDDLE);
  1211             if(c<0x3000) {
  1212                 *target++=(UChar)c;
  1213                 prev=BOCU1_SIMPLE_PREV(c);
  1214             } else {
  1215                 break;
  1217         } else if(c<=0x20) {
  1218             if(c!=0x20) {
  1219                 prev=BOCU1_ASCII_PREV;
  1221             *target++=(UChar)c;
  1222         } else {
  1223             break;
  1225         ++source;
  1226         --count;
  1229     /* decode a sequence of single and lead bytes */
  1230     while(source<sourceLimit) {
  1231         if(target>=targetLimit) {
  1232             /* target is full */
  1233             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1234             break;
  1237         c=*source++;
  1238         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
  1239             /* Write a code point directly from a single-byte difference. */
  1240             c=prev+(c-BOCU1_MIDDLE);
  1241             if(c<0x3000) {
  1242                 *target++=(UChar)c;
  1243                 prev=BOCU1_SIMPLE_PREV(c);
  1244                 goto fastSingle;
  1246         } else if(c<=0x20) {
  1247             /*
  1248              * Direct-encoded C0 control code or space.
  1249              * Reset prev for C0 control codes but not for space.
  1250              */
  1251             if(c!=0x20) {
  1252                 prev=BOCU1_ASCII_PREV;
  1254             *target++=(UChar)c;
  1255             continue;
  1256         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
  1257             /* Optimize two-byte case. */
  1258             if(c>=BOCU1_MIDDLE) {
  1259                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
  1260             } else {
  1261                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
  1264             /* trail byte */
  1265             c=decodeBocu1TrailByte(1, *source++);
  1266             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
  1267                 bytes[0]=source[-2];
  1268                 bytes[1]=source[-1];
  1269                 byteIndex=2;
  1270                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1271                 break;
  1273         } else if(c==BOCU1_RESET) {
  1274             /* only reset the state, no code point */
  1275             prev=BOCU1_ASCII_PREV;
  1276             continue;
  1277         } else {
  1278             /*
  1279              * For multi-byte difference lead bytes, set the decoder state
  1280              * with the partial difference value from the lead byte and
  1281              * with the number of trail bytes.
  1282              */
  1283             bytes[0]=(uint8_t)c;
  1284             byteIndex=1;
  1286             diff=decodeBocu1LeadByte(c);
  1287             count=diff&3;
  1288             diff>>=2;
  1289 getTrail:
  1290             for(;;) {
  1291                 if(source>=sourceLimit) {
  1292                     goto endloop;
  1294                 c=bytes[byteIndex++]=*source++;
  1296                 /* trail byte in any position */
  1297                 c=decodeBocu1TrailByte(count, c);
  1298                 if(c<0) {
  1299                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1300                     goto endloop;
  1303                 diff+=c;
  1304                 if(--count==0) {
  1305                     /* final trail byte, deliver a code point */
  1306                     byteIndex=0;
  1307                     c=prev+diff;
  1308                     if((uint32_t)c>0x10ffff) {
  1309                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1310                         goto endloop;
  1312                     break;
  1317         /* calculate the next prev and output c */
  1318         prev=BOCU1_PREV(c);
  1319         if(c<=0xffff) {
  1320             *target++=(UChar)c;
  1321         } else {
  1322             /* output surrogate pair */
  1323             *target++=U16_LEAD(c);
  1324             if(target<targetLimit) {
  1325                 *target++=U16_TRAIL(c);
  1326             } else {
  1327                 /* target overflow */
  1328                 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
  1329                 cnv->UCharErrorBufferLength=1;
  1330                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1331                 break;
  1335 endloop:
  1337     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
  1338         /* set the converter state in UConverter to deal with the next character */
  1339         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
  1340         cnv->mode=0;
  1341     } else {
  1342         /* set the converter state back into UConverter */
  1343         cnv->toUnicodeStatus=(uint32_t)prev;
  1344         cnv->mode=(diff<<2)|count;
  1346     cnv->toULength=byteIndex;
  1348     /* write back the updated pointers */
  1349     pArgs->source=(const char *)source;
  1350     pArgs->target=target;
  1351     return;
  1354 /* miscellaneous ------------------------------------------------------------ */
  1356 static const UConverterImpl _Bocu1Impl={
  1357     UCNV_BOCU1,
  1359     NULL,
  1360     NULL,
  1362     NULL,
  1363     NULL,
  1364     NULL,
  1366     _Bocu1ToUnicode,
  1367     _Bocu1ToUnicodeWithOffsets,
  1368     _Bocu1FromUnicode,
  1369     _Bocu1FromUnicodeWithOffsets,
  1370     NULL,
  1372     NULL,
  1373     NULL,
  1374     NULL,
  1375     NULL,
  1376     ucnv_getCompleteUnicodeSet,
  1378     NULL,
  1379     NULL
  1380 };
  1382 static const UConverterStaticData _Bocu1StaticData={
  1383     sizeof(UConverterStaticData),
  1384     "BOCU-1",
  1385     1214, /* CCSID for BOCU-1 */
  1386     UCNV_IBM, UCNV_BOCU1,
  1387     1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
  1388     { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
  1389     FALSE, FALSE,
  1390     0,
  1391     0,
  1392     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  1393 };
  1395 const UConverterSharedData _Bocu1Data={
  1396     sizeof(UConverterSharedData), ~((uint32_t)0),
  1397     NULL, NULL, &_Bocu1StaticData, FALSE, &_Bocu1Impl,
  1398     0,
  1399     UCNV_MBCS_TABLE_INITIALIZER
  1400 };
  1402 #endif

mercurial