michael@0: /*
michael@0: ******************************************************************************
michael@0: *
michael@0: *   Copyright (C) 2002-2011, International Business Machines
michael@0: *   Corporation and others.  All Rights Reserved.
michael@0: *
michael@0: ******************************************************************************
michael@0: *   file name:  ucnvbocu.cpp
michael@0: *   encoding:   US-ASCII
michael@0: *   tab size:   8 (not used)
michael@0: *   indentation:4
michael@0: *
michael@0: *   created on: 2002mar27
michael@0: *   created by: Markus W. Scherer
michael@0: *
michael@0: *   This is an implementation of the Binary Ordered Compression for Unicode,
michael@0: *   in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
michael@0: */
michael@0: 
michael@0: #include "unicode/utypes.h"
michael@0: 
michael@0: #if !UCONFIG_NO_CONVERSION
michael@0: 
michael@0: #include "unicode/ucnv.h"
michael@0: #include "unicode/ucnv_cb.h"
michael@0: #include "unicode/utf16.h"
michael@0: #include "putilimp.h"
michael@0: #include "ucnv_bld.h"
michael@0: #include "ucnv_cnv.h"
michael@0: #include "uassert.h"
michael@0: 
michael@0: /* BOCU-1 constants and macros ---------------------------------------------- */
michael@0: 
michael@0: /*
michael@0:  * BOCU-1 encodes the code points of a Unicode string as
michael@0:  * a sequence of byte-encoded differences (slope detection),
michael@0:  * preserving lexical order.
michael@0:  *
michael@0:  * Optimize the difference-taking for runs of Unicode text within
michael@0:  * small scripts:
michael@0:  *
michael@0:  * Most small scripts are allocated within aligned 128-blocks of Unicode
michael@0:  * code points. Lexical order is preserved if the "previous code point" state
michael@0:  * is always moved into the middle of such a block.
michael@0:  *
michael@0:  * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
michael@0:  * areas into the middle of those areas.
michael@0:  *
michael@0:  * C0 control codes and space are encoded with their US-ASCII bytes.
michael@0:  * "prev" is reset for C0 controls but not for space.
michael@0:  */
michael@0: 
michael@0: /* initial value for "prev": middle of the ASCII range */
michael@0: #define BOCU1_ASCII_PREV        0x40
michael@0: 
michael@0: /* bounding byte values for differences */
michael@0: #define BOCU1_MIN               0x21
michael@0: #define BOCU1_MIDDLE            0x90
michael@0: #define BOCU1_MAX_LEAD          0xfe
michael@0: #define BOCU1_MAX_TRAIL         0xff
michael@0: #define BOCU1_RESET             0xff
michael@0: 
michael@0: /* number of lead bytes */
michael@0: #define BOCU1_COUNT             (BOCU1_MAX_LEAD-BOCU1_MIN+1)
michael@0: 
michael@0: /* adjust trail byte counts for the use of some C0 control byte values */
michael@0: #define BOCU1_TRAIL_CONTROLS_COUNT  20
michael@0: #define BOCU1_TRAIL_BYTE_OFFSET     (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
michael@0: 
michael@0: /* number of trail bytes */
michael@0: #define BOCU1_TRAIL_COUNT       ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
michael@0: 
michael@0: /*
michael@0:  * number of positive and negative single-byte codes
michael@0:  * (counting 0==BOCU1_MIDDLE among the positive ones)
michael@0:  */
michael@0: #define BOCU1_SINGLE            64
michael@0: 
michael@0: /* number of lead bytes for positive and negative 2/3/4-byte sequences */
michael@0: #define BOCU1_LEAD_2            43
michael@0: #define BOCU1_LEAD_3            3
michael@0: #define BOCU1_LEAD_4            1
michael@0: 
michael@0: /* The difference value range for single-byters. */
michael@0: #define BOCU1_REACH_POS_1   (BOCU1_SINGLE-1)
michael@0: #define BOCU1_REACH_NEG_1   (-BOCU1_SINGLE)
michael@0: 
michael@0: /* The difference value range for double-byters. */
michael@0: #define BOCU1_REACH_POS_2   (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
michael@0: #define BOCU1_REACH_NEG_2   (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
michael@0: 
michael@0: /* The difference value range for 3-byters. */
michael@0: #define BOCU1_REACH_POS_3   \
michael@0:     (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
michael@0: 
michael@0: #define BOCU1_REACH_NEG_3   (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
michael@0: 
michael@0: /* The lead byte start values. */
michael@0: #define BOCU1_START_POS_2   (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
michael@0: #define BOCU1_START_POS_3   (BOCU1_START_POS_2+BOCU1_LEAD_2)
michael@0: #define BOCU1_START_POS_4   (BOCU1_START_POS_3+BOCU1_LEAD_3)
michael@0:      /* ==BOCU1_MAX_LEAD */
michael@0: 
michael@0: #define BOCU1_START_NEG_2   (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
michael@0: #define BOCU1_START_NEG_3   (BOCU1_START_NEG_2-BOCU1_LEAD_2)
michael@0: #define BOCU1_START_NEG_4   (BOCU1_START_NEG_3-BOCU1_LEAD_3)
michael@0:      /* ==BOCU1_MIN+1 */
michael@0: 
michael@0: /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
michael@0: #define BOCU1_LENGTH_FROM_LEAD(lead) \
michael@0:     ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
michael@0:      (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
michael@0:      (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
michael@0: 
michael@0: /* The length of a byte sequence, according to its packed form. */
michael@0: #define BOCU1_LENGTH_FROM_PACKED(packed) \
michael@0:     ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
michael@0: 
michael@0: /*
michael@0:  * 12 commonly used C0 control codes (and space) are only used to encode
michael@0:  * themselves directly,
michael@0:  * which makes BOCU-1 MIME-usable and reasonably safe for
michael@0:  * ASCII-oriented software.
michael@0:  *
michael@0:  * These controls are
michael@0:  *  0   NUL
michael@0:  *
michael@0:  *  7   BEL
michael@0:  *  8   BS
michael@0:  *
michael@0:  *  9   TAB
michael@0:  *  a   LF
michael@0:  *  b   VT
michael@0:  *  c   FF
michael@0:  *  d   CR
michael@0:  *
michael@0:  *  e   SO
michael@0:  *  f   SI
michael@0:  *
michael@0:  * 1a   SUB
michael@0:  * 1b   ESC
michael@0:  *
michael@0:  * The other 20 C0 controls are also encoded directly (to preserve order)
michael@0:  * but are also used as trail bytes in difference encoding
michael@0:  * (for better compression).
michael@0:  */
michael@0: #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
michael@0: 
michael@0: /*
michael@0:  * Byte value map for control codes,
michael@0:  * from external byte values 0x00..0x20
michael@0:  * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
michael@0:  * External byte values that are illegal as trail bytes are mapped to -1.
michael@0:  */
michael@0: static const int8_t
michael@0: bocu1ByteToTrail[BOCU1_MIN]={
michael@0: /*  0     1     2     3     4     5     6     7    */
michael@0:     -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
michael@0: 
michael@0: /*  8     9     a     b     c     d     e     f    */
michael@0:     -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
michael@0: 
michael@0: /*  10    11    12    13    14    15    16    17   */
michael@0:     0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
michael@0: 
michael@0: /*  18    19    1a    1b    1c    1d    1e    1f   */
michael@0:     0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,
michael@0: 
michael@0: /*  20   */
michael@0:     -1
michael@0: };
michael@0: 
michael@0: /*
michael@0:  * Byte value map for control codes,
michael@0:  * from trail byte values 0..19 (0..0x13) as used in the difference calculation
michael@0:  * to external byte values 0x00..0x20.
michael@0:  */
michael@0: static const int8_t
michael@0: bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
michael@0: /*  0     1     2     3     4     5     6     7    */
michael@0:     0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
michael@0: 
michael@0: /*  8     9     a     b     c     d     e     f    */
michael@0:     0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
michael@0: 
michael@0: /*  10    11    12    13   */
michael@0:     0x1c, 0x1d, 0x1e, 0x1f
michael@0: };
michael@0: 
michael@0: /**
michael@0:  * Integer division and modulo with negative numerators
michael@0:  * yields negative modulo results and quotients that are one more than
michael@0:  * what we need here.
michael@0:  * This macro adjust the results so that the modulo-value m is always >=0.
michael@0:  *
michael@0:  * For positive n, the if() condition is always FALSE.
michael@0:  *
michael@0:  * @param n Number to be split into quotient and rest.
michael@0:  *          Will be modified to contain the quotient.
michael@0:  * @param d Divisor.
michael@0:  * @param m Output variable for the rest (modulo result).
michael@0:  */
michael@0: #define NEGDIVMOD(n, d, m) { \
michael@0:     (m)=(n)%(d); \
michael@0:     (n)/=(d); \
michael@0:     if((m)<0) { \
michael@0:         --(n); \
michael@0:         (m)+=(d); \
michael@0:     } \
michael@0: }
michael@0: 
michael@0: /* Faster versions of packDiff() for single-byte-encoded diff values. */
michael@0: 
michael@0: /** Is a diff value encodable in a single byte? */
michael@0: #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
michael@0: 
michael@0: /** Encode a diff value in a single byte. */
michael@0: #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
michael@0: 
michael@0: /** Is a diff value encodable in two bytes? */
michael@0: #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
michael@0: 
michael@0: /* BOCU-1 implementation functions ------------------------------------------ */
michael@0: 
michael@0: #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
michael@0: 
michael@0: /**
michael@0:  * Compute the next "previous" value for differencing
michael@0:  * from the current code point.
michael@0:  *
michael@0:  * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
michael@0:  * @return "previous code point" state value
michael@0:  */
michael@0: static inline int32_t
michael@0: bocu1Prev(int32_t c) {
michael@0:     /* compute new prev */
michael@0:     if(/* 0x3040<=c && */ c<=0x309f) {
michael@0:         /* Hiragana is not 128-aligned */
michael@0:         return 0x3070;
michael@0:     } else if(0x4e00<=c && c<=0x9fa5) {
michael@0:         /* CJK Unihan */
michael@0:         return 0x4e00-BOCU1_REACH_NEG_2;
michael@0:     } else if(0xac00<=c /* && c<=0xd7a3 */) {
michael@0:         /* Korean Hangul */
michael@0:         return (0xd7a3+0xac00)/2;
michael@0:     } else {
michael@0:         /* mostly small scripts */
michael@0:         return BOCU1_SIMPLE_PREV(c);
michael@0:     }
michael@0: }
michael@0: 
michael@0: /** Fast version of bocu1Prev() for most scripts. */
michael@0: #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
michael@0: 
michael@0: /*
michael@0:  * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
michael@0:  * The UConverter fields are used as follows:
michael@0:  *
michael@0:  * fromUnicodeStatus    encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
michael@0:  *
michael@0:  * toUnicodeStatus      decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
michael@0:  * mode                 decoder's incomplete (diff<<2)|count (ignored when toULength==0)
michael@0:  */
michael@0: 
michael@0: /* BOCU-1-from-Unicode conversion functions --------------------------------- */
michael@0: 
michael@0: /**
michael@0:  * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
michael@0:  * and return a packed integer with them.
michael@0:  *
michael@0:  * The encoding favors small absolute differences with short encodings
michael@0:  * to compress runs of same-script characters.
michael@0:  *
michael@0:  * Optimized version with unrolled loops and fewer floating-point operations
michael@0:  * than the standard packDiff().
michael@0:  *
michael@0:  * @param diff difference value -0x10ffff..0x10ffff
michael@0:  * @return
michael@0:  *      0x010000zz for 1-byte sequence zz
michael@0:  *      0x0200yyzz for 2-byte sequence yy zz
michael@0:  *      0x03xxyyzz for 3-byte sequence xx yy zz
michael@0:  *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
michael@0:  */
michael@0: static int32_t
michael@0: packDiff(int32_t diff) {
michael@0:     int32_t result, m;
michael@0: 
michael@0:     U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */
michael@0:     if(diff>=BOCU1_REACH_NEG_1) {
michael@0:         /* mostly positive differences, and single-byte negative ones */
michael@0: #if 0   /* single-byte case handled in macros, see below */
michael@0:         if(diff<=BOCU1_REACH_POS_1) {
michael@0:             /* single byte */
michael@0:             return 0x01000000|(BOCU1_MIDDLE+diff);
michael@0:         } else
michael@0: #endif
michael@0:         if(diff<=BOCU1_REACH_POS_2) {
michael@0:             /* two bytes */
michael@0:             diff-=BOCU1_REACH_POS_1+1;
michael@0:             result=0x02000000;
michael@0: 
michael@0:             m=diff%BOCU1_TRAIL_COUNT;
michael@0:             diff/=BOCU1_TRAIL_COUNT;
michael@0:             result|=BOCU1_TRAIL_TO_BYTE(m);
michael@0: 
michael@0:             result|=(BOCU1_START_POS_2+diff)<<8;
michael@0:         } else if(diff<=BOCU1_REACH_POS_3) {
michael@0:             /* three bytes */
michael@0:             diff-=BOCU1_REACH_POS_2+1;
michael@0:             result=0x03000000;
michael@0: 
michael@0:             m=diff%BOCU1_TRAIL_COUNT;
michael@0:             diff/=BOCU1_TRAIL_COUNT;
michael@0:             result|=BOCU1_TRAIL_TO_BYTE(m);
michael@0: 
michael@0:             m=diff%BOCU1_TRAIL_COUNT;
michael@0:             diff/=BOCU1_TRAIL_COUNT;
michael@0:             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
michael@0: 
michael@0:             result|=(BOCU1_START_POS_3+diff)<<16;
michael@0:         } else {
michael@0:             /* four bytes */
michael@0:             diff-=BOCU1_REACH_POS_3+1;
michael@0: 
michael@0:             m=diff%BOCU1_TRAIL_COUNT;
michael@0:             diff/=BOCU1_TRAIL_COUNT;
michael@0:             result=BOCU1_TRAIL_TO_BYTE(m);
michael@0: 
michael@0:             m=diff%BOCU1_TRAIL_COUNT;
michael@0:             diff/=BOCU1_TRAIL_COUNT;
michael@0:             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
michael@0: 
michael@0:             /*
michael@0:              * We know that / and % would deliver quotient 0 and rest=diff.
michael@0:              * Avoid division and modulo for performance.
michael@0:              */
michael@0:             result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
michael@0: 
michael@0:             result|=((uint32_t)BOCU1_START_POS_4)<<24;
michael@0:         }
michael@0:     } else {
michael@0:         /* two- to four-byte negative differences */
michael@0:         if(diff>=BOCU1_REACH_NEG_2) {
michael@0:             /* two bytes */
michael@0:             diff-=BOCU1_REACH_NEG_1;
michael@0:             result=0x02000000;
michael@0: 
michael@0:             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
michael@0:             result|=BOCU1_TRAIL_TO_BYTE(m);
michael@0: 
michael@0:             result|=(BOCU1_START_NEG_2+diff)<<8;
michael@0:         } else if(diff>=BOCU1_REACH_NEG_3) {
michael@0:             /* three bytes */
michael@0:             diff-=BOCU1_REACH_NEG_2;
michael@0:             result=0x03000000;
michael@0: 
michael@0:             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
michael@0:             result|=BOCU1_TRAIL_TO_BYTE(m);
michael@0: 
michael@0:             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
michael@0:             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
michael@0: 
michael@0:             result|=(BOCU1_START_NEG_3+diff)<<16;
michael@0:         } else {
michael@0:             /* four bytes */
michael@0:             diff-=BOCU1_REACH_NEG_3;
michael@0: 
michael@0:             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
michael@0:             result=BOCU1_TRAIL_TO_BYTE(m);
michael@0: 
michael@0:             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
michael@0:             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
michael@0: 
michael@0:             /*
michael@0:              * We know that NEGDIVMOD would deliver
michael@0:              * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
michael@0:              * Avoid division and modulo for performance.
michael@0:              */
michael@0:             m=diff+BOCU1_TRAIL_COUNT;
michael@0:             result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
michael@0: 
michael@0:             result|=BOCU1_MIN<<24;
michael@0:         }
michael@0:     }
michael@0:     return result;
michael@0: }
michael@0: 
michael@0: 
michael@0: static void
michael@0: _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
michael@0:                              UErrorCode *pErrorCode) {
michael@0:     UConverter *cnv;
michael@0:     const UChar *source, *sourceLimit;
michael@0:     uint8_t *target;
michael@0:     int32_t targetCapacity;
michael@0:     int32_t *offsets;
michael@0: 
michael@0:     int32_t prev, c, diff;
michael@0: 
michael@0:     int32_t sourceIndex, nextSourceIndex;
michael@0: 
michael@0: U_ALIGN_CODE(16)
michael@0: 
michael@0:     /* set up the local pointers */
michael@0:     cnv=pArgs->converter;
michael@0:     source=pArgs->source;
michael@0:     sourceLimit=pArgs->sourceLimit;
michael@0:     target=(uint8_t *)pArgs->target;
michael@0:     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
michael@0:     offsets=pArgs->offsets;
michael@0: 
michael@0:     /* get the converter state from UConverter */
michael@0:     c=cnv->fromUChar32;
michael@0:     prev=(int32_t)cnv->fromUnicodeStatus;
michael@0:     if(prev==0) {
michael@0:         prev=BOCU1_ASCII_PREV;
michael@0:     }
michael@0: 
michael@0:     /* sourceIndex=-1 if the current character began in the previous buffer */
michael@0:     sourceIndex= c==0 ? 0 : -1;
michael@0:     nextSourceIndex=0;
michael@0: 
michael@0:     /* conversion loop */
michael@0:     if(c!=0 && targetCapacity>0) {
michael@0:         goto getTrail;
michael@0:     }
michael@0: 
michael@0: fastSingle:
michael@0:     /* fast loop for single-byte differences */
michael@0:     /* use only one loop counter variable, targetCapacity, not also source */
michael@0:     diff=(int32_t)(sourceLimit-source);
michael@0:     if(targetCapacity>diff) {
michael@0:         targetCapacity=diff;
michael@0:     }
michael@0:     while(targetCapacity>0 && (c=*source)<0x3000) {
michael@0:         if(c<=0x20) {
michael@0:             if(c!=0x20) {
michael@0:                 prev=BOCU1_ASCII_PREV;
michael@0:             }
michael@0:             *target++=(uint8_t)c;
michael@0:             *offsets++=nextSourceIndex++;
michael@0:             ++source;
michael@0:             --targetCapacity;
michael@0:         } else {
michael@0:             diff=c-prev;
michael@0:             if(DIFF_IS_SINGLE(diff)) {
michael@0:                 prev=BOCU1_SIMPLE_PREV(c);
michael@0:                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
michael@0:                 *offsets++=nextSourceIndex++;
michael@0:                 ++source;
michael@0:                 --targetCapacity;
michael@0:             } else {
michael@0:                 break;
michael@0:             }
michael@0:         }
michael@0:     }
michael@0:     /* restore real values */
michael@0:     targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
michael@0:     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
michael@0: 
michael@0:     /* regular loop for all cases */
michael@0:     while(source<sourceLimit) {
michael@0:         if(targetCapacity>0) {
michael@0:             c=*source++;
michael@0:             ++nextSourceIndex;
michael@0: 
michael@0:             if(c<=0x20) {
michael@0:                 /*
michael@0:                  * ISO C0 control & space:
michael@0:                  * Encode directly for MIME compatibility,
michael@0:                  * and reset state except for space, to not disrupt compression.
michael@0:                  */
michael@0:                 if(c!=0x20) {
michael@0:                     prev=BOCU1_ASCII_PREV;
michael@0:                 }
michael@0:                 *target++=(uint8_t)c;
michael@0:                 *offsets++=sourceIndex;
michael@0:                 --targetCapacity;
michael@0: 
michael@0:                 sourceIndex=nextSourceIndex;
michael@0:                 continue;
michael@0:             }
michael@0: 
michael@0:             if(U16_IS_LEAD(c)) {
michael@0: getTrail:
michael@0:                 if(source<sourceLimit) {
michael@0:                     /* test the following code unit */
michael@0:                     UChar trail=*source;
michael@0:                     if(U16_IS_TRAIL(trail)) {
michael@0:                         ++source;
michael@0:                         ++nextSourceIndex;
michael@0:                         c=U16_GET_SUPPLEMENTARY(c, trail);
michael@0:                     }
michael@0:                 } else {
michael@0:                     /* no more input */
michael@0:                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
michael@0:                     break;
michael@0:                 }
michael@0:             }
michael@0: 
michael@0:             /*
michael@0:              * all other Unicode code points c==U+0021..U+10ffff
michael@0:              * are encoded with the difference c-prev
michael@0:              *
michael@0:              * a new prev is computed from c,
michael@0:              * placed in the middle of a 0x80-block (for most small scripts) or
michael@0:              * in the middle of the Unihan and Hangul blocks
michael@0:              * to statistically minimize the following difference
michael@0:              */
michael@0:             diff=c-prev;
michael@0:             prev=BOCU1_PREV(c);
michael@0:             if(DIFF_IS_SINGLE(diff)) {
michael@0:                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
michael@0:                 *offsets++=sourceIndex;
michael@0:                 --targetCapacity;
michael@0:                 sourceIndex=nextSourceIndex;
michael@0:                 if(c<0x3000) {
michael@0:                     goto fastSingle;
michael@0:                 }
michael@0:             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
michael@0:                 /* optimize 2-byte case */
michael@0:                 int32_t m;
michael@0: 
michael@0:                 if(diff>=0) {
michael@0:                     diff-=BOCU1_REACH_POS_1+1;
michael@0:                     m=diff%BOCU1_TRAIL_COUNT;
michael@0:                     diff/=BOCU1_TRAIL_COUNT;
michael@0:                     diff+=BOCU1_START_POS_2;
michael@0:                 } else {
michael@0:                     diff-=BOCU1_REACH_NEG_1;
michael@0:                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
michael@0:                     diff+=BOCU1_START_NEG_2;
michael@0:                 }
michael@0:                 *target++=(uint8_t)diff;
michael@0:                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
michael@0:                 *offsets++=sourceIndex;
michael@0:                 *offsets++=sourceIndex;
michael@0:                 targetCapacity-=2;
michael@0:                 sourceIndex=nextSourceIndex;
michael@0:             } else {
michael@0:                 int32_t length; /* will be 2..4 */
michael@0: 
michael@0:                 diff=packDiff(diff);
michael@0:                 length=BOCU1_LENGTH_FROM_PACKED(diff);
michael@0: 
michael@0:                 /* write the output character bytes from diff and length */
michael@0:                 /* from the first if in the loop we know that targetCapacity>0 */
michael@0:                 if(length<=targetCapacity) {
michael@0:                     switch(length) {
michael@0:                         /* each branch falls through to the next one */
michael@0:                     case 4:
michael@0:                         *target++=(uint8_t)(diff>>24);
michael@0:                         *offsets++=sourceIndex;
michael@0:                     case 3: /*fall through*/
michael@0:                         *target++=(uint8_t)(diff>>16);
michael@0:                         *offsets++=sourceIndex;
michael@0:                     case 2: /*fall through*/
michael@0:                         *target++=(uint8_t)(diff>>8);
michael@0:                         *offsets++=sourceIndex;
michael@0:                     /* case 1: handled above */
michael@0:                         *target++=(uint8_t)diff;
michael@0:                         *offsets++=sourceIndex;
michael@0:                     default:
michael@0:                         /* will never occur */
michael@0:                         break;
michael@0:                     }
michael@0:                     targetCapacity-=length;
michael@0:                     sourceIndex=nextSourceIndex;
michael@0:                 } else {
michael@0:                     uint8_t *charErrorBuffer;
michael@0: 
michael@0:                     /*
michael@0:                      * We actually do this backwards here:
michael@0:                      * In order to save an intermediate variable, we output
michael@0:                      * first to the overflow buffer what does not fit into the
michael@0:                      * regular target.
michael@0:                      */
michael@0:                     /* we know that 1<=targetCapacity<length<=4 */
michael@0:                     length-=targetCapacity;
michael@0:                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
michael@0:                     switch(length) {
michael@0:                         /* each branch falls through to the next one */
michael@0:                     case 3:
michael@0:                         *charErrorBuffer++=(uint8_t)(diff>>16);
michael@0:                     case 2: /*fall through*/
michael@0:                         *charErrorBuffer++=(uint8_t)(diff>>8);
michael@0:                     case 1: /*fall through*/
michael@0:                         *charErrorBuffer=(uint8_t)diff;
michael@0:                     default:
michael@0:                         /* will never occur */
michael@0:                         break;
michael@0:                     }
michael@0:                     cnv->charErrorBufferLength=(int8_t)length;
michael@0: 
michael@0:                     /* now output what fits into the regular target */
michael@0:                     diff>>=8*length; /* length was reduced by targetCapacity */
michael@0:                     switch(targetCapacity) {
michael@0:                         /* each branch falls through to the next one */
michael@0:                     case 3:
michael@0:                         *target++=(uint8_t)(diff>>16);
michael@0:                         *offsets++=sourceIndex;
michael@0:                     case 2: /*fall through*/
michael@0:                         *target++=(uint8_t)(diff>>8);
michael@0:                         *offsets++=sourceIndex;
michael@0:                     case 1: /*fall through*/
michael@0:                         *target++=(uint8_t)diff;
michael@0:                         *offsets++=sourceIndex;
michael@0:                     default:
michael@0:                         /* will never occur */
michael@0:                         break;
michael@0:                     }
michael@0: 
michael@0:                     /* target overflow */
michael@0:                     targetCapacity=0;
michael@0:                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0:                     break;
michael@0:                 }
michael@0:             }
michael@0:         } else {
michael@0:             /* target is full */
michael@0:             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0:             break;
michael@0:         }
michael@0:     }
michael@0: 
michael@0:     /* set the converter state back into UConverter */
michael@0:     cnv->fromUChar32= c<0 ? -c : 0;
michael@0:     cnv->fromUnicodeStatus=(uint32_t)prev;
michael@0: 
michael@0:     /* write back the updated pointers */
michael@0:     pArgs->source=source;
michael@0:     pArgs->target=(char *)target;
michael@0:     pArgs->offsets=offsets;
michael@0: }
michael@0: 
michael@0: /*
michael@0:  * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
michael@0:  * If a change is made in the original function, then either
michael@0:  * change this function the same way or
michael@0:  * re-copy the original function and remove the variables
michael@0:  * offsets, sourceIndex, and nextSourceIndex.
michael@0:  */
michael@0: static void
michael@0: _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
michael@0:                   UErrorCode *pErrorCode) {
michael@0:     UConverter *cnv;
michael@0:     const UChar *source, *sourceLimit;
michael@0:     uint8_t *target;
michael@0:     int32_t targetCapacity;
michael@0: 
michael@0:     int32_t prev, c, diff;
michael@0: 
michael@0:     /* set up the local pointers */
michael@0:     cnv=pArgs->converter;
michael@0:     source=pArgs->source;
michael@0:     sourceLimit=pArgs->sourceLimit;
michael@0:     target=(uint8_t *)pArgs->target;
michael@0:     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
michael@0: 
michael@0:     /* get the converter state from UConverter */
michael@0:     c=cnv->fromUChar32;
michael@0:     prev=(int32_t)cnv->fromUnicodeStatus;
michael@0:     if(prev==0) {
michael@0:         prev=BOCU1_ASCII_PREV;
michael@0:     }
michael@0: 
michael@0:     /* conversion loop */
michael@0:     if(c!=0 && targetCapacity>0) {
michael@0:         goto getTrail;
michael@0:     }
michael@0: 
michael@0: fastSingle:
michael@0:     /* fast loop for single-byte differences */
michael@0:     /* use only one loop counter variable, targetCapacity, not also source */
michael@0:     diff=(int32_t)(sourceLimit-source);
michael@0:     if(targetCapacity>diff) {
michael@0:         targetCapacity=diff;
michael@0:     }
michael@0:     while(targetCapacity>0 && (c=*source)<0x3000) {
michael@0:         if(c<=0x20) {
michael@0:             if(c!=0x20) {
michael@0:                 prev=BOCU1_ASCII_PREV;
michael@0:             }
michael@0:             *target++=(uint8_t)c;
michael@0:         } else {
michael@0:             diff=c-prev;
michael@0:             if(DIFF_IS_SINGLE(diff)) {
michael@0:                 prev=BOCU1_SIMPLE_PREV(c);
michael@0:                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
michael@0:             } else {
michael@0:                 break;
michael@0:             }
michael@0:         }
michael@0:         ++source;
michael@0:         --targetCapacity;
michael@0:     }
michael@0:     /* restore real values */
michael@0:     targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
michael@0: 
michael@0:     /* regular loop for all cases */
michael@0:     while(source<sourceLimit) {
michael@0:         if(targetCapacity>0) {
michael@0:             c=*source++;
michael@0: 
michael@0:             if(c<=0x20) {
michael@0:                 /*
michael@0:                  * ISO C0 control & space:
michael@0:                  * Encode directly for MIME compatibility,
michael@0:                  * and reset state except for space, to not disrupt compression.
michael@0:                  */
michael@0:                 if(c!=0x20) {
michael@0:                     prev=BOCU1_ASCII_PREV;
michael@0:                 }
michael@0:                 *target++=(uint8_t)c;
michael@0:                 --targetCapacity;
michael@0:                 continue;
michael@0:             }
michael@0: 
michael@0:             if(U16_IS_LEAD(c)) {
michael@0: getTrail:
michael@0:                 if(source<sourceLimit) {
michael@0:                     /* test the following code unit */
michael@0:                     UChar trail=*source;
michael@0:                     if(U16_IS_TRAIL(trail)) {
michael@0:                         ++source;
michael@0:                         c=U16_GET_SUPPLEMENTARY(c, trail);
michael@0:                     }
michael@0:                 } else {
michael@0:                     /* no more input */
michael@0:                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
michael@0:                     break;
michael@0:                 }
michael@0:             }
michael@0: 
michael@0:             /*
michael@0:              * all other Unicode code points c==U+0021..U+10ffff
michael@0:              * are encoded with the difference c-prev
michael@0:              *
michael@0:              * a new prev is computed from c,
michael@0:              * placed in the middle of a 0x80-block (for most small scripts) or
michael@0:              * in the middle of the Unihan and Hangul blocks
michael@0:              * to statistically minimize the following difference
michael@0:              */
michael@0:             diff=c-prev;
michael@0:             prev=BOCU1_PREV(c);
michael@0:             if(DIFF_IS_SINGLE(diff)) {
michael@0:                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
michael@0:                 --targetCapacity;
michael@0:                 if(c<0x3000) {
michael@0:                     goto fastSingle;
michael@0:                 }
michael@0:             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
michael@0:                 /* optimize 2-byte case */
michael@0:                 int32_t m;
michael@0: 
michael@0:                 if(diff>=0) {
michael@0:                     diff-=BOCU1_REACH_POS_1+1;
michael@0:                     m=diff%BOCU1_TRAIL_COUNT;
michael@0:                     diff/=BOCU1_TRAIL_COUNT;
michael@0:                     diff+=BOCU1_START_POS_2;
michael@0:                 } else {
michael@0:                     diff-=BOCU1_REACH_NEG_1;
michael@0:                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
michael@0:                     diff+=BOCU1_START_NEG_2;
michael@0:                 }
michael@0:                 *target++=(uint8_t)diff;
michael@0:                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
michael@0:                 targetCapacity-=2;
michael@0:             } else {
michael@0:                 int32_t length; /* will be 2..4 */
michael@0: 
michael@0:                 diff=packDiff(diff);
michael@0:                 length=BOCU1_LENGTH_FROM_PACKED(diff);
michael@0: 
michael@0:                 /* write the output character bytes from diff and length */
michael@0:                 /* from the first if in the loop we know that targetCapacity>0 */
michael@0:                 if(length<=targetCapacity) {
michael@0:                     switch(length) {
michael@0:                         /* each branch falls through to the next one */
michael@0:                     case 4:
michael@0:                         *target++=(uint8_t)(diff>>24);
michael@0:                     case 3: /*fall through*/
michael@0:                         *target++=(uint8_t)(diff>>16);
michael@0:                     /* case 2: handled above */
michael@0:                         *target++=(uint8_t)(diff>>8);
michael@0:                     /* case 1: handled above */
michael@0:                         *target++=(uint8_t)diff;
michael@0:                     default:
michael@0:                         /* will never occur */
michael@0:                         break;
michael@0:                     }
michael@0:                     targetCapacity-=length;
michael@0:                 } else {
michael@0:                     uint8_t *charErrorBuffer;
michael@0: 
michael@0:                     /*
michael@0:                      * We actually do this backwards here:
michael@0:                      * In order to save an intermediate variable, we output
michael@0:                      * first to the overflow buffer what does not fit into the
michael@0:                      * regular target.
michael@0:                      */
michael@0:                     /* we know that 1<=targetCapacity<length<=4 */
michael@0:                     length-=targetCapacity;
michael@0:                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
michael@0:                     switch(length) {
michael@0:                         /* each branch falls through to the next one */
michael@0:                     case 3:
michael@0:                         *charErrorBuffer++=(uint8_t)(diff>>16);
michael@0:                     case 2: /*fall through*/
michael@0:                         *charErrorBuffer++=(uint8_t)(diff>>8);
michael@0:                     case 1: /*fall through*/
michael@0:                         *charErrorBuffer=(uint8_t)diff;
michael@0:                     default:
michael@0:                         /* will never occur */
michael@0:                         break;
michael@0:                     }
michael@0:                     cnv->charErrorBufferLength=(int8_t)length;
michael@0: 
michael@0:                     /* now output what fits into the regular target */
michael@0:                     diff>>=8*length; /* length was reduced by targetCapacity */
michael@0:                     switch(targetCapacity) {
michael@0:                         /* each branch falls through to the next one */
michael@0:                     case 3:
michael@0:                         *target++=(uint8_t)(diff>>16);
michael@0:                     case 2: /*fall through*/
michael@0:                         *target++=(uint8_t)(diff>>8);
michael@0:                     case 1: /*fall through*/
michael@0:                         *target++=(uint8_t)diff;
michael@0:                     default:
michael@0:                         /* will never occur */
michael@0:                         break;
michael@0:                     }
michael@0: 
michael@0:                     /* target overflow */
michael@0:                     targetCapacity=0;
michael@0:                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0:                     break;
michael@0:                 }
michael@0:             }
michael@0:         } else {
michael@0:             /* target is full */
michael@0:             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0:             break;
michael@0:         }
michael@0:     }
michael@0: 
michael@0:     /* set the converter state back into UConverter */
michael@0:     cnv->fromUChar32= c<0 ? -c : 0;
michael@0:     cnv->fromUnicodeStatus=(uint32_t)prev;
michael@0: 
michael@0:     /* write back the updated pointers */
michael@0:     pArgs->source=source;
michael@0:     pArgs->target=(char *)target;
michael@0: }
michael@0: 
michael@0: /* BOCU-1-to-Unicode conversion functions ----------------------------------- */
michael@0: 
michael@0: /**
michael@0:  * Function for BOCU-1 decoder; handles multi-byte lead bytes.
michael@0:  *
michael@0:  * @param b lead byte;
michael@0:  *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
michael@0:  * @return (diff<<2)|count
michael@0:  */
michael@0: static inline int32_t
michael@0: decodeBocu1LeadByte(int32_t b) {
michael@0:     int32_t diff, count;
michael@0: 
michael@0:     if(b>=BOCU1_START_NEG_2) {
michael@0:         /* positive difference */
michael@0:         if(b<BOCU1_START_POS_3) {
michael@0:             /* two bytes */
michael@0:             diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
michael@0:             count=1;
michael@0:         } else if(b<BOCU1_START_POS_4) {
michael@0:             /* three bytes */
michael@0:             diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
michael@0:             count=2;
michael@0:         } else {
michael@0:             /* four bytes */
michael@0:             diff=BOCU1_REACH_POS_3+1;
michael@0:             count=3;
michael@0:         }
michael@0:     } else {
michael@0:         /* negative difference */
michael@0:         if(b>=BOCU1_START_NEG_3) {
michael@0:             /* two bytes */
michael@0:             diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
michael@0:             count=1;
michael@0:         } else if(b>BOCU1_MIN) {
michael@0:             /* three bytes */
michael@0:             diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
michael@0:             count=2;
michael@0:         } else {
michael@0:             /* four bytes */
michael@0:             diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
michael@0:             count=3;
michael@0:         }
michael@0:     }
michael@0: 
michael@0:     /* return the state for decoding the trail byte(s) */
michael@0:     return (diff<<2)|count;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Function for BOCU-1 decoder; handles multi-byte trail bytes.
michael@0:  *
michael@0:  * @param count number of remaining trail bytes including this one
michael@0:  * @param b trail byte
michael@0:  * @return new delta for diff including b - <0 indicates an error
michael@0:  *
michael@0:  * @see decodeBocu1
michael@0:  */
michael@0: static inline int32_t
michael@0: decodeBocu1TrailByte(int32_t count, int32_t b) {
michael@0:     if(b<=0x20) {
michael@0:         /* skip some C0 controls and make the trail byte range contiguous */
michael@0:         b=bocu1ByteToTrail[b];
michael@0:         /* b<0 for an illegal trail byte value will result in return<0 below */
michael@0: #if BOCU1_MAX_TRAIL<0xff
michael@0:     } else if(b>BOCU1_MAX_TRAIL) {
michael@0:         return -99;
michael@0: #endif
michael@0:     } else {
michael@0:         b-=BOCU1_TRAIL_BYTE_OFFSET;
michael@0:     }
michael@0: 
michael@0:     /* add trail byte into difference and decrement count */
michael@0:     if(count==1) {
michael@0:         return b;
michael@0:     } else if(count==2) {
michael@0:         return b*BOCU1_TRAIL_COUNT;
michael@0:     } else /* count==3 */ {
michael@0:         return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
michael@0:     }
michael@0: }
michael@0: 
michael@0: static void
michael@0: _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
michael@0:                            UErrorCode *pErrorCode) {
michael@0:     UConverter *cnv;
michael@0:     const uint8_t *source, *sourceLimit;
michael@0:     UChar *target;
michael@0:     const UChar *targetLimit;
michael@0:     int32_t *offsets;
michael@0: 
michael@0:     int32_t prev, count, diff, c;
michael@0: 
michael@0:     int8_t byteIndex;
michael@0:     uint8_t *bytes;
michael@0: 
michael@0:     int32_t sourceIndex, nextSourceIndex;
michael@0: 
michael@0:     /* set up the local pointers */
michael@0:     cnv=pArgs->converter;
michael@0:     source=(const uint8_t *)pArgs->source;
michael@0:     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
michael@0:     target=pArgs->target;
michael@0:     targetLimit=pArgs->targetLimit;
michael@0:     offsets=pArgs->offsets;
michael@0: 
michael@0:     /* get the converter state from UConverter */
michael@0:     prev=(int32_t)cnv->toUnicodeStatus;
michael@0:     if(prev==0) {
michael@0:         prev=BOCU1_ASCII_PREV;
michael@0:     }
michael@0:     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
michael@0:     count=diff&3;
michael@0:     diff>>=2;
michael@0: 
michael@0:     byteIndex=cnv->toULength;
michael@0:     bytes=cnv->toUBytes;
michael@0: 
michael@0:     /* sourceIndex=-1 if the current character began in the previous buffer */
michael@0:     sourceIndex=byteIndex==0 ? 0 : -1;
michael@0:     nextSourceIndex=0;
michael@0: 
michael@0:     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
michael@0:     if(count>0 && byteIndex>0 && target<targetLimit) {
michael@0:         goto getTrail;
michael@0:     }
michael@0: 
michael@0: fastSingle:
michael@0:     /* fast loop for single-byte differences */
michael@0:     /* use count as the only loop counter variable */
michael@0:     diff=(int32_t)(sourceLimit-source);
michael@0:     count=(int32_t)(pArgs->targetLimit-target);
michael@0:     if(count>diff) {
michael@0:         count=diff;
michael@0:     }
michael@0:     while(count>0) {
michael@0:         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
michael@0:             c=prev+(c-BOCU1_MIDDLE);
michael@0:             if(c<0x3000) {
michael@0:                 *target++=(UChar)c;
michael@0:                 *offsets++=nextSourceIndex++;
michael@0:                 prev=BOCU1_SIMPLE_PREV(c);
michael@0:             } else {
michael@0:                 break;
michael@0:             }
michael@0:         } else if(c<=0x20) {
michael@0:             if(c!=0x20) {
michael@0:                 prev=BOCU1_ASCII_PREV;
michael@0:             }
michael@0:             *target++=(UChar)c;
michael@0:             *offsets++=nextSourceIndex++;
michael@0:         } else {
michael@0:             break;
michael@0:         }
michael@0:         ++source;
michael@0:         --count;
michael@0:     }
michael@0:     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
michael@0: 
michael@0:     /* decode a sequence of single and lead bytes */
michael@0:     while(source<sourceLimit) {
michael@0:         if(target>=targetLimit) {
michael@0:             /* target is full */
michael@0:             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0:             break;
michael@0:         }
michael@0: 
michael@0:         ++nextSourceIndex;
michael@0:         c=*source++;
michael@0:         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
michael@0:             /* Write a code point directly from a single-byte difference. */
michael@0:             c=prev+(c-BOCU1_MIDDLE);
michael@0:             if(c<0x3000) {
michael@0:                 *target++=(UChar)c;
michael@0:                 *offsets++=sourceIndex;
michael@0:                 prev=BOCU1_SIMPLE_PREV(c);
michael@0:                 sourceIndex=nextSourceIndex;
michael@0:                 goto fastSingle;
michael@0:             }
michael@0:         } else if(c<=0x20) {
michael@0:             /*
michael@0:              * Direct-encoded C0 control code or space.
michael@0:              * Reset prev for C0 control codes but not for space.
michael@0:              */
michael@0:             if(c!=0x20) {
michael@0:                 prev=BOCU1_ASCII_PREV;
michael@0:             }
michael@0:             *target++=(UChar)c;
michael@0:             *offsets++=sourceIndex;
michael@0:             sourceIndex=nextSourceIndex;
michael@0:             continue;
michael@0:         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
michael@0:             /* Optimize two-byte case. */
michael@0:             if(c>=BOCU1_MIDDLE) {
michael@0:                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
michael@0:             } else {
michael@0:                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
michael@0:             }
michael@0: 
michael@0:             /* trail byte */
michael@0:             ++nextSourceIndex;
michael@0:             c=decodeBocu1TrailByte(1, *source++);
michael@0:             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
michael@0:                 bytes[0]=source[-2];
michael@0:                 bytes[1]=source[-1];
michael@0:                 byteIndex=2;
michael@0:                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0:                 break;
michael@0:             }
michael@0:         } else if(c==BOCU1_RESET) {
michael@0:             /* only reset the state, no code point */
michael@0:             prev=BOCU1_ASCII_PREV;
michael@0:             sourceIndex=nextSourceIndex;
michael@0:             continue;
michael@0:         } else {
michael@0:             /*
michael@0:              * For multi-byte difference lead bytes, set the decoder state
michael@0:              * with the partial difference value from the lead byte and
michael@0:              * with the number of trail bytes.
michael@0:              */
michael@0:             bytes[0]=(uint8_t)c;
michael@0:             byteIndex=1;
michael@0: 
michael@0:             diff=decodeBocu1LeadByte(c);
michael@0:             count=diff&3;
michael@0:             diff>>=2;
michael@0: getTrail:
michael@0:             for(;;) {
michael@0:                 if(source>=sourceLimit) {
michael@0:                     goto endloop;
michael@0:                 }
michael@0:                 ++nextSourceIndex;
michael@0:                 c=bytes[byteIndex++]=*source++;
michael@0: 
michael@0:                 /* trail byte in any position */
michael@0:                 c=decodeBocu1TrailByte(count, c);
michael@0:                 if(c<0) {
michael@0:                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0:                     goto endloop;
michael@0:                 }
michael@0: 
michael@0:                 diff+=c;
michael@0:                 if(--count==0) {
michael@0:                     /* final trail byte, deliver a code point */
michael@0:                     byteIndex=0;
michael@0:                     c=prev+diff;
michael@0:                     if((uint32_t)c>0x10ffff) {
michael@0:                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0:                         goto endloop;
michael@0:                     }
michael@0:                     break;
michael@0:                 }
michael@0:             }
michael@0:         }
michael@0: 
michael@0:         /* calculate the next prev and output c */
michael@0:         prev=BOCU1_PREV(c);
michael@0:         if(c<=0xffff) {
michael@0:             *target++=(UChar)c;
michael@0:             *offsets++=sourceIndex;
michael@0:         } else {
michael@0:             /* output surrogate pair */
michael@0:             *target++=U16_LEAD(c);
michael@0:             if(target<targetLimit) {
michael@0:                 *target++=U16_TRAIL(c);
michael@0:                 *offsets++=sourceIndex;
michael@0:                 *offsets++=sourceIndex;
michael@0:             } else {
michael@0:                 /* target overflow */
michael@0:                 *offsets++=sourceIndex;
michael@0:                 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
michael@0:                 cnv->UCharErrorBufferLength=1;
michael@0:                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0:                 break;
michael@0:             }
michael@0:         }
michael@0:         sourceIndex=nextSourceIndex;
michael@0:     }
michael@0: endloop:
michael@0: 
michael@0:     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
michael@0:         /* set the converter state in UConverter to deal with the next character */
michael@0:         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
michael@0:         cnv->mode=0;
michael@0:     } else {
michael@0:         /* set the converter state back into UConverter */
michael@0:         cnv->toUnicodeStatus=(uint32_t)prev;
michael@0:         cnv->mode=(diff<<2)|count;
michael@0:     }
michael@0:     cnv->toULength=byteIndex;
michael@0: 
michael@0:     /* write back the updated pointers */
michael@0:     pArgs->source=(const char *)source;
michael@0:     pArgs->target=target;
michael@0:     pArgs->offsets=offsets;
michael@0:     return;
michael@0: }
michael@0: 
michael@0: /*
michael@0:  * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
michael@0:  * If a change is made in the original function, then either
michael@0:  * change this function the same way or
michael@0:  * re-copy the original function and remove the variables
michael@0:  * offsets, sourceIndex, and nextSourceIndex.
michael@0:  */
michael@0: static void
michael@0: _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
michael@0:                 UErrorCode *pErrorCode) {
michael@0:     UConverter *cnv;
michael@0:     const uint8_t *source, *sourceLimit;
michael@0:     UChar *target;
michael@0:     const UChar *targetLimit;
michael@0: 
michael@0:     int32_t prev, count, diff, c;
michael@0: 
michael@0:     int8_t byteIndex;
michael@0:     uint8_t *bytes;
michael@0: 
michael@0: U_ALIGN_CODE(16)
michael@0: 
michael@0:     /* set up the local pointers */
michael@0:     cnv=pArgs->converter;
michael@0:     source=(const uint8_t *)pArgs->source;
michael@0:     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
michael@0:     target=pArgs->target;
michael@0:     targetLimit=pArgs->targetLimit;
michael@0: 
michael@0:     /* get the converter state from UConverter */
michael@0:     prev=(int32_t)cnv->toUnicodeStatus;
michael@0:     if(prev==0) {
michael@0:         prev=BOCU1_ASCII_PREV;
michael@0:     }
michael@0:     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
michael@0:     count=diff&3;
michael@0:     diff>>=2;
michael@0: 
michael@0:     byteIndex=cnv->toULength;
michael@0:     bytes=cnv->toUBytes;
michael@0: 
michael@0:     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
michael@0:     if(count>0 && byteIndex>0 && target<targetLimit) {
michael@0:         goto getTrail;
michael@0:     }
michael@0: 
michael@0: fastSingle:
michael@0:     /* fast loop for single-byte differences */
michael@0:     /* use count as the only loop counter variable */
michael@0:     diff=(int32_t)(sourceLimit-source);
michael@0:     count=(int32_t)(pArgs->targetLimit-target);
michael@0:     if(count>diff) {
michael@0:         count=diff;
michael@0:     }
michael@0:     while(count>0) {
michael@0:         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
michael@0:             c=prev+(c-BOCU1_MIDDLE);
michael@0:             if(c<0x3000) {
michael@0:                 *target++=(UChar)c;
michael@0:                 prev=BOCU1_SIMPLE_PREV(c);
michael@0:             } else {
michael@0:                 break;
michael@0:             }
michael@0:         } else if(c<=0x20) {
michael@0:             if(c!=0x20) {
michael@0:                 prev=BOCU1_ASCII_PREV;
michael@0:             }
michael@0:             *target++=(UChar)c;
michael@0:         } else {
michael@0:             break;
michael@0:         }
michael@0:         ++source;
michael@0:         --count;
michael@0:     }
michael@0: 
michael@0:     /* decode a sequence of single and lead bytes */
michael@0:     while(source<sourceLimit) {
michael@0:         if(target>=targetLimit) {
michael@0:             /* target is full */
michael@0:             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0:             break;
michael@0:         }
michael@0: 
michael@0:         c=*source++;
michael@0:         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
michael@0:             /* Write a code point directly from a single-byte difference. */
michael@0:             c=prev+(c-BOCU1_MIDDLE);
michael@0:             if(c<0x3000) {
michael@0:                 *target++=(UChar)c;
michael@0:                 prev=BOCU1_SIMPLE_PREV(c);
michael@0:                 goto fastSingle;
michael@0:             }
michael@0:         } else if(c<=0x20) {
michael@0:             /*
michael@0:              * Direct-encoded C0 control code or space.
michael@0:              * Reset prev for C0 control codes but not for space.
michael@0:              */
michael@0:             if(c!=0x20) {
michael@0:                 prev=BOCU1_ASCII_PREV;
michael@0:             }
michael@0:             *target++=(UChar)c;
michael@0:             continue;
michael@0:         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
michael@0:             /* Optimize two-byte case. */
michael@0:             if(c>=BOCU1_MIDDLE) {
michael@0:                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
michael@0:             } else {
michael@0:                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
michael@0:             }
michael@0: 
michael@0:             /* trail byte */
michael@0:             c=decodeBocu1TrailByte(1, *source++);
michael@0:             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
michael@0:                 bytes[0]=source[-2];
michael@0:                 bytes[1]=source[-1];
michael@0:                 byteIndex=2;
michael@0:                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0:                 break;
michael@0:             }
michael@0:         } else if(c==BOCU1_RESET) {
michael@0:             /* only reset the state, no code point */
michael@0:             prev=BOCU1_ASCII_PREV;
michael@0:             continue;
michael@0:         } else {
michael@0:             /*
michael@0:              * For multi-byte difference lead bytes, set the decoder state
michael@0:              * with the partial difference value from the lead byte and
michael@0:              * with the number of trail bytes.
michael@0:              */
michael@0:             bytes[0]=(uint8_t)c;
michael@0:             byteIndex=1;
michael@0: 
michael@0:             diff=decodeBocu1LeadByte(c);
michael@0:             count=diff&3;
michael@0:             diff>>=2;
michael@0: getTrail:
michael@0:             for(;;) {
michael@0:                 if(source>=sourceLimit) {
michael@0:                     goto endloop;
michael@0:                 }
michael@0:                 c=bytes[byteIndex++]=*source++;
michael@0: 
michael@0:                 /* trail byte in any position */
michael@0:                 c=decodeBocu1TrailByte(count, c);
michael@0:                 if(c<0) {
michael@0:                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0:                     goto endloop;
michael@0:                 }
michael@0: 
michael@0:                 diff+=c;
michael@0:                 if(--count==0) {
michael@0:                     /* final trail byte, deliver a code point */
michael@0:                     byteIndex=0;
michael@0:                     c=prev+diff;
michael@0:                     if((uint32_t)c>0x10ffff) {
michael@0:                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0:                         goto endloop;
michael@0:                     }
michael@0:                     break;
michael@0:                 }
michael@0:             }
michael@0:         }
michael@0: 
michael@0:         /* calculate the next prev and output c */
michael@0:         prev=BOCU1_PREV(c);
michael@0:         if(c<=0xffff) {
michael@0:             *target++=(UChar)c;
michael@0:         } else {
michael@0:             /* output surrogate pair */
michael@0:             *target++=U16_LEAD(c);
michael@0:             if(target<targetLimit) {
michael@0:                 *target++=U16_TRAIL(c);
michael@0:             } else {
michael@0:                 /* target overflow */
michael@0:                 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
michael@0:                 cnv->UCharErrorBufferLength=1;
michael@0:                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0:                 break;
michael@0:             }
michael@0:         }
michael@0:     }
michael@0: endloop:
michael@0: 
michael@0:     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
michael@0:         /* set the converter state in UConverter to deal with the next character */
michael@0:         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
michael@0:         cnv->mode=0;
michael@0:     } else {
michael@0:         /* set the converter state back into UConverter */
michael@0:         cnv->toUnicodeStatus=(uint32_t)prev;
michael@0:         cnv->mode=(diff<<2)|count;
michael@0:     }
michael@0:     cnv->toULength=byteIndex;
michael@0: 
michael@0:     /* write back the updated pointers */
michael@0:     pArgs->source=(const char *)source;
michael@0:     pArgs->target=target;
michael@0:     return;
michael@0: }
michael@0: 
michael@0: /* miscellaneous ------------------------------------------------------------ */
michael@0: 
michael@0: static const UConverterImpl _Bocu1Impl={
michael@0:     UCNV_BOCU1,
michael@0: 
michael@0:     NULL,
michael@0:     NULL,
michael@0: 
michael@0:     NULL,
michael@0:     NULL,
michael@0:     NULL,
michael@0: 
michael@0:     _Bocu1ToUnicode,
michael@0:     _Bocu1ToUnicodeWithOffsets,
michael@0:     _Bocu1FromUnicode,
michael@0:     _Bocu1FromUnicodeWithOffsets,
michael@0:     NULL,
michael@0: 
michael@0:     NULL,
michael@0:     NULL,
michael@0:     NULL,
michael@0:     NULL,
michael@0:     ucnv_getCompleteUnicodeSet,
michael@0: 
michael@0:     NULL,
michael@0:     NULL
michael@0: };
michael@0: 
michael@0: static const UConverterStaticData _Bocu1StaticData={
michael@0:     sizeof(UConverterStaticData),
michael@0:     "BOCU-1",
michael@0:     1214, /* CCSID for BOCU-1 */
michael@0:     UCNV_IBM, UCNV_BOCU1,
michael@0:     1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
michael@0:     { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
michael@0:     FALSE, FALSE,
michael@0:     0,
michael@0:     0,
michael@0:     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
michael@0: };
michael@0: 
michael@0: const UConverterSharedData _Bocu1Data={
michael@0:     sizeof(UConverterSharedData), ~((uint32_t)0),
michael@0:     NULL, NULL, &_Bocu1StaticData, FALSE, &_Bocu1Impl,
michael@0:     0,
michael@0:     UCNV_MBCS_TABLE_INITIALIZER
michael@0: };
michael@0: 
michael@0: #endif