The Tor Browser: intl/icu/source/common/ucnvbocu.cpp@b8a032363ba2 (annotated)

intl/icu/source/common/ucnvbocu.cpp@b8a032363ba2 (annotated)

intl/icu/source/common/ucnvbocu.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author: Michael Schloh von Bennewitz <michael@schloh.com>
date: Thu, 22 Jan 2015 13:21:57 +0100
branch: TOR_BUG_9701
changeset 15: b8a032363ba2
permissions: -rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

 /*
 ******************************************************************************
 *
 *   Copyright (C) 2002-2011, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 ******************************************************************************
 *   file name:  ucnvbocu.cpp
 *   encoding:   US-ASCII
 *   tab size:   8 (not used)
 *   indentation:4
 *
 *   created on: 2002mar27
 *   created by: Markus W. Scherer
 *
 *   This is an implementation of the Binary Ordered Compression for Unicode,
 *   in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
 */
 #include "unicode/utypes.h"
 #if !UCONFIG_NO_CONVERSION
 #include "unicode/ucnv.h"
 #include "unicode/ucnv_cb.h"
 #include "unicode/utf16.h"
 #include "putilimp.h"
 #include "ucnv_bld.h"
 #include "ucnv_cnv.h"
 #include "uassert.h"
 /* BOCU-1 constants and macros ---------------------------------------------- */
 /*
  * BOCU-1 encodes the code points of a Unicode string as
  * a sequence of byte-encoded differences (slope detection),
  * preserving lexical order.
  *
  * Optimize the difference-taking for runs of Unicode text within
  * small scripts:
  *
  * Most small scripts are allocated within aligned 128-blocks of Unicode
  * code points. Lexical order is preserved if the "previous code point" state
  * is always moved into the middle of such a block.
  *
  * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
  * areas into the middle of those areas.
  *
  * C0 control codes and space are encoded with their US-ASCII bytes.
  * "prev" is reset for C0 controls but not for space.
  */
 /* initial value for "prev": middle of the ASCII range */
 #define BOCU1_ASCII_PREV        0x40
 /* bounding byte values for differences */
 #define BOCU1_MIN               0x21
 #define BOCU1_MIDDLE            0x90
 #define BOCU1_MAX_LEAD          0xfe
 #define BOCU1_MAX_TRAIL         0xff
 #define BOCU1_RESET             0xff
 /* number of lead bytes */
 #define BOCU1_COUNT             (BOCU1_MAX_LEAD-BOCU1_MIN+1)
 /* adjust trail byte counts for the use of some C0 control byte values */
 #define BOCU1_TRAIL_CONTROLS_COUNT  20
 #define BOCU1_TRAIL_BYTE_OFFSET     (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
 /* number of trail bytes */
 #define BOCU1_TRAIL_COUNT       ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
 /*
  * number of positive and negative single-byte codes
  * (counting 0==BOCU1_MIDDLE among the positive ones)
  */
 #define BOCU1_SINGLE            64
 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
 #define BOCU1_LEAD_2            43
 #define BOCU1_LEAD_3            3
 #define BOCU1_LEAD_4            1
 /* The difference value range for single-byters. */
 #define BOCU1_REACH_POS_1   (BOCU1_SINGLE-1)
 #define BOCU1_REACH_NEG_1   (-BOCU1_SINGLE)
 /* The difference value range for double-byters. */
 #define BOCU1_REACH_POS_2   (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
 #define BOCU1_REACH_NEG_2   (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
 /* The difference value range for 3-byters. */
 #define BOCU1_REACH_POS_3   \
     (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
 #define BOCU1_REACH_NEG_3   (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
 /* The lead byte start values. */
 #define BOCU1_START_POS_2   (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
 #define BOCU1_START_POS_3   (BOCU1_START_POS_2+BOCU1_LEAD_2)
 #define BOCU1_START_POS_4   (BOCU1_START_POS_3+BOCU1_LEAD_3)
      /* ==BOCU1_MAX_LEAD */
 #define BOCU1_START_NEG_2   (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
 #define BOCU1_START_NEG_3   (BOCU1_START_NEG_2-BOCU1_LEAD_2)
 #define BOCU1_START_NEG_4   (BOCU1_START_NEG_3-BOCU1_LEAD_3)
      /* ==BOCU1_MIN+1 */
 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
 #define BOCU1_LENGTH_FROM_LEAD(lead) \
     ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
      (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
      (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
 /* The length of a byte sequence, according to its packed form. */
 #define BOCU1_LENGTH_FROM_PACKED(packed) \
     ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
 /*
  * 12 commonly used C0 control codes (and space) are only used to encode
  * themselves directly,
  * which makes BOCU-1 MIME-usable and reasonably safe for
  * ASCII-oriented software.
  *
  * These controls are
  *  0   NUL
  *
  *  7   BEL
  *  8   BS
  *
  *  9   TAB
  *  a   LF
  *  b   VT
  *  c   FF
  *  d   CR
  *
  *  e   SO
  *  f   SI
  *
  * 1a   SUB
  * 1b   ESC
  *
  * The other 20 C0 controls are also encoded directly (to preserve order)
  * but are also used as trail bytes in difference encoding
  * (for better compression).
  */
 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
 /*
  * Byte value map for control codes,
  * from external byte values 0x00..0x20
  * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
  * External byte values that are illegal as trail bytes are mapped to -1.
  */
 static const int8_t
 bocu1ByteToTrail[BOCU1_MIN]={
 /*  0     1     2     3     4     5     6     7    */
     -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
 /*  8     9     a     b     c     d     e     f    */
     -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
 /*  10    11    12    13    14    15    16    17   */
 x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
 /*  18    19    1a    1b    1c    1d    1e    1f   */
 x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,
 /*  20   */
     -1
 };
 /*
  * Byte value map for control codes,
  * from trail byte values 0..19 (0..0x13) as used in the difference calculation
  * to external byte values 0x00..0x20.
  */
 static const int8_t
 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
 /*  0     1     2     3     4     5     6     7    */
 x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
 /*  8     9     a     b     c     d     e     f    */
 x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
 /*  10    11    12    13   */
 x1c, 0x1d, 0x1e, 0x1f
 };
 /**
  * Integer division and modulo with negative numerators
  * yields negative modulo results and quotients that are one more than
  * what we need here.
  * This macro adjust the results so that the modulo-value m is always >=0.
  *
  * For positive n, the if() condition is always FALSE.
  *
  * @param n Number to be split into quotient and rest.
  *          Will be modified to contain the quotient.
  * @param d Divisor.
  * @param m Output variable for the rest (modulo result).
  */
 #define NEGDIVMOD(n, d, m) { \
     (m)=(n)%(d); \
     (n)/=(d); \
     if((m)<0) { \
         --(n); \
         (m)+=(d); \
     } \
 }
 /* Faster versions of packDiff() for single-byte-encoded diff values. */
 /** Is a diff value encodable in a single byte? */
 #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
 /** Encode a diff value in a single byte. */
 #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
 /** Is a diff value encodable in two bytes? */
 #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
 /* BOCU-1 implementation functions ------------------------------------------ */
 #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
 /**
  * Compute the next "previous" value for differencing
  * from the current code point.
  *
  * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
  * @return "previous code point" state value
  */
 static inline int32_t
 bocu1Prev(int32_t c) {
     /* compute new prev */
     if(/* 0x3040<=c && */ c<=0x309f) {
         /* Hiragana is not 128-aligned */
         return 0x3070;
     } else if(0x4e00<=c && c<=0x9fa5) {
         /* CJK Unihan */
         return 0x4e00-BOCU1_REACH_NEG_2;
     } else if(0xac00<=c /* && c<=0xd7a3 */) {
         /* Korean Hangul */
         return (0xd7a3+0xac00)/2;
     } else {
         /* mostly small scripts */
         return BOCU1_SIMPLE_PREV(c);
     }
 }
 /** Fast version of bocu1Prev() for most scripts. */
 #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
 /*
  * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
  * The UConverter fields are used as follows:
  *
  * fromUnicodeStatus    encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
  *
  * toUnicodeStatus      decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
  * mode                 decoder's incomplete (diff<<2)|count (ignored when toULength==0)
  */
 /* BOCU-1-from-Unicode conversion functions --------------------------------- */
 /**
  * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
  * and return a packed integer with them.
  *
  * The encoding favors small absolute differences with short encodings
  * to compress runs of same-script characters.
  *
  * Optimized version with unrolled loops and fewer floating-point operations
  * than the standard packDiff().
  *
  * @param diff difference value -0x10ffff..0x10ffff
  * @return
  *      0x010000zz for 1-byte sequence zz
  *      0x0200yyzz for 2-byte sequence yy zz
  *      0x03xxyyzz for 3-byte sequence xx yy zz
  *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
  */
 static int32_t
 packDiff(int32_t diff) {
     int32_t result, m;
     U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */
     if(diff>=BOCU1_REACH_NEG_1) {
         /* mostly positive differences, and single-byte negative ones */
 #if 0   /* single-byte case handled in macros, see below */
         if(diff<=BOCU1_REACH_POS_1) {
             /* single byte */
             return 0x01000000|(BOCU1_MIDDLE+diff);
         } else
 #endif
         if(diff<=BOCU1_REACH_POS_2) {
             /* two bytes */
             diff-=BOCU1_REACH_POS_1+1;
             result=0x02000000;
             m=diff%BOCU1_TRAIL_COUNT;
             diff/=BOCU1_TRAIL_COUNT;
             result|=BOCU1_TRAIL_TO_BYTE(m);
             result|=(BOCU1_START_POS_2+diff)<<8;
         } else if(diff<=BOCU1_REACH_POS_3) {
             /* three bytes */
             diff-=BOCU1_REACH_POS_2+1;
             result=0x03000000;
             m=diff%BOCU1_TRAIL_COUNT;
             diff/=BOCU1_TRAIL_COUNT;
             result|=BOCU1_TRAIL_TO_BYTE(m);
             m=diff%BOCU1_TRAIL_COUNT;
             diff/=BOCU1_TRAIL_COUNT;
             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
             result|=(BOCU1_START_POS_3+diff)<<16;
         } else {
             /* four bytes */
             diff-=BOCU1_REACH_POS_3+1;
             m=diff%BOCU1_TRAIL_COUNT;
             diff/=BOCU1_TRAIL_COUNT;
             result=BOCU1_TRAIL_TO_BYTE(m);
             m=diff%BOCU1_TRAIL_COUNT;
             diff/=BOCU1_TRAIL_COUNT;
             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
             /*
              * We know that / and % would deliver quotient 0 and rest=diff.
              * Avoid division and modulo for performance.
              */
             result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
             result|=((uint32_t)BOCU1_START_POS_4)<<24;
         }
     } else {
         /* two- to four-byte negative differences */
         if(diff>=BOCU1_REACH_NEG_2) {
             /* two bytes */
             diff-=BOCU1_REACH_NEG_1;
             result=0x02000000;
             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
             result|=BOCU1_TRAIL_TO_BYTE(m);
             result|=(BOCU1_START_NEG_2+diff)<<8;
         } else if(diff>=BOCU1_REACH_NEG_3) {
             /* three bytes */
             diff-=BOCU1_REACH_NEG_2;
             result=0x03000000;
             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
             result|=BOCU1_TRAIL_TO_BYTE(m);
             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
             result|=(BOCU1_START_NEG_3+diff)<<16;
         } else {
             /* four bytes */
             diff-=BOCU1_REACH_NEG_3;
             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
             result=BOCU1_TRAIL_TO_BYTE(m);
             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
             /*
              * We know that NEGDIVMOD would deliver
              * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
              * Avoid division and modulo for performance.
              */
             m=diff+BOCU1_TRAIL_COUNT;
             result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
             result|=BOCU1_MIN<<24;
         }
     }
     return result;
 }
 static void
 _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
                              UErrorCode *pErrorCode) {
     UConverter *cnv;
     const UChar *source, *sourceLimit;
     uint8_t *target;
     int32_t targetCapacity;
     int32_t *offsets;
     int32_t prev, c, diff;
     int32_t sourceIndex, nextSourceIndex;
 U_ALIGN_CODE(16)
     /* set up the local pointers */
     cnv=pArgs->converter;
     source=pArgs->source;
     sourceLimit=pArgs->sourceLimit;
     target=(uint8_t *)pArgs->target;
     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
     offsets=pArgs->offsets;
     /* get the converter state from UConverter */
     c=cnv->fromUChar32;
     prev=(int32_t)cnv->fromUnicodeStatus;
     if(prev==0) {
         prev=BOCU1_ASCII_PREV;
     }
     /* sourceIndex=-1 if the current character began in the previous buffer */
     sourceIndex= c==0 ? 0 : -1;
     nextSourceIndex=0;
     /* conversion loop */
     if(c!=0 && targetCapacity>0) {
         goto getTrail;
     }
 fastSingle:
     /* fast loop for single-byte differences */
     /* use only one loop counter variable, targetCapacity, not also source */
     diff=(int32_t)(sourceLimit-source);
     if(targetCapacity>diff) {
         targetCapacity=diff;
     }
     while(targetCapacity>0 && (c=*source)<0x3000) {
         if(c<=0x20) {
             if(c!=0x20) {
                 prev=BOCU1_ASCII_PREV;
             }
             *target++=(uint8_t)c;
             *offsets++=nextSourceIndex++;
             ++source;
             --targetCapacity;
         } else {
             diff=c-prev;
             if(DIFF_IS_SINGLE(diff)) {
                 prev=BOCU1_SIMPLE_PREV(c);
                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
                 *offsets++=nextSourceIndex++;
                 ++source;
                 --targetCapacity;
             } else {
                 break;
             }
         }
     }
     /* restore real values */
     targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
     /* regular loop for all cases */
     while(source<sourceLimit) {
         if(targetCapacity>0) {
             c=*source++;
             ++nextSourceIndex;
             if(c<=0x20) {
                 /*
                  * ISO C0 control & space:
                  * Encode directly for MIME compatibility,
                  * and reset state except for space, to not disrupt compression.
                  */
                 if(c!=0x20) {
                     prev=BOCU1_ASCII_PREV;
                 }
                 *target++=(uint8_t)c;
                 *offsets++=sourceIndex;
                 --targetCapacity;
                 sourceIndex=nextSourceIndex;
                 continue;
             }
             if(U16_IS_LEAD(c)) {
 getTrail:
                 if(source<sourceLimit) {
                     /* test the following code unit */
                     UChar trail=*source;
                     if(U16_IS_TRAIL(trail)) {
                         ++source;
                         ++nextSourceIndex;
                         c=U16_GET_SUPPLEMENTARY(c, trail);
                     }
                 } else {
                     /* no more input */
                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
                     break;
                 }
             }
             /*
              * all other Unicode code points c==U+0021..U+10ffff
              * are encoded with the difference c-prev
              *
              * a new prev is computed from c,
              * placed in the middle of a 0x80-block (for most small scripts) or
              * in the middle of the Unihan and Hangul blocks
              * to statistically minimize the following difference
              */
             diff=c-prev;
             prev=BOCU1_PREV(c);
             if(DIFF_IS_SINGLE(diff)) {
                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
                 *offsets++=sourceIndex;
                 --targetCapacity;
                 sourceIndex=nextSourceIndex;
                 if(c<0x3000) {
                     goto fastSingle;
                 }
             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
                 /* optimize 2-byte case */
                 int32_t m;
                 if(diff>=0) {
                     diff-=BOCU1_REACH_POS_1+1;
                     m=diff%BOCU1_TRAIL_COUNT;
                     diff/=BOCU1_TRAIL_COUNT;
                     diff+=BOCU1_START_POS_2;
                 } else {
                     diff-=BOCU1_REACH_NEG_1;
                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
                     diff+=BOCU1_START_NEG_2;
                 }
                 *target++=(uint8_t)diff;
                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
                 *offsets++=sourceIndex;
                 *offsets++=sourceIndex;
                 targetCapacity-=2;
                 sourceIndex=nextSourceIndex;
             } else {
                 int32_t length; /* will be 2..4 */
                 diff=packDiff(diff);
                 length=BOCU1_LENGTH_FROM_PACKED(diff);
                 /* write the output character bytes from diff and length */
                 /* from the first if in the loop we know that targetCapacity>0 */
                 if(length<=targetCapacity) {
                     switch(length) {
                         /* each branch falls through to the next one */
                     case 4:
                         *target++=(uint8_t)(diff>>24);
                         *offsets++=sourceIndex;
                     case 3: /*fall through*/
                         *target++=(uint8_t)(diff>>16);
                         *offsets++=sourceIndex;
                     case 2: /*fall through*/
                         *target++=(uint8_t)(diff>>8);
                         *offsets++=sourceIndex;
                     /* case 1: handled above */
                         *target++=(uint8_t)diff;
                         *offsets++=sourceIndex;
                     default:
                         /* will never occur */
                         break;
                     }
                     targetCapacity-=length;
                     sourceIndex=nextSourceIndex;
                 } else {
                     uint8_t *charErrorBuffer;
                     /*
                      * We actually do this backwards here:
                      * In order to save an intermediate variable, we output
                      * first to the overflow buffer what does not fit into the
                      * regular target.
                      */
                     /* we know that 1<=targetCapacity<length<=4 */
                     length-=targetCapacity;
                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
                     switch(length) {
                         /* each branch falls through to the next one */
                     case 3:
                         *charErrorBuffer++=(uint8_t)(diff>>16);
                     case 2: /*fall through*/
                         *charErrorBuffer++=(uint8_t)(diff>>8);
                     case 1: /*fall through*/
                         *charErrorBuffer=(uint8_t)diff;
                     default:
                         /* will never occur */
                         break;
                     }
                     cnv->charErrorBufferLength=(int8_t)length;
                     /* now output what fits into the regular target */
                     diff>>=8*length; /* length was reduced by targetCapacity */
                     switch(targetCapacity) {
                         /* each branch falls through to the next one */
                     case 3:
                         *target++=(uint8_t)(diff>>16);
                         *offsets++=sourceIndex;
                     case 2: /*fall through*/
                         *target++=(uint8_t)(diff>>8);
                         *offsets++=sourceIndex;
                     case 1: /*fall through*/
                         *target++=(uint8_t)diff;
                         *offsets++=sourceIndex;
                     default:
                         /* will never occur */
                         break;
                     }
                     /* target overflow */
                     targetCapacity=0;
                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
                     break;
                 }
             }
         } else {
             /* target is full */
             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
             break;
         }
     }
     /* set the converter state back into UConverter */
     cnv->fromUChar32= c<0 ? -c : 0;
     cnv->fromUnicodeStatus=(uint32_t)prev;
     /* write back the updated pointers */
     pArgs->source=source;
     pArgs->target=(char *)target;
     pArgs->offsets=offsets;
 }
 /*
  * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
  * If a change is made in the original function, then either
  * change this function the same way or
  * re-copy the original function and remove the variables
  * offsets, sourceIndex, and nextSourceIndex.
  */
 static void
 _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
                   UErrorCode *pErrorCode) {
     UConverter *cnv;
     const UChar *source, *sourceLimit;
     uint8_t *target;
     int32_t targetCapacity;
     int32_t prev, c, diff;
     /* set up the local pointers */
     cnv=pArgs->converter;
     source=pArgs->source;
     sourceLimit=pArgs->sourceLimit;
     target=(uint8_t *)pArgs->target;
     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
     /* get the converter state from UConverter */
     c=cnv->fromUChar32;
     prev=(int32_t)cnv->fromUnicodeStatus;
     if(prev==0) {
         prev=BOCU1_ASCII_PREV;
     }
     /* conversion loop */
     if(c!=0 && targetCapacity>0) {
         goto getTrail;
     }
 fastSingle:
     /* fast loop for single-byte differences */
     /* use only one loop counter variable, targetCapacity, not also source */
     diff=(int32_t)(sourceLimit-source);
     if(targetCapacity>diff) {
         targetCapacity=diff;
     }
     while(targetCapacity>0 && (c=*source)<0x3000) {
         if(c<=0x20) {
             if(c!=0x20) {
                 prev=BOCU1_ASCII_PREV;
             }
             *target++=(uint8_t)c;
         } else {
             diff=c-prev;
             if(DIFF_IS_SINGLE(diff)) {
                 prev=BOCU1_SIMPLE_PREV(c);
                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
             } else {
                 break;
             }
         }
         ++source;
         --targetCapacity;
     }
     /* restore real values */
     targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
     /* regular loop for all cases */
     while(source<sourceLimit) {
         if(targetCapacity>0) {
             c=*source++;
             if(c<=0x20) {
                 /*
                  * ISO C0 control & space:
                  * Encode directly for MIME compatibility,
                  * and reset state except for space, to not disrupt compression.
                  */
                 if(c!=0x20) {
                     prev=BOCU1_ASCII_PREV;
                 }
                 *target++=(uint8_t)c;
                 --targetCapacity;
                 continue;
             }
             if(U16_IS_LEAD(c)) {
 getTrail:
                 if(source<sourceLimit) {
                     /* test the following code unit */
                     UChar trail=*source;
                     if(U16_IS_TRAIL(trail)) {
                         ++source;
                         c=U16_GET_SUPPLEMENTARY(c, trail);
                     }
                 } else {
                     /* no more input */
                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
                     break;
                 }
             }
             /*
              * all other Unicode code points c==U+0021..U+10ffff
              * are encoded with the difference c-prev
              *
              * a new prev is computed from c,
              * placed in the middle of a 0x80-block (for most small scripts) or
              * in the middle of the Unihan and Hangul blocks
              * to statistically minimize the following difference
              */
             diff=c-prev;
             prev=BOCU1_PREV(c);
             if(DIFF_IS_SINGLE(diff)) {
                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
                 --targetCapacity;
                 if(c<0x3000) {
                     goto fastSingle;
                 }
             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
                 /* optimize 2-byte case */
                 int32_t m;
                 if(diff>=0) {
                     diff-=BOCU1_REACH_POS_1+1;
                     m=diff%BOCU1_TRAIL_COUNT;
                     diff/=BOCU1_TRAIL_COUNT;
                     diff+=BOCU1_START_POS_2;
                 } else {
                     diff-=BOCU1_REACH_NEG_1;
                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
                     diff+=BOCU1_START_NEG_2;
                 }
                 *target++=(uint8_t)diff;
                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
                 targetCapacity-=2;
             } else {
                 int32_t length; /* will be 2..4 */
                 diff=packDiff(diff);
                 length=BOCU1_LENGTH_FROM_PACKED(diff);
                 /* write the output character bytes from diff and length */
                 /* from the first if in the loop we know that targetCapacity>0 */
                 if(length<=targetCapacity) {
                     switch(length) {
                         /* each branch falls through to the next one */
                     case 4:
                         *target++=(uint8_t)(diff>>24);
                     case 3: /*fall through*/
                         *target++=(uint8_t)(diff>>16);
                     /* case 2: handled above */
                         *target++=(uint8_t)(diff>>8);
                     /* case 1: handled above */
                         *target++=(uint8_t)diff;
                     default:
                         /* will never occur */
                         break;
                     }
                     targetCapacity-=length;
                 } else {
                     uint8_t *charErrorBuffer;
                     /*
                      * We actually do this backwards here:
                      * In order to save an intermediate variable, we output
                      * first to the overflow buffer what does not fit into the
                      * regular target.
                      */
                     /* we know that 1<=targetCapacity<length<=4 */
                     length-=targetCapacity;
                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
                     switch(length) {
                         /* each branch falls through to the next one */
                     case 3:
                         *charErrorBuffer++=(uint8_t)(diff>>16);
                     case 2: /*fall through*/
                         *charErrorBuffer++=(uint8_t)(diff>>8);
                     case 1: /*fall through*/
                         *charErrorBuffer=(uint8_t)diff;
                     default:
                         /* will never occur */
                         break;
                     }
                     cnv->charErrorBufferLength=(int8_t)length;
                     /* now output what fits into the regular target */
                     diff>>=8*length; /* length was reduced by targetCapacity */
                     switch(targetCapacity) {
                         /* each branch falls through to the next one */
                     case 3:
                         *target++=(uint8_t)(diff>>16);
                     case 2: /*fall through*/
                         *target++=(uint8_t)(diff>>8);
                     case 1: /*fall through*/
                         *target++=(uint8_t)diff;
                     default:
                         /* will never occur */
                         break;
                     }
                     /* target overflow */
                     targetCapacity=0;
                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
                     break;
                 }
             }
         } else {
             /* target is full */
             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
             break;
         }
     }
     /* set the converter state back into UConverter */
     cnv->fromUChar32= c<0 ? -c : 0;
     cnv->fromUnicodeStatus=(uint32_t)prev;
     /* write back the updated pointers */
     pArgs->source=source;
     pArgs->target=(char *)target;
 }
 /* BOCU-1-to-Unicode conversion functions ----------------------------------- */
 /**
  * Function for BOCU-1 decoder; handles multi-byte lead bytes.
  *
  * @param b lead byte;
  *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
  * @return (diff<<2)|count
  */
 static inline int32_t
 decodeBocu1LeadByte(int32_t b) {
     int32_t diff, count;
     if(b>=BOCU1_START_NEG_2) {
         /* positive difference */
         if(b<BOCU1_START_POS_3) {
             /* two bytes */
             diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
             count=1;
         } else if(b<BOCU1_START_POS_4) {
             /* three bytes */
             diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
             count=2;
         } else {
             /* four bytes */
             diff=BOCU1_REACH_POS_3+1;
             count=3;
         }
     } else {
         /* negative difference */
         if(b>=BOCU1_START_NEG_3) {
             /* two bytes */
             diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
             count=1;
         } else if(b>BOCU1_MIN) {
             /* three bytes */
             diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
             count=2;
         } else {
             /* four bytes */
             diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
             count=3;
         }
     }
     /* return the state for decoding the trail byte(s) */
     return (diff<<2)|count;
 }
 /**
  * Function for BOCU-1 decoder; handles multi-byte trail bytes.
  *
  * @param count number of remaining trail bytes including this one
  * @param b trail byte
  * @return new delta for diff including b - <0 indicates an error
  *
  * @see decodeBocu1
  */
 static inline int32_t
 decodeBocu1TrailByte(int32_t count, int32_t b) {
     if(b<=0x20) {
         /* skip some C0 controls and make the trail byte range contiguous */
         b=bocu1ByteToTrail[b];
         /* b<0 for an illegal trail byte value will result in return<0 below */
 #if BOCU1_MAX_TRAIL<0xff
     } else if(b>BOCU1_MAX_TRAIL) {
         return -99;
 #endif
     } else {
         b-=BOCU1_TRAIL_BYTE_OFFSET;
     }
     /* add trail byte into difference and decrement count */
     if(count==1) {
         return b;
     } else if(count==2) {
         return b*BOCU1_TRAIL_COUNT;
     } else /* count==3 */ {
         return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
     }
 }
 static void
 _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
                            UErrorCode *pErrorCode) {
     UConverter *cnv;
     const uint8_t *source, *sourceLimit;
     UChar *target;
     const UChar *targetLimit;
     int32_t *offsets;
     int32_t prev, count, diff, c;
     int8_t byteIndex;
     uint8_t *bytes;
     int32_t sourceIndex, nextSourceIndex;
     /* set up the local pointers */
     cnv=pArgs->converter;
     source=(const uint8_t *)pArgs->source;
     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
     target=pArgs->target;
     targetLimit=pArgs->targetLimit;
     offsets=pArgs->offsets;
     /* get the converter state from UConverter */
     prev=(int32_t)cnv->toUnicodeStatus;
     if(prev==0) {
         prev=BOCU1_ASCII_PREV;
     }
     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
     count=diff&3;
     diff>>=2;
     byteIndex=cnv->toULength;
     bytes=cnv->toUBytes;
     /* sourceIndex=-1 if the current character began in the previous buffer */
     sourceIndex=byteIndex==0 ? 0 : -1;
     nextSourceIndex=0;
     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
     if(count>0 && byteIndex>0 && target<targetLimit) {
         goto getTrail;
     }
 fastSingle:
     /* fast loop for single-byte differences */
     /* use count as the only loop counter variable */
     diff=(int32_t)(sourceLimit-source);
     count=(int32_t)(pArgs->targetLimit-target);
     if(count>diff) {
         count=diff;
     }
     while(count>0) {
         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
             c=prev+(c-BOCU1_MIDDLE);
             if(c<0x3000) {
                 *target++=(UChar)c;
                 *offsets++=nextSourceIndex++;
                 prev=BOCU1_SIMPLE_PREV(c);
             } else {
                 break;
             }
         } else if(c<=0x20) {
             if(c!=0x20) {
                 prev=BOCU1_ASCII_PREV;
             }
             *target++=(UChar)c;
             *offsets++=nextSourceIndex++;
         } else {
             break;
         }
         ++source;
         --count;
     }
     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
     /* decode a sequence of single and lead bytes */
     while(source<sourceLimit) {
         if(target>=targetLimit) {
             /* target is full */
             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
             break;
         }
         ++nextSourceIndex;
         c=*source++;
         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
             /* Write a code point directly from a single-byte difference. */
             c=prev+(c-BOCU1_MIDDLE);
             if(c<0x3000) {
                 *target++=(UChar)c;
                 *offsets++=sourceIndex;
                 prev=BOCU1_SIMPLE_PREV(c);
                 sourceIndex=nextSourceIndex;
                 goto fastSingle;
             }
         } else if(c<=0x20) {
             /*
              * Direct-encoded C0 control code or space.
              * Reset prev for C0 control codes but not for space.
              */
             if(c!=0x20) {
                 prev=BOCU1_ASCII_PREV;
             }
             *target++=(UChar)c;
             *offsets++=sourceIndex;
             sourceIndex=nextSourceIndex;
             continue;
         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
             /* Optimize two-byte case. */
             if(c>=BOCU1_MIDDLE) {
                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
             } else {
                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
             }
             /* trail byte */
             ++nextSourceIndex;
             c=decodeBocu1TrailByte(1, *source++);
             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
                 bytes[0]=source[-2];
                 bytes[1]=source[-1];
                 byteIndex=2;
                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
                 break;
             }
         } else if(c==BOCU1_RESET) {
             /* only reset the state, no code point */
             prev=BOCU1_ASCII_PREV;
             sourceIndex=nextSourceIndex;
             continue;
         } else {
             /*
              * For multi-byte difference lead bytes, set the decoder state
              * with the partial difference value from the lead byte and
              * with the number of trail bytes.
              */
             bytes[0]=(uint8_t)c;
             byteIndex=1;
             diff=decodeBocu1LeadByte(c);
             count=diff&3;
             diff>>=2;
 getTrail:
             for(;;) {
                 if(source>=sourceLimit) {
                     goto endloop;
                 }
                 ++nextSourceIndex;
                 c=bytes[byteIndex++]=*source++;
                 /* trail byte in any position */
                 c=decodeBocu1TrailByte(count, c);
                 if(c<0) {
                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
                     goto endloop;
                 }
                 diff+=c;
                 if(--count==0) {
                     /* final trail byte, deliver a code point */
                     byteIndex=0;
                     c=prev+diff;
                     if((uint32_t)c>0x10ffff) {
                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
                         goto endloop;
                     }
                     break;
                 }
             }
         }
         /* calculate the next prev and output c */
         prev=BOCU1_PREV(c);
         if(c<=0xffff) {
             *target++=(UChar)c;
             *offsets++=sourceIndex;
         } else {
             /* output surrogate pair */
             *target++=U16_LEAD(c);
             if(target<targetLimit) {
                 *target++=U16_TRAIL(c);
                 *offsets++=sourceIndex;
                 *offsets++=sourceIndex;
             } else {
                 /* target overflow */
                 *offsets++=sourceIndex;
                 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
                 cnv->UCharErrorBufferLength=1;
                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
                 break;
             }
         }
         sourceIndex=nextSourceIndex;
     }
 endloop:
     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
         /* set the converter state in UConverter to deal with the next character */
         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
         cnv->mode=0;
     } else {
         /* set the converter state back into UConverter */
         cnv->toUnicodeStatus=(uint32_t)prev;
         cnv->mode=(diff<<2)|count;
     }
     cnv->toULength=byteIndex;
     /* write back the updated pointers */
     pArgs->source=(const char *)source;
     pArgs->target=target;
     pArgs->offsets=offsets;
     return;
 }
 /*
  * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
  * If a change is made in the original function, then either
  * change this function the same way or
  * re-copy the original function and remove the variables
  * offsets, sourceIndex, and nextSourceIndex.
  */
 static void
 _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
                 UErrorCode *pErrorCode) {
     UConverter *cnv;
     const uint8_t *source, *sourceLimit;
     UChar *target;
     const UChar *targetLimit;
     int32_t prev, count, diff, c;
     int8_t byteIndex;
     uint8_t *bytes;
 U_ALIGN_CODE(16)
     /* set up the local pointers */
     cnv=pArgs->converter;
     source=(const uint8_t *)pArgs->source;
     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
     target=pArgs->target;
     targetLimit=pArgs->targetLimit;
     /* get the converter state from UConverter */
     prev=(int32_t)cnv->toUnicodeStatus;
     if(prev==0) {
         prev=BOCU1_ASCII_PREV;
     }
     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
     count=diff&3;
     diff>>=2;
     byteIndex=cnv->toULength;
     bytes=cnv->toUBytes;
     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
     if(count>0 && byteIndex>0 && target<targetLimit) {
         goto getTrail;
     }
 fastSingle:
     /* fast loop for single-byte differences */
     /* use count as the only loop counter variable */
     diff=(int32_t)(sourceLimit-source);
     count=(int32_t)(pArgs->targetLimit-target);
     if(count>diff) {
         count=diff;
     }
     while(count>0) {
         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
             c=prev+(c-BOCU1_MIDDLE);
             if(c<0x3000) {
                 *target++=(UChar)c;
                 prev=BOCU1_SIMPLE_PREV(c);
             } else {
                 break;
             }
         } else if(c<=0x20) {
             if(c!=0x20) {
                 prev=BOCU1_ASCII_PREV;
             }
             *target++=(UChar)c;
         } else {
             break;
         }
         ++source;
         --count;
     }
     /* decode a sequence of single and lead bytes */
     while(source<sourceLimit) {
         if(target>=targetLimit) {
             /* target is full */
             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
             break;
         }
         c=*source++;
         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
             /* Write a code point directly from a single-byte difference. */
             c=prev+(c-BOCU1_MIDDLE);
             if(c<0x3000) {
                 *target++=(UChar)c;
                 prev=BOCU1_SIMPLE_PREV(c);
                 goto fastSingle;
             }
         } else if(c<=0x20) {
             /*
              * Direct-encoded C0 control code or space.
              * Reset prev for C0 control codes but not for space.
              */
             if(c!=0x20) {
                 prev=BOCU1_ASCII_PREV;
             }
             *target++=(UChar)c;
             continue;
         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
             /* Optimize two-byte case. */
             if(c>=BOCU1_MIDDLE) {
                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
             } else {
                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
             }
             /* trail byte */
             c=decodeBocu1TrailByte(1, *source++);
             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
                 bytes[0]=source[-2];
                 bytes[1]=source[-1];
                 byteIndex=2;
                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
                 break;
             }
         } else if(c==BOCU1_RESET) {
             /* only reset the state, no code point */
             prev=BOCU1_ASCII_PREV;
             continue;
         } else {
             /*
              * For multi-byte difference lead bytes, set the decoder state
              * with the partial difference value from the lead byte and
              * with the number of trail bytes.
              */
             bytes[0]=(uint8_t)c;
             byteIndex=1;
             diff=decodeBocu1LeadByte(c);
             count=diff&3;
             diff>>=2;
 getTrail:
             for(;;) {
                 if(source>=sourceLimit) {
                     goto endloop;
                 }
                 c=bytes[byteIndex++]=*source++;
                 /* trail byte in any position */
                 c=decodeBocu1TrailByte(count, c);
                 if(c<0) {
                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
                     goto endloop;
                 }
                 diff+=c;
                 if(--count==0) {
                     /* final trail byte, deliver a code point */
                     byteIndex=0;
                     c=prev+diff;
                     if((uint32_t)c>0x10ffff) {
                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
                         goto endloop;
                     }
                     break;
                 }
             }
         }
         /* calculate the next prev and output c */
         prev=BOCU1_PREV(c);
         if(c<=0xffff) {
             *target++=(UChar)c;
         } else {
             /* output surrogate pair */
             *target++=U16_LEAD(c);
             if(target<targetLimit) {
                 *target++=U16_TRAIL(c);
             } else {
                 /* target overflow */
                 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
                 cnv->UCharErrorBufferLength=1;
                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
                 break;
             }
         }
     }
 endloop:
     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
         /* set the converter state in UConverter to deal with the next character */
         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
         cnv->mode=0;
     } else {
         /* set the converter state back into UConverter */
         cnv->toUnicodeStatus=(uint32_t)prev;
         cnv->mode=(diff<<2)|count;
     }
     cnv->toULength=byteIndex;
     /* write back the updated pointers */
     pArgs->source=(const char *)source;
     pArgs->target=target;
     return;
 }
 /* miscellaneous ------------------------------------------------------------ */
 static const UConverterImpl _Bocu1Impl={
     UCNV_BOCU1,
     NULL,
     NULL,
     NULL,
     NULL,
     NULL,
     _Bocu1ToUnicode,
     _Bocu1ToUnicodeWithOffsets,
     _Bocu1FromUnicode,
     _Bocu1FromUnicodeWithOffsets,
     NULL,
     NULL,
     NULL,
     NULL,
     NULL,
     ucnv_getCompleteUnicodeSet,
     NULL,
     NULL
 };
 static const UConverterStaticData _Bocu1StaticData={
     sizeof(UConverterStaticData),
     "BOCU-1",
 , /* CCSID for BOCU-1 */
     UCNV_IBM, UCNV_BOCU1,
 , 4, /* one UChar generates at least 1 byte and at most 4 bytes */
     { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
     FALSE, FALSE,
 ,
 ,
     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 };
 const UConverterSharedData _Bocu1Data={
     sizeof(UConverterSharedData), ~((uint32_t)0),
     NULL, NULL, &_Bocu1StaticData, FALSE, &_Bocu1Impl,
 ,
     UCNV_MBCS_TABLE_INITIALIZER
 };
 #endif

The Tor Browser / annotate

intl/icu/source/common/ucnvbocu.cpp@b8a032363ba2 (annotated)

intl/icu/source/common/ucnvbocu.cpp