The Tor Browser: diff intl/icu/source/common/ucnvbocu.cpp

     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/ucnvbocu.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1402 @@
     1.4 +/*
     1.5 +******************************************************************************
     1.6 +*
     1.7 +*   Copyright (C) 2002-2011, International Business Machines
     1.8 +*   Corporation and others.  All Rights Reserved.
     1.9 +*
    1.10 +******************************************************************************
    1.11 +*   file name:  ucnvbocu.cpp
    1.12 +*   encoding:   US-ASCII
    1.13 +*   tab size:   8 (not used)
    1.14 +*   indentation:4
    1.15 +*
    1.16 +*   created on: 2002mar27
    1.17 +*   created by: Markus W. Scherer
    1.18 +*
    1.19 +*   This is an implementation of the Binary Ordered Compression for Unicode,
    1.20 +*   in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
    1.21 +*/
    1.22 +
    1.23 +#include "unicode/utypes.h"
    1.24 +
    1.25 +#if !UCONFIG_NO_CONVERSION
    1.26 +
    1.27 +#include "unicode/ucnv.h"
    1.28 +#include "unicode/ucnv_cb.h"
    1.29 +#include "unicode/utf16.h"
    1.30 +#include "putilimp.h"
    1.31 +#include "ucnv_bld.h"
    1.32 +#include "ucnv_cnv.h"
    1.33 +#include "uassert.h"
    1.34 +
    1.35 +/* BOCU-1 constants and macros ---------------------------------------------- */
    1.36 +
    1.37 +/*
    1.38 + * BOCU-1 encodes the code points of a Unicode string as
    1.39 + * a sequence of byte-encoded differences (slope detection),
    1.40 + * preserving lexical order.
    1.41 + *
    1.42 + * Optimize the difference-taking for runs of Unicode text within
    1.43 + * small scripts:
    1.44 + *
    1.45 + * Most small scripts are allocated within aligned 128-blocks of Unicode
    1.46 + * code points. Lexical order is preserved if the "previous code point" state
    1.47 + * is always moved into the middle of such a block.
    1.48 + *
    1.49 + * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
    1.50 + * areas into the middle of those areas.
    1.51 + *
    1.52 + * C0 control codes and space are encoded with their US-ASCII bytes.
    1.53 + * "prev" is reset for C0 controls but not for space.
    1.54 + */
    1.55 +
    1.56 +/* initial value for "prev": middle of the ASCII range */
    1.57 +#define BOCU1_ASCII_PREV        0x40
    1.58 +
    1.59 +/* bounding byte values for differences */
    1.60 +#define BOCU1_MIN               0x21
    1.61 +#define BOCU1_MIDDLE            0x90
    1.62 +#define BOCU1_MAX_LEAD          0xfe
    1.63 +#define BOCU1_MAX_TRAIL         0xff
    1.64 +#define BOCU1_RESET             0xff
    1.65 +
    1.66 +/* number of lead bytes */
    1.67 +#define BOCU1_COUNT             (BOCU1_MAX_LEAD-BOCU1_MIN+1)
    1.68 +
    1.69 +/* adjust trail byte counts for the use of some C0 control byte values */
    1.70 +#define BOCU1_TRAIL_CONTROLS_COUNT  20
    1.71 +#define BOCU1_TRAIL_BYTE_OFFSET     (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
    1.72 +
    1.73 +/* number of trail bytes */
    1.74 +#define BOCU1_TRAIL_COUNT       ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
    1.75 +
    1.76 +/*
    1.77 + * number of positive and negative single-byte codes
    1.78 + * (counting 0==BOCU1_MIDDLE among the positive ones)
    1.79 + */
    1.80 +#define BOCU1_SINGLE            64
    1.81 +
    1.82 +/* number of lead bytes for positive and negative 2/3/4-byte sequences */
    1.83 +#define BOCU1_LEAD_2            43
    1.84 +#define BOCU1_LEAD_3            3
    1.85 +#define BOCU1_LEAD_4            1
    1.86 +
    1.87 +/* The difference value range for single-byters. */
    1.88 +#define BOCU1_REACH_POS_1   (BOCU1_SINGLE-1)
    1.89 +#define BOCU1_REACH_NEG_1   (-BOCU1_SINGLE)
    1.90 +
    1.91 +/* The difference value range for double-byters. */
    1.92 +#define BOCU1_REACH_POS_2   (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
    1.93 +#define BOCU1_REACH_NEG_2   (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
    1.94 +
    1.95 +/* The difference value range for 3-byters. */
    1.96 +#define BOCU1_REACH_POS_3   \
    1.97 +    (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
    1.98 +
    1.99 +#define BOCU1_REACH_NEG_3   (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
   1.100 +
   1.101 +/* The lead byte start values. */
   1.102 +#define BOCU1_START_POS_2   (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
   1.103 +#define BOCU1_START_POS_3   (BOCU1_START_POS_2+BOCU1_LEAD_2)
   1.104 +#define BOCU1_START_POS_4   (BOCU1_START_POS_3+BOCU1_LEAD_3)
   1.105 +     /* ==BOCU1_MAX_LEAD */
   1.106 +
   1.107 +#define BOCU1_START_NEG_2   (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
   1.108 +#define BOCU1_START_NEG_3   (BOCU1_START_NEG_2-BOCU1_LEAD_2)
   1.109 +#define BOCU1_START_NEG_4   (BOCU1_START_NEG_3-BOCU1_LEAD_3)
   1.110 +     /* ==BOCU1_MIN+1 */
   1.111 +
   1.112 +/* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
   1.113 +#define BOCU1_LENGTH_FROM_LEAD(lead) \
   1.114 +    ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
   1.115 +     (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
   1.116 +     (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
   1.117 +
   1.118 +/* The length of a byte sequence, according to its packed form. */
   1.119 +#define BOCU1_LENGTH_FROM_PACKED(packed) \
   1.120 +    ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
   1.121 +
   1.122 +/*
   1.123 + * 12 commonly used C0 control codes (and space) are only used to encode
   1.124 + * themselves directly,
   1.125 + * which makes BOCU-1 MIME-usable and reasonably safe for
   1.126 + * ASCII-oriented software.
   1.127 + *
   1.128 + * These controls are
   1.129 + *  0   NUL
   1.130 + *
   1.131 + *  7   BEL
   1.132 + *  8   BS
   1.133 + *
   1.134 + *  9   TAB
   1.135 + *  a   LF
   1.136 + *  b   VT
   1.137 + *  c   FF
   1.138 + *  d   CR
   1.139 + *
   1.140 + *  e   SO
   1.141 + *  f   SI
   1.142 + *
   1.143 + * 1a   SUB
   1.144 + * 1b   ESC
   1.145 + *
   1.146 + * The other 20 C0 controls are also encoded directly (to preserve order)
   1.147 + * but are also used as trail bytes in difference encoding
   1.148 + * (for better compression).
   1.149 + */
   1.150 +#define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
   1.151 +
   1.152 +/*
   1.153 + * Byte value map for control codes,
   1.154 + * from external byte values 0x00..0x20
   1.155 + * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
   1.156 + * External byte values that are illegal as trail bytes are mapped to -1.
   1.157 + */
   1.158 +static const int8_t
   1.159 +bocu1ByteToTrail[BOCU1_MIN]={
   1.160 +/*  0     1     2     3     4     5     6     7    */
   1.161 +    -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
   1.162 +
   1.163 +/*  8     9     a     b     c     d     e     f    */
   1.164 +    -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
   1.165 +
   1.166 +/*  10    11    12    13    14    15    16    17   */
   1.167 +    0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
   1.168 +
   1.169 +/*  18    19    1a    1b    1c    1d    1e    1f   */
   1.170 +    0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,
   1.171 +
   1.172 +/*  20   */
   1.173 +    -1
   1.174 +};
   1.175 +
   1.176 +/*
   1.177 + * Byte value map for control codes,
   1.178 + * from trail byte values 0..19 (0..0x13) as used in the difference calculation
   1.179 + * to external byte values 0x00..0x20.
   1.180 + */
   1.181 +static const int8_t
   1.182 +bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
   1.183 +/*  0     1     2     3     4     5     6     7    */
   1.184 +    0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
   1.185 +
   1.186 +/*  8     9     a     b     c     d     e     f    */
   1.187 +    0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
   1.188 +
   1.189 +/*  10    11    12    13   */
   1.190 +    0x1c, 0x1d, 0x1e, 0x1f
   1.191 +};
   1.192 +
   1.193 +/**
   1.194 + * Integer division and modulo with negative numerators
   1.195 + * yields negative modulo results and quotients that are one more than
   1.196 + * what we need here.
   1.197 + * This macro adjust the results so that the modulo-value m is always >=0.
   1.198 + *
   1.199 + * For positive n, the if() condition is always FALSE.
   1.200 + *
   1.201 + * @param n Number to be split into quotient and rest.
   1.202 + *          Will be modified to contain the quotient.
   1.203 + * @param d Divisor.
   1.204 + * @param m Output variable for the rest (modulo result).
   1.205 + */
   1.206 +#define NEGDIVMOD(n, d, m) { \
   1.207 +    (m)=(n)%(d); \
   1.208 +    (n)/=(d); \
   1.209 +    if((m)<0) { \
   1.210 +        --(n); \
   1.211 +        (m)+=(d); \
   1.212 +    } \
   1.213 +}
   1.214 +
   1.215 +/* Faster versions of packDiff() for single-byte-encoded diff values. */
   1.216 +
   1.217 +/** Is a diff value encodable in a single byte? */
   1.218 +#define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
   1.219 +
   1.220 +/** Encode a diff value in a single byte. */
   1.221 +#define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
   1.222 +
   1.223 +/** Is a diff value encodable in two bytes? */
   1.224 +#define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
   1.225 +
   1.226 +/* BOCU-1 implementation functions ------------------------------------------ */
   1.227 +
   1.228 +#define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
   1.229 +
   1.230 +/**
   1.231 + * Compute the next "previous" value for differencing
   1.232 + * from the current code point.
   1.233 + *
   1.234 + * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
   1.235 + * @return "previous code point" state value
   1.236 + */
   1.237 +static inline int32_t
   1.238 +bocu1Prev(int32_t c) {
   1.239 +    /* compute new prev */
   1.240 +    if(/* 0x3040<=c && */ c<=0x309f) {
   1.241 +        /* Hiragana is not 128-aligned */
   1.242 +        return 0x3070;
   1.243 +    } else if(0x4e00<=c && c<=0x9fa5) {
   1.244 +        /* CJK Unihan */
   1.245 +        return 0x4e00-BOCU1_REACH_NEG_2;
   1.246 +    } else if(0xac00<=c /* && c<=0xd7a3 */) {
   1.247 +        /* Korean Hangul */
   1.248 +        return (0xd7a3+0xac00)/2;
   1.249 +    } else {
   1.250 +        /* mostly small scripts */
   1.251 +        return BOCU1_SIMPLE_PREV(c);
   1.252 +    }
   1.253 +}
   1.254 +
   1.255 +/** Fast version of bocu1Prev() for most scripts. */
   1.256 +#define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
   1.257 +
   1.258 +/*
   1.259 + * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
   1.260 + * The UConverter fields are used as follows:
   1.261 + *
   1.262 + * fromUnicodeStatus    encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
   1.263 + *
   1.264 + * toUnicodeStatus      decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
   1.265 + * mode                 decoder's incomplete (diff<<2)|count (ignored when toULength==0)
   1.266 + */
   1.267 +
   1.268 +/* BOCU-1-from-Unicode conversion functions --------------------------------- */
   1.269 +
   1.270 +/**
   1.271 + * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
   1.272 + * and return a packed integer with them.
   1.273 + *
   1.274 + * The encoding favors small absolute differences with short encodings
   1.275 + * to compress runs of same-script characters.
   1.276 + *
   1.277 + * Optimized version with unrolled loops and fewer floating-point operations
   1.278 + * than the standard packDiff().
   1.279 + *
   1.280 + * @param diff difference value -0x10ffff..0x10ffff
   1.281 + * @return
   1.282 + *      0x010000zz for 1-byte sequence zz
   1.283 + *      0x0200yyzz for 2-byte sequence yy zz
   1.284 + *      0x03xxyyzz for 3-byte sequence xx yy zz
   1.285 + *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
   1.286 + */
   1.287 +static int32_t
   1.288 +packDiff(int32_t diff) {
   1.289 +    int32_t result, m;
   1.290 +
   1.291 +    U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */
   1.292 +    if(diff>=BOCU1_REACH_NEG_1) {
   1.293 +        /* mostly positive differences, and single-byte negative ones */
   1.294 +#if 0   /* single-byte case handled in macros, see below */
   1.295 +        if(diff<=BOCU1_REACH_POS_1) {
   1.296 +            /* single byte */
   1.297 +            return 0x01000000|(BOCU1_MIDDLE+diff);
   1.298 +        } else
   1.299 +#endif
   1.300 +        if(diff<=BOCU1_REACH_POS_2) {
   1.301 +            /* two bytes */
   1.302 +            diff-=BOCU1_REACH_POS_1+1;
   1.303 +            result=0x02000000;
   1.304 +
   1.305 +            m=diff%BOCU1_TRAIL_COUNT;
   1.306 +            diff/=BOCU1_TRAIL_COUNT;
   1.307 +            result|=BOCU1_TRAIL_TO_BYTE(m);
   1.308 +
   1.309 +            result|=(BOCU1_START_POS_2+diff)<<8;
   1.310 +        } else if(diff<=BOCU1_REACH_POS_3) {
   1.311 +            /* three bytes */
   1.312 +            diff-=BOCU1_REACH_POS_2+1;
   1.313 +            result=0x03000000;
   1.314 +
   1.315 +            m=diff%BOCU1_TRAIL_COUNT;
   1.316 +            diff/=BOCU1_TRAIL_COUNT;
   1.317 +            result|=BOCU1_TRAIL_TO_BYTE(m);
   1.318 +
   1.319 +            m=diff%BOCU1_TRAIL_COUNT;
   1.320 +            diff/=BOCU1_TRAIL_COUNT;
   1.321 +            result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
   1.322 +
   1.323 +            result|=(BOCU1_START_POS_3+diff)<<16;
   1.324 +        } else {
   1.325 +            /* four bytes */
   1.326 +            diff-=BOCU1_REACH_POS_3+1;
   1.327 +
   1.328 +            m=diff%BOCU1_TRAIL_COUNT;
   1.329 +            diff/=BOCU1_TRAIL_COUNT;
   1.330 +            result=BOCU1_TRAIL_TO_BYTE(m);
   1.331 +
   1.332 +            m=diff%BOCU1_TRAIL_COUNT;
   1.333 +            diff/=BOCU1_TRAIL_COUNT;
   1.334 +            result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
   1.335 +
   1.336 +            /*
   1.337 +             * We know that / and % would deliver quotient 0 and rest=diff.
   1.338 +             * Avoid division and modulo for performance.
   1.339 +             */
   1.340 +            result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
   1.341 +
   1.342 +            result|=((uint32_t)BOCU1_START_POS_4)<<24;
   1.343 +        }
   1.344 +    } else {
   1.345 +        /* two- to four-byte negative differences */
   1.346 +        if(diff>=BOCU1_REACH_NEG_2) {
   1.347 +            /* two bytes */
   1.348 +            diff-=BOCU1_REACH_NEG_1;
   1.349 +            result=0x02000000;
   1.350 +
   1.351 +            NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
   1.352 +            result|=BOCU1_TRAIL_TO_BYTE(m);
   1.353 +
   1.354 +            result|=(BOCU1_START_NEG_2+diff)<<8;
   1.355 +        } else if(diff>=BOCU1_REACH_NEG_3) {
   1.356 +            /* three bytes */
   1.357 +            diff-=BOCU1_REACH_NEG_2;
   1.358 +            result=0x03000000;
   1.359 +
   1.360 +            NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
   1.361 +            result|=BOCU1_TRAIL_TO_BYTE(m);
   1.362 +
   1.363 +            NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
   1.364 +            result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
   1.365 +
   1.366 +            result|=(BOCU1_START_NEG_3+diff)<<16;
   1.367 +        } else {
   1.368 +            /* four bytes */
   1.369 +            diff-=BOCU1_REACH_NEG_3;
   1.370 +
   1.371 +            NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
   1.372 +            result=BOCU1_TRAIL_TO_BYTE(m);
   1.373 +
   1.374 +            NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
   1.375 +            result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
   1.376 +
   1.377 +            /*
   1.378 +             * We know that NEGDIVMOD would deliver
   1.379 +             * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
   1.380 +             * Avoid division and modulo for performance.
   1.381 +             */
   1.382 +            m=diff+BOCU1_TRAIL_COUNT;
   1.383 +            result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
   1.384 +
   1.385 +            result|=BOCU1_MIN<<24;
   1.386 +        }
   1.387 +    }
   1.388 +    return result;
   1.389 +}
   1.390 +
   1.391 +
   1.392 +static void
   1.393 +_Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
   1.394 +                             UErrorCode *pErrorCode) {
   1.395 +    UConverter *cnv;
   1.396 +    const UChar *source, *sourceLimit;
   1.397 +    uint8_t *target;
   1.398 +    int32_t targetCapacity;
   1.399 +    int32_t *offsets;
   1.400 +
   1.401 +    int32_t prev, c, diff;
   1.402 +
   1.403 +    int32_t sourceIndex, nextSourceIndex;
   1.404 +
   1.405 +U_ALIGN_CODE(16)
   1.406 +
   1.407 +    /* set up the local pointers */
   1.408 +    cnv=pArgs->converter;
   1.409 +    source=pArgs->source;
   1.410 +    sourceLimit=pArgs->sourceLimit;
   1.411 +    target=(uint8_t *)pArgs->target;
   1.412 +    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
   1.413 +    offsets=pArgs->offsets;
   1.414 +
   1.415 +    /* get the converter state from UConverter */
   1.416 +    c=cnv->fromUChar32;
   1.417 +    prev=(int32_t)cnv->fromUnicodeStatus;
   1.418 +    if(prev==0) {
   1.419 +        prev=BOCU1_ASCII_PREV;
   1.420 +    }
   1.421 +
   1.422 +    /* sourceIndex=-1 if the current character began in the previous buffer */
   1.423 +    sourceIndex= c==0 ? 0 : -1;
   1.424 +    nextSourceIndex=0;
   1.425 +
   1.426 +    /* conversion loop */
   1.427 +    if(c!=0 && targetCapacity>0) {
   1.428 +        goto getTrail;
   1.429 +    }
   1.430 +
   1.431 +fastSingle:
   1.432 +    /* fast loop for single-byte differences */
   1.433 +    /* use only one loop counter variable, targetCapacity, not also source */
   1.434 +    diff=(int32_t)(sourceLimit-source);
   1.435 +    if(targetCapacity>diff) {
   1.436 +        targetCapacity=diff;
   1.437 +    }
   1.438 +    while(targetCapacity>0 && (c=*source)<0x3000) {
   1.439 +        if(c<=0x20) {
   1.440 +            if(c!=0x20) {
   1.441 +                prev=BOCU1_ASCII_PREV;
   1.442 +            }
   1.443 +            *target++=(uint8_t)c;
   1.444 +            *offsets++=nextSourceIndex++;
   1.445 +            ++source;
   1.446 +            --targetCapacity;
   1.447 +        } else {
   1.448 +            diff=c-prev;
   1.449 +            if(DIFF_IS_SINGLE(diff)) {
   1.450 +                prev=BOCU1_SIMPLE_PREV(c);
   1.451 +                *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
   1.452 +                *offsets++=nextSourceIndex++;
   1.453 +                ++source;
   1.454 +                --targetCapacity;
   1.455 +            } else {
   1.456 +                break;
   1.457 +            }
   1.458 +        }
   1.459 +    }
   1.460 +    /* restore real values */
   1.461 +    targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
   1.462 +    sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
   1.463 +
   1.464 +    /* regular loop for all cases */
   1.465 +    while(source<sourceLimit) {
   1.466 +        if(targetCapacity>0) {
   1.467 +            c=*source++;
   1.468 +            ++nextSourceIndex;
   1.469 +
   1.470 +            if(c<=0x20) {
   1.471 +                /*
   1.472 +                 * ISO C0 control & space:
   1.473 +                 * Encode directly for MIME compatibility,
   1.474 +                 * and reset state except for space, to not disrupt compression.
   1.475 +                 */
   1.476 +                if(c!=0x20) {
   1.477 +                    prev=BOCU1_ASCII_PREV;
   1.478 +                }
   1.479 +                *target++=(uint8_t)c;
   1.480 +                *offsets++=sourceIndex;
   1.481 +                --targetCapacity;
   1.482 +
   1.483 +                sourceIndex=nextSourceIndex;
   1.484 +                continue;
   1.485 +            }
   1.486 +
   1.487 +            if(U16_IS_LEAD(c)) {
   1.488 +getTrail:
   1.489 +                if(source<sourceLimit) {
   1.490 +                    /* test the following code unit */
   1.491 +                    UChar trail=*source;
   1.492 +                    if(U16_IS_TRAIL(trail)) {
   1.493 +                        ++source;
   1.494 +                        ++nextSourceIndex;
   1.495 +                        c=U16_GET_SUPPLEMENTARY(c, trail);
   1.496 +                    }
   1.497 +                } else {
   1.498 +                    /* no more input */
   1.499 +                    c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
   1.500 +                    break;
   1.501 +                }
   1.502 +            }
   1.503 +
   1.504 +            /*
   1.505 +             * all other Unicode code points c==U+0021..U+10ffff
   1.506 +             * are encoded with the difference c-prev
   1.507 +             *
   1.508 +             * a new prev is computed from c,
   1.509 +             * placed in the middle of a 0x80-block (for most small scripts) or
   1.510 +             * in the middle of the Unihan and Hangul blocks
   1.511 +             * to statistically minimize the following difference
   1.512 +             */
   1.513 +            diff=c-prev;
   1.514 +            prev=BOCU1_PREV(c);
   1.515 +            if(DIFF_IS_SINGLE(diff)) {
   1.516 +                *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
   1.517 +                *offsets++=sourceIndex;
   1.518 +                --targetCapacity;
   1.519 +                sourceIndex=nextSourceIndex;
   1.520 +                if(c<0x3000) {
   1.521 +                    goto fastSingle;
   1.522 +                }
   1.523 +            } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
   1.524 +                /* optimize 2-byte case */
   1.525 +                int32_t m;
   1.526 +
   1.527 +                if(diff>=0) {
   1.528 +                    diff-=BOCU1_REACH_POS_1+1;
   1.529 +                    m=diff%BOCU1_TRAIL_COUNT;
   1.530 +                    diff/=BOCU1_TRAIL_COUNT;
   1.531 +                    diff+=BOCU1_START_POS_2;
   1.532 +                } else {
   1.533 +                    diff-=BOCU1_REACH_NEG_1;
   1.534 +                    NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
   1.535 +                    diff+=BOCU1_START_NEG_2;
   1.536 +                }
   1.537 +                *target++=(uint8_t)diff;
   1.538 +                *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
   1.539 +                *offsets++=sourceIndex;
   1.540 +                *offsets++=sourceIndex;
   1.541 +                targetCapacity-=2;
   1.542 +                sourceIndex=nextSourceIndex;
   1.543 +            } else {
   1.544 +                int32_t length; /* will be 2..4 */
   1.545 +
   1.546 +                diff=packDiff(diff);
   1.547 +                length=BOCU1_LENGTH_FROM_PACKED(diff);
   1.548 +
   1.549 +                /* write the output character bytes from diff and length */
   1.550 +                /* from the first if in the loop we know that targetCapacity>0 */
   1.551 +                if(length<=targetCapacity) {
   1.552 +                    switch(length) {
   1.553 +                        /* each branch falls through to the next one */
   1.554 +                    case 4:
   1.555 +                        *target++=(uint8_t)(diff>>24);
   1.556 +                        *offsets++=sourceIndex;
   1.557 +                    case 3: /*fall through*/
   1.558 +                        *target++=(uint8_t)(diff>>16);
   1.559 +                        *offsets++=sourceIndex;
   1.560 +                    case 2: /*fall through*/
   1.561 +                        *target++=(uint8_t)(diff>>8);
   1.562 +                        *offsets++=sourceIndex;
   1.563 +                    /* case 1: handled above */
   1.564 +                        *target++=(uint8_t)diff;
   1.565 +                        *offsets++=sourceIndex;
   1.566 +                    default:
   1.567 +                        /* will never occur */
   1.568 +                        break;
   1.569 +                    }
   1.570 +                    targetCapacity-=length;
   1.571 +                    sourceIndex=nextSourceIndex;
   1.572 +                } else {
   1.573 +                    uint8_t *charErrorBuffer;
   1.574 +
   1.575 +                    /*
   1.576 +                     * We actually do this backwards here:
   1.577 +                     * In order to save an intermediate variable, we output
   1.578 +                     * first to the overflow buffer what does not fit into the
   1.579 +                     * regular target.
   1.580 +                     */
   1.581 +                    /* we know that 1<=targetCapacity<length<=4 */
   1.582 +                    length-=targetCapacity;
   1.583 +                    charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
   1.584 +                    switch(length) {
   1.585 +                        /* each branch falls through to the next one */
   1.586 +                    case 3:
   1.587 +                        *charErrorBuffer++=(uint8_t)(diff>>16);
   1.588 +                    case 2: /*fall through*/
   1.589 +                        *charErrorBuffer++=(uint8_t)(diff>>8);
   1.590 +                    case 1: /*fall through*/
   1.591 +                        *charErrorBuffer=(uint8_t)diff;
   1.592 +                    default:
   1.593 +                        /* will never occur */
   1.594 +                        break;
   1.595 +                    }
   1.596 +                    cnv->charErrorBufferLength=(int8_t)length;
   1.597 +
   1.598 +                    /* now output what fits into the regular target */
   1.599 +                    diff>>=8*length; /* length was reduced by targetCapacity */
   1.600 +                    switch(targetCapacity) {
   1.601 +                        /* each branch falls through to the next one */
   1.602 +                    case 3:
   1.603 +                        *target++=(uint8_t)(diff>>16);
   1.604 +                        *offsets++=sourceIndex;
   1.605 +                    case 2: /*fall through*/
   1.606 +                        *target++=(uint8_t)(diff>>8);
   1.607 +                        *offsets++=sourceIndex;
   1.608 +                    case 1: /*fall through*/
   1.609 +                        *target++=(uint8_t)diff;
   1.610 +                        *offsets++=sourceIndex;
   1.611 +                    default:
   1.612 +                        /* will never occur */
   1.613 +                        break;
   1.614 +                    }
   1.615 +
   1.616 +                    /* target overflow */
   1.617 +                    targetCapacity=0;
   1.618 +                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1.619 +                    break;
   1.620 +                }
   1.621 +            }
   1.622 +        } else {
   1.623 +            /* target is full */
   1.624 +            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1.625 +            break;
   1.626 +        }
   1.627 +    }
   1.628 +
   1.629 +    /* set the converter state back into UConverter */
   1.630 +    cnv->fromUChar32= c<0 ? -c : 0;
   1.631 +    cnv->fromUnicodeStatus=(uint32_t)prev;
   1.632 +
   1.633 +    /* write back the updated pointers */
   1.634 +    pArgs->source=source;
   1.635 +    pArgs->target=(char *)target;
   1.636 +    pArgs->offsets=offsets;
   1.637 +}
   1.638 +
   1.639 +/*
   1.640 + * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
   1.641 + * If a change is made in the original function, then either
   1.642 + * change this function the same way or
   1.643 + * re-copy the original function and remove the variables
   1.644 + * offsets, sourceIndex, and nextSourceIndex.
   1.645 + */
   1.646 +static void
   1.647 +_Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
   1.648 +                  UErrorCode *pErrorCode) {
   1.649 +    UConverter *cnv;
   1.650 +    const UChar *source, *sourceLimit;
   1.651 +    uint8_t *target;
   1.652 +    int32_t targetCapacity;
   1.653 +
   1.654 +    int32_t prev, c, diff;
   1.655 +
   1.656 +    /* set up the local pointers */
   1.657 +    cnv=pArgs->converter;
   1.658 +    source=pArgs->source;
   1.659 +    sourceLimit=pArgs->sourceLimit;
   1.660 +    target=(uint8_t *)pArgs->target;
   1.661 +    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
   1.662 +
   1.663 +    /* get the converter state from UConverter */
   1.664 +    c=cnv->fromUChar32;
   1.665 +    prev=(int32_t)cnv->fromUnicodeStatus;
   1.666 +    if(prev==0) {
   1.667 +        prev=BOCU1_ASCII_PREV;
   1.668 +    }
   1.669 +
   1.670 +    /* conversion loop */
   1.671 +    if(c!=0 && targetCapacity>0) {
   1.672 +        goto getTrail;
   1.673 +    }
   1.674 +
   1.675 +fastSingle:
   1.676 +    /* fast loop for single-byte differences */
   1.677 +    /* use only one loop counter variable, targetCapacity, not also source */
   1.678 +    diff=(int32_t)(sourceLimit-source);
   1.679 +    if(targetCapacity>diff) {
   1.680 +        targetCapacity=diff;
   1.681 +    }
   1.682 +    while(targetCapacity>0 && (c=*source)<0x3000) {
   1.683 +        if(c<=0x20) {
   1.684 +            if(c!=0x20) {
   1.685 +                prev=BOCU1_ASCII_PREV;
   1.686 +            }
   1.687 +            *target++=(uint8_t)c;
   1.688 +        } else {
   1.689 +            diff=c-prev;
   1.690 +            if(DIFF_IS_SINGLE(diff)) {
   1.691 +                prev=BOCU1_SIMPLE_PREV(c);
   1.692 +                *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
   1.693 +            } else {
   1.694 +                break;
   1.695 +            }
   1.696 +        }
   1.697 +        ++source;
   1.698 +        --targetCapacity;
   1.699 +    }
   1.700 +    /* restore real values */
   1.701 +    targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
   1.702 +
   1.703 +    /* regular loop for all cases */
   1.704 +    while(source<sourceLimit) {
   1.705 +        if(targetCapacity>0) {
   1.706 +            c=*source++;
   1.707 +
   1.708 +            if(c<=0x20) {
   1.709 +                /*
   1.710 +                 * ISO C0 control & space:
   1.711 +                 * Encode directly for MIME compatibility,
   1.712 +                 * and reset state except for space, to not disrupt compression.
   1.713 +                 */
   1.714 +                if(c!=0x20) {
   1.715 +                    prev=BOCU1_ASCII_PREV;
   1.716 +                }
   1.717 +                *target++=(uint8_t)c;
   1.718 +                --targetCapacity;
   1.719 +                continue;
   1.720 +            }
   1.721 +
   1.722 +            if(U16_IS_LEAD(c)) {
   1.723 +getTrail:
   1.724 +                if(source<sourceLimit) {
   1.725 +                    /* test the following code unit */
   1.726 +                    UChar trail=*source;
   1.727 +                    if(U16_IS_TRAIL(trail)) {
   1.728 +                        ++source;
   1.729 +                        c=U16_GET_SUPPLEMENTARY(c, trail);
   1.730 +                    }
   1.731 +                } else {
   1.732 +                    /* no more input */
   1.733 +                    c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
   1.734 +                    break;
   1.735 +                }
   1.736 +            }
   1.737 +
   1.738 +            /*
   1.739 +             * all other Unicode code points c==U+0021..U+10ffff
   1.740 +             * are encoded with the difference c-prev
   1.741 +             *
   1.742 +             * a new prev is computed from c,
   1.743 +             * placed in the middle of a 0x80-block (for most small scripts) or
   1.744 +             * in the middle of the Unihan and Hangul blocks
   1.745 +             * to statistically minimize the following difference
   1.746 +             */
   1.747 +            diff=c-prev;
   1.748 +            prev=BOCU1_PREV(c);
   1.749 +            if(DIFF_IS_SINGLE(diff)) {
   1.750 +                *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
   1.751 +                --targetCapacity;
   1.752 +                if(c<0x3000) {
   1.753 +                    goto fastSingle;
   1.754 +                }
   1.755 +            } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
   1.756 +                /* optimize 2-byte case */
   1.757 +                int32_t m;
   1.758 +
   1.759 +                if(diff>=0) {
   1.760 +                    diff-=BOCU1_REACH_POS_1+1;
   1.761 +                    m=diff%BOCU1_TRAIL_COUNT;
   1.762 +                    diff/=BOCU1_TRAIL_COUNT;
   1.763 +                    diff+=BOCU1_START_POS_2;
   1.764 +                } else {
   1.765 +                    diff-=BOCU1_REACH_NEG_1;
   1.766 +                    NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
   1.767 +                    diff+=BOCU1_START_NEG_2;
   1.768 +                }
   1.769 +                *target++=(uint8_t)diff;
   1.770 +                *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
   1.771 +                targetCapacity-=2;
   1.772 +            } else {
   1.773 +                int32_t length; /* will be 2..4 */
   1.774 +
   1.775 +                diff=packDiff(diff);
   1.776 +                length=BOCU1_LENGTH_FROM_PACKED(diff);
   1.777 +
   1.778 +                /* write the output character bytes from diff and length */
   1.779 +                /* from the first if in the loop we know that targetCapacity>0 */
   1.780 +                if(length<=targetCapacity) {
   1.781 +                    switch(length) {
   1.782 +                        /* each branch falls through to the next one */
   1.783 +                    case 4:
   1.784 +                        *target++=(uint8_t)(diff>>24);
   1.785 +                    case 3: /*fall through*/
   1.786 +                        *target++=(uint8_t)(diff>>16);
   1.787 +                    /* case 2: handled above */
   1.788 +                        *target++=(uint8_t)(diff>>8);
   1.789 +                    /* case 1: handled above */
   1.790 +                        *target++=(uint8_t)diff;
   1.791 +                    default:
   1.792 +                        /* will never occur */
   1.793 +                        break;
   1.794 +                    }
   1.795 +                    targetCapacity-=length;
   1.796 +                } else {
   1.797 +                    uint8_t *charErrorBuffer;
   1.798 +
   1.799 +                    /*
   1.800 +                     * We actually do this backwards here:
   1.801 +                     * In order to save an intermediate variable, we output
   1.802 +                     * first to the overflow buffer what does not fit into the
   1.803 +                     * regular target.
   1.804 +                     */
   1.805 +                    /* we know that 1<=targetCapacity<length<=4 */
   1.806 +                    length-=targetCapacity;
   1.807 +                    charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
   1.808 +                    switch(length) {
   1.809 +                        /* each branch falls through to the next one */
   1.810 +                    case 3:
   1.811 +                        *charErrorBuffer++=(uint8_t)(diff>>16);
   1.812 +                    case 2: /*fall through*/
   1.813 +                        *charErrorBuffer++=(uint8_t)(diff>>8);
   1.814 +                    case 1: /*fall through*/
   1.815 +                        *charErrorBuffer=(uint8_t)diff;
   1.816 +                    default:
   1.817 +                        /* will never occur */
   1.818 +                        break;
   1.819 +                    }
   1.820 +                    cnv->charErrorBufferLength=(int8_t)length;
   1.821 +
   1.822 +                    /* now output what fits into the regular target */
   1.823 +                    diff>>=8*length; /* length was reduced by targetCapacity */
   1.824 +                    switch(targetCapacity) {
   1.825 +                        /* each branch falls through to the next one */
   1.826 +                    case 3:
   1.827 +                        *target++=(uint8_t)(diff>>16);
   1.828 +                    case 2: /*fall through*/
   1.829 +                        *target++=(uint8_t)(diff>>8);
   1.830 +                    case 1: /*fall through*/
   1.831 +                        *target++=(uint8_t)diff;
   1.832 +                    default:
   1.833 +                        /* will never occur */
   1.834 +                        break;
   1.835 +                    }
   1.836 +
   1.837 +                    /* target overflow */
   1.838 +                    targetCapacity=0;
   1.839 +                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1.840 +                    break;
   1.841 +                }
   1.842 +            }
   1.843 +        } else {
   1.844 +            /* target is full */
   1.845 +            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1.846 +            break;
   1.847 +        }
   1.848 +    }
   1.849 +
   1.850 +    /* set the converter state back into UConverter */
   1.851 +    cnv->fromUChar32= c<0 ? -c : 0;
   1.852 +    cnv->fromUnicodeStatus=(uint32_t)prev;
   1.853 +
   1.854 +    /* write back the updated pointers */
   1.855 +    pArgs->source=source;
   1.856 +    pArgs->target=(char *)target;
   1.857 +}
   1.858 +
   1.859 +/* BOCU-1-to-Unicode conversion functions ----------------------------------- */
   1.860 +
   1.861 +/**
   1.862 + * Function for BOCU-1 decoder; handles multi-byte lead bytes.
   1.863 + *
   1.864 + * @param b lead byte;
   1.865 + *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
   1.866 + * @return (diff<<2)|count
   1.867 + */
   1.868 +static inline int32_t
   1.869 +decodeBocu1LeadByte(int32_t b) {
   1.870 +    int32_t diff, count;
   1.871 +
   1.872 +    if(b>=BOCU1_START_NEG_2) {
   1.873 +        /* positive difference */
   1.874 +        if(b<BOCU1_START_POS_3) {
   1.875 +            /* two bytes */
   1.876 +            diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
   1.877 +            count=1;
   1.878 +        } else if(b<BOCU1_START_POS_4) {
   1.879 +            /* three bytes */
   1.880 +            diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
   1.881 +            count=2;
   1.882 +        } else {
   1.883 +            /* four bytes */
   1.884 +            diff=BOCU1_REACH_POS_3+1;
   1.885 +            count=3;
   1.886 +        }
   1.887 +    } else {
   1.888 +        /* negative difference */
   1.889 +        if(b>=BOCU1_START_NEG_3) {
   1.890 +            /* two bytes */
   1.891 +            diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
   1.892 +            count=1;
   1.893 +        } else if(b>BOCU1_MIN) {
   1.894 +            /* three bytes */
   1.895 +            diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
   1.896 +            count=2;
   1.897 +        } else {
   1.898 +            /* four bytes */
   1.899 +            diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
   1.900 +            count=3;
   1.901 +        }
   1.902 +    }
   1.903 +
   1.904 +    /* return the state for decoding the trail byte(s) */
   1.905 +    return (diff<<2)|count;
   1.906 +}
   1.907 +
   1.908 +/**
   1.909 + * Function for BOCU-1 decoder; handles multi-byte trail bytes.
   1.910 + *
   1.911 + * @param count number of remaining trail bytes including this one
   1.912 + * @param b trail byte
   1.913 + * @return new delta for diff including b - <0 indicates an error
   1.914 + *
   1.915 + * @see decodeBocu1
   1.916 + */
   1.917 +static inline int32_t
   1.918 +decodeBocu1TrailByte(int32_t count, int32_t b) {
   1.919 +    if(b<=0x20) {
   1.920 +        /* skip some C0 controls and make the trail byte range contiguous */
   1.921 +        b=bocu1ByteToTrail[b];
   1.922 +        /* b<0 for an illegal trail byte value will result in return<0 below */
   1.923 +#if BOCU1_MAX_TRAIL<0xff
   1.924 +    } else if(b>BOCU1_MAX_TRAIL) {
   1.925 +        return -99;
   1.926 +#endif
   1.927 +    } else {
   1.928 +        b-=BOCU1_TRAIL_BYTE_OFFSET;
   1.929 +    }
   1.930 +
   1.931 +    /* add trail byte into difference and decrement count */
   1.932 +    if(count==1) {
   1.933 +        return b;
   1.934 +    } else if(count==2) {
   1.935 +        return b*BOCU1_TRAIL_COUNT;
   1.936 +    } else /* count==3 */ {
   1.937 +        return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
   1.938 +    }
   1.939 +}
   1.940 +
   1.941 +static void
   1.942 +_Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
   1.943 +                           UErrorCode *pErrorCode) {
   1.944 +    UConverter *cnv;
   1.945 +    const uint8_t *source, *sourceLimit;
   1.946 +    UChar *target;
   1.947 +    const UChar *targetLimit;
   1.948 +    int32_t *offsets;
   1.949 +
   1.950 +    int32_t prev, count, diff, c;
   1.951 +
   1.952 +    int8_t byteIndex;
   1.953 +    uint8_t *bytes;
   1.954 +
   1.955 +    int32_t sourceIndex, nextSourceIndex;
   1.956 +
   1.957 +    /* set up the local pointers */
   1.958 +    cnv=pArgs->converter;
   1.959 +    source=(const uint8_t *)pArgs->source;
   1.960 +    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   1.961 +    target=pArgs->target;
   1.962 +    targetLimit=pArgs->targetLimit;
   1.963 +    offsets=pArgs->offsets;
   1.964 +
   1.965 +    /* get the converter state from UConverter */
   1.966 +    prev=(int32_t)cnv->toUnicodeStatus;
   1.967 +    if(prev==0) {
   1.968 +        prev=BOCU1_ASCII_PREV;
   1.969 +    }
   1.970 +    diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
   1.971 +    count=diff&3;
   1.972 +    diff>>=2;
   1.973 +
   1.974 +    byteIndex=cnv->toULength;
   1.975 +    bytes=cnv->toUBytes;
   1.976 +
   1.977 +    /* sourceIndex=-1 if the current character began in the previous buffer */
   1.978 +    sourceIndex=byteIndex==0 ? 0 : -1;
   1.979 +    nextSourceIndex=0;
   1.980 +
   1.981 +    /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
   1.982 +    if(count>0 && byteIndex>0 && target<targetLimit) {
   1.983 +        goto getTrail;
   1.984 +    }
   1.985 +
   1.986 +fastSingle:
   1.987 +    /* fast loop for single-byte differences */
   1.988 +    /* use count as the only loop counter variable */
   1.989 +    diff=(int32_t)(sourceLimit-source);
   1.990 +    count=(int32_t)(pArgs->targetLimit-target);
   1.991 +    if(count>diff) {
   1.992 +        count=diff;
   1.993 +    }
   1.994 +    while(count>0) {
   1.995 +        if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
   1.996 +            c=prev+(c-BOCU1_MIDDLE);
   1.997 +            if(c<0x3000) {
   1.998 +                *target++=(UChar)c;
   1.999 +                *offsets++=nextSourceIndex++;
  1.1000 +                prev=BOCU1_SIMPLE_PREV(c);
  1.1001 +            } else {
  1.1002 +                break;
  1.1003 +            }
  1.1004 +        } else if(c<=0x20) {
  1.1005 +            if(c!=0x20) {
  1.1006 +                prev=BOCU1_ASCII_PREV;
  1.1007 +            }
  1.1008 +            *target++=(UChar)c;
  1.1009 +            *offsets++=nextSourceIndex++;
  1.1010 +        } else {
  1.1011 +            break;
  1.1012 +        }
  1.1013 +        ++source;
  1.1014 +        --count;
  1.1015 +    }
  1.1016 +    sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
  1.1017 +
  1.1018 +    /* decode a sequence of single and lead bytes */
  1.1019 +    while(source<sourceLimit) {
  1.1020 +        if(target>=targetLimit) {
  1.1021 +            /* target is full */
  1.1022 +            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1.1023 +            break;
  1.1024 +        }
  1.1025 +
  1.1026 +        ++nextSourceIndex;
  1.1027 +        c=*source++;
  1.1028 +        if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
  1.1029 +            /* Write a code point directly from a single-byte difference. */
  1.1030 +            c=prev+(c-BOCU1_MIDDLE);
  1.1031 +            if(c<0x3000) {
  1.1032 +                *target++=(UChar)c;
  1.1033 +                *offsets++=sourceIndex;
  1.1034 +                prev=BOCU1_SIMPLE_PREV(c);
  1.1035 +                sourceIndex=nextSourceIndex;
  1.1036 +                goto fastSingle;
  1.1037 +            }
  1.1038 +        } else if(c<=0x20) {
  1.1039 +            /*
  1.1040 +             * Direct-encoded C0 control code or space.
  1.1041 +             * Reset prev for C0 control codes but not for space.
  1.1042 +             */
  1.1043 +            if(c!=0x20) {
  1.1044 +                prev=BOCU1_ASCII_PREV;
  1.1045 +            }
  1.1046 +            *target++=(UChar)c;
  1.1047 +            *offsets++=sourceIndex;
  1.1048 +            sourceIndex=nextSourceIndex;
  1.1049 +            continue;
  1.1050 +        } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
  1.1051 +            /* Optimize two-byte case. */
  1.1052 +            if(c>=BOCU1_MIDDLE) {
  1.1053 +                diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
  1.1054 +            } else {
  1.1055 +                diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
  1.1056 +            }
  1.1057 +
  1.1058 +            /* trail byte */
  1.1059 +            ++nextSourceIndex;
  1.1060 +            c=decodeBocu1TrailByte(1, *source++);
  1.1061 +            if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
  1.1062 +                bytes[0]=source[-2];
  1.1063 +                bytes[1]=source[-1];
  1.1064 +                byteIndex=2;
  1.1065 +                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1.1066 +                break;
  1.1067 +            }
  1.1068 +        } else if(c==BOCU1_RESET) {
  1.1069 +            /* only reset the state, no code point */
  1.1070 +            prev=BOCU1_ASCII_PREV;
  1.1071 +            sourceIndex=nextSourceIndex;
  1.1072 +            continue;
  1.1073 +        } else {
  1.1074 +            /*
  1.1075 +             * For multi-byte difference lead bytes, set the decoder state
  1.1076 +             * with the partial difference value from the lead byte and
  1.1077 +             * with the number of trail bytes.
  1.1078 +             */
  1.1079 +            bytes[0]=(uint8_t)c;
  1.1080 +            byteIndex=1;
  1.1081 +
  1.1082 +            diff=decodeBocu1LeadByte(c);
  1.1083 +            count=diff&3;
  1.1084 +            diff>>=2;
  1.1085 +getTrail:
  1.1086 +            for(;;) {
  1.1087 +                if(source>=sourceLimit) {
  1.1088 +                    goto endloop;
  1.1089 +                }
  1.1090 +                ++nextSourceIndex;
  1.1091 +                c=bytes[byteIndex++]=*source++;
  1.1092 +
  1.1093 +                /* trail byte in any position */
  1.1094 +                c=decodeBocu1TrailByte(count, c);
  1.1095 +                if(c<0) {
  1.1096 +                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1.1097 +                    goto endloop;
  1.1098 +                }
  1.1099 +
  1.1100 +                diff+=c;
  1.1101 +                if(--count==0) {
  1.1102 +                    /* final trail byte, deliver a code point */
  1.1103 +                    byteIndex=0;
  1.1104 +                    c=prev+diff;
  1.1105 +                    if((uint32_t)c>0x10ffff) {
  1.1106 +                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1.1107 +                        goto endloop;
  1.1108 +                    }
  1.1109 +                    break;
  1.1110 +                }
  1.1111 +            }
  1.1112 +        }
  1.1113 +
  1.1114 +        /* calculate the next prev and output c */
  1.1115 +        prev=BOCU1_PREV(c);
  1.1116 +        if(c<=0xffff) {
  1.1117 +            *target++=(UChar)c;
  1.1118 +            *offsets++=sourceIndex;
  1.1119 +        } else {
  1.1120 +            /* output surrogate pair */
  1.1121 +            *target++=U16_LEAD(c);
  1.1122 +            if(target<targetLimit) {
  1.1123 +                *target++=U16_TRAIL(c);
  1.1124 +                *offsets++=sourceIndex;
  1.1125 +                *offsets++=sourceIndex;
  1.1126 +            } else {
  1.1127 +                /* target overflow */
  1.1128 +                *offsets++=sourceIndex;
  1.1129 +                cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
  1.1130 +                cnv->UCharErrorBufferLength=1;
  1.1131 +                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1.1132 +                break;
  1.1133 +            }
  1.1134 +        }
  1.1135 +        sourceIndex=nextSourceIndex;
  1.1136 +    }
  1.1137 +endloop:
  1.1138 +
  1.1139 +    if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
  1.1140 +        /* set the converter state in UConverter to deal with the next character */
  1.1141 +        cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
  1.1142 +        cnv->mode=0;
  1.1143 +    } else {
  1.1144 +        /* set the converter state back into UConverter */
  1.1145 +        cnv->toUnicodeStatus=(uint32_t)prev;
  1.1146 +        cnv->mode=(diff<<2)|count;
  1.1147 +    }
  1.1148 +    cnv->toULength=byteIndex;
  1.1149 +
  1.1150 +    /* write back the updated pointers */
  1.1151 +    pArgs->source=(const char *)source;
  1.1152 +    pArgs->target=target;
  1.1153 +    pArgs->offsets=offsets;
  1.1154 +    return;
  1.1155 +}
  1.1156 +
  1.1157 +/*
  1.1158 + * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
  1.1159 + * If a change is made in the original function, then either
  1.1160 + * change this function the same way or
  1.1161 + * re-copy the original function and remove the variables
  1.1162 + * offsets, sourceIndex, and nextSourceIndex.
  1.1163 + */
  1.1164 +static void
  1.1165 +_Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
  1.1166 +                UErrorCode *pErrorCode) {
  1.1167 +    UConverter *cnv;
  1.1168 +    const uint8_t *source, *sourceLimit;
  1.1169 +    UChar *target;
  1.1170 +    const UChar *targetLimit;
  1.1171 +
  1.1172 +    int32_t prev, count, diff, c;
  1.1173 +
  1.1174 +    int8_t byteIndex;
  1.1175 +    uint8_t *bytes;
  1.1176 +
  1.1177 +U_ALIGN_CODE(16)
  1.1178 +
  1.1179 +    /* set up the local pointers */
  1.1180 +    cnv=pArgs->converter;
  1.1181 +    source=(const uint8_t *)pArgs->source;
  1.1182 +    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
  1.1183 +    target=pArgs->target;
  1.1184 +    targetLimit=pArgs->targetLimit;
  1.1185 +
  1.1186 +    /* get the converter state from UConverter */
  1.1187 +    prev=(int32_t)cnv->toUnicodeStatus;
  1.1188 +    if(prev==0) {
  1.1189 +        prev=BOCU1_ASCII_PREV;
  1.1190 +    }
  1.1191 +    diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
  1.1192 +    count=diff&3;
  1.1193 +    diff>>=2;
  1.1194 +
  1.1195 +    byteIndex=cnv->toULength;
  1.1196 +    bytes=cnv->toUBytes;
  1.1197 +
  1.1198 +    /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
  1.1199 +    if(count>0 && byteIndex>0 && target<targetLimit) {
  1.1200 +        goto getTrail;
  1.1201 +    }
  1.1202 +
  1.1203 +fastSingle:
  1.1204 +    /* fast loop for single-byte differences */
  1.1205 +    /* use count as the only loop counter variable */
  1.1206 +    diff=(int32_t)(sourceLimit-source);
  1.1207 +    count=(int32_t)(pArgs->targetLimit-target);
  1.1208 +    if(count>diff) {
  1.1209 +        count=diff;
  1.1210 +    }
  1.1211 +    while(count>0) {
  1.1212 +        if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
  1.1213 +            c=prev+(c-BOCU1_MIDDLE);
  1.1214 +            if(c<0x3000) {
  1.1215 +                *target++=(UChar)c;
  1.1216 +                prev=BOCU1_SIMPLE_PREV(c);
  1.1217 +            } else {
  1.1218 +                break;
  1.1219 +            }
  1.1220 +        } else if(c<=0x20) {
  1.1221 +            if(c!=0x20) {
  1.1222 +                prev=BOCU1_ASCII_PREV;
  1.1223 +            }
  1.1224 +            *target++=(UChar)c;
  1.1225 +        } else {
  1.1226 +            break;
  1.1227 +        }
  1.1228 +        ++source;
  1.1229 +        --count;
  1.1230 +    }
  1.1231 +
  1.1232 +    /* decode a sequence of single and lead bytes */
  1.1233 +    while(source<sourceLimit) {
  1.1234 +        if(target>=targetLimit) {
  1.1235 +            /* target is full */
  1.1236 +            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1.1237 +            break;
  1.1238 +        }
  1.1239 +
  1.1240 +        c=*source++;
  1.1241 +        if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
  1.1242 +            /* Write a code point directly from a single-byte difference. */
  1.1243 +            c=prev+(c-BOCU1_MIDDLE);
  1.1244 +            if(c<0x3000) {
  1.1245 +                *target++=(UChar)c;
  1.1246 +                prev=BOCU1_SIMPLE_PREV(c);
  1.1247 +                goto fastSingle;
  1.1248 +            }
  1.1249 +        } else if(c<=0x20) {
  1.1250 +            /*
  1.1251 +             * Direct-encoded C0 control code or space.
  1.1252 +             * Reset prev for C0 control codes but not for space.
  1.1253 +             */
  1.1254 +            if(c!=0x20) {
  1.1255 +                prev=BOCU1_ASCII_PREV;
  1.1256 +            }
  1.1257 +            *target++=(UChar)c;
  1.1258 +            continue;
  1.1259 +        } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
  1.1260 +            /* Optimize two-byte case. */
  1.1261 +            if(c>=BOCU1_MIDDLE) {
  1.1262 +                diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
  1.1263 +            } else {
  1.1264 +                diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
  1.1265 +            }
  1.1266 +
  1.1267 +            /* trail byte */
  1.1268 +            c=decodeBocu1TrailByte(1, *source++);
  1.1269 +            if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
  1.1270 +                bytes[0]=source[-2];
  1.1271 +                bytes[1]=source[-1];
  1.1272 +                byteIndex=2;
  1.1273 +                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1.1274 +                break;
  1.1275 +            }
  1.1276 +        } else if(c==BOCU1_RESET) {
  1.1277 +            /* only reset the state, no code point */
  1.1278 +            prev=BOCU1_ASCII_PREV;
  1.1279 +            continue;
  1.1280 +        } else {
  1.1281 +            /*
  1.1282 +             * For multi-byte difference lead bytes, set the decoder state
  1.1283 +             * with the partial difference value from the lead byte and
  1.1284 +             * with the number of trail bytes.
  1.1285 +             */
  1.1286 +            bytes[0]=(uint8_t)c;
  1.1287 +            byteIndex=1;
  1.1288 +
  1.1289 +            diff=decodeBocu1LeadByte(c);
  1.1290 +            count=diff&3;
  1.1291 +            diff>>=2;
  1.1292 +getTrail:
  1.1293 +            for(;;) {
  1.1294 +                if(source>=sourceLimit) {
  1.1295 +                    goto endloop;
  1.1296 +                }
  1.1297 +                c=bytes[byteIndex++]=*source++;
  1.1298 +
  1.1299 +                /* trail byte in any position */
  1.1300 +                c=decodeBocu1TrailByte(count, c);
  1.1301 +                if(c<0) {
  1.1302 +                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1.1303 +                    goto endloop;
  1.1304 +                }
  1.1305 +
  1.1306 +                diff+=c;
  1.1307 +                if(--count==0) {
  1.1308 +                    /* final trail byte, deliver a code point */
  1.1309 +                    byteIndex=0;
  1.1310 +                    c=prev+diff;
  1.1311 +                    if((uint32_t)c>0x10ffff) {
  1.1312 +                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1.1313 +                        goto endloop;
  1.1314 +                    }
  1.1315 +                    break;
  1.1316 +                }
  1.1317 +            }
  1.1318 +        }
  1.1319 +
  1.1320 +        /* calculate the next prev and output c */
  1.1321 +        prev=BOCU1_PREV(c);
  1.1322 +        if(c<=0xffff) {
  1.1323 +            *target++=(UChar)c;
  1.1324 +        } else {
  1.1325 +            /* output surrogate pair */
  1.1326 +            *target++=U16_LEAD(c);
  1.1327 +            if(target<targetLimit) {
  1.1328 +                *target++=U16_TRAIL(c);
  1.1329 +            } else {
  1.1330 +                /* target overflow */
  1.1331 +                cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
  1.1332 +                cnv->UCharErrorBufferLength=1;
  1.1333 +                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1.1334 +                break;
  1.1335 +            }
  1.1336 +        }
  1.1337 +    }
  1.1338 +endloop:
  1.1339 +
  1.1340 +    if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
  1.1341 +        /* set the converter state in UConverter to deal with the next character */
  1.1342 +        cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
  1.1343 +        cnv->mode=0;
  1.1344 +    } else {
  1.1345 +        /* set the converter state back into UConverter */
  1.1346 +        cnv->toUnicodeStatus=(uint32_t)prev;
  1.1347 +        cnv->mode=(diff<<2)|count;
  1.1348 +    }
  1.1349 +    cnv->toULength=byteIndex;
  1.1350 +
  1.1351 +    /* write back the updated pointers */
  1.1352 +    pArgs->source=(const char *)source;
  1.1353 +    pArgs->target=target;
  1.1354 +    return;
  1.1355 +}
  1.1356 +
  1.1357 +/* miscellaneous ------------------------------------------------------------ */
  1.1358 +
  1.1359 +static const UConverterImpl _Bocu1Impl={
  1.1360 +    UCNV_BOCU1,
  1.1361 +
  1.1362 +    NULL,
  1.1363 +    NULL,
  1.1364 +
  1.1365 +    NULL,
  1.1366 +    NULL,
  1.1367 +    NULL,
  1.1368 +
  1.1369 +    _Bocu1ToUnicode,
  1.1370 +    _Bocu1ToUnicodeWithOffsets,
  1.1371 +    _Bocu1FromUnicode,
  1.1372 +    _Bocu1FromUnicodeWithOffsets,
  1.1373 +    NULL,
  1.1374 +
  1.1375 +    NULL,
  1.1376 +    NULL,
  1.1377 +    NULL,
  1.1378 +    NULL,
  1.1379 +    ucnv_getCompleteUnicodeSet,
  1.1380 +
  1.1381 +    NULL,
  1.1382 +    NULL
  1.1383 +};
  1.1384 +
  1.1385 +static const UConverterStaticData _Bocu1StaticData={
  1.1386 +    sizeof(UConverterStaticData),
  1.1387 +    "BOCU-1",
  1.1388 +    1214, /* CCSID for BOCU-1 */
  1.1389 +    UCNV_IBM, UCNV_BOCU1,
  1.1390 +    1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
  1.1391 +    { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
  1.1392 +    FALSE, FALSE,
  1.1393 +    0,
  1.1394 +    0,
  1.1395 +    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  1.1396 +};
  1.1397 +
  1.1398 +const UConverterSharedData _Bocu1Data={
  1.1399 +    sizeof(UConverterSharedData), ~((uint32_t)0),
  1.1400 +    NULL, NULL, &_Bocu1StaticData, FALSE, &_Bocu1Impl,
  1.1401 +    0,
  1.1402 +    UCNV_MBCS_TABLE_INITIALIZER
  1.1403 +};
  1.1404 +
  1.1405 +#endif
The Tor Browser / file diff

diff: intl/icu/source/common/ucnvbocu.cpp

intl/icu/source/common/ucnvbocu.cpp