1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/ucnvbocu.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1402 @@ 1.4 +/* 1.5 +****************************************************************************** 1.6 +* 1.7 +* Copyright (C) 2002-2011, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +****************************************************************************** 1.11 +* file name: ucnvbocu.cpp 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 2002mar27 1.17 +* created by: Markus W. Scherer 1.18 +* 1.19 +* This is an implementation of the Binary Ordered Compression for Unicode, 1.20 +* in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/ 1.21 +*/ 1.22 + 1.23 +#include "unicode/utypes.h" 1.24 + 1.25 +#if !UCONFIG_NO_CONVERSION 1.26 + 1.27 +#include "unicode/ucnv.h" 1.28 +#include "unicode/ucnv_cb.h" 1.29 +#include "unicode/utf16.h" 1.30 +#include "putilimp.h" 1.31 +#include "ucnv_bld.h" 1.32 +#include "ucnv_cnv.h" 1.33 +#include "uassert.h" 1.34 + 1.35 +/* BOCU-1 constants and macros ---------------------------------------------- */ 1.36 + 1.37 +/* 1.38 + * BOCU-1 encodes the code points of a Unicode string as 1.39 + * a sequence of byte-encoded differences (slope detection), 1.40 + * preserving lexical order. 1.41 + * 1.42 + * Optimize the difference-taking for runs of Unicode text within 1.43 + * small scripts: 1.44 + * 1.45 + * Most small scripts are allocated within aligned 128-blocks of Unicode 1.46 + * code points. Lexical order is preserved if the "previous code point" state 1.47 + * is always moved into the middle of such a block. 1.48 + * 1.49 + * Additionally, "prev" is moved from anywhere in the Unihan and Hangul 1.50 + * areas into the middle of those areas. 1.51 + * 1.52 + * C0 control codes and space are encoded with their US-ASCII bytes. 1.53 + * "prev" is reset for C0 controls but not for space. 1.54 + */ 1.55 + 1.56 +/* initial value for "prev": middle of the ASCII range */ 1.57 +#define BOCU1_ASCII_PREV 0x40 1.58 + 1.59 +/* bounding byte values for differences */ 1.60 +#define BOCU1_MIN 0x21 1.61 +#define BOCU1_MIDDLE 0x90 1.62 +#define BOCU1_MAX_LEAD 0xfe 1.63 +#define BOCU1_MAX_TRAIL 0xff 1.64 +#define BOCU1_RESET 0xff 1.65 + 1.66 +/* number of lead bytes */ 1.67 +#define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1) 1.68 + 1.69 +/* adjust trail byte counts for the use of some C0 control byte values */ 1.70 +#define BOCU1_TRAIL_CONTROLS_COUNT 20 1.71 +#define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT) 1.72 + 1.73 +/* number of trail bytes */ 1.74 +#define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT) 1.75 + 1.76 +/* 1.77 + * number of positive and negative single-byte codes 1.78 + * (counting 0==BOCU1_MIDDLE among the positive ones) 1.79 + */ 1.80 +#define BOCU1_SINGLE 64 1.81 + 1.82 +/* number of lead bytes for positive and negative 2/3/4-byte sequences */ 1.83 +#define BOCU1_LEAD_2 43 1.84 +#define BOCU1_LEAD_3 3 1.85 +#define BOCU1_LEAD_4 1 1.86 + 1.87 +/* The difference value range for single-byters. */ 1.88 +#define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1) 1.89 +#define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE) 1.90 + 1.91 +/* The difference value range for double-byters. */ 1.92 +#define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) 1.93 +#define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) 1.94 + 1.95 +/* The difference value range for 3-byters. */ 1.96 +#define BOCU1_REACH_POS_3 \ 1.97 + (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) 1.98 + 1.99 +#define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) 1.100 + 1.101 +/* The lead byte start values. */ 1.102 +#define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1) 1.103 +#define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2) 1.104 +#define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3) 1.105 + /* ==BOCU1_MAX_LEAD */ 1.106 + 1.107 +#define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1) 1.108 +#define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2) 1.109 +#define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3) 1.110 + /* ==BOCU1_MIN+1 */ 1.111 + 1.112 +/* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */ 1.113 +#define BOCU1_LENGTH_FROM_LEAD(lead) \ 1.114 + ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \ 1.115 + (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \ 1.116 + (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4) 1.117 + 1.118 +/* The length of a byte sequence, according to its packed form. */ 1.119 +#define BOCU1_LENGTH_FROM_PACKED(packed) \ 1.120 + ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4) 1.121 + 1.122 +/* 1.123 + * 12 commonly used C0 control codes (and space) are only used to encode 1.124 + * themselves directly, 1.125 + * which makes BOCU-1 MIME-usable and reasonably safe for 1.126 + * ASCII-oriented software. 1.127 + * 1.128 + * These controls are 1.129 + * 0 NUL 1.130 + * 1.131 + * 7 BEL 1.132 + * 8 BS 1.133 + * 1.134 + * 9 TAB 1.135 + * a LF 1.136 + * b VT 1.137 + * c FF 1.138 + * d CR 1.139 + * 1.140 + * e SO 1.141 + * f SI 1.142 + * 1.143 + * 1a SUB 1.144 + * 1b ESC 1.145 + * 1.146 + * The other 20 C0 controls are also encoded directly (to preserve order) 1.147 + * but are also used as trail bytes in difference encoding 1.148 + * (for better compression). 1.149 + */ 1.150 +#define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t]) 1.151 + 1.152 +/* 1.153 + * Byte value map for control codes, 1.154 + * from external byte values 0x00..0x20 1.155 + * to trail byte values 0..19 (0..0x13) as used in the difference calculation. 1.156 + * External byte values that are illegal as trail bytes are mapped to -1. 1.157 + */ 1.158 +static const int8_t 1.159 +bocu1ByteToTrail[BOCU1_MIN]={ 1.160 +/* 0 1 2 3 4 5 6 7 */ 1.161 + -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1, 1.162 + 1.163 +/* 8 9 a b c d e f */ 1.164 + -1, -1, -1, -1, -1, -1, -1, -1, 1.165 + 1.166 +/* 10 11 12 13 14 15 16 17 */ 1.167 + 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 1.168 + 1.169 +/* 18 19 1a 1b 1c 1d 1e 1f */ 1.170 + 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13, 1.171 + 1.172 +/* 20 */ 1.173 + -1 1.174 +}; 1.175 + 1.176 +/* 1.177 + * Byte value map for control codes, 1.178 + * from trail byte values 0..19 (0..0x13) as used in the difference calculation 1.179 + * to external byte values 0x00..0x20. 1.180 + */ 1.181 +static const int8_t 1.182 +bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={ 1.183 +/* 0 1 2 3 4 5 6 7 */ 1.184 + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11, 1.185 + 1.186 +/* 8 9 a b c d e f */ 1.187 + 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 1.188 + 1.189 +/* 10 11 12 13 */ 1.190 + 0x1c, 0x1d, 0x1e, 0x1f 1.191 +}; 1.192 + 1.193 +/** 1.194 + * Integer division and modulo with negative numerators 1.195 + * yields negative modulo results and quotients that are one more than 1.196 + * what we need here. 1.197 + * This macro adjust the results so that the modulo-value m is always >=0. 1.198 + * 1.199 + * For positive n, the if() condition is always FALSE. 1.200 + * 1.201 + * @param n Number to be split into quotient and rest. 1.202 + * Will be modified to contain the quotient. 1.203 + * @param d Divisor. 1.204 + * @param m Output variable for the rest (modulo result). 1.205 + */ 1.206 +#define NEGDIVMOD(n, d, m) { \ 1.207 + (m)=(n)%(d); \ 1.208 + (n)/=(d); \ 1.209 + if((m)<0) { \ 1.210 + --(n); \ 1.211 + (m)+=(d); \ 1.212 + } \ 1.213 +} 1.214 + 1.215 +/* Faster versions of packDiff() for single-byte-encoded diff values. */ 1.216 + 1.217 +/** Is a diff value encodable in a single byte? */ 1.218 +#define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1) 1.219 + 1.220 +/** Encode a diff value in a single byte. */ 1.221 +#define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff)) 1.222 + 1.223 +/** Is a diff value encodable in two bytes? */ 1.224 +#define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2) 1.225 + 1.226 +/* BOCU-1 implementation functions ------------------------------------------ */ 1.227 + 1.228 +#define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV) 1.229 + 1.230 +/** 1.231 + * Compute the next "previous" value for differencing 1.232 + * from the current code point. 1.233 + * 1.234 + * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below) 1.235 + * @return "previous code point" state value 1.236 + */ 1.237 +static inline int32_t 1.238 +bocu1Prev(int32_t c) { 1.239 + /* compute new prev */ 1.240 + if(/* 0x3040<=c && */ c<=0x309f) { 1.241 + /* Hiragana is not 128-aligned */ 1.242 + return 0x3070; 1.243 + } else if(0x4e00<=c && c<=0x9fa5) { 1.244 + /* CJK Unihan */ 1.245 + return 0x4e00-BOCU1_REACH_NEG_2; 1.246 + } else if(0xac00<=c /* && c<=0xd7a3 */) { 1.247 + /* Korean Hangul */ 1.248 + return (0xd7a3+0xac00)/2; 1.249 + } else { 1.250 + /* mostly small scripts */ 1.251 + return BOCU1_SIMPLE_PREV(c); 1.252 + } 1.253 +} 1.254 + 1.255 +/** Fast version of bocu1Prev() for most scripts. */ 1.256 +#define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c)) 1.257 + 1.258 +/* 1.259 + * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c. 1.260 + * The UConverter fields are used as follows: 1.261 + * 1.262 + * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV) 1.263 + * 1.264 + * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV) 1.265 + * mode decoder's incomplete (diff<<2)|count (ignored when toULength==0) 1.266 + */ 1.267 + 1.268 +/* BOCU-1-from-Unicode conversion functions --------------------------------- */ 1.269 + 1.270 +/** 1.271 + * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes 1.272 + * and return a packed integer with them. 1.273 + * 1.274 + * The encoding favors small absolute differences with short encodings 1.275 + * to compress runs of same-script characters. 1.276 + * 1.277 + * Optimized version with unrolled loops and fewer floating-point operations 1.278 + * than the standard packDiff(). 1.279 + * 1.280 + * @param diff difference value -0x10ffff..0x10ffff 1.281 + * @return 1.282 + * 0x010000zz for 1-byte sequence zz 1.283 + * 0x0200yyzz for 2-byte sequence yy zz 1.284 + * 0x03xxyyzz for 3-byte sequence xx yy zz 1.285 + * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03) 1.286 + */ 1.287 +static int32_t 1.288 +packDiff(int32_t diff) { 1.289 + int32_t result, m; 1.290 + 1.291 + U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */ 1.292 + if(diff>=BOCU1_REACH_NEG_1) { 1.293 + /* mostly positive differences, and single-byte negative ones */ 1.294 +#if 0 /* single-byte case handled in macros, see below */ 1.295 + if(diff<=BOCU1_REACH_POS_1) { 1.296 + /* single byte */ 1.297 + return 0x01000000|(BOCU1_MIDDLE+diff); 1.298 + } else 1.299 +#endif 1.300 + if(diff<=BOCU1_REACH_POS_2) { 1.301 + /* two bytes */ 1.302 + diff-=BOCU1_REACH_POS_1+1; 1.303 + result=0x02000000; 1.304 + 1.305 + m=diff%BOCU1_TRAIL_COUNT; 1.306 + diff/=BOCU1_TRAIL_COUNT; 1.307 + result|=BOCU1_TRAIL_TO_BYTE(m); 1.308 + 1.309 + result|=(BOCU1_START_POS_2+diff)<<8; 1.310 + } else if(diff<=BOCU1_REACH_POS_3) { 1.311 + /* three bytes */ 1.312 + diff-=BOCU1_REACH_POS_2+1; 1.313 + result=0x03000000; 1.314 + 1.315 + m=diff%BOCU1_TRAIL_COUNT; 1.316 + diff/=BOCU1_TRAIL_COUNT; 1.317 + result|=BOCU1_TRAIL_TO_BYTE(m); 1.318 + 1.319 + m=diff%BOCU1_TRAIL_COUNT; 1.320 + diff/=BOCU1_TRAIL_COUNT; 1.321 + result|=BOCU1_TRAIL_TO_BYTE(m)<<8; 1.322 + 1.323 + result|=(BOCU1_START_POS_3+diff)<<16; 1.324 + } else { 1.325 + /* four bytes */ 1.326 + diff-=BOCU1_REACH_POS_3+1; 1.327 + 1.328 + m=diff%BOCU1_TRAIL_COUNT; 1.329 + diff/=BOCU1_TRAIL_COUNT; 1.330 + result=BOCU1_TRAIL_TO_BYTE(m); 1.331 + 1.332 + m=diff%BOCU1_TRAIL_COUNT; 1.333 + diff/=BOCU1_TRAIL_COUNT; 1.334 + result|=BOCU1_TRAIL_TO_BYTE(m)<<8; 1.335 + 1.336 + /* 1.337 + * We know that / and % would deliver quotient 0 and rest=diff. 1.338 + * Avoid division and modulo for performance. 1.339 + */ 1.340 + result|=BOCU1_TRAIL_TO_BYTE(diff)<<16; 1.341 + 1.342 + result|=((uint32_t)BOCU1_START_POS_4)<<24; 1.343 + } 1.344 + } else { 1.345 + /* two- to four-byte negative differences */ 1.346 + if(diff>=BOCU1_REACH_NEG_2) { 1.347 + /* two bytes */ 1.348 + diff-=BOCU1_REACH_NEG_1; 1.349 + result=0x02000000; 1.350 + 1.351 + NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 1.352 + result|=BOCU1_TRAIL_TO_BYTE(m); 1.353 + 1.354 + result|=(BOCU1_START_NEG_2+diff)<<8; 1.355 + } else if(diff>=BOCU1_REACH_NEG_3) { 1.356 + /* three bytes */ 1.357 + diff-=BOCU1_REACH_NEG_2; 1.358 + result=0x03000000; 1.359 + 1.360 + NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 1.361 + result|=BOCU1_TRAIL_TO_BYTE(m); 1.362 + 1.363 + NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 1.364 + result|=BOCU1_TRAIL_TO_BYTE(m)<<8; 1.365 + 1.366 + result|=(BOCU1_START_NEG_3+diff)<<16; 1.367 + } else { 1.368 + /* four bytes */ 1.369 + diff-=BOCU1_REACH_NEG_3; 1.370 + 1.371 + NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 1.372 + result=BOCU1_TRAIL_TO_BYTE(m); 1.373 + 1.374 + NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 1.375 + result|=BOCU1_TRAIL_TO_BYTE(m)<<8; 1.376 + 1.377 + /* 1.378 + * We know that NEGDIVMOD would deliver 1.379 + * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT. 1.380 + * Avoid division and modulo for performance. 1.381 + */ 1.382 + m=diff+BOCU1_TRAIL_COUNT; 1.383 + result|=BOCU1_TRAIL_TO_BYTE(m)<<16; 1.384 + 1.385 + result|=BOCU1_MIN<<24; 1.386 + } 1.387 + } 1.388 + return result; 1.389 +} 1.390 + 1.391 + 1.392 +static void 1.393 +_Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 1.394 + UErrorCode *pErrorCode) { 1.395 + UConverter *cnv; 1.396 + const UChar *source, *sourceLimit; 1.397 + uint8_t *target; 1.398 + int32_t targetCapacity; 1.399 + int32_t *offsets; 1.400 + 1.401 + int32_t prev, c, diff; 1.402 + 1.403 + int32_t sourceIndex, nextSourceIndex; 1.404 + 1.405 +U_ALIGN_CODE(16) 1.406 + 1.407 + /* set up the local pointers */ 1.408 + cnv=pArgs->converter; 1.409 + source=pArgs->source; 1.410 + sourceLimit=pArgs->sourceLimit; 1.411 + target=(uint8_t *)pArgs->target; 1.412 + targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 1.413 + offsets=pArgs->offsets; 1.414 + 1.415 + /* get the converter state from UConverter */ 1.416 + c=cnv->fromUChar32; 1.417 + prev=(int32_t)cnv->fromUnicodeStatus; 1.418 + if(prev==0) { 1.419 + prev=BOCU1_ASCII_PREV; 1.420 + } 1.421 + 1.422 + /* sourceIndex=-1 if the current character began in the previous buffer */ 1.423 + sourceIndex= c==0 ? 0 : -1; 1.424 + nextSourceIndex=0; 1.425 + 1.426 + /* conversion loop */ 1.427 + if(c!=0 && targetCapacity>0) { 1.428 + goto getTrail; 1.429 + } 1.430 + 1.431 +fastSingle: 1.432 + /* fast loop for single-byte differences */ 1.433 + /* use only one loop counter variable, targetCapacity, not also source */ 1.434 + diff=(int32_t)(sourceLimit-source); 1.435 + if(targetCapacity>diff) { 1.436 + targetCapacity=diff; 1.437 + } 1.438 + while(targetCapacity>0 && (c=*source)<0x3000) { 1.439 + if(c<=0x20) { 1.440 + if(c!=0x20) { 1.441 + prev=BOCU1_ASCII_PREV; 1.442 + } 1.443 + *target++=(uint8_t)c; 1.444 + *offsets++=nextSourceIndex++; 1.445 + ++source; 1.446 + --targetCapacity; 1.447 + } else { 1.448 + diff=c-prev; 1.449 + if(DIFF_IS_SINGLE(diff)) { 1.450 + prev=BOCU1_SIMPLE_PREV(c); 1.451 + *target++=(uint8_t)PACK_SINGLE_DIFF(diff); 1.452 + *offsets++=nextSourceIndex++; 1.453 + ++source; 1.454 + --targetCapacity; 1.455 + } else { 1.456 + break; 1.457 + } 1.458 + } 1.459 + } 1.460 + /* restore real values */ 1.461 + targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target); 1.462 + sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */ 1.463 + 1.464 + /* regular loop for all cases */ 1.465 + while(source<sourceLimit) { 1.466 + if(targetCapacity>0) { 1.467 + c=*source++; 1.468 + ++nextSourceIndex; 1.469 + 1.470 + if(c<=0x20) { 1.471 + /* 1.472 + * ISO C0 control & space: 1.473 + * Encode directly for MIME compatibility, 1.474 + * and reset state except for space, to not disrupt compression. 1.475 + */ 1.476 + if(c!=0x20) { 1.477 + prev=BOCU1_ASCII_PREV; 1.478 + } 1.479 + *target++=(uint8_t)c; 1.480 + *offsets++=sourceIndex; 1.481 + --targetCapacity; 1.482 + 1.483 + sourceIndex=nextSourceIndex; 1.484 + continue; 1.485 + } 1.486 + 1.487 + if(U16_IS_LEAD(c)) { 1.488 +getTrail: 1.489 + if(source<sourceLimit) { 1.490 + /* test the following code unit */ 1.491 + UChar trail=*source; 1.492 + if(U16_IS_TRAIL(trail)) { 1.493 + ++source; 1.494 + ++nextSourceIndex; 1.495 + c=U16_GET_SUPPLEMENTARY(c, trail); 1.496 + } 1.497 + } else { 1.498 + /* no more input */ 1.499 + c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */ 1.500 + break; 1.501 + } 1.502 + } 1.503 + 1.504 + /* 1.505 + * all other Unicode code points c==U+0021..U+10ffff 1.506 + * are encoded with the difference c-prev 1.507 + * 1.508 + * a new prev is computed from c, 1.509 + * placed in the middle of a 0x80-block (for most small scripts) or 1.510 + * in the middle of the Unihan and Hangul blocks 1.511 + * to statistically minimize the following difference 1.512 + */ 1.513 + diff=c-prev; 1.514 + prev=BOCU1_PREV(c); 1.515 + if(DIFF_IS_SINGLE(diff)) { 1.516 + *target++=(uint8_t)PACK_SINGLE_DIFF(diff); 1.517 + *offsets++=sourceIndex; 1.518 + --targetCapacity; 1.519 + sourceIndex=nextSourceIndex; 1.520 + if(c<0x3000) { 1.521 + goto fastSingle; 1.522 + } 1.523 + } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) { 1.524 + /* optimize 2-byte case */ 1.525 + int32_t m; 1.526 + 1.527 + if(diff>=0) { 1.528 + diff-=BOCU1_REACH_POS_1+1; 1.529 + m=diff%BOCU1_TRAIL_COUNT; 1.530 + diff/=BOCU1_TRAIL_COUNT; 1.531 + diff+=BOCU1_START_POS_2; 1.532 + } else { 1.533 + diff-=BOCU1_REACH_NEG_1; 1.534 + NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 1.535 + diff+=BOCU1_START_NEG_2; 1.536 + } 1.537 + *target++=(uint8_t)diff; 1.538 + *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m); 1.539 + *offsets++=sourceIndex; 1.540 + *offsets++=sourceIndex; 1.541 + targetCapacity-=2; 1.542 + sourceIndex=nextSourceIndex; 1.543 + } else { 1.544 + int32_t length; /* will be 2..4 */ 1.545 + 1.546 + diff=packDiff(diff); 1.547 + length=BOCU1_LENGTH_FROM_PACKED(diff); 1.548 + 1.549 + /* write the output character bytes from diff and length */ 1.550 + /* from the first if in the loop we know that targetCapacity>0 */ 1.551 + if(length<=targetCapacity) { 1.552 + switch(length) { 1.553 + /* each branch falls through to the next one */ 1.554 + case 4: 1.555 + *target++=(uint8_t)(diff>>24); 1.556 + *offsets++=sourceIndex; 1.557 + case 3: /*fall through*/ 1.558 + *target++=(uint8_t)(diff>>16); 1.559 + *offsets++=sourceIndex; 1.560 + case 2: /*fall through*/ 1.561 + *target++=(uint8_t)(diff>>8); 1.562 + *offsets++=sourceIndex; 1.563 + /* case 1: handled above */ 1.564 + *target++=(uint8_t)diff; 1.565 + *offsets++=sourceIndex; 1.566 + default: 1.567 + /* will never occur */ 1.568 + break; 1.569 + } 1.570 + targetCapacity-=length; 1.571 + sourceIndex=nextSourceIndex; 1.572 + } else { 1.573 + uint8_t *charErrorBuffer; 1.574 + 1.575 + /* 1.576 + * We actually do this backwards here: 1.577 + * In order to save an intermediate variable, we output 1.578 + * first to the overflow buffer what does not fit into the 1.579 + * regular target. 1.580 + */ 1.581 + /* we know that 1<=targetCapacity<length<=4 */ 1.582 + length-=targetCapacity; 1.583 + charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; 1.584 + switch(length) { 1.585 + /* each branch falls through to the next one */ 1.586 + case 3: 1.587 + *charErrorBuffer++=(uint8_t)(diff>>16); 1.588 + case 2: /*fall through*/ 1.589 + *charErrorBuffer++=(uint8_t)(diff>>8); 1.590 + case 1: /*fall through*/ 1.591 + *charErrorBuffer=(uint8_t)diff; 1.592 + default: 1.593 + /* will never occur */ 1.594 + break; 1.595 + } 1.596 + cnv->charErrorBufferLength=(int8_t)length; 1.597 + 1.598 + /* now output what fits into the regular target */ 1.599 + diff>>=8*length; /* length was reduced by targetCapacity */ 1.600 + switch(targetCapacity) { 1.601 + /* each branch falls through to the next one */ 1.602 + case 3: 1.603 + *target++=(uint8_t)(diff>>16); 1.604 + *offsets++=sourceIndex; 1.605 + case 2: /*fall through*/ 1.606 + *target++=(uint8_t)(diff>>8); 1.607 + *offsets++=sourceIndex; 1.608 + case 1: /*fall through*/ 1.609 + *target++=(uint8_t)diff; 1.610 + *offsets++=sourceIndex; 1.611 + default: 1.612 + /* will never occur */ 1.613 + break; 1.614 + } 1.615 + 1.616 + /* target overflow */ 1.617 + targetCapacity=0; 1.618 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.619 + break; 1.620 + } 1.621 + } 1.622 + } else { 1.623 + /* target is full */ 1.624 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.625 + break; 1.626 + } 1.627 + } 1.628 + 1.629 + /* set the converter state back into UConverter */ 1.630 + cnv->fromUChar32= c<0 ? -c : 0; 1.631 + cnv->fromUnicodeStatus=(uint32_t)prev; 1.632 + 1.633 + /* write back the updated pointers */ 1.634 + pArgs->source=source; 1.635 + pArgs->target=(char *)target; 1.636 + pArgs->offsets=offsets; 1.637 +} 1.638 + 1.639 +/* 1.640 + * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling. 1.641 + * If a change is made in the original function, then either 1.642 + * change this function the same way or 1.643 + * re-copy the original function and remove the variables 1.644 + * offsets, sourceIndex, and nextSourceIndex. 1.645 + */ 1.646 +static void 1.647 +_Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs, 1.648 + UErrorCode *pErrorCode) { 1.649 + UConverter *cnv; 1.650 + const UChar *source, *sourceLimit; 1.651 + uint8_t *target; 1.652 + int32_t targetCapacity; 1.653 + 1.654 + int32_t prev, c, diff; 1.655 + 1.656 + /* set up the local pointers */ 1.657 + cnv=pArgs->converter; 1.658 + source=pArgs->source; 1.659 + sourceLimit=pArgs->sourceLimit; 1.660 + target=(uint8_t *)pArgs->target; 1.661 + targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 1.662 + 1.663 + /* get the converter state from UConverter */ 1.664 + c=cnv->fromUChar32; 1.665 + prev=(int32_t)cnv->fromUnicodeStatus; 1.666 + if(prev==0) { 1.667 + prev=BOCU1_ASCII_PREV; 1.668 + } 1.669 + 1.670 + /* conversion loop */ 1.671 + if(c!=0 && targetCapacity>0) { 1.672 + goto getTrail; 1.673 + } 1.674 + 1.675 +fastSingle: 1.676 + /* fast loop for single-byte differences */ 1.677 + /* use only one loop counter variable, targetCapacity, not also source */ 1.678 + diff=(int32_t)(sourceLimit-source); 1.679 + if(targetCapacity>diff) { 1.680 + targetCapacity=diff; 1.681 + } 1.682 + while(targetCapacity>0 && (c=*source)<0x3000) { 1.683 + if(c<=0x20) { 1.684 + if(c!=0x20) { 1.685 + prev=BOCU1_ASCII_PREV; 1.686 + } 1.687 + *target++=(uint8_t)c; 1.688 + } else { 1.689 + diff=c-prev; 1.690 + if(DIFF_IS_SINGLE(diff)) { 1.691 + prev=BOCU1_SIMPLE_PREV(c); 1.692 + *target++=(uint8_t)PACK_SINGLE_DIFF(diff); 1.693 + } else { 1.694 + break; 1.695 + } 1.696 + } 1.697 + ++source; 1.698 + --targetCapacity; 1.699 + } 1.700 + /* restore real values */ 1.701 + targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target); 1.702 + 1.703 + /* regular loop for all cases */ 1.704 + while(source<sourceLimit) { 1.705 + if(targetCapacity>0) { 1.706 + c=*source++; 1.707 + 1.708 + if(c<=0x20) { 1.709 + /* 1.710 + * ISO C0 control & space: 1.711 + * Encode directly for MIME compatibility, 1.712 + * and reset state except for space, to not disrupt compression. 1.713 + */ 1.714 + if(c!=0x20) { 1.715 + prev=BOCU1_ASCII_PREV; 1.716 + } 1.717 + *target++=(uint8_t)c; 1.718 + --targetCapacity; 1.719 + continue; 1.720 + } 1.721 + 1.722 + if(U16_IS_LEAD(c)) { 1.723 +getTrail: 1.724 + if(source<sourceLimit) { 1.725 + /* test the following code unit */ 1.726 + UChar trail=*source; 1.727 + if(U16_IS_TRAIL(trail)) { 1.728 + ++source; 1.729 + c=U16_GET_SUPPLEMENTARY(c, trail); 1.730 + } 1.731 + } else { 1.732 + /* no more input */ 1.733 + c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */ 1.734 + break; 1.735 + } 1.736 + } 1.737 + 1.738 + /* 1.739 + * all other Unicode code points c==U+0021..U+10ffff 1.740 + * are encoded with the difference c-prev 1.741 + * 1.742 + * a new prev is computed from c, 1.743 + * placed in the middle of a 0x80-block (for most small scripts) or 1.744 + * in the middle of the Unihan and Hangul blocks 1.745 + * to statistically minimize the following difference 1.746 + */ 1.747 + diff=c-prev; 1.748 + prev=BOCU1_PREV(c); 1.749 + if(DIFF_IS_SINGLE(diff)) { 1.750 + *target++=(uint8_t)PACK_SINGLE_DIFF(diff); 1.751 + --targetCapacity; 1.752 + if(c<0x3000) { 1.753 + goto fastSingle; 1.754 + } 1.755 + } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) { 1.756 + /* optimize 2-byte case */ 1.757 + int32_t m; 1.758 + 1.759 + if(diff>=0) { 1.760 + diff-=BOCU1_REACH_POS_1+1; 1.761 + m=diff%BOCU1_TRAIL_COUNT; 1.762 + diff/=BOCU1_TRAIL_COUNT; 1.763 + diff+=BOCU1_START_POS_2; 1.764 + } else { 1.765 + diff-=BOCU1_REACH_NEG_1; 1.766 + NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 1.767 + diff+=BOCU1_START_NEG_2; 1.768 + } 1.769 + *target++=(uint8_t)diff; 1.770 + *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m); 1.771 + targetCapacity-=2; 1.772 + } else { 1.773 + int32_t length; /* will be 2..4 */ 1.774 + 1.775 + diff=packDiff(diff); 1.776 + length=BOCU1_LENGTH_FROM_PACKED(diff); 1.777 + 1.778 + /* write the output character bytes from diff and length */ 1.779 + /* from the first if in the loop we know that targetCapacity>0 */ 1.780 + if(length<=targetCapacity) { 1.781 + switch(length) { 1.782 + /* each branch falls through to the next one */ 1.783 + case 4: 1.784 + *target++=(uint8_t)(diff>>24); 1.785 + case 3: /*fall through*/ 1.786 + *target++=(uint8_t)(diff>>16); 1.787 + /* case 2: handled above */ 1.788 + *target++=(uint8_t)(diff>>8); 1.789 + /* case 1: handled above */ 1.790 + *target++=(uint8_t)diff; 1.791 + default: 1.792 + /* will never occur */ 1.793 + break; 1.794 + } 1.795 + targetCapacity-=length; 1.796 + } else { 1.797 + uint8_t *charErrorBuffer; 1.798 + 1.799 + /* 1.800 + * We actually do this backwards here: 1.801 + * In order to save an intermediate variable, we output 1.802 + * first to the overflow buffer what does not fit into the 1.803 + * regular target. 1.804 + */ 1.805 + /* we know that 1<=targetCapacity<length<=4 */ 1.806 + length-=targetCapacity; 1.807 + charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; 1.808 + switch(length) { 1.809 + /* each branch falls through to the next one */ 1.810 + case 3: 1.811 + *charErrorBuffer++=(uint8_t)(diff>>16); 1.812 + case 2: /*fall through*/ 1.813 + *charErrorBuffer++=(uint8_t)(diff>>8); 1.814 + case 1: /*fall through*/ 1.815 + *charErrorBuffer=(uint8_t)diff; 1.816 + default: 1.817 + /* will never occur */ 1.818 + break; 1.819 + } 1.820 + cnv->charErrorBufferLength=(int8_t)length; 1.821 + 1.822 + /* now output what fits into the regular target */ 1.823 + diff>>=8*length; /* length was reduced by targetCapacity */ 1.824 + switch(targetCapacity) { 1.825 + /* each branch falls through to the next one */ 1.826 + case 3: 1.827 + *target++=(uint8_t)(diff>>16); 1.828 + case 2: /*fall through*/ 1.829 + *target++=(uint8_t)(diff>>8); 1.830 + case 1: /*fall through*/ 1.831 + *target++=(uint8_t)diff; 1.832 + default: 1.833 + /* will never occur */ 1.834 + break; 1.835 + } 1.836 + 1.837 + /* target overflow */ 1.838 + targetCapacity=0; 1.839 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.840 + break; 1.841 + } 1.842 + } 1.843 + } else { 1.844 + /* target is full */ 1.845 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.846 + break; 1.847 + } 1.848 + } 1.849 + 1.850 + /* set the converter state back into UConverter */ 1.851 + cnv->fromUChar32= c<0 ? -c : 0; 1.852 + cnv->fromUnicodeStatus=(uint32_t)prev; 1.853 + 1.854 + /* write back the updated pointers */ 1.855 + pArgs->source=source; 1.856 + pArgs->target=(char *)target; 1.857 +} 1.858 + 1.859 +/* BOCU-1-to-Unicode conversion functions ----------------------------------- */ 1.860 + 1.861 +/** 1.862 + * Function for BOCU-1 decoder; handles multi-byte lead bytes. 1.863 + * 1.864 + * @param b lead byte; 1.865 + * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD 1.866 + * @return (diff<<2)|count 1.867 + */ 1.868 +static inline int32_t 1.869 +decodeBocu1LeadByte(int32_t b) { 1.870 + int32_t diff, count; 1.871 + 1.872 + if(b>=BOCU1_START_NEG_2) { 1.873 + /* positive difference */ 1.874 + if(b<BOCU1_START_POS_3) { 1.875 + /* two bytes */ 1.876 + diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; 1.877 + count=1; 1.878 + } else if(b<BOCU1_START_POS_4) { 1.879 + /* three bytes */ 1.880 + diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1; 1.881 + count=2; 1.882 + } else { 1.883 + /* four bytes */ 1.884 + diff=BOCU1_REACH_POS_3+1; 1.885 + count=3; 1.886 + } 1.887 + } else { 1.888 + /* negative difference */ 1.889 + if(b>=BOCU1_START_NEG_3) { 1.890 + /* two bytes */ 1.891 + diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; 1.892 + count=1; 1.893 + } else if(b>BOCU1_MIN) { 1.894 + /* three bytes */ 1.895 + diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2; 1.896 + count=2; 1.897 + } else { 1.898 + /* four bytes */ 1.899 + diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3; 1.900 + count=3; 1.901 + } 1.902 + } 1.903 + 1.904 + /* return the state for decoding the trail byte(s) */ 1.905 + return (diff<<2)|count; 1.906 +} 1.907 + 1.908 +/** 1.909 + * Function for BOCU-1 decoder; handles multi-byte trail bytes. 1.910 + * 1.911 + * @param count number of remaining trail bytes including this one 1.912 + * @param b trail byte 1.913 + * @return new delta for diff including b - <0 indicates an error 1.914 + * 1.915 + * @see decodeBocu1 1.916 + */ 1.917 +static inline int32_t 1.918 +decodeBocu1TrailByte(int32_t count, int32_t b) { 1.919 + if(b<=0x20) { 1.920 + /* skip some C0 controls and make the trail byte range contiguous */ 1.921 + b=bocu1ByteToTrail[b]; 1.922 + /* b<0 for an illegal trail byte value will result in return<0 below */ 1.923 +#if BOCU1_MAX_TRAIL<0xff 1.924 + } else if(b>BOCU1_MAX_TRAIL) { 1.925 + return -99; 1.926 +#endif 1.927 + } else { 1.928 + b-=BOCU1_TRAIL_BYTE_OFFSET; 1.929 + } 1.930 + 1.931 + /* add trail byte into difference and decrement count */ 1.932 + if(count==1) { 1.933 + return b; 1.934 + } else if(count==2) { 1.935 + return b*BOCU1_TRAIL_COUNT; 1.936 + } else /* count==3 */ { 1.937 + return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT); 1.938 + } 1.939 +} 1.940 + 1.941 +static void 1.942 +_Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 1.943 + UErrorCode *pErrorCode) { 1.944 + UConverter *cnv; 1.945 + const uint8_t *source, *sourceLimit; 1.946 + UChar *target; 1.947 + const UChar *targetLimit; 1.948 + int32_t *offsets; 1.949 + 1.950 + int32_t prev, count, diff, c; 1.951 + 1.952 + int8_t byteIndex; 1.953 + uint8_t *bytes; 1.954 + 1.955 + int32_t sourceIndex, nextSourceIndex; 1.956 + 1.957 + /* set up the local pointers */ 1.958 + cnv=pArgs->converter; 1.959 + source=(const uint8_t *)pArgs->source; 1.960 + sourceLimit=(const uint8_t *)pArgs->sourceLimit; 1.961 + target=pArgs->target; 1.962 + targetLimit=pArgs->targetLimit; 1.963 + offsets=pArgs->offsets; 1.964 + 1.965 + /* get the converter state from UConverter */ 1.966 + prev=(int32_t)cnv->toUnicodeStatus; 1.967 + if(prev==0) { 1.968 + prev=BOCU1_ASCII_PREV; 1.969 + } 1.970 + diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */ 1.971 + count=diff&3; 1.972 + diff>>=2; 1.973 + 1.974 + byteIndex=cnv->toULength; 1.975 + bytes=cnv->toUBytes; 1.976 + 1.977 + /* sourceIndex=-1 if the current character began in the previous buffer */ 1.978 + sourceIndex=byteIndex==0 ? 0 : -1; 1.979 + nextSourceIndex=0; 1.980 + 1.981 + /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */ 1.982 + if(count>0 && byteIndex>0 && target<targetLimit) { 1.983 + goto getTrail; 1.984 + } 1.985 + 1.986 +fastSingle: 1.987 + /* fast loop for single-byte differences */ 1.988 + /* use count as the only loop counter variable */ 1.989 + diff=(int32_t)(sourceLimit-source); 1.990 + count=(int32_t)(pArgs->targetLimit-target); 1.991 + if(count>diff) { 1.992 + count=diff; 1.993 + } 1.994 + while(count>0) { 1.995 + if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) { 1.996 + c=prev+(c-BOCU1_MIDDLE); 1.997 + if(c<0x3000) { 1.998 + *target++=(UChar)c; 1.999 + *offsets++=nextSourceIndex++; 1.1000 + prev=BOCU1_SIMPLE_PREV(c); 1.1001 + } else { 1.1002 + break; 1.1003 + } 1.1004 + } else if(c<=0x20) { 1.1005 + if(c!=0x20) { 1.1006 + prev=BOCU1_ASCII_PREV; 1.1007 + } 1.1008 + *target++=(UChar)c; 1.1009 + *offsets++=nextSourceIndex++; 1.1010 + } else { 1.1011 + break; 1.1012 + } 1.1013 + ++source; 1.1014 + --count; 1.1015 + } 1.1016 + sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */ 1.1017 + 1.1018 + /* decode a sequence of single and lead bytes */ 1.1019 + while(source<sourceLimit) { 1.1020 + if(target>=targetLimit) { 1.1021 + /* target is full */ 1.1022 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.1023 + break; 1.1024 + } 1.1025 + 1.1026 + ++nextSourceIndex; 1.1027 + c=*source++; 1.1028 + if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) { 1.1029 + /* Write a code point directly from a single-byte difference. */ 1.1030 + c=prev+(c-BOCU1_MIDDLE); 1.1031 + if(c<0x3000) { 1.1032 + *target++=(UChar)c; 1.1033 + *offsets++=sourceIndex; 1.1034 + prev=BOCU1_SIMPLE_PREV(c); 1.1035 + sourceIndex=nextSourceIndex; 1.1036 + goto fastSingle; 1.1037 + } 1.1038 + } else if(c<=0x20) { 1.1039 + /* 1.1040 + * Direct-encoded C0 control code or space. 1.1041 + * Reset prev for C0 control codes but not for space. 1.1042 + */ 1.1043 + if(c!=0x20) { 1.1044 + prev=BOCU1_ASCII_PREV; 1.1045 + } 1.1046 + *target++=(UChar)c; 1.1047 + *offsets++=sourceIndex; 1.1048 + sourceIndex=nextSourceIndex; 1.1049 + continue; 1.1050 + } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) { 1.1051 + /* Optimize two-byte case. */ 1.1052 + if(c>=BOCU1_MIDDLE) { 1.1053 + diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; 1.1054 + } else { 1.1055 + diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; 1.1056 + } 1.1057 + 1.1058 + /* trail byte */ 1.1059 + ++nextSourceIndex; 1.1060 + c=decodeBocu1TrailByte(1, *source++); 1.1061 + if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) { 1.1062 + bytes[0]=source[-2]; 1.1063 + bytes[1]=source[-1]; 1.1064 + byteIndex=2; 1.1065 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.1066 + break; 1.1067 + } 1.1068 + } else if(c==BOCU1_RESET) { 1.1069 + /* only reset the state, no code point */ 1.1070 + prev=BOCU1_ASCII_PREV; 1.1071 + sourceIndex=nextSourceIndex; 1.1072 + continue; 1.1073 + } else { 1.1074 + /* 1.1075 + * For multi-byte difference lead bytes, set the decoder state 1.1076 + * with the partial difference value from the lead byte and 1.1077 + * with the number of trail bytes. 1.1078 + */ 1.1079 + bytes[0]=(uint8_t)c; 1.1080 + byteIndex=1; 1.1081 + 1.1082 + diff=decodeBocu1LeadByte(c); 1.1083 + count=diff&3; 1.1084 + diff>>=2; 1.1085 +getTrail: 1.1086 + for(;;) { 1.1087 + if(source>=sourceLimit) { 1.1088 + goto endloop; 1.1089 + } 1.1090 + ++nextSourceIndex; 1.1091 + c=bytes[byteIndex++]=*source++; 1.1092 + 1.1093 + /* trail byte in any position */ 1.1094 + c=decodeBocu1TrailByte(count, c); 1.1095 + if(c<0) { 1.1096 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.1097 + goto endloop; 1.1098 + } 1.1099 + 1.1100 + diff+=c; 1.1101 + if(--count==0) { 1.1102 + /* final trail byte, deliver a code point */ 1.1103 + byteIndex=0; 1.1104 + c=prev+diff; 1.1105 + if((uint32_t)c>0x10ffff) { 1.1106 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.1107 + goto endloop; 1.1108 + } 1.1109 + break; 1.1110 + } 1.1111 + } 1.1112 + } 1.1113 + 1.1114 + /* calculate the next prev and output c */ 1.1115 + prev=BOCU1_PREV(c); 1.1116 + if(c<=0xffff) { 1.1117 + *target++=(UChar)c; 1.1118 + *offsets++=sourceIndex; 1.1119 + } else { 1.1120 + /* output surrogate pair */ 1.1121 + *target++=U16_LEAD(c); 1.1122 + if(target<targetLimit) { 1.1123 + *target++=U16_TRAIL(c); 1.1124 + *offsets++=sourceIndex; 1.1125 + *offsets++=sourceIndex; 1.1126 + } else { 1.1127 + /* target overflow */ 1.1128 + *offsets++=sourceIndex; 1.1129 + cnv->UCharErrorBuffer[0]=U16_TRAIL(c); 1.1130 + cnv->UCharErrorBufferLength=1; 1.1131 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.1132 + break; 1.1133 + } 1.1134 + } 1.1135 + sourceIndex=nextSourceIndex; 1.1136 + } 1.1137 +endloop: 1.1138 + 1.1139 + if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) { 1.1140 + /* set the converter state in UConverter to deal with the next character */ 1.1141 + cnv->toUnicodeStatus=BOCU1_ASCII_PREV; 1.1142 + cnv->mode=0; 1.1143 + } else { 1.1144 + /* set the converter state back into UConverter */ 1.1145 + cnv->toUnicodeStatus=(uint32_t)prev; 1.1146 + cnv->mode=(diff<<2)|count; 1.1147 + } 1.1148 + cnv->toULength=byteIndex; 1.1149 + 1.1150 + /* write back the updated pointers */ 1.1151 + pArgs->source=(const char *)source; 1.1152 + pArgs->target=target; 1.1153 + pArgs->offsets=offsets; 1.1154 + return; 1.1155 +} 1.1156 + 1.1157 +/* 1.1158 + * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling. 1.1159 + * If a change is made in the original function, then either 1.1160 + * change this function the same way or 1.1161 + * re-copy the original function and remove the variables 1.1162 + * offsets, sourceIndex, and nextSourceIndex. 1.1163 + */ 1.1164 +static void 1.1165 +_Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs, 1.1166 + UErrorCode *pErrorCode) { 1.1167 + UConverter *cnv; 1.1168 + const uint8_t *source, *sourceLimit; 1.1169 + UChar *target; 1.1170 + const UChar *targetLimit; 1.1171 + 1.1172 + int32_t prev, count, diff, c; 1.1173 + 1.1174 + int8_t byteIndex; 1.1175 + uint8_t *bytes; 1.1176 + 1.1177 +U_ALIGN_CODE(16) 1.1178 + 1.1179 + /* set up the local pointers */ 1.1180 + cnv=pArgs->converter; 1.1181 + source=(const uint8_t *)pArgs->source; 1.1182 + sourceLimit=(const uint8_t *)pArgs->sourceLimit; 1.1183 + target=pArgs->target; 1.1184 + targetLimit=pArgs->targetLimit; 1.1185 + 1.1186 + /* get the converter state from UConverter */ 1.1187 + prev=(int32_t)cnv->toUnicodeStatus; 1.1188 + if(prev==0) { 1.1189 + prev=BOCU1_ASCII_PREV; 1.1190 + } 1.1191 + diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */ 1.1192 + count=diff&3; 1.1193 + diff>>=2; 1.1194 + 1.1195 + byteIndex=cnv->toULength; 1.1196 + bytes=cnv->toUBytes; 1.1197 + 1.1198 + /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */ 1.1199 + if(count>0 && byteIndex>0 && target<targetLimit) { 1.1200 + goto getTrail; 1.1201 + } 1.1202 + 1.1203 +fastSingle: 1.1204 + /* fast loop for single-byte differences */ 1.1205 + /* use count as the only loop counter variable */ 1.1206 + diff=(int32_t)(sourceLimit-source); 1.1207 + count=(int32_t)(pArgs->targetLimit-target); 1.1208 + if(count>diff) { 1.1209 + count=diff; 1.1210 + } 1.1211 + while(count>0) { 1.1212 + if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) { 1.1213 + c=prev+(c-BOCU1_MIDDLE); 1.1214 + if(c<0x3000) { 1.1215 + *target++=(UChar)c; 1.1216 + prev=BOCU1_SIMPLE_PREV(c); 1.1217 + } else { 1.1218 + break; 1.1219 + } 1.1220 + } else if(c<=0x20) { 1.1221 + if(c!=0x20) { 1.1222 + prev=BOCU1_ASCII_PREV; 1.1223 + } 1.1224 + *target++=(UChar)c; 1.1225 + } else { 1.1226 + break; 1.1227 + } 1.1228 + ++source; 1.1229 + --count; 1.1230 + } 1.1231 + 1.1232 + /* decode a sequence of single and lead bytes */ 1.1233 + while(source<sourceLimit) { 1.1234 + if(target>=targetLimit) { 1.1235 + /* target is full */ 1.1236 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.1237 + break; 1.1238 + } 1.1239 + 1.1240 + c=*source++; 1.1241 + if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) { 1.1242 + /* Write a code point directly from a single-byte difference. */ 1.1243 + c=prev+(c-BOCU1_MIDDLE); 1.1244 + if(c<0x3000) { 1.1245 + *target++=(UChar)c; 1.1246 + prev=BOCU1_SIMPLE_PREV(c); 1.1247 + goto fastSingle; 1.1248 + } 1.1249 + } else if(c<=0x20) { 1.1250 + /* 1.1251 + * Direct-encoded C0 control code or space. 1.1252 + * Reset prev for C0 control codes but not for space. 1.1253 + */ 1.1254 + if(c!=0x20) { 1.1255 + prev=BOCU1_ASCII_PREV; 1.1256 + } 1.1257 + *target++=(UChar)c; 1.1258 + continue; 1.1259 + } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) { 1.1260 + /* Optimize two-byte case. */ 1.1261 + if(c>=BOCU1_MIDDLE) { 1.1262 + diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; 1.1263 + } else { 1.1264 + diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; 1.1265 + } 1.1266 + 1.1267 + /* trail byte */ 1.1268 + c=decodeBocu1TrailByte(1, *source++); 1.1269 + if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) { 1.1270 + bytes[0]=source[-2]; 1.1271 + bytes[1]=source[-1]; 1.1272 + byteIndex=2; 1.1273 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.1274 + break; 1.1275 + } 1.1276 + } else if(c==BOCU1_RESET) { 1.1277 + /* only reset the state, no code point */ 1.1278 + prev=BOCU1_ASCII_PREV; 1.1279 + continue; 1.1280 + } else { 1.1281 + /* 1.1282 + * For multi-byte difference lead bytes, set the decoder state 1.1283 + * with the partial difference value from the lead byte and 1.1284 + * with the number of trail bytes. 1.1285 + */ 1.1286 + bytes[0]=(uint8_t)c; 1.1287 + byteIndex=1; 1.1288 + 1.1289 + diff=decodeBocu1LeadByte(c); 1.1290 + count=diff&3; 1.1291 + diff>>=2; 1.1292 +getTrail: 1.1293 + for(;;) { 1.1294 + if(source>=sourceLimit) { 1.1295 + goto endloop; 1.1296 + } 1.1297 + c=bytes[byteIndex++]=*source++; 1.1298 + 1.1299 + /* trail byte in any position */ 1.1300 + c=decodeBocu1TrailByte(count, c); 1.1301 + if(c<0) { 1.1302 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.1303 + goto endloop; 1.1304 + } 1.1305 + 1.1306 + diff+=c; 1.1307 + if(--count==0) { 1.1308 + /* final trail byte, deliver a code point */ 1.1309 + byteIndex=0; 1.1310 + c=prev+diff; 1.1311 + if((uint32_t)c>0x10ffff) { 1.1312 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.1313 + goto endloop; 1.1314 + } 1.1315 + break; 1.1316 + } 1.1317 + } 1.1318 + } 1.1319 + 1.1320 + /* calculate the next prev and output c */ 1.1321 + prev=BOCU1_PREV(c); 1.1322 + if(c<=0xffff) { 1.1323 + *target++=(UChar)c; 1.1324 + } else { 1.1325 + /* output surrogate pair */ 1.1326 + *target++=U16_LEAD(c); 1.1327 + if(target<targetLimit) { 1.1328 + *target++=U16_TRAIL(c); 1.1329 + } else { 1.1330 + /* target overflow */ 1.1331 + cnv->UCharErrorBuffer[0]=U16_TRAIL(c); 1.1332 + cnv->UCharErrorBufferLength=1; 1.1333 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.1334 + break; 1.1335 + } 1.1336 + } 1.1337 + } 1.1338 +endloop: 1.1339 + 1.1340 + if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) { 1.1341 + /* set the converter state in UConverter to deal with the next character */ 1.1342 + cnv->toUnicodeStatus=BOCU1_ASCII_PREV; 1.1343 + cnv->mode=0; 1.1344 + } else { 1.1345 + /* set the converter state back into UConverter */ 1.1346 + cnv->toUnicodeStatus=(uint32_t)prev; 1.1347 + cnv->mode=(diff<<2)|count; 1.1348 + } 1.1349 + cnv->toULength=byteIndex; 1.1350 + 1.1351 + /* write back the updated pointers */ 1.1352 + pArgs->source=(const char *)source; 1.1353 + pArgs->target=target; 1.1354 + return; 1.1355 +} 1.1356 + 1.1357 +/* miscellaneous ------------------------------------------------------------ */ 1.1358 + 1.1359 +static const UConverterImpl _Bocu1Impl={ 1.1360 + UCNV_BOCU1, 1.1361 + 1.1362 + NULL, 1.1363 + NULL, 1.1364 + 1.1365 + NULL, 1.1366 + NULL, 1.1367 + NULL, 1.1368 + 1.1369 + _Bocu1ToUnicode, 1.1370 + _Bocu1ToUnicodeWithOffsets, 1.1371 + _Bocu1FromUnicode, 1.1372 + _Bocu1FromUnicodeWithOffsets, 1.1373 + NULL, 1.1374 + 1.1375 + NULL, 1.1376 + NULL, 1.1377 + NULL, 1.1378 + NULL, 1.1379 + ucnv_getCompleteUnicodeSet, 1.1380 + 1.1381 + NULL, 1.1382 + NULL 1.1383 +}; 1.1384 + 1.1385 +static const UConverterStaticData _Bocu1StaticData={ 1.1386 + sizeof(UConverterStaticData), 1.1387 + "BOCU-1", 1.1388 + 1214, /* CCSID for BOCU-1 */ 1.1389 + UCNV_IBM, UCNV_BOCU1, 1.1390 + 1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */ 1.1391 + { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */ 1.1392 + FALSE, FALSE, 1.1393 + 0, 1.1394 + 0, 1.1395 + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1.1396 +}; 1.1397 + 1.1398 +const UConverterSharedData _Bocu1Data={ 1.1399 + sizeof(UConverterSharedData), ~((uint32_t)0), 1.1400 + NULL, NULL, &_Bocu1StaticData, FALSE, &_Bocu1Impl, 1.1401 + 0, 1.1402 + UCNV_MBCS_TABLE_INITIALIZER 1.1403 +}; 1.1404 + 1.1405 +#endif