intl/icu/source/common/ucnvbocu.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /*
michael@0 2 ******************************************************************************
michael@0 3 *
michael@0 4 * Copyright (C) 2002-2011, International Business Machines
michael@0 5 * Corporation and others. All Rights Reserved.
michael@0 6 *
michael@0 7 ******************************************************************************
michael@0 8 * file name: ucnvbocu.cpp
michael@0 9 * encoding: US-ASCII
michael@0 10 * tab size: 8 (not used)
michael@0 11 * indentation:4
michael@0 12 *
michael@0 13 * created on: 2002mar27
michael@0 14 * created by: Markus W. Scherer
michael@0 15 *
michael@0 16 * This is an implementation of the Binary Ordered Compression for Unicode,
michael@0 17 * in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
michael@0 18 */
michael@0 19
michael@0 20 #include "unicode/utypes.h"
michael@0 21
michael@0 22 #if !UCONFIG_NO_CONVERSION
michael@0 23
michael@0 24 #include "unicode/ucnv.h"
michael@0 25 #include "unicode/ucnv_cb.h"
michael@0 26 #include "unicode/utf16.h"
michael@0 27 #include "putilimp.h"
michael@0 28 #include "ucnv_bld.h"
michael@0 29 #include "ucnv_cnv.h"
michael@0 30 #include "uassert.h"
michael@0 31
michael@0 32 /* BOCU-1 constants and macros ---------------------------------------------- */
michael@0 33
michael@0 34 /*
michael@0 35 * BOCU-1 encodes the code points of a Unicode string as
michael@0 36 * a sequence of byte-encoded differences (slope detection),
michael@0 37 * preserving lexical order.
michael@0 38 *
michael@0 39 * Optimize the difference-taking for runs of Unicode text within
michael@0 40 * small scripts:
michael@0 41 *
michael@0 42 * Most small scripts are allocated within aligned 128-blocks of Unicode
michael@0 43 * code points. Lexical order is preserved if the "previous code point" state
michael@0 44 * is always moved into the middle of such a block.
michael@0 45 *
michael@0 46 * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
michael@0 47 * areas into the middle of those areas.
michael@0 48 *
michael@0 49 * C0 control codes and space are encoded with their US-ASCII bytes.
michael@0 50 * "prev" is reset for C0 controls but not for space.
michael@0 51 */
michael@0 52
michael@0 53 /* initial value for "prev": middle of the ASCII range */
michael@0 54 #define BOCU1_ASCII_PREV 0x40
michael@0 55
michael@0 56 /* bounding byte values for differences */
michael@0 57 #define BOCU1_MIN 0x21
michael@0 58 #define BOCU1_MIDDLE 0x90
michael@0 59 #define BOCU1_MAX_LEAD 0xfe
michael@0 60 #define BOCU1_MAX_TRAIL 0xff
michael@0 61 #define BOCU1_RESET 0xff
michael@0 62
michael@0 63 /* number of lead bytes */
michael@0 64 #define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1)
michael@0 65
michael@0 66 /* adjust trail byte counts for the use of some C0 control byte values */
michael@0 67 #define BOCU1_TRAIL_CONTROLS_COUNT 20
michael@0 68 #define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
michael@0 69
michael@0 70 /* number of trail bytes */
michael@0 71 #define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
michael@0 72
michael@0 73 /*
michael@0 74 * number of positive and negative single-byte codes
michael@0 75 * (counting 0==BOCU1_MIDDLE among the positive ones)
michael@0 76 */
michael@0 77 #define BOCU1_SINGLE 64
michael@0 78
michael@0 79 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
michael@0 80 #define BOCU1_LEAD_2 43
michael@0 81 #define BOCU1_LEAD_3 3
michael@0 82 #define BOCU1_LEAD_4 1
michael@0 83
michael@0 84 /* The difference value range for single-byters. */
michael@0 85 #define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1)
michael@0 86 #define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE)
michael@0 87
michael@0 88 /* The difference value range for double-byters. */
michael@0 89 #define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
michael@0 90 #define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
michael@0 91
michael@0 92 /* The difference value range for 3-byters. */
michael@0 93 #define BOCU1_REACH_POS_3 \
michael@0 94 (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
michael@0 95
michael@0 96 #define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
michael@0 97
michael@0 98 /* The lead byte start values. */
michael@0 99 #define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
michael@0 100 #define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2)
michael@0 101 #define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3)
michael@0 102 /* ==BOCU1_MAX_LEAD */
michael@0 103
michael@0 104 #define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
michael@0 105 #define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2)
michael@0 106 #define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3)
michael@0 107 /* ==BOCU1_MIN+1 */
michael@0 108
michael@0 109 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
michael@0 110 #define BOCU1_LENGTH_FROM_LEAD(lead) \
michael@0 111 ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
michael@0 112 (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
michael@0 113 (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
michael@0 114
michael@0 115 /* The length of a byte sequence, according to its packed form. */
michael@0 116 #define BOCU1_LENGTH_FROM_PACKED(packed) \
michael@0 117 ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
michael@0 118
michael@0 119 /*
michael@0 120 * 12 commonly used C0 control codes (and space) are only used to encode
michael@0 121 * themselves directly,
michael@0 122 * which makes BOCU-1 MIME-usable and reasonably safe for
michael@0 123 * ASCII-oriented software.
michael@0 124 *
michael@0 125 * These controls are
michael@0 126 * 0 NUL
michael@0 127 *
michael@0 128 * 7 BEL
michael@0 129 * 8 BS
michael@0 130 *
michael@0 131 * 9 TAB
michael@0 132 * a LF
michael@0 133 * b VT
michael@0 134 * c FF
michael@0 135 * d CR
michael@0 136 *
michael@0 137 * e SO
michael@0 138 * f SI
michael@0 139 *
michael@0 140 * 1a SUB
michael@0 141 * 1b ESC
michael@0 142 *
michael@0 143 * The other 20 C0 controls are also encoded directly (to preserve order)
michael@0 144 * but are also used as trail bytes in difference encoding
michael@0 145 * (for better compression).
michael@0 146 */
michael@0 147 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
michael@0 148
michael@0 149 /*
michael@0 150 * Byte value map for control codes,
michael@0 151 * from external byte values 0x00..0x20
michael@0 152 * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
michael@0 153 * External byte values that are illegal as trail bytes are mapped to -1.
michael@0 154 */
michael@0 155 static const int8_t
michael@0 156 bocu1ByteToTrail[BOCU1_MIN]={
michael@0 157 /* 0 1 2 3 4 5 6 7 */
michael@0 158 -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
michael@0 159
michael@0 160 /* 8 9 a b c d e f */
michael@0 161 -1, -1, -1, -1, -1, -1, -1, -1,
michael@0 162
michael@0 163 /* 10 11 12 13 14 15 16 17 */
michael@0 164 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
michael@0 165
michael@0 166 /* 18 19 1a 1b 1c 1d 1e 1f */
michael@0 167 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13,
michael@0 168
michael@0 169 /* 20 */
michael@0 170 -1
michael@0 171 };
michael@0 172
michael@0 173 /*
michael@0 174 * Byte value map for control codes,
michael@0 175 * from trail byte values 0..19 (0..0x13) as used in the difference calculation
michael@0 176 * to external byte values 0x00..0x20.
michael@0 177 */
michael@0 178 static const int8_t
michael@0 179 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
michael@0 180 /* 0 1 2 3 4 5 6 7 */
michael@0 181 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
michael@0 182
michael@0 183 /* 8 9 a b c d e f */
michael@0 184 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
michael@0 185
michael@0 186 /* 10 11 12 13 */
michael@0 187 0x1c, 0x1d, 0x1e, 0x1f
michael@0 188 };
michael@0 189
michael@0 190 /**
michael@0 191 * Integer division and modulo with negative numerators
michael@0 192 * yields negative modulo results and quotients that are one more than
michael@0 193 * what we need here.
michael@0 194 * This macro adjust the results so that the modulo-value m is always >=0.
michael@0 195 *
michael@0 196 * For positive n, the if() condition is always FALSE.
michael@0 197 *
michael@0 198 * @param n Number to be split into quotient and rest.
michael@0 199 * Will be modified to contain the quotient.
michael@0 200 * @param d Divisor.
michael@0 201 * @param m Output variable for the rest (modulo result).
michael@0 202 */
michael@0 203 #define NEGDIVMOD(n, d, m) { \
michael@0 204 (m)=(n)%(d); \
michael@0 205 (n)/=(d); \
michael@0 206 if((m)<0) { \
michael@0 207 --(n); \
michael@0 208 (m)+=(d); \
michael@0 209 } \
michael@0 210 }
michael@0 211
michael@0 212 /* Faster versions of packDiff() for single-byte-encoded diff values. */
michael@0 213
michael@0 214 /** Is a diff value encodable in a single byte? */
michael@0 215 #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
michael@0 216
michael@0 217 /** Encode a diff value in a single byte. */
michael@0 218 #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
michael@0 219
michael@0 220 /** Is a diff value encodable in two bytes? */
michael@0 221 #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
michael@0 222
michael@0 223 /* BOCU-1 implementation functions ------------------------------------------ */
michael@0 224
michael@0 225 #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
michael@0 226
michael@0 227 /**
michael@0 228 * Compute the next "previous" value for differencing
michael@0 229 * from the current code point.
michael@0 230 *
michael@0 231 * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
michael@0 232 * @return "previous code point" state value
michael@0 233 */
michael@0 234 static inline int32_t
michael@0 235 bocu1Prev(int32_t c) {
michael@0 236 /* compute new prev */
michael@0 237 if(/* 0x3040<=c && */ c<=0x309f) {
michael@0 238 /* Hiragana is not 128-aligned */
michael@0 239 return 0x3070;
michael@0 240 } else if(0x4e00<=c && c<=0x9fa5) {
michael@0 241 /* CJK Unihan */
michael@0 242 return 0x4e00-BOCU1_REACH_NEG_2;
michael@0 243 } else if(0xac00<=c /* && c<=0xd7a3 */) {
michael@0 244 /* Korean Hangul */
michael@0 245 return (0xd7a3+0xac00)/2;
michael@0 246 } else {
michael@0 247 /* mostly small scripts */
michael@0 248 return BOCU1_SIMPLE_PREV(c);
michael@0 249 }
michael@0 250 }
michael@0 251
michael@0 252 /** Fast version of bocu1Prev() for most scripts. */
michael@0 253 #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
michael@0 254
michael@0 255 /*
michael@0 256 * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
michael@0 257 * The UConverter fields are used as follows:
michael@0 258 *
michael@0 259 * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
michael@0 260 *
michael@0 261 * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
michael@0 262 * mode decoder's incomplete (diff<<2)|count (ignored when toULength==0)
michael@0 263 */
michael@0 264
michael@0 265 /* BOCU-1-from-Unicode conversion functions --------------------------------- */
michael@0 266
michael@0 267 /**
michael@0 268 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
michael@0 269 * and return a packed integer with them.
michael@0 270 *
michael@0 271 * The encoding favors small absolute differences with short encodings
michael@0 272 * to compress runs of same-script characters.
michael@0 273 *
michael@0 274 * Optimized version with unrolled loops and fewer floating-point operations
michael@0 275 * than the standard packDiff().
michael@0 276 *
michael@0 277 * @param diff difference value -0x10ffff..0x10ffff
michael@0 278 * @return
michael@0 279 * 0x010000zz for 1-byte sequence zz
michael@0 280 * 0x0200yyzz for 2-byte sequence yy zz
michael@0 281 * 0x03xxyyzz for 3-byte sequence xx yy zz
michael@0 282 * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
michael@0 283 */
michael@0 284 static int32_t
michael@0 285 packDiff(int32_t diff) {
michael@0 286 int32_t result, m;
michael@0 287
michael@0 288 U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */
michael@0 289 if(diff>=BOCU1_REACH_NEG_1) {
michael@0 290 /* mostly positive differences, and single-byte negative ones */
michael@0 291 #if 0 /* single-byte case handled in macros, see below */
michael@0 292 if(diff<=BOCU1_REACH_POS_1) {
michael@0 293 /* single byte */
michael@0 294 return 0x01000000|(BOCU1_MIDDLE+diff);
michael@0 295 } else
michael@0 296 #endif
michael@0 297 if(diff<=BOCU1_REACH_POS_2) {
michael@0 298 /* two bytes */
michael@0 299 diff-=BOCU1_REACH_POS_1+1;
michael@0 300 result=0x02000000;
michael@0 301
michael@0 302 m=diff%BOCU1_TRAIL_COUNT;
michael@0 303 diff/=BOCU1_TRAIL_COUNT;
michael@0 304 result|=BOCU1_TRAIL_TO_BYTE(m);
michael@0 305
michael@0 306 result|=(BOCU1_START_POS_2+diff)<<8;
michael@0 307 } else if(diff<=BOCU1_REACH_POS_3) {
michael@0 308 /* three bytes */
michael@0 309 diff-=BOCU1_REACH_POS_2+1;
michael@0 310 result=0x03000000;
michael@0 311
michael@0 312 m=diff%BOCU1_TRAIL_COUNT;
michael@0 313 diff/=BOCU1_TRAIL_COUNT;
michael@0 314 result|=BOCU1_TRAIL_TO_BYTE(m);
michael@0 315
michael@0 316 m=diff%BOCU1_TRAIL_COUNT;
michael@0 317 diff/=BOCU1_TRAIL_COUNT;
michael@0 318 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
michael@0 319
michael@0 320 result|=(BOCU1_START_POS_3+diff)<<16;
michael@0 321 } else {
michael@0 322 /* four bytes */
michael@0 323 diff-=BOCU1_REACH_POS_3+1;
michael@0 324
michael@0 325 m=diff%BOCU1_TRAIL_COUNT;
michael@0 326 diff/=BOCU1_TRAIL_COUNT;
michael@0 327 result=BOCU1_TRAIL_TO_BYTE(m);
michael@0 328
michael@0 329 m=diff%BOCU1_TRAIL_COUNT;
michael@0 330 diff/=BOCU1_TRAIL_COUNT;
michael@0 331 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
michael@0 332
michael@0 333 /*
michael@0 334 * We know that / and % would deliver quotient 0 and rest=diff.
michael@0 335 * Avoid division and modulo for performance.
michael@0 336 */
michael@0 337 result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
michael@0 338
michael@0 339 result|=((uint32_t)BOCU1_START_POS_4)<<24;
michael@0 340 }
michael@0 341 } else {
michael@0 342 /* two- to four-byte negative differences */
michael@0 343 if(diff>=BOCU1_REACH_NEG_2) {
michael@0 344 /* two bytes */
michael@0 345 diff-=BOCU1_REACH_NEG_1;
michael@0 346 result=0x02000000;
michael@0 347
michael@0 348 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
michael@0 349 result|=BOCU1_TRAIL_TO_BYTE(m);
michael@0 350
michael@0 351 result|=(BOCU1_START_NEG_2+diff)<<8;
michael@0 352 } else if(diff>=BOCU1_REACH_NEG_3) {
michael@0 353 /* three bytes */
michael@0 354 diff-=BOCU1_REACH_NEG_2;
michael@0 355 result=0x03000000;
michael@0 356
michael@0 357 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
michael@0 358 result|=BOCU1_TRAIL_TO_BYTE(m);
michael@0 359
michael@0 360 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
michael@0 361 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
michael@0 362
michael@0 363 result|=(BOCU1_START_NEG_3+diff)<<16;
michael@0 364 } else {
michael@0 365 /* four bytes */
michael@0 366 diff-=BOCU1_REACH_NEG_3;
michael@0 367
michael@0 368 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
michael@0 369 result=BOCU1_TRAIL_TO_BYTE(m);
michael@0 370
michael@0 371 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
michael@0 372 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
michael@0 373
michael@0 374 /*
michael@0 375 * We know that NEGDIVMOD would deliver
michael@0 376 * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
michael@0 377 * Avoid division and modulo for performance.
michael@0 378 */
michael@0 379 m=diff+BOCU1_TRAIL_COUNT;
michael@0 380 result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
michael@0 381
michael@0 382 result|=BOCU1_MIN<<24;
michael@0 383 }
michael@0 384 }
michael@0 385 return result;
michael@0 386 }
michael@0 387
michael@0 388
michael@0 389 static void
michael@0 390 _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
michael@0 391 UErrorCode *pErrorCode) {
michael@0 392 UConverter *cnv;
michael@0 393 const UChar *source, *sourceLimit;
michael@0 394 uint8_t *target;
michael@0 395 int32_t targetCapacity;
michael@0 396 int32_t *offsets;
michael@0 397
michael@0 398 int32_t prev, c, diff;
michael@0 399
michael@0 400 int32_t sourceIndex, nextSourceIndex;
michael@0 401
michael@0 402 U_ALIGN_CODE(16)
michael@0 403
michael@0 404 /* set up the local pointers */
michael@0 405 cnv=pArgs->converter;
michael@0 406 source=pArgs->source;
michael@0 407 sourceLimit=pArgs->sourceLimit;
michael@0 408 target=(uint8_t *)pArgs->target;
michael@0 409 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
michael@0 410 offsets=pArgs->offsets;
michael@0 411
michael@0 412 /* get the converter state from UConverter */
michael@0 413 c=cnv->fromUChar32;
michael@0 414 prev=(int32_t)cnv->fromUnicodeStatus;
michael@0 415 if(prev==0) {
michael@0 416 prev=BOCU1_ASCII_PREV;
michael@0 417 }
michael@0 418
michael@0 419 /* sourceIndex=-1 if the current character began in the previous buffer */
michael@0 420 sourceIndex= c==0 ? 0 : -1;
michael@0 421 nextSourceIndex=0;
michael@0 422
michael@0 423 /* conversion loop */
michael@0 424 if(c!=0 && targetCapacity>0) {
michael@0 425 goto getTrail;
michael@0 426 }
michael@0 427
michael@0 428 fastSingle:
michael@0 429 /* fast loop for single-byte differences */
michael@0 430 /* use only one loop counter variable, targetCapacity, not also source */
michael@0 431 diff=(int32_t)(sourceLimit-source);
michael@0 432 if(targetCapacity>diff) {
michael@0 433 targetCapacity=diff;
michael@0 434 }
michael@0 435 while(targetCapacity>0 && (c=*source)<0x3000) {
michael@0 436 if(c<=0x20) {
michael@0 437 if(c!=0x20) {
michael@0 438 prev=BOCU1_ASCII_PREV;
michael@0 439 }
michael@0 440 *target++=(uint8_t)c;
michael@0 441 *offsets++=nextSourceIndex++;
michael@0 442 ++source;
michael@0 443 --targetCapacity;
michael@0 444 } else {
michael@0 445 diff=c-prev;
michael@0 446 if(DIFF_IS_SINGLE(diff)) {
michael@0 447 prev=BOCU1_SIMPLE_PREV(c);
michael@0 448 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
michael@0 449 *offsets++=nextSourceIndex++;
michael@0 450 ++source;
michael@0 451 --targetCapacity;
michael@0 452 } else {
michael@0 453 break;
michael@0 454 }
michael@0 455 }
michael@0 456 }
michael@0 457 /* restore real values */
michael@0 458 targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
michael@0 459 sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
michael@0 460
michael@0 461 /* regular loop for all cases */
michael@0 462 while(source<sourceLimit) {
michael@0 463 if(targetCapacity>0) {
michael@0 464 c=*source++;
michael@0 465 ++nextSourceIndex;
michael@0 466
michael@0 467 if(c<=0x20) {
michael@0 468 /*
michael@0 469 * ISO C0 control & space:
michael@0 470 * Encode directly for MIME compatibility,
michael@0 471 * and reset state except for space, to not disrupt compression.
michael@0 472 */
michael@0 473 if(c!=0x20) {
michael@0 474 prev=BOCU1_ASCII_PREV;
michael@0 475 }
michael@0 476 *target++=(uint8_t)c;
michael@0 477 *offsets++=sourceIndex;
michael@0 478 --targetCapacity;
michael@0 479
michael@0 480 sourceIndex=nextSourceIndex;
michael@0 481 continue;
michael@0 482 }
michael@0 483
michael@0 484 if(U16_IS_LEAD(c)) {
michael@0 485 getTrail:
michael@0 486 if(source<sourceLimit) {
michael@0 487 /* test the following code unit */
michael@0 488 UChar trail=*source;
michael@0 489 if(U16_IS_TRAIL(trail)) {
michael@0 490 ++source;
michael@0 491 ++nextSourceIndex;
michael@0 492 c=U16_GET_SUPPLEMENTARY(c, trail);
michael@0 493 }
michael@0 494 } else {
michael@0 495 /* no more input */
michael@0 496 c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
michael@0 497 break;
michael@0 498 }
michael@0 499 }
michael@0 500
michael@0 501 /*
michael@0 502 * all other Unicode code points c==U+0021..U+10ffff
michael@0 503 * are encoded with the difference c-prev
michael@0 504 *
michael@0 505 * a new prev is computed from c,
michael@0 506 * placed in the middle of a 0x80-block (for most small scripts) or
michael@0 507 * in the middle of the Unihan and Hangul blocks
michael@0 508 * to statistically minimize the following difference
michael@0 509 */
michael@0 510 diff=c-prev;
michael@0 511 prev=BOCU1_PREV(c);
michael@0 512 if(DIFF_IS_SINGLE(diff)) {
michael@0 513 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
michael@0 514 *offsets++=sourceIndex;
michael@0 515 --targetCapacity;
michael@0 516 sourceIndex=nextSourceIndex;
michael@0 517 if(c<0x3000) {
michael@0 518 goto fastSingle;
michael@0 519 }
michael@0 520 } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
michael@0 521 /* optimize 2-byte case */
michael@0 522 int32_t m;
michael@0 523
michael@0 524 if(diff>=0) {
michael@0 525 diff-=BOCU1_REACH_POS_1+1;
michael@0 526 m=diff%BOCU1_TRAIL_COUNT;
michael@0 527 diff/=BOCU1_TRAIL_COUNT;
michael@0 528 diff+=BOCU1_START_POS_2;
michael@0 529 } else {
michael@0 530 diff-=BOCU1_REACH_NEG_1;
michael@0 531 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
michael@0 532 diff+=BOCU1_START_NEG_2;
michael@0 533 }
michael@0 534 *target++=(uint8_t)diff;
michael@0 535 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
michael@0 536 *offsets++=sourceIndex;
michael@0 537 *offsets++=sourceIndex;
michael@0 538 targetCapacity-=2;
michael@0 539 sourceIndex=nextSourceIndex;
michael@0 540 } else {
michael@0 541 int32_t length; /* will be 2..4 */
michael@0 542
michael@0 543 diff=packDiff(diff);
michael@0 544 length=BOCU1_LENGTH_FROM_PACKED(diff);
michael@0 545
michael@0 546 /* write the output character bytes from diff and length */
michael@0 547 /* from the first if in the loop we know that targetCapacity>0 */
michael@0 548 if(length<=targetCapacity) {
michael@0 549 switch(length) {
michael@0 550 /* each branch falls through to the next one */
michael@0 551 case 4:
michael@0 552 *target++=(uint8_t)(diff>>24);
michael@0 553 *offsets++=sourceIndex;
michael@0 554 case 3: /*fall through*/
michael@0 555 *target++=(uint8_t)(diff>>16);
michael@0 556 *offsets++=sourceIndex;
michael@0 557 case 2: /*fall through*/
michael@0 558 *target++=(uint8_t)(diff>>8);
michael@0 559 *offsets++=sourceIndex;
michael@0 560 /* case 1: handled above */
michael@0 561 *target++=(uint8_t)diff;
michael@0 562 *offsets++=sourceIndex;
michael@0 563 default:
michael@0 564 /* will never occur */
michael@0 565 break;
michael@0 566 }
michael@0 567 targetCapacity-=length;
michael@0 568 sourceIndex=nextSourceIndex;
michael@0 569 } else {
michael@0 570 uint8_t *charErrorBuffer;
michael@0 571
michael@0 572 /*
michael@0 573 * We actually do this backwards here:
michael@0 574 * In order to save an intermediate variable, we output
michael@0 575 * first to the overflow buffer what does not fit into the
michael@0 576 * regular target.
michael@0 577 */
michael@0 578 /* we know that 1<=targetCapacity<length<=4 */
michael@0 579 length-=targetCapacity;
michael@0 580 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
michael@0 581 switch(length) {
michael@0 582 /* each branch falls through to the next one */
michael@0 583 case 3:
michael@0 584 *charErrorBuffer++=(uint8_t)(diff>>16);
michael@0 585 case 2: /*fall through*/
michael@0 586 *charErrorBuffer++=(uint8_t)(diff>>8);
michael@0 587 case 1: /*fall through*/
michael@0 588 *charErrorBuffer=(uint8_t)diff;
michael@0 589 default:
michael@0 590 /* will never occur */
michael@0 591 break;
michael@0 592 }
michael@0 593 cnv->charErrorBufferLength=(int8_t)length;
michael@0 594
michael@0 595 /* now output what fits into the regular target */
michael@0 596 diff>>=8*length; /* length was reduced by targetCapacity */
michael@0 597 switch(targetCapacity) {
michael@0 598 /* each branch falls through to the next one */
michael@0 599 case 3:
michael@0 600 *target++=(uint8_t)(diff>>16);
michael@0 601 *offsets++=sourceIndex;
michael@0 602 case 2: /*fall through*/
michael@0 603 *target++=(uint8_t)(diff>>8);
michael@0 604 *offsets++=sourceIndex;
michael@0 605 case 1: /*fall through*/
michael@0 606 *target++=(uint8_t)diff;
michael@0 607 *offsets++=sourceIndex;
michael@0 608 default:
michael@0 609 /* will never occur */
michael@0 610 break;
michael@0 611 }
michael@0 612
michael@0 613 /* target overflow */
michael@0 614 targetCapacity=0;
michael@0 615 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 616 break;
michael@0 617 }
michael@0 618 }
michael@0 619 } else {
michael@0 620 /* target is full */
michael@0 621 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 622 break;
michael@0 623 }
michael@0 624 }
michael@0 625
michael@0 626 /* set the converter state back into UConverter */
michael@0 627 cnv->fromUChar32= c<0 ? -c : 0;
michael@0 628 cnv->fromUnicodeStatus=(uint32_t)prev;
michael@0 629
michael@0 630 /* write back the updated pointers */
michael@0 631 pArgs->source=source;
michael@0 632 pArgs->target=(char *)target;
michael@0 633 pArgs->offsets=offsets;
michael@0 634 }
michael@0 635
michael@0 636 /*
michael@0 637 * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
michael@0 638 * If a change is made in the original function, then either
michael@0 639 * change this function the same way or
michael@0 640 * re-copy the original function and remove the variables
michael@0 641 * offsets, sourceIndex, and nextSourceIndex.
michael@0 642 */
michael@0 643 static void
michael@0 644 _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
michael@0 645 UErrorCode *pErrorCode) {
michael@0 646 UConverter *cnv;
michael@0 647 const UChar *source, *sourceLimit;
michael@0 648 uint8_t *target;
michael@0 649 int32_t targetCapacity;
michael@0 650
michael@0 651 int32_t prev, c, diff;
michael@0 652
michael@0 653 /* set up the local pointers */
michael@0 654 cnv=pArgs->converter;
michael@0 655 source=pArgs->source;
michael@0 656 sourceLimit=pArgs->sourceLimit;
michael@0 657 target=(uint8_t *)pArgs->target;
michael@0 658 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
michael@0 659
michael@0 660 /* get the converter state from UConverter */
michael@0 661 c=cnv->fromUChar32;
michael@0 662 prev=(int32_t)cnv->fromUnicodeStatus;
michael@0 663 if(prev==0) {
michael@0 664 prev=BOCU1_ASCII_PREV;
michael@0 665 }
michael@0 666
michael@0 667 /* conversion loop */
michael@0 668 if(c!=0 && targetCapacity>0) {
michael@0 669 goto getTrail;
michael@0 670 }
michael@0 671
michael@0 672 fastSingle:
michael@0 673 /* fast loop for single-byte differences */
michael@0 674 /* use only one loop counter variable, targetCapacity, not also source */
michael@0 675 diff=(int32_t)(sourceLimit-source);
michael@0 676 if(targetCapacity>diff) {
michael@0 677 targetCapacity=diff;
michael@0 678 }
michael@0 679 while(targetCapacity>0 && (c=*source)<0x3000) {
michael@0 680 if(c<=0x20) {
michael@0 681 if(c!=0x20) {
michael@0 682 prev=BOCU1_ASCII_PREV;
michael@0 683 }
michael@0 684 *target++=(uint8_t)c;
michael@0 685 } else {
michael@0 686 diff=c-prev;
michael@0 687 if(DIFF_IS_SINGLE(diff)) {
michael@0 688 prev=BOCU1_SIMPLE_PREV(c);
michael@0 689 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
michael@0 690 } else {
michael@0 691 break;
michael@0 692 }
michael@0 693 }
michael@0 694 ++source;
michael@0 695 --targetCapacity;
michael@0 696 }
michael@0 697 /* restore real values */
michael@0 698 targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
michael@0 699
michael@0 700 /* regular loop for all cases */
michael@0 701 while(source<sourceLimit) {
michael@0 702 if(targetCapacity>0) {
michael@0 703 c=*source++;
michael@0 704
michael@0 705 if(c<=0x20) {
michael@0 706 /*
michael@0 707 * ISO C0 control & space:
michael@0 708 * Encode directly for MIME compatibility,
michael@0 709 * and reset state except for space, to not disrupt compression.
michael@0 710 */
michael@0 711 if(c!=0x20) {
michael@0 712 prev=BOCU1_ASCII_PREV;
michael@0 713 }
michael@0 714 *target++=(uint8_t)c;
michael@0 715 --targetCapacity;
michael@0 716 continue;
michael@0 717 }
michael@0 718
michael@0 719 if(U16_IS_LEAD(c)) {
michael@0 720 getTrail:
michael@0 721 if(source<sourceLimit) {
michael@0 722 /* test the following code unit */
michael@0 723 UChar trail=*source;
michael@0 724 if(U16_IS_TRAIL(trail)) {
michael@0 725 ++source;
michael@0 726 c=U16_GET_SUPPLEMENTARY(c, trail);
michael@0 727 }
michael@0 728 } else {
michael@0 729 /* no more input */
michael@0 730 c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
michael@0 731 break;
michael@0 732 }
michael@0 733 }
michael@0 734
michael@0 735 /*
michael@0 736 * all other Unicode code points c==U+0021..U+10ffff
michael@0 737 * are encoded with the difference c-prev
michael@0 738 *
michael@0 739 * a new prev is computed from c,
michael@0 740 * placed in the middle of a 0x80-block (for most small scripts) or
michael@0 741 * in the middle of the Unihan and Hangul blocks
michael@0 742 * to statistically minimize the following difference
michael@0 743 */
michael@0 744 diff=c-prev;
michael@0 745 prev=BOCU1_PREV(c);
michael@0 746 if(DIFF_IS_SINGLE(diff)) {
michael@0 747 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
michael@0 748 --targetCapacity;
michael@0 749 if(c<0x3000) {
michael@0 750 goto fastSingle;
michael@0 751 }
michael@0 752 } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
michael@0 753 /* optimize 2-byte case */
michael@0 754 int32_t m;
michael@0 755
michael@0 756 if(diff>=0) {
michael@0 757 diff-=BOCU1_REACH_POS_1+1;
michael@0 758 m=diff%BOCU1_TRAIL_COUNT;
michael@0 759 diff/=BOCU1_TRAIL_COUNT;
michael@0 760 diff+=BOCU1_START_POS_2;
michael@0 761 } else {
michael@0 762 diff-=BOCU1_REACH_NEG_1;
michael@0 763 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
michael@0 764 diff+=BOCU1_START_NEG_2;
michael@0 765 }
michael@0 766 *target++=(uint8_t)diff;
michael@0 767 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
michael@0 768 targetCapacity-=2;
michael@0 769 } else {
michael@0 770 int32_t length; /* will be 2..4 */
michael@0 771
michael@0 772 diff=packDiff(diff);
michael@0 773 length=BOCU1_LENGTH_FROM_PACKED(diff);
michael@0 774
michael@0 775 /* write the output character bytes from diff and length */
michael@0 776 /* from the first if in the loop we know that targetCapacity>0 */
michael@0 777 if(length<=targetCapacity) {
michael@0 778 switch(length) {
michael@0 779 /* each branch falls through to the next one */
michael@0 780 case 4:
michael@0 781 *target++=(uint8_t)(diff>>24);
michael@0 782 case 3: /*fall through*/
michael@0 783 *target++=(uint8_t)(diff>>16);
michael@0 784 /* case 2: handled above */
michael@0 785 *target++=(uint8_t)(diff>>8);
michael@0 786 /* case 1: handled above */
michael@0 787 *target++=(uint8_t)diff;
michael@0 788 default:
michael@0 789 /* will never occur */
michael@0 790 break;
michael@0 791 }
michael@0 792 targetCapacity-=length;
michael@0 793 } else {
michael@0 794 uint8_t *charErrorBuffer;
michael@0 795
michael@0 796 /*
michael@0 797 * We actually do this backwards here:
michael@0 798 * In order to save an intermediate variable, we output
michael@0 799 * first to the overflow buffer what does not fit into the
michael@0 800 * regular target.
michael@0 801 */
michael@0 802 /* we know that 1<=targetCapacity<length<=4 */
michael@0 803 length-=targetCapacity;
michael@0 804 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
michael@0 805 switch(length) {
michael@0 806 /* each branch falls through to the next one */
michael@0 807 case 3:
michael@0 808 *charErrorBuffer++=(uint8_t)(diff>>16);
michael@0 809 case 2: /*fall through*/
michael@0 810 *charErrorBuffer++=(uint8_t)(diff>>8);
michael@0 811 case 1: /*fall through*/
michael@0 812 *charErrorBuffer=(uint8_t)diff;
michael@0 813 default:
michael@0 814 /* will never occur */
michael@0 815 break;
michael@0 816 }
michael@0 817 cnv->charErrorBufferLength=(int8_t)length;
michael@0 818
michael@0 819 /* now output what fits into the regular target */
michael@0 820 diff>>=8*length; /* length was reduced by targetCapacity */
michael@0 821 switch(targetCapacity) {
michael@0 822 /* each branch falls through to the next one */
michael@0 823 case 3:
michael@0 824 *target++=(uint8_t)(diff>>16);
michael@0 825 case 2: /*fall through*/
michael@0 826 *target++=(uint8_t)(diff>>8);
michael@0 827 case 1: /*fall through*/
michael@0 828 *target++=(uint8_t)diff;
michael@0 829 default:
michael@0 830 /* will never occur */
michael@0 831 break;
michael@0 832 }
michael@0 833
michael@0 834 /* target overflow */
michael@0 835 targetCapacity=0;
michael@0 836 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 837 break;
michael@0 838 }
michael@0 839 }
michael@0 840 } else {
michael@0 841 /* target is full */
michael@0 842 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 843 break;
michael@0 844 }
michael@0 845 }
michael@0 846
michael@0 847 /* set the converter state back into UConverter */
michael@0 848 cnv->fromUChar32= c<0 ? -c : 0;
michael@0 849 cnv->fromUnicodeStatus=(uint32_t)prev;
michael@0 850
michael@0 851 /* write back the updated pointers */
michael@0 852 pArgs->source=source;
michael@0 853 pArgs->target=(char *)target;
michael@0 854 }
michael@0 855
michael@0 856 /* BOCU-1-to-Unicode conversion functions ----------------------------------- */
michael@0 857
michael@0 858 /**
michael@0 859 * Function for BOCU-1 decoder; handles multi-byte lead bytes.
michael@0 860 *
michael@0 861 * @param b lead byte;
michael@0 862 * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
michael@0 863 * @return (diff<<2)|count
michael@0 864 */
michael@0 865 static inline int32_t
michael@0 866 decodeBocu1LeadByte(int32_t b) {
michael@0 867 int32_t diff, count;
michael@0 868
michael@0 869 if(b>=BOCU1_START_NEG_2) {
michael@0 870 /* positive difference */
michael@0 871 if(b<BOCU1_START_POS_3) {
michael@0 872 /* two bytes */
michael@0 873 diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
michael@0 874 count=1;
michael@0 875 } else if(b<BOCU1_START_POS_4) {
michael@0 876 /* three bytes */
michael@0 877 diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
michael@0 878 count=2;
michael@0 879 } else {
michael@0 880 /* four bytes */
michael@0 881 diff=BOCU1_REACH_POS_3+1;
michael@0 882 count=3;
michael@0 883 }
michael@0 884 } else {
michael@0 885 /* negative difference */
michael@0 886 if(b>=BOCU1_START_NEG_3) {
michael@0 887 /* two bytes */
michael@0 888 diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
michael@0 889 count=1;
michael@0 890 } else if(b>BOCU1_MIN) {
michael@0 891 /* three bytes */
michael@0 892 diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
michael@0 893 count=2;
michael@0 894 } else {
michael@0 895 /* four bytes */
michael@0 896 diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
michael@0 897 count=3;
michael@0 898 }
michael@0 899 }
michael@0 900
michael@0 901 /* return the state for decoding the trail byte(s) */
michael@0 902 return (diff<<2)|count;
michael@0 903 }
michael@0 904
michael@0 905 /**
michael@0 906 * Function for BOCU-1 decoder; handles multi-byte trail bytes.
michael@0 907 *
michael@0 908 * @param count number of remaining trail bytes including this one
michael@0 909 * @param b trail byte
michael@0 910 * @return new delta for diff including b - <0 indicates an error
michael@0 911 *
michael@0 912 * @see decodeBocu1
michael@0 913 */
michael@0 914 static inline int32_t
michael@0 915 decodeBocu1TrailByte(int32_t count, int32_t b) {
michael@0 916 if(b<=0x20) {
michael@0 917 /* skip some C0 controls and make the trail byte range contiguous */
michael@0 918 b=bocu1ByteToTrail[b];
michael@0 919 /* b<0 for an illegal trail byte value will result in return<0 below */
michael@0 920 #if BOCU1_MAX_TRAIL<0xff
michael@0 921 } else if(b>BOCU1_MAX_TRAIL) {
michael@0 922 return -99;
michael@0 923 #endif
michael@0 924 } else {
michael@0 925 b-=BOCU1_TRAIL_BYTE_OFFSET;
michael@0 926 }
michael@0 927
michael@0 928 /* add trail byte into difference and decrement count */
michael@0 929 if(count==1) {
michael@0 930 return b;
michael@0 931 } else if(count==2) {
michael@0 932 return b*BOCU1_TRAIL_COUNT;
michael@0 933 } else /* count==3 */ {
michael@0 934 return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
michael@0 935 }
michael@0 936 }
michael@0 937
michael@0 938 static void
michael@0 939 _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
michael@0 940 UErrorCode *pErrorCode) {
michael@0 941 UConverter *cnv;
michael@0 942 const uint8_t *source, *sourceLimit;
michael@0 943 UChar *target;
michael@0 944 const UChar *targetLimit;
michael@0 945 int32_t *offsets;
michael@0 946
michael@0 947 int32_t prev, count, diff, c;
michael@0 948
michael@0 949 int8_t byteIndex;
michael@0 950 uint8_t *bytes;
michael@0 951
michael@0 952 int32_t sourceIndex, nextSourceIndex;
michael@0 953
michael@0 954 /* set up the local pointers */
michael@0 955 cnv=pArgs->converter;
michael@0 956 source=(const uint8_t *)pArgs->source;
michael@0 957 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
michael@0 958 target=pArgs->target;
michael@0 959 targetLimit=pArgs->targetLimit;
michael@0 960 offsets=pArgs->offsets;
michael@0 961
michael@0 962 /* get the converter state from UConverter */
michael@0 963 prev=(int32_t)cnv->toUnicodeStatus;
michael@0 964 if(prev==0) {
michael@0 965 prev=BOCU1_ASCII_PREV;
michael@0 966 }
michael@0 967 diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
michael@0 968 count=diff&3;
michael@0 969 diff>>=2;
michael@0 970
michael@0 971 byteIndex=cnv->toULength;
michael@0 972 bytes=cnv->toUBytes;
michael@0 973
michael@0 974 /* sourceIndex=-1 if the current character began in the previous buffer */
michael@0 975 sourceIndex=byteIndex==0 ? 0 : -1;
michael@0 976 nextSourceIndex=0;
michael@0 977
michael@0 978 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
michael@0 979 if(count>0 && byteIndex>0 && target<targetLimit) {
michael@0 980 goto getTrail;
michael@0 981 }
michael@0 982
michael@0 983 fastSingle:
michael@0 984 /* fast loop for single-byte differences */
michael@0 985 /* use count as the only loop counter variable */
michael@0 986 diff=(int32_t)(sourceLimit-source);
michael@0 987 count=(int32_t)(pArgs->targetLimit-target);
michael@0 988 if(count>diff) {
michael@0 989 count=diff;
michael@0 990 }
michael@0 991 while(count>0) {
michael@0 992 if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
michael@0 993 c=prev+(c-BOCU1_MIDDLE);
michael@0 994 if(c<0x3000) {
michael@0 995 *target++=(UChar)c;
michael@0 996 *offsets++=nextSourceIndex++;
michael@0 997 prev=BOCU1_SIMPLE_PREV(c);
michael@0 998 } else {
michael@0 999 break;
michael@0 1000 }
michael@0 1001 } else if(c<=0x20) {
michael@0 1002 if(c!=0x20) {
michael@0 1003 prev=BOCU1_ASCII_PREV;
michael@0 1004 }
michael@0 1005 *target++=(UChar)c;
michael@0 1006 *offsets++=nextSourceIndex++;
michael@0 1007 } else {
michael@0 1008 break;
michael@0 1009 }
michael@0 1010 ++source;
michael@0 1011 --count;
michael@0 1012 }
michael@0 1013 sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
michael@0 1014
michael@0 1015 /* decode a sequence of single and lead bytes */
michael@0 1016 while(source<sourceLimit) {
michael@0 1017 if(target>=targetLimit) {
michael@0 1018 /* target is full */
michael@0 1019 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 1020 break;
michael@0 1021 }
michael@0 1022
michael@0 1023 ++nextSourceIndex;
michael@0 1024 c=*source++;
michael@0 1025 if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
michael@0 1026 /* Write a code point directly from a single-byte difference. */
michael@0 1027 c=prev+(c-BOCU1_MIDDLE);
michael@0 1028 if(c<0x3000) {
michael@0 1029 *target++=(UChar)c;
michael@0 1030 *offsets++=sourceIndex;
michael@0 1031 prev=BOCU1_SIMPLE_PREV(c);
michael@0 1032 sourceIndex=nextSourceIndex;
michael@0 1033 goto fastSingle;
michael@0 1034 }
michael@0 1035 } else if(c<=0x20) {
michael@0 1036 /*
michael@0 1037 * Direct-encoded C0 control code or space.
michael@0 1038 * Reset prev for C0 control codes but not for space.
michael@0 1039 */
michael@0 1040 if(c!=0x20) {
michael@0 1041 prev=BOCU1_ASCII_PREV;
michael@0 1042 }
michael@0 1043 *target++=(UChar)c;
michael@0 1044 *offsets++=sourceIndex;
michael@0 1045 sourceIndex=nextSourceIndex;
michael@0 1046 continue;
michael@0 1047 } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
michael@0 1048 /* Optimize two-byte case. */
michael@0 1049 if(c>=BOCU1_MIDDLE) {
michael@0 1050 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
michael@0 1051 } else {
michael@0 1052 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
michael@0 1053 }
michael@0 1054
michael@0 1055 /* trail byte */
michael@0 1056 ++nextSourceIndex;
michael@0 1057 c=decodeBocu1TrailByte(1, *source++);
michael@0 1058 if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
michael@0 1059 bytes[0]=source[-2];
michael@0 1060 bytes[1]=source[-1];
michael@0 1061 byteIndex=2;
michael@0 1062 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 1063 break;
michael@0 1064 }
michael@0 1065 } else if(c==BOCU1_RESET) {
michael@0 1066 /* only reset the state, no code point */
michael@0 1067 prev=BOCU1_ASCII_PREV;
michael@0 1068 sourceIndex=nextSourceIndex;
michael@0 1069 continue;
michael@0 1070 } else {
michael@0 1071 /*
michael@0 1072 * For multi-byte difference lead bytes, set the decoder state
michael@0 1073 * with the partial difference value from the lead byte and
michael@0 1074 * with the number of trail bytes.
michael@0 1075 */
michael@0 1076 bytes[0]=(uint8_t)c;
michael@0 1077 byteIndex=1;
michael@0 1078
michael@0 1079 diff=decodeBocu1LeadByte(c);
michael@0 1080 count=diff&3;
michael@0 1081 diff>>=2;
michael@0 1082 getTrail:
michael@0 1083 for(;;) {
michael@0 1084 if(source>=sourceLimit) {
michael@0 1085 goto endloop;
michael@0 1086 }
michael@0 1087 ++nextSourceIndex;
michael@0 1088 c=bytes[byteIndex++]=*source++;
michael@0 1089
michael@0 1090 /* trail byte in any position */
michael@0 1091 c=decodeBocu1TrailByte(count, c);
michael@0 1092 if(c<0) {
michael@0 1093 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 1094 goto endloop;
michael@0 1095 }
michael@0 1096
michael@0 1097 diff+=c;
michael@0 1098 if(--count==0) {
michael@0 1099 /* final trail byte, deliver a code point */
michael@0 1100 byteIndex=0;
michael@0 1101 c=prev+diff;
michael@0 1102 if((uint32_t)c>0x10ffff) {
michael@0 1103 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 1104 goto endloop;
michael@0 1105 }
michael@0 1106 break;
michael@0 1107 }
michael@0 1108 }
michael@0 1109 }
michael@0 1110
michael@0 1111 /* calculate the next prev and output c */
michael@0 1112 prev=BOCU1_PREV(c);
michael@0 1113 if(c<=0xffff) {
michael@0 1114 *target++=(UChar)c;
michael@0 1115 *offsets++=sourceIndex;
michael@0 1116 } else {
michael@0 1117 /* output surrogate pair */
michael@0 1118 *target++=U16_LEAD(c);
michael@0 1119 if(target<targetLimit) {
michael@0 1120 *target++=U16_TRAIL(c);
michael@0 1121 *offsets++=sourceIndex;
michael@0 1122 *offsets++=sourceIndex;
michael@0 1123 } else {
michael@0 1124 /* target overflow */
michael@0 1125 *offsets++=sourceIndex;
michael@0 1126 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
michael@0 1127 cnv->UCharErrorBufferLength=1;
michael@0 1128 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 1129 break;
michael@0 1130 }
michael@0 1131 }
michael@0 1132 sourceIndex=nextSourceIndex;
michael@0 1133 }
michael@0 1134 endloop:
michael@0 1135
michael@0 1136 if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
michael@0 1137 /* set the converter state in UConverter to deal with the next character */
michael@0 1138 cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
michael@0 1139 cnv->mode=0;
michael@0 1140 } else {
michael@0 1141 /* set the converter state back into UConverter */
michael@0 1142 cnv->toUnicodeStatus=(uint32_t)prev;
michael@0 1143 cnv->mode=(diff<<2)|count;
michael@0 1144 }
michael@0 1145 cnv->toULength=byteIndex;
michael@0 1146
michael@0 1147 /* write back the updated pointers */
michael@0 1148 pArgs->source=(const char *)source;
michael@0 1149 pArgs->target=target;
michael@0 1150 pArgs->offsets=offsets;
michael@0 1151 return;
michael@0 1152 }
michael@0 1153
michael@0 1154 /*
michael@0 1155 * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
michael@0 1156 * If a change is made in the original function, then either
michael@0 1157 * change this function the same way or
michael@0 1158 * re-copy the original function and remove the variables
michael@0 1159 * offsets, sourceIndex, and nextSourceIndex.
michael@0 1160 */
michael@0 1161 static void
michael@0 1162 _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
michael@0 1163 UErrorCode *pErrorCode) {
michael@0 1164 UConverter *cnv;
michael@0 1165 const uint8_t *source, *sourceLimit;
michael@0 1166 UChar *target;
michael@0 1167 const UChar *targetLimit;
michael@0 1168
michael@0 1169 int32_t prev, count, diff, c;
michael@0 1170
michael@0 1171 int8_t byteIndex;
michael@0 1172 uint8_t *bytes;
michael@0 1173
michael@0 1174 U_ALIGN_CODE(16)
michael@0 1175
michael@0 1176 /* set up the local pointers */
michael@0 1177 cnv=pArgs->converter;
michael@0 1178 source=(const uint8_t *)pArgs->source;
michael@0 1179 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
michael@0 1180 target=pArgs->target;
michael@0 1181 targetLimit=pArgs->targetLimit;
michael@0 1182
michael@0 1183 /* get the converter state from UConverter */
michael@0 1184 prev=(int32_t)cnv->toUnicodeStatus;
michael@0 1185 if(prev==0) {
michael@0 1186 prev=BOCU1_ASCII_PREV;
michael@0 1187 }
michael@0 1188 diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
michael@0 1189 count=diff&3;
michael@0 1190 diff>>=2;
michael@0 1191
michael@0 1192 byteIndex=cnv->toULength;
michael@0 1193 bytes=cnv->toUBytes;
michael@0 1194
michael@0 1195 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
michael@0 1196 if(count>0 && byteIndex>0 && target<targetLimit) {
michael@0 1197 goto getTrail;
michael@0 1198 }
michael@0 1199
michael@0 1200 fastSingle:
michael@0 1201 /* fast loop for single-byte differences */
michael@0 1202 /* use count as the only loop counter variable */
michael@0 1203 diff=(int32_t)(sourceLimit-source);
michael@0 1204 count=(int32_t)(pArgs->targetLimit-target);
michael@0 1205 if(count>diff) {
michael@0 1206 count=diff;
michael@0 1207 }
michael@0 1208 while(count>0) {
michael@0 1209 if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
michael@0 1210 c=prev+(c-BOCU1_MIDDLE);
michael@0 1211 if(c<0x3000) {
michael@0 1212 *target++=(UChar)c;
michael@0 1213 prev=BOCU1_SIMPLE_PREV(c);
michael@0 1214 } else {
michael@0 1215 break;
michael@0 1216 }
michael@0 1217 } else if(c<=0x20) {
michael@0 1218 if(c!=0x20) {
michael@0 1219 prev=BOCU1_ASCII_PREV;
michael@0 1220 }
michael@0 1221 *target++=(UChar)c;
michael@0 1222 } else {
michael@0 1223 break;
michael@0 1224 }
michael@0 1225 ++source;
michael@0 1226 --count;
michael@0 1227 }
michael@0 1228
michael@0 1229 /* decode a sequence of single and lead bytes */
michael@0 1230 while(source<sourceLimit) {
michael@0 1231 if(target>=targetLimit) {
michael@0 1232 /* target is full */
michael@0 1233 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 1234 break;
michael@0 1235 }
michael@0 1236
michael@0 1237 c=*source++;
michael@0 1238 if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
michael@0 1239 /* Write a code point directly from a single-byte difference. */
michael@0 1240 c=prev+(c-BOCU1_MIDDLE);
michael@0 1241 if(c<0x3000) {
michael@0 1242 *target++=(UChar)c;
michael@0 1243 prev=BOCU1_SIMPLE_PREV(c);
michael@0 1244 goto fastSingle;
michael@0 1245 }
michael@0 1246 } else if(c<=0x20) {
michael@0 1247 /*
michael@0 1248 * Direct-encoded C0 control code or space.
michael@0 1249 * Reset prev for C0 control codes but not for space.
michael@0 1250 */
michael@0 1251 if(c!=0x20) {
michael@0 1252 prev=BOCU1_ASCII_PREV;
michael@0 1253 }
michael@0 1254 *target++=(UChar)c;
michael@0 1255 continue;
michael@0 1256 } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
michael@0 1257 /* Optimize two-byte case. */
michael@0 1258 if(c>=BOCU1_MIDDLE) {
michael@0 1259 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
michael@0 1260 } else {
michael@0 1261 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
michael@0 1262 }
michael@0 1263
michael@0 1264 /* trail byte */
michael@0 1265 c=decodeBocu1TrailByte(1, *source++);
michael@0 1266 if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
michael@0 1267 bytes[0]=source[-2];
michael@0 1268 bytes[1]=source[-1];
michael@0 1269 byteIndex=2;
michael@0 1270 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 1271 break;
michael@0 1272 }
michael@0 1273 } else if(c==BOCU1_RESET) {
michael@0 1274 /* only reset the state, no code point */
michael@0 1275 prev=BOCU1_ASCII_PREV;
michael@0 1276 continue;
michael@0 1277 } else {
michael@0 1278 /*
michael@0 1279 * For multi-byte difference lead bytes, set the decoder state
michael@0 1280 * with the partial difference value from the lead byte and
michael@0 1281 * with the number of trail bytes.
michael@0 1282 */
michael@0 1283 bytes[0]=(uint8_t)c;
michael@0 1284 byteIndex=1;
michael@0 1285
michael@0 1286 diff=decodeBocu1LeadByte(c);
michael@0 1287 count=diff&3;
michael@0 1288 diff>>=2;
michael@0 1289 getTrail:
michael@0 1290 for(;;) {
michael@0 1291 if(source>=sourceLimit) {
michael@0 1292 goto endloop;
michael@0 1293 }
michael@0 1294 c=bytes[byteIndex++]=*source++;
michael@0 1295
michael@0 1296 /* trail byte in any position */
michael@0 1297 c=decodeBocu1TrailByte(count, c);
michael@0 1298 if(c<0) {
michael@0 1299 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 1300 goto endloop;
michael@0 1301 }
michael@0 1302
michael@0 1303 diff+=c;
michael@0 1304 if(--count==0) {
michael@0 1305 /* final trail byte, deliver a code point */
michael@0 1306 byteIndex=0;
michael@0 1307 c=prev+diff;
michael@0 1308 if((uint32_t)c>0x10ffff) {
michael@0 1309 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 1310 goto endloop;
michael@0 1311 }
michael@0 1312 break;
michael@0 1313 }
michael@0 1314 }
michael@0 1315 }
michael@0 1316
michael@0 1317 /* calculate the next prev and output c */
michael@0 1318 prev=BOCU1_PREV(c);
michael@0 1319 if(c<=0xffff) {
michael@0 1320 *target++=(UChar)c;
michael@0 1321 } else {
michael@0 1322 /* output surrogate pair */
michael@0 1323 *target++=U16_LEAD(c);
michael@0 1324 if(target<targetLimit) {
michael@0 1325 *target++=U16_TRAIL(c);
michael@0 1326 } else {
michael@0 1327 /* target overflow */
michael@0 1328 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
michael@0 1329 cnv->UCharErrorBufferLength=1;
michael@0 1330 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 1331 break;
michael@0 1332 }
michael@0 1333 }
michael@0 1334 }
michael@0 1335 endloop:
michael@0 1336
michael@0 1337 if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
michael@0 1338 /* set the converter state in UConverter to deal with the next character */
michael@0 1339 cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
michael@0 1340 cnv->mode=0;
michael@0 1341 } else {
michael@0 1342 /* set the converter state back into UConverter */
michael@0 1343 cnv->toUnicodeStatus=(uint32_t)prev;
michael@0 1344 cnv->mode=(diff<<2)|count;
michael@0 1345 }
michael@0 1346 cnv->toULength=byteIndex;
michael@0 1347
michael@0 1348 /* write back the updated pointers */
michael@0 1349 pArgs->source=(const char *)source;
michael@0 1350 pArgs->target=target;
michael@0 1351 return;
michael@0 1352 }
michael@0 1353
michael@0 1354 /* miscellaneous ------------------------------------------------------------ */
michael@0 1355
michael@0 1356 static const UConverterImpl _Bocu1Impl={
michael@0 1357 UCNV_BOCU1,
michael@0 1358
michael@0 1359 NULL,
michael@0 1360 NULL,
michael@0 1361
michael@0 1362 NULL,
michael@0 1363 NULL,
michael@0 1364 NULL,
michael@0 1365
michael@0 1366 _Bocu1ToUnicode,
michael@0 1367 _Bocu1ToUnicodeWithOffsets,
michael@0 1368 _Bocu1FromUnicode,
michael@0 1369 _Bocu1FromUnicodeWithOffsets,
michael@0 1370 NULL,
michael@0 1371
michael@0 1372 NULL,
michael@0 1373 NULL,
michael@0 1374 NULL,
michael@0 1375 NULL,
michael@0 1376 ucnv_getCompleteUnicodeSet,
michael@0 1377
michael@0 1378 NULL,
michael@0 1379 NULL
michael@0 1380 };
michael@0 1381
michael@0 1382 static const UConverterStaticData _Bocu1StaticData={
michael@0 1383 sizeof(UConverterStaticData),
michael@0 1384 "BOCU-1",
michael@0 1385 1214, /* CCSID for BOCU-1 */
michael@0 1386 UCNV_IBM, UCNV_BOCU1,
michael@0 1387 1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
michael@0 1388 { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
michael@0 1389 FALSE, FALSE,
michael@0 1390 0,
michael@0 1391 0,
michael@0 1392 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
michael@0 1393 };
michael@0 1394
michael@0 1395 const UConverterSharedData _Bocu1Data={
michael@0 1396 sizeof(UConverterSharedData), ~((uint32_t)0),
michael@0 1397 NULL, NULL, &_Bocu1StaticData, FALSE, &_Bocu1Impl,
michael@0 1398 0,
michael@0 1399 UCNV_MBCS_TABLE_INITIALIZER
michael@0 1400 };
michael@0 1401
michael@0 1402 #endif

mercurial