|
1 /* |
|
2 ****************************************************************************** |
|
3 * |
|
4 * Copyright (C) 2002-2011, International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ****************************************************************************** |
|
8 * file name: ucnvbocu.cpp |
|
9 * encoding: US-ASCII |
|
10 * tab size: 8 (not used) |
|
11 * indentation:4 |
|
12 * |
|
13 * created on: 2002mar27 |
|
14 * created by: Markus W. Scherer |
|
15 * |
|
16 * This is an implementation of the Binary Ordered Compression for Unicode, |
|
17 * in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/ |
|
18 */ |
|
19 |
|
20 #include "unicode/utypes.h" |
|
21 |
|
22 #if !UCONFIG_NO_CONVERSION |
|
23 |
|
24 #include "unicode/ucnv.h" |
|
25 #include "unicode/ucnv_cb.h" |
|
26 #include "unicode/utf16.h" |
|
27 #include "putilimp.h" |
|
28 #include "ucnv_bld.h" |
|
29 #include "ucnv_cnv.h" |
|
30 #include "uassert.h" |
|
31 |
|
32 /* BOCU-1 constants and macros ---------------------------------------------- */ |
|
33 |
|
34 /* |
|
35 * BOCU-1 encodes the code points of a Unicode string as |
|
36 * a sequence of byte-encoded differences (slope detection), |
|
37 * preserving lexical order. |
|
38 * |
|
39 * Optimize the difference-taking for runs of Unicode text within |
|
40 * small scripts: |
|
41 * |
|
42 * Most small scripts are allocated within aligned 128-blocks of Unicode |
|
43 * code points. Lexical order is preserved if the "previous code point" state |
|
44 * is always moved into the middle of such a block. |
|
45 * |
|
46 * Additionally, "prev" is moved from anywhere in the Unihan and Hangul |
|
47 * areas into the middle of those areas. |
|
48 * |
|
49 * C0 control codes and space are encoded with their US-ASCII bytes. |
|
50 * "prev" is reset for C0 controls but not for space. |
|
51 */ |
|
52 |
|
53 /* initial value for "prev": middle of the ASCII range */ |
|
54 #define BOCU1_ASCII_PREV 0x40 |
|
55 |
|
56 /* bounding byte values for differences */ |
|
57 #define BOCU1_MIN 0x21 |
|
58 #define BOCU1_MIDDLE 0x90 |
|
59 #define BOCU1_MAX_LEAD 0xfe |
|
60 #define BOCU1_MAX_TRAIL 0xff |
|
61 #define BOCU1_RESET 0xff |
|
62 |
|
63 /* number of lead bytes */ |
|
64 #define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1) |
|
65 |
|
66 /* adjust trail byte counts for the use of some C0 control byte values */ |
|
67 #define BOCU1_TRAIL_CONTROLS_COUNT 20 |
|
68 #define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT) |
|
69 |
|
70 /* number of trail bytes */ |
|
71 #define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT) |
|
72 |
|
73 /* |
|
74 * number of positive and negative single-byte codes |
|
75 * (counting 0==BOCU1_MIDDLE among the positive ones) |
|
76 */ |
|
77 #define BOCU1_SINGLE 64 |
|
78 |
|
79 /* number of lead bytes for positive and negative 2/3/4-byte sequences */ |
|
80 #define BOCU1_LEAD_2 43 |
|
81 #define BOCU1_LEAD_3 3 |
|
82 #define BOCU1_LEAD_4 1 |
|
83 |
|
84 /* The difference value range for single-byters. */ |
|
85 #define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1) |
|
86 #define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE) |
|
87 |
|
88 /* The difference value range for double-byters. */ |
|
89 #define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) |
|
90 #define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) |
|
91 |
|
92 /* The difference value range for 3-byters. */ |
|
93 #define BOCU1_REACH_POS_3 \ |
|
94 (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) |
|
95 |
|
96 #define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) |
|
97 |
|
98 /* The lead byte start values. */ |
|
99 #define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1) |
|
100 #define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2) |
|
101 #define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3) |
|
102 /* ==BOCU1_MAX_LEAD */ |
|
103 |
|
104 #define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1) |
|
105 #define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2) |
|
106 #define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3) |
|
107 /* ==BOCU1_MIN+1 */ |
|
108 |
|
109 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */ |
|
110 #define BOCU1_LENGTH_FROM_LEAD(lead) \ |
|
111 ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \ |
|
112 (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \ |
|
113 (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4) |
|
114 |
|
115 /* The length of a byte sequence, according to its packed form. */ |
|
116 #define BOCU1_LENGTH_FROM_PACKED(packed) \ |
|
117 ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4) |
|
118 |
|
119 /* |
|
120 * 12 commonly used C0 control codes (and space) are only used to encode |
|
121 * themselves directly, |
|
122 * which makes BOCU-1 MIME-usable and reasonably safe for |
|
123 * ASCII-oriented software. |
|
124 * |
|
125 * These controls are |
|
126 * 0 NUL |
|
127 * |
|
128 * 7 BEL |
|
129 * 8 BS |
|
130 * |
|
131 * 9 TAB |
|
132 * a LF |
|
133 * b VT |
|
134 * c FF |
|
135 * d CR |
|
136 * |
|
137 * e SO |
|
138 * f SI |
|
139 * |
|
140 * 1a SUB |
|
141 * 1b ESC |
|
142 * |
|
143 * The other 20 C0 controls are also encoded directly (to preserve order) |
|
144 * but are also used as trail bytes in difference encoding |
|
145 * (for better compression). |
|
146 */ |
|
147 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t]) |
|
148 |
|
149 /* |
|
150 * Byte value map for control codes, |
|
151 * from external byte values 0x00..0x20 |
|
152 * to trail byte values 0..19 (0..0x13) as used in the difference calculation. |
|
153 * External byte values that are illegal as trail bytes are mapped to -1. |
|
154 */ |
|
155 static const int8_t |
|
156 bocu1ByteToTrail[BOCU1_MIN]={ |
|
157 /* 0 1 2 3 4 5 6 7 */ |
|
158 -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1, |
|
159 |
|
160 /* 8 9 a b c d e f */ |
|
161 -1, -1, -1, -1, -1, -1, -1, -1, |
|
162 |
|
163 /* 10 11 12 13 14 15 16 17 */ |
|
164 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, |
|
165 |
|
166 /* 18 19 1a 1b 1c 1d 1e 1f */ |
|
167 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13, |
|
168 |
|
169 /* 20 */ |
|
170 -1 |
|
171 }; |
|
172 |
|
173 /* |
|
174 * Byte value map for control codes, |
|
175 * from trail byte values 0..19 (0..0x13) as used in the difference calculation |
|
176 * to external byte values 0x00..0x20. |
|
177 */ |
|
178 static const int8_t |
|
179 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={ |
|
180 /* 0 1 2 3 4 5 6 7 */ |
|
181 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11, |
|
182 |
|
183 /* 8 9 a b c d e f */ |
|
184 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, |
|
185 |
|
186 /* 10 11 12 13 */ |
|
187 0x1c, 0x1d, 0x1e, 0x1f |
|
188 }; |
|
189 |
|
190 /** |
|
191 * Integer division and modulo with negative numerators |
|
192 * yields negative modulo results and quotients that are one more than |
|
193 * what we need here. |
|
194 * This macro adjust the results so that the modulo-value m is always >=0. |
|
195 * |
|
196 * For positive n, the if() condition is always FALSE. |
|
197 * |
|
198 * @param n Number to be split into quotient and rest. |
|
199 * Will be modified to contain the quotient. |
|
200 * @param d Divisor. |
|
201 * @param m Output variable for the rest (modulo result). |
|
202 */ |
|
203 #define NEGDIVMOD(n, d, m) { \ |
|
204 (m)=(n)%(d); \ |
|
205 (n)/=(d); \ |
|
206 if((m)<0) { \ |
|
207 --(n); \ |
|
208 (m)+=(d); \ |
|
209 } \ |
|
210 } |
|
211 |
|
212 /* Faster versions of packDiff() for single-byte-encoded diff values. */ |
|
213 |
|
214 /** Is a diff value encodable in a single byte? */ |
|
215 #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1) |
|
216 |
|
217 /** Encode a diff value in a single byte. */ |
|
218 #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff)) |
|
219 |
|
220 /** Is a diff value encodable in two bytes? */ |
|
221 #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2) |
|
222 |
|
223 /* BOCU-1 implementation functions ------------------------------------------ */ |
|
224 |
|
225 #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV) |
|
226 |
|
227 /** |
|
228 * Compute the next "previous" value for differencing |
|
229 * from the current code point. |
|
230 * |
|
231 * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below) |
|
232 * @return "previous code point" state value |
|
233 */ |
|
234 static inline int32_t |
|
235 bocu1Prev(int32_t c) { |
|
236 /* compute new prev */ |
|
237 if(/* 0x3040<=c && */ c<=0x309f) { |
|
238 /* Hiragana is not 128-aligned */ |
|
239 return 0x3070; |
|
240 } else if(0x4e00<=c && c<=0x9fa5) { |
|
241 /* CJK Unihan */ |
|
242 return 0x4e00-BOCU1_REACH_NEG_2; |
|
243 } else if(0xac00<=c /* && c<=0xd7a3 */) { |
|
244 /* Korean Hangul */ |
|
245 return (0xd7a3+0xac00)/2; |
|
246 } else { |
|
247 /* mostly small scripts */ |
|
248 return BOCU1_SIMPLE_PREV(c); |
|
249 } |
|
250 } |
|
251 |
|
252 /** Fast version of bocu1Prev() for most scripts. */ |
|
253 #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c)) |
|
254 |
|
255 /* |
|
256 * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c. |
|
257 * The UConverter fields are used as follows: |
|
258 * |
|
259 * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV) |
|
260 * |
|
261 * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV) |
|
262 * mode decoder's incomplete (diff<<2)|count (ignored when toULength==0) |
|
263 */ |
|
264 |
|
265 /* BOCU-1-from-Unicode conversion functions --------------------------------- */ |
|
266 |
|
267 /** |
|
268 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes |
|
269 * and return a packed integer with them. |
|
270 * |
|
271 * The encoding favors small absolute differences with short encodings |
|
272 * to compress runs of same-script characters. |
|
273 * |
|
274 * Optimized version with unrolled loops and fewer floating-point operations |
|
275 * than the standard packDiff(). |
|
276 * |
|
277 * @param diff difference value -0x10ffff..0x10ffff |
|
278 * @return |
|
279 * 0x010000zz for 1-byte sequence zz |
|
280 * 0x0200yyzz for 2-byte sequence yy zz |
|
281 * 0x03xxyyzz for 3-byte sequence xx yy zz |
|
282 * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03) |
|
283 */ |
|
284 static int32_t |
|
285 packDiff(int32_t diff) { |
|
286 int32_t result, m; |
|
287 |
|
288 U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */ |
|
289 if(diff>=BOCU1_REACH_NEG_1) { |
|
290 /* mostly positive differences, and single-byte negative ones */ |
|
291 #if 0 /* single-byte case handled in macros, see below */ |
|
292 if(diff<=BOCU1_REACH_POS_1) { |
|
293 /* single byte */ |
|
294 return 0x01000000|(BOCU1_MIDDLE+diff); |
|
295 } else |
|
296 #endif |
|
297 if(diff<=BOCU1_REACH_POS_2) { |
|
298 /* two bytes */ |
|
299 diff-=BOCU1_REACH_POS_1+1; |
|
300 result=0x02000000; |
|
301 |
|
302 m=diff%BOCU1_TRAIL_COUNT; |
|
303 diff/=BOCU1_TRAIL_COUNT; |
|
304 result|=BOCU1_TRAIL_TO_BYTE(m); |
|
305 |
|
306 result|=(BOCU1_START_POS_2+diff)<<8; |
|
307 } else if(diff<=BOCU1_REACH_POS_3) { |
|
308 /* three bytes */ |
|
309 diff-=BOCU1_REACH_POS_2+1; |
|
310 result=0x03000000; |
|
311 |
|
312 m=diff%BOCU1_TRAIL_COUNT; |
|
313 diff/=BOCU1_TRAIL_COUNT; |
|
314 result|=BOCU1_TRAIL_TO_BYTE(m); |
|
315 |
|
316 m=diff%BOCU1_TRAIL_COUNT; |
|
317 diff/=BOCU1_TRAIL_COUNT; |
|
318 result|=BOCU1_TRAIL_TO_BYTE(m)<<8; |
|
319 |
|
320 result|=(BOCU1_START_POS_3+diff)<<16; |
|
321 } else { |
|
322 /* four bytes */ |
|
323 diff-=BOCU1_REACH_POS_3+1; |
|
324 |
|
325 m=diff%BOCU1_TRAIL_COUNT; |
|
326 diff/=BOCU1_TRAIL_COUNT; |
|
327 result=BOCU1_TRAIL_TO_BYTE(m); |
|
328 |
|
329 m=diff%BOCU1_TRAIL_COUNT; |
|
330 diff/=BOCU1_TRAIL_COUNT; |
|
331 result|=BOCU1_TRAIL_TO_BYTE(m)<<8; |
|
332 |
|
333 /* |
|
334 * We know that / and % would deliver quotient 0 and rest=diff. |
|
335 * Avoid division and modulo for performance. |
|
336 */ |
|
337 result|=BOCU1_TRAIL_TO_BYTE(diff)<<16; |
|
338 |
|
339 result|=((uint32_t)BOCU1_START_POS_4)<<24; |
|
340 } |
|
341 } else { |
|
342 /* two- to four-byte negative differences */ |
|
343 if(diff>=BOCU1_REACH_NEG_2) { |
|
344 /* two bytes */ |
|
345 diff-=BOCU1_REACH_NEG_1; |
|
346 result=0x02000000; |
|
347 |
|
348 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); |
|
349 result|=BOCU1_TRAIL_TO_BYTE(m); |
|
350 |
|
351 result|=(BOCU1_START_NEG_2+diff)<<8; |
|
352 } else if(diff>=BOCU1_REACH_NEG_3) { |
|
353 /* three bytes */ |
|
354 diff-=BOCU1_REACH_NEG_2; |
|
355 result=0x03000000; |
|
356 |
|
357 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); |
|
358 result|=BOCU1_TRAIL_TO_BYTE(m); |
|
359 |
|
360 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); |
|
361 result|=BOCU1_TRAIL_TO_BYTE(m)<<8; |
|
362 |
|
363 result|=(BOCU1_START_NEG_3+diff)<<16; |
|
364 } else { |
|
365 /* four bytes */ |
|
366 diff-=BOCU1_REACH_NEG_3; |
|
367 |
|
368 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); |
|
369 result=BOCU1_TRAIL_TO_BYTE(m); |
|
370 |
|
371 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); |
|
372 result|=BOCU1_TRAIL_TO_BYTE(m)<<8; |
|
373 |
|
374 /* |
|
375 * We know that NEGDIVMOD would deliver |
|
376 * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT. |
|
377 * Avoid division and modulo for performance. |
|
378 */ |
|
379 m=diff+BOCU1_TRAIL_COUNT; |
|
380 result|=BOCU1_TRAIL_TO_BYTE(m)<<16; |
|
381 |
|
382 result|=BOCU1_MIN<<24; |
|
383 } |
|
384 } |
|
385 return result; |
|
386 } |
|
387 |
|
388 |
|
389 static void |
|
390 _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, |
|
391 UErrorCode *pErrorCode) { |
|
392 UConverter *cnv; |
|
393 const UChar *source, *sourceLimit; |
|
394 uint8_t *target; |
|
395 int32_t targetCapacity; |
|
396 int32_t *offsets; |
|
397 |
|
398 int32_t prev, c, diff; |
|
399 |
|
400 int32_t sourceIndex, nextSourceIndex; |
|
401 |
|
402 U_ALIGN_CODE(16) |
|
403 |
|
404 /* set up the local pointers */ |
|
405 cnv=pArgs->converter; |
|
406 source=pArgs->source; |
|
407 sourceLimit=pArgs->sourceLimit; |
|
408 target=(uint8_t *)pArgs->target; |
|
409 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); |
|
410 offsets=pArgs->offsets; |
|
411 |
|
412 /* get the converter state from UConverter */ |
|
413 c=cnv->fromUChar32; |
|
414 prev=(int32_t)cnv->fromUnicodeStatus; |
|
415 if(prev==0) { |
|
416 prev=BOCU1_ASCII_PREV; |
|
417 } |
|
418 |
|
419 /* sourceIndex=-1 if the current character began in the previous buffer */ |
|
420 sourceIndex= c==0 ? 0 : -1; |
|
421 nextSourceIndex=0; |
|
422 |
|
423 /* conversion loop */ |
|
424 if(c!=0 && targetCapacity>0) { |
|
425 goto getTrail; |
|
426 } |
|
427 |
|
428 fastSingle: |
|
429 /* fast loop for single-byte differences */ |
|
430 /* use only one loop counter variable, targetCapacity, not also source */ |
|
431 diff=(int32_t)(sourceLimit-source); |
|
432 if(targetCapacity>diff) { |
|
433 targetCapacity=diff; |
|
434 } |
|
435 while(targetCapacity>0 && (c=*source)<0x3000) { |
|
436 if(c<=0x20) { |
|
437 if(c!=0x20) { |
|
438 prev=BOCU1_ASCII_PREV; |
|
439 } |
|
440 *target++=(uint8_t)c; |
|
441 *offsets++=nextSourceIndex++; |
|
442 ++source; |
|
443 --targetCapacity; |
|
444 } else { |
|
445 diff=c-prev; |
|
446 if(DIFF_IS_SINGLE(diff)) { |
|
447 prev=BOCU1_SIMPLE_PREV(c); |
|
448 *target++=(uint8_t)PACK_SINGLE_DIFF(diff); |
|
449 *offsets++=nextSourceIndex++; |
|
450 ++source; |
|
451 --targetCapacity; |
|
452 } else { |
|
453 break; |
|
454 } |
|
455 } |
|
456 } |
|
457 /* restore real values */ |
|
458 targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target); |
|
459 sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */ |
|
460 |
|
461 /* regular loop for all cases */ |
|
462 while(source<sourceLimit) { |
|
463 if(targetCapacity>0) { |
|
464 c=*source++; |
|
465 ++nextSourceIndex; |
|
466 |
|
467 if(c<=0x20) { |
|
468 /* |
|
469 * ISO C0 control & space: |
|
470 * Encode directly for MIME compatibility, |
|
471 * and reset state except for space, to not disrupt compression. |
|
472 */ |
|
473 if(c!=0x20) { |
|
474 prev=BOCU1_ASCII_PREV; |
|
475 } |
|
476 *target++=(uint8_t)c; |
|
477 *offsets++=sourceIndex; |
|
478 --targetCapacity; |
|
479 |
|
480 sourceIndex=nextSourceIndex; |
|
481 continue; |
|
482 } |
|
483 |
|
484 if(U16_IS_LEAD(c)) { |
|
485 getTrail: |
|
486 if(source<sourceLimit) { |
|
487 /* test the following code unit */ |
|
488 UChar trail=*source; |
|
489 if(U16_IS_TRAIL(trail)) { |
|
490 ++source; |
|
491 ++nextSourceIndex; |
|
492 c=U16_GET_SUPPLEMENTARY(c, trail); |
|
493 } |
|
494 } else { |
|
495 /* no more input */ |
|
496 c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */ |
|
497 break; |
|
498 } |
|
499 } |
|
500 |
|
501 /* |
|
502 * all other Unicode code points c==U+0021..U+10ffff |
|
503 * are encoded with the difference c-prev |
|
504 * |
|
505 * a new prev is computed from c, |
|
506 * placed in the middle of a 0x80-block (for most small scripts) or |
|
507 * in the middle of the Unihan and Hangul blocks |
|
508 * to statistically minimize the following difference |
|
509 */ |
|
510 diff=c-prev; |
|
511 prev=BOCU1_PREV(c); |
|
512 if(DIFF_IS_SINGLE(diff)) { |
|
513 *target++=(uint8_t)PACK_SINGLE_DIFF(diff); |
|
514 *offsets++=sourceIndex; |
|
515 --targetCapacity; |
|
516 sourceIndex=nextSourceIndex; |
|
517 if(c<0x3000) { |
|
518 goto fastSingle; |
|
519 } |
|
520 } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) { |
|
521 /* optimize 2-byte case */ |
|
522 int32_t m; |
|
523 |
|
524 if(diff>=0) { |
|
525 diff-=BOCU1_REACH_POS_1+1; |
|
526 m=diff%BOCU1_TRAIL_COUNT; |
|
527 diff/=BOCU1_TRAIL_COUNT; |
|
528 diff+=BOCU1_START_POS_2; |
|
529 } else { |
|
530 diff-=BOCU1_REACH_NEG_1; |
|
531 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); |
|
532 diff+=BOCU1_START_NEG_2; |
|
533 } |
|
534 *target++=(uint8_t)diff; |
|
535 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m); |
|
536 *offsets++=sourceIndex; |
|
537 *offsets++=sourceIndex; |
|
538 targetCapacity-=2; |
|
539 sourceIndex=nextSourceIndex; |
|
540 } else { |
|
541 int32_t length; /* will be 2..4 */ |
|
542 |
|
543 diff=packDiff(diff); |
|
544 length=BOCU1_LENGTH_FROM_PACKED(diff); |
|
545 |
|
546 /* write the output character bytes from diff and length */ |
|
547 /* from the first if in the loop we know that targetCapacity>0 */ |
|
548 if(length<=targetCapacity) { |
|
549 switch(length) { |
|
550 /* each branch falls through to the next one */ |
|
551 case 4: |
|
552 *target++=(uint8_t)(diff>>24); |
|
553 *offsets++=sourceIndex; |
|
554 case 3: /*fall through*/ |
|
555 *target++=(uint8_t)(diff>>16); |
|
556 *offsets++=sourceIndex; |
|
557 case 2: /*fall through*/ |
|
558 *target++=(uint8_t)(diff>>8); |
|
559 *offsets++=sourceIndex; |
|
560 /* case 1: handled above */ |
|
561 *target++=(uint8_t)diff; |
|
562 *offsets++=sourceIndex; |
|
563 default: |
|
564 /* will never occur */ |
|
565 break; |
|
566 } |
|
567 targetCapacity-=length; |
|
568 sourceIndex=nextSourceIndex; |
|
569 } else { |
|
570 uint8_t *charErrorBuffer; |
|
571 |
|
572 /* |
|
573 * We actually do this backwards here: |
|
574 * In order to save an intermediate variable, we output |
|
575 * first to the overflow buffer what does not fit into the |
|
576 * regular target. |
|
577 */ |
|
578 /* we know that 1<=targetCapacity<length<=4 */ |
|
579 length-=targetCapacity; |
|
580 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; |
|
581 switch(length) { |
|
582 /* each branch falls through to the next one */ |
|
583 case 3: |
|
584 *charErrorBuffer++=(uint8_t)(diff>>16); |
|
585 case 2: /*fall through*/ |
|
586 *charErrorBuffer++=(uint8_t)(diff>>8); |
|
587 case 1: /*fall through*/ |
|
588 *charErrorBuffer=(uint8_t)diff; |
|
589 default: |
|
590 /* will never occur */ |
|
591 break; |
|
592 } |
|
593 cnv->charErrorBufferLength=(int8_t)length; |
|
594 |
|
595 /* now output what fits into the regular target */ |
|
596 diff>>=8*length; /* length was reduced by targetCapacity */ |
|
597 switch(targetCapacity) { |
|
598 /* each branch falls through to the next one */ |
|
599 case 3: |
|
600 *target++=(uint8_t)(diff>>16); |
|
601 *offsets++=sourceIndex; |
|
602 case 2: /*fall through*/ |
|
603 *target++=(uint8_t)(diff>>8); |
|
604 *offsets++=sourceIndex; |
|
605 case 1: /*fall through*/ |
|
606 *target++=(uint8_t)diff; |
|
607 *offsets++=sourceIndex; |
|
608 default: |
|
609 /* will never occur */ |
|
610 break; |
|
611 } |
|
612 |
|
613 /* target overflow */ |
|
614 targetCapacity=0; |
|
615 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
616 break; |
|
617 } |
|
618 } |
|
619 } else { |
|
620 /* target is full */ |
|
621 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
622 break; |
|
623 } |
|
624 } |
|
625 |
|
626 /* set the converter state back into UConverter */ |
|
627 cnv->fromUChar32= c<0 ? -c : 0; |
|
628 cnv->fromUnicodeStatus=(uint32_t)prev; |
|
629 |
|
630 /* write back the updated pointers */ |
|
631 pArgs->source=source; |
|
632 pArgs->target=(char *)target; |
|
633 pArgs->offsets=offsets; |
|
634 } |
|
635 |
|
636 /* |
|
637 * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling. |
|
638 * If a change is made in the original function, then either |
|
639 * change this function the same way or |
|
640 * re-copy the original function and remove the variables |
|
641 * offsets, sourceIndex, and nextSourceIndex. |
|
642 */ |
|
643 static void |
|
644 _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs, |
|
645 UErrorCode *pErrorCode) { |
|
646 UConverter *cnv; |
|
647 const UChar *source, *sourceLimit; |
|
648 uint8_t *target; |
|
649 int32_t targetCapacity; |
|
650 |
|
651 int32_t prev, c, diff; |
|
652 |
|
653 /* set up the local pointers */ |
|
654 cnv=pArgs->converter; |
|
655 source=pArgs->source; |
|
656 sourceLimit=pArgs->sourceLimit; |
|
657 target=(uint8_t *)pArgs->target; |
|
658 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); |
|
659 |
|
660 /* get the converter state from UConverter */ |
|
661 c=cnv->fromUChar32; |
|
662 prev=(int32_t)cnv->fromUnicodeStatus; |
|
663 if(prev==0) { |
|
664 prev=BOCU1_ASCII_PREV; |
|
665 } |
|
666 |
|
667 /* conversion loop */ |
|
668 if(c!=0 && targetCapacity>0) { |
|
669 goto getTrail; |
|
670 } |
|
671 |
|
672 fastSingle: |
|
673 /* fast loop for single-byte differences */ |
|
674 /* use only one loop counter variable, targetCapacity, not also source */ |
|
675 diff=(int32_t)(sourceLimit-source); |
|
676 if(targetCapacity>diff) { |
|
677 targetCapacity=diff; |
|
678 } |
|
679 while(targetCapacity>0 && (c=*source)<0x3000) { |
|
680 if(c<=0x20) { |
|
681 if(c!=0x20) { |
|
682 prev=BOCU1_ASCII_PREV; |
|
683 } |
|
684 *target++=(uint8_t)c; |
|
685 } else { |
|
686 diff=c-prev; |
|
687 if(DIFF_IS_SINGLE(diff)) { |
|
688 prev=BOCU1_SIMPLE_PREV(c); |
|
689 *target++=(uint8_t)PACK_SINGLE_DIFF(diff); |
|
690 } else { |
|
691 break; |
|
692 } |
|
693 } |
|
694 ++source; |
|
695 --targetCapacity; |
|
696 } |
|
697 /* restore real values */ |
|
698 targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target); |
|
699 |
|
700 /* regular loop for all cases */ |
|
701 while(source<sourceLimit) { |
|
702 if(targetCapacity>0) { |
|
703 c=*source++; |
|
704 |
|
705 if(c<=0x20) { |
|
706 /* |
|
707 * ISO C0 control & space: |
|
708 * Encode directly for MIME compatibility, |
|
709 * and reset state except for space, to not disrupt compression. |
|
710 */ |
|
711 if(c!=0x20) { |
|
712 prev=BOCU1_ASCII_PREV; |
|
713 } |
|
714 *target++=(uint8_t)c; |
|
715 --targetCapacity; |
|
716 continue; |
|
717 } |
|
718 |
|
719 if(U16_IS_LEAD(c)) { |
|
720 getTrail: |
|
721 if(source<sourceLimit) { |
|
722 /* test the following code unit */ |
|
723 UChar trail=*source; |
|
724 if(U16_IS_TRAIL(trail)) { |
|
725 ++source; |
|
726 c=U16_GET_SUPPLEMENTARY(c, trail); |
|
727 } |
|
728 } else { |
|
729 /* no more input */ |
|
730 c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */ |
|
731 break; |
|
732 } |
|
733 } |
|
734 |
|
735 /* |
|
736 * all other Unicode code points c==U+0021..U+10ffff |
|
737 * are encoded with the difference c-prev |
|
738 * |
|
739 * a new prev is computed from c, |
|
740 * placed in the middle of a 0x80-block (for most small scripts) or |
|
741 * in the middle of the Unihan and Hangul blocks |
|
742 * to statistically minimize the following difference |
|
743 */ |
|
744 diff=c-prev; |
|
745 prev=BOCU1_PREV(c); |
|
746 if(DIFF_IS_SINGLE(diff)) { |
|
747 *target++=(uint8_t)PACK_SINGLE_DIFF(diff); |
|
748 --targetCapacity; |
|
749 if(c<0x3000) { |
|
750 goto fastSingle; |
|
751 } |
|
752 } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) { |
|
753 /* optimize 2-byte case */ |
|
754 int32_t m; |
|
755 |
|
756 if(diff>=0) { |
|
757 diff-=BOCU1_REACH_POS_1+1; |
|
758 m=diff%BOCU1_TRAIL_COUNT; |
|
759 diff/=BOCU1_TRAIL_COUNT; |
|
760 diff+=BOCU1_START_POS_2; |
|
761 } else { |
|
762 diff-=BOCU1_REACH_NEG_1; |
|
763 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); |
|
764 diff+=BOCU1_START_NEG_2; |
|
765 } |
|
766 *target++=(uint8_t)diff; |
|
767 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m); |
|
768 targetCapacity-=2; |
|
769 } else { |
|
770 int32_t length; /* will be 2..4 */ |
|
771 |
|
772 diff=packDiff(diff); |
|
773 length=BOCU1_LENGTH_FROM_PACKED(diff); |
|
774 |
|
775 /* write the output character bytes from diff and length */ |
|
776 /* from the first if in the loop we know that targetCapacity>0 */ |
|
777 if(length<=targetCapacity) { |
|
778 switch(length) { |
|
779 /* each branch falls through to the next one */ |
|
780 case 4: |
|
781 *target++=(uint8_t)(diff>>24); |
|
782 case 3: /*fall through*/ |
|
783 *target++=(uint8_t)(diff>>16); |
|
784 /* case 2: handled above */ |
|
785 *target++=(uint8_t)(diff>>8); |
|
786 /* case 1: handled above */ |
|
787 *target++=(uint8_t)diff; |
|
788 default: |
|
789 /* will never occur */ |
|
790 break; |
|
791 } |
|
792 targetCapacity-=length; |
|
793 } else { |
|
794 uint8_t *charErrorBuffer; |
|
795 |
|
796 /* |
|
797 * We actually do this backwards here: |
|
798 * In order to save an intermediate variable, we output |
|
799 * first to the overflow buffer what does not fit into the |
|
800 * regular target. |
|
801 */ |
|
802 /* we know that 1<=targetCapacity<length<=4 */ |
|
803 length-=targetCapacity; |
|
804 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; |
|
805 switch(length) { |
|
806 /* each branch falls through to the next one */ |
|
807 case 3: |
|
808 *charErrorBuffer++=(uint8_t)(diff>>16); |
|
809 case 2: /*fall through*/ |
|
810 *charErrorBuffer++=(uint8_t)(diff>>8); |
|
811 case 1: /*fall through*/ |
|
812 *charErrorBuffer=(uint8_t)diff; |
|
813 default: |
|
814 /* will never occur */ |
|
815 break; |
|
816 } |
|
817 cnv->charErrorBufferLength=(int8_t)length; |
|
818 |
|
819 /* now output what fits into the regular target */ |
|
820 diff>>=8*length; /* length was reduced by targetCapacity */ |
|
821 switch(targetCapacity) { |
|
822 /* each branch falls through to the next one */ |
|
823 case 3: |
|
824 *target++=(uint8_t)(diff>>16); |
|
825 case 2: /*fall through*/ |
|
826 *target++=(uint8_t)(diff>>8); |
|
827 case 1: /*fall through*/ |
|
828 *target++=(uint8_t)diff; |
|
829 default: |
|
830 /* will never occur */ |
|
831 break; |
|
832 } |
|
833 |
|
834 /* target overflow */ |
|
835 targetCapacity=0; |
|
836 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
837 break; |
|
838 } |
|
839 } |
|
840 } else { |
|
841 /* target is full */ |
|
842 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
843 break; |
|
844 } |
|
845 } |
|
846 |
|
847 /* set the converter state back into UConverter */ |
|
848 cnv->fromUChar32= c<0 ? -c : 0; |
|
849 cnv->fromUnicodeStatus=(uint32_t)prev; |
|
850 |
|
851 /* write back the updated pointers */ |
|
852 pArgs->source=source; |
|
853 pArgs->target=(char *)target; |
|
854 } |
|
855 |
|
856 /* BOCU-1-to-Unicode conversion functions ----------------------------------- */ |
|
857 |
|
858 /** |
|
859 * Function for BOCU-1 decoder; handles multi-byte lead bytes. |
|
860 * |
|
861 * @param b lead byte; |
|
862 * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD |
|
863 * @return (diff<<2)|count |
|
864 */ |
|
865 static inline int32_t |
|
866 decodeBocu1LeadByte(int32_t b) { |
|
867 int32_t diff, count; |
|
868 |
|
869 if(b>=BOCU1_START_NEG_2) { |
|
870 /* positive difference */ |
|
871 if(b<BOCU1_START_POS_3) { |
|
872 /* two bytes */ |
|
873 diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; |
|
874 count=1; |
|
875 } else if(b<BOCU1_START_POS_4) { |
|
876 /* three bytes */ |
|
877 diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1; |
|
878 count=2; |
|
879 } else { |
|
880 /* four bytes */ |
|
881 diff=BOCU1_REACH_POS_3+1; |
|
882 count=3; |
|
883 } |
|
884 } else { |
|
885 /* negative difference */ |
|
886 if(b>=BOCU1_START_NEG_3) { |
|
887 /* two bytes */ |
|
888 diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; |
|
889 count=1; |
|
890 } else if(b>BOCU1_MIN) { |
|
891 /* three bytes */ |
|
892 diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2; |
|
893 count=2; |
|
894 } else { |
|
895 /* four bytes */ |
|
896 diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3; |
|
897 count=3; |
|
898 } |
|
899 } |
|
900 |
|
901 /* return the state for decoding the trail byte(s) */ |
|
902 return (diff<<2)|count; |
|
903 } |
|
904 |
|
905 /** |
|
906 * Function for BOCU-1 decoder; handles multi-byte trail bytes. |
|
907 * |
|
908 * @param count number of remaining trail bytes including this one |
|
909 * @param b trail byte |
|
910 * @return new delta for diff including b - <0 indicates an error |
|
911 * |
|
912 * @see decodeBocu1 |
|
913 */ |
|
914 static inline int32_t |
|
915 decodeBocu1TrailByte(int32_t count, int32_t b) { |
|
916 if(b<=0x20) { |
|
917 /* skip some C0 controls and make the trail byte range contiguous */ |
|
918 b=bocu1ByteToTrail[b]; |
|
919 /* b<0 for an illegal trail byte value will result in return<0 below */ |
|
920 #if BOCU1_MAX_TRAIL<0xff |
|
921 } else if(b>BOCU1_MAX_TRAIL) { |
|
922 return -99; |
|
923 #endif |
|
924 } else { |
|
925 b-=BOCU1_TRAIL_BYTE_OFFSET; |
|
926 } |
|
927 |
|
928 /* add trail byte into difference and decrement count */ |
|
929 if(count==1) { |
|
930 return b; |
|
931 } else if(count==2) { |
|
932 return b*BOCU1_TRAIL_COUNT; |
|
933 } else /* count==3 */ { |
|
934 return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT); |
|
935 } |
|
936 } |
|
937 |
|
938 static void |
|
939 _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, |
|
940 UErrorCode *pErrorCode) { |
|
941 UConverter *cnv; |
|
942 const uint8_t *source, *sourceLimit; |
|
943 UChar *target; |
|
944 const UChar *targetLimit; |
|
945 int32_t *offsets; |
|
946 |
|
947 int32_t prev, count, diff, c; |
|
948 |
|
949 int8_t byteIndex; |
|
950 uint8_t *bytes; |
|
951 |
|
952 int32_t sourceIndex, nextSourceIndex; |
|
953 |
|
954 /* set up the local pointers */ |
|
955 cnv=pArgs->converter; |
|
956 source=(const uint8_t *)pArgs->source; |
|
957 sourceLimit=(const uint8_t *)pArgs->sourceLimit; |
|
958 target=pArgs->target; |
|
959 targetLimit=pArgs->targetLimit; |
|
960 offsets=pArgs->offsets; |
|
961 |
|
962 /* get the converter state from UConverter */ |
|
963 prev=(int32_t)cnv->toUnicodeStatus; |
|
964 if(prev==0) { |
|
965 prev=BOCU1_ASCII_PREV; |
|
966 } |
|
967 diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */ |
|
968 count=diff&3; |
|
969 diff>>=2; |
|
970 |
|
971 byteIndex=cnv->toULength; |
|
972 bytes=cnv->toUBytes; |
|
973 |
|
974 /* sourceIndex=-1 if the current character began in the previous buffer */ |
|
975 sourceIndex=byteIndex==0 ? 0 : -1; |
|
976 nextSourceIndex=0; |
|
977 |
|
978 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */ |
|
979 if(count>0 && byteIndex>0 && target<targetLimit) { |
|
980 goto getTrail; |
|
981 } |
|
982 |
|
983 fastSingle: |
|
984 /* fast loop for single-byte differences */ |
|
985 /* use count as the only loop counter variable */ |
|
986 diff=(int32_t)(sourceLimit-source); |
|
987 count=(int32_t)(pArgs->targetLimit-target); |
|
988 if(count>diff) { |
|
989 count=diff; |
|
990 } |
|
991 while(count>0) { |
|
992 if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) { |
|
993 c=prev+(c-BOCU1_MIDDLE); |
|
994 if(c<0x3000) { |
|
995 *target++=(UChar)c; |
|
996 *offsets++=nextSourceIndex++; |
|
997 prev=BOCU1_SIMPLE_PREV(c); |
|
998 } else { |
|
999 break; |
|
1000 } |
|
1001 } else if(c<=0x20) { |
|
1002 if(c!=0x20) { |
|
1003 prev=BOCU1_ASCII_PREV; |
|
1004 } |
|
1005 *target++=(UChar)c; |
|
1006 *offsets++=nextSourceIndex++; |
|
1007 } else { |
|
1008 break; |
|
1009 } |
|
1010 ++source; |
|
1011 --count; |
|
1012 } |
|
1013 sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */ |
|
1014 |
|
1015 /* decode a sequence of single and lead bytes */ |
|
1016 while(source<sourceLimit) { |
|
1017 if(target>=targetLimit) { |
|
1018 /* target is full */ |
|
1019 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
1020 break; |
|
1021 } |
|
1022 |
|
1023 ++nextSourceIndex; |
|
1024 c=*source++; |
|
1025 if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) { |
|
1026 /* Write a code point directly from a single-byte difference. */ |
|
1027 c=prev+(c-BOCU1_MIDDLE); |
|
1028 if(c<0x3000) { |
|
1029 *target++=(UChar)c; |
|
1030 *offsets++=sourceIndex; |
|
1031 prev=BOCU1_SIMPLE_PREV(c); |
|
1032 sourceIndex=nextSourceIndex; |
|
1033 goto fastSingle; |
|
1034 } |
|
1035 } else if(c<=0x20) { |
|
1036 /* |
|
1037 * Direct-encoded C0 control code or space. |
|
1038 * Reset prev for C0 control codes but not for space. |
|
1039 */ |
|
1040 if(c!=0x20) { |
|
1041 prev=BOCU1_ASCII_PREV; |
|
1042 } |
|
1043 *target++=(UChar)c; |
|
1044 *offsets++=sourceIndex; |
|
1045 sourceIndex=nextSourceIndex; |
|
1046 continue; |
|
1047 } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) { |
|
1048 /* Optimize two-byte case. */ |
|
1049 if(c>=BOCU1_MIDDLE) { |
|
1050 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; |
|
1051 } else { |
|
1052 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; |
|
1053 } |
|
1054 |
|
1055 /* trail byte */ |
|
1056 ++nextSourceIndex; |
|
1057 c=decodeBocu1TrailByte(1, *source++); |
|
1058 if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) { |
|
1059 bytes[0]=source[-2]; |
|
1060 bytes[1]=source[-1]; |
|
1061 byteIndex=2; |
|
1062 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
|
1063 break; |
|
1064 } |
|
1065 } else if(c==BOCU1_RESET) { |
|
1066 /* only reset the state, no code point */ |
|
1067 prev=BOCU1_ASCII_PREV; |
|
1068 sourceIndex=nextSourceIndex; |
|
1069 continue; |
|
1070 } else { |
|
1071 /* |
|
1072 * For multi-byte difference lead bytes, set the decoder state |
|
1073 * with the partial difference value from the lead byte and |
|
1074 * with the number of trail bytes. |
|
1075 */ |
|
1076 bytes[0]=(uint8_t)c; |
|
1077 byteIndex=1; |
|
1078 |
|
1079 diff=decodeBocu1LeadByte(c); |
|
1080 count=diff&3; |
|
1081 diff>>=2; |
|
1082 getTrail: |
|
1083 for(;;) { |
|
1084 if(source>=sourceLimit) { |
|
1085 goto endloop; |
|
1086 } |
|
1087 ++nextSourceIndex; |
|
1088 c=bytes[byteIndex++]=*source++; |
|
1089 |
|
1090 /* trail byte in any position */ |
|
1091 c=decodeBocu1TrailByte(count, c); |
|
1092 if(c<0) { |
|
1093 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
|
1094 goto endloop; |
|
1095 } |
|
1096 |
|
1097 diff+=c; |
|
1098 if(--count==0) { |
|
1099 /* final trail byte, deliver a code point */ |
|
1100 byteIndex=0; |
|
1101 c=prev+diff; |
|
1102 if((uint32_t)c>0x10ffff) { |
|
1103 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
|
1104 goto endloop; |
|
1105 } |
|
1106 break; |
|
1107 } |
|
1108 } |
|
1109 } |
|
1110 |
|
1111 /* calculate the next prev and output c */ |
|
1112 prev=BOCU1_PREV(c); |
|
1113 if(c<=0xffff) { |
|
1114 *target++=(UChar)c; |
|
1115 *offsets++=sourceIndex; |
|
1116 } else { |
|
1117 /* output surrogate pair */ |
|
1118 *target++=U16_LEAD(c); |
|
1119 if(target<targetLimit) { |
|
1120 *target++=U16_TRAIL(c); |
|
1121 *offsets++=sourceIndex; |
|
1122 *offsets++=sourceIndex; |
|
1123 } else { |
|
1124 /* target overflow */ |
|
1125 *offsets++=sourceIndex; |
|
1126 cnv->UCharErrorBuffer[0]=U16_TRAIL(c); |
|
1127 cnv->UCharErrorBufferLength=1; |
|
1128 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
1129 break; |
|
1130 } |
|
1131 } |
|
1132 sourceIndex=nextSourceIndex; |
|
1133 } |
|
1134 endloop: |
|
1135 |
|
1136 if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) { |
|
1137 /* set the converter state in UConverter to deal with the next character */ |
|
1138 cnv->toUnicodeStatus=BOCU1_ASCII_PREV; |
|
1139 cnv->mode=0; |
|
1140 } else { |
|
1141 /* set the converter state back into UConverter */ |
|
1142 cnv->toUnicodeStatus=(uint32_t)prev; |
|
1143 cnv->mode=(diff<<2)|count; |
|
1144 } |
|
1145 cnv->toULength=byteIndex; |
|
1146 |
|
1147 /* write back the updated pointers */ |
|
1148 pArgs->source=(const char *)source; |
|
1149 pArgs->target=target; |
|
1150 pArgs->offsets=offsets; |
|
1151 return; |
|
1152 } |
|
1153 |
|
1154 /* |
|
1155 * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling. |
|
1156 * If a change is made in the original function, then either |
|
1157 * change this function the same way or |
|
1158 * re-copy the original function and remove the variables |
|
1159 * offsets, sourceIndex, and nextSourceIndex. |
|
1160 */ |
|
1161 static void |
|
1162 _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs, |
|
1163 UErrorCode *pErrorCode) { |
|
1164 UConverter *cnv; |
|
1165 const uint8_t *source, *sourceLimit; |
|
1166 UChar *target; |
|
1167 const UChar *targetLimit; |
|
1168 |
|
1169 int32_t prev, count, diff, c; |
|
1170 |
|
1171 int8_t byteIndex; |
|
1172 uint8_t *bytes; |
|
1173 |
|
1174 U_ALIGN_CODE(16) |
|
1175 |
|
1176 /* set up the local pointers */ |
|
1177 cnv=pArgs->converter; |
|
1178 source=(const uint8_t *)pArgs->source; |
|
1179 sourceLimit=(const uint8_t *)pArgs->sourceLimit; |
|
1180 target=pArgs->target; |
|
1181 targetLimit=pArgs->targetLimit; |
|
1182 |
|
1183 /* get the converter state from UConverter */ |
|
1184 prev=(int32_t)cnv->toUnicodeStatus; |
|
1185 if(prev==0) { |
|
1186 prev=BOCU1_ASCII_PREV; |
|
1187 } |
|
1188 diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */ |
|
1189 count=diff&3; |
|
1190 diff>>=2; |
|
1191 |
|
1192 byteIndex=cnv->toULength; |
|
1193 bytes=cnv->toUBytes; |
|
1194 |
|
1195 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */ |
|
1196 if(count>0 && byteIndex>0 && target<targetLimit) { |
|
1197 goto getTrail; |
|
1198 } |
|
1199 |
|
1200 fastSingle: |
|
1201 /* fast loop for single-byte differences */ |
|
1202 /* use count as the only loop counter variable */ |
|
1203 diff=(int32_t)(sourceLimit-source); |
|
1204 count=(int32_t)(pArgs->targetLimit-target); |
|
1205 if(count>diff) { |
|
1206 count=diff; |
|
1207 } |
|
1208 while(count>0) { |
|
1209 if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) { |
|
1210 c=prev+(c-BOCU1_MIDDLE); |
|
1211 if(c<0x3000) { |
|
1212 *target++=(UChar)c; |
|
1213 prev=BOCU1_SIMPLE_PREV(c); |
|
1214 } else { |
|
1215 break; |
|
1216 } |
|
1217 } else if(c<=0x20) { |
|
1218 if(c!=0x20) { |
|
1219 prev=BOCU1_ASCII_PREV; |
|
1220 } |
|
1221 *target++=(UChar)c; |
|
1222 } else { |
|
1223 break; |
|
1224 } |
|
1225 ++source; |
|
1226 --count; |
|
1227 } |
|
1228 |
|
1229 /* decode a sequence of single and lead bytes */ |
|
1230 while(source<sourceLimit) { |
|
1231 if(target>=targetLimit) { |
|
1232 /* target is full */ |
|
1233 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
1234 break; |
|
1235 } |
|
1236 |
|
1237 c=*source++; |
|
1238 if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) { |
|
1239 /* Write a code point directly from a single-byte difference. */ |
|
1240 c=prev+(c-BOCU1_MIDDLE); |
|
1241 if(c<0x3000) { |
|
1242 *target++=(UChar)c; |
|
1243 prev=BOCU1_SIMPLE_PREV(c); |
|
1244 goto fastSingle; |
|
1245 } |
|
1246 } else if(c<=0x20) { |
|
1247 /* |
|
1248 * Direct-encoded C0 control code or space. |
|
1249 * Reset prev for C0 control codes but not for space. |
|
1250 */ |
|
1251 if(c!=0x20) { |
|
1252 prev=BOCU1_ASCII_PREV; |
|
1253 } |
|
1254 *target++=(UChar)c; |
|
1255 continue; |
|
1256 } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) { |
|
1257 /* Optimize two-byte case. */ |
|
1258 if(c>=BOCU1_MIDDLE) { |
|
1259 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; |
|
1260 } else { |
|
1261 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; |
|
1262 } |
|
1263 |
|
1264 /* trail byte */ |
|
1265 c=decodeBocu1TrailByte(1, *source++); |
|
1266 if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) { |
|
1267 bytes[0]=source[-2]; |
|
1268 bytes[1]=source[-1]; |
|
1269 byteIndex=2; |
|
1270 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
|
1271 break; |
|
1272 } |
|
1273 } else if(c==BOCU1_RESET) { |
|
1274 /* only reset the state, no code point */ |
|
1275 prev=BOCU1_ASCII_PREV; |
|
1276 continue; |
|
1277 } else { |
|
1278 /* |
|
1279 * For multi-byte difference lead bytes, set the decoder state |
|
1280 * with the partial difference value from the lead byte and |
|
1281 * with the number of trail bytes. |
|
1282 */ |
|
1283 bytes[0]=(uint8_t)c; |
|
1284 byteIndex=1; |
|
1285 |
|
1286 diff=decodeBocu1LeadByte(c); |
|
1287 count=diff&3; |
|
1288 diff>>=2; |
|
1289 getTrail: |
|
1290 for(;;) { |
|
1291 if(source>=sourceLimit) { |
|
1292 goto endloop; |
|
1293 } |
|
1294 c=bytes[byteIndex++]=*source++; |
|
1295 |
|
1296 /* trail byte in any position */ |
|
1297 c=decodeBocu1TrailByte(count, c); |
|
1298 if(c<0) { |
|
1299 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
|
1300 goto endloop; |
|
1301 } |
|
1302 |
|
1303 diff+=c; |
|
1304 if(--count==0) { |
|
1305 /* final trail byte, deliver a code point */ |
|
1306 byteIndex=0; |
|
1307 c=prev+diff; |
|
1308 if((uint32_t)c>0x10ffff) { |
|
1309 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
|
1310 goto endloop; |
|
1311 } |
|
1312 break; |
|
1313 } |
|
1314 } |
|
1315 } |
|
1316 |
|
1317 /* calculate the next prev and output c */ |
|
1318 prev=BOCU1_PREV(c); |
|
1319 if(c<=0xffff) { |
|
1320 *target++=(UChar)c; |
|
1321 } else { |
|
1322 /* output surrogate pair */ |
|
1323 *target++=U16_LEAD(c); |
|
1324 if(target<targetLimit) { |
|
1325 *target++=U16_TRAIL(c); |
|
1326 } else { |
|
1327 /* target overflow */ |
|
1328 cnv->UCharErrorBuffer[0]=U16_TRAIL(c); |
|
1329 cnv->UCharErrorBufferLength=1; |
|
1330 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
1331 break; |
|
1332 } |
|
1333 } |
|
1334 } |
|
1335 endloop: |
|
1336 |
|
1337 if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) { |
|
1338 /* set the converter state in UConverter to deal with the next character */ |
|
1339 cnv->toUnicodeStatus=BOCU1_ASCII_PREV; |
|
1340 cnv->mode=0; |
|
1341 } else { |
|
1342 /* set the converter state back into UConverter */ |
|
1343 cnv->toUnicodeStatus=(uint32_t)prev; |
|
1344 cnv->mode=(diff<<2)|count; |
|
1345 } |
|
1346 cnv->toULength=byteIndex; |
|
1347 |
|
1348 /* write back the updated pointers */ |
|
1349 pArgs->source=(const char *)source; |
|
1350 pArgs->target=target; |
|
1351 return; |
|
1352 } |
|
1353 |
|
1354 /* miscellaneous ------------------------------------------------------------ */ |
|
1355 |
|
1356 static const UConverterImpl _Bocu1Impl={ |
|
1357 UCNV_BOCU1, |
|
1358 |
|
1359 NULL, |
|
1360 NULL, |
|
1361 |
|
1362 NULL, |
|
1363 NULL, |
|
1364 NULL, |
|
1365 |
|
1366 _Bocu1ToUnicode, |
|
1367 _Bocu1ToUnicodeWithOffsets, |
|
1368 _Bocu1FromUnicode, |
|
1369 _Bocu1FromUnicodeWithOffsets, |
|
1370 NULL, |
|
1371 |
|
1372 NULL, |
|
1373 NULL, |
|
1374 NULL, |
|
1375 NULL, |
|
1376 ucnv_getCompleteUnicodeSet, |
|
1377 |
|
1378 NULL, |
|
1379 NULL |
|
1380 }; |
|
1381 |
|
1382 static const UConverterStaticData _Bocu1StaticData={ |
|
1383 sizeof(UConverterStaticData), |
|
1384 "BOCU-1", |
|
1385 1214, /* CCSID for BOCU-1 */ |
|
1386 UCNV_IBM, UCNV_BOCU1, |
|
1387 1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */ |
|
1388 { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */ |
|
1389 FALSE, FALSE, |
|
1390 0, |
|
1391 0, |
|
1392 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
|
1393 }; |
|
1394 |
|
1395 const UConverterSharedData _Bocu1Data={ |
|
1396 sizeof(UConverterSharedData), ~((uint32_t)0), |
|
1397 NULL, NULL, &_Bocu1StaticData, FALSE, &_Bocu1Impl, |
|
1398 0, |
|
1399 UCNV_MBCS_TABLE_INITIALIZER |
|
1400 }; |
|
1401 |
|
1402 #endif |