|
1 /* |
|
2 ****************************************************************************** |
|
3 * |
|
4 * Copyright (C) 2000-2011, International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ****************************************************************************** |
|
8 * file name: ucnvscsu.c |
|
9 * encoding: US-ASCII |
|
10 * tab size: 8 (not used) |
|
11 * indentation:4 |
|
12 * |
|
13 * created on: 2000nov18 |
|
14 * created by: Markus W. Scherer |
|
15 * |
|
16 * This is an implementation of the Standard Compression Scheme for Unicode |
|
17 * as defined in http://www.unicode.org/unicode/reports/tr6/ . |
|
18 * Reserved commands and window settings are treated as illegal sequences and |
|
19 * will result in callback calls. |
|
20 */ |
|
21 |
|
22 #include "unicode/utypes.h" |
|
23 |
|
24 #if !UCONFIG_NO_CONVERSION |
|
25 |
|
26 #include "unicode/ucnv.h" |
|
27 #include "unicode/ucnv_cb.h" |
|
28 #include "unicode/utf16.h" |
|
29 #include "ucnv_bld.h" |
|
30 #include "ucnv_cnv.h" |
|
31 #include "cmemory.h" |
|
32 |
|
33 /* SCSU definitions --------------------------------------------------------- */ |
|
34 |
|
35 /* SCSU command byte values */ |
|
36 enum { |
|
37 SQ0=0x01, /* Quote from window pair 0 */ |
|
38 SQ7=0x08, /* Quote from window pair 7 */ |
|
39 SDX=0x0B, /* Define a window as extended */ |
|
40 Srs=0x0C, /* reserved */ |
|
41 SQU=0x0E, /* Quote a single Unicode character */ |
|
42 SCU=0x0F, /* Change to Unicode mode */ |
|
43 SC0=0x10, /* Select window 0 */ |
|
44 SC7=0x17, /* Select window 7 */ |
|
45 SD0=0x18, /* Define and select window 0 */ |
|
46 SD7=0x1F, /* Define and select window 7 */ |
|
47 |
|
48 UC0=0xE0, /* Select window 0 */ |
|
49 UC7=0xE7, /* Select window 7 */ |
|
50 UD0=0xE8, /* Define and select window 0 */ |
|
51 UD7=0xEF, /* Define and select window 7 */ |
|
52 UQU=0xF0, /* Quote a single Unicode character */ |
|
53 UDX=0xF1, /* Define a Window as extended */ |
|
54 Urs=0xF2 /* reserved */ |
|
55 }; |
|
56 |
|
57 enum { |
|
58 /* |
|
59 * Unicode code points from 3400 to E000 are not adressible by |
|
60 * dynamic window, since in these areas no short run alphabets are |
|
61 * found. Therefore add gapOffset to all values from gapThreshold. |
|
62 */ |
|
63 gapThreshold=0x68, |
|
64 gapOffset=0xAC00, |
|
65 |
|
66 /* values between reservedStart and fixedThreshold are reserved */ |
|
67 reservedStart=0xA8, |
|
68 |
|
69 /* use table of predefined fixed offsets for values from fixedThreshold */ |
|
70 fixedThreshold=0xF9 |
|
71 }; |
|
72 |
|
73 /* constant offsets for the 8 static windows */ |
|
74 static const uint32_t staticOffsets[8]={ |
|
75 0x0000, /* ASCII for quoted tags */ |
|
76 0x0080, /* Latin - 1 Supplement (for access to punctuation) */ |
|
77 0x0100, /* Latin Extended-A */ |
|
78 0x0300, /* Combining Diacritical Marks */ |
|
79 0x2000, /* General Punctuation */ |
|
80 0x2080, /* Currency Symbols */ |
|
81 0x2100, /* Letterlike Symbols and Number Forms */ |
|
82 0x3000 /* CJK Symbols and punctuation */ |
|
83 }; |
|
84 |
|
85 /* initial offsets for the 8 dynamic (sliding) windows */ |
|
86 static const uint32_t initialDynamicOffsets[8]={ |
|
87 0x0080, /* Latin-1 */ |
|
88 0x00C0, /* Latin Extended A */ |
|
89 0x0400, /* Cyrillic */ |
|
90 0x0600, /* Arabic */ |
|
91 0x0900, /* Devanagari */ |
|
92 0x3040, /* Hiragana */ |
|
93 0x30A0, /* Katakana */ |
|
94 0xFF00 /* Fullwidth ASCII */ |
|
95 }; |
|
96 |
|
97 /* Table of fixed predefined Offsets */ |
|
98 static const uint32_t fixedOffsets[]={ |
|
99 /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */ |
|
100 /* 0xFA */ 0x0250, /* IPA extensions */ |
|
101 /* 0xFB */ 0x0370, /* Greek */ |
|
102 /* 0xFC */ 0x0530, /* Armenian */ |
|
103 /* 0xFD */ 0x3040, /* Hiragana */ |
|
104 /* 0xFE */ 0x30A0, /* Katakana */ |
|
105 /* 0xFF */ 0xFF60 /* Halfwidth Katakana */ |
|
106 }; |
|
107 |
|
108 /* state values */ |
|
109 enum { |
|
110 readCommand, |
|
111 quotePairOne, |
|
112 quotePairTwo, |
|
113 quoteOne, |
|
114 definePairOne, |
|
115 definePairTwo, |
|
116 defineOne |
|
117 }; |
|
118 |
|
119 typedef struct SCSUData { |
|
120 /* dynamic window offsets, intitialize to default values from initialDynamicOffsets */ |
|
121 uint32_t toUDynamicOffsets[8]; |
|
122 uint32_t fromUDynamicOffsets[8]; |
|
123 |
|
124 /* state machine state - toUnicode */ |
|
125 UBool toUIsSingleByteMode; |
|
126 uint8_t toUState; |
|
127 int8_t toUQuoteWindow, toUDynamicWindow; |
|
128 uint8_t toUByteOne; |
|
129 uint8_t toUPadding[3]; |
|
130 |
|
131 /* state machine state - fromUnicode */ |
|
132 UBool fromUIsSingleByteMode; |
|
133 int8_t fromUDynamicWindow; |
|
134 |
|
135 /* |
|
136 * windowUse[] keeps track of the use of the dynamic windows: |
|
137 * At nextWindowUseIndex there is the least recently used window, |
|
138 * and the following windows (in a wrapping manner) are more and more |
|
139 * recently used. |
|
140 * At nextWindowUseIndex-1 there is the most recently used window. |
|
141 */ |
|
142 uint8_t locale; |
|
143 int8_t nextWindowUseIndex; |
|
144 int8_t windowUse[8]; |
|
145 } SCSUData; |
|
146 |
|
147 static const int8_t initialWindowUse[8]={ 7, 0, 3, 2, 4, 5, 6, 1 }; |
|
148 static const int8_t initialWindowUse_ja[8]={ 3, 2, 4, 1, 0, 7, 5, 6 }; |
|
149 |
|
150 enum { |
|
151 lGeneric, l_ja |
|
152 }; |
|
153 |
|
154 /* SCSU setup functions ----------------------------------------------------- */ |
|
155 |
|
156 static void |
|
157 _SCSUReset(UConverter *cnv, UConverterResetChoice choice) { |
|
158 SCSUData *scsu=(SCSUData *)cnv->extraInfo; |
|
159 |
|
160 if(choice<=UCNV_RESET_TO_UNICODE) { |
|
161 /* reset toUnicode */ |
|
162 uprv_memcpy(scsu->toUDynamicOffsets, initialDynamicOffsets, 32); |
|
163 |
|
164 scsu->toUIsSingleByteMode=TRUE; |
|
165 scsu->toUState=readCommand; |
|
166 scsu->toUQuoteWindow=scsu->toUDynamicWindow=0; |
|
167 scsu->toUByteOne=0; |
|
168 |
|
169 cnv->toULength=0; |
|
170 } |
|
171 if(choice!=UCNV_RESET_TO_UNICODE) { |
|
172 /* reset fromUnicode */ |
|
173 uprv_memcpy(scsu->fromUDynamicOffsets, initialDynamicOffsets, 32); |
|
174 |
|
175 scsu->fromUIsSingleByteMode=TRUE; |
|
176 scsu->fromUDynamicWindow=0; |
|
177 |
|
178 scsu->nextWindowUseIndex=0; |
|
179 switch(scsu->locale) { |
|
180 case l_ja: |
|
181 uprv_memcpy(scsu->windowUse, initialWindowUse_ja, 8); |
|
182 break; |
|
183 default: |
|
184 uprv_memcpy(scsu->windowUse, initialWindowUse, 8); |
|
185 break; |
|
186 } |
|
187 |
|
188 cnv->fromUChar32=0; |
|
189 } |
|
190 } |
|
191 |
|
192 static void |
|
193 _SCSUOpen(UConverter *cnv, |
|
194 UConverterLoadArgs *pArgs, |
|
195 UErrorCode *pErrorCode) { |
|
196 const char *locale=pArgs->locale; |
|
197 if(pArgs->onlyTestIsLoadable) { |
|
198 return; |
|
199 } |
|
200 cnv->extraInfo=uprv_malloc(sizeof(SCSUData)); |
|
201 if(cnv->extraInfo!=NULL) { |
|
202 if(locale!=NULL && locale[0]=='j' && locale[1]=='a' && (locale[2]==0 || locale[2]=='_')) { |
|
203 ((SCSUData *)cnv->extraInfo)->locale=l_ja; |
|
204 } else { |
|
205 ((SCSUData *)cnv->extraInfo)->locale=lGeneric; |
|
206 } |
|
207 _SCSUReset(cnv, UCNV_RESET_BOTH); |
|
208 } else { |
|
209 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
|
210 } |
|
211 |
|
212 /* Set the substitution character U+fffd as a Unicode string. */ |
|
213 cnv->subUChars[0]=0xfffd; |
|
214 cnv->subCharLen=-1; |
|
215 } |
|
216 |
|
217 static void |
|
218 _SCSUClose(UConverter *cnv) { |
|
219 if(cnv->extraInfo!=NULL) { |
|
220 if(!cnv->isExtraLocal) { |
|
221 uprv_free(cnv->extraInfo); |
|
222 } |
|
223 cnv->extraInfo=NULL; |
|
224 } |
|
225 } |
|
226 |
|
227 /* SCSU-to-Unicode conversion functions ------------------------------------- */ |
|
228 |
|
229 static void |
|
230 _SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, |
|
231 UErrorCode *pErrorCode) { |
|
232 UConverter *cnv; |
|
233 SCSUData *scsu; |
|
234 const uint8_t *source, *sourceLimit; |
|
235 UChar *target; |
|
236 const UChar *targetLimit; |
|
237 int32_t *offsets; |
|
238 UBool isSingleByteMode; |
|
239 uint8_t state, byteOne; |
|
240 int8_t quoteWindow, dynamicWindow; |
|
241 |
|
242 int32_t sourceIndex, nextSourceIndex; |
|
243 |
|
244 uint8_t b; |
|
245 |
|
246 /* set up the local pointers */ |
|
247 cnv=pArgs->converter; |
|
248 scsu=(SCSUData *)cnv->extraInfo; |
|
249 |
|
250 source=(const uint8_t *)pArgs->source; |
|
251 sourceLimit=(const uint8_t *)pArgs->sourceLimit; |
|
252 target=pArgs->target; |
|
253 targetLimit=pArgs->targetLimit; |
|
254 offsets=pArgs->offsets; |
|
255 |
|
256 /* get the state machine state */ |
|
257 isSingleByteMode=scsu->toUIsSingleByteMode; |
|
258 state=scsu->toUState; |
|
259 quoteWindow=scsu->toUQuoteWindow; |
|
260 dynamicWindow=scsu->toUDynamicWindow; |
|
261 byteOne=scsu->toUByteOne; |
|
262 |
|
263 /* sourceIndex=-1 if the current character began in the previous buffer */ |
|
264 sourceIndex=state==readCommand ? 0 : -1; |
|
265 nextSourceIndex=0; |
|
266 |
|
267 /* |
|
268 * conversion "loop" |
|
269 * |
|
270 * For performance, this is not a normal C loop. |
|
271 * Instead, there are two code blocks for the two SCSU modes. |
|
272 * The function branches to either one, and a change of the mode is done with a goto to |
|
273 * the other branch. |
|
274 * |
|
275 * Each branch has two conventional loops: |
|
276 * - a fast-path loop for the most common codes in the mode |
|
277 * - a loop for all other codes in the mode |
|
278 * When the fast-path runs into a code that it cannot handle, its loop ends and it |
|
279 * runs into the following loop to handle the other codes. |
|
280 * The end of the input or output buffer is also handled by the slower loop. |
|
281 * The slow loop jumps (goto) to the fast-path loop again as soon as possible. |
|
282 * |
|
283 * The callback handling is done by returning with an error code. |
|
284 * The conversion framework actually calls the callback function. |
|
285 */ |
|
286 if(isSingleByteMode) { |
|
287 /* fast path for single-byte mode */ |
|
288 if(state==readCommand) { |
|
289 fastSingle: |
|
290 while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) { |
|
291 ++source; |
|
292 ++nextSourceIndex; |
|
293 if(b<=0x7f) { |
|
294 /* write US-ASCII graphic character or DEL */ |
|
295 *target++=(UChar)b; |
|
296 if(offsets!=NULL) { |
|
297 *offsets++=sourceIndex; |
|
298 } |
|
299 } else { |
|
300 /* write from dynamic window */ |
|
301 uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f); |
|
302 if(c<=0xffff) { |
|
303 *target++=(UChar)c; |
|
304 if(offsets!=NULL) { |
|
305 *offsets++=sourceIndex; |
|
306 } |
|
307 } else { |
|
308 /* output surrogate pair */ |
|
309 *target++=(UChar)(0xd7c0+(c>>10)); |
|
310 if(target<targetLimit) { |
|
311 *target++=(UChar)(0xdc00|(c&0x3ff)); |
|
312 if(offsets!=NULL) { |
|
313 *offsets++=sourceIndex; |
|
314 *offsets++=sourceIndex; |
|
315 } |
|
316 } else { |
|
317 /* target overflow */ |
|
318 if(offsets!=NULL) { |
|
319 *offsets++=sourceIndex; |
|
320 } |
|
321 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); |
|
322 cnv->UCharErrorBufferLength=1; |
|
323 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
324 goto endloop; |
|
325 } |
|
326 } |
|
327 } |
|
328 sourceIndex=nextSourceIndex; |
|
329 } |
|
330 } |
|
331 |
|
332 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */ |
|
333 singleByteMode: |
|
334 while(source<sourceLimit) { |
|
335 if(target>=targetLimit) { |
|
336 /* target is full */ |
|
337 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
338 break; |
|
339 } |
|
340 b=*source++; |
|
341 ++nextSourceIndex; |
|
342 switch(state) { |
|
343 case readCommand: |
|
344 /* redundant conditions are commented out */ |
|
345 /* here: b<0x20 because otherwise we would be in fastSingle */ |
|
346 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { |
|
347 /* CR/LF/TAB/NUL */ |
|
348 *target++=(UChar)b; |
|
349 if(offsets!=NULL) { |
|
350 *offsets++=sourceIndex; |
|
351 } |
|
352 sourceIndex=nextSourceIndex; |
|
353 goto fastSingle; |
|
354 } else if(SC0<=b) { |
|
355 if(b<=SC7) { |
|
356 dynamicWindow=(int8_t)(b-SC0); |
|
357 sourceIndex=nextSourceIndex; |
|
358 goto fastSingle; |
|
359 } else /* if(SD0<=b && b<=SD7) */ { |
|
360 dynamicWindow=(int8_t)(b-SD0); |
|
361 state=defineOne; |
|
362 } |
|
363 } else if(/* SQ0<=b && */ b<=SQ7) { |
|
364 quoteWindow=(int8_t)(b-SQ0); |
|
365 state=quoteOne; |
|
366 } else if(b==SDX) { |
|
367 state=definePairOne; |
|
368 } else if(b==SQU) { |
|
369 state=quotePairOne; |
|
370 } else if(b==SCU) { |
|
371 sourceIndex=nextSourceIndex; |
|
372 isSingleByteMode=FALSE; |
|
373 goto fastUnicode; |
|
374 } else /* Srs */ { |
|
375 /* callback(illegal) */ |
|
376 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
|
377 cnv->toUBytes[0]=b; |
|
378 cnv->toULength=1; |
|
379 goto endloop; |
|
380 } |
|
381 |
|
382 /* store the first byte of a multibyte sequence in toUBytes[] */ |
|
383 cnv->toUBytes[0]=b; |
|
384 cnv->toULength=1; |
|
385 break; |
|
386 case quotePairOne: |
|
387 byteOne=b; |
|
388 cnv->toUBytes[1]=b; |
|
389 cnv->toULength=2; |
|
390 state=quotePairTwo; |
|
391 break; |
|
392 case quotePairTwo: |
|
393 *target++=(UChar)((byteOne<<8)|b); |
|
394 if(offsets!=NULL) { |
|
395 *offsets++=sourceIndex; |
|
396 } |
|
397 sourceIndex=nextSourceIndex; |
|
398 state=readCommand; |
|
399 goto fastSingle; |
|
400 case quoteOne: |
|
401 if(b<0x80) { |
|
402 /* all static offsets are in the BMP */ |
|
403 *target++=(UChar)(staticOffsets[quoteWindow]+b); |
|
404 if(offsets!=NULL) { |
|
405 *offsets++=sourceIndex; |
|
406 } |
|
407 } else { |
|
408 /* write from dynamic window */ |
|
409 uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f); |
|
410 if(c<=0xffff) { |
|
411 *target++=(UChar)c; |
|
412 if(offsets!=NULL) { |
|
413 *offsets++=sourceIndex; |
|
414 } |
|
415 } else { |
|
416 /* output surrogate pair */ |
|
417 *target++=(UChar)(0xd7c0+(c>>10)); |
|
418 if(target<targetLimit) { |
|
419 *target++=(UChar)(0xdc00|(c&0x3ff)); |
|
420 if(offsets!=NULL) { |
|
421 *offsets++=sourceIndex; |
|
422 *offsets++=sourceIndex; |
|
423 } |
|
424 } else { |
|
425 /* target overflow */ |
|
426 if(offsets!=NULL) { |
|
427 *offsets++=sourceIndex; |
|
428 } |
|
429 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); |
|
430 cnv->UCharErrorBufferLength=1; |
|
431 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
432 goto endloop; |
|
433 } |
|
434 } |
|
435 } |
|
436 sourceIndex=nextSourceIndex; |
|
437 state=readCommand; |
|
438 goto fastSingle; |
|
439 case definePairOne: |
|
440 dynamicWindow=(int8_t)((b>>5)&7); |
|
441 byteOne=(uint8_t)(b&0x1f); |
|
442 cnv->toUBytes[1]=b; |
|
443 cnv->toULength=2; |
|
444 state=definePairTwo; |
|
445 break; |
|
446 case definePairTwo: |
|
447 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL); |
|
448 sourceIndex=nextSourceIndex; |
|
449 state=readCommand; |
|
450 goto fastSingle; |
|
451 case defineOne: |
|
452 if(b==0) { |
|
453 /* callback(illegal): Reserved window offset value 0 */ |
|
454 cnv->toUBytes[1]=b; |
|
455 cnv->toULength=2; |
|
456 goto endloop; |
|
457 } else if(b<gapThreshold) { |
|
458 scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL; |
|
459 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) { |
|
460 scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset; |
|
461 } else if(b>=fixedThreshold) { |
|
462 scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold]; |
|
463 } else { |
|
464 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */ |
|
465 cnv->toUBytes[1]=b; |
|
466 cnv->toULength=2; |
|
467 goto endloop; |
|
468 } |
|
469 sourceIndex=nextSourceIndex; |
|
470 state=readCommand; |
|
471 goto fastSingle; |
|
472 } |
|
473 } |
|
474 } else { |
|
475 /* fast path for Unicode mode */ |
|
476 if(state==readCommand) { |
|
477 fastUnicode: |
|
478 while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) { |
|
479 *target++=(UChar)((b<<8)|source[1]); |
|
480 if(offsets!=NULL) { |
|
481 *offsets++=sourceIndex; |
|
482 } |
|
483 sourceIndex=nextSourceIndex; |
|
484 nextSourceIndex+=2; |
|
485 source+=2; |
|
486 } |
|
487 } |
|
488 |
|
489 /* normal state machine for Unicode mode */ |
|
490 /* unicodeByteMode: */ |
|
491 while(source<sourceLimit) { |
|
492 if(target>=targetLimit) { |
|
493 /* target is full */ |
|
494 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
495 break; |
|
496 } |
|
497 b=*source++; |
|
498 ++nextSourceIndex; |
|
499 switch(state) { |
|
500 case readCommand: |
|
501 if((uint8_t)(b-UC0)>(Urs-UC0)) { |
|
502 byteOne=b; |
|
503 cnv->toUBytes[0]=b; |
|
504 cnv->toULength=1; |
|
505 state=quotePairTwo; |
|
506 } else if(/* UC0<=b && */ b<=UC7) { |
|
507 dynamicWindow=(int8_t)(b-UC0); |
|
508 sourceIndex=nextSourceIndex; |
|
509 isSingleByteMode=TRUE; |
|
510 goto fastSingle; |
|
511 } else if(/* UD0<=b && */ b<=UD7) { |
|
512 dynamicWindow=(int8_t)(b-UD0); |
|
513 isSingleByteMode=TRUE; |
|
514 cnv->toUBytes[0]=b; |
|
515 cnv->toULength=1; |
|
516 state=defineOne; |
|
517 goto singleByteMode; |
|
518 } else if(b==UDX) { |
|
519 isSingleByteMode=TRUE; |
|
520 cnv->toUBytes[0]=b; |
|
521 cnv->toULength=1; |
|
522 state=definePairOne; |
|
523 goto singleByteMode; |
|
524 } else if(b==UQU) { |
|
525 cnv->toUBytes[0]=b; |
|
526 cnv->toULength=1; |
|
527 state=quotePairOne; |
|
528 } else /* Urs */ { |
|
529 /* callback(illegal) */ |
|
530 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
|
531 cnv->toUBytes[0]=b; |
|
532 cnv->toULength=1; |
|
533 goto endloop; |
|
534 } |
|
535 break; |
|
536 case quotePairOne: |
|
537 byteOne=b; |
|
538 cnv->toUBytes[1]=b; |
|
539 cnv->toULength=2; |
|
540 state=quotePairTwo; |
|
541 break; |
|
542 case quotePairTwo: |
|
543 *target++=(UChar)((byteOne<<8)|b); |
|
544 if(offsets!=NULL) { |
|
545 *offsets++=sourceIndex; |
|
546 } |
|
547 sourceIndex=nextSourceIndex; |
|
548 state=readCommand; |
|
549 goto fastUnicode; |
|
550 } |
|
551 } |
|
552 } |
|
553 endloop: |
|
554 |
|
555 /* set the converter state back into UConverter */ |
|
556 if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { |
|
557 /* reset to deal with the next character */ |
|
558 state=readCommand; |
|
559 } else if(state==readCommand) { |
|
560 /* not in a multi-byte sequence, reset toULength */ |
|
561 cnv->toULength=0; |
|
562 } |
|
563 scsu->toUIsSingleByteMode=isSingleByteMode; |
|
564 scsu->toUState=state; |
|
565 scsu->toUQuoteWindow=quoteWindow; |
|
566 scsu->toUDynamicWindow=dynamicWindow; |
|
567 scsu->toUByteOne=byteOne; |
|
568 |
|
569 /* write back the updated pointers */ |
|
570 pArgs->source=(const char *)source; |
|
571 pArgs->target=target; |
|
572 pArgs->offsets=offsets; |
|
573 return; |
|
574 } |
|
575 |
|
576 /* |
|
577 * Identical to _SCSUToUnicodeWithOffsets but without offset handling. |
|
578 * If a change is made in the original function, then either |
|
579 * change this function the same way or |
|
580 * re-copy the original function and remove the variables |
|
581 * offsets, sourceIndex, and nextSourceIndex. |
|
582 */ |
|
583 static void |
|
584 _SCSUToUnicode(UConverterToUnicodeArgs *pArgs, |
|
585 UErrorCode *pErrorCode) { |
|
586 UConverter *cnv; |
|
587 SCSUData *scsu; |
|
588 const uint8_t *source, *sourceLimit; |
|
589 UChar *target; |
|
590 const UChar *targetLimit; |
|
591 UBool isSingleByteMode; |
|
592 uint8_t state, byteOne; |
|
593 int8_t quoteWindow, dynamicWindow; |
|
594 |
|
595 uint8_t b; |
|
596 |
|
597 /* set up the local pointers */ |
|
598 cnv=pArgs->converter; |
|
599 scsu=(SCSUData *)cnv->extraInfo; |
|
600 |
|
601 source=(const uint8_t *)pArgs->source; |
|
602 sourceLimit=(const uint8_t *)pArgs->sourceLimit; |
|
603 target=pArgs->target; |
|
604 targetLimit=pArgs->targetLimit; |
|
605 |
|
606 /* get the state machine state */ |
|
607 isSingleByteMode=scsu->toUIsSingleByteMode; |
|
608 state=scsu->toUState; |
|
609 quoteWindow=scsu->toUQuoteWindow; |
|
610 dynamicWindow=scsu->toUDynamicWindow; |
|
611 byteOne=scsu->toUByteOne; |
|
612 |
|
613 /* |
|
614 * conversion "loop" |
|
615 * |
|
616 * For performance, this is not a normal C loop. |
|
617 * Instead, there are two code blocks for the two SCSU modes. |
|
618 * The function branches to either one, and a change of the mode is done with a goto to |
|
619 * the other branch. |
|
620 * |
|
621 * Each branch has two conventional loops: |
|
622 * - a fast-path loop for the most common codes in the mode |
|
623 * - a loop for all other codes in the mode |
|
624 * When the fast-path runs into a code that it cannot handle, its loop ends and it |
|
625 * runs into the following loop to handle the other codes. |
|
626 * The end of the input or output buffer is also handled by the slower loop. |
|
627 * The slow loop jumps (goto) to the fast-path loop again as soon as possible. |
|
628 * |
|
629 * The callback handling is done by returning with an error code. |
|
630 * The conversion framework actually calls the callback function. |
|
631 */ |
|
632 if(isSingleByteMode) { |
|
633 /* fast path for single-byte mode */ |
|
634 if(state==readCommand) { |
|
635 fastSingle: |
|
636 while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) { |
|
637 ++source; |
|
638 if(b<=0x7f) { |
|
639 /* write US-ASCII graphic character or DEL */ |
|
640 *target++=(UChar)b; |
|
641 } else { |
|
642 /* write from dynamic window */ |
|
643 uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f); |
|
644 if(c<=0xffff) { |
|
645 *target++=(UChar)c; |
|
646 } else { |
|
647 /* output surrogate pair */ |
|
648 *target++=(UChar)(0xd7c0+(c>>10)); |
|
649 if(target<targetLimit) { |
|
650 *target++=(UChar)(0xdc00|(c&0x3ff)); |
|
651 } else { |
|
652 /* target overflow */ |
|
653 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); |
|
654 cnv->UCharErrorBufferLength=1; |
|
655 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
656 goto endloop; |
|
657 } |
|
658 } |
|
659 } |
|
660 } |
|
661 } |
|
662 |
|
663 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */ |
|
664 singleByteMode: |
|
665 while(source<sourceLimit) { |
|
666 if(target>=targetLimit) { |
|
667 /* target is full */ |
|
668 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
669 break; |
|
670 } |
|
671 b=*source++; |
|
672 switch(state) { |
|
673 case readCommand: |
|
674 /* redundant conditions are commented out */ |
|
675 /* here: b<0x20 because otherwise we would be in fastSingle */ |
|
676 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { |
|
677 /* CR/LF/TAB/NUL */ |
|
678 *target++=(UChar)b; |
|
679 goto fastSingle; |
|
680 } else if(SC0<=b) { |
|
681 if(b<=SC7) { |
|
682 dynamicWindow=(int8_t)(b-SC0); |
|
683 goto fastSingle; |
|
684 } else /* if(SD0<=b && b<=SD7) */ { |
|
685 dynamicWindow=(int8_t)(b-SD0); |
|
686 state=defineOne; |
|
687 } |
|
688 } else if(/* SQ0<=b && */ b<=SQ7) { |
|
689 quoteWindow=(int8_t)(b-SQ0); |
|
690 state=quoteOne; |
|
691 } else if(b==SDX) { |
|
692 state=definePairOne; |
|
693 } else if(b==SQU) { |
|
694 state=quotePairOne; |
|
695 } else if(b==SCU) { |
|
696 isSingleByteMode=FALSE; |
|
697 goto fastUnicode; |
|
698 } else /* Srs */ { |
|
699 /* callback(illegal) */ |
|
700 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
|
701 cnv->toUBytes[0]=b; |
|
702 cnv->toULength=1; |
|
703 goto endloop; |
|
704 } |
|
705 |
|
706 /* store the first byte of a multibyte sequence in toUBytes[] */ |
|
707 cnv->toUBytes[0]=b; |
|
708 cnv->toULength=1; |
|
709 break; |
|
710 case quotePairOne: |
|
711 byteOne=b; |
|
712 cnv->toUBytes[1]=b; |
|
713 cnv->toULength=2; |
|
714 state=quotePairTwo; |
|
715 break; |
|
716 case quotePairTwo: |
|
717 *target++=(UChar)((byteOne<<8)|b); |
|
718 state=readCommand; |
|
719 goto fastSingle; |
|
720 case quoteOne: |
|
721 if(b<0x80) { |
|
722 /* all static offsets are in the BMP */ |
|
723 *target++=(UChar)(staticOffsets[quoteWindow]+b); |
|
724 } else { |
|
725 /* write from dynamic window */ |
|
726 uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f); |
|
727 if(c<=0xffff) { |
|
728 *target++=(UChar)c; |
|
729 } else { |
|
730 /* output surrogate pair */ |
|
731 *target++=(UChar)(0xd7c0+(c>>10)); |
|
732 if(target<targetLimit) { |
|
733 *target++=(UChar)(0xdc00|(c&0x3ff)); |
|
734 } else { |
|
735 /* target overflow */ |
|
736 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); |
|
737 cnv->UCharErrorBufferLength=1; |
|
738 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
739 goto endloop; |
|
740 } |
|
741 } |
|
742 } |
|
743 state=readCommand; |
|
744 goto fastSingle; |
|
745 case definePairOne: |
|
746 dynamicWindow=(int8_t)((b>>5)&7); |
|
747 byteOne=(uint8_t)(b&0x1f); |
|
748 cnv->toUBytes[1]=b; |
|
749 cnv->toULength=2; |
|
750 state=definePairTwo; |
|
751 break; |
|
752 case definePairTwo: |
|
753 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL); |
|
754 state=readCommand; |
|
755 goto fastSingle; |
|
756 case defineOne: |
|
757 if(b==0) { |
|
758 /* callback(illegal): Reserved window offset value 0 */ |
|
759 cnv->toUBytes[1]=b; |
|
760 cnv->toULength=2; |
|
761 goto endloop; |
|
762 } else if(b<gapThreshold) { |
|
763 scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL; |
|
764 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) { |
|
765 scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset; |
|
766 } else if(b>=fixedThreshold) { |
|
767 scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold]; |
|
768 } else { |
|
769 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */ |
|
770 cnv->toUBytes[1]=b; |
|
771 cnv->toULength=2; |
|
772 goto endloop; |
|
773 } |
|
774 state=readCommand; |
|
775 goto fastSingle; |
|
776 } |
|
777 } |
|
778 } else { |
|
779 /* fast path for Unicode mode */ |
|
780 if(state==readCommand) { |
|
781 fastUnicode: |
|
782 while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) { |
|
783 *target++=(UChar)((b<<8)|source[1]); |
|
784 source+=2; |
|
785 } |
|
786 } |
|
787 |
|
788 /* normal state machine for Unicode mode */ |
|
789 /* unicodeByteMode: */ |
|
790 while(source<sourceLimit) { |
|
791 if(target>=targetLimit) { |
|
792 /* target is full */ |
|
793 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
794 break; |
|
795 } |
|
796 b=*source++; |
|
797 switch(state) { |
|
798 case readCommand: |
|
799 if((uint8_t)(b-UC0)>(Urs-UC0)) { |
|
800 byteOne=b; |
|
801 cnv->toUBytes[0]=b; |
|
802 cnv->toULength=1; |
|
803 state=quotePairTwo; |
|
804 } else if(/* UC0<=b && */ b<=UC7) { |
|
805 dynamicWindow=(int8_t)(b-UC0); |
|
806 isSingleByteMode=TRUE; |
|
807 goto fastSingle; |
|
808 } else if(/* UD0<=b && */ b<=UD7) { |
|
809 dynamicWindow=(int8_t)(b-UD0); |
|
810 isSingleByteMode=TRUE; |
|
811 cnv->toUBytes[0]=b; |
|
812 cnv->toULength=1; |
|
813 state=defineOne; |
|
814 goto singleByteMode; |
|
815 } else if(b==UDX) { |
|
816 isSingleByteMode=TRUE; |
|
817 cnv->toUBytes[0]=b; |
|
818 cnv->toULength=1; |
|
819 state=definePairOne; |
|
820 goto singleByteMode; |
|
821 } else if(b==UQU) { |
|
822 cnv->toUBytes[0]=b; |
|
823 cnv->toULength=1; |
|
824 state=quotePairOne; |
|
825 } else /* Urs */ { |
|
826 /* callback(illegal) */ |
|
827 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
|
828 cnv->toUBytes[0]=b; |
|
829 cnv->toULength=1; |
|
830 goto endloop; |
|
831 } |
|
832 break; |
|
833 case quotePairOne: |
|
834 byteOne=b; |
|
835 cnv->toUBytes[1]=b; |
|
836 cnv->toULength=2; |
|
837 state=quotePairTwo; |
|
838 break; |
|
839 case quotePairTwo: |
|
840 *target++=(UChar)((byteOne<<8)|b); |
|
841 state=readCommand; |
|
842 goto fastUnicode; |
|
843 } |
|
844 } |
|
845 } |
|
846 endloop: |
|
847 |
|
848 /* set the converter state back into UConverter */ |
|
849 if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { |
|
850 /* reset to deal with the next character */ |
|
851 state=readCommand; |
|
852 } else if(state==readCommand) { |
|
853 /* not in a multi-byte sequence, reset toULength */ |
|
854 cnv->toULength=0; |
|
855 } |
|
856 scsu->toUIsSingleByteMode=isSingleByteMode; |
|
857 scsu->toUState=state; |
|
858 scsu->toUQuoteWindow=quoteWindow; |
|
859 scsu->toUDynamicWindow=dynamicWindow; |
|
860 scsu->toUByteOne=byteOne; |
|
861 |
|
862 /* write back the updated pointers */ |
|
863 pArgs->source=(const char *)source; |
|
864 pArgs->target=target; |
|
865 return; |
|
866 } |
|
867 |
|
868 /* SCSU-from-Unicode conversion functions ----------------------------------- */ |
|
869 |
|
870 /* |
|
871 * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve |
|
872 * reasonable results. The lookahead is minimal. |
|
873 * Many cases are simple: |
|
874 * A character fits directly into the current mode, a dynamic or static window, |
|
875 * or is not compressible. These cases are tested first. |
|
876 * Real compression heuristics are applied to the rest, in code branches for |
|
877 * single/Unicode mode and BMP/supplementary code points. |
|
878 * The heuristics used here are extremely simple. |
|
879 */ |
|
880 |
|
881 /* get the number of the window that this character is in, or -1 */ |
|
882 static int8_t |
|
883 getWindow(const uint32_t offsets[8], uint32_t c) { |
|
884 int i; |
|
885 for(i=0; i<8; ++i) { |
|
886 if((uint32_t)(c-offsets[i])<=0x7f) { |
|
887 return (int8_t)(i); |
|
888 } |
|
889 } |
|
890 return -1; |
|
891 } |
|
892 |
|
893 /* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */ |
|
894 static UBool |
|
895 isInOffsetWindowOrDirect(uint32_t offset, uint32_t c) { |
|
896 return (UBool)(c<=offset+0x7f && |
|
897 (c>=offset || (c<=0x7f && |
|
898 (c>=0x20 || (1UL<<c)&0x2601)))); |
|
899 /* binary 0010 0110 0000 0001, |
|
900 check for b==0xd || b==0xa || b==9 || b==0 */ |
|
901 } |
|
902 |
|
903 /* |
|
904 * getNextDynamicWindow returns the next dynamic window to be redefined |
|
905 */ |
|
906 static int8_t |
|
907 getNextDynamicWindow(SCSUData *scsu) { |
|
908 int8_t window=scsu->windowUse[scsu->nextWindowUseIndex]; |
|
909 if(++scsu->nextWindowUseIndex==8) { |
|
910 scsu->nextWindowUseIndex=0; |
|
911 } |
|
912 return window; |
|
913 } |
|
914 |
|
915 /* |
|
916 * useDynamicWindow() adjusts |
|
917 * windowUse[] and nextWindowUseIndex for the algorithm to choose |
|
918 * the next dynamic window to be defined; |
|
919 * a subclass may override it and provide its own algorithm. |
|
920 */ |
|
921 static void |
|
922 useDynamicWindow(SCSUData *scsu, int8_t window) { |
|
923 /* |
|
924 * move the existing window, which just became the most recently used one, |
|
925 * up in windowUse[] to nextWindowUseIndex-1 |
|
926 */ |
|
927 |
|
928 /* first, find the index of the window - backwards to favor the more recently used windows */ |
|
929 int i, j; |
|
930 |
|
931 i=scsu->nextWindowUseIndex; |
|
932 do { |
|
933 if(--i<0) { |
|
934 i=7; |
|
935 } |
|
936 } while(scsu->windowUse[i]!=window); |
|
937 |
|
938 /* now copy each windowUse[i+1] to [i] */ |
|
939 j=i+1; |
|
940 if(j==8) { |
|
941 j=0; |
|
942 } |
|
943 while(j!=scsu->nextWindowUseIndex) { |
|
944 scsu->windowUse[i]=scsu->windowUse[j]; |
|
945 i=j; |
|
946 if(++j==8) { j=0; } |
|
947 } |
|
948 |
|
949 /* finally, set the window into the most recently used index */ |
|
950 scsu->windowUse[i]=window; |
|
951 } |
|
952 |
|
953 /* |
|
954 * calculate the offset and the code for a dynamic window that contains the character |
|
955 * takes fixed offsets into account |
|
956 * the offset of the window is stored in the offset variable, |
|
957 * the code is returned |
|
958 * |
|
959 * return offset code: -1 none <=0xff code for SDn/UDn else code for SDX/UDX, subtract 0x200 to get the true code |
|
960 */ |
|
961 static int |
|
962 getDynamicOffset(uint32_t c, uint32_t *pOffset) { |
|
963 int i; |
|
964 |
|
965 for(i=0; i<7; ++i) { |
|
966 if((uint32_t)(c-fixedOffsets[i])<=0x7f) { |
|
967 *pOffset=fixedOffsets[i]; |
|
968 return 0xf9+i; |
|
969 } |
|
970 } |
|
971 |
|
972 if(c<0x80) { |
|
973 /* No dynamic window for US-ASCII. */ |
|
974 return -1; |
|
975 } else if(c<0x3400 || |
|
976 (uint32_t)(c-0x10000)<(0x14000-0x10000) || |
|
977 (uint32_t)(c-0x1d000)<=(0x1ffff-0x1d000) |
|
978 ) { |
|
979 /* This character is in a code range for a "small", i.e., reasonably windowable, script. */ |
|
980 *pOffset=c&0x7fffff80; |
|
981 return (int)(c>>7); |
|
982 } else if(0xe000<=c && c!=0xfeff && c<0xfff0) { |
|
983 /* For these characters we need to take the gapOffset into account. */ |
|
984 *pOffset=c&0x7fffff80; |
|
985 return (int)((c-gapOffset)>>7); |
|
986 } else { |
|
987 return -1; |
|
988 } |
|
989 } |
|
990 |
|
991 /* |
|
992 * Idea for compression: |
|
993 * - save SCSUData and other state before really starting work |
|
994 * - at endloop, see if compression could be better with just unicode mode |
|
995 * - don't do this if a callback has been called |
|
996 * - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning |
|
997 * - different buffer handling! |
|
998 * |
|
999 * Drawback or need for corrective handling: |
|
1000 * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and |
|
1001 * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible |
|
1002 * not only for compression but also for HTML/XML documents with following charset/encoding announcers. |
|
1003 * |
|
1004 * How to achieve both? |
|
1005 * - Only replace the result after an SDX or SCU? |
|
1006 */ |
|
1007 |
|
1008 static void |
|
1009 _SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, |
|
1010 UErrorCode *pErrorCode) { |
|
1011 UConverter *cnv; |
|
1012 SCSUData *scsu; |
|
1013 const UChar *source, *sourceLimit; |
|
1014 uint8_t *target; |
|
1015 int32_t targetCapacity; |
|
1016 int32_t *offsets; |
|
1017 |
|
1018 UBool isSingleByteMode; |
|
1019 uint8_t dynamicWindow; |
|
1020 uint32_t currentOffset; |
|
1021 |
|
1022 uint32_t c, delta; |
|
1023 |
|
1024 int32_t sourceIndex, nextSourceIndex; |
|
1025 |
|
1026 int32_t length; |
|
1027 |
|
1028 /* variables for compression heuristics */ |
|
1029 uint32_t offset; |
|
1030 UChar lead, trail; |
|
1031 int code; |
|
1032 int8_t window; |
|
1033 |
|
1034 /* set up the local pointers */ |
|
1035 cnv=pArgs->converter; |
|
1036 scsu=(SCSUData *)cnv->extraInfo; |
|
1037 |
|
1038 /* set up the local pointers */ |
|
1039 source=pArgs->source; |
|
1040 sourceLimit=pArgs->sourceLimit; |
|
1041 target=(uint8_t *)pArgs->target; |
|
1042 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); |
|
1043 offsets=pArgs->offsets; |
|
1044 |
|
1045 /* get the state machine state */ |
|
1046 isSingleByteMode=scsu->fromUIsSingleByteMode; |
|
1047 dynamicWindow=scsu->fromUDynamicWindow; |
|
1048 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
|
1049 |
|
1050 c=cnv->fromUChar32; |
|
1051 |
|
1052 /* sourceIndex=-1 if the current character began in the previous buffer */ |
|
1053 sourceIndex= c==0 ? 0 : -1; |
|
1054 nextSourceIndex=0; |
|
1055 |
|
1056 /* similar conversion "loop" as in toUnicode */ |
|
1057 loop: |
|
1058 if(isSingleByteMode) { |
|
1059 if(c!=0 && targetCapacity>0) { |
|
1060 goto getTrailSingle; |
|
1061 } |
|
1062 |
|
1063 /* state machine for single-byte mode */ |
|
1064 /* singleByteMode: */ |
|
1065 while(source<sourceLimit) { |
|
1066 if(targetCapacity<=0) { |
|
1067 /* target is full */ |
|
1068 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
1069 break; |
|
1070 } |
|
1071 c=*source++; |
|
1072 ++nextSourceIndex; |
|
1073 |
|
1074 if((c-0x20)<=0x5f) { |
|
1075 /* pass US-ASCII graphic character through */ |
|
1076 *target++=(uint8_t)c; |
|
1077 if(offsets!=NULL) { |
|
1078 *offsets++=sourceIndex; |
|
1079 } |
|
1080 --targetCapacity; |
|
1081 } else if(c<0x20) { |
|
1082 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { |
|
1083 /* CR/LF/TAB/NUL */ |
|
1084 *target++=(uint8_t)c; |
|
1085 if(offsets!=NULL) { |
|
1086 *offsets++=sourceIndex; |
|
1087 } |
|
1088 --targetCapacity; |
|
1089 } else { |
|
1090 /* quote C0 control character */ |
|
1091 c|=SQ0<<8; |
|
1092 length=2; |
|
1093 goto outputBytes; |
|
1094 } |
|
1095 } else if((delta=c-currentOffset)<=0x7f) { |
|
1096 /* use the current dynamic window */ |
|
1097 *target++=(uint8_t)(delta|0x80); |
|
1098 if(offsets!=NULL) { |
|
1099 *offsets++=sourceIndex; |
|
1100 } |
|
1101 --targetCapacity; |
|
1102 } else if(U16_IS_SURROGATE(c)) { |
|
1103 if(U16_IS_SURROGATE_LEAD(c)) { |
|
1104 getTrailSingle: |
|
1105 lead=(UChar)c; |
|
1106 if(source<sourceLimit) { |
|
1107 /* test the following code unit */ |
|
1108 trail=*source; |
|
1109 if(U16_IS_TRAIL(trail)) { |
|
1110 ++source; |
|
1111 ++nextSourceIndex; |
|
1112 c=U16_GET_SUPPLEMENTARY(c, trail); |
|
1113 /* convert this surrogate code point */ |
|
1114 /* exit this condition tree */ |
|
1115 } else { |
|
1116 /* this is an unmatched lead code unit (1st surrogate) */ |
|
1117 /* callback(illegal) */ |
|
1118 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
|
1119 goto endloop; |
|
1120 } |
|
1121 } else { |
|
1122 /* no more input */ |
|
1123 break; |
|
1124 } |
|
1125 } else { |
|
1126 /* this is an unmatched trail code unit (2nd surrogate) */ |
|
1127 /* callback(illegal) */ |
|
1128 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
|
1129 goto endloop; |
|
1130 } |
|
1131 |
|
1132 /* compress supplementary character U+10000..U+10ffff */ |
|
1133 if((delta=c-currentOffset)<=0x7f) { |
|
1134 /* use the current dynamic window */ |
|
1135 *target++=(uint8_t)(delta|0x80); |
|
1136 if(offsets!=NULL) { |
|
1137 *offsets++=sourceIndex; |
|
1138 } |
|
1139 --targetCapacity; |
|
1140 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { |
|
1141 /* there is a dynamic window that contains this character, change to it */ |
|
1142 dynamicWindow=window; |
|
1143 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
|
1144 useDynamicWindow(scsu, dynamicWindow); |
|
1145 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; |
|
1146 length=2; |
|
1147 goto outputBytes; |
|
1148 } else if((code=getDynamicOffset(c, &offset))>=0) { |
|
1149 /* might check if there are more characters in this window to come */ |
|
1150 /* define an extended window with this character */ |
|
1151 code-=0x200; |
|
1152 dynamicWindow=getNextDynamicWindow(scsu); |
|
1153 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; |
|
1154 useDynamicWindow(scsu, dynamicWindow); |
|
1155 c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; |
|
1156 length=4; |
|
1157 goto outputBytes; |
|
1158 } else { |
|
1159 /* change to Unicode mode and output this (lead, trail) pair */ |
|
1160 isSingleByteMode=FALSE; |
|
1161 *target++=(uint8_t)SCU; |
|
1162 if(offsets!=NULL) { |
|
1163 *offsets++=sourceIndex; |
|
1164 } |
|
1165 --targetCapacity; |
|
1166 c=((uint32_t)lead<<16)|trail; |
|
1167 length=4; |
|
1168 goto outputBytes; |
|
1169 } |
|
1170 } else if(c<0xa0) { |
|
1171 /* quote C1 control character */ |
|
1172 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */ |
|
1173 length=2; |
|
1174 goto outputBytes; |
|
1175 } else if(c==0xfeff || c>=0xfff0) { |
|
1176 /* quote signature character=byte order mark and specials */ |
|
1177 c|=SQU<<16; |
|
1178 length=3; |
|
1179 goto outputBytes; |
|
1180 } else { |
|
1181 /* compress all other BMP characters */ |
|
1182 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { |
|
1183 /* there is a window defined that contains this character - switch to it or quote from it? */ |
|
1184 if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) { |
|
1185 /* change to dynamic window */ |
|
1186 dynamicWindow=window; |
|
1187 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
|
1188 useDynamicWindow(scsu, dynamicWindow); |
|
1189 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; |
|
1190 length=2; |
|
1191 goto outputBytes; |
|
1192 } else { |
|
1193 /* quote from dynamic window */ |
|
1194 c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80; |
|
1195 length=2; |
|
1196 goto outputBytes; |
|
1197 } |
|
1198 } else if((window=getWindow(staticOffsets, c))>=0) { |
|
1199 /* quote from static window */ |
|
1200 c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]); |
|
1201 length=2; |
|
1202 goto outputBytes; |
|
1203 } else if((code=getDynamicOffset(c, &offset))>=0) { |
|
1204 /* define a dynamic window with this character */ |
|
1205 dynamicWindow=getNextDynamicWindow(scsu); |
|
1206 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; |
|
1207 useDynamicWindow(scsu, dynamicWindow); |
|
1208 c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; |
|
1209 length=3; |
|
1210 goto outputBytes; |
|
1211 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) && |
|
1212 (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400)) |
|
1213 ) { |
|
1214 /* |
|
1215 * this character is not compressible (a BMP ideograph or similar); |
|
1216 * switch to Unicode mode if this is the last character in the block |
|
1217 * or there is at least one more ideograph following immediately |
|
1218 */ |
|
1219 isSingleByteMode=FALSE; |
|
1220 c|=SCU<<16; |
|
1221 length=3; |
|
1222 goto outputBytes; |
|
1223 } else { |
|
1224 /* quote Unicode */ |
|
1225 c|=SQU<<16; |
|
1226 length=3; |
|
1227 goto outputBytes; |
|
1228 } |
|
1229 } |
|
1230 |
|
1231 /* normal end of conversion: prepare for a new character */ |
|
1232 c=0; |
|
1233 sourceIndex=nextSourceIndex; |
|
1234 } |
|
1235 } else { |
|
1236 if(c!=0 && targetCapacity>0) { |
|
1237 goto getTrailUnicode; |
|
1238 } |
|
1239 |
|
1240 /* state machine for Unicode mode */ |
|
1241 /* unicodeByteMode: */ |
|
1242 while(source<sourceLimit) { |
|
1243 if(targetCapacity<=0) { |
|
1244 /* target is full */ |
|
1245 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
1246 break; |
|
1247 } |
|
1248 c=*source++; |
|
1249 ++nextSourceIndex; |
|
1250 |
|
1251 if((uint32_t)(c-0x3400)<(0xd800-0x3400)) { |
|
1252 /* not compressible, write character directly */ |
|
1253 if(targetCapacity>=2) { |
|
1254 *target++=(uint8_t)(c>>8); |
|
1255 *target++=(uint8_t)c; |
|
1256 if(offsets!=NULL) { |
|
1257 *offsets++=sourceIndex; |
|
1258 *offsets++=sourceIndex; |
|
1259 } |
|
1260 targetCapacity-=2; |
|
1261 } else { |
|
1262 length=2; |
|
1263 goto outputBytes; |
|
1264 } |
|
1265 } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) { |
|
1266 /* compress BMP character if the following one is not an uncompressible ideograph */ |
|
1267 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) { |
|
1268 if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) { |
|
1269 /* ASCII digit or letter */ |
|
1270 isSingleByteMode=TRUE; |
|
1271 c|=((uint32_t)(UC0+dynamicWindow)<<8)|c; |
|
1272 length=2; |
|
1273 goto outputBytes; |
|
1274 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { |
|
1275 /* there is a dynamic window that contains this character, change to it */ |
|
1276 isSingleByteMode=TRUE; |
|
1277 dynamicWindow=window; |
|
1278 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
|
1279 useDynamicWindow(scsu, dynamicWindow); |
|
1280 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; |
|
1281 length=2; |
|
1282 goto outputBytes; |
|
1283 } else if((code=getDynamicOffset(c, &offset))>=0) { |
|
1284 /* define a dynamic window with this character */ |
|
1285 isSingleByteMode=TRUE; |
|
1286 dynamicWindow=getNextDynamicWindow(scsu); |
|
1287 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; |
|
1288 useDynamicWindow(scsu, dynamicWindow); |
|
1289 c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; |
|
1290 length=3; |
|
1291 goto outputBytes; |
|
1292 } |
|
1293 } |
|
1294 |
|
1295 /* don't know how to compress this character, just write it directly */ |
|
1296 length=2; |
|
1297 goto outputBytes; |
|
1298 } else if(c<0xe000) { |
|
1299 /* c is a surrogate */ |
|
1300 if(U16_IS_SURROGATE_LEAD(c)) { |
|
1301 getTrailUnicode: |
|
1302 lead=(UChar)c; |
|
1303 if(source<sourceLimit) { |
|
1304 /* test the following code unit */ |
|
1305 trail=*source; |
|
1306 if(U16_IS_TRAIL(trail)) { |
|
1307 ++source; |
|
1308 ++nextSourceIndex; |
|
1309 c=U16_GET_SUPPLEMENTARY(c, trail); |
|
1310 /* convert this surrogate code point */ |
|
1311 /* exit this condition tree */ |
|
1312 } else { |
|
1313 /* this is an unmatched lead code unit (1st surrogate) */ |
|
1314 /* callback(illegal) */ |
|
1315 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
|
1316 goto endloop; |
|
1317 } |
|
1318 } else { |
|
1319 /* no more input */ |
|
1320 break; |
|
1321 } |
|
1322 } else { |
|
1323 /* this is an unmatched trail code unit (2nd surrogate) */ |
|
1324 /* callback(illegal) */ |
|
1325 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
|
1326 goto endloop; |
|
1327 } |
|
1328 |
|
1329 /* compress supplementary character */ |
|
1330 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 && |
|
1331 !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400)) |
|
1332 ) { |
|
1333 /* |
|
1334 * there is a dynamic window that contains this character and |
|
1335 * the following character is not uncompressible, |
|
1336 * change to the window |
|
1337 */ |
|
1338 isSingleByteMode=TRUE; |
|
1339 dynamicWindow=window; |
|
1340 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
|
1341 useDynamicWindow(scsu, dynamicWindow); |
|
1342 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; |
|
1343 length=2; |
|
1344 goto outputBytes; |
|
1345 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */ |
|
1346 (code=getDynamicOffset(c, &offset))>=0 |
|
1347 ) { |
|
1348 /* two supplementary characters in (probably) the same window - define an extended one */ |
|
1349 isSingleByteMode=TRUE; |
|
1350 code-=0x200; |
|
1351 dynamicWindow=getNextDynamicWindow(scsu); |
|
1352 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; |
|
1353 useDynamicWindow(scsu, dynamicWindow); |
|
1354 c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; |
|
1355 length=4; |
|
1356 goto outputBytes; |
|
1357 } else { |
|
1358 /* don't know how to compress this character, just write it directly */ |
|
1359 c=((uint32_t)lead<<16)|trail; |
|
1360 length=4; |
|
1361 goto outputBytes; |
|
1362 } |
|
1363 } else /* 0xe000<=c<0xf300 */ { |
|
1364 /* quote to avoid SCSU tags */ |
|
1365 c|=UQU<<16; |
|
1366 length=3; |
|
1367 goto outputBytes; |
|
1368 } |
|
1369 |
|
1370 /* normal end of conversion: prepare for a new character */ |
|
1371 c=0; |
|
1372 sourceIndex=nextSourceIndex; |
|
1373 } |
|
1374 } |
|
1375 endloop: |
|
1376 |
|
1377 /* set the converter state back into UConverter */ |
|
1378 scsu->fromUIsSingleByteMode=isSingleByteMode; |
|
1379 scsu->fromUDynamicWindow=dynamicWindow; |
|
1380 |
|
1381 cnv->fromUChar32=c; |
|
1382 |
|
1383 /* write back the updated pointers */ |
|
1384 pArgs->source=source; |
|
1385 pArgs->target=(char *)target; |
|
1386 pArgs->offsets=offsets; |
|
1387 return; |
|
1388 |
|
1389 outputBytes: |
|
1390 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */ |
|
1391 /* from the first if in the loop we know that targetCapacity>0 */ |
|
1392 if(length<=targetCapacity) { |
|
1393 if(offsets==NULL) { |
|
1394 switch(length) { |
|
1395 /* each branch falls through to the next one */ |
|
1396 case 4: |
|
1397 *target++=(uint8_t)(c>>24); |
|
1398 case 3: /*fall through*/ |
|
1399 *target++=(uint8_t)(c>>16); |
|
1400 case 2: /*fall through*/ |
|
1401 *target++=(uint8_t)(c>>8); |
|
1402 case 1: /*fall through*/ |
|
1403 *target++=(uint8_t)c; |
|
1404 default: |
|
1405 /* will never occur */ |
|
1406 break; |
|
1407 } |
|
1408 } else { |
|
1409 switch(length) { |
|
1410 /* each branch falls through to the next one */ |
|
1411 case 4: |
|
1412 *target++=(uint8_t)(c>>24); |
|
1413 *offsets++=sourceIndex; |
|
1414 case 3: /*fall through*/ |
|
1415 *target++=(uint8_t)(c>>16); |
|
1416 *offsets++=sourceIndex; |
|
1417 case 2: /*fall through*/ |
|
1418 *target++=(uint8_t)(c>>8); |
|
1419 *offsets++=sourceIndex; |
|
1420 case 1: /*fall through*/ |
|
1421 *target++=(uint8_t)c; |
|
1422 *offsets++=sourceIndex; |
|
1423 default: |
|
1424 /* will never occur */ |
|
1425 break; |
|
1426 } |
|
1427 } |
|
1428 targetCapacity-=length; |
|
1429 |
|
1430 /* normal end of conversion: prepare for a new character */ |
|
1431 c=0; |
|
1432 sourceIndex=nextSourceIndex; |
|
1433 goto loop; |
|
1434 } else { |
|
1435 uint8_t *p; |
|
1436 |
|
1437 /* |
|
1438 * We actually do this backwards here: |
|
1439 * In order to save an intermediate variable, we output |
|
1440 * first to the overflow buffer what does not fit into the |
|
1441 * regular target. |
|
1442 */ |
|
1443 /* we know that 0<=targetCapacity<length<=4 */ |
|
1444 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */ |
|
1445 length-=targetCapacity; |
|
1446 p=(uint8_t *)cnv->charErrorBuffer; |
|
1447 switch(length) { |
|
1448 /* each branch falls through to the next one */ |
|
1449 case 4: |
|
1450 *p++=(uint8_t)(c>>24); |
|
1451 case 3: /*fall through*/ |
|
1452 *p++=(uint8_t)(c>>16); |
|
1453 case 2: /*fall through*/ |
|
1454 *p++=(uint8_t)(c>>8); |
|
1455 case 1: /*fall through*/ |
|
1456 *p=(uint8_t)c; |
|
1457 default: |
|
1458 /* will never occur */ |
|
1459 break; |
|
1460 } |
|
1461 cnv->charErrorBufferLength=(int8_t)length; |
|
1462 |
|
1463 /* now output what fits into the regular target */ |
|
1464 c>>=8*length; /* length was reduced by targetCapacity */ |
|
1465 switch(targetCapacity) { |
|
1466 /* each branch falls through to the next one */ |
|
1467 case 3: |
|
1468 *target++=(uint8_t)(c>>16); |
|
1469 if(offsets!=NULL) { |
|
1470 *offsets++=sourceIndex; |
|
1471 } |
|
1472 case 2: /*fall through*/ |
|
1473 *target++=(uint8_t)(c>>8); |
|
1474 if(offsets!=NULL) { |
|
1475 *offsets++=sourceIndex; |
|
1476 } |
|
1477 case 1: /*fall through*/ |
|
1478 *target++=(uint8_t)c; |
|
1479 if(offsets!=NULL) { |
|
1480 *offsets++=sourceIndex; |
|
1481 } |
|
1482 default: |
|
1483 break; |
|
1484 } |
|
1485 |
|
1486 /* target overflow */ |
|
1487 targetCapacity=0; |
|
1488 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
1489 c=0; |
|
1490 goto endloop; |
|
1491 } |
|
1492 } |
|
1493 |
|
1494 /* |
|
1495 * Identical to _SCSUFromUnicodeWithOffsets but without offset handling. |
|
1496 * If a change is made in the original function, then either |
|
1497 * change this function the same way or |
|
1498 * re-copy the original function and remove the variables |
|
1499 * offsets, sourceIndex, and nextSourceIndex. |
|
1500 */ |
|
1501 static void |
|
1502 _SCSUFromUnicode(UConverterFromUnicodeArgs *pArgs, |
|
1503 UErrorCode *pErrorCode) { |
|
1504 UConverter *cnv; |
|
1505 SCSUData *scsu; |
|
1506 const UChar *source, *sourceLimit; |
|
1507 uint8_t *target; |
|
1508 int32_t targetCapacity; |
|
1509 |
|
1510 UBool isSingleByteMode; |
|
1511 uint8_t dynamicWindow; |
|
1512 uint32_t currentOffset; |
|
1513 |
|
1514 uint32_t c, delta; |
|
1515 |
|
1516 int32_t length; |
|
1517 |
|
1518 /* variables for compression heuristics */ |
|
1519 uint32_t offset; |
|
1520 UChar lead, trail; |
|
1521 int code; |
|
1522 int8_t window; |
|
1523 |
|
1524 /* set up the local pointers */ |
|
1525 cnv=pArgs->converter; |
|
1526 scsu=(SCSUData *)cnv->extraInfo; |
|
1527 |
|
1528 /* set up the local pointers */ |
|
1529 source=pArgs->source; |
|
1530 sourceLimit=pArgs->sourceLimit; |
|
1531 target=(uint8_t *)pArgs->target; |
|
1532 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); |
|
1533 |
|
1534 /* get the state machine state */ |
|
1535 isSingleByteMode=scsu->fromUIsSingleByteMode; |
|
1536 dynamicWindow=scsu->fromUDynamicWindow; |
|
1537 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
|
1538 |
|
1539 c=cnv->fromUChar32; |
|
1540 |
|
1541 /* similar conversion "loop" as in toUnicode */ |
|
1542 loop: |
|
1543 if(isSingleByteMode) { |
|
1544 if(c!=0 && targetCapacity>0) { |
|
1545 goto getTrailSingle; |
|
1546 } |
|
1547 |
|
1548 /* state machine for single-byte mode */ |
|
1549 /* singleByteMode: */ |
|
1550 while(source<sourceLimit) { |
|
1551 if(targetCapacity<=0) { |
|
1552 /* target is full */ |
|
1553 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
1554 break; |
|
1555 } |
|
1556 c=*source++; |
|
1557 |
|
1558 if((c-0x20)<=0x5f) { |
|
1559 /* pass US-ASCII graphic character through */ |
|
1560 *target++=(uint8_t)c; |
|
1561 --targetCapacity; |
|
1562 } else if(c<0x20) { |
|
1563 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { |
|
1564 /* CR/LF/TAB/NUL */ |
|
1565 *target++=(uint8_t)c; |
|
1566 --targetCapacity; |
|
1567 } else { |
|
1568 /* quote C0 control character */ |
|
1569 c|=SQ0<<8; |
|
1570 length=2; |
|
1571 goto outputBytes; |
|
1572 } |
|
1573 } else if((delta=c-currentOffset)<=0x7f) { |
|
1574 /* use the current dynamic window */ |
|
1575 *target++=(uint8_t)(delta|0x80); |
|
1576 --targetCapacity; |
|
1577 } else if(U16_IS_SURROGATE(c)) { |
|
1578 if(U16_IS_SURROGATE_LEAD(c)) { |
|
1579 getTrailSingle: |
|
1580 lead=(UChar)c; |
|
1581 if(source<sourceLimit) { |
|
1582 /* test the following code unit */ |
|
1583 trail=*source; |
|
1584 if(U16_IS_TRAIL(trail)) { |
|
1585 ++source; |
|
1586 c=U16_GET_SUPPLEMENTARY(c, trail); |
|
1587 /* convert this surrogate code point */ |
|
1588 /* exit this condition tree */ |
|
1589 } else { |
|
1590 /* this is an unmatched lead code unit (1st surrogate) */ |
|
1591 /* callback(illegal) */ |
|
1592 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
|
1593 goto endloop; |
|
1594 } |
|
1595 } else { |
|
1596 /* no more input */ |
|
1597 break; |
|
1598 } |
|
1599 } else { |
|
1600 /* this is an unmatched trail code unit (2nd surrogate) */ |
|
1601 /* callback(illegal) */ |
|
1602 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
|
1603 goto endloop; |
|
1604 } |
|
1605 |
|
1606 /* compress supplementary character U+10000..U+10ffff */ |
|
1607 if((delta=c-currentOffset)<=0x7f) { |
|
1608 /* use the current dynamic window */ |
|
1609 *target++=(uint8_t)(delta|0x80); |
|
1610 --targetCapacity; |
|
1611 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { |
|
1612 /* there is a dynamic window that contains this character, change to it */ |
|
1613 dynamicWindow=window; |
|
1614 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
|
1615 useDynamicWindow(scsu, dynamicWindow); |
|
1616 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; |
|
1617 length=2; |
|
1618 goto outputBytes; |
|
1619 } else if((code=getDynamicOffset(c, &offset))>=0) { |
|
1620 /* might check if there are more characters in this window to come */ |
|
1621 /* define an extended window with this character */ |
|
1622 code-=0x200; |
|
1623 dynamicWindow=getNextDynamicWindow(scsu); |
|
1624 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; |
|
1625 useDynamicWindow(scsu, dynamicWindow); |
|
1626 c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; |
|
1627 length=4; |
|
1628 goto outputBytes; |
|
1629 } else { |
|
1630 /* change to Unicode mode and output this (lead, trail) pair */ |
|
1631 isSingleByteMode=FALSE; |
|
1632 *target++=(uint8_t)SCU; |
|
1633 --targetCapacity; |
|
1634 c=((uint32_t)lead<<16)|trail; |
|
1635 length=4; |
|
1636 goto outputBytes; |
|
1637 } |
|
1638 } else if(c<0xa0) { |
|
1639 /* quote C1 control character */ |
|
1640 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */ |
|
1641 length=2; |
|
1642 goto outputBytes; |
|
1643 } else if(c==0xfeff || c>=0xfff0) { |
|
1644 /* quote signature character=byte order mark and specials */ |
|
1645 c|=SQU<<16; |
|
1646 length=3; |
|
1647 goto outputBytes; |
|
1648 } else { |
|
1649 /* compress all other BMP characters */ |
|
1650 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { |
|
1651 /* there is a window defined that contains this character - switch to it or quote from it? */ |
|
1652 if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) { |
|
1653 /* change to dynamic window */ |
|
1654 dynamicWindow=window; |
|
1655 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
|
1656 useDynamicWindow(scsu, dynamicWindow); |
|
1657 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; |
|
1658 length=2; |
|
1659 goto outputBytes; |
|
1660 } else { |
|
1661 /* quote from dynamic window */ |
|
1662 c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80; |
|
1663 length=2; |
|
1664 goto outputBytes; |
|
1665 } |
|
1666 } else if((window=getWindow(staticOffsets, c))>=0) { |
|
1667 /* quote from static window */ |
|
1668 c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]); |
|
1669 length=2; |
|
1670 goto outputBytes; |
|
1671 } else if((code=getDynamicOffset(c, &offset))>=0) { |
|
1672 /* define a dynamic window with this character */ |
|
1673 dynamicWindow=getNextDynamicWindow(scsu); |
|
1674 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; |
|
1675 useDynamicWindow(scsu, dynamicWindow); |
|
1676 c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; |
|
1677 length=3; |
|
1678 goto outputBytes; |
|
1679 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) && |
|
1680 (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400)) |
|
1681 ) { |
|
1682 /* |
|
1683 * this character is not compressible (a BMP ideograph or similar); |
|
1684 * switch to Unicode mode if this is the last character in the block |
|
1685 * or there is at least one more ideograph following immediately |
|
1686 */ |
|
1687 isSingleByteMode=FALSE; |
|
1688 c|=SCU<<16; |
|
1689 length=3; |
|
1690 goto outputBytes; |
|
1691 } else { |
|
1692 /* quote Unicode */ |
|
1693 c|=SQU<<16; |
|
1694 length=3; |
|
1695 goto outputBytes; |
|
1696 } |
|
1697 } |
|
1698 |
|
1699 /* normal end of conversion: prepare for a new character */ |
|
1700 c=0; |
|
1701 } |
|
1702 } else { |
|
1703 if(c!=0 && targetCapacity>0) { |
|
1704 goto getTrailUnicode; |
|
1705 } |
|
1706 |
|
1707 /* state machine for Unicode mode */ |
|
1708 /* unicodeByteMode: */ |
|
1709 while(source<sourceLimit) { |
|
1710 if(targetCapacity<=0) { |
|
1711 /* target is full */ |
|
1712 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
1713 break; |
|
1714 } |
|
1715 c=*source++; |
|
1716 |
|
1717 if((uint32_t)(c-0x3400)<(0xd800-0x3400)) { |
|
1718 /* not compressible, write character directly */ |
|
1719 if(targetCapacity>=2) { |
|
1720 *target++=(uint8_t)(c>>8); |
|
1721 *target++=(uint8_t)c; |
|
1722 targetCapacity-=2; |
|
1723 } else { |
|
1724 length=2; |
|
1725 goto outputBytes; |
|
1726 } |
|
1727 } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) { |
|
1728 /* compress BMP character if the following one is not an uncompressible ideograph */ |
|
1729 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) { |
|
1730 if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) { |
|
1731 /* ASCII digit or letter */ |
|
1732 isSingleByteMode=TRUE; |
|
1733 c|=((uint32_t)(UC0+dynamicWindow)<<8)|c; |
|
1734 length=2; |
|
1735 goto outputBytes; |
|
1736 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { |
|
1737 /* there is a dynamic window that contains this character, change to it */ |
|
1738 isSingleByteMode=TRUE; |
|
1739 dynamicWindow=window; |
|
1740 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
|
1741 useDynamicWindow(scsu, dynamicWindow); |
|
1742 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; |
|
1743 length=2; |
|
1744 goto outputBytes; |
|
1745 } else if((code=getDynamicOffset(c, &offset))>=0) { |
|
1746 /* define a dynamic window with this character */ |
|
1747 isSingleByteMode=TRUE; |
|
1748 dynamicWindow=getNextDynamicWindow(scsu); |
|
1749 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; |
|
1750 useDynamicWindow(scsu, dynamicWindow); |
|
1751 c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; |
|
1752 length=3; |
|
1753 goto outputBytes; |
|
1754 } |
|
1755 } |
|
1756 |
|
1757 /* don't know how to compress this character, just write it directly */ |
|
1758 length=2; |
|
1759 goto outputBytes; |
|
1760 } else if(c<0xe000) { |
|
1761 /* c is a surrogate */ |
|
1762 if(U16_IS_SURROGATE_LEAD(c)) { |
|
1763 getTrailUnicode: |
|
1764 lead=(UChar)c; |
|
1765 if(source<sourceLimit) { |
|
1766 /* test the following code unit */ |
|
1767 trail=*source; |
|
1768 if(U16_IS_TRAIL(trail)) { |
|
1769 ++source; |
|
1770 c=U16_GET_SUPPLEMENTARY(c, trail); |
|
1771 /* convert this surrogate code point */ |
|
1772 /* exit this condition tree */ |
|
1773 } else { |
|
1774 /* this is an unmatched lead code unit (1st surrogate) */ |
|
1775 /* callback(illegal) */ |
|
1776 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
|
1777 goto endloop; |
|
1778 } |
|
1779 } else { |
|
1780 /* no more input */ |
|
1781 break; |
|
1782 } |
|
1783 } else { |
|
1784 /* this is an unmatched trail code unit (2nd surrogate) */ |
|
1785 /* callback(illegal) */ |
|
1786 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
|
1787 goto endloop; |
|
1788 } |
|
1789 |
|
1790 /* compress supplementary character */ |
|
1791 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 && |
|
1792 !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400)) |
|
1793 ) { |
|
1794 /* |
|
1795 * there is a dynamic window that contains this character and |
|
1796 * the following character is not uncompressible, |
|
1797 * change to the window |
|
1798 */ |
|
1799 isSingleByteMode=TRUE; |
|
1800 dynamicWindow=window; |
|
1801 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
|
1802 useDynamicWindow(scsu, dynamicWindow); |
|
1803 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; |
|
1804 length=2; |
|
1805 goto outputBytes; |
|
1806 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */ |
|
1807 (code=getDynamicOffset(c, &offset))>=0 |
|
1808 ) { |
|
1809 /* two supplementary characters in (probably) the same window - define an extended one */ |
|
1810 isSingleByteMode=TRUE; |
|
1811 code-=0x200; |
|
1812 dynamicWindow=getNextDynamicWindow(scsu); |
|
1813 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; |
|
1814 useDynamicWindow(scsu, dynamicWindow); |
|
1815 c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; |
|
1816 length=4; |
|
1817 goto outputBytes; |
|
1818 } else { |
|
1819 /* don't know how to compress this character, just write it directly */ |
|
1820 c=((uint32_t)lead<<16)|trail; |
|
1821 length=4; |
|
1822 goto outputBytes; |
|
1823 } |
|
1824 } else /* 0xe000<=c<0xf300 */ { |
|
1825 /* quote to avoid SCSU tags */ |
|
1826 c|=UQU<<16; |
|
1827 length=3; |
|
1828 goto outputBytes; |
|
1829 } |
|
1830 |
|
1831 /* normal end of conversion: prepare for a new character */ |
|
1832 c=0; |
|
1833 } |
|
1834 } |
|
1835 endloop: |
|
1836 |
|
1837 /* set the converter state back into UConverter */ |
|
1838 scsu->fromUIsSingleByteMode=isSingleByteMode; |
|
1839 scsu->fromUDynamicWindow=dynamicWindow; |
|
1840 |
|
1841 cnv->fromUChar32=c; |
|
1842 |
|
1843 /* write back the updated pointers */ |
|
1844 pArgs->source=source; |
|
1845 pArgs->target=(char *)target; |
|
1846 return; |
|
1847 |
|
1848 outputBytes: |
|
1849 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */ |
|
1850 /* from the first if in the loop we know that targetCapacity>0 */ |
|
1851 if(length<=targetCapacity) { |
|
1852 switch(length) { |
|
1853 /* each branch falls through to the next one */ |
|
1854 case 4: |
|
1855 *target++=(uint8_t)(c>>24); |
|
1856 case 3: /*fall through*/ |
|
1857 *target++=(uint8_t)(c>>16); |
|
1858 case 2: /*fall through*/ |
|
1859 *target++=(uint8_t)(c>>8); |
|
1860 case 1: /*fall through*/ |
|
1861 *target++=(uint8_t)c; |
|
1862 default: |
|
1863 /* will never occur */ |
|
1864 break; |
|
1865 } |
|
1866 targetCapacity-=length; |
|
1867 |
|
1868 /* normal end of conversion: prepare for a new character */ |
|
1869 c=0; |
|
1870 goto loop; |
|
1871 } else { |
|
1872 uint8_t *p; |
|
1873 |
|
1874 /* |
|
1875 * We actually do this backwards here: |
|
1876 * In order to save an intermediate variable, we output |
|
1877 * first to the overflow buffer what does not fit into the |
|
1878 * regular target. |
|
1879 */ |
|
1880 /* we know that 0<=targetCapacity<length<=4 */ |
|
1881 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */ |
|
1882 length-=targetCapacity; |
|
1883 p=(uint8_t *)cnv->charErrorBuffer; |
|
1884 switch(length) { |
|
1885 /* each branch falls through to the next one */ |
|
1886 case 4: |
|
1887 *p++=(uint8_t)(c>>24); |
|
1888 case 3: /*fall through*/ |
|
1889 *p++=(uint8_t)(c>>16); |
|
1890 case 2: /*fall through*/ |
|
1891 *p++=(uint8_t)(c>>8); |
|
1892 case 1: /*fall through*/ |
|
1893 *p=(uint8_t)c; |
|
1894 default: |
|
1895 /* will never occur */ |
|
1896 break; |
|
1897 } |
|
1898 cnv->charErrorBufferLength=(int8_t)length; |
|
1899 |
|
1900 /* now output what fits into the regular target */ |
|
1901 c>>=8*length; /* length was reduced by targetCapacity */ |
|
1902 switch(targetCapacity) { |
|
1903 /* each branch falls through to the next one */ |
|
1904 case 3: |
|
1905 *target++=(uint8_t)(c>>16); |
|
1906 case 2: /*fall through*/ |
|
1907 *target++=(uint8_t)(c>>8); |
|
1908 case 1: /*fall through*/ |
|
1909 *target++=(uint8_t)c; |
|
1910 default: |
|
1911 break; |
|
1912 } |
|
1913 |
|
1914 /* target overflow */ |
|
1915 targetCapacity=0; |
|
1916 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
1917 c=0; |
|
1918 goto endloop; |
|
1919 } |
|
1920 } |
|
1921 |
|
1922 /* miscellaneous ------------------------------------------------------------ */ |
|
1923 |
|
1924 static const char * |
|
1925 _SCSUGetName(const UConverter *cnv) { |
|
1926 SCSUData *scsu=(SCSUData *)cnv->extraInfo; |
|
1927 |
|
1928 switch(scsu->locale) { |
|
1929 case l_ja: |
|
1930 return "SCSU,locale=ja"; |
|
1931 default: |
|
1932 return "SCSU"; |
|
1933 } |
|
1934 } |
|
1935 |
|
1936 /* structure for SafeClone calculations */ |
|
1937 struct cloneSCSUStruct |
|
1938 { |
|
1939 UConverter cnv; |
|
1940 SCSUData mydata; |
|
1941 }; |
|
1942 |
|
1943 static UConverter * |
|
1944 _SCSUSafeClone(const UConverter *cnv, |
|
1945 void *stackBuffer, |
|
1946 int32_t *pBufferSize, |
|
1947 UErrorCode *status) |
|
1948 { |
|
1949 struct cloneSCSUStruct * localClone; |
|
1950 int32_t bufferSizeNeeded = sizeof(struct cloneSCSUStruct); |
|
1951 |
|
1952 if (U_FAILURE(*status)){ |
|
1953 return 0; |
|
1954 } |
|
1955 |
|
1956 if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */ |
|
1957 *pBufferSize = bufferSizeNeeded; |
|
1958 return 0; |
|
1959 } |
|
1960 |
|
1961 localClone = (struct cloneSCSUStruct *)stackBuffer; |
|
1962 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ |
|
1963 |
|
1964 uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(SCSUData)); |
|
1965 localClone->cnv.extraInfo = &localClone->mydata; |
|
1966 localClone->cnv.isExtraLocal = TRUE; |
|
1967 |
|
1968 return &localClone->cnv; |
|
1969 } |
|
1970 |
|
1971 |
|
1972 static const UConverterImpl _SCSUImpl={ |
|
1973 UCNV_SCSU, |
|
1974 |
|
1975 NULL, |
|
1976 NULL, |
|
1977 |
|
1978 _SCSUOpen, |
|
1979 _SCSUClose, |
|
1980 _SCSUReset, |
|
1981 |
|
1982 _SCSUToUnicode, |
|
1983 _SCSUToUnicodeWithOffsets, |
|
1984 _SCSUFromUnicode, |
|
1985 _SCSUFromUnicodeWithOffsets, |
|
1986 NULL, |
|
1987 |
|
1988 NULL, |
|
1989 _SCSUGetName, |
|
1990 NULL, |
|
1991 _SCSUSafeClone, |
|
1992 ucnv_getCompleteUnicodeSet |
|
1993 }; |
|
1994 |
|
1995 static const UConverterStaticData _SCSUStaticData={ |
|
1996 sizeof(UConverterStaticData), |
|
1997 "SCSU", |
|
1998 1212, /* CCSID for SCSU */ |
|
1999 UCNV_IBM, UCNV_SCSU, |
|
2000 1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */ |
|
2001 /* |
|
2002 * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode |
|
2003 * substitution string. |
|
2004 */ |
|
2005 { 0x0e, 0xff, 0xfd, 0 }, 3, |
|
2006 FALSE, FALSE, |
|
2007 0, |
|
2008 0, |
|
2009 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
|
2010 }; |
|
2011 |
|
2012 const UConverterSharedData _SCSUData={ |
|
2013 sizeof(UConverterSharedData), ~((uint32_t)0), |
|
2014 NULL, NULL, &_SCSUStaticData, FALSE, &_SCSUImpl, |
|
2015 0 |
|
2016 }; |
|
2017 |
|
2018 #endif |