| |
1 /* |
| |
2 ****************************************************************************** |
| |
3 * |
| |
4 * Copyright (C) 2000-2011, International Business Machines |
| |
5 * Corporation and others. All Rights Reserved. |
| |
6 * |
| |
7 ****************************************************************************** |
| |
8 * file name: ucnvscsu.c |
| |
9 * encoding: US-ASCII |
| |
10 * tab size: 8 (not used) |
| |
11 * indentation:4 |
| |
12 * |
| |
13 * created on: 2000nov18 |
| |
14 * created by: Markus W. Scherer |
| |
15 * |
| |
16 * This is an implementation of the Standard Compression Scheme for Unicode |
| |
17 * as defined in http://www.unicode.org/unicode/reports/tr6/ . |
| |
18 * Reserved commands and window settings are treated as illegal sequences and |
| |
19 * will result in callback calls. |
| |
20 */ |
| |
21 |
| |
22 #include "unicode/utypes.h" |
| |
23 |
| |
24 #if !UCONFIG_NO_CONVERSION |
| |
25 |
| |
26 #include "unicode/ucnv.h" |
| |
27 #include "unicode/ucnv_cb.h" |
| |
28 #include "unicode/utf16.h" |
| |
29 #include "ucnv_bld.h" |
| |
30 #include "ucnv_cnv.h" |
| |
31 #include "cmemory.h" |
| |
32 |
| |
33 /* SCSU definitions --------------------------------------------------------- */ |
| |
34 |
| |
35 /* SCSU command byte values */ |
| |
36 enum { |
| |
37 SQ0=0x01, /* Quote from window pair 0 */ |
| |
38 SQ7=0x08, /* Quote from window pair 7 */ |
| |
39 SDX=0x0B, /* Define a window as extended */ |
| |
40 Srs=0x0C, /* reserved */ |
| |
41 SQU=0x0E, /* Quote a single Unicode character */ |
| |
42 SCU=0x0F, /* Change to Unicode mode */ |
| |
43 SC0=0x10, /* Select window 0 */ |
| |
44 SC7=0x17, /* Select window 7 */ |
| |
45 SD0=0x18, /* Define and select window 0 */ |
| |
46 SD7=0x1F, /* Define and select window 7 */ |
| |
47 |
| |
48 UC0=0xE0, /* Select window 0 */ |
| |
49 UC7=0xE7, /* Select window 7 */ |
| |
50 UD0=0xE8, /* Define and select window 0 */ |
| |
51 UD7=0xEF, /* Define and select window 7 */ |
| |
52 UQU=0xF0, /* Quote a single Unicode character */ |
| |
53 UDX=0xF1, /* Define a Window as extended */ |
| |
54 Urs=0xF2 /* reserved */ |
| |
55 }; |
| |
56 |
| |
57 enum { |
| |
58 /* |
| |
59 * Unicode code points from 3400 to E000 are not adressible by |
| |
60 * dynamic window, since in these areas no short run alphabets are |
| |
61 * found. Therefore add gapOffset to all values from gapThreshold. |
| |
62 */ |
| |
63 gapThreshold=0x68, |
| |
64 gapOffset=0xAC00, |
| |
65 |
| |
66 /* values between reservedStart and fixedThreshold are reserved */ |
| |
67 reservedStart=0xA8, |
| |
68 |
| |
69 /* use table of predefined fixed offsets for values from fixedThreshold */ |
| |
70 fixedThreshold=0xF9 |
| |
71 }; |
| |
72 |
| |
73 /* constant offsets for the 8 static windows */ |
| |
74 static const uint32_t staticOffsets[8]={ |
| |
75 0x0000, /* ASCII for quoted tags */ |
| |
76 0x0080, /* Latin - 1 Supplement (for access to punctuation) */ |
| |
77 0x0100, /* Latin Extended-A */ |
| |
78 0x0300, /* Combining Diacritical Marks */ |
| |
79 0x2000, /* General Punctuation */ |
| |
80 0x2080, /* Currency Symbols */ |
| |
81 0x2100, /* Letterlike Symbols and Number Forms */ |
| |
82 0x3000 /* CJK Symbols and punctuation */ |
| |
83 }; |
| |
84 |
| |
85 /* initial offsets for the 8 dynamic (sliding) windows */ |
| |
86 static const uint32_t initialDynamicOffsets[8]={ |
| |
87 0x0080, /* Latin-1 */ |
| |
88 0x00C0, /* Latin Extended A */ |
| |
89 0x0400, /* Cyrillic */ |
| |
90 0x0600, /* Arabic */ |
| |
91 0x0900, /* Devanagari */ |
| |
92 0x3040, /* Hiragana */ |
| |
93 0x30A0, /* Katakana */ |
| |
94 0xFF00 /* Fullwidth ASCII */ |
| |
95 }; |
| |
96 |
| |
97 /* Table of fixed predefined Offsets */ |
| |
98 static const uint32_t fixedOffsets[]={ |
| |
99 /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */ |
| |
100 /* 0xFA */ 0x0250, /* IPA extensions */ |
| |
101 /* 0xFB */ 0x0370, /* Greek */ |
| |
102 /* 0xFC */ 0x0530, /* Armenian */ |
| |
103 /* 0xFD */ 0x3040, /* Hiragana */ |
| |
104 /* 0xFE */ 0x30A0, /* Katakana */ |
| |
105 /* 0xFF */ 0xFF60 /* Halfwidth Katakana */ |
| |
106 }; |
| |
107 |
| |
108 /* state values */ |
| |
109 enum { |
| |
110 readCommand, |
| |
111 quotePairOne, |
| |
112 quotePairTwo, |
| |
113 quoteOne, |
| |
114 definePairOne, |
| |
115 definePairTwo, |
| |
116 defineOne |
| |
117 }; |
| |
118 |
| |
119 typedef struct SCSUData { |
| |
120 /* dynamic window offsets, intitialize to default values from initialDynamicOffsets */ |
| |
121 uint32_t toUDynamicOffsets[8]; |
| |
122 uint32_t fromUDynamicOffsets[8]; |
| |
123 |
| |
124 /* state machine state - toUnicode */ |
| |
125 UBool toUIsSingleByteMode; |
| |
126 uint8_t toUState; |
| |
127 int8_t toUQuoteWindow, toUDynamicWindow; |
| |
128 uint8_t toUByteOne; |
| |
129 uint8_t toUPadding[3]; |
| |
130 |
| |
131 /* state machine state - fromUnicode */ |
| |
132 UBool fromUIsSingleByteMode; |
| |
133 int8_t fromUDynamicWindow; |
| |
134 |
| |
135 /* |
| |
136 * windowUse[] keeps track of the use of the dynamic windows: |
| |
137 * At nextWindowUseIndex there is the least recently used window, |
| |
138 * and the following windows (in a wrapping manner) are more and more |
| |
139 * recently used. |
| |
140 * At nextWindowUseIndex-1 there is the most recently used window. |
| |
141 */ |
| |
142 uint8_t locale; |
| |
143 int8_t nextWindowUseIndex; |
| |
144 int8_t windowUse[8]; |
| |
145 } SCSUData; |
| |
146 |
| |
147 static const int8_t initialWindowUse[8]={ 7, 0, 3, 2, 4, 5, 6, 1 }; |
| |
148 static const int8_t initialWindowUse_ja[8]={ 3, 2, 4, 1, 0, 7, 5, 6 }; |
| |
149 |
| |
150 enum { |
| |
151 lGeneric, l_ja |
| |
152 }; |
| |
153 |
| |
154 /* SCSU setup functions ----------------------------------------------------- */ |
| |
155 |
| |
156 static void |
| |
157 _SCSUReset(UConverter *cnv, UConverterResetChoice choice) { |
| |
158 SCSUData *scsu=(SCSUData *)cnv->extraInfo; |
| |
159 |
| |
160 if(choice<=UCNV_RESET_TO_UNICODE) { |
| |
161 /* reset toUnicode */ |
| |
162 uprv_memcpy(scsu->toUDynamicOffsets, initialDynamicOffsets, 32); |
| |
163 |
| |
164 scsu->toUIsSingleByteMode=TRUE; |
| |
165 scsu->toUState=readCommand; |
| |
166 scsu->toUQuoteWindow=scsu->toUDynamicWindow=0; |
| |
167 scsu->toUByteOne=0; |
| |
168 |
| |
169 cnv->toULength=0; |
| |
170 } |
| |
171 if(choice!=UCNV_RESET_TO_UNICODE) { |
| |
172 /* reset fromUnicode */ |
| |
173 uprv_memcpy(scsu->fromUDynamicOffsets, initialDynamicOffsets, 32); |
| |
174 |
| |
175 scsu->fromUIsSingleByteMode=TRUE; |
| |
176 scsu->fromUDynamicWindow=0; |
| |
177 |
| |
178 scsu->nextWindowUseIndex=0; |
| |
179 switch(scsu->locale) { |
| |
180 case l_ja: |
| |
181 uprv_memcpy(scsu->windowUse, initialWindowUse_ja, 8); |
| |
182 break; |
| |
183 default: |
| |
184 uprv_memcpy(scsu->windowUse, initialWindowUse, 8); |
| |
185 break; |
| |
186 } |
| |
187 |
| |
188 cnv->fromUChar32=0; |
| |
189 } |
| |
190 } |
| |
191 |
| |
192 static void |
| |
193 _SCSUOpen(UConverter *cnv, |
| |
194 UConverterLoadArgs *pArgs, |
| |
195 UErrorCode *pErrorCode) { |
| |
196 const char *locale=pArgs->locale; |
| |
197 if(pArgs->onlyTestIsLoadable) { |
| |
198 return; |
| |
199 } |
| |
200 cnv->extraInfo=uprv_malloc(sizeof(SCSUData)); |
| |
201 if(cnv->extraInfo!=NULL) { |
| |
202 if(locale!=NULL && locale[0]=='j' && locale[1]=='a' && (locale[2]==0 || locale[2]=='_')) { |
| |
203 ((SCSUData *)cnv->extraInfo)->locale=l_ja; |
| |
204 } else { |
| |
205 ((SCSUData *)cnv->extraInfo)->locale=lGeneric; |
| |
206 } |
| |
207 _SCSUReset(cnv, UCNV_RESET_BOTH); |
| |
208 } else { |
| |
209 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
| |
210 } |
| |
211 |
| |
212 /* Set the substitution character U+fffd as a Unicode string. */ |
| |
213 cnv->subUChars[0]=0xfffd; |
| |
214 cnv->subCharLen=-1; |
| |
215 } |
| |
216 |
| |
217 static void |
| |
218 _SCSUClose(UConverter *cnv) { |
| |
219 if(cnv->extraInfo!=NULL) { |
| |
220 if(!cnv->isExtraLocal) { |
| |
221 uprv_free(cnv->extraInfo); |
| |
222 } |
| |
223 cnv->extraInfo=NULL; |
| |
224 } |
| |
225 } |
| |
226 |
| |
227 /* SCSU-to-Unicode conversion functions ------------------------------------- */ |
| |
228 |
| |
229 static void |
| |
230 _SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, |
| |
231 UErrorCode *pErrorCode) { |
| |
232 UConverter *cnv; |
| |
233 SCSUData *scsu; |
| |
234 const uint8_t *source, *sourceLimit; |
| |
235 UChar *target; |
| |
236 const UChar *targetLimit; |
| |
237 int32_t *offsets; |
| |
238 UBool isSingleByteMode; |
| |
239 uint8_t state, byteOne; |
| |
240 int8_t quoteWindow, dynamicWindow; |
| |
241 |
| |
242 int32_t sourceIndex, nextSourceIndex; |
| |
243 |
| |
244 uint8_t b; |
| |
245 |
| |
246 /* set up the local pointers */ |
| |
247 cnv=pArgs->converter; |
| |
248 scsu=(SCSUData *)cnv->extraInfo; |
| |
249 |
| |
250 source=(const uint8_t *)pArgs->source; |
| |
251 sourceLimit=(const uint8_t *)pArgs->sourceLimit; |
| |
252 target=pArgs->target; |
| |
253 targetLimit=pArgs->targetLimit; |
| |
254 offsets=pArgs->offsets; |
| |
255 |
| |
256 /* get the state machine state */ |
| |
257 isSingleByteMode=scsu->toUIsSingleByteMode; |
| |
258 state=scsu->toUState; |
| |
259 quoteWindow=scsu->toUQuoteWindow; |
| |
260 dynamicWindow=scsu->toUDynamicWindow; |
| |
261 byteOne=scsu->toUByteOne; |
| |
262 |
| |
263 /* sourceIndex=-1 if the current character began in the previous buffer */ |
| |
264 sourceIndex=state==readCommand ? 0 : -1; |
| |
265 nextSourceIndex=0; |
| |
266 |
| |
267 /* |
| |
268 * conversion "loop" |
| |
269 * |
| |
270 * For performance, this is not a normal C loop. |
| |
271 * Instead, there are two code blocks for the two SCSU modes. |
| |
272 * The function branches to either one, and a change of the mode is done with a goto to |
| |
273 * the other branch. |
| |
274 * |
| |
275 * Each branch has two conventional loops: |
| |
276 * - a fast-path loop for the most common codes in the mode |
| |
277 * - a loop for all other codes in the mode |
| |
278 * When the fast-path runs into a code that it cannot handle, its loop ends and it |
| |
279 * runs into the following loop to handle the other codes. |
| |
280 * The end of the input or output buffer is also handled by the slower loop. |
| |
281 * The slow loop jumps (goto) to the fast-path loop again as soon as possible. |
| |
282 * |
| |
283 * The callback handling is done by returning with an error code. |
| |
284 * The conversion framework actually calls the callback function. |
| |
285 */ |
| |
286 if(isSingleByteMode) { |
| |
287 /* fast path for single-byte mode */ |
| |
288 if(state==readCommand) { |
| |
289 fastSingle: |
| |
290 while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) { |
| |
291 ++source; |
| |
292 ++nextSourceIndex; |
| |
293 if(b<=0x7f) { |
| |
294 /* write US-ASCII graphic character or DEL */ |
| |
295 *target++=(UChar)b; |
| |
296 if(offsets!=NULL) { |
| |
297 *offsets++=sourceIndex; |
| |
298 } |
| |
299 } else { |
| |
300 /* write from dynamic window */ |
| |
301 uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f); |
| |
302 if(c<=0xffff) { |
| |
303 *target++=(UChar)c; |
| |
304 if(offsets!=NULL) { |
| |
305 *offsets++=sourceIndex; |
| |
306 } |
| |
307 } else { |
| |
308 /* output surrogate pair */ |
| |
309 *target++=(UChar)(0xd7c0+(c>>10)); |
| |
310 if(target<targetLimit) { |
| |
311 *target++=(UChar)(0xdc00|(c&0x3ff)); |
| |
312 if(offsets!=NULL) { |
| |
313 *offsets++=sourceIndex; |
| |
314 *offsets++=sourceIndex; |
| |
315 } |
| |
316 } else { |
| |
317 /* target overflow */ |
| |
318 if(offsets!=NULL) { |
| |
319 *offsets++=sourceIndex; |
| |
320 } |
| |
321 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); |
| |
322 cnv->UCharErrorBufferLength=1; |
| |
323 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| |
324 goto endloop; |
| |
325 } |
| |
326 } |
| |
327 } |
| |
328 sourceIndex=nextSourceIndex; |
| |
329 } |
| |
330 } |
| |
331 |
| |
332 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */ |
| |
333 singleByteMode: |
| |
334 while(source<sourceLimit) { |
| |
335 if(target>=targetLimit) { |
| |
336 /* target is full */ |
| |
337 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| |
338 break; |
| |
339 } |
| |
340 b=*source++; |
| |
341 ++nextSourceIndex; |
| |
342 switch(state) { |
| |
343 case readCommand: |
| |
344 /* redundant conditions are commented out */ |
| |
345 /* here: b<0x20 because otherwise we would be in fastSingle */ |
| |
346 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { |
| |
347 /* CR/LF/TAB/NUL */ |
| |
348 *target++=(UChar)b; |
| |
349 if(offsets!=NULL) { |
| |
350 *offsets++=sourceIndex; |
| |
351 } |
| |
352 sourceIndex=nextSourceIndex; |
| |
353 goto fastSingle; |
| |
354 } else if(SC0<=b) { |
| |
355 if(b<=SC7) { |
| |
356 dynamicWindow=(int8_t)(b-SC0); |
| |
357 sourceIndex=nextSourceIndex; |
| |
358 goto fastSingle; |
| |
359 } else /* if(SD0<=b && b<=SD7) */ { |
| |
360 dynamicWindow=(int8_t)(b-SD0); |
| |
361 state=defineOne; |
| |
362 } |
| |
363 } else if(/* SQ0<=b && */ b<=SQ7) { |
| |
364 quoteWindow=(int8_t)(b-SQ0); |
| |
365 state=quoteOne; |
| |
366 } else if(b==SDX) { |
| |
367 state=definePairOne; |
| |
368 } else if(b==SQU) { |
| |
369 state=quotePairOne; |
| |
370 } else if(b==SCU) { |
| |
371 sourceIndex=nextSourceIndex; |
| |
372 isSingleByteMode=FALSE; |
| |
373 goto fastUnicode; |
| |
374 } else /* Srs */ { |
| |
375 /* callback(illegal) */ |
| |
376 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| |
377 cnv->toUBytes[0]=b; |
| |
378 cnv->toULength=1; |
| |
379 goto endloop; |
| |
380 } |
| |
381 |
| |
382 /* store the first byte of a multibyte sequence in toUBytes[] */ |
| |
383 cnv->toUBytes[0]=b; |
| |
384 cnv->toULength=1; |
| |
385 break; |
| |
386 case quotePairOne: |
| |
387 byteOne=b; |
| |
388 cnv->toUBytes[1]=b; |
| |
389 cnv->toULength=2; |
| |
390 state=quotePairTwo; |
| |
391 break; |
| |
392 case quotePairTwo: |
| |
393 *target++=(UChar)((byteOne<<8)|b); |
| |
394 if(offsets!=NULL) { |
| |
395 *offsets++=sourceIndex; |
| |
396 } |
| |
397 sourceIndex=nextSourceIndex; |
| |
398 state=readCommand; |
| |
399 goto fastSingle; |
| |
400 case quoteOne: |
| |
401 if(b<0x80) { |
| |
402 /* all static offsets are in the BMP */ |
| |
403 *target++=(UChar)(staticOffsets[quoteWindow]+b); |
| |
404 if(offsets!=NULL) { |
| |
405 *offsets++=sourceIndex; |
| |
406 } |
| |
407 } else { |
| |
408 /* write from dynamic window */ |
| |
409 uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f); |
| |
410 if(c<=0xffff) { |
| |
411 *target++=(UChar)c; |
| |
412 if(offsets!=NULL) { |
| |
413 *offsets++=sourceIndex; |
| |
414 } |
| |
415 } else { |
| |
416 /* output surrogate pair */ |
| |
417 *target++=(UChar)(0xd7c0+(c>>10)); |
| |
418 if(target<targetLimit) { |
| |
419 *target++=(UChar)(0xdc00|(c&0x3ff)); |
| |
420 if(offsets!=NULL) { |
| |
421 *offsets++=sourceIndex; |
| |
422 *offsets++=sourceIndex; |
| |
423 } |
| |
424 } else { |
| |
425 /* target overflow */ |
| |
426 if(offsets!=NULL) { |
| |
427 *offsets++=sourceIndex; |
| |
428 } |
| |
429 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); |
| |
430 cnv->UCharErrorBufferLength=1; |
| |
431 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| |
432 goto endloop; |
| |
433 } |
| |
434 } |
| |
435 } |
| |
436 sourceIndex=nextSourceIndex; |
| |
437 state=readCommand; |
| |
438 goto fastSingle; |
| |
439 case definePairOne: |
| |
440 dynamicWindow=(int8_t)((b>>5)&7); |
| |
441 byteOne=(uint8_t)(b&0x1f); |
| |
442 cnv->toUBytes[1]=b; |
| |
443 cnv->toULength=2; |
| |
444 state=definePairTwo; |
| |
445 break; |
| |
446 case definePairTwo: |
| |
447 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL); |
| |
448 sourceIndex=nextSourceIndex; |
| |
449 state=readCommand; |
| |
450 goto fastSingle; |
| |
451 case defineOne: |
| |
452 if(b==0) { |
| |
453 /* callback(illegal): Reserved window offset value 0 */ |
| |
454 cnv->toUBytes[1]=b; |
| |
455 cnv->toULength=2; |
| |
456 goto endloop; |
| |
457 } else if(b<gapThreshold) { |
| |
458 scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL; |
| |
459 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) { |
| |
460 scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset; |
| |
461 } else if(b>=fixedThreshold) { |
| |
462 scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold]; |
| |
463 } else { |
| |
464 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */ |
| |
465 cnv->toUBytes[1]=b; |
| |
466 cnv->toULength=2; |
| |
467 goto endloop; |
| |
468 } |
| |
469 sourceIndex=nextSourceIndex; |
| |
470 state=readCommand; |
| |
471 goto fastSingle; |
| |
472 } |
| |
473 } |
| |
474 } else { |
| |
475 /* fast path for Unicode mode */ |
| |
476 if(state==readCommand) { |
| |
477 fastUnicode: |
| |
478 while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) { |
| |
479 *target++=(UChar)((b<<8)|source[1]); |
| |
480 if(offsets!=NULL) { |
| |
481 *offsets++=sourceIndex; |
| |
482 } |
| |
483 sourceIndex=nextSourceIndex; |
| |
484 nextSourceIndex+=2; |
| |
485 source+=2; |
| |
486 } |
| |
487 } |
| |
488 |
| |
489 /* normal state machine for Unicode mode */ |
| |
490 /* unicodeByteMode: */ |
| |
491 while(source<sourceLimit) { |
| |
492 if(target>=targetLimit) { |
| |
493 /* target is full */ |
| |
494 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| |
495 break; |
| |
496 } |
| |
497 b=*source++; |
| |
498 ++nextSourceIndex; |
| |
499 switch(state) { |
| |
500 case readCommand: |
| |
501 if((uint8_t)(b-UC0)>(Urs-UC0)) { |
| |
502 byteOne=b; |
| |
503 cnv->toUBytes[0]=b; |
| |
504 cnv->toULength=1; |
| |
505 state=quotePairTwo; |
| |
506 } else if(/* UC0<=b && */ b<=UC7) { |
| |
507 dynamicWindow=(int8_t)(b-UC0); |
| |
508 sourceIndex=nextSourceIndex; |
| |
509 isSingleByteMode=TRUE; |
| |
510 goto fastSingle; |
| |
511 } else if(/* UD0<=b && */ b<=UD7) { |
| |
512 dynamicWindow=(int8_t)(b-UD0); |
| |
513 isSingleByteMode=TRUE; |
| |
514 cnv->toUBytes[0]=b; |
| |
515 cnv->toULength=1; |
| |
516 state=defineOne; |
| |
517 goto singleByteMode; |
| |
518 } else if(b==UDX) { |
| |
519 isSingleByteMode=TRUE; |
| |
520 cnv->toUBytes[0]=b; |
| |
521 cnv->toULength=1; |
| |
522 state=definePairOne; |
| |
523 goto singleByteMode; |
| |
524 } else if(b==UQU) { |
| |
525 cnv->toUBytes[0]=b; |
| |
526 cnv->toULength=1; |
| |
527 state=quotePairOne; |
| |
528 } else /* Urs */ { |
| |
529 /* callback(illegal) */ |
| |
530 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| |
531 cnv->toUBytes[0]=b; |
| |
532 cnv->toULength=1; |
| |
533 goto endloop; |
| |
534 } |
| |
535 break; |
| |
536 case quotePairOne: |
| |
537 byteOne=b; |
| |
538 cnv->toUBytes[1]=b; |
| |
539 cnv->toULength=2; |
| |
540 state=quotePairTwo; |
| |
541 break; |
| |
542 case quotePairTwo: |
| |
543 *target++=(UChar)((byteOne<<8)|b); |
| |
544 if(offsets!=NULL) { |
| |
545 *offsets++=sourceIndex; |
| |
546 } |
| |
547 sourceIndex=nextSourceIndex; |
| |
548 state=readCommand; |
| |
549 goto fastUnicode; |
| |
550 } |
| |
551 } |
| |
552 } |
| |
553 endloop: |
| |
554 |
| |
555 /* set the converter state back into UConverter */ |
| |
556 if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { |
| |
557 /* reset to deal with the next character */ |
| |
558 state=readCommand; |
| |
559 } else if(state==readCommand) { |
| |
560 /* not in a multi-byte sequence, reset toULength */ |
| |
561 cnv->toULength=0; |
| |
562 } |
| |
563 scsu->toUIsSingleByteMode=isSingleByteMode; |
| |
564 scsu->toUState=state; |
| |
565 scsu->toUQuoteWindow=quoteWindow; |
| |
566 scsu->toUDynamicWindow=dynamicWindow; |
| |
567 scsu->toUByteOne=byteOne; |
| |
568 |
| |
569 /* write back the updated pointers */ |
| |
570 pArgs->source=(const char *)source; |
| |
571 pArgs->target=target; |
| |
572 pArgs->offsets=offsets; |
| |
573 return; |
| |
574 } |
| |
575 |
| |
576 /* |
| |
577 * Identical to _SCSUToUnicodeWithOffsets but without offset handling. |
| |
578 * If a change is made in the original function, then either |
| |
579 * change this function the same way or |
| |
580 * re-copy the original function and remove the variables |
| |
581 * offsets, sourceIndex, and nextSourceIndex. |
| |
582 */ |
| |
583 static void |
| |
584 _SCSUToUnicode(UConverterToUnicodeArgs *pArgs, |
| |
585 UErrorCode *pErrorCode) { |
| |
586 UConverter *cnv; |
| |
587 SCSUData *scsu; |
| |
588 const uint8_t *source, *sourceLimit; |
| |
589 UChar *target; |
| |
590 const UChar *targetLimit; |
| |
591 UBool isSingleByteMode; |
| |
592 uint8_t state, byteOne; |
| |
593 int8_t quoteWindow, dynamicWindow; |
| |
594 |
| |
595 uint8_t b; |
| |
596 |
| |
597 /* set up the local pointers */ |
| |
598 cnv=pArgs->converter; |
| |
599 scsu=(SCSUData *)cnv->extraInfo; |
| |
600 |
| |
601 source=(const uint8_t *)pArgs->source; |
| |
602 sourceLimit=(const uint8_t *)pArgs->sourceLimit; |
| |
603 target=pArgs->target; |
| |
604 targetLimit=pArgs->targetLimit; |
| |
605 |
| |
606 /* get the state machine state */ |
| |
607 isSingleByteMode=scsu->toUIsSingleByteMode; |
| |
608 state=scsu->toUState; |
| |
609 quoteWindow=scsu->toUQuoteWindow; |
| |
610 dynamicWindow=scsu->toUDynamicWindow; |
| |
611 byteOne=scsu->toUByteOne; |
| |
612 |
| |
613 /* |
| |
614 * conversion "loop" |
| |
615 * |
| |
616 * For performance, this is not a normal C loop. |
| |
617 * Instead, there are two code blocks for the two SCSU modes. |
| |
618 * The function branches to either one, and a change of the mode is done with a goto to |
| |
619 * the other branch. |
| |
620 * |
| |
621 * Each branch has two conventional loops: |
| |
622 * - a fast-path loop for the most common codes in the mode |
| |
623 * - a loop for all other codes in the mode |
| |
624 * When the fast-path runs into a code that it cannot handle, its loop ends and it |
| |
625 * runs into the following loop to handle the other codes. |
| |
626 * The end of the input or output buffer is also handled by the slower loop. |
| |
627 * The slow loop jumps (goto) to the fast-path loop again as soon as possible. |
| |
628 * |
| |
629 * The callback handling is done by returning with an error code. |
| |
630 * The conversion framework actually calls the callback function. |
| |
631 */ |
| |
632 if(isSingleByteMode) { |
| |
633 /* fast path for single-byte mode */ |
| |
634 if(state==readCommand) { |
| |
635 fastSingle: |
| |
636 while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) { |
| |
637 ++source; |
| |
638 if(b<=0x7f) { |
| |
639 /* write US-ASCII graphic character or DEL */ |
| |
640 *target++=(UChar)b; |
| |
641 } else { |
| |
642 /* write from dynamic window */ |
| |
643 uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f); |
| |
644 if(c<=0xffff) { |
| |
645 *target++=(UChar)c; |
| |
646 } else { |
| |
647 /* output surrogate pair */ |
| |
648 *target++=(UChar)(0xd7c0+(c>>10)); |
| |
649 if(target<targetLimit) { |
| |
650 *target++=(UChar)(0xdc00|(c&0x3ff)); |
| |
651 } else { |
| |
652 /* target overflow */ |
| |
653 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); |
| |
654 cnv->UCharErrorBufferLength=1; |
| |
655 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| |
656 goto endloop; |
| |
657 } |
| |
658 } |
| |
659 } |
| |
660 } |
| |
661 } |
| |
662 |
| |
663 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */ |
| |
664 singleByteMode: |
| |
665 while(source<sourceLimit) { |
| |
666 if(target>=targetLimit) { |
| |
667 /* target is full */ |
| |
668 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| |
669 break; |
| |
670 } |
| |
671 b=*source++; |
| |
672 switch(state) { |
| |
673 case readCommand: |
| |
674 /* redundant conditions are commented out */ |
| |
675 /* here: b<0x20 because otherwise we would be in fastSingle */ |
| |
676 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { |
| |
677 /* CR/LF/TAB/NUL */ |
| |
678 *target++=(UChar)b; |
| |
679 goto fastSingle; |
| |
680 } else if(SC0<=b) { |
| |
681 if(b<=SC7) { |
| |
682 dynamicWindow=(int8_t)(b-SC0); |
| |
683 goto fastSingle; |
| |
684 } else /* if(SD0<=b && b<=SD7) */ { |
| |
685 dynamicWindow=(int8_t)(b-SD0); |
| |
686 state=defineOne; |
| |
687 } |
| |
688 } else if(/* SQ0<=b && */ b<=SQ7) { |
| |
689 quoteWindow=(int8_t)(b-SQ0); |
| |
690 state=quoteOne; |
| |
691 } else if(b==SDX) { |
| |
692 state=definePairOne; |
| |
693 } else if(b==SQU) { |
| |
694 state=quotePairOne; |
| |
695 } else if(b==SCU) { |
| |
696 isSingleByteMode=FALSE; |
| |
697 goto fastUnicode; |
| |
698 } else /* Srs */ { |
| |
699 /* callback(illegal) */ |
| |
700 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| |
701 cnv->toUBytes[0]=b; |
| |
702 cnv->toULength=1; |
| |
703 goto endloop; |
| |
704 } |
| |
705 |
| |
706 /* store the first byte of a multibyte sequence in toUBytes[] */ |
| |
707 cnv->toUBytes[0]=b; |
| |
708 cnv->toULength=1; |
| |
709 break; |
| |
710 case quotePairOne: |
| |
711 byteOne=b; |
| |
712 cnv->toUBytes[1]=b; |
| |
713 cnv->toULength=2; |
| |
714 state=quotePairTwo; |
| |
715 break; |
| |
716 case quotePairTwo: |
| |
717 *target++=(UChar)((byteOne<<8)|b); |
| |
718 state=readCommand; |
| |
719 goto fastSingle; |
| |
720 case quoteOne: |
| |
721 if(b<0x80) { |
| |
722 /* all static offsets are in the BMP */ |
| |
723 *target++=(UChar)(staticOffsets[quoteWindow]+b); |
| |
724 } else { |
| |
725 /* write from dynamic window */ |
| |
726 uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f); |
| |
727 if(c<=0xffff) { |
| |
728 *target++=(UChar)c; |
| |
729 } else { |
| |
730 /* output surrogate pair */ |
| |
731 *target++=(UChar)(0xd7c0+(c>>10)); |
| |
732 if(target<targetLimit) { |
| |
733 *target++=(UChar)(0xdc00|(c&0x3ff)); |
| |
734 } else { |
| |
735 /* target overflow */ |
| |
736 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); |
| |
737 cnv->UCharErrorBufferLength=1; |
| |
738 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| |
739 goto endloop; |
| |
740 } |
| |
741 } |
| |
742 } |
| |
743 state=readCommand; |
| |
744 goto fastSingle; |
| |
745 case definePairOne: |
| |
746 dynamicWindow=(int8_t)((b>>5)&7); |
| |
747 byteOne=(uint8_t)(b&0x1f); |
| |
748 cnv->toUBytes[1]=b; |
| |
749 cnv->toULength=2; |
| |
750 state=definePairTwo; |
| |
751 break; |
| |
752 case definePairTwo: |
| |
753 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL); |
| |
754 state=readCommand; |
| |
755 goto fastSingle; |
| |
756 case defineOne: |
| |
757 if(b==0) { |
| |
758 /* callback(illegal): Reserved window offset value 0 */ |
| |
759 cnv->toUBytes[1]=b; |
| |
760 cnv->toULength=2; |
| |
761 goto endloop; |
| |
762 } else if(b<gapThreshold) { |
| |
763 scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL; |
| |
764 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) { |
| |
765 scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset; |
| |
766 } else if(b>=fixedThreshold) { |
| |
767 scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold]; |
| |
768 } else { |
| |
769 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */ |
| |
770 cnv->toUBytes[1]=b; |
| |
771 cnv->toULength=2; |
| |
772 goto endloop; |
| |
773 } |
| |
774 state=readCommand; |
| |
775 goto fastSingle; |
| |
776 } |
| |
777 } |
| |
778 } else { |
| |
779 /* fast path for Unicode mode */ |
| |
780 if(state==readCommand) { |
| |
781 fastUnicode: |
| |
782 while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) { |
| |
783 *target++=(UChar)((b<<8)|source[1]); |
| |
784 source+=2; |
| |
785 } |
| |
786 } |
| |
787 |
| |
788 /* normal state machine for Unicode mode */ |
| |
789 /* unicodeByteMode: */ |
| |
790 while(source<sourceLimit) { |
| |
791 if(target>=targetLimit) { |
| |
792 /* target is full */ |
| |
793 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| |
794 break; |
| |
795 } |
| |
796 b=*source++; |
| |
797 switch(state) { |
| |
798 case readCommand: |
| |
799 if((uint8_t)(b-UC0)>(Urs-UC0)) { |
| |
800 byteOne=b; |
| |
801 cnv->toUBytes[0]=b; |
| |
802 cnv->toULength=1; |
| |
803 state=quotePairTwo; |
| |
804 } else if(/* UC0<=b && */ b<=UC7) { |
| |
805 dynamicWindow=(int8_t)(b-UC0); |
| |
806 isSingleByteMode=TRUE; |
| |
807 goto fastSingle; |
| |
808 } else if(/* UD0<=b && */ b<=UD7) { |
| |
809 dynamicWindow=(int8_t)(b-UD0); |
| |
810 isSingleByteMode=TRUE; |
| |
811 cnv->toUBytes[0]=b; |
| |
812 cnv->toULength=1; |
| |
813 state=defineOne; |
| |
814 goto singleByteMode; |
| |
815 } else if(b==UDX) { |
| |
816 isSingleByteMode=TRUE; |
| |
817 cnv->toUBytes[0]=b; |
| |
818 cnv->toULength=1; |
| |
819 state=definePairOne; |
| |
820 goto singleByteMode; |
| |
821 } else if(b==UQU) { |
| |
822 cnv->toUBytes[0]=b; |
| |
823 cnv->toULength=1; |
| |
824 state=quotePairOne; |
| |
825 } else /* Urs */ { |
| |
826 /* callback(illegal) */ |
| |
827 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| |
828 cnv->toUBytes[0]=b; |
| |
829 cnv->toULength=1; |
| |
830 goto endloop; |
| |
831 } |
| |
832 break; |
| |
833 case quotePairOne: |
| |
834 byteOne=b; |
| |
835 cnv->toUBytes[1]=b; |
| |
836 cnv->toULength=2; |
| |
837 state=quotePairTwo; |
| |
838 break; |
| |
839 case quotePairTwo: |
| |
840 *target++=(UChar)((byteOne<<8)|b); |
| |
841 state=readCommand; |
| |
842 goto fastUnicode; |
| |
843 } |
| |
844 } |
| |
845 } |
| |
846 endloop: |
| |
847 |
| |
848 /* set the converter state back into UConverter */ |
| |
849 if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { |
| |
850 /* reset to deal with the next character */ |
| |
851 state=readCommand; |
| |
852 } else if(state==readCommand) { |
| |
853 /* not in a multi-byte sequence, reset toULength */ |
| |
854 cnv->toULength=0; |
| |
855 } |
| |
856 scsu->toUIsSingleByteMode=isSingleByteMode; |
| |
857 scsu->toUState=state; |
| |
858 scsu->toUQuoteWindow=quoteWindow; |
| |
859 scsu->toUDynamicWindow=dynamicWindow; |
| |
860 scsu->toUByteOne=byteOne; |
| |
861 |
| |
862 /* write back the updated pointers */ |
| |
863 pArgs->source=(const char *)source; |
| |
864 pArgs->target=target; |
| |
865 return; |
| |
866 } |
| |
867 |
| |
868 /* SCSU-from-Unicode conversion functions ----------------------------------- */ |
| |
869 |
| |
870 /* |
| |
871 * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve |
| |
872 * reasonable results. The lookahead is minimal. |
| |
873 * Many cases are simple: |
| |
874 * A character fits directly into the current mode, a dynamic or static window, |
| |
875 * or is not compressible. These cases are tested first. |
| |
876 * Real compression heuristics are applied to the rest, in code branches for |
| |
877 * single/Unicode mode and BMP/supplementary code points. |
| |
878 * The heuristics used here are extremely simple. |
| |
879 */ |
| |
880 |
| |
881 /* get the number of the window that this character is in, or -1 */ |
| |
882 static int8_t |
| |
883 getWindow(const uint32_t offsets[8], uint32_t c) { |
| |
884 int i; |
| |
885 for(i=0; i<8; ++i) { |
| |
886 if((uint32_t)(c-offsets[i])<=0x7f) { |
| |
887 return (int8_t)(i); |
| |
888 } |
| |
889 } |
| |
890 return -1; |
| |
891 } |
| |
892 |
| |
893 /* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */ |
| |
894 static UBool |
| |
895 isInOffsetWindowOrDirect(uint32_t offset, uint32_t c) { |
| |
896 return (UBool)(c<=offset+0x7f && |
| |
897 (c>=offset || (c<=0x7f && |
| |
898 (c>=0x20 || (1UL<<c)&0x2601)))); |
| |
899 /* binary 0010 0110 0000 0001, |
| |
900 check for b==0xd || b==0xa || b==9 || b==0 */ |
| |
901 } |
| |
902 |
| |
903 /* |
| |
904 * getNextDynamicWindow returns the next dynamic window to be redefined |
| |
905 */ |
| |
906 static int8_t |
| |
907 getNextDynamicWindow(SCSUData *scsu) { |
| |
908 int8_t window=scsu->windowUse[scsu->nextWindowUseIndex]; |
| |
909 if(++scsu->nextWindowUseIndex==8) { |
| |
910 scsu->nextWindowUseIndex=0; |
| |
911 } |
| |
912 return window; |
| |
913 } |
| |
914 |
| |
915 /* |
| |
916 * useDynamicWindow() adjusts |
| |
917 * windowUse[] and nextWindowUseIndex for the algorithm to choose |
| |
918 * the next dynamic window to be defined; |
| |
919 * a subclass may override it and provide its own algorithm. |
| |
920 */ |
| |
921 static void |
| |
922 useDynamicWindow(SCSUData *scsu, int8_t window) { |
| |
923 /* |
| |
924 * move the existing window, which just became the most recently used one, |
| |
925 * up in windowUse[] to nextWindowUseIndex-1 |
| |
926 */ |
| |
927 |
| |
928 /* first, find the index of the window - backwards to favor the more recently used windows */ |
| |
929 int i, j; |
| |
930 |
| |
931 i=scsu->nextWindowUseIndex; |
| |
932 do { |
| |
933 if(--i<0) { |
| |
934 i=7; |
| |
935 } |
| |
936 } while(scsu->windowUse[i]!=window); |
| |
937 |
| |
938 /* now copy each windowUse[i+1] to [i] */ |
| |
939 j=i+1; |
| |
940 if(j==8) { |
| |
941 j=0; |
| |
942 } |
| |
943 while(j!=scsu->nextWindowUseIndex) { |
| |
944 scsu->windowUse[i]=scsu->windowUse[j]; |
| |
945 i=j; |
| |
946 if(++j==8) { j=0; } |
| |
947 } |
| |
948 |
| |
949 /* finally, set the window into the most recently used index */ |
| |
950 scsu->windowUse[i]=window; |
| |
951 } |
| |
952 |
| |
953 /* |
| |
954 * calculate the offset and the code for a dynamic window that contains the character |
| |
955 * takes fixed offsets into account |
| |
956 * the offset of the window is stored in the offset variable, |
| |
957 * the code is returned |
| |
958 * |
| |
959 * return offset code: -1 none <=0xff code for SDn/UDn else code for SDX/UDX, subtract 0x200 to get the true code |
| |
960 */ |
| |
961 static int |
| |
962 getDynamicOffset(uint32_t c, uint32_t *pOffset) { |
| |
963 int i; |
| |
964 |
| |
965 for(i=0; i<7; ++i) { |
| |
966 if((uint32_t)(c-fixedOffsets[i])<=0x7f) { |
| |
967 *pOffset=fixedOffsets[i]; |
| |
968 return 0xf9+i; |
| |
969 } |
| |
970 } |
| |
971 |
| |
972 if(c<0x80) { |
| |
973 /* No dynamic window for US-ASCII. */ |
| |
974 return -1; |
| |
975 } else if(c<0x3400 || |
| |
976 (uint32_t)(c-0x10000)<(0x14000-0x10000) || |
| |
977 (uint32_t)(c-0x1d000)<=(0x1ffff-0x1d000) |
| |
978 ) { |
| |
979 /* This character is in a code range for a "small", i.e., reasonably windowable, script. */ |
| |
980 *pOffset=c&0x7fffff80; |
| |
981 return (int)(c>>7); |
| |
982 } else if(0xe000<=c && c!=0xfeff && c<0xfff0) { |
| |
983 /* For these characters we need to take the gapOffset into account. */ |
| |
984 *pOffset=c&0x7fffff80; |
| |
985 return (int)((c-gapOffset)>>7); |
| |
986 } else { |
| |
987 return -1; |
| |
988 } |
| |
989 } |
| |
990 |
| |
991 /* |
| |
992 * Idea for compression: |
| |
993 * - save SCSUData and other state before really starting work |
| |
994 * - at endloop, see if compression could be better with just unicode mode |
| |
995 * - don't do this if a callback has been called |
| |
996 * - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning |
| |
997 * - different buffer handling! |
| |
998 * |
| |
999 * Drawback or need for corrective handling: |
| |
1000 * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and |
| |
1001 * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible |
| |
1002 * not only for compression but also for HTML/XML documents with following charset/encoding announcers. |
| |
1003 * |
| |
1004 * How to achieve both? |
| |
1005 * - Only replace the result after an SDX or SCU? |
| |
1006 */ |
| |
1007 |
| |
1008 static void |
| |
1009 _SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, |
| |
1010 UErrorCode *pErrorCode) { |
| |
1011 UConverter *cnv; |
| |
1012 SCSUData *scsu; |
| |
1013 const UChar *source, *sourceLimit; |
| |
1014 uint8_t *target; |
| |
1015 int32_t targetCapacity; |
| |
1016 int32_t *offsets; |
| |
1017 |
| |
1018 UBool isSingleByteMode; |
| |
1019 uint8_t dynamicWindow; |
| |
1020 uint32_t currentOffset; |
| |
1021 |
| |
1022 uint32_t c, delta; |
| |
1023 |
| |
1024 int32_t sourceIndex, nextSourceIndex; |
| |
1025 |
| |
1026 int32_t length; |
| |
1027 |
| |
1028 /* variables for compression heuristics */ |
| |
1029 uint32_t offset; |
| |
1030 UChar lead, trail; |
| |
1031 int code; |
| |
1032 int8_t window; |
| |
1033 |
| |
1034 /* set up the local pointers */ |
| |
1035 cnv=pArgs->converter; |
| |
1036 scsu=(SCSUData *)cnv->extraInfo; |
| |
1037 |
| |
1038 /* set up the local pointers */ |
| |
1039 source=pArgs->source; |
| |
1040 sourceLimit=pArgs->sourceLimit; |
| |
1041 target=(uint8_t *)pArgs->target; |
| |
1042 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); |
| |
1043 offsets=pArgs->offsets; |
| |
1044 |
| |
1045 /* get the state machine state */ |
| |
1046 isSingleByteMode=scsu->fromUIsSingleByteMode; |
| |
1047 dynamicWindow=scsu->fromUDynamicWindow; |
| |
1048 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
| |
1049 |
| |
1050 c=cnv->fromUChar32; |
| |
1051 |
| |
1052 /* sourceIndex=-1 if the current character began in the previous buffer */ |
| |
1053 sourceIndex= c==0 ? 0 : -1; |
| |
1054 nextSourceIndex=0; |
| |
1055 |
| |
1056 /* similar conversion "loop" as in toUnicode */ |
| |
1057 loop: |
| |
1058 if(isSingleByteMode) { |
| |
1059 if(c!=0 && targetCapacity>0) { |
| |
1060 goto getTrailSingle; |
| |
1061 } |
| |
1062 |
| |
1063 /* state machine for single-byte mode */ |
| |
1064 /* singleByteMode: */ |
| |
1065 while(source<sourceLimit) { |
| |
1066 if(targetCapacity<=0) { |
| |
1067 /* target is full */ |
| |
1068 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| |
1069 break; |
| |
1070 } |
| |
1071 c=*source++; |
| |
1072 ++nextSourceIndex; |
| |
1073 |
| |
1074 if((c-0x20)<=0x5f) { |
| |
1075 /* pass US-ASCII graphic character through */ |
| |
1076 *target++=(uint8_t)c; |
| |
1077 if(offsets!=NULL) { |
| |
1078 *offsets++=sourceIndex; |
| |
1079 } |
| |
1080 --targetCapacity; |
| |
1081 } else if(c<0x20) { |
| |
1082 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { |
| |
1083 /* CR/LF/TAB/NUL */ |
| |
1084 *target++=(uint8_t)c; |
| |
1085 if(offsets!=NULL) { |
| |
1086 *offsets++=sourceIndex; |
| |
1087 } |
| |
1088 --targetCapacity; |
| |
1089 } else { |
| |
1090 /* quote C0 control character */ |
| |
1091 c|=SQ0<<8; |
| |
1092 length=2; |
| |
1093 goto outputBytes; |
| |
1094 } |
| |
1095 } else if((delta=c-currentOffset)<=0x7f) { |
| |
1096 /* use the current dynamic window */ |
| |
1097 *target++=(uint8_t)(delta|0x80); |
| |
1098 if(offsets!=NULL) { |
| |
1099 *offsets++=sourceIndex; |
| |
1100 } |
| |
1101 --targetCapacity; |
| |
1102 } else if(U16_IS_SURROGATE(c)) { |
| |
1103 if(U16_IS_SURROGATE_LEAD(c)) { |
| |
1104 getTrailSingle: |
| |
1105 lead=(UChar)c; |
| |
1106 if(source<sourceLimit) { |
| |
1107 /* test the following code unit */ |
| |
1108 trail=*source; |
| |
1109 if(U16_IS_TRAIL(trail)) { |
| |
1110 ++source; |
| |
1111 ++nextSourceIndex; |
| |
1112 c=U16_GET_SUPPLEMENTARY(c, trail); |
| |
1113 /* convert this surrogate code point */ |
| |
1114 /* exit this condition tree */ |
| |
1115 } else { |
| |
1116 /* this is an unmatched lead code unit (1st surrogate) */ |
| |
1117 /* callback(illegal) */ |
| |
1118 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| |
1119 goto endloop; |
| |
1120 } |
| |
1121 } else { |
| |
1122 /* no more input */ |
| |
1123 break; |
| |
1124 } |
| |
1125 } else { |
| |
1126 /* this is an unmatched trail code unit (2nd surrogate) */ |
| |
1127 /* callback(illegal) */ |
| |
1128 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| |
1129 goto endloop; |
| |
1130 } |
| |
1131 |
| |
1132 /* compress supplementary character U+10000..U+10ffff */ |
| |
1133 if((delta=c-currentOffset)<=0x7f) { |
| |
1134 /* use the current dynamic window */ |
| |
1135 *target++=(uint8_t)(delta|0x80); |
| |
1136 if(offsets!=NULL) { |
| |
1137 *offsets++=sourceIndex; |
| |
1138 } |
| |
1139 --targetCapacity; |
| |
1140 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { |
| |
1141 /* there is a dynamic window that contains this character, change to it */ |
| |
1142 dynamicWindow=window; |
| |
1143 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
| |
1144 useDynamicWindow(scsu, dynamicWindow); |
| |
1145 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; |
| |
1146 length=2; |
| |
1147 goto outputBytes; |
| |
1148 } else if((code=getDynamicOffset(c, &offset))>=0) { |
| |
1149 /* might check if there are more characters in this window to come */ |
| |
1150 /* define an extended window with this character */ |
| |
1151 code-=0x200; |
| |
1152 dynamicWindow=getNextDynamicWindow(scsu); |
| |
1153 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; |
| |
1154 useDynamicWindow(scsu, dynamicWindow); |
| |
1155 c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; |
| |
1156 length=4; |
| |
1157 goto outputBytes; |
| |
1158 } else { |
| |
1159 /* change to Unicode mode and output this (lead, trail) pair */ |
| |
1160 isSingleByteMode=FALSE; |
| |
1161 *target++=(uint8_t)SCU; |
| |
1162 if(offsets!=NULL) { |
| |
1163 *offsets++=sourceIndex; |
| |
1164 } |
| |
1165 --targetCapacity; |
| |
1166 c=((uint32_t)lead<<16)|trail; |
| |
1167 length=4; |
| |
1168 goto outputBytes; |
| |
1169 } |
| |
1170 } else if(c<0xa0) { |
| |
1171 /* quote C1 control character */ |
| |
1172 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */ |
| |
1173 length=2; |
| |
1174 goto outputBytes; |
| |
1175 } else if(c==0xfeff || c>=0xfff0) { |
| |
1176 /* quote signature character=byte order mark and specials */ |
| |
1177 c|=SQU<<16; |
| |
1178 length=3; |
| |
1179 goto outputBytes; |
| |
1180 } else { |
| |
1181 /* compress all other BMP characters */ |
| |
1182 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { |
| |
1183 /* there is a window defined that contains this character - switch to it or quote from it? */ |
| |
1184 if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) { |
| |
1185 /* change to dynamic window */ |
| |
1186 dynamicWindow=window; |
| |
1187 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
| |
1188 useDynamicWindow(scsu, dynamicWindow); |
| |
1189 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; |
| |
1190 length=2; |
| |
1191 goto outputBytes; |
| |
1192 } else { |
| |
1193 /* quote from dynamic window */ |
| |
1194 c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80; |
| |
1195 length=2; |
| |
1196 goto outputBytes; |
| |
1197 } |
| |
1198 } else if((window=getWindow(staticOffsets, c))>=0) { |
| |
1199 /* quote from static window */ |
| |
1200 c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]); |
| |
1201 length=2; |
| |
1202 goto outputBytes; |
| |
1203 } else if((code=getDynamicOffset(c, &offset))>=0) { |
| |
1204 /* define a dynamic window with this character */ |
| |
1205 dynamicWindow=getNextDynamicWindow(scsu); |
| |
1206 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; |
| |
1207 useDynamicWindow(scsu, dynamicWindow); |
| |
1208 c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; |
| |
1209 length=3; |
| |
1210 goto outputBytes; |
| |
1211 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) && |
| |
1212 (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400)) |
| |
1213 ) { |
| |
1214 /* |
| |
1215 * this character is not compressible (a BMP ideograph or similar); |
| |
1216 * switch to Unicode mode if this is the last character in the block |
| |
1217 * or there is at least one more ideograph following immediately |
| |
1218 */ |
| |
1219 isSingleByteMode=FALSE; |
| |
1220 c|=SCU<<16; |
| |
1221 length=3; |
| |
1222 goto outputBytes; |
| |
1223 } else { |
| |
1224 /* quote Unicode */ |
| |
1225 c|=SQU<<16; |
| |
1226 length=3; |
| |
1227 goto outputBytes; |
| |
1228 } |
| |
1229 } |
| |
1230 |
| |
1231 /* normal end of conversion: prepare for a new character */ |
| |
1232 c=0; |
| |
1233 sourceIndex=nextSourceIndex; |
| |
1234 } |
| |
1235 } else { |
| |
1236 if(c!=0 && targetCapacity>0) { |
| |
1237 goto getTrailUnicode; |
| |
1238 } |
| |
1239 |
| |
1240 /* state machine for Unicode mode */ |
| |
1241 /* unicodeByteMode: */ |
| |
1242 while(source<sourceLimit) { |
| |
1243 if(targetCapacity<=0) { |
| |
1244 /* target is full */ |
| |
1245 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| |
1246 break; |
| |
1247 } |
| |
1248 c=*source++; |
| |
1249 ++nextSourceIndex; |
| |
1250 |
| |
1251 if((uint32_t)(c-0x3400)<(0xd800-0x3400)) { |
| |
1252 /* not compressible, write character directly */ |
| |
1253 if(targetCapacity>=2) { |
| |
1254 *target++=(uint8_t)(c>>8); |
| |
1255 *target++=(uint8_t)c; |
| |
1256 if(offsets!=NULL) { |
| |
1257 *offsets++=sourceIndex; |
| |
1258 *offsets++=sourceIndex; |
| |
1259 } |
| |
1260 targetCapacity-=2; |
| |
1261 } else { |
| |
1262 length=2; |
| |
1263 goto outputBytes; |
| |
1264 } |
| |
1265 } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) { |
| |
1266 /* compress BMP character if the following one is not an uncompressible ideograph */ |
| |
1267 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) { |
| |
1268 if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) { |
| |
1269 /* ASCII digit or letter */ |
| |
1270 isSingleByteMode=TRUE; |
| |
1271 c|=((uint32_t)(UC0+dynamicWindow)<<8)|c; |
| |
1272 length=2; |
| |
1273 goto outputBytes; |
| |
1274 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { |
| |
1275 /* there is a dynamic window that contains this character, change to it */ |
| |
1276 isSingleByteMode=TRUE; |
| |
1277 dynamicWindow=window; |
| |
1278 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
| |
1279 useDynamicWindow(scsu, dynamicWindow); |
| |
1280 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; |
| |
1281 length=2; |
| |
1282 goto outputBytes; |
| |
1283 } else if((code=getDynamicOffset(c, &offset))>=0) { |
| |
1284 /* define a dynamic window with this character */ |
| |
1285 isSingleByteMode=TRUE; |
| |
1286 dynamicWindow=getNextDynamicWindow(scsu); |
| |
1287 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; |
| |
1288 useDynamicWindow(scsu, dynamicWindow); |
| |
1289 c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; |
| |
1290 length=3; |
| |
1291 goto outputBytes; |
| |
1292 } |
| |
1293 } |
| |
1294 |
| |
1295 /* don't know how to compress this character, just write it directly */ |
| |
1296 length=2; |
| |
1297 goto outputBytes; |
| |
1298 } else if(c<0xe000) { |
| |
1299 /* c is a surrogate */ |
| |
1300 if(U16_IS_SURROGATE_LEAD(c)) { |
| |
1301 getTrailUnicode: |
| |
1302 lead=(UChar)c; |
| |
1303 if(source<sourceLimit) { |
| |
1304 /* test the following code unit */ |
| |
1305 trail=*source; |
| |
1306 if(U16_IS_TRAIL(trail)) { |
| |
1307 ++source; |
| |
1308 ++nextSourceIndex; |
| |
1309 c=U16_GET_SUPPLEMENTARY(c, trail); |
| |
1310 /* convert this surrogate code point */ |
| |
1311 /* exit this condition tree */ |
| |
1312 } else { |
| |
1313 /* this is an unmatched lead code unit (1st surrogate) */ |
| |
1314 /* callback(illegal) */ |
| |
1315 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| |
1316 goto endloop; |
| |
1317 } |
| |
1318 } else { |
| |
1319 /* no more input */ |
| |
1320 break; |
| |
1321 } |
| |
1322 } else { |
| |
1323 /* this is an unmatched trail code unit (2nd surrogate) */ |
| |
1324 /* callback(illegal) */ |
| |
1325 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| |
1326 goto endloop; |
| |
1327 } |
| |
1328 |
| |
1329 /* compress supplementary character */ |
| |
1330 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 && |
| |
1331 !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400)) |
| |
1332 ) { |
| |
1333 /* |
| |
1334 * there is a dynamic window that contains this character and |
| |
1335 * the following character is not uncompressible, |
| |
1336 * change to the window |
| |
1337 */ |
| |
1338 isSingleByteMode=TRUE; |
| |
1339 dynamicWindow=window; |
| |
1340 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
| |
1341 useDynamicWindow(scsu, dynamicWindow); |
| |
1342 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; |
| |
1343 length=2; |
| |
1344 goto outputBytes; |
| |
1345 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */ |
| |
1346 (code=getDynamicOffset(c, &offset))>=0 |
| |
1347 ) { |
| |
1348 /* two supplementary characters in (probably) the same window - define an extended one */ |
| |
1349 isSingleByteMode=TRUE; |
| |
1350 code-=0x200; |
| |
1351 dynamicWindow=getNextDynamicWindow(scsu); |
| |
1352 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; |
| |
1353 useDynamicWindow(scsu, dynamicWindow); |
| |
1354 c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; |
| |
1355 length=4; |
| |
1356 goto outputBytes; |
| |
1357 } else { |
| |
1358 /* don't know how to compress this character, just write it directly */ |
| |
1359 c=((uint32_t)lead<<16)|trail; |
| |
1360 length=4; |
| |
1361 goto outputBytes; |
| |
1362 } |
| |
1363 } else /* 0xe000<=c<0xf300 */ { |
| |
1364 /* quote to avoid SCSU tags */ |
| |
1365 c|=UQU<<16; |
| |
1366 length=3; |
| |
1367 goto outputBytes; |
| |
1368 } |
| |
1369 |
| |
1370 /* normal end of conversion: prepare for a new character */ |
| |
1371 c=0; |
| |
1372 sourceIndex=nextSourceIndex; |
| |
1373 } |
| |
1374 } |
| |
1375 endloop: |
| |
1376 |
| |
1377 /* set the converter state back into UConverter */ |
| |
1378 scsu->fromUIsSingleByteMode=isSingleByteMode; |
| |
1379 scsu->fromUDynamicWindow=dynamicWindow; |
| |
1380 |
| |
1381 cnv->fromUChar32=c; |
| |
1382 |
| |
1383 /* write back the updated pointers */ |
| |
1384 pArgs->source=source; |
| |
1385 pArgs->target=(char *)target; |
| |
1386 pArgs->offsets=offsets; |
| |
1387 return; |
| |
1388 |
| |
1389 outputBytes: |
| |
1390 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */ |
| |
1391 /* from the first if in the loop we know that targetCapacity>0 */ |
| |
1392 if(length<=targetCapacity) { |
| |
1393 if(offsets==NULL) { |
| |
1394 switch(length) { |
| |
1395 /* each branch falls through to the next one */ |
| |
1396 case 4: |
| |
1397 *target++=(uint8_t)(c>>24); |
| |
1398 case 3: /*fall through*/ |
| |
1399 *target++=(uint8_t)(c>>16); |
| |
1400 case 2: /*fall through*/ |
| |
1401 *target++=(uint8_t)(c>>8); |
| |
1402 case 1: /*fall through*/ |
| |
1403 *target++=(uint8_t)c; |
| |
1404 default: |
| |
1405 /* will never occur */ |
| |
1406 break; |
| |
1407 } |
| |
1408 } else { |
| |
1409 switch(length) { |
| |
1410 /* each branch falls through to the next one */ |
| |
1411 case 4: |
| |
1412 *target++=(uint8_t)(c>>24); |
| |
1413 *offsets++=sourceIndex; |
| |
1414 case 3: /*fall through*/ |
| |
1415 *target++=(uint8_t)(c>>16); |
| |
1416 *offsets++=sourceIndex; |
| |
1417 case 2: /*fall through*/ |
| |
1418 *target++=(uint8_t)(c>>8); |
| |
1419 *offsets++=sourceIndex; |
| |
1420 case 1: /*fall through*/ |
| |
1421 *target++=(uint8_t)c; |
| |
1422 *offsets++=sourceIndex; |
| |
1423 default: |
| |
1424 /* will never occur */ |
| |
1425 break; |
| |
1426 } |
| |
1427 } |
| |
1428 targetCapacity-=length; |
| |
1429 |
| |
1430 /* normal end of conversion: prepare for a new character */ |
| |
1431 c=0; |
| |
1432 sourceIndex=nextSourceIndex; |
| |
1433 goto loop; |
| |
1434 } else { |
| |
1435 uint8_t *p; |
| |
1436 |
| |
1437 /* |
| |
1438 * We actually do this backwards here: |
| |
1439 * In order to save an intermediate variable, we output |
| |
1440 * first to the overflow buffer what does not fit into the |
| |
1441 * regular target. |
| |
1442 */ |
| |
1443 /* we know that 0<=targetCapacity<length<=4 */ |
| |
1444 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */ |
| |
1445 length-=targetCapacity; |
| |
1446 p=(uint8_t *)cnv->charErrorBuffer; |
| |
1447 switch(length) { |
| |
1448 /* each branch falls through to the next one */ |
| |
1449 case 4: |
| |
1450 *p++=(uint8_t)(c>>24); |
| |
1451 case 3: /*fall through*/ |
| |
1452 *p++=(uint8_t)(c>>16); |
| |
1453 case 2: /*fall through*/ |
| |
1454 *p++=(uint8_t)(c>>8); |
| |
1455 case 1: /*fall through*/ |
| |
1456 *p=(uint8_t)c; |
| |
1457 default: |
| |
1458 /* will never occur */ |
| |
1459 break; |
| |
1460 } |
| |
1461 cnv->charErrorBufferLength=(int8_t)length; |
| |
1462 |
| |
1463 /* now output what fits into the regular target */ |
| |
1464 c>>=8*length; /* length was reduced by targetCapacity */ |
| |
1465 switch(targetCapacity) { |
| |
1466 /* each branch falls through to the next one */ |
| |
1467 case 3: |
| |
1468 *target++=(uint8_t)(c>>16); |
| |
1469 if(offsets!=NULL) { |
| |
1470 *offsets++=sourceIndex; |
| |
1471 } |
| |
1472 case 2: /*fall through*/ |
| |
1473 *target++=(uint8_t)(c>>8); |
| |
1474 if(offsets!=NULL) { |
| |
1475 *offsets++=sourceIndex; |
| |
1476 } |
| |
1477 case 1: /*fall through*/ |
| |
1478 *target++=(uint8_t)c; |
| |
1479 if(offsets!=NULL) { |
| |
1480 *offsets++=sourceIndex; |
| |
1481 } |
| |
1482 default: |
| |
1483 break; |
| |
1484 } |
| |
1485 |
| |
1486 /* target overflow */ |
| |
1487 targetCapacity=0; |
| |
1488 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| |
1489 c=0; |
| |
1490 goto endloop; |
| |
1491 } |
| |
1492 } |
| |
1493 |
| |
1494 /* |
| |
1495 * Identical to _SCSUFromUnicodeWithOffsets but without offset handling. |
| |
1496 * If a change is made in the original function, then either |
| |
1497 * change this function the same way or |
| |
1498 * re-copy the original function and remove the variables |
| |
1499 * offsets, sourceIndex, and nextSourceIndex. |
| |
1500 */ |
| |
1501 static void |
| |
1502 _SCSUFromUnicode(UConverterFromUnicodeArgs *pArgs, |
| |
1503 UErrorCode *pErrorCode) { |
| |
1504 UConverter *cnv; |
| |
1505 SCSUData *scsu; |
| |
1506 const UChar *source, *sourceLimit; |
| |
1507 uint8_t *target; |
| |
1508 int32_t targetCapacity; |
| |
1509 |
| |
1510 UBool isSingleByteMode; |
| |
1511 uint8_t dynamicWindow; |
| |
1512 uint32_t currentOffset; |
| |
1513 |
| |
1514 uint32_t c, delta; |
| |
1515 |
| |
1516 int32_t length; |
| |
1517 |
| |
1518 /* variables for compression heuristics */ |
| |
1519 uint32_t offset; |
| |
1520 UChar lead, trail; |
| |
1521 int code; |
| |
1522 int8_t window; |
| |
1523 |
| |
1524 /* set up the local pointers */ |
| |
1525 cnv=pArgs->converter; |
| |
1526 scsu=(SCSUData *)cnv->extraInfo; |
| |
1527 |
| |
1528 /* set up the local pointers */ |
| |
1529 source=pArgs->source; |
| |
1530 sourceLimit=pArgs->sourceLimit; |
| |
1531 target=(uint8_t *)pArgs->target; |
| |
1532 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); |
| |
1533 |
| |
1534 /* get the state machine state */ |
| |
1535 isSingleByteMode=scsu->fromUIsSingleByteMode; |
| |
1536 dynamicWindow=scsu->fromUDynamicWindow; |
| |
1537 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
| |
1538 |
| |
1539 c=cnv->fromUChar32; |
| |
1540 |
| |
1541 /* similar conversion "loop" as in toUnicode */ |
| |
1542 loop: |
| |
1543 if(isSingleByteMode) { |
| |
1544 if(c!=0 && targetCapacity>0) { |
| |
1545 goto getTrailSingle; |
| |
1546 } |
| |
1547 |
| |
1548 /* state machine for single-byte mode */ |
| |
1549 /* singleByteMode: */ |
| |
1550 while(source<sourceLimit) { |
| |
1551 if(targetCapacity<=0) { |
| |
1552 /* target is full */ |
| |
1553 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| |
1554 break; |
| |
1555 } |
| |
1556 c=*source++; |
| |
1557 |
| |
1558 if((c-0x20)<=0x5f) { |
| |
1559 /* pass US-ASCII graphic character through */ |
| |
1560 *target++=(uint8_t)c; |
| |
1561 --targetCapacity; |
| |
1562 } else if(c<0x20) { |
| |
1563 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { |
| |
1564 /* CR/LF/TAB/NUL */ |
| |
1565 *target++=(uint8_t)c; |
| |
1566 --targetCapacity; |
| |
1567 } else { |
| |
1568 /* quote C0 control character */ |
| |
1569 c|=SQ0<<8; |
| |
1570 length=2; |
| |
1571 goto outputBytes; |
| |
1572 } |
| |
1573 } else if((delta=c-currentOffset)<=0x7f) { |
| |
1574 /* use the current dynamic window */ |
| |
1575 *target++=(uint8_t)(delta|0x80); |
| |
1576 --targetCapacity; |
| |
1577 } else if(U16_IS_SURROGATE(c)) { |
| |
1578 if(U16_IS_SURROGATE_LEAD(c)) { |
| |
1579 getTrailSingle: |
| |
1580 lead=(UChar)c; |
| |
1581 if(source<sourceLimit) { |
| |
1582 /* test the following code unit */ |
| |
1583 trail=*source; |
| |
1584 if(U16_IS_TRAIL(trail)) { |
| |
1585 ++source; |
| |
1586 c=U16_GET_SUPPLEMENTARY(c, trail); |
| |
1587 /* convert this surrogate code point */ |
| |
1588 /* exit this condition tree */ |
| |
1589 } else { |
| |
1590 /* this is an unmatched lead code unit (1st surrogate) */ |
| |
1591 /* callback(illegal) */ |
| |
1592 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| |
1593 goto endloop; |
| |
1594 } |
| |
1595 } else { |
| |
1596 /* no more input */ |
| |
1597 break; |
| |
1598 } |
| |
1599 } else { |
| |
1600 /* this is an unmatched trail code unit (2nd surrogate) */ |
| |
1601 /* callback(illegal) */ |
| |
1602 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| |
1603 goto endloop; |
| |
1604 } |
| |
1605 |
| |
1606 /* compress supplementary character U+10000..U+10ffff */ |
| |
1607 if((delta=c-currentOffset)<=0x7f) { |
| |
1608 /* use the current dynamic window */ |
| |
1609 *target++=(uint8_t)(delta|0x80); |
| |
1610 --targetCapacity; |
| |
1611 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { |
| |
1612 /* there is a dynamic window that contains this character, change to it */ |
| |
1613 dynamicWindow=window; |
| |
1614 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
| |
1615 useDynamicWindow(scsu, dynamicWindow); |
| |
1616 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; |
| |
1617 length=2; |
| |
1618 goto outputBytes; |
| |
1619 } else if((code=getDynamicOffset(c, &offset))>=0) { |
| |
1620 /* might check if there are more characters in this window to come */ |
| |
1621 /* define an extended window with this character */ |
| |
1622 code-=0x200; |
| |
1623 dynamicWindow=getNextDynamicWindow(scsu); |
| |
1624 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; |
| |
1625 useDynamicWindow(scsu, dynamicWindow); |
| |
1626 c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; |
| |
1627 length=4; |
| |
1628 goto outputBytes; |
| |
1629 } else { |
| |
1630 /* change to Unicode mode and output this (lead, trail) pair */ |
| |
1631 isSingleByteMode=FALSE; |
| |
1632 *target++=(uint8_t)SCU; |
| |
1633 --targetCapacity; |
| |
1634 c=((uint32_t)lead<<16)|trail; |
| |
1635 length=4; |
| |
1636 goto outputBytes; |
| |
1637 } |
| |
1638 } else if(c<0xa0) { |
| |
1639 /* quote C1 control character */ |
| |
1640 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */ |
| |
1641 length=2; |
| |
1642 goto outputBytes; |
| |
1643 } else if(c==0xfeff || c>=0xfff0) { |
| |
1644 /* quote signature character=byte order mark and specials */ |
| |
1645 c|=SQU<<16; |
| |
1646 length=3; |
| |
1647 goto outputBytes; |
| |
1648 } else { |
| |
1649 /* compress all other BMP characters */ |
| |
1650 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { |
| |
1651 /* there is a window defined that contains this character - switch to it or quote from it? */ |
| |
1652 if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) { |
| |
1653 /* change to dynamic window */ |
| |
1654 dynamicWindow=window; |
| |
1655 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
| |
1656 useDynamicWindow(scsu, dynamicWindow); |
| |
1657 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; |
| |
1658 length=2; |
| |
1659 goto outputBytes; |
| |
1660 } else { |
| |
1661 /* quote from dynamic window */ |
| |
1662 c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80; |
| |
1663 length=2; |
| |
1664 goto outputBytes; |
| |
1665 } |
| |
1666 } else if((window=getWindow(staticOffsets, c))>=0) { |
| |
1667 /* quote from static window */ |
| |
1668 c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]); |
| |
1669 length=2; |
| |
1670 goto outputBytes; |
| |
1671 } else if((code=getDynamicOffset(c, &offset))>=0) { |
| |
1672 /* define a dynamic window with this character */ |
| |
1673 dynamicWindow=getNextDynamicWindow(scsu); |
| |
1674 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; |
| |
1675 useDynamicWindow(scsu, dynamicWindow); |
| |
1676 c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; |
| |
1677 length=3; |
| |
1678 goto outputBytes; |
| |
1679 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) && |
| |
1680 (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400)) |
| |
1681 ) { |
| |
1682 /* |
| |
1683 * this character is not compressible (a BMP ideograph or similar); |
| |
1684 * switch to Unicode mode if this is the last character in the block |
| |
1685 * or there is at least one more ideograph following immediately |
| |
1686 */ |
| |
1687 isSingleByteMode=FALSE; |
| |
1688 c|=SCU<<16; |
| |
1689 length=3; |
| |
1690 goto outputBytes; |
| |
1691 } else { |
| |
1692 /* quote Unicode */ |
| |
1693 c|=SQU<<16; |
| |
1694 length=3; |
| |
1695 goto outputBytes; |
| |
1696 } |
| |
1697 } |
| |
1698 |
| |
1699 /* normal end of conversion: prepare for a new character */ |
| |
1700 c=0; |
| |
1701 } |
| |
1702 } else { |
| |
1703 if(c!=0 && targetCapacity>0) { |
| |
1704 goto getTrailUnicode; |
| |
1705 } |
| |
1706 |
| |
1707 /* state machine for Unicode mode */ |
| |
1708 /* unicodeByteMode: */ |
| |
1709 while(source<sourceLimit) { |
| |
1710 if(targetCapacity<=0) { |
| |
1711 /* target is full */ |
| |
1712 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| |
1713 break; |
| |
1714 } |
| |
1715 c=*source++; |
| |
1716 |
| |
1717 if((uint32_t)(c-0x3400)<(0xd800-0x3400)) { |
| |
1718 /* not compressible, write character directly */ |
| |
1719 if(targetCapacity>=2) { |
| |
1720 *target++=(uint8_t)(c>>8); |
| |
1721 *target++=(uint8_t)c; |
| |
1722 targetCapacity-=2; |
| |
1723 } else { |
| |
1724 length=2; |
| |
1725 goto outputBytes; |
| |
1726 } |
| |
1727 } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) { |
| |
1728 /* compress BMP character if the following one is not an uncompressible ideograph */ |
| |
1729 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) { |
| |
1730 if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) { |
| |
1731 /* ASCII digit or letter */ |
| |
1732 isSingleByteMode=TRUE; |
| |
1733 c|=((uint32_t)(UC0+dynamicWindow)<<8)|c; |
| |
1734 length=2; |
| |
1735 goto outputBytes; |
| |
1736 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { |
| |
1737 /* there is a dynamic window that contains this character, change to it */ |
| |
1738 isSingleByteMode=TRUE; |
| |
1739 dynamicWindow=window; |
| |
1740 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
| |
1741 useDynamicWindow(scsu, dynamicWindow); |
| |
1742 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; |
| |
1743 length=2; |
| |
1744 goto outputBytes; |
| |
1745 } else if((code=getDynamicOffset(c, &offset))>=0) { |
| |
1746 /* define a dynamic window with this character */ |
| |
1747 isSingleByteMode=TRUE; |
| |
1748 dynamicWindow=getNextDynamicWindow(scsu); |
| |
1749 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; |
| |
1750 useDynamicWindow(scsu, dynamicWindow); |
| |
1751 c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; |
| |
1752 length=3; |
| |
1753 goto outputBytes; |
| |
1754 } |
| |
1755 } |
| |
1756 |
| |
1757 /* don't know how to compress this character, just write it directly */ |
| |
1758 length=2; |
| |
1759 goto outputBytes; |
| |
1760 } else if(c<0xe000) { |
| |
1761 /* c is a surrogate */ |
| |
1762 if(U16_IS_SURROGATE_LEAD(c)) { |
| |
1763 getTrailUnicode: |
| |
1764 lead=(UChar)c; |
| |
1765 if(source<sourceLimit) { |
| |
1766 /* test the following code unit */ |
| |
1767 trail=*source; |
| |
1768 if(U16_IS_TRAIL(trail)) { |
| |
1769 ++source; |
| |
1770 c=U16_GET_SUPPLEMENTARY(c, trail); |
| |
1771 /* convert this surrogate code point */ |
| |
1772 /* exit this condition tree */ |
| |
1773 } else { |
| |
1774 /* this is an unmatched lead code unit (1st surrogate) */ |
| |
1775 /* callback(illegal) */ |
| |
1776 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| |
1777 goto endloop; |
| |
1778 } |
| |
1779 } else { |
| |
1780 /* no more input */ |
| |
1781 break; |
| |
1782 } |
| |
1783 } else { |
| |
1784 /* this is an unmatched trail code unit (2nd surrogate) */ |
| |
1785 /* callback(illegal) */ |
| |
1786 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| |
1787 goto endloop; |
| |
1788 } |
| |
1789 |
| |
1790 /* compress supplementary character */ |
| |
1791 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 && |
| |
1792 !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400)) |
| |
1793 ) { |
| |
1794 /* |
| |
1795 * there is a dynamic window that contains this character and |
| |
1796 * the following character is not uncompressible, |
| |
1797 * change to the window |
| |
1798 */ |
| |
1799 isSingleByteMode=TRUE; |
| |
1800 dynamicWindow=window; |
| |
1801 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
| |
1802 useDynamicWindow(scsu, dynamicWindow); |
| |
1803 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; |
| |
1804 length=2; |
| |
1805 goto outputBytes; |
| |
1806 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */ |
| |
1807 (code=getDynamicOffset(c, &offset))>=0 |
| |
1808 ) { |
| |
1809 /* two supplementary characters in (probably) the same window - define an extended one */ |
| |
1810 isSingleByteMode=TRUE; |
| |
1811 code-=0x200; |
| |
1812 dynamicWindow=getNextDynamicWindow(scsu); |
| |
1813 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; |
| |
1814 useDynamicWindow(scsu, dynamicWindow); |
| |
1815 c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; |
| |
1816 length=4; |
| |
1817 goto outputBytes; |
| |
1818 } else { |
| |
1819 /* don't know how to compress this character, just write it directly */ |
| |
1820 c=((uint32_t)lead<<16)|trail; |
| |
1821 length=4; |
| |
1822 goto outputBytes; |
| |
1823 } |
| |
1824 } else /* 0xe000<=c<0xf300 */ { |
| |
1825 /* quote to avoid SCSU tags */ |
| |
1826 c|=UQU<<16; |
| |
1827 length=3; |
| |
1828 goto outputBytes; |
| |
1829 } |
| |
1830 |
| |
1831 /* normal end of conversion: prepare for a new character */ |
| |
1832 c=0; |
| |
1833 } |
| |
1834 } |
| |
1835 endloop: |
| |
1836 |
| |
1837 /* set the converter state back into UConverter */ |
| |
1838 scsu->fromUIsSingleByteMode=isSingleByteMode; |
| |
1839 scsu->fromUDynamicWindow=dynamicWindow; |
| |
1840 |
| |
1841 cnv->fromUChar32=c; |
| |
1842 |
| |
1843 /* write back the updated pointers */ |
| |
1844 pArgs->source=source; |
| |
1845 pArgs->target=(char *)target; |
| |
1846 return; |
| |
1847 |
| |
1848 outputBytes: |
| |
1849 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */ |
| |
1850 /* from the first if in the loop we know that targetCapacity>0 */ |
| |
1851 if(length<=targetCapacity) { |
| |
1852 switch(length) { |
| |
1853 /* each branch falls through to the next one */ |
| |
1854 case 4: |
| |
1855 *target++=(uint8_t)(c>>24); |
| |
1856 case 3: /*fall through*/ |
| |
1857 *target++=(uint8_t)(c>>16); |
| |
1858 case 2: /*fall through*/ |
| |
1859 *target++=(uint8_t)(c>>8); |
| |
1860 case 1: /*fall through*/ |
| |
1861 *target++=(uint8_t)c; |
| |
1862 default: |
| |
1863 /* will never occur */ |
| |
1864 break; |
| |
1865 } |
| |
1866 targetCapacity-=length; |
| |
1867 |
| |
1868 /* normal end of conversion: prepare for a new character */ |
| |
1869 c=0; |
| |
1870 goto loop; |
| |
1871 } else { |
| |
1872 uint8_t *p; |
| |
1873 |
| |
1874 /* |
| |
1875 * We actually do this backwards here: |
| |
1876 * In order to save an intermediate variable, we output |
| |
1877 * first to the overflow buffer what does not fit into the |
| |
1878 * regular target. |
| |
1879 */ |
| |
1880 /* we know that 0<=targetCapacity<length<=4 */ |
| |
1881 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */ |
| |
1882 length-=targetCapacity; |
| |
1883 p=(uint8_t *)cnv->charErrorBuffer; |
| |
1884 switch(length) { |
| |
1885 /* each branch falls through to the next one */ |
| |
1886 case 4: |
| |
1887 *p++=(uint8_t)(c>>24); |
| |
1888 case 3: /*fall through*/ |
| |
1889 *p++=(uint8_t)(c>>16); |
| |
1890 case 2: /*fall through*/ |
| |
1891 *p++=(uint8_t)(c>>8); |
| |
1892 case 1: /*fall through*/ |
| |
1893 *p=(uint8_t)c; |
| |
1894 default: |
| |
1895 /* will never occur */ |
| |
1896 break; |
| |
1897 } |
| |
1898 cnv->charErrorBufferLength=(int8_t)length; |
| |
1899 |
| |
1900 /* now output what fits into the regular target */ |
| |
1901 c>>=8*length; /* length was reduced by targetCapacity */ |
| |
1902 switch(targetCapacity) { |
| |
1903 /* each branch falls through to the next one */ |
| |
1904 case 3: |
| |
1905 *target++=(uint8_t)(c>>16); |
| |
1906 case 2: /*fall through*/ |
| |
1907 *target++=(uint8_t)(c>>8); |
| |
1908 case 1: /*fall through*/ |
| |
1909 *target++=(uint8_t)c; |
| |
1910 default: |
| |
1911 break; |
| |
1912 } |
| |
1913 |
| |
1914 /* target overflow */ |
| |
1915 targetCapacity=0; |
| |
1916 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| |
1917 c=0; |
| |
1918 goto endloop; |
| |
1919 } |
| |
1920 } |
| |
1921 |
| |
1922 /* miscellaneous ------------------------------------------------------------ */ |
| |
1923 |
| |
1924 static const char * |
| |
1925 _SCSUGetName(const UConverter *cnv) { |
| |
1926 SCSUData *scsu=(SCSUData *)cnv->extraInfo; |
| |
1927 |
| |
1928 switch(scsu->locale) { |
| |
1929 case l_ja: |
| |
1930 return "SCSU,locale=ja"; |
| |
1931 default: |
| |
1932 return "SCSU"; |
| |
1933 } |
| |
1934 } |
| |
1935 |
| |
1936 /* structure for SafeClone calculations */ |
| |
1937 struct cloneSCSUStruct |
| |
1938 { |
| |
1939 UConverter cnv; |
| |
1940 SCSUData mydata; |
| |
1941 }; |
| |
1942 |
| |
1943 static UConverter * |
| |
1944 _SCSUSafeClone(const UConverter *cnv, |
| |
1945 void *stackBuffer, |
| |
1946 int32_t *pBufferSize, |
| |
1947 UErrorCode *status) |
| |
1948 { |
| |
1949 struct cloneSCSUStruct * localClone; |
| |
1950 int32_t bufferSizeNeeded = sizeof(struct cloneSCSUStruct); |
| |
1951 |
| |
1952 if (U_FAILURE(*status)){ |
| |
1953 return 0; |
| |
1954 } |
| |
1955 |
| |
1956 if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */ |
| |
1957 *pBufferSize = bufferSizeNeeded; |
| |
1958 return 0; |
| |
1959 } |
| |
1960 |
| |
1961 localClone = (struct cloneSCSUStruct *)stackBuffer; |
| |
1962 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ |
| |
1963 |
| |
1964 uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(SCSUData)); |
| |
1965 localClone->cnv.extraInfo = &localClone->mydata; |
| |
1966 localClone->cnv.isExtraLocal = TRUE; |
| |
1967 |
| |
1968 return &localClone->cnv; |
| |
1969 } |
| |
1970 |
| |
1971 |
| |
1972 static const UConverterImpl _SCSUImpl={ |
| |
1973 UCNV_SCSU, |
| |
1974 |
| |
1975 NULL, |
| |
1976 NULL, |
| |
1977 |
| |
1978 _SCSUOpen, |
| |
1979 _SCSUClose, |
| |
1980 _SCSUReset, |
| |
1981 |
| |
1982 _SCSUToUnicode, |
| |
1983 _SCSUToUnicodeWithOffsets, |
| |
1984 _SCSUFromUnicode, |
| |
1985 _SCSUFromUnicodeWithOffsets, |
| |
1986 NULL, |
| |
1987 |
| |
1988 NULL, |
| |
1989 _SCSUGetName, |
| |
1990 NULL, |
| |
1991 _SCSUSafeClone, |
| |
1992 ucnv_getCompleteUnicodeSet |
| |
1993 }; |
| |
1994 |
| |
1995 static const UConverterStaticData _SCSUStaticData={ |
| |
1996 sizeof(UConverterStaticData), |
| |
1997 "SCSU", |
| |
1998 1212, /* CCSID for SCSU */ |
| |
1999 UCNV_IBM, UCNV_SCSU, |
| |
2000 1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */ |
| |
2001 /* |
| |
2002 * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode |
| |
2003 * substitution string. |
| |
2004 */ |
| |
2005 { 0x0e, 0xff, 0xfd, 0 }, 3, |
| |
2006 FALSE, FALSE, |
| |
2007 0, |
| |
2008 0, |
| |
2009 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
| |
2010 }; |
| |
2011 |
| |
2012 const UConverterSharedData _SCSUData={ |
| |
2013 sizeof(UConverterSharedData), ~((uint32_t)0), |
| |
2014 NULL, NULL, &_SCSUStaticData, FALSE, &_SCSUImpl, |
| |
2015 0 |
| |
2016 }; |
| |
2017 |
| |
2018 #endif |