|
1 /* |
|
2 ********************************************************************** |
|
3 * Copyright (C) 2002-2010, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ********************************************************************** |
|
6 * file name: ucnv_u16.c |
|
7 * encoding: US-ASCII |
|
8 * tab size: 8 (not used) |
|
9 * indentation:4 |
|
10 * |
|
11 * created on: 2002jul01 |
|
12 * created by: Markus W. Scherer |
|
13 * |
|
14 * UTF-16 converter implementation. Used to be in ucnv_utf.c. |
|
15 */ |
|
16 |
|
17 #include "unicode/utypes.h" |
|
18 |
|
19 #if !UCONFIG_NO_CONVERSION |
|
20 |
|
21 #include "unicode/ucnv.h" |
|
22 #include "ucnv_bld.h" |
|
23 #include "ucnv_cnv.h" |
|
24 #include "cmemory.h" |
|
25 |
|
26 enum { |
|
27 UCNV_NEED_TO_WRITE_BOM=1 |
|
28 }; |
|
29 |
|
30 /* |
|
31 * The UTF-16 toUnicode implementation is also used for the Java-specific |
|
32 * "with BOM" variants of UTF-16BE and UTF-16LE. |
|
33 */ |
|
34 static void |
|
35 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, |
|
36 UErrorCode *pErrorCode); |
|
37 |
|
38 /* UTF-16BE ----------------------------------------------------------------- */ |
|
39 |
|
40 #if U_IS_BIG_ENDIAN |
|
41 # define _UTF16PEFromUnicodeWithOffsets _UTF16BEFromUnicodeWithOffsets |
|
42 #else |
|
43 # define _UTF16PEFromUnicodeWithOffsets _UTF16LEFromUnicodeWithOffsets |
|
44 #endif |
|
45 |
|
46 |
|
47 static void |
|
48 _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, |
|
49 UErrorCode *pErrorCode) { |
|
50 UConverter *cnv; |
|
51 const UChar *source; |
|
52 char *target; |
|
53 int32_t *offsets; |
|
54 |
|
55 uint32_t targetCapacity, length, sourceIndex; |
|
56 UChar c, trail; |
|
57 char overflow[4]; |
|
58 |
|
59 source=pArgs->source; |
|
60 length=(int32_t)(pArgs->sourceLimit-source); |
|
61 if(length<=0) { |
|
62 /* no input, nothing to do */ |
|
63 return; |
|
64 } |
|
65 |
|
66 cnv=pArgs->converter; |
|
67 |
|
68 /* write the BOM if necessary */ |
|
69 if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { |
|
70 static const char bom[]={ (char)0xfe, (char)0xff }; |
|
71 ucnv_fromUWriteBytes(cnv, |
|
72 bom, 2, |
|
73 &pArgs->target, pArgs->targetLimit, |
|
74 &pArgs->offsets, -1, |
|
75 pErrorCode); |
|
76 cnv->fromUnicodeStatus=0; |
|
77 } |
|
78 |
|
79 target=pArgs->target; |
|
80 if(target >= pArgs->targetLimit) { |
|
81 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
82 return; |
|
83 } |
|
84 |
|
85 targetCapacity=(uint32_t)(pArgs->targetLimit-target); |
|
86 offsets=pArgs->offsets; |
|
87 sourceIndex=0; |
|
88 |
|
89 /* c!=0 indicates in several places outside the main loops that a surrogate was found */ |
|
90 |
|
91 if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) { |
|
92 /* the last buffer ended with a lead surrogate, output the surrogate pair */ |
|
93 ++source; |
|
94 --length; |
|
95 target[0]=(uint8_t)(c>>8); |
|
96 target[1]=(uint8_t)c; |
|
97 target[2]=(uint8_t)(trail>>8); |
|
98 target[3]=(uint8_t)trail; |
|
99 target+=4; |
|
100 targetCapacity-=4; |
|
101 if(offsets!=NULL) { |
|
102 *offsets++=-1; |
|
103 *offsets++=-1; |
|
104 *offsets++=-1; |
|
105 *offsets++=-1; |
|
106 } |
|
107 sourceIndex=1; |
|
108 cnv->fromUChar32=c=0; |
|
109 } |
|
110 |
|
111 if(c==0) { |
|
112 /* copy an even number of bytes for complete UChars */ |
|
113 uint32_t count=2*length; |
|
114 if(count>targetCapacity) { |
|
115 count=targetCapacity&~1; |
|
116 } |
|
117 /* count is even */ |
|
118 targetCapacity-=count; |
|
119 count>>=1; |
|
120 length-=count; |
|
121 |
|
122 if(offsets==NULL) { |
|
123 while(count>0) { |
|
124 c=*source++; |
|
125 if(U16_IS_SINGLE(c)) { |
|
126 target[0]=(uint8_t)(c>>8); |
|
127 target[1]=(uint8_t)c; |
|
128 target+=2; |
|
129 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { |
|
130 ++source; |
|
131 --count; |
|
132 target[0]=(uint8_t)(c>>8); |
|
133 target[1]=(uint8_t)c; |
|
134 target[2]=(uint8_t)(trail>>8); |
|
135 target[3]=(uint8_t)trail; |
|
136 target+=4; |
|
137 } else { |
|
138 break; |
|
139 } |
|
140 --count; |
|
141 } |
|
142 } else { |
|
143 while(count>0) { |
|
144 c=*source++; |
|
145 if(U16_IS_SINGLE(c)) { |
|
146 target[0]=(uint8_t)(c>>8); |
|
147 target[1]=(uint8_t)c; |
|
148 target+=2; |
|
149 *offsets++=sourceIndex; |
|
150 *offsets++=sourceIndex++; |
|
151 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { |
|
152 ++source; |
|
153 --count; |
|
154 target[0]=(uint8_t)(c>>8); |
|
155 target[1]=(uint8_t)c; |
|
156 target[2]=(uint8_t)(trail>>8); |
|
157 target[3]=(uint8_t)trail; |
|
158 target+=4; |
|
159 *offsets++=sourceIndex; |
|
160 *offsets++=sourceIndex; |
|
161 *offsets++=sourceIndex; |
|
162 *offsets++=sourceIndex; |
|
163 sourceIndex+=2; |
|
164 } else { |
|
165 break; |
|
166 } |
|
167 --count; |
|
168 } |
|
169 } |
|
170 |
|
171 if(count==0) { |
|
172 /* done with the loop for complete UChars */ |
|
173 if(length>0 && targetCapacity>0) { |
|
174 /* |
|
175 * there is more input and some target capacity - |
|
176 * it must be targetCapacity==1 because otherwise |
|
177 * the above would have copied more; |
|
178 * prepare for overflow output |
|
179 */ |
|
180 if(U16_IS_SINGLE(c=*source++)) { |
|
181 overflow[0]=(char)(c>>8); |
|
182 overflow[1]=(char)c; |
|
183 length=2; /* 2 bytes to output */ |
|
184 c=0; |
|
185 /* } else { keep c for surrogate handling, length will be set there */ |
|
186 } |
|
187 } else { |
|
188 length=0; |
|
189 c=0; |
|
190 } |
|
191 } else { |
|
192 /* keep c for surrogate handling, length will be set there */ |
|
193 targetCapacity+=2*count; |
|
194 } |
|
195 } else { |
|
196 length=0; /* from here on, length counts the bytes in overflow[] */ |
|
197 } |
|
198 |
|
199 if(c!=0) { |
|
200 /* |
|
201 * c is a surrogate, and |
|
202 * - source or target too short |
|
203 * - or the surrogate is unmatched |
|
204 */ |
|
205 length=0; |
|
206 if(U16_IS_SURROGATE_LEAD(c)) { |
|
207 if(source<pArgs->sourceLimit) { |
|
208 if(U16_IS_TRAIL(trail=*source)) { |
|
209 /* output the surrogate pair, will overflow (see conditions comment above) */ |
|
210 ++source; |
|
211 overflow[0]=(char)(c>>8); |
|
212 overflow[1]=(char)c; |
|
213 overflow[2]=(char)(trail>>8); |
|
214 overflow[3]=(char)trail; |
|
215 length=4; /* 4 bytes to output */ |
|
216 c=0; |
|
217 } else { |
|
218 /* unmatched lead surrogate */ |
|
219 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
|
220 } |
|
221 } else { |
|
222 /* see if the trail surrogate is in the next buffer */ |
|
223 } |
|
224 } else { |
|
225 /* unmatched trail surrogate */ |
|
226 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
|
227 } |
|
228 cnv->fromUChar32=c; |
|
229 } |
|
230 |
|
231 if(length>0) { |
|
232 /* output length bytes with overflow (length>targetCapacity>0) */ |
|
233 ucnv_fromUWriteBytes(cnv, |
|
234 overflow, length, |
|
235 (char **)&target, pArgs->targetLimit, |
|
236 &offsets, sourceIndex, |
|
237 pErrorCode); |
|
238 targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target); |
|
239 } |
|
240 |
|
241 if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) { |
|
242 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
243 } |
|
244 |
|
245 /* write back the updated pointers */ |
|
246 pArgs->source=source; |
|
247 pArgs->target=(char *)target; |
|
248 pArgs->offsets=offsets; |
|
249 } |
|
250 |
|
251 static void |
|
252 _UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, |
|
253 UErrorCode *pErrorCode) { |
|
254 UConverter *cnv; |
|
255 const uint8_t *source; |
|
256 UChar *target; |
|
257 int32_t *offsets; |
|
258 |
|
259 uint32_t targetCapacity, length, count, sourceIndex; |
|
260 UChar c, trail; |
|
261 |
|
262 if(pArgs->converter->mode<8) { |
|
263 _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode); |
|
264 return; |
|
265 } |
|
266 |
|
267 cnv=pArgs->converter; |
|
268 source=(const uint8_t *)pArgs->source; |
|
269 length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source); |
|
270 if(length<=0 && cnv->toUnicodeStatus==0) { |
|
271 /* no input, nothing to do */ |
|
272 return; |
|
273 } |
|
274 |
|
275 target=pArgs->target; |
|
276 if(target >= pArgs->targetLimit) { |
|
277 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
278 return; |
|
279 } |
|
280 |
|
281 targetCapacity=(uint32_t)(pArgs->targetLimit-target); |
|
282 offsets=pArgs->offsets; |
|
283 sourceIndex=0; |
|
284 c=0; |
|
285 |
|
286 /* complete a partial UChar or pair from the last call */ |
|
287 if(cnv->toUnicodeStatus!=0) { |
|
288 /* |
|
289 * special case: single byte from a previous buffer, |
|
290 * where the byte turned out not to belong to a trail surrogate |
|
291 * and the preceding, unmatched lead surrogate was put into toUBytes[] |
|
292 * for error handling |
|
293 */ |
|
294 cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus; |
|
295 cnv->toULength=1; |
|
296 cnv->toUnicodeStatus=0; |
|
297 } |
|
298 if((count=cnv->toULength)!=0) { |
|
299 uint8_t *p=cnv->toUBytes; |
|
300 do { |
|
301 p[count++]=*source++; |
|
302 ++sourceIndex; |
|
303 --length; |
|
304 if(count==2) { |
|
305 c=((UChar)p[0]<<8)|p[1]; |
|
306 if(U16_IS_SINGLE(c)) { |
|
307 /* output the BMP code point */ |
|
308 *target++=c; |
|
309 if(offsets!=NULL) { |
|
310 *offsets++=-1; |
|
311 } |
|
312 --targetCapacity; |
|
313 count=0; |
|
314 c=0; |
|
315 break; |
|
316 } else if(U16_IS_SURROGATE_LEAD(c)) { |
|
317 /* continue collecting bytes for the trail surrogate */ |
|
318 c=0; /* avoid unnecessary surrogate handling below */ |
|
319 } else { |
|
320 /* fall through to error handling for an unmatched trail surrogate */ |
|
321 break; |
|
322 } |
|
323 } else if(count==4) { |
|
324 c=((UChar)p[0]<<8)|p[1]; |
|
325 trail=((UChar)p[2]<<8)|p[3]; |
|
326 if(U16_IS_TRAIL(trail)) { |
|
327 /* output the surrogate pair */ |
|
328 *target++=c; |
|
329 if(targetCapacity>=2) { |
|
330 *target++=trail; |
|
331 if(offsets!=NULL) { |
|
332 *offsets++=-1; |
|
333 *offsets++=-1; |
|
334 } |
|
335 targetCapacity-=2; |
|
336 } else /* targetCapacity==1 */ { |
|
337 targetCapacity=0; |
|
338 cnv->UCharErrorBuffer[0]=trail; |
|
339 cnv->UCharErrorBufferLength=1; |
|
340 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
341 } |
|
342 count=0; |
|
343 c=0; |
|
344 break; |
|
345 } else { |
|
346 /* unmatched lead surrogate, handle here for consistent toUBytes[] */ |
|
347 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
|
348 |
|
349 /* back out reading the code unit after it */ |
|
350 if(((const uint8_t *)pArgs->source-source)>=2) { |
|
351 source-=2; |
|
352 } else { |
|
353 /* |
|
354 * if the trail unit's first byte was in a previous buffer, then |
|
355 * we need to put it into a special place because toUBytes[] will be |
|
356 * used for the lead unit's bytes |
|
357 */ |
|
358 cnv->toUnicodeStatus=0x100|p[2]; |
|
359 --source; |
|
360 } |
|
361 cnv->toULength=2; |
|
362 |
|
363 /* write back the updated pointers */ |
|
364 pArgs->source=(const char *)source; |
|
365 pArgs->target=target; |
|
366 pArgs->offsets=offsets; |
|
367 return; |
|
368 } |
|
369 } |
|
370 } while(length>0); |
|
371 cnv->toULength=(int8_t)count; |
|
372 } |
|
373 |
|
374 /* copy an even number of bytes for complete UChars */ |
|
375 count=2*targetCapacity; |
|
376 if(count>length) { |
|
377 count=length&~1; |
|
378 } |
|
379 if(c==0 && count>0) { |
|
380 length-=count; |
|
381 count>>=1; |
|
382 targetCapacity-=count; |
|
383 if(offsets==NULL) { |
|
384 do { |
|
385 c=((UChar)source[0]<<8)|source[1]; |
|
386 source+=2; |
|
387 if(U16_IS_SINGLE(c)) { |
|
388 *target++=c; |
|
389 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && |
|
390 U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1]) |
|
391 ) { |
|
392 source+=2; |
|
393 --count; |
|
394 *target++=c; |
|
395 *target++=trail; |
|
396 } else { |
|
397 break; |
|
398 } |
|
399 } while(--count>0); |
|
400 } else { |
|
401 do { |
|
402 c=((UChar)source[0]<<8)|source[1]; |
|
403 source+=2; |
|
404 if(U16_IS_SINGLE(c)) { |
|
405 *target++=c; |
|
406 *offsets++=sourceIndex; |
|
407 sourceIndex+=2; |
|
408 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && |
|
409 U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1]) |
|
410 ) { |
|
411 source+=2; |
|
412 --count; |
|
413 *target++=c; |
|
414 *target++=trail; |
|
415 *offsets++=sourceIndex; |
|
416 *offsets++=sourceIndex; |
|
417 sourceIndex+=4; |
|
418 } else { |
|
419 break; |
|
420 } |
|
421 } while(--count>0); |
|
422 } |
|
423 |
|
424 if(count==0) { |
|
425 /* done with the loop for complete UChars */ |
|
426 c=0; |
|
427 } else { |
|
428 /* keep c for surrogate handling, trail will be set there */ |
|
429 length+=2*(count-1); /* one more byte pair was consumed than count decremented */ |
|
430 targetCapacity+=count; |
|
431 } |
|
432 } |
|
433 |
|
434 if(c!=0) { |
|
435 /* |
|
436 * c is a surrogate, and |
|
437 * - source or target too short |
|
438 * - or the surrogate is unmatched |
|
439 */ |
|
440 cnv->toUBytes[0]=(uint8_t)(c>>8); |
|
441 cnv->toUBytes[1]=(uint8_t)c; |
|
442 cnv->toULength=2; |
|
443 |
|
444 if(U16_IS_SURROGATE_LEAD(c)) { |
|
445 if(length>=2) { |
|
446 if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])) { |
|
447 /* output the surrogate pair, will overflow (see conditions comment above) */ |
|
448 source+=2; |
|
449 length-=2; |
|
450 *target++=c; |
|
451 if(offsets!=NULL) { |
|
452 *offsets++=sourceIndex; |
|
453 } |
|
454 cnv->UCharErrorBuffer[0]=trail; |
|
455 cnv->UCharErrorBufferLength=1; |
|
456 cnv->toULength=0; |
|
457 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
458 } else { |
|
459 /* unmatched lead surrogate */ |
|
460 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
|
461 } |
|
462 } else { |
|
463 /* see if the trail surrogate is in the next buffer */ |
|
464 } |
|
465 } else { |
|
466 /* unmatched trail surrogate */ |
|
467 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
|
468 } |
|
469 } |
|
470 |
|
471 if(U_SUCCESS(*pErrorCode)) { |
|
472 /* check for a remaining source byte */ |
|
473 if(length>0) { |
|
474 if(targetCapacity==0) { |
|
475 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
476 } else { |
|
477 /* it must be length==1 because otherwise the above would have copied more */ |
|
478 cnv->toUBytes[cnv->toULength++]=*source++; |
|
479 } |
|
480 } |
|
481 } |
|
482 |
|
483 /* write back the updated pointers */ |
|
484 pArgs->source=(const char *)source; |
|
485 pArgs->target=target; |
|
486 pArgs->offsets=offsets; |
|
487 } |
|
488 |
|
489 static UChar32 |
|
490 _UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { |
|
491 const uint8_t *s, *sourceLimit; |
|
492 UChar32 c; |
|
493 |
|
494 if(pArgs->converter->mode<8) { |
|
495 return UCNV_GET_NEXT_UCHAR_USE_TO_U; |
|
496 } |
|
497 |
|
498 s=(const uint8_t *)pArgs->source; |
|
499 sourceLimit=(const uint8_t *)pArgs->sourceLimit; |
|
500 |
|
501 if(s>=sourceLimit) { |
|
502 /* no input */ |
|
503 *err=U_INDEX_OUTOFBOUNDS_ERROR; |
|
504 return 0xffff; |
|
505 } |
|
506 |
|
507 if(s+2>sourceLimit) { |
|
508 /* only one byte: truncated UChar */ |
|
509 pArgs->converter->toUBytes[0]=*s++; |
|
510 pArgs->converter->toULength=1; |
|
511 pArgs->source=(const char *)s; |
|
512 *err = U_TRUNCATED_CHAR_FOUND; |
|
513 return 0xffff; |
|
514 } |
|
515 |
|
516 /* get one UChar */ |
|
517 c=((UChar32)*s<<8)|s[1]; |
|
518 s+=2; |
|
519 |
|
520 /* check for a surrogate pair */ |
|
521 if(U_IS_SURROGATE(c)) { |
|
522 if(U16_IS_SURROGATE_LEAD(c)) { |
|
523 if(s+2<=sourceLimit) { |
|
524 UChar trail; |
|
525 |
|
526 /* get a second UChar and see if it is a trail surrogate */ |
|
527 trail=((UChar)*s<<8)|s[1]; |
|
528 if(U16_IS_TRAIL(trail)) { |
|
529 c=U16_GET_SUPPLEMENTARY(c, trail); |
|
530 s+=2; |
|
531 } else { |
|
532 /* unmatched lead surrogate */ |
|
533 c=-2; |
|
534 } |
|
535 } else { |
|
536 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */ |
|
537 uint8_t *bytes=pArgs->converter->toUBytes; |
|
538 s-=2; |
|
539 pArgs->converter->toULength=(int8_t)(sourceLimit-s); |
|
540 do { |
|
541 *bytes++=*s++; |
|
542 } while(s<sourceLimit); |
|
543 |
|
544 c=0xffff; |
|
545 *err=U_TRUNCATED_CHAR_FOUND; |
|
546 } |
|
547 } else { |
|
548 /* unmatched trail surrogate */ |
|
549 c=-2; |
|
550 } |
|
551 |
|
552 if(c<0) { |
|
553 /* write the unmatched surrogate */ |
|
554 uint8_t *bytes=pArgs->converter->toUBytes; |
|
555 pArgs->converter->toULength=2; |
|
556 *bytes=*(s-2); |
|
557 bytes[1]=*(s-1); |
|
558 |
|
559 c=0xffff; |
|
560 *err=U_ILLEGAL_CHAR_FOUND; |
|
561 } |
|
562 } |
|
563 |
|
564 pArgs->source=(const char *)s; |
|
565 return c; |
|
566 } |
|
567 |
|
568 static void |
|
569 _UTF16BEReset(UConverter *cnv, UConverterResetChoice choice) { |
|
570 if(choice<=UCNV_RESET_TO_UNICODE) { |
|
571 /* reset toUnicode state */ |
|
572 if(UCNV_GET_VERSION(cnv)==0) { |
|
573 cnv->mode=8; /* no BOM handling */ |
|
574 } else { |
|
575 cnv->mode=0; /* Java-specific "UnicodeBig" requires BE BOM or no BOM */ |
|
576 } |
|
577 } |
|
578 if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) { |
|
579 /* reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BOM */ |
|
580 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; |
|
581 } |
|
582 } |
|
583 |
|
584 static void |
|
585 _UTF16BEOpen(UConverter *cnv, |
|
586 UConverterLoadArgs *pArgs, |
|
587 UErrorCode *pErrorCode) { |
|
588 if(UCNV_GET_VERSION(cnv)<=1) { |
|
589 _UTF16BEReset(cnv, UCNV_RESET_BOTH); |
|
590 } else { |
|
591 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
592 } |
|
593 } |
|
594 |
|
595 static const char * |
|
596 _UTF16BEGetName(const UConverter *cnv) { |
|
597 if(UCNV_GET_VERSION(cnv)==0) { |
|
598 return "UTF-16BE"; |
|
599 } else { |
|
600 return "UTF-16BE,version=1"; |
|
601 } |
|
602 } |
|
603 |
|
604 static const UConverterImpl _UTF16BEImpl={ |
|
605 UCNV_UTF16_BigEndian, |
|
606 |
|
607 NULL, |
|
608 NULL, |
|
609 |
|
610 _UTF16BEOpen, |
|
611 NULL, |
|
612 _UTF16BEReset, |
|
613 |
|
614 _UTF16BEToUnicodeWithOffsets, |
|
615 _UTF16BEToUnicodeWithOffsets, |
|
616 _UTF16BEFromUnicodeWithOffsets, |
|
617 _UTF16BEFromUnicodeWithOffsets, |
|
618 _UTF16BEGetNextUChar, |
|
619 |
|
620 NULL, |
|
621 _UTF16BEGetName, |
|
622 NULL, |
|
623 NULL, |
|
624 ucnv_getNonSurrogateUnicodeSet |
|
625 }; |
|
626 |
|
627 static const UConverterStaticData _UTF16BEStaticData={ |
|
628 sizeof(UConverterStaticData), |
|
629 "UTF-16BE", |
|
630 1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2, |
|
631 { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE, |
|
632 0, |
|
633 0, |
|
634 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
|
635 }; |
|
636 |
|
637 |
|
638 const UConverterSharedData _UTF16BEData={ |
|
639 sizeof(UConverterSharedData), ~((uint32_t) 0), |
|
640 NULL, NULL, &_UTF16BEStaticData, FALSE, &_UTF16BEImpl, |
|
641 0 |
|
642 }; |
|
643 |
|
644 /* UTF-16LE ----------------------------------------------------------------- */ |
|
645 |
|
646 static void |
|
647 _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, |
|
648 UErrorCode *pErrorCode) { |
|
649 UConverter *cnv; |
|
650 const UChar *source; |
|
651 char *target; |
|
652 int32_t *offsets; |
|
653 |
|
654 uint32_t targetCapacity, length, sourceIndex; |
|
655 UChar c, trail; |
|
656 char overflow[4]; |
|
657 |
|
658 source=pArgs->source; |
|
659 length=(int32_t)(pArgs->sourceLimit-source); |
|
660 if(length<=0) { |
|
661 /* no input, nothing to do */ |
|
662 return; |
|
663 } |
|
664 |
|
665 cnv=pArgs->converter; |
|
666 |
|
667 /* write the BOM if necessary */ |
|
668 if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { |
|
669 static const char bom[]={ (char)0xff, (char)0xfe }; |
|
670 ucnv_fromUWriteBytes(cnv, |
|
671 bom, 2, |
|
672 &pArgs->target, pArgs->targetLimit, |
|
673 &pArgs->offsets, -1, |
|
674 pErrorCode); |
|
675 cnv->fromUnicodeStatus=0; |
|
676 } |
|
677 |
|
678 target=pArgs->target; |
|
679 if(target >= pArgs->targetLimit) { |
|
680 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
681 return; |
|
682 } |
|
683 |
|
684 targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target); |
|
685 offsets=pArgs->offsets; |
|
686 sourceIndex=0; |
|
687 |
|
688 /* c!=0 indicates in several places outside the main loops that a surrogate was found */ |
|
689 |
|
690 if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) { |
|
691 /* the last buffer ended with a lead surrogate, output the surrogate pair */ |
|
692 ++source; |
|
693 --length; |
|
694 target[0]=(uint8_t)c; |
|
695 target[1]=(uint8_t)(c>>8); |
|
696 target[2]=(uint8_t)trail; |
|
697 target[3]=(uint8_t)(trail>>8); |
|
698 target+=4; |
|
699 targetCapacity-=4; |
|
700 if(offsets!=NULL) { |
|
701 *offsets++=-1; |
|
702 *offsets++=-1; |
|
703 *offsets++=-1; |
|
704 *offsets++=-1; |
|
705 } |
|
706 sourceIndex=1; |
|
707 cnv->fromUChar32=c=0; |
|
708 } |
|
709 |
|
710 if(c==0) { |
|
711 /* copy an even number of bytes for complete UChars */ |
|
712 uint32_t count=2*length; |
|
713 if(count>targetCapacity) { |
|
714 count=targetCapacity&~1; |
|
715 } |
|
716 /* count is even */ |
|
717 targetCapacity-=count; |
|
718 count>>=1; |
|
719 length-=count; |
|
720 |
|
721 if(offsets==NULL) { |
|
722 while(count>0) { |
|
723 c=*source++; |
|
724 if(U16_IS_SINGLE(c)) { |
|
725 target[0]=(uint8_t)c; |
|
726 target[1]=(uint8_t)(c>>8); |
|
727 target+=2; |
|
728 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { |
|
729 ++source; |
|
730 --count; |
|
731 target[0]=(uint8_t)c; |
|
732 target[1]=(uint8_t)(c>>8); |
|
733 target[2]=(uint8_t)trail; |
|
734 target[3]=(uint8_t)(trail>>8); |
|
735 target+=4; |
|
736 } else { |
|
737 break; |
|
738 } |
|
739 --count; |
|
740 } |
|
741 } else { |
|
742 while(count>0) { |
|
743 c=*source++; |
|
744 if(U16_IS_SINGLE(c)) { |
|
745 target[0]=(uint8_t)c; |
|
746 target[1]=(uint8_t)(c>>8); |
|
747 target+=2; |
|
748 *offsets++=sourceIndex; |
|
749 *offsets++=sourceIndex++; |
|
750 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { |
|
751 ++source; |
|
752 --count; |
|
753 target[0]=(uint8_t)c; |
|
754 target[1]=(uint8_t)(c>>8); |
|
755 target[2]=(uint8_t)trail; |
|
756 target[3]=(uint8_t)(trail>>8); |
|
757 target+=4; |
|
758 *offsets++=sourceIndex; |
|
759 *offsets++=sourceIndex; |
|
760 *offsets++=sourceIndex; |
|
761 *offsets++=sourceIndex; |
|
762 sourceIndex+=2; |
|
763 } else { |
|
764 break; |
|
765 } |
|
766 --count; |
|
767 } |
|
768 } |
|
769 |
|
770 if(count==0) { |
|
771 /* done with the loop for complete UChars */ |
|
772 if(length>0 && targetCapacity>0) { |
|
773 /* |
|
774 * there is more input and some target capacity - |
|
775 * it must be targetCapacity==1 because otherwise |
|
776 * the above would have copied more; |
|
777 * prepare for overflow output |
|
778 */ |
|
779 if(U16_IS_SINGLE(c=*source++)) { |
|
780 overflow[0]=(char)c; |
|
781 overflow[1]=(char)(c>>8); |
|
782 length=2; /* 2 bytes to output */ |
|
783 c=0; |
|
784 /* } else { keep c for surrogate handling, length will be set there */ |
|
785 } |
|
786 } else { |
|
787 length=0; |
|
788 c=0; |
|
789 } |
|
790 } else { |
|
791 /* keep c for surrogate handling, length will be set there */ |
|
792 targetCapacity+=2*count; |
|
793 } |
|
794 } else { |
|
795 length=0; /* from here on, length counts the bytes in overflow[] */ |
|
796 } |
|
797 |
|
798 if(c!=0) { |
|
799 /* |
|
800 * c is a surrogate, and |
|
801 * - source or target too short |
|
802 * - or the surrogate is unmatched |
|
803 */ |
|
804 length=0; |
|
805 if(U16_IS_SURROGATE_LEAD(c)) { |
|
806 if(source<pArgs->sourceLimit) { |
|
807 if(U16_IS_TRAIL(trail=*source)) { |
|
808 /* output the surrogate pair, will overflow (see conditions comment above) */ |
|
809 ++source; |
|
810 overflow[0]=(char)c; |
|
811 overflow[1]=(char)(c>>8); |
|
812 overflow[2]=(char)trail; |
|
813 overflow[3]=(char)(trail>>8); |
|
814 length=4; /* 4 bytes to output */ |
|
815 c=0; |
|
816 } else { |
|
817 /* unmatched lead surrogate */ |
|
818 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
|
819 } |
|
820 } else { |
|
821 /* see if the trail surrogate is in the next buffer */ |
|
822 } |
|
823 } else { |
|
824 /* unmatched trail surrogate */ |
|
825 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
|
826 } |
|
827 cnv->fromUChar32=c; |
|
828 } |
|
829 |
|
830 if(length>0) { |
|
831 /* output length bytes with overflow (length>targetCapacity>0) */ |
|
832 ucnv_fromUWriteBytes(cnv, |
|
833 overflow, length, |
|
834 &target, pArgs->targetLimit, |
|
835 &offsets, sourceIndex, |
|
836 pErrorCode); |
|
837 targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target); |
|
838 } |
|
839 |
|
840 if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) { |
|
841 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
842 } |
|
843 |
|
844 /* write back the updated pointers */ |
|
845 pArgs->source=source; |
|
846 pArgs->target=target; |
|
847 pArgs->offsets=offsets; |
|
848 } |
|
849 |
|
850 static void |
|
851 _UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, |
|
852 UErrorCode *pErrorCode) { |
|
853 UConverter *cnv; |
|
854 const uint8_t *source; |
|
855 UChar *target; |
|
856 int32_t *offsets; |
|
857 |
|
858 uint32_t targetCapacity, length, count, sourceIndex; |
|
859 UChar c, trail; |
|
860 |
|
861 if(pArgs->converter->mode<8) { |
|
862 _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode); |
|
863 return; |
|
864 } |
|
865 |
|
866 cnv=pArgs->converter; |
|
867 source=(const uint8_t *)pArgs->source; |
|
868 length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source); |
|
869 if(length<=0 && cnv->toUnicodeStatus==0) { |
|
870 /* no input, nothing to do */ |
|
871 return; |
|
872 } |
|
873 |
|
874 target=pArgs->target; |
|
875 if(target >= pArgs->targetLimit) { |
|
876 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
877 return; |
|
878 } |
|
879 |
|
880 targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target); |
|
881 offsets=pArgs->offsets; |
|
882 sourceIndex=0; |
|
883 c=0; |
|
884 |
|
885 /* complete a partial UChar or pair from the last call */ |
|
886 if(cnv->toUnicodeStatus!=0) { |
|
887 /* |
|
888 * special case: single byte from a previous buffer, |
|
889 * where the byte turned out not to belong to a trail surrogate |
|
890 * and the preceding, unmatched lead surrogate was put into toUBytes[] |
|
891 * for error handling |
|
892 */ |
|
893 cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus; |
|
894 cnv->toULength=1; |
|
895 cnv->toUnicodeStatus=0; |
|
896 } |
|
897 if((count=cnv->toULength)!=0) { |
|
898 uint8_t *p=cnv->toUBytes; |
|
899 do { |
|
900 p[count++]=*source++; |
|
901 ++sourceIndex; |
|
902 --length; |
|
903 if(count==2) { |
|
904 c=((UChar)p[1]<<8)|p[0]; |
|
905 if(U16_IS_SINGLE(c)) { |
|
906 /* output the BMP code point */ |
|
907 *target++=c; |
|
908 if(offsets!=NULL) { |
|
909 *offsets++=-1; |
|
910 } |
|
911 --targetCapacity; |
|
912 count=0; |
|
913 c=0; |
|
914 break; |
|
915 } else if(U16_IS_SURROGATE_LEAD(c)) { |
|
916 /* continue collecting bytes for the trail surrogate */ |
|
917 c=0; /* avoid unnecessary surrogate handling below */ |
|
918 } else { |
|
919 /* fall through to error handling for an unmatched trail surrogate */ |
|
920 break; |
|
921 } |
|
922 } else if(count==4) { |
|
923 c=((UChar)p[1]<<8)|p[0]; |
|
924 trail=((UChar)p[3]<<8)|p[2]; |
|
925 if(U16_IS_TRAIL(trail)) { |
|
926 /* output the surrogate pair */ |
|
927 *target++=c; |
|
928 if(targetCapacity>=2) { |
|
929 *target++=trail; |
|
930 if(offsets!=NULL) { |
|
931 *offsets++=-1; |
|
932 *offsets++=-1; |
|
933 } |
|
934 targetCapacity-=2; |
|
935 } else /* targetCapacity==1 */ { |
|
936 targetCapacity=0; |
|
937 cnv->UCharErrorBuffer[0]=trail; |
|
938 cnv->UCharErrorBufferLength=1; |
|
939 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
940 } |
|
941 count=0; |
|
942 c=0; |
|
943 break; |
|
944 } else { |
|
945 /* unmatched lead surrogate, handle here for consistent toUBytes[] */ |
|
946 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
|
947 |
|
948 /* back out reading the code unit after it */ |
|
949 if(((const uint8_t *)pArgs->source-source)>=2) { |
|
950 source-=2; |
|
951 } else { |
|
952 /* |
|
953 * if the trail unit's first byte was in a previous buffer, then |
|
954 * we need to put it into a special place because toUBytes[] will be |
|
955 * used for the lead unit's bytes |
|
956 */ |
|
957 cnv->toUnicodeStatus=0x100|p[2]; |
|
958 --source; |
|
959 } |
|
960 cnv->toULength=2; |
|
961 |
|
962 /* write back the updated pointers */ |
|
963 pArgs->source=(const char *)source; |
|
964 pArgs->target=target; |
|
965 pArgs->offsets=offsets; |
|
966 return; |
|
967 } |
|
968 } |
|
969 } while(length>0); |
|
970 cnv->toULength=(int8_t)count; |
|
971 } |
|
972 |
|
973 /* copy an even number of bytes for complete UChars */ |
|
974 count=2*targetCapacity; |
|
975 if(count>length) { |
|
976 count=length&~1; |
|
977 } |
|
978 if(c==0 && count>0) { |
|
979 length-=count; |
|
980 count>>=1; |
|
981 targetCapacity-=count; |
|
982 if(offsets==NULL) { |
|
983 do { |
|
984 c=((UChar)source[1]<<8)|source[0]; |
|
985 source+=2; |
|
986 if(U16_IS_SINGLE(c)) { |
|
987 *target++=c; |
|
988 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && |
|
989 U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0]) |
|
990 ) { |
|
991 source+=2; |
|
992 --count; |
|
993 *target++=c; |
|
994 *target++=trail; |
|
995 } else { |
|
996 break; |
|
997 } |
|
998 } while(--count>0); |
|
999 } else { |
|
1000 do { |
|
1001 c=((UChar)source[1]<<8)|source[0]; |
|
1002 source+=2; |
|
1003 if(U16_IS_SINGLE(c)) { |
|
1004 *target++=c; |
|
1005 *offsets++=sourceIndex; |
|
1006 sourceIndex+=2; |
|
1007 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && |
|
1008 U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0]) |
|
1009 ) { |
|
1010 source+=2; |
|
1011 --count; |
|
1012 *target++=c; |
|
1013 *target++=trail; |
|
1014 *offsets++=sourceIndex; |
|
1015 *offsets++=sourceIndex; |
|
1016 sourceIndex+=4; |
|
1017 } else { |
|
1018 break; |
|
1019 } |
|
1020 } while(--count>0); |
|
1021 } |
|
1022 |
|
1023 if(count==0) { |
|
1024 /* done with the loop for complete UChars */ |
|
1025 c=0; |
|
1026 } else { |
|
1027 /* keep c for surrogate handling, trail will be set there */ |
|
1028 length+=2*(count-1); /* one more byte pair was consumed than count decremented */ |
|
1029 targetCapacity+=count; |
|
1030 } |
|
1031 } |
|
1032 |
|
1033 if(c!=0) { |
|
1034 /* |
|
1035 * c is a surrogate, and |
|
1036 * - source or target too short |
|
1037 * - or the surrogate is unmatched |
|
1038 */ |
|
1039 cnv->toUBytes[0]=(uint8_t)c; |
|
1040 cnv->toUBytes[1]=(uint8_t)(c>>8); |
|
1041 cnv->toULength=2; |
|
1042 |
|
1043 if(U16_IS_SURROGATE_LEAD(c)) { |
|
1044 if(length>=2) { |
|
1045 if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])) { |
|
1046 /* output the surrogate pair, will overflow (see conditions comment above) */ |
|
1047 source+=2; |
|
1048 length-=2; |
|
1049 *target++=c; |
|
1050 if(offsets!=NULL) { |
|
1051 *offsets++=sourceIndex; |
|
1052 } |
|
1053 cnv->UCharErrorBuffer[0]=trail; |
|
1054 cnv->UCharErrorBufferLength=1; |
|
1055 cnv->toULength=0; |
|
1056 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
1057 } else { |
|
1058 /* unmatched lead surrogate */ |
|
1059 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
|
1060 } |
|
1061 } else { |
|
1062 /* see if the trail surrogate is in the next buffer */ |
|
1063 } |
|
1064 } else { |
|
1065 /* unmatched trail surrogate */ |
|
1066 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
|
1067 } |
|
1068 } |
|
1069 |
|
1070 if(U_SUCCESS(*pErrorCode)) { |
|
1071 /* check for a remaining source byte */ |
|
1072 if(length>0) { |
|
1073 if(targetCapacity==0) { |
|
1074 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
1075 } else { |
|
1076 /* it must be length==1 because otherwise the above would have copied more */ |
|
1077 cnv->toUBytes[cnv->toULength++]=*source++; |
|
1078 } |
|
1079 } |
|
1080 } |
|
1081 |
|
1082 /* write back the updated pointers */ |
|
1083 pArgs->source=(const char *)source; |
|
1084 pArgs->target=target; |
|
1085 pArgs->offsets=offsets; |
|
1086 } |
|
1087 |
|
1088 static UChar32 |
|
1089 _UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { |
|
1090 const uint8_t *s, *sourceLimit; |
|
1091 UChar32 c; |
|
1092 |
|
1093 if(pArgs->converter->mode<8) { |
|
1094 return UCNV_GET_NEXT_UCHAR_USE_TO_U; |
|
1095 } |
|
1096 |
|
1097 s=(const uint8_t *)pArgs->source; |
|
1098 sourceLimit=(const uint8_t *)pArgs->sourceLimit; |
|
1099 |
|
1100 if(s>=sourceLimit) { |
|
1101 /* no input */ |
|
1102 *err=U_INDEX_OUTOFBOUNDS_ERROR; |
|
1103 return 0xffff; |
|
1104 } |
|
1105 |
|
1106 if(s+2>sourceLimit) { |
|
1107 /* only one byte: truncated UChar */ |
|
1108 pArgs->converter->toUBytes[0]=*s++; |
|
1109 pArgs->converter->toULength=1; |
|
1110 pArgs->source=(const char *)s; |
|
1111 *err = U_TRUNCATED_CHAR_FOUND; |
|
1112 return 0xffff; |
|
1113 } |
|
1114 |
|
1115 /* get one UChar */ |
|
1116 c=((UChar32)s[1]<<8)|*s; |
|
1117 s+=2; |
|
1118 |
|
1119 /* check for a surrogate pair */ |
|
1120 if(U_IS_SURROGATE(c)) { |
|
1121 if(U16_IS_SURROGATE_LEAD(c)) { |
|
1122 if(s+2<=sourceLimit) { |
|
1123 UChar trail; |
|
1124 |
|
1125 /* get a second UChar and see if it is a trail surrogate */ |
|
1126 trail=((UChar)s[1]<<8)|*s; |
|
1127 if(U16_IS_TRAIL(trail)) { |
|
1128 c=U16_GET_SUPPLEMENTARY(c, trail); |
|
1129 s+=2; |
|
1130 } else { |
|
1131 /* unmatched lead surrogate */ |
|
1132 c=-2; |
|
1133 } |
|
1134 } else { |
|
1135 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */ |
|
1136 uint8_t *bytes=pArgs->converter->toUBytes; |
|
1137 s-=2; |
|
1138 pArgs->converter->toULength=(int8_t)(sourceLimit-s); |
|
1139 do { |
|
1140 *bytes++=*s++; |
|
1141 } while(s<sourceLimit); |
|
1142 |
|
1143 c=0xffff; |
|
1144 *err=U_TRUNCATED_CHAR_FOUND; |
|
1145 } |
|
1146 } else { |
|
1147 /* unmatched trail surrogate */ |
|
1148 c=-2; |
|
1149 } |
|
1150 |
|
1151 if(c<0) { |
|
1152 /* write the unmatched surrogate */ |
|
1153 uint8_t *bytes=pArgs->converter->toUBytes; |
|
1154 pArgs->converter->toULength=2; |
|
1155 *bytes=*(s-2); |
|
1156 bytes[1]=*(s-1); |
|
1157 |
|
1158 c=0xffff; |
|
1159 *err=U_ILLEGAL_CHAR_FOUND; |
|
1160 } |
|
1161 } |
|
1162 |
|
1163 pArgs->source=(const char *)s; |
|
1164 return c; |
|
1165 } |
|
1166 |
|
1167 static void |
|
1168 _UTF16LEReset(UConverter *cnv, UConverterResetChoice choice) { |
|
1169 if(choice<=UCNV_RESET_TO_UNICODE) { |
|
1170 /* reset toUnicode state */ |
|
1171 if(UCNV_GET_VERSION(cnv)==0) { |
|
1172 cnv->mode=8; /* no BOM handling */ |
|
1173 } else { |
|
1174 cnv->mode=0; /* Java-specific "UnicodeLittle" requires LE BOM or no BOM */ |
|
1175 } |
|
1176 } |
|
1177 if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) { |
|
1178 /* reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE BOM */ |
|
1179 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; |
|
1180 } |
|
1181 } |
|
1182 |
|
1183 static void |
|
1184 _UTF16LEOpen(UConverter *cnv, |
|
1185 UConverterLoadArgs *pArgs, |
|
1186 UErrorCode *pErrorCode) { |
|
1187 if(UCNV_GET_VERSION(cnv)<=1) { |
|
1188 _UTF16LEReset(cnv, UCNV_RESET_BOTH); |
|
1189 } else { |
|
1190 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
1191 } |
|
1192 } |
|
1193 |
|
1194 static const char * |
|
1195 _UTF16LEGetName(const UConverter *cnv) { |
|
1196 if(UCNV_GET_VERSION(cnv)==0) { |
|
1197 return "UTF-16LE"; |
|
1198 } else { |
|
1199 return "UTF-16LE,version=1"; |
|
1200 } |
|
1201 } |
|
1202 |
|
1203 static const UConverterImpl _UTF16LEImpl={ |
|
1204 UCNV_UTF16_LittleEndian, |
|
1205 |
|
1206 NULL, |
|
1207 NULL, |
|
1208 |
|
1209 _UTF16LEOpen, |
|
1210 NULL, |
|
1211 _UTF16LEReset, |
|
1212 |
|
1213 _UTF16LEToUnicodeWithOffsets, |
|
1214 _UTF16LEToUnicodeWithOffsets, |
|
1215 _UTF16LEFromUnicodeWithOffsets, |
|
1216 _UTF16LEFromUnicodeWithOffsets, |
|
1217 _UTF16LEGetNextUChar, |
|
1218 |
|
1219 NULL, |
|
1220 _UTF16LEGetName, |
|
1221 NULL, |
|
1222 NULL, |
|
1223 ucnv_getNonSurrogateUnicodeSet |
|
1224 }; |
|
1225 |
|
1226 |
|
1227 static const UConverterStaticData _UTF16LEStaticData={ |
|
1228 sizeof(UConverterStaticData), |
|
1229 "UTF-16LE", |
|
1230 1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2, |
|
1231 { 0xfd, 0xff, 0, 0 },2,FALSE,FALSE, |
|
1232 0, |
|
1233 0, |
|
1234 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
|
1235 }; |
|
1236 |
|
1237 |
|
1238 const UConverterSharedData _UTF16LEData={ |
|
1239 sizeof(UConverterSharedData), ~((uint32_t) 0), |
|
1240 NULL, NULL, &_UTF16LEStaticData, FALSE, &_UTF16LEImpl, |
|
1241 0 |
|
1242 }; |
|
1243 |
|
1244 /* UTF-16 (Detect BOM) ------------------------------------------------------ */ |
|
1245 |
|
1246 /* |
|
1247 * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE |
|
1248 * accordingly. |
|
1249 * This is a simpler version of the UTF-32 converter, with |
|
1250 * fewer states for shorter BOMs. |
|
1251 * |
|
1252 * State values: |
|
1253 * 0 initial state |
|
1254 * 1 saw first byte |
|
1255 * 2..5 - |
|
1256 * 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1 |
|
1257 * 8 UTF-16BE mode |
|
1258 * 9 UTF-16LE mode |
|
1259 * |
|
1260 * During detection: state==number of initial bytes seen so far. |
|
1261 * |
|
1262 * On output, emit U+FEFF as the first code point. |
|
1263 * |
|
1264 * Variants: |
|
1265 * - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error. |
|
1266 * - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and |
|
1267 * UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as an error. |
|
1268 */ |
|
1269 |
|
1270 static void |
|
1271 _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) { |
|
1272 if(choice<=UCNV_RESET_TO_UNICODE) { |
|
1273 /* reset toUnicode: state=0 */ |
|
1274 cnv->mode=0; |
|
1275 } |
|
1276 if(choice!=UCNV_RESET_TO_UNICODE) { |
|
1277 /* reset fromUnicode: prepare to output the UTF-16PE BOM */ |
|
1278 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; |
|
1279 } |
|
1280 } |
|
1281 |
|
1282 static const UConverterSharedData _UTF16v2Data; |
|
1283 |
|
1284 static void |
|
1285 _UTF16Open(UConverter *cnv, |
|
1286 UConverterLoadArgs *pArgs, |
|
1287 UErrorCode *pErrorCode) { |
|
1288 if(UCNV_GET_VERSION(cnv)<=2) { |
|
1289 if(UCNV_GET_VERSION(cnv)==2 && !pArgs->onlyTestIsLoadable) { |
|
1290 /* |
|
1291 * Switch implementation, and switch the staticData that's different |
|
1292 * and was copied into the UConverter. |
|
1293 * (See ucnv_createConverterFromSharedData() in ucnv_bld.c.) |
|
1294 * UTF-16,version=2 fromUnicode() always writes a big-endian byte stream. |
|
1295 */ |
|
1296 cnv->sharedData=(UConverterSharedData*)&_UTF16v2Data; |
|
1297 uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MAX_SUBCHAR_LEN); |
|
1298 } |
|
1299 _UTF16Reset(cnv, UCNV_RESET_BOTH); |
|
1300 } else { |
|
1301 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
1302 } |
|
1303 } |
|
1304 |
|
1305 static const char * |
|
1306 _UTF16GetName(const UConverter *cnv) { |
|
1307 if(UCNV_GET_VERSION(cnv)==0) { |
|
1308 return "UTF-16"; |
|
1309 } else if(UCNV_GET_VERSION(cnv)==1) { |
|
1310 return "UTF-16,version=1"; |
|
1311 } else { |
|
1312 return "UTF-16,version=2"; |
|
1313 } |
|
1314 } |
|
1315 |
|
1316 const UConverterSharedData _UTF16Data; |
|
1317 |
|
1318 #define IS_UTF16BE(cnv) ((cnv)->sharedData==&_UTF16BEData) |
|
1319 #define IS_UTF16LE(cnv) ((cnv)->sharedData==&_UTF16LEData) |
|
1320 #define IS_UTF16(cnv) ((cnv)->sharedData==&_UTF16Data || (cnv)->sharedData==&_UTF16v2Data) |
|
1321 |
|
1322 static void |
|
1323 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, |
|
1324 UErrorCode *pErrorCode) { |
|
1325 UConverter *cnv=pArgs->converter; |
|
1326 const char *source=pArgs->source; |
|
1327 const char *sourceLimit=pArgs->sourceLimit; |
|
1328 int32_t *offsets=pArgs->offsets; |
|
1329 |
|
1330 int32_t state, offsetDelta; |
|
1331 uint8_t b; |
|
1332 |
|
1333 state=cnv->mode; |
|
1334 |
|
1335 /* |
|
1336 * If we detect a BOM in this buffer, then we must add the BOM size to the |
|
1337 * offsets because the actual converter function will not see and count the BOM. |
|
1338 * offsetDelta will have the number of the BOM bytes that are in the current buffer. |
|
1339 */ |
|
1340 offsetDelta=0; |
|
1341 |
|
1342 while(source<sourceLimit && U_SUCCESS(*pErrorCode)) { |
|
1343 switch(state) { |
|
1344 case 0: |
|
1345 cnv->toUBytes[0]=(uint8_t)*source++; |
|
1346 cnv->toULength=1; |
|
1347 state=1; |
|
1348 break; |
|
1349 case 1: |
|
1350 /* |
|
1351 * Only inside this switch case can the state variable |
|
1352 * temporarily take two additional values: |
|
1353 * 6: BOM error, continue with BE |
|
1354 * 7: BOM error, continue with LE |
|
1355 */ |
|
1356 b=*source; |
|
1357 if(cnv->toUBytes[0]==0xfe && b==0xff) { |
|
1358 if(IS_UTF16LE(cnv)) { |
|
1359 state=7; /* illegal reverse BOM for Java "UnicodeLittle" */ |
|
1360 } else { |
|
1361 state=8; /* detect UTF-16BE */ |
|
1362 } |
|
1363 } else if(cnv->toUBytes[0]==0xff && b==0xfe) { |
|
1364 if(IS_UTF16BE(cnv)) { |
|
1365 state=6; /* illegal reverse BOM for Java "UnicodeBig" */ |
|
1366 } else { |
|
1367 state=9; /* detect UTF-16LE */ |
|
1368 } |
|
1369 } else if((IS_UTF16(cnv) && UCNV_GET_VERSION(cnv)==1)) { |
|
1370 state=6; /* illegal missing BOM for Java "Unicode" */ |
|
1371 } |
|
1372 if(state>=8) { |
|
1373 /* BOM detected, consume it */ |
|
1374 ++source; |
|
1375 cnv->toULength=0; |
|
1376 offsetDelta=(int32_t)(source-pArgs->source); |
|
1377 } else if(state<6) { |
|
1378 /* ok: no BOM, and not a reverse BOM */ |
|
1379 if(source!=pArgs->source) { |
|
1380 /* reset the source for a correct first offset */ |
|
1381 source=pArgs->source; |
|
1382 cnv->toULength=0; |
|
1383 } |
|
1384 if(IS_UTF16LE(cnv)) { |
|
1385 /* Make Java "UnicodeLittle" default to LE. */ |
|
1386 state=9; |
|
1387 } else { |
|
1388 /* Make standard UTF-16 and Java "UnicodeBig" default to BE. */ |
|
1389 state=8; |
|
1390 } |
|
1391 } else { |
|
1392 /* |
|
1393 * error: missing BOM, or reverse BOM |
|
1394 * UTF-16,version=1: Java-specific "Unicode" requires a BOM. |
|
1395 * UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE BOM or no BOM. |
|
1396 * UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an LE BOM or no BOM. |
|
1397 */ |
|
1398 /* report the non-BOM or reverse BOM as an illegal sequence */ |
|
1399 cnv->toUBytes[1]=b; |
|
1400 cnv->toULength=2; |
|
1401 pArgs->source=source+1; |
|
1402 /* continue with conversion if the callback resets the error */ |
|
1403 /* |
|
1404 * Make Java "Unicode" default to BE like standard UTF-16. |
|
1405 * Make Java "UnicodeBig" and "UnicodeLittle" default |
|
1406 * to their normal endiannesses. |
|
1407 */ |
|
1408 cnv->mode=state+2; |
|
1409 *pErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE; |
|
1410 return; |
|
1411 } |
|
1412 /* convert the rest of the stream */ |
|
1413 cnv->mode=state; |
|
1414 continue; |
|
1415 case 8: |
|
1416 /* call UTF-16BE */ |
|
1417 pArgs->source=source; |
|
1418 _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode); |
|
1419 source=pArgs->source; |
|
1420 break; |
|
1421 case 9: |
|
1422 /* call UTF-16LE */ |
|
1423 pArgs->source=source; |
|
1424 _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode); |
|
1425 source=pArgs->source; |
|
1426 break; |
|
1427 default: |
|
1428 break; /* does not occur */ |
|
1429 } |
|
1430 } |
|
1431 |
|
1432 /* add BOM size to offsets - see comment at offsetDelta declaration */ |
|
1433 if(offsets!=NULL && offsetDelta!=0) { |
|
1434 int32_t *offsetsLimit=pArgs->offsets; |
|
1435 while(offsets<offsetsLimit) { |
|
1436 *offsets++ += offsetDelta; |
|
1437 } |
|
1438 } |
|
1439 |
|
1440 pArgs->source=source; |
|
1441 |
|
1442 if(source==sourceLimit && pArgs->flush) { |
|
1443 /* handle truncated input */ |
|
1444 switch(state) { |
|
1445 case 0: |
|
1446 break; /* no input at all, nothing to do */ |
|
1447 case 8: |
|
1448 _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode); |
|
1449 break; |
|
1450 case 9: |
|
1451 _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode); |
|
1452 break; |
|
1453 default: |
|
1454 /* 0<state<8: framework will report truncation, nothing to do here */ |
|
1455 break; |
|
1456 } |
|
1457 } |
|
1458 |
|
1459 cnv->mode=state; |
|
1460 } |
|
1461 |
|
1462 static UChar32 |
|
1463 _UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs, |
|
1464 UErrorCode *pErrorCode) { |
|
1465 switch(pArgs->converter->mode) { |
|
1466 case 8: |
|
1467 return _UTF16BEGetNextUChar(pArgs, pErrorCode); |
|
1468 case 9: |
|
1469 return _UTF16LEGetNextUChar(pArgs, pErrorCode); |
|
1470 default: |
|
1471 return UCNV_GET_NEXT_UCHAR_USE_TO_U; |
|
1472 } |
|
1473 } |
|
1474 |
|
1475 static const UConverterImpl _UTF16Impl = { |
|
1476 UCNV_UTF16, |
|
1477 |
|
1478 NULL, |
|
1479 NULL, |
|
1480 |
|
1481 _UTF16Open, |
|
1482 NULL, |
|
1483 _UTF16Reset, |
|
1484 |
|
1485 _UTF16ToUnicodeWithOffsets, |
|
1486 _UTF16ToUnicodeWithOffsets, |
|
1487 _UTF16PEFromUnicodeWithOffsets, |
|
1488 _UTF16PEFromUnicodeWithOffsets, |
|
1489 _UTF16GetNextUChar, |
|
1490 |
|
1491 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */ |
|
1492 _UTF16GetName, |
|
1493 NULL, |
|
1494 NULL, |
|
1495 ucnv_getNonSurrogateUnicodeSet |
|
1496 }; |
|
1497 |
|
1498 static const UConverterStaticData _UTF16StaticData = { |
|
1499 sizeof(UConverterStaticData), |
|
1500 "UTF-16", |
|
1501 1204, /* CCSID for BOM sensitive UTF-16 */ |
|
1502 UCNV_IBM, UCNV_UTF16, 2, 2, |
|
1503 #if U_IS_BIG_ENDIAN |
|
1504 { 0xff, 0xfd, 0, 0 }, 2, |
|
1505 #else |
|
1506 { 0xfd, 0xff, 0, 0 }, 2, |
|
1507 #endif |
|
1508 FALSE, FALSE, |
|
1509 0, |
|
1510 0, |
|
1511 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
|
1512 }; |
|
1513 |
|
1514 const UConverterSharedData _UTF16Data = { |
|
1515 sizeof(UConverterSharedData), ~((uint32_t) 0), |
|
1516 NULL, NULL, &_UTF16StaticData, FALSE, &_UTF16Impl, |
|
1517 0 |
|
1518 }; |
|
1519 |
|
1520 static const UConverterImpl _UTF16v2Impl = { |
|
1521 UCNV_UTF16, |
|
1522 |
|
1523 NULL, |
|
1524 NULL, |
|
1525 |
|
1526 _UTF16Open, |
|
1527 NULL, |
|
1528 _UTF16Reset, |
|
1529 |
|
1530 _UTF16ToUnicodeWithOffsets, |
|
1531 _UTF16ToUnicodeWithOffsets, |
|
1532 _UTF16BEFromUnicodeWithOffsets, |
|
1533 _UTF16BEFromUnicodeWithOffsets, |
|
1534 _UTF16GetNextUChar, |
|
1535 |
|
1536 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */ |
|
1537 _UTF16GetName, |
|
1538 NULL, |
|
1539 NULL, |
|
1540 ucnv_getNonSurrogateUnicodeSet |
|
1541 }; |
|
1542 |
|
1543 static const UConverterStaticData _UTF16v2StaticData = { |
|
1544 sizeof(UConverterStaticData), |
|
1545 "UTF-16,version=2", |
|
1546 1204, /* CCSID for BOM sensitive UTF-16 */ |
|
1547 UCNV_IBM, UCNV_UTF16, 2, 2, |
|
1548 { 0xff, 0xfd, 0, 0 }, 2, |
|
1549 FALSE, FALSE, |
|
1550 0, |
|
1551 0, |
|
1552 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
|
1553 }; |
|
1554 |
|
1555 static const UConverterSharedData _UTF16v2Data = { |
|
1556 sizeof(UConverterSharedData), ~((uint32_t) 0), |
|
1557 NULL, NULL, &_UTF16v2StaticData, FALSE, &_UTF16v2Impl, |
|
1558 0 |
|
1559 }; |
|
1560 |
|
1561 #endif |