|
1 /* |
|
2 ********************************************************************** |
|
3 * Copyright (C) 2000-2012, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ********************************************************************** |
|
6 * file name: ucnvlat1.cpp |
|
7 * encoding: US-ASCII |
|
8 * tab size: 8 (not used) |
|
9 * indentation:4 |
|
10 * |
|
11 * created on: 2000feb07 |
|
12 * created by: Markus W. Scherer |
|
13 */ |
|
14 |
|
15 #include "unicode/utypes.h" |
|
16 |
|
17 #if !UCONFIG_NO_CONVERSION |
|
18 |
|
19 #include "unicode/ucnv.h" |
|
20 #include "unicode/uset.h" |
|
21 #include "unicode/utf8.h" |
|
22 #include "ucnv_bld.h" |
|
23 #include "ucnv_cnv.h" |
|
24 |
|
25 /* control optimizations according to the platform */ |
|
26 #define LATIN1_UNROLL_FROM_UNICODE 1 |
|
27 |
|
28 /* ISO 8859-1 --------------------------------------------------------------- */ |
|
29 |
|
30 /* This is a table-less and callback-less version of ucnv_MBCSSingleToBMPWithOffsets(). */ |
|
31 static void |
|
32 _Latin1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, |
|
33 UErrorCode *pErrorCode) { |
|
34 const uint8_t *source; |
|
35 UChar *target; |
|
36 int32_t targetCapacity, length; |
|
37 int32_t *offsets; |
|
38 |
|
39 int32_t sourceIndex; |
|
40 |
|
41 /* set up the local pointers */ |
|
42 source=(const uint8_t *)pArgs->source; |
|
43 target=pArgs->target; |
|
44 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); |
|
45 offsets=pArgs->offsets; |
|
46 |
|
47 sourceIndex=0; |
|
48 |
|
49 /* |
|
50 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter |
|
51 * for the minimum of the sourceLength and targetCapacity |
|
52 */ |
|
53 length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source); |
|
54 if(length<=targetCapacity) { |
|
55 targetCapacity=length; |
|
56 } else { |
|
57 /* target will be full */ |
|
58 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
59 length=targetCapacity; |
|
60 } |
|
61 |
|
62 if(targetCapacity>=8) { |
|
63 /* This loop is unrolled for speed and improved pipelining. */ |
|
64 int32_t count, loops; |
|
65 |
|
66 loops=count=targetCapacity>>3; |
|
67 length=targetCapacity&=0x7; |
|
68 do { |
|
69 target[0]=source[0]; |
|
70 target[1]=source[1]; |
|
71 target[2]=source[2]; |
|
72 target[3]=source[3]; |
|
73 target[4]=source[4]; |
|
74 target[5]=source[5]; |
|
75 target[6]=source[6]; |
|
76 target[7]=source[7]; |
|
77 target+=8; |
|
78 source+=8; |
|
79 } while(--count>0); |
|
80 |
|
81 if(offsets!=NULL) { |
|
82 do { |
|
83 offsets[0]=sourceIndex++; |
|
84 offsets[1]=sourceIndex++; |
|
85 offsets[2]=sourceIndex++; |
|
86 offsets[3]=sourceIndex++; |
|
87 offsets[4]=sourceIndex++; |
|
88 offsets[5]=sourceIndex++; |
|
89 offsets[6]=sourceIndex++; |
|
90 offsets[7]=sourceIndex++; |
|
91 offsets+=8; |
|
92 } while(--loops>0); |
|
93 } |
|
94 } |
|
95 |
|
96 /* conversion loop */ |
|
97 while(targetCapacity>0) { |
|
98 *target++=*source++; |
|
99 --targetCapacity; |
|
100 } |
|
101 |
|
102 /* write back the updated pointers */ |
|
103 pArgs->source=(const char *)source; |
|
104 pArgs->target=target; |
|
105 |
|
106 /* set offsets */ |
|
107 if(offsets!=NULL) { |
|
108 while(length>0) { |
|
109 *offsets++=sourceIndex++; |
|
110 --length; |
|
111 } |
|
112 pArgs->offsets=offsets; |
|
113 } |
|
114 } |
|
115 |
|
116 /* This is a table-less and callback-less version of ucnv_MBCSSingleGetNextUChar(). */ |
|
117 static UChar32 |
|
118 _Latin1GetNextUChar(UConverterToUnicodeArgs *pArgs, |
|
119 UErrorCode *pErrorCode) { |
|
120 const uint8_t *source=(const uint8_t *)pArgs->source; |
|
121 if(source<(const uint8_t *)pArgs->sourceLimit) { |
|
122 pArgs->source=(const char *)(source+1); |
|
123 return *source; |
|
124 } |
|
125 |
|
126 /* no output because of empty input */ |
|
127 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
|
128 return 0xffff; |
|
129 } |
|
130 |
|
131 /* This is a table-less version of ucnv_MBCSSingleFromBMPWithOffsets(). */ |
|
132 static void |
|
133 _Latin1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, |
|
134 UErrorCode *pErrorCode) { |
|
135 UConverter *cnv; |
|
136 const UChar *source, *sourceLimit; |
|
137 uint8_t *target, *oldTarget; |
|
138 int32_t targetCapacity, length; |
|
139 int32_t *offsets; |
|
140 |
|
141 UChar32 cp; |
|
142 UChar c, max; |
|
143 |
|
144 int32_t sourceIndex; |
|
145 |
|
146 /* set up the local pointers */ |
|
147 cnv=pArgs->converter; |
|
148 source=pArgs->source; |
|
149 sourceLimit=pArgs->sourceLimit; |
|
150 target=oldTarget=(uint8_t *)pArgs->target; |
|
151 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); |
|
152 offsets=pArgs->offsets; |
|
153 |
|
154 if(cnv->sharedData==&_Latin1Data) { |
|
155 max=0xff; /* Latin-1 */ |
|
156 } else { |
|
157 max=0x7f; /* US-ASCII */ |
|
158 } |
|
159 |
|
160 /* get the converter state from UConverter */ |
|
161 cp=cnv->fromUChar32; |
|
162 |
|
163 /* sourceIndex=-1 if the current character began in the previous buffer */ |
|
164 sourceIndex= cp==0 ? 0 : -1; |
|
165 |
|
166 /* |
|
167 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter |
|
168 * for the minimum of the sourceLength and targetCapacity |
|
169 */ |
|
170 length=(int32_t)(sourceLimit-source); |
|
171 if(length<targetCapacity) { |
|
172 targetCapacity=length; |
|
173 } |
|
174 |
|
175 /* conversion loop */ |
|
176 if(cp!=0 && targetCapacity>0) { |
|
177 goto getTrail; |
|
178 } |
|
179 |
|
180 #if LATIN1_UNROLL_FROM_UNICODE |
|
181 /* unroll the loop with the most common case */ |
|
182 if(targetCapacity>=16) { |
|
183 int32_t count, loops; |
|
184 UChar u, oredChars; |
|
185 |
|
186 loops=count=targetCapacity>>4; |
|
187 do { |
|
188 oredChars=u=*source++; |
|
189 *target++=(uint8_t)u; |
|
190 oredChars|=u=*source++; |
|
191 *target++=(uint8_t)u; |
|
192 oredChars|=u=*source++; |
|
193 *target++=(uint8_t)u; |
|
194 oredChars|=u=*source++; |
|
195 *target++=(uint8_t)u; |
|
196 oredChars|=u=*source++; |
|
197 *target++=(uint8_t)u; |
|
198 oredChars|=u=*source++; |
|
199 *target++=(uint8_t)u; |
|
200 oredChars|=u=*source++; |
|
201 *target++=(uint8_t)u; |
|
202 oredChars|=u=*source++; |
|
203 *target++=(uint8_t)u; |
|
204 oredChars|=u=*source++; |
|
205 *target++=(uint8_t)u; |
|
206 oredChars|=u=*source++; |
|
207 *target++=(uint8_t)u; |
|
208 oredChars|=u=*source++; |
|
209 *target++=(uint8_t)u; |
|
210 oredChars|=u=*source++; |
|
211 *target++=(uint8_t)u; |
|
212 oredChars|=u=*source++; |
|
213 *target++=(uint8_t)u; |
|
214 oredChars|=u=*source++; |
|
215 *target++=(uint8_t)u; |
|
216 oredChars|=u=*source++; |
|
217 *target++=(uint8_t)u; |
|
218 oredChars|=u=*source++; |
|
219 *target++=(uint8_t)u; |
|
220 |
|
221 /* were all 16 entries really valid? */ |
|
222 if(oredChars>max) { |
|
223 /* no, return to the first of these 16 */ |
|
224 source-=16; |
|
225 target-=16; |
|
226 break; |
|
227 } |
|
228 } while(--count>0); |
|
229 count=loops-count; |
|
230 targetCapacity-=16*count; |
|
231 |
|
232 if(offsets!=NULL) { |
|
233 oldTarget+=16*count; |
|
234 while(count>0) { |
|
235 *offsets++=sourceIndex++; |
|
236 *offsets++=sourceIndex++; |
|
237 *offsets++=sourceIndex++; |
|
238 *offsets++=sourceIndex++; |
|
239 *offsets++=sourceIndex++; |
|
240 *offsets++=sourceIndex++; |
|
241 *offsets++=sourceIndex++; |
|
242 *offsets++=sourceIndex++; |
|
243 *offsets++=sourceIndex++; |
|
244 *offsets++=sourceIndex++; |
|
245 *offsets++=sourceIndex++; |
|
246 *offsets++=sourceIndex++; |
|
247 *offsets++=sourceIndex++; |
|
248 *offsets++=sourceIndex++; |
|
249 *offsets++=sourceIndex++; |
|
250 *offsets++=sourceIndex++; |
|
251 --count; |
|
252 } |
|
253 } |
|
254 } |
|
255 #endif |
|
256 |
|
257 /* conversion loop */ |
|
258 c=0; |
|
259 while(targetCapacity>0 && (c=*source++)<=max) { |
|
260 /* convert the Unicode code point */ |
|
261 *target++=(uint8_t)c; |
|
262 --targetCapacity; |
|
263 } |
|
264 |
|
265 if(c>max) { |
|
266 cp=c; |
|
267 if(!U_IS_SURROGATE(cp)) { |
|
268 /* callback(unassigned) */ |
|
269 } else if(U_IS_SURROGATE_LEAD(cp)) { |
|
270 getTrail: |
|
271 if(source<sourceLimit) { |
|
272 /* test the following code unit */ |
|
273 UChar trail=*source; |
|
274 if(U16_IS_TRAIL(trail)) { |
|
275 ++source; |
|
276 cp=U16_GET_SUPPLEMENTARY(cp, trail); |
|
277 /* this codepage does not map supplementary code points */ |
|
278 /* callback(unassigned) */ |
|
279 } else { |
|
280 /* this is an unmatched lead code unit (1st surrogate) */ |
|
281 /* callback(illegal) */ |
|
282 } |
|
283 } else { |
|
284 /* no more input */ |
|
285 cnv->fromUChar32=cp; |
|
286 goto noMoreInput; |
|
287 } |
|
288 } else { |
|
289 /* this is an unmatched trail code unit (2nd surrogate) */ |
|
290 /* callback(illegal) */ |
|
291 } |
|
292 |
|
293 *pErrorCode= U_IS_SURROGATE(cp) ? U_ILLEGAL_CHAR_FOUND : U_INVALID_CHAR_FOUND; |
|
294 cnv->fromUChar32=cp; |
|
295 } |
|
296 noMoreInput: |
|
297 |
|
298 /* set offsets since the start */ |
|
299 if(offsets!=NULL) { |
|
300 size_t count=target-oldTarget; |
|
301 while(count>0) { |
|
302 *offsets++=sourceIndex++; |
|
303 --count; |
|
304 } |
|
305 } |
|
306 |
|
307 if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) { |
|
308 /* target is full */ |
|
309 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
310 } |
|
311 |
|
312 /* write back the updated pointers */ |
|
313 pArgs->source=source; |
|
314 pArgs->target=(char *)target; |
|
315 pArgs->offsets=offsets; |
|
316 } |
|
317 |
|
318 /* Convert UTF-8 to Latin-1. Adapted from ucnv_SBCSFromUTF8(). */ |
|
319 static void |
|
320 ucnv_Latin1FromUTF8(UConverterFromUnicodeArgs *pFromUArgs, |
|
321 UConverterToUnicodeArgs *pToUArgs, |
|
322 UErrorCode *pErrorCode) { |
|
323 UConverter *utf8; |
|
324 const uint8_t *source, *sourceLimit; |
|
325 uint8_t *target; |
|
326 int32_t targetCapacity; |
|
327 |
|
328 UChar32 c; |
|
329 uint8_t b, t1; |
|
330 |
|
331 /* set up the local pointers */ |
|
332 utf8=pToUArgs->converter; |
|
333 source=(uint8_t *)pToUArgs->source; |
|
334 sourceLimit=(uint8_t *)pToUArgs->sourceLimit; |
|
335 target=(uint8_t *)pFromUArgs->target; |
|
336 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); |
|
337 |
|
338 /* get the converter state from the UTF-8 UConverter */ |
|
339 c=(UChar32)utf8->toUnicodeStatus; |
|
340 if(c!=0 && source<sourceLimit) { |
|
341 if(targetCapacity==0) { |
|
342 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
343 return; |
|
344 } else if(c>=0xc2 && c<=0xc3 && (t1=(uint8_t)(*source-0x80)) <= 0x3f) { |
|
345 ++source; |
|
346 *target++=(uint8_t)(((c&3)<<6)|t1); |
|
347 --targetCapacity; |
|
348 |
|
349 utf8->toUnicodeStatus=0; |
|
350 utf8->toULength=0; |
|
351 } else { |
|
352 /* complicated, illegal or unmappable input: fall back to the pivoting implementation */ |
|
353 *pErrorCode=U_USING_DEFAULT_WARNING; |
|
354 return; |
|
355 } |
|
356 } |
|
357 |
|
358 /* |
|
359 * Make sure that the last byte sequence before sourceLimit is complete |
|
360 * or runs into a lead byte. |
|
361 * In the conversion loop compare source with sourceLimit only once |
|
362 * per multi-byte character. |
|
363 * For Latin-1, adjust sourceLimit only for 1 trail byte because |
|
364 * the conversion loop handles at most 2-byte sequences. |
|
365 */ |
|
366 if(source<sourceLimit && U8_IS_LEAD(*(sourceLimit-1))) { |
|
367 --sourceLimit; |
|
368 } |
|
369 |
|
370 /* conversion loop */ |
|
371 while(source<sourceLimit) { |
|
372 if(targetCapacity>0) { |
|
373 b=*source++; |
|
374 if((int8_t)b>=0) { |
|
375 /* convert ASCII */ |
|
376 *target++=(uint8_t)b; |
|
377 --targetCapacity; |
|
378 } else if( /* handle U+0080..U+00FF inline */ |
|
379 b>=0xc2 && b<=0xc3 && |
|
380 (t1=(uint8_t)(*source-0x80)) <= 0x3f |
|
381 ) { |
|
382 ++source; |
|
383 *target++=(uint8_t)(((b&3)<<6)|t1); |
|
384 --targetCapacity; |
|
385 } else { |
|
386 /* complicated, illegal or unmappable input: fall back to the pivoting implementation */ |
|
387 pToUArgs->source=(char *)(source-1); |
|
388 pFromUArgs->target=(char *)target; |
|
389 *pErrorCode=U_USING_DEFAULT_WARNING; |
|
390 return; |
|
391 } |
|
392 } else { |
|
393 /* target is full */ |
|
394 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
395 break; |
|
396 } |
|
397 } |
|
398 |
|
399 /* |
|
400 * The sourceLimit may have been adjusted before the conversion loop |
|
401 * to stop before a truncated sequence. |
|
402 * If so, then collect the truncated sequence now. |
|
403 * For Latin-1, there is at most exactly one lead byte because of the |
|
404 * smaller sourceLimit adjustment logic. |
|
405 */ |
|
406 if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) { |
|
407 utf8->toUnicodeStatus=utf8->toUBytes[0]=b=*source++; |
|
408 utf8->toULength=1; |
|
409 utf8->mode=U8_COUNT_TRAIL_BYTES(b)+1; |
|
410 } |
|
411 |
|
412 /* write back the updated pointers */ |
|
413 pToUArgs->source=(char *)source; |
|
414 pFromUArgs->target=(char *)target; |
|
415 } |
|
416 |
|
417 static void |
|
418 _Latin1GetUnicodeSet(const UConverter *cnv, |
|
419 const USetAdder *sa, |
|
420 UConverterUnicodeSet which, |
|
421 UErrorCode *pErrorCode) { |
|
422 sa->addRange(sa->set, 0, 0xff); |
|
423 } |
|
424 |
|
425 static const UConverterImpl _Latin1Impl={ |
|
426 UCNV_LATIN_1, |
|
427 |
|
428 NULL, |
|
429 NULL, |
|
430 |
|
431 NULL, |
|
432 NULL, |
|
433 NULL, |
|
434 |
|
435 _Latin1ToUnicodeWithOffsets, |
|
436 _Latin1ToUnicodeWithOffsets, |
|
437 _Latin1FromUnicodeWithOffsets, |
|
438 _Latin1FromUnicodeWithOffsets, |
|
439 _Latin1GetNextUChar, |
|
440 |
|
441 NULL, |
|
442 NULL, |
|
443 NULL, |
|
444 NULL, |
|
445 _Latin1GetUnicodeSet, |
|
446 |
|
447 NULL, |
|
448 ucnv_Latin1FromUTF8 |
|
449 }; |
|
450 |
|
451 static const UConverterStaticData _Latin1StaticData={ |
|
452 sizeof(UConverterStaticData), |
|
453 "ISO-8859-1", |
|
454 819, UCNV_IBM, UCNV_LATIN_1, 1, 1, |
|
455 { 0x1a, 0, 0, 0 }, 1, FALSE, FALSE, |
|
456 0, |
|
457 0, |
|
458 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
|
459 }; |
|
460 |
|
461 const UConverterSharedData _Latin1Data={ |
|
462 sizeof(UConverterSharedData), ~((uint32_t) 0), |
|
463 NULL, NULL, &_Latin1StaticData, FALSE, &_Latin1Impl, |
|
464 0 |
|
465 }; |
|
466 |
|
467 /* US-ASCII ----------------------------------------------------------------- */ |
|
468 |
|
469 /* This is a table-less version of ucnv_MBCSSingleToBMPWithOffsets(). */ |
|
470 static void |
|
471 _ASCIIToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, |
|
472 UErrorCode *pErrorCode) { |
|
473 const uint8_t *source, *sourceLimit; |
|
474 UChar *target, *oldTarget; |
|
475 int32_t targetCapacity, length; |
|
476 int32_t *offsets; |
|
477 |
|
478 int32_t sourceIndex; |
|
479 |
|
480 uint8_t c; |
|
481 |
|
482 /* set up the local pointers */ |
|
483 source=(const uint8_t *)pArgs->source; |
|
484 sourceLimit=(const uint8_t *)pArgs->sourceLimit; |
|
485 target=oldTarget=pArgs->target; |
|
486 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); |
|
487 offsets=pArgs->offsets; |
|
488 |
|
489 /* sourceIndex=-1 if the current character began in the previous buffer */ |
|
490 sourceIndex=0; |
|
491 |
|
492 /* |
|
493 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter |
|
494 * for the minimum of the sourceLength and targetCapacity |
|
495 */ |
|
496 length=(int32_t)(sourceLimit-source); |
|
497 if(length<targetCapacity) { |
|
498 targetCapacity=length; |
|
499 } |
|
500 |
|
501 if(targetCapacity>=8) { |
|
502 /* This loop is unrolled for speed and improved pipelining. */ |
|
503 int32_t count, loops; |
|
504 UChar oredChars; |
|
505 |
|
506 loops=count=targetCapacity>>3; |
|
507 do { |
|
508 oredChars=target[0]=source[0]; |
|
509 oredChars|=target[1]=source[1]; |
|
510 oredChars|=target[2]=source[2]; |
|
511 oredChars|=target[3]=source[3]; |
|
512 oredChars|=target[4]=source[4]; |
|
513 oredChars|=target[5]=source[5]; |
|
514 oredChars|=target[6]=source[6]; |
|
515 oredChars|=target[7]=source[7]; |
|
516 |
|
517 /* were all 16 entries really valid? */ |
|
518 if(oredChars>0x7f) { |
|
519 /* no, return to the first of these 16 */ |
|
520 break; |
|
521 } |
|
522 source+=8; |
|
523 target+=8; |
|
524 } while(--count>0); |
|
525 count=loops-count; |
|
526 targetCapacity-=count*8; |
|
527 |
|
528 if(offsets!=NULL) { |
|
529 oldTarget+=count*8; |
|
530 while(count>0) { |
|
531 offsets[0]=sourceIndex++; |
|
532 offsets[1]=sourceIndex++; |
|
533 offsets[2]=sourceIndex++; |
|
534 offsets[3]=sourceIndex++; |
|
535 offsets[4]=sourceIndex++; |
|
536 offsets[5]=sourceIndex++; |
|
537 offsets[6]=sourceIndex++; |
|
538 offsets[7]=sourceIndex++; |
|
539 offsets+=8; |
|
540 --count; |
|
541 } |
|
542 } |
|
543 } |
|
544 |
|
545 /* conversion loop */ |
|
546 c=0; |
|
547 while(targetCapacity>0 && (c=*source++)<=0x7f) { |
|
548 *target++=c; |
|
549 --targetCapacity; |
|
550 } |
|
551 |
|
552 if(c>0x7f) { |
|
553 /* callback(illegal); copy the current bytes to toUBytes[] */ |
|
554 UConverter *cnv=pArgs->converter; |
|
555 cnv->toUBytes[0]=c; |
|
556 cnv->toULength=1; |
|
557 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
|
558 } else if(source<sourceLimit && target>=pArgs->targetLimit) { |
|
559 /* target is full */ |
|
560 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
561 } |
|
562 |
|
563 /* set offsets since the start */ |
|
564 if(offsets!=NULL) { |
|
565 size_t count=target-oldTarget; |
|
566 while(count>0) { |
|
567 *offsets++=sourceIndex++; |
|
568 --count; |
|
569 } |
|
570 } |
|
571 |
|
572 /* write back the updated pointers */ |
|
573 pArgs->source=(const char *)source; |
|
574 pArgs->target=target; |
|
575 pArgs->offsets=offsets; |
|
576 } |
|
577 |
|
578 /* This is a table-less version of ucnv_MBCSSingleGetNextUChar(). */ |
|
579 static UChar32 |
|
580 _ASCIIGetNextUChar(UConverterToUnicodeArgs *pArgs, |
|
581 UErrorCode *pErrorCode) { |
|
582 const uint8_t *source; |
|
583 uint8_t b; |
|
584 |
|
585 source=(const uint8_t *)pArgs->source; |
|
586 if(source<(const uint8_t *)pArgs->sourceLimit) { |
|
587 b=*source++; |
|
588 pArgs->source=(const char *)source; |
|
589 if(b<=0x7f) { |
|
590 return b; |
|
591 } else { |
|
592 UConverter *cnv=pArgs->converter; |
|
593 cnv->toUBytes[0]=b; |
|
594 cnv->toULength=1; |
|
595 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
|
596 return 0xffff; |
|
597 } |
|
598 } |
|
599 |
|
600 /* no output because of empty input */ |
|
601 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
|
602 return 0xffff; |
|
603 } |
|
604 |
|
605 /* "Convert" UTF-8 to US-ASCII: Validate and copy. */ |
|
606 static void |
|
607 ucnv_ASCIIFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, |
|
608 UConverterToUnicodeArgs *pToUArgs, |
|
609 UErrorCode *pErrorCode) { |
|
610 const uint8_t *source, *sourceLimit; |
|
611 uint8_t *target; |
|
612 int32_t targetCapacity, length; |
|
613 |
|
614 uint8_t c; |
|
615 |
|
616 if(pToUArgs->converter->toUnicodeStatus!=0) { |
|
617 /* no handling of partial UTF-8 characters here, fall back to pivoting */ |
|
618 *pErrorCode=U_USING_DEFAULT_WARNING; |
|
619 return; |
|
620 } |
|
621 |
|
622 /* set up the local pointers */ |
|
623 source=(const uint8_t *)pToUArgs->source; |
|
624 sourceLimit=(const uint8_t *)pToUArgs->sourceLimit; |
|
625 target=(uint8_t *)pFromUArgs->target; |
|
626 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); |
|
627 |
|
628 /* |
|
629 * since the conversion here is 1:1 uint8_t:uint8_t, we need only one counter |
|
630 * for the minimum of the sourceLength and targetCapacity |
|
631 */ |
|
632 length=(int32_t)(sourceLimit-source); |
|
633 if(length<targetCapacity) { |
|
634 targetCapacity=length; |
|
635 } |
|
636 |
|
637 /* unroll the loop with the most common case */ |
|
638 if(targetCapacity>=16) { |
|
639 int32_t count, loops; |
|
640 uint8_t oredChars; |
|
641 |
|
642 loops=count=targetCapacity>>4; |
|
643 do { |
|
644 oredChars=*target++=*source++; |
|
645 oredChars|=*target++=*source++; |
|
646 oredChars|=*target++=*source++; |
|
647 oredChars|=*target++=*source++; |
|
648 oredChars|=*target++=*source++; |
|
649 oredChars|=*target++=*source++; |
|
650 oredChars|=*target++=*source++; |
|
651 oredChars|=*target++=*source++; |
|
652 oredChars|=*target++=*source++; |
|
653 oredChars|=*target++=*source++; |
|
654 oredChars|=*target++=*source++; |
|
655 oredChars|=*target++=*source++; |
|
656 oredChars|=*target++=*source++; |
|
657 oredChars|=*target++=*source++; |
|
658 oredChars|=*target++=*source++; |
|
659 oredChars|=*target++=*source++; |
|
660 |
|
661 /* were all 16 entries really valid? */ |
|
662 if(oredChars>0x7f) { |
|
663 /* no, return to the first of these 16 */ |
|
664 source-=16; |
|
665 target-=16; |
|
666 break; |
|
667 } |
|
668 } while(--count>0); |
|
669 count=loops-count; |
|
670 targetCapacity-=16*count; |
|
671 } |
|
672 |
|
673 /* conversion loop */ |
|
674 c=0; |
|
675 while(targetCapacity>0 && (c=*source)<=0x7f) { |
|
676 ++source; |
|
677 *target++=c; |
|
678 --targetCapacity; |
|
679 } |
|
680 |
|
681 if(c>0x7f) { |
|
682 /* non-ASCII character, handle in standard converter */ |
|
683 *pErrorCode=U_USING_DEFAULT_WARNING; |
|
684 } else if(source<sourceLimit && target>=(const uint8_t *)pFromUArgs->targetLimit) { |
|
685 /* target is full */ |
|
686 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
687 } |
|
688 |
|
689 /* write back the updated pointers */ |
|
690 pToUArgs->source=(const char *)source; |
|
691 pFromUArgs->target=(char *)target; |
|
692 } |
|
693 |
|
694 static void |
|
695 _ASCIIGetUnicodeSet(const UConverter *cnv, |
|
696 const USetAdder *sa, |
|
697 UConverterUnicodeSet which, |
|
698 UErrorCode *pErrorCode) { |
|
699 sa->addRange(sa->set, 0, 0x7f); |
|
700 } |
|
701 |
|
702 static const UConverterImpl _ASCIIImpl={ |
|
703 UCNV_US_ASCII, |
|
704 |
|
705 NULL, |
|
706 NULL, |
|
707 |
|
708 NULL, |
|
709 NULL, |
|
710 NULL, |
|
711 |
|
712 _ASCIIToUnicodeWithOffsets, |
|
713 _ASCIIToUnicodeWithOffsets, |
|
714 _Latin1FromUnicodeWithOffsets, |
|
715 _Latin1FromUnicodeWithOffsets, |
|
716 _ASCIIGetNextUChar, |
|
717 |
|
718 NULL, |
|
719 NULL, |
|
720 NULL, |
|
721 NULL, |
|
722 _ASCIIGetUnicodeSet, |
|
723 |
|
724 NULL, |
|
725 ucnv_ASCIIFromUTF8 |
|
726 }; |
|
727 |
|
728 static const UConverterStaticData _ASCIIStaticData={ |
|
729 sizeof(UConverterStaticData), |
|
730 "US-ASCII", |
|
731 367, UCNV_IBM, UCNV_US_ASCII, 1, 1, |
|
732 { 0x1a, 0, 0, 0 }, 1, FALSE, FALSE, |
|
733 0, |
|
734 0, |
|
735 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
|
736 }; |
|
737 |
|
738 const UConverterSharedData _ASCIIData={ |
|
739 sizeof(UConverterSharedData), ~((uint32_t) 0), |
|
740 NULL, NULL, &_ASCIIStaticData, FALSE, &_ASCIIImpl, |
|
741 0 |
|
742 }; |
|
743 |
|
744 #endif |