intl/uconv/ucvja/nsJapaneseToUnicode.cpp

branch
TOR_BUG_9701
changeset 8
97036ab72558
equal deleted inserted replaced
-1:000000000000 0:54dd996ae5d1
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 #include "nsJapaneseToUnicode.h"
6
7 #include "nsUCSupport.h"
8
9 #include "japanese.map"
10
11 #include "nsICharsetConverterManager.h"
12 #include "nsServiceManagerUtils.h"
13
14 #include "mozilla/Assertions.h"
15
16 // HTML5 says to use Windows-31J instead of the real Shift_JIS for decoding
17 #define SJIS_INDEX gCP932Index[0]
18 #define JIS0208_INDEX gCP932Index[1]
19
20 #define JIS0212_INDEX gJIS0212Index
21 #define SJIS_UNMAPPED 0x30fb
22 #define UNICODE_REPLACEMENT_CHARACTER 0xfffd
23 #define IN_GR_RANGE(b) \
24 ((uint8_t(0xa1) <= uint8_t(b)) && (uint8_t(b) <= uint8_t(0xfe)))
25
26 NS_IMETHODIMP nsShiftJISToUnicode::Convert(
27 const char * aSrc, int32_t * aSrcLen,
28 char16_t * aDest, int32_t * aDestLen)
29 {
30 static const uint8_t sbIdx[256] =
31 {
32 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x00 */
33 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x08 */
34 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x10 */
35 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x18 */
36 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x20 */
37 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x28 */
38 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x30 */
39 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x38 */
40 0, 1, 2, 3, 4, 5, 6, 7, /* 0x40 */
41 8, 9, 10, 11, 12, 13, 14, 15, /* 0x48 */
42 16, 17, 18, 19, 20, 21, 22, 23, /* 0x50 */
43 24, 25, 26, 27, 28, 29, 30, 31, /* 0x58 */
44 32, 33, 34, 35, 36, 37, 38, 39, /* 0x60 */
45 40, 41, 42, 43, 44, 45, 46, 47, /* 0x68 */
46 48, 49, 50, 51, 52, 53, 54, 55, /* 0x70 */
47 56, 57, 58, 59, 60, 61, 62, 0xFF, /* 0x78 */
48 63, 64, 65, 66, 67, 68, 69, 70, /* 0x80 */
49 71, 72, 73, 74, 75, 76, 77, 78, /* 0x88 */
50 79, 80, 81, 82, 83, 84, 85, 86, /* 0x90 */
51 87, 88, 89, 90, 91, 92, 93, 94, /* 0x98 */
52 95, 96, 97, 98, 99, 100, 101, 102, /* 0xa0 */
53 103, 104, 105, 106, 107, 108, 109, 110, /* 0xa8 */
54 111, 112, 113, 114, 115, 116, 117, 118, /* 0xb0 */
55 119, 120, 121, 122, 123, 124, 125, 126, /* 0xb8 */
56 127, 128, 129, 130, 131, 132, 133, 134, /* 0xc0 */
57 135, 136, 137, 138, 139, 140, 141, 142, /* 0xc8 */
58 143, 144, 145, 146, 147, 148, 149, 150, /* 0xd0 */
59 151, 152, 153, 154, 155, 156, 157, 158, /* 0xd8 */
60 159, 160, 161, 162, 163, 164, 165, 166, /* 0xe0 */
61 167, 168, 169, 170, 171, 172, 173, 174, /* 0xe8 */
62 175, 176, 177, 178, 179, 180, 181, 182, /* 0xf0 */
63 183, 184, 185, 186, 187, 0xFF, 0xFF, 0xFF, /* 0xf8 */
64 };
65
66 const unsigned char* srcEnd = (unsigned char*)aSrc + *aSrcLen;
67 const unsigned char* src =(unsigned char*) aSrc;
68 char16_t* destEnd = aDest + *aDestLen;
69 char16_t* dest = aDest;
70 while (src < srcEnd) {
71 switch (mState) {
72 case 0:
73 if (*src <= 0x80) {
74 // ASCII
75 *dest++ = (char16_t) *src;
76 if (dest >= destEnd) {
77 goto error1;
78 }
79 } else {
80 mData = SJIS_INDEX[*src & 0x7F];
81 if (mData < 0xE000) {
82 mState = 1; // two bytes
83 } else if (mData < 0xF000) {
84 mState = 2; // EUDC
85 } else {
86 *dest++ = mData; // JIS 0201
87 if (dest >= destEnd) {
88 goto error1;
89 }
90 }
91 }
92 break;
93
94 case 1: // Index to table
95 {
96 MOZ_ASSERT(mData < 0xE000);
97 uint8_t off = sbIdx[*src];
98
99 // Error handling: in the case where the second octet is not in the
100 // valid ranges 0x40-0x7E 0x80-0xFC, unconsume the invalid octet and
101 // interpret it as the ASCII value. In the case where the second
102 // octet is in the valid range but there is no mapping for the
103 // 2-octet sequence, do not unconsume.
104 if(0xFF == off) {
105 src--;
106 if (mErrBehavior == kOnError_Signal)
107 goto error_invalidchar;
108 *dest++ = UNICODE_REPLACEMENT_CHARACTER;
109 } else {
110 char16_t ch = gJapaneseMap[mData+off];
111 if(ch == 0xfffd) {
112 if (mErrBehavior == kOnError_Signal)
113 goto error_invalidchar;
114 ch = SJIS_UNMAPPED;
115 }
116 *dest++ = ch;
117 }
118 mState = 0;
119 if(dest >= destEnd)
120 goto error1;
121 }
122 break;
123
124 case 2: // EUDC
125 {
126 MOZ_ASSERT(0xE000 <= mData && mData < 0xF000);
127 uint8_t off = sbIdx[*src];
128
129 // Error handling as in case 1
130 if(0xFF == off) {
131 src--;
132 if (mErrBehavior == kOnError_Signal)
133 goto error_invalidchar;
134
135 *dest++ = UNICODE_REPLACEMENT_CHARACTER;
136 } else {
137 *dest++ = mData + off;
138 }
139 mState = 0;
140 if(dest >= destEnd)
141 goto error1;
142 }
143 break;
144
145 }
146 src++;
147 }
148 *aDestLen = dest - aDest;
149 return NS_OK;
150 error_invalidchar:
151 *aDestLen = dest - aDest;
152 *aSrcLen = src - (const unsigned char*)aSrc;
153 return NS_ERROR_ILLEGAL_INPUT;
154 error1:
155 *aDestLen = dest - aDest;
156 src++;
157 if ((mState == 0) && (src == srcEnd)) {
158 return NS_OK;
159 }
160 *aSrcLen = src - (const unsigned char*)aSrc;
161 return NS_OK_UDEC_MOREOUTPUT;
162 }
163
164 char16_t
165 nsShiftJISToUnicode::GetCharacterForUnMapped()
166 {
167 return char16_t(SJIS_UNMAPPED);
168 }
169
170 NS_IMETHODIMP nsEUCJPToUnicodeV2::Convert(
171 const char * aSrc, int32_t * aSrcLen,
172 char16_t * aDest, int32_t * aDestLen)
173 {
174 static const uint8_t sbIdx[256] =
175 {
176 /* 0x0X */
177 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
178 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
179 /* 0x1X */
180 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
181 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
182 /* 0x2X */
183 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
184 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
185 /* 0x3X */
186 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
187 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
188 /* 0x4X */
189 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
190 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
191 /* 0x5X */
192 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
193 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
194 /* 0x6X */
195 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
196 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
197 /* 0x7X */
198 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
199 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
200 /* 0x8X */
201 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
202 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
203 /* 0x9X */
204 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
205 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
206 /* 0xAX */
207 0xFF, 0, 1, 2, 3, 4, 5, 6,
208 7, 8 , 9, 10, 11, 12, 13, 14,
209 /* 0xBX */
210 15, 16, 17, 18, 19, 20, 21, 22,
211 23, 24, 25, 26, 27, 28, 29, 30,
212 /* 0xCX */
213 31, 32, 33, 34, 35, 36, 37, 38,
214 39, 40, 41, 42, 43, 44, 45, 46,
215 /* 0xDX */
216 47, 48, 49, 50, 51, 52, 53, 54,
217 55, 56, 57, 58, 59, 60, 61, 62,
218 /* 0xEX */
219 63, 64, 65, 66, 67, 68, 69, 70,
220 71, 72, 73, 74, 75, 76, 77, 78,
221 /* 0xFX */
222 79, 80, 81, 82, 83, 84, 85, 86,
223 87, 88, 89, 90, 91, 92, 93, 0xFF,
224 };
225
226 const unsigned char* srcEnd = (unsigned char*)aSrc + *aSrcLen;
227 const unsigned char* src =(unsigned char*) aSrc;
228 char16_t* destEnd = aDest + *aDestLen;
229 char16_t* dest = aDest;
230 while((src < srcEnd))
231 {
232 switch(mState)
233 {
234 case 0:
235 if(*src & 0x80 && *src != (unsigned char)0xa0)
236 {
237 mData = JIS0208_INDEX[*src & 0x7F];
238 if(mData != 0xFFFD )
239 {
240 mState = 1; // two byte JIS0208
241 } else {
242 if( 0x8e == *src) {
243 // JIS 0201
244 mState = 2; // JIS0201
245 } else if(0x8f == *src) {
246 // JIS 0212
247 mState = 3; // JIS0212
248 } else {
249 // others
250 if (mErrBehavior == kOnError_Signal)
251 goto error_invalidchar;
252 *dest++ = 0xFFFD;
253 if(dest >= destEnd)
254 goto error1;
255 }
256 }
257 } else {
258 // ASCII
259 *dest++ = (char16_t) *src;
260 if(dest >= destEnd)
261 goto error1;
262 }
263 break;
264
265 case 1: // Index to table
266 {
267 uint8_t off = sbIdx[*src];
268 if(0xFF == off) {
269 if (mErrBehavior == kOnError_Signal)
270 goto error_invalidchar;
271 *dest++ = 0xFFFD;
272 // if the first byte is valid for EUC-JP but the second
273 // is not while being a valid US-ASCII, save it
274 // instead of eating it up !
275 if ( (uint8_t)*src < (uint8_t)0x7f )
276 --src;
277 } else {
278 *dest++ = gJapaneseMap[mData+off];
279 }
280 mState = 0;
281 if(dest >= destEnd)
282 goto error1;
283 }
284 break;
285
286 case 2: // JIS 0201
287 {
288 if((0xA1 <= *src) && (*src <= 0xDF)) {
289 *dest++ = (0xFF61-0x00A1) + *src;
290 } else {
291 if (mErrBehavior == kOnError_Signal)
292 goto error_invalidchar;
293 *dest++ = 0xFFFD;
294 // if 0x8e is not followed by a valid JIS X 0201 byte
295 // but by a valid US-ASCII, save it instead of eating it up.
296 if ( (uint8_t)*src < (uint8_t)0x7f )
297 --src;
298 }
299 mState = 0;
300 if(dest >= destEnd)
301 goto error1;
302 }
303 break;
304
305 case 3: // JIS 0212
306 {
307 if (IN_GR_RANGE(*src))
308 {
309 mData = JIS0212_INDEX[*src & 0x7F];
310 if(mData != 0xFFFD )
311 {
312 mState = 4;
313 } else {
314 mState = 5; // error
315 }
316 } else {
317 // First "JIS 0212" byte is not in the valid GR range: save it
318 if (mErrBehavior == kOnError_Signal)
319 goto error_invalidchar;
320 *dest++ = 0xFFFD;
321 --src;
322 mState = 0;
323 if(dest >= destEnd)
324 goto error1;
325 }
326 }
327 break;
328 case 4:
329 {
330 uint8_t off = sbIdx[*src];
331 if(0xFF != off) {
332 *dest++ = gJapaneseMap[mData+off];
333 mState = 0;
334 if(dest >= destEnd)
335 goto error1;
336 break;
337 }
338 // else fall through to error handler
339 }
340 case 5: // two bytes undefined
341 {
342 if (mErrBehavior == kOnError_Signal)
343 goto error_invalidchar;
344 *dest++ = 0xFFFD;
345 // Undefined JIS 0212 two byte sequence. If the second byte is in
346 // the valid range for a two byte sequence (0xa1 - 0xfe) consume
347 // both bytes. Otherwise resynchronize on the second byte.
348 if (!IN_GR_RANGE(*src))
349 --src;
350 mState = 0;
351 if(dest >= destEnd)
352 goto error1;
353 }
354 break;
355 }
356 src++;
357 }
358 *aDestLen = dest - aDest;
359 return NS_OK;
360 error_invalidchar:
361 *aDestLen = dest - aDest;
362 *aSrcLen = src - (const unsigned char*)aSrc;
363 return NS_ERROR_ILLEGAL_INPUT;
364 error1:
365 *aDestLen = dest - aDest;
366 src++;
367 if ((mState == 0) && (src == srcEnd)) {
368 return NS_OK;
369 }
370 *aSrcLen = src - (const unsigned char*)aSrc;
371 return NS_OK_UDEC_MOREOUTPUT;
372 }
373
374
375
376 NS_IMETHODIMP nsISO2022JPToUnicodeV2::Convert(
377 const char * aSrc, int32_t * aSrcLen,
378 char16_t * aDest, int32_t * aDestLen)
379 {
380 static NS_DEFINE_CID(kCharsetConverterManagerCID, NS_ICHARSETCONVERTERMANAGER_CID);
381
382 static const uint16_t fbIdx[128] =
383 {
384 /* 0x8X */
385 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
386 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
387 /* 0x9X */
388 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
389 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
390 /* 0xAX */
391 0xFFFD, 0, 94, 94* 2, 94* 3, 94* 4, 94* 5, 94* 6,
392 94* 7, 94* 8 , 94* 9, 94*10, 94*11, 94*12, 94*13, 94*14,
393 /* 0xBX */
394 94*15, 94*16, 94*17, 94*18, 94*19, 94*20, 94*21, 94*22,
395 94*23, 94*24, 94*25, 94*26, 94*27, 94*28, 94*29, 94*30,
396 /* 0xCX */
397 94*31, 94*32, 94*33, 94*34, 94*35, 94*36, 94*37, 94*38,
398 94*39, 94*40, 94*41, 94*42, 94*43, 94*44, 94*45, 94*46,
399 /* 0xDX */
400 94*47, 94*48, 94*49, 94*50, 94*51, 94*52, 94*53, 94*54,
401 94*55, 94*56, 94*57, 94*58, 94*59, 94*60, 94*61, 94*62,
402 /* 0xEX */
403 94*63, 94*64, 94*65, 94*66, 94*67, 94*68, 94*69, 94*70,
404 94*71, 94*72, 94*73, 94*74, 94*75, 94*76, 94*77, 94*78,
405 /* 0xFX */
406 94*79, 94*80, 94*81, 94*82, 94*83, 94*84, 94*85, 94*86,
407 94*87, 94*88, 94*89, 94*90, 94*91, 94*92, 94*93, 0xFFFD,
408 };
409 static const uint8_t sbIdx[256] =
410 {
411 /* 0x0X */
412 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
413 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
414 /* 0x1X */
415 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
416 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
417 /* 0x2X */
418 0xFF, 0, 1, 2, 3, 4, 5, 6,
419 7, 8 , 9, 10, 11, 12, 13, 14,
420 /* 0x3X */
421 15, 16, 17, 18, 19, 20, 21, 22,
422 23, 24, 25, 26, 27, 28, 29, 30,
423 /* 0x4X */
424 31, 32, 33, 34, 35, 36, 37, 38,
425 39, 40, 41, 42, 43, 44, 45, 46,
426 /* 0x5X */
427 47, 48, 49, 50, 51, 52, 53, 54,
428 55, 56, 57, 58, 59, 60, 61, 62,
429 /* 0x6X */
430 63, 64, 65, 66, 67, 68, 69, 70,
431 71, 72, 73, 74, 75, 76, 77, 78,
432 /* 0x7X */
433 79, 80, 81, 82, 83, 84, 85, 86,
434 87, 88, 89, 90, 91, 92, 93, 0xFF,
435 /* 0x8X */
436 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
437 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
438 /* 0x9X */
439 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
440 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
441 /* 0xAX */
442 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
443 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
444 /* 0xBX */
445 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
446 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
447 /* 0xCX */
448 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
449 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
450 /* 0xDX */
451 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
452 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
453 /* 0xEX */
454 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
455 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
456 /* 0xFX */
457 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
458 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
459 };
460
461 const unsigned char* srcEnd = (unsigned char*)aSrc + *aSrcLen;
462 const unsigned char* src =(unsigned char*) aSrc;
463 char16_t* destEnd = aDest + *aDestLen;
464 char16_t* dest = aDest;
465 while((src < srcEnd))
466 {
467
468 switch(mState)
469 {
470 case mState_ASCII:
471 if(0x1b == *src)
472 {
473 mLastLegalState = mState;
474 mState = mState_ESC;
475 } else if(*src & 0x80) {
476 if (mErrBehavior == kOnError_Signal)
477 goto error3;
478 if (CHECK_OVERRUN(dest, destEnd, 1))
479 goto error1;
480 *dest++ = UNICODE_REPLACEMENT_CHARACTER;
481 } else {
482 if (CHECK_OVERRUN(dest, destEnd, 1))
483 goto error1;
484 *dest++ = (char16_t) *src;
485 }
486 break;
487
488 case mState_ESC:
489 if( '(' == *src) {
490 mState = mState_ESC_28;
491 } else if ('$' == *src) {
492 mState = mState_ESC_24;
493 } else if ('.' == *src) { // for ISO-2022-JP-2
494 mState = mState_ESC_2e;
495 } else if ('N' == *src) { // for ISO-2022-JP-2
496 mState = mState_ESC_4e;
497 } else {
498 if (CHECK_OVERRUN(dest, destEnd, 2))
499 goto error1;
500 *dest++ = (char16_t) 0x1b;
501 if (0x80 & *src) {
502 if (mErrBehavior == kOnError_Signal)
503 goto error3;
504 *dest++ = UNICODE_REPLACEMENT_CHARACTER;
505 } else {
506 *dest++ = (char16_t) *src;
507 }
508 mState = mLastLegalState;
509 }
510 break;
511
512 case mState_ESC_28: // ESC (
513 if( 'B' == *src) {
514 mState = mState_ASCII;
515 if (mRunLength == 0) {
516 if (CHECK_OVERRUN(dest, destEnd, 1))
517 goto error1;
518 *dest++ = 0xFFFD;
519 }
520 mRunLength = 0;
521 } else if ('J' == *src) {
522 mState = mState_JISX0201_1976Roman;
523 if (mRunLength == 0 && mLastLegalState != mState_ASCII) {
524 if (CHECK_OVERRUN(dest, destEnd, 1))
525 goto error1;
526 if (mErrBehavior == kOnError_Signal)
527 goto error3;
528 *dest++ = 0xFFFD;
529 }
530 mRunLength = 0;
531 } else if ('I' == *src) {
532 mState = mState_JISX0201_1976Kana;
533 mRunLength = 0;
534 } else {
535 if (CHECK_OVERRUN(dest, destEnd, 3))
536 goto error1;
537 *dest++ = (char16_t) 0x1b;
538 *dest++ = (char16_t) '(';
539 if (0x80 & *src) {
540 if (mErrBehavior == kOnError_Signal)
541 goto error3;
542 *dest++ = UNICODE_REPLACEMENT_CHARACTER;
543 } else {
544 *dest++ = (char16_t) *src;
545 }
546 mState = mLastLegalState;
547 }
548 break;
549
550 case mState_ESC_24: // ESC $
551 if( '@' == *src) {
552 mState = mState_JISX0208_1978;
553 mRunLength = 0;
554 } else if ('A' == *src) {
555 mState = mState_GB2312_1980;
556 mRunLength = 0;
557 } else if ('B' == *src) {
558 mState = mState_JISX0208_1983;
559 mRunLength = 0;
560 } else if ('(' == *src) {
561 mState = mState_ESC_24_28;
562 } else {
563 if (CHECK_OVERRUN(dest, destEnd, 3))
564 goto error1;
565 *dest++ = (char16_t) 0x1b;
566 *dest++ = (char16_t) '$';
567 if (0x80 & *src) {
568 if (mErrBehavior == kOnError_Signal)
569 goto error3;
570 *dest++ = UNICODE_REPLACEMENT_CHARACTER;
571 } else {
572 *dest++ = (char16_t) *src;
573 }
574 mState = mLastLegalState;
575 }
576 break;
577
578 case mState_ESC_24_28: // ESC $ (
579 if( 'C' == *src) {
580 mState = mState_KSC5601_1987;
581 mRunLength = 0;
582 } else if ('D' == *src) {
583 mState = mState_JISX0212_1990;
584 mRunLength = 0;
585 } else {
586 if (CHECK_OVERRUN(dest, destEnd, 4))
587 goto error1;
588 *dest++ = (char16_t) 0x1b;
589 *dest++ = (char16_t) '$';
590 *dest++ = (char16_t) '(';
591 if (0x80 & *src) {
592 if (mErrBehavior == kOnError_Signal)
593 goto error3;
594 *dest++ = UNICODE_REPLACEMENT_CHARACTER;
595 } else {
596 *dest++ = (char16_t) *src;
597 }
598 mState = mLastLegalState;
599 }
600 break;
601
602 case mState_JISX0201_1976Roman:
603 if(0x1b == *src) {
604 mLastLegalState = mState;
605 mState = mState_ESC;
606 } else if(*src & 0x80) {
607 if (mErrBehavior == kOnError_Signal)
608 goto error3;
609 if (CHECK_OVERRUN(dest, destEnd, 1))
610 goto error1;
611 *dest++ = UNICODE_REPLACEMENT_CHARACTER;
612 ++mRunLength;
613 } else {
614 // XXX We need to decide how to handle \ and ~ here
615 // we may need a if statement here for '\' and '~'
616 // to map them to Yen and Overbar
617 if (CHECK_OVERRUN(dest, destEnd, 1))
618 goto error1;
619 *dest++ = (char16_t) *src;
620 ++mRunLength;
621 }
622 break;
623
624 case mState_JISX0201_1976Kana:
625 if(0x1b == *src) {
626 mLastLegalState = mState;
627 mState = mState_ESC;
628 } else {
629 if (CHECK_OVERRUN(dest, destEnd, 1))
630 goto error1;
631 if((0x21 <= *src) && (*src <= 0x5F)) {
632 *dest++ = (0xFF61-0x0021) + *src;
633 } else {
634 if (mErrBehavior == kOnError_Signal)
635 goto error3;
636 *dest++ = UNICODE_REPLACEMENT_CHARACTER;
637 }
638 ++mRunLength;
639 }
640 break;
641
642 case mState_JISX0208_1978:
643 if(0x1b == *src) {
644 mLastLegalState = mState;
645 mState = mState_ESC;
646 } else if(*src & 0x80) {
647 mLastLegalState = mState;
648 mState = mState_ERROR;
649 } else {
650 mData = JIS0208_INDEX[*src & 0x7F];
651 if (0xFFFD == mData) {
652 if (mErrBehavior == kOnError_Signal)
653 goto error3;
654 mState = mState_ERROR;
655 } else {
656 mState = mState_JISX0208_1978_2ndbyte;
657 }
658 }
659 break;
660
661 case mState_GB2312_1980:
662 if(0x1b == *src) {
663 mLastLegalState = mState;
664 mState = mState_ESC;
665 } else if(*src & 0x80) {
666 mLastLegalState = mState;
667 mState = mState_ERROR;
668 } else {
669 mData = fbIdx[*src & 0x7F];
670 if (0xFFFD == mData) {
671 if (mErrBehavior == kOnError_Signal)
672 goto error3;
673 mState = mState_ERROR;
674 } else {
675 mState = mState_GB2312_1980_2ndbyte;
676 }
677 }
678 break;
679
680 case mState_JISX0208_1983:
681 if(0x1b == *src) {
682 mLastLegalState = mState;
683 mState = mState_ESC;
684 } else if(*src & 0x80) {
685 mLastLegalState = mState;
686 mState = mState_ERROR;
687 } else {
688 mData = JIS0208_INDEX[*src & 0x7F];
689 if (0xFFFD == mData) {
690 if (mErrBehavior == kOnError_Signal)
691 goto error3;
692 mState = mState_ERROR;
693 } else {
694 mState = mState_JISX0208_1983_2ndbyte;
695 }
696 }
697 break;
698
699 case mState_KSC5601_1987:
700 if(0x1b == *src) {
701 mLastLegalState = mState;
702 mState = mState_ESC;
703 } else if(*src & 0x80) {
704 mLastLegalState = mState;
705 mState = mState_ERROR;
706 } else {
707 mData = fbIdx[*src & 0x7F];
708 if (0xFFFD == mData) {
709 if (mErrBehavior == kOnError_Signal)
710 goto error3;
711 mState = mState_ERROR;
712 } else {
713 mState = mState_KSC5601_1987_2ndbyte;
714 }
715 }
716 break;
717
718 case mState_JISX0212_1990:
719 if(0x1b == *src) {
720 mLastLegalState = mState;
721 mState = mState_ESC;
722 } else if(*src & 0x80) {
723 mLastLegalState = mState;
724 mState = mState_ERROR;
725 } else {
726 mData = JIS0212_INDEX[*src & 0x7F];
727 if (0xFFFD == mData) {
728 if (mErrBehavior == kOnError_Signal)
729 goto error3;
730 mState = mState_ERROR;
731 } else {
732 mState = mState_JISX0212_1990_2ndbyte;
733 }
734 }
735 break;
736
737 case mState_JISX0208_1978_2ndbyte:
738 {
739 if (CHECK_OVERRUN(dest, destEnd, 1))
740 goto error1;
741 uint8_t off = sbIdx[*src];
742 if(0xFF == off) {
743 if (mErrBehavior == kOnError_Signal)
744 goto error3;
745 *dest++ = UNICODE_REPLACEMENT_CHARACTER;
746 } else {
747 // XXX We need to map from JIS X 0208 1983 to 1987
748 // in the next line before pass to *dest++
749 *dest++ = gJapaneseMap[mData+off];
750 }
751 ++mRunLength;
752 mState = mState_JISX0208_1978;
753 }
754 break;
755
756 case mState_GB2312_1980_2ndbyte:
757 {
758 if (CHECK_OVERRUN(dest, destEnd, 1))
759 goto error1;
760 uint8_t off = sbIdx[*src];
761 if(0xFF == off) {
762 if (mErrBehavior == kOnError_Signal)
763 goto error3;
764 *dest++ = UNICODE_REPLACEMENT_CHARACTER;
765 } else {
766 if (!mGB2312Decoder) {
767 // creating a delegate converter (GB2312)
768 nsresult rv;
769 nsCOMPtr<nsICharsetConverterManager> ccm =
770 do_GetService(kCharsetConverterManagerCID, &rv);
771 if (NS_SUCCEEDED(rv)) {
772 rv = ccm->GetUnicodeDecoderRaw("GB2312", &mGB2312Decoder);
773 }
774 }
775 if (!mGB2312Decoder) {// failed creating a delegate converter
776 goto error2;
777 } else {
778 unsigned char gb[2];
779 char16_t uni;
780 int32_t gbLen = 2, uniLen = 1;
781 // ((mData/94)+0x21) is the original 1st byte.
782 // *src is the present 2nd byte.
783 // Put 2 bytes (one character) to gb[] with GB2312 encoding.
784 gb[0] = ((mData / 94) + 0x21) | 0x80;
785 gb[1] = *src | 0x80;
786 // Convert GB2312 to unicode.
787 mGB2312Decoder->Convert((const char *)gb, &gbLen,
788 &uni, &uniLen);
789 *dest++ = uni;
790 }
791 }
792 ++mRunLength;
793 mState = mState_GB2312_1980;
794 }
795 break;
796
797 case mState_JISX0208_1983_2ndbyte:
798 {
799 if (CHECK_OVERRUN(dest, destEnd, 1))
800 goto error1;
801 uint8_t off = sbIdx[*src];
802 if(0xFF == off) {
803 if (mErrBehavior == kOnError_Signal)
804 goto error3;
805 *dest++ = UNICODE_REPLACEMENT_CHARACTER;
806 } else {
807 *dest++ = gJapaneseMap[mData+off];
808 }
809 ++mRunLength;
810 mState = mState_JISX0208_1983;
811 }
812 break;
813
814 case mState_KSC5601_1987_2ndbyte:
815 {
816 if (CHECK_OVERRUN(dest, destEnd, 1))
817 goto error1;
818 uint8_t off = sbIdx[*src];
819 if(0xFF == off) {
820 if (mErrBehavior == kOnError_Signal)
821 goto error3;
822 *dest++ = UNICODE_REPLACEMENT_CHARACTER;
823 } else {
824 if (!mEUCKRDecoder) {
825 // creating a delegate converter (EUC-KR)
826 nsresult rv;
827 nsCOMPtr<nsICharsetConverterManager> ccm =
828 do_GetService(kCharsetConverterManagerCID, &rv);
829 if (NS_SUCCEEDED(rv)) {
830 rv = ccm->GetUnicodeDecoderRaw("EUC-KR", &mEUCKRDecoder);
831 }
832 }
833 if (!mEUCKRDecoder) {// failed creating a delegate converter
834 goto error2;
835 } else {
836 unsigned char ksc[2];
837 char16_t uni;
838 int32_t kscLen = 2, uniLen = 1;
839 // ((mData/94)+0x21) is the original 1st byte.
840 // *src is the present 2nd byte.
841 // Put 2 bytes (one character) to ksc[] with EUC-KR encoding.
842 ksc[0] = ((mData / 94) + 0x21) | 0x80;
843 ksc[1] = *src | 0x80;
844 // Convert EUC-KR to unicode.
845 mEUCKRDecoder->Convert((const char *)ksc, &kscLen,
846 &uni, &uniLen);
847 *dest++ = uni;
848 }
849 }
850 ++mRunLength;
851 mState = mState_KSC5601_1987;
852 }
853 break;
854
855 case mState_JISX0212_1990_2ndbyte:
856 {
857 uint8_t off = sbIdx[*src];
858 if (CHECK_OVERRUN(dest, destEnd, 1))
859 goto error1;
860 if(0xFF == off) {
861 if (mErrBehavior == kOnError_Signal)
862 goto error3;
863 *dest++ = UNICODE_REPLACEMENT_CHARACTER;
864 } else {
865 *dest++ = gJapaneseMap[mData+off];
866 }
867 ++mRunLength;
868 mState = mState_JISX0212_1990;
869 }
870 break;
871
872 case mState_ESC_2e: // ESC .
873 // "ESC ." will designate 96 character set to G2.
874 mState = mLastLegalState;
875 if( 'A' == *src) {
876 G2charset = G2_ISO88591;
877 } else if ('F' == *src) {
878 G2charset = G2_ISO88597;
879 } else {
880 if (CHECK_OVERRUN(dest, destEnd, 3))
881 goto error1;
882 *dest++ = (char16_t) 0x1b;
883 *dest++ = (char16_t) '.';
884 if (0x80 & *src) {
885 if (mErrBehavior == kOnError_Signal)
886 goto error3;
887 *dest++ = UNICODE_REPLACEMENT_CHARACTER;
888 } else {
889 *dest++ = (char16_t) *src;
890 }
891 }
892 break;
893
894 case mState_ESC_4e: // ESC N
895 // "ESC N" is the SS2 sequence, that invoke a G2 designated
896 // character set. Since SS2 is effective only for next one
897 // character, mState should be returned to the last status.
898 mState = mLastLegalState;
899 if((0x20 <= *src) && (*src <= 0x7F)) {
900 if (CHECK_OVERRUN(dest, destEnd, 1))
901 goto error1;
902 if (G2_ISO88591 == G2charset) {
903 *dest++ = *src | 0x80;
904 } else if (G2_ISO88597 == G2charset) {
905 if (!mISO88597Decoder) {
906 // creating a delegate converter (ISO-8859-7)
907 nsresult rv;
908 nsCOMPtr<nsICharsetConverterManager> ccm =
909 do_GetService(kCharsetConverterManagerCID, &rv);
910 if (NS_SUCCEEDED(rv)) {
911 rv = ccm->GetUnicodeDecoderRaw("ISO-8859-7", &mISO88597Decoder);
912 }
913 }
914 if (!mISO88597Decoder) {// failed creating a delegate converter
915 goto error2;
916 } else {
917 // Put one character with ISO-8859-7 encoding.
918 unsigned char gr = *src | 0x80;
919 char16_t uni;
920 int32_t grLen = 1, uniLen = 1;
921 // Convert ISO-8859-7 to unicode.
922 mISO88597Decoder->Convert((const char *)&gr, &grLen,
923 &uni, &uniLen);
924 *dest++ = uni;
925 }
926 } else {// G2charset is G2_unknown (not designated yet)
927 if (mErrBehavior == kOnError_Signal)
928 goto error3;
929 *dest++ = UNICODE_REPLACEMENT_CHARACTER;
930 }
931 ++mRunLength;
932 } else {
933 if (CHECK_OVERRUN(dest, destEnd, 3))
934 goto error1;
935 *dest++ = (char16_t) 0x1b;
936 *dest++ = (char16_t) 'N';
937 if (0x80 & *src) {
938 if (mErrBehavior == kOnError_Signal)
939 goto error3;
940 *dest++ = UNICODE_REPLACEMENT_CHARACTER;
941 } else {
942 *dest++ = (char16_t) *src;
943 }
944 }
945 break;
946
947 case mState_ERROR:
948 mState = mLastLegalState;
949 if (mErrBehavior == kOnError_Signal) {
950 mRunLength = 0;
951 goto error3;
952 }
953 if (CHECK_OVERRUN(dest, destEnd, 1))
954 goto error1;
955 *dest++ = UNICODE_REPLACEMENT_CHARACTER;
956 ++mRunLength;
957 break;
958
959 } // switch
960 src++;
961 }
962 *aDestLen = dest - aDest;
963 return NS_OK;
964 error1:
965 *aDestLen = dest - aDest;
966 *aSrcLen = src - (const unsigned char*)aSrc;
967 return NS_OK_UDEC_MOREOUTPUT;
968 error2:
969 *aDestLen = dest - aDest;
970 *aSrcLen = src - (const unsigned char*)aSrc;
971 return NS_ERROR_UNEXPECTED;
972 error3:
973 *aDestLen = dest - aDest;
974 *aSrcLen = src - (const unsigned char*)aSrc;
975 return NS_ERROR_ILLEGAL_INPUT;
976 }

mercurial