|
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
|
2 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
3 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
5 #include "nsJapaneseToUnicode.h" |
|
6 |
|
7 #include "nsUCSupport.h" |
|
8 |
|
9 #include "japanese.map" |
|
10 |
|
11 #include "nsICharsetConverterManager.h" |
|
12 #include "nsServiceManagerUtils.h" |
|
13 |
|
14 #include "mozilla/Assertions.h" |
|
15 |
|
16 // HTML5 says to use Windows-31J instead of the real Shift_JIS for decoding |
|
17 #define SJIS_INDEX gCP932Index[0] |
|
18 #define JIS0208_INDEX gCP932Index[1] |
|
19 |
|
20 #define JIS0212_INDEX gJIS0212Index |
|
21 #define SJIS_UNMAPPED 0x30fb |
|
22 #define UNICODE_REPLACEMENT_CHARACTER 0xfffd |
|
23 #define IN_GR_RANGE(b) \ |
|
24 ((uint8_t(0xa1) <= uint8_t(b)) && (uint8_t(b) <= uint8_t(0xfe))) |
|
25 |
|
26 NS_IMETHODIMP nsShiftJISToUnicode::Convert( |
|
27 const char * aSrc, int32_t * aSrcLen, |
|
28 char16_t * aDest, int32_t * aDestLen) |
|
29 { |
|
30 static const uint8_t sbIdx[256] = |
|
31 { |
|
32 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x00 */ |
|
33 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x08 */ |
|
34 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x10 */ |
|
35 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x18 */ |
|
36 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x20 */ |
|
37 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x28 */ |
|
38 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x30 */ |
|
39 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x38 */ |
|
40 0, 1, 2, 3, 4, 5, 6, 7, /* 0x40 */ |
|
41 8, 9, 10, 11, 12, 13, 14, 15, /* 0x48 */ |
|
42 16, 17, 18, 19, 20, 21, 22, 23, /* 0x50 */ |
|
43 24, 25, 26, 27, 28, 29, 30, 31, /* 0x58 */ |
|
44 32, 33, 34, 35, 36, 37, 38, 39, /* 0x60 */ |
|
45 40, 41, 42, 43, 44, 45, 46, 47, /* 0x68 */ |
|
46 48, 49, 50, 51, 52, 53, 54, 55, /* 0x70 */ |
|
47 56, 57, 58, 59, 60, 61, 62, 0xFF, /* 0x78 */ |
|
48 63, 64, 65, 66, 67, 68, 69, 70, /* 0x80 */ |
|
49 71, 72, 73, 74, 75, 76, 77, 78, /* 0x88 */ |
|
50 79, 80, 81, 82, 83, 84, 85, 86, /* 0x90 */ |
|
51 87, 88, 89, 90, 91, 92, 93, 94, /* 0x98 */ |
|
52 95, 96, 97, 98, 99, 100, 101, 102, /* 0xa0 */ |
|
53 103, 104, 105, 106, 107, 108, 109, 110, /* 0xa8 */ |
|
54 111, 112, 113, 114, 115, 116, 117, 118, /* 0xb0 */ |
|
55 119, 120, 121, 122, 123, 124, 125, 126, /* 0xb8 */ |
|
56 127, 128, 129, 130, 131, 132, 133, 134, /* 0xc0 */ |
|
57 135, 136, 137, 138, 139, 140, 141, 142, /* 0xc8 */ |
|
58 143, 144, 145, 146, 147, 148, 149, 150, /* 0xd0 */ |
|
59 151, 152, 153, 154, 155, 156, 157, 158, /* 0xd8 */ |
|
60 159, 160, 161, 162, 163, 164, 165, 166, /* 0xe0 */ |
|
61 167, 168, 169, 170, 171, 172, 173, 174, /* 0xe8 */ |
|
62 175, 176, 177, 178, 179, 180, 181, 182, /* 0xf0 */ |
|
63 183, 184, 185, 186, 187, 0xFF, 0xFF, 0xFF, /* 0xf8 */ |
|
64 }; |
|
65 |
|
66 const unsigned char* srcEnd = (unsigned char*)aSrc + *aSrcLen; |
|
67 const unsigned char* src =(unsigned char*) aSrc; |
|
68 char16_t* destEnd = aDest + *aDestLen; |
|
69 char16_t* dest = aDest; |
|
70 while (src < srcEnd) { |
|
71 switch (mState) { |
|
72 case 0: |
|
73 if (*src <= 0x80) { |
|
74 // ASCII |
|
75 *dest++ = (char16_t) *src; |
|
76 if (dest >= destEnd) { |
|
77 goto error1; |
|
78 } |
|
79 } else { |
|
80 mData = SJIS_INDEX[*src & 0x7F]; |
|
81 if (mData < 0xE000) { |
|
82 mState = 1; // two bytes |
|
83 } else if (mData < 0xF000) { |
|
84 mState = 2; // EUDC |
|
85 } else { |
|
86 *dest++ = mData; // JIS 0201 |
|
87 if (dest >= destEnd) { |
|
88 goto error1; |
|
89 } |
|
90 } |
|
91 } |
|
92 break; |
|
93 |
|
94 case 1: // Index to table |
|
95 { |
|
96 MOZ_ASSERT(mData < 0xE000); |
|
97 uint8_t off = sbIdx[*src]; |
|
98 |
|
99 // Error handling: in the case where the second octet is not in the |
|
100 // valid ranges 0x40-0x7E 0x80-0xFC, unconsume the invalid octet and |
|
101 // interpret it as the ASCII value. In the case where the second |
|
102 // octet is in the valid range but there is no mapping for the |
|
103 // 2-octet sequence, do not unconsume. |
|
104 if(0xFF == off) { |
|
105 src--; |
|
106 if (mErrBehavior == kOnError_Signal) |
|
107 goto error_invalidchar; |
|
108 *dest++ = UNICODE_REPLACEMENT_CHARACTER; |
|
109 } else { |
|
110 char16_t ch = gJapaneseMap[mData+off]; |
|
111 if(ch == 0xfffd) { |
|
112 if (mErrBehavior == kOnError_Signal) |
|
113 goto error_invalidchar; |
|
114 ch = SJIS_UNMAPPED; |
|
115 } |
|
116 *dest++ = ch; |
|
117 } |
|
118 mState = 0; |
|
119 if(dest >= destEnd) |
|
120 goto error1; |
|
121 } |
|
122 break; |
|
123 |
|
124 case 2: // EUDC |
|
125 { |
|
126 MOZ_ASSERT(0xE000 <= mData && mData < 0xF000); |
|
127 uint8_t off = sbIdx[*src]; |
|
128 |
|
129 // Error handling as in case 1 |
|
130 if(0xFF == off) { |
|
131 src--; |
|
132 if (mErrBehavior == kOnError_Signal) |
|
133 goto error_invalidchar; |
|
134 |
|
135 *dest++ = UNICODE_REPLACEMENT_CHARACTER; |
|
136 } else { |
|
137 *dest++ = mData + off; |
|
138 } |
|
139 mState = 0; |
|
140 if(dest >= destEnd) |
|
141 goto error1; |
|
142 } |
|
143 break; |
|
144 |
|
145 } |
|
146 src++; |
|
147 } |
|
148 *aDestLen = dest - aDest; |
|
149 return NS_OK; |
|
150 error_invalidchar: |
|
151 *aDestLen = dest - aDest; |
|
152 *aSrcLen = src - (const unsigned char*)aSrc; |
|
153 return NS_ERROR_ILLEGAL_INPUT; |
|
154 error1: |
|
155 *aDestLen = dest - aDest; |
|
156 src++; |
|
157 if ((mState == 0) && (src == srcEnd)) { |
|
158 return NS_OK; |
|
159 } |
|
160 *aSrcLen = src - (const unsigned char*)aSrc; |
|
161 return NS_OK_UDEC_MOREOUTPUT; |
|
162 } |
|
163 |
|
164 char16_t |
|
165 nsShiftJISToUnicode::GetCharacterForUnMapped() |
|
166 { |
|
167 return char16_t(SJIS_UNMAPPED); |
|
168 } |
|
169 |
|
170 NS_IMETHODIMP nsEUCJPToUnicodeV2::Convert( |
|
171 const char * aSrc, int32_t * aSrcLen, |
|
172 char16_t * aDest, int32_t * aDestLen) |
|
173 { |
|
174 static const uint8_t sbIdx[256] = |
|
175 { |
|
176 /* 0x0X */ |
|
177 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
178 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
179 /* 0x1X */ |
|
180 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
181 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
182 /* 0x2X */ |
|
183 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
184 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
185 /* 0x3X */ |
|
186 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
187 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
188 /* 0x4X */ |
|
189 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
190 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
191 /* 0x5X */ |
|
192 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
193 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
194 /* 0x6X */ |
|
195 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
196 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
197 /* 0x7X */ |
|
198 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
199 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
200 /* 0x8X */ |
|
201 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
202 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
203 /* 0x9X */ |
|
204 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
205 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
206 /* 0xAX */ |
|
207 0xFF, 0, 1, 2, 3, 4, 5, 6, |
|
208 7, 8 , 9, 10, 11, 12, 13, 14, |
|
209 /* 0xBX */ |
|
210 15, 16, 17, 18, 19, 20, 21, 22, |
|
211 23, 24, 25, 26, 27, 28, 29, 30, |
|
212 /* 0xCX */ |
|
213 31, 32, 33, 34, 35, 36, 37, 38, |
|
214 39, 40, 41, 42, 43, 44, 45, 46, |
|
215 /* 0xDX */ |
|
216 47, 48, 49, 50, 51, 52, 53, 54, |
|
217 55, 56, 57, 58, 59, 60, 61, 62, |
|
218 /* 0xEX */ |
|
219 63, 64, 65, 66, 67, 68, 69, 70, |
|
220 71, 72, 73, 74, 75, 76, 77, 78, |
|
221 /* 0xFX */ |
|
222 79, 80, 81, 82, 83, 84, 85, 86, |
|
223 87, 88, 89, 90, 91, 92, 93, 0xFF, |
|
224 }; |
|
225 |
|
226 const unsigned char* srcEnd = (unsigned char*)aSrc + *aSrcLen; |
|
227 const unsigned char* src =(unsigned char*) aSrc; |
|
228 char16_t* destEnd = aDest + *aDestLen; |
|
229 char16_t* dest = aDest; |
|
230 while((src < srcEnd)) |
|
231 { |
|
232 switch(mState) |
|
233 { |
|
234 case 0: |
|
235 if(*src & 0x80 && *src != (unsigned char)0xa0) |
|
236 { |
|
237 mData = JIS0208_INDEX[*src & 0x7F]; |
|
238 if(mData != 0xFFFD ) |
|
239 { |
|
240 mState = 1; // two byte JIS0208 |
|
241 } else { |
|
242 if( 0x8e == *src) { |
|
243 // JIS 0201 |
|
244 mState = 2; // JIS0201 |
|
245 } else if(0x8f == *src) { |
|
246 // JIS 0212 |
|
247 mState = 3; // JIS0212 |
|
248 } else { |
|
249 // others |
|
250 if (mErrBehavior == kOnError_Signal) |
|
251 goto error_invalidchar; |
|
252 *dest++ = 0xFFFD; |
|
253 if(dest >= destEnd) |
|
254 goto error1; |
|
255 } |
|
256 } |
|
257 } else { |
|
258 // ASCII |
|
259 *dest++ = (char16_t) *src; |
|
260 if(dest >= destEnd) |
|
261 goto error1; |
|
262 } |
|
263 break; |
|
264 |
|
265 case 1: // Index to table |
|
266 { |
|
267 uint8_t off = sbIdx[*src]; |
|
268 if(0xFF == off) { |
|
269 if (mErrBehavior == kOnError_Signal) |
|
270 goto error_invalidchar; |
|
271 *dest++ = 0xFFFD; |
|
272 // if the first byte is valid for EUC-JP but the second |
|
273 // is not while being a valid US-ASCII, save it |
|
274 // instead of eating it up ! |
|
275 if ( (uint8_t)*src < (uint8_t)0x7f ) |
|
276 --src; |
|
277 } else { |
|
278 *dest++ = gJapaneseMap[mData+off]; |
|
279 } |
|
280 mState = 0; |
|
281 if(dest >= destEnd) |
|
282 goto error1; |
|
283 } |
|
284 break; |
|
285 |
|
286 case 2: // JIS 0201 |
|
287 { |
|
288 if((0xA1 <= *src) && (*src <= 0xDF)) { |
|
289 *dest++ = (0xFF61-0x00A1) + *src; |
|
290 } else { |
|
291 if (mErrBehavior == kOnError_Signal) |
|
292 goto error_invalidchar; |
|
293 *dest++ = 0xFFFD; |
|
294 // if 0x8e is not followed by a valid JIS X 0201 byte |
|
295 // but by a valid US-ASCII, save it instead of eating it up. |
|
296 if ( (uint8_t)*src < (uint8_t)0x7f ) |
|
297 --src; |
|
298 } |
|
299 mState = 0; |
|
300 if(dest >= destEnd) |
|
301 goto error1; |
|
302 } |
|
303 break; |
|
304 |
|
305 case 3: // JIS 0212 |
|
306 { |
|
307 if (IN_GR_RANGE(*src)) |
|
308 { |
|
309 mData = JIS0212_INDEX[*src & 0x7F]; |
|
310 if(mData != 0xFFFD ) |
|
311 { |
|
312 mState = 4; |
|
313 } else { |
|
314 mState = 5; // error |
|
315 } |
|
316 } else { |
|
317 // First "JIS 0212" byte is not in the valid GR range: save it |
|
318 if (mErrBehavior == kOnError_Signal) |
|
319 goto error_invalidchar; |
|
320 *dest++ = 0xFFFD; |
|
321 --src; |
|
322 mState = 0; |
|
323 if(dest >= destEnd) |
|
324 goto error1; |
|
325 } |
|
326 } |
|
327 break; |
|
328 case 4: |
|
329 { |
|
330 uint8_t off = sbIdx[*src]; |
|
331 if(0xFF != off) { |
|
332 *dest++ = gJapaneseMap[mData+off]; |
|
333 mState = 0; |
|
334 if(dest >= destEnd) |
|
335 goto error1; |
|
336 break; |
|
337 } |
|
338 // else fall through to error handler |
|
339 } |
|
340 case 5: // two bytes undefined |
|
341 { |
|
342 if (mErrBehavior == kOnError_Signal) |
|
343 goto error_invalidchar; |
|
344 *dest++ = 0xFFFD; |
|
345 // Undefined JIS 0212 two byte sequence. If the second byte is in |
|
346 // the valid range for a two byte sequence (0xa1 - 0xfe) consume |
|
347 // both bytes. Otherwise resynchronize on the second byte. |
|
348 if (!IN_GR_RANGE(*src)) |
|
349 --src; |
|
350 mState = 0; |
|
351 if(dest >= destEnd) |
|
352 goto error1; |
|
353 } |
|
354 break; |
|
355 } |
|
356 src++; |
|
357 } |
|
358 *aDestLen = dest - aDest; |
|
359 return NS_OK; |
|
360 error_invalidchar: |
|
361 *aDestLen = dest - aDest; |
|
362 *aSrcLen = src - (const unsigned char*)aSrc; |
|
363 return NS_ERROR_ILLEGAL_INPUT; |
|
364 error1: |
|
365 *aDestLen = dest - aDest; |
|
366 src++; |
|
367 if ((mState == 0) && (src == srcEnd)) { |
|
368 return NS_OK; |
|
369 } |
|
370 *aSrcLen = src - (const unsigned char*)aSrc; |
|
371 return NS_OK_UDEC_MOREOUTPUT; |
|
372 } |
|
373 |
|
374 |
|
375 |
|
376 NS_IMETHODIMP nsISO2022JPToUnicodeV2::Convert( |
|
377 const char * aSrc, int32_t * aSrcLen, |
|
378 char16_t * aDest, int32_t * aDestLen) |
|
379 { |
|
380 static NS_DEFINE_CID(kCharsetConverterManagerCID, NS_ICHARSETCONVERTERMANAGER_CID); |
|
381 |
|
382 static const uint16_t fbIdx[128] = |
|
383 { |
|
384 /* 0x8X */ |
|
385 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, |
|
386 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, |
|
387 /* 0x9X */ |
|
388 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, |
|
389 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, |
|
390 /* 0xAX */ |
|
391 0xFFFD, 0, 94, 94* 2, 94* 3, 94* 4, 94* 5, 94* 6, |
|
392 94* 7, 94* 8 , 94* 9, 94*10, 94*11, 94*12, 94*13, 94*14, |
|
393 /* 0xBX */ |
|
394 94*15, 94*16, 94*17, 94*18, 94*19, 94*20, 94*21, 94*22, |
|
395 94*23, 94*24, 94*25, 94*26, 94*27, 94*28, 94*29, 94*30, |
|
396 /* 0xCX */ |
|
397 94*31, 94*32, 94*33, 94*34, 94*35, 94*36, 94*37, 94*38, |
|
398 94*39, 94*40, 94*41, 94*42, 94*43, 94*44, 94*45, 94*46, |
|
399 /* 0xDX */ |
|
400 94*47, 94*48, 94*49, 94*50, 94*51, 94*52, 94*53, 94*54, |
|
401 94*55, 94*56, 94*57, 94*58, 94*59, 94*60, 94*61, 94*62, |
|
402 /* 0xEX */ |
|
403 94*63, 94*64, 94*65, 94*66, 94*67, 94*68, 94*69, 94*70, |
|
404 94*71, 94*72, 94*73, 94*74, 94*75, 94*76, 94*77, 94*78, |
|
405 /* 0xFX */ |
|
406 94*79, 94*80, 94*81, 94*82, 94*83, 94*84, 94*85, 94*86, |
|
407 94*87, 94*88, 94*89, 94*90, 94*91, 94*92, 94*93, 0xFFFD, |
|
408 }; |
|
409 static const uint8_t sbIdx[256] = |
|
410 { |
|
411 /* 0x0X */ |
|
412 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
413 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
414 /* 0x1X */ |
|
415 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
416 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
417 /* 0x2X */ |
|
418 0xFF, 0, 1, 2, 3, 4, 5, 6, |
|
419 7, 8 , 9, 10, 11, 12, 13, 14, |
|
420 /* 0x3X */ |
|
421 15, 16, 17, 18, 19, 20, 21, 22, |
|
422 23, 24, 25, 26, 27, 28, 29, 30, |
|
423 /* 0x4X */ |
|
424 31, 32, 33, 34, 35, 36, 37, 38, |
|
425 39, 40, 41, 42, 43, 44, 45, 46, |
|
426 /* 0x5X */ |
|
427 47, 48, 49, 50, 51, 52, 53, 54, |
|
428 55, 56, 57, 58, 59, 60, 61, 62, |
|
429 /* 0x6X */ |
|
430 63, 64, 65, 66, 67, 68, 69, 70, |
|
431 71, 72, 73, 74, 75, 76, 77, 78, |
|
432 /* 0x7X */ |
|
433 79, 80, 81, 82, 83, 84, 85, 86, |
|
434 87, 88, 89, 90, 91, 92, 93, 0xFF, |
|
435 /* 0x8X */ |
|
436 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
437 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
438 /* 0x9X */ |
|
439 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
440 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
441 /* 0xAX */ |
|
442 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
443 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
444 /* 0xBX */ |
|
445 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
446 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
447 /* 0xCX */ |
|
448 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
449 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
450 /* 0xDX */ |
|
451 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
452 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
453 /* 0xEX */ |
|
454 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
455 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
456 /* 0xFX */ |
|
457 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
458 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
|
459 }; |
|
460 |
|
461 const unsigned char* srcEnd = (unsigned char*)aSrc + *aSrcLen; |
|
462 const unsigned char* src =(unsigned char*) aSrc; |
|
463 char16_t* destEnd = aDest + *aDestLen; |
|
464 char16_t* dest = aDest; |
|
465 while((src < srcEnd)) |
|
466 { |
|
467 |
|
468 switch(mState) |
|
469 { |
|
470 case mState_ASCII: |
|
471 if(0x1b == *src) |
|
472 { |
|
473 mLastLegalState = mState; |
|
474 mState = mState_ESC; |
|
475 } else if(*src & 0x80) { |
|
476 if (mErrBehavior == kOnError_Signal) |
|
477 goto error3; |
|
478 if (CHECK_OVERRUN(dest, destEnd, 1)) |
|
479 goto error1; |
|
480 *dest++ = UNICODE_REPLACEMENT_CHARACTER; |
|
481 } else { |
|
482 if (CHECK_OVERRUN(dest, destEnd, 1)) |
|
483 goto error1; |
|
484 *dest++ = (char16_t) *src; |
|
485 } |
|
486 break; |
|
487 |
|
488 case mState_ESC: |
|
489 if( '(' == *src) { |
|
490 mState = mState_ESC_28; |
|
491 } else if ('$' == *src) { |
|
492 mState = mState_ESC_24; |
|
493 } else if ('.' == *src) { // for ISO-2022-JP-2 |
|
494 mState = mState_ESC_2e; |
|
495 } else if ('N' == *src) { // for ISO-2022-JP-2 |
|
496 mState = mState_ESC_4e; |
|
497 } else { |
|
498 if (CHECK_OVERRUN(dest, destEnd, 2)) |
|
499 goto error1; |
|
500 *dest++ = (char16_t) 0x1b; |
|
501 if (0x80 & *src) { |
|
502 if (mErrBehavior == kOnError_Signal) |
|
503 goto error3; |
|
504 *dest++ = UNICODE_REPLACEMENT_CHARACTER; |
|
505 } else { |
|
506 *dest++ = (char16_t) *src; |
|
507 } |
|
508 mState = mLastLegalState; |
|
509 } |
|
510 break; |
|
511 |
|
512 case mState_ESC_28: // ESC ( |
|
513 if( 'B' == *src) { |
|
514 mState = mState_ASCII; |
|
515 if (mRunLength == 0) { |
|
516 if (CHECK_OVERRUN(dest, destEnd, 1)) |
|
517 goto error1; |
|
518 *dest++ = 0xFFFD; |
|
519 } |
|
520 mRunLength = 0; |
|
521 } else if ('J' == *src) { |
|
522 mState = mState_JISX0201_1976Roman; |
|
523 if (mRunLength == 0 && mLastLegalState != mState_ASCII) { |
|
524 if (CHECK_OVERRUN(dest, destEnd, 1)) |
|
525 goto error1; |
|
526 if (mErrBehavior == kOnError_Signal) |
|
527 goto error3; |
|
528 *dest++ = 0xFFFD; |
|
529 } |
|
530 mRunLength = 0; |
|
531 } else if ('I' == *src) { |
|
532 mState = mState_JISX0201_1976Kana; |
|
533 mRunLength = 0; |
|
534 } else { |
|
535 if (CHECK_OVERRUN(dest, destEnd, 3)) |
|
536 goto error1; |
|
537 *dest++ = (char16_t) 0x1b; |
|
538 *dest++ = (char16_t) '('; |
|
539 if (0x80 & *src) { |
|
540 if (mErrBehavior == kOnError_Signal) |
|
541 goto error3; |
|
542 *dest++ = UNICODE_REPLACEMENT_CHARACTER; |
|
543 } else { |
|
544 *dest++ = (char16_t) *src; |
|
545 } |
|
546 mState = mLastLegalState; |
|
547 } |
|
548 break; |
|
549 |
|
550 case mState_ESC_24: // ESC $ |
|
551 if( '@' == *src) { |
|
552 mState = mState_JISX0208_1978; |
|
553 mRunLength = 0; |
|
554 } else if ('A' == *src) { |
|
555 mState = mState_GB2312_1980; |
|
556 mRunLength = 0; |
|
557 } else if ('B' == *src) { |
|
558 mState = mState_JISX0208_1983; |
|
559 mRunLength = 0; |
|
560 } else if ('(' == *src) { |
|
561 mState = mState_ESC_24_28; |
|
562 } else { |
|
563 if (CHECK_OVERRUN(dest, destEnd, 3)) |
|
564 goto error1; |
|
565 *dest++ = (char16_t) 0x1b; |
|
566 *dest++ = (char16_t) '$'; |
|
567 if (0x80 & *src) { |
|
568 if (mErrBehavior == kOnError_Signal) |
|
569 goto error3; |
|
570 *dest++ = UNICODE_REPLACEMENT_CHARACTER; |
|
571 } else { |
|
572 *dest++ = (char16_t) *src; |
|
573 } |
|
574 mState = mLastLegalState; |
|
575 } |
|
576 break; |
|
577 |
|
578 case mState_ESC_24_28: // ESC $ ( |
|
579 if( 'C' == *src) { |
|
580 mState = mState_KSC5601_1987; |
|
581 mRunLength = 0; |
|
582 } else if ('D' == *src) { |
|
583 mState = mState_JISX0212_1990; |
|
584 mRunLength = 0; |
|
585 } else { |
|
586 if (CHECK_OVERRUN(dest, destEnd, 4)) |
|
587 goto error1; |
|
588 *dest++ = (char16_t) 0x1b; |
|
589 *dest++ = (char16_t) '$'; |
|
590 *dest++ = (char16_t) '('; |
|
591 if (0x80 & *src) { |
|
592 if (mErrBehavior == kOnError_Signal) |
|
593 goto error3; |
|
594 *dest++ = UNICODE_REPLACEMENT_CHARACTER; |
|
595 } else { |
|
596 *dest++ = (char16_t) *src; |
|
597 } |
|
598 mState = mLastLegalState; |
|
599 } |
|
600 break; |
|
601 |
|
602 case mState_JISX0201_1976Roman: |
|
603 if(0x1b == *src) { |
|
604 mLastLegalState = mState; |
|
605 mState = mState_ESC; |
|
606 } else if(*src & 0x80) { |
|
607 if (mErrBehavior == kOnError_Signal) |
|
608 goto error3; |
|
609 if (CHECK_OVERRUN(dest, destEnd, 1)) |
|
610 goto error1; |
|
611 *dest++ = UNICODE_REPLACEMENT_CHARACTER; |
|
612 ++mRunLength; |
|
613 } else { |
|
614 // XXX We need to decide how to handle \ and ~ here |
|
615 // we may need a if statement here for '\' and '~' |
|
616 // to map them to Yen and Overbar |
|
617 if (CHECK_OVERRUN(dest, destEnd, 1)) |
|
618 goto error1; |
|
619 *dest++ = (char16_t) *src; |
|
620 ++mRunLength; |
|
621 } |
|
622 break; |
|
623 |
|
624 case mState_JISX0201_1976Kana: |
|
625 if(0x1b == *src) { |
|
626 mLastLegalState = mState; |
|
627 mState = mState_ESC; |
|
628 } else { |
|
629 if (CHECK_OVERRUN(dest, destEnd, 1)) |
|
630 goto error1; |
|
631 if((0x21 <= *src) && (*src <= 0x5F)) { |
|
632 *dest++ = (0xFF61-0x0021) + *src; |
|
633 } else { |
|
634 if (mErrBehavior == kOnError_Signal) |
|
635 goto error3; |
|
636 *dest++ = UNICODE_REPLACEMENT_CHARACTER; |
|
637 } |
|
638 ++mRunLength; |
|
639 } |
|
640 break; |
|
641 |
|
642 case mState_JISX0208_1978: |
|
643 if(0x1b == *src) { |
|
644 mLastLegalState = mState; |
|
645 mState = mState_ESC; |
|
646 } else if(*src & 0x80) { |
|
647 mLastLegalState = mState; |
|
648 mState = mState_ERROR; |
|
649 } else { |
|
650 mData = JIS0208_INDEX[*src & 0x7F]; |
|
651 if (0xFFFD == mData) { |
|
652 if (mErrBehavior == kOnError_Signal) |
|
653 goto error3; |
|
654 mState = mState_ERROR; |
|
655 } else { |
|
656 mState = mState_JISX0208_1978_2ndbyte; |
|
657 } |
|
658 } |
|
659 break; |
|
660 |
|
661 case mState_GB2312_1980: |
|
662 if(0x1b == *src) { |
|
663 mLastLegalState = mState; |
|
664 mState = mState_ESC; |
|
665 } else if(*src & 0x80) { |
|
666 mLastLegalState = mState; |
|
667 mState = mState_ERROR; |
|
668 } else { |
|
669 mData = fbIdx[*src & 0x7F]; |
|
670 if (0xFFFD == mData) { |
|
671 if (mErrBehavior == kOnError_Signal) |
|
672 goto error3; |
|
673 mState = mState_ERROR; |
|
674 } else { |
|
675 mState = mState_GB2312_1980_2ndbyte; |
|
676 } |
|
677 } |
|
678 break; |
|
679 |
|
680 case mState_JISX0208_1983: |
|
681 if(0x1b == *src) { |
|
682 mLastLegalState = mState; |
|
683 mState = mState_ESC; |
|
684 } else if(*src & 0x80) { |
|
685 mLastLegalState = mState; |
|
686 mState = mState_ERROR; |
|
687 } else { |
|
688 mData = JIS0208_INDEX[*src & 0x7F]; |
|
689 if (0xFFFD == mData) { |
|
690 if (mErrBehavior == kOnError_Signal) |
|
691 goto error3; |
|
692 mState = mState_ERROR; |
|
693 } else { |
|
694 mState = mState_JISX0208_1983_2ndbyte; |
|
695 } |
|
696 } |
|
697 break; |
|
698 |
|
699 case mState_KSC5601_1987: |
|
700 if(0x1b == *src) { |
|
701 mLastLegalState = mState; |
|
702 mState = mState_ESC; |
|
703 } else if(*src & 0x80) { |
|
704 mLastLegalState = mState; |
|
705 mState = mState_ERROR; |
|
706 } else { |
|
707 mData = fbIdx[*src & 0x7F]; |
|
708 if (0xFFFD == mData) { |
|
709 if (mErrBehavior == kOnError_Signal) |
|
710 goto error3; |
|
711 mState = mState_ERROR; |
|
712 } else { |
|
713 mState = mState_KSC5601_1987_2ndbyte; |
|
714 } |
|
715 } |
|
716 break; |
|
717 |
|
718 case mState_JISX0212_1990: |
|
719 if(0x1b == *src) { |
|
720 mLastLegalState = mState; |
|
721 mState = mState_ESC; |
|
722 } else if(*src & 0x80) { |
|
723 mLastLegalState = mState; |
|
724 mState = mState_ERROR; |
|
725 } else { |
|
726 mData = JIS0212_INDEX[*src & 0x7F]; |
|
727 if (0xFFFD == mData) { |
|
728 if (mErrBehavior == kOnError_Signal) |
|
729 goto error3; |
|
730 mState = mState_ERROR; |
|
731 } else { |
|
732 mState = mState_JISX0212_1990_2ndbyte; |
|
733 } |
|
734 } |
|
735 break; |
|
736 |
|
737 case mState_JISX0208_1978_2ndbyte: |
|
738 { |
|
739 if (CHECK_OVERRUN(dest, destEnd, 1)) |
|
740 goto error1; |
|
741 uint8_t off = sbIdx[*src]; |
|
742 if(0xFF == off) { |
|
743 if (mErrBehavior == kOnError_Signal) |
|
744 goto error3; |
|
745 *dest++ = UNICODE_REPLACEMENT_CHARACTER; |
|
746 } else { |
|
747 // XXX We need to map from JIS X 0208 1983 to 1987 |
|
748 // in the next line before pass to *dest++ |
|
749 *dest++ = gJapaneseMap[mData+off]; |
|
750 } |
|
751 ++mRunLength; |
|
752 mState = mState_JISX0208_1978; |
|
753 } |
|
754 break; |
|
755 |
|
756 case mState_GB2312_1980_2ndbyte: |
|
757 { |
|
758 if (CHECK_OVERRUN(dest, destEnd, 1)) |
|
759 goto error1; |
|
760 uint8_t off = sbIdx[*src]; |
|
761 if(0xFF == off) { |
|
762 if (mErrBehavior == kOnError_Signal) |
|
763 goto error3; |
|
764 *dest++ = UNICODE_REPLACEMENT_CHARACTER; |
|
765 } else { |
|
766 if (!mGB2312Decoder) { |
|
767 // creating a delegate converter (GB2312) |
|
768 nsresult rv; |
|
769 nsCOMPtr<nsICharsetConverterManager> ccm = |
|
770 do_GetService(kCharsetConverterManagerCID, &rv); |
|
771 if (NS_SUCCEEDED(rv)) { |
|
772 rv = ccm->GetUnicodeDecoderRaw("GB2312", &mGB2312Decoder); |
|
773 } |
|
774 } |
|
775 if (!mGB2312Decoder) {// failed creating a delegate converter |
|
776 goto error2; |
|
777 } else { |
|
778 unsigned char gb[2]; |
|
779 char16_t uni; |
|
780 int32_t gbLen = 2, uniLen = 1; |
|
781 // ((mData/94)+0x21) is the original 1st byte. |
|
782 // *src is the present 2nd byte. |
|
783 // Put 2 bytes (one character) to gb[] with GB2312 encoding. |
|
784 gb[0] = ((mData / 94) + 0x21) | 0x80; |
|
785 gb[1] = *src | 0x80; |
|
786 // Convert GB2312 to unicode. |
|
787 mGB2312Decoder->Convert((const char *)gb, &gbLen, |
|
788 &uni, &uniLen); |
|
789 *dest++ = uni; |
|
790 } |
|
791 } |
|
792 ++mRunLength; |
|
793 mState = mState_GB2312_1980; |
|
794 } |
|
795 break; |
|
796 |
|
797 case mState_JISX0208_1983_2ndbyte: |
|
798 { |
|
799 if (CHECK_OVERRUN(dest, destEnd, 1)) |
|
800 goto error1; |
|
801 uint8_t off = sbIdx[*src]; |
|
802 if(0xFF == off) { |
|
803 if (mErrBehavior == kOnError_Signal) |
|
804 goto error3; |
|
805 *dest++ = UNICODE_REPLACEMENT_CHARACTER; |
|
806 } else { |
|
807 *dest++ = gJapaneseMap[mData+off]; |
|
808 } |
|
809 ++mRunLength; |
|
810 mState = mState_JISX0208_1983; |
|
811 } |
|
812 break; |
|
813 |
|
814 case mState_KSC5601_1987_2ndbyte: |
|
815 { |
|
816 if (CHECK_OVERRUN(dest, destEnd, 1)) |
|
817 goto error1; |
|
818 uint8_t off = sbIdx[*src]; |
|
819 if(0xFF == off) { |
|
820 if (mErrBehavior == kOnError_Signal) |
|
821 goto error3; |
|
822 *dest++ = UNICODE_REPLACEMENT_CHARACTER; |
|
823 } else { |
|
824 if (!mEUCKRDecoder) { |
|
825 // creating a delegate converter (EUC-KR) |
|
826 nsresult rv; |
|
827 nsCOMPtr<nsICharsetConverterManager> ccm = |
|
828 do_GetService(kCharsetConverterManagerCID, &rv); |
|
829 if (NS_SUCCEEDED(rv)) { |
|
830 rv = ccm->GetUnicodeDecoderRaw("EUC-KR", &mEUCKRDecoder); |
|
831 } |
|
832 } |
|
833 if (!mEUCKRDecoder) {// failed creating a delegate converter |
|
834 goto error2; |
|
835 } else { |
|
836 unsigned char ksc[2]; |
|
837 char16_t uni; |
|
838 int32_t kscLen = 2, uniLen = 1; |
|
839 // ((mData/94)+0x21) is the original 1st byte. |
|
840 // *src is the present 2nd byte. |
|
841 // Put 2 bytes (one character) to ksc[] with EUC-KR encoding. |
|
842 ksc[0] = ((mData / 94) + 0x21) | 0x80; |
|
843 ksc[1] = *src | 0x80; |
|
844 // Convert EUC-KR to unicode. |
|
845 mEUCKRDecoder->Convert((const char *)ksc, &kscLen, |
|
846 &uni, &uniLen); |
|
847 *dest++ = uni; |
|
848 } |
|
849 } |
|
850 ++mRunLength; |
|
851 mState = mState_KSC5601_1987; |
|
852 } |
|
853 break; |
|
854 |
|
855 case mState_JISX0212_1990_2ndbyte: |
|
856 { |
|
857 uint8_t off = sbIdx[*src]; |
|
858 if (CHECK_OVERRUN(dest, destEnd, 1)) |
|
859 goto error1; |
|
860 if(0xFF == off) { |
|
861 if (mErrBehavior == kOnError_Signal) |
|
862 goto error3; |
|
863 *dest++ = UNICODE_REPLACEMENT_CHARACTER; |
|
864 } else { |
|
865 *dest++ = gJapaneseMap[mData+off]; |
|
866 } |
|
867 ++mRunLength; |
|
868 mState = mState_JISX0212_1990; |
|
869 } |
|
870 break; |
|
871 |
|
872 case mState_ESC_2e: // ESC . |
|
873 // "ESC ." will designate 96 character set to G2. |
|
874 mState = mLastLegalState; |
|
875 if( 'A' == *src) { |
|
876 G2charset = G2_ISO88591; |
|
877 } else if ('F' == *src) { |
|
878 G2charset = G2_ISO88597; |
|
879 } else { |
|
880 if (CHECK_OVERRUN(dest, destEnd, 3)) |
|
881 goto error1; |
|
882 *dest++ = (char16_t) 0x1b; |
|
883 *dest++ = (char16_t) '.'; |
|
884 if (0x80 & *src) { |
|
885 if (mErrBehavior == kOnError_Signal) |
|
886 goto error3; |
|
887 *dest++ = UNICODE_REPLACEMENT_CHARACTER; |
|
888 } else { |
|
889 *dest++ = (char16_t) *src; |
|
890 } |
|
891 } |
|
892 break; |
|
893 |
|
894 case mState_ESC_4e: // ESC N |
|
895 // "ESC N" is the SS2 sequence, that invoke a G2 designated |
|
896 // character set. Since SS2 is effective only for next one |
|
897 // character, mState should be returned to the last status. |
|
898 mState = mLastLegalState; |
|
899 if((0x20 <= *src) && (*src <= 0x7F)) { |
|
900 if (CHECK_OVERRUN(dest, destEnd, 1)) |
|
901 goto error1; |
|
902 if (G2_ISO88591 == G2charset) { |
|
903 *dest++ = *src | 0x80; |
|
904 } else if (G2_ISO88597 == G2charset) { |
|
905 if (!mISO88597Decoder) { |
|
906 // creating a delegate converter (ISO-8859-7) |
|
907 nsresult rv; |
|
908 nsCOMPtr<nsICharsetConverterManager> ccm = |
|
909 do_GetService(kCharsetConverterManagerCID, &rv); |
|
910 if (NS_SUCCEEDED(rv)) { |
|
911 rv = ccm->GetUnicodeDecoderRaw("ISO-8859-7", &mISO88597Decoder); |
|
912 } |
|
913 } |
|
914 if (!mISO88597Decoder) {// failed creating a delegate converter |
|
915 goto error2; |
|
916 } else { |
|
917 // Put one character with ISO-8859-7 encoding. |
|
918 unsigned char gr = *src | 0x80; |
|
919 char16_t uni; |
|
920 int32_t grLen = 1, uniLen = 1; |
|
921 // Convert ISO-8859-7 to unicode. |
|
922 mISO88597Decoder->Convert((const char *)&gr, &grLen, |
|
923 &uni, &uniLen); |
|
924 *dest++ = uni; |
|
925 } |
|
926 } else {// G2charset is G2_unknown (not designated yet) |
|
927 if (mErrBehavior == kOnError_Signal) |
|
928 goto error3; |
|
929 *dest++ = UNICODE_REPLACEMENT_CHARACTER; |
|
930 } |
|
931 ++mRunLength; |
|
932 } else { |
|
933 if (CHECK_OVERRUN(dest, destEnd, 3)) |
|
934 goto error1; |
|
935 *dest++ = (char16_t) 0x1b; |
|
936 *dest++ = (char16_t) 'N'; |
|
937 if (0x80 & *src) { |
|
938 if (mErrBehavior == kOnError_Signal) |
|
939 goto error3; |
|
940 *dest++ = UNICODE_REPLACEMENT_CHARACTER; |
|
941 } else { |
|
942 *dest++ = (char16_t) *src; |
|
943 } |
|
944 } |
|
945 break; |
|
946 |
|
947 case mState_ERROR: |
|
948 mState = mLastLegalState; |
|
949 if (mErrBehavior == kOnError_Signal) { |
|
950 mRunLength = 0; |
|
951 goto error3; |
|
952 } |
|
953 if (CHECK_OVERRUN(dest, destEnd, 1)) |
|
954 goto error1; |
|
955 *dest++ = UNICODE_REPLACEMENT_CHARACTER; |
|
956 ++mRunLength; |
|
957 break; |
|
958 |
|
959 } // switch |
|
960 src++; |
|
961 } |
|
962 *aDestLen = dest - aDest; |
|
963 return NS_OK; |
|
964 error1: |
|
965 *aDestLen = dest - aDest; |
|
966 *aSrcLen = src - (const unsigned char*)aSrc; |
|
967 return NS_OK_UDEC_MOREOUTPUT; |
|
968 error2: |
|
969 *aDestLen = dest - aDest; |
|
970 *aSrcLen = src - (const unsigned char*)aSrc; |
|
971 return NS_ERROR_UNEXPECTED; |
|
972 error3: |
|
973 *aDestLen = dest - aDest; |
|
974 *aSrcLen = src - (const unsigned char*)aSrc; |
|
975 return NS_ERROR_ILLEGAL_INPUT; |
|
976 } |