1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/uconv/ucvja/nsJapaneseToUnicode.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,976 @@ 1.4 +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 +#include "nsJapaneseToUnicode.h" 1.9 + 1.10 +#include "nsUCSupport.h" 1.11 + 1.12 +#include "japanese.map" 1.13 + 1.14 +#include "nsICharsetConverterManager.h" 1.15 +#include "nsServiceManagerUtils.h" 1.16 + 1.17 +#include "mozilla/Assertions.h" 1.18 + 1.19 +// HTML5 says to use Windows-31J instead of the real Shift_JIS for decoding 1.20 +#define SJIS_INDEX gCP932Index[0] 1.21 +#define JIS0208_INDEX gCP932Index[1] 1.22 + 1.23 +#define JIS0212_INDEX gJIS0212Index 1.24 +#define SJIS_UNMAPPED 0x30fb 1.25 +#define UNICODE_REPLACEMENT_CHARACTER 0xfffd 1.26 +#define IN_GR_RANGE(b) \ 1.27 + ((uint8_t(0xa1) <= uint8_t(b)) && (uint8_t(b) <= uint8_t(0xfe))) 1.28 + 1.29 +NS_IMETHODIMP nsShiftJISToUnicode::Convert( 1.30 + const char * aSrc, int32_t * aSrcLen, 1.31 + char16_t * aDest, int32_t * aDestLen) 1.32 +{ 1.33 + static const uint8_t sbIdx[256] = 1.34 + { 1.35 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x00 */ 1.36 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x08 */ 1.37 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x10 */ 1.38 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x18 */ 1.39 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x20 */ 1.40 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x28 */ 1.41 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x30 */ 1.42 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x38 */ 1.43 + 0, 1, 2, 3, 4, 5, 6, 7, /* 0x40 */ 1.44 + 8, 9, 10, 11, 12, 13, 14, 15, /* 0x48 */ 1.45 + 16, 17, 18, 19, 20, 21, 22, 23, /* 0x50 */ 1.46 + 24, 25, 26, 27, 28, 29, 30, 31, /* 0x58 */ 1.47 + 32, 33, 34, 35, 36, 37, 38, 39, /* 0x60 */ 1.48 + 40, 41, 42, 43, 44, 45, 46, 47, /* 0x68 */ 1.49 + 48, 49, 50, 51, 52, 53, 54, 55, /* 0x70 */ 1.50 + 56, 57, 58, 59, 60, 61, 62, 0xFF, /* 0x78 */ 1.51 + 63, 64, 65, 66, 67, 68, 69, 70, /* 0x80 */ 1.52 + 71, 72, 73, 74, 75, 76, 77, 78, /* 0x88 */ 1.53 + 79, 80, 81, 82, 83, 84, 85, 86, /* 0x90 */ 1.54 + 87, 88, 89, 90, 91, 92, 93, 94, /* 0x98 */ 1.55 + 95, 96, 97, 98, 99, 100, 101, 102, /* 0xa0 */ 1.56 + 103, 104, 105, 106, 107, 108, 109, 110, /* 0xa8 */ 1.57 + 111, 112, 113, 114, 115, 116, 117, 118, /* 0xb0 */ 1.58 + 119, 120, 121, 122, 123, 124, 125, 126, /* 0xb8 */ 1.59 + 127, 128, 129, 130, 131, 132, 133, 134, /* 0xc0 */ 1.60 + 135, 136, 137, 138, 139, 140, 141, 142, /* 0xc8 */ 1.61 + 143, 144, 145, 146, 147, 148, 149, 150, /* 0xd0 */ 1.62 + 151, 152, 153, 154, 155, 156, 157, 158, /* 0xd8 */ 1.63 + 159, 160, 161, 162, 163, 164, 165, 166, /* 0xe0 */ 1.64 + 167, 168, 169, 170, 171, 172, 173, 174, /* 0xe8 */ 1.65 + 175, 176, 177, 178, 179, 180, 181, 182, /* 0xf0 */ 1.66 + 183, 184, 185, 186, 187, 0xFF, 0xFF, 0xFF, /* 0xf8 */ 1.67 + }; 1.68 + 1.69 + const unsigned char* srcEnd = (unsigned char*)aSrc + *aSrcLen; 1.70 + const unsigned char* src =(unsigned char*) aSrc; 1.71 + char16_t* destEnd = aDest + *aDestLen; 1.72 + char16_t* dest = aDest; 1.73 + while (src < srcEnd) { 1.74 + switch (mState) { 1.75 + case 0: 1.76 + if (*src <= 0x80) { 1.77 + // ASCII 1.78 + *dest++ = (char16_t) *src; 1.79 + if (dest >= destEnd) { 1.80 + goto error1; 1.81 + } 1.82 + } else { 1.83 + mData = SJIS_INDEX[*src & 0x7F]; 1.84 + if (mData < 0xE000) { 1.85 + mState = 1; // two bytes 1.86 + } else if (mData < 0xF000) { 1.87 + mState = 2; // EUDC 1.88 + } else { 1.89 + *dest++ = mData; // JIS 0201 1.90 + if (dest >= destEnd) { 1.91 + goto error1; 1.92 + } 1.93 + } 1.94 + } 1.95 + break; 1.96 + 1.97 + case 1: // Index to table 1.98 + { 1.99 + MOZ_ASSERT(mData < 0xE000); 1.100 + uint8_t off = sbIdx[*src]; 1.101 + 1.102 + // Error handling: in the case where the second octet is not in the 1.103 + // valid ranges 0x40-0x7E 0x80-0xFC, unconsume the invalid octet and 1.104 + // interpret it as the ASCII value. In the case where the second 1.105 + // octet is in the valid range but there is no mapping for the 1.106 + // 2-octet sequence, do not unconsume. 1.107 + if(0xFF == off) { 1.108 + src--; 1.109 + if (mErrBehavior == kOnError_Signal) 1.110 + goto error_invalidchar; 1.111 + *dest++ = UNICODE_REPLACEMENT_CHARACTER; 1.112 + } else { 1.113 + char16_t ch = gJapaneseMap[mData+off]; 1.114 + if(ch == 0xfffd) { 1.115 + if (mErrBehavior == kOnError_Signal) 1.116 + goto error_invalidchar; 1.117 + ch = SJIS_UNMAPPED; 1.118 + } 1.119 + *dest++ = ch; 1.120 + } 1.121 + mState = 0; 1.122 + if(dest >= destEnd) 1.123 + goto error1; 1.124 + } 1.125 + break; 1.126 + 1.127 + case 2: // EUDC 1.128 + { 1.129 + MOZ_ASSERT(0xE000 <= mData && mData < 0xF000); 1.130 + uint8_t off = sbIdx[*src]; 1.131 + 1.132 + // Error handling as in case 1 1.133 + if(0xFF == off) { 1.134 + src--; 1.135 + if (mErrBehavior == kOnError_Signal) 1.136 + goto error_invalidchar; 1.137 + 1.138 + *dest++ = UNICODE_REPLACEMENT_CHARACTER; 1.139 + } else { 1.140 + *dest++ = mData + off; 1.141 + } 1.142 + mState = 0; 1.143 + if(dest >= destEnd) 1.144 + goto error1; 1.145 + } 1.146 + break; 1.147 + 1.148 + } 1.149 + src++; 1.150 + } 1.151 + *aDestLen = dest - aDest; 1.152 + return NS_OK; 1.153 +error_invalidchar: 1.154 + *aDestLen = dest - aDest; 1.155 + *aSrcLen = src - (const unsigned char*)aSrc; 1.156 + return NS_ERROR_ILLEGAL_INPUT; 1.157 +error1: 1.158 + *aDestLen = dest - aDest; 1.159 + src++; 1.160 + if ((mState == 0) && (src == srcEnd)) { 1.161 + return NS_OK; 1.162 + } 1.163 + *aSrcLen = src - (const unsigned char*)aSrc; 1.164 + return NS_OK_UDEC_MOREOUTPUT; 1.165 +} 1.166 + 1.167 +char16_t 1.168 +nsShiftJISToUnicode::GetCharacterForUnMapped() 1.169 +{ 1.170 + return char16_t(SJIS_UNMAPPED); 1.171 +} 1.172 + 1.173 +NS_IMETHODIMP nsEUCJPToUnicodeV2::Convert( 1.174 + const char * aSrc, int32_t * aSrcLen, 1.175 + char16_t * aDest, int32_t * aDestLen) 1.176 +{ 1.177 + static const uint8_t sbIdx[256] = 1.178 + { 1.179 +/* 0x0X */ 1.180 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.181 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.182 +/* 0x1X */ 1.183 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.184 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.185 +/* 0x2X */ 1.186 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.187 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.188 +/* 0x3X */ 1.189 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.190 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.191 +/* 0x4X */ 1.192 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.193 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.194 +/* 0x5X */ 1.195 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.196 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.197 +/* 0x6X */ 1.198 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.199 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.200 +/* 0x7X */ 1.201 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.202 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.203 +/* 0x8X */ 1.204 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.205 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.206 +/* 0x9X */ 1.207 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.208 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.209 +/* 0xAX */ 1.210 + 0xFF, 0, 1, 2, 3, 4, 5, 6, 1.211 + 7, 8 , 9, 10, 11, 12, 13, 14, 1.212 +/* 0xBX */ 1.213 + 15, 16, 17, 18, 19, 20, 21, 22, 1.214 + 23, 24, 25, 26, 27, 28, 29, 30, 1.215 +/* 0xCX */ 1.216 + 31, 32, 33, 34, 35, 36, 37, 38, 1.217 + 39, 40, 41, 42, 43, 44, 45, 46, 1.218 +/* 0xDX */ 1.219 + 47, 48, 49, 50, 51, 52, 53, 54, 1.220 + 55, 56, 57, 58, 59, 60, 61, 62, 1.221 +/* 0xEX */ 1.222 + 63, 64, 65, 66, 67, 68, 69, 70, 1.223 + 71, 72, 73, 74, 75, 76, 77, 78, 1.224 +/* 0xFX */ 1.225 + 79, 80, 81, 82, 83, 84, 85, 86, 1.226 + 87, 88, 89, 90, 91, 92, 93, 0xFF, 1.227 + }; 1.228 + 1.229 + const unsigned char* srcEnd = (unsigned char*)aSrc + *aSrcLen; 1.230 + const unsigned char* src =(unsigned char*) aSrc; 1.231 + char16_t* destEnd = aDest + *aDestLen; 1.232 + char16_t* dest = aDest; 1.233 + while((src < srcEnd)) 1.234 + { 1.235 + switch(mState) 1.236 + { 1.237 + case 0: 1.238 + if(*src & 0x80 && *src != (unsigned char)0xa0) 1.239 + { 1.240 + mData = JIS0208_INDEX[*src & 0x7F]; 1.241 + if(mData != 0xFFFD ) 1.242 + { 1.243 + mState = 1; // two byte JIS0208 1.244 + } else { 1.245 + if( 0x8e == *src) { 1.246 + // JIS 0201 1.247 + mState = 2; // JIS0201 1.248 + } else if(0x8f == *src) { 1.249 + // JIS 0212 1.250 + mState = 3; // JIS0212 1.251 + } else { 1.252 + // others 1.253 + if (mErrBehavior == kOnError_Signal) 1.254 + goto error_invalidchar; 1.255 + *dest++ = 0xFFFD; 1.256 + if(dest >= destEnd) 1.257 + goto error1; 1.258 + } 1.259 + } 1.260 + } else { 1.261 + // ASCII 1.262 + *dest++ = (char16_t) *src; 1.263 + if(dest >= destEnd) 1.264 + goto error1; 1.265 + } 1.266 + break; 1.267 + 1.268 + case 1: // Index to table 1.269 + { 1.270 + uint8_t off = sbIdx[*src]; 1.271 + if(0xFF == off) { 1.272 + if (mErrBehavior == kOnError_Signal) 1.273 + goto error_invalidchar; 1.274 + *dest++ = 0xFFFD; 1.275 + // if the first byte is valid for EUC-JP but the second 1.276 + // is not while being a valid US-ASCII, save it 1.277 + // instead of eating it up ! 1.278 + if ( (uint8_t)*src < (uint8_t)0x7f ) 1.279 + --src; 1.280 + } else { 1.281 + *dest++ = gJapaneseMap[mData+off]; 1.282 + } 1.283 + mState = 0; 1.284 + if(dest >= destEnd) 1.285 + goto error1; 1.286 + } 1.287 + break; 1.288 + 1.289 + case 2: // JIS 0201 1.290 + { 1.291 + if((0xA1 <= *src) && (*src <= 0xDF)) { 1.292 + *dest++ = (0xFF61-0x00A1) + *src; 1.293 + } else { 1.294 + if (mErrBehavior == kOnError_Signal) 1.295 + goto error_invalidchar; 1.296 + *dest++ = 0xFFFD; 1.297 + // if 0x8e is not followed by a valid JIS X 0201 byte 1.298 + // but by a valid US-ASCII, save it instead of eating it up. 1.299 + if ( (uint8_t)*src < (uint8_t)0x7f ) 1.300 + --src; 1.301 + } 1.302 + mState = 0; 1.303 + if(dest >= destEnd) 1.304 + goto error1; 1.305 + } 1.306 + break; 1.307 + 1.308 + case 3: // JIS 0212 1.309 + { 1.310 + if (IN_GR_RANGE(*src)) 1.311 + { 1.312 + mData = JIS0212_INDEX[*src & 0x7F]; 1.313 + if(mData != 0xFFFD ) 1.314 + { 1.315 + mState = 4; 1.316 + } else { 1.317 + mState = 5; // error 1.318 + } 1.319 + } else { 1.320 + // First "JIS 0212" byte is not in the valid GR range: save it 1.321 + if (mErrBehavior == kOnError_Signal) 1.322 + goto error_invalidchar; 1.323 + *dest++ = 0xFFFD; 1.324 + --src; 1.325 + mState = 0; 1.326 + if(dest >= destEnd) 1.327 + goto error1; 1.328 + } 1.329 + } 1.330 + break; 1.331 + case 4: 1.332 + { 1.333 + uint8_t off = sbIdx[*src]; 1.334 + if(0xFF != off) { 1.335 + *dest++ = gJapaneseMap[mData+off]; 1.336 + mState = 0; 1.337 + if(dest >= destEnd) 1.338 + goto error1; 1.339 + break; 1.340 + } 1.341 + // else fall through to error handler 1.342 + } 1.343 + case 5: // two bytes undefined 1.344 + { 1.345 + if (mErrBehavior == kOnError_Signal) 1.346 + goto error_invalidchar; 1.347 + *dest++ = 0xFFFD; 1.348 + // Undefined JIS 0212 two byte sequence. If the second byte is in 1.349 + // the valid range for a two byte sequence (0xa1 - 0xfe) consume 1.350 + // both bytes. Otherwise resynchronize on the second byte. 1.351 + if (!IN_GR_RANGE(*src)) 1.352 + --src; 1.353 + mState = 0; 1.354 + if(dest >= destEnd) 1.355 + goto error1; 1.356 + } 1.357 + break; 1.358 + } 1.359 + src++; 1.360 + } 1.361 + *aDestLen = dest - aDest; 1.362 + return NS_OK; 1.363 +error_invalidchar: 1.364 + *aDestLen = dest - aDest; 1.365 + *aSrcLen = src - (const unsigned char*)aSrc; 1.366 + return NS_ERROR_ILLEGAL_INPUT; 1.367 +error1: 1.368 + *aDestLen = dest - aDest; 1.369 + src++; 1.370 + if ((mState == 0) && (src == srcEnd)) { 1.371 + return NS_OK; 1.372 + } 1.373 + *aSrcLen = src - (const unsigned char*)aSrc; 1.374 + return NS_OK_UDEC_MOREOUTPUT; 1.375 +} 1.376 + 1.377 + 1.378 + 1.379 +NS_IMETHODIMP nsISO2022JPToUnicodeV2::Convert( 1.380 + const char * aSrc, int32_t * aSrcLen, 1.381 + char16_t * aDest, int32_t * aDestLen) 1.382 +{ 1.383 + static NS_DEFINE_CID(kCharsetConverterManagerCID, NS_ICHARSETCONVERTERMANAGER_CID); 1.384 + 1.385 + static const uint16_t fbIdx[128] = 1.386 + { 1.387 +/* 0x8X */ 1.388 + 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 1.389 + 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 1.390 +/* 0x9X */ 1.391 + 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 1.392 + 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 1.393 +/* 0xAX */ 1.394 + 0xFFFD, 0, 94, 94* 2, 94* 3, 94* 4, 94* 5, 94* 6, 1.395 + 94* 7, 94* 8 , 94* 9, 94*10, 94*11, 94*12, 94*13, 94*14, 1.396 +/* 0xBX */ 1.397 + 94*15, 94*16, 94*17, 94*18, 94*19, 94*20, 94*21, 94*22, 1.398 + 94*23, 94*24, 94*25, 94*26, 94*27, 94*28, 94*29, 94*30, 1.399 +/* 0xCX */ 1.400 + 94*31, 94*32, 94*33, 94*34, 94*35, 94*36, 94*37, 94*38, 1.401 + 94*39, 94*40, 94*41, 94*42, 94*43, 94*44, 94*45, 94*46, 1.402 +/* 0xDX */ 1.403 + 94*47, 94*48, 94*49, 94*50, 94*51, 94*52, 94*53, 94*54, 1.404 + 94*55, 94*56, 94*57, 94*58, 94*59, 94*60, 94*61, 94*62, 1.405 +/* 0xEX */ 1.406 + 94*63, 94*64, 94*65, 94*66, 94*67, 94*68, 94*69, 94*70, 1.407 + 94*71, 94*72, 94*73, 94*74, 94*75, 94*76, 94*77, 94*78, 1.408 +/* 0xFX */ 1.409 + 94*79, 94*80, 94*81, 94*82, 94*83, 94*84, 94*85, 94*86, 1.410 + 94*87, 94*88, 94*89, 94*90, 94*91, 94*92, 94*93, 0xFFFD, 1.411 + }; 1.412 + static const uint8_t sbIdx[256] = 1.413 + { 1.414 +/* 0x0X */ 1.415 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.416 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.417 +/* 0x1X */ 1.418 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.419 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.420 +/* 0x2X */ 1.421 + 0xFF, 0, 1, 2, 3, 4, 5, 6, 1.422 + 7, 8 , 9, 10, 11, 12, 13, 14, 1.423 +/* 0x3X */ 1.424 + 15, 16, 17, 18, 19, 20, 21, 22, 1.425 + 23, 24, 25, 26, 27, 28, 29, 30, 1.426 +/* 0x4X */ 1.427 + 31, 32, 33, 34, 35, 36, 37, 38, 1.428 + 39, 40, 41, 42, 43, 44, 45, 46, 1.429 +/* 0x5X */ 1.430 + 47, 48, 49, 50, 51, 52, 53, 54, 1.431 + 55, 56, 57, 58, 59, 60, 61, 62, 1.432 +/* 0x6X */ 1.433 + 63, 64, 65, 66, 67, 68, 69, 70, 1.434 + 71, 72, 73, 74, 75, 76, 77, 78, 1.435 +/* 0x7X */ 1.436 + 79, 80, 81, 82, 83, 84, 85, 86, 1.437 + 87, 88, 89, 90, 91, 92, 93, 0xFF, 1.438 +/* 0x8X */ 1.439 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.440 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.441 +/* 0x9X */ 1.442 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.443 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.444 +/* 0xAX */ 1.445 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.446 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.447 +/* 0xBX */ 1.448 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.449 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.450 +/* 0xCX */ 1.451 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.452 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.453 +/* 0xDX */ 1.454 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.455 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.456 +/* 0xEX */ 1.457 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.458 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.459 +/* 0xFX */ 1.460 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.461 + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1.462 + }; 1.463 + 1.464 + const unsigned char* srcEnd = (unsigned char*)aSrc + *aSrcLen; 1.465 + const unsigned char* src =(unsigned char*) aSrc; 1.466 + char16_t* destEnd = aDest + *aDestLen; 1.467 + char16_t* dest = aDest; 1.468 + while((src < srcEnd)) 1.469 + { 1.470 + 1.471 + switch(mState) 1.472 + { 1.473 + case mState_ASCII: 1.474 + if(0x1b == *src) 1.475 + { 1.476 + mLastLegalState = mState; 1.477 + mState = mState_ESC; 1.478 + } else if(*src & 0x80) { 1.479 + if (mErrBehavior == kOnError_Signal) 1.480 + goto error3; 1.481 + if (CHECK_OVERRUN(dest, destEnd, 1)) 1.482 + goto error1; 1.483 + *dest++ = UNICODE_REPLACEMENT_CHARACTER; 1.484 + } else { 1.485 + if (CHECK_OVERRUN(dest, destEnd, 1)) 1.486 + goto error1; 1.487 + *dest++ = (char16_t) *src; 1.488 + } 1.489 + break; 1.490 + 1.491 + case mState_ESC: 1.492 + if( '(' == *src) { 1.493 + mState = mState_ESC_28; 1.494 + } else if ('$' == *src) { 1.495 + mState = mState_ESC_24; 1.496 + } else if ('.' == *src) { // for ISO-2022-JP-2 1.497 + mState = mState_ESC_2e; 1.498 + } else if ('N' == *src) { // for ISO-2022-JP-2 1.499 + mState = mState_ESC_4e; 1.500 + } else { 1.501 + if (CHECK_OVERRUN(dest, destEnd, 2)) 1.502 + goto error1; 1.503 + *dest++ = (char16_t) 0x1b; 1.504 + if (0x80 & *src) { 1.505 + if (mErrBehavior == kOnError_Signal) 1.506 + goto error3; 1.507 + *dest++ = UNICODE_REPLACEMENT_CHARACTER; 1.508 + } else { 1.509 + *dest++ = (char16_t) *src; 1.510 + } 1.511 + mState = mLastLegalState; 1.512 + } 1.513 + break; 1.514 + 1.515 + case mState_ESC_28: // ESC ( 1.516 + if( 'B' == *src) { 1.517 + mState = mState_ASCII; 1.518 + if (mRunLength == 0) { 1.519 + if (CHECK_OVERRUN(dest, destEnd, 1)) 1.520 + goto error1; 1.521 + *dest++ = 0xFFFD; 1.522 + } 1.523 + mRunLength = 0; 1.524 + } else if ('J' == *src) { 1.525 + mState = mState_JISX0201_1976Roman; 1.526 + if (mRunLength == 0 && mLastLegalState != mState_ASCII) { 1.527 + if (CHECK_OVERRUN(dest, destEnd, 1)) 1.528 + goto error1; 1.529 + if (mErrBehavior == kOnError_Signal) 1.530 + goto error3; 1.531 + *dest++ = 0xFFFD; 1.532 + } 1.533 + mRunLength = 0; 1.534 + } else if ('I' == *src) { 1.535 + mState = mState_JISX0201_1976Kana; 1.536 + mRunLength = 0; 1.537 + } else { 1.538 + if (CHECK_OVERRUN(dest, destEnd, 3)) 1.539 + goto error1; 1.540 + *dest++ = (char16_t) 0x1b; 1.541 + *dest++ = (char16_t) '('; 1.542 + if (0x80 & *src) { 1.543 + if (mErrBehavior == kOnError_Signal) 1.544 + goto error3; 1.545 + *dest++ = UNICODE_REPLACEMENT_CHARACTER; 1.546 + } else { 1.547 + *dest++ = (char16_t) *src; 1.548 + } 1.549 + mState = mLastLegalState; 1.550 + } 1.551 + break; 1.552 + 1.553 + case mState_ESC_24: // ESC $ 1.554 + if( '@' == *src) { 1.555 + mState = mState_JISX0208_1978; 1.556 + mRunLength = 0; 1.557 + } else if ('A' == *src) { 1.558 + mState = mState_GB2312_1980; 1.559 + mRunLength = 0; 1.560 + } else if ('B' == *src) { 1.561 + mState = mState_JISX0208_1983; 1.562 + mRunLength = 0; 1.563 + } else if ('(' == *src) { 1.564 + mState = mState_ESC_24_28; 1.565 + } else { 1.566 + if (CHECK_OVERRUN(dest, destEnd, 3)) 1.567 + goto error1; 1.568 + *dest++ = (char16_t) 0x1b; 1.569 + *dest++ = (char16_t) '$'; 1.570 + if (0x80 & *src) { 1.571 + if (mErrBehavior == kOnError_Signal) 1.572 + goto error3; 1.573 + *dest++ = UNICODE_REPLACEMENT_CHARACTER; 1.574 + } else { 1.575 + *dest++ = (char16_t) *src; 1.576 + } 1.577 + mState = mLastLegalState; 1.578 + } 1.579 + break; 1.580 + 1.581 + case mState_ESC_24_28: // ESC $ ( 1.582 + if( 'C' == *src) { 1.583 + mState = mState_KSC5601_1987; 1.584 + mRunLength = 0; 1.585 + } else if ('D' == *src) { 1.586 + mState = mState_JISX0212_1990; 1.587 + mRunLength = 0; 1.588 + } else { 1.589 + if (CHECK_OVERRUN(dest, destEnd, 4)) 1.590 + goto error1; 1.591 + *dest++ = (char16_t) 0x1b; 1.592 + *dest++ = (char16_t) '$'; 1.593 + *dest++ = (char16_t) '('; 1.594 + if (0x80 & *src) { 1.595 + if (mErrBehavior == kOnError_Signal) 1.596 + goto error3; 1.597 + *dest++ = UNICODE_REPLACEMENT_CHARACTER; 1.598 + } else { 1.599 + *dest++ = (char16_t) *src; 1.600 + } 1.601 + mState = mLastLegalState; 1.602 + } 1.603 + break; 1.604 + 1.605 + case mState_JISX0201_1976Roman: 1.606 + if(0x1b == *src) { 1.607 + mLastLegalState = mState; 1.608 + mState = mState_ESC; 1.609 + } else if(*src & 0x80) { 1.610 + if (mErrBehavior == kOnError_Signal) 1.611 + goto error3; 1.612 + if (CHECK_OVERRUN(dest, destEnd, 1)) 1.613 + goto error1; 1.614 + *dest++ = UNICODE_REPLACEMENT_CHARACTER; 1.615 + ++mRunLength; 1.616 + } else { 1.617 + // XXX We need to decide how to handle \ and ~ here 1.618 + // we may need a if statement here for '\' and '~' 1.619 + // to map them to Yen and Overbar 1.620 + if (CHECK_OVERRUN(dest, destEnd, 1)) 1.621 + goto error1; 1.622 + *dest++ = (char16_t) *src; 1.623 + ++mRunLength; 1.624 + } 1.625 + break; 1.626 + 1.627 + case mState_JISX0201_1976Kana: 1.628 + if(0x1b == *src) { 1.629 + mLastLegalState = mState; 1.630 + mState = mState_ESC; 1.631 + } else { 1.632 + if (CHECK_OVERRUN(dest, destEnd, 1)) 1.633 + goto error1; 1.634 + if((0x21 <= *src) && (*src <= 0x5F)) { 1.635 + *dest++ = (0xFF61-0x0021) + *src; 1.636 + } else { 1.637 + if (mErrBehavior == kOnError_Signal) 1.638 + goto error3; 1.639 + *dest++ = UNICODE_REPLACEMENT_CHARACTER; 1.640 + } 1.641 + ++mRunLength; 1.642 + } 1.643 + break; 1.644 + 1.645 + case mState_JISX0208_1978: 1.646 + if(0x1b == *src) { 1.647 + mLastLegalState = mState; 1.648 + mState = mState_ESC; 1.649 + } else if(*src & 0x80) { 1.650 + mLastLegalState = mState; 1.651 + mState = mState_ERROR; 1.652 + } else { 1.653 + mData = JIS0208_INDEX[*src & 0x7F]; 1.654 + if (0xFFFD == mData) { 1.655 + if (mErrBehavior == kOnError_Signal) 1.656 + goto error3; 1.657 + mState = mState_ERROR; 1.658 + } else { 1.659 + mState = mState_JISX0208_1978_2ndbyte; 1.660 + } 1.661 + } 1.662 + break; 1.663 + 1.664 + case mState_GB2312_1980: 1.665 + if(0x1b == *src) { 1.666 + mLastLegalState = mState; 1.667 + mState = mState_ESC; 1.668 + } else if(*src & 0x80) { 1.669 + mLastLegalState = mState; 1.670 + mState = mState_ERROR; 1.671 + } else { 1.672 + mData = fbIdx[*src & 0x7F]; 1.673 + if (0xFFFD == mData) { 1.674 + if (mErrBehavior == kOnError_Signal) 1.675 + goto error3; 1.676 + mState = mState_ERROR; 1.677 + } else { 1.678 + mState = mState_GB2312_1980_2ndbyte; 1.679 + } 1.680 + } 1.681 + break; 1.682 + 1.683 + case mState_JISX0208_1983: 1.684 + if(0x1b == *src) { 1.685 + mLastLegalState = mState; 1.686 + mState = mState_ESC; 1.687 + } else if(*src & 0x80) { 1.688 + mLastLegalState = mState; 1.689 + mState = mState_ERROR; 1.690 + } else { 1.691 + mData = JIS0208_INDEX[*src & 0x7F]; 1.692 + if (0xFFFD == mData) { 1.693 + if (mErrBehavior == kOnError_Signal) 1.694 + goto error3; 1.695 + mState = mState_ERROR; 1.696 + } else { 1.697 + mState = mState_JISX0208_1983_2ndbyte; 1.698 + } 1.699 + } 1.700 + break; 1.701 + 1.702 + case mState_KSC5601_1987: 1.703 + if(0x1b == *src) { 1.704 + mLastLegalState = mState; 1.705 + mState = mState_ESC; 1.706 + } else if(*src & 0x80) { 1.707 + mLastLegalState = mState; 1.708 + mState = mState_ERROR; 1.709 + } else { 1.710 + mData = fbIdx[*src & 0x7F]; 1.711 + if (0xFFFD == mData) { 1.712 + if (mErrBehavior == kOnError_Signal) 1.713 + goto error3; 1.714 + mState = mState_ERROR; 1.715 + } else { 1.716 + mState = mState_KSC5601_1987_2ndbyte; 1.717 + } 1.718 + } 1.719 + break; 1.720 + 1.721 + case mState_JISX0212_1990: 1.722 + if(0x1b == *src) { 1.723 + mLastLegalState = mState; 1.724 + mState = mState_ESC; 1.725 + } else if(*src & 0x80) { 1.726 + mLastLegalState = mState; 1.727 + mState = mState_ERROR; 1.728 + } else { 1.729 + mData = JIS0212_INDEX[*src & 0x7F]; 1.730 + if (0xFFFD == mData) { 1.731 + if (mErrBehavior == kOnError_Signal) 1.732 + goto error3; 1.733 + mState = mState_ERROR; 1.734 + } else { 1.735 + mState = mState_JISX0212_1990_2ndbyte; 1.736 + } 1.737 + } 1.738 + break; 1.739 + 1.740 + case mState_JISX0208_1978_2ndbyte: 1.741 + { 1.742 + if (CHECK_OVERRUN(dest, destEnd, 1)) 1.743 + goto error1; 1.744 + uint8_t off = sbIdx[*src]; 1.745 + if(0xFF == off) { 1.746 + if (mErrBehavior == kOnError_Signal) 1.747 + goto error3; 1.748 + *dest++ = UNICODE_REPLACEMENT_CHARACTER; 1.749 + } else { 1.750 + // XXX We need to map from JIS X 0208 1983 to 1987 1.751 + // in the next line before pass to *dest++ 1.752 + *dest++ = gJapaneseMap[mData+off]; 1.753 + } 1.754 + ++mRunLength; 1.755 + mState = mState_JISX0208_1978; 1.756 + } 1.757 + break; 1.758 + 1.759 + case mState_GB2312_1980_2ndbyte: 1.760 + { 1.761 + if (CHECK_OVERRUN(dest, destEnd, 1)) 1.762 + goto error1; 1.763 + uint8_t off = sbIdx[*src]; 1.764 + if(0xFF == off) { 1.765 + if (mErrBehavior == kOnError_Signal) 1.766 + goto error3; 1.767 + *dest++ = UNICODE_REPLACEMENT_CHARACTER; 1.768 + } else { 1.769 + if (!mGB2312Decoder) { 1.770 + // creating a delegate converter (GB2312) 1.771 + nsresult rv; 1.772 + nsCOMPtr<nsICharsetConverterManager> ccm = 1.773 + do_GetService(kCharsetConverterManagerCID, &rv); 1.774 + if (NS_SUCCEEDED(rv)) { 1.775 + rv = ccm->GetUnicodeDecoderRaw("GB2312", &mGB2312Decoder); 1.776 + } 1.777 + } 1.778 + if (!mGB2312Decoder) {// failed creating a delegate converter 1.779 + goto error2; 1.780 + } else { 1.781 + unsigned char gb[2]; 1.782 + char16_t uni; 1.783 + int32_t gbLen = 2, uniLen = 1; 1.784 + // ((mData/94)+0x21) is the original 1st byte. 1.785 + // *src is the present 2nd byte. 1.786 + // Put 2 bytes (one character) to gb[] with GB2312 encoding. 1.787 + gb[0] = ((mData / 94) + 0x21) | 0x80; 1.788 + gb[1] = *src | 0x80; 1.789 + // Convert GB2312 to unicode. 1.790 + mGB2312Decoder->Convert((const char *)gb, &gbLen, 1.791 + &uni, &uniLen); 1.792 + *dest++ = uni; 1.793 + } 1.794 + } 1.795 + ++mRunLength; 1.796 + mState = mState_GB2312_1980; 1.797 + } 1.798 + break; 1.799 + 1.800 + case mState_JISX0208_1983_2ndbyte: 1.801 + { 1.802 + if (CHECK_OVERRUN(dest, destEnd, 1)) 1.803 + goto error1; 1.804 + uint8_t off = sbIdx[*src]; 1.805 + if(0xFF == off) { 1.806 + if (mErrBehavior == kOnError_Signal) 1.807 + goto error3; 1.808 + *dest++ = UNICODE_REPLACEMENT_CHARACTER; 1.809 + } else { 1.810 + *dest++ = gJapaneseMap[mData+off]; 1.811 + } 1.812 + ++mRunLength; 1.813 + mState = mState_JISX0208_1983; 1.814 + } 1.815 + break; 1.816 + 1.817 + case mState_KSC5601_1987_2ndbyte: 1.818 + { 1.819 + if (CHECK_OVERRUN(dest, destEnd, 1)) 1.820 + goto error1; 1.821 + uint8_t off = sbIdx[*src]; 1.822 + if(0xFF == off) { 1.823 + if (mErrBehavior == kOnError_Signal) 1.824 + goto error3; 1.825 + *dest++ = UNICODE_REPLACEMENT_CHARACTER; 1.826 + } else { 1.827 + if (!mEUCKRDecoder) { 1.828 + // creating a delegate converter (EUC-KR) 1.829 + nsresult rv; 1.830 + nsCOMPtr<nsICharsetConverterManager> ccm = 1.831 + do_GetService(kCharsetConverterManagerCID, &rv); 1.832 + if (NS_SUCCEEDED(rv)) { 1.833 + rv = ccm->GetUnicodeDecoderRaw("EUC-KR", &mEUCKRDecoder); 1.834 + } 1.835 + } 1.836 + if (!mEUCKRDecoder) {// failed creating a delegate converter 1.837 + goto error2; 1.838 + } else { 1.839 + unsigned char ksc[2]; 1.840 + char16_t uni; 1.841 + int32_t kscLen = 2, uniLen = 1; 1.842 + // ((mData/94)+0x21) is the original 1st byte. 1.843 + // *src is the present 2nd byte. 1.844 + // Put 2 bytes (one character) to ksc[] with EUC-KR encoding. 1.845 + ksc[0] = ((mData / 94) + 0x21) | 0x80; 1.846 + ksc[1] = *src | 0x80; 1.847 + // Convert EUC-KR to unicode. 1.848 + mEUCKRDecoder->Convert((const char *)ksc, &kscLen, 1.849 + &uni, &uniLen); 1.850 + *dest++ = uni; 1.851 + } 1.852 + } 1.853 + ++mRunLength; 1.854 + mState = mState_KSC5601_1987; 1.855 + } 1.856 + break; 1.857 + 1.858 + case mState_JISX0212_1990_2ndbyte: 1.859 + { 1.860 + uint8_t off = sbIdx[*src]; 1.861 + if (CHECK_OVERRUN(dest, destEnd, 1)) 1.862 + goto error1; 1.863 + if(0xFF == off) { 1.864 + if (mErrBehavior == kOnError_Signal) 1.865 + goto error3; 1.866 + *dest++ = UNICODE_REPLACEMENT_CHARACTER; 1.867 + } else { 1.868 + *dest++ = gJapaneseMap[mData+off]; 1.869 + } 1.870 + ++mRunLength; 1.871 + mState = mState_JISX0212_1990; 1.872 + } 1.873 + break; 1.874 + 1.875 + case mState_ESC_2e: // ESC . 1.876 + // "ESC ." will designate 96 character set to G2. 1.877 + mState = mLastLegalState; 1.878 + if( 'A' == *src) { 1.879 + G2charset = G2_ISO88591; 1.880 + } else if ('F' == *src) { 1.881 + G2charset = G2_ISO88597; 1.882 + } else { 1.883 + if (CHECK_OVERRUN(dest, destEnd, 3)) 1.884 + goto error1; 1.885 + *dest++ = (char16_t) 0x1b; 1.886 + *dest++ = (char16_t) '.'; 1.887 + if (0x80 & *src) { 1.888 + if (mErrBehavior == kOnError_Signal) 1.889 + goto error3; 1.890 + *dest++ = UNICODE_REPLACEMENT_CHARACTER; 1.891 + } else { 1.892 + *dest++ = (char16_t) *src; 1.893 + } 1.894 + } 1.895 + break; 1.896 + 1.897 + case mState_ESC_4e: // ESC N 1.898 + // "ESC N" is the SS2 sequence, that invoke a G2 designated 1.899 + // character set. Since SS2 is effective only for next one 1.900 + // character, mState should be returned to the last status. 1.901 + mState = mLastLegalState; 1.902 + if((0x20 <= *src) && (*src <= 0x7F)) { 1.903 + if (CHECK_OVERRUN(dest, destEnd, 1)) 1.904 + goto error1; 1.905 + if (G2_ISO88591 == G2charset) { 1.906 + *dest++ = *src | 0x80; 1.907 + } else if (G2_ISO88597 == G2charset) { 1.908 + if (!mISO88597Decoder) { 1.909 + // creating a delegate converter (ISO-8859-7) 1.910 + nsresult rv; 1.911 + nsCOMPtr<nsICharsetConverterManager> ccm = 1.912 + do_GetService(kCharsetConverterManagerCID, &rv); 1.913 + if (NS_SUCCEEDED(rv)) { 1.914 + rv = ccm->GetUnicodeDecoderRaw("ISO-8859-7", &mISO88597Decoder); 1.915 + } 1.916 + } 1.917 + if (!mISO88597Decoder) {// failed creating a delegate converter 1.918 + goto error2; 1.919 + } else { 1.920 + // Put one character with ISO-8859-7 encoding. 1.921 + unsigned char gr = *src | 0x80; 1.922 + char16_t uni; 1.923 + int32_t grLen = 1, uniLen = 1; 1.924 + // Convert ISO-8859-7 to unicode. 1.925 + mISO88597Decoder->Convert((const char *)&gr, &grLen, 1.926 + &uni, &uniLen); 1.927 + *dest++ = uni; 1.928 + } 1.929 + } else {// G2charset is G2_unknown (not designated yet) 1.930 + if (mErrBehavior == kOnError_Signal) 1.931 + goto error3; 1.932 + *dest++ = UNICODE_REPLACEMENT_CHARACTER; 1.933 + } 1.934 + ++mRunLength; 1.935 + } else { 1.936 + if (CHECK_OVERRUN(dest, destEnd, 3)) 1.937 + goto error1; 1.938 + *dest++ = (char16_t) 0x1b; 1.939 + *dest++ = (char16_t) 'N'; 1.940 + if (0x80 & *src) { 1.941 + if (mErrBehavior == kOnError_Signal) 1.942 + goto error3; 1.943 + *dest++ = UNICODE_REPLACEMENT_CHARACTER; 1.944 + } else { 1.945 + *dest++ = (char16_t) *src; 1.946 + } 1.947 + } 1.948 + break; 1.949 + 1.950 + case mState_ERROR: 1.951 + mState = mLastLegalState; 1.952 + if (mErrBehavior == kOnError_Signal) { 1.953 + mRunLength = 0; 1.954 + goto error3; 1.955 + } 1.956 + if (CHECK_OVERRUN(dest, destEnd, 1)) 1.957 + goto error1; 1.958 + *dest++ = UNICODE_REPLACEMENT_CHARACTER; 1.959 + ++mRunLength; 1.960 + break; 1.961 + 1.962 + } // switch 1.963 + src++; 1.964 + } 1.965 + *aDestLen = dest - aDest; 1.966 + return NS_OK; 1.967 +error1: 1.968 + *aDestLen = dest - aDest; 1.969 + *aSrcLen = src - (const unsigned char*)aSrc; 1.970 + return NS_OK_UDEC_MOREOUTPUT; 1.971 +error2: 1.972 + *aDestLen = dest - aDest; 1.973 + *aSrcLen = src - (const unsigned char*)aSrc; 1.974 + return NS_ERROR_UNEXPECTED; 1.975 +error3: 1.976 + *aDestLen = dest - aDest; 1.977 + *aSrcLen = src - (const unsigned char*)aSrc; 1.978 + return NS_ERROR_ILLEGAL_INPUT; 1.979 +}