michael@0: /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: #include "nsJapaneseToUnicode.h" michael@0: michael@0: #include "nsUCSupport.h" michael@0: michael@0: #include "japanese.map" michael@0: michael@0: #include "nsICharsetConverterManager.h" michael@0: #include "nsServiceManagerUtils.h" michael@0: michael@0: #include "mozilla/Assertions.h" michael@0: michael@0: // HTML5 says to use Windows-31J instead of the real Shift_JIS for decoding michael@0: #define SJIS_INDEX gCP932Index[0] michael@0: #define JIS0208_INDEX gCP932Index[1] michael@0: michael@0: #define JIS0212_INDEX gJIS0212Index michael@0: #define SJIS_UNMAPPED 0x30fb michael@0: #define UNICODE_REPLACEMENT_CHARACTER 0xfffd michael@0: #define IN_GR_RANGE(b) \ michael@0: ((uint8_t(0xa1) <= uint8_t(b)) && (uint8_t(b) <= uint8_t(0xfe))) michael@0: michael@0: NS_IMETHODIMP nsShiftJISToUnicode::Convert( michael@0: const char * aSrc, int32_t * aSrcLen, michael@0: char16_t * aDest, int32_t * aDestLen) michael@0: { michael@0: static const uint8_t sbIdx[256] = michael@0: { michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x00 */ michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x08 */ michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x10 */ michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x18 */ michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x20 */ michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x28 */ michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x30 */ michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x38 */ michael@0: 0, 1, 2, 3, 4, 5, 6, 7, /* 0x40 */ michael@0: 8, 9, 10, 11, 12, 13, 14, 15, /* 0x48 */ michael@0: 16, 17, 18, 19, 20, 21, 22, 23, /* 0x50 */ michael@0: 24, 25, 26, 27, 28, 29, 30, 31, /* 0x58 */ michael@0: 32, 33, 34, 35, 36, 37, 38, 39, /* 0x60 */ michael@0: 40, 41, 42, 43, 44, 45, 46, 47, /* 0x68 */ michael@0: 48, 49, 50, 51, 52, 53, 54, 55, /* 0x70 */ michael@0: 56, 57, 58, 59, 60, 61, 62, 0xFF, /* 0x78 */ michael@0: 63, 64, 65, 66, 67, 68, 69, 70, /* 0x80 */ michael@0: 71, 72, 73, 74, 75, 76, 77, 78, /* 0x88 */ michael@0: 79, 80, 81, 82, 83, 84, 85, 86, /* 0x90 */ michael@0: 87, 88, 89, 90, 91, 92, 93, 94, /* 0x98 */ michael@0: 95, 96, 97, 98, 99, 100, 101, 102, /* 0xa0 */ michael@0: 103, 104, 105, 106, 107, 108, 109, 110, /* 0xa8 */ michael@0: 111, 112, 113, 114, 115, 116, 117, 118, /* 0xb0 */ michael@0: 119, 120, 121, 122, 123, 124, 125, 126, /* 0xb8 */ michael@0: 127, 128, 129, 130, 131, 132, 133, 134, /* 0xc0 */ michael@0: 135, 136, 137, 138, 139, 140, 141, 142, /* 0xc8 */ michael@0: 143, 144, 145, 146, 147, 148, 149, 150, /* 0xd0 */ michael@0: 151, 152, 153, 154, 155, 156, 157, 158, /* 0xd8 */ michael@0: 159, 160, 161, 162, 163, 164, 165, 166, /* 0xe0 */ michael@0: 167, 168, 169, 170, 171, 172, 173, 174, /* 0xe8 */ michael@0: 175, 176, 177, 178, 179, 180, 181, 182, /* 0xf0 */ michael@0: 183, 184, 185, 186, 187, 0xFF, 0xFF, 0xFF, /* 0xf8 */ michael@0: }; michael@0: michael@0: const unsigned char* srcEnd = (unsigned char*)aSrc + *aSrcLen; michael@0: const unsigned char* src =(unsigned char*) aSrc; michael@0: char16_t* destEnd = aDest + *aDestLen; michael@0: char16_t* dest = aDest; michael@0: while (src < srcEnd) { michael@0: switch (mState) { michael@0: case 0: michael@0: if (*src <= 0x80) { michael@0: // ASCII michael@0: *dest++ = (char16_t) *src; michael@0: if (dest >= destEnd) { michael@0: goto error1; michael@0: } michael@0: } else { michael@0: mData = SJIS_INDEX[*src & 0x7F]; michael@0: if (mData < 0xE000) { michael@0: mState = 1; // two bytes michael@0: } else if (mData < 0xF000) { michael@0: mState = 2; // EUDC michael@0: } else { michael@0: *dest++ = mData; // JIS 0201 michael@0: if (dest >= destEnd) { michael@0: goto error1; michael@0: } michael@0: } michael@0: } michael@0: break; michael@0: michael@0: case 1: // Index to table michael@0: { michael@0: MOZ_ASSERT(mData < 0xE000); michael@0: uint8_t off = sbIdx[*src]; michael@0: michael@0: // Error handling: in the case where the second octet is not in the michael@0: // valid ranges 0x40-0x7E 0x80-0xFC, unconsume the invalid octet and michael@0: // interpret it as the ASCII value. In the case where the second michael@0: // octet is in the valid range but there is no mapping for the michael@0: // 2-octet sequence, do not unconsume. michael@0: if(0xFF == off) { michael@0: src--; michael@0: if (mErrBehavior == kOnError_Signal) michael@0: goto error_invalidchar; michael@0: *dest++ = UNICODE_REPLACEMENT_CHARACTER; michael@0: } else { michael@0: char16_t ch = gJapaneseMap[mData+off]; michael@0: if(ch == 0xfffd) { michael@0: if (mErrBehavior == kOnError_Signal) michael@0: goto error_invalidchar; michael@0: ch = SJIS_UNMAPPED; michael@0: } michael@0: *dest++ = ch; michael@0: } michael@0: mState = 0; michael@0: if(dest >= destEnd) michael@0: goto error1; michael@0: } michael@0: break; michael@0: michael@0: case 2: // EUDC michael@0: { michael@0: MOZ_ASSERT(0xE000 <= mData && mData < 0xF000); michael@0: uint8_t off = sbIdx[*src]; michael@0: michael@0: // Error handling as in case 1 michael@0: if(0xFF == off) { michael@0: src--; michael@0: if (mErrBehavior == kOnError_Signal) michael@0: goto error_invalidchar; michael@0: michael@0: *dest++ = UNICODE_REPLACEMENT_CHARACTER; michael@0: } else { michael@0: *dest++ = mData + off; michael@0: } michael@0: mState = 0; michael@0: if(dest >= destEnd) michael@0: goto error1; michael@0: } michael@0: break; michael@0: michael@0: } michael@0: src++; michael@0: } michael@0: *aDestLen = dest - aDest; michael@0: return NS_OK; michael@0: error_invalidchar: michael@0: *aDestLen = dest - aDest; michael@0: *aSrcLen = src - (const unsigned char*)aSrc; michael@0: return NS_ERROR_ILLEGAL_INPUT; michael@0: error1: michael@0: *aDestLen = dest - aDest; michael@0: src++; michael@0: if ((mState == 0) && (src == srcEnd)) { michael@0: return NS_OK; michael@0: } michael@0: *aSrcLen = src - (const unsigned char*)aSrc; michael@0: return NS_OK_UDEC_MOREOUTPUT; michael@0: } michael@0: michael@0: char16_t michael@0: nsShiftJISToUnicode::GetCharacterForUnMapped() michael@0: { michael@0: return char16_t(SJIS_UNMAPPED); michael@0: } michael@0: michael@0: NS_IMETHODIMP nsEUCJPToUnicodeV2::Convert( michael@0: const char * aSrc, int32_t * aSrcLen, michael@0: char16_t * aDest, int32_t * aDestLen) michael@0: { michael@0: static const uint8_t sbIdx[256] = michael@0: { michael@0: /* 0x0X */ michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: /* 0x1X */ michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: /* 0x2X */ michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: /* 0x3X */ michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: /* 0x4X */ michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: /* 0x5X */ michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: /* 0x6X */ michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: /* 0x7X */ michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: /* 0x8X */ michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: /* 0x9X */ michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: /* 0xAX */ michael@0: 0xFF, 0, 1, 2, 3, 4, 5, 6, michael@0: 7, 8 , 9, 10, 11, 12, 13, 14, michael@0: /* 0xBX */ michael@0: 15, 16, 17, 18, 19, 20, 21, 22, michael@0: 23, 24, 25, 26, 27, 28, 29, 30, michael@0: /* 0xCX */ michael@0: 31, 32, 33, 34, 35, 36, 37, 38, michael@0: 39, 40, 41, 42, 43, 44, 45, 46, michael@0: /* 0xDX */ michael@0: 47, 48, 49, 50, 51, 52, 53, 54, michael@0: 55, 56, 57, 58, 59, 60, 61, 62, michael@0: /* 0xEX */ michael@0: 63, 64, 65, 66, 67, 68, 69, 70, michael@0: 71, 72, 73, 74, 75, 76, 77, 78, michael@0: /* 0xFX */ michael@0: 79, 80, 81, 82, 83, 84, 85, 86, michael@0: 87, 88, 89, 90, 91, 92, 93, 0xFF, michael@0: }; michael@0: michael@0: const unsigned char* srcEnd = (unsigned char*)aSrc + *aSrcLen; michael@0: const unsigned char* src =(unsigned char*) aSrc; michael@0: char16_t* destEnd = aDest + *aDestLen; michael@0: char16_t* dest = aDest; michael@0: while((src < srcEnd)) michael@0: { michael@0: switch(mState) michael@0: { michael@0: case 0: michael@0: if(*src & 0x80 && *src != (unsigned char)0xa0) michael@0: { michael@0: mData = JIS0208_INDEX[*src & 0x7F]; michael@0: if(mData != 0xFFFD ) michael@0: { michael@0: mState = 1; // two byte JIS0208 michael@0: } else { michael@0: if( 0x8e == *src) { michael@0: // JIS 0201 michael@0: mState = 2; // JIS0201 michael@0: } else if(0x8f == *src) { michael@0: // JIS 0212 michael@0: mState = 3; // JIS0212 michael@0: } else { michael@0: // others michael@0: if (mErrBehavior == kOnError_Signal) michael@0: goto error_invalidchar; michael@0: *dest++ = 0xFFFD; michael@0: if(dest >= destEnd) michael@0: goto error1; michael@0: } michael@0: } michael@0: } else { michael@0: // ASCII michael@0: *dest++ = (char16_t) *src; michael@0: if(dest >= destEnd) michael@0: goto error1; michael@0: } michael@0: break; michael@0: michael@0: case 1: // Index to table michael@0: { michael@0: uint8_t off = sbIdx[*src]; michael@0: if(0xFF == off) { michael@0: if (mErrBehavior == kOnError_Signal) michael@0: goto error_invalidchar; michael@0: *dest++ = 0xFFFD; michael@0: // if the first byte is valid for EUC-JP but the second michael@0: // is not while being a valid US-ASCII, save it michael@0: // instead of eating it up ! michael@0: if ( (uint8_t)*src < (uint8_t)0x7f ) michael@0: --src; michael@0: } else { michael@0: *dest++ = gJapaneseMap[mData+off]; michael@0: } michael@0: mState = 0; michael@0: if(dest >= destEnd) michael@0: goto error1; michael@0: } michael@0: break; michael@0: michael@0: case 2: // JIS 0201 michael@0: { michael@0: if((0xA1 <= *src) && (*src <= 0xDF)) { michael@0: *dest++ = (0xFF61-0x00A1) + *src; michael@0: } else { michael@0: if (mErrBehavior == kOnError_Signal) michael@0: goto error_invalidchar; michael@0: *dest++ = 0xFFFD; michael@0: // if 0x8e is not followed by a valid JIS X 0201 byte michael@0: // but by a valid US-ASCII, save it instead of eating it up. michael@0: if ( (uint8_t)*src < (uint8_t)0x7f ) michael@0: --src; michael@0: } michael@0: mState = 0; michael@0: if(dest >= destEnd) michael@0: goto error1; michael@0: } michael@0: break; michael@0: michael@0: case 3: // JIS 0212 michael@0: { michael@0: if (IN_GR_RANGE(*src)) michael@0: { michael@0: mData = JIS0212_INDEX[*src & 0x7F]; michael@0: if(mData != 0xFFFD ) michael@0: { michael@0: mState = 4; michael@0: } else { michael@0: mState = 5; // error michael@0: } michael@0: } else { michael@0: // First "JIS 0212" byte is not in the valid GR range: save it michael@0: if (mErrBehavior == kOnError_Signal) michael@0: goto error_invalidchar; michael@0: *dest++ = 0xFFFD; michael@0: --src; michael@0: mState = 0; michael@0: if(dest >= destEnd) michael@0: goto error1; michael@0: } michael@0: } michael@0: break; michael@0: case 4: michael@0: { michael@0: uint8_t off = sbIdx[*src]; michael@0: if(0xFF != off) { michael@0: *dest++ = gJapaneseMap[mData+off]; michael@0: mState = 0; michael@0: if(dest >= destEnd) michael@0: goto error1; michael@0: break; michael@0: } michael@0: // else fall through to error handler michael@0: } michael@0: case 5: // two bytes undefined michael@0: { michael@0: if (mErrBehavior == kOnError_Signal) michael@0: goto error_invalidchar; michael@0: *dest++ = 0xFFFD; michael@0: // Undefined JIS 0212 two byte sequence. If the second byte is in michael@0: // the valid range for a two byte sequence (0xa1 - 0xfe) consume michael@0: // both bytes. Otherwise resynchronize on the second byte. michael@0: if (!IN_GR_RANGE(*src)) michael@0: --src; michael@0: mState = 0; michael@0: if(dest >= destEnd) michael@0: goto error1; michael@0: } michael@0: break; michael@0: } michael@0: src++; michael@0: } michael@0: *aDestLen = dest - aDest; michael@0: return NS_OK; michael@0: error_invalidchar: michael@0: *aDestLen = dest - aDest; michael@0: *aSrcLen = src - (const unsigned char*)aSrc; michael@0: return NS_ERROR_ILLEGAL_INPUT; michael@0: error1: michael@0: *aDestLen = dest - aDest; michael@0: src++; michael@0: if ((mState == 0) && (src == srcEnd)) { michael@0: return NS_OK; michael@0: } michael@0: *aSrcLen = src - (const unsigned char*)aSrc; michael@0: return NS_OK_UDEC_MOREOUTPUT; michael@0: } michael@0: michael@0: michael@0: michael@0: NS_IMETHODIMP nsISO2022JPToUnicodeV2::Convert( michael@0: const char * aSrc, int32_t * aSrcLen, michael@0: char16_t * aDest, int32_t * aDestLen) michael@0: { michael@0: static NS_DEFINE_CID(kCharsetConverterManagerCID, NS_ICHARSETCONVERTERMANAGER_CID); michael@0: michael@0: static const uint16_t fbIdx[128] = michael@0: { michael@0: /* 0x8X */ michael@0: 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, michael@0: 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, michael@0: /* 0x9X */ michael@0: 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, michael@0: 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, michael@0: /* 0xAX */ michael@0: 0xFFFD, 0, 94, 94* 2, 94* 3, 94* 4, 94* 5, 94* 6, michael@0: 94* 7, 94* 8 , 94* 9, 94*10, 94*11, 94*12, 94*13, 94*14, michael@0: /* 0xBX */ michael@0: 94*15, 94*16, 94*17, 94*18, 94*19, 94*20, 94*21, 94*22, michael@0: 94*23, 94*24, 94*25, 94*26, 94*27, 94*28, 94*29, 94*30, michael@0: /* 0xCX */ michael@0: 94*31, 94*32, 94*33, 94*34, 94*35, 94*36, 94*37, 94*38, michael@0: 94*39, 94*40, 94*41, 94*42, 94*43, 94*44, 94*45, 94*46, michael@0: /* 0xDX */ michael@0: 94*47, 94*48, 94*49, 94*50, 94*51, 94*52, 94*53, 94*54, michael@0: 94*55, 94*56, 94*57, 94*58, 94*59, 94*60, 94*61, 94*62, michael@0: /* 0xEX */ michael@0: 94*63, 94*64, 94*65, 94*66, 94*67, 94*68, 94*69, 94*70, michael@0: 94*71, 94*72, 94*73, 94*74, 94*75, 94*76, 94*77, 94*78, michael@0: /* 0xFX */ michael@0: 94*79, 94*80, 94*81, 94*82, 94*83, 94*84, 94*85, 94*86, michael@0: 94*87, 94*88, 94*89, 94*90, 94*91, 94*92, 94*93, 0xFFFD, michael@0: }; michael@0: static const uint8_t sbIdx[256] = michael@0: { michael@0: /* 0x0X */ michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: /* 0x1X */ michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: /* 0x2X */ michael@0: 0xFF, 0, 1, 2, 3, 4, 5, 6, michael@0: 7, 8 , 9, 10, 11, 12, 13, 14, michael@0: /* 0x3X */ michael@0: 15, 16, 17, 18, 19, 20, 21, 22, michael@0: 23, 24, 25, 26, 27, 28, 29, 30, michael@0: /* 0x4X */ michael@0: 31, 32, 33, 34, 35, 36, 37, 38, michael@0: 39, 40, 41, 42, 43, 44, 45, 46, michael@0: /* 0x5X */ michael@0: 47, 48, 49, 50, 51, 52, 53, 54, michael@0: 55, 56, 57, 58, 59, 60, 61, 62, michael@0: /* 0x6X */ michael@0: 63, 64, 65, 66, 67, 68, 69, 70, michael@0: 71, 72, 73, 74, 75, 76, 77, 78, michael@0: /* 0x7X */ michael@0: 79, 80, 81, 82, 83, 84, 85, 86, michael@0: 87, 88, 89, 90, 91, 92, 93, 0xFF, michael@0: /* 0x8X */ michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: /* 0x9X */ michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: /* 0xAX */ michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: /* 0xBX */ michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: /* 0xCX */ michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: /* 0xDX */ michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: /* 0xEX */ michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: /* 0xFX */ michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, michael@0: }; michael@0: michael@0: const unsigned char* srcEnd = (unsigned char*)aSrc + *aSrcLen; michael@0: const unsigned char* src =(unsigned char*) aSrc; michael@0: char16_t* destEnd = aDest + *aDestLen; michael@0: char16_t* dest = aDest; michael@0: while((src < srcEnd)) michael@0: { michael@0: michael@0: switch(mState) michael@0: { michael@0: case mState_ASCII: michael@0: if(0x1b == *src) michael@0: { michael@0: mLastLegalState = mState; michael@0: mState = mState_ESC; michael@0: } else if(*src & 0x80) { michael@0: if (mErrBehavior == kOnError_Signal) michael@0: goto error3; michael@0: if (CHECK_OVERRUN(dest, destEnd, 1)) michael@0: goto error1; michael@0: *dest++ = UNICODE_REPLACEMENT_CHARACTER; michael@0: } else { michael@0: if (CHECK_OVERRUN(dest, destEnd, 1)) michael@0: goto error1; michael@0: *dest++ = (char16_t) *src; michael@0: } michael@0: break; michael@0: michael@0: case mState_ESC: michael@0: if( '(' == *src) { michael@0: mState = mState_ESC_28; michael@0: } else if ('$' == *src) { michael@0: mState = mState_ESC_24; michael@0: } else if ('.' == *src) { // for ISO-2022-JP-2 michael@0: mState = mState_ESC_2e; michael@0: } else if ('N' == *src) { // for ISO-2022-JP-2 michael@0: mState = mState_ESC_4e; michael@0: } else { michael@0: if (CHECK_OVERRUN(dest, destEnd, 2)) michael@0: goto error1; michael@0: *dest++ = (char16_t) 0x1b; michael@0: if (0x80 & *src) { michael@0: if (mErrBehavior == kOnError_Signal) michael@0: goto error3; michael@0: *dest++ = UNICODE_REPLACEMENT_CHARACTER; michael@0: } else { michael@0: *dest++ = (char16_t) *src; michael@0: } michael@0: mState = mLastLegalState; michael@0: } michael@0: break; michael@0: michael@0: case mState_ESC_28: // ESC ( michael@0: if( 'B' == *src) { michael@0: mState = mState_ASCII; michael@0: if (mRunLength == 0) { michael@0: if (CHECK_OVERRUN(dest, destEnd, 1)) michael@0: goto error1; michael@0: *dest++ = 0xFFFD; michael@0: } michael@0: mRunLength = 0; michael@0: } else if ('J' == *src) { michael@0: mState = mState_JISX0201_1976Roman; michael@0: if (mRunLength == 0 && mLastLegalState != mState_ASCII) { michael@0: if (CHECK_OVERRUN(dest, destEnd, 1)) michael@0: goto error1; michael@0: if (mErrBehavior == kOnError_Signal) michael@0: goto error3; michael@0: *dest++ = 0xFFFD; michael@0: } michael@0: mRunLength = 0; michael@0: } else if ('I' == *src) { michael@0: mState = mState_JISX0201_1976Kana; michael@0: mRunLength = 0; michael@0: } else { michael@0: if (CHECK_OVERRUN(dest, destEnd, 3)) michael@0: goto error1; michael@0: *dest++ = (char16_t) 0x1b; michael@0: *dest++ = (char16_t) '('; michael@0: if (0x80 & *src) { michael@0: if (mErrBehavior == kOnError_Signal) michael@0: goto error3; michael@0: *dest++ = UNICODE_REPLACEMENT_CHARACTER; michael@0: } else { michael@0: *dest++ = (char16_t) *src; michael@0: } michael@0: mState = mLastLegalState; michael@0: } michael@0: break; michael@0: michael@0: case mState_ESC_24: // ESC $ michael@0: if( '@' == *src) { michael@0: mState = mState_JISX0208_1978; michael@0: mRunLength = 0; michael@0: } else if ('A' == *src) { michael@0: mState = mState_GB2312_1980; michael@0: mRunLength = 0; michael@0: } else if ('B' == *src) { michael@0: mState = mState_JISX0208_1983; michael@0: mRunLength = 0; michael@0: } else if ('(' == *src) { michael@0: mState = mState_ESC_24_28; michael@0: } else { michael@0: if (CHECK_OVERRUN(dest, destEnd, 3)) michael@0: goto error1; michael@0: *dest++ = (char16_t) 0x1b; michael@0: *dest++ = (char16_t) '$'; michael@0: if (0x80 & *src) { michael@0: if (mErrBehavior == kOnError_Signal) michael@0: goto error3; michael@0: *dest++ = UNICODE_REPLACEMENT_CHARACTER; michael@0: } else { michael@0: *dest++ = (char16_t) *src; michael@0: } michael@0: mState = mLastLegalState; michael@0: } michael@0: break; michael@0: michael@0: case mState_ESC_24_28: // ESC $ ( michael@0: if( 'C' == *src) { michael@0: mState = mState_KSC5601_1987; michael@0: mRunLength = 0; michael@0: } else if ('D' == *src) { michael@0: mState = mState_JISX0212_1990; michael@0: mRunLength = 0; michael@0: } else { michael@0: if (CHECK_OVERRUN(dest, destEnd, 4)) michael@0: goto error1; michael@0: *dest++ = (char16_t) 0x1b; michael@0: *dest++ = (char16_t) '$'; michael@0: *dest++ = (char16_t) '('; michael@0: if (0x80 & *src) { michael@0: if (mErrBehavior == kOnError_Signal) michael@0: goto error3; michael@0: *dest++ = UNICODE_REPLACEMENT_CHARACTER; michael@0: } else { michael@0: *dest++ = (char16_t) *src; michael@0: } michael@0: mState = mLastLegalState; michael@0: } michael@0: break; michael@0: michael@0: case mState_JISX0201_1976Roman: michael@0: if(0x1b == *src) { michael@0: mLastLegalState = mState; michael@0: mState = mState_ESC; michael@0: } else if(*src & 0x80) { michael@0: if (mErrBehavior == kOnError_Signal) michael@0: goto error3; michael@0: if (CHECK_OVERRUN(dest, destEnd, 1)) michael@0: goto error1; michael@0: *dest++ = UNICODE_REPLACEMENT_CHARACTER; michael@0: ++mRunLength; michael@0: } else { michael@0: // XXX We need to decide how to handle \ and ~ here michael@0: // we may need a if statement here for '\' and '~' michael@0: // to map them to Yen and Overbar michael@0: if (CHECK_OVERRUN(dest, destEnd, 1)) michael@0: goto error1; michael@0: *dest++ = (char16_t) *src; michael@0: ++mRunLength; michael@0: } michael@0: break; michael@0: michael@0: case mState_JISX0201_1976Kana: michael@0: if(0x1b == *src) { michael@0: mLastLegalState = mState; michael@0: mState = mState_ESC; michael@0: } else { michael@0: if (CHECK_OVERRUN(dest, destEnd, 1)) michael@0: goto error1; michael@0: if((0x21 <= *src) && (*src <= 0x5F)) { michael@0: *dest++ = (0xFF61-0x0021) + *src; michael@0: } else { michael@0: if (mErrBehavior == kOnError_Signal) michael@0: goto error3; michael@0: *dest++ = UNICODE_REPLACEMENT_CHARACTER; michael@0: } michael@0: ++mRunLength; michael@0: } michael@0: break; michael@0: michael@0: case mState_JISX0208_1978: michael@0: if(0x1b == *src) { michael@0: mLastLegalState = mState; michael@0: mState = mState_ESC; michael@0: } else if(*src & 0x80) { michael@0: mLastLegalState = mState; michael@0: mState = mState_ERROR; michael@0: } else { michael@0: mData = JIS0208_INDEX[*src & 0x7F]; michael@0: if (0xFFFD == mData) { michael@0: if (mErrBehavior == kOnError_Signal) michael@0: goto error3; michael@0: mState = mState_ERROR; michael@0: } else { michael@0: mState = mState_JISX0208_1978_2ndbyte; michael@0: } michael@0: } michael@0: break; michael@0: michael@0: case mState_GB2312_1980: michael@0: if(0x1b == *src) { michael@0: mLastLegalState = mState; michael@0: mState = mState_ESC; michael@0: } else if(*src & 0x80) { michael@0: mLastLegalState = mState; michael@0: mState = mState_ERROR; michael@0: } else { michael@0: mData = fbIdx[*src & 0x7F]; michael@0: if (0xFFFD == mData) { michael@0: if (mErrBehavior == kOnError_Signal) michael@0: goto error3; michael@0: mState = mState_ERROR; michael@0: } else { michael@0: mState = mState_GB2312_1980_2ndbyte; michael@0: } michael@0: } michael@0: break; michael@0: michael@0: case mState_JISX0208_1983: michael@0: if(0x1b == *src) { michael@0: mLastLegalState = mState; michael@0: mState = mState_ESC; michael@0: } else if(*src & 0x80) { michael@0: mLastLegalState = mState; michael@0: mState = mState_ERROR; michael@0: } else { michael@0: mData = JIS0208_INDEX[*src & 0x7F]; michael@0: if (0xFFFD == mData) { michael@0: if (mErrBehavior == kOnError_Signal) michael@0: goto error3; michael@0: mState = mState_ERROR; michael@0: } else { michael@0: mState = mState_JISX0208_1983_2ndbyte; michael@0: } michael@0: } michael@0: break; michael@0: michael@0: case mState_KSC5601_1987: michael@0: if(0x1b == *src) { michael@0: mLastLegalState = mState; michael@0: mState = mState_ESC; michael@0: } else if(*src & 0x80) { michael@0: mLastLegalState = mState; michael@0: mState = mState_ERROR; michael@0: } else { michael@0: mData = fbIdx[*src & 0x7F]; michael@0: if (0xFFFD == mData) { michael@0: if (mErrBehavior == kOnError_Signal) michael@0: goto error3; michael@0: mState = mState_ERROR; michael@0: } else { michael@0: mState = mState_KSC5601_1987_2ndbyte; michael@0: } michael@0: } michael@0: break; michael@0: michael@0: case mState_JISX0212_1990: michael@0: if(0x1b == *src) { michael@0: mLastLegalState = mState; michael@0: mState = mState_ESC; michael@0: } else if(*src & 0x80) { michael@0: mLastLegalState = mState; michael@0: mState = mState_ERROR; michael@0: } else { michael@0: mData = JIS0212_INDEX[*src & 0x7F]; michael@0: if (0xFFFD == mData) { michael@0: if (mErrBehavior == kOnError_Signal) michael@0: goto error3; michael@0: mState = mState_ERROR; michael@0: } else { michael@0: mState = mState_JISX0212_1990_2ndbyte; michael@0: } michael@0: } michael@0: break; michael@0: michael@0: case mState_JISX0208_1978_2ndbyte: michael@0: { michael@0: if (CHECK_OVERRUN(dest, destEnd, 1)) michael@0: goto error1; michael@0: uint8_t off = sbIdx[*src]; michael@0: if(0xFF == off) { michael@0: if (mErrBehavior == kOnError_Signal) michael@0: goto error3; michael@0: *dest++ = UNICODE_REPLACEMENT_CHARACTER; michael@0: } else { michael@0: // XXX We need to map from JIS X 0208 1983 to 1987 michael@0: // in the next line before pass to *dest++ michael@0: *dest++ = gJapaneseMap[mData+off]; michael@0: } michael@0: ++mRunLength; michael@0: mState = mState_JISX0208_1978; michael@0: } michael@0: break; michael@0: michael@0: case mState_GB2312_1980_2ndbyte: michael@0: { michael@0: if (CHECK_OVERRUN(dest, destEnd, 1)) michael@0: goto error1; michael@0: uint8_t off = sbIdx[*src]; michael@0: if(0xFF == off) { michael@0: if (mErrBehavior == kOnError_Signal) michael@0: goto error3; michael@0: *dest++ = UNICODE_REPLACEMENT_CHARACTER; michael@0: } else { michael@0: if (!mGB2312Decoder) { michael@0: // creating a delegate converter (GB2312) michael@0: nsresult rv; michael@0: nsCOMPtr ccm = michael@0: do_GetService(kCharsetConverterManagerCID, &rv); michael@0: if (NS_SUCCEEDED(rv)) { michael@0: rv = ccm->GetUnicodeDecoderRaw("GB2312", &mGB2312Decoder); michael@0: } michael@0: } michael@0: if (!mGB2312Decoder) {// failed creating a delegate converter michael@0: goto error2; michael@0: } else { michael@0: unsigned char gb[2]; michael@0: char16_t uni; michael@0: int32_t gbLen = 2, uniLen = 1; michael@0: // ((mData/94)+0x21) is the original 1st byte. michael@0: // *src is the present 2nd byte. michael@0: // Put 2 bytes (one character) to gb[] with GB2312 encoding. michael@0: gb[0] = ((mData / 94) + 0x21) | 0x80; michael@0: gb[1] = *src | 0x80; michael@0: // Convert GB2312 to unicode. michael@0: mGB2312Decoder->Convert((const char *)gb, &gbLen, michael@0: &uni, &uniLen); michael@0: *dest++ = uni; michael@0: } michael@0: } michael@0: ++mRunLength; michael@0: mState = mState_GB2312_1980; michael@0: } michael@0: break; michael@0: michael@0: case mState_JISX0208_1983_2ndbyte: michael@0: { michael@0: if (CHECK_OVERRUN(dest, destEnd, 1)) michael@0: goto error1; michael@0: uint8_t off = sbIdx[*src]; michael@0: if(0xFF == off) { michael@0: if (mErrBehavior == kOnError_Signal) michael@0: goto error3; michael@0: *dest++ = UNICODE_REPLACEMENT_CHARACTER; michael@0: } else { michael@0: *dest++ = gJapaneseMap[mData+off]; michael@0: } michael@0: ++mRunLength; michael@0: mState = mState_JISX0208_1983; michael@0: } michael@0: break; michael@0: michael@0: case mState_KSC5601_1987_2ndbyte: michael@0: { michael@0: if (CHECK_OVERRUN(dest, destEnd, 1)) michael@0: goto error1; michael@0: uint8_t off = sbIdx[*src]; michael@0: if(0xFF == off) { michael@0: if (mErrBehavior == kOnError_Signal) michael@0: goto error3; michael@0: *dest++ = UNICODE_REPLACEMENT_CHARACTER; michael@0: } else { michael@0: if (!mEUCKRDecoder) { michael@0: // creating a delegate converter (EUC-KR) michael@0: nsresult rv; michael@0: nsCOMPtr ccm = michael@0: do_GetService(kCharsetConverterManagerCID, &rv); michael@0: if (NS_SUCCEEDED(rv)) { michael@0: rv = ccm->GetUnicodeDecoderRaw("EUC-KR", &mEUCKRDecoder); michael@0: } michael@0: } michael@0: if (!mEUCKRDecoder) {// failed creating a delegate converter michael@0: goto error2; michael@0: } else { michael@0: unsigned char ksc[2]; michael@0: char16_t uni; michael@0: int32_t kscLen = 2, uniLen = 1; michael@0: // ((mData/94)+0x21) is the original 1st byte. michael@0: // *src is the present 2nd byte. michael@0: // Put 2 bytes (one character) to ksc[] with EUC-KR encoding. michael@0: ksc[0] = ((mData / 94) + 0x21) | 0x80; michael@0: ksc[1] = *src | 0x80; michael@0: // Convert EUC-KR to unicode. michael@0: mEUCKRDecoder->Convert((const char *)ksc, &kscLen, michael@0: &uni, &uniLen); michael@0: *dest++ = uni; michael@0: } michael@0: } michael@0: ++mRunLength; michael@0: mState = mState_KSC5601_1987; michael@0: } michael@0: break; michael@0: michael@0: case mState_JISX0212_1990_2ndbyte: michael@0: { michael@0: uint8_t off = sbIdx[*src]; michael@0: if (CHECK_OVERRUN(dest, destEnd, 1)) michael@0: goto error1; michael@0: if(0xFF == off) { michael@0: if (mErrBehavior == kOnError_Signal) michael@0: goto error3; michael@0: *dest++ = UNICODE_REPLACEMENT_CHARACTER; michael@0: } else { michael@0: *dest++ = gJapaneseMap[mData+off]; michael@0: } michael@0: ++mRunLength; michael@0: mState = mState_JISX0212_1990; michael@0: } michael@0: break; michael@0: michael@0: case mState_ESC_2e: // ESC . michael@0: // "ESC ." will designate 96 character set to G2. michael@0: mState = mLastLegalState; michael@0: if( 'A' == *src) { michael@0: G2charset = G2_ISO88591; michael@0: } else if ('F' == *src) { michael@0: G2charset = G2_ISO88597; michael@0: } else { michael@0: if (CHECK_OVERRUN(dest, destEnd, 3)) michael@0: goto error1; michael@0: *dest++ = (char16_t) 0x1b; michael@0: *dest++ = (char16_t) '.'; michael@0: if (0x80 & *src) { michael@0: if (mErrBehavior == kOnError_Signal) michael@0: goto error3; michael@0: *dest++ = UNICODE_REPLACEMENT_CHARACTER; michael@0: } else { michael@0: *dest++ = (char16_t) *src; michael@0: } michael@0: } michael@0: break; michael@0: michael@0: case mState_ESC_4e: // ESC N michael@0: // "ESC N" is the SS2 sequence, that invoke a G2 designated michael@0: // character set. Since SS2 is effective only for next one michael@0: // character, mState should be returned to the last status. michael@0: mState = mLastLegalState; michael@0: if((0x20 <= *src) && (*src <= 0x7F)) { michael@0: if (CHECK_OVERRUN(dest, destEnd, 1)) michael@0: goto error1; michael@0: if (G2_ISO88591 == G2charset) { michael@0: *dest++ = *src | 0x80; michael@0: } else if (G2_ISO88597 == G2charset) { michael@0: if (!mISO88597Decoder) { michael@0: // creating a delegate converter (ISO-8859-7) michael@0: nsresult rv; michael@0: nsCOMPtr ccm = michael@0: do_GetService(kCharsetConverterManagerCID, &rv); michael@0: if (NS_SUCCEEDED(rv)) { michael@0: rv = ccm->GetUnicodeDecoderRaw("ISO-8859-7", &mISO88597Decoder); michael@0: } michael@0: } michael@0: if (!mISO88597Decoder) {// failed creating a delegate converter michael@0: goto error2; michael@0: } else { michael@0: // Put one character with ISO-8859-7 encoding. michael@0: unsigned char gr = *src | 0x80; michael@0: char16_t uni; michael@0: int32_t grLen = 1, uniLen = 1; michael@0: // Convert ISO-8859-7 to unicode. michael@0: mISO88597Decoder->Convert((const char *)&gr, &grLen, michael@0: &uni, &uniLen); michael@0: *dest++ = uni; michael@0: } michael@0: } else {// G2charset is G2_unknown (not designated yet) michael@0: if (mErrBehavior == kOnError_Signal) michael@0: goto error3; michael@0: *dest++ = UNICODE_REPLACEMENT_CHARACTER; michael@0: } michael@0: ++mRunLength; michael@0: } else { michael@0: if (CHECK_OVERRUN(dest, destEnd, 3)) michael@0: goto error1; michael@0: *dest++ = (char16_t) 0x1b; michael@0: *dest++ = (char16_t) 'N'; michael@0: if (0x80 & *src) { michael@0: if (mErrBehavior == kOnError_Signal) michael@0: goto error3; michael@0: *dest++ = UNICODE_REPLACEMENT_CHARACTER; michael@0: } else { michael@0: *dest++ = (char16_t) *src; michael@0: } michael@0: } michael@0: break; michael@0: michael@0: case mState_ERROR: michael@0: mState = mLastLegalState; michael@0: if (mErrBehavior == kOnError_Signal) { michael@0: mRunLength = 0; michael@0: goto error3; michael@0: } michael@0: if (CHECK_OVERRUN(dest, destEnd, 1)) michael@0: goto error1; michael@0: *dest++ = UNICODE_REPLACEMENT_CHARACTER; michael@0: ++mRunLength; michael@0: break; michael@0: michael@0: } // switch michael@0: src++; michael@0: } michael@0: *aDestLen = dest - aDest; michael@0: return NS_OK; michael@0: error1: michael@0: *aDestLen = dest - aDest; michael@0: *aSrcLen = src - (const unsigned char*)aSrc; michael@0: return NS_OK_UDEC_MOREOUTPUT; michael@0: error2: michael@0: *aDestLen = dest - aDest; michael@0: *aSrcLen = src - (const unsigned char*)aSrc; michael@0: return NS_ERROR_UNEXPECTED; michael@0: error3: michael@0: *aDestLen = dest - aDest; michael@0: *aSrcLen = src - (const unsigned char*)aSrc; michael@0: return NS_ERROR_ILLEGAL_INPUT; michael@0: }