michael@0: /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: /** michael@0: * A character set converter from HZ to Unicode. michael@0: * michael@0: * michael@0: * @created 08/Sept/1999 michael@0: * @author Yueheng Xu, Yueheng.Xu@intel.com michael@0: * michael@0: * Note: in this HZ-GB-2312 converter, we accept a string composed of 7-bit HZ michael@0: * encoded Chinese chars,as it is defined in RFC1843 available at michael@0: * http://www.cis.ohio-state.edu/htbin/rfc/rfc1843.html michael@0: * and RFC1842 available at http://www.cis.ohio-state.edu/htbin/rfc/rfc1842.html. michael@0: * michael@0: * Earlier versions of the converter said: michael@0: * "In an effort to match the similar extended capability of Microsoft michael@0: * Internet Explorer 5.0. We also accept the 8-bit GB encoded chars michael@0: * mixed in a HZ string. michael@0: * But this should not be a recommendedd practice for HTML authors." michael@0: * However, testing in current versions of IE shows that it only accepts michael@0: * 8-bit characters when the converter is in GB state, and when in ASCII michael@0: * state each single 8-bit character is converted to U+FFFD michael@0: * michael@0: * The priority of converting are as follows: first convert 8-bit GB code; then, michael@0: * consume HZ ESC sequences such as '~{', '~}', '~~'; then, depending on the current michael@0: * state ( default to ASCII state ) of the string, each 7-bit char is converted as an michael@0: * ASCII, or two 7-bit chars are converted into a Chinese character. michael@0: */ michael@0: michael@0: michael@0: michael@0: #include "nsHZToUnicode.h" michael@0: #include "gbku.h" michael@0: #include "mozilla/Telemetry.h" michael@0: michael@0: //---------------------------------------------------------------------- michael@0: // Class nsHZToUnicode [implementation] michael@0: michael@0: //---------------------------------------------------------------------- michael@0: // Subclassing of nsTablesDecoderSupport class [implementation] michael@0: michael@0: #define HZ_STATE_GB 1 michael@0: #define HZ_STATE_ASCII 2 michael@0: #define HZ_STATE_ODD_BYTE_FLAG 0x80 michael@0: #define HZLEAD1 '~' michael@0: #define HZLEAD2 '{' michael@0: #define HZLEAD3 '}' michael@0: #define HZ_ODD_BYTE_STATE (mHZState & (HZ_STATE_ODD_BYTE_FLAG)) michael@0: #define HZ_ENCODING_STATE (mHZState & ~(HZ_STATE_ODD_BYTE_FLAG)) michael@0: michael@0: using namespace mozilla; michael@0: michael@0: nsHZToUnicode::nsHZToUnicode() : nsBufferDecoderSupport(1) michael@0: { michael@0: mHZState = HZ_STATE_ASCII; // per HZ spec, default to ASCII state michael@0: mRunLength = 0; michael@0: mOddByte = 0; michael@0: Telemetry::Accumulate(Telemetry::DECODER_INSTANTIATED_HZ, true); michael@0: } michael@0: michael@0: //Overwriting the ConvertNoBuff() in nsUCvCnSupport.cpp. michael@0: NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff( michael@0: const char* aSrc, michael@0: int32_t * aSrcLength, michael@0: char16_t *aDest, michael@0: int32_t * aDestLength) michael@0: { michael@0: int32_t i=0; michael@0: int32_t iSrcLength = *aSrcLength; michael@0: int32_t iDestlen = 0; michael@0: *aSrcLength=0; michael@0: nsresult res = NS_OK; michael@0: char oddByte = mOddByte; michael@0: michael@0: for (i=0; i= (*aDestLength)) { michael@0: res = NS_OK_UDEC_MOREOUTPUT; michael@0: break; michael@0: } michael@0: michael@0: char srcByte = *aSrc++; michael@0: (*aSrcLength)++; michael@0: michael@0: if (!HZ_ODD_BYTE_STATE) { michael@0: if (srcByte == HZLEAD1 || michael@0: (HZ_ENCODING_STATE == HZ_STATE_GB && michael@0: (UINT8_IN_RANGE(0x21, srcByte, 0x7E) || michael@0: UINT8_IN_RANGE(0x81, srcByte, 0xFE)))) { michael@0: oddByte = srcByte; michael@0: mHZState |= HZ_STATE_ODD_BYTE_FLAG; michael@0: } else { michael@0: *aDest++ = (srcByte & 0x80) ? UCS2_NO_MAPPING : michael@0: CAST_CHAR_TO_UNICHAR(srcByte); michael@0: iDestlen++; michael@0: } michael@0: } else { michael@0: if (oddByte & 0x80) { michael@0: // Accept legal 8-bit GB 2312-80 sequences in GB mode only michael@0: NS_ASSERTION(HZ_ENCODING_STATE == HZ_STATE_GB, michael@0: "Invalid lead byte in ASCII mode"); michael@0: *aDest++ = (UINT8_IN_RANGE(0x81, oddByte, 0xFE) && michael@0: UINT8_IN_RANGE(0x40, srcByte, 0xFE)) ? michael@0: mUtil.GBKCharToUnicode(oddByte, srcByte) : UCS2_NO_MAPPING; michael@0: mRunLength++; michael@0: iDestlen++; michael@0: // otherwise, it is a 7-bit byte michael@0: // The source will be an ASCII or a 7-bit HZ code depending on oddByte michael@0: } else if (oddByte == HZLEAD1) { // if it is lead by '~' michael@0: switch (srcByte) { michael@0: case HZLEAD2: michael@0: // we got a '~{' michael@0: // we are switching to HZ state michael@0: mHZState = HZ_STATE_GB; michael@0: mRunLength = 0; michael@0: break; michael@0: michael@0: case HZLEAD3: michael@0: // we got a '~}' michael@0: // we are switching to ASCII state michael@0: mHZState = HZ_STATE_ASCII; michael@0: if (mRunLength == 0) { michael@0: *aDest++ = UCS2_NO_MAPPING; michael@0: iDestlen++; michael@0: } michael@0: mRunLength = 0; michael@0: break; michael@0: michael@0: case HZLEAD1: michael@0: // we got a '~~', process like an ASCII, but no state change michael@0: *aDest++ = CAST_CHAR_TO_UNICHAR(srcByte); michael@0: iDestlen++; michael@0: mRunLength++; michael@0: break; michael@0: michael@0: default: michael@0: // Undefined ESC sequence '~X': treat as an error if X is a michael@0: // printable character or we are in ASCII mode, and resynchronize michael@0: // on the second character. michael@0: // michael@0: // N.B. For compatibility with other implementations, we treat '~\n' michael@0: // as an illegal sequence even though RFC1843 permits it, and for michael@0: // the same reason we pass through control characters including '\n' michael@0: // and ' ' even in GB mode. michael@0: if (srcByte > 0x20 || HZ_ENCODING_STATE == HZ_STATE_ASCII) { michael@0: *aDest++ = UCS2_NO_MAPPING; michael@0: iDestlen++; michael@0: } michael@0: aSrc--; michael@0: (*aSrcLength)--; michael@0: i--; michael@0: break; michael@0: } michael@0: } else if (HZ_ENCODING_STATE == HZ_STATE_GB) { michael@0: *aDest++ = (UINT8_IN_RANGE(0x21, oddByte, 0x7E) && michael@0: UINT8_IN_RANGE(0x21, srcByte, 0x7E)) ? michael@0: mUtil.GBKCharToUnicode(oddByte|0x80, srcByte|0x80) : michael@0: UCS2_NO_MAPPING; michael@0: mRunLength++; michael@0: iDestlen++; michael@0: } else { michael@0: NS_NOTREACHED("2-byte sequence that we don't know how to handle"); michael@0: *aDest++ = UCS2_NO_MAPPING; michael@0: iDestlen++; michael@0: } michael@0: oddByte = 0; michael@0: mHZState &= ~HZ_STATE_ODD_BYTE_FLAG; michael@0: } michael@0: } // for loop michael@0: mOddByte = HZ_ODD_BYTE_STATE ? oddByte : 0; michael@0: *aDestLength = iDestlen; michael@0: return res; michael@0: } michael@0: michael@0: