1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/uconv/ucvcn/nsHZToUnicode.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,176 @@ 1.4 +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 +/** 1.9 + * A character set converter from HZ to Unicode. 1.10 + * 1.11 + * 1.12 + * @created 08/Sept/1999 1.13 + * @author Yueheng Xu, Yueheng.Xu@intel.com 1.14 + * 1.15 + * Note: in this HZ-GB-2312 converter, we accept a string composed of 7-bit HZ 1.16 + * encoded Chinese chars,as it is defined in RFC1843 available at 1.17 + * http://www.cis.ohio-state.edu/htbin/rfc/rfc1843.html 1.18 + * and RFC1842 available at http://www.cis.ohio-state.edu/htbin/rfc/rfc1842.html. 1.19 + * 1.20 + * Earlier versions of the converter said: 1.21 + * "In an effort to match the similar extended capability of Microsoft 1.22 + * Internet Explorer 5.0. We also accept the 8-bit GB encoded chars 1.23 + * mixed in a HZ string. 1.24 + * But this should not be a recommendedd practice for HTML authors." 1.25 + * However, testing in current versions of IE shows that it only accepts 1.26 + * 8-bit characters when the converter is in GB state, and when in ASCII 1.27 + * state each single 8-bit character is converted to U+FFFD 1.28 + * 1.29 + * The priority of converting are as follows: first convert 8-bit GB code; then, 1.30 + * consume HZ ESC sequences such as '~{', '~}', '~~'; then, depending on the current 1.31 + * state ( default to ASCII state ) of the string, each 7-bit char is converted as an 1.32 + * ASCII, or two 7-bit chars are converted into a Chinese character. 1.33 + */ 1.34 + 1.35 + 1.36 + 1.37 +#include "nsHZToUnicode.h" 1.38 +#include "gbku.h" 1.39 +#include "mozilla/Telemetry.h" 1.40 + 1.41 +//---------------------------------------------------------------------- 1.42 +// Class nsHZToUnicode [implementation] 1.43 + 1.44 +//---------------------------------------------------------------------- 1.45 +// Subclassing of nsTablesDecoderSupport class [implementation] 1.46 + 1.47 +#define HZ_STATE_GB 1 1.48 +#define HZ_STATE_ASCII 2 1.49 +#define HZ_STATE_ODD_BYTE_FLAG 0x80 1.50 +#define HZLEAD1 '~' 1.51 +#define HZLEAD2 '{' 1.52 +#define HZLEAD3 '}' 1.53 +#define HZ_ODD_BYTE_STATE (mHZState & (HZ_STATE_ODD_BYTE_FLAG)) 1.54 +#define HZ_ENCODING_STATE (mHZState & ~(HZ_STATE_ODD_BYTE_FLAG)) 1.55 + 1.56 +using namespace mozilla; 1.57 + 1.58 +nsHZToUnicode::nsHZToUnicode() : nsBufferDecoderSupport(1) 1.59 +{ 1.60 + mHZState = HZ_STATE_ASCII; // per HZ spec, default to ASCII state 1.61 + mRunLength = 0; 1.62 + mOddByte = 0; 1.63 + Telemetry::Accumulate(Telemetry::DECODER_INSTANTIATED_HZ, true); 1.64 +} 1.65 + 1.66 +//Overwriting the ConvertNoBuff() in nsUCvCnSupport.cpp. 1.67 +NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff( 1.68 + const char* aSrc, 1.69 + int32_t * aSrcLength, 1.70 + char16_t *aDest, 1.71 + int32_t * aDestLength) 1.72 +{ 1.73 + int32_t i=0; 1.74 + int32_t iSrcLength = *aSrcLength; 1.75 + int32_t iDestlen = 0; 1.76 + *aSrcLength=0; 1.77 + nsresult res = NS_OK; 1.78 + char oddByte = mOddByte; 1.79 + 1.80 + for (i=0; i<iSrcLength; i++) { 1.81 + if (iDestlen >= (*aDestLength)) { 1.82 + res = NS_OK_UDEC_MOREOUTPUT; 1.83 + break; 1.84 + } 1.85 + 1.86 + char srcByte = *aSrc++; 1.87 + (*aSrcLength)++; 1.88 + 1.89 + if (!HZ_ODD_BYTE_STATE) { 1.90 + if (srcByte == HZLEAD1 || 1.91 + (HZ_ENCODING_STATE == HZ_STATE_GB && 1.92 + (UINT8_IN_RANGE(0x21, srcByte, 0x7E) || 1.93 + UINT8_IN_RANGE(0x81, srcByte, 0xFE)))) { 1.94 + oddByte = srcByte; 1.95 + mHZState |= HZ_STATE_ODD_BYTE_FLAG; 1.96 + } else { 1.97 + *aDest++ = (srcByte & 0x80) ? UCS2_NO_MAPPING : 1.98 + CAST_CHAR_TO_UNICHAR(srcByte); 1.99 + iDestlen++; 1.100 + } 1.101 + } else { 1.102 + if (oddByte & 0x80) { 1.103 + // Accept legal 8-bit GB 2312-80 sequences in GB mode only 1.104 + NS_ASSERTION(HZ_ENCODING_STATE == HZ_STATE_GB, 1.105 + "Invalid lead byte in ASCII mode"); 1.106 + *aDest++ = (UINT8_IN_RANGE(0x81, oddByte, 0xFE) && 1.107 + UINT8_IN_RANGE(0x40, srcByte, 0xFE)) ? 1.108 + mUtil.GBKCharToUnicode(oddByte, srcByte) : UCS2_NO_MAPPING; 1.109 + mRunLength++; 1.110 + iDestlen++; 1.111 + // otherwise, it is a 7-bit byte 1.112 + // The source will be an ASCII or a 7-bit HZ code depending on oddByte 1.113 + } else if (oddByte == HZLEAD1) { // if it is lead by '~' 1.114 + switch (srcByte) { 1.115 + case HZLEAD2: 1.116 + // we got a '~{' 1.117 + // we are switching to HZ state 1.118 + mHZState = HZ_STATE_GB; 1.119 + mRunLength = 0; 1.120 + break; 1.121 + 1.122 + case HZLEAD3: 1.123 + // we got a '~}' 1.124 + // we are switching to ASCII state 1.125 + mHZState = HZ_STATE_ASCII; 1.126 + if (mRunLength == 0) { 1.127 + *aDest++ = UCS2_NO_MAPPING; 1.128 + iDestlen++; 1.129 + } 1.130 + mRunLength = 0; 1.131 + break; 1.132 + 1.133 + case HZLEAD1: 1.134 + // we got a '~~', process like an ASCII, but no state change 1.135 + *aDest++ = CAST_CHAR_TO_UNICHAR(srcByte); 1.136 + iDestlen++; 1.137 + mRunLength++; 1.138 + break; 1.139 + 1.140 + default: 1.141 + // Undefined ESC sequence '~X': treat as an error if X is a 1.142 + // printable character or we are in ASCII mode, and resynchronize 1.143 + // on the second character. 1.144 + // 1.145 + // N.B. For compatibility with other implementations, we treat '~\n' 1.146 + // as an illegal sequence even though RFC1843 permits it, and for 1.147 + // the same reason we pass through control characters including '\n' 1.148 + // and ' ' even in GB mode. 1.149 + if (srcByte > 0x20 || HZ_ENCODING_STATE == HZ_STATE_ASCII) { 1.150 + *aDest++ = UCS2_NO_MAPPING; 1.151 + iDestlen++; 1.152 + } 1.153 + aSrc--; 1.154 + (*aSrcLength)--; 1.155 + i--; 1.156 + break; 1.157 + } 1.158 + } else if (HZ_ENCODING_STATE == HZ_STATE_GB) { 1.159 + *aDest++ = (UINT8_IN_RANGE(0x21, oddByte, 0x7E) && 1.160 + UINT8_IN_RANGE(0x21, srcByte, 0x7E)) ? 1.161 + mUtil.GBKCharToUnicode(oddByte|0x80, srcByte|0x80) : 1.162 + UCS2_NO_MAPPING; 1.163 + mRunLength++; 1.164 + iDestlen++; 1.165 + } else { 1.166 + NS_NOTREACHED("2-byte sequence that we don't know how to handle"); 1.167 + *aDest++ = UCS2_NO_MAPPING; 1.168 + iDestlen++; 1.169 + } 1.170 + oddByte = 0; 1.171 + mHZState &= ~HZ_STATE_ODD_BYTE_FLAG; 1.172 + } 1.173 + } // for loop 1.174 + mOddByte = HZ_ODD_BYTE_STATE ? oddByte : 0; 1.175 + *aDestLength = iDestlen; 1.176 + return res; 1.177 +} 1.178 + 1.179 +