intl/uconv/ucvcn/nsHZToUnicode.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
michael@0 2 /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0 3 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 5 /**
michael@0 6 * A character set converter from HZ to Unicode.
michael@0 7 *
michael@0 8 *
michael@0 9 * @created 08/Sept/1999
michael@0 10 * @author Yueheng Xu, Yueheng.Xu@intel.com
michael@0 11 *
michael@0 12 * Note: in this HZ-GB-2312 converter, we accept a string composed of 7-bit HZ
michael@0 13 * encoded Chinese chars,as it is defined in RFC1843 available at
michael@0 14 * http://www.cis.ohio-state.edu/htbin/rfc/rfc1843.html
michael@0 15 * and RFC1842 available at http://www.cis.ohio-state.edu/htbin/rfc/rfc1842.html.
michael@0 16 *
michael@0 17 * Earlier versions of the converter said:
michael@0 18 * "In an effort to match the similar extended capability of Microsoft
michael@0 19 * Internet Explorer 5.0. We also accept the 8-bit GB encoded chars
michael@0 20 * mixed in a HZ string.
michael@0 21 * But this should not be a recommendedd practice for HTML authors."
michael@0 22 * However, testing in current versions of IE shows that it only accepts
michael@0 23 * 8-bit characters when the converter is in GB state, and when in ASCII
michael@0 24 * state each single 8-bit character is converted to U+FFFD
michael@0 25 *
michael@0 26 * The priority of converting are as follows: first convert 8-bit GB code; then,
michael@0 27 * consume HZ ESC sequences such as '~{', '~}', '~~'; then, depending on the current
michael@0 28 * state ( default to ASCII state ) of the string, each 7-bit char is converted as an
michael@0 29 * ASCII, or two 7-bit chars are converted into a Chinese character.
michael@0 30 */
michael@0 31
michael@0 32
michael@0 33
michael@0 34 #include "nsHZToUnicode.h"
michael@0 35 #include "gbku.h"
michael@0 36 #include "mozilla/Telemetry.h"
michael@0 37
michael@0 38 //----------------------------------------------------------------------
michael@0 39 // Class nsHZToUnicode [implementation]
michael@0 40
michael@0 41 //----------------------------------------------------------------------
michael@0 42 // Subclassing of nsTablesDecoderSupport class [implementation]
michael@0 43
michael@0 44 #define HZ_STATE_GB 1
michael@0 45 #define HZ_STATE_ASCII 2
michael@0 46 #define HZ_STATE_ODD_BYTE_FLAG 0x80
michael@0 47 #define HZLEAD1 '~'
michael@0 48 #define HZLEAD2 '{'
michael@0 49 #define HZLEAD3 '}'
michael@0 50 #define HZ_ODD_BYTE_STATE (mHZState & (HZ_STATE_ODD_BYTE_FLAG))
michael@0 51 #define HZ_ENCODING_STATE (mHZState & ~(HZ_STATE_ODD_BYTE_FLAG))
michael@0 52
michael@0 53 using namespace mozilla;
michael@0 54
michael@0 55 nsHZToUnicode::nsHZToUnicode() : nsBufferDecoderSupport(1)
michael@0 56 {
michael@0 57 mHZState = HZ_STATE_ASCII; // per HZ spec, default to ASCII state
michael@0 58 mRunLength = 0;
michael@0 59 mOddByte = 0;
michael@0 60 Telemetry::Accumulate(Telemetry::DECODER_INSTANTIATED_HZ, true);
michael@0 61 }
michael@0 62
michael@0 63 //Overwriting the ConvertNoBuff() in nsUCvCnSupport.cpp.
michael@0 64 NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
michael@0 65 const char* aSrc,
michael@0 66 int32_t * aSrcLength,
michael@0 67 char16_t *aDest,
michael@0 68 int32_t * aDestLength)
michael@0 69 {
michael@0 70 int32_t i=0;
michael@0 71 int32_t iSrcLength = *aSrcLength;
michael@0 72 int32_t iDestlen = 0;
michael@0 73 *aSrcLength=0;
michael@0 74 nsresult res = NS_OK;
michael@0 75 char oddByte = mOddByte;
michael@0 76
michael@0 77 for (i=0; i<iSrcLength; i++) {
michael@0 78 if (iDestlen >= (*aDestLength)) {
michael@0 79 res = NS_OK_UDEC_MOREOUTPUT;
michael@0 80 break;
michael@0 81 }
michael@0 82
michael@0 83 char srcByte = *aSrc++;
michael@0 84 (*aSrcLength)++;
michael@0 85
michael@0 86 if (!HZ_ODD_BYTE_STATE) {
michael@0 87 if (srcByte == HZLEAD1 ||
michael@0 88 (HZ_ENCODING_STATE == HZ_STATE_GB &&
michael@0 89 (UINT8_IN_RANGE(0x21, srcByte, 0x7E) ||
michael@0 90 UINT8_IN_RANGE(0x81, srcByte, 0xFE)))) {
michael@0 91 oddByte = srcByte;
michael@0 92 mHZState |= HZ_STATE_ODD_BYTE_FLAG;
michael@0 93 } else {
michael@0 94 *aDest++ = (srcByte & 0x80) ? UCS2_NO_MAPPING :
michael@0 95 CAST_CHAR_TO_UNICHAR(srcByte);
michael@0 96 iDestlen++;
michael@0 97 }
michael@0 98 } else {
michael@0 99 if (oddByte & 0x80) {
michael@0 100 // Accept legal 8-bit GB 2312-80 sequences in GB mode only
michael@0 101 NS_ASSERTION(HZ_ENCODING_STATE == HZ_STATE_GB,
michael@0 102 "Invalid lead byte in ASCII mode");
michael@0 103 *aDest++ = (UINT8_IN_RANGE(0x81, oddByte, 0xFE) &&
michael@0 104 UINT8_IN_RANGE(0x40, srcByte, 0xFE)) ?
michael@0 105 mUtil.GBKCharToUnicode(oddByte, srcByte) : UCS2_NO_MAPPING;
michael@0 106 mRunLength++;
michael@0 107 iDestlen++;
michael@0 108 // otherwise, it is a 7-bit byte
michael@0 109 // The source will be an ASCII or a 7-bit HZ code depending on oddByte
michael@0 110 } else if (oddByte == HZLEAD1) { // if it is lead by '~'
michael@0 111 switch (srcByte) {
michael@0 112 case HZLEAD2:
michael@0 113 // we got a '~{'
michael@0 114 // we are switching to HZ state
michael@0 115 mHZState = HZ_STATE_GB;
michael@0 116 mRunLength = 0;
michael@0 117 break;
michael@0 118
michael@0 119 case HZLEAD3:
michael@0 120 // we got a '~}'
michael@0 121 // we are switching to ASCII state
michael@0 122 mHZState = HZ_STATE_ASCII;
michael@0 123 if (mRunLength == 0) {
michael@0 124 *aDest++ = UCS2_NO_MAPPING;
michael@0 125 iDestlen++;
michael@0 126 }
michael@0 127 mRunLength = 0;
michael@0 128 break;
michael@0 129
michael@0 130 case HZLEAD1:
michael@0 131 // we got a '~~', process like an ASCII, but no state change
michael@0 132 *aDest++ = CAST_CHAR_TO_UNICHAR(srcByte);
michael@0 133 iDestlen++;
michael@0 134 mRunLength++;
michael@0 135 break;
michael@0 136
michael@0 137 default:
michael@0 138 // Undefined ESC sequence '~X': treat as an error if X is a
michael@0 139 // printable character or we are in ASCII mode, and resynchronize
michael@0 140 // on the second character.
michael@0 141 //
michael@0 142 // N.B. For compatibility with other implementations, we treat '~\n'
michael@0 143 // as an illegal sequence even though RFC1843 permits it, and for
michael@0 144 // the same reason we pass through control characters including '\n'
michael@0 145 // and ' ' even in GB mode.
michael@0 146 if (srcByte > 0x20 || HZ_ENCODING_STATE == HZ_STATE_ASCII) {
michael@0 147 *aDest++ = UCS2_NO_MAPPING;
michael@0 148 iDestlen++;
michael@0 149 }
michael@0 150 aSrc--;
michael@0 151 (*aSrcLength)--;
michael@0 152 i--;
michael@0 153 break;
michael@0 154 }
michael@0 155 } else if (HZ_ENCODING_STATE == HZ_STATE_GB) {
michael@0 156 *aDest++ = (UINT8_IN_RANGE(0x21, oddByte, 0x7E) &&
michael@0 157 UINT8_IN_RANGE(0x21, srcByte, 0x7E)) ?
michael@0 158 mUtil.GBKCharToUnicode(oddByte|0x80, srcByte|0x80) :
michael@0 159 UCS2_NO_MAPPING;
michael@0 160 mRunLength++;
michael@0 161 iDestlen++;
michael@0 162 } else {
michael@0 163 NS_NOTREACHED("2-byte sequence that we don't know how to handle");
michael@0 164 *aDest++ = UCS2_NO_MAPPING;
michael@0 165 iDestlen++;
michael@0 166 }
michael@0 167 oddByte = 0;
michael@0 168 mHZState &= ~HZ_STATE_ODD_BYTE_FLAG;
michael@0 169 }
michael@0 170 } // for loop
michael@0 171 mOddByte = HZ_ODD_BYTE_STATE ? oddByte : 0;
michael@0 172 *aDestLength = iDestlen;
michael@0 173 return res;
michael@0 174 }
michael@0 175
michael@0 176

mercurial