Tue, 06 Jan 2015 21:39:09 +0100
Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.
1 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 /**
6 * A character set converter from HZ to Unicode.
7 *
8 *
9 * @created 08/Sept/1999
10 * @author Yueheng Xu, Yueheng.Xu@intel.com
11 *
12 * Note: in this HZ-GB-2312 converter, we accept a string composed of 7-bit HZ
13 * encoded Chinese chars,as it is defined in RFC1843 available at
14 * http://www.cis.ohio-state.edu/htbin/rfc/rfc1843.html
15 * and RFC1842 available at http://www.cis.ohio-state.edu/htbin/rfc/rfc1842.html.
16 *
17 * Earlier versions of the converter said:
18 * "In an effort to match the similar extended capability of Microsoft
19 * Internet Explorer 5.0. We also accept the 8-bit GB encoded chars
20 * mixed in a HZ string.
21 * But this should not be a recommendedd practice for HTML authors."
22 * However, testing in current versions of IE shows that it only accepts
23 * 8-bit characters when the converter is in GB state, and when in ASCII
24 * state each single 8-bit character is converted to U+FFFD
25 *
26 * The priority of converting are as follows: first convert 8-bit GB code; then,
27 * consume HZ ESC sequences such as '~{', '~}', '~~'; then, depending on the current
28 * state ( default to ASCII state ) of the string, each 7-bit char is converted as an
29 * ASCII, or two 7-bit chars are converted into a Chinese character.
30 */
34 #include "nsHZToUnicode.h"
35 #include "gbku.h"
36 #include "mozilla/Telemetry.h"
38 //----------------------------------------------------------------------
39 // Class nsHZToUnicode [implementation]
41 //----------------------------------------------------------------------
42 // Subclassing of nsTablesDecoderSupport class [implementation]
44 #define HZ_STATE_GB 1
45 #define HZ_STATE_ASCII 2
46 #define HZ_STATE_ODD_BYTE_FLAG 0x80
47 #define HZLEAD1 '~'
48 #define HZLEAD2 '{'
49 #define HZLEAD3 '}'
50 #define HZ_ODD_BYTE_STATE (mHZState & (HZ_STATE_ODD_BYTE_FLAG))
51 #define HZ_ENCODING_STATE (mHZState & ~(HZ_STATE_ODD_BYTE_FLAG))
53 using namespace mozilla;
55 nsHZToUnicode::nsHZToUnicode() : nsBufferDecoderSupport(1)
56 {
57 mHZState = HZ_STATE_ASCII; // per HZ spec, default to ASCII state
58 mRunLength = 0;
59 mOddByte = 0;
60 Telemetry::Accumulate(Telemetry::DECODER_INSTANTIATED_HZ, true);
61 }
63 //Overwriting the ConvertNoBuff() in nsUCvCnSupport.cpp.
64 NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
65 const char* aSrc,
66 int32_t * aSrcLength,
67 char16_t *aDest,
68 int32_t * aDestLength)
69 {
70 int32_t i=0;
71 int32_t iSrcLength = *aSrcLength;
72 int32_t iDestlen = 0;
73 *aSrcLength=0;
74 nsresult res = NS_OK;
75 char oddByte = mOddByte;
77 for (i=0; i<iSrcLength; i++) {
78 if (iDestlen >= (*aDestLength)) {
79 res = NS_OK_UDEC_MOREOUTPUT;
80 break;
81 }
83 char srcByte = *aSrc++;
84 (*aSrcLength)++;
86 if (!HZ_ODD_BYTE_STATE) {
87 if (srcByte == HZLEAD1 ||
88 (HZ_ENCODING_STATE == HZ_STATE_GB &&
89 (UINT8_IN_RANGE(0x21, srcByte, 0x7E) ||
90 UINT8_IN_RANGE(0x81, srcByte, 0xFE)))) {
91 oddByte = srcByte;
92 mHZState |= HZ_STATE_ODD_BYTE_FLAG;
93 } else {
94 *aDest++ = (srcByte & 0x80) ? UCS2_NO_MAPPING :
95 CAST_CHAR_TO_UNICHAR(srcByte);
96 iDestlen++;
97 }
98 } else {
99 if (oddByte & 0x80) {
100 // Accept legal 8-bit GB 2312-80 sequences in GB mode only
101 NS_ASSERTION(HZ_ENCODING_STATE == HZ_STATE_GB,
102 "Invalid lead byte in ASCII mode");
103 *aDest++ = (UINT8_IN_RANGE(0x81, oddByte, 0xFE) &&
104 UINT8_IN_RANGE(0x40, srcByte, 0xFE)) ?
105 mUtil.GBKCharToUnicode(oddByte, srcByte) : UCS2_NO_MAPPING;
106 mRunLength++;
107 iDestlen++;
108 // otherwise, it is a 7-bit byte
109 // The source will be an ASCII or a 7-bit HZ code depending on oddByte
110 } else if (oddByte == HZLEAD1) { // if it is lead by '~'
111 switch (srcByte) {
112 case HZLEAD2:
113 // we got a '~{'
114 // we are switching to HZ state
115 mHZState = HZ_STATE_GB;
116 mRunLength = 0;
117 break;
119 case HZLEAD3:
120 // we got a '~}'
121 // we are switching to ASCII state
122 mHZState = HZ_STATE_ASCII;
123 if (mRunLength == 0) {
124 *aDest++ = UCS2_NO_MAPPING;
125 iDestlen++;
126 }
127 mRunLength = 0;
128 break;
130 case HZLEAD1:
131 // we got a '~~', process like an ASCII, but no state change
132 *aDest++ = CAST_CHAR_TO_UNICHAR(srcByte);
133 iDestlen++;
134 mRunLength++;
135 break;
137 default:
138 // Undefined ESC sequence '~X': treat as an error if X is a
139 // printable character or we are in ASCII mode, and resynchronize
140 // on the second character.
141 //
142 // N.B. For compatibility with other implementations, we treat '~\n'
143 // as an illegal sequence even though RFC1843 permits it, and for
144 // the same reason we pass through control characters including '\n'
145 // and ' ' even in GB mode.
146 if (srcByte > 0x20 || HZ_ENCODING_STATE == HZ_STATE_ASCII) {
147 *aDest++ = UCS2_NO_MAPPING;
148 iDestlen++;
149 }
150 aSrc--;
151 (*aSrcLength)--;
152 i--;
153 break;
154 }
155 } else if (HZ_ENCODING_STATE == HZ_STATE_GB) {
156 *aDest++ = (UINT8_IN_RANGE(0x21, oddByte, 0x7E) &&
157 UINT8_IN_RANGE(0x21, srcByte, 0x7E)) ?
158 mUtil.GBKCharToUnicode(oddByte|0x80, srcByte|0x80) :
159 UCS2_NO_MAPPING;
160 mRunLength++;
161 iDestlen++;
162 } else {
163 NS_NOTREACHED("2-byte sequence that we don't know how to handle");
164 *aDest++ = UCS2_NO_MAPPING;
165 iDestlen++;
166 }
167 oddByte = 0;
168 mHZState &= ~HZ_STATE_ODD_BYTE_FLAG;
169 }
170 } // for loop
171 mOddByte = HZ_ODD_BYTE_STATE ? oddByte : 0;
172 *aDestLength = iDestlen;
173 return res;
174 }