|
1 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
|
2 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
3 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
5 /** |
|
6 * A character set converter from HZ to Unicode. |
|
7 * |
|
8 * |
|
9 * @created 08/Sept/1999 |
|
10 * @author Yueheng Xu, Yueheng.Xu@intel.com |
|
11 * |
|
12 * Note: in this HZ-GB-2312 converter, we accept a string composed of 7-bit HZ |
|
13 * encoded Chinese chars,as it is defined in RFC1843 available at |
|
14 * http://www.cis.ohio-state.edu/htbin/rfc/rfc1843.html |
|
15 * and RFC1842 available at http://www.cis.ohio-state.edu/htbin/rfc/rfc1842.html. |
|
16 * |
|
17 * Earlier versions of the converter said: |
|
18 * "In an effort to match the similar extended capability of Microsoft |
|
19 * Internet Explorer 5.0. We also accept the 8-bit GB encoded chars |
|
20 * mixed in a HZ string. |
|
21 * But this should not be a recommendedd practice for HTML authors." |
|
22 * However, testing in current versions of IE shows that it only accepts |
|
23 * 8-bit characters when the converter is in GB state, and when in ASCII |
|
24 * state each single 8-bit character is converted to U+FFFD |
|
25 * |
|
26 * The priority of converting are as follows: first convert 8-bit GB code; then, |
|
27 * consume HZ ESC sequences such as '~{', '~}', '~~'; then, depending on the current |
|
28 * state ( default to ASCII state ) of the string, each 7-bit char is converted as an |
|
29 * ASCII, or two 7-bit chars are converted into a Chinese character. |
|
30 */ |
|
31 |
|
32 |
|
33 |
|
34 #include "nsHZToUnicode.h" |
|
35 #include "gbku.h" |
|
36 #include "mozilla/Telemetry.h" |
|
37 |
|
38 //---------------------------------------------------------------------- |
|
39 // Class nsHZToUnicode [implementation] |
|
40 |
|
41 //---------------------------------------------------------------------- |
|
42 // Subclassing of nsTablesDecoderSupport class [implementation] |
|
43 |
|
44 #define HZ_STATE_GB 1 |
|
45 #define HZ_STATE_ASCII 2 |
|
46 #define HZ_STATE_ODD_BYTE_FLAG 0x80 |
|
47 #define HZLEAD1 '~' |
|
48 #define HZLEAD2 '{' |
|
49 #define HZLEAD3 '}' |
|
50 #define HZ_ODD_BYTE_STATE (mHZState & (HZ_STATE_ODD_BYTE_FLAG)) |
|
51 #define HZ_ENCODING_STATE (mHZState & ~(HZ_STATE_ODD_BYTE_FLAG)) |
|
52 |
|
53 using namespace mozilla; |
|
54 |
|
55 nsHZToUnicode::nsHZToUnicode() : nsBufferDecoderSupport(1) |
|
56 { |
|
57 mHZState = HZ_STATE_ASCII; // per HZ spec, default to ASCII state |
|
58 mRunLength = 0; |
|
59 mOddByte = 0; |
|
60 Telemetry::Accumulate(Telemetry::DECODER_INSTANTIATED_HZ, true); |
|
61 } |
|
62 |
|
63 //Overwriting the ConvertNoBuff() in nsUCvCnSupport.cpp. |
|
64 NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff( |
|
65 const char* aSrc, |
|
66 int32_t * aSrcLength, |
|
67 char16_t *aDest, |
|
68 int32_t * aDestLength) |
|
69 { |
|
70 int32_t i=0; |
|
71 int32_t iSrcLength = *aSrcLength; |
|
72 int32_t iDestlen = 0; |
|
73 *aSrcLength=0; |
|
74 nsresult res = NS_OK; |
|
75 char oddByte = mOddByte; |
|
76 |
|
77 for (i=0; i<iSrcLength; i++) { |
|
78 if (iDestlen >= (*aDestLength)) { |
|
79 res = NS_OK_UDEC_MOREOUTPUT; |
|
80 break; |
|
81 } |
|
82 |
|
83 char srcByte = *aSrc++; |
|
84 (*aSrcLength)++; |
|
85 |
|
86 if (!HZ_ODD_BYTE_STATE) { |
|
87 if (srcByte == HZLEAD1 || |
|
88 (HZ_ENCODING_STATE == HZ_STATE_GB && |
|
89 (UINT8_IN_RANGE(0x21, srcByte, 0x7E) || |
|
90 UINT8_IN_RANGE(0x81, srcByte, 0xFE)))) { |
|
91 oddByte = srcByte; |
|
92 mHZState |= HZ_STATE_ODD_BYTE_FLAG; |
|
93 } else { |
|
94 *aDest++ = (srcByte & 0x80) ? UCS2_NO_MAPPING : |
|
95 CAST_CHAR_TO_UNICHAR(srcByte); |
|
96 iDestlen++; |
|
97 } |
|
98 } else { |
|
99 if (oddByte & 0x80) { |
|
100 // Accept legal 8-bit GB 2312-80 sequences in GB mode only |
|
101 NS_ASSERTION(HZ_ENCODING_STATE == HZ_STATE_GB, |
|
102 "Invalid lead byte in ASCII mode"); |
|
103 *aDest++ = (UINT8_IN_RANGE(0x81, oddByte, 0xFE) && |
|
104 UINT8_IN_RANGE(0x40, srcByte, 0xFE)) ? |
|
105 mUtil.GBKCharToUnicode(oddByte, srcByte) : UCS2_NO_MAPPING; |
|
106 mRunLength++; |
|
107 iDestlen++; |
|
108 // otherwise, it is a 7-bit byte |
|
109 // The source will be an ASCII or a 7-bit HZ code depending on oddByte |
|
110 } else if (oddByte == HZLEAD1) { // if it is lead by '~' |
|
111 switch (srcByte) { |
|
112 case HZLEAD2: |
|
113 // we got a '~{' |
|
114 // we are switching to HZ state |
|
115 mHZState = HZ_STATE_GB; |
|
116 mRunLength = 0; |
|
117 break; |
|
118 |
|
119 case HZLEAD3: |
|
120 // we got a '~}' |
|
121 // we are switching to ASCII state |
|
122 mHZState = HZ_STATE_ASCII; |
|
123 if (mRunLength == 0) { |
|
124 *aDest++ = UCS2_NO_MAPPING; |
|
125 iDestlen++; |
|
126 } |
|
127 mRunLength = 0; |
|
128 break; |
|
129 |
|
130 case HZLEAD1: |
|
131 // we got a '~~', process like an ASCII, but no state change |
|
132 *aDest++ = CAST_CHAR_TO_UNICHAR(srcByte); |
|
133 iDestlen++; |
|
134 mRunLength++; |
|
135 break; |
|
136 |
|
137 default: |
|
138 // Undefined ESC sequence '~X': treat as an error if X is a |
|
139 // printable character or we are in ASCII mode, and resynchronize |
|
140 // on the second character. |
|
141 // |
|
142 // N.B. For compatibility with other implementations, we treat '~\n' |
|
143 // as an illegal sequence even though RFC1843 permits it, and for |
|
144 // the same reason we pass through control characters including '\n' |
|
145 // and ' ' even in GB mode. |
|
146 if (srcByte > 0x20 || HZ_ENCODING_STATE == HZ_STATE_ASCII) { |
|
147 *aDest++ = UCS2_NO_MAPPING; |
|
148 iDestlen++; |
|
149 } |
|
150 aSrc--; |
|
151 (*aSrcLength)--; |
|
152 i--; |
|
153 break; |
|
154 } |
|
155 } else if (HZ_ENCODING_STATE == HZ_STATE_GB) { |
|
156 *aDest++ = (UINT8_IN_RANGE(0x21, oddByte, 0x7E) && |
|
157 UINT8_IN_RANGE(0x21, srcByte, 0x7E)) ? |
|
158 mUtil.GBKCharToUnicode(oddByte|0x80, srcByte|0x80) : |
|
159 UCS2_NO_MAPPING; |
|
160 mRunLength++; |
|
161 iDestlen++; |
|
162 } else { |
|
163 NS_NOTREACHED("2-byte sequence that we don't know how to handle"); |
|
164 *aDest++ = UCS2_NO_MAPPING; |
|
165 iDestlen++; |
|
166 } |
|
167 oddByte = 0; |
|
168 mHZState &= ~HZ_STATE_ODD_BYTE_FLAG; |
|
169 } |
|
170 } // for loop |
|
171 mOddByte = HZ_ODD_BYTE_STATE ? oddByte : 0; |
|
172 *aDestLength = iDestlen; |
|
173 return res; |
|
174 } |
|
175 |
|
176 |