|
1 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
|
2 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
3 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
5 /** |
|
6 * A character set converter from GBK to Unicode. |
|
7 * |
|
8 * |
|
9 * @created 07/Sept/1999 |
|
10 * @author Yueheng Xu, Yueheng.Xu@intel.com |
|
11 */ |
|
12 |
|
13 #include "nsGBKToUnicode.h" |
|
14 #include "gbku.h" |
|
15 |
|
16 |
|
17 //------------------------------------------------------------ |
|
18 // nsGBKUnique2BytesToUnicode |
|
19 //------------------------------------------------------------ |
|
20 class nsGBKUnique2BytesToUnicode : public nsTableDecoderSupport |
|
21 { |
|
22 public: |
|
23 nsGBKUnique2BytesToUnicode(); |
|
24 virtual ~nsGBKUnique2BytesToUnicode() |
|
25 { } |
|
26 protected: |
|
27 }; |
|
28 |
|
29 static const uint16_t g_utGBKUnique2Bytes[] = { |
|
30 #include "gbkuniq2b.ut" |
|
31 }; |
|
32 nsGBKUnique2BytesToUnicode::nsGBKUnique2BytesToUnicode() |
|
33 : nsTableDecoderSupport(u2BytesCharset, nullptr, |
|
34 (uMappingTable*) &g_utGBKUnique2Bytes, 1) |
|
35 { |
|
36 } |
|
37 |
|
38 //------------------------------------------------------------ |
|
39 // nsGB18030Unique2BytesToUnicode |
|
40 //------------------------------------------------------------ |
|
41 class nsGB18030Unique2BytesToUnicode : public nsTableDecoderSupport |
|
42 { |
|
43 public: |
|
44 nsGB18030Unique2BytesToUnicode(); |
|
45 virtual ~nsGB18030Unique2BytesToUnicode() |
|
46 { } |
|
47 protected: |
|
48 }; |
|
49 |
|
50 static const uint16_t g_utGB18030Unique2Bytes[] = { |
|
51 #include "gb18030uniq2b.ut" |
|
52 }; |
|
53 nsGB18030Unique2BytesToUnicode::nsGB18030Unique2BytesToUnicode() |
|
54 : nsTableDecoderSupport(u2BytesCharset, nullptr, |
|
55 (uMappingTable*) &g_utGB18030Unique2Bytes, 1) |
|
56 { |
|
57 } |
|
58 |
|
59 //------------------------------------------------------------ |
|
60 // nsGB18030Unique4BytesToUnicode |
|
61 //------------------------------------------------------------ |
|
62 class nsGB18030Unique4BytesToUnicode : public nsTableDecoderSupport |
|
63 { |
|
64 public: |
|
65 nsGB18030Unique4BytesToUnicode(); |
|
66 virtual ~nsGB18030Unique4BytesToUnicode() |
|
67 { } |
|
68 protected: |
|
69 }; |
|
70 |
|
71 static const uint16_t g_utGB18030Unique4Bytes[] = { |
|
72 #include "gb180304bytes.ut" |
|
73 }; |
|
74 nsGB18030Unique4BytesToUnicode::nsGB18030Unique4BytesToUnicode() |
|
75 : nsTableDecoderSupport(u4BytesGB18030Charset, nullptr, |
|
76 (uMappingTable*) &g_utGB18030Unique4Bytes, 1) |
|
77 { |
|
78 } |
|
79 |
|
80 |
|
81 //---------------------------------------------------------------------- |
|
82 // Class nsGBKToUnicode [implementation] |
|
83 |
|
84 //---------------------------------------------------------------------- |
|
85 // Subclassing of nsTablesDecoderSupport class [implementation] |
|
86 |
|
87 #define LEGAL_GBK_MULTIBYTE_FIRST_BYTE(c) \ |
|
88 (UINT8_IN_RANGE(0x81, (c), 0xFE)) |
|
89 #define FIRST_BYTE_IS_SURROGATE(c) \ |
|
90 (UINT8_IN_RANGE(0x90, (c), 0xFE)) |
|
91 #define LEGAL_GBK_2BYTE_SECOND_BYTE(c) \ |
|
92 (UINT8_IN_RANGE(0x40, (c), 0x7E)|| UINT8_IN_RANGE(0x80, (c), 0xFE)) |
|
93 #define LEGAL_GBK_4BYTE_SECOND_BYTE(c) \ |
|
94 (UINT8_IN_RANGE(0x30, (c), 0x39)) |
|
95 #define LEGAL_GBK_4BYTE_THIRD_BYTE(c) \ |
|
96 (UINT8_IN_RANGE(0x81, (c), 0xFE)) |
|
97 #define LEGAL_GBK_4BYTE_FORTH_BYTE(c) \ |
|
98 (UINT8_IN_RANGE(0x30, (c), 0x39)) |
|
99 |
|
100 NS_IMETHODIMP nsGBKToUnicode::ConvertNoBuff(const char* aSrc, |
|
101 int32_t * aSrcLength, |
|
102 char16_t *aDest, |
|
103 int32_t * aDestLength) |
|
104 { |
|
105 int32_t i=0; |
|
106 int32_t iSrcLength = (*aSrcLength); |
|
107 int32_t iDestlen = 0; |
|
108 nsresult rv=NS_OK; |
|
109 *aSrcLength = 0; |
|
110 |
|
111 for (i=0;i<iSrcLength;i++) |
|
112 { |
|
113 if ( iDestlen >= (*aDestLength) ) |
|
114 { |
|
115 rv = NS_OK_UDEC_MOREOUTPUT; |
|
116 break; |
|
117 } |
|
118 // The valid range for the 1st byte is [0x81,0xFE] |
|
119 if(LEGAL_GBK_MULTIBYTE_FIRST_BYTE(*aSrc)) |
|
120 { |
|
121 if(i+1 >= iSrcLength) |
|
122 { |
|
123 rv = NS_OK_UDEC_MOREINPUT; |
|
124 break; |
|
125 } |
|
126 // To make sure, the second byte has to be checked as well. |
|
127 // In GBK, the second byte range is [0x40,0x7E] and [0x80,0XFE] |
|
128 if(LEGAL_GBK_2BYTE_SECOND_BYTE(aSrc[1])) |
|
129 { |
|
130 // Valid GBK code |
|
131 *aDest = mUtil.GBKCharToUnicode(aSrc[0], aSrc[1]); |
|
132 if(UCS2_NO_MAPPING == *aDest) |
|
133 { |
|
134 // We cannot map in the common mapping, let's call the |
|
135 // delegate 2 byte decoder to decode the gbk or gb18030 unique |
|
136 // 2 byte mapping |
|
137 if(! TryExtensionDecoder(aSrc, aDest)) |
|
138 { |
|
139 *aDest = UCS2_NO_MAPPING; |
|
140 } |
|
141 } |
|
142 aSrc += 2; |
|
143 i++; |
|
144 } |
|
145 else if (LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1])) |
|
146 { |
|
147 // from the first 2 bytes, it looks like a 4 byte GB18030 |
|
148 if(i+3 >= iSrcLength) // make sure we got 4 bytes |
|
149 { |
|
150 rv = NS_OK_UDEC_MOREINPUT; |
|
151 break; |
|
152 } |
|
153 // 4 bytes patten |
|
154 // [0x81-0xfe][0x30-0x39][0x81-0xfe][0x30-0x39] |
|
155 // preset the |
|
156 |
|
157 if (LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]) && |
|
158 LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3])) |
|
159 { |
|
160 if ( ! FIRST_BYTE_IS_SURROGATE(aSrc[0])) |
|
161 { |
|
162 // let's call the delegated 4 byte gb18030 converter to convert it |
|
163 if(! Try4BytesDecoder(aSrc, aDest)) |
|
164 *aDest = UCS2_NO_MAPPING; |
|
165 } else { |
|
166 // let's try supplement mapping |
|
167 if ( (iDestlen+1) < (*aDestLength) ) |
|
168 { |
|
169 if(DecodeToSurrogate(aSrc, aDest)) |
|
170 { |
|
171 // surrogte two char16_t |
|
172 iDestlen++; |
|
173 aDest++; |
|
174 } else { |
|
175 *aDest = UCS2_NO_MAPPING; |
|
176 } |
|
177 } else { |
|
178 if (*aDestLength < 2) { |
|
179 NS_ERROR("insufficient space in output buffer"); |
|
180 *aDest = UCS2_NO_MAPPING; |
|
181 } else { |
|
182 rv = NS_OK_UDEC_MOREOUTPUT; |
|
183 break; |
|
184 } |
|
185 } |
|
186 } |
|
187 aSrc += 4; |
|
188 i += 3; |
|
189 } else { |
|
190 *aDest = UCS2_NO_MAPPING; |
|
191 // If the third and fourth bytes are not in the legal ranges for |
|
192 // a four-byte sequnce, resynchronize on the second byte |
|
193 // (which we know is in the range of LEGAL_GBK_4BYTE_SECOND_BYTE, |
|
194 // 0x30-0x39) |
|
195 aSrc++; |
|
196 } |
|
197 } |
|
198 else if ((uint8_t) aSrc[0] == (uint8_t)0xA0 ) |
|
199 { |
|
200 // stand-alone (not followed by a valid second byte) 0xA0 ! |
|
201 // treat it as valid a la Netscape 4.x |
|
202 *aDest = CAST_CHAR_TO_UNICHAR(*aSrc); |
|
203 aSrc++; |
|
204 } else { |
|
205 // Invalid GBK code point (second byte should be 0x40 or higher) |
|
206 *aDest = UCS2_NO_MAPPING; |
|
207 aSrc++; |
|
208 } |
|
209 } else { |
|
210 if(IS_ASCII(*aSrc)) |
|
211 { |
|
212 // The source is an ASCII |
|
213 *aDest = CAST_CHAR_TO_UNICHAR(*aSrc); |
|
214 aSrc++; |
|
215 } else { |
|
216 if(IS_GBK_EURO(*aSrc)) { |
|
217 *aDest = UCS2_EURO; |
|
218 } else { |
|
219 *aDest = UCS2_NO_MAPPING; |
|
220 } |
|
221 aSrc++; |
|
222 } |
|
223 } |
|
224 iDestlen++; |
|
225 aDest++; |
|
226 *aSrcLength = i+1; |
|
227 } |
|
228 *aDestLength = iDestlen; |
|
229 return rv; |
|
230 } |
|
231 |
|
232 |
|
233 void nsGBKToUnicode::CreateExtensionDecoder() |
|
234 { |
|
235 mExtensionDecoder = new nsGBKUnique2BytesToUnicode(); |
|
236 } |
|
237 void nsGBKToUnicode::Create4BytesDecoder() |
|
238 { |
|
239 m4BytesDecoder = nullptr; |
|
240 } |
|
241 void nsGB18030ToUnicode::CreateExtensionDecoder() |
|
242 { |
|
243 mExtensionDecoder = new nsGB18030Unique2BytesToUnicode(); |
|
244 } |
|
245 void nsGB18030ToUnicode::Create4BytesDecoder() |
|
246 { |
|
247 m4BytesDecoder = new nsGB18030Unique4BytesToUnicode(); |
|
248 } |
|
249 bool nsGB18030ToUnicode::DecodeToSurrogate(const char* aSrc, char16_t* aOut) |
|
250 { |
|
251 NS_ASSERTION(FIRST_BYTE_IS_SURROGATE(aSrc[0]), "illegal first byte"); |
|
252 NS_ASSERTION(LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]), "illegal second byte"); |
|
253 NS_ASSERTION(LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]), "illegal third byte"); |
|
254 NS_ASSERTION(LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]), "illegal forth byte"); |
|
255 if(! FIRST_BYTE_IS_SURROGATE(aSrc[0])) |
|
256 return false; |
|
257 if(! LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1])) |
|
258 return false; |
|
259 if(! LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2])) |
|
260 return false; |
|
261 if(! LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3])) |
|
262 return false; |
|
263 |
|
264 uint8_t a1 = (uint8_t) aSrc[0]; |
|
265 uint8_t a2 = (uint8_t) aSrc[1]; |
|
266 uint8_t a3 = (uint8_t) aSrc[2]; |
|
267 uint8_t a4 = (uint8_t) aSrc[3]; |
|
268 a1 -= (uint8_t)0x90; |
|
269 a2 -= (uint8_t)0x30; |
|
270 a3 -= (uint8_t)0x81; |
|
271 a4 -= (uint8_t)0x30; |
|
272 uint32_t idx = (((a1 * 10 + a2 ) * 126 + a3) * 10) + a4; |
|
273 // idx == ucs4Codepoint - 0x10000 |
|
274 if (idx > 0x000FFFFF) |
|
275 return false; |
|
276 |
|
277 *aOut++ = 0xD800 | (idx >> 10); |
|
278 *aOut = 0xDC00 | (0x000003FF & idx); |
|
279 |
|
280 return true; |
|
281 } |
|
282 bool nsGBKToUnicode::TryExtensionDecoder(const char* aSrc, char16_t* aOut) |
|
283 { |
|
284 if(!mExtensionDecoder) |
|
285 CreateExtensionDecoder(); |
|
286 NS_ASSERTION(mExtensionDecoder, "cannot creqte 2 bytes unique converter"); |
|
287 if(mExtensionDecoder) |
|
288 { |
|
289 nsresult res = mExtensionDecoder->Reset(); |
|
290 NS_ASSERTION(NS_SUCCEEDED(res), "2 bytes unique conversoin reset failed"); |
|
291 int32_t len = 2; |
|
292 int32_t dstlen = 1; |
|
293 res = mExtensionDecoder->Convert(aSrc,&len, aOut, &dstlen); |
|
294 NS_ASSERTION(NS_FAILED(res) || ((len==2) && (dstlen == 1)), |
|
295 "some strange conversion result"); |
|
296 // if we failed, we then just use the 0xfffd |
|
297 // therefore, we ignore the res here. |
|
298 if(NS_SUCCEEDED(res)) |
|
299 return true; |
|
300 } |
|
301 return false; |
|
302 } |
|
303 bool nsGBKToUnicode::DecodeToSurrogate(const char* aSrc, char16_t* aOut) |
|
304 { |
|
305 return false; |
|
306 } |
|
307 bool nsGBKToUnicode::Try4BytesDecoder(const char* aSrc, char16_t* aOut) |
|
308 { |
|
309 if(!m4BytesDecoder) |
|
310 Create4BytesDecoder(); |
|
311 if(m4BytesDecoder) |
|
312 { |
|
313 nsresult res = m4BytesDecoder->Reset(); |
|
314 NS_ASSERTION(NS_SUCCEEDED(res), "4 bytes unique conversoin reset failed"); |
|
315 int32_t len = 4; |
|
316 int32_t dstlen = 1; |
|
317 res = m4BytesDecoder->Convert(aSrc,&len, aOut, &dstlen); |
|
318 NS_ASSERTION(NS_FAILED(res) || ((len==4) && (dstlen == 1)), |
|
319 "some strange conversion result"); |
|
320 // if we failed, we then just use the 0xfffd |
|
321 // therefore, we ignore the res here. |
|
322 if(NS_SUCCEEDED(res)) |
|
323 return true; |
|
324 } |
|
325 return false; |
|
326 } |