intl/uconv/ucvcn/nsGBKToUnicode.cpp

changeset 0
6474c204b198
equal deleted inserted replaced
-1:000000000000 0:af091dd02cd1
1 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 /**
6 * A character set converter from GBK to Unicode.
7 *
8 *
9 * @created 07/Sept/1999
10 * @author Yueheng Xu, Yueheng.Xu@intel.com
11 */
12
13 #include "nsGBKToUnicode.h"
14 #include "gbku.h"
15
16
17 //------------------------------------------------------------
18 // nsGBKUnique2BytesToUnicode
19 //------------------------------------------------------------
20 class nsGBKUnique2BytesToUnicode : public nsTableDecoderSupport
21 {
22 public:
23 nsGBKUnique2BytesToUnicode();
24 virtual ~nsGBKUnique2BytesToUnicode()
25 { }
26 protected:
27 };
28
29 static const uint16_t g_utGBKUnique2Bytes[] = {
30 #include "gbkuniq2b.ut"
31 };
32 nsGBKUnique2BytesToUnicode::nsGBKUnique2BytesToUnicode()
33 : nsTableDecoderSupport(u2BytesCharset, nullptr,
34 (uMappingTable*) &g_utGBKUnique2Bytes, 1)
35 {
36 }
37
38 //------------------------------------------------------------
39 // nsGB18030Unique2BytesToUnicode
40 //------------------------------------------------------------
41 class nsGB18030Unique2BytesToUnicode : public nsTableDecoderSupport
42 {
43 public:
44 nsGB18030Unique2BytesToUnicode();
45 virtual ~nsGB18030Unique2BytesToUnicode()
46 { }
47 protected:
48 };
49
50 static const uint16_t g_utGB18030Unique2Bytes[] = {
51 #include "gb18030uniq2b.ut"
52 };
53 nsGB18030Unique2BytesToUnicode::nsGB18030Unique2BytesToUnicode()
54 : nsTableDecoderSupport(u2BytesCharset, nullptr,
55 (uMappingTable*) &g_utGB18030Unique2Bytes, 1)
56 {
57 }
58
59 //------------------------------------------------------------
60 // nsGB18030Unique4BytesToUnicode
61 //------------------------------------------------------------
62 class nsGB18030Unique4BytesToUnicode : public nsTableDecoderSupport
63 {
64 public:
65 nsGB18030Unique4BytesToUnicode();
66 virtual ~nsGB18030Unique4BytesToUnicode()
67 { }
68 protected:
69 };
70
71 static const uint16_t g_utGB18030Unique4Bytes[] = {
72 #include "gb180304bytes.ut"
73 };
74 nsGB18030Unique4BytesToUnicode::nsGB18030Unique4BytesToUnicode()
75 : nsTableDecoderSupport(u4BytesGB18030Charset, nullptr,
76 (uMappingTable*) &g_utGB18030Unique4Bytes, 1)
77 {
78 }
79
80
81 //----------------------------------------------------------------------
82 // Class nsGBKToUnicode [implementation]
83
84 //----------------------------------------------------------------------
85 // Subclassing of nsTablesDecoderSupport class [implementation]
86
87 #define LEGAL_GBK_MULTIBYTE_FIRST_BYTE(c) \
88 (UINT8_IN_RANGE(0x81, (c), 0xFE))
89 #define FIRST_BYTE_IS_SURROGATE(c) \
90 (UINT8_IN_RANGE(0x90, (c), 0xFE))
91 #define LEGAL_GBK_2BYTE_SECOND_BYTE(c) \
92 (UINT8_IN_RANGE(0x40, (c), 0x7E)|| UINT8_IN_RANGE(0x80, (c), 0xFE))
93 #define LEGAL_GBK_4BYTE_SECOND_BYTE(c) \
94 (UINT8_IN_RANGE(0x30, (c), 0x39))
95 #define LEGAL_GBK_4BYTE_THIRD_BYTE(c) \
96 (UINT8_IN_RANGE(0x81, (c), 0xFE))
97 #define LEGAL_GBK_4BYTE_FORTH_BYTE(c) \
98 (UINT8_IN_RANGE(0x30, (c), 0x39))
99
100 NS_IMETHODIMP nsGBKToUnicode::ConvertNoBuff(const char* aSrc,
101 int32_t * aSrcLength,
102 char16_t *aDest,
103 int32_t * aDestLength)
104 {
105 int32_t i=0;
106 int32_t iSrcLength = (*aSrcLength);
107 int32_t iDestlen = 0;
108 nsresult rv=NS_OK;
109 *aSrcLength = 0;
110
111 for (i=0;i<iSrcLength;i++)
112 {
113 if ( iDestlen >= (*aDestLength) )
114 {
115 rv = NS_OK_UDEC_MOREOUTPUT;
116 break;
117 }
118 // The valid range for the 1st byte is [0x81,0xFE]
119 if(LEGAL_GBK_MULTIBYTE_FIRST_BYTE(*aSrc))
120 {
121 if(i+1 >= iSrcLength)
122 {
123 rv = NS_OK_UDEC_MOREINPUT;
124 break;
125 }
126 // To make sure, the second byte has to be checked as well.
127 // In GBK, the second byte range is [0x40,0x7E] and [0x80,0XFE]
128 if(LEGAL_GBK_2BYTE_SECOND_BYTE(aSrc[1]))
129 {
130 // Valid GBK code
131 *aDest = mUtil.GBKCharToUnicode(aSrc[0], aSrc[1]);
132 if(UCS2_NO_MAPPING == *aDest)
133 {
134 // We cannot map in the common mapping, let's call the
135 // delegate 2 byte decoder to decode the gbk or gb18030 unique
136 // 2 byte mapping
137 if(! TryExtensionDecoder(aSrc, aDest))
138 {
139 *aDest = UCS2_NO_MAPPING;
140 }
141 }
142 aSrc += 2;
143 i++;
144 }
145 else if (LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]))
146 {
147 // from the first 2 bytes, it looks like a 4 byte GB18030
148 if(i+3 >= iSrcLength) // make sure we got 4 bytes
149 {
150 rv = NS_OK_UDEC_MOREINPUT;
151 break;
152 }
153 // 4 bytes patten
154 // [0x81-0xfe][0x30-0x39][0x81-0xfe][0x30-0x39]
155 // preset the
156
157 if (LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]) &&
158 LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]))
159 {
160 if ( ! FIRST_BYTE_IS_SURROGATE(aSrc[0]))
161 {
162 // let's call the delegated 4 byte gb18030 converter to convert it
163 if(! Try4BytesDecoder(aSrc, aDest))
164 *aDest = UCS2_NO_MAPPING;
165 } else {
166 // let's try supplement mapping
167 if ( (iDestlen+1) < (*aDestLength) )
168 {
169 if(DecodeToSurrogate(aSrc, aDest))
170 {
171 // surrogte two char16_t
172 iDestlen++;
173 aDest++;
174 } else {
175 *aDest = UCS2_NO_MAPPING;
176 }
177 } else {
178 if (*aDestLength < 2) {
179 NS_ERROR("insufficient space in output buffer");
180 *aDest = UCS2_NO_MAPPING;
181 } else {
182 rv = NS_OK_UDEC_MOREOUTPUT;
183 break;
184 }
185 }
186 }
187 aSrc += 4;
188 i += 3;
189 } else {
190 *aDest = UCS2_NO_MAPPING;
191 // If the third and fourth bytes are not in the legal ranges for
192 // a four-byte sequnce, resynchronize on the second byte
193 // (which we know is in the range of LEGAL_GBK_4BYTE_SECOND_BYTE,
194 // 0x30-0x39)
195 aSrc++;
196 }
197 }
198 else if ((uint8_t) aSrc[0] == (uint8_t)0xA0 )
199 {
200 // stand-alone (not followed by a valid second byte) 0xA0 !
201 // treat it as valid a la Netscape 4.x
202 *aDest = CAST_CHAR_TO_UNICHAR(*aSrc);
203 aSrc++;
204 } else {
205 // Invalid GBK code point (second byte should be 0x40 or higher)
206 *aDest = UCS2_NO_MAPPING;
207 aSrc++;
208 }
209 } else {
210 if(IS_ASCII(*aSrc))
211 {
212 // The source is an ASCII
213 *aDest = CAST_CHAR_TO_UNICHAR(*aSrc);
214 aSrc++;
215 } else {
216 if(IS_GBK_EURO(*aSrc)) {
217 *aDest = UCS2_EURO;
218 } else {
219 *aDest = UCS2_NO_MAPPING;
220 }
221 aSrc++;
222 }
223 }
224 iDestlen++;
225 aDest++;
226 *aSrcLength = i+1;
227 }
228 *aDestLength = iDestlen;
229 return rv;
230 }
231
232
233 void nsGBKToUnicode::CreateExtensionDecoder()
234 {
235 mExtensionDecoder = new nsGBKUnique2BytesToUnicode();
236 }
237 void nsGBKToUnicode::Create4BytesDecoder()
238 {
239 m4BytesDecoder = nullptr;
240 }
241 void nsGB18030ToUnicode::CreateExtensionDecoder()
242 {
243 mExtensionDecoder = new nsGB18030Unique2BytesToUnicode();
244 }
245 void nsGB18030ToUnicode::Create4BytesDecoder()
246 {
247 m4BytesDecoder = new nsGB18030Unique4BytesToUnicode();
248 }
249 bool nsGB18030ToUnicode::DecodeToSurrogate(const char* aSrc, char16_t* aOut)
250 {
251 NS_ASSERTION(FIRST_BYTE_IS_SURROGATE(aSrc[0]), "illegal first byte");
252 NS_ASSERTION(LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]), "illegal second byte");
253 NS_ASSERTION(LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]), "illegal third byte");
254 NS_ASSERTION(LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]), "illegal forth byte");
255 if(! FIRST_BYTE_IS_SURROGATE(aSrc[0]))
256 return false;
257 if(! LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]))
258 return false;
259 if(! LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]))
260 return false;
261 if(! LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]))
262 return false;
263
264 uint8_t a1 = (uint8_t) aSrc[0];
265 uint8_t a2 = (uint8_t) aSrc[1];
266 uint8_t a3 = (uint8_t) aSrc[2];
267 uint8_t a4 = (uint8_t) aSrc[3];
268 a1 -= (uint8_t)0x90;
269 a2 -= (uint8_t)0x30;
270 a3 -= (uint8_t)0x81;
271 a4 -= (uint8_t)0x30;
272 uint32_t idx = (((a1 * 10 + a2 ) * 126 + a3) * 10) + a4;
273 // idx == ucs4Codepoint - 0x10000
274 if (idx > 0x000FFFFF)
275 return false;
276
277 *aOut++ = 0xD800 | (idx >> 10);
278 *aOut = 0xDC00 | (0x000003FF & idx);
279
280 return true;
281 }
282 bool nsGBKToUnicode::TryExtensionDecoder(const char* aSrc, char16_t* aOut)
283 {
284 if(!mExtensionDecoder)
285 CreateExtensionDecoder();
286 NS_ASSERTION(mExtensionDecoder, "cannot creqte 2 bytes unique converter");
287 if(mExtensionDecoder)
288 {
289 nsresult res = mExtensionDecoder->Reset();
290 NS_ASSERTION(NS_SUCCEEDED(res), "2 bytes unique conversoin reset failed");
291 int32_t len = 2;
292 int32_t dstlen = 1;
293 res = mExtensionDecoder->Convert(aSrc,&len, aOut, &dstlen);
294 NS_ASSERTION(NS_FAILED(res) || ((len==2) && (dstlen == 1)),
295 "some strange conversion result");
296 // if we failed, we then just use the 0xfffd
297 // therefore, we ignore the res here.
298 if(NS_SUCCEEDED(res))
299 return true;
300 }
301 return false;
302 }
303 bool nsGBKToUnicode::DecodeToSurrogate(const char* aSrc, char16_t* aOut)
304 {
305 return false;
306 }
307 bool nsGBKToUnicode::Try4BytesDecoder(const char* aSrc, char16_t* aOut)
308 {
309 if(!m4BytesDecoder)
310 Create4BytesDecoder();
311 if(m4BytesDecoder)
312 {
313 nsresult res = m4BytesDecoder->Reset();
314 NS_ASSERTION(NS_SUCCEEDED(res), "4 bytes unique conversoin reset failed");
315 int32_t len = 4;
316 int32_t dstlen = 1;
317 res = m4BytesDecoder->Convert(aSrc,&len, aOut, &dstlen);
318 NS_ASSERTION(NS_FAILED(res) || ((len==4) && (dstlen == 1)),
319 "some strange conversion result");
320 // if we failed, we then just use the 0xfffd
321 // therefore, we ignore the res here.
322 if(NS_SUCCEEDED(res))
323 return true;
324 }
325 return false;
326 }

mercurial