Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
michael@0 | 2 | /* This Source Code Form is subject to the terms of the Mozilla Public |
michael@0 | 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
michael@0 | 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
michael@0 | 5 | /** |
michael@0 | 6 | * A character set converter from Unicode to GBK. |
michael@0 | 7 | * |
michael@0 | 8 | * |
michael@0 | 9 | * @created 08/Sept/1999 |
michael@0 | 10 | * @author Yueheng Xu, Yueheng.Xu@intel.com |
michael@0 | 11 | * Revision History |
michael@0 | 12 | * 04/Oct/1999. Yueheng Xu: used table gUnicodeToGBKTable[0x5200] to make |
michael@0 | 13 | * Unicode to GB mapping fast |
michael@0 | 14 | */ |
michael@0 | 15 | |
michael@0 | 16 | #include "nsUnicodeToGBK.h" |
michael@0 | 17 | #include "gbku.h" |
michael@0 | 18 | #include "uconvutil.h" |
michael@0 | 19 | #include "nsCharTraits.h" |
michael@0 | 20 | |
michael@0 | 21 | //------------------------------------------------------------- |
michael@0 | 22 | // Global table initialization function defined in gbku.h |
michael@0 | 23 | //------------------------------------------------------------- |
michael@0 | 24 | |
michael@0 | 25 | //----------------------------------------------------------------------- |
michael@0 | 26 | // Private class used by nsUnicodeToGB18030 and nsUnicodeToGB18030Font0 |
michael@0 | 27 | // nsUnicodeToGB18030Uniq2Bytes |
michael@0 | 28 | //----------------------------------------------------------------------- |
michael@0 | 29 | static const uint16_t g_uf_gb18030_2bytes[] = { |
michael@0 | 30 | #include "gb18030uniq2b.uf" |
michael@0 | 31 | }; |
michael@0 | 32 | class nsUnicodeToGB18030Uniq2Bytes : public nsTableEncoderSupport |
michael@0 | 33 | { |
michael@0 | 34 | public: |
michael@0 | 35 | nsUnicodeToGB18030Uniq2Bytes() |
michael@0 | 36 | : nsTableEncoderSupport(u2BytesCharset, |
michael@0 | 37 | (uMappingTable*) &g_uf_gb18030_2bytes, 2) {} |
michael@0 | 38 | protected: |
michael@0 | 39 | }; |
michael@0 | 40 | //----------------------------------------------------------------------- |
michael@0 | 41 | // Private class used by nsUnicodeToGB18030 |
michael@0 | 42 | // nsUnicodeTo4BytesGB18030 |
michael@0 | 43 | //----------------------------------------------------------------------- |
michael@0 | 44 | static const uint16_t g_uf_gb18030_4bytes[] = { |
michael@0 | 45 | #include "gb180304bytes.uf" |
michael@0 | 46 | }; |
michael@0 | 47 | class nsUnicodeTo4BytesGB18030 : public nsTableEncoderSupport |
michael@0 | 48 | { |
michael@0 | 49 | public: |
michael@0 | 50 | nsUnicodeTo4BytesGB18030() |
michael@0 | 51 | : nsTableEncoderSupport(u4BytesGB18030Charset, |
michael@0 | 52 | (uMappingTable*) &g_uf_gb18030_4bytes, 4) {} |
michael@0 | 53 | protected: |
michael@0 | 54 | }; |
michael@0 | 55 | //----------------------------------------------------------------------- |
michael@0 | 56 | // Private class used by nsUnicodeToGBK |
michael@0 | 57 | // nsUnicodeToGBKUniq2Bytes |
michael@0 | 58 | //----------------------------------------------------------------------- |
michael@0 | 59 | static const uint16_t g_uf_gbk_2bytes[] = { |
michael@0 | 60 | #include "gbkuniq2b.uf" |
michael@0 | 61 | }; |
michael@0 | 62 | class nsUnicodeToGBKUniq2Bytes : public nsTableEncoderSupport |
michael@0 | 63 | { |
michael@0 | 64 | public: |
michael@0 | 65 | nsUnicodeToGBKUniq2Bytes() |
michael@0 | 66 | : nsTableEncoderSupport(u2BytesCharset, |
michael@0 | 67 | (uMappingTable*) &g_uf_gbk_2bytes, 2) {} |
michael@0 | 68 | protected: |
michael@0 | 69 | }; |
michael@0 | 70 | //----------------------------------------------------------------------- |
michael@0 | 71 | // nsUnicodeToGB18030 |
michael@0 | 72 | //----------------------------------------------------------------------- |
michael@0 | 73 | void nsUnicodeToGB18030::CreateExtensionEncoder() |
michael@0 | 74 | { |
michael@0 | 75 | mExtensionEncoder = new nsUnicodeToGB18030Uniq2Bytes(); |
michael@0 | 76 | } |
michael@0 | 77 | void nsUnicodeToGB18030::Create4BytesEncoder() |
michael@0 | 78 | { |
michael@0 | 79 | m4BytesEncoder = new nsUnicodeTo4BytesGB18030(); |
michael@0 | 80 | } |
michael@0 | 81 | |
michael@0 | 82 | bool nsUnicodeToGB18030::EncodeSurrogate( |
michael@0 | 83 | char16_t aSurrogateHigh, |
michael@0 | 84 | char16_t aSurrogateLow, |
michael@0 | 85 | char* aOut) |
michael@0 | 86 | { |
michael@0 | 87 | if( NS_IS_HIGH_SURROGATE(aSurrogateHigh) && |
michael@0 | 88 | NS_IS_LOW_SURROGATE(aSurrogateLow) ) |
michael@0 | 89 | { |
michael@0 | 90 | // notice that idx does not include the 0x10000 |
michael@0 | 91 | uint32_t idx = ((aSurrogateHigh - (char16_t)0xD800) << 10 ) | |
michael@0 | 92 | (aSurrogateLow - (char16_t) 0xDC00); |
michael@0 | 93 | |
michael@0 | 94 | unsigned char *out = (unsigned char*) aOut; |
michael@0 | 95 | // notice this is from 0x90 for supplment planes |
michael@0 | 96 | out[0] = (idx / (10*126*10)) + 0x90; |
michael@0 | 97 | idx %= (10*126*10); |
michael@0 | 98 | out[1] = (idx / (10*126)) + 0x30; |
michael@0 | 99 | idx %= (10*126); |
michael@0 | 100 | out[2] = (idx / (10)) + 0x81; |
michael@0 | 101 | out[3] = (idx % 10) + 0x30; |
michael@0 | 102 | return true; |
michael@0 | 103 | } |
michael@0 | 104 | return false; |
michael@0 | 105 | } |
michael@0 | 106 | |
michael@0 | 107 | //---------------------------------------------------------------------- |
michael@0 | 108 | // Class nsUnicodeToGBK [implementation] |
michael@0 | 109 | |
michael@0 | 110 | nsUnicodeToGBK::nsUnicodeToGBK(uint32_t aMaxLength) : |
michael@0 | 111 | nsEncoderSupport(aMaxLength) |
michael@0 | 112 | { |
michael@0 | 113 | mExtensionEncoder = nullptr; |
michael@0 | 114 | m4BytesEncoder = nullptr; |
michael@0 | 115 | mSurrogateHigh = 0; |
michael@0 | 116 | } |
michael@0 | 117 | void nsUnicodeToGBK::CreateExtensionEncoder() |
michael@0 | 118 | { |
michael@0 | 119 | mExtensionEncoder = new nsUnicodeToGBKUniq2Bytes(); |
michael@0 | 120 | } |
michael@0 | 121 | void nsUnicodeToGBK::Create4BytesEncoder() |
michael@0 | 122 | { |
michael@0 | 123 | m4BytesEncoder = nullptr; |
michael@0 | 124 | } |
michael@0 | 125 | bool nsUnicodeToGBK::TryExtensionEncoder( |
michael@0 | 126 | char16_t aChar, |
michael@0 | 127 | char* aOut, |
michael@0 | 128 | int32_t *aOutLen |
michael@0 | 129 | ) |
michael@0 | 130 | { |
michael@0 | 131 | if( NS_IS_HIGH_SURROGATE(aChar) || |
michael@0 | 132 | NS_IS_LOW_SURROGATE(aChar) ) |
michael@0 | 133 | { |
michael@0 | 134 | // performance tune for surrogate characters |
michael@0 | 135 | return false; |
michael@0 | 136 | } |
michael@0 | 137 | if(! mExtensionEncoder ) |
michael@0 | 138 | CreateExtensionEncoder(); |
michael@0 | 139 | if(mExtensionEncoder) |
michael@0 | 140 | { |
michael@0 | 141 | int32_t len = 1; |
michael@0 | 142 | nsresult res = NS_OK; |
michael@0 | 143 | res = mExtensionEncoder->Convert(&aChar, &len, aOut, aOutLen); |
michael@0 | 144 | if(NS_SUCCEEDED(res) && (*aOutLen > 0)) |
michael@0 | 145 | return true; |
michael@0 | 146 | } |
michael@0 | 147 | return false; |
michael@0 | 148 | } |
michael@0 | 149 | |
michael@0 | 150 | bool nsUnicodeToGBK::Try4BytesEncoder( |
michael@0 | 151 | char16_t aChar, |
michael@0 | 152 | char* aOut, |
michael@0 | 153 | int32_t *aOutLen |
michael@0 | 154 | ) |
michael@0 | 155 | { |
michael@0 | 156 | if( NS_IS_HIGH_SURROGATE(aChar) || |
michael@0 | 157 | NS_IS_LOW_SURROGATE(aChar) ) |
michael@0 | 158 | { |
michael@0 | 159 | // performance tune for surrogate characters |
michael@0 | 160 | return false; |
michael@0 | 161 | } |
michael@0 | 162 | if(! m4BytesEncoder ) |
michael@0 | 163 | Create4BytesEncoder(); |
michael@0 | 164 | if(m4BytesEncoder) |
michael@0 | 165 | { |
michael@0 | 166 | int32_t len = 1; |
michael@0 | 167 | nsresult res = NS_OK; |
michael@0 | 168 | res = m4BytesEncoder->Convert(&aChar, &len, aOut, aOutLen); |
michael@0 | 169 | NS_ASSERTION(NS_FAILED(res) || ((1 == len) && (4 == *aOutLen)), |
michael@0 | 170 | "unexpect conversion length"); |
michael@0 | 171 | if(NS_SUCCEEDED(res) && (*aOutLen > 0)) |
michael@0 | 172 | return true; |
michael@0 | 173 | } |
michael@0 | 174 | return false; |
michael@0 | 175 | } |
michael@0 | 176 | bool nsUnicodeToGBK::EncodeSurrogate( |
michael@0 | 177 | char16_t aSurrogateHigh, |
michael@0 | 178 | char16_t aSurrogateLow, |
michael@0 | 179 | char* aOut) |
michael@0 | 180 | { |
michael@0 | 181 | return false; // GBK cannot encode Surrogate, let the subclass encode it. |
michael@0 | 182 | } |
michael@0 | 183 | |
michael@0 | 184 | NS_IMETHODIMP nsUnicodeToGBK::ConvertNoBuff( |
michael@0 | 185 | const char16_t * aSrc, |
michael@0 | 186 | int32_t * aSrcLength, |
michael@0 | 187 | char * aDest, |
michael@0 | 188 | int32_t * aDestLength) |
michael@0 | 189 | { |
michael@0 | 190 | int32_t iSrcLength = 0; |
michael@0 | 191 | int32_t iDestLength = 0; |
michael@0 | 192 | char16_t unicode; |
michael@0 | 193 | nsresult res = NS_OK; |
michael@0 | 194 | while (iSrcLength < *aSrcLength ) |
michael@0 | 195 | { |
michael@0 | 196 | unicode = *aSrc; |
michael@0 | 197 | //if unicode's hi byte has something, it is not ASCII, must be a GB |
michael@0 | 198 | if(IS_ASCII(unicode)) |
michael@0 | 199 | { |
michael@0 | 200 | // this is an ASCII |
michael@0 | 201 | *aDest = CAST_UNICHAR_TO_CHAR(*aSrc); |
michael@0 | 202 | aDest++; // increment 1 byte |
michael@0 | 203 | iDestLength +=1; |
michael@0 | 204 | } else { |
michael@0 | 205 | char byte1, byte2; |
michael@0 | 206 | if(mUtil.UnicodeToGBKChar( unicode, false, &byte1, &byte2)) |
michael@0 | 207 | { |
michael@0 | 208 | // make sure we still have 2 bytes for output first |
michael@0 | 209 | if(iDestLength+2 > *aDestLength) |
michael@0 | 210 | { |
michael@0 | 211 | res = NS_OK_UENC_MOREOUTPUT; |
michael@0 | 212 | break; |
michael@0 | 213 | } |
michael@0 | 214 | aDest[0] = byte1; |
michael@0 | 215 | aDest[1] = byte2; |
michael@0 | 216 | aDest += 2; // increment 2 bytes |
michael@0 | 217 | iDestLength +=2; |
michael@0 | 218 | } else { |
michael@0 | 219 | int32_t aOutLen = 2; |
michael@0 | 220 | // make sure we still have 2 bytes for output first |
michael@0 | 221 | if(iDestLength+2 > *aDestLength) |
michael@0 | 222 | { |
michael@0 | 223 | res = NS_OK_UENC_MOREOUTPUT; |
michael@0 | 224 | break; |
michael@0 | 225 | } |
michael@0 | 226 | // we cannot map in the common mapping. Let's try to |
michael@0 | 227 | // call the delegated 2 byte converter for the gbk or gb18030 |
michael@0 | 228 | // unique 2 byte mapping |
michael@0 | 229 | if(TryExtensionEncoder(unicode, aDest, &aOutLen)) |
michael@0 | 230 | { |
michael@0 | 231 | iDestLength += aOutLen; |
michael@0 | 232 | aDest += aOutLen; |
michael@0 | 233 | } else { |
michael@0 | 234 | // make sure we still have 4 bytes for output first |
michael@0 | 235 | if(iDestLength+4 > *aDestLength) |
michael@0 | 236 | { |
michael@0 | 237 | res = NS_OK_UENC_MOREOUTPUT; |
michael@0 | 238 | break; |
michael@0 | 239 | } |
michael@0 | 240 | // we still cannot map. Let's try to |
michael@0 | 241 | // call the delegated GB18030 4 byte converter |
michael@0 | 242 | aOutLen = 4; |
michael@0 | 243 | if( NS_IS_HIGH_SURROGATE(unicode) ) |
michael@0 | 244 | { |
michael@0 | 245 | if((iSrcLength+1) < *aSrcLength ) { |
michael@0 | 246 | if(EncodeSurrogate(aSrc[0],aSrc[1], aDest)) { |
michael@0 | 247 | // since we got a surrogate pair, we need to increment src. |
michael@0 | 248 | iSrcLength++ ; |
michael@0 | 249 | aSrc++; |
michael@0 | 250 | iDestLength += aOutLen; |
michael@0 | 251 | aDest += aOutLen; |
michael@0 | 252 | } else { |
michael@0 | 253 | // only get a high surrogate, but not a low surrogate |
michael@0 | 254 | res = NS_ERROR_UENC_NOMAPPING; |
michael@0 | 255 | iSrcLength++; // include length of the unmapped character |
michael@0 | 256 | break; |
michael@0 | 257 | } |
michael@0 | 258 | } else { |
michael@0 | 259 | mSurrogateHigh = aSrc[0]; |
michael@0 | 260 | break; // this will go to afterwhileloop |
michael@0 | 261 | } |
michael@0 | 262 | } else { |
michael@0 | 263 | if( NS_IS_LOW_SURROGATE(unicode) ) |
michael@0 | 264 | { |
michael@0 | 265 | if(NS_IS_HIGH_SURROGATE(mSurrogateHigh)) { |
michael@0 | 266 | if(EncodeSurrogate(mSurrogateHigh, aSrc[0], aDest)) { |
michael@0 | 267 | iDestLength += aOutLen; |
michael@0 | 268 | aDest += aOutLen; |
michael@0 | 269 | } else { |
michael@0 | 270 | // only get a high surrogate, but not a low surrogate |
michael@0 | 271 | res = NS_ERROR_UENC_NOMAPPING; |
michael@0 | 272 | iSrcLength++; // include length of the unmapped character |
michael@0 | 273 | break; |
michael@0 | 274 | } |
michael@0 | 275 | } else { |
michael@0 | 276 | // only get a low surrogate, but not a low surrogate |
michael@0 | 277 | res = NS_ERROR_UENC_NOMAPPING; |
michael@0 | 278 | iSrcLength++; // include length of the unmapped character |
michael@0 | 279 | break; |
michael@0 | 280 | } |
michael@0 | 281 | } else { |
michael@0 | 282 | if(Try4BytesEncoder(unicode, aDest, &aOutLen)) |
michael@0 | 283 | { |
michael@0 | 284 | NS_ASSERTION((aOutLen == 4), "we should always generate 4 bytes here"); |
michael@0 | 285 | iDestLength += aOutLen; |
michael@0 | 286 | aDest += aOutLen; |
michael@0 | 287 | } else { |
michael@0 | 288 | res = NS_ERROR_UENC_NOMAPPING; |
michael@0 | 289 | iSrcLength++; // include length of the unmapped character |
michael@0 | 290 | break; |
michael@0 | 291 | } |
michael@0 | 292 | } |
michael@0 | 293 | } |
michael@0 | 294 | } |
michael@0 | 295 | } |
michael@0 | 296 | } |
michael@0 | 297 | iSrcLength++ ; // Each unicode char just count as one in char16_t string; |
michael@0 | 298 | mSurrogateHigh = 0; |
michael@0 | 299 | aSrc++; |
michael@0 | 300 | if ( iDestLength >= (*aDestLength) && (iSrcLength < *aSrcLength) ) |
michael@0 | 301 | { |
michael@0 | 302 | res = NS_OK_UENC_MOREOUTPUT; |
michael@0 | 303 | break; |
michael@0 | 304 | } |
michael@0 | 305 | } |
michael@0 | 306 | //afterwhileloop: |
michael@0 | 307 | *aDestLength = iDestLength; |
michael@0 | 308 | *aSrcLength = iSrcLength; |
michael@0 | 309 | return res; |
michael@0 | 310 | } |