1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/uconv/src/nsUTF8ToUnicode.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,362 @@ 1.4 +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 1.5 +/* vim: set ts=2 et sw=2 tw=80: */ 1.6 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.7 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.8 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.9 + 1.10 +#include "nsUCSupport.h" 1.11 +#include "nsUTF8ToUnicode.h" 1.12 +#include "mozilla/SSE.h" 1.13 +#include "nsCharTraits.h" 1.14 +#include <algorithm> 1.15 + 1.16 +#define UNICODE_BYTE_ORDER_MARK 0xFEFF 1.17 + 1.18 +static char16_t* EmitSurrogatePair(uint32_t ucs4, char16_t* aDest) 1.19 +{ 1.20 + NS_ASSERTION(ucs4 > 0xFFFF, "Should be a supplementary character"); 1.21 + ucs4 -= 0x00010000; 1.22 + *aDest++ = 0xD800 | (0x000003FF & (ucs4 >> 10)); 1.23 + *aDest++ = 0xDC00 | (0x000003FF & ucs4); 1.24 + return aDest; 1.25 +} 1.26 + 1.27 +//---------------------------------------------------------------------- 1.28 +// Class nsUTF8ToUnicode [implementation] 1.29 + 1.30 +nsUTF8ToUnicode::nsUTF8ToUnicode() 1.31 +: nsBasicDecoderSupport() 1.32 +{ 1.33 + Reset(); 1.34 +} 1.35 + 1.36 +//---------------------------------------------------------------------- 1.37 +// Subclassing of nsTableDecoderSupport class [implementation] 1.38 + 1.39 +/** 1.40 + * Normally the maximum length of the output of the UTF8 decoder in UTF16 1.41 + * code units is the same as the length of the input in UTF8 code units, 1.42 + * since 1-byte, 2-byte and 3-byte UTF-8 sequences decode to a single 1.43 + * UTF-16 character, and 4-byte UTF-8 sequences decode to a surrogate pair. 1.44 + * 1.45 + * However, there is an edge case where the output can be longer than the 1.46 + * input: if the previous buffer ended with an incomplete multi-byte 1.47 + * sequence and this buffer does not begin with a valid continuation 1.48 + * byte, we will return NS_ERROR_ILLEGAL_INPUT and the caller may insert a 1.49 + * replacement character in the output buffer which corresponds to no 1.50 + * character in the input buffer. So in the worst case the destination 1.51 + * will need to be one code unit longer than the source. 1.52 + * See bug 301797. 1.53 + */ 1.54 +NS_IMETHODIMP nsUTF8ToUnicode::GetMaxLength(const char * aSrc, 1.55 + int32_t aSrcLength, 1.56 + int32_t * aDestLength) 1.57 +{ 1.58 + *aDestLength = aSrcLength + 1; 1.59 + return NS_OK; 1.60 +} 1.61 + 1.62 + 1.63 +//---------------------------------------------------------------------- 1.64 +// Subclassing of nsBasicDecoderSupport class [implementation] 1.65 + 1.66 +NS_IMETHODIMP nsUTF8ToUnicode::Reset() 1.67 +{ 1.68 + 1.69 + mUcs4 = 0; // cached Unicode character 1.70 + mState = 0; // cached expected number of octets after the current octet 1.71 + // until the beginning of the next UTF8 character sequence 1.72 + mBytes = 1; // cached expected number of octets in the current sequence 1.73 + mFirst = true; 1.74 + 1.75 + return NS_OK; 1.76 + 1.77 +} 1.78 + 1.79 +//---------------------------------------------------------------------- 1.80 +// Subclassing of nsBasicDecoderSupport class [implementation] 1.81 + 1.82 +// Fast ASCII -> UTF16 inner loop implementations 1.83 +// 1.84 +// Convert_ascii_run will update src and dst to the new values, and 1.85 +// len must be the maximum number ascii chars that it would be valid 1.86 +// to take from src and place into dst. (That is, the minimum of the 1.87 +// number of bytes left in src and the number of unichars available in 1.88 +// dst.) 1.89 + 1.90 +#if defined(__arm__) || defined(_M_ARM) 1.91 + 1.92 +// on ARM, do extra work to avoid byte/halfword reads/writes by 1.93 +// reading/writing a word at a time for as long as we can 1.94 +static inline void 1.95 +Convert_ascii_run (const char *&src, 1.96 + char16_t *&dst, 1.97 + int32_t len) 1.98 +{ 1.99 + const uint32_t *src32; 1.100 + uint32_t *dst32; 1.101 + 1.102 + // with some alignments, we'd never actually break out of the slow loop, so 1.103 + // check and do the faster slow loop 1.104 + if ((((NS_PTR_TO_UINT32(dst) & 3) == 0) && ((NS_PTR_TO_UINT32(src) & 1) == 0)) || 1.105 + (((NS_PTR_TO_UINT32(dst) & 3) == 2) && ((NS_PTR_TO_UINT32(src) & 1) == 1))) 1.106 + { 1.107 + while (((NS_PTR_TO_UINT32(src) & 3) || 1.108 + (NS_PTR_TO_UINT32(dst) & 3)) && 1.109 + len > 0) 1.110 + { 1.111 + if (*src & 0x80U) 1.112 + return; 1.113 + *dst++ = (char16_t) *src++; 1.114 + len--; 1.115 + } 1.116 + } else { 1.117 + goto finish; 1.118 + } 1.119 + 1.120 + // then go 4 bytes at a time 1.121 + src32 = (const uint32_t*) src; 1.122 + dst32 = (uint32_t*) dst; 1.123 + 1.124 + while (len > 4) { 1.125 + uint32_t in = *src32++; 1.126 + 1.127 + if (in & 0x80808080U) { 1.128 + src32--; 1.129 + break; 1.130 + } 1.131 + 1.132 + *dst32++ = ((in & 0x000000ff) >> 0) | ((in & 0x0000ff00) << 8); 1.133 + *dst32++ = ((in & 0x00ff0000) >> 16) | ((in & 0xff000000) >> 8); 1.134 + 1.135 + len -= 4; 1.136 + } 1.137 + 1.138 + src = (const char *) src32; 1.139 + dst = (char16_t *) dst32; 1.140 + 1.141 +finish: 1.142 + while (len-- > 0 && (*src & 0x80U) == 0) { 1.143 + *dst++ = (char16_t) *src++; 1.144 + } 1.145 +} 1.146 + 1.147 +#else 1.148 + 1.149 +#ifdef MOZILLA_MAY_SUPPORT_SSE2 1.150 +namespace mozilla { 1.151 +namespace SSE2 { 1.152 + 1.153 +void Convert_ascii_run(const char *&src, char16_t *&dst, int32_t len); 1.154 + 1.155 +} 1.156 +} 1.157 +#endif 1.158 + 1.159 +static inline void 1.160 +Convert_ascii_run (const char *&src, 1.161 + char16_t *&dst, 1.162 + int32_t len) 1.163 +{ 1.164 +#ifdef MOZILLA_MAY_SUPPORT_SSE2 1.165 + if (mozilla::supports_sse2()) { 1.166 + mozilla::SSE2::Convert_ascii_run(src, dst, len); 1.167 + return; 1.168 + } 1.169 +#endif 1.170 + 1.171 + while (len-- > 0 && (*src & 0x80U) == 0) { 1.172 + *dst++ = (char16_t) *src++; 1.173 + } 1.174 +} 1.175 + 1.176 +#endif 1.177 + 1.178 +NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc, 1.179 + int32_t * aSrcLength, 1.180 + char16_t * aDest, 1.181 + int32_t * aDestLength) 1.182 +{ 1.183 + uint32_t aSrcLen = (uint32_t) (*aSrcLength); 1.184 + uint32_t aDestLen = (uint32_t) (*aDestLength); 1.185 + 1.186 + const char *in, *inend; 1.187 + inend = aSrc + aSrcLen; 1.188 + 1.189 + char16_t *out, *outend; 1.190 + outend = aDest + aDestLen; 1.191 + 1.192 + nsresult res = NS_OK; // conversion result 1.193 + 1.194 + out = aDest; 1.195 + if (mState == 0xFF) { 1.196 + // Emit supplementary character left over from previous iteration. It is 1.197 + // caller's responsibility to keep a sufficient buffer. 1.198 + if (aDestLen < 2) { 1.199 + *aSrcLength = *aDestLength = 0; 1.200 + return NS_OK_UDEC_MOREOUTPUT; 1.201 + } 1.202 + out = EmitSurrogatePair(mUcs4, out); 1.203 + mUcs4 = 0; 1.204 + mState = 0; 1.205 + mBytes = 1; 1.206 + mFirst = false; 1.207 + } 1.208 + 1.209 + // alias these locally for speed 1.210 + int32_t mUcs4 = this->mUcs4; 1.211 + uint8_t mState = this->mState; 1.212 + uint8_t mBytes = this->mBytes; 1.213 + bool mFirst = this->mFirst; 1.214 + 1.215 + // Set mFirst to false now so we don't have to every time through the ASCII 1.216 + // branch within the loop. 1.217 + if (mFirst && aSrcLen && (0 == (0x80 & (*aSrc)))) 1.218 + mFirst = false; 1.219 + 1.220 + for (in = aSrc; ((in < inend) && (out < outend)); ++in) { 1.221 + uint8_t c = *in; 1.222 + if (0 == mState) { 1.223 + // When mState is zero we expect either a US-ASCII character or a 1.224 + // multi-octet sequence. 1.225 + if (c < 0x80) { // 00..7F 1.226 + int32_t max_loops = std::min(inend - in, outend - out); 1.227 + Convert_ascii_run(in, out, max_loops); 1.228 + --in; // match the rest of the cases 1.229 + mBytes = 1; 1.230 + } else if (c < 0xC2) { // C0/C1 1.231 + // Overlong 2 octet sequence 1.232 + if (mErrBehavior == kOnError_Signal) { 1.233 + res = NS_ERROR_ILLEGAL_INPUT; 1.234 + break; 1.235 + } 1.236 + *out++ = UCS2_REPLACEMENT_CHAR; 1.237 + mFirst = false; 1.238 + } else if (c < 0xE0) { // C2..DF 1.239 + // First octet of 2 octet sequence 1.240 + mUcs4 = c; 1.241 + mUcs4 = (mUcs4 & 0x1F) << 6; 1.242 + mState = 1; 1.243 + mBytes = 2; 1.244 + } else if (c < 0xF0) { // E0..EF 1.245 + // First octet of 3 octet sequence 1.246 + mUcs4 = c; 1.247 + mUcs4 = (mUcs4 & 0x0F) << 12; 1.248 + mState = 2; 1.249 + mBytes = 3; 1.250 + } else if (c < 0xF5) { // F0..F4 1.251 + // First octet of 4 octet sequence 1.252 + mUcs4 = c; 1.253 + mUcs4 = (mUcs4 & 0x07) << 18; 1.254 + mState = 3; 1.255 + mBytes = 4; 1.256 + } else { // F5..FF 1.257 + /* Current octet is neither in the US-ASCII range nor a legal first 1.258 + * octet of a multi-octet sequence. 1.259 + */ 1.260 + if (mErrBehavior == kOnError_Signal) { 1.261 + /* Return an error condition. Caller is responsible for flushing and 1.262 + * refilling the buffer and resetting state. 1.263 + */ 1.264 + res = NS_ERROR_ILLEGAL_INPUT; 1.265 + break; 1.266 + } 1.267 + *out++ = UCS2_REPLACEMENT_CHAR; 1.268 + mFirst = false; 1.269 + } 1.270 + } else { 1.271 + // When mState is non-zero, we expect a continuation of the multi-octet 1.272 + // sequence 1.273 + if (0x80 == (0xC0 & c)) { 1.274 + if (mState > 1) { 1.275 + // If we are here, all possibilities are: 1.276 + // mState == 2 && mBytes == 3 || 1.277 + // mState == 2 && mBytes == 4 || 1.278 + // mState == 3 && mBytes == 4 1.279 + if ((mBytes == 3 && ((!mUcs4 && c < 0xA0) || // E0 80..9F 1.280 + (mUcs4 == 0xD000 && c > 0x9F))) || // ED A0..BF 1.281 + (mState == 3 && ((!mUcs4 && c < 0x90) || // F0 80..8F 1.282 + (mUcs4 == 0x100000 && c > 0x8F)))) {// F4 90..BF 1.283 + // illegal sequences or sequences converted into illegal ranges. 1.284 + in--; 1.285 + if (mErrBehavior == kOnError_Signal) { 1.286 + res = NS_ERROR_ILLEGAL_INPUT; 1.287 + break; 1.288 + } 1.289 + *out++ = UCS2_REPLACEMENT_CHAR; 1.290 + mState = 0; 1.291 + mFirst = false; 1.292 + continue; 1.293 + } 1.294 + } 1.295 + 1.296 + // Legal continuation. 1.297 + uint32_t shift = (mState - 1) * 6; 1.298 + uint32_t tmp = c; 1.299 + tmp = (tmp & 0x0000003FL) << shift; 1.300 + mUcs4 |= tmp; 1.301 + 1.302 + if (0 == --mState) { 1.303 + /* End of the multi-octet sequence. mUcs4 now contains the final 1.304 + * Unicode codepoint to be output 1.305 + */ 1.306 + 1.307 + if (mUcs4 > 0xFFFF) { 1.308 + // mUcs4 is in the range 0x10000 - 0x10FFFF. Output a UTF-16 pair 1.309 + if (out + 2 > outend) { 1.310 + // insufficient space left in the buffer. Keep mUcs4 for the 1.311 + // next iteration. 1.312 + mState = 0xFF; 1.313 + ++in; 1.314 + res = NS_OK_UDEC_MOREOUTPUT; 1.315 + break; 1.316 + } 1.317 + out = EmitSurrogatePair(mUcs4, out); 1.318 + } else if (UNICODE_BYTE_ORDER_MARK != mUcs4 || !mFirst) { 1.319 + // Don't output the BOM only if it is the first character 1.320 + *out++ = mUcs4; 1.321 + } 1.322 + //initialize UTF8 cache 1.323 + mUcs4 = 0; 1.324 + mState = 0; 1.325 + mBytes = 1; 1.326 + mFirst = false; 1.327 + } 1.328 + } else { 1.329 + /* ((0xC0 & c != 0x80) && (mState != 0)) 1.330 + * 1.331 + * Incomplete multi-octet sequence. Unconsume this 1.332 + * octet and return an error condition. Caller is responsible 1.333 + * for flushing and refilling the buffer and resetting state. 1.334 + */ 1.335 + in--; 1.336 + if (mErrBehavior == kOnError_Signal) { 1.337 + res = NS_ERROR_ILLEGAL_INPUT; 1.338 + break; 1.339 + } 1.340 + *out++ = UCS2_REPLACEMENT_CHAR; 1.341 + mState = 0; 1.342 + mFirst = false; 1.343 + } 1.344 + } 1.345 + } 1.346 + 1.347 + // output not finished, output buffer too short 1.348 + if ((NS_OK == res) && (in < inend) && (out >= outend)) 1.349 + res = NS_OK_UDEC_MOREOUTPUT; 1.350 + 1.351 + // last UCS4 is incomplete, make sure the caller 1.352 + // returns with properly aligned continuation of the buffer 1.353 + if ((NS_OK == res) && (mState != 0)) 1.354 + res = NS_OK_UDEC_MOREINPUT; 1.355 + 1.356 + *aSrcLength = in - aSrc; 1.357 + *aDestLength = out - aDest; 1.358 + 1.359 + this->mUcs4 = mUcs4; 1.360 + this->mState = mState; 1.361 + this->mBytes = mBytes; 1.362 + this->mFirst = mFirst; 1.363 + 1.364 + return(res); 1.365 +}