1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/uconv/ucvlatin/nsUTF16ToUnicode.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,350 @@ 1.4 +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 + 1.9 +#include "nsUTF16ToUnicode.h" 1.10 +#include "nsCharTraits.h" 1.11 +#include "mozilla/Endian.h" 1.12 + 1.13 +enum { 1.14 + STATE_NORMAL = 0, 1.15 + STATE_HALF_CODE_POINT = 1, 1.16 + STATE_FIRST_CALL = 2, 1.17 + STATE_SECOND_BYTE = STATE_FIRST_CALL | STATE_HALF_CODE_POINT, 1.18 + STATE_ODD_SURROGATE_PAIR = 4 1.19 +}; 1.20 + 1.21 +nsresult 1.22 +nsUTF16ToUnicodeBase::UTF16ConvertToUnicode(const char * aSrc, 1.23 + int32_t * aSrcLength, 1.24 + char16_t * aDest, 1.25 + int32_t * aDestLength, 1.26 + bool aSwapBytes) 1.27 +{ 1.28 + const char* src = aSrc; 1.29 + const char* srcEnd = aSrc + *aSrcLength; 1.30 + char16_t* dest = aDest; 1.31 + char16_t* destEnd = aDest + *aDestLength; 1.32 + char16_t oddHighSurrogate; 1.33 + 1.34 + switch(mState) { 1.35 + case STATE_FIRST_CALL: 1.36 + NS_ASSERTION(*aSrcLength > 1, "buffer too short"); 1.37 + src+=2; 1.38 + mState = STATE_NORMAL; 1.39 + break; 1.40 + 1.41 + case STATE_SECOND_BYTE: 1.42 + NS_ASSERTION(*aSrcLength > 0, "buffer too short"); 1.43 + src++; 1.44 + mState = STATE_NORMAL; 1.45 + break; 1.46 + 1.47 + case STATE_ODD_SURROGATE_PAIR: 1.48 + if (*aDestLength < 2) 1.49 + goto error; 1.50 + else { 1.51 + *dest++ = mOddHighSurrogate; 1.52 + *dest++ = mOddLowSurrogate; 1.53 + mOddHighSurrogate = mOddLowSurrogate = 0; 1.54 + mState = STATE_NORMAL; 1.55 + } 1.56 + break; 1.57 + 1.58 + case STATE_NORMAL: 1.59 + case STATE_HALF_CODE_POINT: 1.60 + default: 1.61 + break; 1.62 + } 1.63 + 1.64 + oddHighSurrogate = mOddHighSurrogate; 1.65 + 1.66 + if (src == srcEnd) { 1.67 + *aDestLength = dest - aDest; 1.68 + return (mState != STATE_NORMAL || oddHighSurrogate) ? 1.69 + NS_OK_UDEC_MOREINPUT : NS_OK; 1.70 + } 1.71 + 1.72 + const char* srcEvenEnd; 1.73 + 1.74 + char16_t u; 1.75 + if (mState == STATE_HALF_CODE_POINT) { 1.76 + if (dest == destEnd) 1.77 + goto error; 1.78 + 1.79 + // the 1st byte of a 16-bit code unit was stored in |mOddByte| in the 1.80 + // previous run while the 2nd byte has to come from |*src|. 1.81 + mState = STATE_NORMAL; 1.82 +#if MOZ_BIG_ENDIAN 1.83 + u = (mOddByte << 8) | uint8_t(*src++); // safe, we know we have at least one byte. 1.84 +#else 1.85 + u = (*src++ << 8) | mOddByte; // safe, we know we have at least one byte. 1.86 +#endif 1.87 + srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop 1.88 + goto have_codepoint; 1.89 + } else { 1.90 + srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop 1.91 + } 1.92 + 1.93 + while (src != srcEvenEnd) { 1.94 + if (dest == destEnd) 1.95 + goto error; 1.96 + 1.97 +#if !defined(__sparc__) && !defined(__arm__) 1.98 + u = *(const char16_t*)src; 1.99 +#else 1.100 + memcpy(&u, src, 2); 1.101 +#endif 1.102 + src += 2; 1.103 + 1.104 +have_codepoint: 1.105 + if (aSwapBytes) 1.106 + u = u << 8 | u >> 8; 1.107 + 1.108 + if (!IS_SURROGATE(u)) { 1.109 + if (oddHighSurrogate) { 1.110 + if (mErrBehavior == kOnError_Signal) { 1.111 + goto error2; 1.112 + } 1.113 + *dest++ = UCS2_REPLACEMENT_CHAR; 1.114 + if (dest == destEnd) 1.115 + goto error; 1.116 + oddHighSurrogate = 0; 1.117 + } 1.118 + *dest++ = u; 1.119 + } else if (NS_IS_HIGH_SURROGATE(u)) { 1.120 + if (oddHighSurrogate) { 1.121 + if (mErrBehavior == kOnError_Signal) { 1.122 + goto error2; 1.123 + } 1.124 + *dest++ = UCS2_REPLACEMENT_CHAR; 1.125 + if (dest == destEnd) 1.126 + goto error; 1.127 + } 1.128 + oddHighSurrogate = u; 1.129 + } 1.130 + else /* if (NS_IS_LOW_SURROGATE(u)) */ { 1.131 + if (oddHighSurrogate && *aDestLength > 1) { 1.132 + if (dest + 1 >= destEnd) { 1.133 + mOddLowSurrogate = u; 1.134 + mOddHighSurrogate = oddHighSurrogate; 1.135 + mState = STATE_ODD_SURROGATE_PAIR; 1.136 + goto error; 1.137 + } 1.138 + *dest++ = oddHighSurrogate; 1.139 + *dest++ = u; 1.140 + } else { 1.141 + if (mErrBehavior == kOnError_Signal) { 1.142 + goto error2; 1.143 + } 1.144 + *dest++ = UCS2_REPLACEMENT_CHAR; 1.145 + } 1.146 + oddHighSurrogate = 0; 1.147 + } 1.148 + } 1.149 + if (src != srcEnd) { 1.150 + // store the lead byte of a 16-bit unit for the next run. 1.151 + mOddByte = *src++; 1.152 + mState = STATE_HALF_CODE_POINT; 1.153 + } 1.154 + 1.155 + mOddHighSurrogate = oddHighSurrogate; 1.156 + 1.157 + *aDestLength = dest - aDest; 1.158 + *aSrcLength = src - aSrc; 1.159 + return (mState != STATE_NORMAL || oddHighSurrogate) ? 1.160 + NS_OK_UDEC_MOREINPUT : NS_OK; 1.161 + 1.162 +error: 1.163 + *aDestLength = dest - aDest; 1.164 + *aSrcLength = src - aSrc; 1.165 + return NS_OK_UDEC_MOREOUTPUT; 1.166 + 1.167 +error2: 1.168 + *aDestLength = dest - aDest; 1.169 + *aSrcLength = --src - aSrc; 1.170 + return NS_ERROR_ILLEGAL_INPUT; 1.171 +} 1.172 + 1.173 +NS_IMETHODIMP 1.174 +nsUTF16ToUnicodeBase::Reset() 1.175 +{ 1.176 + mState = STATE_FIRST_CALL; 1.177 + mOddByte = 0; 1.178 + mOddHighSurrogate = 0; 1.179 + mOddLowSurrogate = 0; 1.180 + return NS_OK; 1.181 +} 1.182 + 1.183 +NS_IMETHODIMP 1.184 +nsUTF16ToUnicodeBase::GetMaxLength(const char * aSrc, int32_t aSrcLength, 1.185 + int32_t * aDestLength) 1.186 +{ 1.187 + // the left-over data of the previous run have to be taken into account. 1.188 + *aDestLength = (aSrcLength + ((STATE_HALF_CODE_POINT & mState) ? 1 : 0)) / 2; 1.189 + if (mOddHighSurrogate) 1.190 + (*aDestLength)++; 1.191 + if (mOddLowSurrogate) 1.192 + (*aDestLength)++; 1.193 + return NS_OK; 1.194 +} 1.195 + 1.196 + 1.197 +NS_IMETHODIMP 1.198 +nsUTF16BEToUnicode::Convert(const char * aSrc, int32_t * aSrcLength, 1.199 + char16_t * aDest, int32_t * aDestLength) 1.200 +{ 1.201 + switch (mState) { 1.202 + case STATE_FIRST_CALL: 1.203 + if (*aSrcLength < 2) { 1.204 + if (*aSrcLength < 1) { 1.205 + *aDestLength = 0; 1.206 + return NS_OK; 1.207 + } 1.208 + if (uint8_t(*aSrc) != 0xFE) { 1.209 + mState = STATE_NORMAL; 1.210 + break; 1.211 + } 1.212 + *aDestLength = 0; 1.213 + mState = STATE_SECOND_BYTE; 1.214 + return NS_OK_UDEC_MOREINPUT; 1.215 + } 1.216 +#if MOZ_LITTLE_ENDIAN 1.217 + // on LE machines, BE BOM is 0xFFFE 1.218 + if (0xFFFE != *((char16_t*)aSrc)) { 1.219 + mState = STATE_NORMAL; 1.220 + } 1.221 +#else 1.222 + if (0xFEFF != *((char16_t*)aSrc)) { 1.223 + mState = STATE_NORMAL; 1.224 + } 1.225 +#endif 1.226 + break; 1.227 + 1.228 + case STATE_SECOND_BYTE: 1.229 + if (*aSrcLength < 1) { 1.230 + *aDestLength = 0; 1.231 + return NS_OK_UDEC_MOREINPUT; 1.232 + } 1.233 + if (uint8_t(*aSrc) != 0xFF) { 1.234 + mOddByte = 0xFE; 1.235 + mState = STATE_HALF_CODE_POINT; 1.236 + } 1.237 + break; 1.238 + } 1.239 + 1.240 + return UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength, 1.241 + bool(MOZ_LITTLE_ENDIAN)); 1.242 +} 1.243 + 1.244 +NS_IMETHODIMP 1.245 +nsUTF16LEToUnicode::Convert(const char * aSrc, int32_t * aSrcLength, 1.246 + char16_t * aDest, int32_t * aDestLength) 1.247 +{ 1.248 + switch (mState) { 1.249 + case STATE_FIRST_CALL: 1.250 + if (*aSrcLength < 2) { 1.251 + if (*aSrcLength < 1) { 1.252 + *aDestLength = 0; 1.253 + return NS_OK; 1.254 + } 1.255 + if (uint8_t(*aSrc) != 0xFF) { 1.256 + mState = STATE_NORMAL; 1.257 + break; 1.258 + } 1.259 + *aDestLength = 0; 1.260 + mState = STATE_SECOND_BYTE; 1.261 + return NS_OK_UDEC_MOREINPUT; 1.262 + } 1.263 +#if MOZ_BIG_ENDIAN 1.264 + // on BE machines, LE BOM is 0xFFFE 1.265 + if (0xFFFE != *((char16_t*)aSrc)) { 1.266 + mState = STATE_NORMAL; 1.267 + } 1.268 +#else 1.269 + if (0xFEFF != *((char16_t*)aSrc)) { 1.270 + mState = STATE_NORMAL; 1.271 + } 1.272 +#endif 1.273 + break; 1.274 + 1.275 + case STATE_SECOND_BYTE: 1.276 + if (*aSrcLength < 1) { 1.277 + *aDestLength = 0; 1.278 + return NS_OK_UDEC_MOREINPUT; 1.279 + } 1.280 + if (uint8_t(*aSrc) != 0xFE) { 1.281 + mOddByte = 0xFF; 1.282 + mState = STATE_HALF_CODE_POINT; 1.283 + } 1.284 + break; 1.285 + } 1.286 + 1.287 + return UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength, 1.288 + bool(MOZ_BIG_ENDIAN)); 1.289 +} 1.290 + 1.291 +NS_IMETHODIMP 1.292 +nsUTF16ToUnicode::Reset() 1.293 +{ 1.294 + mEndian = kUnknown; 1.295 + mFoundBOM = false; 1.296 + return nsUTF16ToUnicodeBase::Reset(); 1.297 +} 1.298 + 1.299 +NS_IMETHODIMP 1.300 +nsUTF16ToUnicode::Convert(const char * aSrc, int32_t * aSrcLength, 1.301 + char16_t * aDest, int32_t * aDestLength) 1.302 +{ 1.303 + if(STATE_FIRST_CALL == mState && *aSrcLength < 2) 1.304 + { 1.305 + nsresult res = (*aSrcLength == 0) ? NS_OK : NS_ERROR_ILLEGAL_INPUT; 1.306 + *aSrcLength=0; 1.307 + *aDestLength=0; 1.308 + return res; 1.309 + } 1.310 + if(STATE_FIRST_CALL == mState) // first time called 1.311 + { 1.312 + // check if BOM (0xFEFF) is at the beginning, remove it if found, and 1.313 + // set mEndian accordingly. 1.314 + if(0xFF == uint8_t(aSrc[0]) && 0xFE == uint8_t(aSrc[1])) { 1.315 + mEndian = kLittleEndian; 1.316 + mFoundBOM = true; 1.317 + } 1.318 + else if(0xFE == uint8_t(aSrc[0]) && 0xFF == uint8_t(aSrc[1])) { 1.319 + mEndian = kBigEndian; 1.320 + mFoundBOM = true; 1.321 + } 1.322 + // BOM is not found, but we can use a simple heuristic to determine 1.323 + // the endianness. Assume the first character is [U+0001, U+00FF]. 1.324 + // Not always valid, but it's very likely to hold for html/xml/css. 1.325 + else if(!aSrc[0] && aSrc[1]) { // 0x00 0xhh (hh != 00) 1.326 + mState = STATE_NORMAL; 1.327 + mEndian = kBigEndian; 1.328 + } 1.329 + else if(aSrc[0] && !aSrc[1]) { // 0xhh 0x00 (hh != 00) 1.330 + mState = STATE_NORMAL; 1.331 + mEndian = kLittleEndian; 1.332 + } 1.333 + else { // Neither BOM nor 'plausible' byte patterns at the beginning. 1.334 + // Just assume it's BE (following Unicode standard) 1.335 + // and let the garbage show up in the browser. (security concern?) 1.336 + // (bug 246194) 1.337 + mState = STATE_NORMAL; 1.338 + mEndian = kBigEndian; 1.339 + } 1.340 + } 1.341 + 1.342 + nsresult rv = UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength, 1.343 +#if MOZ_BIG_ENDIAN 1.344 + (mEndian == kLittleEndian) 1.345 +#else 1.346 + (mEndian == kBigEndian) 1.347 +#endif 1.348 + ); 1.349 + 1.350 + // If BOM is not found and we're to return NS_OK, signal that BOM 1.351 + // is not found. Otherwise, return |rv| from |UTF16ConvertToUnicode| 1.352 + return (rv == NS_OK && !mFoundBOM) ? NS_OK_UDEC_NOBOMFOUND : rv; 1.353 +}