intl/uconv/ucvlatin/nsUTF16ToUnicode.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/uconv/ucvlatin/nsUTF16ToUnicode.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,350 @@
     1.4 +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.8 +
     1.9 +#include "nsUTF16ToUnicode.h"
    1.10 +#include "nsCharTraits.h"
    1.11 +#include "mozilla/Endian.h"
    1.12 +
    1.13 +enum {
    1.14 +  STATE_NORMAL = 0,
    1.15 +  STATE_HALF_CODE_POINT = 1,
    1.16 +  STATE_FIRST_CALL = 2,
    1.17 +  STATE_SECOND_BYTE = STATE_FIRST_CALL | STATE_HALF_CODE_POINT,
    1.18 +  STATE_ODD_SURROGATE_PAIR = 4
    1.19 +};
    1.20 +
    1.21 +nsresult
    1.22 +nsUTF16ToUnicodeBase::UTF16ConvertToUnicode(const char * aSrc,
    1.23 +                                            int32_t * aSrcLength,
    1.24 +                                            char16_t * aDest,
    1.25 +                                            int32_t * aDestLength,
    1.26 +                                            bool aSwapBytes)
    1.27 +{
    1.28 +  const char* src = aSrc;
    1.29 +  const char* srcEnd = aSrc + *aSrcLength;
    1.30 +  char16_t* dest = aDest;
    1.31 +  char16_t* destEnd = aDest + *aDestLength;
    1.32 +  char16_t oddHighSurrogate;
    1.33 +
    1.34 +  switch(mState) {
    1.35 +    case STATE_FIRST_CALL:
    1.36 +      NS_ASSERTION(*aSrcLength > 1, "buffer too short");
    1.37 +      src+=2;
    1.38 +      mState = STATE_NORMAL;
    1.39 +      break;
    1.40 +
    1.41 +    case STATE_SECOND_BYTE:
    1.42 +      NS_ASSERTION(*aSrcLength > 0, "buffer too short");
    1.43 +      src++;
    1.44 +      mState = STATE_NORMAL;
    1.45 +      break;
    1.46 +
    1.47 +    case STATE_ODD_SURROGATE_PAIR:
    1.48 +      if (*aDestLength < 2)
    1.49 +        goto error;
    1.50 +      else {
    1.51 +        *dest++ = mOddHighSurrogate;
    1.52 +        *dest++ = mOddLowSurrogate;
    1.53 +        mOddHighSurrogate = mOddLowSurrogate = 0;
    1.54 +        mState = STATE_NORMAL;
    1.55 +      }
    1.56 +      break;
    1.57 +
    1.58 +    case STATE_NORMAL:
    1.59 +    case STATE_HALF_CODE_POINT:
    1.60 +    default:
    1.61 +      break;
    1.62 +  }
    1.63 +
    1.64 +  oddHighSurrogate = mOddHighSurrogate;
    1.65 +
    1.66 +  if (src == srcEnd) {
    1.67 +    *aDestLength = dest - aDest;
    1.68 +    return (mState != STATE_NORMAL || oddHighSurrogate) ?
    1.69 +           NS_OK_UDEC_MOREINPUT : NS_OK;
    1.70 +  }
    1.71 +
    1.72 +  const char* srcEvenEnd;
    1.73 +
    1.74 +  char16_t u;
    1.75 +  if (mState == STATE_HALF_CODE_POINT) {
    1.76 +    if (dest == destEnd)
    1.77 +      goto error;
    1.78 +
    1.79 +    // the 1st byte of a 16-bit code unit was stored in |mOddByte| in the
    1.80 +    // previous run while the 2nd byte has to come from |*src|.
    1.81 +    mState = STATE_NORMAL;
    1.82 +#if MOZ_BIG_ENDIAN
    1.83 +    u = (mOddByte << 8) | uint8_t(*src++); // safe, we know we have at least one byte.
    1.84 +#else
    1.85 +    u = (*src++ << 8) | mOddByte; // safe, we know we have at least one byte.
    1.86 +#endif
    1.87 +    srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop
    1.88 +    goto have_codepoint;
    1.89 +  } else {
    1.90 +    srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop
    1.91 +  }
    1.92 +
    1.93 +  while (src != srcEvenEnd) {
    1.94 +    if (dest == destEnd)
    1.95 +      goto error;
    1.96 +
    1.97 +#if !defined(__sparc__) && !defined(__arm__)
    1.98 +    u = *(const char16_t*)src;
    1.99 +#else
   1.100 +    memcpy(&u, src, 2);
   1.101 +#endif
   1.102 +    src += 2;
   1.103 +
   1.104 +have_codepoint:
   1.105 +    if (aSwapBytes)
   1.106 +      u = u << 8 | u >> 8;
   1.107 +
   1.108 +    if (!IS_SURROGATE(u)) {
   1.109 +      if (oddHighSurrogate) {
   1.110 +        if (mErrBehavior == kOnError_Signal) {
   1.111 +          goto error2;
   1.112 +        }
   1.113 +        *dest++ = UCS2_REPLACEMENT_CHAR;
   1.114 +        if (dest == destEnd)
   1.115 +          goto error;
   1.116 +        oddHighSurrogate = 0;
   1.117 +      }
   1.118 +      *dest++ = u;
   1.119 +    } else if (NS_IS_HIGH_SURROGATE(u)) {
   1.120 +      if (oddHighSurrogate) {
   1.121 +        if (mErrBehavior == kOnError_Signal) {
   1.122 +          goto error2;
   1.123 +        }
   1.124 +        *dest++ = UCS2_REPLACEMENT_CHAR;
   1.125 +        if (dest == destEnd)
   1.126 +          goto error;
   1.127 +      }
   1.128 +      oddHighSurrogate = u;
   1.129 +    }
   1.130 +    else /* if (NS_IS_LOW_SURROGATE(u)) */ {
   1.131 +      if (oddHighSurrogate && *aDestLength > 1) {
   1.132 +        if (dest + 1 >= destEnd) {
   1.133 +          mOddLowSurrogate = u;
   1.134 +          mOddHighSurrogate = oddHighSurrogate;
   1.135 +          mState = STATE_ODD_SURROGATE_PAIR;
   1.136 +          goto error;
   1.137 +        }
   1.138 +        *dest++ = oddHighSurrogate;
   1.139 +        *dest++ = u;
   1.140 +      } else {
   1.141 +        if (mErrBehavior == kOnError_Signal) {
   1.142 +          goto error2;
   1.143 +        }
   1.144 +        *dest++ = UCS2_REPLACEMENT_CHAR;
   1.145 +      }
   1.146 +      oddHighSurrogate = 0;
   1.147 +    }
   1.148 +  }
   1.149 +  if (src != srcEnd) {
   1.150 +    // store the lead byte of a 16-bit unit for the next run.
   1.151 +    mOddByte = *src++;
   1.152 +    mState = STATE_HALF_CODE_POINT;
   1.153 +  }
   1.154 +
   1.155 +  mOddHighSurrogate = oddHighSurrogate;
   1.156 +
   1.157 +  *aDestLength = dest - aDest;
   1.158 +  *aSrcLength =  src  - aSrc; 
   1.159 +  return (mState != STATE_NORMAL || oddHighSurrogate) ?
   1.160 +         NS_OK_UDEC_MOREINPUT : NS_OK;
   1.161 +
   1.162 +error:
   1.163 +  *aDestLength = dest - aDest;
   1.164 +  *aSrcLength =  src  - aSrc; 
   1.165 +  return  NS_OK_UDEC_MOREOUTPUT;
   1.166 +
   1.167 +error2:
   1.168 +  *aDestLength = dest - aDest;
   1.169 +  *aSrcLength = --src - aSrc; 
   1.170 +  return  NS_ERROR_ILLEGAL_INPUT;
   1.171 +}
   1.172 +
   1.173 +NS_IMETHODIMP
   1.174 +nsUTF16ToUnicodeBase::Reset()
   1.175 +{
   1.176 +  mState = STATE_FIRST_CALL;
   1.177 +  mOddByte = 0;
   1.178 +  mOddHighSurrogate = 0;
   1.179 +  mOddLowSurrogate = 0;
   1.180 +  return NS_OK;
   1.181 +}
   1.182 +
   1.183 +NS_IMETHODIMP
   1.184 +nsUTF16ToUnicodeBase::GetMaxLength(const char * aSrc, int32_t aSrcLength, 
   1.185 +                                   int32_t * aDestLength)
   1.186 +{
   1.187 +  // the left-over data of the previous run have to be taken into account.
   1.188 +  *aDestLength = (aSrcLength + ((STATE_HALF_CODE_POINT & mState) ? 1 : 0)) / 2;
   1.189 +  if (mOddHighSurrogate)
   1.190 +    (*aDestLength)++;
   1.191 +  if (mOddLowSurrogate)
   1.192 +    (*aDestLength)++;
   1.193 +  return NS_OK;
   1.194 +}
   1.195 +
   1.196 +
   1.197 +NS_IMETHODIMP
   1.198 +nsUTF16BEToUnicode::Convert(const char * aSrc, int32_t * aSrcLength,
   1.199 +                            char16_t * aDest, int32_t * aDestLength)
   1.200 +{
   1.201 +  switch (mState) {
   1.202 +    case STATE_FIRST_CALL:
   1.203 +      if (*aSrcLength < 2) {
   1.204 +        if (*aSrcLength < 1) {
   1.205 +          *aDestLength = 0;
   1.206 +          return NS_OK;
   1.207 +        }
   1.208 +        if (uint8_t(*aSrc) != 0xFE) {
   1.209 +          mState = STATE_NORMAL;
   1.210 +          break;
   1.211 +        }
   1.212 +        *aDestLength = 0;
   1.213 +        mState = STATE_SECOND_BYTE;
   1.214 +        return NS_OK_UDEC_MOREINPUT;
   1.215 +      }
   1.216 +#if MOZ_LITTLE_ENDIAN
   1.217 +      // on LE machines, BE BOM is 0xFFFE
   1.218 +      if (0xFFFE != *((char16_t*)aSrc)) {
   1.219 +        mState = STATE_NORMAL;
   1.220 +      }
   1.221 +#else
   1.222 +      if (0xFEFF != *((char16_t*)aSrc)) {
   1.223 +        mState = STATE_NORMAL;
   1.224 +      }
   1.225 +#endif
   1.226 +      break;
   1.227 +
   1.228 +    case STATE_SECOND_BYTE:
   1.229 +      if (*aSrcLength < 1) {
   1.230 +        *aDestLength = 0;
   1.231 +        return NS_OK_UDEC_MOREINPUT;
   1.232 +      }
   1.233 +      if (uint8_t(*aSrc) != 0xFF) {
   1.234 +        mOddByte = 0xFE;
   1.235 +        mState = STATE_HALF_CODE_POINT;
   1.236 +      }
   1.237 +      break;
   1.238 +  }
   1.239 +
   1.240 +  return UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength,
   1.241 +                               bool(MOZ_LITTLE_ENDIAN));
   1.242 +}
   1.243 +
   1.244 +NS_IMETHODIMP
   1.245 +nsUTF16LEToUnicode::Convert(const char * aSrc, int32_t * aSrcLength,
   1.246 +                            char16_t * aDest, int32_t * aDestLength)
   1.247 +{
   1.248 +  switch (mState) {
   1.249 +    case STATE_FIRST_CALL:
   1.250 +      if (*aSrcLength < 2) {
   1.251 +        if (*aSrcLength < 1) {
   1.252 +          *aDestLength = 0;
   1.253 +          return NS_OK;
   1.254 +        }
   1.255 +        if (uint8_t(*aSrc) != 0xFF) {
   1.256 +          mState = STATE_NORMAL;
   1.257 +          break;
   1.258 +        }
   1.259 +        *aDestLength = 0;
   1.260 +        mState = STATE_SECOND_BYTE;
   1.261 +        return NS_OK_UDEC_MOREINPUT;
   1.262 +      }
   1.263 +#if MOZ_BIG_ENDIAN
   1.264 +      // on BE machines, LE BOM is 0xFFFE
   1.265 +      if (0xFFFE != *((char16_t*)aSrc)) {
   1.266 +        mState = STATE_NORMAL;
   1.267 +      }
   1.268 +#else
   1.269 +      if (0xFEFF != *((char16_t*)aSrc)) {
   1.270 +        mState = STATE_NORMAL;
   1.271 +      }
   1.272 +#endif
   1.273 +      break;
   1.274 +
   1.275 +    case STATE_SECOND_BYTE:
   1.276 +      if (*aSrcLength < 1) {
   1.277 +        *aDestLength = 0;
   1.278 +        return NS_OK_UDEC_MOREINPUT;
   1.279 +      }
   1.280 +      if (uint8_t(*aSrc) != 0xFE) {
   1.281 +        mOddByte = 0xFF;
   1.282 +        mState = STATE_HALF_CODE_POINT;
   1.283 +      }
   1.284 +      break;
   1.285 +  }
   1.286 +
   1.287 +  return UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength,
   1.288 +                               bool(MOZ_BIG_ENDIAN));
   1.289 +}
   1.290 +
   1.291 +NS_IMETHODIMP
   1.292 +nsUTF16ToUnicode::Reset()
   1.293 +{
   1.294 +  mEndian = kUnknown;
   1.295 +  mFoundBOM = false;
   1.296 +  return nsUTF16ToUnicodeBase::Reset();
   1.297 +}
   1.298 +
   1.299 +NS_IMETHODIMP
   1.300 +nsUTF16ToUnicode::Convert(const char * aSrc, int32_t * aSrcLength,
   1.301 +                          char16_t * aDest, int32_t * aDestLength)
   1.302 +{
   1.303 +    if(STATE_FIRST_CALL == mState && *aSrcLength < 2)
   1.304 +    {
   1.305 +      nsresult res = (*aSrcLength == 0) ? NS_OK : NS_ERROR_ILLEGAL_INPUT;
   1.306 +      *aSrcLength=0;
   1.307 +      *aDestLength=0;
   1.308 +      return res;
   1.309 +    }
   1.310 +    if(STATE_FIRST_CALL == mState) // first time called
   1.311 +    {
   1.312 +      // check if BOM (0xFEFF) is at the beginning, remove it if found, and
   1.313 +      // set mEndian accordingly.
   1.314 +      if(0xFF == uint8_t(aSrc[0]) && 0xFE == uint8_t(aSrc[1])) {
   1.315 +        mEndian = kLittleEndian;
   1.316 +        mFoundBOM = true;
   1.317 +      }
   1.318 +      else if(0xFE == uint8_t(aSrc[0]) && 0xFF == uint8_t(aSrc[1])) {
   1.319 +        mEndian = kBigEndian;
   1.320 +        mFoundBOM = true;
   1.321 +      }
   1.322 +      // BOM is not found, but we can use a simple heuristic to determine
   1.323 +      // the endianness. Assume the first character is [U+0001, U+00FF].
   1.324 +      // Not always valid, but it's very likely to hold for html/xml/css. 
   1.325 +      else if(!aSrc[0] && aSrc[1]) {  // 0x00 0xhh (hh != 00)
   1.326 +        mState = STATE_NORMAL;
   1.327 +        mEndian = kBigEndian;
   1.328 +      }
   1.329 +      else if(aSrc[0] && !aSrc[1]) {  // 0xhh 0x00 (hh != 00)
   1.330 +        mState = STATE_NORMAL;
   1.331 +        mEndian = kLittleEndian;
   1.332 +      }
   1.333 +      else { // Neither BOM nor 'plausible' byte patterns at the beginning.
   1.334 +             // Just assume it's BE (following Unicode standard)
   1.335 +             // and let the garbage show up in the browser. (security concern?)
   1.336 +             // (bug 246194)
   1.337 +        mState = STATE_NORMAL;
   1.338 +        mEndian = kBigEndian;
   1.339 +      }
   1.340 +    }
   1.341 +    
   1.342 +    nsresult rv = UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength,
   1.343 +#if MOZ_BIG_ENDIAN
   1.344 +                                        (mEndian == kLittleEndian)
   1.345 +#else
   1.346 +                                        (mEndian == kBigEndian)
   1.347 +#endif
   1.348 +                                        );
   1.349 +
   1.350 +    // If BOM is not found and we're to return NS_OK, signal that BOM
   1.351 +    // is not found. Otherwise, return |rv| from |UTF16ConvertToUnicode|
   1.352 +    return (rv == NS_OK && !mFoundBOM) ? NS_OK_UDEC_NOBOMFOUND : rv;
   1.353 +}

mercurial