intl/unicharutil/src/nsSaveAsCharset.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/unicharutil/src/nsSaveAsCharset.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,365 @@
     1.4 +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     1.5 +/* vim: set ts=8 sts=2 et sw=2 tw=80: */
     1.6 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.7 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.8 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.9 +
    1.10 +
    1.11 +#include "prmem.h"
    1.12 +#include "prprf.h"
    1.13 +#include "nsICharsetConverterManager.h"
    1.14 +#include "nsSaveAsCharset.h"
    1.15 +#include "nsWhitespaceTokenizer.h"
    1.16 +#include "nsServiceManagerUtils.h"
    1.17 +
    1.18 +//
    1.19 +// nsISupports methods
    1.20 +//
    1.21 +NS_IMPL_ISUPPORTS(nsSaveAsCharset, nsISaveAsCharset)
    1.22 +
    1.23 +//
    1.24 +// nsSaveAsCharset
    1.25 +//
    1.26 +nsSaveAsCharset::nsSaveAsCharset()
    1.27 +{
    1.28 +  mAttribute = attr_htmlTextDefault;
    1.29 +  mEntityVersion = 0;
    1.30 +  mCharsetListIndex = -1;
    1.31 +}
    1.32 +
    1.33 +nsSaveAsCharset::~nsSaveAsCharset()
    1.34 +{
    1.35 +}
    1.36 +
    1.37 +NS_IMETHODIMP
    1.38 +nsSaveAsCharset::Init(const char *charset, uint32_t attr, uint32_t entityVersion)
    1.39 +{
    1.40 +  nsresult rv = NS_OK;
    1.41 +
    1.42 +  mAttribute = attr;
    1.43 +  mEntityVersion = entityVersion;
    1.44 +
    1.45 +  rv = SetupCharsetList(charset);
    1.46 +  NS_ENSURE_SUCCESS(rv, rv);
    1.47 +
    1.48 +  // set up unicode encoder
    1.49 +  rv = SetupUnicodeEncoder(GetNextCharset());
    1.50 +  NS_ENSURE_SUCCESS(rv, rv);
    1.51 +
    1.52 +  // set up entity converter
    1.53 +  if (attr_EntityNone != MASK_ENTITY(mAttribute) && !mEntityConverter)
    1.54 +    mEntityConverter = do_CreateInstance(NS_ENTITYCONVERTER_CONTRACTID, &rv);
    1.55 +
    1.56 +  return rv;
    1.57 +}
    1.58 +
    1.59 +NS_IMETHODIMP
    1.60 +nsSaveAsCharset::Convert(const char16_t *inString, char **_retval)
    1.61 +{
    1.62 +  NS_ENSURE_ARG_POINTER(_retval);
    1.63 +  NS_ENSURE_ARG_POINTER(inString);
    1.64 +  if (0 == *inString)
    1.65 +    return NS_ERROR_ILLEGAL_VALUE;
    1.66 +  nsresult rv = NS_OK;
    1.67 +
    1.68 +  NS_ASSERTION(mEncoder, "need to call Init() before Convert()");
    1.69 +  NS_ENSURE_TRUE(mEncoder, NS_ERROR_FAILURE);
    1.70 +
    1.71 +  *_retval = nullptr;
    1.72 +
    1.73 +  // make sure to start from the first charset in the list
    1.74 +  if (mCharsetListIndex > 0) {
    1.75 +    mCharsetListIndex = -1;
    1.76 +    rv = SetupUnicodeEncoder(GetNextCharset());
    1.77 +    NS_ENSURE_SUCCESS(rv, rv);
    1.78 +  }
    1.79 +
    1.80 +  do {
    1.81 +    // fallback to the next charset in the list if the last conversion failed by an unmapped character
    1.82 +    if (MASK_CHARSET_FALLBACK(mAttribute) && NS_ERROR_UENC_NOMAPPING == rv) {
    1.83 +      const char * charset = GetNextCharset();
    1.84 +      if (!charset)
    1.85 +        break;
    1.86 +      rv = SetupUnicodeEncoder(charset);
    1.87 +      NS_ENSURE_SUCCESS(rv, rv);
    1.88 +      PR_FREEIF(*_retval);
    1.89 +    }
    1.90 +
    1.91 +    if (attr_EntityBeforeCharsetConv == MASK_ENTITY(mAttribute)) {
    1.92 +      NS_ASSERTION(mEntityConverter, "need to call Init() before Convert()");
    1.93 +      NS_ENSURE_TRUE(mEntityConverter, NS_ERROR_FAILURE);
    1.94 +      char16_t *entity = nullptr;
    1.95 +      // do the entity conversion first
    1.96 +      rv = mEntityConverter->ConvertToEntities(inString, mEntityVersion, &entity);
    1.97 +      if(NS_SUCCEEDED(rv)) {
    1.98 +        rv = DoCharsetConversion(entity, _retval);
    1.99 +        nsMemory::Free(entity);
   1.100 +      }
   1.101 +    }
   1.102 +    else
   1.103 +      rv = DoCharsetConversion(inString, _retval);
   1.104 +
   1.105 +  } while (MASK_CHARSET_FALLBACK(mAttribute) && NS_ERROR_UENC_NOMAPPING == rv);
   1.106 +
   1.107 +  return rv;
   1.108 +}
   1.109 +
   1.110 +NS_IMETHODIMP 
   1.111 +nsSaveAsCharset::GetCharset(char * *aCharset)
   1.112 +{
   1.113 +  NS_ENSURE_ARG(aCharset);
   1.114 +  NS_ASSERTION(mCharsetListIndex >= 0, "need to call Init() first");
   1.115 +  NS_ENSURE_TRUE(mCharsetListIndex >= 0, NS_ERROR_FAILURE);
   1.116 +
   1.117 +  const char* charset = mCharsetList[mCharsetListIndex].get();
   1.118 +  if (!charset) {
   1.119 +    *aCharset = nullptr;
   1.120 +    NS_ASSERTION(charset, "make sure to call Init() with non empty charset list");
   1.121 +    return NS_ERROR_FAILURE;
   1.122 +  }
   1.123 +
   1.124 +  *aCharset = strdup(charset);
   1.125 +  return (*aCharset) ? NS_OK : NS_ERROR_OUT_OF_MEMORY;
   1.126 +}
   1.127 +
   1.128 +/////////////////////////////////////////////////////////////////////////////////////////
   1.129 +
   1.130 +#define RESERVE_FALLBACK_BYTES 512
   1.131 +
   1.132 +// do the fallback, reallocate the buffer if necessary
   1.133 +// need to pass destination buffer info (size, current position and estimation of rest of the conversion)
   1.134 +NS_IMETHODIMP
   1.135 +nsSaveAsCharset::HandleFallBack(uint32_t character, char **outString, int32_t *bufferLength, 
   1.136 +                                int32_t *currentPos, int32_t estimatedLength)
   1.137 +{
   1.138 +  NS_ENSURE_ARG_POINTER(outString);
   1.139 +  NS_ENSURE_ARG_POINTER(bufferLength);
   1.140 +  NS_ENSURE_ARG_POINTER(currentPos);
   1.141 +
   1.142 +  char fallbackStr[256];
   1.143 +  nsresult rv = DoConversionFallBack(character, fallbackStr, 256);
   1.144 +  if (NS_SUCCEEDED(rv)) {
   1.145 +    int32_t tempLen = (int32_t) strlen(fallbackStr);
   1.146 +
   1.147 +    // reallocate if the buffer is not large enough
   1.148 +    if ((tempLen + estimatedLength) >= (*bufferLength - *currentPos)) {
   1.149 +      int32_t addLength = tempLen + RESERVE_FALLBACK_BYTES;
   1.150 +      // + 1 is for the terminating NUL, don't add that to bufferLength
   1.151 +      char *temp = (char *) PR_Realloc(*outString, *bufferLength + addLength + 1);
   1.152 +      if (temp) {
   1.153 +        // adjust length/pointer after realloc
   1.154 +        *bufferLength += addLength;
   1.155 +        *outString = temp;
   1.156 +      } else {
   1.157 +        *outString = nullptr;
   1.158 +        *bufferLength = 0;
   1.159 +        return NS_ERROR_OUT_OF_MEMORY;
   1.160 +      }
   1.161 +    }
   1.162 +    memcpy((*outString + *currentPos), fallbackStr, tempLen);
   1.163 +    *currentPos += tempLen;
   1.164 +  }
   1.165 +  return rv;
   1.166 +}
   1.167 +
   1.168 +NS_IMETHODIMP
   1.169 +nsSaveAsCharset::DoCharsetConversion(const char16_t *inString, char **outString)
   1.170 +{
   1.171 +  NS_ENSURE_ARG_POINTER(outString);
   1.172 +
   1.173 +  *outString = nullptr;
   1.174 +
   1.175 +  nsresult rv;
   1.176 +  int32_t inStringLength = NS_strlen(inString);       // original input string length
   1.177 +  int32_t bufferLength;                               // allocated buffer length
   1.178 +  int32_t srcLength = inStringLength;
   1.179 +  int32_t dstLength;
   1.180 +  int32_t pos1, pos2;
   1.181 +  nsresult saveResult = NS_OK;                         // to remember NS_ERROR_UENC_NOMAPPING
   1.182 +
   1.183 +  // estimate and allocate the target buffer (reserve extra memory for fallback)
   1.184 +  rv = mEncoder->GetMaxLength(inString, inStringLength, &dstLength);
   1.185 +  if (NS_FAILED(rv)) return rv;
   1.186 +
   1.187 +  bufferLength = dstLength + RESERVE_FALLBACK_BYTES; // extra bytes for fallback
   1.188 +  // + 1 is for the terminating NUL -- we don't add that to bufferLength so that
   1.189 +  // we can always write dstPtr[pos2] = '\0' even when the encoder filled the
   1.190 +  // buffer.
   1.191 +  char *dstPtr = (char *) PR_Malloc(bufferLength + 1);
   1.192 +  if (!dstPtr) {
   1.193 +    return NS_ERROR_OUT_OF_MEMORY;
   1.194 +  }
   1.195 +  
   1.196 +  for (pos1 = 0, pos2 = 0; pos1 < inStringLength;) {
   1.197 +    // convert from unicode
   1.198 +    dstLength = bufferLength - pos2;
   1.199 +    NS_ASSERTION(dstLength >= 0, "out of bounds write");
   1.200 +    rv = mEncoder->Convert(&inString[pos1], &srcLength, &dstPtr[pos2], &dstLength);
   1.201 +
   1.202 +    pos1 += srcLength ? srcLength : 1;
   1.203 +    pos2 += dstLength;
   1.204 +    dstPtr[pos2] = '\0';
   1.205 +
   1.206 +    // break: this is usually the case (no error) OR unrecoverable error
   1.207 +    if (NS_ERROR_UENC_NOMAPPING != rv) break;
   1.208 +
   1.209 +    // remember this happened and reset the result
   1.210 +    saveResult = rv;
   1.211 +    rv = NS_OK;
   1.212 +
   1.213 +    // finish encoder, give it a chance to write extra data like escape sequences
   1.214 +    dstLength = bufferLength - pos2;
   1.215 +    rv = mEncoder->Finish(&dstPtr[pos2], &dstLength);
   1.216 +    if (NS_SUCCEEDED(rv)) {
   1.217 +      pos2 += dstLength;
   1.218 +      dstPtr[pos2] = '\0';
   1.219 +    }
   1.220 +
   1.221 +    srcLength = inStringLength - pos1;
   1.222 +
   1.223 +    // do the fallback
   1.224 +    if (!ATTR_NO_FALLBACK(mAttribute)) {
   1.225 +      uint32_t unMappedChar;
   1.226 +      if (NS_IS_HIGH_SURROGATE(inString[pos1-1]) && 
   1.227 +          inStringLength > pos1 && NS_IS_LOW_SURROGATE(inString[pos1])) {
   1.228 +        unMappedChar = SURROGATE_TO_UCS4(inString[pos1-1], inString[pos1]);
   1.229 +        pos1++;
   1.230 +      } else {
   1.231 +        unMappedChar = inString[pos1-1];
   1.232 +      }
   1.233 +
   1.234 +      rv = mEncoder->GetMaxLength(inString+pos1, inStringLength-pos1, &dstLength);
   1.235 +      if (NS_FAILED(rv)) 
   1.236 +        break;
   1.237 +
   1.238 +      rv = HandleFallBack(unMappedChar, &dstPtr, &bufferLength, &pos2, dstLength);
   1.239 +      if (NS_FAILED(rv)) 
   1.240 +        break;
   1.241 +      dstPtr[pos2] = '\0';
   1.242 +    }
   1.243 +  }
   1.244 +
   1.245 +  if (NS_SUCCEEDED(rv)) {
   1.246 +    // finish encoder, give it a chance to write extra data like escape sequences
   1.247 +    dstLength = bufferLength - pos2;
   1.248 +    rv = mEncoder->Finish(&dstPtr[pos2], &dstLength);
   1.249 +    if (NS_SUCCEEDED(rv)) {
   1.250 +      pos2 += dstLength;
   1.251 +      dstPtr[pos2] = '\0';
   1.252 +    }
   1.253 +  }
   1.254 +
   1.255 +  if (NS_FAILED(rv)) {
   1.256 +    PR_FREEIF(dstPtr);
   1.257 +    return rv;
   1.258 +  }
   1.259 +
   1.260 +  *outString = dstPtr;      // set the result string
   1.261 +
   1.262 +  // set error code so that the caller can do own fall back
   1.263 +  if (NS_ERROR_UENC_NOMAPPING == saveResult) {
   1.264 +    rv = NS_ERROR_UENC_NOMAPPING;
   1.265 +  }
   1.266 +
   1.267 +  return rv;
   1.268 +}
   1.269 +
   1.270 +NS_IMETHODIMP
   1.271 +nsSaveAsCharset::DoConversionFallBack(uint32_t inUCS4, char *outString, int32_t bufferLength)
   1.272 +{
   1.273 +  NS_ENSURE_ARG_POINTER(outString);
   1.274 +
   1.275 +  *outString = '\0';
   1.276 +
   1.277 +  nsresult rv = NS_OK;
   1.278 +
   1.279 +  if (ATTR_NO_FALLBACK(mAttribute)) {
   1.280 +    return NS_OK;
   1.281 +  }
   1.282 +  if (attr_EntityAfterCharsetConv == MASK_ENTITY(mAttribute)) {
   1.283 +    char *entity = nullptr;
   1.284 +    rv = mEntityConverter->ConvertUTF32ToEntity(inUCS4, mEntityVersion, &entity);
   1.285 +    if (NS_SUCCEEDED(rv)) {
   1.286 +      if (!entity || (int32_t)strlen(entity) > bufferLength) {
   1.287 +        return NS_ERROR_OUT_OF_MEMORY;
   1.288 +      }
   1.289 +      PL_strcpy(outString, entity);
   1.290 +      nsMemory::Free(entity);
   1.291 +      return rv;
   1.292 +    }
   1.293 +  }
   1.294 +
   1.295 +  switch (MASK_FALLBACK(mAttribute)) {
   1.296 +  case attr_FallbackQuestionMark:
   1.297 +    if(bufferLength>=2) {
   1.298 +      *outString++='?';
   1.299 +      *outString='\0';
   1.300 +      rv = NS_OK;
   1.301 +    } else {
   1.302 +      rv = NS_ERROR_FAILURE;
   1.303 +    }
   1.304 +    break;
   1.305 +  case attr_FallbackEscapeU:
   1.306 +    if (inUCS4 & 0xff0000)
   1.307 +      rv = (PR_snprintf(outString, bufferLength, "\\u%.6x", inUCS4) > 0) ? NS_OK : NS_ERROR_FAILURE;
   1.308 +    else
   1.309 +      rv = (PR_snprintf(outString, bufferLength, "\\u%.4x", inUCS4) > 0) ? NS_OK : NS_ERROR_FAILURE;
   1.310 +    break;
   1.311 +  case attr_FallbackDecimalNCR:
   1.312 +    rv = ( PR_snprintf(outString, bufferLength, "&#%u;", inUCS4) > 0) ? NS_OK : NS_ERROR_FAILURE;
   1.313 +    break;
   1.314 +  case attr_FallbackHexNCR:
   1.315 +    rv = (PR_snprintf(outString, bufferLength, "&#x%x;", inUCS4) > 0) ? NS_OK : NS_ERROR_FAILURE;
   1.316 +    break;
   1.317 +  case attr_FallbackNone:
   1.318 +    rv = NS_OK;
   1.319 +    break;
   1.320 +  default:
   1.321 +    rv = NS_ERROR_ILLEGAL_VALUE;
   1.322 +    break;
   1.323 +  }
   1.324 +
   1.325 +	return rv;
   1.326 +}
   1.327 +
   1.328 +nsresult nsSaveAsCharset::SetupUnicodeEncoder(const char* charset)
   1.329 +{
   1.330 +  NS_ENSURE_ARG(charset);
   1.331 +  nsresult rv;
   1.332 +
   1.333 +  // set up unicode encoder
   1.334 +  nsCOMPtr <nsICharsetConverterManager> ccm = do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID, &rv);
   1.335 +  NS_ENSURE_SUCCESS(rv, rv);
   1.336 +
   1.337 +  return ccm->GetUnicodeEncoder(charset, getter_AddRefs(mEncoder));
   1.338 +}
   1.339 +
   1.340 +nsresult nsSaveAsCharset::SetupCharsetList(const char *charsetList)
   1.341 +{
   1.342 +  NS_ENSURE_ARG(charsetList);
   1.343 +
   1.344 +  NS_ASSERTION(charsetList[0], "charsetList should not be empty");
   1.345 +  if (!charsetList[0])
   1.346 +    return NS_ERROR_INVALID_ARG;
   1.347 +
   1.348 +  if (mCharsetListIndex >= 0) {
   1.349 +    mCharsetList.Clear();
   1.350 +    mCharsetListIndex = -1;
   1.351 +  }
   1.352 +
   1.353 +  nsCWhitespaceTokenizer tokenizer = nsDependentCString(charsetList);
   1.354 +  while (tokenizer.hasMoreTokens()) {
   1.355 +    ParseString(tokenizer.nextToken(), ',', mCharsetList);
   1.356 +  }
   1.357 +
   1.358 +  return NS_OK;
   1.359 +}
   1.360 +
   1.361 +const char * nsSaveAsCharset::GetNextCharset()
   1.362 +{
   1.363 +  if ((mCharsetListIndex + 1) >= int32_t(mCharsetList.Length()))
   1.364 +    return nullptr;
   1.365 +
   1.366 +  // bump the index and return the next charset
   1.367 +  return mCharsetList[++mCharsetListIndex].get();
   1.368 +}

mercurial