1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/unicharutil/src/nsSaveAsCharset.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,365 @@ 1.4 +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 1.5 +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ 1.6 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.7 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.8 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.9 + 1.10 + 1.11 +#include "prmem.h" 1.12 +#include "prprf.h" 1.13 +#include "nsICharsetConverterManager.h" 1.14 +#include "nsSaveAsCharset.h" 1.15 +#include "nsWhitespaceTokenizer.h" 1.16 +#include "nsServiceManagerUtils.h" 1.17 + 1.18 +// 1.19 +// nsISupports methods 1.20 +// 1.21 +NS_IMPL_ISUPPORTS(nsSaveAsCharset, nsISaveAsCharset) 1.22 + 1.23 +// 1.24 +// nsSaveAsCharset 1.25 +// 1.26 +nsSaveAsCharset::nsSaveAsCharset() 1.27 +{ 1.28 + mAttribute = attr_htmlTextDefault; 1.29 + mEntityVersion = 0; 1.30 + mCharsetListIndex = -1; 1.31 +} 1.32 + 1.33 +nsSaveAsCharset::~nsSaveAsCharset() 1.34 +{ 1.35 +} 1.36 + 1.37 +NS_IMETHODIMP 1.38 +nsSaveAsCharset::Init(const char *charset, uint32_t attr, uint32_t entityVersion) 1.39 +{ 1.40 + nsresult rv = NS_OK; 1.41 + 1.42 + mAttribute = attr; 1.43 + mEntityVersion = entityVersion; 1.44 + 1.45 + rv = SetupCharsetList(charset); 1.46 + NS_ENSURE_SUCCESS(rv, rv); 1.47 + 1.48 + // set up unicode encoder 1.49 + rv = SetupUnicodeEncoder(GetNextCharset()); 1.50 + NS_ENSURE_SUCCESS(rv, rv); 1.51 + 1.52 + // set up entity converter 1.53 + if (attr_EntityNone != MASK_ENTITY(mAttribute) && !mEntityConverter) 1.54 + mEntityConverter = do_CreateInstance(NS_ENTITYCONVERTER_CONTRACTID, &rv); 1.55 + 1.56 + return rv; 1.57 +} 1.58 + 1.59 +NS_IMETHODIMP 1.60 +nsSaveAsCharset::Convert(const char16_t *inString, char **_retval) 1.61 +{ 1.62 + NS_ENSURE_ARG_POINTER(_retval); 1.63 + NS_ENSURE_ARG_POINTER(inString); 1.64 + if (0 == *inString) 1.65 + return NS_ERROR_ILLEGAL_VALUE; 1.66 + nsresult rv = NS_OK; 1.67 + 1.68 + NS_ASSERTION(mEncoder, "need to call Init() before Convert()"); 1.69 + NS_ENSURE_TRUE(mEncoder, NS_ERROR_FAILURE); 1.70 + 1.71 + *_retval = nullptr; 1.72 + 1.73 + // make sure to start from the first charset in the list 1.74 + if (mCharsetListIndex > 0) { 1.75 + mCharsetListIndex = -1; 1.76 + rv = SetupUnicodeEncoder(GetNextCharset()); 1.77 + NS_ENSURE_SUCCESS(rv, rv); 1.78 + } 1.79 + 1.80 + do { 1.81 + // fallback to the next charset in the list if the last conversion failed by an unmapped character 1.82 + if (MASK_CHARSET_FALLBACK(mAttribute) && NS_ERROR_UENC_NOMAPPING == rv) { 1.83 + const char * charset = GetNextCharset(); 1.84 + if (!charset) 1.85 + break; 1.86 + rv = SetupUnicodeEncoder(charset); 1.87 + NS_ENSURE_SUCCESS(rv, rv); 1.88 + PR_FREEIF(*_retval); 1.89 + } 1.90 + 1.91 + if (attr_EntityBeforeCharsetConv == MASK_ENTITY(mAttribute)) { 1.92 + NS_ASSERTION(mEntityConverter, "need to call Init() before Convert()"); 1.93 + NS_ENSURE_TRUE(mEntityConverter, NS_ERROR_FAILURE); 1.94 + char16_t *entity = nullptr; 1.95 + // do the entity conversion first 1.96 + rv = mEntityConverter->ConvertToEntities(inString, mEntityVersion, &entity); 1.97 + if(NS_SUCCEEDED(rv)) { 1.98 + rv = DoCharsetConversion(entity, _retval); 1.99 + nsMemory::Free(entity); 1.100 + } 1.101 + } 1.102 + else 1.103 + rv = DoCharsetConversion(inString, _retval); 1.104 + 1.105 + } while (MASK_CHARSET_FALLBACK(mAttribute) && NS_ERROR_UENC_NOMAPPING == rv); 1.106 + 1.107 + return rv; 1.108 +} 1.109 + 1.110 +NS_IMETHODIMP 1.111 +nsSaveAsCharset::GetCharset(char * *aCharset) 1.112 +{ 1.113 + NS_ENSURE_ARG(aCharset); 1.114 + NS_ASSERTION(mCharsetListIndex >= 0, "need to call Init() first"); 1.115 + NS_ENSURE_TRUE(mCharsetListIndex >= 0, NS_ERROR_FAILURE); 1.116 + 1.117 + const char* charset = mCharsetList[mCharsetListIndex].get(); 1.118 + if (!charset) { 1.119 + *aCharset = nullptr; 1.120 + NS_ASSERTION(charset, "make sure to call Init() with non empty charset list"); 1.121 + return NS_ERROR_FAILURE; 1.122 + } 1.123 + 1.124 + *aCharset = strdup(charset); 1.125 + return (*aCharset) ? NS_OK : NS_ERROR_OUT_OF_MEMORY; 1.126 +} 1.127 + 1.128 +///////////////////////////////////////////////////////////////////////////////////////// 1.129 + 1.130 +#define RESERVE_FALLBACK_BYTES 512 1.131 + 1.132 +// do the fallback, reallocate the buffer if necessary 1.133 +// need to pass destination buffer info (size, current position and estimation of rest of the conversion) 1.134 +NS_IMETHODIMP 1.135 +nsSaveAsCharset::HandleFallBack(uint32_t character, char **outString, int32_t *bufferLength, 1.136 + int32_t *currentPos, int32_t estimatedLength) 1.137 +{ 1.138 + NS_ENSURE_ARG_POINTER(outString); 1.139 + NS_ENSURE_ARG_POINTER(bufferLength); 1.140 + NS_ENSURE_ARG_POINTER(currentPos); 1.141 + 1.142 + char fallbackStr[256]; 1.143 + nsresult rv = DoConversionFallBack(character, fallbackStr, 256); 1.144 + if (NS_SUCCEEDED(rv)) { 1.145 + int32_t tempLen = (int32_t) strlen(fallbackStr); 1.146 + 1.147 + // reallocate if the buffer is not large enough 1.148 + if ((tempLen + estimatedLength) >= (*bufferLength - *currentPos)) { 1.149 + int32_t addLength = tempLen + RESERVE_FALLBACK_BYTES; 1.150 + // + 1 is for the terminating NUL, don't add that to bufferLength 1.151 + char *temp = (char *) PR_Realloc(*outString, *bufferLength + addLength + 1); 1.152 + if (temp) { 1.153 + // adjust length/pointer after realloc 1.154 + *bufferLength += addLength; 1.155 + *outString = temp; 1.156 + } else { 1.157 + *outString = nullptr; 1.158 + *bufferLength = 0; 1.159 + return NS_ERROR_OUT_OF_MEMORY; 1.160 + } 1.161 + } 1.162 + memcpy((*outString + *currentPos), fallbackStr, tempLen); 1.163 + *currentPos += tempLen; 1.164 + } 1.165 + return rv; 1.166 +} 1.167 + 1.168 +NS_IMETHODIMP 1.169 +nsSaveAsCharset::DoCharsetConversion(const char16_t *inString, char **outString) 1.170 +{ 1.171 + NS_ENSURE_ARG_POINTER(outString); 1.172 + 1.173 + *outString = nullptr; 1.174 + 1.175 + nsresult rv; 1.176 + int32_t inStringLength = NS_strlen(inString); // original input string length 1.177 + int32_t bufferLength; // allocated buffer length 1.178 + int32_t srcLength = inStringLength; 1.179 + int32_t dstLength; 1.180 + int32_t pos1, pos2; 1.181 + nsresult saveResult = NS_OK; // to remember NS_ERROR_UENC_NOMAPPING 1.182 + 1.183 + // estimate and allocate the target buffer (reserve extra memory for fallback) 1.184 + rv = mEncoder->GetMaxLength(inString, inStringLength, &dstLength); 1.185 + if (NS_FAILED(rv)) return rv; 1.186 + 1.187 + bufferLength = dstLength + RESERVE_FALLBACK_BYTES; // extra bytes for fallback 1.188 + // + 1 is for the terminating NUL -- we don't add that to bufferLength so that 1.189 + // we can always write dstPtr[pos2] = '\0' even when the encoder filled the 1.190 + // buffer. 1.191 + char *dstPtr = (char *) PR_Malloc(bufferLength + 1); 1.192 + if (!dstPtr) { 1.193 + return NS_ERROR_OUT_OF_MEMORY; 1.194 + } 1.195 + 1.196 + for (pos1 = 0, pos2 = 0; pos1 < inStringLength;) { 1.197 + // convert from unicode 1.198 + dstLength = bufferLength - pos2; 1.199 + NS_ASSERTION(dstLength >= 0, "out of bounds write"); 1.200 + rv = mEncoder->Convert(&inString[pos1], &srcLength, &dstPtr[pos2], &dstLength); 1.201 + 1.202 + pos1 += srcLength ? srcLength : 1; 1.203 + pos2 += dstLength; 1.204 + dstPtr[pos2] = '\0'; 1.205 + 1.206 + // break: this is usually the case (no error) OR unrecoverable error 1.207 + if (NS_ERROR_UENC_NOMAPPING != rv) break; 1.208 + 1.209 + // remember this happened and reset the result 1.210 + saveResult = rv; 1.211 + rv = NS_OK; 1.212 + 1.213 + // finish encoder, give it a chance to write extra data like escape sequences 1.214 + dstLength = bufferLength - pos2; 1.215 + rv = mEncoder->Finish(&dstPtr[pos2], &dstLength); 1.216 + if (NS_SUCCEEDED(rv)) { 1.217 + pos2 += dstLength; 1.218 + dstPtr[pos2] = '\0'; 1.219 + } 1.220 + 1.221 + srcLength = inStringLength - pos1; 1.222 + 1.223 + // do the fallback 1.224 + if (!ATTR_NO_FALLBACK(mAttribute)) { 1.225 + uint32_t unMappedChar; 1.226 + if (NS_IS_HIGH_SURROGATE(inString[pos1-1]) && 1.227 + inStringLength > pos1 && NS_IS_LOW_SURROGATE(inString[pos1])) { 1.228 + unMappedChar = SURROGATE_TO_UCS4(inString[pos1-1], inString[pos1]); 1.229 + pos1++; 1.230 + } else { 1.231 + unMappedChar = inString[pos1-1]; 1.232 + } 1.233 + 1.234 + rv = mEncoder->GetMaxLength(inString+pos1, inStringLength-pos1, &dstLength); 1.235 + if (NS_FAILED(rv)) 1.236 + break; 1.237 + 1.238 + rv = HandleFallBack(unMappedChar, &dstPtr, &bufferLength, &pos2, dstLength); 1.239 + if (NS_FAILED(rv)) 1.240 + break; 1.241 + dstPtr[pos2] = '\0'; 1.242 + } 1.243 + } 1.244 + 1.245 + if (NS_SUCCEEDED(rv)) { 1.246 + // finish encoder, give it a chance to write extra data like escape sequences 1.247 + dstLength = bufferLength - pos2; 1.248 + rv = mEncoder->Finish(&dstPtr[pos2], &dstLength); 1.249 + if (NS_SUCCEEDED(rv)) { 1.250 + pos2 += dstLength; 1.251 + dstPtr[pos2] = '\0'; 1.252 + } 1.253 + } 1.254 + 1.255 + if (NS_FAILED(rv)) { 1.256 + PR_FREEIF(dstPtr); 1.257 + return rv; 1.258 + } 1.259 + 1.260 + *outString = dstPtr; // set the result string 1.261 + 1.262 + // set error code so that the caller can do own fall back 1.263 + if (NS_ERROR_UENC_NOMAPPING == saveResult) { 1.264 + rv = NS_ERROR_UENC_NOMAPPING; 1.265 + } 1.266 + 1.267 + return rv; 1.268 +} 1.269 + 1.270 +NS_IMETHODIMP 1.271 +nsSaveAsCharset::DoConversionFallBack(uint32_t inUCS4, char *outString, int32_t bufferLength) 1.272 +{ 1.273 + NS_ENSURE_ARG_POINTER(outString); 1.274 + 1.275 + *outString = '\0'; 1.276 + 1.277 + nsresult rv = NS_OK; 1.278 + 1.279 + if (ATTR_NO_FALLBACK(mAttribute)) { 1.280 + return NS_OK; 1.281 + } 1.282 + if (attr_EntityAfterCharsetConv == MASK_ENTITY(mAttribute)) { 1.283 + char *entity = nullptr; 1.284 + rv = mEntityConverter->ConvertUTF32ToEntity(inUCS4, mEntityVersion, &entity); 1.285 + if (NS_SUCCEEDED(rv)) { 1.286 + if (!entity || (int32_t)strlen(entity) > bufferLength) { 1.287 + return NS_ERROR_OUT_OF_MEMORY; 1.288 + } 1.289 + PL_strcpy(outString, entity); 1.290 + nsMemory::Free(entity); 1.291 + return rv; 1.292 + } 1.293 + } 1.294 + 1.295 + switch (MASK_FALLBACK(mAttribute)) { 1.296 + case attr_FallbackQuestionMark: 1.297 + if(bufferLength>=2) { 1.298 + *outString++='?'; 1.299 + *outString='\0'; 1.300 + rv = NS_OK; 1.301 + } else { 1.302 + rv = NS_ERROR_FAILURE; 1.303 + } 1.304 + break; 1.305 + case attr_FallbackEscapeU: 1.306 + if (inUCS4 & 0xff0000) 1.307 + rv = (PR_snprintf(outString, bufferLength, "\\u%.6x", inUCS4) > 0) ? NS_OK : NS_ERROR_FAILURE; 1.308 + else 1.309 + rv = (PR_snprintf(outString, bufferLength, "\\u%.4x", inUCS4) > 0) ? NS_OK : NS_ERROR_FAILURE; 1.310 + break; 1.311 + case attr_FallbackDecimalNCR: 1.312 + rv = ( PR_snprintf(outString, bufferLength, "&#%u;", inUCS4) > 0) ? NS_OK : NS_ERROR_FAILURE; 1.313 + break; 1.314 + case attr_FallbackHexNCR: 1.315 + rv = (PR_snprintf(outString, bufferLength, "&#x%x;", inUCS4) > 0) ? NS_OK : NS_ERROR_FAILURE; 1.316 + break; 1.317 + case attr_FallbackNone: 1.318 + rv = NS_OK; 1.319 + break; 1.320 + default: 1.321 + rv = NS_ERROR_ILLEGAL_VALUE; 1.322 + break; 1.323 + } 1.324 + 1.325 + return rv; 1.326 +} 1.327 + 1.328 +nsresult nsSaveAsCharset::SetupUnicodeEncoder(const char* charset) 1.329 +{ 1.330 + NS_ENSURE_ARG(charset); 1.331 + nsresult rv; 1.332 + 1.333 + // set up unicode encoder 1.334 + nsCOMPtr <nsICharsetConverterManager> ccm = do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID, &rv); 1.335 + NS_ENSURE_SUCCESS(rv, rv); 1.336 + 1.337 + return ccm->GetUnicodeEncoder(charset, getter_AddRefs(mEncoder)); 1.338 +} 1.339 + 1.340 +nsresult nsSaveAsCharset::SetupCharsetList(const char *charsetList) 1.341 +{ 1.342 + NS_ENSURE_ARG(charsetList); 1.343 + 1.344 + NS_ASSERTION(charsetList[0], "charsetList should not be empty"); 1.345 + if (!charsetList[0]) 1.346 + return NS_ERROR_INVALID_ARG; 1.347 + 1.348 + if (mCharsetListIndex >= 0) { 1.349 + mCharsetList.Clear(); 1.350 + mCharsetListIndex = -1; 1.351 + } 1.352 + 1.353 + nsCWhitespaceTokenizer tokenizer = nsDependentCString(charsetList); 1.354 + while (tokenizer.hasMoreTokens()) { 1.355 + ParseString(tokenizer.nextToken(), ',', mCharsetList); 1.356 + } 1.357 + 1.358 + return NS_OK; 1.359 +} 1.360 + 1.361 +const char * nsSaveAsCharset::GetNextCharset() 1.362 +{ 1.363 + if ((mCharsetListIndex + 1) >= int32_t(mCharsetList.Length())) 1.364 + return nullptr; 1.365 + 1.366 + // bump the index and return the next charset 1.367 + return mCharsetList[++mCharsetListIndex].get(); 1.368 +}