1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/unistr_cnv.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,425 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* 1.7 +* Copyright (C) 1999-2010, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +******************************************************************************* 1.11 +* file name: unistr_cnv.cpp 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:2 1.15 +* 1.16 +* created on: 2004aug19 1.17 +* created by: Markus W. Scherer 1.18 +* 1.19 +* Character conversion functions moved here from unistr.cpp 1.20 +*/ 1.21 + 1.22 +#include "unicode/utypes.h" 1.23 + 1.24 +#if !UCONFIG_NO_CONVERSION 1.25 + 1.26 +#include "unicode/putil.h" 1.27 +#include "cstring.h" 1.28 +#include "cmemory.h" 1.29 +#include "unicode/ustring.h" 1.30 +#include "unicode/unistr.h" 1.31 +#include "unicode/ucnv.h" 1.32 +#include "ucnv_imp.h" 1.33 +#include "putilimp.h" 1.34 +#include "ustr_cnv.h" 1.35 +#include "ustr_imp.h" 1.36 + 1.37 +U_NAMESPACE_BEGIN 1.38 + 1.39 +//======================================== 1.40 +// Constructors 1.41 +//======================================== 1.42 + 1.43 +#if !U_CHARSET_IS_UTF8 1.44 + 1.45 +UnicodeString::UnicodeString(const char *codepageData) 1.46 + : fShortLength(0), 1.47 + fFlags(kShortString) 1.48 +{ 1.49 + if(codepageData != 0) { 1.50 + doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0); 1.51 + } 1.52 +} 1.53 + 1.54 +UnicodeString::UnicodeString(const char *codepageData, 1.55 + int32_t dataLength) 1.56 + : fShortLength(0), 1.57 + fFlags(kShortString) 1.58 +{ 1.59 + if(codepageData != 0) { 1.60 + doCodepageCreate(codepageData, dataLength, 0); 1.61 + } 1.62 +} 1.63 + 1.64 +// else see unistr.cpp 1.65 +#endif 1.66 + 1.67 +UnicodeString::UnicodeString(const char *codepageData, 1.68 + const char *codepage) 1.69 + : fShortLength(0), 1.70 + fFlags(kShortString) 1.71 +{ 1.72 + if(codepageData != 0) { 1.73 + doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage); 1.74 + } 1.75 +} 1.76 + 1.77 +UnicodeString::UnicodeString(const char *codepageData, 1.78 + int32_t dataLength, 1.79 + const char *codepage) 1.80 + : fShortLength(0), 1.81 + fFlags(kShortString) 1.82 +{ 1.83 + if(codepageData != 0) { 1.84 + doCodepageCreate(codepageData, dataLength, codepage); 1.85 + } 1.86 +} 1.87 + 1.88 +UnicodeString::UnicodeString(const char *src, int32_t srcLength, 1.89 + UConverter *cnv, 1.90 + UErrorCode &errorCode) 1.91 + : fShortLength(0), 1.92 + fFlags(kShortString) 1.93 +{ 1.94 + if(U_SUCCESS(errorCode)) { 1.95 + // check arguments 1.96 + if(src==NULL) { 1.97 + // treat as an empty string, do nothing more 1.98 + } else if(srcLength<-1) { 1.99 + errorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.100 + } else { 1.101 + // get input length 1.102 + if(srcLength==-1) { 1.103 + srcLength=(int32_t)uprv_strlen(src); 1.104 + } 1.105 + if(srcLength>0) { 1.106 + if(cnv!=0) { 1.107 + // use the provided converter 1.108 + ucnv_resetToUnicode(cnv); 1.109 + doCodepageCreate(src, srcLength, cnv, errorCode); 1.110 + } else { 1.111 + // use the default converter 1.112 + cnv=u_getDefaultConverter(&errorCode); 1.113 + doCodepageCreate(src, srcLength, cnv, errorCode); 1.114 + u_releaseDefaultConverter(cnv); 1.115 + } 1.116 + } 1.117 + } 1.118 + 1.119 + if(U_FAILURE(errorCode)) { 1.120 + setToBogus(); 1.121 + } 1.122 + } 1.123 +} 1.124 + 1.125 +//======================================== 1.126 +// Codeset conversion 1.127 +//======================================== 1.128 + 1.129 +#if !U_CHARSET_IS_UTF8 1.130 + 1.131 +int32_t 1.132 +UnicodeString::extract(int32_t start, 1.133 + int32_t length, 1.134 + char *target, 1.135 + uint32_t dstSize) const { 1.136 + return extract(start, length, target, dstSize, 0); 1.137 +} 1.138 + 1.139 +// else see unistr.cpp 1.140 +#endif 1.141 + 1.142 +int32_t 1.143 +UnicodeString::extract(int32_t start, 1.144 + int32_t length, 1.145 + char *target, 1.146 + uint32_t dstSize, 1.147 + const char *codepage) const 1.148 +{ 1.149 + // if the arguments are illegal, then do nothing 1.150 + if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) { 1.151 + return 0; 1.152 + } 1.153 + 1.154 + // pin the indices to legal values 1.155 + pinIndices(start, length); 1.156 + 1.157 + // We need to cast dstSize to int32_t for all subsequent code. 1.158 + // I don't know why the API was defined with uint32_t but we are stuck with it. 1.159 + // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize 1.160 + // as a limit in some functions, it may wrap around and yield a pointer 1.161 + // that compares less-than target. 1.162 + int32_t capacity; 1.163 + if(dstSize < 0x7fffffff) { 1.164 + // Assume that the capacity is real and a limit pointer won't wrap around. 1.165 + capacity = (int32_t)dstSize; 1.166 + } else { 1.167 + // Pin the capacity so that a limit pointer does not wrap around. 1.168 + char *targetLimit = (char *)U_MAX_PTR(target); 1.169 + // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff 1.170 + // greater than target and does not wrap around the top of the address space. 1.171 + capacity = (int32_t)(targetLimit - target); 1.172 + } 1.173 + 1.174 + // create the converter 1.175 + UConverter *converter; 1.176 + UErrorCode status = U_ZERO_ERROR; 1.177 + 1.178 + // just write the NUL if the string length is 0 1.179 + if(length == 0) { 1.180 + return u_terminateChars(target, capacity, 0, &status); 1.181 + } 1.182 + 1.183 + // if the codepage is the default, use our cache 1.184 + // if it is an empty string, then use the "invariant character" conversion 1.185 + if (codepage == 0) { 1.186 + const char *defaultName = ucnv_getDefaultName(); 1.187 + if(UCNV_FAST_IS_UTF8(defaultName)) { 1.188 + return toUTF8(start, length, target, capacity); 1.189 + } 1.190 + converter = u_getDefaultConverter(&status); 1.191 + } else if (*codepage == 0) { 1.192 + // use the "invariant characters" conversion 1.193 + int32_t destLength; 1.194 + if(length <= capacity) { 1.195 + destLength = length; 1.196 + } else { 1.197 + destLength = capacity; 1.198 + } 1.199 + u_UCharsToChars(getArrayStart() + start, target, destLength); 1.200 + return u_terminateChars(target, capacity, length, &status); 1.201 + } else { 1.202 + converter = ucnv_open(codepage, &status); 1.203 + } 1.204 + 1.205 + length = doExtract(start, length, target, capacity, converter, status); 1.206 + 1.207 + // close the converter 1.208 + if (codepage == 0) { 1.209 + u_releaseDefaultConverter(converter); 1.210 + } else { 1.211 + ucnv_close(converter); 1.212 + } 1.213 + 1.214 + return length; 1.215 +} 1.216 + 1.217 +int32_t 1.218 +UnicodeString::extract(char *dest, int32_t destCapacity, 1.219 + UConverter *cnv, 1.220 + UErrorCode &errorCode) const 1.221 +{ 1.222 + if(U_FAILURE(errorCode)) { 1.223 + return 0; 1.224 + } 1.225 + 1.226 + if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) { 1.227 + errorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.228 + return 0; 1.229 + } 1.230 + 1.231 + // nothing to do? 1.232 + if(isEmpty()) { 1.233 + return u_terminateChars(dest, destCapacity, 0, &errorCode); 1.234 + } 1.235 + 1.236 + // get the converter 1.237 + UBool isDefaultConverter; 1.238 + if(cnv==0) { 1.239 + isDefaultConverter=TRUE; 1.240 + cnv=u_getDefaultConverter(&errorCode); 1.241 + if(U_FAILURE(errorCode)) { 1.242 + return 0; 1.243 + } 1.244 + } else { 1.245 + isDefaultConverter=FALSE; 1.246 + ucnv_resetFromUnicode(cnv); 1.247 + } 1.248 + 1.249 + // convert 1.250 + int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode); 1.251 + 1.252 + // release the converter 1.253 + if(isDefaultConverter) { 1.254 + u_releaseDefaultConverter(cnv); 1.255 + } 1.256 + 1.257 + return len; 1.258 +} 1.259 + 1.260 +int32_t 1.261 +UnicodeString::doExtract(int32_t start, int32_t length, 1.262 + char *dest, int32_t destCapacity, 1.263 + UConverter *cnv, 1.264 + UErrorCode &errorCode) const 1.265 +{ 1.266 + if(U_FAILURE(errorCode)) { 1.267 + if(destCapacity!=0) { 1.268 + *dest=0; 1.269 + } 1.270 + return 0; 1.271 + } 1.272 + 1.273 + const UChar *src=getArrayStart()+start, *srcLimit=src+length; 1.274 + char *originalDest=dest; 1.275 + const char *destLimit; 1.276 + 1.277 + if(destCapacity==0) { 1.278 + destLimit=dest=0; 1.279 + } else if(destCapacity==-1) { 1.280 + // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used. 1.281 + destLimit=(char*)U_MAX_PTR(dest); 1.282 + // for NUL-termination, translate into highest int32_t 1.283 + destCapacity=0x7fffffff; 1.284 + } else { 1.285 + destLimit=dest+destCapacity; 1.286 + } 1.287 + 1.288 + // perform the conversion 1.289 + ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode); 1.290 + length=(int32_t)(dest-originalDest); 1.291 + 1.292 + // if an overflow occurs, then get the preflighting length 1.293 + if(errorCode==U_BUFFER_OVERFLOW_ERROR) { 1.294 + char buffer[1024]; 1.295 + 1.296 + destLimit=buffer+sizeof(buffer); 1.297 + do { 1.298 + dest=buffer; 1.299 + errorCode=U_ZERO_ERROR; 1.300 + ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode); 1.301 + length+=(int32_t)(dest-buffer); 1.302 + } while(errorCode==U_BUFFER_OVERFLOW_ERROR); 1.303 + } 1.304 + 1.305 + return u_terminateChars(originalDest, destCapacity, length, &errorCode); 1.306 +} 1.307 + 1.308 +void 1.309 +UnicodeString::doCodepageCreate(const char *codepageData, 1.310 + int32_t dataLength, 1.311 + const char *codepage) 1.312 +{ 1.313 + // if there's nothing to convert, do nothing 1.314 + if(codepageData == 0 || dataLength == 0 || dataLength < -1) { 1.315 + return; 1.316 + } 1.317 + if(dataLength == -1) { 1.318 + dataLength = (int32_t)uprv_strlen(codepageData); 1.319 + } 1.320 + 1.321 + UErrorCode status = U_ZERO_ERROR; 1.322 + 1.323 + // create the converter 1.324 + // if the codepage is the default, use our cache 1.325 + // if it is an empty string, then use the "invariant character" conversion 1.326 + UConverter *converter; 1.327 + if (codepage == 0) { 1.328 + const char *defaultName = ucnv_getDefaultName(); 1.329 + if(UCNV_FAST_IS_UTF8(defaultName)) { 1.330 + setToUTF8(StringPiece(codepageData, dataLength)); 1.331 + return; 1.332 + } 1.333 + converter = u_getDefaultConverter(&status); 1.334 + } else if(*codepage == 0) { 1.335 + // use the "invariant characters" conversion 1.336 + if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) { 1.337 + u_charsToUChars(codepageData, getArrayStart(), dataLength); 1.338 + setLength(dataLength); 1.339 + } else { 1.340 + setToBogus(); 1.341 + } 1.342 + return; 1.343 + } else { 1.344 + converter = ucnv_open(codepage, &status); 1.345 + } 1.346 + 1.347 + // if we failed, set the appropriate flags and return 1.348 + if(U_FAILURE(status)) { 1.349 + setToBogus(); 1.350 + return; 1.351 + } 1.352 + 1.353 + // perform the conversion 1.354 + doCodepageCreate(codepageData, dataLength, converter, status); 1.355 + if(U_FAILURE(status)) { 1.356 + setToBogus(); 1.357 + } 1.358 + 1.359 + // close the converter 1.360 + if(codepage == 0) { 1.361 + u_releaseDefaultConverter(converter); 1.362 + } else { 1.363 + ucnv_close(converter); 1.364 + } 1.365 +} 1.366 + 1.367 +void 1.368 +UnicodeString::doCodepageCreate(const char *codepageData, 1.369 + int32_t dataLength, 1.370 + UConverter *converter, 1.371 + UErrorCode &status) 1.372 +{ 1.373 + if(U_FAILURE(status)) { 1.374 + return; 1.375 + } 1.376 + 1.377 + // set up the conversion parameters 1.378 + const char *mySource = codepageData; 1.379 + const char *mySourceEnd = mySource + dataLength; 1.380 + UChar *array, *myTarget; 1.381 + 1.382 + // estimate the size needed: 1.383 + int32_t arraySize; 1.384 + if(dataLength <= US_STACKBUF_SIZE) { 1.385 + // try to use the stack buffer 1.386 + arraySize = US_STACKBUF_SIZE; 1.387 + } else { 1.388 + // 1.25 UChar's per source byte should cover most cases 1.389 + arraySize = dataLength + (dataLength >> 2); 1.390 + } 1.391 + 1.392 + // we do not care about the current contents 1.393 + UBool doCopyArray = FALSE; 1.394 + for(;;) { 1.395 + if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) { 1.396 + setToBogus(); 1.397 + break; 1.398 + } 1.399 + 1.400 + // perform the conversion 1.401 + array = getArrayStart(); 1.402 + myTarget = array + length(); 1.403 + ucnv_toUnicode(converter, &myTarget, array + getCapacity(), 1.404 + &mySource, mySourceEnd, 0, TRUE, &status); 1.405 + 1.406 + // update the conversion parameters 1.407 + setLength((int32_t)(myTarget - array)); 1.408 + 1.409 + // allocate more space and copy data, if needed 1.410 + if(status == U_BUFFER_OVERFLOW_ERROR) { 1.411 + // reset the error code 1.412 + status = U_ZERO_ERROR; 1.413 + 1.414 + // keep the previous conversion results 1.415 + doCopyArray = TRUE; 1.416 + 1.417 + // estimate the new size needed, larger than before 1.418 + // try 2 UChar's per remaining source byte 1.419 + arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource)); 1.420 + } else { 1.421 + break; 1.422 + } 1.423 + } 1.424 +} 1.425 + 1.426 +U_NAMESPACE_END 1.427 + 1.428 +#endif