intl/icu/source/common/unistr_cnv.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 *******************************************************************************
michael@0 3 *
michael@0 4 * Copyright (C) 1999-2010, International Business Machines
michael@0 5 * Corporation and others. All Rights Reserved.
michael@0 6 *
michael@0 7 *******************************************************************************
michael@0 8 * file name: unistr_cnv.cpp
michael@0 9 * encoding: US-ASCII
michael@0 10 * tab size: 8 (not used)
michael@0 11 * indentation:2
michael@0 12 *
michael@0 13 * created on: 2004aug19
michael@0 14 * created by: Markus W. Scherer
michael@0 15 *
michael@0 16 * Character conversion functions moved here from unistr.cpp
michael@0 17 */
michael@0 18
michael@0 19 #include "unicode/utypes.h"
michael@0 20
michael@0 21 #if !UCONFIG_NO_CONVERSION
michael@0 22
michael@0 23 #include "unicode/putil.h"
michael@0 24 #include "cstring.h"
michael@0 25 #include "cmemory.h"
michael@0 26 #include "unicode/ustring.h"
michael@0 27 #include "unicode/unistr.h"
michael@0 28 #include "unicode/ucnv.h"
michael@0 29 #include "ucnv_imp.h"
michael@0 30 #include "putilimp.h"
michael@0 31 #include "ustr_cnv.h"
michael@0 32 #include "ustr_imp.h"
michael@0 33
michael@0 34 U_NAMESPACE_BEGIN
michael@0 35
michael@0 36 //========================================
michael@0 37 // Constructors
michael@0 38 //========================================
michael@0 39
michael@0 40 #if !U_CHARSET_IS_UTF8
michael@0 41
michael@0 42 UnicodeString::UnicodeString(const char *codepageData)
michael@0 43 : fShortLength(0),
michael@0 44 fFlags(kShortString)
michael@0 45 {
michael@0 46 if(codepageData != 0) {
michael@0 47 doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0);
michael@0 48 }
michael@0 49 }
michael@0 50
michael@0 51 UnicodeString::UnicodeString(const char *codepageData,
michael@0 52 int32_t dataLength)
michael@0 53 : fShortLength(0),
michael@0 54 fFlags(kShortString)
michael@0 55 {
michael@0 56 if(codepageData != 0) {
michael@0 57 doCodepageCreate(codepageData, dataLength, 0);
michael@0 58 }
michael@0 59 }
michael@0 60
michael@0 61 // else see unistr.cpp
michael@0 62 #endif
michael@0 63
michael@0 64 UnicodeString::UnicodeString(const char *codepageData,
michael@0 65 const char *codepage)
michael@0 66 : fShortLength(0),
michael@0 67 fFlags(kShortString)
michael@0 68 {
michael@0 69 if(codepageData != 0) {
michael@0 70 doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);
michael@0 71 }
michael@0 72 }
michael@0 73
michael@0 74 UnicodeString::UnicodeString(const char *codepageData,
michael@0 75 int32_t dataLength,
michael@0 76 const char *codepage)
michael@0 77 : fShortLength(0),
michael@0 78 fFlags(kShortString)
michael@0 79 {
michael@0 80 if(codepageData != 0) {
michael@0 81 doCodepageCreate(codepageData, dataLength, codepage);
michael@0 82 }
michael@0 83 }
michael@0 84
michael@0 85 UnicodeString::UnicodeString(const char *src, int32_t srcLength,
michael@0 86 UConverter *cnv,
michael@0 87 UErrorCode &errorCode)
michael@0 88 : fShortLength(0),
michael@0 89 fFlags(kShortString)
michael@0 90 {
michael@0 91 if(U_SUCCESS(errorCode)) {
michael@0 92 // check arguments
michael@0 93 if(src==NULL) {
michael@0 94 // treat as an empty string, do nothing more
michael@0 95 } else if(srcLength<-1) {
michael@0 96 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 97 } else {
michael@0 98 // get input length
michael@0 99 if(srcLength==-1) {
michael@0 100 srcLength=(int32_t)uprv_strlen(src);
michael@0 101 }
michael@0 102 if(srcLength>0) {
michael@0 103 if(cnv!=0) {
michael@0 104 // use the provided converter
michael@0 105 ucnv_resetToUnicode(cnv);
michael@0 106 doCodepageCreate(src, srcLength, cnv, errorCode);
michael@0 107 } else {
michael@0 108 // use the default converter
michael@0 109 cnv=u_getDefaultConverter(&errorCode);
michael@0 110 doCodepageCreate(src, srcLength, cnv, errorCode);
michael@0 111 u_releaseDefaultConverter(cnv);
michael@0 112 }
michael@0 113 }
michael@0 114 }
michael@0 115
michael@0 116 if(U_FAILURE(errorCode)) {
michael@0 117 setToBogus();
michael@0 118 }
michael@0 119 }
michael@0 120 }
michael@0 121
michael@0 122 //========================================
michael@0 123 // Codeset conversion
michael@0 124 //========================================
michael@0 125
michael@0 126 #if !U_CHARSET_IS_UTF8
michael@0 127
michael@0 128 int32_t
michael@0 129 UnicodeString::extract(int32_t start,
michael@0 130 int32_t length,
michael@0 131 char *target,
michael@0 132 uint32_t dstSize) const {
michael@0 133 return extract(start, length, target, dstSize, 0);
michael@0 134 }
michael@0 135
michael@0 136 // else see unistr.cpp
michael@0 137 #endif
michael@0 138
michael@0 139 int32_t
michael@0 140 UnicodeString::extract(int32_t start,
michael@0 141 int32_t length,
michael@0 142 char *target,
michael@0 143 uint32_t dstSize,
michael@0 144 const char *codepage) const
michael@0 145 {
michael@0 146 // if the arguments are illegal, then do nothing
michael@0 147 if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
michael@0 148 return 0;
michael@0 149 }
michael@0 150
michael@0 151 // pin the indices to legal values
michael@0 152 pinIndices(start, length);
michael@0 153
michael@0 154 // We need to cast dstSize to int32_t for all subsequent code.
michael@0 155 // I don't know why the API was defined with uint32_t but we are stuck with it.
michael@0 156 // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
michael@0 157 // as a limit in some functions, it may wrap around and yield a pointer
michael@0 158 // that compares less-than target.
michael@0 159 int32_t capacity;
michael@0 160 if(dstSize < 0x7fffffff) {
michael@0 161 // Assume that the capacity is real and a limit pointer won't wrap around.
michael@0 162 capacity = (int32_t)dstSize;
michael@0 163 } else {
michael@0 164 // Pin the capacity so that a limit pointer does not wrap around.
michael@0 165 char *targetLimit = (char *)U_MAX_PTR(target);
michael@0 166 // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff
michael@0 167 // greater than target and does not wrap around the top of the address space.
michael@0 168 capacity = (int32_t)(targetLimit - target);
michael@0 169 }
michael@0 170
michael@0 171 // create the converter
michael@0 172 UConverter *converter;
michael@0 173 UErrorCode status = U_ZERO_ERROR;
michael@0 174
michael@0 175 // just write the NUL if the string length is 0
michael@0 176 if(length == 0) {
michael@0 177 return u_terminateChars(target, capacity, 0, &status);
michael@0 178 }
michael@0 179
michael@0 180 // if the codepage is the default, use our cache
michael@0 181 // if it is an empty string, then use the "invariant character" conversion
michael@0 182 if (codepage == 0) {
michael@0 183 const char *defaultName = ucnv_getDefaultName();
michael@0 184 if(UCNV_FAST_IS_UTF8(defaultName)) {
michael@0 185 return toUTF8(start, length, target, capacity);
michael@0 186 }
michael@0 187 converter = u_getDefaultConverter(&status);
michael@0 188 } else if (*codepage == 0) {
michael@0 189 // use the "invariant characters" conversion
michael@0 190 int32_t destLength;
michael@0 191 if(length <= capacity) {
michael@0 192 destLength = length;
michael@0 193 } else {
michael@0 194 destLength = capacity;
michael@0 195 }
michael@0 196 u_UCharsToChars(getArrayStart() + start, target, destLength);
michael@0 197 return u_terminateChars(target, capacity, length, &status);
michael@0 198 } else {
michael@0 199 converter = ucnv_open(codepage, &status);
michael@0 200 }
michael@0 201
michael@0 202 length = doExtract(start, length, target, capacity, converter, status);
michael@0 203
michael@0 204 // close the converter
michael@0 205 if (codepage == 0) {
michael@0 206 u_releaseDefaultConverter(converter);
michael@0 207 } else {
michael@0 208 ucnv_close(converter);
michael@0 209 }
michael@0 210
michael@0 211 return length;
michael@0 212 }
michael@0 213
michael@0 214 int32_t
michael@0 215 UnicodeString::extract(char *dest, int32_t destCapacity,
michael@0 216 UConverter *cnv,
michael@0 217 UErrorCode &errorCode) const
michael@0 218 {
michael@0 219 if(U_FAILURE(errorCode)) {
michael@0 220 return 0;
michael@0 221 }
michael@0 222
michael@0 223 if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
michael@0 224 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 225 return 0;
michael@0 226 }
michael@0 227
michael@0 228 // nothing to do?
michael@0 229 if(isEmpty()) {
michael@0 230 return u_terminateChars(dest, destCapacity, 0, &errorCode);
michael@0 231 }
michael@0 232
michael@0 233 // get the converter
michael@0 234 UBool isDefaultConverter;
michael@0 235 if(cnv==0) {
michael@0 236 isDefaultConverter=TRUE;
michael@0 237 cnv=u_getDefaultConverter(&errorCode);
michael@0 238 if(U_FAILURE(errorCode)) {
michael@0 239 return 0;
michael@0 240 }
michael@0 241 } else {
michael@0 242 isDefaultConverter=FALSE;
michael@0 243 ucnv_resetFromUnicode(cnv);
michael@0 244 }
michael@0 245
michael@0 246 // convert
michael@0 247 int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode);
michael@0 248
michael@0 249 // release the converter
michael@0 250 if(isDefaultConverter) {
michael@0 251 u_releaseDefaultConverter(cnv);
michael@0 252 }
michael@0 253
michael@0 254 return len;
michael@0 255 }
michael@0 256
michael@0 257 int32_t
michael@0 258 UnicodeString::doExtract(int32_t start, int32_t length,
michael@0 259 char *dest, int32_t destCapacity,
michael@0 260 UConverter *cnv,
michael@0 261 UErrorCode &errorCode) const
michael@0 262 {
michael@0 263 if(U_FAILURE(errorCode)) {
michael@0 264 if(destCapacity!=0) {
michael@0 265 *dest=0;
michael@0 266 }
michael@0 267 return 0;
michael@0 268 }
michael@0 269
michael@0 270 const UChar *src=getArrayStart()+start, *srcLimit=src+length;
michael@0 271 char *originalDest=dest;
michael@0 272 const char *destLimit;
michael@0 273
michael@0 274 if(destCapacity==0) {
michael@0 275 destLimit=dest=0;
michael@0 276 } else if(destCapacity==-1) {
michael@0 277 // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
michael@0 278 destLimit=(char*)U_MAX_PTR(dest);
michael@0 279 // for NUL-termination, translate into highest int32_t
michael@0 280 destCapacity=0x7fffffff;
michael@0 281 } else {
michael@0 282 destLimit=dest+destCapacity;
michael@0 283 }
michael@0 284
michael@0 285 // perform the conversion
michael@0 286 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
michael@0 287 length=(int32_t)(dest-originalDest);
michael@0 288
michael@0 289 // if an overflow occurs, then get the preflighting length
michael@0 290 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
michael@0 291 char buffer[1024];
michael@0 292
michael@0 293 destLimit=buffer+sizeof(buffer);
michael@0 294 do {
michael@0 295 dest=buffer;
michael@0 296 errorCode=U_ZERO_ERROR;
michael@0 297 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
michael@0 298 length+=(int32_t)(dest-buffer);
michael@0 299 } while(errorCode==U_BUFFER_OVERFLOW_ERROR);
michael@0 300 }
michael@0 301
michael@0 302 return u_terminateChars(originalDest, destCapacity, length, &errorCode);
michael@0 303 }
michael@0 304
michael@0 305 void
michael@0 306 UnicodeString::doCodepageCreate(const char *codepageData,
michael@0 307 int32_t dataLength,
michael@0 308 const char *codepage)
michael@0 309 {
michael@0 310 // if there's nothing to convert, do nothing
michael@0 311 if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
michael@0 312 return;
michael@0 313 }
michael@0 314 if(dataLength == -1) {
michael@0 315 dataLength = (int32_t)uprv_strlen(codepageData);
michael@0 316 }
michael@0 317
michael@0 318 UErrorCode status = U_ZERO_ERROR;
michael@0 319
michael@0 320 // create the converter
michael@0 321 // if the codepage is the default, use our cache
michael@0 322 // if it is an empty string, then use the "invariant character" conversion
michael@0 323 UConverter *converter;
michael@0 324 if (codepage == 0) {
michael@0 325 const char *defaultName = ucnv_getDefaultName();
michael@0 326 if(UCNV_FAST_IS_UTF8(defaultName)) {
michael@0 327 setToUTF8(StringPiece(codepageData, dataLength));
michael@0 328 return;
michael@0 329 }
michael@0 330 converter = u_getDefaultConverter(&status);
michael@0 331 } else if(*codepage == 0) {
michael@0 332 // use the "invariant characters" conversion
michael@0 333 if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
michael@0 334 u_charsToUChars(codepageData, getArrayStart(), dataLength);
michael@0 335 setLength(dataLength);
michael@0 336 } else {
michael@0 337 setToBogus();
michael@0 338 }
michael@0 339 return;
michael@0 340 } else {
michael@0 341 converter = ucnv_open(codepage, &status);
michael@0 342 }
michael@0 343
michael@0 344 // if we failed, set the appropriate flags and return
michael@0 345 if(U_FAILURE(status)) {
michael@0 346 setToBogus();
michael@0 347 return;
michael@0 348 }
michael@0 349
michael@0 350 // perform the conversion
michael@0 351 doCodepageCreate(codepageData, dataLength, converter, status);
michael@0 352 if(U_FAILURE(status)) {
michael@0 353 setToBogus();
michael@0 354 }
michael@0 355
michael@0 356 // close the converter
michael@0 357 if(codepage == 0) {
michael@0 358 u_releaseDefaultConverter(converter);
michael@0 359 } else {
michael@0 360 ucnv_close(converter);
michael@0 361 }
michael@0 362 }
michael@0 363
michael@0 364 void
michael@0 365 UnicodeString::doCodepageCreate(const char *codepageData,
michael@0 366 int32_t dataLength,
michael@0 367 UConverter *converter,
michael@0 368 UErrorCode &status)
michael@0 369 {
michael@0 370 if(U_FAILURE(status)) {
michael@0 371 return;
michael@0 372 }
michael@0 373
michael@0 374 // set up the conversion parameters
michael@0 375 const char *mySource = codepageData;
michael@0 376 const char *mySourceEnd = mySource + dataLength;
michael@0 377 UChar *array, *myTarget;
michael@0 378
michael@0 379 // estimate the size needed:
michael@0 380 int32_t arraySize;
michael@0 381 if(dataLength <= US_STACKBUF_SIZE) {
michael@0 382 // try to use the stack buffer
michael@0 383 arraySize = US_STACKBUF_SIZE;
michael@0 384 } else {
michael@0 385 // 1.25 UChar's per source byte should cover most cases
michael@0 386 arraySize = dataLength + (dataLength >> 2);
michael@0 387 }
michael@0 388
michael@0 389 // we do not care about the current contents
michael@0 390 UBool doCopyArray = FALSE;
michael@0 391 for(;;) {
michael@0 392 if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
michael@0 393 setToBogus();
michael@0 394 break;
michael@0 395 }
michael@0 396
michael@0 397 // perform the conversion
michael@0 398 array = getArrayStart();
michael@0 399 myTarget = array + length();
michael@0 400 ucnv_toUnicode(converter, &myTarget, array + getCapacity(),
michael@0 401 &mySource, mySourceEnd, 0, TRUE, &status);
michael@0 402
michael@0 403 // update the conversion parameters
michael@0 404 setLength((int32_t)(myTarget - array));
michael@0 405
michael@0 406 // allocate more space and copy data, if needed
michael@0 407 if(status == U_BUFFER_OVERFLOW_ERROR) {
michael@0 408 // reset the error code
michael@0 409 status = U_ZERO_ERROR;
michael@0 410
michael@0 411 // keep the previous conversion results
michael@0 412 doCopyArray = TRUE;
michael@0 413
michael@0 414 // estimate the new size needed, larger than before
michael@0 415 // try 2 UChar's per remaining source byte
michael@0 416 arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource));
michael@0 417 } else {
michael@0 418 break;
michael@0 419 }
michael@0 420 }
michael@0 421 }
michael@0 422
michael@0 423 U_NAMESPACE_END
michael@0 424
michael@0 425 #endif

mercurial