michael@0: /* michael@0: ******************************************************************************* michael@0: * michael@0: * Copyright (C) 1999-2010, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ******************************************************************************* michael@0: * file name: unistr_cnv.cpp michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:2 michael@0: * michael@0: * created on: 2004aug19 michael@0: * created by: Markus W. Scherer michael@0: * michael@0: * Character conversion functions moved here from unistr.cpp michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_CONVERSION michael@0: michael@0: #include "unicode/putil.h" michael@0: #include "cstring.h" michael@0: #include "cmemory.h" michael@0: #include "unicode/ustring.h" michael@0: #include "unicode/unistr.h" michael@0: #include "unicode/ucnv.h" michael@0: #include "ucnv_imp.h" michael@0: #include "putilimp.h" michael@0: #include "ustr_cnv.h" michael@0: #include "ustr_imp.h" michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: //======================================== michael@0: // Constructors michael@0: //======================================== michael@0: michael@0: #if !U_CHARSET_IS_UTF8 michael@0: michael@0: UnicodeString::UnicodeString(const char *codepageData) michael@0: : fShortLength(0), michael@0: fFlags(kShortString) michael@0: { michael@0: if(codepageData != 0) { michael@0: doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0); michael@0: } michael@0: } michael@0: michael@0: UnicodeString::UnicodeString(const char *codepageData, michael@0: int32_t dataLength) michael@0: : fShortLength(0), michael@0: fFlags(kShortString) michael@0: { michael@0: if(codepageData != 0) { michael@0: doCodepageCreate(codepageData, dataLength, 0); michael@0: } michael@0: } michael@0: michael@0: // else see unistr.cpp michael@0: #endif michael@0: michael@0: UnicodeString::UnicodeString(const char *codepageData, michael@0: const char *codepage) michael@0: : fShortLength(0), michael@0: fFlags(kShortString) michael@0: { michael@0: if(codepageData != 0) { michael@0: doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage); michael@0: } michael@0: } michael@0: michael@0: UnicodeString::UnicodeString(const char *codepageData, michael@0: int32_t dataLength, michael@0: const char *codepage) michael@0: : fShortLength(0), michael@0: fFlags(kShortString) michael@0: { michael@0: if(codepageData != 0) { michael@0: doCodepageCreate(codepageData, dataLength, codepage); michael@0: } michael@0: } michael@0: michael@0: UnicodeString::UnicodeString(const char *src, int32_t srcLength, michael@0: UConverter *cnv, michael@0: UErrorCode &errorCode) michael@0: : fShortLength(0), michael@0: fFlags(kShortString) michael@0: { michael@0: if(U_SUCCESS(errorCode)) { michael@0: // check arguments michael@0: if(src==NULL) { michael@0: // treat as an empty string, do nothing more michael@0: } else if(srcLength<-1) { michael@0: errorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: } else { michael@0: // get input length michael@0: if(srcLength==-1) { michael@0: srcLength=(int32_t)uprv_strlen(src); michael@0: } michael@0: if(srcLength>0) { michael@0: if(cnv!=0) { michael@0: // use the provided converter michael@0: ucnv_resetToUnicode(cnv); michael@0: doCodepageCreate(src, srcLength, cnv, errorCode); michael@0: } else { michael@0: // use the default converter michael@0: cnv=u_getDefaultConverter(&errorCode); michael@0: doCodepageCreate(src, srcLength, cnv, errorCode); michael@0: u_releaseDefaultConverter(cnv); michael@0: } michael@0: } michael@0: } michael@0: michael@0: if(U_FAILURE(errorCode)) { michael@0: setToBogus(); michael@0: } michael@0: } michael@0: } michael@0: michael@0: //======================================== michael@0: // Codeset conversion michael@0: //======================================== michael@0: michael@0: #if !U_CHARSET_IS_UTF8 michael@0: michael@0: int32_t michael@0: UnicodeString::extract(int32_t start, michael@0: int32_t length, michael@0: char *target, michael@0: uint32_t dstSize) const { michael@0: return extract(start, length, target, dstSize, 0); michael@0: } michael@0: michael@0: // else see unistr.cpp michael@0: #endif michael@0: michael@0: int32_t michael@0: UnicodeString::extract(int32_t start, michael@0: int32_t length, michael@0: char *target, michael@0: uint32_t dstSize, michael@0: const char *codepage) const michael@0: { michael@0: // if the arguments are illegal, then do nothing michael@0: if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) { michael@0: return 0; michael@0: } michael@0: michael@0: // pin the indices to legal values michael@0: pinIndices(start, length); michael@0: michael@0: // We need to cast dstSize to int32_t for all subsequent code. michael@0: // I don't know why the API was defined with uint32_t but we are stuck with it. michael@0: // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize michael@0: // as a limit in some functions, it may wrap around and yield a pointer michael@0: // that compares less-than target. michael@0: int32_t capacity; michael@0: if(dstSize < 0x7fffffff) { michael@0: // Assume that the capacity is real and a limit pointer won't wrap around. michael@0: capacity = (int32_t)dstSize; michael@0: } else { michael@0: // Pin the capacity so that a limit pointer does not wrap around. michael@0: char *targetLimit = (char *)U_MAX_PTR(target); michael@0: // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff michael@0: // greater than target and does not wrap around the top of the address space. michael@0: capacity = (int32_t)(targetLimit - target); michael@0: } michael@0: michael@0: // create the converter michael@0: UConverter *converter; michael@0: UErrorCode status = U_ZERO_ERROR; michael@0: michael@0: // just write the NUL if the string length is 0 michael@0: if(length == 0) { michael@0: return u_terminateChars(target, capacity, 0, &status); michael@0: } michael@0: michael@0: // if the codepage is the default, use our cache michael@0: // if it is an empty string, then use the "invariant character" conversion michael@0: if (codepage == 0) { michael@0: const char *defaultName = ucnv_getDefaultName(); michael@0: if(UCNV_FAST_IS_UTF8(defaultName)) { michael@0: return toUTF8(start, length, target, capacity); michael@0: } michael@0: converter = u_getDefaultConverter(&status); michael@0: } else if (*codepage == 0) { michael@0: // use the "invariant characters" conversion michael@0: int32_t destLength; michael@0: if(length <= capacity) { michael@0: destLength = length; michael@0: } else { michael@0: destLength = capacity; michael@0: } michael@0: u_UCharsToChars(getArrayStart() + start, target, destLength); michael@0: return u_terminateChars(target, capacity, length, &status); michael@0: } else { michael@0: converter = ucnv_open(codepage, &status); michael@0: } michael@0: michael@0: length = doExtract(start, length, target, capacity, converter, status); michael@0: michael@0: // close the converter michael@0: if (codepage == 0) { michael@0: u_releaseDefaultConverter(converter); michael@0: } else { michael@0: ucnv_close(converter); michael@0: } michael@0: michael@0: return length; michael@0: } michael@0: michael@0: int32_t michael@0: UnicodeString::extract(char *dest, int32_t destCapacity, michael@0: UConverter *cnv, michael@0: UErrorCode &errorCode) const michael@0: { michael@0: if(U_FAILURE(errorCode)) { michael@0: return 0; michael@0: } michael@0: michael@0: if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) { michael@0: errorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: // nothing to do? michael@0: if(isEmpty()) { michael@0: return u_terminateChars(dest, destCapacity, 0, &errorCode); michael@0: } michael@0: michael@0: // get the converter michael@0: UBool isDefaultConverter; michael@0: if(cnv==0) { michael@0: isDefaultConverter=TRUE; michael@0: cnv=u_getDefaultConverter(&errorCode); michael@0: if(U_FAILURE(errorCode)) { michael@0: return 0; michael@0: } michael@0: } else { michael@0: isDefaultConverter=FALSE; michael@0: ucnv_resetFromUnicode(cnv); michael@0: } michael@0: michael@0: // convert michael@0: int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode); michael@0: michael@0: // release the converter michael@0: if(isDefaultConverter) { michael@0: u_releaseDefaultConverter(cnv); michael@0: } michael@0: michael@0: return len; michael@0: } michael@0: michael@0: int32_t michael@0: UnicodeString::doExtract(int32_t start, int32_t length, michael@0: char *dest, int32_t destCapacity, michael@0: UConverter *cnv, michael@0: UErrorCode &errorCode) const michael@0: { michael@0: if(U_FAILURE(errorCode)) { michael@0: if(destCapacity!=0) { michael@0: *dest=0; michael@0: } michael@0: return 0; michael@0: } michael@0: michael@0: const UChar *src=getArrayStart()+start, *srcLimit=src+length; michael@0: char *originalDest=dest; michael@0: const char *destLimit; michael@0: michael@0: if(destCapacity==0) { michael@0: destLimit=dest=0; michael@0: } else if(destCapacity==-1) { michael@0: // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used. michael@0: destLimit=(char*)U_MAX_PTR(dest); michael@0: // for NUL-termination, translate into highest int32_t michael@0: destCapacity=0x7fffffff; michael@0: } else { michael@0: destLimit=dest+destCapacity; michael@0: } michael@0: michael@0: // perform the conversion michael@0: ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode); michael@0: length=(int32_t)(dest-originalDest); michael@0: michael@0: // if an overflow occurs, then get the preflighting length michael@0: if(errorCode==U_BUFFER_OVERFLOW_ERROR) { michael@0: char buffer[1024]; michael@0: michael@0: destLimit=buffer+sizeof(buffer); michael@0: do { michael@0: dest=buffer; michael@0: errorCode=U_ZERO_ERROR; michael@0: ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode); michael@0: length+=(int32_t)(dest-buffer); michael@0: } while(errorCode==U_BUFFER_OVERFLOW_ERROR); michael@0: } michael@0: michael@0: return u_terminateChars(originalDest, destCapacity, length, &errorCode); michael@0: } michael@0: michael@0: void michael@0: UnicodeString::doCodepageCreate(const char *codepageData, michael@0: int32_t dataLength, michael@0: const char *codepage) michael@0: { michael@0: // if there's nothing to convert, do nothing michael@0: if(codepageData == 0 || dataLength == 0 || dataLength < -1) { michael@0: return; michael@0: } michael@0: if(dataLength == -1) { michael@0: dataLength = (int32_t)uprv_strlen(codepageData); michael@0: } michael@0: michael@0: UErrorCode status = U_ZERO_ERROR; michael@0: michael@0: // create the converter michael@0: // if the codepage is the default, use our cache michael@0: // if it is an empty string, then use the "invariant character" conversion michael@0: UConverter *converter; michael@0: if (codepage == 0) { michael@0: const char *defaultName = ucnv_getDefaultName(); michael@0: if(UCNV_FAST_IS_UTF8(defaultName)) { michael@0: setToUTF8(StringPiece(codepageData, dataLength)); michael@0: return; michael@0: } michael@0: converter = u_getDefaultConverter(&status); michael@0: } else if(*codepage == 0) { michael@0: // use the "invariant characters" conversion michael@0: if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) { michael@0: u_charsToUChars(codepageData, getArrayStart(), dataLength); michael@0: setLength(dataLength); michael@0: } else { michael@0: setToBogus(); michael@0: } michael@0: return; michael@0: } else { michael@0: converter = ucnv_open(codepage, &status); michael@0: } michael@0: michael@0: // if we failed, set the appropriate flags and return michael@0: if(U_FAILURE(status)) { michael@0: setToBogus(); michael@0: return; michael@0: } michael@0: michael@0: // perform the conversion michael@0: doCodepageCreate(codepageData, dataLength, converter, status); michael@0: if(U_FAILURE(status)) { michael@0: setToBogus(); michael@0: } michael@0: michael@0: // close the converter michael@0: if(codepage == 0) { michael@0: u_releaseDefaultConverter(converter); michael@0: } else { michael@0: ucnv_close(converter); michael@0: } michael@0: } michael@0: michael@0: void michael@0: UnicodeString::doCodepageCreate(const char *codepageData, michael@0: int32_t dataLength, michael@0: UConverter *converter, michael@0: UErrorCode &status) michael@0: { michael@0: if(U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: michael@0: // set up the conversion parameters michael@0: const char *mySource = codepageData; michael@0: const char *mySourceEnd = mySource + dataLength; michael@0: UChar *array, *myTarget; michael@0: michael@0: // estimate the size needed: michael@0: int32_t arraySize; michael@0: if(dataLength <= US_STACKBUF_SIZE) { michael@0: // try to use the stack buffer michael@0: arraySize = US_STACKBUF_SIZE; michael@0: } else { michael@0: // 1.25 UChar's per source byte should cover most cases michael@0: arraySize = dataLength + (dataLength >> 2); michael@0: } michael@0: michael@0: // we do not care about the current contents michael@0: UBool doCopyArray = FALSE; michael@0: for(;;) { michael@0: if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) { michael@0: setToBogus(); michael@0: break; michael@0: } michael@0: michael@0: // perform the conversion michael@0: array = getArrayStart(); michael@0: myTarget = array + length(); michael@0: ucnv_toUnicode(converter, &myTarget, array + getCapacity(), michael@0: &mySource, mySourceEnd, 0, TRUE, &status); michael@0: michael@0: // update the conversion parameters michael@0: setLength((int32_t)(myTarget - array)); michael@0: michael@0: // allocate more space and copy data, if needed michael@0: if(status == U_BUFFER_OVERFLOW_ERROR) { michael@0: // reset the error code michael@0: status = U_ZERO_ERROR; michael@0: michael@0: // keep the previous conversion results michael@0: doCopyArray = TRUE; michael@0: michael@0: // estimate the new size needed, larger than before michael@0: // try 2 UChar's per remaining source byte michael@0: arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource)); michael@0: } else { michael@0: break; michael@0: } michael@0: } michael@0: } michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: #endif