michael@0: /*
michael@0: *******************************************************************************
michael@0: *
michael@0: *   Copyright (C) 1999-2010, International Business Machines
michael@0: *   Corporation and others.  All Rights Reserved.
michael@0: *
michael@0: *******************************************************************************
michael@0: *   file name:  unistr_cnv.cpp
michael@0: *   encoding:   US-ASCII
michael@0: *   tab size:   8 (not used)
michael@0: *   indentation:2
michael@0: *
michael@0: *   created on: 2004aug19
michael@0: *   created by: Markus W. Scherer
michael@0: *
michael@0: *   Character conversion functions moved here from unistr.cpp
michael@0: */
michael@0: 
michael@0: #include "unicode/utypes.h"
michael@0: 
michael@0: #if !UCONFIG_NO_CONVERSION
michael@0: 
michael@0: #include "unicode/putil.h"
michael@0: #include "cstring.h"
michael@0: #include "cmemory.h"
michael@0: #include "unicode/ustring.h"
michael@0: #include "unicode/unistr.h"
michael@0: #include "unicode/ucnv.h"
michael@0: #include "ucnv_imp.h"
michael@0: #include "putilimp.h"
michael@0: #include "ustr_cnv.h"
michael@0: #include "ustr_imp.h"
michael@0: 
michael@0: U_NAMESPACE_BEGIN
michael@0: 
michael@0: //========================================
michael@0: // Constructors
michael@0: //========================================
michael@0: 
michael@0: #if !U_CHARSET_IS_UTF8
michael@0: 
michael@0: UnicodeString::UnicodeString(const char *codepageData)
michael@0:   : fShortLength(0),
michael@0:     fFlags(kShortString)
michael@0: {
michael@0:     if(codepageData != 0) {
michael@0:         doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0);
michael@0:     }
michael@0: }
michael@0: 
michael@0: UnicodeString::UnicodeString(const char *codepageData,
michael@0:                              int32_t dataLength)
michael@0:   : fShortLength(0),
michael@0:     fFlags(kShortString)
michael@0: {
michael@0:     if(codepageData != 0) {
michael@0:         doCodepageCreate(codepageData, dataLength, 0);
michael@0:     }
michael@0: }
michael@0: 
michael@0: // else see unistr.cpp
michael@0: #endif
michael@0: 
michael@0: UnicodeString::UnicodeString(const char *codepageData,
michael@0:                              const char *codepage)
michael@0:   : fShortLength(0),
michael@0:     fFlags(kShortString)
michael@0: {
michael@0:     if(codepageData != 0) {
michael@0:         doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);
michael@0:     }
michael@0: }
michael@0: 
michael@0: UnicodeString::UnicodeString(const char *codepageData,
michael@0:                              int32_t dataLength,
michael@0:                              const char *codepage)
michael@0:   : fShortLength(0),
michael@0:     fFlags(kShortString)
michael@0: {
michael@0:     if(codepageData != 0) {
michael@0:         doCodepageCreate(codepageData, dataLength, codepage);
michael@0:     }
michael@0: }
michael@0: 
michael@0: UnicodeString::UnicodeString(const char *src, int32_t srcLength,
michael@0:                              UConverter *cnv,
michael@0:                              UErrorCode &errorCode)
michael@0:   : fShortLength(0),
michael@0:     fFlags(kShortString)
michael@0: {
michael@0:     if(U_SUCCESS(errorCode)) {
michael@0:         // check arguments
michael@0:         if(src==NULL) {
michael@0:             // treat as an empty string, do nothing more
michael@0:         } else if(srcLength<-1) {
michael@0:             errorCode=U_ILLEGAL_ARGUMENT_ERROR;
michael@0:         } else {
michael@0:             // get input length
michael@0:             if(srcLength==-1) {
michael@0:                 srcLength=(int32_t)uprv_strlen(src);
michael@0:             }
michael@0:             if(srcLength>0) {
michael@0:                 if(cnv!=0) {
michael@0:                     // use the provided converter
michael@0:                     ucnv_resetToUnicode(cnv);
michael@0:                     doCodepageCreate(src, srcLength, cnv, errorCode);
michael@0:                 } else {
michael@0:                     // use the default converter
michael@0:                     cnv=u_getDefaultConverter(&errorCode);
michael@0:                     doCodepageCreate(src, srcLength, cnv, errorCode);
michael@0:                     u_releaseDefaultConverter(cnv);
michael@0:                 }
michael@0:             }
michael@0:         }
michael@0: 
michael@0:         if(U_FAILURE(errorCode)) {
michael@0:             setToBogus();
michael@0:         }
michael@0:     }
michael@0: }
michael@0: 
michael@0: //========================================
michael@0: // Codeset conversion
michael@0: //========================================
michael@0: 
michael@0: #if !U_CHARSET_IS_UTF8
michael@0: 
michael@0: int32_t
michael@0: UnicodeString::extract(int32_t start,
michael@0:                        int32_t length,
michael@0:                        char *target,
michael@0:                        uint32_t dstSize) const {
michael@0:     return extract(start, length, target, dstSize, 0);
michael@0: }
michael@0: 
michael@0: // else see unistr.cpp
michael@0: #endif
michael@0: 
michael@0: int32_t
michael@0: UnicodeString::extract(int32_t start,
michael@0:                        int32_t length,
michael@0:                        char *target,
michael@0:                        uint32_t dstSize,
michael@0:                        const char *codepage) const
michael@0: {
michael@0:     // if the arguments are illegal, then do nothing
michael@0:     if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
michael@0:         return 0;
michael@0:     }
michael@0: 
michael@0:     // pin the indices to legal values
michael@0:     pinIndices(start, length);
michael@0: 
michael@0:     // We need to cast dstSize to int32_t for all subsequent code.
michael@0:     // I don't know why the API was defined with uint32_t but we are stuck with it.
michael@0:     // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
michael@0:     // as a limit in some functions, it may wrap around and yield a pointer
michael@0:     // that compares less-than target.
michael@0:     int32_t capacity;
michael@0:     if(dstSize < 0x7fffffff) {
michael@0:         // Assume that the capacity is real and a limit pointer won't wrap around.
michael@0:         capacity = (int32_t)dstSize;
michael@0:     } else {
michael@0:         // Pin the capacity so that a limit pointer does not wrap around.
michael@0:         char *targetLimit = (char *)U_MAX_PTR(target);
michael@0:         // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff
michael@0:         // greater than target and does not wrap around the top of the address space.
michael@0:         capacity = (int32_t)(targetLimit - target);
michael@0:     }
michael@0: 
michael@0:     // create the converter
michael@0:     UConverter *converter;
michael@0:     UErrorCode status = U_ZERO_ERROR;
michael@0: 
michael@0:     // just write the NUL if the string length is 0
michael@0:     if(length == 0) {
michael@0:         return u_terminateChars(target, capacity, 0, &status);
michael@0:     }
michael@0: 
michael@0:     // if the codepage is the default, use our cache
michael@0:     // if it is an empty string, then use the "invariant character" conversion
michael@0:     if (codepage == 0) {
michael@0:         const char *defaultName = ucnv_getDefaultName();
michael@0:         if(UCNV_FAST_IS_UTF8(defaultName)) {
michael@0:             return toUTF8(start, length, target, capacity);
michael@0:         }
michael@0:         converter = u_getDefaultConverter(&status);
michael@0:     } else if (*codepage == 0) {
michael@0:         // use the "invariant characters" conversion
michael@0:         int32_t destLength;
michael@0:         if(length <= capacity) {
michael@0:             destLength = length;
michael@0:         } else {
michael@0:             destLength = capacity;
michael@0:         }
michael@0:         u_UCharsToChars(getArrayStart() + start, target, destLength);
michael@0:         return u_terminateChars(target, capacity, length, &status);
michael@0:     } else {
michael@0:         converter = ucnv_open(codepage, &status);
michael@0:     }
michael@0: 
michael@0:     length = doExtract(start, length, target, capacity, converter, status);
michael@0: 
michael@0:     // close the converter
michael@0:     if (codepage == 0) {
michael@0:         u_releaseDefaultConverter(converter);
michael@0:     } else {
michael@0:         ucnv_close(converter);
michael@0:     }
michael@0: 
michael@0:     return length;
michael@0: }
michael@0: 
michael@0: int32_t
michael@0: UnicodeString::extract(char *dest, int32_t destCapacity,
michael@0:                        UConverter *cnv,
michael@0:                        UErrorCode &errorCode) const
michael@0: {
michael@0:     if(U_FAILURE(errorCode)) {
michael@0:         return 0;
michael@0:     }
michael@0: 
michael@0:     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
michael@0:         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
michael@0:         return 0;
michael@0:     }
michael@0: 
michael@0:     // nothing to do?
michael@0:     if(isEmpty()) {
michael@0:         return u_terminateChars(dest, destCapacity, 0, &errorCode);
michael@0:     }
michael@0: 
michael@0:     // get the converter
michael@0:     UBool isDefaultConverter;
michael@0:     if(cnv==0) {
michael@0:         isDefaultConverter=TRUE;
michael@0:         cnv=u_getDefaultConverter(&errorCode);
michael@0:         if(U_FAILURE(errorCode)) {
michael@0:             return 0;
michael@0:         }
michael@0:     } else {
michael@0:         isDefaultConverter=FALSE;
michael@0:         ucnv_resetFromUnicode(cnv);
michael@0:     }
michael@0: 
michael@0:     // convert
michael@0:     int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode);
michael@0: 
michael@0:     // release the converter
michael@0:     if(isDefaultConverter) {
michael@0:         u_releaseDefaultConverter(cnv);
michael@0:     }
michael@0: 
michael@0:     return len;
michael@0: }
michael@0: 
michael@0: int32_t
michael@0: UnicodeString::doExtract(int32_t start, int32_t length,
michael@0:                          char *dest, int32_t destCapacity,
michael@0:                          UConverter *cnv,
michael@0:                          UErrorCode &errorCode) const
michael@0: {
michael@0:     if(U_FAILURE(errorCode)) {
michael@0:         if(destCapacity!=0) {
michael@0:             *dest=0;
michael@0:         }
michael@0:         return 0;
michael@0:     }
michael@0: 
michael@0:     const UChar *src=getArrayStart()+start, *srcLimit=src+length;
michael@0:     char *originalDest=dest;
michael@0:     const char *destLimit;
michael@0: 
michael@0:     if(destCapacity==0) {
michael@0:         destLimit=dest=0;
michael@0:     } else if(destCapacity==-1) {
michael@0:         // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
michael@0:         destLimit=(char*)U_MAX_PTR(dest);
michael@0:         // for NUL-termination, translate into highest int32_t
michael@0:         destCapacity=0x7fffffff;
michael@0:     } else {
michael@0:         destLimit=dest+destCapacity;
michael@0:     }
michael@0: 
michael@0:     // perform the conversion
michael@0:     ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
michael@0:     length=(int32_t)(dest-originalDest);
michael@0: 
michael@0:     // if an overflow occurs, then get the preflighting length
michael@0:     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
michael@0:         char buffer[1024];
michael@0: 
michael@0:         destLimit=buffer+sizeof(buffer);
michael@0:         do {
michael@0:             dest=buffer;
michael@0:             errorCode=U_ZERO_ERROR;
michael@0:             ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
michael@0:             length+=(int32_t)(dest-buffer);
michael@0:         } while(errorCode==U_BUFFER_OVERFLOW_ERROR);
michael@0:     }
michael@0: 
michael@0:     return u_terminateChars(originalDest, destCapacity, length, &errorCode);
michael@0: }
michael@0: 
michael@0: void
michael@0: UnicodeString::doCodepageCreate(const char *codepageData,
michael@0:                                 int32_t dataLength,
michael@0:                                 const char *codepage)
michael@0: {
michael@0:     // if there's nothing to convert, do nothing
michael@0:     if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
michael@0:         return;
michael@0:     }
michael@0:     if(dataLength == -1) {
michael@0:         dataLength = (int32_t)uprv_strlen(codepageData);
michael@0:     }
michael@0: 
michael@0:     UErrorCode status = U_ZERO_ERROR;
michael@0: 
michael@0:     // create the converter
michael@0:     // if the codepage is the default, use our cache
michael@0:     // if it is an empty string, then use the "invariant character" conversion
michael@0:     UConverter *converter;
michael@0:     if (codepage == 0) {
michael@0:         const char *defaultName = ucnv_getDefaultName();
michael@0:         if(UCNV_FAST_IS_UTF8(defaultName)) {
michael@0:             setToUTF8(StringPiece(codepageData, dataLength));
michael@0:             return;
michael@0:         }
michael@0:         converter = u_getDefaultConverter(&status);
michael@0:     } else if(*codepage == 0) {
michael@0:         // use the "invariant characters" conversion
michael@0:         if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
michael@0:             u_charsToUChars(codepageData, getArrayStart(), dataLength);
michael@0:             setLength(dataLength);
michael@0:         } else {
michael@0:             setToBogus();
michael@0:         }
michael@0:         return;
michael@0:     } else {
michael@0:         converter = ucnv_open(codepage, &status);
michael@0:     }
michael@0: 
michael@0:     // if we failed, set the appropriate flags and return
michael@0:     if(U_FAILURE(status)) {
michael@0:         setToBogus();
michael@0:         return;
michael@0:     }
michael@0: 
michael@0:     // perform the conversion
michael@0:     doCodepageCreate(codepageData, dataLength, converter, status);
michael@0:     if(U_FAILURE(status)) {
michael@0:         setToBogus();
michael@0:     }
michael@0: 
michael@0:     // close the converter
michael@0:     if(codepage == 0) {
michael@0:         u_releaseDefaultConverter(converter);
michael@0:     } else {
michael@0:         ucnv_close(converter);
michael@0:     }
michael@0: }
michael@0: 
michael@0: void
michael@0: UnicodeString::doCodepageCreate(const char *codepageData,
michael@0:                                 int32_t dataLength,
michael@0:                                 UConverter *converter,
michael@0:                                 UErrorCode &status)
michael@0: {
michael@0:     if(U_FAILURE(status)) {
michael@0:         return;
michael@0:     }
michael@0: 
michael@0:     // set up the conversion parameters
michael@0:     const char *mySource     = codepageData;
michael@0:     const char *mySourceEnd  = mySource + dataLength;
michael@0:     UChar *array, *myTarget;
michael@0: 
michael@0:     // estimate the size needed:
michael@0:     int32_t arraySize;
michael@0:     if(dataLength <= US_STACKBUF_SIZE) {
michael@0:         // try to use the stack buffer
michael@0:         arraySize = US_STACKBUF_SIZE;
michael@0:     } else {
michael@0:         // 1.25 UChar's per source byte should cover most cases
michael@0:         arraySize = dataLength + (dataLength >> 2);
michael@0:     }
michael@0: 
michael@0:     // we do not care about the current contents
michael@0:     UBool doCopyArray = FALSE;
michael@0:     for(;;) {
michael@0:         if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
michael@0:             setToBogus();
michael@0:             break;
michael@0:         }
michael@0: 
michael@0:         // perform the conversion
michael@0:         array = getArrayStart();
michael@0:         myTarget = array + length();
michael@0:         ucnv_toUnicode(converter, &myTarget,  array + getCapacity(),
michael@0:             &mySource, mySourceEnd, 0, TRUE, &status);
michael@0: 
michael@0:         // update the conversion parameters
michael@0:         setLength((int32_t)(myTarget - array));
michael@0: 
michael@0:         // allocate more space and copy data, if needed
michael@0:         if(status == U_BUFFER_OVERFLOW_ERROR) {
michael@0:             // reset the error code
michael@0:             status = U_ZERO_ERROR;
michael@0: 
michael@0:             // keep the previous conversion results
michael@0:             doCopyArray = TRUE;
michael@0: 
michael@0:             // estimate the new size needed, larger than before
michael@0:             // try 2 UChar's per remaining source byte
michael@0:             arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource));
michael@0:         } else {
michael@0:             break;
michael@0:         }
michael@0:     }
michael@0: }
michael@0: 
michael@0: U_NAMESPACE_END
michael@0: 
michael@0: #endif