The Tor Browser: intl/icu/source/common/unistr

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*

     2 *******************************************************************************

3 *

     4 *   Copyright (C) 1999-2010, International Business Machines

     5 *   Corporation and others.  All Rights Reserved.

6 *

     7 *******************************************************************************

     8 *   file name:  unistr_cnv.cpp

     9 *   encoding:   US-ASCII

    10 *   tab size:   8 (not used)

    11 *   indentation:2

    12 *

    13 *   created on: 2004aug19

    14 *   created by: Markus W. Scherer

    15 *

    16 *   Character conversion functions moved here from unistr.cpp

    17 */

    19 #include "unicode/utypes.h"

    21 #if !UCONFIG_NO_CONVERSION

    23 #include "unicode/putil.h"

    24 #include "cstring.h"

    25 #include "cmemory.h"

    26 #include "unicode/ustring.h"

    27 #include "unicode/unistr.h"

    28 #include "unicode/ucnv.h"

    29 #include "ucnv_imp.h"

    30 #include "putilimp.h"

    31 #include "ustr_cnv.h"

    32 #include "ustr_imp.h"

    34 U_NAMESPACE_BEGIN

    36 //========================================

    37 // Constructors

    38 //========================================

    40 #if !U_CHARSET_IS_UTF8

    42 UnicodeString::UnicodeString(const char *codepageData)

    43   : fShortLength(0),

    44     fFlags(kShortString)

    45 {

    46     if(codepageData != 0) {

    47         doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0);

    48     }

    49 }

    51 UnicodeString::UnicodeString(const char *codepageData,

    52                              int32_t dataLength)

    53   : fShortLength(0),

    54     fFlags(kShortString)

    55 {

    56     if(codepageData != 0) {

    57         doCodepageCreate(codepageData, dataLength, 0);

    58     }

    59 }

    61 // else see unistr.cpp

    62 #endif

    64 UnicodeString::UnicodeString(const char *codepageData,

    65                              const char *codepage)

    66   : fShortLength(0),

    67     fFlags(kShortString)

    68 {

    69     if(codepageData != 0) {

    70         doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);

    71     }

    72 }

    74 UnicodeString::UnicodeString(const char *codepageData,

    75                              int32_t dataLength,

    76                              const char *codepage)

    77   : fShortLength(0),

    78     fFlags(kShortString)

    79 {

    80     if(codepageData != 0) {

    81         doCodepageCreate(codepageData, dataLength, codepage);

    82     }

    83 }

    85 UnicodeString::UnicodeString(const char *src, int32_t srcLength,

    86                              UConverter *cnv,

    87                              UErrorCode &errorCode)

    88   : fShortLength(0),

    89     fFlags(kShortString)

    90 {

    91     if(U_SUCCESS(errorCode)) {

    92         // check arguments

    93         if(src==NULL) {

    94             // treat as an empty string, do nothing more

    95         } else if(srcLength<-1) {

    96             errorCode=U_ILLEGAL_ARGUMENT_ERROR;

    97         } else {

    98             // get input length

    99             if(srcLength==-1) {

   100                 srcLength=(int32_t)uprv_strlen(src);

   101             }

   102             if(srcLength>0) {

   103                 if(cnv!=0) {

   104                     // use the provided converter

   105                     ucnv_resetToUnicode(cnv);

   106                     doCodepageCreate(src, srcLength, cnv, errorCode);

   107                 } else {

   108                     // use the default converter

   109                     cnv=u_getDefaultConverter(&errorCode);

   110                     doCodepageCreate(src, srcLength, cnv, errorCode);

   111                     u_releaseDefaultConverter(cnv);

   112                 }

   113             }

   114         }

   116         if(U_FAILURE(errorCode)) {

   117             setToBogus();

   118         }

   119     }

   120 }

   122 //========================================

   123 // Codeset conversion

   124 //========================================

   126 #if !U_CHARSET_IS_UTF8

   128 int32_t

   129 UnicodeString::extract(int32_t start,

   130                        int32_t length,

   131                        char *target,

   132                        uint32_t dstSize) const {

   133     return extract(start, length, target, dstSize, 0);

   134 }

   136 // else see unistr.cpp

   137 #endif

   139 int32_t

   140 UnicodeString::extract(int32_t start,

   141                        int32_t length,

   142                        char *target,

   143                        uint32_t dstSize,

   144                        const char *codepage) const

   145 {

   146     // if the arguments are illegal, then do nothing

   147     if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {

   148         return 0;

   149     }

   151     // pin the indices to legal values

   152     pinIndices(start, length);

   154     // We need to cast dstSize to int32_t for all subsequent code.

   155     // I don't know why the API was defined with uint32_t but we are stuck with it.

   156     // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize

   157     // as a limit in some functions, it may wrap around and yield a pointer

   158     // that compares less-than target.

   159     int32_t capacity;

   160     if(dstSize < 0x7fffffff) {

   161         // Assume that the capacity is real and a limit pointer won't wrap around.

   162         capacity = (int32_t)dstSize;

   163     } else {

   164         // Pin the capacity so that a limit pointer does not wrap around.

   165         char *targetLimit = (char *)U_MAX_PTR(target);

   166         // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff

   167         // greater than target and does not wrap around the top of the address space.

   168         capacity = (int32_t)(targetLimit - target);

   169     }

   171     // create the converter

   172     UConverter *converter;

   173     UErrorCode status = U_ZERO_ERROR;

   175     // just write the NUL if the string length is 0

   176     if(length == 0) {

   177         return u_terminateChars(target, capacity, 0, &status);

   178     }

   180     // if the codepage is the default, use our cache

   181     // if it is an empty string, then use the "invariant character" conversion

   182     if (codepage == 0) {

   183         const char *defaultName = ucnv_getDefaultName();

   184         if(UCNV_FAST_IS_UTF8(defaultName)) {

   185             return toUTF8(start, length, target, capacity);

   186         }

   187         converter = u_getDefaultConverter(&status);

   188     } else if (*codepage == 0) {

   189         // use the "invariant characters" conversion

   190         int32_t destLength;

   191         if(length <= capacity) {

   192             destLength = length;

   193         } else {

   194             destLength = capacity;

   195         }

   196         u_UCharsToChars(getArrayStart() + start, target, destLength);

   197         return u_terminateChars(target, capacity, length, &status);

   198     } else {

   199         converter = ucnv_open(codepage, &status);

   200     }

   202     length = doExtract(start, length, target, capacity, converter, status);

   204     // close the converter

   205     if (codepage == 0) {

   206         u_releaseDefaultConverter(converter);

   207     } else {

   208         ucnv_close(converter);

   209     }

   211     return length;

   212 }

   214 int32_t

   215 UnicodeString::extract(char *dest, int32_t destCapacity,

   216                        UConverter *cnv,

   217                        UErrorCode &errorCode) const

   218 {

   219     if(U_FAILURE(errorCode)) {

   220         return 0;

   221     }

   223     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {

   224         errorCode=U_ILLEGAL_ARGUMENT_ERROR;

   225         return 0;

   226     }

   228     // nothing to do?

   229     if(isEmpty()) {

   230         return u_terminateChars(dest, destCapacity, 0, &errorCode);

   231     }

   233     // get the converter

   234     UBool isDefaultConverter;

   235     if(cnv==0) {

   236         isDefaultConverter=TRUE;

   237         cnv=u_getDefaultConverter(&errorCode);

   238         if(U_FAILURE(errorCode)) {

   239             return 0;

   240         }

   241     } else {

   242         isDefaultConverter=FALSE;

   243         ucnv_resetFromUnicode(cnv);

   244     }

   246     // convert

   247     int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode);

   249     // release the converter

   250     if(isDefaultConverter) {

   251         u_releaseDefaultConverter(cnv);

   252     }

   254     return len;

   255 }

   257 int32_t

   258 UnicodeString::doExtract(int32_t start, int32_t length,

   259                          char *dest, int32_t destCapacity,

   260                          UConverter *cnv,

   261                          UErrorCode &errorCode) const

   262 {

   263     if(U_FAILURE(errorCode)) {

   264         if(destCapacity!=0) {

   265             *dest=0;

   266         }

   267         return 0;

   268     }

   270     const UChar *src=getArrayStart()+start, *srcLimit=src+length;

   271     char *originalDest=dest;

   272     const char *destLimit;

   274     if(destCapacity==0) {

   275         destLimit=dest=0;

   276     } else if(destCapacity==-1) {

   277         // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.

   278         destLimit=(char*)U_MAX_PTR(dest);

   279         // for NUL-termination, translate into highest int32_t

   280         destCapacity=0x7fffffff;

   281     } else {

   282         destLimit=dest+destCapacity;

   283     }

   285     // perform the conversion

   286     ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);

   287     length=(int32_t)(dest-originalDest);

   289     // if an overflow occurs, then get the preflighting length

   290     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {

   291         char buffer[1024];

   293         destLimit=buffer+sizeof(buffer);

   294         do {

   295             dest=buffer;

   296             errorCode=U_ZERO_ERROR;

   297             ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);

   298             length+=(int32_t)(dest-buffer);

   299         } while(errorCode==U_BUFFER_OVERFLOW_ERROR);

   300     }

   302     return u_terminateChars(originalDest, destCapacity, length, &errorCode);

   303 }

   305 void

   306 UnicodeString::doCodepageCreate(const char *codepageData,

   307                                 int32_t dataLength,

   308                                 const char *codepage)

   309 {

   310     // if there's nothing to convert, do nothing

   311     if(codepageData == 0 || dataLength == 0 || dataLength < -1) {

   312         return;

   313     }

   314     if(dataLength == -1) {

   315         dataLength = (int32_t)uprv_strlen(codepageData);

   316     }

   318     UErrorCode status = U_ZERO_ERROR;

   320     // create the converter

   321     // if the codepage is the default, use our cache

   322     // if it is an empty string, then use the "invariant character" conversion

   323     UConverter *converter;

   324     if (codepage == 0) {

   325         const char *defaultName = ucnv_getDefaultName();

   326         if(UCNV_FAST_IS_UTF8(defaultName)) {

   327             setToUTF8(StringPiece(codepageData, dataLength));

   328             return;

   329         }

   330         converter = u_getDefaultConverter(&status);

   331     } else if(*codepage == 0) {

   332         // use the "invariant characters" conversion

   333         if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {

   334             u_charsToUChars(codepageData, getArrayStart(), dataLength);

   335             setLength(dataLength);

   336         } else {

   337             setToBogus();

   338         }

   339         return;

   340     } else {

   341         converter = ucnv_open(codepage, &status);

   342     }

   344     // if we failed, set the appropriate flags and return

   345     if(U_FAILURE(status)) {

   346         setToBogus();

   347         return;

   348     }

   350     // perform the conversion

   351     doCodepageCreate(codepageData, dataLength, converter, status);

   352     if(U_FAILURE(status)) {

   353         setToBogus();

   354     }

   356     // close the converter

   357     if(codepage == 0) {

   358         u_releaseDefaultConverter(converter);

   359     } else {

   360         ucnv_close(converter);

   361     }

   362 }

   364 void

   365 UnicodeString::doCodepageCreate(const char *codepageData,

   366                                 int32_t dataLength,

   367                                 UConverter *converter,

   368                                 UErrorCode &status)

   369 {

   370     if(U_FAILURE(status)) {

   371         return;

   372     }

   374     // set up the conversion parameters

   375     const char *mySource     = codepageData;

   376     const char *mySourceEnd  = mySource + dataLength;

   377     UChar *array, *myTarget;

   379     // estimate the size needed:

   380     int32_t arraySize;

   381     if(dataLength <= US_STACKBUF_SIZE) {

   382         // try to use the stack buffer

   383         arraySize = US_STACKBUF_SIZE;

   384     } else {

   385         // 1.25 UChar's per source byte should cover most cases

   386         arraySize = dataLength + (dataLength >> 2);

   387     }

   389     // we do not care about the current contents

   390     UBool doCopyArray = FALSE;

   391     for(;;) {

   392         if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {

   393             setToBogus();

   394             break;

   395         }

   397         // perform the conversion

   398         array = getArrayStart();

   399         myTarget = array + length();

   400         ucnv_toUnicode(converter, &myTarget,  array + getCapacity(),

   401             &mySource, mySourceEnd, 0, TRUE, &status);

   403         // update the conversion parameters

   404         setLength((int32_t)(myTarget - array));

   406         // allocate more space and copy data, if needed

   407         if(status == U_BUFFER_OVERFLOW_ERROR) {

   408             // reset the error code

   409             status = U_ZERO_ERROR;

   411             // keep the previous conversion results

   412             doCopyArray = TRUE;

   414             // estimate the new size needed, larger than before

   415             // try 2 UChar's per remaining source byte

   416             arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource));

   417         } else {

   418             break;

   419         }

   420     }

   421 }

   423 U_NAMESPACE_END

   425 #endif

The Tor Browser / file revision

intl/icu/source/common/unistr_cnv.cpp@6474c204b198

intl/icu/source/common/unistr_cnv.cpp