intl/icu/source/common/unistr_cnv.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*
     2 *******************************************************************************
     3 *
     4 *   Copyright (C) 1999-2010, International Business Machines
     5 *   Corporation and others.  All Rights Reserved.
     6 *
     7 *******************************************************************************
     8 *   file name:  unistr_cnv.cpp
     9 *   encoding:   US-ASCII
    10 *   tab size:   8 (not used)
    11 *   indentation:2
    12 *
    13 *   created on: 2004aug19
    14 *   created by: Markus W. Scherer
    15 *
    16 *   Character conversion functions moved here from unistr.cpp
    17 */
    19 #include "unicode/utypes.h"
    21 #if !UCONFIG_NO_CONVERSION
    23 #include "unicode/putil.h"
    24 #include "cstring.h"
    25 #include "cmemory.h"
    26 #include "unicode/ustring.h"
    27 #include "unicode/unistr.h"
    28 #include "unicode/ucnv.h"
    29 #include "ucnv_imp.h"
    30 #include "putilimp.h"
    31 #include "ustr_cnv.h"
    32 #include "ustr_imp.h"
    34 U_NAMESPACE_BEGIN
    36 //========================================
    37 // Constructors
    38 //========================================
    40 #if !U_CHARSET_IS_UTF8
    42 UnicodeString::UnicodeString(const char *codepageData)
    43   : fShortLength(0),
    44     fFlags(kShortString)
    45 {
    46     if(codepageData != 0) {
    47         doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0);
    48     }
    49 }
    51 UnicodeString::UnicodeString(const char *codepageData,
    52                              int32_t dataLength)
    53   : fShortLength(0),
    54     fFlags(kShortString)
    55 {
    56     if(codepageData != 0) {
    57         doCodepageCreate(codepageData, dataLength, 0);
    58     }
    59 }
    61 // else see unistr.cpp
    62 #endif
    64 UnicodeString::UnicodeString(const char *codepageData,
    65                              const char *codepage)
    66   : fShortLength(0),
    67     fFlags(kShortString)
    68 {
    69     if(codepageData != 0) {
    70         doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);
    71     }
    72 }
    74 UnicodeString::UnicodeString(const char *codepageData,
    75                              int32_t dataLength,
    76                              const char *codepage)
    77   : fShortLength(0),
    78     fFlags(kShortString)
    79 {
    80     if(codepageData != 0) {
    81         doCodepageCreate(codepageData, dataLength, codepage);
    82     }
    83 }
    85 UnicodeString::UnicodeString(const char *src, int32_t srcLength,
    86                              UConverter *cnv,
    87                              UErrorCode &errorCode)
    88   : fShortLength(0),
    89     fFlags(kShortString)
    90 {
    91     if(U_SUCCESS(errorCode)) {
    92         // check arguments
    93         if(src==NULL) {
    94             // treat as an empty string, do nothing more
    95         } else if(srcLength<-1) {
    96             errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    97         } else {
    98             // get input length
    99             if(srcLength==-1) {
   100                 srcLength=(int32_t)uprv_strlen(src);
   101             }
   102             if(srcLength>0) {
   103                 if(cnv!=0) {
   104                     // use the provided converter
   105                     ucnv_resetToUnicode(cnv);
   106                     doCodepageCreate(src, srcLength, cnv, errorCode);
   107                 } else {
   108                     // use the default converter
   109                     cnv=u_getDefaultConverter(&errorCode);
   110                     doCodepageCreate(src, srcLength, cnv, errorCode);
   111                     u_releaseDefaultConverter(cnv);
   112                 }
   113             }
   114         }
   116         if(U_FAILURE(errorCode)) {
   117             setToBogus();
   118         }
   119     }
   120 }
   122 //========================================
   123 // Codeset conversion
   124 //========================================
   126 #if !U_CHARSET_IS_UTF8
   128 int32_t
   129 UnicodeString::extract(int32_t start,
   130                        int32_t length,
   131                        char *target,
   132                        uint32_t dstSize) const {
   133     return extract(start, length, target, dstSize, 0);
   134 }
   136 // else see unistr.cpp
   137 #endif
   139 int32_t
   140 UnicodeString::extract(int32_t start,
   141                        int32_t length,
   142                        char *target,
   143                        uint32_t dstSize,
   144                        const char *codepage) const
   145 {
   146     // if the arguments are illegal, then do nothing
   147     if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
   148         return 0;
   149     }
   151     // pin the indices to legal values
   152     pinIndices(start, length);
   154     // We need to cast dstSize to int32_t for all subsequent code.
   155     // I don't know why the API was defined with uint32_t but we are stuck with it.
   156     // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
   157     // as a limit in some functions, it may wrap around and yield a pointer
   158     // that compares less-than target.
   159     int32_t capacity;
   160     if(dstSize < 0x7fffffff) {
   161         // Assume that the capacity is real and a limit pointer won't wrap around.
   162         capacity = (int32_t)dstSize;
   163     } else {
   164         // Pin the capacity so that a limit pointer does not wrap around.
   165         char *targetLimit = (char *)U_MAX_PTR(target);
   166         // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff
   167         // greater than target and does not wrap around the top of the address space.
   168         capacity = (int32_t)(targetLimit - target);
   169     }
   171     // create the converter
   172     UConverter *converter;
   173     UErrorCode status = U_ZERO_ERROR;
   175     // just write the NUL if the string length is 0
   176     if(length == 0) {
   177         return u_terminateChars(target, capacity, 0, &status);
   178     }
   180     // if the codepage is the default, use our cache
   181     // if it is an empty string, then use the "invariant character" conversion
   182     if (codepage == 0) {
   183         const char *defaultName = ucnv_getDefaultName();
   184         if(UCNV_FAST_IS_UTF8(defaultName)) {
   185             return toUTF8(start, length, target, capacity);
   186         }
   187         converter = u_getDefaultConverter(&status);
   188     } else if (*codepage == 0) {
   189         // use the "invariant characters" conversion
   190         int32_t destLength;
   191         if(length <= capacity) {
   192             destLength = length;
   193         } else {
   194             destLength = capacity;
   195         }
   196         u_UCharsToChars(getArrayStart() + start, target, destLength);
   197         return u_terminateChars(target, capacity, length, &status);
   198     } else {
   199         converter = ucnv_open(codepage, &status);
   200     }
   202     length = doExtract(start, length, target, capacity, converter, status);
   204     // close the converter
   205     if (codepage == 0) {
   206         u_releaseDefaultConverter(converter);
   207     } else {
   208         ucnv_close(converter);
   209     }
   211     return length;
   212 }
   214 int32_t
   215 UnicodeString::extract(char *dest, int32_t destCapacity,
   216                        UConverter *cnv,
   217                        UErrorCode &errorCode) const
   218 {
   219     if(U_FAILURE(errorCode)) {
   220         return 0;
   221     }
   223     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
   224         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
   225         return 0;
   226     }
   228     // nothing to do?
   229     if(isEmpty()) {
   230         return u_terminateChars(dest, destCapacity, 0, &errorCode);
   231     }
   233     // get the converter
   234     UBool isDefaultConverter;
   235     if(cnv==0) {
   236         isDefaultConverter=TRUE;
   237         cnv=u_getDefaultConverter(&errorCode);
   238         if(U_FAILURE(errorCode)) {
   239             return 0;
   240         }
   241     } else {
   242         isDefaultConverter=FALSE;
   243         ucnv_resetFromUnicode(cnv);
   244     }
   246     // convert
   247     int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode);
   249     // release the converter
   250     if(isDefaultConverter) {
   251         u_releaseDefaultConverter(cnv);
   252     }
   254     return len;
   255 }
   257 int32_t
   258 UnicodeString::doExtract(int32_t start, int32_t length,
   259                          char *dest, int32_t destCapacity,
   260                          UConverter *cnv,
   261                          UErrorCode &errorCode) const
   262 {
   263     if(U_FAILURE(errorCode)) {
   264         if(destCapacity!=0) {
   265             *dest=0;
   266         }
   267         return 0;
   268     }
   270     const UChar *src=getArrayStart()+start, *srcLimit=src+length;
   271     char *originalDest=dest;
   272     const char *destLimit;
   274     if(destCapacity==0) {
   275         destLimit=dest=0;
   276     } else if(destCapacity==-1) {
   277         // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
   278         destLimit=(char*)U_MAX_PTR(dest);
   279         // for NUL-termination, translate into highest int32_t
   280         destCapacity=0x7fffffff;
   281     } else {
   282         destLimit=dest+destCapacity;
   283     }
   285     // perform the conversion
   286     ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
   287     length=(int32_t)(dest-originalDest);
   289     // if an overflow occurs, then get the preflighting length
   290     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
   291         char buffer[1024];
   293         destLimit=buffer+sizeof(buffer);
   294         do {
   295             dest=buffer;
   296             errorCode=U_ZERO_ERROR;
   297             ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
   298             length+=(int32_t)(dest-buffer);
   299         } while(errorCode==U_BUFFER_OVERFLOW_ERROR);
   300     }
   302     return u_terminateChars(originalDest, destCapacity, length, &errorCode);
   303 }
   305 void
   306 UnicodeString::doCodepageCreate(const char *codepageData,
   307                                 int32_t dataLength,
   308                                 const char *codepage)
   309 {
   310     // if there's nothing to convert, do nothing
   311     if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
   312         return;
   313     }
   314     if(dataLength == -1) {
   315         dataLength = (int32_t)uprv_strlen(codepageData);
   316     }
   318     UErrorCode status = U_ZERO_ERROR;
   320     // create the converter
   321     // if the codepage is the default, use our cache
   322     // if it is an empty string, then use the "invariant character" conversion
   323     UConverter *converter;
   324     if (codepage == 0) {
   325         const char *defaultName = ucnv_getDefaultName();
   326         if(UCNV_FAST_IS_UTF8(defaultName)) {
   327             setToUTF8(StringPiece(codepageData, dataLength));
   328             return;
   329         }
   330         converter = u_getDefaultConverter(&status);
   331     } else if(*codepage == 0) {
   332         // use the "invariant characters" conversion
   333         if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
   334             u_charsToUChars(codepageData, getArrayStart(), dataLength);
   335             setLength(dataLength);
   336         } else {
   337             setToBogus();
   338         }
   339         return;
   340     } else {
   341         converter = ucnv_open(codepage, &status);
   342     }
   344     // if we failed, set the appropriate flags and return
   345     if(U_FAILURE(status)) {
   346         setToBogus();
   347         return;
   348     }
   350     // perform the conversion
   351     doCodepageCreate(codepageData, dataLength, converter, status);
   352     if(U_FAILURE(status)) {
   353         setToBogus();
   354     }
   356     // close the converter
   357     if(codepage == 0) {
   358         u_releaseDefaultConverter(converter);
   359     } else {
   360         ucnv_close(converter);
   361     }
   362 }
   364 void
   365 UnicodeString::doCodepageCreate(const char *codepageData,
   366                                 int32_t dataLength,
   367                                 UConverter *converter,
   368                                 UErrorCode &status)
   369 {
   370     if(U_FAILURE(status)) {
   371         return;
   372     }
   374     // set up the conversion parameters
   375     const char *mySource     = codepageData;
   376     const char *mySourceEnd  = mySource + dataLength;
   377     UChar *array, *myTarget;
   379     // estimate the size needed:
   380     int32_t arraySize;
   381     if(dataLength <= US_STACKBUF_SIZE) {
   382         // try to use the stack buffer
   383         arraySize = US_STACKBUF_SIZE;
   384     } else {
   385         // 1.25 UChar's per source byte should cover most cases
   386         arraySize = dataLength + (dataLength >> 2);
   387     }
   389     // we do not care about the current contents
   390     UBool doCopyArray = FALSE;
   391     for(;;) {
   392         if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
   393             setToBogus();
   394             break;
   395         }
   397         // perform the conversion
   398         array = getArrayStart();
   399         myTarget = array + length();
   400         ucnv_toUnicode(converter, &myTarget,  array + getCapacity(),
   401             &mySource, mySourceEnd, 0, TRUE, &status);
   403         // update the conversion parameters
   404         setLength((int32_t)(myTarget - array));
   406         // allocate more space and copy data, if needed
   407         if(status == U_BUFFER_OVERFLOW_ERROR) {
   408             // reset the error code
   409             status = U_ZERO_ERROR;
   411             // keep the previous conversion results
   412             doCopyArray = TRUE;
   414             // estimate the new size needed, larger than before
   415             // try 2 UChar's per remaining source byte
   416             arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource));
   417         } else {
   418             break;
   419         }
   420     }
   421 }
   423 U_NAMESPACE_END
   425 #endif

mercurial