intl/icu/source/common/ucnvsel.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*
     2 *******************************************************************************
     3 *
     4 *   Copyright (C) 2008-2011, International Business Machines
     5 *   Corporation, Google and others.  All Rights Reserved.
     6 *
     7 *******************************************************************************
     8 */
     9 // Author : eldawy@google.com (Mohamed Eldawy)
    10 // ucnvsel.cpp
    11 //
    12 // Purpose: To generate a list of encodings capable of handling
    13 // a given Unicode text
    14 //
    15 // Started 09-April-2008
    17 /**
    18  * \file
    19  *
    20  * This is an implementation of an encoding selector.
    21  * The goal is, given a unicode string, find the encodings
    22  * this string can be mapped to. To make processing faster
    23  * a trie is built when you call ucnvsel_open() that
    24  * stores all encodings a codepoint can map to
    25  */
    27 #include "unicode/ucnvsel.h"
    29 #if !UCONFIG_NO_CONVERSION
    31 #include <string.h>
    33 #include "unicode/uchar.h"
    34 #include "unicode/uniset.h"
    35 #include "unicode/ucnv.h"
    36 #include "unicode/ustring.h"
    37 #include "unicode/uchriter.h"
    38 #include "utrie2.h"
    39 #include "propsvec.h"
    40 #include "uassert.h"
    41 #include "ucmndata.h"
    42 #include "uenumimp.h"
    43 #include "cmemory.h"
    44 #include "cstring.h"
    46 U_NAMESPACE_USE
    48 struct UConverterSelector {
    49   UTrie2 *trie;              // 16 bit trie containing offsets into pv
    50   uint32_t* pv;              // table of bits!
    51   int32_t pvCount;
    52   char** encodings;          // which encodings did user ask to use?
    53   int32_t encodingsCount;
    54   int32_t encodingStrLength;
    55   uint8_t* swapped;
    56   UBool ownPv, ownEncodingStrings;
    57 };
    59 static void generateSelectorData(UConverterSelector* result,
    60                                  UPropsVectors *upvec,
    61                                  const USet* excludedCodePoints,
    62                                  const UConverterUnicodeSet whichSet,
    63                                  UErrorCode* status) {
    64   if (U_FAILURE(*status)) {
    65     return;
    66   }
    68   int32_t columns = (result->encodingsCount+31)/32;
    70   // set errorValue to all-ones
    71   for (int32_t col = 0; col < columns; col++) {
    72     upvec_setValue(upvec, UPVEC_ERROR_VALUE_CP, UPVEC_ERROR_VALUE_CP,
    73                    col, ~0, ~0, status);
    74   }
    76   for (int32_t i = 0; i < result->encodingsCount; ++i) {
    77     uint32_t mask;
    78     uint32_t column;
    79     int32_t item_count;
    80     int32_t j;
    81     UConverter* test_converter = ucnv_open(result->encodings[i], status);
    82     if (U_FAILURE(*status)) {
    83       return;
    84     }
    85     USet* unicode_point_set;
    86     unicode_point_set = uset_open(1, 0);  // empty set
    88     ucnv_getUnicodeSet(test_converter, unicode_point_set,
    89                        whichSet, status);
    90     if (U_FAILURE(*status)) {
    91       ucnv_close(test_converter);
    92       return;
    93     }
    95     column = i / 32;
    96     mask = 1 << (i%32);
    97     // now iterate over intervals on set i!
    98     item_count = uset_getItemCount(unicode_point_set);
   100     for (j = 0; j < item_count; ++j) {
   101       UChar32 start_char;
   102       UChar32 end_char;
   103       UErrorCode smallStatus = U_ZERO_ERROR;
   104       uset_getItem(unicode_point_set, j, &start_char, &end_char, NULL, 0,
   105                    &smallStatus);
   106       if (U_FAILURE(smallStatus)) {
   107         // this will be reached for the converters that fill the set with
   108         // strings. Those should be ignored by our system
   109       } else {
   110         upvec_setValue(upvec, start_char, end_char, column, ~0, mask,
   111                        status);
   112       }
   113     }
   114     ucnv_close(test_converter);
   115     uset_close(unicode_point_set);
   116     if (U_FAILURE(*status)) {
   117       return;
   118     }
   119   }
   121   // handle excluded encodings! Simply set their values to all 1's in the upvec
   122   if (excludedCodePoints) {
   123     int32_t item_count = uset_getItemCount(excludedCodePoints);
   124     for (int32_t j = 0; j < item_count; ++j) {
   125       UChar32 start_char;
   126       UChar32 end_char;
   128       uset_getItem(excludedCodePoints, j, &start_char, &end_char, NULL, 0,
   129                    status);
   130       for (int32_t col = 0; col < columns; col++) {
   131         upvec_setValue(upvec, start_char, end_char, col, ~0, ~0,
   132                       status);
   133       }
   134     }
   135   }
   137   // alright. Now, let's put things in the same exact form you'd get when you
   138   // unserialize things.
   139   result->trie = upvec_compactToUTrie2WithRowIndexes(upvec, status);
   140   result->pv = upvec_cloneArray(upvec, &result->pvCount, NULL, status);
   141   result->pvCount *= columns;  // number of uint32_t = rows * columns
   142   result->ownPv = TRUE;
   143 }
   145 /* open a selector. If converterListSize is 0, build for all converters.
   146    If excludedCodePoints is NULL, don't exclude any codepoints */
   147 U_CAPI UConverterSelector* U_EXPORT2
   148 ucnvsel_open(const char* const*  converterList, int32_t converterListSize,
   149              const USet* excludedCodePoints,
   150              const UConverterUnicodeSet whichSet, UErrorCode* status) {
   151   // check if already failed
   152   if (U_FAILURE(*status)) {
   153     return NULL;
   154   }
   155   // ensure args make sense!
   156   if (converterListSize < 0 || (converterList == NULL && converterListSize != 0)) {
   157     *status = U_ILLEGAL_ARGUMENT_ERROR;
   158     return NULL;
   159   }
   161   // allocate a new converter
   162   LocalUConverterSelectorPointer newSelector(
   163     (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector)));
   164   if (newSelector.isNull()) {
   165     *status = U_MEMORY_ALLOCATION_ERROR;
   166     return NULL;
   167   }
   168   uprv_memset(newSelector.getAlias(), 0, sizeof(UConverterSelector));
   170   if (converterListSize == 0) {
   171     converterList = NULL;
   172     converterListSize = ucnv_countAvailable();
   173   }
   174   newSelector->encodings =
   175     (char**)uprv_malloc(converterListSize * sizeof(char*));
   176   if (!newSelector->encodings) {
   177     *status = U_MEMORY_ALLOCATION_ERROR;
   178     return NULL;
   179   }
   180   newSelector->encodings[0] = NULL;  // now we can call ucnvsel_close()
   182   // make a backup copy of the list of converters
   183   int32_t totalSize = 0;
   184   int32_t i;
   185   for (i = 0; i < converterListSize; i++) {
   186     totalSize +=
   187       (int32_t)uprv_strlen(converterList != NULL ? converterList[i] : ucnv_getAvailableName(i)) + 1;
   188   }
   189   // 4-align the totalSize to 4-align the size of the serialized form
   190   int32_t encodingStrPadding = totalSize & 3;
   191   if (encodingStrPadding != 0) {
   192     encodingStrPadding = 4 - encodingStrPadding;
   193   }
   194   newSelector->encodingStrLength = totalSize += encodingStrPadding;
   195   char* allStrings = (char*) uprv_malloc(totalSize);
   196   if (!allStrings) {
   197     *status = U_MEMORY_ALLOCATION_ERROR;
   198     return NULL;
   199   }
   201   for (i = 0; i < converterListSize; i++) {
   202     newSelector->encodings[i] = allStrings;
   203     uprv_strcpy(newSelector->encodings[i],
   204                 converterList != NULL ? converterList[i] : ucnv_getAvailableName(i));
   205     allStrings += uprv_strlen(newSelector->encodings[i]) + 1;
   206   }
   207   while (encodingStrPadding > 0) {
   208     *allStrings++ = 0;
   209     --encodingStrPadding;
   210   }
   212   newSelector->ownEncodingStrings = TRUE;
   213   newSelector->encodingsCount = converterListSize;
   214   UPropsVectors *upvec = upvec_open((converterListSize+31)/32, status);
   215   generateSelectorData(newSelector.getAlias(), upvec, excludedCodePoints, whichSet, status);
   216   upvec_close(upvec);
   218   if (U_FAILURE(*status)) {
   219     return NULL;
   220   }
   222   return newSelector.orphan();
   223 }
   225 /* close opened selector */
   226 U_CAPI void U_EXPORT2
   227 ucnvsel_close(UConverterSelector *sel) {
   228   if (!sel) {
   229     return;
   230   }
   231   if (sel->ownEncodingStrings) {
   232     uprv_free(sel->encodings[0]);
   233   }
   234   uprv_free(sel->encodings);
   235   if (sel->ownPv) {
   236     uprv_free(sel->pv);
   237   }
   238   utrie2_close(sel->trie);
   239   uprv_free(sel->swapped);
   240   uprv_free(sel);
   241 }
   243 static const UDataInfo dataInfo = {
   244   sizeof(UDataInfo),
   245   0,
   247   U_IS_BIG_ENDIAN,
   248   U_CHARSET_FAMILY,
   249   U_SIZEOF_UCHAR,
   250   0,
   252   { 0x43, 0x53, 0x65, 0x6c },   /* dataFormat="CSel" */
   253   { 1, 0, 0, 0 },               /* formatVersion */
   254   { 0, 0, 0, 0 }                /* dataVersion */
   255 };
   257 enum {
   258   UCNVSEL_INDEX_TRIE_SIZE,      // trie size in bytes
   259   UCNVSEL_INDEX_PV_COUNT,       // number of uint32_t in the bit vectors
   260   UCNVSEL_INDEX_NAMES_COUNT,    // number of encoding names
   261   UCNVSEL_INDEX_NAMES_LENGTH,   // number of encoding name bytes including padding
   262   UCNVSEL_INDEX_SIZE = 15,      // bytes following the DataHeader
   263   UCNVSEL_INDEX_COUNT = 16
   264 };
   266 /*
   267  * Serialized form of a UConverterSelector, formatVersion 1:
   268  *
   269  * The serialized form begins with a standard ICU DataHeader with a UDataInfo
   270  * as the template above.
   271  * This is followed by:
   272  *   int32_t indexes[UCNVSEL_INDEX_COUNT];          // see index entry constants above
   273  *   serialized UTrie2;                             // indexes[UCNVSEL_INDEX_TRIE_SIZE] bytes
   274  *   uint32_t pv[indexes[UCNVSEL_INDEX_PV_COUNT]];  // bit vectors
   275  *   char* encodingNames[indexes[UCNVSEL_INDEX_NAMES_LENGTH]];  // NUL-terminated strings + padding
   276  */
   278 /* serialize a selector */
   279 U_CAPI int32_t U_EXPORT2
   280 ucnvsel_serialize(const UConverterSelector* sel,
   281                   void* buffer, int32_t bufferCapacity, UErrorCode* status) {
   282   // check if already failed
   283   if (U_FAILURE(*status)) {
   284     return 0;
   285   }
   286   // ensure args make sense!
   287   uint8_t *p = (uint8_t *)buffer;
   288   if (bufferCapacity < 0 ||
   289       (bufferCapacity > 0 && (p == NULL || (U_POINTER_MASK_LSB(p, 3) != 0)))
   290   ) {
   291     *status = U_ILLEGAL_ARGUMENT_ERROR;
   292     return 0;
   293   }
   294   // add up the size of the serialized form
   295   int32_t serializedTrieSize = utrie2_serialize(sel->trie, NULL, 0, status);
   296   if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
   297     return 0;
   298   }
   299   *status = U_ZERO_ERROR;
   301   DataHeader header;
   302   uprv_memset(&header, 0, sizeof(header));
   303   header.dataHeader.headerSize = (uint16_t)((sizeof(header) + 15) & ~15);
   304   header.dataHeader.magic1 = 0xda;
   305   header.dataHeader.magic2 = 0x27;
   306   uprv_memcpy(&header.info, &dataInfo, sizeof(dataInfo));
   308   int32_t indexes[UCNVSEL_INDEX_COUNT] = {
   309     serializedTrieSize,
   310     sel->pvCount,
   311     sel->encodingsCount,
   312     sel->encodingStrLength
   313   };
   315   int32_t totalSize =
   316     header.dataHeader.headerSize +
   317     (int32_t)sizeof(indexes) +
   318     serializedTrieSize +
   319     sel->pvCount * 4 +
   320     sel->encodingStrLength;
   321   indexes[UCNVSEL_INDEX_SIZE] = totalSize - header.dataHeader.headerSize;
   322   if (totalSize > bufferCapacity) {
   323     *status = U_BUFFER_OVERFLOW_ERROR;
   324     return totalSize;
   325   }
   326   // ok, save!
   327   int32_t length = header.dataHeader.headerSize;
   328   uprv_memcpy(p, &header, sizeof(header));
   329   uprv_memset(p + sizeof(header), 0, length - sizeof(header));
   330   p += length;
   332   length = (int32_t)sizeof(indexes);
   333   uprv_memcpy(p, indexes, length);
   334   p += length;
   336   utrie2_serialize(sel->trie, p, serializedTrieSize, status);
   337   p += serializedTrieSize;
   339   length = sel->pvCount * 4;
   340   uprv_memcpy(p, sel->pv, length);
   341   p += length;
   343   uprv_memcpy(p, sel->encodings[0], sel->encodingStrLength);
   344   p += sel->encodingStrLength;
   346   return totalSize;
   347 }
   349 /**
   350  * swap a selector into the desired Endianness and Asciiness of
   351  * the system. Just as FYI, selectors are always saved in the format
   352  * of the system that created them. They are only converted if used
   353  * on another system. In other words, selectors created on different
   354  * system can be different even if the params are identical (endianness
   355  * and Asciiness differences only)
   356  *
   357  * @param ds pointer to data swapper containing swapping info
   358  * @param inData pointer to incoming data
   359  * @param length length of inData in bytes
   360  * @param outData pointer to output data. Capacity should
   361  *                be at least equal to capacity of inData
   362  * @param status an in/out ICU UErrorCode
   363  * @return 0 on failure, number of bytes swapped on success
   364  *         number of bytes swapped can be smaller than length
   365  */
   366 static int32_t
   367 ucnvsel_swap(const UDataSwapper *ds,
   368              const void *inData, int32_t length,
   369              void *outData, UErrorCode *status) {
   370   /* udata_swapDataHeader checks the arguments */
   371   int32_t headerSize = udata_swapDataHeader(ds, inData, length, outData, status);
   372   if(U_FAILURE(*status)) {
   373     return 0;
   374   }
   376   /* check data format and format version */
   377   const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData + 4);
   378   if(!(
   379     pInfo->dataFormat[0] == 0x43 &&  /* dataFormat="CSel" */
   380     pInfo->dataFormat[1] == 0x53 &&
   381     pInfo->dataFormat[2] == 0x65 &&
   382     pInfo->dataFormat[3] == 0x6c
   383   )) {
   384     udata_printError(ds, "ucnvsel_swap(): data format %02x.%02x.%02x.%02x is not recognized as UConverterSelector data\n",
   385                      pInfo->dataFormat[0], pInfo->dataFormat[1],
   386                      pInfo->dataFormat[2], pInfo->dataFormat[3]);
   387     *status = U_INVALID_FORMAT_ERROR;
   388     return 0;
   389   }
   390   if(pInfo->formatVersion[0] != 1) {
   391     udata_printError(ds, "ucnvsel_swap(): format version %02x is not supported\n",
   392                      pInfo->formatVersion[0]);
   393     *status = U_UNSUPPORTED_ERROR;
   394     return 0;
   395   }
   397   if(length >= 0) {
   398     length -= headerSize;
   399     if(length < 16*4) {
   400       udata_printError(ds, "ucnvsel_swap(): too few bytes (%d after header) for UConverterSelector data\n",
   401                        length);
   402       *status = U_INDEX_OUTOFBOUNDS_ERROR;
   403       return 0;
   404     }
   405   }
   407   const uint8_t *inBytes = (const uint8_t *)inData + headerSize;
   408   uint8_t *outBytes = (uint8_t *)outData + headerSize;
   410   /* read the indexes */
   411   const int32_t *inIndexes = (const int32_t *)inBytes;
   412   int32_t indexes[16];
   413   int32_t i;
   414   for(i = 0; i < 16; ++i) {
   415     indexes[i] = udata_readInt32(ds, inIndexes[i]);
   416   }
   418   /* get the total length of the data */
   419   int32_t size = indexes[UCNVSEL_INDEX_SIZE];
   420   if(length >= 0) {
   421     if(length < size) {
   422       udata_printError(ds, "ucnvsel_swap(): too few bytes (%d after header) for all of UConverterSelector data\n",
   423                        length);
   424       *status = U_INDEX_OUTOFBOUNDS_ERROR;
   425       return 0;
   426     }
   428     /* copy the data for inaccessible bytes */
   429     if(inBytes != outBytes) {
   430       uprv_memcpy(outBytes, inBytes, size);
   431     }
   433     int32_t offset = 0, count;
   435     /* swap the int32_t indexes[] */
   436     count = UCNVSEL_INDEX_COUNT*4;
   437     ds->swapArray32(ds, inBytes, count, outBytes, status);
   438     offset += count;
   440     /* swap the UTrie2 */
   441     count = indexes[UCNVSEL_INDEX_TRIE_SIZE];
   442     utrie2_swap(ds, inBytes + offset, count, outBytes + offset, status);
   443     offset += count;
   445     /* swap the uint32_t pv[] */
   446     count = indexes[UCNVSEL_INDEX_PV_COUNT]*4;
   447     ds->swapArray32(ds, inBytes + offset, count, outBytes + offset, status);
   448     offset += count;
   450     /* swap the encoding names */
   451     count = indexes[UCNVSEL_INDEX_NAMES_LENGTH];
   452     ds->swapInvChars(ds, inBytes + offset, count, outBytes + offset, status);
   453     offset += count;
   455     U_ASSERT(offset == size);
   456   }
   458   return headerSize + size;
   459 }
   461 /* unserialize a selector */
   462 U_CAPI UConverterSelector* U_EXPORT2
   463 ucnvsel_openFromSerialized(const void* buffer, int32_t length, UErrorCode* status) {
   464   // check if already failed
   465   if (U_FAILURE(*status)) {
   466     return NULL;
   467   }
   468   // ensure args make sense!
   469   const uint8_t *p = (const uint8_t *)buffer;
   470   if (length <= 0 ||
   471       (length > 0 && (p == NULL || (U_POINTER_MASK_LSB(p, 3) != 0)))
   472   ) {
   473     *status = U_ILLEGAL_ARGUMENT_ERROR;
   474     return NULL;
   475   }
   476   // header
   477   if (length < 32) {
   478     // not even enough space for a minimal header
   479     *status = U_INDEX_OUTOFBOUNDS_ERROR;
   480     return NULL;
   481   }
   482   const DataHeader *pHeader = (const DataHeader *)p;
   483   if (!(
   484     pHeader->dataHeader.magic1==0xda &&
   485     pHeader->dataHeader.magic2==0x27 &&
   486     pHeader->info.dataFormat[0] == 0x43 &&
   487     pHeader->info.dataFormat[1] == 0x53 &&
   488     pHeader->info.dataFormat[2] == 0x65 &&
   489     pHeader->info.dataFormat[3] == 0x6c
   490   )) {
   491     /* header not valid or dataFormat not recognized */
   492     *status = U_INVALID_FORMAT_ERROR;
   493     return NULL;
   494   }
   495   if (pHeader->info.formatVersion[0] != 1) {
   496     *status = U_UNSUPPORTED_ERROR;
   497     return NULL;
   498   }
   499   uint8_t* swapped = NULL;
   500   if (pHeader->info.isBigEndian != U_IS_BIG_ENDIAN ||
   501       pHeader->info.charsetFamily != U_CHARSET_FAMILY
   502   ) {
   503     // swap the data
   504     UDataSwapper *ds =
   505       udata_openSwapperForInputData(p, length, U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, status);
   506     int32_t totalSize = ucnvsel_swap(ds, p, -1, NULL, status);
   507     if (U_FAILURE(*status)) {
   508       udata_closeSwapper(ds);
   509       return NULL;
   510     }
   511     if (length < totalSize) {
   512       udata_closeSwapper(ds);
   513       *status = U_INDEX_OUTOFBOUNDS_ERROR;
   514       return NULL;
   515     }
   516     swapped = (uint8_t*)uprv_malloc(totalSize);
   517     if (swapped == NULL) {
   518       udata_closeSwapper(ds);
   519       *status = U_MEMORY_ALLOCATION_ERROR;
   520       return NULL;
   521     }
   522     ucnvsel_swap(ds, p, length, swapped, status);
   523     udata_closeSwapper(ds);
   524     if (U_FAILURE(*status)) {
   525       uprv_free(swapped);
   526       return NULL;
   527     }
   528     p = swapped;
   529     pHeader = (const DataHeader *)p;
   530   }
   531   if (length < (pHeader->dataHeader.headerSize + 16 * 4)) {
   532     // not even enough space for the header and the indexes
   533     uprv_free(swapped);
   534     *status = U_INDEX_OUTOFBOUNDS_ERROR;
   535     return NULL;
   536   }
   537   p += pHeader->dataHeader.headerSize;
   538   length -= pHeader->dataHeader.headerSize;
   539   // indexes
   540   const int32_t *indexes = (const int32_t *)p;
   541   if (length < indexes[UCNVSEL_INDEX_SIZE]) {
   542     uprv_free(swapped);
   543     *status = U_INDEX_OUTOFBOUNDS_ERROR;
   544     return NULL;
   545   }
   546   p += UCNVSEL_INDEX_COUNT * 4;
   547   // create and populate the selector object
   548   UConverterSelector* sel = (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector));
   549   char **encodings =
   550     (char **)uprv_malloc(
   551       indexes[UCNVSEL_INDEX_NAMES_COUNT] * sizeof(char *));
   552   if (sel == NULL || encodings == NULL) {
   553     uprv_free(swapped);
   554     uprv_free(sel);
   555     uprv_free(encodings);
   556     *status = U_MEMORY_ALLOCATION_ERROR;
   557     return NULL;
   558   }
   559   uprv_memset(sel, 0, sizeof(UConverterSelector));
   560   sel->pvCount = indexes[UCNVSEL_INDEX_PV_COUNT];
   561   sel->encodings = encodings;
   562   sel->encodingsCount = indexes[UCNVSEL_INDEX_NAMES_COUNT];
   563   sel->encodingStrLength = indexes[UCNVSEL_INDEX_NAMES_LENGTH];
   564   sel->swapped = swapped;
   565   // trie
   566   sel->trie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
   567                                         p, indexes[UCNVSEL_INDEX_TRIE_SIZE], NULL,
   568                                         status);
   569   p += indexes[UCNVSEL_INDEX_TRIE_SIZE];
   570   if (U_FAILURE(*status)) {
   571     ucnvsel_close(sel);
   572     return NULL;
   573   }
   574   // bit vectors
   575   sel->pv = (uint32_t *)p;
   576   p += sel->pvCount * 4;
   577   // encoding names
   578   char* s = (char*)p;
   579   for (int32_t i = 0; i < sel->encodingsCount; ++i) {
   580     sel->encodings[i] = s;
   581     s += uprv_strlen(s) + 1;
   582   }
   583   p += sel->encodingStrLength;
   585   return sel;
   586 }
   588 // a bunch of functions for the enumeration thingie! Nothing fancy here. Just
   589 // iterate over the selected encodings
   590 struct Enumerator {
   591   int16_t* index;
   592   int16_t length;
   593   int16_t cur;
   594   const UConverterSelector* sel;
   595 };
   597 U_CDECL_BEGIN
   599 static void U_CALLCONV
   600 ucnvsel_close_selector_iterator(UEnumeration *enumerator) {
   601   uprv_free(((Enumerator*)(enumerator->context))->index);
   602   uprv_free(enumerator->context);
   603   uprv_free(enumerator);
   604 }
   607 static int32_t U_CALLCONV
   608 ucnvsel_count_encodings(UEnumeration *enumerator, UErrorCode *status) {
   609   // check if already failed
   610   if (U_FAILURE(*status)) {
   611     return 0;
   612   }
   613   return ((Enumerator*)(enumerator->context))->length;
   614 }
   617 static const char* U_CALLCONV ucnvsel_next_encoding(UEnumeration* enumerator,
   618                                                  int32_t* resultLength,
   619                                                  UErrorCode* status) {
   620   // check if already failed
   621   if (U_FAILURE(*status)) {
   622     return NULL;
   623   }
   625   int16_t cur = ((Enumerator*)(enumerator->context))->cur;
   626   const UConverterSelector* sel;
   627   const char* result;
   628   if (cur >= ((Enumerator*)(enumerator->context))->length) {
   629     return NULL;
   630   }
   631   sel = ((Enumerator*)(enumerator->context))->sel;
   632   result = sel->encodings[((Enumerator*)(enumerator->context))->index[cur] ];
   633   ((Enumerator*)(enumerator->context))->cur++;
   634   if (resultLength) {
   635     *resultLength = (int32_t)uprv_strlen(result);
   636   }
   637   return result;
   638 }
   640 static void U_CALLCONV ucnvsel_reset_iterator(UEnumeration* enumerator,
   641                                            UErrorCode* status) {
   642   // check if already failed
   643   if (U_FAILURE(*status)) {
   644     return ;
   645   }
   646   ((Enumerator*)(enumerator->context))->cur = 0;
   647 }
   649 U_CDECL_END
   652 static const UEnumeration defaultEncodings = {
   653   NULL,
   654     NULL,
   655     ucnvsel_close_selector_iterator,
   656     ucnvsel_count_encodings,
   657     uenum_unextDefault,
   658     ucnvsel_next_encoding, 
   659     ucnvsel_reset_iterator
   660 };
   663 // internal fn to intersect two sets of masks
   664 // returns whether the mask has reduced to all zeros
   665 static UBool intersectMasks(uint32_t* dest, const uint32_t* source1, int32_t len) {
   666   int32_t i;
   667   uint32_t oredDest = 0;
   668   for (i = 0 ; i < len ; ++i) {
   669     oredDest |= (dest[i] &= source1[i]);
   670   }
   671   return oredDest == 0;
   672 }
   674 // internal fn to count how many 1's are there in a mask
   675 // algorithm taken from  http://graphics.stanford.edu/~seander/bithacks.html
   676 static int16_t countOnes(uint32_t* mask, int32_t len) {
   677   int32_t i, totalOnes = 0;
   678   for (i = 0 ; i < len ; ++i) {
   679     uint32_t ent = mask[i];
   680     for (; ent; totalOnes++)
   681     {
   682       ent &= ent - 1; // clear the least significant bit set
   683     }
   684   }
   685   return totalOnes;
   686 }
   689 /* internal function! */
   690 static UEnumeration *selectForMask(const UConverterSelector* sel,
   691                                    uint32_t *mask, UErrorCode *status) {
   692   // this is the context we will use. Store a table of indices to which
   693   // encodings are legit.
   694   struct Enumerator* result = (Enumerator*)uprv_malloc(sizeof(Enumerator));
   695   if (result == NULL) {
   696     uprv_free(mask);
   697     *status = U_MEMORY_ALLOCATION_ERROR;
   698     return NULL;
   699   }
   700   result->index = NULL;  // this will be allocated later!
   701   result->length = result->cur = 0;
   702   result->sel = sel;
   704   UEnumeration *en = (UEnumeration *)uprv_malloc(sizeof(UEnumeration));
   705   if (en == NULL) {
   706     // TODO(markus): Combine Enumerator and UEnumeration into one struct.
   707     uprv_free(mask);
   708     uprv_free(result);
   709     *status = U_MEMORY_ALLOCATION_ERROR;
   710     return NULL;
   711   }
   712   memcpy(en, &defaultEncodings, sizeof(UEnumeration));
   713   en->context = result;
   715   int32_t columns = (sel->encodingsCount+31)/32;
   716   int16_t numOnes = countOnes(mask, columns);
   717   // now, we know the exact space we need for index
   718   if (numOnes > 0) {
   719     result->index = (int16_t*) uprv_malloc(numOnes * sizeof(int16_t));
   721     int32_t i, j;
   722     int16_t k = 0;
   723     for (j = 0 ; j < columns; j++) {
   724       uint32_t v = mask[j];
   725       for (i = 0 ; i < 32 && k < sel->encodingsCount; i++, k++) {
   726         if ((v & 1) != 0) {
   727           result->index[result->length++] = k;
   728         }
   729         v >>= 1;
   730       }
   731     }
   732   } //otherwise, index will remain NULL (and will never be touched by
   733     //the enumerator code anyway)
   734   uprv_free(mask);
   735   return en;
   736 }
   738 /* check a string against the selector - UTF16 version */
   739 U_CAPI UEnumeration * U_EXPORT2
   740 ucnvsel_selectForString(const UConverterSelector* sel,
   741                         const UChar *s, int32_t length, UErrorCode *status) {
   742   // check if already failed
   743   if (U_FAILURE(*status)) {
   744     return NULL;
   745   }
   746   // ensure args make sense!
   747   if (sel == NULL || (s == NULL && length != 0)) {
   748     *status = U_ILLEGAL_ARGUMENT_ERROR;
   749     return NULL;
   750   }
   752   int32_t columns = (sel->encodingsCount+31)/32;
   753   uint32_t* mask = (uint32_t*) uprv_malloc(columns * 4);
   754   if (mask == NULL) {
   755     *status = U_MEMORY_ALLOCATION_ERROR;
   756     return NULL;
   757   }
   758   uprv_memset(mask, ~0, columns *4);
   760   if(s!=NULL) {
   761     const UChar *limit;
   762     if (length >= 0) {
   763       limit = s + length;
   764     } else {
   765       limit = NULL;
   766     }
   768     while (limit == NULL ? *s != 0 : s != limit) {
   769       UChar32 c;
   770       uint16_t pvIndex;
   771       UTRIE2_U16_NEXT16(sel->trie, s, limit, c, pvIndex);
   772       if (intersectMasks(mask, sel->pv+pvIndex, columns)) {
   773         break;
   774       }
   775     }
   776   }
   777   return selectForMask(sel, mask, status);
   778 }
   780 /* check a string against the selector - UTF8 version */
   781 U_CAPI UEnumeration * U_EXPORT2
   782 ucnvsel_selectForUTF8(const UConverterSelector* sel,
   783                       const char *s, int32_t length, UErrorCode *status) {
   784   // check if already failed
   785   if (U_FAILURE(*status)) {
   786     return NULL;
   787   }
   788   // ensure args make sense!
   789   if (sel == NULL || (s == NULL && length != 0)) {
   790     *status = U_ILLEGAL_ARGUMENT_ERROR;
   791     return NULL;
   792   }
   794   int32_t columns = (sel->encodingsCount+31)/32;
   795   uint32_t* mask = (uint32_t*) uprv_malloc(columns * 4);
   796   if (mask == NULL) {
   797     *status = U_MEMORY_ALLOCATION_ERROR;
   798     return NULL;
   799   }
   800   uprv_memset(mask, ~0, columns *4);
   802   if (length < 0) {
   803     length = (int32_t)uprv_strlen(s);
   804   }
   806   if(s!=NULL) {
   807     const char *limit = s + length;
   809     while (s != limit) {
   810       uint16_t pvIndex;
   811       UTRIE2_U8_NEXT16(sel->trie, s, limit, pvIndex);
   812       if (intersectMasks(mask, sel->pv+pvIndex, columns)) {
   813         break;
   814       }
   815     }
   816   }
   817   return selectForMask(sel, mask, status);
   818 }
   820 #endif  // !UCONFIG_NO_CONVERSION

mercurial