intl/icu/source/common/ucnvsel.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/ucnvsel.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,820 @@
     1.4 +/*
     1.5 +*******************************************************************************
     1.6 +*
     1.7 +*   Copyright (C) 2008-2011, International Business Machines
     1.8 +*   Corporation, Google and others.  All Rights Reserved.
     1.9 +*
    1.10 +*******************************************************************************
    1.11 +*/
    1.12 +// Author : eldawy@google.com (Mohamed Eldawy)
    1.13 +// ucnvsel.cpp
    1.14 +//
    1.15 +// Purpose: To generate a list of encodings capable of handling
    1.16 +// a given Unicode text
    1.17 +//
    1.18 +// Started 09-April-2008
    1.19 +
    1.20 +/**
    1.21 + * \file
    1.22 + *
    1.23 + * This is an implementation of an encoding selector.
    1.24 + * The goal is, given a unicode string, find the encodings
    1.25 + * this string can be mapped to. To make processing faster
    1.26 + * a trie is built when you call ucnvsel_open() that
    1.27 + * stores all encodings a codepoint can map to
    1.28 + */
    1.29 +
    1.30 +#include "unicode/ucnvsel.h"
    1.31 +
    1.32 +#if !UCONFIG_NO_CONVERSION
    1.33 +
    1.34 +#include <string.h>
    1.35 +
    1.36 +#include "unicode/uchar.h"
    1.37 +#include "unicode/uniset.h"
    1.38 +#include "unicode/ucnv.h"
    1.39 +#include "unicode/ustring.h"
    1.40 +#include "unicode/uchriter.h"
    1.41 +#include "utrie2.h"
    1.42 +#include "propsvec.h"
    1.43 +#include "uassert.h"
    1.44 +#include "ucmndata.h"
    1.45 +#include "uenumimp.h"
    1.46 +#include "cmemory.h"
    1.47 +#include "cstring.h"
    1.48 +
    1.49 +U_NAMESPACE_USE
    1.50 +
    1.51 +struct UConverterSelector {
    1.52 +  UTrie2 *trie;              // 16 bit trie containing offsets into pv
    1.53 +  uint32_t* pv;              // table of bits!
    1.54 +  int32_t pvCount;
    1.55 +  char** encodings;          // which encodings did user ask to use?
    1.56 +  int32_t encodingsCount;
    1.57 +  int32_t encodingStrLength;
    1.58 +  uint8_t* swapped;
    1.59 +  UBool ownPv, ownEncodingStrings;
    1.60 +};
    1.61 +
    1.62 +static void generateSelectorData(UConverterSelector* result,
    1.63 +                                 UPropsVectors *upvec,
    1.64 +                                 const USet* excludedCodePoints,
    1.65 +                                 const UConverterUnicodeSet whichSet,
    1.66 +                                 UErrorCode* status) {
    1.67 +  if (U_FAILURE(*status)) {
    1.68 +    return;
    1.69 +  }
    1.70 +
    1.71 +  int32_t columns = (result->encodingsCount+31)/32;
    1.72 +
    1.73 +  // set errorValue to all-ones
    1.74 +  for (int32_t col = 0; col < columns; col++) {
    1.75 +    upvec_setValue(upvec, UPVEC_ERROR_VALUE_CP, UPVEC_ERROR_VALUE_CP,
    1.76 +                   col, ~0, ~0, status);
    1.77 +  }
    1.78 +
    1.79 +  for (int32_t i = 0; i < result->encodingsCount; ++i) {
    1.80 +    uint32_t mask;
    1.81 +    uint32_t column;
    1.82 +    int32_t item_count;
    1.83 +    int32_t j;
    1.84 +    UConverter* test_converter = ucnv_open(result->encodings[i], status);
    1.85 +    if (U_FAILURE(*status)) {
    1.86 +      return;
    1.87 +    }
    1.88 +    USet* unicode_point_set;
    1.89 +    unicode_point_set = uset_open(1, 0);  // empty set
    1.90 +
    1.91 +    ucnv_getUnicodeSet(test_converter, unicode_point_set,
    1.92 +                       whichSet, status);
    1.93 +    if (U_FAILURE(*status)) {
    1.94 +      ucnv_close(test_converter);
    1.95 +      return;
    1.96 +    }
    1.97 +
    1.98 +    column = i / 32;
    1.99 +    mask = 1 << (i%32);
   1.100 +    // now iterate over intervals on set i!
   1.101 +    item_count = uset_getItemCount(unicode_point_set);
   1.102 +
   1.103 +    for (j = 0; j < item_count; ++j) {
   1.104 +      UChar32 start_char;
   1.105 +      UChar32 end_char;
   1.106 +      UErrorCode smallStatus = U_ZERO_ERROR;
   1.107 +      uset_getItem(unicode_point_set, j, &start_char, &end_char, NULL, 0,
   1.108 +                   &smallStatus);
   1.109 +      if (U_FAILURE(smallStatus)) {
   1.110 +        // this will be reached for the converters that fill the set with
   1.111 +        // strings. Those should be ignored by our system
   1.112 +      } else {
   1.113 +        upvec_setValue(upvec, start_char, end_char, column, ~0, mask,
   1.114 +                       status);
   1.115 +      }
   1.116 +    }
   1.117 +    ucnv_close(test_converter);
   1.118 +    uset_close(unicode_point_set);
   1.119 +    if (U_FAILURE(*status)) {
   1.120 +      return;
   1.121 +    }
   1.122 +  }
   1.123 +
   1.124 +  // handle excluded encodings! Simply set their values to all 1's in the upvec
   1.125 +  if (excludedCodePoints) {
   1.126 +    int32_t item_count = uset_getItemCount(excludedCodePoints);
   1.127 +    for (int32_t j = 0; j < item_count; ++j) {
   1.128 +      UChar32 start_char;
   1.129 +      UChar32 end_char;
   1.130 +
   1.131 +      uset_getItem(excludedCodePoints, j, &start_char, &end_char, NULL, 0,
   1.132 +                   status);
   1.133 +      for (int32_t col = 0; col < columns; col++) {
   1.134 +        upvec_setValue(upvec, start_char, end_char, col, ~0, ~0,
   1.135 +                      status);
   1.136 +      }
   1.137 +    }
   1.138 +  }
   1.139 +
   1.140 +  // alright. Now, let's put things in the same exact form you'd get when you
   1.141 +  // unserialize things.
   1.142 +  result->trie = upvec_compactToUTrie2WithRowIndexes(upvec, status);
   1.143 +  result->pv = upvec_cloneArray(upvec, &result->pvCount, NULL, status);
   1.144 +  result->pvCount *= columns;  // number of uint32_t = rows * columns
   1.145 +  result->ownPv = TRUE;
   1.146 +}
   1.147 +
   1.148 +/* open a selector. If converterListSize is 0, build for all converters.
   1.149 +   If excludedCodePoints is NULL, don't exclude any codepoints */
   1.150 +U_CAPI UConverterSelector* U_EXPORT2
   1.151 +ucnvsel_open(const char* const*  converterList, int32_t converterListSize,
   1.152 +             const USet* excludedCodePoints,
   1.153 +             const UConverterUnicodeSet whichSet, UErrorCode* status) {
   1.154 +  // check if already failed
   1.155 +  if (U_FAILURE(*status)) {
   1.156 +    return NULL;
   1.157 +  }
   1.158 +  // ensure args make sense!
   1.159 +  if (converterListSize < 0 || (converterList == NULL && converterListSize != 0)) {
   1.160 +    *status = U_ILLEGAL_ARGUMENT_ERROR;
   1.161 +    return NULL;
   1.162 +  }
   1.163 +
   1.164 +  // allocate a new converter
   1.165 +  LocalUConverterSelectorPointer newSelector(
   1.166 +    (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector)));
   1.167 +  if (newSelector.isNull()) {
   1.168 +    *status = U_MEMORY_ALLOCATION_ERROR;
   1.169 +    return NULL;
   1.170 +  }
   1.171 +  uprv_memset(newSelector.getAlias(), 0, sizeof(UConverterSelector));
   1.172 +
   1.173 +  if (converterListSize == 0) {
   1.174 +    converterList = NULL;
   1.175 +    converterListSize = ucnv_countAvailable();
   1.176 +  }
   1.177 +  newSelector->encodings =
   1.178 +    (char**)uprv_malloc(converterListSize * sizeof(char*));
   1.179 +  if (!newSelector->encodings) {
   1.180 +    *status = U_MEMORY_ALLOCATION_ERROR;
   1.181 +    return NULL;
   1.182 +  }
   1.183 +  newSelector->encodings[0] = NULL;  // now we can call ucnvsel_close()
   1.184 +
   1.185 +  // make a backup copy of the list of converters
   1.186 +  int32_t totalSize = 0;
   1.187 +  int32_t i;
   1.188 +  for (i = 0; i < converterListSize; i++) {
   1.189 +    totalSize +=
   1.190 +      (int32_t)uprv_strlen(converterList != NULL ? converterList[i] : ucnv_getAvailableName(i)) + 1;
   1.191 +  }
   1.192 +  // 4-align the totalSize to 4-align the size of the serialized form
   1.193 +  int32_t encodingStrPadding = totalSize & 3;
   1.194 +  if (encodingStrPadding != 0) {
   1.195 +    encodingStrPadding = 4 - encodingStrPadding;
   1.196 +  }
   1.197 +  newSelector->encodingStrLength = totalSize += encodingStrPadding;
   1.198 +  char* allStrings = (char*) uprv_malloc(totalSize);
   1.199 +  if (!allStrings) {
   1.200 +    *status = U_MEMORY_ALLOCATION_ERROR;
   1.201 +    return NULL;
   1.202 +  }
   1.203 +
   1.204 +  for (i = 0; i < converterListSize; i++) {
   1.205 +    newSelector->encodings[i] = allStrings;
   1.206 +    uprv_strcpy(newSelector->encodings[i],
   1.207 +                converterList != NULL ? converterList[i] : ucnv_getAvailableName(i));
   1.208 +    allStrings += uprv_strlen(newSelector->encodings[i]) + 1;
   1.209 +  }
   1.210 +  while (encodingStrPadding > 0) {
   1.211 +    *allStrings++ = 0;
   1.212 +    --encodingStrPadding;
   1.213 +  }
   1.214 +
   1.215 +  newSelector->ownEncodingStrings = TRUE;
   1.216 +  newSelector->encodingsCount = converterListSize;
   1.217 +  UPropsVectors *upvec = upvec_open((converterListSize+31)/32, status);
   1.218 +  generateSelectorData(newSelector.getAlias(), upvec, excludedCodePoints, whichSet, status);
   1.219 +  upvec_close(upvec);
   1.220 +
   1.221 +  if (U_FAILURE(*status)) {
   1.222 +    return NULL;
   1.223 +  }
   1.224 +
   1.225 +  return newSelector.orphan();
   1.226 +}
   1.227 +
   1.228 +/* close opened selector */
   1.229 +U_CAPI void U_EXPORT2
   1.230 +ucnvsel_close(UConverterSelector *sel) {
   1.231 +  if (!sel) {
   1.232 +    return;
   1.233 +  }
   1.234 +  if (sel->ownEncodingStrings) {
   1.235 +    uprv_free(sel->encodings[0]);
   1.236 +  }
   1.237 +  uprv_free(sel->encodings);
   1.238 +  if (sel->ownPv) {
   1.239 +    uprv_free(sel->pv);
   1.240 +  }
   1.241 +  utrie2_close(sel->trie);
   1.242 +  uprv_free(sel->swapped);
   1.243 +  uprv_free(sel);
   1.244 +}
   1.245 +
   1.246 +static const UDataInfo dataInfo = {
   1.247 +  sizeof(UDataInfo),
   1.248 +  0,
   1.249 +
   1.250 +  U_IS_BIG_ENDIAN,
   1.251 +  U_CHARSET_FAMILY,
   1.252 +  U_SIZEOF_UCHAR,
   1.253 +  0,
   1.254 +
   1.255 +  { 0x43, 0x53, 0x65, 0x6c },   /* dataFormat="CSel" */
   1.256 +  { 1, 0, 0, 0 },               /* formatVersion */
   1.257 +  { 0, 0, 0, 0 }                /* dataVersion */
   1.258 +};
   1.259 +
   1.260 +enum {
   1.261 +  UCNVSEL_INDEX_TRIE_SIZE,      // trie size in bytes
   1.262 +  UCNVSEL_INDEX_PV_COUNT,       // number of uint32_t in the bit vectors
   1.263 +  UCNVSEL_INDEX_NAMES_COUNT,    // number of encoding names
   1.264 +  UCNVSEL_INDEX_NAMES_LENGTH,   // number of encoding name bytes including padding
   1.265 +  UCNVSEL_INDEX_SIZE = 15,      // bytes following the DataHeader
   1.266 +  UCNVSEL_INDEX_COUNT = 16
   1.267 +};
   1.268 +
   1.269 +/*
   1.270 + * Serialized form of a UConverterSelector, formatVersion 1:
   1.271 + *
   1.272 + * The serialized form begins with a standard ICU DataHeader with a UDataInfo
   1.273 + * as the template above.
   1.274 + * This is followed by:
   1.275 + *   int32_t indexes[UCNVSEL_INDEX_COUNT];          // see index entry constants above
   1.276 + *   serialized UTrie2;                             // indexes[UCNVSEL_INDEX_TRIE_SIZE] bytes
   1.277 + *   uint32_t pv[indexes[UCNVSEL_INDEX_PV_COUNT]];  // bit vectors
   1.278 + *   char* encodingNames[indexes[UCNVSEL_INDEX_NAMES_LENGTH]];  // NUL-terminated strings + padding
   1.279 + */
   1.280 +
   1.281 +/* serialize a selector */
   1.282 +U_CAPI int32_t U_EXPORT2
   1.283 +ucnvsel_serialize(const UConverterSelector* sel,
   1.284 +                  void* buffer, int32_t bufferCapacity, UErrorCode* status) {
   1.285 +  // check if already failed
   1.286 +  if (U_FAILURE(*status)) {
   1.287 +    return 0;
   1.288 +  }
   1.289 +  // ensure args make sense!
   1.290 +  uint8_t *p = (uint8_t *)buffer;
   1.291 +  if (bufferCapacity < 0 ||
   1.292 +      (bufferCapacity > 0 && (p == NULL || (U_POINTER_MASK_LSB(p, 3) != 0)))
   1.293 +  ) {
   1.294 +    *status = U_ILLEGAL_ARGUMENT_ERROR;
   1.295 +    return 0;
   1.296 +  }
   1.297 +  // add up the size of the serialized form
   1.298 +  int32_t serializedTrieSize = utrie2_serialize(sel->trie, NULL, 0, status);
   1.299 +  if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
   1.300 +    return 0;
   1.301 +  }
   1.302 +  *status = U_ZERO_ERROR;
   1.303 +
   1.304 +  DataHeader header;
   1.305 +  uprv_memset(&header, 0, sizeof(header));
   1.306 +  header.dataHeader.headerSize = (uint16_t)((sizeof(header) + 15) & ~15);
   1.307 +  header.dataHeader.magic1 = 0xda;
   1.308 +  header.dataHeader.magic2 = 0x27;
   1.309 +  uprv_memcpy(&header.info, &dataInfo, sizeof(dataInfo));
   1.310 +
   1.311 +  int32_t indexes[UCNVSEL_INDEX_COUNT] = {
   1.312 +    serializedTrieSize,
   1.313 +    sel->pvCount,
   1.314 +    sel->encodingsCount,
   1.315 +    sel->encodingStrLength
   1.316 +  };
   1.317 +
   1.318 +  int32_t totalSize =
   1.319 +    header.dataHeader.headerSize +
   1.320 +    (int32_t)sizeof(indexes) +
   1.321 +    serializedTrieSize +
   1.322 +    sel->pvCount * 4 +
   1.323 +    sel->encodingStrLength;
   1.324 +  indexes[UCNVSEL_INDEX_SIZE] = totalSize - header.dataHeader.headerSize;
   1.325 +  if (totalSize > bufferCapacity) {
   1.326 +    *status = U_BUFFER_OVERFLOW_ERROR;
   1.327 +    return totalSize;
   1.328 +  }
   1.329 +  // ok, save!
   1.330 +  int32_t length = header.dataHeader.headerSize;
   1.331 +  uprv_memcpy(p, &header, sizeof(header));
   1.332 +  uprv_memset(p + sizeof(header), 0, length - sizeof(header));
   1.333 +  p += length;
   1.334 +
   1.335 +  length = (int32_t)sizeof(indexes);
   1.336 +  uprv_memcpy(p, indexes, length);
   1.337 +  p += length;
   1.338 +
   1.339 +  utrie2_serialize(sel->trie, p, serializedTrieSize, status);
   1.340 +  p += serializedTrieSize;
   1.341 +
   1.342 +  length = sel->pvCount * 4;
   1.343 +  uprv_memcpy(p, sel->pv, length);
   1.344 +  p += length;
   1.345 +
   1.346 +  uprv_memcpy(p, sel->encodings[0], sel->encodingStrLength);
   1.347 +  p += sel->encodingStrLength;
   1.348 +
   1.349 +  return totalSize;
   1.350 +}
   1.351 +
   1.352 +/**
   1.353 + * swap a selector into the desired Endianness and Asciiness of
   1.354 + * the system. Just as FYI, selectors are always saved in the format
   1.355 + * of the system that created them. They are only converted if used
   1.356 + * on another system. In other words, selectors created on different
   1.357 + * system can be different even if the params are identical (endianness
   1.358 + * and Asciiness differences only)
   1.359 + *
   1.360 + * @param ds pointer to data swapper containing swapping info
   1.361 + * @param inData pointer to incoming data
   1.362 + * @param length length of inData in bytes
   1.363 + * @param outData pointer to output data. Capacity should
   1.364 + *                be at least equal to capacity of inData
   1.365 + * @param status an in/out ICU UErrorCode
   1.366 + * @return 0 on failure, number of bytes swapped on success
   1.367 + *         number of bytes swapped can be smaller than length
   1.368 + */
   1.369 +static int32_t
   1.370 +ucnvsel_swap(const UDataSwapper *ds,
   1.371 +             const void *inData, int32_t length,
   1.372 +             void *outData, UErrorCode *status) {
   1.373 +  /* udata_swapDataHeader checks the arguments */
   1.374 +  int32_t headerSize = udata_swapDataHeader(ds, inData, length, outData, status);
   1.375 +  if(U_FAILURE(*status)) {
   1.376 +    return 0;
   1.377 +  }
   1.378 +
   1.379 +  /* check data format and format version */
   1.380 +  const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData + 4);
   1.381 +  if(!(
   1.382 +    pInfo->dataFormat[0] == 0x43 &&  /* dataFormat="CSel" */
   1.383 +    pInfo->dataFormat[1] == 0x53 &&
   1.384 +    pInfo->dataFormat[2] == 0x65 &&
   1.385 +    pInfo->dataFormat[3] == 0x6c
   1.386 +  )) {
   1.387 +    udata_printError(ds, "ucnvsel_swap(): data format %02x.%02x.%02x.%02x is not recognized as UConverterSelector data\n",
   1.388 +                     pInfo->dataFormat[0], pInfo->dataFormat[1],
   1.389 +                     pInfo->dataFormat[2], pInfo->dataFormat[3]);
   1.390 +    *status = U_INVALID_FORMAT_ERROR;
   1.391 +    return 0;
   1.392 +  }
   1.393 +  if(pInfo->formatVersion[0] != 1) {
   1.394 +    udata_printError(ds, "ucnvsel_swap(): format version %02x is not supported\n",
   1.395 +                     pInfo->formatVersion[0]);
   1.396 +    *status = U_UNSUPPORTED_ERROR;
   1.397 +    return 0;
   1.398 +  }
   1.399 +
   1.400 +  if(length >= 0) {
   1.401 +    length -= headerSize;
   1.402 +    if(length < 16*4) {
   1.403 +      udata_printError(ds, "ucnvsel_swap(): too few bytes (%d after header) for UConverterSelector data\n",
   1.404 +                       length);
   1.405 +      *status = U_INDEX_OUTOFBOUNDS_ERROR;
   1.406 +      return 0;
   1.407 +    }
   1.408 +  }
   1.409 +
   1.410 +  const uint8_t *inBytes = (const uint8_t *)inData + headerSize;
   1.411 +  uint8_t *outBytes = (uint8_t *)outData + headerSize;
   1.412 +
   1.413 +  /* read the indexes */
   1.414 +  const int32_t *inIndexes = (const int32_t *)inBytes;
   1.415 +  int32_t indexes[16];
   1.416 +  int32_t i;
   1.417 +  for(i = 0; i < 16; ++i) {
   1.418 +    indexes[i] = udata_readInt32(ds, inIndexes[i]);
   1.419 +  }
   1.420 +
   1.421 +  /* get the total length of the data */
   1.422 +  int32_t size = indexes[UCNVSEL_INDEX_SIZE];
   1.423 +  if(length >= 0) {
   1.424 +    if(length < size) {
   1.425 +      udata_printError(ds, "ucnvsel_swap(): too few bytes (%d after header) for all of UConverterSelector data\n",
   1.426 +                       length);
   1.427 +      *status = U_INDEX_OUTOFBOUNDS_ERROR;
   1.428 +      return 0;
   1.429 +    }
   1.430 +
   1.431 +    /* copy the data for inaccessible bytes */
   1.432 +    if(inBytes != outBytes) {
   1.433 +      uprv_memcpy(outBytes, inBytes, size);
   1.434 +    }
   1.435 +
   1.436 +    int32_t offset = 0, count;
   1.437 +
   1.438 +    /* swap the int32_t indexes[] */
   1.439 +    count = UCNVSEL_INDEX_COUNT*4;
   1.440 +    ds->swapArray32(ds, inBytes, count, outBytes, status);
   1.441 +    offset += count;
   1.442 +
   1.443 +    /* swap the UTrie2 */
   1.444 +    count = indexes[UCNVSEL_INDEX_TRIE_SIZE];
   1.445 +    utrie2_swap(ds, inBytes + offset, count, outBytes + offset, status);
   1.446 +    offset += count;
   1.447 +
   1.448 +    /* swap the uint32_t pv[] */
   1.449 +    count = indexes[UCNVSEL_INDEX_PV_COUNT]*4;
   1.450 +    ds->swapArray32(ds, inBytes + offset, count, outBytes + offset, status);
   1.451 +    offset += count;
   1.452 +
   1.453 +    /* swap the encoding names */
   1.454 +    count = indexes[UCNVSEL_INDEX_NAMES_LENGTH];
   1.455 +    ds->swapInvChars(ds, inBytes + offset, count, outBytes + offset, status);
   1.456 +    offset += count;
   1.457 +
   1.458 +    U_ASSERT(offset == size);
   1.459 +  }
   1.460 +
   1.461 +  return headerSize + size;
   1.462 +}
   1.463 +
   1.464 +/* unserialize a selector */
   1.465 +U_CAPI UConverterSelector* U_EXPORT2
   1.466 +ucnvsel_openFromSerialized(const void* buffer, int32_t length, UErrorCode* status) {
   1.467 +  // check if already failed
   1.468 +  if (U_FAILURE(*status)) {
   1.469 +    return NULL;
   1.470 +  }
   1.471 +  // ensure args make sense!
   1.472 +  const uint8_t *p = (const uint8_t *)buffer;
   1.473 +  if (length <= 0 ||
   1.474 +      (length > 0 && (p == NULL || (U_POINTER_MASK_LSB(p, 3) != 0)))
   1.475 +  ) {
   1.476 +    *status = U_ILLEGAL_ARGUMENT_ERROR;
   1.477 +    return NULL;
   1.478 +  }
   1.479 +  // header
   1.480 +  if (length < 32) {
   1.481 +    // not even enough space for a minimal header
   1.482 +    *status = U_INDEX_OUTOFBOUNDS_ERROR;
   1.483 +    return NULL;
   1.484 +  }
   1.485 +  const DataHeader *pHeader = (const DataHeader *)p;
   1.486 +  if (!(
   1.487 +    pHeader->dataHeader.magic1==0xda &&
   1.488 +    pHeader->dataHeader.magic2==0x27 &&
   1.489 +    pHeader->info.dataFormat[0] == 0x43 &&
   1.490 +    pHeader->info.dataFormat[1] == 0x53 &&
   1.491 +    pHeader->info.dataFormat[2] == 0x65 &&
   1.492 +    pHeader->info.dataFormat[3] == 0x6c
   1.493 +  )) {
   1.494 +    /* header not valid or dataFormat not recognized */
   1.495 +    *status = U_INVALID_FORMAT_ERROR;
   1.496 +    return NULL;
   1.497 +  }
   1.498 +  if (pHeader->info.formatVersion[0] != 1) {
   1.499 +    *status = U_UNSUPPORTED_ERROR;
   1.500 +    return NULL;
   1.501 +  }
   1.502 +  uint8_t* swapped = NULL;
   1.503 +  if (pHeader->info.isBigEndian != U_IS_BIG_ENDIAN ||
   1.504 +      pHeader->info.charsetFamily != U_CHARSET_FAMILY
   1.505 +  ) {
   1.506 +    // swap the data
   1.507 +    UDataSwapper *ds =
   1.508 +      udata_openSwapperForInputData(p, length, U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, status);
   1.509 +    int32_t totalSize = ucnvsel_swap(ds, p, -1, NULL, status);
   1.510 +    if (U_FAILURE(*status)) {
   1.511 +      udata_closeSwapper(ds);
   1.512 +      return NULL;
   1.513 +    }
   1.514 +    if (length < totalSize) {
   1.515 +      udata_closeSwapper(ds);
   1.516 +      *status = U_INDEX_OUTOFBOUNDS_ERROR;
   1.517 +      return NULL;
   1.518 +    }
   1.519 +    swapped = (uint8_t*)uprv_malloc(totalSize);
   1.520 +    if (swapped == NULL) {
   1.521 +      udata_closeSwapper(ds);
   1.522 +      *status = U_MEMORY_ALLOCATION_ERROR;
   1.523 +      return NULL;
   1.524 +    }
   1.525 +    ucnvsel_swap(ds, p, length, swapped, status);
   1.526 +    udata_closeSwapper(ds);
   1.527 +    if (U_FAILURE(*status)) {
   1.528 +      uprv_free(swapped);
   1.529 +      return NULL;
   1.530 +    }
   1.531 +    p = swapped;
   1.532 +    pHeader = (const DataHeader *)p;
   1.533 +  }
   1.534 +  if (length < (pHeader->dataHeader.headerSize + 16 * 4)) {
   1.535 +    // not even enough space for the header and the indexes
   1.536 +    uprv_free(swapped);
   1.537 +    *status = U_INDEX_OUTOFBOUNDS_ERROR;
   1.538 +    return NULL;
   1.539 +  }
   1.540 +  p += pHeader->dataHeader.headerSize;
   1.541 +  length -= pHeader->dataHeader.headerSize;
   1.542 +  // indexes
   1.543 +  const int32_t *indexes = (const int32_t *)p;
   1.544 +  if (length < indexes[UCNVSEL_INDEX_SIZE]) {
   1.545 +    uprv_free(swapped);
   1.546 +    *status = U_INDEX_OUTOFBOUNDS_ERROR;
   1.547 +    return NULL;
   1.548 +  }
   1.549 +  p += UCNVSEL_INDEX_COUNT * 4;
   1.550 +  // create and populate the selector object
   1.551 +  UConverterSelector* sel = (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector));
   1.552 +  char **encodings =
   1.553 +    (char **)uprv_malloc(
   1.554 +      indexes[UCNVSEL_INDEX_NAMES_COUNT] * sizeof(char *));
   1.555 +  if (sel == NULL || encodings == NULL) {
   1.556 +    uprv_free(swapped);
   1.557 +    uprv_free(sel);
   1.558 +    uprv_free(encodings);
   1.559 +    *status = U_MEMORY_ALLOCATION_ERROR;
   1.560 +    return NULL;
   1.561 +  }
   1.562 +  uprv_memset(sel, 0, sizeof(UConverterSelector));
   1.563 +  sel->pvCount = indexes[UCNVSEL_INDEX_PV_COUNT];
   1.564 +  sel->encodings = encodings;
   1.565 +  sel->encodingsCount = indexes[UCNVSEL_INDEX_NAMES_COUNT];
   1.566 +  sel->encodingStrLength = indexes[UCNVSEL_INDEX_NAMES_LENGTH];
   1.567 +  sel->swapped = swapped;
   1.568 +  // trie
   1.569 +  sel->trie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
   1.570 +                                        p, indexes[UCNVSEL_INDEX_TRIE_SIZE], NULL,
   1.571 +                                        status);
   1.572 +  p += indexes[UCNVSEL_INDEX_TRIE_SIZE];
   1.573 +  if (U_FAILURE(*status)) {
   1.574 +    ucnvsel_close(sel);
   1.575 +    return NULL;
   1.576 +  }
   1.577 +  // bit vectors
   1.578 +  sel->pv = (uint32_t *)p;
   1.579 +  p += sel->pvCount * 4;
   1.580 +  // encoding names
   1.581 +  char* s = (char*)p;
   1.582 +  for (int32_t i = 0; i < sel->encodingsCount; ++i) {
   1.583 +    sel->encodings[i] = s;
   1.584 +    s += uprv_strlen(s) + 1;
   1.585 +  }
   1.586 +  p += sel->encodingStrLength;
   1.587 +
   1.588 +  return sel;
   1.589 +}
   1.590 +
   1.591 +// a bunch of functions for the enumeration thingie! Nothing fancy here. Just
   1.592 +// iterate over the selected encodings
   1.593 +struct Enumerator {
   1.594 +  int16_t* index;
   1.595 +  int16_t length;
   1.596 +  int16_t cur;
   1.597 +  const UConverterSelector* sel;
   1.598 +};
   1.599 +
   1.600 +U_CDECL_BEGIN
   1.601 +
   1.602 +static void U_CALLCONV
   1.603 +ucnvsel_close_selector_iterator(UEnumeration *enumerator) {
   1.604 +  uprv_free(((Enumerator*)(enumerator->context))->index);
   1.605 +  uprv_free(enumerator->context);
   1.606 +  uprv_free(enumerator);
   1.607 +}
   1.608 +
   1.609 +
   1.610 +static int32_t U_CALLCONV
   1.611 +ucnvsel_count_encodings(UEnumeration *enumerator, UErrorCode *status) {
   1.612 +  // check if already failed
   1.613 +  if (U_FAILURE(*status)) {
   1.614 +    return 0;
   1.615 +  }
   1.616 +  return ((Enumerator*)(enumerator->context))->length;
   1.617 +}
   1.618 +
   1.619 +
   1.620 +static const char* U_CALLCONV ucnvsel_next_encoding(UEnumeration* enumerator,
   1.621 +                                                 int32_t* resultLength,
   1.622 +                                                 UErrorCode* status) {
   1.623 +  // check if already failed
   1.624 +  if (U_FAILURE(*status)) {
   1.625 +    return NULL;
   1.626 +  }
   1.627 +
   1.628 +  int16_t cur = ((Enumerator*)(enumerator->context))->cur;
   1.629 +  const UConverterSelector* sel;
   1.630 +  const char* result;
   1.631 +  if (cur >= ((Enumerator*)(enumerator->context))->length) {
   1.632 +    return NULL;
   1.633 +  }
   1.634 +  sel = ((Enumerator*)(enumerator->context))->sel;
   1.635 +  result = sel->encodings[((Enumerator*)(enumerator->context))->index[cur] ];
   1.636 +  ((Enumerator*)(enumerator->context))->cur++;
   1.637 +  if (resultLength) {
   1.638 +    *resultLength = (int32_t)uprv_strlen(result);
   1.639 +  }
   1.640 +  return result;
   1.641 +}
   1.642 +
   1.643 +static void U_CALLCONV ucnvsel_reset_iterator(UEnumeration* enumerator,
   1.644 +                                           UErrorCode* status) {
   1.645 +  // check if already failed
   1.646 +  if (U_FAILURE(*status)) {
   1.647 +    return ;
   1.648 +  }
   1.649 +  ((Enumerator*)(enumerator->context))->cur = 0;
   1.650 +}
   1.651 +
   1.652 +U_CDECL_END
   1.653 +
   1.654 +
   1.655 +static const UEnumeration defaultEncodings = {
   1.656 +  NULL,
   1.657 +    NULL,
   1.658 +    ucnvsel_close_selector_iterator,
   1.659 +    ucnvsel_count_encodings,
   1.660 +    uenum_unextDefault,
   1.661 +    ucnvsel_next_encoding, 
   1.662 +    ucnvsel_reset_iterator
   1.663 +};
   1.664 +
   1.665 +
   1.666 +// internal fn to intersect two sets of masks
   1.667 +// returns whether the mask has reduced to all zeros
   1.668 +static UBool intersectMasks(uint32_t* dest, const uint32_t* source1, int32_t len) {
   1.669 +  int32_t i;
   1.670 +  uint32_t oredDest = 0;
   1.671 +  for (i = 0 ; i < len ; ++i) {
   1.672 +    oredDest |= (dest[i] &= source1[i]);
   1.673 +  }
   1.674 +  return oredDest == 0;
   1.675 +}
   1.676 +
   1.677 +// internal fn to count how many 1's are there in a mask
   1.678 +// algorithm taken from  http://graphics.stanford.edu/~seander/bithacks.html
   1.679 +static int16_t countOnes(uint32_t* mask, int32_t len) {
   1.680 +  int32_t i, totalOnes = 0;
   1.681 +  for (i = 0 ; i < len ; ++i) {
   1.682 +    uint32_t ent = mask[i];
   1.683 +    for (; ent; totalOnes++)
   1.684 +    {
   1.685 +      ent &= ent - 1; // clear the least significant bit set
   1.686 +    }
   1.687 +  }
   1.688 +  return totalOnes;
   1.689 +}
   1.690 +
   1.691 +
   1.692 +/* internal function! */
   1.693 +static UEnumeration *selectForMask(const UConverterSelector* sel,
   1.694 +                                   uint32_t *mask, UErrorCode *status) {
   1.695 +  // this is the context we will use. Store a table of indices to which
   1.696 +  // encodings are legit.
   1.697 +  struct Enumerator* result = (Enumerator*)uprv_malloc(sizeof(Enumerator));
   1.698 +  if (result == NULL) {
   1.699 +    uprv_free(mask);
   1.700 +    *status = U_MEMORY_ALLOCATION_ERROR;
   1.701 +    return NULL;
   1.702 +  }
   1.703 +  result->index = NULL;  // this will be allocated later!
   1.704 +  result->length = result->cur = 0;
   1.705 +  result->sel = sel;
   1.706 +
   1.707 +  UEnumeration *en = (UEnumeration *)uprv_malloc(sizeof(UEnumeration));
   1.708 +  if (en == NULL) {
   1.709 +    // TODO(markus): Combine Enumerator and UEnumeration into one struct.
   1.710 +    uprv_free(mask);
   1.711 +    uprv_free(result);
   1.712 +    *status = U_MEMORY_ALLOCATION_ERROR;
   1.713 +    return NULL;
   1.714 +  }
   1.715 +  memcpy(en, &defaultEncodings, sizeof(UEnumeration));
   1.716 +  en->context = result;
   1.717 +
   1.718 +  int32_t columns = (sel->encodingsCount+31)/32;
   1.719 +  int16_t numOnes = countOnes(mask, columns);
   1.720 +  // now, we know the exact space we need for index
   1.721 +  if (numOnes > 0) {
   1.722 +    result->index = (int16_t*) uprv_malloc(numOnes * sizeof(int16_t));
   1.723 +
   1.724 +    int32_t i, j;
   1.725 +    int16_t k = 0;
   1.726 +    for (j = 0 ; j < columns; j++) {
   1.727 +      uint32_t v = mask[j];
   1.728 +      for (i = 0 ; i < 32 && k < sel->encodingsCount; i++, k++) {
   1.729 +        if ((v & 1) != 0) {
   1.730 +          result->index[result->length++] = k;
   1.731 +        }
   1.732 +        v >>= 1;
   1.733 +      }
   1.734 +    }
   1.735 +  } //otherwise, index will remain NULL (and will never be touched by
   1.736 +    //the enumerator code anyway)
   1.737 +  uprv_free(mask);
   1.738 +  return en;
   1.739 +}
   1.740 +
   1.741 +/* check a string against the selector - UTF16 version */
   1.742 +U_CAPI UEnumeration * U_EXPORT2
   1.743 +ucnvsel_selectForString(const UConverterSelector* sel,
   1.744 +                        const UChar *s, int32_t length, UErrorCode *status) {
   1.745 +  // check if already failed
   1.746 +  if (U_FAILURE(*status)) {
   1.747 +    return NULL;
   1.748 +  }
   1.749 +  // ensure args make sense!
   1.750 +  if (sel == NULL || (s == NULL && length != 0)) {
   1.751 +    *status = U_ILLEGAL_ARGUMENT_ERROR;
   1.752 +    return NULL;
   1.753 +  }
   1.754 +
   1.755 +  int32_t columns = (sel->encodingsCount+31)/32;
   1.756 +  uint32_t* mask = (uint32_t*) uprv_malloc(columns * 4);
   1.757 +  if (mask == NULL) {
   1.758 +    *status = U_MEMORY_ALLOCATION_ERROR;
   1.759 +    return NULL;
   1.760 +  }
   1.761 +  uprv_memset(mask, ~0, columns *4);
   1.762 +
   1.763 +  if(s!=NULL) {
   1.764 +    const UChar *limit;
   1.765 +    if (length >= 0) {
   1.766 +      limit = s + length;
   1.767 +    } else {
   1.768 +      limit = NULL;
   1.769 +    }
   1.770 +    
   1.771 +    while (limit == NULL ? *s != 0 : s != limit) {
   1.772 +      UChar32 c;
   1.773 +      uint16_t pvIndex;
   1.774 +      UTRIE2_U16_NEXT16(sel->trie, s, limit, c, pvIndex);
   1.775 +      if (intersectMasks(mask, sel->pv+pvIndex, columns)) {
   1.776 +        break;
   1.777 +      }
   1.778 +    }
   1.779 +  }
   1.780 +  return selectForMask(sel, mask, status);
   1.781 +}
   1.782 +
   1.783 +/* check a string against the selector - UTF8 version */
   1.784 +U_CAPI UEnumeration * U_EXPORT2
   1.785 +ucnvsel_selectForUTF8(const UConverterSelector* sel,
   1.786 +                      const char *s, int32_t length, UErrorCode *status) {
   1.787 +  // check if already failed
   1.788 +  if (U_FAILURE(*status)) {
   1.789 +    return NULL;
   1.790 +  }
   1.791 +  // ensure args make sense!
   1.792 +  if (sel == NULL || (s == NULL && length != 0)) {
   1.793 +    *status = U_ILLEGAL_ARGUMENT_ERROR;
   1.794 +    return NULL;
   1.795 +  }
   1.796 +
   1.797 +  int32_t columns = (sel->encodingsCount+31)/32;
   1.798 +  uint32_t* mask = (uint32_t*) uprv_malloc(columns * 4);
   1.799 +  if (mask == NULL) {
   1.800 +    *status = U_MEMORY_ALLOCATION_ERROR;
   1.801 +    return NULL;
   1.802 +  }
   1.803 +  uprv_memset(mask, ~0, columns *4);
   1.804 +
   1.805 +  if (length < 0) {
   1.806 +    length = (int32_t)uprv_strlen(s);
   1.807 +  }
   1.808 +
   1.809 +  if(s!=NULL) {
   1.810 +    const char *limit = s + length;
   1.811 +    
   1.812 +    while (s != limit) {
   1.813 +      uint16_t pvIndex;
   1.814 +      UTRIE2_U8_NEXT16(sel->trie, s, limit, pvIndex);
   1.815 +      if (intersectMasks(mask, sel->pv+pvIndex, columns)) {
   1.816 +        break;
   1.817 +      }
   1.818 +    }
   1.819 +  }
   1.820 +  return selectForMask(sel, mask, status);
   1.821 +}
   1.822 +
   1.823 +#endif  // !UCONFIG_NO_CONVERSION

mercurial