intl/icu/source/common/ucnvsel.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 *******************************************************************************
michael@0 3 *
michael@0 4 * Copyright (C) 2008-2011, International Business Machines
michael@0 5 * Corporation, Google and others. All Rights Reserved.
michael@0 6 *
michael@0 7 *******************************************************************************
michael@0 8 */
michael@0 9 // Author : eldawy@google.com (Mohamed Eldawy)
michael@0 10 // ucnvsel.cpp
michael@0 11 //
michael@0 12 // Purpose: To generate a list of encodings capable of handling
michael@0 13 // a given Unicode text
michael@0 14 //
michael@0 15 // Started 09-April-2008
michael@0 16
michael@0 17 /**
michael@0 18 * \file
michael@0 19 *
michael@0 20 * This is an implementation of an encoding selector.
michael@0 21 * The goal is, given a unicode string, find the encodings
michael@0 22 * this string can be mapped to. To make processing faster
michael@0 23 * a trie is built when you call ucnvsel_open() that
michael@0 24 * stores all encodings a codepoint can map to
michael@0 25 */
michael@0 26
michael@0 27 #include "unicode/ucnvsel.h"
michael@0 28
michael@0 29 #if !UCONFIG_NO_CONVERSION
michael@0 30
michael@0 31 #include <string.h>
michael@0 32
michael@0 33 #include "unicode/uchar.h"
michael@0 34 #include "unicode/uniset.h"
michael@0 35 #include "unicode/ucnv.h"
michael@0 36 #include "unicode/ustring.h"
michael@0 37 #include "unicode/uchriter.h"
michael@0 38 #include "utrie2.h"
michael@0 39 #include "propsvec.h"
michael@0 40 #include "uassert.h"
michael@0 41 #include "ucmndata.h"
michael@0 42 #include "uenumimp.h"
michael@0 43 #include "cmemory.h"
michael@0 44 #include "cstring.h"
michael@0 45
michael@0 46 U_NAMESPACE_USE
michael@0 47
michael@0 48 struct UConverterSelector {
michael@0 49 UTrie2 *trie; // 16 bit trie containing offsets into pv
michael@0 50 uint32_t* pv; // table of bits!
michael@0 51 int32_t pvCount;
michael@0 52 char** encodings; // which encodings did user ask to use?
michael@0 53 int32_t encodingsCount;
michael@0 54 int32_t encodingStrLength;
michael@0 55 uint8_t* swapped;
michael@0 56 UBool ownPv, ownEncodingStrings;
michael@0 57 };
michael@0 58
michael@0 59 static void generateSelectorData(UConverterSelector* result,
michael@0 60 UPropsVectors *upvec,
michael@0 61 const USet* excludedCodePoints,
michael@0 62 const UConverterUnicodeSet whichSet,
michael@0 63 UErrorCode* status) {
michael@0 64 if (U_FAILURE(*status)) {
michael@0 65 return;
michael@0 66 }
michael@0 67
michael@0 68 int32_t columns = (result->encodingsCount+31)/32;
michael@0 69
michael@0 70 // set errorValue to all-ones
michael@0 71 for (int32_t col = 0; col < columns; col++) {
michael@0 72 upvec_setValue(upvec, UPVEC_ERROR_VALUE_CP, UPVEC_ERROR_VALUE_CP,
michael@0 73 col, ~0, ~0, status);
michael@0 74 }
michael@0 75
michael@0 76 for (int32_t i = 0; i < result->encodingsCount; ++i) {
michael@0 77 uint32_t mask;
michael@0 78 uint32_t column;
michael@0 79 int32_t item_count;
michael@0 80 int32_t j;
michael@0 81 UConverter* test_converter = ucnv_open(result->encodings[i], status);
michael@0 82 if (U_FAILURE(*status)) {
michael@0 83 return;
michael@0 84 }
michael@0 85 USet* unicode_point_set;
michael@0 86 unicode_point_set = uset_open(1, 0); // empty set
michael@0 87
michael@0 88 ucnv_getUnicodeSet(test_converter, unicode_point_set,
michael@0 89 whichSet, status);
michael@0 90 if (U_FAILURE(*status)) {
michael@0 91 ucnv_close(test_converter);
michael@0 92 return;
michael@0 93 }
michael@0 94
michael@0 95 column = i / 32;
michael@0 96 mask = 1 << (i%32);
michael@0 97 // now iterate over intervals on set i!
michael@0 98 item_count = uset_getItemCount(unicode_point_set);
michael@0 99
michael@0 100 for (j = 0; j < item_count; ++j) {
michael@0 101 UChar32 start_char;
michael@0 102 UChar32 end_char;
michael@0 103 UErrorCode smallStatus = U_ZERO_ERROR;
michael@0 104 uset_getItem(unicode_point_set, j, &start_char, &end_char, NULL, 0,
michael@0 105 &smallStatus);
michael@0 106 if (U_FAILURE(smallStatus)) {
michael@0 107 // this will be reached for the converters that fill the set with
michael@0 108 // strings. Those should be ignored by our system
michael@0 109 } else {
michael@0 110 upvec_setValue(upvec, start_char, end_char, column, ~0, mask,
michael@0 111 status);
michael@0 112 }
michael@0 113 }
michael@0 114 ucnv_close(test_converter);
michael@0 115 uset_close(unicode_point_set);
michael@0 116 if (U_FAILURE(*status)) {
michael@0 117 return;
michael@0 118 }
michael@0 119 }
michael@0 120
michael@0 121 // handle excluded encodings! Simply set their values to all 1's in the upvec
michael@0 122 if (excludedCodePoints) {
michael@0 123 int32_t item_count = uset_getItemCount(excludedCodePoints);
michael@0 124 for (int32_t j = 0; j < item_count; ++j) {
michael@0 125 UChar32 start_char;
michael@0 126 UChar32 end_char;
michael@0 127
michael@0 128 uset_getItem(excludedCodePoints, j, &start_char, &end_char, NULL, 0,
michael@0 129 status);
michael@0 130 for (int32_t col = 0; col < columns; col++) {
michael@0 131 upvec_setValue(upvec, start_char, end_char, col, ~0, ~0,
michael@0 132 status);
michael@0 133 }
michael@0 134 }
michael@0 135 }
michael@0 136
michael@0 137 // alright. Now, let's put things in the same exact form you'd get when you
michael@0 138 // unserialize things.
michael@0 139 result->trie = upvec_compactToUTrie2WithRowIndexes(upvec, status);
michael@0 140 result->pv = upvec_cloneArray(upvec, &result->pvCount, NULL, status);
michael@0 141 result->pvCount *= columns; // number of uint32_t = rows * columns
michael@0 142 result->ownPv = TRUE;
michael@0 143 }
michael@0 144
michael@0 145 /* open a selector. If converterListSize is 0, build for all converters.
michael@0 146 If excludedCodePoints is NULL, don't exclude any codepoints */
michael@0 147 U_CAPI UConverterSelector* U_EXPORT2
michael@0 148 ucnvsel_open(const char* const* converterList, int32_t converterListSize,
michael@0 149 const USet* excludedCodePoints,
michael@0 150 const UConverterUnicodeSet whichSet, UErrorCode* status) {
michael@0 151 // check if already failed
michael@0 152 if (U_FAILURE(*status)) {
michael@0 153 return NULL;
michael@0 154 }
michael@0 155 // ensure args make sense!
michael@0 156 if (converterListSize < 0 || (converterList == NULL && converterListSize != 0)) {
michael@0 157 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 158 return NULL;
michael@0 159 }
michael@0 160
michael@0 161 // allocate a new converter
michael@0 162 LocalUConverterSelectorPointer newSelector(
michael@0 163 (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector)));
michael@0 164 if (newSelector.isNull()) {
michael@0 165 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 166 return NULL;
michael@0 167 }
michael@0 168 uprv_memset(newSelector.getAlias(), 0, sizeof(UConverterSelector));
michael@0 169
michael@0 170 if (converterListSize == 0) {
michael@0 171 converterList = NULL;
michael@0 172 converterListSize = ucnv_countAvailable();
michael@0 173 }
michael@0 174 newSelector->encodings =
michael@0 175 (char**)uprv_malloc(converterListSize * sizeof(char*));
michael@0 176 if (!newSelector->encodings) {
michael@0 177 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 178 return NULL;
michael@0 179 }
michael@0 180 newSelector->encodings[0] = NULL; // now we can call ucnvsel_close()
michael@0 181
michael@0 182 // make a backup copy of the list of converters
michael@0 183 int32_t totalSize = 0;
michael@0 184 int32_t i;
michael@0 185 for (i = 0; i < converterListSize; i++) {
michael@0 186 totalSize +=
michael@0 187 (int32_t)uprv_strlen(converterList != NULL ? converterList[i] : ucnv_getAvailableName(i)) + 1;
michael@0 188 }
michael@0 189 // 4-align the totalSize to 4-align the size of the serialized form
michael@0 190 int32_t encodingStrPadding = totalSize & 3;
michael@0 191 if (encodingStrPadding != 0) {
michael@0 192 encodingStrPadding = 4 - encodingStrPadding;
michael@0 193 }
michael@0 194 newSelector->encodingStrLength = totalSize += encodingStrPadding;
michael@0 195 char* allStrings = (char*) uprv_malloc(totalSize);
michael@0 196 if (!allStrings) {
michael@0 197 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 198 return NULL;
michael@0 199 }
michael@0 200
michael@0 201 for (i = 0; i < converterListSize; i++) {
michael@0 202 newSelector->encodings[i] = allStrings;
michael@0 203 uprv_strcpy(newSelector->encodings[i],
michael@0 204 converterList != NULL ? converterList[i] : ucnv_getAvailableName(i));
michael@0 205 allStrings += uprv_strlen(newSelector->encodings[i]) + 1;
michael@0 206 }
michael@0 207 while (encodingStrPadding > 0) {
michael@0 208 *allStrings++ = 0;
michael@0 209 --encodingStrPadding;
michael@0 210 }
michael@0 211
michael@0 212 newSelector->ownEncodingStrings = TRUE;
michael@0 213 newSelector->encodingsCount = converterListSize;
michael@0 214 UPropsVectors *upvec = upvec_open((converterListSize+31)/32, status);
michael@0 215 generateSelectorData(newSelector.getAlias(), upvec, excludedCodePoints, whichSet, status);
michael@0 216 upvec_close(upvec);
michael@0 217
michael@0 218 if (U_FAILURE(*status)) {
michael@0 219 return NULL;
michael@0 220 }
michael@0 221
michael@0 222 return newSelector.orphan();
michael@0 223 }
michael@0 224
michael@0 225 /* close opened selector */
michael@0 226 U_CAPI void U_EXPORT2
michael@0 227 ucnvsel_close(UConverterSelector *sel) {
michael@0 228 if (!sel) {
michael@0 229 return;
michael@0 230 }
michael@0 231 if (sel->ownEncodingStrings) {
michael@0 232 uprv_free(sel->encodings[0]);
michael@0 233 }
michael@0 234 uprv_free(sel->encodings);
michael@0 235 if (sel->ownPv) {
michael@0 236 uprv_free(sel->pv);
michael@0 237 }
michael@0 238 utrie2_close(sel->trie);
michael@0 239 uprv_free(sel->swapped);
michael@0 240 uprv_free(sel);
michael@0 241 }
michael@0 242
michael@0 243 static const UDataInfo dataInfo = {
michael@0 244 sizeof(UDataInfo),
michael@0 245 0,
michael@0 246
michael@0 247 U_IS_BIG_ENDIAN,
michael@0 248 U_CHARSET_FAMILY,
michael@0 249 U_SIZEOF_UCHAR,
michael@0 250 0,
michael@0 251
michael@0 252 { 0x43, 0x53, 0x65, 0x6c }, /* dataFormat="CSel" */
michael@0 253 { 1, 0, 0, 0 }, /* formatVersion */
michael@0 254 { 0, 0, 0, 0 } /* dataVersion */
michael@0 255 };
michael@0 256
michael@0 257 enum {
michael@0 258 UCNVSEL_INDEX_TRIE_SIZE, // trie size in bytes
michael@0 259 UCNVSEL_INDEX_PV_COUNT, // number of uint32_t in the bit vectors
michael@0 260 UCNVSEL_INDEX_NAMES_COUNT, // number of encoding names
michael@0 261 UCNVSEL_INDEX_NAMES_LENGTH, // number of encoding name bytes including padding
michael@0 262 UCNVSEL_INDEX_SIZE = 15, // bytes following the DataHeader
michael@0 263 UCNVSEL_INDEX_COUNT = 16
michael@0 264 };
michael@0 265
michael@0 266 /*
michael@0 267 * Serialized form of a UConverterSelector, formatVersion 1:
michael@0 268 *
michael@0 269 * The serialized form begins with a standard ICU DataHeader with a UDataInfo
michael@0 270 * as the template above.
michael@0 271 * This is followed by:
michael@0 272 * int32_t indexes[UCNVSEL_INDEX_COUNT]; // see index entry constants above
michael@0 273 * serialized UTrie2; // indexes[UCNVSEL_INDEX_TRIE_SIZE] bytes
michael@0 274 * uint32_t pv[indexes[UCNVSEL_INDEX_PV_COUNT]]; // bit vectors
michael@0 275 * char* encodingNames[indexes[UCNVSEL_INDEX_NAMES_LENGTH]]; // NUL-terminated strings + padding
michael@0 276 */
michael@0 277
michael@0 278 /* serialize a selector */
michael@0 279 U_CAPI int32_t U_EXPORT2
michael@0 280 ucnvsel_serialize(const UConverterSelector* sel,
michael@0 281 void* buffer, int32_t bufferCapacity, UErrorCode* status) {
michael@0 282 // check if already failed
michael@0 283 if (U_FAILURE(*status)) {
michael@0 284 return 0;
michael@0 285 }
michael@0 286 // ensure args make sense!
michael@0 287 uint8_t *p = (uint8_t *)buffer;
michael@0 288 if (bufferCapacity < 0 ||
michael@0 289 (bufferCapacity > 0 && (p == NULL || (U_POINTER_MASK_LSB(p, 3) != 0)))
michael@0 290 ) {
michael@0 291 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 292 return 0;
michael@0 293 }
michael@0 294 // add up the size of the serialized form
michael@0 295 int32_t serializedTrieSize = utrie2_serialize(sel->trie, NULL, 0, status);
michael@0 296 if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
michael@0 297 return 0;
michael@0 298 }
michael@0 299 *status = U_ZERO_ERROR;
michael@0 300
michael@0 301 DataHeader header;
michael@0 302 uprv_memset(&header, 0, sizeof(header));
michael@0 303 header.dataHeader.headerSize = (uint16_t)((sizeof(header) + 15) & ~15);
michael@0 304 header.dataHeader.magic1 = 0xda;
michael@0 305 header.dataHeader.magic2 = 0x27;
michael@0 306 uprv_memcpy(&header.info, &dataInfo, sizeof(dataInfo));
michael@0 307
michael@0 308 int32_t indexes[UCNVSEL_INDEX_COUNT] = {
michael@0 309 serializedTrieSize,
michael@0 310 sel->pvCount,
michael@0 311 sel->encodingsCount,
michael@0 312 sel->encodingStrLength
michael@0 313 };
michael@0 314
michael@0 315 int32_t totalSize =
michael@0 316 header.dataHeader.headerSize +
michael@0 317 (int32_t)sizeof(indexes) +
michael@0 318 serializedTrieSize +
michael@0 319 sel->pvCount * 4 +
michael@0 320 sel->encodingStrLength;
michael@0 321 indexes[UCNVSEL_INDEX_SIZE] = totalSize - header.dataHeader.headerSize;
michael@0 322 if (totalSize > bufferCapacity) {
michael@0 323 *status = U_BUFFER_OVERFLOW_ERROR;
michael@0 324 return totalSize;
michael@0 325 }
michael@0 326 // ok, save!
michael@0 327 int32_t length = header.dataHeader.headerSize;
michael@0 328 uprv_memcpy(p, &header, sizeof(header));
michael@0 329 uprv_memset(p + sizeof(header), 0, length - sizeof(header));
michael@0 330 p += length;
michael@0 331
michael@0 332 length = (int32_t)sizeof(indexes);
michael@0 333 uprv_memcpy(p, indexes, length);
michael@0 334 p += length;
michael@0 335
michael@0 336 utrie2_serialize(sel->trie, p, serializedTrieSize, status);
michael@0 337 p += serializedTrieSize;
michael@0 338
michael@0 339 length = sel->pvCount * 4;
michael@0 340 uprv_memcpy(p, sel->pv, length);
michael@0 341 p += length;
michael@0 342
michael@0 343 uprv_memcpy(p, sel->encodings[0], sel->encodingStrLength);
michael@0 344 p += sel->encodingStrLength;
michael@0 345
michael@0 346 return totalSize;
michael@0 347 }
michael@0 348
michael@0 349 /**
michael@0 350 * swap a selector into the desired Endianness and Asciiness of
michael@0 351 * the system. Just as FYI, selectors are always saved in the format
michael@0 352 * of the system that created them. They are only converted if used
michael@0 353 * on another system. In other words, selectors created on different
michael@0 354 * system can be different even if the params are identical (endianness
michael@0 355 * and Asciiness differences only)
michael@0 356 *
michael@0 357 * @param ds pointer to data swapper containing swapping info
michael@0 358 * @param inData pointer to incoming data
michael@0 359 * @param length length of inData in bytes
michael@0 360 * @param outData pointer to output data. Capacity should
michael@0 361 * be at least equal to capacity of inData
michael@0 362 * @param status an in/out ICU UErrorCode
michael@0 363 * @return 0 on failure, number of bytes swapped on success
michael@0 364 * number of bytes swapped can be smaller than length
michael@0 365 */
michael@0 366 static int32_t
michael@0 367 ucnvsel_swap(const UDataSwapper *ds,
michael@0 368 const void *inData, int32_t length,
michael@0 369 void *outData, UErrorCode *status) {
michael@0 370 /* udata_swapDataHeader checks the arguments */
michael@0 371 int32_t headerSize = udata_swapDataHeader(ds, inData, length, outData, status);
michael@0 372 if(U_FAILURE(*status)) {
michael@0 373 return 0;
michael@0 374 }
michael@0 375
michael@0 376 /* check data format and format version */
michael@0 377 const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData + 4);
michael@0 378 if(!(
michael@0 379 pInfo->dataFormat[0] == 0x43 && /* dataFormat="CSel" */
michael@0 380 pInfo->dataFormat[1] == 0x53 &&
michael@0 381 pInfo->dataFormat[2] == 0x65 &&
michael@0 382 pInfo->dataFormat[3] == 0x6c
michael@0 383 )) {
michael@0 384 udata_printError(ds, "ucnvsel_swap(): data format %02x.%02x.%02x.%02x is not recognized as UConverterSelector data\n",
michael@0 385 pInfo->dataFormat[0], pInfo->dataFormat[1],
michael@0 386 pInfo->dataFormat[2], pInfo->dataFormat[3]);
michael@0 387 *status = U_INVALID_FORMAT_ERROR;
michael@0 388 return 0;
michael@0 389 }
michael@0 390 if(pInfo->formatVersion[0] != 1) {
michael@0 391 udata_printError(ds, "ucnvsel_swap(): format version %02x is not supported\n",
michael@0 392 pInfo->formatVersion[0]);
michael@0 393 *status = U_UNSUPPORTED_ERROR;
michael@0 394 return 0;
michael@0 395 }
michael@0 396
michael@0 397 if(length >= 0) {
michael@0 398 length -= headerSize;
michael@0 399 if(length < 16*4) {
michael@0 400 udata_printError(ds, "ucnvsel_swap(): too few bytes (%d after header) for UConverterSelector data\n",
michael@0 401 length);
michael@0 402 *status = U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 403 return 0;
michael@0 404 }
michael@0 405 }
michael@0 406
michael@0 407 const uint8_t *inBytes = (const uint8_t *)inData + headerSize;
michael@0 408 uint8_t *outBytes = (uint8_t *)outData + headerSize;
michael@0 409
michael@0 410 /* read the indexes */
michael@0 411 const int32_t *inIndexes = (const int32_t *)inBytes;
michael@0 412 int32_t indexes[16];
michael@0 413 int32_t i;
michael@0 414 for(i = 0; i < 16; ++i) {
michael@0 415 indexes[i] = udata_readInt32(ds, inIndexes[i]);
michael@0 416 }
michael@0 417
michael@0 418 /* get the total length of the data */
michael@0 419 int32_t size = indexes[UCNVSEL_INDEX_SIZE];
michael@0 420 if(length >= 0) {
michael@0 421 if(length < size) {
michael@0 422 udata_printError(ds, "ucnvsel_swap(): too few bytes (%d after header) for all of UConverterSelector data\n",
michael@0 423 length);
michael@0 424 *status = U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 425 return 0;
michael@0 426 }
michael@0 427
michael@0 428 /* copy the data for inaccessible bytes */
michael@0 429 if(inBytes != outBytes) {
michael@0 430 uprv_memcpy(outBytes, inBytes, size);
michael@0 431 }
michael@0 432
michael@0 433 int32_t offset = 0, count;
michael@0 434
michael@0 435 /* swap the int32_t indexes[] */
michael@0 436 count = UCNVSEL_INDEX_COUNT*4;
michael@0 437 ds->swapArray32(ds, inBytes, count, outBytes, status);
michael@0 438 offset += count;
michael@0 439
michael@0 440 /* swap the UTrie2 */
michael@0 441 count = indexes[UCNVSEL_INDEX_TRIE_SIZE];
michael@0 442 utrie2_swap(ds, inBytes + offset, count, outBytes + offset, status);
michael@0 443 offset += count;
michael@0 444
michael@0 445 /* swap the uint32_t pv[] */
michael@0 446 count = indexes[UCNVSEL_INDEX_PV_COUNT]*4;
michael@0 447 ds->swapArray32(ds, inBytes + offset, count, outBytes + offset, status);
michael@0 448 offset += count;
michael@0 449
michael@0 450 /* swap the encoding names */
michael@0 451 count = indexes[UCNVSEL_INDEX_NAMES_LENGTH];
michael@0 452 ds->swapInvChars(ds, inBytes + offset, count, outBytes + offset, status);
michael@0 453 offset += count;
michael@0 454
michael@0 455 U_ASSERT(offset == size);
michael@0 456 }
michael@0 457
michael@0 458 return headerSize + size;
michael@0 459 }
michael@0 460
michael@0 461 /* unserialize a selector */
michael@0 462 U_CAPI UConverterSelector* U_EXPORT2
michael@0 463 ucnvsel_openFromSerialized(const void* buffer, int32_t length, UErrorCode* status) {
michael@0 464 // check if already failed
michael@0 465 if (U_FAILURE(*status)) {
michael@0 466 return NULL;
michael@0 467 }
michael@0 468 // ensure args make sense!
michael@0 469 const uint8_t *p = (const uint8_t *)buffer;
michael@0 470 if (length <= 0 ||
michael@0 471 (length > 0 && (p == NULL || (U_POINTER_MASK_LSB(p, 3) != 0)))
michael@0 472 ) {
michael@0 473 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 474 return NULL;
michael@0 475 }
michael@0 476 // header
michael@0 477 if (length < 32) {
michael@0 478 // not even enough space for a minimal header
michael@0 479 *status = U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 480 return NULL;
michael@0 481 }
michael@0 482 const DataHeader *pHeader = (const DataHeader *)p;
michael@0 483 if (!(
michael@0 484 pHeader->dataHeader.magic1==0xda &&
michael@0 485 pHeader->dataHeader.magic2==0x27 &&
michael@0 486 pHeader->info.dataFormat[0] == 0x43 &&
michael@0 487 pHeader->info.dataFormat[1] == 0x53 &&
michael@0 488 pHeader->info.dataFormat[2] == 0x65 &&
michael@0 489 pHeader->info.dataFormat[3] == 0x6c
michael@0 490 )) {
michael@0 491 /* header not valid or dataFormat not recognized */
michael@0 492 *status = U_INVALID_FORMAT_ERROR;
michael@0 493 return NULL;
michael@0 494 }
michael@0 495 if (pHeader->info.formatVersion[0] != 1) {
michael@0 496 *status = U_UNSUPPORTED_ERROR;
michael@0 497 return NULL;
michael@0 498 }
michael@0 499 uint8_t* swapped = NULL;
michael@0 500 if (pHeader->info.isBigEndian != U_IS_BIG_ENDIAN ||
michael@0 501 pHeader->info.charsetFamily != U_CHARSET_FAMILY
michael@0 502 ) {
michael@0 503 // swap the data
michael@0 504 UDataSwapper *ds =
michael@0 505 udata_openSwapperForInputData(p, length, U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, status);
michael@0 506 int32_t totalSize = ucnvsel_swap(ds, p, -1, NULL, status);
michael@0 507 if (U_FAILURE(*status)) {
michael@0 508 udata_closeSwapper(ds);
michael@0 509 return NULL;
michael@0 510 }
michael@0 511 if (length < totalSize) {
michael@0 512 udata_closeSwapper(ds);
michael@0 513 *status = U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 514 return NULL;
michael@0 515 }
michael@0 516 swapped = (uint8_t*)uprv_malloc(totalSize);
michael@0 517 if (swapped == NULL) {
michael@0 518 udata_closeSwapper(ds);
michael@0 519 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 520 return NULL;
michael@0 521 }
michael@0 522 ucnvsel_swap(ds, p, length, swapped, status);
michael@0 523 udata_closeSwapper(ds);
michael@0 524 if (U_FAILURE(*status)) {
michael@0 525 uprv_free(swapped);
michael@0 526 return NULL;
michael@0 527 }
michael@0 528 p = swapped;
michael@0 529 pHeader = (const DataHeader *)p;
michael@0 530 }
michael@0 531 if (length < (pHeader->dataHeader.headerSize + 16 * 4)) {
michael@0 532 // not even enough space for the header and the indexes
michael@0 533 uprv_free(swapped);
michael@0 534 *status = U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 535 return NULL;
michael@0 536 }
michael@0 537 p += pHeader->dataHeader.headerSize;
michael@0 538 length -= pHeader->dataHeader.headerSize;
michael@0 539 // indexes
michael@0 540 const int32_t *indexes = (const int32_t *)p;
michael@0 541 if (length < indexes[UCNVSEL_INDEX_SIZE]) {
michael@0 542 uprv_free(swapped);
michael@0 543 *status = U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 544 return NULL;
michael@0 545 }
michael@0 546 p += UCNVSEL_INDEX_COUNT * 4;
michael@0 547 // create and populate the selector object
michael@0 548 UConverterSelector* sel = (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector));
michael@0 549 char **encodings =
michael@0 550 (char **)uprv_malloc(
michael@0 551 indexes[UCNVSEL_INDEX_NAMES_COUNT] * sizeof(char *));
michael@0 552 if (sel == NULL || encodings == NULL) {
michael@0 553 uprv_free(swapped);
michael@0 554 uprv_free(sel);
michael@0 555 uprv_free(encodings);
michael@0 556 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 557 return NULL;
michael@0 558 }
michael@0 559 uprv_memset(sel, 0, sizeof(UConverterSelector));
michael@0 560 sel->pvCount = indexes[UCNVSEL_INDEX_PV_COUNT];
michael@0 561 sel->encodings = encodings;
michael@0 562 sel->encodingsCount = indexes[UCNVSEL_INDEX_NAMES_COUNT];
michael@0 563 sel->encodingStrLength = indexes[UCNVSEL_INDEX_NAMES_LENGTH];
michael@0 564 sel->swapped = swapped;
michael@0 565 // trie
michael@0 566 sel->trie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
michael@0 567 p, indexes[UCNVSEL_INDEX_TRIE_SIZE], NULL,
michael@0 568 status);
michael@0 569 p += indexes[UCNVSEL_INDEX_TRIE_SIZE];
michael@0 570 if (U_FAILURE(*status)) {
michael@0 571 ucnvsel_close(sel);
michael@0 572 return NULL;
michael@0 573 }
michael@0 574 // bit vectors
michael@0 575 sel->pv = (uint32_t *)p;
michael@0 576 p += sel->pvCount * 4;
michael@0 577 // encoding names
michael@0 578 char* s = (char*)p;
michael@0 579 for (int32_t i = 0; i < sel->encodingsCount; ++i) {
michael@0 580 sel->encodings[i] = s;
michael@0 581 s += uprv_strlen(s) + 1;
michael@0 582 }
michael@0 583 p += sel->encodingStrLength;
michael@0 584
michael@0 585 return sel;
michael@0 586 }
michael@0 587
michael@0 588 // a bunch of functions for the enumeration thingie! Nothing fancy here. Just
michael@0 589 // iterate over the selected encodings
michael@0 590 struct Enumerator {
michael@0 591 int16_t* index;
michael@0 592 int16_t length;
michael@0 593 int16_t cur;
michael@0 594 const UConverterSelector* sel;
michael@0 595 };
michael@0 596
michael@0 597 U_CDECL_BEGIN
michael@0 598
michael@0 599 static void U_CALLCONV
michael@0 600 ucnvsel_close_selector_iterator(UEnumeration *enumerator) {
michael@0 601 uprv_free(((Enumerator*)(enumerator->context))->index);
michael@0 602 uprv_free(enumerator->context);
michael@0 603 uprv_free(enumerator);
michael@0 604 }
michael@0 605
michael@0 606
michael@0 607 static int32_t U_CALLCONV
michael@0 608 ucnvsel_count_encodings(UEnumeration *enumerator, UErrorCode *status) {
michael@0 609 // check if already failed
michael@0 610 if (U_FAILURE(*status)) {
michael@0 611 return 0;
michael@0 612 }
michael@0 613 return ((Enumerator*)(enumerator->context))->length;
michael@0 614 }
michael@0 615
michael@0 616
michael@0 617 static const char* U_CALLCONV ucnvsel_next_encoding(UEnumeration* enumerator,
michael@0 618 int32_t* resultLength,
michael@0 619 UErrorCode* status) {
michael@0 620 // check if already failed
michael@0 621 if (U_FAILURE(*status)) {
michael@0 622 return NULL;
michael@0 623 }
michael@0 624
michael@0 625 int16_t cur = ((Enumerator*)(enumerator->context))->cur;
michael@0 626 const UConverterSelector* sel;
michael@0 627 const char* result;
michael@0 628 if (cur >= ((Enumerator*)(enumerator->context))->length) {
michael@0 629 return NULL;
michael@0 630 }
michael@0 631 sel = ((Enumerator*)(enumerator->context))->sel;
michael@0 632 result = sel->encodings[((Enumerator*)(enumerator->context))->index[cur] ];
michael@0 633 ((Enumerator*)(enumerator->context))->cur++;
michael@0 634 if (resultLength) {
michael@0 635 *resultLength = (int32_t)uprv_strlen(result);
michael@0 636 }
michael@0 637 return result;
michael@0 638 }
michael@0 639
michael@0 640 static void U_CALLCONV ucnvsel_reset_iterator(UEnumeration* enumerator,
michael@0 641 UErrorCode* status) {
michael@0 642 // check if already failed
michael@0 643 if (U_FAILURE(*status)) {
michael@0 644 return ;
michael@0 645 }
michael@0 646 ((Enumerator*)(enumerator->context))->cur = 0;
michael@0 647 }
michael@0 648
michael@0 649 U_CDECL_END
michael@0 650
michael@0 651
michael@0 652 static const UEnumeration defaultEncodings = {
michael@0 653 NULL,
michael@0 654 NULL,
michael@0 655 ucnvsel_close_selector_iterator,
michael@0 656 ucnvsel_count_encodings,
michael@0 657 uenum_unextDefault,
michael@0 658 ucnvsel_next_encoding,
michael@0 659 ucnvsel_reset_iterator
michael@0 660 };
michael@0 661
michael@0 662
michael@0 663 // internal fn to intersect two sets of masks
michael@0 664 // returns whether the mask has reduced to all zeros
michael@0 665 static UBool intersectMasks(uint32_t* dest, const uint32_t* source1, int32_t len) {
michael@0 666 int32_t i;
michael@0 667 uint32_t oredDest = 0;
michael@0 668 for (i = 0 ; i < len ; ++i) {
michael@0 669 oredDest |= (dest[i] &= source1[i]);
michael@0 670 }
michael@0 671 return oredDest == 0;
michael@0 672 }
michael@0 673
michael@0 674 // internal fn to count how many 1's are there in a mask
michael@0 675 // algorithm taken from http://graphics.stanford.edu/~seander/bithacks.html
michael@0 676 static int16_t countOnes(uint32_t* mask, int32_t len) {
michael@0 677 int32_t i, totalOnes = 0;
michael@0 678 for (i = 0 ; i < len ; ++i) {
michael@0 679 uint32_t ent = mask[i];
michael@0 680 for (; ent; totalOnes++)
michael@0 681 {
michael@0 682 ent &= ent - 1; // clear the least significant bit set
michael@0 683 }
michael@0 684 }
michael@0 685 return totalOnes;
michael@0 686 }
michael@0 687
michael@0 688
michael@0 689 /* internal function! */
michael@0 690 static UEnumeration *selectForMask(const UConverterSelector* sel,
michael@0 691 uint32_t *mask, UErrorCode *status) {
michael@0 692 // this is the context we will use. Store a table of indices to which
michael@0 693 // encodings are legit.
michael@0 694 struct Enumerator* result = (Enumerator*)uprv_malloc(sizeof(Enumerator));
michael@0 695 if (result == NULL) {
michael@0 696 uprv_free(mask);
michael@0 697 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 698 return NULL;
michael@0 699 }
michael@0 700 result->index = NULL; // this will be allocated later!
michael@0 701 result->length = result->cur = 0;
michael@0 702 result->sel = sel;
michael@0 703
michael@0 704 UEnumeration *en = (UEnumeration *)uprv_malloc(sizeof(UEnumeration));
michael@0 705 if (en == NULL) {
michael@0 706 // TODO(markus): Combine Enumerator and UEnumeration into one struct.
michael@0 707 uprv_free(mask);
michael@0 708 uprv_free(result);
michael@0 709 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 710 return NULL;
michael@0 711 }
michael@0 712 memcpy(en, &defaultEncodings, sizeof(UEnumeration));
michael@0 713 en->context = result;
michael@0 714
michael@0 715 int32_t columns = (sel->encodingsCount+31)/32;
michael@0 716 int16_t numOnes = countOnes(mask, columns);
michael@0 717 // now, we know the exact space we need for index
michael@0 718 if (numOnes > 0) {
michael@0 719 result->index = (int16_t*) uprv_malloc(numOnes * sizeof(int16_t));
michael@0 720
michael@0 721 int32_t i, j;
michael@0 722 int16_t k = 0;
michael@0 723 for (j = 0 ; j < columns; j++) {
michael@0 724 uint32_t v = mask[j];
michael@0 725 for (i = 0 ; i < 32 && k < sel->encodingsCount; i++, k++) {
michael@0 726 if ((v & 1) != 0) {
michael@0 727 result->index[result->length++] = k;
michael@0 728 }
michael@0 729 v >>= 1;
michael@0 730 }
michael@0 731 }
michael@0 732 } //otherwise, index will remain NULL (and will never be touched by
michael@0 733 //the enumerator code anyway)
michael@0 734 uprv_free(mask);
michael@0 735 return en;
michael@0 736 }
michael@0 737
michael@0 738 /* check a string against the selector - UTF16 version */
michael@0 739 U_CAPI UEnumeration * U_EXPORT2
michael@0 740 ucnvsel_selectForString(const UConverterSelector* sel,
michael@0 741 const UChar *s, int32_t length, UErrorCode *status) {
michael@0 742 // check if already failed
michael@0 743 if (U_FAILURE(*status)) {
michael@0 744 return NULL;
michael@0 745 }
michael@0 746 // ensure args make sense!
michael@0 747 if (sel == NULL || (s == NULL && length != 0)) {
michael@0 748 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 749 return NULL;
michael@0 750 }
michael@0 751
michael@0 752 int32_t columns = (sel->encodingsCount+31)/32;
michael@0 753 uint32_t* mask = (uint32_t*) uprv_malloc(columns * 4);
michael@0 754 if (mask == NULL) {
michael@0 755 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 756 return NULL;
michael@0 757 }
michael@0 758 uprv_memset(mask, ~0, columns *4);
michael@0 759
michael@0 760 if(s!=NULL) {
michael@0 761 const UChar *limit;
michael@0 762 if (length >= 0) {
michael@0 763 limit = s + length;
michael@0 764 } else {
michael@0 765 limit = NULL;
michael@0 766 }
michael@0 767
michael@0 768 while (limit == NULL ? *s != 0 : s != limit) {
michael@0 769 UChar32 c;
michael@0 770 uint16_t pvIndex;
michael@0 771 UTRIE2_U16_NEXT16(sel->trie, s, limit, c, pvIndex);
michael@0 772 if (intersectMasks(mask, sel->pv+pvIndex, columns)) {
michael@0 773 break;
michael@0 774 }
michael@0 775 }
michael@0 776 }
michael@0 777 return selectForMask(sel, mask, status);
michael@0 778 }
michael@0 779
michael@0 780 /* check a string against the selector - UTF8 version */
michael@0 781 U_CAPI UEnumeration * U_EXPORT2
michael@0 782 ucnvsel_selectForUTF8(const UConverterSelector* sel,
michael@0 783 const char *s, int32_t length, UErrorCode *status) {
michael@0 784 // check if already failed
michael@0 785 if (U_FAILURE(*status)) {
michael@0 786 return NULL;
michael@0 787 }
michael@0 788 // ensure args make sense!
michael@0 789 if (sel == NULL || (s == NULL && length != 0)) {
michael@0 790 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 791 return NULL;
michael@0 792 }
michael@0 793
michael@0 794 int32_t columns = (sel->encodingsCount+31)/32;
michael@0 795 uint32_t* mask = (uint32_t*) uprv_malloc(columns * 4);
michael@0 796 if (mask == NULL) {
michael@0 797 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 798 return NULL;
michael@0 799 }
michael@0 800 uprv_memset(mask, ~0, columns *4);
michael@0 801
michael@0 802 if (length < 0) {
michael@0 803 length = (int32_t)uprv_strlen(s);
michael@0 804 }
michael@0 805
michael@0 806 if(s!=NULL) {
michael@0 807 const char *limit = s + length;
michael@0 808
michael@0 809 while (s != limit) {
michael@0 810 uint16_t pvIndex;
michael@0 811 UTRIE2_U8_NEXT16(sel->trie, s, limit, pvIndex);
michael@0 812 if (intersectMasks(mask, sel->pv+pvIndex, columns)) {
michael@0 813 break;
michael@0 814 }
michael@0 815 }
michael@0 816 }
michael@0 817 return selectForMask(sel, mask, status);
michael@0 818 }
michael@0 819
michael@0 820 #endif // !UCONFIG_NO_CONVERSION

mercurial