1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/ucnvsel.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,820 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* 1.7 +* Copyright (C) 2008-2011, International Business Machines 1.8 +* Corporation, Google and others. All Rights Reserved. 1.9 +* 1.10 +******************************************************************************* 1.11 +*/ 1.12 +// Author : eldawy@google.com (Mohamed Eldawy) 1.13 +// ucnvsel.cpp 1.14 +// 1.15 +// Purpose: To generate a list of encodings capable of handling 1.16 +// a given Unicode text 1.17 +// 1.18 +// Started 09-April-2008 1.19 + 1.20 +/** 1.21 + * \file 1.22 + * 1.23 + * This is an implementation of an encoding selector. 1.24 + * The goal is, given a unicode string, find the encodings 1.25 + * this string can be mapped to. To make processing faster 1.26 + * a trie is built when you call ucnvsel_open() that 1.27 + * stores all encodings a codepoint can map to 1.28 + */ 1.29 + 1.30 +#include "unicode/ucnvsel.h" 1.31 + 1.32 +#if !UCONFIG_NO_CONVERSION 1.33 + 1.34 +#include <string.h> 1.35 + 1.36 +#include "unicode/uchar.h" 1.37 +#include "unicode/uniset.h" 1.38 +#include "unicode/ucnv.h" 1.39 +#include "unicode/ustring.h" 1.40 +#include "unicode/uchriter.h" 1.41 +#include "utrie2.h" 1.42 +#include "propsvec.h" 1.43 +#include "uassert.h" 1.44 +#include "ucmndata.h" 1.45 +#include "uenumimp.h" 1.46 +#include "cmemory.h" 1.47 +#include "cstring.h" 1.48 + 1.49 +U_NAMESPACE_USE 1.50 + 1.51 +struct UConverterSelector { 1.52 + UTrie2 *trie; // 16 bit trie containing offsets into pv 1.53 + uint32_t* pv; // table of bits! 1.54 + int32_t pvCount; 1.55 + char** encodings; // which encodings did user ask to use? 1.56 + int32_t encodingsCount; 1.57 + int32_t encodingStrLength; 1.58 + uint8_t* swapped; 1.59 + UBool ownPv, ownEncodingStrings; 1.60 +}; 1.61 + 1.62 +static void generateSelectorData(UConverterSelector* result, 1.63 + UPropsVectors *upvec, 1.64 + const USet* excludedCodePoints, 1.65 + const UConverterUnicodeSet whichSet, 1.66 + UErrorCode* status) { 1.67 + if (U_FAILURE(*status)) { 1.68 + return; 1.69 + } 1.70 + 1.71 + int32_t columns = (result->encodingsCount+31)/32; 1.72 + 1.73 + // set errorValue to all-ones 1.74 + for (int32_t col = 0; col < columns; col++) { 1.75 + upvec_setValue(upvec, UPVEC_ERROR_VALUE_CP, UPVEC_ERROR_VALUE_CP, 1.76 + col, ~0, ~0, status); 1.77 + } 1.78 + 1.79 + for (int32_t i = 0; i < result->encodingsCount; ++i) { 1.80 + uint32_t mask; 1.81 + uint32_t column; 1.82 + int32_t item_count; 1.83 + int32_t j; 1.84 + UConverter* test_converter = ucnv_open(result->encodings[i], status); 1.85 + if (U_FAILURE(*status)) { 1.86 + return; 1.87 + } 1.88 + USet* unicode_point_set; 1.89 + unicode_point_set = uset_open(1, 0); // empty set 1.90 + 1.91 + ucnv_getUnicodeSet(test_converter, unicode_point_set, 1.92 + whichSet, status); 1.93 + if (U_FAILURE(*status)) { 1.94 + ucnv_close(test_converter); 1.95 + return; 1.96 + } 1.97 + 1.98 + column = i / 32; 1.99 + mask = 1 << (i%32); 1.100 + // now iterate over intervals on set i! 1.101 + item_count = uset_getItemCount(unicode_point_set); 1.102 + 1.103 + for (j = 0; j < item_count; ++j) { 1.104 + UChar32 start_char; 1.105 + UChar32 end_char; 1.106 + UErrorCode smallStatus = U_ZERO_ERROR; 1.107 + uset_getItem(unicode_point_set, j, &start_char, &end_char, NULL, 0, 1.108 + &smallStatus); 1.109 + if (U_FAILURE(smallStatus)) { 1.110 + // this will be reached for the converters that fill the set with 1.111 + // strings. Those should be ignored by our system 1.112 + } else { 1.113 + upvec_setValue(upvec, start_char, end_char, column, ~0, mask, 1.114 + status); 1.115 + } 1.116 + } 1.117 + ucnv_close(test_converter); 1.118 + uset_close(unicode_point_set); 1.119 + if (U_FAILURE(*status)) { 1.120 + return; 1.121 + } 1.122 + } 1.123 + 1.124 + // handle excluded encodings! Simply set their values to all 1's in the upvec 1.125 + if (excludedCodePoints) { 1.126 + int32_t item_count = uset_getItemCount(excludedCodePoints); 1.127 + for (int32_t j = 0; j < item_count; ++j) { 1.128 + UChar32 start_char; 1.129 + UChar32 end_char; 1.130 + 1.131 + uset_getItem(excludedCodePoints, j, &start_char, &end_char, NULL, 0, 1.132 + status); 1.133 + for (int32_t col = 0; col < columns; col++) { 1.134 + upvec_setValue(upvec, start_char, end_char, col, ~0, ~0, 1.135 + status); 1.136 + } 1.137 + } 1.138 + } 1.139 + 1.140 + // alright. Now, let's put things in the same exact form you'd get when you 1.141 + // unserialize things. 1.142 + result->trie = upvec_compactToUTrie2WithRowIndexes(upvec, status); 1.143 + result->pv = upvec_cloneArray(upvec, &result->pvCount, NULL, status); 1.144 + result->pvCount *= columns; // number of uint32_t = rows * columns 1.145 + result->ownPv = TRUE; 1.146 +} 1.147 + 1.148 +/* open a selector. If converterListSize is 0, build for all converters. 1.149 + If excludedCodePoints is NULL, don't exclude any codepoints */ 1.150 +U_CAPI UConverterSelector* U_EXPORT2 1.151 +ucnvsel_open(const char* const* converterList, int32_t converterListSize, 1.152 + const USet* excludedCodePoints, 1.153 + const UConverterUnicodeSet whichSet, UErrorCode* status) { 1.154 + // check if already failed 1.155 + if (U_FAILURE(*status)) { 1.156 + return NULL; 1.157 + } 1.158 + // ensure args make sense! 1.159 + if (converterListSize < 0 || (converterList == NULL && converterListSize != 0)) { 1.160 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.161 + return NULL; 1.162 + } 1.163 + 1.164 + // allocate a new converter 1.165 + LocalUConverterSelectorPointer newSelector( 1.166 + (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector))); 1.167 + if (newSelector.isNull()) { 1.168 + *status = U_MEMORY_ALLOCATION_ERROR; 1.169 + return NULL; 1.170 + } 1.171 + uprv_memset(newSelector.getAlias(), 0, sizeof(UConverterSelector)); 1.172 + 1.173 + if (converterListSize == 0) { 1.174 + converterList = NULL; 1.175 + converterListSize = ucnv_countAvailable(); 1.176 + } 1.177 + newSelector->encodings = 1.178 + (char**)uprv_malloc(converterListSize * sizeof(char*)); 1.179 + if (!newSelector->encodings) { 1.180 + *status = U_MEMORY_ALLOCATION_ERROR; 1.181 + return NULL; 1.182 + } 1.183 + newSelector->encodings[0] = NULL; // now we can call ucnvsel_close() 1.184 + 1.185 + // make a backup copy of the list of converters 1.186 + int32_t totalSize = 0; 1.187 + int32_t i; 1.188 + for (i = 0; i < converterListSize; i++) { 1.189 + totalSize += 1.190 + (int32_t)uprv_strlen(converterList != NULL ? converterList[i] : ucnv_getAvailableName(i)) + 1; 1.191 + } 1.192 + // 4-align the totalSize to 4-align the size of the serialized form 1.193 + int32_t encodingStrPadding = totalSize & 3; 1.194 + if (encodingStrPadding != 0) { 1.195 + encodingStrPadding = 4 - encodingStrPadding; 1.196 + } 1.197 + newSelector->encodingStrLength = totalSize += encodingStrPadding; 1.198 + char* allStrings = (char*) uprv_malloc(totalSize); 1.199 + if (!allStrings) { 1.200 + *status = U_MEMORY_ALLOCATION_ERROR; 1.201 + return NULL; 1.202 + } 1.203 + 1.204 + for (i = 0; i < converterListSize; i++) { 1.205 + newSelector->encodings[i] = allStrings; 1.206 + uprv_strcpy(newSelector->encodings[i], 1.207 + converterList != NULL ? converterList[i] : ucnv_getAvailableName(i)); 1.208 + allStrings += uprv_strlen(newSelector->encodings[i]) + 1; 1.209 + } 1.210 + while (encodingStrPadding > 0) { 1.211 + *allStrings++ = 0; 1.212 + --encodingStrPadding; 1.213 + } 1.214 + 1.215 + newSelector->ownEncodingStrings = TRUE; 1.216 + newSelector->encodingsCount = converterListSize; 1.217 + UPropsVectors *upvec = upvec_open((converterListSize+31)/32, status); 1.218 + generateSelectorData(newSelector.getAlias(), upvec, excludedCodePoints, whichSet, status); 1.219 + upvec_close(upvec); 1.220 + 1.221 + if (U_FAILURE(*status)) { 1.222 + return NULL; 1.223 + } 1.224 + 1.225 + return newSelector.orphan(); 1.226 +} 1.227 + 1.228 +/* close opened selector */ 1.229 +U_CAPI void U_EXPORT2 1.230 +ucnvsel_close(UConverterSelector *sel) { 1.231 + if (!sel) { 1.232 + return; 1.233 + } 1.234 + if (sel->ownEncodingStrings) { 1.235 + uprv_free(sel->encodings[0]); 1.236 + } 1.237 + uprv_free(sel->encodings); 1.238 + if (sel->ownPv) { 1.239 + uprv_free(sel->pv); 1.240 + } 1.241 + utrie2_close(sel->trie); 1.242 + uprv_free(sel->swapped); 1.243 + uprv_free(sel); 1.244 +} 1.245 + 1.246 +static const UDataInfo dataInfo = { 1.247 + sizeof(UDataInfo), 1.248 + 0, 1.249 + 1.250 + U_IS_BIG_ENDIAN, 1.251 + U_CHARSET_FAMILY, 1.252 + U_SIZEOF_UCHAR, 1.253 + 0, 1.254 + 1.255 + { 0x43, 0x53, 0x65, 0x6c }, /* dataFormat="CSel" */ 1.256 + { 1, 0, 0, 0 }, /* formatVersion */ 1.257 + { 0, 0, 0, 0 } /* dataVersion */ 1.258 +}; 1.259 + 1.260 +enum { 1.261 + UCNVSEL_INDEX_TRIE_SIZE, // trie size in bytes 1.262 + UCNVSEL_INDEX_PV_COUNT, // number of uint32_t in the bit vectors 1.263 + UCNVSEL_INDEX_NAMES_COUNT, // number of encoding names 1.264 + UCNVSEL_INDEX_NAMES_LENGTH, // number of encoding name bytes including padding 1.265 + UCNVSEL_INDEX_SIZE = 15, // bytes following the DataHeader 1.266 + UCNVSEL_INDEX_COUNT = 16 1.267 +}; 1.268 + 1.269 +/* 1.270 + * Serialized form of a UConverterSelector, formatVersion 1: 1.271 + * 1.272 + * The serialized form begins with a standard ICU DataHeader with a UDataInfo 1.273 + * as the template above. 1.274 + * This is followed by: 1.275 + * int32_t indexes[UCNVSEL_INDEX_COUNT]; // see index entry constants above 1.276 + * serialized UTrie2; // indexes[UCNVSEL_INDEX_TRIE_SIZE] bytes 1.277 + * uint32_t pv[indexes[UCNVSEL_INDEX_PV_COUNT]]; // bit vectors 1.278 + * char* encodingNames[indexes[UCNVSEL_INDEX_NAMES_LENGTH]]; // NUL-terminated strings + padding 1.279 + */ 1.280 + 1.281 +/* serialize a selector */ 1.282 +U_CAPI int32_t U_EXPORT2 1.283 +ucnvsel_serialize(const UConverterSelector* sel, 1.284 + void* buffer, int32_t bufferCapacity, UErrorCode* status) { 1.285 + // check if already failed 1.286 + if (U_FAILURE(*status)) { 1.287 + return 0; 1.288 + } 1.289 + // ensure args make sense! 1.290 + uint8_t *p = (uint8_t *)buffer; 1.291 + if (bufferCapacity < 0 || 1.292 + (bufferCapacity > 0 && (p == NULL || (U_POINTER_MASK_LSB(p, 3) != 0))) 1.293 + ) { 1.294 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.295 + return 0; 1.296 + } 1.297 + // add up the size of the serialized form 1.298 + int32_t serializedTrieSize = utrie2_serialize(sel->trie, NULL, 0, status); 1.299 + if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) { 1.300 + return 0; 1.301 + } 1.302 + *status = U_ZERO_ERROR; 1.303 + 1.304 + DataHeader header; 1.305 + uprv_memset(&header, 0, sizeof(header)); 1.306 + header.dataHeader.headerSize = (uint16_t)((sizeof(header) + 15) & ~15); 1.307 + header.dataHeader.magic1 = 0xda; 1.308 + header.dataHeader.magic2 = 0x27; 1.309 + uprv_memcpy(&header.info, &dataInfo, sizeof(dataInfo)); 1.310 + 1.311 + int32_t indexes[UCNVSEL_INDEX_COUNT] = { 1.312 + serializedTrieSize, 1.313 + sel->pvCount, 1.314 + sel->encodingsCount, 1.315 + sel->encodingStrLength 1.316 + }; 1.317 + 1.318 + int32_t totalSize = 1.319 + header.dataHeader.headerSize + 1.320 + (int32_t)sizeof(indexes) + 1.321 + serializedTrieSize + 1.322 + sel->pvCount * 4 + 1.323 + sel->encodingStrLength; 1.324 + indexes[UCNVSEL_INDEX_SIZE] = totalSize - header.dataHeader.headerSize; 1.325 + if (totalSize > bufferCapacity) { 1.326 + *status = U_BUFFER_OVERFLOW_ERROR; 1.327 + return totalSize; 1.328 + } 1.329 + // ok, save! 1.330 + int32_t length = header.dataHeader.headerSize; 1.331 + uprv_memcpy(p, &header, sizeof(header)); 1.332 + uprv_memset(p + sizeof(header), 0, length - sizeof(header)); 1.333 + p += length; 1.334 + 1.335 + length = (int32_t)sizeof(indexes); 1.336 + uprv_memcpy(p, indexes, length); 1.337 + p += length; 1.338 + 1.339 + utrie2_serialize(sel->trie, p, serializedTrieSize, status); 1.340 + p += serializedTrieSize; 1.341 + 1.342 + length = sel->pvCount * 4; 1.343 + uprv_memcpy(p, sel->pv, length); 1.344 + p += length; 1.345 + 1.346 + uprv_memcpy(p, sel->encodings[0], sel->encodingStrLength); 1.347 + p += sel->encodingStrLength; 1.348 + 1.349 + return totalSize; 1.350 +} 1.351 + 1.352 +/** 1.353 + * swap a selector into the desired Endianness and Asciiness of 1.354 + * the system. Just as FYI, selectors are always saved in the format 1.355 + * of the system that created them. They are only converted if used 1.356 + * on another system. In other words, selectors created on different 1.357 + * system can be different even if the params are identical (endianness 1.358 + * and Asciiness differences only) 1.359 + * 1.360 + * @param ds pointer to data swapper containing swapping info 1.361 + * @param inData pointer to incoming data 1.362 + * @param length length of inData in bytes 1.363 + * @param outData pointer to output data. Capacity should 1.364 + * be at least equal to capacity of inData 1.365 + * @param status an in/out ICU UErrorCode 1.366 + * @return 0 on failure, number of bytes swapped on success 1.367 + * number of bytes swapped can be smaller than length 1.368 + */ 1.369 +static int32_t 1.370 +ucnvsel_swap(const UDataSwapper *ds, 1.371 + const void *inData, int32_t length, 1.372 + void *outData, UErrorCode *status) { 1.373 + /* udata_swapDataHeader checks the arguments */ 1.374 + int32_t headerSize = udata_swapDataHeader(ds, inData, length, outData, status); 1.375 + if(U_FAILURE(*status)) { 1.376 + return 0; 1.377 + } 1.378 + 1.379 + /* check data format and format version */ 1.380 + const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData + 4); 1.381 + if(!( 1.382 + pInfo->dataFormat[0] == 0x43 && /* dataFormat="CSel" */ 1.383 + pInfo->dataFormat[1] == 0x53 && 1.384 + pInfo->dataFormat[2] == 0x65 && 1.385 + pInfo->dataFormat[3] == 0x6c 1.386 + )) { 1.387 + udata_printError(ds, "ucnvsel_swap(): data format %02x.%02x.%02x.%02x is not recognized as UConverterSelector data\n", 1.388 + pInfo->dataFormat[0], pInfo->dataFormat[1], 1.389 + pInfo->dataFormat[2], pInfo->dataFormat[3]); 1.390 + *status = U_INVALID_FORMAT_ERROR; 1.391 + return 0; 1.392 + } 1.393 + if(pInfo->formatVersion[0] != 1) { 1.394 + udata_printError(ds, "ucnvsel_swap(): format version %02x is not supported\n", 1.395 + pInfo->formatVersion[0]); 1.396 + *status = U_UNSUPPORTED_ERROR; 1.397 + return 0; 1.398 + } 1.399 + 1.400 + if(length >= 0) { 1.401 + length -= headerSize; 1.402 + if(length < 16*4) { 1.403 + udata_printError(ds, "ucnvsel_swap(): too few bytes (%d after header) for UConverterSelector data\n", 1.404 + length); 1.405 + *status = U_INDEX_OUTOFBOUNDS_ERROR; 1.406 + return 0; 1.407 + } 1.408 + } 1.409 + 1.410 + const uint8_t *inBytes = (const uint8_t *)inData + headerSize; 1.411 + uint8_t *outBytes = (uint8_t *)outData + headerSize; 1.412 + 1.413 + /* read the indexes */ 1.414 + const int32_t *inIndexes = (const int32_t *)inBytes; 1.415 + int32_t indexes[16]; 1.416 + int32_t i; 1.417 + for(i = 0; i < 16; ++i) { 1.418 + indexes[i] = udata_readInt32(ds, inIndexes[i]); 1.419 + } 1.420 + 1.421 + /* get the total length of the data */ 1.422 + int32_t size = indexes[UCNVSEL_INDEX_SIZE]; 1.423 + if(length >= 0) { 1.424 + if(length < size) { 1.425 + udata_printError(ds, "ucnvsel_swap(): too few bytes (%d after header) for all of UConverterSelector data\n", 1.426 + length); 1.427 + *status = U_INDEX_OUTOFBOUNDS_ERROR; 1.428 + return 0; 1.429 + } 1.430 + 1.431 + /* copy the data for inaccessible bytes */ 1.432 + if(inBytes != outBytes) { 1.433 + uprv_memcpy(outBytes, inBytes, size); 1.434 + } 1.435 + 1.436 + int32_t offset = 0, count; 1.437 + 1.438 + /* swap the int32_t indexes[] */ 1.439 + count = UCNVSEL_INDEX_COUNT*4; 1.440 + ds->swapArray32(ds, inBytes, count, outBytes, status); 1.441 + offset += count; 1.442 + 1.443 + /* swap the UTrie2 */ 1.444 + count = indexes[UCNVSEL_INDEX_TRIE_SIZE]; 1.445 + utrie2_swap(ds, inBytes + offset, count, outBytes + offset, status); 1.446 + offset += count; 1.447 + 1.448 + /* swap the uint32_t pv[] */ 1.449 + count = indexes[UCNVSEL_INDEX_PV_COUNT]*4; 1.450 + ds->swapArray32(ds, inBytes + offset, count, outBytes + offset, status); 1.451 + offset += count; 1.452 + 1.453 + /* swap the encoding names */ 1.454 + count = indexes[UCNVSEL_INDEX_NAMES_LENGTH]; 1.455 + ds->swapInvChars(ds, inBytes + offset, count, outBytes + offset, status); 1.456 + offset += count; 1.457 + 1.458 + U_ASSERT(offset == size); 1.459 + } 1.460 + 1.461 + return headerSize + size; 1.462 +} 1.463 + 1.464 +/* unserialize a selector */ 1.465 +U_CAPI UConverterSelector* U_EXPORT2 1.466 +ucnvsel_openFromSerialized(const void* buffer, int32_t length, UErrorCode* status) { 1.467 + // check if already failed 1.468 + if (U_FAILURE(*status)) { 1.469 + return NULL; 1.470 + } 1.471 + // ensure args make sense! 1.472 + const uint8_t *p = (const uint8_t *)buffer; 1.473 + if (length <= 0 || 1.474 + (length > 0 && (p == NULL || (U_POINTER_MASK_LSB(p, 3) != 0))) 1.475 + ) { 1.476 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.477 + return NULL; 1.478 + } 1.479 + // header 1.480 + if (length < 32) { 1.481 + // not even enough space for a minimal header 1.482 + *status = U_INDEX_OUTOFBOUNDS_ERROR; 1.483 + return NULL; 1.484 + } 1.485 + const DataHeader *pHeader = (const DataHeader *)p; 1.486 + if (!( 1.487 + pHeader->dataHeader.magic1==0xda && 1.488 + pHeader->dataHeader.magic2==0x27 && 1.489 + pHeader->info.dataFormat[0] == 0x43 && 1.490 + pHeader->info.dataFormat[1] == 0x53 && 1.491 + pHeader->info.dataFormat[2] == 0x65 && 1.492 + pHeader->info.dataFormat[3] == 0x6c 1.493 + )) { 1.494 + /* header not valid or dataFormat not recognized */ 1.495 + *status = U_INVALID_FORMAT_ERROR; 1.496 + return NULL; 1.497 + } 1.498 + if (pHeader->info.formatVersion[0] != 1) { 1.499 + *status = U_UNSUPPORTED_ERROR; 1.500 + return NULL; 1.501 + } 1.502 + uint8_t* swapped = NULL; 1.503 + if (pHeader->info.isBigEndian != U_IS_BIG_ENDIAN || 1.504 + pHeader->info.charsetFamily != U_CHARSET_FAMILY 1.505 + ) { 1.506 + // swap the data 1.507 + UDataSwapper *ds = 1.508 + udata_openSwapperForInputData(p, length, U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, status); 1.509 + int32_t totalSize = ucnvsel_swap(ds, p, -1, NULL, status); 1.510 + if (U_FAILURE(*status)) { 1.511 + udata_closeSwapper(ds); 1.512 + return NULL; 1.513 + } 1.514 + if (length < totalSize) { 1.515 + udata_closeSwapper(ds); 1.516 + *status = U_INDEX_OUTOFBOUNDS_ERROR; 1.517 + return NULL; 1.518 + } 1.519 + swapped = (uint8_t*)uprv_malloc(totalSize); 1.520 + if (swapped == NULL) { 1.521 + udata_closeSwapper(ds); 1.522 + *status = U_MEMORY_ALLOCATION_ERROR; 1.523 + return NULL; 1.524 + } 1.525 + ucnvsel_swap(ds, p, length, swapped, status); 1.526 + udata_closeSwapper(ds); 1.527 + if (U_FAILURE(*status)) { 1.528 + uprv_free(swapped); 1.529 + return NULL; 1.530 + } 1.531 + p = swapped; 1.532 + pHeader = (const DataHeader *)p; 1.533 + } 1.534 + if (length < (pHeader->dataHeader.headerSize + 16 * 4)) { 1.535 + // not even enough space for the header and the indexes 1.536 + uprv_free(swapped); 1.537 + *status = U_INDEX_OUTOFBOUNDS_ERROR; 1.538 + return NULL; 1.539 + } 1.540 + p += pHeader->dataHeader.headerSize; 1.541 + length -= pHeader->dataHeader.headerSize; 1.542 + // indexes 1.543 + const int32_t *indexes = (const int32_t *)p; 1.544 + if (length < indexes[UCNVSEL_INDEX_SIZE]) { 1.545 + uprv_free(swapped); 1.546 + *status = U_INDEX_OUTOFBOUNDS_ERROR; 1.547 + return NULL; 1.548 + } 1.549 + p += UCNVSEL_INDEX_COUNT * 4; 1.550 + // create and populate the selector object 1.551 + UConverterSelector* sel = (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector)); 1.552 + char **encodings = 1.553 + (char **)uprv_malloc( 1.554 + indexes[UCNVSEL_INDEX_NAMES_COUNT] * sizeof(char *)); 1.555 + if (sel == NULL || encodings == NULL) { 1.556 + uprv_free(swapped); 1.557 + uprv_free(sel); 1.558 + uprv_free(encodings); 1.559 + *status = U_MEMORY_ALLOCATION_ERROR; 1.560 + return NULL; 1.561 + } 1.562 + uprv_memset(sel, 0, sizeof(UConverterSelector)); 1.563 + sel->pvCount = indexes[UCNVSEL_INDEX_PV_COUNT]; 1.564 + sel->encodings = encodings; 1.565 + sel->encodingsCount = indexes[UCNVSEL_INDEX_NAMES_COUNT]; 1.566 + sel->encodingStrLength = indexes[UCNVSEL_INDEX_NAMES_LENGTH]; 1.567 + sel->swapped = swapped; 1.568 + // trie 1.569 + sel->trie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, 1.570 + p, indexes[UCNVSEL_INDEX_TRIE_SIZE], NULL, 1.571 + status); 1.572 + p += indexes[UCNVSEL_INDEX_TRIE_SIZE]; 1.573 + if (U_FAILURE(*status)) { 1.574 + ucnvsel_close(sel); 1.575 + return NULL; 1.576 + } 1.577 + // bit vectors 1.578 + sel->pv = (uint32_t *)p; 1.579 + p += sel->pvCount * 4; 1.580 + // encoding names 1.581 + char* s = (char*)p; 1.582 + for (int32_t i = 0; i < sel->encodingsCount; ++i) { 1.583 + sel->encodings[i] = s; 1.584 + s += uprv_strlen(s) + 1; 1.585 + } 1.586 + p += sel->encodingStrLength; 1.587 + 1.588 + return sel; 1.589 +} 1.590 + 1.591 +// a bunch of functions for the enumeration thingie! Nothing fancy here. Just 1.592 +// iterate over the selected encodings 1.593 +struct Enumerator { 1.594 + int16_t* index; 1.595 + int16_t length; 1.596 + int16_t cur; 1.597 + const UConverterSelector* sel; 1.598 +}; 1.599 + 1.600 +U_CDECL_BEGIN 1.601 + 1.602 +static void U_CALLCONV 1.603 +ucnvsel_close_selector_iterator(UEnumeration *enumerator) { 1.604 + uprv_free(((Enumerator*)(enumerator->context))->index); 1.605 + uprv_free(enumerator->context); 1.606 + uprv_free(enumerator); 1.607 +} 1.608 + 1.609 + 1.610 +static int32_t U_CALLCONV 1.611 +ucnvsel_count_encodings(UEnumeration *enumerator, UErrorCode *status) { 1.612 + // check if already failed 1.613 + if (U_FAILURE(*status)) { 1.614 + return 0; 1.615 + } 1.616 + return ((Enumerator*)(enumerator->context))->length; 1.617 +} 1.618 + 1.619 + 1.620 +static const char* U_CALLCONV ucnvsel_next_encoding(UEnumeration* enumerator, 1.621 + int32_t* resultLength, 1.622 + UErrorCode* status) { 1.623 + // check if already failed 1.624 + if (U_FAILURE(*status)) { 1.625 + return NULL; 1.626 + } 1.627 + 1.628 + int16_t cur = ((Enumerator*)(enumerator->context))->cur; 1.629 + const UConverterSelector* sel; 1.630 + const char* result; 1.631 + if (cur >= ((Enumerator*)(enumerator->context))->length) { 1.632 + return NULL; 1.633 + } 1.634 + sel = ((Enumerator*)(enumerator->context))->sel; 1.635 + result = sel->encodings[((Enumerator*)(enumerator->context))->index[cur] ]; 1.636 + ((Enumerator*)(enumerator->context))->cur++; 1.637 + if (resultLength) { 1.638 + *resultLength = (int32_t)uprv_strlen(result); 1.639 + } 1.640 + return result; 1.641 +} 1.642 + 1.643 +static void U_CALLCONV ucnvsel_reset_iterator(UEnumeration* enumerator, 1.644 + UErrorCode* status) { 1.645 + // check if already failed 1.646 + if (U_FAILURE(*status)) { 1.647 + return ; 1.648 + } 1.649 + ((Enumerator*)(enumerator->context))->cur = 0; 1.650 +} 1.651 + 1.652 +U_CDECL_END 1.653 + 1.654 + 1.655 +static const UEnumeration defaultEncodings = { 1.656 + NULL, 1.657 + NULL, 1.658 + ucnvsel_close_selector_iterator, 1.659 + ucnvsel_count_encodings, 1.660 + uenum_unextDefault, 1.661 + ucnvsel_next_encoding, 1.662 + ucnvsel_reset_iterator 1.663 +}; 1.664 + 1.665 + 1.666 +// internal fn to intersect two sets of masks 1.667 +// returns whether the mask has reduced to all zeros 1.668 +static UBool intersectMasks(uint32_t* dest, const uint32_t* source1, int32_t len) { 1.669 + int32_t i; 1.670 + uint32_t oredDest = 0; 1.671 + for (i = 0 ; i < len ; ++i) { 1.672 + oredDest |= (dest[i] &= source1[i]); 1.673 + } 1.674 + return oredDest == 0; 1.675 +} 1.676 + 1.677 +// internal fn to count how many 1's are there in a mask 1.678 +// algorithm taken from http://graphics.stanford.edu/~seander/bithacks.html 1.679 +static int16_t countOnes(uint32_t* mask, int32_t len) { 1.680 + int32_t i, totalOnes = 0; 1.681 + for (i = 0 ; i < len ; ++i) { 1.682 + uint32_t ent = mask[i]; 1.683 + for (; ent; totalOnes++) 1.684 + { 1.685 + ent &= ent - 1; // clear the least significant bit set 1.686 + } 1.687 + } 1.688 + return totalOnes; 1.689 +} 1.690 + 1.691 + 1.692 +/* internal function! */ 1.693 +static UEnumeration *selectForMask(const UConverterSelector* sel, 1.694 + uint32_t *mask, UErrorCode *status) { 1.695 + // this is the context we will use. Store a table of indices to which 1.696 + // encodings are legit. 1.697 + struct Enumerator* result = (Enumerator*)uprv_malloc(sizeof(Enumerator)); 1.698 + if (result == NULL) { 1.699 + uprv_free(mask); 1.700 + *status = U_MEMORY_ALLOCATION_ERROR; 1.701 + return NULL; 1.702 + } 1.703 + result->index = NULL; // this will be allocated later! 1.704 + result->length = result->cur = 0; 1.705 + result->sel = sel; 1.706 + 1.707 + UEnumeration *en = (UEnumeration *)uprv_malloc(sizeof(UEnumeration)); 1.708 + if (en == NULL) { 1.709 + // TODO(markus): Combine Enumerator and UEnumeration into one struct. 1.710 + uprv_free(mask); 1.711 + uprv_free(result); 1.712 + *status = U_MEMORY_ALLOCATION_ERROR; 1.713 + return NULL; 1.714 + } 1.715 + memcpy(en, &defaultEncodings, sizeof(UEnumeration)); 1.716 + en->context = result; 1.717 + 1.718 + int32_t columns = (sel->encodingsCount+31)/32; 1.719 + int16_t numOnes = countOnes(mask, columns); 1.720 + // now, we know the exact space we need for index 1.721 + if (numOnes > 0) { 1.722 + result->index = (int16_t*) uprv_malloc(numOnes * sizeof(int16_t)); 1.723 + 1.724 + int32_t i, j; 1.725 + int16_t k = 0; 1.726 + for (j = 0 ; j < columns; j++) { 1.727 + uint32_t v = mask[j]; 1.728 + for (i = 0 ; i < 32 && k < sel->encodingsCount; i++, k++) { 1.729 + if ((v & 1) != 0) { 1.730 + result->index[result->length++] = k; 1.731 + } 1.732 + v >>= 1; 1.733 + } 1.734 + } 1.735 + } //otherwise, index will remain NULL (and will never be touched by 1.736 + //the enumerator code anyway) 1.737 + uprv_free(mask); 1.738 + return en; 1.739 +} 1.740 + 1.741 +/* check a string against the selector - UTF16 version */ 1.742 +U_CAPI UEnumeration * U_EXPORT2 1.743 +ucnvsel_selectForString(const UConverterSelector* sel, 1.744 + const UChar *s, int32_t length, UErrorCode *status) { 1.745 + // check if already failed 1.746 + if (U_FAILURE(*status)) { 1.747 + return NULL; 1.748 + } 1.749 + // ensure args make sense! 1.750 + if (sel == NULL || (s == NULL && length != 0)) { 1.751 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.752 + return NULL; 1.753 + } 1.754 + 1.755 + int32_t columns = (sel->encodingsCount+31)/32; 1.756 + uint32_t* mask = (uint32_t*) uprv_malloc(columns * 4); 1.757 + if (mask == NULL) { 1.758 + *status = U_MEMORY_ALLOCATION_ERROR; 1.759 + return NULL; 1.760 + } 1.761 + uprv_memset(mask, ~0, columns *4); 1.762 + 1.763 + if(s!=NULL) { 1.764 + const UChar *limit; 1.765 + if (length >= 0) { 1.766 + limit = s + length; 1.767 + } else { 1.768 + limit = NULL; 1.769 + } 1.770 + 1.771 + while (limit == NULL ? *s != 0 : s != limit) { 1.772 + UChar32 c; 1.773 + uint16_t pvIndex; 1.774 + UTRIE2_U16_NEXT16(sel->trie, s, limit, c, pvIndex); 1.775 + if (intersectMasks(mask, sel->pv+pvIndex, columns)) { 1.776 + break; 1.777 + } 1.778 + } 1.779 + } 1.780 + return selectForMask(sel, mask, status); 1.781 +} 1.782 + 1.783 +/* check a string against the selector - UTF8 version */ 1.784 +U_CAPI UEnumeration * U_EXPORT2 1.785 +ucnvsel_selectForUTF8(const UConverterSelector* sel, 1.786 + const char *s, int32_t length, UErrorCode *status) { 1.787 + // check if already failed 1.788 + if (U_FAILURE(*status)) { 1.789 + return NULL; 1.790 + } 1.791 + // ensure args make sense! 1.792 + if (sel == NULL || (s == NULL && length != 0)) { 1.793 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.794 + return NULL; 1.795 + } 1.796 + 1.797 + int32_t columns = (sel->encodingsCount+31)/32; 1.798 + uint32_t* mask = (uint32_t*) uprv_malloc(columns * 4); 1.799 + if (mask == NULL) { 1.800 + *status = U_MEMORY_ALLOCATION_ERROR; 1.801 + return NULL; 1.802 + } 1.803 + uprv_memset(mask, ~0, columns *4); 1.804 + 1.805 + if (length < 0) { 1.806 + length = (int32_t)uprv_strlen(s); 1.807 + } 1.808 + 1.809 + if(s!=NULL) { 1.810 + const char *limit = s + length; 1.811 + 1.812 + while (s != limit) { 1.813 + uint16_t pvIndex; 1.814 + UTRIE2_U8_NEXT16(sel->trie, s, limit, pvIndex); 1.815 + if (intersectMasks(mask, sel->pv+pvIndex, columns)) { 1.816 + break; 1.817 + } 1.818 + } 1.819 + } 1.820 + return selectForMask(sel, mask, status); 1.821 +} 1.822 + 1.823 +#endif // !UCONFIG_NO_CONVERSION