michael@0: /* michael@0: ******************************************************************************* michael@0: * michael@0: * Copyright (C) 2008-2011, International Business Machines michael@0: * Corporation, Google and others. All Rights Reserved. michael@0: * michael@0: ******************************************************************************* michael@0: */ michael@0: // Author : eldawy@google.com (Mohamed Eldawy) michael@0: // ucnvsel.cpp michael@0: // michael@0: // Purpose: To generate a list of encodings capable of handling michael@0: // a given Unicode text michael@0: // michael@0: // Started 09-April-2008 michael@0: michael@0: /** michael@0: * \file michael@0: * michael@0: * This is an implementation of an encoding selector. michael@0: * The goal is, given a unicode string, find the encodings michael@0: * this string can be mapped to. To make processing faster michael@0: * a trie is built when you call ucnvsel_open() that michael@0: * stores all encodings a codepoint can map to michael@0: */ michael@0: michael@0: #include "unicode/ucnvsel.h" michael@0: michael@0: #if !UCONFIG_NO_CONVERSION michael@0: michael@0: #include michael@0: michael@0: #include "unicode/uchar.h" michael@0: #include "unicode/uniset.h" michael@0: #include "unicode/ucnv.h" michael@0: #include "unicode/ustring.h" michael@0: #include "unicode/uchriter.h" michael@0: #include "utrie2.h" michael@0: #include "propsvec.h" michael@0: #include "uassert.h" michael@0: #include "ucmndata.h" michael@0: #include "uenumimp.h" michael@0: #include "cmemory.h" michael@0: #include "cstring.h" michael@0: michael@0: U_NAMESPACE_USE michael@0: michael@0: struct UConverterSelector { michael@0: UTrie2 *trie; // 16 bit trie containing offsets into pv michael@0: uint32_t* pv; // table of bits! michael@0: int32_t pvCount; michael@0: char** encodings; // which encodings did user ask to use? michael@0: int32_t encodingsCount; michael@0: int32_t encodingStrLength; michael@0: uint8_t* swapped; michael@0: UBool ownPv, ownEncodingStrings; michael@0: }; michael@0: michael@0: static void generateSelectorData(UConverterSelector* result, michael@0: UPropsVectors *upvec, michael@0: const USet* excludedCodePoints, michael@0: const UConverterUnicodeSet whichSet, michael@0: UErrorCode* status) { michael@0: if (U_FAILURE(*status)) { michael@0: return; michael@0: } michael@0: michael@0: int32_t columns = (result->encodingsCount+31)/32; michael@0: michael@0: // set errorValue to all-ones michael@0: for (int32_t col = 0; col < columns; col++) { michael@0: upvec_setValue(upvec, UPVEC_ERROR_VALUE_CP, UPVEC_ERROR_VALUE_CP, michael@0: col, ~0, ~0, status); michael@0: } michael@0: michael@0: for (int32_t i = 0; i < result->encodingsCount; ++i) { michael@0: uint32_t mask; michael@0: uint32_t column; michael@0: int32_t item_count; michael@0: int32_t j; michael@0: UConverter* test_converter = ucnv_open(result->encodings[i], status); michael@0: if (U_FAILURE(*status)) { michael@0: return; michael@0: } michael@0: USet* unicode_point_set; michael@0: unicode_point_set = uset_open(1, 0); // empty set michael@0: michael@0: ucnv_getUnicodeSet(test_converter, unicode_point_set, michael@0: whichSet, status); michael@0: if (U_FAILURE(*status)) { michael@0: ucnv_close(test_converter); michael@0: return; michael@0: } michael@0: michael@0: column = i / 32; michael@0: mask = 1 << (i%32); michael@0: // now iterate over intervals on set i! michael@0: item_count = uset_getItemCount(unicode_point_set); michael@0: michael@0: for (j = 0; j < item_count; ++j) { michael@0: UChar32 start_char; michael@0: UChar32 end_char; michael@0: UErrorCode smallStatus = U_ZERO_ERROR; michael@0: uset_getItem(unicode_point_set, j, &start_char, &end_char, NULL, 0, michael@0: &smallStatus); michael@0: if (U_FAILURE(smallStatus)) { michael@0: // this will be reached for the converters that fill the set with michael@0: // strings. Those should be ignored by our system michael@0: } else { michael@0: upvec_setValue(upvec, start_char, end_char, column, ~0, mask, michael@0: status); michael@0: } michael@0: } michael@0: ucnv_close(test_converter); michael@0: uset_close(unicode_point_set); michael@0: if (U_FAILURE(*status)) { michael@0: return; michael@0: } michael@0: } michael@0: michael@0: // handle excluded encodings! Simply set their values to all 1's in the upvec michael@0: if (excludedCodePoints) { michael@0: int32_t item_count = uset_getItemCount(excludedCodePoints); michael@0: for (int32_t j = 0; j < item_count; ++j) { michael@0: UChar32 start_char; michael@0: UChar32 end_char; michael@0: michael@0: uset_getItem(excludedCodePoints, j, &start_char, &end_char, NULL, 0, michael@0: status); michael@0: for (int32_t col = 0; col < columns; col++) { michael@0: upvec_setValue(upvec, start_char, end_char, col, ~0, ~0, michael@0: status); michael@0: } michael@0: } michael@0: } michael@0: michael@0: // alright. Now, let's put things in the same exact form you'd get when you michael@0: // unserialize things. michael@0: result->trie = upvec_compactToUTrie2WithRowIndexes(upvec, status); michael@0: result->pv = upvec_cloneArray(upvec, &result->pvCount, NULL, status); michael@0: result->pvCount *= columns; // number of uint32_t = rows * columns michael@0: result->ownPv = TRUE; michael@0: } michael@0: michael@0: /* open a selector. If converterListSize is 0, build for all converters. michael@0: If excludedCodePoints is NULL, don't exclude any codepoints */ michael@0: U_CAPI UConverterSelector* U_EXPORT2 michael@0: ucnvsel_open(const char* const* converterList, int32_t converterListSize, michael@0: const USet* excludedCodePoints, michael@0: const UConverterUnicodeSet whichSet, UErrorCode* status) { michael@0: // check if already failed michael@0: if (U_FAILURE(*status)) { michael@0: return NULL; michael@0: } michael@0: // ensure args make sense! michael@0: if (converterListSize < 0 || (converterList == NULL && converterListSize != 0)) { michael@0: *status = U_ILLEGAL_ARGUMENT_ERROR; michael@0: return NULL; michael@0: } michael@0: michael@0: // allocate a new converter michael@0: LocalUConverterSelectorPointer newSelector( michael@0: (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector))); michael@0: if (newSelector.isNull()) { michael@0: *status = U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: uprv_memset(newSelector.getAlias(), 0, sizeof(UConverterSelector)); michael@0: michael@0: if (converterListSize == 0) { michael@0: converterList = NULL; michael@0: converterListSize = ucnv_countAvailable(); michael@0: } michael@0: newSelector->encodings = michael@0: (char**)uprv_malloc(converterListSize * sizeof(char*)); michael@0: if (!newSelector->encodings) { michael@0: *status = U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: newSelector->encodings[0] = NULL; // now we can call ucnvsel_close() michael@0: michael@0: // make a backup copy of the list of converters michael@0: int32_t totalSize = 0; michael@0: int32_t i; michael@0: for (i = 0; i < converterListSize; i++) { michael@0: totalSize += michael@0: (int32_t)uprv_strlen(converterList != NULL ? converterList[i] : ucnv_getAvailableName(i)) + 1; michael@0: } michael@0: // 4-align the totalSize to 4-align the size of the serialized form michael@0: int32_t encodingStrPadding = totalSize & 3; michael@0: if (encodingStrPadding != 0) { michael@0: encodingStrPadding = 4 - encodingStrPadding; michael@0: } michael@0: newSelector->encodingStrLength = totalSize += encodingStrPadding; michael@0: char* allStrings = (char*) uprv_malloc(totalSize); michael@0: if (!allStrings) { michael@0: *status = U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: michael@0: for (i = 0; i < converterListSize; i++) { michael@0: newSelector->encodings[i] = allStrings; michael@0: uprv_strcpy(newSelector->encodings[i], michael@0: converterList != NULL ? converterList[i] : ucnv_getAvailableName(i)); michael@0: allStrings += uprv_strlen(newSelector->encodings[i]) + 1; michael@0: } michael@0: while (encodingStrPadding > 0) { michael@0: *allStrings++ = 0; michael@0: --encodingStrPadding; michael@0: } michael@0: michael@0: newSelector->ownEncodingStrings = TRUE; michael@0: newSelector->encodingsCount = converterListSize; michael@0: UPropsVectors *upvec = upvec_open((converterListSize+31)/32, status); michael@0: generateSelectorData(newSelector.getAlias(), upvec, excludedCodePoints, whichSet, status); michael@0: upvec_close(upvec); michael@0: michael@0: if (U_FAILURE(*status)) { michael@0: return NULL; michael@0: } michael@0: michael@0: return newSelector.orphan(); michael@0: } michael@0: michael@0: /* close opened selector */ michael@0: U_CAPI void U_EXPORT2 michael@0: ucnvsel_close(UConverterSelector *sel) { michael@0: if (!sel) { michael@0: return; michael@0: } michael@0: if (sel->ownEncodingStrings) { michael@0: uprv_free(sel->encodings[0]); michael@0: } michael@0: uprv_free(sel->encodings); michael@0: if (sel->ownPv) { michael@0: uprv_free(sel->pv); michael@0: } michael@0: utrie2_close(sel->trie); michael@0: uprv_free(sel->swapped); michael@0: uprv_free(sel); michael@0: } michael@0: michael@0: static const UDataInfo dataInfo = { michael@0: sizeof(UDataInfo), michael@0: 0, michael@0: michael@0: U_IS_BIG_ENDIAN, michael@0: U_CHARSET_FAMILY, michael@0: U_SIZEOF_UCHAR, michael@0: 0, michael@0: michael@0: { 0x43, 0x53, 0x65, 0x6c }, /* dataFormat="CSel" */ michael@0: { 1, 0, 0, 0 }, /* formatVersion */ michael@0: { 0, 0, 0, 0 } /* dataVersion */ michael@0: }; michael@0: michael@0: enum { michael@0: UCNVSEL_INDEX_TRIE_SIZE, // trie size in bytes michael@0: UCNVSEL_INDEX_PV_COUNT, // number of uint32_t in the bit vectors michael@0: UCNVSEL_INDEX_NAMES_COUNT, // number of encoding names michael@0: UCNVSEL_INDEX_NAMES_LENGTH, // number of encoding name bytes including padding michael@0: UCNVSEL_INDEX_SIZE = 15, // bytes following the DataHeader michael@0: UCNVSEL_INDEX_COUNT = 16 michael@0: }; michael@0: michael@0: /* michael@0: * Serialized form of a UConverterSelector, formatVersion 1: michael@0: * michael@0: * The serialized form begins with a standard ICU DataHeader with a UDataInfo michael@0: * as the template above. michael@0: * This is followed by: michael@0: * int32_t indexes[UCNVSEL_INDEX_COUNT]; // see index entry constants above michael@0: * serialized UTrie2; // indexes[UCNVSEL_INDEX_TRIE_SIZE] bytes michael@0: * uint32_t pv[indexes[UCNVSEL_INDEX_PV_COUNT]]; // bit vectors michael@0: * char* encodingNames[indexes[UCNVSEL_INDEX_NAMES_LENGTH]]; // NUL-terminated strings + padding michael@0: */ michael@0: michael@0: /* serialize a selector */ michael@0: U_CAPI int32_t U_EXPORT2 michael@0: ucnvsel_serialize(const UConverterSelector* sel, michael@0: void* buffer, int32_t bufferCapacity, UErrorCode* status) { michael@0: // check if already failed michael@0: if (U_FAILURE(*status)) { michael@0: return 0; michael@0: } michael@0: // ensure args make sense! michael@0: uint8_t *p = (uint8_t *)buffer; michael@0: if (bufferCapacity < 0 || michael@0: (bufferCapacity > 0 && (p == NULL || (U_POINTER_MASK_LSB(p, 3) != 0))) michael@0: ) { michael@0: *status = U_ILLEGAL_ARGUMENT_ERROR; michael@0: return 0; michael@0: } michael@0: // add up the size of the serialized form michael@0: int32_t serializedTrieSize = utrie2_serialize(sel->trie, NULL, 0, status); michael@0: if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) { michael@0: return 0; michael@0: } michael@0: *status = U_ZERO_ERROR; michael@0: michael@0: DataHeader header; michael@0: uprv_memset(&header, 0, sizeof(header)); michael@0: header.dataHeader.headerSize = (uint16_t)((sizeof(header) + 15) & ~15); michael@0: header.dataHeader.magic1 = 0xda; michael@0: header.dataHeader.magic2 = 0x27; michael@0: uprv_memcpy(&header.info, &dataInfo, sizeof(dataInfo)); michael@0: michael@0: int32_t indexes[UCNVSEL_INDEX_COUNT] = { michael@0: serializedTrieSize, michael@0: sel->pvCount, michael@0: sel->encodingsCount, michael@0: sel->encodingStrLength michael@0: }; michael@0: michael@0: int32_t totalSize = michael@0: header.dataHeader.headerSize + michael@0: (int32_t)sizeof(indexes) + michael@0: serializedTrieSize + michael@0: sel->pvCount * 4 + michael@0: sel->encodingStrLength; michael@0: indexes[UCNVSEL_INDEX_SIZE] = totalSize - header.dataHeader.headerSize; michael@0: if (totalSize > bufferCapacity) { michael@0: *status = U_BUFFER_OVERFLOW_ERROR; michael@0: return totalSize; michael@0: } michael@0: // ok, save! michael@0: int32_t length = header.dataHeader.headerSize; michael@0: uprv_memcpy(p, &header, sizeof(header)); michael@0: uprv_memset(p + sizeof(header), 0, length - sizeof(header)); michael@0: p += length; michael@0: michael@0: length = (int32_t)sizeof(indexes); michael@0: uprv_memcpy(p, indexes, length); michael@0: p += length; michael@0: michael@0: utrie2_serialize(sel->trie, p, serializedTrieSize, status); michael@0: p += serializedTrieSize; michael@0: michael@0: length = sel->pvCount * 4; michael@0: uprv_memcpy(p, sel->pv, length); michael@0: p += length; michael@0: michael@0: uprv_memcpy(p, sel->encodings[0], sel->encodingStrLength); michael@0: p += sel->encodingStrLength; michael@0: michael@0: return totalSize; michael@0: } michael@0: michael@0: /** michael@0: * swap a selector into the desired Endianness and Asciiness of michael@0: * the system. Just as FYI, selectors are always saved in the format michael@0: * of the system that created them. They are only converted if used michael@0: * on another system. In other words, selectors created on different michael@0: * system can be different even if the params are identical (endianness michael@0: * and Asciiness differences only) michael@0: * michael@0: * @param ds pointer to data swapper containing swapping info michael@0: * @param inData pointer to incoming data michael@0: * @param length length of inData in bytes michael@0: * @param outData pointer to output data. Capacity should michael@0: * be at least equal to capacity of inData michael@0: * @param status an in/out ICU UErrorCode michael@0: * @return 0 on failure, number of bytes swapped on success michael@0: * number of bytes swapped can be smaller than length michael@0: */ michael@0: static int32_t michael@0: ucnvsel_swap(const UDataSwapper *ds, michael@0: const void *inData, int32_t length, michael@0: void *outData, UErrorCode *status) { michael@0: /* udata_swapDataHeader checks the arguments */ michael@0: int32_t headerSize = udata_swapDataHeader(ds, inData, length, outData, status); michael@0: if(U_FAILURE(*status)) { michael@0: return 0; michael@0: } michael@0: michael@0: /* check data format and format version */ michael@0: const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData + 4); michael@0: if(!( michael@0: pInfo->dataFormat[0] == 0x43 && /* dataFormat="CSel" */ michael@0: pInfo->dataFormat[1] == 0x53 && michael@0: pInfo->dataFormat[2] == 0x65 && michael@0: pInfo->dataFormat[3] == 0x6c michael@0: )) { michael@0: udata_printError(ds, "ucnvsel_swap(): data format %02x.%02x.%02x.%02x is not recognized as UConverterSelector data\n", michael@0: pInfo->dataFormat[0], pInfo->dataFormat[1], michael@0: pInfo->dataFormat[2], pInfo->dataFormat[3]); michael@0: *status = U_INVALID_FORMAT_ERROR; michael@0: return 0; michael@0: } michael@0: if(pInfo->formatVersion[0] != 1) { michael@0: udata_printError(ds, "ucnvsel_swap(): format version %02x is not supported\n", michael@0: pInfo->formatVersion[0]); michael@0: *status = U_UNSUPPORTED_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: if(length >= 0) { michael@0: length -= headerSize; michael@0: if(length < 16*4) { michael@0: udata_printError(ds, "ucnvsel_swap(): too few bytes (%d after header) for UConverterSelector data\n", michael@0: length); michael@0: *status = U_INDEX_OUTOFBOUNDS_ERROR; michael@0: return 0; michael@0: } michael@0: } michael@0: michael@0: const uint8_t *inBytes = (const uint8_t *)inData + headerSize; michael@0: uint8_t *outBytes = (uint8_t *)outData + headerSize; michael@0: michael@0: /* read the indexes */ michael@0: const int32_t *inIndexes = (const int32_t *)inBytes; michael@0: int32_t indexes[16]; michael@0: int32_t i; michael@0: for(i = 0; i < 16; ++i) { michael@0: indexes[i] = udata_readInt32(ds, inIndexes[i]); michael@0: } michael@0: michael@0: /* get the total length of the data */ michael@0: int32_t size = indexes[UCNVSEL_INDEX_SIZE]; michael@0: if(length >= 0) { michael@0: if(length < size) { michael@0: udata_printError(ds, "ucnvsel_swap(): too few bytes (%d after header) for all of UConverterSelector data\n", michael@0: length); michael@0: *status = U_INDEX_OUTOFBOUNDS_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: /* copy the data for inaccessible bytes */ michael@0: if(inBytes != outBytes) { michael@0: uprv_memcpy(outBytes, inBytes, size); michael@0: } michael@0: michael@0: int32_t offset = 0, count; michael@0: michael@0: /* swap the int32_t indexes[] */ michael@0: count = UCNVSEL_INDEX_COUNT*4; michael@0: ds->swapArray32(ds, inBytes, count, outBytes, status); michael@0: offset += count; michael@0: michael@0: /* swap the UTrie2 */ michael@0: count = indexes[UCNVSEL_INDEX_TRIE_SIZE]; michael@0: utrie2_swap(ds, inBytes + offset, count, outBytes + offset, status); michael@0: offset += count; michael@0: michael@0: /* swap the uint32_t pv[] */ michael@0: count = indexes[UCNVSEL_INDEX_PV_COUNT]*4; michael@0: ds->swapArray32(ds, inBytes + offset, count, outBytes + offset, status); michael@0: offset += count; michael@0: michael@0: /* swap the encoding names */ michael@0: count = indexes[UCNVSEL_INDEX_NAMES_LENGTH]; michael@0: ds->swapInvChars(ds, inBytes + offset, count, outBytes + offset, status); michael@0: offset += count; michael@0: michael@0: U_ASSERT(offset == size); michael@0: } michael@0: michael@0: return headerSize + size; michael@0: } michael@0: michael@0: /* unserialize a selector */ michael@0: U_CAPI UConverterSelector* U_EXPORT2 michael@0: ucnvsel_openFromSerialized(const void* buffer, int32_t length, UErrorCode* status) { michael@0: // check if already failed michael@0: if (U_FAILURE(*status)) { michael@0: return NULL; michael@0: } michael@0: // ensure args make sense! michael@0: const uint8_t *p = (const uint8_t *)buffer; michael@0: if (length <= 0 || michael@0: (length > 0 && (p == NULL || (U_POINTER_MASK_LSB(p, 3) != 0))) michael@0: ) { michael@0: *status = U_ILLEGAL_ARGUMENT_ERROR; michael@0: return NULL; michael@0: } michael@0: // header michael@0: if (length < 32) { michael@0: // not even enough space for a minimal header michael@0: *status = U_INDEX_OUTOFBOUNDS_ERROR; michael@0: return NULL; michael@0: } michael@0: const DataHeader *pHeader = (const DataHeader *)p; michael@0: if (!( michael@0: pHeader->dataHeader.magic1==0xda && michael@0: pHeader->dataHeader.magic2==0x27 && michael@0: pHeader->info.dataFormat[0] == 0x43 && michael@0: pHeader->info.dataFormat[1] == 0x53 && michael@0: pHeader->info.dataFormat[2] == 0x65 && michael@0: pHeader->info.dataFormat[3] == 0x6c michael@0: )) { michael@0: /* header not valid or dataFormat not recognized */ michael@0: *status = U_INVALID_FORMAT_ERROR; michael@0: return NULL; michael@0: } michael@0: if (pHeader->info.formatVersion[0] != 1) { michael@0: *status = U_UNSUPPORTED_ERROR; michael@0: return NULL; michael@0: } michael@0: uint8_t* swapped = NULL; michael@0: if (pHeader->info.isBigEndian != U_IS_BIG_ENDIAN || michael@0: pHeader->info.charsetFamily != U_CHARSET_FAMILY michael@0: ) { michael@0: // swap the data michael@0: UDataSwapper *ds = michael@0: udata_openSwapperForInputData(p, length, U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, status); michael@0: int32_t totalSize = ucnvsel_swap(ds, p, -1, NULL, status); michael@0: if (U_FAILURE(*status)) { michael@0: udata_closeSwapper(ds); michael@0: return NULL; michael@0: } michael@0: if (length < totalSize) { michael@0: udata_closeSwapper(ds); michael@0: *status = U_INDEX_OUTOFBOUNDS_ERROR; michael@0: return NULL; michael@0: } michael@0: swapped = (uint8_t*)uprv_malloc(totalSize); michael@0: if (swapped == NULL) { michael@0: udata_closeSwapper(ds); michael@0: *status = U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: ucnvsel_swap(ds, p, length, swapped, status); michael@0: udata_closeSwapper(ds); michael@0: if (U_FAILURE(*status)) { michael@0: uprv_free(swapped); michael@0: return NULL; michael@0: } michael@0: p = swapped; michael@0: pHeader = (const DataHeader *)p; michael@0: } michael@0: if (length < (pHeader->dataHeader.headerSize + 16 * 4)) { michael@0: // not even enough space for the header and the indexes michael@0: uprv_free(swapped); michael@0: *status = U_INDEX_OUTOFBOUNDS_ERROR; michael@0: return NULL; michael@0: } michael@0: p += pHeader->dataHeader.headerSize; michael@0: length -= pHeader->dataHeader.headerSize; michael@0: // indexes michael@0: const int32_t *indexes = (const int32_t *)p; michael@0: if (length < indexes[UCNVSEL_INDEX_SIZE]) { michael@0: uprv_free(swapped); michael@0: *status = U_INDEX_OUTOFBOUNDS_ERROR; michael@0: return NULL; michael@0: } michael@0: p += UCNVSEL_INDEX_COUNT * 4; michael@0: // create and populate the selector object michael@0: UConverterSelector* sel = (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector)); michael@0: char **encodings = michael@0: (char **)uprv_malloc( michael@0: indexes[UCNVSEL_INDEX_NAMES_COUNT] * sizeof(char *)); michael@0: if (sel == NULL || encodings == NULL) { michael@0: uprv_free(swapped); michael@0: uprv_free(sel); michael@0: uprv_free(encodings); michael@0: *status = U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: uprv_memset(sel, 0, sizeof(UConverterSelector)); michael@0: sel->pvCount = indexes[UCNVSEL_INDEX_PV_COUNT]; michael@0: sel->encodings = encodings; michael@0: sel->encodingsCount = indexes[UCNVSEL_INDEX_NAMES_COUNT]; michael@0: sel->encodingStrLength = indexes[UCNVSEL_INDEX_NAMES_LENGTH]; michael@0: sel->swapped = swapped; michael@0: // trie michael@0: sel->trie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, michael@0: p, indexes[UCNVSEL_INDEX_TRIE_SIZE], NULL, michael@0: status); michael@0: p += indexes[UCNVSEL_INDEX_TRIE_SIZE]; michael@0: if (U_FAILURE(*status)) { michael@0: ucnvsel_close(sel); michael@0: return NULL; michael@0: } michael@0: // bit vectors michael@0: sel->pv = (uint32_t *)p; michael@0: p += sel->pvCount * 4; michael@0: // encoding names michael@0: char* s = (char*)p; michael@0: for (int32_t i = 0; i < sel->encodingsCount; ++i) { michael@0: sel->encodings[i] = s; michael@0: s += uprv_strlen(s) + 1; michael@0: } michael@0: p += sel->encodingStrLength; michael@0: michael@0: return sel; michael@0: } michael@0: michael@0: // a bunch of functions for the enumeration thingie! Nothing fancy here. Just michael@0: // iterate over the selected encodings michael@0: struct Enumerator { michael@0: int16_t* index; michael@0: int16_t length; michael@0: int16_t cur; michael@0: const UConverterSelector* sel; michael@0: }; michael@0: michael@0: U_CDECL_BEGIN michael@0: michael@0: static void U_CALLCONV michael@0: ucnvsel_close_selector_iterator(UEnumeration *enumerator) { michael@0: uprv_free(((Enumerator*)(enumerator->context))->index); michael@0: uprv_free(enumerator->context); michael@0: uprv_free(enumerator); michael@0: } michael@0: michael@0: michael@0: static int32_t U_CALLCONV michael@0: ucnvsel_count_encodings(UEnumeration *enumerator, UErrorCode *status) { michael@0: // check if already failed michael@0: if (U_FAILURE(*status)) { michael@0: return 0; michael@0: } michael@0: return ((Enumerator*)(enumerator->context))->length; michael@0: } michael@0: michael@0: michael@0: static const char* U_CALLCONV ucnvsel_next_encoding(UEnumeration* enumerator, michael@0: int32_t* resultLength, michael@0: UErrorCode* status) { michael@0: // check if already failed michael@0: if (U_FAILURE(*status)) { michael@0: return NULL; michael@0: } michael@0: michael@0: int16_t cur = ((Enumerator*)(enumerator->context))->cur; michael@0: const UConverterSelector* sel; michael@0: const char* result; michael@0: if (cur >= ((Enumerator*)(enumerator->context))->length) { michael@0: return NULL; michael@0: } michael@0: sel = ((Enumerator*)(enumerator->context))->sel; michael@0: result = sel->encodings[((Enumerator*)(enumerator->context))->index[cur] ]; michael@0: ((Enumerator*)(enumerator->context))->cur++; michael@0: if (resultLength) { michael@0: *resultLength = (int32_t)uprv_strlen(result); michael@0: } michael@0: return result; michael@0: } michael@0: michael@0: static void U_CALLCONV ucnvsel_reset_iterator(UEnumeration* enumerator, michael@0: UErrorCode* status) { michael@0: // check if already failed michael@0: if (U_FAILURE(*status)) { michael@0: return ; michael@0: } michael@0: ((Enumerator*)(enumerator->context))->cur = 0; michael@0: } michael@0: michael@0: U_CDECL_END michael@0: michael@0: michael@0: static const UEnumeration defaultEncodings = { michael@0: NULL, michael@0: NULL, michael@0: ucnvsel_close_selector_iterator, michael@0: ucnvsel_count_encodings, michael@0: uenum_unextDefault, michael@0: ucnvsel_next_encoding, michael@0: ucnvsel_reset_iterator michael@0: }; michael@0: michael@0: michael@0: // internal fn to intersect two sets of masks michael@0: // returns whether the mask has reduced to all zeros michael@0: static UBool intersectMasks(uint32_t* dest, const uint32_t* source1, int32_t len) { michael@0: int32_t i; michael@0: uint32_t oredDest = 0; michael@0: for (i = 0 ; i < len ; ++i) { michael@0: oredDest |= (dest[i] &= source1[i]); michael@0: } michael@0: return oredDest == 0; michael@0: } michael@0: michael@0: // internal fn to count how many 1's are there in a mask michael@0: // algorithm taken from http://graphics.stanford.edu/~seander/bithacks.html michael@0: static int16_t countOnes(uint32_t* mask, int32_t len) { michael@0: int32_t i, totalOnes = 0; michael@0: for (i = 0 ; i < len ; ++i) { michael@0: uint32_t ent = mask[i]; michael@0: for (; ent; totalOnes++) michael@0: { michael@0: ent &= ent - 1; // clear the least significant bit set michael@0: } michael@0: } michael@0: return totalOnes; michael@0: } michael@0: michael@0: michael@0: /* internal function! */ michael@0: static UEnumeration *selectForMask(const UConverterSelector* sel, michael@0: uint32_t *mask, UErrorCode *status) { michael@0: // this is the context we will use. Store a table of indices to which michael@0: // encodings are legit. michael@0: struct Enumerator* result = (Enumerator*)uprv_malloc(sizeof(Enumerator)); michael@0: if (result == NULL) { michael@0: uprv_free(mask); michael@0: *status = U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: result->index = NULL; // this will be allocated later! michael@0: result->length = result->cur = 0; michael@0: result->sel = sel; michael@0: michael@0: UEnumeration *en = (UEnumeration *)uprv_malloc(sizeof(UEnumeration)); michael@0: if (en == NULL) { michael@0: // TODO(markus): Combine Enumerator and UEnumeration into one struct. michael@0: uprv_free(mask); michael@0: uprv_free(result); michael@0: *status = U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: memcpy(en, &defaultEncodings, sizeof(UEnumeration)); michael@0: en->context = result; michael@0: michael@0: int32_t columns = (sel->encodingsCount+31)/32; michael@0: int16_t numOnes = countOnes(mask, columns); michael@0: // now, we know the exact space we need for index michael@0: if (numOnes > 0) { michael@0: result->index = (int16_t*) uprv_malloc(numOnes * sizeof(int16_t)); michael@0: michael@0: int32_t i, j; michael@0: int16_t k = 0; michael@0: for (j = 0 ; j < columns; j++) { michael@0: uint32_t v = mask[j]; michael@0: for (i = 0 ; i < 32 && k < sel->encodingsCount; i++, k++) { michael@0: if ((v & 1) != 0) { michael@0: result->index[result->length++] = k; michael@0: } michael@0: v >>= 1; michael@0: } michael@0: } michael@0: } //otherwise, index will remain NULL (and will never be touched by michael@0: //the enumerator code anyway) michael@0: uprv_free(mask); michael@0: return en; michael@0: } michael@0: michael@0: /* check a string against the selector - UTF16 version */ michael@0: U_CAPI UEnumeration * U_EXPORT2 michael@0: ucnvsel_selectForString(const UConverterSelector* sel, michael@0: const UChar *s, int32_t length, UErrorCode *status) { michael@0: // check if already failed michael@0: if (U_FAILURE(*status)) { michael@0: return NULL; michael@0: } michael@0: // ensure args make sense! michael@0: if (sel == NULL || (s == NULL && length != 0)) { michael@0: *status = U_ILLEGAL_ARGUMENT_ERROR; michael@0: return NULL; michael@0: } michael@0: michael@0: int32_t columns = (sel->encodingsCount+31)/32; michael@0: uint32_t* mask = (uint32_t*) uprv_malloc(columns * 4); michael@0: if (mask == NULL) { michael@0: *status = U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: uprv_memset(mask, ~0, columns *4); michael@0: michael@0: if(s!=NULL) { michael@0: const UChar *limit; michael@0: if (length >= 0) { michael@0: limit = s + length; michael@0: } else { michael@0: limit = NULL; michael@0: } michael@0: michael@0: while (limit == NULL ? *s != 0 : s != limit) { michael@0: UChar32 c; michael@0: uint16_t pvIndex; michael@0: UTRIE2_U16_NEXT16(sel->trie, s, limit, c, pvIndex); michael@0: if (intersectMasks(mask, sel->pv+pvIndex, columns)) { michael@0: break; michael@0: } michael@0: } michael@0: } michael@0: return selectForMask(sel, mask, status); michael@0: } michael@0: michael@0: /* check a string against the selector - UTF8 version */ michael@0: U_CAPI UEnumeration * U_EXPORT2 michael@0: ucnvsel_selectForUTF8(const UConverterSelector* sel, michael@0: const char *s, int32_t length, UErrorCode *status) { michael@0: // check if already failed michael@0: if (U_FAILURE(*status)) { michael@0: return NULL; michael@0: } michael@0: // ensure args make sense! michael@0: if (sel == NULL || (s == NULL && length != 0)) { michael@0: *status = U_ILLEGAL_ARGUMENT_ERROR; michael@0: return NULL; michael@0: } michael@0: michael@0: int32_t columns = (sel->encodingsCount+31)/32; michael@0: uint32_t* mask = (uint32_t*) uprv_malloc(columns * 4); michael@0: if (mask == NULL) { michael@0: *status = U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: uprv_memset(mask, ~0, columns *4); michael@0: michael@0: if (length < 0) { michael@0: length = (int32_t)uprv_strlen(s); michael@0: } michael@0: michael@0: if(s!=NULL) { michael@0: const char *limit = s + length; michael@0: michael@0: while (s != limit) { michael@0: uint16_t pvIndex; michael@0: UTRIE2_U8_NEXT16(sel->trie, s, limit, pvIndex); michael@0: if (intersectMasks(mask, sel->pv+pvIndex, columns)) { michael@0: break; michael@0: } michael@0: } michael@0: } michael@0: return selectForMask(sel, mask, status); michael@0: } michael@0: michael@0: #endif // !UCONFIG_NO_CONVERSION