Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | /* |
michael@0 | 2 | ******************************************************************************* |
michael@0 | 3 | * |
michael@0 | 4 | * Copyright (C) 2008-2011, International Business Machines |
michael@0 | 5 | * Corporation, Google and others. All Rights Reserved. |
michael@0 | 6 | * |
michael@0 | 7 | ******************************************************************************* |
michael@0 | 8 | */ |
michael@0 | 9 | // Author : eldawy@google.com (Mohamed Eldawy) |
michael@0 | 10 | // ucnvsel.cpp |
michael@0 | 11 | // |
michael@0 | 12 | // Purpose: To generate a list of encodings capable of handling |
michael@0 | 13 | // a given Unicode text |
michael@0 | 14 | // |
michael@0 | 15 | // Started 09-April-2008 |
michael@0 | 16 | |
michael@0 | 17 | /** |
michael@0 | 18 | * \file |
michael@0 | 19 | * |
michael@0 | 20 | * This is an implementation of an encoding selector. |
michael@0 | 21 | * The goal is, given a unicode string, find the encodings |
michael@0 | 22 | * this string can be mapped to. To make processing faster |
michael@0 | 23 | * a trie is built when you call ucnvsel_open() that |
michael@0 | 24 | * stores all encodings a codepoint can map to |
michael@0 | 25 | */ |
michael@0 | 26 | |
michael@0 | 27 | #include "unicode/ucnvsel.h" |
michael@0 | 28 | |
michael@0 | 29 | #if !UCONFIG_NO_CONVERSION |
michael@0 | 30 | |
michael@0 | 31 | #include <string.h> |
michael@0 | 32 | |
michael@0 | 33 | #include "unicode/uchar.h" |
michael@0 | 34 | #include "unicode/uniset.h" |
michael@0 | 35 | #include "unicode/ucnv.h" |
michael@0 | 36 | #include "unicode/ustring.h" |
michael@0 | 37 | #include "unicode/uchriter.h" |
michael@0 | 38 | #include "utrie2.h" |
michael@0 | 39 | #include "propsvec.h" |
michael@0 | 40 | #include "uassert.h" |
michael@0 | 41 | #include "ucmndata.h" |
michael@0 | 42 | #include "uenumimp.h" |
michael@0 | 43 | #include "cmemory.h" |
michael@0 | 44 | #include "cstring.h" |
michael@0 | 45 | |
michael@0 | 46 | U_NAMESPACE_USE |
michael@0 | 47 | |
michael@0 | 48 | struct UConverterSelector { |
michael@0 | 49 | UTrie2 *trie; // 16 bit trie containing offsets into pv |
michael@0 | 50 | uint32_t* pv; // table of bits! |
michael@0 | 51 | int32_t pvCount; |
michael@0 | 52 | char** encodings; // which encodings did user ask to use? |
michael@0 | 53 | int32_t encodingsCount; |
michael@0 | 54 | int32_t encodingStrLength; |
michael@0 | 55 | uint8_t* swapped; |
michael@0 | 56 | UBool ownPv, ownEncodingStrings; |
michael@0 | 57 | }; |
michael@0 | 58 | |
michael@0 | 59 | static void generateSelectorData(UConverterSelector* result, |
michael@0 | 60 | UPropsVectors *upvec, |
michael@0 | 61 | const USet* excludedCodePoints, |
michael@0 | 62 | const UConverterUnicodeSet whichSet, |
michael@0 | 63 | UErrorCode* status) { |
michael@0 | 64 | if (U_FAILURE(*status)) { |
michael@0 | 65 | return; |
michael@0 | 66 | } |
michael@0 | 67 | |
michael@0 | 68 | int32_t columns = (result->encodingsCount+31)/32; |
michael@0 | 69 | |
michael@0 | 70 | // set errorValue to all-ones |
michael@0 | 71 | for (int32_t col = 0; col < columns; col++) { |
michael@0 | 72 | upvec_setValue(upvec, UPVEC_ERROR_VALUE_CP, UPVEC_ERROR_VALUE_CP, |
michael@0 | 73 | col, ~0, ~0, status); |
michael@0 | 74 | } |
michael@0 | 75 | |
michael@0 | 76 | for (int32_t i = 0; i < result->encodingsCount; ++i) { |
michael@0 | 77 | uint32_t mask; |
michael@0 | 78 | uint32_t column; |
michael@0 | 79 | int32_t item_count; |
michael@0 | 80 | int32_t j; |
michael@0 | 81 | UConverter* test_converter = ucnv_open(result->encodings[i], status); |
michael@0 | 82 | if (U_FAILURE(*status)) { |
michael@0 | 83 | return; |
michael@0 | 84 | } |
michael@0 | 85 | USet* unicode_point_set; |
michael@0 | 86 | unicode_point_set = uset_open(1, 0); // empty set |
michael@0 | 87 | |
michael@0 | 88 | ucnv_getUnicodeSet(test_converter, unicode_point_set, |
michael@0 | 89 | whichSet, status); |
michael@0 | 90 | if (U_FAILURE(*status)) { |
michael@0 | 91 | ucnv_close(test_converter); |
michael@0 | 92 | return; |
michael@0 | 93 | } |
michael@0 | 94 | |
michael@0 | 95 | column = i / 32; |
michael@0 | 96 | mask = 1 << (i%32); |
michael@0 | 97 | // now iterate over intervals on set i! |
michael@0 | 98 | item_count = uset_getItemCount(unicode_point_set); |
michael@0 | 99 | |
michael@0 | 100 | for (j = 0; j < item_count; ++j) { |
michael@0 | 101 | UChar32 start_char; |
michael@0 | 102 | UChar32 end_char; |
michael@0 | 103 | UErrorCode smallStatus = U_ZERO_ERROR; |
michael@0 | 104 | uset_getItem(unicode_point_set, j, &start_char, &end_char, NULL, 0, |
michael@0 | 105 | &smallStatus); |
michael@0 | 106 | if (U_FAILURE(smallStatus)) { |
michael@0 | 107 | // this will be reached for the converters that fill the set with |
michael@0 | 108 | // strings. Those should be ignored by our system |
michael@0 | 109 | } else { |
michael@0 | 110 | upvec_setValue(upvec, start_char, end_char, column, ~0, mask, |
michael@0 | 111 | status); |
michael@0 | 112 | } |
michael@0 | 113 | } |
michael@0 | 114 | ucnv_close(test_converter); |
michael@0 | 115 | uset_close(unicode_point_set); |
michael@0 | 116 | if (U_FAILURE(*status)) { |
michael@0 | 117 | return; |
michael@0 | 118 | } |
michael@0 | 119 | } |
michael@0 | 120 | |
michael@0 | 121 | // handle excluded encodings! Simply set their values to all 1's in the upvec |
michael@0 | 122 | if (excludedCodePoints) { |
michael@0 | 123 | int32_t item_count = uset_getItemCount(excludedCodePoints); |
michael@0 | 124 | for (int32_t j = 0; j < item_count; ++j) { |
michael@0 | 125 | UChar32 start_char; |
michael@0 | 126 | UChar32 end_char; |
michael@0 | 127 | |
michael@0 | 128 | uset_getItem(excludedCodePoints, j, &start_char, &end_char, NULL, 0, |
michael@0 | 129 | status); |
michael@0 | 130 | for (int32_t col = 0; col < columns; col++) { |
michael@0 | 131 | upvec_setValue(upvec, start_char, end_char, col, ~0, ~0, |
michael@0 | 132 | status); |
michael@0 | 133 | } |
michael@0 | 134 | } |
michael@0 | 135 | } |
michael@0 | 136 | |
michael@0 | 137 | // alright. Now, let's put things in the same exact form you'd get when you |
michael@0 | 138 | // unserialize things. |
michael@0 | 139 | result->trie = upvec_compactToUTrie2WithRowIndexes(upvec, status); |
michael@0 | 140 | result->pv = upvec_cloneArray(upvec, &result->pvCount, NULL, status); |
michael@0 | 141 | result->pvCount *= columns; // number of uint32_t = rows * columns |
michael@0 | 142 | result->ownPv = TRUE; |
michael@0 | 143 | } |
michael@0 | 144 | |
michael@0 | 145 | /* open a selector. If converterListSize is 0, build for all converters. |
michael@0 | 146 | If excludedCodePoints is NULL, don't exclude any codepoints */ |
michael@0 | 147 | U_CAPI UConverterSelector* U_EXPORT2 |
michael@0 | 148 | ucnvsel_open(const char* const* converterList, int32_t converterListSize, |
michael@0 | 149 | const USet* excludedCodePoints, |
michael@0 | 150 | const UConverterUnicodeSet whichSet, UErrorCode* status) { |
michael@0 | 151 | // check if already failed |
michael@0 | 152 | if (U_FAILURE(*status)) { |
michael@0 | 153 | return NULL; |
michael@0 | 154 | } |
michael@0 | 155 | // ensure args make sense! |
michael@0 | 156 | if (converterListSize < 0 || (converterList == NULL && converterListSize != 0)) { |
michael@0 | 157 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 158 | return NULL; |
michael@0 | 159 | } |
michael@0 | 160 | |
michael@0 | 161 | // allocate a new converter |
michael@0 | 162 | LocalUConverterSelectorPointer newSelector( |
michael@0 | 163 | (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector))); |
michael@0 | 164 | if (newSelector.isNull()) { |
michael@0 | 165 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 166 | return NULL; |
michael@0 | 167 | } |
michael@0 | 168 | uprv_memset(newSelector.getAlias(), 0, sizeof(UConverterSelector)); |
michael@0 | 169 | |
michael@0 | 170 | if (converterListSize == 0) { |
michael@0 | 171 | converterList = NULL; |
michael@0 | 172 | converterListSize = ucnv_countAvailable(); |
michael@0 | 173 | } |
michael@0 | 174 | newSelector->encodings = |
michael@0 | 175 | (char**)uprv_malloc(converterListSize * sizeof(char*)); |
michael@0 | 176 | if (!newSelector->encodings) { |
michael@0 | 177 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 178 | return NULL; |
michael@0 | 179 | } |
michael@0 | 180 | newSelector->encodings[0] = NULL; // now we can call ucnvsel_close() |
michael@0 | 181 | |
michael@0 | 182 | // make a backup copy of the list of converters |
michael@0 | 183 | int32_t totalSize = 0; |
michael@0 | 184 | int32_t i; |
michael@0 | 185 | for (i = 0; i < converterListSize; i++) { |
michael@0 | 186 | totalSize += |
michael@0 | 187 | (int32_t)uprv_strlen(converterList != NULL ? converterList[i] : ucnv_getAvailableName(i)) + 1; |
michael@0 | 188 | } |
michael@0 | 189 | // 4-align the totalSize to 4-align the size of the serialized form |
michael@0 | 190 | int32_t encodingStrPadding = totalSize & 3; |
michael@0 | 191 | if (encodingStrPadding != 0) { |
michael@0 | 192 | encodingStrPadding = 4 - encodingStrPadding; |
michael@0 | 193 | } |
michael@0 | 194 | newSelector->encodingStrLength = totalSize += encodingStrPadding; |
michael@0 | 195 | char* allStrings = (char*) uprv_malloc(totalSize); |
michael@0 | 196 | if (!allStrings) { |
michael@0 | 197 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 198 | return NULL; |
michael@0 | 199 | } |
michael@0 | 200 | |
michael@0 | 201 | for (i = 0; i < converterListSize; i++) { |
michael@0 | 202 | newSelector->encodings[i] = allStrings; |
michael@0 | 203 | uprv_strcpy(newSelector->encodings[i], |
michael@0 | 204 | converterList != NULL ? converterList[i] : ucnv_getAvailableName(i)); |
michael@0 | 205 | allStrings += uprv_strlen(newSelector->encodings[i]) + 1; |
michael@0 | 206 | } |
michael@0 | 207 | while (encodingStrPadding > 0) { |
michael@0 | 208 | *allStrings++ = 0; |
michael@0 | 209 | --encodingStrPadding; |
michael@0 | 210 | } |
michael@0 | 211 | |
michael@0 | 212 | newSelector->ownEncodingStrings = TRUE; |
michael@0 | 213 | newSelector->encodingsCount = converterListSize; |
michael@0 | 214 | UPropsVectors *upvec = upvec_open((converterListSize+31)/32, status); |
michael@0 | 215 | generateSelectorData(newSelector.getAlias(), upvec, excludedCodePoints, whichSet, status); |
michael@0 | 216 | upvec_close(upvec); |
michael@0 | 217 | |
michael@0 | 218 | if (U_FAILURE(*status)) { |
michael@0 | 219 | return NULL; |
michael@0 | 220 | } |
michael@0 | 221 | |
michael@0 | 222 | return newSelector.orphan(); |
michael@0 | 223 | } |
michael@0 | 224 | |
michael@0 | 225 | /* close opened selector */ |
michael@0 | 226 | U_CAPI void U_EXPORT2 |
michael@0 | 227 | ucnvsel_close(UConverterSelector *sel) { |
michael@0 | 228 | if (!sel) { |
michael@0 | 229 | return; |
michael@0 | 230 | } |
michael@0 | 231 | if (sel->ownEncodingStrings) { |
michael@0 | 232 | uprv_free(sel->encodings[0]); |
michael@0 | 233 | } |
michael@0 | 234 | uprv_free(sel->encodings); |
michael@0 | 235 | if (sel->ownPv) { |
michael@0 | 236 | uprv_free(sel->pv); |
michael@0 | 237 | } |
michael@0 | 238 | utrie2_close(sel->trie); |
michael@0 | 239 | uprv_free(sel->swapped); |
michael@0 | 240 | uprv_free(sel); |
michael@0 | 241 | } |
michael@0 | 242 | |
michael@0 | 243 | static const UDataInfo dataInfo = { |
michael@0 | 244 | sizeof(UDataInfo), |
michael@0 | 245 | 0, |
michael@0 | 246 | |
michael@0 | 247 | U_IS_BIG_ENDIAN, |
michael@0 | 248 | U_CHARSET_FAMILY, |
michael@0 | 249 | U_SIZEOF_UCHAR, |
michael@0 | 250 | 0, |
michael@0 | 251 | |
michael@0 | 252 | { 0x43, 0x53, 0x65, 0x6c }, /* dataFormat="CSel" */ |
michael@0 | 253 | { 1, 0, 0, 0 }, /* formatVersion */ |
michael@0 | 254 | { 0, 0, 0, 0 } /* dataVersion */ |
michael@0 | 255 | }; |
michael@0 | 256 | |
michael@0 | 257 | enum { |
michael@0 | 258 | UCNVSEL_INDEX_TRIE_SIZE, // trie size in bytes |
michael@0 | 259 | UCNVSEL_INDEX_PV_COUNT, // number of uint32_t in the bit vectors |
michael@0 | 260 | UCNVSEL_INDEX_NAMES_COUNT, // number of encoding names |
michael@0 | 261 | UCNVSEL_INDEX_NAMES_LENGTH, // number of encoding name bytes including padding |
michael@0 | 262 | UCNVSEL_INDEX_SIZE = 15, // bytes following the DataHeader |
michael@0 | 263 | UCNVSEL_INDEX_COUNT = 16 |
michael@0 | 264 | }; |
michael@0 | 265 | |
michael@0 | 266 | /* |
michael@0 | 267 | * Serialized form of a UConverterSelector, formatVersion 1: |
michael@0 | 268 | * |
michael@0 | 269 | * The serialized form begins with a standard ICU DataHeader with a UDataInfo |
michael@0 | 270 | * as the template above. |
michael@0 | 271 | * This is followed by: |
michael@0 | 272 | * int32_t indexes[UCNVSEL_INDEX_COUNT]; // see index entry constants above |
michael@0 | 273 | * serialized UTrie2; // indexes[UCNVSEL_INDEX_TRIE_SIZE] bytes |
michael@0 | 274 | * uint32_t pv[indexes[UCNVSEL_INDEX_PV_COUNT]]; // bit vectors |
michael@0 | 275 | * char* encodingNames[indexes[UCNVSEL_INDEX_NAMES_LENGTH]]; // NUL-terminated strings + padding |
michael@0 | 276 | */ |
michael@0 | 277 | |
michael@0 | 278 | /* serialize a selector */ |
michael@0 | 279 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 280 | ucnvsel_serialize(const UConverterSelector* sel, |
michael@0 | 281 | void* buffer, int32_t bufferCapacity, UErrorCode* status) { |
michael@0 | 282 | // check if already failed |
michael@0 | 283 | if (U_FAILURE(*status)) { |
michael@0 | 284 | return 0; |
michael@0 | 285 | } |
michael@0 | 286 | // ensure args make sense! |
michael@0 | 287 | uint8_t *p = (uint8_t *)buffer; |
michael@0 | 288 | if (bufferCapacity < 0 || |
michael@0 | 289 | (bufferCapacity > 0 && (p == NULL || (U_POINTER_MASK_LSB(p, 3) != 0))) |
michael@0 | 290 | ) { |
michael@0 | 291 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 292 | return 0; |
michael@0 | 293 | } |
michael@0 | 294 | // add up the size of the serialized form |
michael@0 | 295 | int32_t serializedTrieSize = utrie2_serialize(sel->trie, NULL, 0, status); |
michael@0 | 296 | if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) { |
michael@0 | 297 | return 0; |
michael@0 | 298 | } |
michael@0 | 299 | *status = U_ZERO_ERROR; |
michael@0 | 300 | |
michael@0 | 301 | DataHeader header; |
michael@0 | 302 | uprv_memset(&header, 0, sizeof(header)); |
michael@0 | 303 | header.dataHeader.headerSize = (uint16_t)((sizeof(header) + 15) & ~15); |
michael@0 | 304 | header.dataHeader.magic1 = 0xda; |
michael@0 | 305 | header.dataHeader.magic2 = 0x27; |
michael@0 | 306 | uprv_memcpy(&header.info, &dataInfo, sizeof(dataInfo)); |
michael@0 | 307 | |
michael@0 | 308 | int32_t indexes[UCNVSEL_INDEX_COUNT] = { |
michael@0 | 309 | serializedTrieSize, |
michael@0 | 310 | sel->pvCount, |
michael@0 | 311 | sel->encodingsCount, |
michael@0 | 312 | sel->encodingStrLength |
michael@0 | 313 | }; |
michael@0 | 314 | |
michael@0 | 315 | int32_t totalSize = |
michael@0 | 316 | header.dataHeader.headerSize + |
michael@0 | 317 | (int32_t)sizeof(indexes) + |
michael@0 | 318 | serializedTrieSize + |
michael@0 | 319 | sel->pvCount * 4 + |
michael@0 | 320 | sel->encodingStrLength; |
michael@0 | 321 | indexes[UCNVSEL_INDEX_SIZE] = totalSize - header.dataHeader.headerSize; |
michael@0 | 322 | if (totalSize > bufferCapacity) { |
michael@0 | 323 | *status = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 324 | return totalSize; |
michael@0 | 325 | } |
michael@0 | 326 | // ok, save! |
michael@0 | 327 | int32_t length = header.dataHeader.headerSize; |
michael@0 | 328 | uprv_memcpy(p, &header, sizeof(header)); |
michael@0 | 329 | uprv_memset(p + sizeof(header), 0, length - sizeof(header)); |
michael@0 | 330 | p += length; |
michael@0 | 331 | |
michael@0 | 332 | length = (int32_t)sizeof(indexes); |
michael@0 | 333 | uprv_memcpy(p, indexes, length); |
michael@0 | 334 | p += length; |
michael@0 | 335 | |
michael@0 | 336 | utrie2_serialize(sel->trie, p, serializedTrieSize, status); |
michael@0 | 337 | p += serializedTrieSize; |
michael@0 | 338 | |
michael@0 | 339 | length = sel->pvCount * 4; |
michael@0 | 340 | uprv_memcpy(p, sel->pv, length); |
michael@0 | 341 | p += length; |
michael@0 | 342 | |
michael@0 | 343 | uprv_memcpy(p, sel->encodings[0], sel->encodingStrLength); |
michael@0 | 344 | p += sel->encodingStrLength; |
michael@0 | 345 | |
michael@0 | 346 | return totalSize; |
michael@0 | 347 | } |
michael@0 | 348 | |
michael@0 | 349 | /** |
michael@0 | 350 | * swap a selector into the desired Endianness and Asciiness of |
michael@0 | 351 | * the system. Just as FYI, selectors are always saved in the format |
michael@0 | 352 | * of the system that created them. They are only converted if used |
michael@0 | 353 | * on another system. In other words, selectors created on different |
michael@0 | 354 | * system can be different even if the params are identical (endianness |
michael@0 | 355 | * and Asciiness differences only) |
michael@0 | 356 | * |
michael@0 | 357 | * @param ds pointer to data swapper containing swapping info |
michael@0 | 358 | * @param inData pointer to incoming data |
michael@0 | 359 | * @param length length of inData in bytes |
michael@0 | 360 | * @param outData pointer to output data. Capacity should |
michael@0 | 361 | * be at least equal to capacity of inData |
michael@0 | 362 | * @param status an in/out ICU UErrorCode |
michael@0 | 363 | * @return 0 on failure, number of bytes swapped on success |
michael@0 | 364 | * number of bytes swapped can be smaller than length |
michael@0 | 365 | */ |
michael@0 | 366 | static int32_t |
michael@0 | 367 | ucnvsel_swap(const UDataSwapper *ds, |
michael@0 | 368 | const void *inData, int32_t length, |
michael@0 | 369 | void *outData, UErrorCode *status) { |
michael@0 | 370 | /* udata_swapDataHeader checks the arguments */ |
michael@0 | 371 | int32_t headerSize = udata_swapDataHeader(ds, inData, length, outData, status); |
michael@0 | 372 | if(U_FAILURE(*status)) { |
michael@0 | 373 | return 0; |
michael@0 | 374 | } |
michael@0 | 375 | |
michael@0 | 376 | /* check data format and format version */ |
michael@0 | 377 | const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData + 4); |
michael@0 | 378 | if(!( |
michael@0 | 379 | pInfo->dataFormat[0] == 0x43 && /* dataFormat="CSel" */ |
michael@0 | 380 | pInfo->dataFormat[1] == 0x53 && |
michael@0 | 381 | pInfo->dataFormat[2] == 0x65 && |
michael@0 | 382 | pInfo->dataFormat[3] == 0x6c |
michael@0 | 383 | )) { |
michael@0 | 384 | udata_printError(ds, "ucnvsel_swap(): data format %02x.%02x.%02x.%02x is not recognized as UConverterSelector data\n", |
michael@0 | 385 | pInfo->dataFormat[0], pInfo->dataFormat[1], |
michael@0 | 386 | pInfo->dataFormat[2], pInfo->dataFormat[3]); |
michael@0 | 387 | *status = U_INVALID_FORMAT_ERROR; |
michael@0 | 388 | return 0; |
michael@0 | 389 | } |
michael@0 | 390 | if(pInfo->formatVersion[0] != 1) { |
michael@0 | 391 | udata_printError(ds, "ucnvsel_swap(): format version %02x is not supported\n", |
michael@0 | 392 | pInfo->formatVersion[0]); |
michael@0 | 393 | *status = U_UNSUPPORTED_ERROR; |
michael@0 | 394 | return 0; |
michael@0 | 395 | } |
michael@0 | 396 | |
michael@0 | 397 | if(length >= 0) { |
michael@0 | 398 | length -= headerSize; |
michael@0 | 399 | if(length < 16*4) { |
michael@0 | 400 | udata_printError(ds, "ucnvsel_swap(): too few bytes (%d after header) for UConverterSelector data\n", |
michael@0 | 401 | length); |
michael@0 | 402 | *status = U_INDEX_OUTOFBOUNDS_ERROR; |
michael@0 | 403 | return 0; |
michael@0 | 404 | } |
michael@0 | 405 | } |
michael@0 | 406 | |
michael@0 | 407 | const uint8_t *inBytes = (const uint8_t *)inData + headerSize; |
michael@0 | 408 | uint8_t *outBytes = (uint8_t *)outData + headerSize; |
michael@0 | 409 | |
michael@0 | 410 | /* read the indexes */ |
michael@0 | 411 | const int32_t *inIndexes = (const int32_t *)inBytes; |
michael@0 | 412 | int32_t indexes[16]; |
michael@0 | 413 | int32_t i; |
michael@0 | 414 | for(i = 0; i < 16; ++i) { |
michael@0 | 415 | indexes[i] = udata_readInt32(ds, inIndexes[i]); |
michael@0 | 416 | } |
michael@0 | 417 | |
michael@0 | 418 | /* get the total length of the data */ |
michael@0 | 419 | int32_t size = indexes[UCNVSEL_INDEX_SIZE]; |
michael@0 | 420 | if(length >= 0) { |
michael@0 | 421 | if(length < size) { |
michael@0 | 422 | udata_printError(ds, "ucnvsel_swap(): too few bytes (%d after header) for all of UConverterSelector data\n", |
michael@0 | 423 | length); |
michael@0 | 424 | *status = U_INDEX_OUTOFBOUNDS_ERROR; |
michael@0 | 425 | return 0; |
michael@0 | 426 | } |
michael@0 | 427 | |
michael@0 | 428 | /* copy the data for inaccessible bytes */ |
michael@0 | 429 | if(inBytes != outBytes) { |
michael@0 | 430 | uprv_memcpy(outBytes, inBytes, size); |
michael@0 | 431 | } |
michael@0 | 432 | |
michael@0 | 433 | int32_t offset = 0, count; |
michael@0 | 434 | |
michael@0 | 435 | /* swap the int32_t indexes[] */ |
michael@0 | 436 | count = UCNVSEL_INDEX_COUNT*4; |
michael@0 | 437 | ds->swapArray32(ds, inBytes, count, outBytes, status); |
michael@0 | 438 | offset += count; |
michael@0 | 439 | |
michael@0 | 440 | /* swap the UTrie2 */ |
michael@0 | 441 | count = indexes[UCNVSEL_INDEX_TRIE_SIZE]; |
michael@0 | 442 | utrie2_swap(ds, inBytes + offset, count, outBytes + offset, status); |
michael@0 | 443 | offset += count; |
michael@0 | 444 | |
michael@0 | 445 | /* swap the uint32_t pv[] */ |
michael@0 | 446 | count = indexes[UCNVSEL_INDEX_PV_COUNT]*4; |
michael@0 | 447 | ds->swapArray32(ds, inBytes + offset, count, outBytes + offset, status); |
michael@0 | 448 | offset += count; |
michael@0 | 449 | |
michael@0 | 450 | /* swap the encoding names */ |
michael@0 | 451 | count = indexes[UCNVSEL_INDEX_NAMES_LENGTH]; |
michael@0 | 452 | ds->swapInvChars(ds, inBytes + offset, count, outBytes + offset, status); |
michael@0 | 453 | offset += count; |
michael@0 | 454 | |
michael@0 | 455 | U_ASSERT(offset == size); |
michael@0 | 456 | } |
michael@0 | 457 | |
michael@0 | 458 | return headerSize + size; |
michael@0 | 459 | } |
michael@0 | 460 | |
michael@0 | 461 | /* unserialize a selector */ |
michael@0 | 462 | U_CAPI UConverterSelector* U_EXPORT2 |
michael@0 | 463 | ucnvsel_openFromSerialized(const void* buffer, int32_t length, UErrorCode* status) { |
michael@0 | 464 | // check if already failed |
michael@0 | 465 | if (U_FAILURE(*status)) { |
michael@0 | 466 | return NULL; |
michael@0 | 467 | } |
michael@0 | 468 | // ensure args make sense! |
michael@0 | 469 | const uint8_t *p = (const uint8_t *)buffer; |
michael@0 | 470 | if (length <= 0 || |
michael@0 | 471 | (length > 0 && (p == NULL || (U_POINTER_MASK_LSB(p, 3) != 0))) |
michael@0 | 472 | ) { |
michael@0 | 473 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 474 | return NULL; |
michael@0 | 475 | } |
michael@0 | 476 | // header |
michael@0 | 477 | if (length < 32) { |
michael@0 | 478 | // not even enough space for a minimal header |
michael@0 | 479 | *status = U_INDEX_OUTOFBOUNDS_ERROR; |
michael@0 | 480 | return NULL; |
michael@0 | 481 | } |
michael@0 | 482 | const DataHeader *pHeader = (const DataHeader *)p; |
michael@0 | 483 | if (!( |
michael@0 | 484 | pHeader->dataHeader.magic1==0xda && |
michael@0 | 485 | pHeader->dataHeader.magic2==0x27 && |
michael@0 | 486 | pHeader->info.dataFormat[0] == 0x43 && |
michael@0 | 487 | pHeader->info.dataFormat[1] == 0x53 && |
michael@0 | 488 | pHeader->info.dataFormat[2] == 0x65 && |
michael@0 | 489 | pHeader->info.dataFormat[3] == 0x6c |
michael@0 | 490 | )) { |
michael@0 | 491 | /* header not valid or dataFormat not recognized */ |
michael@0 | 492 | *status = U_INVALID_FORMAT_ERROR; |
michael@0 | 493 | return NULL; |
michael@0 | 494 | } |
michael@0 | 495 | if (pHeader->info.formatVersion[0] != 1) { |
michael@0 | 496 | *status = U_UNSUPPORTED_ERROR; |
michael@0 | 497 | return NULL; |
michael@0 | 498 | } |
michael@0 | 499 | uint8_t* swapped = NULL; |
michael@0 | 500 | if (pHeader->info.isBigEndian != U_IS_BIG_ENDIAN || |
michael@0 | 501 | pHeader->info.charsetFamily != U_CHARSET_FAMILY |
michael@0 | 502 | ) { |
michael@0 | 503 | // swap the data |
michael@0 | 504 | UDataSwapper *ds = |
michael@0 | 505 | udata_openSwapperForInputData(p, length, U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, status); |
michael@0 | 506 | int32_t totalSize = ucnvsel_swap(ds, p, -1, NULL, status); |
michael@0 | 507 | if (U_FAILURE(*status)) { |
michael@0 | 508 | udata_closeSwapper(ds); |
michael@0 | 509 | return NULL; |
michael@0 | 510 | } |
michael@0 | 511 | if (length < totalSize) { |
michael@0 | 512 | udata_closeSwapper(ds); |
michael@0 | 513 | *status = U_INDEX_OUTOFBOUNDS_ERROR; |
michael@0 | 514 | return NULL; |
michael@0 | 515 | } |
michael@0 | 516 | swapped = (uint8_t*)uprv_malloc(totalSize); |
michael@0 | 517 | if (swapped == NULL) { |
michael@0 | 518 | udata_closeSwapper(ds); |
michael@0 | 519 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 520 | return NULL; |
michael@0 | 521 | } |
michael@0 | 522 | ucnvsel_swap(ds, p, length, swapped, status); |
michael@0 | 523 | udata_closeSwapper(ds); |
michael@0 | 524 | if (U_FAILURE(*status)) { |
michael@0 | 525 | uprv_free(swapped); |
michael@0 | 526 | return NULL; |
michael@0 | 527 | } |
michael@0 | 528 | p = swapped; |
michael@0 | 529 | pHeader = (const DataHeader *)p; |
michael@0 | 530 | } |
michael@0 | 531 | if (length < (pHeader->dataHeader.headerSize + 16 * 4)) { |
michael@0 | 532 | // not even enough space for the header and the indexes |
michael@0 | 533 | uprv_free(swapped); |
michael@0 | 534 | *status = U_INDEX_OUTOFBOUNDS_ERROR; |
michael@0 | 535 | return NULL; |
michael@0 | 536 | } |
michael@0 | 537 | p += pHeader->dataHeader.headerSize; |
michael@0 | 538 | length -= pHeader->dataHeader.headerSize; |
michael@0 | 539 | // indexes |
michael@0 | 540 | const int32_t *indexes = (const int32_t *)p; |
michael@0 | 541 | if (length < indexes[UCNVSEL_INDEX_SIZE]) { |
michael@0 | 542 | uprv_free(swapped); |
michael@0 | 543 | *status = U_INDEX_OUTOFBOUNDS_ERROR; |
michael@0 | 544 | return NULL; |
michael@0 | 545 | } |
michael@0 | 546 | p += UCNVSEL_INDEX_COUNT * 4; |
michael@0 | 547 | // create and populate the selector object |
michael@0 | 548 | UConverterSelector* sel = (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector)); |
michael@0 | 549 | char **encodings = |
michael@0 | 550 | (char **)uprv_malloc( |
michael@0 | 551 | indexes[UCNVSEL_INDEX_NAMES_COUNT] * sizeof(char *)); |
michael@0 | 552 | if (sel == NULL || encodings == NULL) { |
michael@0 | 553 | uprv_free(swapped); |
michael@0 | 554 | uprv_free(sel); |
michael@0 | 555 | uprv_free(encodings); |
michael@0 | 556 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 557 | return NULL; |
michael@0 | 558 | } |
michael@0 | 559 | uprv_memset(sel, 0, sizeof(UConverterSelector)); |
michael@0 | 560 | sel->pvCount = indexes[UCNVSEL_INDEX_PV_COUNT]; |
michael@0 | 561 | sel->encodings = encodings; |
michael@0 | 562 | sel->encodingsCount = indexes[UCNVSEL_INDEX_NAMES_COUNT]; |
michael@0 | 563 | sel->encodingStrLength = indexes[UCNVSEL_INDEX_NAMES_LENGTH]; |
michael@0 | 564 | sel->swapped = swapped; |
michael@0 | 565 | // trie |
michael@0 | 566 | sel->trie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, |
michael@0 | 567 | p, indexes[UCNVSEL_INDEX_TRIE_SIZE], NULL, |
michael@0 | 568 | status); |
michael@0 | 569 | p += indexes[UCNVSEL_INDEX_TRIE_SIZE]; |
michael@0 | 570 | if (U_FAILURE(*status)) { |
michael@0 | 571 | ucnvsel_close(sel); |
michael@0 | 572 | return NULL; |
michael@0 | 573 | } |
michael@0 | 574 | // bit vectors |
michael@0 | 575 | sel->pv = (uint32_t *)p; |
michael@0 | 576 | p += sel->pvCount * 4; |
michael@0 | 577 | // encoding names |
michael@0 | 578 | char* s = (char*)p; |
michael@0 | 579 | for (int32_t i = 0; i < sel->encodingsCount; ++i) { |
michael@0 | 580 | sel->encodings[i] = s; |
michael@0 | 581 | s += uprv_strlen(s) + 1; |
michael@0 | 582 | } |
michael@0 | 583 | p += sel->encodingStrLength; |
michael@0 | 584 | |
michael@0 | 585 | return sel; |
michael@0 | 586 | } |
michael@0 | 587 | |
michael@0 | 588 | // a bunch of functions for the enumeration thingie! Nothing fancy here. Just |
michael@0 | 589 | // iterate over the selected encodings |
michael@0 | 590 | struct Enumerator { |
michael@0 | 591 | int16_t* index; |
michael@0 | 592 | int16_t length; |
michael@0 | 593 | int16_t cur; |
michael@0 | 594 | const UConverterSelector* sel; |
michael@0 | 595 | }; |
michael@0 | 596 | |
michael@0 | 597 | U_CDECL_BEGIN |
michael@0 | 598 | |
michael@0 | 599 | static void U_CALLCONV |
michael@0 | 600 | ucnvsel_close_selector_iterator(UEnumeration *enumerator) { |
michael@0 | 601 | uprv_free(((Enumerator*)(enumerator->context))->index); |
michael@0 | 602 | uprv_free(enumerator->context); |
michael@0 | 603 | uprv_free(enumerator); |
michael@0 | 604 | } |
michael@0 | 605 | |
michael@0 | 606 | |
michael@0 | 607 | static int32_t U_CALLCONV |
michael@0 | 608 | ucnvsel_count_encodings(UEnumeration *enumerator, UErrorCode *status) { |
michael@0 | 609 | // check if already failed |
michael@0 | 610 | if (U_FAILURE(*status)) { |
michael@0 | 611 | return 0; |
michael@0 | 612 | } |
michael@0 | 613 | return ((Enumerator*)(enumerator->context))->length; |
michael@0 | 614 | } |
michael@0 | 615 | |
michael@0 | 616 | |
michael@0 | 617 | static const char* U_CALLCONV ucnvsel_next_encoding(UEnumeration* enumerator, |
michael@0 | 618 | int32_t* resultLength, |
michael@0 | 619 | UErrorCode* status) { |
michael@0 | 620 | // check if already failed |
michael@0 | 621 | if (U_FAILURE(*status)) { |
michael@0 | 622 | return NULL; |
michael@0 | 623 | } |
michael@0 | 624 | |
michael@0 | 625 | int16_t cur = ((Enumerator*)(enumerator->context))->cur; |
michael@0 | 626 | const UConverterSelector* sel; |
michael@0 | 627 | const char* result; |
michael@0 | 628 | if (cur >= ((Enumerator*)(enumerator->context))->length) { |
michael@0 | 629 | return NULL; |
michael@0 | 630 | } |
michael@0 | 631 | sel = ((Enumerator*)(enumerator->context))->sel; |
michael@0 | 632 | result = sel->encodings[((Enumerator*)(enumerator->context))->index[cur] ]; |
michael@0 | 633 | ((Enumerator*)(enumerator->context))->cur++; |
michael@0 | 634 | if (resultLength) { |
michael@0 | 635 | *resultLength = (int32_t)uprv_strlen(result); |
michael@0 | 636 | } |
michael@0 | 637 | return result; |
michael@0 | 638 | } |
michael@0 | 639 | |
michael@0 | 640 | static void U_CALLCONV ucnvsel_reset_iterator(UEnumeration* enumerator, |
michael@0 | 641 | UErrorCode* status) { |
michael@0 | 642 | // check if already failed |
michael@0 | 643 | if (U_FAILURE(*status)) { |
michael@0 | 644 | return ; |
michael@0 | 645 | } |
michael@0 | 646 | ((Enumerator*)(enumerator->context))->cur = 0; |
michael@0 | 647 | } |
michael@0 | 648 | |
michael@0 | 649 | U_CDECL_END |
michael@0 | 650 | |
michael@0 | 651 | |
michael@0 | 652 | static const UEnumeration defaultEncodings = { |
michael@0 | 653 | NULL, |
michael@0 | 654 | NULL, |
michael@0 | 655 | ucnvsel_close_selector_iterator, |
michael@0 | 656 | ucnvsel_count_encodings, |
michael@0 | 657 | uenum_unextDefault, |
michael@0 | 658 | ucnvsel_next_encoding, |
michael@0 | 659 | ucnvsel_reset_iterator |
michael@0 | 660 | }; |
michael@0 | 661 | |
michael@0 | 662 | |
michael@0 | 663 | // internal fn to intersect two sets of masks |
michael@0 | 664 | // returns whether the mask has reduced to all zeros |
michael@0 | 665 | static UBool intersectMasks(uint32_t* dest, const uint32_t* source1, int32_t len) { |
michael@0 | 666 | int32_t i; |
michael@0 | 667 | uint32_t oredDest = 0; |
michael@0 | 668 | for (i = 0 ; i < len ; ++i) { |
michael@0 | 669 | oredDest |= (dest[i] &= source1[i]); |
michael@0 | 670 | } |
michael@0 | 671 | return oredDest == 0; |
michael@0 | 672 | } |
michael@0 | 673 | |
michael@0 | 674 | // internal fn to count how many 1's are there in a mask |
michael@0 | 675 | // algorithm taken from http://graphics.stanford.edu/~seander/bithacks.html |
michael@0 | 676 | static int16_t countOnes(uint32_t* mask, int32_t len) { |
michael@0 | 677 | int32_t i, totalOnes = 0; |
michael@0 | 678 | for (i = 0 ; i < len ; ++i) { |
michael@0 | 679 | uint32_t ent = mask[i]; |
michael@0 | 680 | for (; ent; totalOnes++) |
michael@0 | 681 | { |
michael@0 | 682 | ent &= ent - 1; // clear the least significant bit set |
michael@0 | 683 | } |
michael@0 | 684 | } |
michael@0 | 685 | return totalOnes; |
michael@0 | 686 | } |
michael@0 | 687 | |
michael@0 | 688 | |
michael@0 | 689 | /* internal function! */ |
michael@0 | 690 | static UEnumeration *selectForMask(const UConverterSelector* sel, |
michael@0 | 691 | uint32_t *mask, UErrorCode *status) { |
michael@0 | 692 | // this is the context we will use. Store a table of indices to which |
michael@0 | 693 | // encodings are legit. |
michael@0 | 694 | struct Enumerator* result = (Enumerator*)uprv_malloc(sizeof(Enumerator)); |
michael@0 | 695 | if (result == NULL) { |
michael@0 | 696 | uprv_free(mask); |
michael@0 | 697 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 698 | return NULL; |
michael@0 | 699 | } |
michael@0 | 700 | result->index = NULL; // this will be allocated later! |
michael@0 | 701 | result->length = result->cur = 0; |
michael@0 | 702 | result->sel = sel; |
michael@0 | 703 | |
michael@0 | 704 | UEnumeration *en = (UEnumeration *)uprv_malloc(sizeof(UEnumeration)); |
michael@0 | 705 | if (en == NULL) { |
michael@0 | 706 | // TODO(markus): Combine Enumerator and UEnumeration into one struct. |
michael@0 | 707 | uprv_free(mask); |
michael@0 | 708 | uprv_free(result); |
michael@0 | 709 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 710 | return NULL; |
michael@0 | 711 | } |
michael@0 | 712 | memcpy(en, &defaultEncodings, sizeof(UEnumeration)); |
michael@0 | 713 | en->context = result; |
michael@0 | 714 | |
michael@0 | 715 | int32_t columns = (sel->encodingsCount+31)/32; |
michael@0 | 716 | int16_t numOnes = countOnes(mask, columns); |
michael@0 | 717 | // now, we know the exact space we need for index |
michael@0 | 718 | if (numOnes > 0) { |
michael@0 | 719 | result->index = (int16_t*) uprv_malloc(numOnes * sizeof(int16_t)); |
michael@0 | 720 | |
michael@0 | 721 | int32_t i, j; |
michael@0 | 722 | int16_t k = 0; |
michael@0 | 723 | for (j = 0 ; j < columns; j++) { |
michael@0 | 724 | uint32_t v = mask[j]; |
michael@0 | 725 | for (i = 0 ; i < 32 && k < sel->encodingsCount; i++, k++) { |
michael@0 | 726 | if ((v & 1) != 0) { |
michael@0 | 727 | result->index[result->length++] = k; |
michael@0 | 728 | } |
michael@0 | 729 | v >>= 1; |
michael@0 | 730 | } |
michael@0 | 731 | } |
michael@0 | 732 | } //otherwise, index will remain NULL (and will never be touched by |
michael@0 | 733 | //the enumerator code anyway) |
michael@0 | 734 | uprv_free(mask); |
michael@0 | 735 | return en; |
michael@0 | 736 | } |
michael@0 | 737 | |
michael@0 | 738 | /* check a string against the selector - UTF16 version */ |
michael@0 | 739 | U_CAPI UEnumeration * U_EXPORT2 |
michael@0 | 740 | ucnvsel_selectForString(const UConverterSelector* sel, |
michael@0 | 741 | const UChar *s, int32_t length, UErrorCode *status) { |
michael@0 | 742 | // check if already failed |
michael@0 | 743 | if (U_FAILURE(*status)) { |
michael@0 | 744 | return NULL; |
michael@0 | 745 | } |
michael@0 | 746 | // ensure args make sense! |
michael@0 | 747 | if (sel == NULL || (s == NULL && length != 0)) { |
michael@0 | 748 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 749 | return NULL; |
michael@0 | 750 | } |
michael@0 | 751 | |
michael@0 | 752 | int32_t columns = (sel->encodingsCount+31)/32; |
michael@0 | 753 | uint32_t* mask = (uint32_t*) uprv_malloc(columns * 4); |
michael@0 | 754 | if (mask == NULL) { |
michael@0 | 755 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 756 | return NULL; |
michael@0 | 757 | } |
michael@0 | 758 | uprv_memset(mask, ~0, columns *4); |
michael@0 | 759 | |
michael@0 | 760 | if(s!=NULL) { |
michael@0 | 761 | const UChar *limit; |
michael@0 | 762 | if (length >= 0) { |
michael@0 | 763 | limit = s + length; |
michael@0 | 764 | } else { |
michael@0 | 765 | limit = NULL; |
michael@0 | 766 | } |
michael@0 | 767 | |
michael@0 | 768 | while (limit == NULL ? *s != 0 : s != limit) { |
michael@0 | 769 | UChar32 c; |
michael@0 | 770 | uint16_t pvIndex; |
michael@0 | 771 | UTRIE2_U16_NEXT16(sel->trie, s, limit, c, pvIndex); |
michael@0 | 772 | if (intersectMasks(mask, sel->pv+pvIndex, columns)) { |
michael@0 | 773 | break; |
michael@0 | 774 | } |
michael@0 | 775 | } |
michael@0 | 776 | } |
michael@0 | 777 | return selectForMask(sel, mask, status); |
michael@0 | 778 | } |
michael@0 | 779 | |
michael@0 | 780 | /* check a string against the selector - UTF8 version */ |
michael@0 | 781 | U_CAPI UEnumeration * U_EXPORT2 |
michael@0 | 782 | ucnvsel_selectForUTF8(const UConverterSelector* sel, |
michael@0 | 783 | const char *s, int32_t length, UErrorCode *status) { |
michael@0 | 784 | // check if already failed |
michael@0 | 785 | if (U_FAILURE(*status)) { |
michael@0 | 786 | return NULL; |
michael@0 | 787 | } |
michael@0 | 788 | // ensure args make sense! |
michael@0 | 789 | if (sel == NULL || (s == NULL && length != 0)) { |
michael@0 | 790 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 791 | return NULL; |
michael@0 | 792 | } |
michael@0 | 793 | |
michael@0 | 794 | int32_t columns = (sel->encodingsCount+31)/32; |
michael@0 | 795 | uint32_t* mask = (uint32_t*) uprv_malloc(columns * 4); |
michael@0 | 796 | if (mask == NULL) { |
michael@0 | 797 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 798 | return NULL; |
michael@0 | 799 | } |
michael@0 | 800 | uprv_memset(mask, ~0, columns *4); |
michael@0 | 801 | |
michael@0 | 802 | if (length < 0) { |
michael@0 | 803 | length = (int32_t)uprv_strlen(s); |
michael@0 | 804 | } |
michael@0 | 805 | |
michael@0 | 806 | if(s!=NULL) { |
michael@0 | 807 | const char *limit = s + length; |
michael@0 | 808 | |
michael@0 | 809 | while (s != limit) { |
michael@0 | 810 | uint16_t pvIndex; |
michael@0 | 811 | UTRIE2_U8_NEXT16(sel->trie, s, limit, pvIndex); |
michael@0 | 812 | if (intersectMasks(mask, sel->pv+pvIndex, columns)) { |
michael@0 | 813 | break; |
michael@0 | 814 | } |
michael@0 | 815 | } |
michael@0 | 816 | } |
michael@0 | 817 | return selectForMask(sel, mask, status); |
michael@0 | 818 | } |
michael@0 | 819 | |
michael@0 | 820 | #endif // !UCONFIG_NO_CONVERSION |