|
1 /* |
|
2 ******************************************************************************* |
|
3 * |
|
4 * Copyright (C) 2008-2011, International Business Machines |
|
5 * Corporation, Google and others. All Rights Reserved. |
|
6 * |
|
7 ******************************************************************************* |
|
8 */ |
|
9 // Author : eldawy@google.com (Mohamed Eldawy) |
|
10 // ucnvsel.cpp |
|
11 // |
|
12 // Purpose: To generate a list of encodings capable of handling |
|
13 // a given Unicode text |
|
14 // |
|
15 // Started 09-April-2008 |
|
16 |
|
17 /** |
|
18 * \file |
|
19 * |
|
20 * This is an implementation of an encoding selector. |
|
21 * The goal is, given a unicode string, find the encodings |
|
22 * this string can be mapped to. To make processing faster |
|
23 * a trie is built when you call ucnvsel_open() that |
|
24 * stores all encodings a codepoint can map to |
|
25 */ |
|
26 |
|
27 #include "unicode/ucnvsel.h" |
|
28 |
|
29 #if !UCONFIG_NO_CONVERSION |
|
30 |
|
31 #include <string.h> |
|
32 |
|
33 #include "unicode/uchar.h" |
|
34 #include "unicode/uniset.h" |
|
35 #include "unicode/ucnv.h" |
|
36 #include "unicode/ustring.h" |
|
37 #include "unicode/uchriter.h" |
|
38 #include "utrie2.h" |
|
39 #include "propsvec.h" |
|
40 #include "uassert.h" |
|
41 #include "ucmndata.h" |
|
42 #include "uenumimp.h" |
|
43 #include "cmemory.h" |
|
44 #include "cstring.h" |
|
45 |
|
46 U_NAMESPACE_USE |
|
47 |
|
48 struct UConverterSelector { |
|
49 UTrie2 *trie; // 16 bit trie containing offsets into pv |
|
50 uint32_t* pv; // table of bits! |
|
51 int32_t pvCount; |
|
52 char** encodings; // which encodings did user ask to use? |
|
53 int32_t encodingsCount; |
|
54 int32_t encodingStrLength; |
|
55 uint8_t* swapped; |
|
56 UBool ownPv, ownEncodingStrings; |
|
57 }; |
|
58 |
|
59 static void generateSelectorData(UConverterSelector* result, |
|
60 UPropsVectors *upvec, |
|
61 const USet* excludedCodePoints, |
|
62 const UConverterUnicodeSet whichSet, |
|
63 UErrorCode* status) { |
|
64 if (U_FAILURE(*status)) { |
|
65 return; |
|
66 } |
|
67 |
|
68 int32_t columns = (result->encodingsCount+31)/32; |
|
69 |
|
70 // set errorValue to all-ones |
|
71 for (int32_t col = 0; col < columns; col++) { |
|
72 upvec_setValue(upvec, UPVEC_ERROR_VALUE_CP, UPVEC_ERROR_VALUE_CP, |
|
73 col, ~0, ~0, status); |
|
74 } |
|
75 |
|
76 for (int32_t i = 0; i < result->encodingsCount; ++i) { |
|
77 uint32_t mask; |
|
78 uint32_t column; |
|
79 int32_t item_count; |
|
80 int32_t j; |
|
81 UConverter* test_converter = ucnv_open(result->encodings[i], status); |
|
82 if (U_FAILURE(*status)) { |
|
83 return; |
|
84 } |
|
85 USet* unicode_point_set; |
|
86 unicode_point_set = uset_open(1, 0); // empty set |
|
87 |
|
88 ucnv_getUnicodeSet(test_converter, unicode_point_set, |
|
89 whichSet, status); |
|
90 if (U_FAILURE(*status)) { |
|
91 ucnv_close(test_converter); |
|
92 return; |
|
93 } |
|
94 |
|
95 column = i / 32; |
|
96 mask = 1 << (i%32); |
|
97 // now iterate over intervals on set i! |
|
98 item_count = uset_getItemCount(unicode_point_set); |
|
99 |
|
100 for (j = 0; j < item_count; ++j) { |
|
101 UChar32 start_char; |
|
102 UChar32 end_char; |
|
103 UErrorCode smallStatus = U_ZERO_ERROR; |
|
104 uset_getItem(unicode_point_set, j, &start_char, &end_char, NULL, 0, |
|
105 &smallStatus); |
|
106 if (U_FAILURE(smallStatus)) { |
|
107 // this will be reached for the converters that fill the set with |
|
108 // strings. Those should be ignored by our system |
|
109 } else { |
|
110 upvec_setValue(upvec, start_char, end_char, column, ~0, mask, |
|
111 status); |
|
112 } |
|
113 } |
|
114 ucnv_close(test_converter); |
|
115 uset_close(unicode_point_set); |
|
116 if (U_FAILURE(*status)) { |
|
117 return; |
|
118 } |
|
119 } |
|
120 |
|
121 // handle excluded encodings! Simply set their values to all 1's in the upvec |
|
122 if (excludedCodePoints) { |
|
123 int32_t item_count = uset_getItemCount(excludedCodePoints); |
|
124 for (int32_t j = 0; j < item_count; ++j) { |
|
125 UChar32 start_char; |
|
126 UChar32 end_char; |
|
127 |
|
128 uset_getItem(excludedCodePoints, j, &start_char, &end_char, NULL, 0, |
|
129 status); |
|
130 for (int32_t col = 0; col < columns; col++) { |
|
131 upvec_setValue(upvec, start_char, end_char, col, ~0, ~0, |
|
132 status); |
|
133 } |
|
134 } |
|
135 } |
|
136 |
|
137 // alright. Now, let's put things in the same exact form you'd get when you |
|
138 // unserialize things. |
|
139 result->trie = upvec_compactToUTrie2WithRowIndexes(upvec, status); |
|
140 result->pv = upvec_cloneArray(upvec, &result->pvCount, NULL, status); |
|
141 result->pvCount *= columns; // number of uint32_t = rows * columns |
|
142 result->ownPv = TRUE; |
|
143 } |
|
144 |
|
145 /* open a selector. If converterListSize is 0, build for all converters. |
|
146 If excludedCodePoints is NULL, don't exclude any codepoints */ |
|
147 U_CAPI UConverterSelector* U_EXPORT2 |
|
148 ucnvsel_open(const char* const* converterList, int32_t converterListSize, |
|
149 const USet* excludedCodePoints, |
|
150 const UConverterUnicodeSet whichSet, UErrorCode* status) { |
|
151 // check if already failed |
|
152 if (U_FAILURE(*status)) { |
|
153 return NULL; |
|
154 } |
|
155 // ensure args make sense! |
|
156 if (converterListSize < 0 || (converterList == NULL && converterListSize != 0)) { |
|
157 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
158 return NULL; |
|
159 } |
|
160 |
|
161 // allocate a new converter |
|
162 LocalUConverterSelectorPointer newSelector( |
|
163 (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector))); |
|
164 if (newSelector.isNull()) { |
|
165 *status = U_MEMORY_ALLOCATION_ERROR; |
|
166 return NULL; |
|
167 } |
|
168 uprv_memset(newSelector.getAlias(), 0, sizeof(UConverterSelector)); |
|
169 |
|
170 if (converterListSize == 0) { |
|
171 converterList = NULL; |
|
172 converterListSize = ucnv_countAvailable(); |
|
173 } |
|
174 newSelector->encodings = |
|
175 (char**)uprv_malloc(converterListSize * sizeof(char*)); |
|
176 if (!newSelector->encodings) { |
|
177 *status = U_MEMORY_ALLOCATION_ERROR; |
|
178 return NULL; |
|
179 } |
|
180 newSelector->encodings[0] = NULL; // now we can call ucnvsel_close() |
|
181 |
|
182 // make a backup copy of the list of converters |
|
183 int32_t totalSize = 0; |
|
184 int32_t i; |
|
185 for (i = 0; i < converterListSize; i++) { |
|
186 totalSize += |
|
187 (int32_t)uprv_strlen(converterList != NULL ? converterList[i] : ucnv_getAvailableName(i)) + 1; |
|
188 } |
|
189 // 4-align the totalSize to 4-align the size of the serialized form |
|
190 int32_t encodingStrPadding = totalSize & 3; |
|
191 if (encodingStrPadding != 0) { |
|
192 encodingStrPadding = 4 - encodingStrPadding; |
|
193 } |
|
194 newSelector->encodingStrLength = totalSize += encodingStrPadding; |
|
195 char* allStrings = (char*) uprv_malloc(totalSize); |
|
196 if (!allStrings) { |
|
197 *status = U_MEMORY_ALLOCATION_ERROR; |
|
198 return NULL; |
|
199 } |
|
200 |
|
201 for (i = 0; i < converterListSize; i++) { |
|
202 newSelector->encodings[i] = allStrings; |
|
203 uprv_strcpy(newSelector->encodings[i], |
|
204 converterList != NULL ? converterList[i] : ucnv_getAvailableName(i)); |
|
205 allStrings += uprv_strlen(newSelector->encodings[i]) + 1; |
|
206 } |
|
207 while (encodingStrPadding > 0) { |
|
208 *allStrings++ = 0; |
|
209 --encodingStrPadding; |
|
210 } |
|
211 |
|
212 newSelector->ownEncodingStrings = TRUE; |
|
213 newSelector->encodingsCount = converterListSize; |
|
214 UPropsVectors *upvec = upvec_open((converterListSize+31)/32, status); |
|
215 generateSelectorData(newSelector.getAlias(), upvec, excludedCodePoints, whichSet, status); |
|
216 upvec_close(upvec); |
|
217 |
|
218 if (U_FAILURE(*status)) { |
|
219 return NULL; |
|
220 } |
|
221 |
|
222 return newSelector.orphan(); |
|
223 } |
|
224 |
|
225 /* close opened selector */ |
|
226 U_CAPI void U_EXPORT2 |
|
227 ucnvsel_close(UConverterSelector *sel) { |
|
228 if (!sel) { |
|
229 return; |
|
230 } |
|
231 if (sel->ownEncodingStrings) { |
|
232 uprv_free(sel->encodings[0]); |
|
233 } |
|
234 uprv_free(sel->encodings); |
|
235 if (sel->ownPv) { |
|
236 uprv_free(sel->pv); |
|
237 } |
|
238 utrie2_close(sel->trie); |
|
239 uprv_free(sel->swapped); |
|
240 uprv_free(sel); |
|
241 } |
|
242 |
|
243 static const UDataInfo dataInfo = { |
|
244 sizeof(UDataInfo), |
|
245 0, |
|
246 |
|
247 U_IS_BIG_ENDIAN, |
|
248 U_CHARSET_FAMILY, |
|
249 U_SIZEOF_UCHAR, |
|
250 0, |
|
251 |
|
252 { 0x43, 0x53, 0x65, 0x6c }, /* dataFormat="CSel" */ |
|
253 { 1, 0, 0, 0 }, /* formatVersion */ |
|
254 { 0, 0, 0, 0 } /* dataVersion */ |
|
255 }; |
|
256 |
|
257 enum { |
|
258 UCNVSEL_INDEX_TRIE_SIZE, // trie size in bytes |
|
259 UCNVSEL_INDEX_PV_COUNT, // number of uint32_t in the bit vectors |
|
260 UCNVSEL_INDEX_NAMES_COUNT, // number of encoding names |
|
261 UCNVSEL_INDEX_NAMES_LENGTH, // number of encoding name bytes including padding |
|
262 UCNVSEL_INDEX_SIZE = 15, // bytes following the DataHeader |
|
263 UCNVSEL_INDEX_COUNT = 16 |
|
264 }; |
|
265 |
|
266 /* |
|
267 * Serialized form of a UConverterSelector, formatVersion 1: |
|
268 * |
|
269 * The serialized form begins with a standard ICU DataHeader with a UDataInfo |
|
270 * as the template above. |
|
271 * This is followed by: |
|
272 * int32_t indexes[UCNVSEL_INDEX_COUNT]; // see index entry constants above |
|
273 * serialized UTrie2; // indexes[UCNVSEL_INDEX_TRIE_SIZE] bytes |
|
274 * uint32_t pv[indexes[UCNVSEL_INDEX_PV_COUNT]]; // bit vectors |
|
275 * char* encodingNames[indexes[UCNVSEL_INDEX_NAMES_LENGTH]]; // NUL-terminated strings + padding |
|
276 */ |
|
277 |
|
278 /* serialize a selector */ |
|
279 U_CAPI int32_t U_EXPORT2 |
|
280 ucnvsel_serialize(const UConverterSelector* sel, |
|
281 void* buffer, int32_t bufferCapacity, UErrorCode* status) { |
|
282 // check if already failed |
|
283 if (U_FAILURE(*status)) { |
|
284 return 0; |
|
285 } |
|
286 // ensure args make sense! |
|
287 uint8_t *p = (uint8_t *)buffer; |
|
288 if (bufferCapacity < 0 || |
|
289 (bufferCapacity > 0 && (p == NULL || (U_POINTER_MASK_LSB(p, 3) != 0))) |
|
290 ) { |
|
291 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
292 return 0; |
|
293 } |
|
294 // add up the size of the serialized form |
|
295 int32_t serializedTrieSize = utrie2_serialize(sel->trie, NULL, 0, status); |
|
296 if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) { |
|
297 return 0; |
|
298 } |
|
299 *status = U_ZERO_ERROR; |
|
300 |
|
301 DataHeader header; |
|
302 uprv_memset(&header, 0, sizeof(header)); |
|
303 header.dataHeader.headerSize = (uint16_t)((sizeof(header) + 15) & ~15); |
|
304 header.dataHeader.magic1 = 0xda; |
|
305 header.dataHeader.magic2 = 0x27; |
|
306 uprv_memcpy(&header.info, &dataInfo, sizeof(dataInfo)); |
|
307 |
|
308 int32_t indexes[UCNVSEL_INDEX_COUNT] = { |
|
309 serializedTrieSize, |
|
310 sel->pvCount, |
|
311 sel->encodingsCount, |
|
312 sel->encodingStrLength |
|
313 }; |
|
314 |
|
315 int32_t totalSize = |
|
316 header.dataHeader.headerSize + |
|
317 (int32_t)sizeof(indexes) + |
|
318 serializedTrieSize + |
|
319 sel->pvCount * 4 + |
|
320 sel->encodingStrLength; |
|
321 indexes[UCNVSEL_INDEX_SIZE] = totalSize - header.dataHeader.headerSize; |
|
322 if (totalSize > bufferCapacity) { |
|
323 *status = U_BUFFER_OVERFLOW_ERROR; |
|
324 return totalSize; |
|
325 } |
|
326 // ok, save! |
|
327 int32_t length = header.dataHeader.headerSize; |
|
328 uprv_memcpy(p, &header, sizeof(header)); |
|
329 uprv_memset(p + sizeof(header), 0, length - sizeof(header)); |
|
330 p += length; |
|
331 |
|
332 length = (int32_t)sizeof(indexes); |
|
333 uprv_memcpy(p, indexes, length); |
|
334 p += length; |
|
335 |
|
336 utrie2_serialize(sel->trie, p, serializedTrieSize, status); |
|
337 p += serializedTrieSize; |
|
338 |
|
339 length = sel->pvCount * 4; |
|
340 uprv_memcpy(p, sel->pv, length); |
|
341 p += length; |
|
342 |
|
343 uprv_memcpy(p, sel->encodings[0], sel->encodingStrLength); |
|
344 p += sel->encodingStrLength; |
|
345 |
|
346 return totalSize; |
|
347 } |
|
348 |
|
349 /** |
|
350 * swap a selector into the desired Endianness and Asciiness of |
|
351 * the system. Just as FYI, selectors are always saved in the format |
|
352 * of the system that created them. They are only converted if used |
|
353 * on another system. In other words, selectors created on different |
|
354 * system can be different even if the params are identical (endianness |
|
355 * and Asciiness differences only) |
|
356 * |
|
357 * @param ds pointer to data swapper containing swapping info |
|
358 * @param inData pointer to incoming data |
|
359 * @param length length of inData in bytes |
|
360 * @param outData pointer to output data. Capacity should |
|
361 * be at least equal to capacity of inData |
|
362 * @param status an in/out ICU UErrorCode |
|
363 * @return 0 on failure, number of bytes swapped on success |
|
364 * number of bytes swapped can be smaller than length |
|
365 */ |
|
366 static int32_t |
|
367 ucnvsel_swap(const UDataSwapper *ds, |
|
368 const void *inData, int32_t length, |
|
369 void *outData, UErrorCode *status) { |
|
370 /* udata_swapDataHeader checks the arguments */ |
|
371 int32_t headerSize = udata_swapDataHeader(ds, inData, length, outData, status); |
|
372 if(U_FAILURE(*status)) { |
|
373 return 0; |
|
374 } |
|
375 |
|
376 /* check data format and format version */ |
|
377 const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData + 4); |
|
378 if(!( |
|
379 pInfo->dataFormat[0] == 0x43 && /* dataFormat="CSel" */ |
|
380 pInfo->dataFormat[1] == 0x53 && |
|
381 pInfo->dataFormat[2] == 0x65 && |
|
382 pInfo->dataFormat[3] == 0x6c |
|
383 )) { |
|
384 udata_printError(ds, "ucnvsel_swap(): data format %02x.%02x.%02x.%02x is not recognized as UConverterSelector data\n", |
|
385 pInfo->dataFormat[0], pInfo->dataFormat[1], |
|
386 pInfo->dataFormat[2], pInfo->dataFormat[3]); |
|
387 *status = U_INVALID_FORMAT_ERROR; |
|
388 return 0; |
|
389 } |
|
390 if(pInfo->formatVersion[0] != 1) { |
|
391 udata_printError(ds, "ucnvsel_swap(): format version %02x is not supported\n", |
|
392 pInfo->formatVersion[0]); |
|
393 *status = U_UNSUPPORTED_ERROR; |
|
394 return 0; |
|
395 } |
|
396 |
|
397 if(length >= 0) { |
|
398 length -= headerSize; |
|
399 if(length < 16*4) { |
|
400 udata_printError(ds, "ucnvsel_swap(): too few bytes (%d after header) for UConverterSelector data\n", |
|
401 length); |
|
402 *status = U_INDEX_OUTOFBOUNDS_ERROR; |
|
403 return 0; |
|
404 } |
|
405 } |
|
406 |
|
407 const uint8_t *inBytes = (const uint8_t *)inData + headerSize; |
|
408 uint8_t *outBytes = (uint8_t *)outData + headerSize; |
|
409 |
|
410 /* read the indexes */ |
|
411 const int32_t *inIndexes = (const int32_t *)inBytes; |
|
412 int32_t indexes[16]; |
|
413 int32_t i; |
|
414 for(i = 0; i < 16; ++i) { |
|
415 indexes[i] = udata_readInt32(ds, inIndexes[i]); |
|
416 } |
|
417 |
|
418 /* get the total length of the data */ |
|
419 int32_t size = indexes[UCNVSEL_INDEX_SIZE]; |
|
420 if(length >= 0) { |
|
421 if(length < size) { |
|
422 udata_printError(ds, "ucnvsel_swap(): too few bytes (%d after header) for all of UConverterSelector data\n", |
|
423 length); |
|
424 *status = U_INDEX_OUTOFBOUNDS_ERROR; |
|
425 return 0; |
|
426 } |
|
427 |
|
428 /* copy the data for inaccessible bytes */ |
|
429 if(inBytes != outBytes) { |
|
430 uprv_memcpy(outBytes, inBytes, size); |
|
431 } |
|
432 |
|
433 int32_t offset = 0, count; |
|
434 |
|
435 /* swap the int32_t indexes[] */ |
|
436 count = UCNVSEL_INDEX_COUNT*4; |
|
437 ds->swapArray32(ds, inBytes, count, outBytes, status); |
|
438 offset += count; |
|
439 |
|
440 /* swap the UTrie2 */ |
|
441 count = indexes[UCNVSEL_INDEX_TRIE_SIZE]; |
|
442 utrie2_swap(ds, inBytes + offset, count, outBytes + offset, status); |
|
443 offset += count; |
|
444 |
|
445 /* swap the uint32_t pv[] */ |
|
446 count = indexes[UCNVSEL_INDEX_PV_COUNT]*4; |
|
447 ds->swapArray32(ds, inBytes + offset, count, outBytes + offset, status); |
|
448 offset += count; |
|
449 |
|
450 /* swap the encoding names */ |
|
451 count = indexes[UCNVSEL_INDEX_NAMES_LENGTH]; |
|
452 ds->swapInvChars(ds, inBytes + offset, count, outBytes + offset, status); |
|
453 offset += count; |
|
454 |
|
455 U_ASSERT(offset == size); |
|
456 } |
|
457 |
|
458 return headerSize + size; |
|
459 } |
|
460 |
|
461 /* unserialize a selector */ |
|
462 U_CAPI UConverterSelector* U_EXPORT2 |
|
463 ucnvsel_openFromSerialized(const void* buffer, int32_t length, UErrorCode* status) { |
|
464 // check if already failed |
|
465 if (U_FAILURE(*status)) { |
|
466 return NULL; |
|
467 } |
|
468 // ensure args make sense! |
|
469 const uint8_t *p = (const uint8_t *)buffer; |
|
470 if (length <= 0 || |
|
471 (length > 0 && (p == NULL || (U_POINTER_MASK_LSB(p, 3) != 0))) |
|
472 ) { |
|
473 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
474 return NULL; |
|
475 } |
|
476 // header |
|
477 if (length < 32) { |
|
478 // not even enough space for a minimal header |
|
479 *status = U_INDEX_OUTOFBOUNDS_ERROR; |
|
480 return NULL; |
|
481 } |
|
482 const DataHeader *pHeader = (const DataHeader *)p; |
|
483 if (!( |
|
484 pHeader->dataHeader.magic1==0xda && |
|
485 pHeader->dataHeader.magic2==0x27 && |
|
486 pHeader->info.dataFormat[0] == 0x43 && |
|
487 pHeader->info.dataFormat[1] == 0x53 && |
|
488 pHeader->info.dataFormat[2] == 0x65 && |
|
489 pHeader->info.dataFormat[3] == 0x6c |
|
490 )) { |
|
491 /* header not valid or dataFormat not recognized */ |
|
492 *status = U_INVALID_FORMAT_ERROR; |
|
493 return NULL; |
|
494 } |
|
495 if (pHeader->info.formatVersion[0] != 1) { |
|
496 *status = U_UNSUPPORTED_ERROR; |
|
497 return NULL; |
|
498 } |
|
499 uint8_t* swapped = NULL; |
|
500 if (pHeader->info.isBigEndian != U_IS_BIG_ENDIAN || |
|
501 pHeader->info.charsetFamily != U_CHARSET_FAMILY |
|
502 ) { |
|
503 // swap the data |
|
504 UDataSwapper *ds = |
|
505 udata_openSwapperForInputData(p, length, U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, status); |
|
506 int32_t totalSize = ucnvsel_swap(ds, p, -1, NULL, status); |
|
507 if (U_FAILURE(*status)) { |
|
508 udata_closeSwapper(ds); |
|
509 return NULL; |
|
510 } |
|
511 if (length < totalSize) { |
|
512 udata_closeSwapper(ds); |
|
513 *status = U_INDEX_OUTOFBOUNDS_ERROR; |
|
514 return NULL; |
|
515 } |
|
516 swapped = (uint8_t*)uprv_malloc(totalSize); |
|
517 if (swapped == NULL) { |
|
518 udata_closeSwapper(ds); |
|
519 *status = U_MEMORY_ALLOCATION_ERROR; |
|
520 return NULL; |
|
521 } |
|
522 ucnvsel_swap(ds, p, length, swapped, status); |
|
523 udata_closeSwapper(ds); |
|
524 if (U_FAILURE(*status)) { |
|
525 uprv_free(swapped); |
|
526 return NULL; |
|
527 } |
|
528 p = swapped; |
|
529 pHeader = (const DataHeader *)p; |
|
530 } |
|
531 if (length < (pHeader->dataHeader.headerSize + 16 * 4)) { |
|
532 // not even enough space for the header and the indexes |
|
533 uprv_free(swapped); |
|
534 *status = U_INDEX_OUTOFBOUNDS_ERROR; |
|
535 return NULL; |
|
536 } |
|
537 p += pHeader->dataHeader.headerSize; |
|
538 length -= pHeader->dataHeader.headerSize; |
|
539 // indexes |
|
540 const int32_t *indexes = (const int32_t *)p; |
|
541 if (length < indexes[UCNVSEL_INDEX_SIZE]) { |
|
542 uprv_free(swapped); |
|
543 *status = U_INDEX_OUTOFBOUNDS_ERROR; |
|
544 return NULL; |
|
545 } |
|
546 p += UCNVSEL_INDEX_COUNT * 4; |
|
547 // create and populate the selector object |
|
548 UConverterSelector* sel = (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector)); |
|
549 char **encodings = |
|
550 (char **)uprv_malloc( |
|
551 indexes[UCNVSEL_INDEX_NAMES_COUNT] * sizeof(char *)); |
|
552 if (sel == NULL || encodings == NULL) { |
|
553 uprv_free(swapped); |
|
554 uprv_free(sel); |
|
555 uprv_free(encodings); |
|
556 *status = U_MEMORY_ALLOCATION_ERROR; |
|
557 return NULL; |
|
558 } |
|
559 uprv_memset(sel, 0, sizeof(UConverterSelector)); |
|
560 sel->pvCount = indexes[UCNVSEL_INDEX_PV_COUNT]; |
|
561 sel->encodings = encodings; |
|
562 sel->encodingsCount = indexes[UCNVSEL_INDEX_NAMES_COUNT]; |
|
563 sel->encodingStrLength = indexes[UCNVSEL_INDEX_NAMES_LENGTH]; |
|
564 sel->swapped = swapped; |
|
565 // trie |
|
566 sel->trie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, |
|
567 p, indexes[UCNVSEL_INDEX_TRIE_SIZE], NULL, |
|
568 status); |
|
569 p += indexes[UCNVSEL_INDEX_TRIE_SIZE]; |
|
570 if (U_FAILURE(*status)) { |
|
571 ucnvsel_close(sel); |
|
572 return NULL; |
|
573 } |
|
574 // bit vectors |
|
575 sel->pv = (uint32_t *)p; |
|
576 p += sel->pvCount * 4; |
|
577 // encoding names |
|
578 char* s = (char*)p; |
|
579 for (int32_t i = 0; i < sel->encodingsCount; ++i) { |
|
580 sel->encodings[i] = s; |
|
581 s += uprv_strlen(s) + 1; |
|
582 } |
|
583 p += sel->encodingStrLength; |
|
584 |
|
585 return sel; |
|
586 } |
|
587 |
|
588 // a bunch of functions for the enumeration thingie! Nothing fancy here. Just |
|
589 // iterate over the selected encodings |
|
590 struct Enumerator { |
|
591 int16_t* index; |
|
592 int16_t length; |
|
593 int16_t cur; |
|
594 const UConverterSelector* sel; |
|
595 }; |
|
596 |
|
597 U_CDECL_BEGIN |
|
598 |
|
599 static void U_CALLCONV |
|
600 ucnvsel_close_selector_iterator(UEnumeration *enumerator) { |
|
601 uprv_free(((Enumerator*)(enumerator->context))->index); |
|
602 uprv_free(enumerator->context); |
|
603 uprv_free(enumerator); |
|
604 } |
|
605 |
|
606 |
|
607 static int32_t U_CALLCONV |
|
608 ucnvsel_count_encodings(UEnumeration *enumerator, UErrorCode *status) { |
|
609 // check if already failed |
|
610 if (U_FAILURE(*status)) { |
|
611 return 0; |
|
612 } |
|
613 return ((Enumerator*)(enumerator->context))->length; |
|
614 } |
|
615 |
|
616 |
|
617 static const char* U_CALLCONV ucnvsel_next_encoding(UEnumeration* enumerator, |
|
618 int32_t* resultLength, |
|
619 UErrorCode* status) { |
|
620 // check if already failed |
|
621 if (U_FAILURE(*status)) { |
|
622 return NULL; |
|
623 } |
|
624 |
|
625 int16_t cur = ((Enumerator*)(enumerator->context))->cur; |
|
626 const UConverterSelector* sel; |
|
627 const char* result; |
|
628 if (cur >= ((Enumerator*)(enumerator->context))->length) { |
|
629 return NULL; |
|
630 } |
|
631 sel = ((Enumerator*)(enumerator->context))->sel; |
|
632 result = sel->encodings[((Enumerator*)(enumerator->context))->index[cur] ]; |
|
633 ((Enumerator*)(enumerator->context))->cur++; |
|
634 if (resultLength) { |
|
635 *resultLength = (int32_t)uprv_strlen(result); |
|
636 } |
|
637 return result; |
|
638 } |
|
639 |
|
640 static void U_CALLCONV ucnvsel_reset_iterator(UEnumeration* enumerator, |
|
641 UErrorCode* status) { |
|
642 // check if already failed |
|
643 if (U_FAILURE(*status)) { |
|
644 return ; |
|
645 } |
|
646 ((Enumerator*)(enumerator->context))->cur = 0; |
|
647 } |
|
648 |
|
649 U_CDECL_END |
|
650 |
|
651 |
|
652 static const UEnumeration defaultEncodings = { |
|
653 NULL, |
|
654 NULL, |
|
655 ucnvsel_close_selector_iterator, |
|
656 ucnvsel_count_encodings, |
|
657 uenum_unextDefault, |
|
658 ucnvsel_next_encoding, |
|
659 ucnvsel_reset_iterator |
|
660 }; |
|
661 |
|
662 |
|
663 // internal fn to intersect two sets of masks |
|
664 // returns whether the mask has reduced to all zeros |
|
665 static UBool intersectMasks(uint32_t* dest, const uint32_t* source1, int32_t len) { |
|
666 int32_t i; |
|
667 uint32_t oredDest = 0; |
|
668 for (i = 0 ; i < len ; ++i) { |
|
669 oredDest |= (dest[i] &= source1[i]); |
|
670 } |
|
671 return oredDest == 0; |
|
672 } |
|
673 |
|
674 // internal fn to count how many 1's are there in a mask |
|
675 // algorithm taken from http://graphics.stanford.edu/~seander/bithacks.html |
|
676 static int16_t countOnes(uint32_t* mask, int32_t len) { |
|
677 int32_t i, totalOnes = 0; |
|
678 for (i = 0 ; i < len ; ++i) { |
|
679 uint32_t ent = mask[i]; |
|
680 for (; ent; totalOnes++) |
|
681 { |
|
682 ent &= ent - 1; // clear the least significant bit set |
|
683 } |
|
684 } |
|
685 return totalOnes; |
|
686 } |
|
687 |
|
688 |
|
689 /* internal function! */ |
|
690 static UEnumeration *selectForMask(const UConverterSelector* sel, |
|
691 uint32_t *mask, UErrorCode *status) { |
|
692 // this is the context we will use. Store a table of indices to which |
|
693 // encodings are legit. |
|
694 struct Enumerator* result = (Enumerator*)uprv_malloc(sizeof(Enumerator)); |
|
695 if (result == NULL) { |
|
696 uprv_free(mask); |
|
697 *status = U_MEMORY_ALLOCATION_ERROR; |
|
698 return NULL; |
|
699 } |
|
700 result->index = NULL; // this will be allocated later! |
|
701 result->length = result->cur = 0; |
|
702 result->sel = sel; |
|
703 |
|
704 UEnumeration *en = (UEnumeration *)uprv_malloc(sizeof(UEnumeration)); |
|
705 if (en == NULL) { |
|
706 // TODO(markus): Combine Enumerator and UEnumeration into one struct. |
|
707 uprv_free(mask); |
|
708 uprv_free(result); |
|
709 *status = U_MEMORY_ALLOCATION_ERROR; |
|
710 return NULL; |
|
711 } |
|
712 memcpy(en, &defaultEncodings, sizeof(UEnumeration)); |
|
713 en->context = result; |
|
714 |
|
715 int32_t columns = (sel->encodingsCount+31)/32; |
|
716 int16_t numOnes = countOnes(mask, columns); |
|
717 // now, we know the exact space we need for index |
|
718 if (numOnes > 0) { |
|
719 result->index = (int16_t*) uprv_malloc(numOnes * sizeof(int16_t)); |
|
720 |
|
721 int32_t i, j; |
|
722 int16_t k = 0; |
|
723 for (j = 0 ; j < columns; j++) { |
|
724 uint32_t v = mask[j]; |
|
725 for (i = 0 ; i < 32 && k < sel->encodingsCount; i++, k++) { |
|
726 if ((v & 1) != 0) { |
|
727 result->index[result->length++] = k; |
|
728 } |
|
729 v >>= 1; |
|
730 } |
|
731 } |
|
732 } //otherwise, index will remain NULL (and will never be touched by |
|
733 //the enumerator code anyway) |
|
734 uprv_free(mask); |
|
735 return en; |
|
736 } |
|
737 |
|
738 /* check a string against the selector - UTF16 version */ |
|
739 U_CAPI UEnumeration * U_EXPORT2 |
|
740 ucnvsel_selectForString(const UConverterSelector* sel, |
|
741 const UChar *s, int32_t length, UErrorCode *status) { |
|
742 // check if already failed |
|
743 if (U_FAILURE(*status)) { |
|
744 return NULL; |
|
745 } |
|
746 // ensure args make sense! |
|
747 if (sel == NULL || (s == NULL && length != 0)) { |
|
748 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
749 return NULL; |
|
750 } |
|
751 |
|
752 int32_t columns = (sel->encodingsCount+31)/32; |
|
753 uint32_t* mask = (uint32_t*) uprv_malloc(columns * 4); |
|
754 if (mask == NULL) { |
|
755 *status = U_MEMORY_ALLOCATION_ERROR; |
|
756 return NULL; |
|
757 } |
|
758 uprv_memset(mask, ~0, columns *4); |
|
759 |
|
760 if(s!=NULL) { |
|
761 const UChar *limit; |
|
762 if (length >= 0) { |
|
763 limit = s + length; |
|
764 } else { |
|
765 limit = NULL; |
|
766 } |
|
767 |
|
768 while (limit == NULL ? *s != 0 : s != limit) { |
|
769 UChar32 c; |
|
770 uint16_t pvIndex; |
|
771 UTRIE2_U16_NEXT16(sel->trie, s, limit, c, pvIndex); |
|
772 if (intersectMasks(mask, sel->pv+pvIndex, columns)) { |
|
773 break; |
|
774 } |
|
775 } |
|
776 } |
|
777 return selectForMask(sel, mask, status); |
|
778 } |
|
779 |
|
780 /* check a string against the selector - UTF8 version */ |
|
781 U_CAPI UEnumeration * U_EXPORT2 |
|
782 ucnvsel_selectForUTF8(const UConverterSelector* sel, |
|
783 const char *s, int32_t length, UErrorCode *status) { |
|
784 // check if already failed |
|
785 if (U_FAILURE(*status)) { |
|
786 return NULL; |
|
787 } |
|
788 // ensure args make sense! |
|
789 if (sel == NULL || (s == NULL && length != 0)) { |
|
790 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
791 return NULL; |
|
792 } |
|
793 |
|
794 int32_t columns = (sel->encodingsCount+31)/32; |
|
795 uint32_t* mask = (uint32_t*) uprv_malloc(columns * 4); |
|
796 if (mask == NULL) { |
|
797 *status = U_MEMORY_ALLOCATION_ERROR; |
|
798 return NULL; |
|
799 } |
|
800 uprv_memset(mask, ~0, columns *4); |
|
801 |
|
802 if (length < 0) { |
|
803 length = (int32_t)uprv_strlen(s); |
|
804 } |
|
805 |
|
806 if(s!=NULL) { |
|
807 const char *limit = s + length; |
|
808 |
|
809 while (s != limit) { |
|
810 uint16_t pvIndex; |
|
811 UTRIE2_U8_NEXT16(sel->trie, s, limit, pvIndex); |
|
812 if (intersectMasks(mask, sel->pv+pvIndex, columns)) { |
|
813 break; |
|
814 } |
|
815 } |
|
816 } |
|
817 return selectForMask(sel, mask, status); |
|
818 } |
|
819 |
|
820 #endif // !UCONFIG_NO_CONVERSION |