intl/icu/source/common/ucnv_io.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 ******************************************************************************
michael@0 3 *
michael@0 4 * Copyright (C) 1999-2013, International Business Machines
michael@0 5 * Corporation and others. All Rights Reserved.
michael@0 6 *
michael@0 7 ******************************************************************************
michael@0 8 *
michael@0 9 *
michael@0 10 * ucnv_io.cpp:
michael@0 11 * initializes global variables and defines functions pertaining to converter
michael@0 12 * name resolution aspect of the conversion code.
michael@0 13 *
michael@0 14 * new implementation:
michael@0 15 *
michael@0 16 * created on: 1999nov22
michael@0 17 * created by: Markus W. Scherer
michael@0 18 *
michael@0 19 * Use the binary cnvalias.icu (created from convrtrs.txt) to work
michael@0 20 * with aliases for converter names.
michael@0 21 *
michael@0 22 * Date Name Description
michael@0 23 * 11/22/1999 markus Created
michael@0 24 * 06/28/2002 grhoten Major overhaul of the converter alias design.
michael@0 25 * Now an alias can map to different converters
michael@0 26 * depending on the specified standard.
michael@0 27 *******************************************************************************
michael@0 28 */
michael@0 29
michael@0 30 #include "unicode/utypes.h"
michael@0 31
michael@0 32 #if !UCONFIG_NO_CONVERSION
michael@0 33
michael@0 34 #include "unicode/ucnv.h"
michael@0 35 #include "unicode/udata.h"
michael@0 36
michael@0 37 #include "umutex.h"
michael@0 38 #include "uarrsort.h"
michael@0 39 #include "uassert.h"
michael@0 40 #include "udataswp.h"
michael@0 41 #include "cstring.h"
michael@0 42 #include "cmemory.h"
michael@0 43 #include "ucnv_io.h"
michael@0 44 #include "uenumimp.h"
michael@0 45 #include "ucln_cmn.h"
michael@0 46
michael@0 47 /* Format of cnvalias.icu -----------------------------------------------------
michael@0 48 *
michael@0 49 * cnvalias.icu is a binary, memory-mappable form of convrtrs.txt.
michael@0 50 * This binary form contains several tables. All indexes are to uint16_t
michael@0 51 * units, and not to the bytes (uint8_t units). Addressing everything on
michael@0 52 * 16-bit boundaries allows us to store more information with small index
michael@0 53 * numbers, which are also 16-bit in size. The majority of the table (except
michael@0 54 * the string table) are 16-bit numbers.
michael@0 55 *
michael@0 56 * First there is the size of the Table of Contents (TOC). The TOC
michael@0 57 * entries contain the size of each section. In order to find the offset
michael@0 58 * you just need to sum up the previous offsets.
michael@0 59 * The TOC length and entries are an array of uint32_t values.
michael@0 60 * The first section after the TOC starts immediately after the TOC.
michael@0 61 *
michael@0 62 * 1) This section contains a list of converters. This list contains indexes
michael@0 63 * into the string table for the converter name. The index of this list is
michael@0 64 * also used by other sections, which are mentioned later on.
michael@0 65 * This list is not sorted.
michael@0 66 *
michael@0 67 * 2) This section contains a list of tags. This list contains indexes
michael@0 68 * into the string table for the tag name. The index of this list is
michael@0 69 * also used by other sections, which are mentioned later on.
michael@0 70 * This list is in priority order of standards.
michael@0 71 *
michael@0 72 * 3) This section contains a list of sorted unique aliases. This
michael@0 73 * list contains indexes into the string table for the alias name. The
michael@0 74 * index of this list is also used by other sections, like the 4th section.
michael@0 75 * The index for the 3rd and 4th section is used to get the
michael@0 76 * alias -> converter name mapping. Section 3 and 4 form a two column table.
michael@0 77 * Some of the most significant bits of each index may contain other
michael@0 78 * information (see findConverter for details).
michael@0 79 *
michael@0 80 * 4) This section contains a list of mapped converter names. Consider this
michael@0 81 * as a table that maps the 3rd section to the 1st section. This list contains
michael@0 82 * indexes into the 1st section. The index of this list is the same index in
michael@0 83 * the 3rd section. There is also some extra information in the high bits of
michael@0 84 * each converter index in this table. Currently it's only used to say that
michael@0 85 * an alias mapped to this converter is ambiguous. See UCNV_CONVERTER_INDEX_MASK
michael@0 86 * and UCNV_AMBIGUOUS_ALIAS_MAP_BIT for more information. This section is
michael@0 87 * the predigested form of the 5th section so that an alias lookup can be fast.
michael@0 88 *
michael@0 89 * 5) This section contains a 2D array with indexes to the 6th section. This
michael@0 90 * section is the full form of all alias mappings. The column index is the
michael@0 91 * index into the converter list (column header). The row index is the index
michael@0 92 * to tag list (row header). This 2D array is the top part a 3D array. The
michael@0 93 * third dimension is in the 6th section.
michael@0 94 *
michael@0 95 * 6) This is blob of variable length arrays. Each array starts with a size,
michael@0 96 * and is followed by indexes to alias names in the string table. This is
michael@0 97 * the third dimension to the section 5. No other section should be referencing
michael@0 98 * this section.
michael@0 99 *
michael@0 100 * 7) Starting in ICU 3.6, this can be a UConverterAliasOptions struct. Its
michael@0 101 * presence indicates that a section 9 exists. UConverterAliasOptions specifies
michael@0 102 * what type of string normalization is used among other potential things in the
michael@0 103 * future.
michael@0 104 *
michael@0 105 * 8) This is the string table. All strings are indexed on an even address.
michael@0 106 * There are two reasons for this. First many chip architectures locate strings
michael@0 107 * faster on even address boundaries. Second, since all indexes are 16-bit
michael@0 108 * numbers, this string table can be 128KB in size instead of 64KB when we
michael@0 109 * only have strings starting on an even address.
michael@0 110 *
michael@0 111 * 9) When present this is a set of prenormalized strings from section 8. This
michael@0 112 * table contains normalized strings with the dashes and spaces stripped out,
michael@0 113 * and all strings lowercased. In the future, the options in section 7 may state
michael@0 114 * other types of normalization.
michael@0 115 *
michael@0 116 * Here is the concept of section 5 and 6. It's a 3D cube. Each tag
michael@0 117 * has a unique alias among all converters. That same alias can
michael@0 118 * be mentioned in other standards on different converters,
michael@0 119 * but only one alias per tag can be unique.
michael@0 120 *
michael@0 121 *
michael@0 122 * Converter Names (Usually in TR22 form)
michael@0 123 * -------------------------------------------.
michael@0 124 * T / /|
michael@0 125 * a / / |
michael@0 126 * g / / |
michael@0 127 * s / / |
michael@0 128 * / / |
michael@0 129 * ------------------------------------------/ |
michael@0 130 * A | | |
michael@0 131 * l | | |
michael@0 132 * i | | /
michael@0 133 * a | | /
michael@0 134 * s | | /
michael@0 135 * e | | /
michael@0 136 * s | |/
michael@0 137 * -------------------------------------------
michael@0 138 *
michael@0 139 *
michael@0 140 *
michael@0 141 * Here is what it really looks like. It's like swiss cheese.
michael@0 142 * There are holes. Some converters aren't recognized by
michael@0 143 * a standard, or they are really old converters that the
michael@0 144 * standard doesn't recognize anymore.
michael@0 145 *
michael@0 146 * Converter Names (Usually in TR22 form)
michael@0 147 * -------------------------------------------.
michael@0 148 * T /##########################################/|
michael@0 149 * a / # # /#
michael@0 150 * g / # ## ## ### # ### ### ### #/
michael@0 151 * s / # ##### #### ## ## #/#
michael@0 152 * / ### # # ## # # # ### # # #/##
michael@0 153 * ------------------------------------------/# #
michael@0 154 * A |### # # ## # # # ### # # #|# #
michael@0 155 * l |# # # # # ## # #|# #
michael@0 156 * i |# # # # # # #|#
michael@0 157 * a |# #|#
michael@0 158 * s | #|#
michael@0 159 * e
michael@0 160 * s
michael@0 161 *
michael@0 162 */
michael@0 163
michael@0 164 /**
michael@0 165 * Used by the UEnumeration API
michael@0 166 */
michael@0 167 typedef struct UAliasContext {
michael@0 168 uint32_t listOffset;
michael@0 169 uint32_t listIdx;
michael@0 170 } UAliasContext;
michael@0 171
michael@0 172 static const char DATA_NAME[] = "cnvalias";
michael@0 173 static const char DATA_TYPE[] = "icu";
michael@0 174
michael@0 175 static UDataMemory *gAliasData=NULL;
michael@0 176 static icu::UInitOnce gAliasDataInitOnce = U_INITONCE_INITIALIZER;
michael@0 177
michael@0 178 enum {
michael@0 179 tocLengthIndex=0,
michael@0 180 converterListIndex=1,
michael@0 181 tagListIndex=2,
michael@0 182 aliasListIndex=3,
michael@0 183 untaggedConvArrayIndex=4,
michael@0 184 taggedAliasArrayIndex=5,
michael@0 185 taggedAliasListsIndex=6,
michael@0 186 tableOptionsIndex=7,
michael@0 187 stringTableIndex=8,
michael@0 188 normalizedStringTableIndex=9,
michael@0 189 offsetsCount, /* length of the swapper's temporary offsets[] */
michael@0 190 minTocLength=8 /* min. tocLength in the file, does not count the tocLengthIndex! */
michael@0 191 };
michael@0 192
michael@0 193 static const UConverterAliasOptions defaultTableOptions = {
michael@0 194 UCNV_IO_UNNORMALIZED,
michael@0 195 0 /* containsCnvOptionInfo */
michael@0 196 };
michael@0 197 static UConverterAlias gMainTable;
michael@0 198
michael@0 199 #define GET_STRING(idx) (const char *)(gMainTable.stringTable + (idx))
michael@0 200 #define GET_NORMALIZED_STRING(idx) (const char *)(gMainTable.normalizedStringTable + (idx))
michael@0 201
michael@0 202 static UBool U_CALLCONV
michael@0 203 isAcceptable(void * /*context*/,
michael@0 204 const char * /*type*/, const char * /*name*/,
michael@0 205 const UDataInfo *pInfo) {
michael@0 206 return (UBool)(
michael@0 207 pInfo->size>=20 &&
michael@0 208 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
michael@0 209 pInfo->charsetFamily==U_CHARSET_FAMILY &&
michael@0 210 pInfo->dataFormat[0]==0x43 && /* dataFormat="CvAl" */
michael@0 211 pInfo->dataFormat[1]==0x76 &&
michael@0 212 pInfo->dataFormat[2]==0x41 &&
michael@0 213 pInfo->dataFormat[3]==0x6c &&
michael@0 214 pInfo->formatVersion[0]==3);
michael@0 215 }
michael@0 216
michael@0 217 static UBool U_CALLCONV ucnv_io_cleanup(void)
michael@0 218 {
michael@0 219 if (gAliasData) {
michael@0 220 udata_close(gAliasData);
michael@0 221 gAliasData = NULL;
michael@0 222 }
michael@0 223 gAliasDataInitOnce.reset();
michael@0 224
michael@0 225 uprv_memset(&gMainTable, 0, sizeof(gMainTable));
michael@0 226
michael@0 227 return TRUE; /* Everything was cleaned up */
michael@0 228 }
michael@0 229
michael@0 230 static void U_CALLCONV initAliasData(UErrorCode &errCode) {
michael@0 231 UDataMemory *data;
michael@0 232 const uint16_t *table;
michael@0 233 const uint32_t *sectionSizes;
michael@0 234 uint32_t tableStart;
michael@0 235 uint32_t currOffset;
michael@0 236
michael@0 237 ucln_common_registerCleanup(UCLN_COMMON_UCNV_IO, ucnv_io_cleanup);
michael@0 238
michael@0 239 U_ASSERT(gAliasData == NULL);
michael@0 240 data = udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errCode);
michael@0 241 if(U_FAILURE(errCode)) {
michael@0 242 return;
michael@0 243 }
michael@0 244
michael@0 245 sectionSizes = (const uint32_t *)udata_getMemory(data);
michael@0 246 table = (const uint16_t *)sectionSizes;
michael@0 247
michael@0 248 tableStart = sectionSizes[0];
michael@0 249 if (tableStart < minTocLength) {
michael@0 250 errCode = U_INVALID_FORMAT_ERROR;
michael@0 251 udata_close(data);
michael@0 252 return;
michael@0 253 }
michael@0 254 gAliasData = data;
michael@0 255
michael@0 256 gMainTable.converterListSize = sectionSizes[1];
michael@0 257 gMainTable.tagListSize = sectionSizes[2];
michael@0 258 gMainTable.aliasListSize = sectionSizes[3];
michael@0 259 gMainTable.untaggedConvArraySize = sectionSizes[4];
michael@0 260 gMainTable.taggedAliasArraySize = sectionSizes[5];
michael@0 261 gMainTable.taggedAliasListsSize = sectionSizes[6];
michael@0 262 gMainTable.optionTableSize = sectionSizes[7];
michael@0 263 gMainTable.stringTableSize = sectionSizes[8];
michael@0 264
michael@0 265 if (tableStart > 8) {
michael@0 266 gMainTable.normalizedStringTableSize = sectionSizes[9];
michael@0 267 }
michael@0 268
michael@0 269 currOffset = tableStart * (sizeof(uint32_t)/sizeof(uint16_t)) + (sizeof(uint32_t)/sizeof(uint16_t));
michael@0 270 gMainTable.converterList = table + currOffset;
michael@0 271
michael@0 272 currOffset += gMainTable.converterListSize;
michael@0 273 gMainTable.tagList = table + currOffset;
michael@0 274
michael@0 275 currOffset += gMainTable.tagListSize;
michael@0 276 gMainTable.aliasList = table + currOffset;
michael@0 277
michael@0 278 currOffset += gMainTable.aliasListSize;
michael@0 279 gMainTable.untaggedConvArray = table + currOffset;
michael@0 280
michael@0 281 currOffset += gMainTable.untaggedConvArraySize;
michael@0 282 gMainTable.taggedAliasArray = table + currOffset;
michael@0 283
michael@0 284 /* aliasLists is a 1's based array, but it has a padding character */
michael@0 285 currOffset += gMainTable.taggedAliasArraySize;
michael@0 286 gMainTable.taggedAliasLists = table + currOffset;
michael@0 287
michael@0 288 currOffset += gMainTable.taggedAliasListsSize;
michael@0 289 if (gMainTable.optionTableSize > 0
michael@0 290 && ((const UConverterAliasOptions *)(table + currOffset))->stringNormalizationType < UCNV_IO_NORM_TYPE_COUNT)
michael@0 291 {
michael@0 292 /* Faster table */
michael@0 293 gMainTable.optionTable = (const UConverterAliasOptions *)(table + currOffset);
michael@0 294 }
michael@0 295 else {
michael@0 296 /* Smaller table, or I can't handle this normalization mode!
michael@0 297 Use the original slower table lookup. */
michael@0 298 gMainTable.optionTable = &defaultTableOptions;
michael@0 299 }
michael@0 300
michael@0 301 currOffset += gMainTable.optionTableSize;
michael@0 302 gMainTable.stringTable = table + currOffset;
michael@0 303
michael@0 304 currOffset += gMainTable.stringTableSize;
michael@0 305 gMainTable.normalizedStringTable = ((gMainTable.optionTable->stringNormalizationType == UCNV_IO_UNNORMALIZED)
michael@0 306 ? gMainTable.stringTable : (table + currOffset));
michael@0 307 }
michael@0 308
michael@0 309
michael@0 310 static UBool
michael@0 311 haveAliasData(UErrorCode *pErrorCode) {
michael@0 312 umtx_initOnce(gAliasDataInitOnce, &initAliasData, *pErrorCode);
michael@0 313 return U_SUCCESS(*pErrorCode);
michael@0 314 }
michael@0 315
michael@0 316 static inline UBool
michael@0 317 isAlias(const char *alias, UErrorCode *pErrorCode) {
michael@0 318 if(alias==NULL) {
michael@0 319 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 320 return FALSE;
michael@0 321 }
michael@0 322 return (UBool)(*alias!=0);
michael@0 323 }
michael@0 324
michael@0 325 static uint32_t getTagNumber(const char *tagname) {
michael@0 326 if (gMainTable.tagList) {
michael@0 327 uint32_t tagNum;
michael@0 328 for (tagNum = 0; tagNum < gMainTable.tagListSize; tagNum++) {
michael@0 329 if (!uprv_stricmp(GET_STRING(gMainTable.tagList[tagNum]), tagname)) {
michael@0 330 return tagNum;
michael@0 331 }
michael@0 332 }
michael@0 333 }
michael@0 334
michael@0 335 return UINT32_MAX;
michael@0 336 }
michael@0 337
michael@0 338 /* character types relevant for ucnv_compareNames() */
michael@0 339 enum {
michael@0 340 UIGNORE,
michael@0 341 ZERO,
michael@0 342 NONZERO,
michael@0 343 MINLETTER /* any values from here on are lowercase letter mappings */
michael@0 344 };
michael@0 345
michael@0 346 /* character types for ASCII 00..7F */
michael@0 347 static const uint8_t asciiTypes[128] = {
michael@0 348 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
michael@0 349 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
michael@0 350 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
michael@0 351 ZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, 0, 0, 0, 0, 0, 0,
michael@0 352 0, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
michael@0 353 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0, 0, 0, 0, 0,
michael@0 354 0, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
michael@0 355 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0, 0, 0, 0, 0
michael@0 356 };
michael@0 357
michael@0 358 #define GET_ASCII_TYPE(c) ((int8_t)(c) >= 0 ? asciiTypes[(uint8_t)c] : (uint8_t)UIGNORE)
michael@0 359
michael@0 360 /* character types for EBCDIC 80..FF */
michael@0 361 static const uint8_t ebcdicTypes[128] = {
michael@0 362 0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0, 0, 0, 0, 0, 0,
michael@0 363 0, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0, 0, 0, 0, 0, 0,
michael@0 364 0, 0, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0, 0, 0, 0, 0, 0,
michael@0 365 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
michael@0 366 0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0, 0, 0, 0, 0, 0,
michael@0 367 0, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0, 0, 0, 0, 0, 0,
michael@0 368 0, 0, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0, 0, 0, 0, 0, 0,
michael@0 369 ZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, 0, 0, 0, 0, 0, 0
michael@0 370 };
michael@0 371
michael@0 372 #define GET_EBCDIC_TYPE(c) ((int8_t)(c) < 0 ? ebcdicTypes[(c)&0x7f] : (uint8_t)UIGNORE)
michael@0 373
michael@0 374 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
michael@0 375 # define GET_CHAR_TYPE(c) GET_ASCII_TYPE(c)
michael@0 376 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
michael@0 377 # define GET_CHAR_TYPE(c) GET_EBCDIC_TYPE(c)
michael@0 378 #else
michael@0 379 # error U_CHARSET_FAMILY is not valid
michael@0 380 #endif
michael@0 381
michael@0 382 /* @see ucnv_compareNames */
michael@0 383 U_CFUNC char * U_EXPORT2
michael@0 384 ucnv_io_stripASCIIForCompare(char *dst, const char *name) {
michael@0 385 char *dstItr = dst;
michael@0 386 uint8_t type, nextType;
michael@0 387 char c1;
michael@0 388 UBool afterDigit = FALSE;
michael@0 389
michael@0 390 while ((c1 = *name++) != 0) {
michael@0 391 type = GET_ASCII_TYPE(c1);
michael@0 392 switch (type) {
michael@0 393 case UIGNORE:
michael@0 394 afterDigit = FALSE;
michael@0 395 continue; /* ignore all but letters and digits */
michael@0 396 case ZERO:
michael@0 397 if (!afterDigit) {
michael@0 398 nextType = GET_ASCII_TYPE(*name);
michael@0 399 if (nextType == ZERO || nextType == NONZERO) {
michael@0 400 continue; /* ignore leading zero before another digit */
michael@0 401 }
michael@0 402 }
michael@0 403 break;
michael@0 404 case NONZERO:
michael@0 405 afterDigit = TRUE;
michael@0 406 break;
michael@0 407 default:
michael@0 408 c1 = (char)type; /* lowercased letter */
michael@0 409 afterDigit = FALSE;
michael@0 410 break;
michael@0 411 }
michael@0 412 *dstItr++ = c1;
michael@0 413 }
michael@0 414 *dstItr = 0;
michael@0 415 return dst;
michael@0 416 }
michael@0 417
michael@0 418 U_CFUNC char * U_EXPORT2
michael@0 419 ucnv_io_stripEBCDICForCompare(char *dst, const char *name) {
michael@0 420 char *dstItr = dst;
michael@0 421 uint8_t type, nextType;
michael@0 422 char c1;
michael@0 423 UBool afterDigit = FALSE;
michael@0 424
michael@0 425 while ((c1 = *name++) != 0) {
michael@0 426 type = GET_EBCDIC_TYPE(c1);
michael@0 427 switch (type) {
michael@0 428 case UIGNORE:
michael@0 429 afterDigit = FALSE;
michael@0 430 continue; /* ignore all but letters and digits */
michael@0 431 case ZERO:
michael@0 432 if (!afterDigit) {
michael@0 433 nextType = GET_EBCDIC_TYPE(*name);
michael@0 434 if (nextType == ZERO || nextType == NONZERO) {
michael@0 435 continue; /* ignore leading zero before another digit */
michael@0 436 }
michael@0 437 }
michael@0 438 break;
michael@0 439 case NONZERO:
michael@0 440 afterDigit = TRUE;
michael@0 441 break;
michael@0 442 default:
michael@0 443 c1 = (char)type; /* lowercased letter */
michael@0 444 afterDigit = FALSE;
michael@0 445 break;
michael@0 446 }
michael@0 447 *dstItr++ = c1;
michael@0 448 }
michael@0 449 *dstItr = 0;
michael@0 450 return dst;
michael@0 451 }
michael@0 452
michael@0 453 /**
michael@0 454 * Do a fuzzy compare of two converter/alias names.
michael@0 455 * The comparison is case-insensitive, ignores leading zeroes if they are not
michael@0 456 * followed by further digits, and ignores all but letters and digits.
michael@0 457 * Thus the strings "UTF-8", "utf_8", "u*T@f08" and "Utf 8" are exactly equivalent.
michael@0 458 * See section 1.4, Charset Alias Matching in Unicode Technical Standard #22
michael@0 459 * at http://www.unicode.org/reports/tr22/
michael@0 460 *
michael@0 461 * This is a symmetrical (commutative) operation; order of arguments
michael@0 462 * is insignificant. This is an important property for sorting the
michael@0 463 * list (when the list is preprocessed into binary form) and for
michael@0 464 * performing binary searches on it at run time.
michael@0 465 *
michael@0 466 * @param name1 a converter name or alias, zero-terminated
michael@0 467 * @param name2 a converter name or alias, zero-terminated
michael@0 468 * @return 0 if the names match, or a negative value if the name1
michael@0 469 * lexically precedes name2, or a positive value if the name1
michael@0 470 * lexically follows name2.
michael@0 471 *
michael@0 472 * @see ucnv_io_stripForCompare
michael@0 473 */
michael@0 474 U_CAPI int U_EXPORT2
michael@0 475 ucnv_compareNames(const char *name1, const char *name2) {
michael@0 476 int rc;
michael@0 477 uint8_t type, nextType;
michael@0 478 char c1, c2;
michael@0 479 UBool afterDigit1 = FALSE, afterDigit2 = FALSE;
michael@0 480
michael@0 481 for (;;) {
michael@0 482 while ((c1 = *name1++) != 0) {
michael@0 483 type = GET_CHAR_TYPE(c1);
michael@0 484 switch (type) {
michael@0 485 case UIGNORE:
michael@0 486 afterDigit1 = FALSE;
michael@0 487 continue; /* ignore all but letters and digits */
michael@0 488 case ZERO:
michael@0 489 if (!afterDigit1) {
michael@0 490 nextType = GET_CHAR_TYPE(*name1);
michael@0 491 if (nextType == ZERO || nextType == NONZERO) {
michael@0 492 continue; /* ignore leading zero before another digit */
michael@0 493 }
michael@0 494 }
michael@0 495 break;
michael@0 496 case NONZERO:
michael@0 497 afterDigit1 = TRUE;
michael@0 498 break;
michael@0 499 default:
michael@0 500 c1 = (char)type; /* lowercased letter */
michael@0 501 afterDigit1 = FALSE;
michael@0 502 break;
michael@0 503 }
michael@0 504 break; /* deliver c1 */
michael@0 505 }
michael@0 506 while ((c2 = *name2++) != 0) {
michael@0 507 type = GET_CHAR_TYPE(c2);
michael@0 508 switch (type) {
michael@0 509 case UIGNORE:
michael@0 510 afterDigit2 = FALSE;
michael@0 511 continue; /* ignore all but letters and digits */
michael@0 512 case ZERO:
michael@0 513 if (!afterDigit2) {
michael@0 514 nextType = GET_CHAR_TYPE(*name2);
michael@0 515 if (nextType == ZERO || nextType == NONZERO) {
michael@0 516 continue; /* ignore leading zero before another digit */
michael@0 517 }
michael@0 518 }
michael@0 519 break;
michael@0 520 case NONZERO:
michael@0 521 afterDigit2 = TRUE;
michael@0 522 break;
michael@0 523 default:
michael@0 524 c2 = (char)type; /* lowercased letter */
michael@0 525 afterDigit2 = FALSE;
michael@0 526 break;
michael@0 527 }
michael@0 528 break; /* deliver c2 */
michael@0 529 }
michael@0 530
michael@0 531 /* If we reach the ends of both strings then they match */
michael@0 532 if ((c1|c2)==0) {
michael@0 533 return 0;
michael@0 534 }
michael@0 535
michael@0 536 /* Case-insensitive comparison */
michael@0 537 rc = (int)(unsigned char)c1 - (int)(unsigned char)c2;
michael@0 538 if (rc != 0) {
michael@0 539 return rc;
michael@0 540 }
michael@0 541 }
michael@0 542 }
michael@0 543
michael@0 544 /*
michael@0 545 * search for an alias
michael@0 546 * return the converter number index for gConverterList
michael@0 547 */
michael@0 548 static inline uint32_t
michael@0 549 findConverter(const char *alias, UBool *containsOption, UErrorCode *pErrorCode) {
michael@0 550 uint32_t mid, start, limit;
michael@0 551 uint32_t lastMid;
michael@0 552 int result;
michael@0 553 int isUnnormalized = (gMainTable.optionTable->stringNormalizationType == UCNV_IO_UNNORMALIZED);
michael@0 554 char strippedName[UCNV_MAX_CONVERTER_NAME_LENGTH];
michael@0 555
michael@0 556 if (!isUnnormalized) {
michael@0 557 if (uprv_strlen(alias) >= UCNV_MAX_CONVERTER_NAME_LENGTH) {
michael@0 558 *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
michael@0 559 return UINT32_MAX;
michael@0 560 }
michael@0 561
michael@0 562 /* Lower case and remove ignoreable characters. */
michael@0 563 ucnv_io_stripForCompare(strippedName, alias);
michael@0 564 alias = strippedName;
michael@0 565 }
michael@0 566
michael@0 567 /* do a binary search for the alias */
michael@0 568 start = 0;
michael@0 569 limit = gMainTable.untaggedConvArraySize;
michael@0 570 mid = limit;
michael@0 571 lastMid = UINT32_MAX;
michael@0 572
michael@0 573 for (;;) {
michael@0 574 mid = (uint32_t)((start + limit) / 2);
michael@0 575 if (lastMid == mid) { /* Have we moved? */
michael@0 576 break; /* We haven't moved, and it wasn't found. */
michael@0 577 }
michael@0 578 lastMid = mid;
michael@0 579 if (isUnnormalized) {
michael@0 580 result = ucnv_compareNames(alias, GET_STRING(gMainTable.aliasList[mid]));
michael@0 581 }
michael@0 582 else {
michael@0 583 result = uprv_strcmp(alias, GET_NORMALIZED_STRING(gMainTable.aliasList[mid]));
michael@0 584 }
michael@0 585
michael@0 586 if (result < 0) {
michael@0 587 limit = mid;
michael@0 588 } else if (result > 0) {
michael@0 589 start = mid;
michael@0 590 } else {
michael@0 591 /* Since the gencnval tool folds duplicates into one entry,
michael@0 592 * this alias in gAliasList is unique, but different standards
michael@0 593 * may map an alias to different converters.
michael@0 594 */
michael@0 595 if (gMainTable.untaggedConvArray[mid] & UCNV_AMBIGUOUS_ALIAS_MAP_BIT) {
michael@0 596 *pErrorCode = U_AMBIGUOUS_ALIAS_WARNING;
michael@0 597 }
michael@0 598 /* State whether the canonical converter name contains an option.
michael@0 599 This information is contained in this list in order to maintain backward & forward compatibility. */
michael@0 600 if (containsOption) {
michael@0 601 UBool containsCnvOptionInfo = (UBool)gMainTable.optionTable->containsCnvOptionInfo;
michael@0 602 *containsOption = (UBool)((containsCnvOptionInfo
michael@0 603 && ((gMainTable.untaggedConvArray[mid] & UCNV_CONTAINS_OPTION_BIT) != 0))
michael@0 604 || !containsCnvOptionInfo);
michael@0 605 }
michael@0 606 return gMainTable.untaggedConvArray[mid] & UCNV_CONVERTER_INDEX_MASK;
michael@0 607 }
michael@0 608 }
michael@0 609
michael@0 610 return UINT32_MAX;
michael@0 611 }
michael@0 612
michael@0 613 /*
michael@0 614 * Is this alias in this list?
michael@0 615 * alias and listOffset should be non-NULL.
michael@0 616 */
michael@0 617 static inline UBool
michael@0 618 isAliasInList(const char *alias, uint32_t listOffset) {
michael@0 619 if (listOffset) {
michael@0 620 uint32_t currAlias;
michael@0 621 uint32_t listCount = gMainTable.taggedAliasLists[listOffset];
michael@0 622 /* +1 to skip listCount */
michael@0 623 const uint16_t *currList = gMainTable.taggedAliasLists + listOffset + 1;
michael@0 624 for (currAlias = 0; currAlias < listCount; currAlias++) {
michael@0 625 if (currList[currAlias]
michael@0 626 && ucnv_compareNames(alias, GET_STRING(currList[currAlias]))==0)
michael@0 627 {
michael@0 628 return TRUE;
michael@0 629 }
michael@0 630 }
michael@0 631 }
michael@0 632 return FALSE;
michael@0 633 }
michael@0 634
michael@0 635 /*
michael@0 636 * Search for an standard name of an alias (what is the default name
michael@0 637 * that this standard uses?)
michael@0 638 * return the listOffset for gTaggedAliasLists. If it's 0,
michael@0 639 * the it couldn't be found, but the parameters are valid.
michael@0 640 */
michael@0 641 static uint32_t
michael@0 642 findTaggedAliasListsOffset(const char *alias, const char *standard, UErrorCode *pErrorCode) {
michael@0 643 uint32_t idx;
michael@0 644 uint32_t listOffset;
michael@0 645 uint32_t convNum;
michael@0 646 UErrorCode myErr = U_ZERO_ERROR;
michael@0 647 uint32_t tagNum = getTagNumber(standard);
michael@0 648
michael@0 649 /* Make a quick guess. Hopefully they used a TR22 canonical alias. */
michael@0 650 convNum = findConverter(alias, NULL, &myErr);
michael@0 651 if (myErr != U_ZERO_ERROR) {
michael@0 652 *pErrorCode = myErr;
michael@0 653 }
michael@0 654
michael@0 655 if (tagNum < (gMainTable.tagListSize - UCNV_NUM_HIDDEN_TAGS) && convNum < gMainTable.converterListSize) {
michael@0 656 listOffset = gMainTable.taggedAliasArray[tagNum*gMainTable.converterListSize + convNum];
michael@0 657 if (listOffset && gMainTable.taggedAliasLists[listOffset + 1]) {
michael@0 658 return listOffset;
michael@0 659 }
michael@0 660 if (myErr == U_AMBIGUOUS_ALIAS_WARNING) {
michael@0 661 /* Uh Oh! They used an ambiguous alias.
michael@0 662 We have to search the whole swiss cheese starting
michael@0 663 at the highest standard affinity.
michael@0 664 This may take a while.
michael@0 665 */
michael@0 666 for (idx = 0; idx < gMainTable.taggedAliasArraySize; idx++) {
michael@0 667 listOffset = gMainTable.taggedAliasArray[idx];
michael@0 668 if (listOffset && isAliasInList(alias, listOffset)) {
michael@0 669 uint32_t currTagNum = idx/gMainTable.converterListSize;
michael@0 670 uint32_t currConvNum = (idx - currTagNum*gMainTable.converterListSize);
michael@0 671 uint32_t tempListOffset = gMainTable.taggedAliasArray[tagNum*gMainTable.converterListSize + currConvNum];
michael@0 672 if (tempListOffset && gMainTable.taggedAliasLists[tempListOffset + 1]) {
michael@0 673 return tempListOffset;
michael@0 674 }
michael@0 675 /* else keep on looking */
michael@0 676 /* We could speed this up by starting on the next row
michael@0 677 because an alias is unique per row, right now.
michael@0 678 This would change if alias versioning appears. */
michael@0 679 }
michael@0 680 }
michael@0 681 /* The standard doesn't know about the alias */
michael@0 682 }
michael@0 683 /* else no default name */
michael@0 684 return 0;
michael@0 685 }
michael@0 686 /* else converter or tag not found */
michael@0 687
michael@0 688 return UINT32_MAX;
michael@0 689 }
michael@0 690
michael@0 691 /* Return the canonical name */
michael@0 692 static uint32_t
michael@0 693 findTaggedConverterNum(const char *alias, const char *standard, UErrorCode *pErrorCode) {
michael@0 694 uint32_t idx;
michael@0 695 uint32_t listOffset;
michael@0 696 uint32_t convNum;
michael@0 697 UErrorCode myErr = U_ZERO_ERROR;
michael@0 698 uint32_t tagNum = getTagNumber(standard);
michael@0 699
michael@0 700 /* Make a quick guess. Hopefully they used a TR22 canonical alias. */
michael@0 701 convNum = findConverter(alias, NULL, &myErr);
michael@0 702 if (myErr != U_ZERO_ERROR) {
michael@0 703 *pErrorCode = myErr;
michael@0 704 }
michael@0 705
michael@0 706 if (tagNum < (gMainTable.tagListSize - UCNV_NUM_HIDDEN_TAGS) && convNum < gMainTable.converterListSize) {
michael@0 707 listOffset = gMainTable.taggedAliasArray[tagNum*gMainTable.converterListSize + convNum];
michael@0 708 if (listOffset && isAliasInList(alias, listOffset)) {
michael@0 709 return convNum;
michael@0 710 }
michael@0 711 if (myErr == U_AMBIGUOUS_ALIAS_WARNING) {
michael@0 712 /* Uh Oh! They used an ambiguous alias.
michael@0 713 We have to search one slice of the swiss cheese.
michael@0 714 We search only in the requested tag, not the whole thing.
michael@0 715 This may take a while.
michael@0 716 */
michael@0 717 uint32_t convStart = (tagNum)*gMainTable.converterListSize;
michael@0 718 uint32_t convLimit = (tagNum+1)*gMainTable.converterListSize;
michael@0 719 for (idx = convStart; idx < convLimit; idx++) {
michael@0 720 listOffset = gMainTable.taggedAliasArray[idx];
michael@0 721 if (listOffset && isAliasInList(alias, listOffset)) {
michael@0 722 return idx-convStart;
michael@0 723 }
michael@0 724 }
michael@0 725 /* The standard doesn't know about the alias */
michael@0 726 }
michael@0 727 /* else no canonical name */
michael@0 728 }
michael@0 729 /* else converter or tag not found */
michael@0 730
michael@0 731 return UINT32_MAX;
michael@0 732 }
michael@0 733
michael@0 734
michael@0 735
michael@0 736 U_CFUNC const char *
michael@0 737 ucnv_io_getConverterName(const char *alias, UBool *containsOption, UErrorCode *pErrorCode) {
michael@0 738 const char *aliasTmp = alias;
michael@0 739 int32_t i = 0;
michael@0 740 for (i = 0; i < 2; i++) {
michael@0 741 if (i == 1) {
michael@0 742 /*
michael@0 743 * After the first unsuccess converter lookup, check to see if
michael@0 744 * the name begins with 'x-'. If it does, strip it off and try
michael@0 745 * again. This behaviour is similar to how ICU4J does it.
michael@0 746 */
michael@0 747 if (aliasTmp[0] == 'x' || aliasTmp[1] == '-') {
michael@0 748 aliasTmp = aliasTmp+2;
michael@0 749 } else {
michael@0 750 break;
michael@0 751 }
michael@0 752 }
michael@0 753 if(haveAliasData(pErrorCode) && isAlias(aliasTmp, pErrorCode)) {
michael@0 754 uint32_t convNum = findConverter(aliasTmp, containsOption, pErrorCode);
michael@0 755 if (convNum < gMainTable.converterListSize) {
michael@0 756 return GET_STRING(gMainTable.converterList[convNum]);
michael@0 757 }
michael@0 758 /* else converter not found */
michael@0 759 } else {
michael@0 760 break;
michael@0 761 }
michael@0 762 }
michael@0 763
michael@0 764 return NULL;
michael@0 765 }
michael@0 766
michael@0 767 static int32_t U_CALLCONV
michael@0 768 ucnv_io_countStandardAliases(UEnumeration *enumerator, UErrorCode * /*pErrorCode*/) {
michael@0 769 int32_t value = 0;
michael@0 770 UAliasContext *myContext = (UAliasContext *)(enumerator->context);
michael@0 771 uint32_t listOffset = myContext->listOffset;
michael@0 772
michael@0 773 if (listOffset) {
michael@0 774 value = gMainTable.taggedAliasLists[listOffset];
michael@0 775 }
michael@0 776 return value;
michael@0 777 }
michael@0 778
michael@0 779 static const char* U_CALLCONV
michael@0 780 ucnv_io_nextStandardAliases(UEnumeration *enumerator,
michael@0 781 int32_t* resultLength,
michael@0 782 UErrorCode * /*pErrorCode*/)
michael@0 783 {
michael@0 784 UAliasContext *myContext = (UAliasContext *)(enumerator->context);
michael@0 785 uint32_t listOffset = myContext->listOffset;
michael@0 786
michael@0 787 if (listOffset) {
michael@0 788 uint32_t listCount = gMainTable.taggedAliasLists[listOffset];
michael@0 789 const uint16_t *currList = gMainTable.taggedAliasLists + listOffset + 1;
michael@0 790
michael@0 791 if (myContext->listIdx < listCount) {
michael@0 792 const char *myStr = GET_STRING(currList[myContext->listIdx++]);
michael@0 793 if (resultLength) {
michael@0 794 *resultLength = (int32_t)uprv_strlen(myStr);
michael@0 795 }
michael@0 796 return myStr;
michael@0 797 }
michael@0 798 }
michael@0 799 /* Either we accessed a zero length list, or we enumerated too far. */
michael@0 800 if (resultLength) {
michael@0 801 *resultLength = 0;
michael@0 802 }
michael@0 803 return NULL;
michael@0 804 }
michael@0 805
michael@0 806 static void U_CALLCONV
michael@0 807 ucnv_io_resetStandardAliases(UEnumeration *enumerator, UErrorCode * /*pErrorCode*/) {
michael@0 808 ((UAliasContext *)(enumerator->context))->listIdx = 0;
michael@0 809 }
michael@0 810
michael@0 811 static void U_CALLCONV
michael@0 812 ucnv_io_closeUEnumeration(UEnumeration *enumerator) {
michael@0 813 uprv_free(enumerator->context);
michael@0 814 uprv_free(enumerator);
michael@0 815 }
michael@0 816
michael@0 817 /* Enumerate the aliases for the specified converter and standard tag */
michael@0 818 static const UEnumeration gEnumAliases = {
michael@0 819 NULL,
michael@0 820 NULL,
michael@0 821 ucnv_io_closeUEnumeration,
michael@0 822 ucnv_io_countStandardAliases,
michael@0 823 uenum_unextDefault,
michael@0 824 ucnv_io_nextStandardAliases,
michael@0 825 ucnv_io_resetStandardAliases
michael@0 826 };
michael@0 827
michael@0 828 U_CAPI UEnumeration * U_EXPORT2
michael@0 829 ucnv_openStandardNames(const char *convName,
michael@0 830 const char *standard,
michael@0 831 UErrorCode *pErrorCode)
michael@0 832 {
michael@0 833 UEnumeration *myEnum = NULL;
michael@0 834 if (haveAliasData(pErrorCode) && isAlias(convName, pErrorCode)) {
michael@0 835 uint32_t listOffset = findTaggedAliasListsOffset(convName, standard, pErrorCode);
michael@0 836
michael@0 837 /* When listOffset == 0, we want to acknowledge that the
michael@0 838 converter name and standard are okay, but there
michael@0 839 is nothing to enumerate. */
michael@0 840 if (listOffset < gMainTable.taggedAliasListsSize) {
michael@0 841 UAliasContext *myContext;
michael@0 842
michael@0 843 myEnum = static_cast<UEnumeration *>(uprv_malloc(sizeof(UEnumeration)));
michael@0 844 if (myEnum == NULL) {
michael@0 845 *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
michael@0 846 return NULL;
michael@0 847 }
michael@0 848 uprv_memcpy(myEnum, &gEnumAliases, sizeof(UEnumeration));
michael@0 849 myContext = static_cast<UAliasContext *>(uprv_malloc(sizeof(UAliasContext)));
michael@0 850 if (myContext == NULL) {
michael@0 851 *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
michael@0 852 uprv_free(myEnum);
michael@0 853 return NULL;
michael@0 854 }
michael@0 855 myContext->listOffset = listOffset;
michael@0 856 myContext->listIdx = 0;
michael@0 857 myEnum->context = myContext;
michael@0 858 }
michael@0 859 /* else converter or tag not found */
michael@0 860 }
michael@0 861 return myEnum;
michael@0 862 }
michael@0 863
michael@0 864 static uint16_t
michael@0 865 ucnv_io_countAliases(const char *alias, UErrorCode *pErrorCode) {
michael@0 866 if(haveAliasData(pErrorCode) && isAlias(alias, pErrorCode)) {
michael@0 867 uint32_t convNum = findConverter(alias, NULL, pErrorCode);
michael@0 868 if (convNum < gMainTable.converterListSize) {
michael@0 869 /* tagListNum - 1 is the ALL tag */
michael@0 870 int32_t listOffset = gMainTable.taggedAliasArray[(gMainTable.tagListSize - 1)*gMainTable.converterListSize + convNum];
michael@0 871
michael@0 872 if (listOffset) {
michael@0 873 return gMainTable.taggedAliasLists[listOffset];
michael@0 874 }
michael@0 875 /* else this shouldn't happen. internal program error */
michael@0 876 }
michael@0 877 /* else converter not found */
michael@0 878 }
michael@0 879 return 0;
michael@0 880 }
michael@0 881
michael@0 882 static uint16_t
michael@0 883 ucnv_io_getAliases(const char *alias, uint16_t start, const char **aliases, UErrorCode *pErrorCode) {
michael@0 884 if(haveAliasData(pErrorCode) && isAlias(alias, pErrorCode)) {
michael@0 885 uint32_t currAlias;
michael@0 886 uint32_t convNum = findConverter(alias, NULL, pErrorCode);
michael@0 887 if (convNum < gMainTable.converterListSize) {
michael@0 888 /* tagListNum - 1 is the ALL tag */
michael@0 889 int32_t listOffset = gMainTable.taggedAliasArray[(gMainTable.tagListSize - 1)*gMainTable.converterListSize + convNum];
michael@0 890
michael@0 891 if (listOffset) {
michael@0 892 uint32_t listCount = gMainTable.taggedAliasLists[listOffset];
michael@0 893 /* +1 to skip listCount */
michael@0 894 const uint16_t *currList = gMainTable.taggedAliasLists + listOffset + 1;
michael@0 895
michael@0 896 for (currAlias = start; currAlias < listCount; currAlias++) {
michael@0 897 aliases[currAlias] = GET_STRING(currList[currAlias]);
michael@0 898 }
michael@0 899 }
michael@0 900 /* else this shouldn't happen. internal program error */
michael@0 901 }
michael@0 902 /* else converter not found */
michael@0 903 }
michael@0 904 return 0;
michael@0 905 }
michael@0 906
michael@0 907 static const char *
michael@0 908 ucnv_io_getAlias(const char *alias, uint16_t n, UErrorCode *pErrorCode) {
michael@0 909 if(haveAliasData(pErrorCode) && isAlias(alias, pErrorCode)) {
michael@0 910 uint32_t convNum = findConverter(alias, NULL, pErrorCode);
michael@0 911 if (convNum < gMainTable.converterListSize) {
michael@0 912 /* tagListNum - 1 is the ALL tag */
michael@0 913 int32_t listOffset = gMainTable.taggedAliasArray[(gMainTable.tagListSize - 1)*gMainTable.converterListSize + convNum];
michael@0 914
michael@0 915 if (listOffset) {
michael@0 916 uint32_t listCount = gMainTable.taggedAliasLists[listOffset];
michael@0 917 /* +1 to skip listCount */
michael@0 918 const uint16_t *currList = gMainTable.taggedAliasLists + listOffset + 1;
michael@0 919
michael@0 920 if (n < listCount) {
michael@0 921 return GET_STRING(currList[n]);
michael@0 922 }
michael@0 923 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 924 }
michael@0 925 /* else this shouldn't happen. internal program error */
michael@0 926 }
michael@0 927 /* else converter not found */
michael@0 928 }
michael@0 929 return NULL;
michael@0 930 }
michael@0 931
michael@0 932 static uint16_t
michael@0 933 ucnv_io_countStandards(UErrorCode *pErrorCode) {
michael@0 934 if (haveAliasData(pErrorCode)) {
michael@0 935 /* Don't include the empty list */
michael@0 936 return (uint16_t)(gMainTable.tagListSize - UCNV_NUM_HIDDEN_TAGS);
michael@0 937 }
michael@0 938
michael@0 939 return 0;
michael@0 940 }
michael@0 941
michael@0 942 U_CAPI const char * U_EXPORT2
michael@0 943 ucnv_getStandard(uint16_t n, UErrorCode *pErrorCode) {
michael@0 944 if (haveAliasData(pErrorCode)) {
michael@0 945 if (n < gMainTable.tagListSize - UCNV_NUM_HIDDEN_TAGS) {
michael@0 946 return GET_STRING(gMainTable.tagList[n]);
michael@0 947 }
michael@0 948 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 949 }
michael@0 950
michael@0 951 return NULL;
michael@0 952 }
michael@0 953
michael@0 954 U_CAPI const char * U_EXPORT2
michael@0 955 ucnv_getStandardName(const char *alias, const char *standard, UErrorCode *pErrorCode) {
michael@0 956 if (haveAliasData(pErrorCode) && isAlias(alias, pErrorCode)) {
michael@0 957 uint32_t listOffset = findTaggedAliasListsOffset(alias, standard, pErrorCode);
michael@0 958
michael@0 959 if (0 < listOffset && listOffset < gMainTable.taggedAliasListsSize) {
michael@0 960 const uint16_t *currList = gMainTable.taggedAliasLists + listOffset + 1;
michael@0 961
michael@0 962 /* Get the preferred name from this list */
michael@0 963 if (currList[0]) {
michael@0 964 return GET_STRING(currList[0]);
michael@0 965 }
michael@0 966 /* else someone screwed up the alias table. */
michael@0 967 /* *pErrorCode = U_INVALID_FORMAT_ERROR */
michael@0 968 }
michael@0 969 }
michael@0 970
michael@0 971 return NULL;
michael@0 972 }
michael@0 973
michael@0 974 U_CAPI uint16_t U_EXPORT2
michael@0 975 ucnv_countAliases(const char *alias, UErrorCode *pErrorCode)
michael@0 976 {
michael@0 977 return ucnv_io_countAliases(alias, pErrorCode);
michael@0 978 }
michael@0 979
michael@0 980
michael@0 981 U_CAPI const char* U_EXPORT2
michael@0 982 ucnv_getAlias(const char *alias, uint16_t n, UErrorCode *pErrorCode)
michael@0 983 {
michael@0 984 return ucnv_io_getAlias(alias, n, pErrorCode);
michael@0 985 }
michael@0 986
michael@0 987 U_CAPI void U_EXPORT2
michael@0 988 ucnv_getAliases(const char *alias, const char **aliases, UErrorCode *pErrorCode)
michael@0 989 {
michael@0 990 ucnv_io_getAliases(alias, 0, aliases, pErrorCode);
michael@0 991 }
michael@0 992
michael@0 993 U_CAPI uint16_t U_EXPORT2
michael@0 994 ucnv_countStandards(void)
michael@0 995 {
michael@0 996 UErrorCode err = U_ZERO_ERROR;
michael@0 997 return ucnv_io_countStandards(&err);
michael@0 998 }
michael@0 999
michael@0 1000 U_CAPI const char * U_EXPORT2
michael@0 1001 ucnv_getCanonicalName(const char *alias, const char *standard, UErrorCode *pErrorCode) {
michael@0 1002 if (haveAliasData(pErrorCode) && isAlias(alias, pErrorCode)) {
michael@0 1003 uint32_t convNum = findTaggedConverterNum(alias, standard, pErrorCode);
michael@0 1004
michael@0 1005 if (convNum < gMainTable.converterListSize) {
michael@0 1006 return GET_STRING(gMainTable.converterList[convNum]);
michael@0 1007 }
michael@0 1008 }
michael@0 1009
michael@0 1010 return NULL;
michael@0 1011 }
michael@0 1012
michael@0 1013 static int32_t U_CALLCONV
michael@0 1014 ucnv_io_countAllConverters(UEnumeration * /*enumerator*/, UErrorCode * /*pErrorCode*/) {
michael@0 1015 return gMainTable.converterListSize;
michael@0 1016 }
michael@0 1017
michael@0 1018 static const char* U_CALLCONV
michael@0 1019 ucnv_io_nextAllConverters(UEnumeration *enumerator,
michael@0 1020 int32_t* resultLength,
michael@0 1021 UErrorCode * /*pErrorCode*/)
michael@0 1022 {
michael@0 1023 uint16_t *myContext = (uint16_t *)(enumerator->context);
michael@0 1024
michael@0 1025 if (*myContext < gMainTable.converterListSize) {
michael@0 1026 const char *myStr = GET_STRING(gMainTable.converterList[(*myContext)++]);
michael@0 1027 if (resultLength) {
michael@0 1028 *resultLength = (int32_t)uprv_strlen(myStr);
michael@0 1029 }
michael@0 1030 return myStr;
michael@0 1031 }
michael@0 1032 /* Either we accessed a zero length list, or we enumerated too far. */
michael@0 1033 if (resultLength) {
michael@0 1034 *resultLength = 0;
michael@0 1035 }
michael@0 1036 return NULL;
michael@0 1037 }
michael@0 1038
michael@0 1039 static void U_CALLCONV
michael@0 1040 ucnv_io_resetAllConverters(UEnumeration *enumerator, UErrorCode * /*pErrorCode*/) {
michael@0 1041 *((uint16_t *)(enumerator->context)) = 0;
michael@0 1042 }
michael@0 1043
michael@0 1044 static const UEnumeration gEnumAllConverters = {
michael@0 1045 NULL,
michael@0 1046 NULL,
michael@0 1047 ucnv_io_closeUEnumeration,
michael@0 1048 ucnv_io_countAllConverters,
michael@0 1049 uenum_unextDefault,
michael@0 1050 ucnv_io_nextAllConverters,
michael@0 1051 ucnv_io_resetAllConverters
michael@0 1052 };
michael@0 1053
michael@0 1054 U_CAPI UEnumeration * U_EXPORT2
michael@0 1055 ucnv_openAllNames(UErrorCode *pErrorCode) {
michael@0 1056 UEnumeration *myEnum = NULL;
michael@0 1057 if (haveAliasData(pErrorCode)) {
michael@0 1058 uint16_t *myContext;
michael@0 1059
michael@0 1060 myEnum = static_cast<UEnumeration *>(uprv_malloc(sizeof(UEnumeration)));
michael@0 1061 if (myEnum == NULL) {
michael@0 1062 *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
michael@0 1063 return NULL;
michael@0 1064 }
michael@0 1065 uprv_memcpy(myEnum, &gEnumAllConverters, sizeof(UEnumeration));
michael@0 1066 myContext = static_cast<uint16_t *>(uprv_malloc(sizeof(uint16_t)));
michael@0 1067 if (myContext == NULL) {
michael@0 1068 *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
michael@0 1069 uprv_free(myEnum);
michael@0 1070 return NULL;
michael@0 1071 }
michael@0 1072 *myContext = 0;
michael@0 1073 myEnum->context = myContext;
michael@0 1074 }
michael@0 1075 return myEnum;
michael@0 1076 }
michael@0 1077
michael@0 1078 U_CFUNC uint16_t
michael@0 1079 ucnv_io_countKnownConverters(UErrorCode *pErrorCode) {
michael@0 1080 if (haveAliasData(pErrorCode)) {
michael@0 1081 return (uint16_t)gMainTable.converterListSize;
michael@0 1082 }
michael@0 1083 return 0;
michael@0 1084 }
michael@0 1085
michael@0 1086 /* alias table swapping ----------------------------------------------------- */
michael@0 1087
michael@0 1088 typedef char * U_CALLCONV StripForCompareFn(char *dst, const char *name);
michael@0 1089
michael@0 1090 /*
michael@0 1091 * row of a temporary array
michael@0 1092 *
michael@0 1093 * gets platform-endian charset string indexes and sorting indexes;
michael@0 1094 * after sorting this array by strings, the actual arrays are permutated
michael@0 1095 * according to the sorting indexes
michael@0 1096 */
michael@0 1097 typedef struct TempRow {
michael@0 1098 uint16_t strIndex, sortIndex;
michael@0 1099 } TempRow;
michael@0 1100
michael@0 1101 typedef struct TempAliasTable {
michael@0 1102 const char *chars;
michael@0 1103 TempRow *rows;
michael@0 1104 uint16_t *resort;
michael@0 1105 StripForCompareFn *stripForCompare;
michael@0 1106 } TempAliasTable;
michael@0 1107
michael@0 1108 enum {
michael@0 1109 STACK_ROW_CAPACITY=500
michael@0 1110 };
michael@0 1111
michael@0 1112 static int32_t
michael@0 1113 io_compareRows(const void *context, const void *left, const void *right) {
michael@0 1114 char strippedLeft[UCNV_MAX_CONVERTER_NAME_LENGTH],
michael@0 1115 strippedRight[UCNV_MAX_CONVERTER_NAME_LENGTH];
michael@0 1116
michael@0 1117 TempAliasTable *tempTable=(TempAliasTable *)context;
michael@0 1118 const char *chars=tempTable->chars;
michael@0 1119
michael@0 1120 return (int32_t)uprv_strcmp(tempTable->stripForCompare(strippedLeft, chars+2*((const TempRow *)left)->strIndex),
michael@0 1121 tempTable->stripForCompare(strippedRight, chars+2*((const TempRow *)right)->strIndex));
michael@0 1122 }
michael@0 1123
michael@0 1124 U_CAPI int32_t U_EXPORT2
michael@0 1125 ucnv_swapAliases(const UDataSwapper *ds,
michael@0 1126 const void *inData, int32_t length, void *outData,
michael@0 1127 UErrorCode *pErrorCode) {
michael@0 1128 const UDataInfo *pInfo;
michael@0 1129 int32_t headerSize;
michael@0 1130
michael@0 1131 const uint16_t *inTable;
michael@0 1132 const uint32_t *inSectionSizes;
michael@0 1133 uint32_t toc[offsetsCount];
michael@0 1134 uint32_t offsets[offsetsCount]; /* 16-bit-addressed offsets from inTable/outTable */
michael@0 1135 uint32_t i, count, tocLength, topOffset;
michael@0 1136
michael@0 1137 TempRow rows[STACK_ROW_CAPACITY];
michael@0 1138 uint16_t resort[STACK_ROW_CAPACITY];
michael@0 1139 TempAliasTable tempTable;
michael@0 1140
michael@0 1141 /* udata_swapDataHeader checks the arguments */
michael@0 1142 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
michael@0 1143 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
michael@0 1144 return 0;
michael@0 1145 }
michael@0 1146
michael@0 1147 /* check data format and format version */
michael@0 1148 pInfo=(const UDataInfo *)((const char *)inData+4);
michael@0 1149 if(!(
michael@0 1150 pInfo->dataFormat[0]==0x43 && /* dataFormat="CvAl" */
michael@0 1151 pInfo->dataFormat[1]==0x76 &&
michael@0 1152 pInfo->dataFormat[2]==0x41 &&
michael@0 1153 pInfo->dataFormat[3]==0x6c &&
michael@0 1154 pInfo->formatVersion[0]==3
michael@0 1155 )) {
michael@0 1156 udata_printError(ds, "ucnv_swapAliases(): data format %02x.%02x.%02x.%02x (format version %02x) is not an alias table\n",
michael@0 1157 pInfo->dataFormat[0], pInfo->dataFormat[1],
michael@0 1158 pInfo->dataFormat[2], pInfo->dataFormat[3],
michael@0 1159 pInfo->formatVersion[0]);
michael@0 1160 *pErrorCode=U_UNSUPPORTED_ERROR;
michael@0 1161 return 0;
michael@0 1162 }
michael@0 1163
michael@0 1164 /* an alias table must contain at least the table of contents array */
michael@0 1165 if(length>=0 && (length-headerSize)<4*(1+minTocLength)) {
michael@0 1166 udata_printError(ds, "ucnv_swapAliases(): too few bytes (%d after header) for an alias table\n",
michael@0 1167 length-headerSize);
michael@0 1168 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 1169 return 0;
michael@0 1170 }
michael@0 1171
michael@0 1172 inSectionSizes=(const uint32_t *)((const char *)inData+headerSize);
michael@0 1173 inTable=(const uint16_t *)inSectionSizes;
michael@0 1174 uprv_memset(toc, 0, sizeof(toc));
michael@0 1175 toc[tocLengthIndex]=tocLength=ds->readUInt32(inSectionSizes[tocLengthIndex]);
michael@0 1176 if(tocLength<minTocLength || offsetsCount<=tocLength) {
michael@0 1177 udata_printError(ds, "ucnv_swapAliases(): table of contents contains unsupported number of sections (%u sections)\n", tocLength);
michael@0 1178 *pErrorCode=U_INVALID_FORMAT_ERROR;
michael@0 1179 return 0;
michael@0 1180 }
michael@0 1181
michael@0 1182 /* read the known part of the table of contents */
michael@0 1183 for(i=converterListIndex; i<=tocLength; ++i) {
michael@0 1184 toc[i]=ds->readUInt32(inSectionSizes[i]);
michael@0 1185 }
michael@0 1186
michael@0 1187 /* compute offsets */
michael@0 1188 uprv_memset(offsets, 0, sizeof(offsets));
michael@0 1189 offsets[converterListIndex]=2*(1+tocLength); /* count two 16-bit units per toc entry */
michael@0 1190 for(i=tagListIndex; i<=tocLength; ++i) {
michael@0 1191 offsets[i]=offsets[i-1]+toc[i-1];
michael@0 1192 }
michael@0 1193
michael@0 1194 /* compute the overall size of the after-header data, in numbers of 16-bit units */
michael@0 1195 topOffset=offsets[i-1]+toc[i-1];
michael@0 1196
michael@0 1197 if(length>=0) {
michael@0 1198 uint16_t *outTable;
michael@0 1199 const uint16_t *p, *p2;
michael@0 1200 uint16_t *q, *q2;
michael@0 1201 uint16_t oldIndex;
michael@0 1202
michael@0 1203 if((length-headerSize)<(2*(int32_t)topOffset)) {
michael@0 1204 udata_printError(ds, "ucnv_swapAliases(): too few bytes (%d after header) for an alias table\n",
michael@0 1205 length-headerSize);
michael@0 1206 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 1207 return 0;
michael@0 1208 }
michael@0 1209
michael@0 1210 outTable=(uint16_t *)((char *)outData+headerSize);
michael@0 1211
michael@0 1212 /* swap the entire table of contents */
michael@0 1213 ds->swapArray32(ds, inTable, 4*(1+tocLength), outTable, pErrorCode);
michael@0 1214
michael@0 1215 /* swap unormalized strings & normalized strings */
michael@0 1216 ds->swapInvChars(ds, inTable+offsets[stringTableIndex], 2*(int32_t)(toc[stringTableIndex]+toc[normalizedStringTableIndex]),
michael@0 1217 outTable+offsets[stringTableIndex], pErrorCode);
michael@0 1218 if(U_FAILURE(*pErrorCode)) {
michael@0 1219 udata_printError(ds, "ucnv_swapAliases().swapInvChars(charset names) failed\n");
michael@0 1220 return 0;
michael@0 1221 }
michael@0 1222
michael@0 1223 if(ds->inCharset==ds->outCharset) {
michael@0 1224 /* no need to sort, just swap all 16-bit values together */
michael@0 1225 ds->swapArray16(ds,
michael@0 1226 inTable+offsets[converterListIndex],
michael@0 1227 2*(int32_t)(offsets[stringTableIndex]-offsets[converterListIndex]),
michael@0 1228 outTable+offsets[converterListIndex],
michael@0 1229 pErrorCode);
michael@0 1230 } else {
michael@0 1231 /* allocate the temporary table for sorting */
michael@0 1232 count=toc[aliasListIndex];
michael@0 1233
michael@0 1234 tempTable.chars=(const char *)(outTable+offsets[stringTableIndex]); /* sort by outCharset */
michael@0 1235
michael@0 1236 if(count<=STACK_ROW_CAPACITY) {
michael@0 1237 tempTable.rows=rows;
michael@0 1238 tempTable.resort=resort;
michael@0 1239 } else {
michael@0 1240 tempTable.rows=(TempRow *)uprv_malloc(count*sizeof(TempRow)+count*2);
michael@0 1241 if(tempTable.rows==NULL) {
michael@0 1242 udata_printError(ds, "ucnv_swapAliases(): unable to allocate memory for sorting tables (max length: %u)\n",
michael@0 1243 count);
michael@0 1244 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
michael@0 1245 return 0;
michael@0 1246 }
michael@0 1247 tempTable.resort=(uint16_t *)(tempTable.rows+count);
michael@0 1248 }
michael@0 1249
michael@0 1250 if(ds->outCharset==U_ASCII_FAMILY) {
michael@0 1251 tempTable.stripForCompare=ucnv_io_stripASCIIForCompare;
michael@0 1252 } else /* U_EBCDIC_FAMILY */ {
michael@0 1253 tempTable.stripForCompare=ucnv_io_stripEBCDICForCompare;
michael@0 1254 }
michael@0 1255
michael@0 1256 /*
michael@0 1257 * Sort unique aliases+mapped names.
michael@0 1258 *
michael@0 1259 * We need to sort the list again by outCharset strings because they
michael@0 1260 * sort differently for different charset families.
michael@0 1261 * First we set up a temporary table with the string indexes and
michael@0 1262 * sorting indexes and sort that.
michael@0 1263 * Then we permutate and copy/swap the actual values.
michael@0 1264 */
michael@0 1265 p=inTable+offsets[aliasListIndex];
michael@0 1266 q=outTable+offsets[aliasListIndex];
michael@0 1267
michael@0 1268 p2=inTable+offsets[untaggedConvArrayIndex];
michael@0 1269 q2=outTable+offsets[untaggedConvArrayIndex];
michael@0 1270
michael@0 1271 for(i=0; i<count; ++i) {
michael@0 1272 tempTable.rows[i].strIndex=ds->readUInt16(p[i]);
michael@0 1273 tempTable.rows[i].sortIndex=(uint16_t)i;
michael@0 1274 }
michael@0 1275
michael@0 1276 uprv_sortArray(tempTable.rows, (int32_t)count, sizeof(TempRow),
michael@0 1277 io_compareRows, &tempTable,
michael@0 1278 FALSE, pErrorCode);
michael@0 1279
michael@0 1280 if(U_SUCCESS(*pErrorCode)) {
michael@0 1281 /* copy/swap/permutate items */
michael@0 1282 if(p!=q) {
michael@0 1283 for(i=0; i<count; ++i) {
michael@0 1284 oldIndex=tempTable.rows[i].sortIndex;
michael@0 1285 ds->swapArray16(ds, p+oldIndex, 2, q+i, pErrorCode);
michael@0 1286 ds->swapArray16(ds, p2+oldIndex, 2, q2+i, pErrorCode);
michael@0 1287 }
michael@0 1288 } else {
michael@0 1289 /*
michael@0 1290 * If we swap in-place, then the permutation must use another
michael@0 1291 * temporary array (tempTable.resort)
michael@0 1292 * before the results are copied to the outBundle.
michael@0 1293 */
michael@0 1294 uint16_t *r=tempTable.resort;
michael@0 1295
michael@0 1296 for(i=0; i<count; ++i) {
michael@0 1297 oldIndex=tempTable.rows[i].sortIndex;
michael@0 1298 ds->swapArray16(ds, p+oldIndex, 2, r+i, pErrorCode);
michael@0 1299 }
michael@0 1300 uprv_memcpy(q, r, 2*count);
michael@0 1301
michael@0 1302 for(i=0; i<count; ++i) {
michael@0 1303 oldIndex=tempTable.rows[i].sortIndex;
michael@0 1304 ds->swapArray16(ds, p2+oldIndex, 2, r+i, pErrorCode);
michael@0 1305 }
michael@0 1306 uprv_memcpy(q2, r, 2*count);
michael@0 1307 }
michael@0 1308 }
michael@0 1309
michael@0 1310 if(tempTable.rows!=rows) {
michael@0 1311 uprv_free(tempTable.rows);
michael@0 1312 }
michael@0 1313
michael@0 1314 if(U_FAILURE(*pErrorCode)) {
michael@0 1315 udata_printError(ds, "ucnv_swapAliases().uprv_sortArray(%u items) failed\n",
michael@0 1316 count);
michael@0 1317 return 0;
michael@0 1318 }
michael@0 1319
michael@0 1320 /* swap remaining 16-bit values */
michael@0 1321 ds->swapArray16(ds,
michael@0 1322 inTable+offsets[converterListIndex],
michael@0 1323 2*(int32_t)(offsets[aliasListIndex]-offsets[converterListIndex]),
michael@0 1324 outTable+offsets[converterListIndex],
michael@0 1325 pErrorCode);
michael@0 1326 ds->swapArray16(ds,
michael@0 1327 inTable+offsets[taggedAliasArrayIndex],
michael@0 1328 2*(int32_t)(offsets[stringTableIndex]-offsets[taggedAliasArrayIndex]),
michael@0 1329 outTable+offsets[taggedAliasArrayIndex],
michael@0 1330 pErrorCode);
michael@0 1331 }
michael@0 1332 }
michael@0 1333
michael@0 1334 return headerSize+2*(int32_t)topOffset;
michael@0 1335 }
michael@0 1336
michael@0 1337 #endif
michael@0 1338
michael@0 1339
michael@0 1340 /*
michael@0 1341 * Hey, Emacs, please set the following:
michael@0 1342 *
michael@0 1343 * Local Variables:
michael@0 1344 * indent-tabs-mode: nil
michael@0 1345 * End:
michael@0 1346 *
michael@0 1347 */

mercurial