intl/icu/source/i18n/ucol_sit.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /*
michael@0 2 *******************************************************************************
michael@0 3 * Copyright (C) 2004-2012, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 *******************************************************************************
michael@0 6 * file name: ucol_sit.cpp
michael@0 7 * encoding: US-ASCII
michael@0 8 * tab size: 8 (not used)
michael@0 9 * indentation:4
michael@0 10 *
michael@0 11 * Modification history
michael@0 12 * Date Name Comments
michael@0 13 * 03/12/2004 weiv Creation
michael@0 14 */
michael@0 15
michael@0 16 #include "unicode/ustring.h"
michael@0 17 #include "unicode/udata.h"
michael@0 18
michael@0 19 #include "utracimp.h"
michael@0 20 #include "ucol_imp.h"
michael@0 21 #include "ucol_tok.h"
michael@0 22 #include "cmemory.h"
michael@0 23 #include "cstring.h"
michael@0 24 #include "uresimp.h"
michael@0 25 #include "unicode/coll.h"
michael@0 26
michael@0 27 #ifdef UCOL_TRACE_SIT
michael@0 28 # include <stdio.h>
michael@0 29 #endif
michael@0 30
michael@0 31 #if !UCONFIG_NO_COLLATION
michael@0 32
michael@0 33 enum OptionsList {
michael@0 34 UCOL_SIT_LANGUAGE = 0,
michael@0 35 UCOL_SIT_SCRIPT = 1,
michael@0 36 UCOL_SIT_REGION = 2,
michael@0 37 UCOL_SIT_VARIANT = 3,
michael@0 38 UCOL_SIT_KEYWORD = 4,
michael@0 39 UCOL_SIT_PROVIDER = 5,
michael@0 40 UCOL_SIT_LOCELEMENT_MAX = UCOL_SIT_PROVIDER, /* the last element that's part of LocElements */
michael@0 41
michael@0 42 UCOL_SIT_BCP47,
michael@0 43 UCOL_SIT_STRENGTH,
michael@0 44 UCOL_SIT_CASE_LEVEL,
michael@0 45 UCOL_SIT_CASE_FIRST,
michael@0 46 UCOL_SIT_NUMERIC_COLLATION,
michael@0 47 UCOL_SIT_ALTERNATE_HANDLING,
michael@0 48 UCOL_SIT_NORMALIZATION_MODE,
michael@0 49 UCOL_SIT_FRENCH_COLLATION,
michael@0 50 UCOL_SIT_HIRAGANA_QUATERNARY,
michael@0 51 UCOL_SIT_VARIABLE_TOP,
michael@0 52 UCOL_SIT_VARIABLE_TOP_VALUE,
michael@0 53 UCOL_SIT_ITEMS_COUNT
michael@0 54 };
michael@0 55
michael@0 56 /* option starters chars. */
michael@0 57 static const char alternateHArg = 'A';
michael@0 58 static const char variableTopValArg = 'B';
michael@0 59 static const char caseFirstArg = 'C';
michael@0 60 static const char numericCollArg = 'D';
michael@0 61 static const char caseLevelArg = 'E';
michael@0 62 static const char frenchCollArg = 'F';
michael@0 63 static const char hiraganaQArg = 'H';
michael@0 64 static const char keywordArg = 'K';
michael@0 65 static const char languageArg = 'L';
michael@0 66 static const char normArg = 'N';
michael@0 67 static const char providerArg = 'P';
michael@0 68 static const char regionArg = 'R';
michael@0 69 static const char strengthArg = 'S';
michael@0 70 static const char variableTopArg = 'T';
michael@0 71 static const char variantArg = 'V';
michael@0 72 static const char RFC3066Arg = 'X';
michael@0 73 static const char scriptArg = 'Z';
michael@0 74
michael@0 75 static const char collationKeyword[] = "@collation=";
michael@0 76 static const char providerKeyword[] = "@sp=";
michael@0 77
michael@0 78
michael@0 79 static const int32_t locElementCount = UCOL_SIT_LOCELEMENT_MAX+1;
michael@0 80 static const int32_t locElementCapacity = 32;
michael@0 81 static const int32_t loc3066Capacity = 256;
michael@0 82 static const int32_t locProviderCapacity = 10;
michael@0 83 static const int32_t internalBufferSize = 512;
michael@0 84
michael@0 85 /* structure containing specification of a collator. Initialized
michael@0 86 * from a short string. Also used to construct a short string from a
michael@0 87 * collator instance
michael@0 88 */
michael@0 89 struct CollatorSpec {
michael@0 90 char locElements[locElementCount][locElementCapacity];
michael@0 91 char locale[loc3066Capacity];
michael@0 92 char provider[locProviderCapacity];
michael@0 93 UColAttributeValue options[UCOL_ATTRIBUTE_COUNT];
michael@0 94 uint32_t variableTopValue;
michael@0 95 UChar variableTopString[locElementCapacity];
michael@0 96 int32_t variableTopStringLen;
michael@0 97 UBool variableTopSet;
michael@0 98 struct {
michael@0 99 const char *start;
michael@0 100 int32_t len;
michael@0 101 } entries[UCOL_SIT_ITEMS_COUNT];
michael@0 102 };
michael@0 103
michael@0 104
michael@0 105 /* structure for converting between character attribute
michael@0 106 * representation and real collation attribute value.
michael@0 107 */
michael@0 108 struct AttributeConversion {
michael@0 109 char letter;
michael@0 110 UColAttributeValue value;
michael@0 111 };
michael@0 112
michael@0 113 static const AttributeConversion conversions[12] = {
michael@0 114 { '1', UCOL_PRIMARY },
michael@0 115 { '2', UCOL_SECONDARY },
michael@0 116 { '3', UCOL_TERTIARY },
michael@0 117 { '4', UCOL_QUATERNARY },
michael@0 118 { 'D', UCOL_DEFAULT },
michael@0 119 { 'I', UCOL_IDENTICAL },
michael@0 120 { 'L', UCOL_LOWER_FIRST },
michael@0 121 { 'N', UCOL_NON_IGNORABLE },
michael@0 122 { 'O', UCOL_ON },
michael@0 123 { 'S', UCOL_SHIFTED },
michael@0 124 { 'U', UCOL_UPPER_FIRST },
michael@0 125 { 'X', UCOL_OFF }
michael@0 126 };
michael@0 127
michael@0 128
michael@0 129 static char
michael@0 130 ucol_sit_attributeValueToLetter(UColAttributeValue value, UErrorCode *status) {
michael@0 131 uint32_t i = 0;
michael@0 132 for(i = 0; i < sizeof(conversions)/sizeof(conversions[0]); i++) {
michael@0 133 if(conversions[i].value == value) {
michael@0 134 return conversions[i].letter;
michael@0 135 }
michael@0 136 }
michael@0 137 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 138 #ifdef UCOL_TRACE_SIT
michael@0 139 fprintf(stderr, "%s:%d: unknown UColAttributeValue %d: %s\n", __FILE__, __LINE__, value, u_errorName(*status));
michael@0 140 #endif
michael@0 141 return 0;
michael@0 142 }
michael@0 143
michael@0 144 static UColAttributeValue
michael@0 145 ucol_sit_letterToAttributeValue(char letter, UErrorCode *status) {
michael@0 146 uint32_t i = 0;
michael@0 147 for(i = 0; i < sizeof(conversions)/sizeof(conversions[0]); i++) {
michael@0 148 if(conversions[i].letter == letter) {
michael@0 149 return conversions[i].value;
michael@0 150 }
michael@0 151 }
michael@0 152 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 153 #ifdef UCOL_TRACE_SIT
michael@0 154 fprintf(stderr, "%s:%d: unknown letter %c: %s\n", __FILE__, __LINE__, letter, u_errorName(*status));
michael@0 155 #endif
michael@0 156 return UCOL_DEFAULT;
michael@0 157 }
michael@0 158
michael@0 159 /* function prototype for functions used to parse a short string */
michael@0 160 U_CDECL_BEGIN
michael@0 161 typedef const char* U_CALLCONV
michael@0 162 ActionFunction(CollatorSpec *spec, uint32_t value1, const char* string,
michael@0 163 UErrorCode *status);
michael@0 164 U_CDECL_END
michael@0 165
michael@0 166 U_CDECL_BEGIN
michael@0 167 static const char* U_CALLCONV
michael@0 168 _processLocaleElement(CollatorSpec *spec, uint32_t value, const char* string,
michael@0 169 UErrorCode *status)
michael@0 170 {
michael@0 171 int32_t len = 0;
michael@0 172 do {
michael@0 173 if(value == UCOL_SIT_LANGUAGE || value == UCOL_SIT_KEYWORD || value == UCOL_SIT_PROVIDER) {
michael@0 174 spec->locElements[value][len++] = uprv_tolower(*string);
michael@0 175 } else {
michael@0 176 spec->locElements[value][len++] = *string;
michael@0 177 }
michael@0 178 } while(*(++string) != '_' && *string && len < locElementCapacity);
michael@0 179 if(len >= locElementCapacity) {
michael@0 180 *status = U_BUFFER_OVERFLOW_ERROR;
michael@0 181 return string;
michael@0 182 }
michael@0 183 // don't skip the underscore at the end
michael@0 184 return string;
michael@0 185 }
michael@0 186 U_CDECL_END
michael@0 187
michael@0 188 U_CDECL_BEGIN
michael@0 189 static const char* U_CALLCONV
michael@0 190 _processRFC3066Locale(CollatorSpec *spec, uint32_t, const char* string,
michael@0 191 UErrorCode *status)
michael@0 192 {
michael@0 193 char terminator = *string;
michael@0 194 string++;
michael@0 195 const char *end = uprv_strchr(string+1, terminator);
michael@0 196 if(end == NULL || end - string >= loc3066Capacity) {
michael@0 197 *status = U_BUFFER_OVERFLOW_ERROR;
michael@0 198 return string;
michael@0 199 } else {
michael@0 200 uprv_strncpy(spec->locale, string, end-string);
michael@0 201 return end+1;
michael@0 202 }
michael@0 203 }
michael@0 204
michael@0 205 U_CDECL_END
michael@0 206
michael@0 207 U_CDECL_BEGIN
michael@0 208 static const char* U_CALLCONV
michael@0 209 _processCollatorOption(CollatorSpec *spec, uint32_t option, const char* string,
michael@0 210 UErrorCode *status)
michael@0 211 {
michael@0 212 spec->options[option] = ucol_sit_letterToAttributeValue(*string, status);
michael@0 213 if((*(++string) != '_' && *string) || U_FAILURE(*status)) {
michael@0 214 #ifdef UCOL_TRACE_SIT
michael@0 215 fprintf(stderr, "%s:%d: unknown collator option at '%s': %s\n", __FILE__, __LINE__, string, u_errorName(*status));
michael@0 216 #endif
michael@0 217 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 218 }
michael@0 219 return string;
michael@0 220 }
michael@0 221 U_CDECL_END
michael@0 222
michael@0 223
michael@0 224 static UChar
michael@0 225 readHexCodeUnit(const char **string, UErrorCode *status)
michael@0 226 {
michael@0 227 UChar result = 0;
michael@0 228 int32_t value = 0;
michael@0 229 char c;
michael@0 230 int32_t noDigits = 0;
michael@0 231 while((c = **string) != 0 && noDigits < 4) {
michael@0 232 if( c >= '0' && c <= '9') {
michael@0 233 value = c - '0';
michael@0 234 } else if ( c >= 'a' && c <= 'f') {
michael@0 235 value = c - 'a' + 10;
michael@0 236 } else if ( c >= 'A' && c <= 'F') {
michael@0 237 value = c - 'A' + 10;
michael@0 238 } else {
michael@0 239 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 240 #ifdef UCOL_TRACE_SIT
michael@0 241 fprintf(stderr, "%s:%d: Bad hex char at '%s': %s\n", __FILE__, __LINE__, *string, u_errorName(*status));
michael@0 242 #endif
michael@0 243 return 0;
michael@0 244 }
michael@0 245 result = (result << 4) | (UChar)value;
michael@0 246 noDigits++;
michael@0 247 (*string)++;
michael@0 248 }
michael@0 249 // if the string was terminated before we read 4 digits, set an error
michael@0 250 if(noDigits < 4) {
michael@0 251 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 252 #ifdef UCOL_TRACE_SIT
michael@0 253 fprintf(stderr, "%s:%d: Short (only %d digits, wanted 4) at '%s': %s\n", __FILE__, __LINE__, noDigits,*string, u_errorName(*status));
michael@0 254 #endif
michael@0 255 }
michael@0 256 return result;
michael@0 257 }
michael@0 258
michael@0 259 U_CDECL_BEGIN
michael@0 260 static const char* U_CALLCONV
michael@0 261 _processVariableTop(CollatorSpec *spec, uint32_t value1, const char* string, UErrorCode *status)
michael@0 262 {
michael@0 263 // get four digits
michael@0 264 int32_t i = 0;
michael@0 265 if(!value1) {
michael@0 266 while(U_SUCCESS(*status) && i < locElementCapacity && *string != 0 && *string != '_') {
michael@0 267 spec->variableTopString[i++] = readHexCodeUnit(&string, status);
michael@0 268 }
michael@0 269 spec->variableTopStringLen = i;
michael@0 270 if(i == locElementCapacity && *string != 0 && *string != '_') {
michael@0 271 *status = U_BUFFER_OVERFLOW_ERROR;
michael@0 272 }
michael@0 273 } else {
michael@0 274 spec->variableTopValue = readHexCodeUnit(&string, status);
michael@0 275 }
michael@0 276 if(U_SUCCESS(*status)) {
michael@0 277 spec->variableTopSet = TRUE;
michael@0 278 }
michael@0 279 return string;
michael@0 280 }
michael@0 281 U_CDECL_END
michael@0 282
michael@0 283
michael@0 284 /* Table for parsing short strings */
michael@0 285 struct ShortStringOptions {
michael@0 286 char optionStart;
michael@0 287 ActionFunction *action;
michael@0 288 uint32_t attr;
michael@0 289 };
michael@0 290
michael@0 291 static const ShortStringOptions options[UCOL_SIT_ITEMS_COUNT] =
michael@0 292 {
michael@0 293 /* 10 ALTERNATE_HANDLING */ {alternateHArg, _processCollatorOption, UCOL_ALTERNATE_HANDLING }, // alternate N, S, D
michael@0 294 /* 15 VARIABLE_TOP_VALUE */ {variableTopValArg, _processVariableTop, 1 },
michael@0 295 /* 08 CASE_FIRST */ {caseFirstArg, _processCollatorOption, UCOL_CASE_FIRST }, // case first L, U, X, D
michael@0 296 /* 09 NUMERIC_COLLATION */ {numericCollArg, _processCollatorOption, UCOL_NUMERIC_COLLATION }, // codan O, X, D
michael@0 297 /* 07 CASE_LEVEL */ {caseLevelArg, _processCollatorOption, UCOL_CASE_LEVEL }, // case level O, X, D
michael@0 298 /* 12 FRENCH_COLLATION */ {frenchCollArg, _processCollatorOption, UCOL_FRENCH_COLLATION }, // french O, X, D
michael@0 299 /* 13 HIRAGANA_QUATERNARY] */ {hiraganaQArg, _processCollatorOption, UCOL_HIRAGANA_QUATERNARY_MODE }, // hiragana O, X, D
michael@0 300 /* 04 KEYWORD */ {keywordArg, _processLocaleElement, UCOL_SIT_KEYWORD }, // keyword
michael@0 301 /* 00 LANGUAGE */ {languageArg, _processLocaleElement, UCOL_SIT_LANGUAGE }, // language
michael@0 302 /* 11 NORMALIZATION_MODE */ {normArg, _processCollatorOption, UCOL_NORMALIZATION_MODE }, // norm O, X, D
michael@0 303 /* 02 REGION */ {regionArg, _processLocaleElement, UCOL_SIT_REGION }, // region
michael@0 304 /* 06 STRENGTH */ {strengthArg, _processCollatorOption, UCOL_STRENGTH }, // strength 1, 2, 3, 4, I, D
michael@0 305 /* 14 VARIABLE_TOP */ {variableTopArg, _processVariableTop, 0 },
michael@0 306 /* 03 VARIANT */ {variantArg, _processLocaleElement, UCOL_SIT_VARIANT }, // variant
michael@0 307 /* 05 RFC3066BIS */ {RFC3066Arg, _processRFC3066Locale, 0 }, // rfc3066bis locale name
michael@0 308 /* 01 SCRIPT */ {scriptArg, _processLocaleElement, UCOL_SIT_SCRIPT }, // script
michael@0 309 /* PROVIDER */ {providerArg, _processLocaleElement, UCOL_SIT_PROVIDER }
michael@0 310 };
michael@0 311
michael@0 312
michael@0 313 static
michael@0 314 const char* ucol_sit_readOption(const char *start, CollatorSpec *spec,
michael@0 315 UErrorCode *status)
michael@0 316 {
michael@0 317 int32_t i = 0;
michael@0 318
michael@0 319 for(i = 0; i < UCOL_SIT_ITEMS_COUNT; i++) {
michael@0 320 if(*start == options[i].optionStart) {
michael@0 321 spec->entries[i].start = start;
michael@0 322 const char* end = options[i].action(spec, options[i].attr, start+1, status);
michael@0 323 spec->entries[i].len = (int32_t)(end - start);
michael@0 324 return end;
michael@0 325 }
michael@0 326 }
michael@0 327 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 328 #ifdef UCOL_TRACE_SIT
michael@0 329 fprintf(stderr, "%s:%d: Unknown option at '%s': %s\n", __FILE__, __LINE__, start, u_errorName(*status));
michael@0 330 #endif
michael@0 331 return start;
michael@0 332 }
michael@0 333
michael@0 334 static
michael@0 335 void ucol_sit_initCollatorSpecs(CollatorSpec *spec)
michael@0 336 {
michael@0 337 // reset everything
michael@0 338 uprv_memset(spec, 0, sizeof(CollatorSpec));
michael@0 339 // set collation options to default
michael@0 340 int32_t i = 0;
michael@0 341 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
michael@0 342 spec->options[i] = UCOL_DEFAULT;
michael@0 343 }
michael@0 344 }
michael@0 345
michael@0 346 static const char*
michael@0 347 ucol_sit_readSpecs(CollatorSpec *s, const char *string,
michael@0 348 UParseError *parseError, UErrorCode *status)
michael@0 349 {
michael@0 350 const char *definition = string;
michael@0 351 while(U_SUCCESS(*status) && *string) {
michael@0 352 string = ucol_sit_readOption(string, s, status);
michael@0 353 // advance over '_'
michael@0 354 while(*string && *string == '_') {
michael@0 355 string++;
michael@0 356 }
michael@0 357 }
michael@0 358 if(U_FAILURE(*status)) {
michael@0 359 parseError->offset = (int32_t)(string - definition);
michael@0 360 }
michael@0 361 return string;
michael@0 362 }
michael@0 363
michael@0 364 static
michael@0 365 int32_t ucol_sit_dumpSpecs(CollatorSpec *s, char *destination, int32_t capacity, UErrorCode *status)
michael@0 366 {
michael@0 367 int32_t i = 0, j = 0;
michael@0 368 int32_t len = 0;
michael@0 369 char optName;
michael@0 370 if(U_SUCCESS(*status)) {
michael@0 371 for(i = 0; i < UCOL_SIT_ITEMS_COUNT; i++) {
michael@0 372 if(s->entries[i].start) {
michael@0 373 if(len) {
michael@0 374 if(len < capacity) {
michael@0 375 uprv_strcat(destination, "_");
michael@0 376 }
michael@0 377 len++;
michael@0 378 }
michael@0 379 optName = *(s->entries[i].start);
michael@0 380 if(optName == languageArg || optName == regionArg || optName == variantArg || optName == keywordArg) {
michael@0 381 for(j = 0; j < s->entries[i].len; j++) {
michael@0 382 if(len + j < capacity) {
michael@0 383 destination[len+j] = uprv_toupper(*(s->entries[i].start+j));
michael@0 384 }
michael@0 385 }
michael@0 386 len += s->entries[i].len;
michael@0 387 } else {
michael@0 388 len += s->entries[i].len;
michael@0 389 if(len < capacity) {
michael@0 390 uprv_strncat(destination,s->entries[i].start, s->entries[i].len);
michael@0 391 }
michael@0 392 }
michael@0 393 }
michael@0 394 }
michael@0 395 return len;
michael@0 396 } else {
michael@0 397 return 0;
michael@0 398 }
michael@0 399 }
michael@0 400
michael@0 401 static void
michael@0 402 ucol_sit_calculateWholeLocale(CollatorSpec *s) {
michael@0 403 // put the locale together, unless we have a done
michael@0 404 // locale
michael@0 405 if(s->locale[0] == 0) {
michael@0 406 // first the language
michael@0 407 uprv_strcat(s->locale, s->locElements[UCOL_SIT_LANGUAGE]);
michael@0 408 // then the script, if present
michael@0 409 if(*(s->locElements[UCOL_SIT_SCRIPT])) {
michael@0 410 uprv_strcat(s->locale, "_");
michael@0 411 uprv_strcat(s->locale, s->locElements[UCOL_SIT_SCRIPT]);
michael@0 412 }
michael@0 413 // then the region, if present
michael@0 414 if(*(s->locElements[UCOL_SIT_REGION])) {
michael@0 415 uprv_strcat(s->locale, "_");
michael@0 416 uprv_strcat(s->locale, s->locElements[UCOL_SIT_REGION]);
michael@0 417 } else if(*(s->locElements[UCOL_SIT_VARIANT])) { // if there is a variant, we need an underscore
michael@0 418 uprv_strcat(s->locale, "_");
michael@0 419 }
michael@0 420 // add variant, if there
michael@0 421 if(*(s->locElements[UCOL_SIT_VARIANT])) {
michael@0 422 uprv_strcat(s->locale, "_");
michael@0 423 uprv_strcat(s->locale, s->locElements[UCOL_SIT_VARIANT]);
michael@0 424 }
michael@0 425
michael@0 426 // if there is a collation keyword, add that too
michael@0 427 if(*(s->locElements[UCOL_SIT_KEYWORD])) {
michael@0 428 uprv_strcat(s->locale, collationKeyword);
michael@0 429 uprv_strcat(s->locale, s->locElements[UCOL_SIT_KEYWORD]);
michael@0 430 }
michael@0 431
michael@0 432 // if there is a provider keyword, add that too
michael@0 433 if(*(s->locElements[UCOL_SIT_PROVIDER])) {
michael@0 434 uprv_strcat(s->locale, providerKeyword);
michael@0 435 uprv_strcat(s->locale, s->locElements[UCOL_SIT_PROVIDER]);
michael@0 436 }
michael@0 437 }
michael@0 438 }
michael@0 439
michael@0 440
michael@0 441 U_CAPI void U_EXPORT2
michael@0 442 ucol_prepareShortStringOpen( const char *definition,
michael@0 443 UBool,
michael@0 444 UParseError *parseError,
michael@0 445 UErrorCode *status)
michael@0 446 {
michael@0 447 if(U_FAILURE(*status)) return;
michael@0 448
michael@0 449 UParseError internalParseError;
michael@0 450
michael@0 451 if(!parseError) {
michael@0 452 parseError = &internalParseError;
michael@0 453 }
michael@0 454 parseError->line = 0;
michael@0 455 parseError->offset = 0;
michael@0 456 parseError->preContext[0] = 0;
michael@0 457 parseError->postContext[0] = 0;
michael@0 458
michael@0 459
michael@0 460 // first we want to pick stuff out of short string.
michael@0 461 // we'll end up with an UCA version, locale and a bunch of
michael@0 462 // settings
michael@0 463
michael@0 464 // analyse the string in order to get everything we need.
michael@0 465 CollatorSpec s;
michael@0 466 ucol_sit_initCollatorSpecs(&s);
michael@0 467 ucol_sit_readSpecs(&s, definition, parseError, status);
michael@0 468 ucol_sit_calculateWholeLocale(&s);
michael@0 469
michael@0 470 char buffer[internalBufferSize];
michael@0 471 uprv_memset(buffer, 0, internalBufferSize);
michael@0 472 uloc_canonicalize(s.locale, buffer, internalBufferSize, status);
michael@0 473
michael@0 474 UResourceBundle *b = ures_open(U_ICUDATA_COLL, buffer, status);
michael@0 475 /* we try to find stuff from keyword */
michael@0 476 UResourceBundle *collations = ures_getByKey(b, "collations", NULL, status);
michael@0 477 UResourceBundle *collElem = NULL;
michael@0 478 char keyBuffer[256];
michael@0 479 // if there is a keyword, we pick it up and try to get elements
michael@0 480 if(!uloc_getKeywordValue(buffer, "collation", keyBuffer, 256, status)) {
michael@0 481 // no keyword. we try to find the default setting, which will give us the keyword value
michael@0 482 UResourceBundle *defaultColl = ures_getByKeyWithFallback(collations, "default", NULL, status);
michael@0 483 if(U_SUCCESS(*status)) {
michael@0 484 int32_t defaultKeyLen = 0;
michael@0 485 const UChar *defaultKey = ures_getString(defaultColl, &defaultKeyLen, status);
michael@0 486 u_UCharsToChars(defaultKey, keyBuffer, defaultKeyLen);
michael@0 487 keyBuffer[defaultKeyLen] = 0;
michael@0 488 } else {
michael@0 489 *status = U_INTERNAL_PROGRAM_ERROR;
michael@0 490 return;
michael@0 491 }
michael@0 492 ures_close(defaultColl);
michael@0 493 }
michael@0 494 collElem = ures_getByKeyWithFallback(collations, keyBuffer, collElem, status);
michael@0 495 ures_close(collElem);
michael@0 496 ures_close(collations);
michael@0 497 ures_close(b);
michael@0 498 }
michael@0 499
michael@0 500
michael@0 501 U_CAPI UCollator* U_EXPORT2
michael@0 502 ucol_openFromShortString( const char *definition,
michael@0 503 UBool forceDefaults,
michael@0 504 UParseError *parseError,
michael@0 505 UErrorCode *status)
michael@0 506 {
michael@0 507 UTRACE_ENTRY_OC(UTRACE_UCOL_OPEN_FROM_SHORT_STRING);
michael@0 508 UTRACE_DATA1(UTRACE_INFO, "short string = \"%s\"", definition);
michael@0 509
michael@0 510 if(U_FAILURE(*status)) return 0;
michael@0 511
michael@0 512 UParseError internalParseError;
michael@0 513
michael@0 514 if(!parseError) {
michael@0 515 parseError = &internalParseError;
michael@0 516 }
michael@0 517 parseError->line = 0;
michael@0 518 parseError->offset = 0;
michael@0 519 parseError->preContext[0] = 0;
michael@0 520 parseError->postContext[0] = 0;
michael@0 521
michael@0 522
michael@0 523 // first we want to pick stuff out of short string.
michael@0 524 // we'll end up with an UCA version, locale and a bunch of
michael@0 525 // settings
michael@0 526
michael@0 527 // analyse the string in order to get everything we need.
michael@0 528 const char *string = definition;
michael@0 529 CollatorSpec s;
michael@0 530 ucol_sit_initCollatorSpecs(&s);
michael@0 531 string = ucol_sit_readSpecs(&s, definition, parseError, status);
michael@0 532 ucol_sit_calculateWholeLocale(&s);
michael@0 533
michael@0 534 char buffer[internalBufferSize];
michael@0 535 uprv_memset(buffer, 0, internalBufferSize);
michael@0 536 uloc_canonicalize(s.locale, buffer, internalBufferSize, status);
michael@0 537
michael@0 538 UCollator *result = ucol_open(buffer, status);
michael@0 539 int32_t i = 0;
michael@0 540
michael@0 541 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
michael@0 542 if(s.options[i] != UCOL_DEFAULT) {
michael@0 543 if(forceDefaults || ucol_getAttribute(result, (UColAttribute)i, status) != s.options[i]) {
michael@0 544 ucol_setAttribute(result, (UColAttribute)i, s.options[i], status);
michael@0 545 }
michael@0 546
michael@0 547 if(U_FAILURE(*status)) {
michael@0 548 parseError->offset = (int32_t)(string - definition);
michael@0 549 ucol_close(result);
michael@0 550 return NULL;
michael@0 551 }
michael@0 552
michael@0 553 }
michael@0 554 }
michael@0 555 if(s.variableTopSet) {
michael@0 556 if(s.variableTopString[0]) {
michael@0 557 ucol_setVariableTop(result, s.variableTopString, s.variableTopStringLen, status);
michael@0 558 } else { // we set by value, using 'B'
michael@0 559 ucol_restoreVariableTop(result, s.variableTopValue, status);
michael@0 560 }
michael@0 561 }
michael@0 562
michael@0 563
michael@0 564 if(U_FAILURE(*status)) { // here it can only be a bogus value
michael@0 565 ucol_close(result);
michael@0 566 result = NULL;
michael@0 567 }
michael@0 568
michael@0 569 UTRACE_EXIT_PTR_STATUS(result, *status);
michael@0 570 return result;
michael@0 571 }
michael@0 572
michael@0 573
michael@0 574 static void appendShortStringElement(const char *src, int32_t len, char *result, int32_t *resultSize, int32_t capacity, char arg)
michael@0 575 {
michael@0 576 if(len) {
michael@0 577 if(*resultSize) {
michael@0 578 if(*resultSize < capacity) {
michael@0 579 uprv_strcat(result, "_");
michael@0 580 }
michael@0 581 (*resultSize)++;
michael@0 582 }
michael@0 583 *resultSize += len + 1;
michael@0 584 if(*resultSize < capacity) {
michael@0 585 uprv_strncat(result, &arg, 1);
michael@0 586 uprv_strncat(result, src, len);
michael@0 587 }
michael@0 588 }
michael@0 589 }
michael@0 590
michael@0 591 U_CAPI int32_t U_EXPORT2
michael@0 592 ucol_getShortDefinitionString(const UCollator *coll,
michael@0 593 const char *locale,
michael@0 594 char *dst,
michael@0 595 int32_t capacity,
michael@0 596 UErrorCode *status)
michael@0 597 {
michael@0 598 if(U_FAILURE(*status)) return 0;
michael@0 599 if(coll->delegate != NULL) {
michael@0 600 return ((icu::Collator*)coll->delegate)->internalGetShortDefinitionString(locale,dst,capacity,*status);
michael@0 601 }
michael@0 602 char buffer[internalBufferSize];
michael@0 603 uprv_memset(buffer, 0, internalBufferSize*sizeof(char));
michael@0 604 int32_t resultSize = 0;
michael@0 605 char tempbuff[internalBufferSize];
michael@0 606 char locBuff[internalBufferSize];
michael@0 607 uprv_memset(buffer, 0, internalBufferSize*sizeof(char));
michael@0 608 int32_t elementSize = 0;
michael@0 609 UBool isAvailable = 0;
michael@0 610 CollatorSpec s;
michael@0 611 ucol_sit_initCollatorSpecs(&s);
michael@0 612
michael@0 613 if(!locale) {
michael@0 614 locale = ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, status);
michael@0 615 }
michael@0 616 elementSize = ucol_getFunctionalEquivalent(locBuff, internalBufferSize, "collation", locale, &isAvailable, status);
michael@0 617
michael@0 618 if(elementSize) {
michael@0 619 // we should probably canonicalize here...
michael@0 620 elementSize = uloc_getLanguage(locBuff, tempbuff, internalBufferSize, status);
michael@0 621 appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, languageArg);
michael@0 622 elementSize = uloc_getCountry(locBuff, tempbuff, internalBufferSize, status);
michael@0 623 appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, regionArg);
michael@0 624 elementSize = uloc_getScript(locBuff, tempbuff, internalBufferSize, status);
michael@0 625 appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, scriptArg);
michael@0 626 elementSize = uloc_getVariant(locBuff, tempbuff, internalBufferSize, status);
michael@0 627 appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, variantArg);
michael@0 628 elementSize = uloc_getKeywordValue(locBuff, "collation", tempbuff, internalBufferSize, status);
michael@0 629 appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, keywordArg);
michael@0 630 }
michael@0 631
michael@0 632 int32_t i = 0;
michael@0 633 UColAttributeValue attribute = UCOL_DEFAULT;
michael@0 634 for(i = 0; i < UCOL_SIT_ITEMS_COUNT; i++) {
michael@0 635 if(options[i].action == _processCollatorOption) {
michael@0 636 attribute = ucol_getAttributeOrDefault(coll, (UColAttribute)options[i].attr, status);
michael@0 637 if(attribute != UCOL_DEFAULT) {
michael@0 638 char letter = ucol_sit_attributeValueToLetter(attribute, status);
michael@0 639 appendShortStringElement(&letter, 1,
michael@0 640 buffer, &resultSize, /*capacity*/internalBufferSize, options[i].optionStart);
michael@0 641 }
michael@0 642 }
michael@0 643 }
michael@0 644 if(coll->variableTopValueisDefault == FALSE) {
michael@0 645 //s.variableTopValue = ucol_getVariableTop(coll, status);
michael@0 646 elementSize = T_CString_integerToString(tempbuff, coll->variableTopValue, 16);
michael@0 647 appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, variableTopValArg);
michael@0 648 }
michael@0 649
michael@0 650 UParseError parseError;
michael@0 651 return ucol_normalizeShortDefinitionString(buffer, dst, capacity, &parseError, status);
michael@0 652 }
michael@0 653
michael@0 654 U_CAPI int32_t U_EXPORT2
michael@0 655 ucol_normalizeShortDefinitionString(const char *definition,
michael@0 656 char *destination,
michael@0 657 int32_t capacity,
michael@0 658 UParseError *parseError,
michael@0 659 UErrorCode *status)
michael@0 660 {
michael@0 661
michael@0 662 if(U_FAILURE(*status)) {
michael@0 663 return 0;
michael@0 664 }
michael@0 665
michael@0 666 if(destination) {
michael@0 667 uprv_memset(destination, 0, capacity*sizeof(char));
michael@0 668 }
michael@0 669
michael@0 670 UParseError pe;
michael@0 671 if(!parseError) {
michael@0 672 parseError = &pe;
michael@0 673 }
michael@0 674
michael@0 675 // validate
michael@0 676 CollatorSpec s;
michael@0 677 ucol_sit_initCollatorSpecs(&s);
michael@0 678 ucol_sit_readSpecs(&s, definition, parseError, status);
michael@0 679 return ucol_sit_dumpSpecs(&s, destination, capacity, status);
michael@0 680 }
michael@0 681
michael@0 682 U_CAPI UColAttributeValue U_EXPORT2
michael@0 683 ucol_getAttributeOrDefault(const UCollator *coll, UColAttribute attr, UErrorCode *status)
michael@0 684 {
michael@0 685 if(U_FAILURE(*status) || coll == NULL) {
michael@0 686 return UCOL_DEFAULT;
michael@0 687 }
michael@0 688 switch(attr) {
michael@0 689 case UCOL_NUMERIC_COLLATION:
michael@0 690 return coll->numericCollationisDefault?UCOL_DEFAULT:coll->numericCollation;
michael@0 691 case UCOL_HIRAGANA_QUATERNARY_MODE:
michael@0 692 return coll->hiraganaQisDefault?UCOL_DEFAULT:coll->hiraganaQ;
michael@0 693 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
michael@0 694 return coll->frenchCollationisDefault?UCOL_DEFAULT:coll->frenchCollation;
michael@0 695 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
michael@0 696 return coll->alternateHandlingisDefault?UCOL_DEFAULT:coll->alternateHandling;
michael@0 697 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
michael@0 698 return coll->caseFirstisDefault?UCOL_DEFAULT:coll->caseFirst;
michael@0 699 case UCOL_CASE_LEVEL: /* do we have an extra case level */
michael@0 700 return coll->caseLevelisDefault?UCOL_DEFAULT:coll->caseLevel;
michael@0 701 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
michael@0 702 return coll->normalizationModeisDefault?UCOL_DEFAULT:coll->normalizationMode;
michael@0 703 case UCOL_STRENGTH: /* attribute for strength */
michael@0 704 return coll->strengthisDefault?UCOL_DEFAULT:coll->strength;
michael@0 705 case UCOL_ATTRIBUTE_COUNT:
michael@0 706 default:
michael@0 707 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 708 #ifdef UCOL_TRACE_SIT
michael@0 709 fprintf(stderr, "%s:%d: Unknown attr value '%d': %s\n", __FILE__, __LINE__, (int)attr, u_errorName(*status));
michael@0 710 #endif
michael@0 711 break;
michael@0 712 }
michael@0 713 return UCOL_DEFAULT;
michael@0 714 }
michael@0 715
michael@0 716
michael@0 717 struct contContext {
michael@0 718 const UCollator *coll;
michael@0 719 USet *conts;
michael@0 720 USet *expansions;
michael@0 721 USet *removedContractions;
michael@0 722 UBool addPrefixes;
michael@0 723 UErrorCode *status;
michael@0 724 };
michael@0 725
michael@0 726
michael@0 727
michael@0 728 static void
michael@0 729 addSpecial(contContext *context, UChar *buffer, int32_t bufLen,
michael@0 730 uint32_t CE, int32_t leftIndex, int32_t rightIndex, UErrorCode *status)
michael@0 731 {
michael@0 732 const UCollator *coll = context->coll;
michael@0 733 USet *contractions = context->conts;
michael@0 734 USet *expansions = context->expansions;
michael@0 735 UBool addPrefixes = context->addPrefixes;
michael@0 736
michael@0 737 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
michael@0 738 uint32_t newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
michael@0 739 // we might have a contraction that ends from previous level
michael@0 740 if(newCE != UCOL_NOT_FOUND) {
michael@0 741 if(isSpecial(CE) && getCETag(CE) == CONTRACTION_TAG && isSpecial(newCE) && getCETag(newCE) == SPEC_PROC_TAG && addPrefixes) {
michael@0 742 addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex, status);
michael@0 743 }
michael@0 744 if(contractions && rightIndex-leftIndex > 1) {
michael@0 745 uset_addString(contractions, buffer+leftIndex, rightIndex-leftIndex);
michael@0 746 if(expansions && isSpecial(CE) && getCETag(CE) == EXPANSION_TAG) {
michael@0 747 uset_addString(expansions, buffer+leftIndex, rightIndex-leftIndex);
michael@0 748 }
michael@0 749 }
michael@0 750 }
michael@0 751
michael@0 752 UCharOffset++;
michael@0 753 // check whether we're doing contraction or prefix
michael@0 754 if(getCETag(CE) == SPEC_PROC_TAG && addPrefixes) {
michael@0 755 if(leftIndex == 0) {
michael@0 756 *status = U_INTERNAL_PROGRAM_ERROR;
michael@0 757 return;
michael@0 758 }
michael@0 759 --leftIndex;
michael@0 760 while(*UCharOffset != 0xFFFF) {
michael@0 761 newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
michael@0 762 buffer[leftIndex] = *UCharOffset;
michael@0 763 if(isSpecial(newCE) && (getCETag(newCE) == CONTRACTION_TAG || getCETag(newCE) == SPEC_PROC_TAG)) {
michael@0 764 addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex, status);
michael@0 765 } else {
michael@0 766 if(contractions) {
michael@0 767 uset_addString(contractions, buffer+leftIndex, rightIndex-leftIndex);
michael@0 768 }
michael@0 769 if(expansions && isSpecial(newCE) && getCETag(newCE) == EXPANSION_TAG) {
michael@0 770 uset_addString(expansions, buffer+leftIndex, rightIndex-leftIndex);
michael@0 771 }
michael@0 772 }
michael@0 773 UCharOffset++;
michael@0 774 }
michael@0 775 } else if(getCETag(CE) == CONTRACTION_TAG) {
michael@0 776 if(rightIndex == bufLen-1) {
michael@0 777 *status = U_INTERNAL_PROGRAM_ERROR;
michael@0 778 return;
michael@0 779 }
michael@0 780 while(*UCharOffset != 0xFFFF) {
michael@0 781 newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
michael@0 782 buffer[rightIndex] = *UCharOffset;
michael@0 783 if(isSpecial(newCE) && (getCETag(newCE) == CONTRACTION_TAG || getCETag(newCE) == SPEC_PROC_TAG)) {
michael@0 784 addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex+1, status);
michael@0 785 } else {
michael@0 786 if(contractions) {
michael@0 787 uset_addString(contractions, buffer+leftIndex, rightIndex+1-leftIndex);
michael@0 788 }
michael@0 789 if(expansions && isSpecial(newCE) && getCETag(newCE) == EXPANSION_TAG) {
michael@0 790 uset_addString(expansions, buffer+leftIndex, rightIndex+1-leftIndex);
michael@0 791 }
michael@0 792 }
michael@0 793 UCharOffset++;
michael@0 794 }
michael@0 795 }
michael@0 796
michael@0 797 }
michael@0 798
michael@0 799 U_CDECL_BEGIN
michael@0 800 static UBool U_CALLCONV
michael@0 801 _processSpecials(const void *context, UChar32 start, UChar32 limit, uint32_t CE)
michael@0 802 {
michael@0 803 UErrorCode *status = ((contContext *)context)->status;
michael@0 804 USet *expansions = ((contContext *)context)->expansions;
michael@0 805 USet *removed = ((contContext *)context)->removedContractions;
michael@0 806 UBool addPrefixes = ((contContext *)context)->addPrefixes;
michael@0 807 UChar contraction[internalBufferSize];
michael@0 808 if(isSpecial(CE)) {
michael@0 809 if(((getCETag(CE) == SPEC_PROC_TAG && addPrefixes) || getCETag(CE) == CONTRACTION_TAG)) {
michael@0 810 while(start < limit && U_SUCCESS(*status)) {
michael@0 811 // if there are suppressed contractions, we don't
michael@0 812 // want to add them.
michael@0 813 if(removed && uset_contains(removed, start)) {
michael@0 814 start++;
michael@0 815 continue;
michael@0 816 }
michael@0 817 // we start our contraction from middle, since we don't know if it
michael@0 818 // will grow toward right or left
michael@0 819 contraction[internalBufferSize/2] = (UChar)start;
michael@0 820 addSpecial(((contContext *)context), contraction, internalBufferSize, CE, internalBufferSize/2, internalBufferSize/2+1, status);
michael@0 821 start++;
michael@0 822 }
michael@0 823 } else if(expansions && getCETag(CE) == EXPANSION_TAG) {
michael@0 824 while(start < limit && U_SUCCESS(*status)) {
michael@0 825 uset_add(expansions, start++);
michael@0 826 }
michael@0 827 }
michael@0 828 }
michael@0 829 if(U_FAILURE(*status)) {
michael@0 830 return FALSE;
michael@0 831 } else {
michael@0 832 return TRUE;
michael@0 833 }
michael@0 834 }
michael@0 835
michael@0 836 U_CDECL_END
michael@0 837
michael@0 838
michael@0 839
michael@0 840 /**
michael@0 841 * Get a set containing the contractions defined by the collator. The set includes
michael@0 842 * both the UCA contractions and the contractions defined by the collator
michael@0 843 * @param coll collator
michael@0 844 * @param conts the set to hold the result
michael@0 845 * @param status to hold the error code
michael@0 846 * @return the size of the contraction set
michael@0 847 */
michael@0 848 U_CAPI int32_t U_EXPORT2
michael@0 849 ucol_getContractions( const UCollator *coll,
michael@0 850 USet *contractions,
michael@0 851 UErrorCode *status)
michael@0 852 {
michael@0 853 ucol_getContractionsAndExpansions(coll, contractions, NULL, FALSE, status);
michael@0 854 return uset_getItemCount(contractions);
michael@0 855 }
michael@0 856
michael@0 857 /**
michael@0 858 * Get a set containing the expansions defined by the collator. The set includes
michael@0 859 * both the UCA expansions and the expansions defined by the tailoring
michael@0 860 * @param coll collator
michael@0 861 * @param conts the set to hold the result
michael@0 862 * @param addPrefixes add the prefix contextual elements to contractions
michael@0 863 * @param status to hold the error code
michael@0 864 *
michael@0 865 * @draft ICU 3.4
michael@0 866 */
michael@0 867 U_CAPI void U_EXPORT2
michael@0 868 ucol_getContractionsAndExpansions( const UCollator *coll,
michael@0 869 USet *contractions,
michael@0 870 USet *expansions,
michael@0 871 UBool addPrefixes,
michael@0 872 UErrorCode *status)
michael@0 873 {
michael@0 874 if(U_FAILURE(*status)) {
michael@0 875 return;
michael@0 876 }
michael@0 877 if(coll == NULL) {
michael@0 878 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 879 return;
michael@0 880 }
michael@0 881
michael@0 882 if(contractions) {
michael@0 883 uset_clear(contractions);
michael@0 884 }
michael@0 885 if(expansions) {
michael@0 886 uset_clear(expansions);
michael@0 887 }
michael@0 888 int32_t rulesLen = 0;
michael@0 889 const UChar* rules = ucol_getRules(coll, &rulesLen);
michael@0 890 UColTokenParser src;
michael@0 891 ucol_tok_initTokenList(&src, rules, rulesLen, coll->UCA,
michael@0 892 ucol_tok_getRulesFromBundle, NULL, status);
michael@0 893
michael@0 894 contContext c = { NULL, contractions, expansions, src.removeSet, addPrefixes, status };
michael@0 895
michael@0 896 // Add the UCA contractions
michael@0 897 c.coll = coll->UCA;
michael@0 898 utrie_enum(&coll->UCA->mapping, NULL, _processSpecials, &c);
michael@0 899
michael@0 900 // This is collator specific. Add contractions from a collator
michael@0 901 c.coll = coll;
michael@0 902 c.removedContractions = NULL;
michael@0 903 utrie_enum(&coll->mapping, NULL, _processSpecials, &c);
michael@0 904 ucol_tok_closeTokenList(&src);
michael@0 905 }
michael@0 906
michael@0 907 U_CAPI int32_t U_EXPORT2
michael@0 908 ucol_getUnsafeSet( const UCollator *coll,
michael@0 909 USet *unsafe,
michael@0 910 UErrorCode *status)
michael@0 911 {
michael@0 912 UChar buffer[internalBufferSize];
michael@0 913 int32_t len = 0;
michael@0 914
michael@0 915 uset_clear(unsafe);
michael@0 916
michael@0 917 // cccpattern = "[[:^tccc=0:][:^lccc=0:]]", unfortunately variant
michael@0 918 static const UChar cccpattern[25] = { 0x5b, 0x5b, 0x3a, 0x5e, 0x74, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d,
michael@0 919 0x5b, 0x3a, 0x5e, 0x6c, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5d, 0x00 };
michael@0 920
michael@0 921 // add chars that fail the fcd check
michael@0 922 uset_applyPattern(unsafe, cccpattern, 24, USET_IGNORE_SPACE, status);
michael@0 923
michael@0 924 // add Thai/Lao prevowels
michael@0 925 uset_addRange(unsafe, 0xe40, 0xe44);
michael@0 926 uset_addRange(unsafe, 0xec0, 0xec4);
michael@0 927 // add lead/trail surrogates
michael@0 928 uset_addRange(unsafe, 0xd800, 0xdfff);
michael@0 929
michael@0 930 USet *contractions = uset_open(0,0);
michael@0 931
michael@0 932 int32_t i = 0, j = 0;
michael@0 933 int32_t contsSize = ucol_getContractions(coll, contractions, status);
michael@0 934 UChar32 c = 0;
michael@0 935 // Contraction set consists only of strings
michael@0 936 // to get unsafe code points, we need to
michael@0 937 // break the strings apart and add them to the unsafe set
michael@0 938 for(i = 0; i < contsSize; i++) {
michael@0 939 len = uset_getItem(contractions, i, NULL, NULL, buffer, internalBufferSize, status);
michael@0 940 if(len > 0) {
michael@0 941 j = 0;
michael@0 942 while(j < len) {
michael@0 943 U16_NEXT(buffer, j, len, c);
michael@0 944 if(j < len) {
michael@0 945 uset_add(unsafe, c);
michael@0 946 }
michael@0 947 }
michael@0 948 }
michael@0 949 }
michael@0 950
michael@0 951 uset_close(contractions);
michael@0 952
michael@0 953 return uset_size(unsafe);
michael@0 954 }
michael@0 955 #endif

mercurial