1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/ucol_sit.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,955 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* Copyright (C) 2004-2012, International Business Machines 1.7 +* Corporation and others. All Rights Reserved. 1.8 +******************************************************************************* 1.9 +* file name: ucol_sit.cpp 1.10 +* encoding: US-ASCII 1.11 +* tab size: 8 (not used) 1.12 +* indentation:4 1.13 +* 1.14 +* Modification history 1.15 +* Date Name Comments 1.16 +* 03/12/2004 weiv Creation 1.17 +*/ 1.18 + 1.19 +#include "unicode/ustring.h" 1.20 +#include "unicode/udata.h" 1.21 + 1.22 +#include "utracimp.h" 1.23 +#include "ucol_imp.h" 1.24 +#include "ucol_tok.h" 1.25 +#include "cmemory.h" 1.26 +#include "cstring.h" 1.27 +#include "uresimp.h" 1.28 +#include "unicode/coll.h" 1.29 + 1.30 +#ifdef UCOL_TRACE_SIT 1.31 +# include <stdio.h> 1.32 +#endif 1.33 + 1.34 +#if !UCONFIG_NO_COLLATION 1.35 + 1.36 +enum OptionsList { 1.37 + UCOL_SIT_LANGUAGE = 0, 1.38 + UCOL_SIT_SCRIPT = 1, 1.39 + UCOL_SIT_REGION = 2, 1.40 + UCOL_SIT_VARIANT = 3, 1.41 + UCOL_SIT_KEYWORD = 4, 1.42 + UCOL_SIT_PROVIDER = 5, 1.43 + UCOL_SIT_LOCELEMENT_MAX = UCOL_SIT_PROVIDER, /* the last element that's part of LocElements */ 1.44 + 1.45 + UCOL_SIT_BCP47, 1.46 + UCOL_SIT_STRENGTH, 1.47 + UCOL_SIT_CASE_LEVEL, 1.48 + UCOL_SIT_CASE_FIRST, 1.49 + UCOL_SIT_NUMERIC_COLLATION, 1.50 + UCOL_SIT_ALTERNATE_HANDLING, 1.51 + UCOL_SIT_NORMALIZATION_MODE, 1.52 + UCOL_SIT_FRENCH_COLLATION, 1.53 + UCOL_SIT_HIRAGANA_QUATERNARY, 1.54 + UCOL_SIT_VARIABLE_TOP, 1.55 + UCOL_SIT_VARIABLE_TOP_VALUE, 1.56 + UCOL_SIT_ITEMS_COUNT 1.57 +}; 1.58 + 1.59 +/* option starters chars. */ 1.60 +static const char alternateHArg = 'A'; 1.61 +static const char variableTopValArg = 'B'; 1.62 +static const char caseFirstArg = 'C'; 1.63 +static const char numericCollArg = 'D'; 1.64 +static const char caseLevelArg = 'E'; 1.65 +static const char frenchCollArg = 'F'; 1.66 +static const char hiraganaQArg = 'H'; 1.67 +static const char keywordArg = 'K'; 1.68 +static const char languageArg = 'L'; 1.69 +static const char normArg = 'N'; 1.70 +static const char providerArg = 'P'; 1.71 +static const char regionArg = 'R'; 1.72 +static const char strengthArg = 'S'; 1.73 +static const char variableTopArg = 'T'; 1.74 +static const char variantArg = 'V'; 1.75 +static const char RFC3066Arg = 'X'; 1.76 +static const char scriptArg = 'Z'; 1.77 + 1.78 +static const char collationKeyword[] = "@collation="; 1.79 +static const char providerKeyword[] = "@sp="; 1.80 + 1.81 + 1.82 +static const int32_t locElementCount = UCOL_SIT_LOCELEMENT_MAX+1; 1.83 +static const int32_t locElementCapacity = 32; 1.84 +static const int32_t loc3066Capacity = 256; 1.85 +static const int32_t locProviderCapacity = 10; 1.86 +static const int32_t internalBufferSize = 512; 1.87 + 1.88 +/* structure containing specification of a collator. Initialized 1.89 + * from a short string. Also used to construct a short string from a 1.90 + * collator instance 1.91 + */ 1.92 +struct CollatorSpec { 1.93 + char locElements[locElementCount][locElementCapacity]; 1.94 + char locale[loc3066Capacity]; 1.95 + char provider[locProviderCapacity]; 1.96 + UColAttributeValue options[UCOL_ATTRIBUTE_COUNT]; 1.97 + uint32_t variableTopValue; 1.98 + UChar variableTopString[locElementCapacity]; 1.99 + int32_t variableTopStringLen; 1.100 + UBool variableTopSet; 1.101 + struct { 1.102 + const char *start; 1.103 + int32_t len; 1.104 + } entries[UCOL_SIT_ITEMS_COUNT]; 1.105 +}; 1.106 + 1.107 + 1.108 +/* structure for converting between character attribute 1.109 + * representation and real collation attribute value. 1.110 + */ 1.111 +struct AttributeConversion { 1.112 + char letter; 1.113 + UColAttributeValue value; 1.114 +}; 1.115 + 1.116 +static const AttributeConversion conversions[12] = { 1.117 + { '1', UCOL_PRIMARY }, 1.118 + { '2', UCOL_SECONDARY }, 1.119 + { '3', UCOL_TERTIARY }, 1.120 + { '4', UCOL_QUATERNARY }, 1.121 + { 'D', UCOL_DEFAULT }, 1.122 + { 'I', UCOL_IDENTICAL }, 1.123 + { 'L', UCOL_LOWER_FIRST }, 1.124 + { 'N', UCOL_NON_IGNORABLE }, 1.125 + { 'O', UCOL_ON }, 1.126 + { 'S', UCOL_SHIFTED }, 1.127 + { 'U', UCOL_UPPER_FIRST }, 1.128 + { 'X', UCOL_OFF } 1.129 +}; 1.130 + 1.131 + 1.132 +static char 1.133 +ucol_sit_attributeValueToLetter(UColAttributeValue value, UErrorCode *status) { 1.134 + uint32_t i = 0; 1.135 + for(i = 0; i < sizeof(conversions)/sizeof(conversions[0]); i++) { 1.136 + if(conversions[i].value == value) { 1.137 + return conversions[i].letter; 1.138 + } 1.139 + } 1.140 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.141 +#ifdef UCOL_TRACE_SIT 1.142 + fprintf(stderr, "%s:%d: unknown UColAttributeValue %d: %s\n", __FILE__, __LINE__, value, u_errorName(*status)); 1.143 +#endif 1.144 + return 0; 1.145 +} 1.146 + 1.147 +static UColAttributeValue 1.148 +ucol_sit_letterToAttributeValue(char letter, UErrorCode *status) { 1.149 + uint32_t i = 0; 1.150 + for(i = 0; i < sizeof(conversions)/sizeof(conversions[0]); i++) { 1.151 + if(conversions[i].letter == letter) { 1.152 + return conversions[i].value; 1.153 + } 1.154 + } 1.155 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.156 +#ifdef UCOL_TRACE_SIT 1.157 + fprintf(stderr, "%s:%d: unknown letter %c: %s\n", __FILE__, __LINE__, letter, u_errorName(*status)); 1.158 +#endif 1.159 + return UCOL_DEFAULT; 1.160 +} 1.161 + 1.162 +/* function prototype for functions used to parse a short string */ 1.163 +U_CDECL_BEGIN 1.164 +typedef const char* U_CALLCONV 1.165 +ActionFunction(CollatorSpec *spec, uint32_t value1, const char* string, 1.166 + UErrorCode *status); 1.167 +U_CDECL_END 1.168 + 1.169 +U_CDECL_BEGIN 1.170 +static const char* U_CALLCONV 1.171 +_processLocaleElement(CollatorSpec *spec, uint32_t value, const char* string, 1.172 + UErrorCode *status) 1.173 +{ 1.174 + int32_t len = 0; 1.175 + do { 1.176 + if(value == UCOL_SIT_LANGUAGE || value == UCOL_SIT_KEYWORD || value == UCOL_SIT_PROVIDER) { 1.177 + spec->locElements[value][len++] = uprv_tolower(*string); 1.178 + } else { 1.179 + spec->locElements[value][len++] = *string; 1.180 + } 1.181 + } while(*(++string) != '_' && *string && len < locElementCapacity); 1.182 + if(len >= locElementCapacity) { 1.183 + *status = U_BUFFER_OVERFLOW_ERROR; 1.184 + return string; 1.185 + } 1.186 + // don't skip the underscore at the end 1.187 + return string; 1.188 +} 1.189 +U_CDECL_END 1.190 + 1.191 +U_CDECL_BEGIN 1.192 +static const char* U_CALLCONV 1.193 +_processRFC3066Locale(CollatorSpec *spec, uint32_t, const char* string, 1.194 + UErrorCode *status) 1.195 +{ 1.196 + char terminator = *string; 1.197 + string++; 1.198 + const char *end = uprv_strchr(string+1, terminator); 1.199 + if(end == NULL || end - string >= loc3066Capacity) { 1.200 + *status = U_BUFFER_OVERFLOW_ERROR; 1.201 + return string; 1.202 + } else { 1.203 + uprv_strncpy(spec->locale, string, end-string); 1.204 + return end+1; 1.205 + } 1.206 +} 1.207 + 1.208 +U_CDECL_END 1.209 + 1.210 +U_CDECL_BEGIN 1.211 +static const char* U_CALLCONV 1.212 +_processCollatorOption(CollatorSpec *spec, uint32_t option, const char* string, 1.213 + UErrorCode *status) 1.214 +{ 1.215 + spec->options[option] = ucol_sit_letterToAttributeValue(*string, status); 1.216 + if((*(++string) != '_' && *string) || U_FAILURE(*status)) { 1.217 +#ifdef UCOL_TRACE_SIT 1.218 + fprintf(stderr, "%s:%d: unknown collator option at '%s': %s\n", __FILE__, __LINE__, string, u_errorName(*status)); 1.219 +#endif 1.220 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.221 + } 1.222 + return string; 1.223 +} 1.224 +U_CDECL_END 1.225 + 1.226 + 1.227 +static UChar 1.228 +readHexCodeUnit(const char **string, UErrorCode *status) 1.229 +{ 1.230 + UChar result = 0; 1.231 + int32_t value = 0; 1.232 + char c; 1.233 + int32_t noDigits = 0; 1.234 + while((c = **string) != 0 && noDigits < 4) { 1.235 + if( c >= '0' && c <= '9') { 1.236 + value = c - '0'; 1.237 + } else if ( c >= 'a' && c <= 'f') { 1.238 + value = c - 'a' + 10; 1.239 + } else if ( c >= 'A' && c <= 'F') { 1.240 + value = c - 'A' + 10; 1.241 + } else { 1.242 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.243 +#ifdef UCOL_TRACE_SIT 1.244 + fprintf(stderr, "%s:%d: Bad hex char at '%s': %s\n", __FILE__, __LINE__, *string, u_errorName(*status)); 1.245 +#endif 1.246 + return 0; 1.247 + } 1.248 + result = (result << 4) | (UChar)value; 1.249 + noDigits++; 1.250 + (*string)++; 1.251 + } 1.252 + // if the string was terminated before we read 4 digits, set an error 1.253 + if(noDigits < 4) { 1.254 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.255 +#ifdef UCOL_TRACE_SIT 1.256 + fprintf(stderr, "%s:%d: Short (only %d digits, wanted 4) at '%s': %s\n", __FILE__, __LINE__, noDigits,*string, u_errorName(*status)); 1.257 +#endif 1.258 + } 1.259 + return result; 1.260 +} 1.261 + 1.262 +U_CDECL_BEGIN 1.263 +static const char* U_CALLCONV 1.264 +_processVariableTop(CollatorSpec *spec, uint32_t value1, const char* string, UErrorCode *status) 1.265 +{ 1.266 + // get four digits 1.267 + int32_t i = 0; 1.268 + if(!value1) { 1.269 + while(U_SUCCESS(*status) && i < locElementCapacity && *string != 0 && *string != '_') { 1.270 + spec->variableTopString[i++] = readHexCodeUnit(&string, status); 1.271 + } 1.272 + spec->variableTopStringLen = i; 1.273 + if(i == locElementCapacity && *string != 0 && *string != '_') { 1.274 + *status = U_BUFFER_OVERFLOW_ERROR; 1.275 + } 1.276 + } else { 1.277 + spec->variableTopValue = readHexCodeUnit(&string, status); 1.278 + } 1.279 + if(U_SUCCESS(*status)) { 1.280 + spec->variableTopSet = TRUE; 1.281 + } 1.282 + return string; 1.283 +} 1.284 +U_CDECL_END 1.285 + 1.286 + 1.287 +/* Table for parsing short strings */ 1.288 +struct ShortStringOptions { 1.289 + char optionStart; 1.290 + ActionFunction *action; 1.291 + uint32_t attr; 1.292 +}; 1.293 + 1.294 +static const ShortStringOptions options[UCOL_SIT_ITEMS_COUNT] = 1.295 +{ 1.296 +/* 10 ALTERNATE_HANDLING */ {alternateHArg, _processCollatorOption, UCOL_ALTERNATE_HANDLING }, // alternate N, S, D 1.297 +/* 15 VARIABLE_TOP_VALUE */ {variableTopValArg, _processVariableTop, 1 }, 1.298 +/* 08 CASE_FIRST */ {caseFirstArg, _processCollatorOption, UCOL_CASE_FIRST }, // case first L, U, X, D 1.299 +/* 09 NUMERIC_COLLATION */ {numericCollArg, _processCollatorOption, UCOL_NUMERIC_COLLATION }, // codan O, X, D 1.300 +/* 07 CASE_LEVEL */ {caseLevelArg, _processCollatorOption, UCOL_CASE_LEVEL }, // case level O, X, D 1.301 +/* 12 FRENCH_COLLATION */ {frenchCollArg, _processCollatorOption, UCOL_FRENCH_COLLATION }, // french O, X, D 1.302 +/* 13 HIRAGANA_QUATERNARY] */ {hiraganaQArg, _processCollatorOption, UCOL_HIRAGANA_QUATERNARY_MODE }, // hiragana O, X, D 1.303 +/* 04 KEYWORD */ {keywordArg, _processLocaleElement, UCOL_SIT_KEYWORD }, // keyword 1.304 +/* 00 LANGUAGE */ {languageArg, _processLocaleElement, UCOL_SIT_LANGUAGE }, // language 1.305 +/* 11 NORMALIZATION_MODE */ {normArg, _processCollatorOption, UCOL_NORMALIZATION_MODE }, // norm O, X, D 1.306 +/* 02 REGION */ {regionArg, _processLocaleElement, UCOL_SIT_REGION }, // region 1.307 +/* 06 STRENGTH */ {strengthArg, _processCollatorOption, UCOL_STRENGTH }, // strength 1, 2, 3, 4, I, D 1.308 +/* 14 VARIABLE_TOP */ {variableTopArg, _processVariableTop, 0 }, 1.309 +/* 03 VARIANT */ {variantArg, _processLocaleElement, UCOL_SIT_VARIANT }, // variant 1.310 +/* 05 RFC3066BIS */ {RFC3066Arg, _processRFC3066Locale, 0 }, // rfc3066bis locale name 1.311 +/* 01 SCRIPT */ {scriptArg, _processLocaleElement, UCOL_SIT_SCRIPT }, // script 1.312 +/* PROVIDER */ {providerArg, _processLocaleElement, UCOL_SIT_PROVIDER } 1.313 +}; 1.314 + 1.315 + 1.316 +static 1.317 +const char* ucol_sit_readOption(const char *start, CollatorSpec *spec, 1.318 + UErrorCode *status) 1.319 +{ 1.320 + int32_t i = 0; 1.321 + 1.322 + for(i = 0; i < UCOL_SIT_ITEMS_COUNT; i++) { 1.323 + if(*start == options[i].optionStart) { 1.324 + spec->entries[i].start = start; 1.325 + const char* end = options[i].action(spec, options[i].attr, start+1, status); 1.326 + spec->entries[i].len = (int32_t)(end - start); 1.327 + return end; 1.328 + } 1.329 + } 1.330 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.331 +#ifdef UCOL_TRACE_SIT 1.332 + fprintf(stderr, "%s:%d: Unknown option at '%s': %s\n", __FILE__, __LINE__, start, u_errorName(*status)); 1.333 +#endif 1.334 + return start; 1.335 +} 1.336 + 1.337 +static 1.338 +void ucol_sit_initCollatorSpecs(CollatorSpec *spec) 1.339 +{ 1.340 + // reset everything 1.341 + uprv_memset(spec, 0, sizeof(CollatorSpec)); 1.342 + // set collation options to default 1.343 + int32_t i = 0; 1.344 + for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) { 1.345 + spec->options[i] = UCOL_DEFAULT; 1.346 + } 1.347 +} 1.348 + 1.349 +static const char* 1.350 +ucol_sit_readSpecs(CollatorSpec *s, const char *string, 1.351 + UParseError *parseError, UErrorCode *status) 1.352 +{ 1.353 + const char *definition = string; 1.354 + while(U_SUCCESS(*status) && *string) { 1.355 + string = ucol_sit_readOption(string, s, status); 1.356 + // advance over '_' 1.357 + while(*string && *string == '_') { 1.358 + string++; 1.359 + } 1.360 + } 1.361 + if(U_FAILURE(*status)) { 1.362 + parseError->offset = (int32_t)(string - definition); 1.363 + } 1.364 + return string; 1.365 +} 1.366 + 1.367 +static 1.368 +int32_t ucol_sit_dumpSpecs(CollatorSpec *s, char *destination, int32_t capacity, UErrorCode *status) 1.369 +{ 1.370 + int32_t i = 0, j = 0; 1.371 + int32_t len = 0; 1.372 + char optName; 1.373 + if(U_SUCCESS(*status)) { 1.374 + for(i = 0; i < UCOL_SIT_ITEMS_COUNT; i++) { 1.375 + if(s->entries[i].start) { 1.376 + if(len) { 1.377 + if(len < capacity) { 1.378 + uprv_strcat(destination, "_"); 1.379 + } 1.380 + len++; 1.381 + } 1.382 + optName = *(s->entries[i].start); 1.383 + if(optName == languageArg || optName == regionArg || optName == variantArg || optName == keywordArg) { 1.384 + for(j = 0; j < s->entries[i].len; j++) { 1.385 + if(len + j < capacity) { 1.386 + destination[len+j] = uprv_toupper(*(s->entries[i].start+j)); 1.387 + } 1.388 + } 1.389 + len += s->entries[i].len; 1.390 + } else { 1.391 + len += s->entries[i].len; 1.392 + if(len < capacity) { 1.393 + uprv_strncat(destination,s->entries[i].start, s->entries[i].len); 1.394 + } 1.395 + } 1.396 + } 1.397 + } 1.398 + return len; 1.399 + } else { 1.400 + return 0; 1.401 + } 1.402 +} 1.403 + 1.404 +static void 1.405 +ucol_sit_calculateWholeLocale(CollatorSpec *s) { 1.406 + // put the locale together, unless we have a done 1.407 + // locale 1.408 + if(s->locale[0] == 0) { 1.409 + // first the language 1.410 + uprv_strcat(s->locale, s->locElements[UCOL_SIT_LANGUAGE]); 1.411 + // then the script, if present 1.412 + if(*(s->locElements[UCOL_SIT_SCRIPT])) { 1.413 + uprv_strcat(s->locale, "_"); 1.414 + uprv_strcat(s->locale, s->locElements[UCOL_SIT_SCRIPT]); 1.415 + } 1.416 + // then the region, if present 1.417 + if(*(s->locElements[UCOL_SIT_REGION])) { 1.418 + uprv_strcat(s->locale, "_"); 1.419 + uprv_strcat(s->locale, s->locElements[UCOL_SIT_REGION]); 1.420 + } else if(*(s->locElements[UCOL_SIT_VARIANT])) { // if there is a variant, we need an underscore 1.421 + uprv_strcat(s->locale, "_"); 1.422 + } 1.423 + // add variant, if there 1.424 + if(*(s->locElements[UCOL_SIT_VARIANT])) { 1.425 + uprv_strcat(s->locale, "_"); 1.426 + uprv_strcat(s->locale, s->locElements[UCOL_SIT_VARIANT]); 1.427 + } 1.428 + 1.429 + // if there is a collation keyword, add that too 1.430 + if(*(s->locElements[UCOL_SIT_KEYWORD])) { 1.431 + uprv_strcat(s->locale, collationKeyword); 1.432 + uprv_strcat(s->locale, s->locElements[UCOL_SIT_KEYWORD]); 1.433 + } 1.434 + 1.435 + // if there is a provider keyword, add that too 1.436 + if(*(s->locElements[UCOL_SIT_PROVIDER])) { 1.437 + uprv_strcat(s->locale, providerKeyword); 1.438 + uprv_strcat(s->locale, s->locElements[UCOL_SIT_PROVIDER]); 1.439 + } 1.440 + } 1.441 +} 1.442 + 1.443 + 1.444 +U_CAPI void U_EXPORT2 1.445 +ucol_prepareShortStringOpen( const char *definition, 1.446 + UBool, 1.447 + UParseError *parseError, 1.448 + UErrorCode *status) 1.449 +{ 1.450 + if(U_FAILURE(*status)) return; 1.451 + 1.452 + UParseError internalParseError; 1.453 + 1.454 + if(!parseError) { 1.455 + parseError = &internalParseError; 1.456 + } 1.457 + parseError->line = 0; 1.458 + parseError->offset = 0; 1.459 + parseError->preContext[0] = 0; 1.460 + parseError->postContext[0] = 0; 1.461 + 1.462 + 1.463 + // first we want to pick stuff out of short string. 1.464 + // we'll end up with an UCA version, locale and a bunch of 1.465 + // settings 1.466 + 1.467 + // analyse the string in order to get everything we need. 1.468 + CollatorSpec s; 1.469 + ucol_sit_initCollatorSpecs(&s); 1.470 + ucol_sit_readSpecs(&s, definition, parseError, status); 1.471 + ucol_sit_calculateWholeLocale(&s); 1.472 + 1.473 + char buffer[internalBufferSize]; 1.474 + uprv_memset(buffer, 0, internalBufferSize); 1.475 + uloc_canonicalize(s.locale, buffer, internalBufferSize, status); 1.476 + 1.477 + UResourceBundle *b = ures_open(U_ICUDATA_COLL, buffer, status); 1.478 + /* we try to find stuff from keyword */ 1.479 + UResourceBundle *collations = ures_getByKey(b, "collations", NULL, status); 1.480 + UResourceBundle *collElem = NULL; 1.481 + char keyBuffer[256]; 1.482 + // if there is a keyword, we pick it up and try to get elements 1.483 + if(!uloc_getKeywordValue(buffer, "collation", keyBuffer, 256, status)) { 1.484 + // no keyword. we try to find the default setting, which will give us the keyword value 1.485 + UResourceBundle *defaultColl = ures_getByKeyWithFallback(collations, "default", NULL, status); 1.486 + if(U_SUCCESS(*status)) { 1.487 + int32_t defaultKeyLen = 0; 1.488 + const UChar *defaultKey = ures_getString(defaultColl, &defaultKeyLen, status); 1.489 + u_UCharsToChars(defaultKey, keyBuffer, defaultKeyLen); 1.490 + keyBuffer[defaultKeyLen] = 0; 1.491 + } else { 1.492 + *status = U_INTERNAL_PROGRAM_ERROR; 1.493 + return; 1.494 + } 1.495 + ures_close(defaultColl); 1.496 + } 1.497 + collElem = ures_getByKeyWithFallback(collations, keyBuffer, collElem, status); 1.498 + ures_close(collElem); 1.499 + ures_close(collations); 1.500 + ures_close(b); 1.501 +} 1.502 + 1.503 + 1.504 +U_CAPI UCollator* U_EXPORT2 1.505 +ucol_openFromShortString( const char *definition, 1.506 + UBool forceDefaults, 1.507 + UParseError *parseError, 1.508 + UErrorCode *status) 1.509 +{ 1.510 + UTRACE_ENTRY_OC(UTRACE_UCOL_OPEN_FROM_SHORT_STRING); 1.511 + UTRACE_DATA1(UTRACE_INFO, "short string = \"%s\"", definition); 1.512 + 1.513 + if(U_FAILURE(*status)) return 0; 1.514 + 1.515 + UParseError internalParseError; 1.516 + 1.517 + if(!parseError) { 1.518 + parseError = &internalParseError; 1.519 + } 1.520 + parseError->line = 0; 1.521 + parseError->offset = 0; 1.522 + parseError->preContext[0] = 0; 1.523 + parseError->postContext[0] = 0; 1.524 + 1.525 + 1.526 + // first we want to pick stuff out of short string. 1.527 + // we'll end up with an UCA version, locale and a bunch of 1.528 + // settings 1.529 + 1.530 + // analyse the string in order to get everything we need. 1.531 + const char *string = definition; 1.532 + CollatorSpec s; 1.533 + ucol_sit_initCollatorSpecs(&s); 1.534 + string = ucol_sit_readSpecs(&s, definition, parseError, status); 1.535 + ucol_sit_calculateWholeLocale(&s); 1.536 + 1.537 + char buffer[internalBufferSize]; 1.538 + uprv_memset(buffer, 0, internalBufferSize); 1.539 + uloc_canonicalize(s.locale, buffer, internalBufferSize, status); 1.540 + 1.541 + UCollator *result = ucol_open(buffer, status); 1.542 + int32_t i = 0; 1.543 + 1.544 + for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) { 1.545 + if(s.options[i] != UCOL_DEFAULT) { 1.546 + if(forceDefaults || ucol_getAttribute(result, (UColAttribute)i, status) != s.options[i]) { 1.547 + ucol_setAttribute(result, (UColAttribute)i, s.options[i], status); 1.548 + } 1.549 + 1.550 + if(U_FAILURE(*status)) { 1.551 + parseError->offset = (int32_t)(string - definition); 1.552 + ucol_close(result); 1.553 + return NULL; 1.554 + } 1.555 + 1.556 + } 1.557 + } 1.558 + if(s.variableTopSet) { 1.559 + if(s.variableTopString[0]) { 1.560 + ucol_setVariableTop(result, s.variableTopString, s.variableTopStringLen, status); 1.561 + } else { // we set by value, using 'B' 1.562 + ucol_restoreVariableTop(result, s.variableTopValue, status); 1.563 + } 1.564 + } 1.565 + 1.566 + 1.567 + if(U_FAILURE(*status)) { // here it can only be a bogus value 1.568 + ucol_close(result); 1.569 + result = NULL; 1.570 + } 1.571 + 1.572 + UTRACE_EXIT_PTR_STATUS(result, *status); 1.573 + return result; 1.574 +} 1.575 + 1.576 + 1.577 +static void appendShortStringElement(const char *src, int32_t len, char *result, int32_t *resultSize, int32_t capacity, char arg) 1.578 +{ 1.579 + if(len) { 1.580 + if(*resultSize) { 1.581 + if(*resultSize < capacity) { 1.582 + uprv_strcat(result, "_"); 1.583 + } 1.584 + (*resultSize)++; 1.585 + } 1.586 + *resultSize += len + 1; 1.587 + if(*resultSize < capacity) { 1.588 + uprv_strncat(result, &arg, 1); 1.589 + uprv_strncat(result, src, len); 1.590 + } 1.591 + } 1.592 +} 1.593 + 1.594 +U_CAPI int32_t U_EXPORT2 1.595 +ucol_getShortDefinitionString(const UCollator *coll, 1.596 + const char *locale, 1.597 + char *dst, 1.598 + int32_t capacity, 1.599 + UErrorCode *status) 1.600 +{ 1.601 + if(U_FAILURE(*status)) return 0; 1.602 + if(coll->delegate != NULL) { 1.603 + return ((icu::Collator*)coll->delegate)->internalGetShortDefinitionString(locale,dst,capacity,*status); 1.604 + } 1.605 + char buffer[internalBufferSize]; 1.606 + uprv_memset(buffer, 0, internalBufferSize*sizeof(char)); 1.607 + int32_t resultSize = 0; 1.608 + char tempbuff[internalBufferSize]; 1.609 + char locBuff[internalBufferSize]; 1.610 + uprv_memset(buffer, 0, internalBufferSize*sizeof(char)); 1.611 + int32_t elementSize = 0; 1.612 + UBool isAvailable = 0; 1.613 + CollatorSpec s; 1.614 + ucol_sit_initCollatorSpecs(&s); 1.615 + 1.616 + if(!locale) { 1.617 + locale = ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, status); 1.618 + } 1.619 + elementSize = ucol_getFunctionalEquivalent(locBuff, internalBufferSize, "collation", locale, &isAvailable, status); 1.620 + 1.621 + if(elementSize) { 1.622 + // we should probably canonicalize here... 1.623 + elementSize = uloc_getLanguage(locBuff, tempbuff, internalBufferSize, status); 1.624 + appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, languageArg); 1.625 + elementSize = uloc_getCountry(locBuff, tempbuff, internalBufferSize, status); 1.626 + appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, regionArg); 1.627 + elementSize = uloc_getScript(locBuff, tempbuff, internalBufferSize, status); 1.628 + appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, scriptArg); 1.629 + elementSize = uloc_getVariant(locBuff, tempbuff, internalBufferSize, status); 1.630 + appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, variantArg); 1.631 + elementSize = uloc_getKeywordValue(locBuff, "collation", tempbuff, internalBufferSize, status); 1.632 + appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, keywordArg); 1.633 + } 1.634 + 1.635 + int32_t i = 0; 1.636 + UColAttributeValue attribute = UCOL_DEFAULT; 1.637 + for(i = 0; i < UCOL_SIT_ITEMS_COUNT; i++) { 1.638 + if(options[i].action == _processCollatorOption) { 1.639 + attribute = ucol_getAttributeOrDefault(coll, (UColAttribute)options[i].attr, status); 1.640 + if(attribute != UCOL_DEFAULT) { 1.641 + char letter = ucol_sit_attributeValueToLetter(attribute, status); 1.642 + appendShortStringElement(&letter, 1, 1.643 + buffer, &resultSize, /*capacity*/internalBufferSize, options[i].optionStart); 1.644 + } 1.645 + } 1.646 + } 1.647 + if(coll->variableTopValueisDefault == FALSE) { 1.648 + //s.variableTopValue = ucol_getVariableTop(coll, status); 1.649 + elementSize = T_CString_integerToString(tempbuff, coll->variableTopValue, 16); 1.650 + appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, variableTopValArg); 1.651 + } 1.652 + 1.653 + UParseError parseError; 1.654 + return ucol_normalizeShortDefinitionString(buffer, dst, capacity, &parseError, status); 1.655 +} 1.656 + 1.657 +U_CAPI int32_t U_EXPORT2 1.658 +ucol_normalizeShortDefinitionString(const char *definition, 1.659 + char *destination, 1.660 + int32_t capacity, 1.661 + UParseError *parseError, 1.662 + UErrorCode *status) 1.663 +{ 1.664 + 1.665 + if(U_FAILURE(*status)) { 1.666 + return 0; 1.667 + } 1.668 + 1.669 + if(destination) { 1.670 + uprv_memset(destination, 0, capacity*sizeof(char)); 1.671 + } 1.672 + 1.673 + UParseError pe; 1.674 + if(!parseError) { 1.675 + parseError = &pe; 1.676 + } 1.677 + 1.678 + // validate 1.679 + CollatorSpec s; 1.680 + ucol_sit_initCollatorSpecs(&s); 1.681 + ucol_sit_readSpecs(&s, definition, parseError, status); 1.682 + return ucol_sit_dumpSpecs(&s, destination, capacity, status); 1.683 +} 1.684 + 1.685 +U_CAPI UColAttributeValue U_EXPORT2 1.686 +ucol_getAttributeOrDefault(const UCollator *coll, UColAttribute attr, UErrorCode *status) 1.687 +{ 1.688 + if(U_FAILURE(*status) || coll == NULL) { 1.689 + return UCOL_DEFAULT; 1.690 + } 1.691 + switch(attr) { 1.692 + case UCOL_NUMERIC_COLLATION: 1.693 + return coll->numericCollationisDefault?UCOL_DEFAULT:coll->numericCollation; 1.694 + case UCOL_HIRAGANA_QUATERNARY_MODE: 1.695 + return coll->hiraganaQisDefault?UCOL_DEFAULT:coll->hiraganaQ; 1.696 + case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/ 1.697 + return coll->frenchCollationisDefault?UCOL_DEFAULT:coll->frenchCollation; 1.698 + case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ 1.699 + return coll->alternateHandlingisDefault?UCOL_DEFAULT:coll->alternateHandling; 1.700 + case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ 1.701 + return coll->caseFirstisDefault?UCOL_DEFAULT:coll->caseFirst; 1.702 + case UCOL_CASE_LEVEL: /* do we have an extra case level */ 1.703 + return coll->caseLevelisDefault?UCOL_DEFAULT:coll->caseLevel; 1.704 + case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ 1.705 + return coll->normalizationModeisDefault?UCOL_DEFAULT:coll->normalizationMode; 1.706 + case UCOL_STRENGTH: /* attribute for strength */ 1.707 + return coll->strengthisDefault?UCOL_DEFAULT:coll->strength; 1.708 + case UCOL_ATTRIBUTE_COUNT: 1.709 + default: 1.710 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.711 +#ifdef UCOL_TRACE_SIT 1.712 + fprintf(stderr, "%s:%d: Unknown attr value '%d': %s\n", __FILE__, __LINE__, (int)attr, u_errorName(*status)); 1.713 +#endif 1.714 + break; 1.715 + } 1.716 + return UCOL_DEFAULT; 1.717 +} 1.718 + 1.719 + 1.720 +struct contContext { 1.721 + const UCollator *coll; 1.722 + USet *conts; 1.723 + USet *expansions; 1.724 + USet *removedContractions; 1.725 + UBool addPrefixes; 1.726 + UErrorCode *status; 1.727 +}; 1.728 + 1.729 + 1.730 + 1.731 +static void 1.732 +addSpecial(contContext *context, UChar *buffer, int32_t bufLen, 1.733 + uint32_t CE, int32_t leftIndex, int32_t rightIndex, UErrorCode *status) 1.734 +{ 1.735 + const UCollator *coll = context->coll; 1.736 + USet *contractions = context->conts; 1.737 + USet *expansions = context->expansions; 1.738 + UBool addPrefixes = context->addPrefixes; 1.739 + 1.740 + const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE); 1.741 + uint32_t newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 1.742 + // we might have a contraction that ends from previous level 1.743 + if(newCE != UCOL_NOT_FOUND) { 1.744 + if(isSpecial(CE) && getCETag(CE) == CONTRACTION_TAG && isSpecial(newCE) && getCETag(newCE) == SPEC_PROC_TAG && addPrefixes) { 1.745 + addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex, status); 1.746 + } 1.747 + if(contractions && rightIndex-leftIndex > 1) { 1.748 + uset_addString(contractions, buffer+leftIndex, rightIndex-leftIndex); 1.749 + if(expansions && isSpecial(CE) && getCETag(CE) == EXPANSION_TAG) { 1.750 + uset_addString(expansions, buffer+leftIndex, rightIndex-leftIndex); 1.751 + } 1.752 + } 1.753 + } 1.754 + 1.755 + UCharOffset++; 1.756 + // check whether we're doing contraction or prefix 1.757 + if(getCETag(CE) == SPEC_PROC_TAG && addPrefixes) { 1.758 + if(leftIndex == 0) { 1.759 + *status = U_INTERNAL_PROGRAM_ERROR; 1.760 + return; 1.761 + } 1.762 + --leftIndex; 1.763 + while(*UCharOffset != 0xFFFF) { 1.764 + newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 1.765 + buffer[leftIndex] = *UCharOffset; 1.766 + if(isSpecial(newCE) && (getCETag(newCE) == CONTRACTION_TAG || getCETag(newCE) == SPEC_PROC_TAG)) { 1.767 + addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex, status); 1.768 + } else { 1.769 + if(contractions) { 1.770 + uset_addString(contractions, buffer+leftIndex, rightIndex-leftIndex); 1.771 + } 1.772 + if(expansions && isSpecial(newCE) && getCETag(newCE) == EXPANSION_TAG) { 1.773 + uset_addString(expansions, buffer+leftIndex, rightIndex-leftIndex); 1.774 + } 1.775 + } 1.776 + UCharOffset++; 1.777 + } 1.778 + } else if(getCETag(CE) == CONTRACTION_TAG) { 1.779 + if(rightIndex == bufLen-1) { 1.780 + *status = U_INTERNAL_PROGRAM_ERROR; 1.781 + return; 1.782 + } 1.783 + while(*UCharOffset != 0xFFFF) { 1.784 + newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 1.785 + buffer[rightIndex] = *UCharOffset; 1.786 + if(isSpecial(newCE) && (getCETag(newCE) == CONTRACTION_TAG || getCETag(newCE) == SPEC_PROC_TAG)) { 1.787 + addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex+1, status); 1.788 + } else { 1.789 + if(contractions) { 1.790 + uset_addString(contractions, buffer+leftIndex, rightIndex+1-leftIndex); 1.791 + } 1.792 + if(expansions && isSpecial(newCE) && getCETag(newCE) == EXPANSION_TAG) { 1.793 + uset_addString(expansions, buffer+leftIndex, rightIndex+1-leftIndex); 1.794 + } 1.795 + } 1.796 + UCharOffset++; 1.797 + } 1.798 + } 1.799 + 1.800 +} 1.801 + 1.802 +U_CDECL_BEGIN 1.803 +static UBool U_CALLCONV 1.804 +_processSpecials(const void *context, UChar32 start, UChar32 limit, uint32_t CE) 1.805 +{ 1.806 + UErrorCode *status = ((contContext *)context)->status; 1.807 + USet *expansions = ((contContext *)context)->expansions; 1.808 + USet *removed = ((contContext *)context)->removedContractions; 1.809 + UBool addPrefixes = ((contContext *)context)->addPrefixes; 1.810 + UChar contraction[internalBufferSize]; 1.811 + if(isSpecial(CE)) { 1.812 + if(((getCETag(CE) == SPEC_PROC_TAG && addPrefixes) || getCETag(CE) == CONTRACTION_TAG)) { 1.813 + while(start < limit && U_SUCCESS(*status)) { 1.814 + // if there are suppressed contractions, we don't 1.815 + // want to add them. 1.816 + if(removed && uset_contains(removed, start)) { 1.817 + start++; 1.818 + continue; 1.819 + } 1.820 + // we start our contraction from middle, since we don't know if it 1.821 + // will grow toward right or left 1.822 + contraction[internalBufferSize/2] = (UChar)start; 1.823 + addSpecial(((contContext *)context), contraction, internalBufferSize, CE, internalBufferSize/2, internalBufferSize/2+1, status); 1.824 + start++; 1.825 + } 1.826 + } else if(expansions && getCETag(CE) == EXPANSION_TAG) { 1.827 + while(start < limit && U_SUCCESS(*status)) { 1.828 + uset_add(expansions, start++); 1.829 + } 1.830 + } 1.831 + } 1.832 + if(U_FAILURE(*status)) { 1.833 + return FALSE; 1.834 + } else { 1.835 + return TRUE; 1.836 + } 1.837 +} 1.838 + 1.839 +U_CDECL_END 1.840 + 1.841 + 1.842 + 1.843 +/** 1.844 + * Get a set containing the contractions defined by the collator. The set includes 1.845 + * both the UCA contractions and the contractions defined by the collator 1.846 + * @param coll collator 1.847 + * @param conts the set to hold the result 1.848 + * @param status to hold the error code 1.849 + * @return the size of the contraction set 1.850 + */ 1.851 +U_CAPI int32_t U_EXPORT2 1.852 +ucol_getContractions( const UCollator *coll, 1.853 + USet *contractions, 1.854 + UErrorCode *status) 1.855 +{ 1.856 + ucol_getContractionsAndExpansions(coll, contractions, NULL, FALSE, status); 1.857 + return uset_getItemCount(contractions); 1.858 +} 1.859 + 1.860 +/** 1.861 + * Get a set containing the expansions defined by the collator. The set includes 1.862 + * both the UCA expansions and the expansions defined by the tailoring 1.863 + * @param coll collator 1.864 + * @param conts the set to hold the result 1.865 + * @param addPrefixes add the prefix contextual elements to contractions 1.866 + * @param status to hold the error code 1.867 + * 1.868 + * @draft ICU 3.4 1.869 + */ 1.870 +U_CAPI void U_EXPORT2 1.871 +ucol_getContractionsAndExpansions( const UCollator *coll, 1.872 + USet *contractions, 1.873 + USet *expansions, 1.874 + UBool addPrefixes, 1.875 + UErrorCode *status) 1.876 +{ 1.877 + if(U_FAILURE(*status)) { 1.878 + return; 1.879 + } 1.880 + if(coll == NULL) { 1.881 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.882 + return; 1.883 + } 1.884 + 1.885 + if(contractions) { 1.886 + uset_clear(contractions); 1.887 + } 1.888 + if(expansions) { 1.889 + uset_clear(expansions); 1.890 + } 1.891 + int32_t rulesLen = 0; 1.892 + const UChar* rules = ucol_getRules(coll, &rulesLen); 1.893 + UColTokenParser src; 1.894 + ucol_tok_initTokenList(&src, rules, rulesLen, coll->UCA, 1.895 + ucol_tok_getRulesFromBundle, NULL, status); 1.896 + 1.897 + contContext c = { NULL, contractions, expansions, src.removeSet, addPrefixes, status }; 1.898 + 1.899 + // Add the UCA contractions 1.900 + c.coll = coll->UCA; 1.901 + utrie_enum(&coll->UCA->mapping, NULL, _processSpecials, &c); 1.902 + 1.903 + // This is collator specific. Add contractions from a collator 1.904 + c.coll = coll; 1.905 + c.removedContractions = NULL; 1.906 + utrie_enum(&coll->mapping, NULL, _processSpecials, &c); 1.907 + ucol_tok_closeTokenList(&src); 1.908 +} 1.909 + 1.910 +U_CAPI int32_t U_EXPORT2 1.911 +ucol_getUnsafeSet( const UCollator *coll, 1.912 + USet *unsafe, 1.913 + UErrorCode *status) 1.914 +{ 1.915 + UChar buffer[internalBufferSize]; 1.916 + int32_t len = 0; 1.917 + 1.918 + uset_clear(unsafe); 1.919 + 1.920 + // cccpattern = "[[:^tccc=0:][:^lccc=0:]]", unfortunately variant 1.921 + static const UChar cccpattern[25] = { 0x5b, 0x5b, 0x3a, 0x5e, 0x74, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 1.922 + 0x5b, 0x3a, 0x5e, 0x6c, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5d, 0x00 }; 1.923 + 1.924 + // add chars that fail the fcd check 1.925 + uset_applyPattern(unsafe, cccpattern, 24, USET_IGNORE_SPACE, status); 1.926 + 1.927 + // add Thai/Lao prevowels 1.928 + uset_addRange(unsafe, 0xe40, 0xe44); 1.929 + uset_addRange(unsafe, 0xec0, 0xec4); 1.930 + // add lead/trail surrogates 1.931 + uset_addRange(unsafe, 0xd800, 0xdfff); 1.932 + 1.933 + USet *contractions = uset_open(0,0); 1.934 + 1.935 + int32_t i = 0, j = 0; 1.936 + int32_t contsSize = ucol_getContractions(coll, contractions, status); 1.937 + UChar32 c = 0; 1.938 + // Contraction set consists only of strings 1.939 + // to get unsafe code points, we need to 1.940 + // break the strings apart and add them to the unsafe set 1.941 + for(i = 0; i < contsSize; i++) { 1.942 + len = uset_getItem(contractions, i, NULL, NULL, buffer, internalBufferSize, status); 1.943 + if(len > 0) { 1.944 + j = 0; 1.945 + while(j < len) { 1.946 + U16_NEXT(buffer, j, len, c); 1.947 + if(j < len) { 1.948 + uset_add(unsafe, c); 1.949 + } 1.950 + } 1.951 + } 1.952 + } 1.953 + 1.954 + uset_close(contractions); 1.955 + 1.956 + return uset_size(unsafe); 1.957 +} 1.958 +#endif