1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/tools/dumpce/dumpce.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1597 @@ 1.4 +/******************************************************************** 1.5 + * COPYRIGHT: 1.6 + * Copyright (C) 2001-2011 IBM, Inc. All Rights Reserved. 1.7 + * 1.8 + ********************************************************************/ 1.9 +/******************************************************************************** 1.10 +* 1.11 +* File dumpce.cpp 1.12 +* 1.13 +* Modification History: 1.14 +* Name Date Description 1.15 +* synwee May 31 2001 Creation 1.16 +* 1.17 +********************************************************************************* 1.18 +*/ 1.19 + 1.20 +/** 1.21 +* This program outputs the collation elements used for a requested tailoring. 1.22 +* 1.23 +* Usage: 1.24 +* dumpce options... please check main function. 1.25 +*/ 1.26 +#include <unicode/utypes.h> 1.27 +#include <unicode/ucol.h> 1.28 +#include <unicode/uloc.h> 1.29 +#include <unicode/ucoleitr.h> 1.30 +#include <unicode/uchar.h> 1.31 +#include <unicode/uscript.h> 1.32 +#include <unicode/utf16.h> 1.33 +#include <unicode/putil.h> 1.34 +#include <unicode/ustring.h> 1.35 +#include <stdio.h> 1.36 +#include <stdlib.h> 1.37 +#include <string.h> 1.38 +#include <time.h> 1.39 +#include "ucol_tok.h" 1.40 +#include "cstring.h" 1.41 +#include "uoptions.h" 1.42 +#include "ucol_imp.h" 1.43 +#include <unicode/ures.h> 1.44 +#include <unicode/uniset.h> 1.45 +#include <unicode/usetiter.h> 1.46 + 1.47 +/** 1.48 +* Command line option variables. 1.49 +* These global variables are set according to the options specified on the 1.50 +* command line by the user. 1.51 +*/ 1.52 +static UOption options[]={ 1.53 + /* 00 */ UOPTION_HELP_H, 1.54 + /* 01 */ UOPTION_HELP_QUESTION_MARK, 1.55 + /* 02 */ {"locale", NULL, NULL, NULL, 'l', UOPT_REQUIRES_ARG, 0}, 1.56 + /* 03 */ {"serialize", NULL, NULL, NULL, 'z', UOPT_NO_ARG, 0}, 1.57 + /* 04 */ UOPTION_DESTDIR, 1.58 + /* 05 */ UOPTION_SOURCEDIR, 1.59 + /* 06 */ {"attribute", NULL, NULL, NULL, 'a', UOPT_REQUIRES_ARG, 0}, 1.60 + /* 07 */ {"rule", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0}, 1.61 + /* 08 */ {"normalization", NULL, NULL, NULL, 'n', UOPT_REQUIRES_ARG, 0}, 1.62 + /* 09 */ {"scripts", NULL, NULL, NULL, 't', UOPT_NO_ARG, 0}, 1.63 + /* 10 */ {"reducehan", NULL, NULL, NULL, 'e', UOPT_NO_ARG, 0}, 1.64 + /* 11 */ UOPTION_VERBOSE, 1.65 + /* 12 */ {"wholescripts", NULL, NULL, NULL, 'W', UOPT_NO_ARG, 0} 1.66 +}; 1.67 + 1.68 +/** 1.69 +* Collator used in this program 1.70 +*/ 1.71 +static UCollator *COLLATOR_; 1.72 +/** 1.73 +* Output strea, used in this program 1.74 +*/ 1.75 +static FILE *OUTPUT_; 1.76 + 1.77 +static UColAttributeValue ATTRIBUTE_[UCOL_ATTRIBUTE_COUNT] = { 1.78 + UCOL_DEFAULT, UCOL_DEFAULT, UCOL_DEFAULT, UCOL_DEFAULT, UCOL_DEFAULT, 1.79 + UCOL_DEFAULT, UCOL_DEFAULT, UCOL_DEFAULT, 1.80 +}; 1.81 + 1.82 +typedef struct { 1.83 + int value; 1.84 + char *name; 1.85 +} EnumNameValuePair; 1.86 + 1.87 +static const EnumNameValuePair ATTRIBUTE_NAME_[] = { 1.88 + {UCOL_FRENCH_COLLATION, "UCOL_FRENCH_COLLATION"}, 1.89 + {UCOL_ALTERNATE_HANDLING, "UCOL_ALTERNATE_HANDLING"}, 1.90 + {UCOL_CASE_FIRST, "UCOL_CASE_FIRST"}, 1.91 + {UCOL_CASE_LEVEL, "UCOL_CASE_LEVEL"}, 1.92 + {UCOL_NORMALIZATION_MODE, 1.93 + "UCOL_NORMALIZATION_MODE|UCOL_DECOMPOSITION_MODE"}, 1.94 + {UCOL_STRENGTH, "UCOL_STRENGTH"}, 1.95 + {UCOL_HIRAGANA_QUATERNARY_MODE, "UCOL_HIRAGANA_QUATERNARY_MODE"}, 1.96 + {UCOL_NUMERIC_COLLATION, "UCOL_NUMERIC_COLLATION"}, 1.97 + NULL 1.98 +}; 1.99 + 1.100 +static const EnumNameValuePair ATTRIBUTE_VALUE_[] = { 1.101 + {UCOL_PRIMARY, "UCOL_PRIMARY"}, 1.102 + {UCOL_SECONDARY, "UCOL_SECONDARY"}, 1.103 + {UCOL_TERTIARY, "UCOL_TERTIARY|UCOL_DEFAULT_STRENGTH"}, 1.104 + {UCOL_QUATERNARY, "UCOL_QUATERNARY"}, 1.105 + {UCOL_IDENTICAL, "UCOL_IDENTICAL"}, 1.106 + {UCOL_OFF, "UCOL_OFF"}, 1.107 + {UCOL_ON, "UCOL_ON"}, 1.108 + {UCOL_SHIFTED, "UCOL_SHIFTED"}, 1.109 + {UCOL_NON_IGNORABLE, "UCOL_NON_IGNORABLE"}, 1.110 + {UCOL_LOWER_FIRST, "UCOL_LOWER_FIRST"}, 1.111 + {UCOL_UPPER_FIRST, "UCOL_UPPER_FIRST"}, 1.112 + NULL 1.113 +}; 1.114 + 1.115 +typedef struct { 1.116 + UChar ch[32]; 1.117 + int count; // number of codepoint 1.118 + UBool tailored; 1.119 +} ScriptElement; 1.120 + 1.121 +/** 1.122 +* Writes the hexadecimal of a null-terminated array of codepoints into a 1.123 +* file 1.124 +* @param f UFILE instance to store 1.125 +* @param c codepoints array 1.126 +*/ 1.127 +void serialize(FILE *f, const UChar *c) 1.128 +{ 1.129 + UChar cp = *(c ++); 1.130 + 1.131 + fprintf(f, " %04x", cp); 1.132 + 1.133 + while (*c != 0) { 1.134 + cp = *(c ++); 1.135 + fprintf(f, " %04x", cp); 1.136 + } 1.137 +} 1.138 + 1.139 +/** 1.140 +* Writes the hexadecimal of a non-null-terminated array of codepoints into a 1.141 +* file 1.142 +* @param f UFILE instance to store 1.143 +* @param c codepoints array 1.144 +* @param l codepoints array length 1.145 +*/ 1.146 +void serialize(FILE *f, const UChar *c, int l) 1.147 +{ 1.148 + int count = 1; 1.149 + UChar cp = *(c ++); 1.150 + 1.151 + fprintf(f, " %04x", cp); 1.152 + 1.153 + while (count < l) { 1.154 + cp = *(c ++); 1.155 + fprintf(f, " %04x", cp); 1.156 + count ++; 1.157 + } 1.158 +} 1.159 + 1.160 +/** 1.161 +* Sets the iterator to the argument string and outputs the collation elements. 1.162 +* @param f file output stream 1.163 +* @param iter collation element iterator 1.164 +*/ 1.165 +void serialize(FILE *f, UCollationElements *iter) { 1.166 + const UChar *codepoint = iter->iteratordata_.string; 1.167 + // unlikely that sortkeys will be over this size 1.168 + uint8_t sortkey[64]; 1.169 + uint8_t *psortkey = sortkey; 1.170 + int sortkeylength = 0; 1.171 + 1.172 + if (iter->iteratordata_.flags & UCOL_ITER_HASLEN) { 1.173 + serialize(f, codepoint, iter->iteratordata_.endp - codepoint); 1.174 + sortkeylength = ucol_getSortKey(iter->iteratordata_.coll, codepoint, 1.175 + iter->iteratordata_.endp - codepoint, sortkey, 64); 1.176 + } 1.177 + else { 1.178 + serialize(f, codepoint); 1.179 + sortkeylength = ucol_getSortKey(iter->iteratordata_.coll, codepoint, 1.180 + -1, sortkey, 64); 1.181 + } 1.182 + if (options[11].doesOccur) { 1.183 + serialize(stdout, codepoint); 1.184 + fprintf(stdout, "\n"); 1.185 + } 1.186 + 1.187 + fprintf(f, "; "); 1.188 + 1.189 + UErrorCode error = U_ZERO_ERROR; 1.190 + uint32_t ce = ucol_next(iter, &error); 1.191 + if (U_FAILURE(error)) { 1.192 + fprintf(f, "Error retrieving collation elements\n"); 1.193 + return; 1.194 + } 1.195 + 1.196 + while (TRUE) { 1.197 + fprintf(f, "["); 1.198 + if (UCOL_PRIMARYORDER(ce) != 0) { 1.199 + fprintf(f, "%04x", UCOL_PRIMARYORDER(ce)); 1.200 + } 1.201 + fprintf(f, ","); 1.202 + if (UCOL_SECONDARYORDER(ce) != 0) { 1.203 + fprintf(f, " %02x", UCOL_SECONDARYORDER(ce)); 1.204 + } 1.205 + fprintf(f, ","); 1.206 + if (UCOL_TERTIARYORDER(ce) != 0) { 1.207 + fprintf(f, " %02x", UCOL_TERTIARYORDER(ce)); 1.208 + } 1.209 + fprintf(f, "] "); 1.210 + 1.211 + ce = ucol_next(iter, &error); 1.212 + if (ce == UCOL_NULLORDER) { 1.213 + break; 1.214 + } 1.215 + if (U_FAILURE(error)) { 1.216 + fprintf(stdout, "Error retrieving collation elements"); 1.217 + return; 1.218 + } 1.219 + } 1.220 + 1.221 + if (sortkeylength > 64) { 1.222 + fprintf(f, "Sortkey exceeds pre-allocated size"); 1.223 + } 1.224 + 1.225 + fprintf(f, "["); 1.226 + while (TRUE) { 1.227 + fprintf(f, "%02x", *psortkey); 1.228 + psortkey ++; 1.229 + if ((*psortkey) == 0) { 1.230 + break; 1.231 + } 1.232 + fprintf(f, " "); 1.233 + } 1.234 + fprintf(f, "]\n"); 1.235 +} 1.236 + 1.237 +/** 1.238 +* Serializes the contraction within the given argument rule 1.239 +* @param f file output stream 1.240 +* @param r rule 1.241 +* @param rlen rule length 1.242 +* @param contractionsonly flag to indicate if only contractions are to be 1.243 +* output or all collation elements 1.244 +* @param iter iterator to iterate over collation elements 1.245 +*/ 1.246 +void serialize(FILE *f, UChar *rule, int rlen, UBool contractiononly, 1.247 + UCollationElements *iter) { 1.248 + const UChar *current = NULL; 1.249 + uint32_t strength = 0; 1.250 + uint32_t chOffset = 0; 1.251 + uint32_t chLen = 0; 1.252 + uint32_t exOffset = 0; 1.253 + uint32_t exLen = 0; 1.254 + uint32_t prefixOffset = 0; 1.255 + uint32_t prefixLen = 0; 1.256 + uint8_t specs = 0; 1.257 + UBool rstart = TRUE; 1.258 + UColTokenParser src; 1.259 + UColOptionSet opts; 1.260 + UParseError parseError; 1.261 + UErrorCode error = U_ZERO_ERROR; 1.262 + 1.263 + src.opts = &opts; 1.264 + 1.265 + src.source = rule; 1.266 + src.current = rule; 1.267 + src.end = rule + rlen; 1.268 + src.extraCurrent = src.end; 1.269 + src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE; 1.270 + 1.271 + 1.272 + while ((current = ucol_tok_parseNextToken(&src, rstart, &parseError, 1.273 + &error)) != NULL) { 1.274 + chOffset = src.parsedToken.charsOffset; 1.275 + chLen = src.parsedToken.charsLen; 1.276 + // contractions handled here 1.277 + if (!contractiononly || chLen > 1) { 1.278 + ucol_setText(iter, rule + chOffset, chLen, &error); 1.279 + if (U_FAILURE(error)) { 1.280 + fprintf(stdout, "Error setting text in iterator\n"); 1.281 + return; 1.282 + } 1.283 + serialize(f, iter); 1.284 + } 1.285 + rstart = FALSE; 1.286 + } 1.287 +} 1.288 + 1.289 +/** 1.290 +* Prints the attribute values in the argument collator into the output stream 1.291 +* @param collator 1.292 +*/ 1.293 +void outputAttribute(UCollator *collator, UErrorCode *error) 1.294 +{ 1.295 + UColAttribute attribute = UCOL_FRENCH_COLLATION; 1.296 + while (attribute < UCOL_ATTRIBUTE_COUNT) { 1.297 + int count = 0; 1.298 + while (TRUE) { 1.299 + // getting attribute name 1.300 + if (ATTRIBUTE_NAME_[count].value == attribute) { 1.301 + fprintf(OUTPUT_, "%s = ", ATTRIBUTE_NAME_[count].name); 1.302 + break; 1.303 + } 1.304 + count ++; 1.305 + } 1.306 + count = 0; 1.307 + int attributeval = ucol_getAttribute(collator, attribute, error); 1.308 + if (U_FAILURE(*error)) { 1.309 + fprintf(stdout, "Failure in reading collator attribute\n"); 1.310 + return; 1.311 + } 1.312 + while (TRUE) { 1.313 + // getting attribute value 1.314 + if (ATTRIBUTE_VALUE_[count].value == attributeval) { 1.315 + fprintf(OUTPUT_, "%s\n", ATTRIBUTE_VALUE_[count].name); 1.316 + break; 1.317 + } 1.318 + count ++; 1.319 + } 1.320 + attribute = (UColAttribute)(attribute + 1); 1.321 + } 1.322 +} 1.323 + 1.324 +/** 1.325 +* Prints the normalization mode in the argument collator into the output stream 1.326 +* @param collator 1.327 +*/ 1.328 +void outputNormalization(UCollator *collator) 1.329 +{ 1.330 + UErrorCode status = U_ZERO_ERROR; 1.331 + int normmode = ucol_getAttribute(collator, UCOL_NORMALIZATION_MODE, &status); 1.332 + int count = 0; 1.333 + while (TRUE) { 1.334 + // getting attribute name 1.335 + if (ATTRIBUTE_VALUE_[count].value == normmode) { 1.336 + break; 1.337 + } 1.338 + count ++; 1.339 + } 1.340 + fprintf(OUTPUT_, "NORMALIZATION MODE = %s\n", 1.341 + ATTRIBUTE_VALUE_[count].name); 1.342 +} 1.343 + 1.344 +/** 1.345 +* Output the collation element belonging to the locale into a file 1.346 +* @param locale string 1.347 +* @param fullrules flag to indicate if only tailored collation elements are to 1.348 +* be output or all collation elements 1.349 +*/ 1.350 +void serialize(const char *locale, UBool tailoredonly) { 1.351 + UErrorCode error = U_ZERO_ERROR; 1.352 + UChar str[128]; 1.353 + int strlen = 0; 1.354 + 1.355 + fprintf(OUTPUT_, "# This file contains the serialized collation elements\n"); 1.356 + fprintf(OUTPUT_, "# as of the collation version indicated below.\n"); 1.357 + fprintf(OUTPUT_, "# Data format: xxxx xxxx..; [yyyy, yy, yy] [yyyy, yy, yy] ... [yyyy, yy, yy] [zz zz..\n"); 1.358 + fprintf(OUTPUT_, "# where xxxx are codepoints in hexadecimals,\n"); 1.359 + fprintf(OUTPUT_, "# yyyyyyyy are the corresponding\n"); 1.360 + fprintf(OUTPUT_, "# collation elements in hexadecimals\n"); 1.361 + fprintf(OUTPUT_, "# and zz are the sortkey values in hexadecimals\n"); 1.362 + 1.363 + fprintf(OUTPUT_, "\n# Collator information\n"); 1.364 + 1.365 + fprintf(OUTPUT_, "\nLocale: %s\n", locale); 1.366 + fprintf(stdout, "Locale: %s\n", locale); 1.367 + UVersionInfo version; 1.368 + ucol_getVersion(COLLATOR_, version); 1.369 + fprintf(OUTPUT_, "Version number: %d.%d.%d.%d\n", 1.370 + version[0], version[1], version[2], version[3]); 1.371 + outputAttribute(COLLATOR_, &error); 1.372 + outputNormalization(COLLATOR_); 1.373 + 1.374 + UCollationElements *iter = ucol_openElements(COLLATOR_, str, strlen, 1.375 + &error); 1.376 + if (U_FAILURE(error)) { 1.377 + fprintf(stdout, "Error creating iterator\n"); 1.378 + return; 1.379 + } 1.380 + 1.381 + if (!tailoredonly) { 1.382 + fprintf(OUTPUT_, "\n# Range of unicode characters\n\n"); 1.383 + UChar32 codepoint = 0; 1.384 + while (codepoint <= UCHAR_MAX_VALUE) { 1.385 + if (u_isdefined(codepoint)) { 1.386 + strlen = 0; 1.387 + UTF16_APPEND_CHAR_UNSAFE(str, strlen, codepoint); 1.388 + str[strlen] = 0; 1.389 + ucol_setText(iter, str, strlen, &error); 1.390 + if (U_FAILURE(error)) { 1.391 + fprintf(stdout, "Error setting text in iterator\n"); 1.392 + return; 1.393 + } 1.394 + serialize(OUTPUT_, iter); 1.395 + } 1.396 + codepoint ++; 1.397 + } 1.398 + } 1.399 + 1.400 + UChar ucarules[0x10000]; 1.401 + UChar *rules; 1.402 + int32_t rulelength = 0; 1.403 + rules = ucarules; 1.404 + 1.405 + if (tailoredonly) { 1.406 + int32_t rulelength = 0; 1.407 + const UChar *temp = ucol_getRules(COLLATOR_, &rulelength); 1.408 + if (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE > 0x10000) { 1.409 + rules = (UChar *)malloc(sizeof(UChar) * 1.410 + (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE)); 1.411 + } 1.412 + memcpy(rules, temp, rulelength * sizeof(UChar)); 1.413 + rules[rulelength] = 0; 1.414 + fprintf(OUTPUT_, "\n# Tailorings\n\n"); 1.415 + serialize(OUTPUT_, rules, rulelength, FALSE, iter); 1.416 + if (rules != ucarules) { 1.417 + free(rules); 1.418 + } 1.419 + } 1.420 + else { 1.421 + rulelength = ucol_getRulesEx(COLLATOR_, UCOL_FULL_RULES, ucarules, 1.422 + 0x10000); 1.423 + if (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE > 0x10000) { 1.424 + rules = (UChar *)malloc(sizeof(UChar) * 1.425 + (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE)); 1.426 + rulelength = ucol_getRulesEx(COLLATOR_, UCOL_FULL_RULES, rules, 1.427 + rulelength); 1.428 + } 1.429 + fprintf(OUTPUT_, "\n# Contractions\n\n"); 1.430 + serialize(OUTPUT_, rules, rulelength, TRUE, iter); 1.431 + if (rules != ucarules) { 1.432 + free(rules); 1.433 + } 1.434 + } 1.435 + 1.436 + ucol_closeElements(iter); 1.437 +} 1.438 + 1.439 +/** 1.440 +* Sets the collator with the attribute values 1.441 +* @param collator 1.442 +* @param error status 1.443 +*/ 1.444 +void setAttributes(UCollator *collator, UErrorCode *error) 1.445 +{ 1.446 + int count = 0; 1.447 + while (count < UCOL_ATTRIBUTE_COUNT) { 1.448 + if (ATTRIBUTE_[count] != UCOL_DEFAULT) { 1.449 + ucol_setAttribute(collator, (UColAttribute)count, 1.450 + ATTRIBUTE_[count], error); 1.451 + if (U_FAILURE(*error)) { 1.452 + return; 1.453 + } 1.454 + } 1.455 + count ++; 1.456 + } 1.457 +} 1.458 + 1.459 +/** 1.460 +* Appends directory path with an ending seperator if necessary. 1.461 +* @param path with enough space to append one seperator 1.462 +* @return new directory path length 1.463 +*/ 1.464 +int appendDirSeparator(char *dir) 1.465 +{ 1.466 + int dirlength = strlen(dir); 1.467 + char dirending = dir[dirlength - 1]; 1.468 + if (dirending != U_FILE_SEP_CHAR) { 1.469 + dir[dirlength] = U_FILE_SEP_CHAR; 1.470 + dir[dirlength + 1] = 0; 1.471 + return dirlength + 1; 1.472 + } 1.473 + return dirlength; 1.474 +} 1.475 + 1.476 +/** 1.477 +* Output the collation element into a file 1.478 +*/ 1.479 +void serialize() { 1.480 + char filename[128]; 1.481 + int dirlength = 0; 1.482 + 1.483 + if (options[4].doesOccur) { 1.484 + strcpy(filename, options[4].value); 1.485 + dirlength = appendDirSeparator(filename); 1.486 + } 1.487 + 1.488 + if (options[2].doesOccur) { 1.489 + const char *locale = (char *)options[2].value; 1.490 + int32_t localeindex = 0; 1.491 + 1.492 + if (strcmp(locale, "all") == 0) { 1.493 + if (options[4].doesOccur) { 1.494 + strcat(filename, "UCA.txt"); 1.495 + OUTPUT_ = fopen(filename, "w"); 1.496 + if (OUTPUT_ == NULL) { 1.497 + fprintf(stdout, "Cannot open file:%s\n", filename); 1.498 + return; 1.499 + } 1.500 + } 1.501 + fprintf(stdout, "UCA\n"); 1.502 + UErrorCode error = U_ZERO_ERROR; 1.503 + COLLATOR_ = ucol_open("en_US", &error); 1.504 + if (U_FAILURE(error)) { 1.505 + fprintf(stdout, "Collator creation failed:"); 1.506 + fprintf(stdout, u_errorName(error)); 1.507 + goto CLOSEUCA; 1.508 + return; 1.509 + } 1.510 + setAttributes(COLLATOR_, &error); 1.511 + if (U_FAILURE(error)) { 1.512 + fprintf(stdout, "Collator attribute setting failed:"); 1.513 + fprintf(stdout, u_errorName(error)); 1.514 + goto CLOSEUCA; 1.515 + return; 1.516 + } 1.517 + 1.518 + serialize("UCA", FALSE); 1.519 +CLOSEUCA : 1.520 + if (options[4].doesOccur) { 1.521 + filename[dirlength] = 0; 1.522 + fclose(OUTPUT_); 1.523 + } 1.524 + ucol_close(COLLATOR_); 1.525 + localeindex = ucol_countAvailable() - 1; 1.526 + fprintf(stdout, "Number of locales: %d\n", localeindex + 1); 1.527 + locale = ucol_getAvailable(localeindex); 1.528 + } 1.529 + 1.530 + while (TRUE) { 1.531 + UErrorCode error = U_ZERO_ERROR; 1.532 + COLLATOR_ = ucol_open(locale, &error); 1.533 + if (U_FAILURE(error)) { 1.534 + fprintf(stdout, "Collator creation failed:"); 1.535 + fprintf(stdout, u_errorName(error)); 1.536 + goto CLOSETAILOR; 1.537 + return; 1.538 + } 1.539 + setAttributes(COLLATOR_, &error); 1.540 + if (U_FAILURE(error)) { 1.541 + fprintf(stdout, "Collator attribute setting failed:"); 1.542 + fprintf(stdout, u_errorName(error)); 1.543 + goto CLOSETAILOR; 1.544 + return; 1.545 + } 1.546 + 1.547 + if (options[4].doesOccur) { 1.548 + strcat(filename, locale); 1.549 + strcat(filename, ".txt"); 1.550 + OUTPUT_ = fopen(filename, "w"); 1.551 + if (OUTPUT_ == NULL) { 1.552 + fprintf(stdout, "Cannot open file:%s\n", filename); 1.553 + return; 1.554 + } 1.555 + } 1.556 + 1.557 + if (options[3].doesOccur) { 1.558 + serialize(locale, TRUE); 1.559 + } 1.560 + 1.561 + ucol_close(COLLATOR_); 1.562 + 1.563 +CLOSETAILOR : 1.564 + if (options[4].doesOccur) { 1.565 + filename[dirlength] = 0; 1.566 + fclose(OUTPUT_); 1.567 + } 1.568 + 1.569 + localeindex --; 1.570 + if (localeindex < 0) { 1.571 + break; 1.572 + } 1.573 + locale = ucol_getAvailable(localeindex); 1.574 + } 1.575 + } 1.576 + 1.577 + if (options[7].doesOccur) { 1.578 + char inputfilename[128] = ""; 1.579 + // rules are to be used 1.580 + if (options[5].doesOccur) { 1.581 + strcpy(inputfilename, options[5].value); 1.582 + appendDirSeparator(inputfilename); 1.583 + } 1.584 + strcat(inputfilename, options[7].value); 1.585 + FILE *input = fopen(inputfilename, "r"); 1.586 + if (input == NULL) { 1.587 + fprintf(stdout, "Cannot open file:%s\n", filename); 1.588 + return; 1.589 + } 1.590 + 1.591 + char s[1024]; 1.592 + UChar rule[1024]; 1.593 + UChar *prule = rule; 1.594 + int size = 1024; 1.595 + // synwee TODO: make this part dynamic 1.596 + while (fscanf(input, "%[^\n]s", s) != EOF) { 1.597 + size -= u_unescape(s, prule, size); 1.598 + prule = prule + u_strlen(prule); 1.599 + } 1.600 + fclose(input); 1.601 + 1.602 + if (options[4].doesOccur) { 1.603 + strcat(filename, "Rules.txt"); 1.604 + OUTPUT_ = fopen(filename, "w"); 1.605 + if (OUTPUT_ == NULL) { 1.606 + fprintf(stdout, "Cannot open file:%s\n", filename); 1.607 + return; 1.608 + } 1.609 + } 1.610 + 1.611 + fprintf(stdout, "Rules\n"); 1.612 + UErrorCode error = U_ZERO_ERROR; 1.613 + UParseError parseError; 1.614 + COLLATOR_ = ucol_openRules(rule, u_strlen(rule), UCOL_DEFAULT, 1.615 + UCOL_DEFAULT_STRENGTH, &parseError, &error); 1.616 + if (U_FAILURE(error)) { 1.617 + fprintf(stdout, "Collator creation failed:"); 1.618 + fprintf(stdout, u_errorName(error)); 1.619 + goto CLOSERULES; 1.620 + return; 1.621 + } 1.622 + setAttributes(COLLATOR_, &error); 1.623 + if (U_FAILURE(error)) { 1.624 + fprintf(stdout, "Collator attribute setting failed:"); 1.625 + fprintf(stdout, u_errorName(error)); 1.626 + goto CLOSERULES; 1.627 + return; 1.628 + } 1.629 + 1.630 + serialize("Rule-based", TRUE); 1.631 + ucol_close(COLLATOR_); 1.632 + 1.633 +CLOSERULES : 1.634 + if (options[4].doesOccur) { 1.635 + filename[dirlength] = 0; 1.636 + fclose(OUTPUT_); 1.637 + } 1.638 + } 1.639 +} 1.640 + 1.641 +/** 1.642 +* Parse for enum values. 1.643 +* Note this only works for positive enum values. 1.644 +* @param enumarray array containing names of the enum values in string and 1.645 +* their corresponding value. 1.646 +* declared enum value. 1.647 +* @param str string to be parsed 1.648 +* @return corresponding integer enum value or -1 if value is not found. 1.649 +*/ 1.650 +int parseEnums(const EnumNameValuePair enumarray[], const char *str) 1.651 +{ 1.652 + const char *enumname = enumarray[0].name; 1.653 + int result = atoi(str); 1.654 + if (result == 0 && str[0] != '0') { 1.655 + while (strcmp(enumname, str) != 0) { 1.656 + // checking for multiple enum names sharing the same values 1.657 + enumname = strstr(enumname, str); 1.658 + if (enumname != NULL) { 1.659 + int size = strchr(enumname, '|') - enumname; 1.660 + if (size < 0) { 1.661 + size = strlen(enumname); 1.662 + } 1.663 + if (size == (int)strlen(str)) { 1.664 + return enumarray[result].value; 1.665 + } 1.666 + } 1.667 + result ++; 1.668 + if (&(enumarray[result]) == NULL) { 1.669 + return -1; 1.670 + } 1.671 + enumname = enumarray[result].name; 1.672 + } 1.673 + } 1.674 + return -1; 1.675 +} 1.676 + 1.677 +/** 1.678 +* Parser for attribute name value pair 1.679 +*/ 1.680 +void parseAttributes() { 1.681 + char str[32]; 1.682 + const char *pname = options[6].value; 1.683 + const char *pend = options[6].value + strlen(options[6].value); 1.684 + const char *pvalue; 1.685 + 1.686 + while (pname < pend) { 1.687 + pvalue = strchr(pname, '='); 1.688 + if (pvalue == NULL) { 1.689 + fprintf(stdout, 1.690 + "No matching value found for attribute argument %s\n", 1.691 + pname); 1.692 + return; 1.693 + } 1.694 + int count = pvalue - pname; 1.695 + strncpy(str, pname, count); 1.696 + str[count] = 0; 1.697 + 1.698 + int name = parseEnums(ATTRIBUTE_NAME_, str); 1.699 + if (name == -1) { 1.700 + fprintf(stdout, "Attribute name not found: %s\n", str); 1.701 + return; 1.702 + } 1.703 + 1.704 + pvalue ++; 1.705 + // getting corresponding enum value 1.706 + pname = strchr(pvalue, ','); 1.707 + if (pname == NULL) { 1.708 + pname = pend; 1.709 + } 1.710 + count = pname - pvalue; 1.711 + strncpy(str, pvalue, count); 1.712 + str[count] = 0; 1.713 + int value = parseEnums(ATTRIBUTE_VALUE_, str); 1.714 + if (value == -1) { 1.715 + fprintf(stdout, "Attribute value not found: %s\n", str); 1.716 + return; 1.717 + } 1.718 + ATTRIBUTE_[name] = (UColAttributeValue)value; 1.719 + pname ++; 1.720 + } 1.721 +} 1.722 + 1.723 +/** 1.724 +* Checks if the locale argument is a base language 1.725 +* @param locale to be checked 1.726 +* @return TRUE if it is a base language 1.727 +*/ 1.728 +inline UBool checkLocaleForLanguage(const char *locale) 1.729 +{ 1.730 + return strlen(locale) <= 2; 1.731 +} 1.732 + 1.733 +/** 1.734 +* Converts a UChar array into its string form "xxxx xxxx" 1.735 +* @param ch array of UChar characters 1.736 +* @param count number of UChar characters 1.737 +*/ 1.738 +void outputUChar(UChar ch[], int count) 1.739 +{ 1.740 + for (int i = 0; i < count; i ++) { 1.741 + fprintf(OUTPUT_, "%04X ", ch[i]); 1.742 + } 1.743 +} 1.744 + 1.745 +/** 1.746 +* If it is a primary difference returns -1 or 1. 1.747 +* If it is a secondary difference returns -2 or 2. 1.748 +* If it is a tertiary difference returns -3 or 3. 1.749 +* If equals returns 0. 1.750 +*/ 1.751 +int compareSortKey(const void *elem1, const void *elem2) 1.752 +{ 1.753 + // compare the 2 script element sort key 1.754 + UChar *ch1 = ((ScriptElement *)elem1)->ch; 1.755 + UChar *ch2 = ((ScriptElement *)elem2)->ch; 1.756 + int size1 = ((ScriptElement *)elem1)->count; 1.757 + int size2 = ((ScriptElement *)elem2)->count; 1.758 + UErrorCode error = U_ZERO_ERROR; 1.759 + 1.760 + ucol_setStrength(COLLATOR_, UCOL_PRIMARY); 1.761 + int result = ucol_strcoll(COLLATOR_, ch1, size1, ch2, size2); 1.762 + if (result == 0) { 1.763 + ucol_setStrength(COLLATOR_, UCOL_SECONDARY); 1.764 + result = ucol_strcoll(COLLATOR_, ch1, size1, ch2, size2); 1.765 + if (result == 0) { 1.766 + ucol_setStrength(COLLATOR_, UCOL_TERTIARY); 1.767 + result = ucol_strcoll(COLLATOR_, ch1, size1, ch2, size2); 1.768 + if (result < 0) { 1.769 + return -3; 1.770 + } 1.771 + if (result > 0) { 1.772 + return 3; 1.773 + } 1.774 + } 1.775 + if (result < 0) { 1.776 + return -2; 1.777 + } 1.778 + if (result > 0) { 1.779 + return 2; 1.780 + } 1.781 + } 1.782 + return result; 1.783 +} 1.784 + 1.785 +/** 1.786 +* Output serialized script elements 1.787 +* @param element the element to output 1.788 +* @param compare the comparison with the previous element 1.789 +* @param expansion flags TRUE if element has an expansion 1.790 +*/ 1.791 +void outputScriptElem(ScriptElement &element, int compare, UBool expansion) 1.792 +{ 1.793 + switch (compare) { 1.794 + case 0: 1.795 + if (expansion) { 1.796 + fprintf(OUTPUT_, "<tr><td class='eq' title='["); 1.797 + } 1.798 + else { 1.799 + fprintf(OUTPUT_, "<tr><td class='q' title='["); 1.800 + } 1.801 + break; 1.802 + case -1: 1.803 + if (expansion) { 1.804 + fprintf(OUTPUT_, "<tr><td class='ep' title='["); 1.805 + } 1.806 + else { 1.807 + fprintf(OUTPUT_, "<tr><td class='p' title='["); 1.808 + } 1.809 + break; 1.810 + case -2: 1.811 + if (expansion) { 1.812 + fprintf(OUTPUT_, "<tr><td class='es' title='["); 1.813 + } 1.814 + else { 1.815 + fprintf(OUTPUT_, "<tr><td class='s' title='["); 1.816 + } 1.817 + break; 1.818 + default: 1.819 + if (expansion) { 1.820 + fprintf(OUTPUT_, "<tr><td class='et' title='["); 1.821 + } 1.822 + else { 1.823 + fprintf(OUTPUT_, "<tr><td class='t' title='["); 1.824 + } 1.825 + } 1.826 + 1.827 + uint8_t sortkey[32]; 1.828 + ucol_setStrength(COLLATOR_, UCOL_TERTIARY); 1.829 + ucol_getSortKey(COLLATOR_, element.ch, element.count, sortkey, 32); 1.830 + int i = 0; 1.831 + while (sortkey[i] != 0) { 1.832 + if (sortkey[i] == 1) { 1.833 + fprintf(OUTPUT_, " | "); 1.834 + } 1.835 + else { 1.836 + fprintf(OUTPUT_, "%02x", sortkey[i]); 1.837 + } 1.838 + 1.839 + i ++; 1.840 + } 1.841 + 1.842 + fprintf(OUTPUT_, "]'>"); 1.843 + 1.844 + UErrorCode error = U_ZERO_ERROR; 1.845 + char utf8[64]; 1.846 + UChar nfc[32]; 1.847 + int32_t length = unorm_normalize(element.ch, element.count, UNORM_NFC, 0, nfc, 1.848 + 32, &error); 1.849 + if (U_FAILURE(error)) { 1.850 + fprintf(stdout, "Error normalizing contractions to NFC\n"); 1.851 + } 1.852 + u_strToUTF8(utf8, 64, &length, nfc, length, &error); 1.853 + if (U_FAILURE(error)) { 1.854 + fprintf(stdout, "Error converting UChar to utf8\n"); 1.855 + return; 1.856 + } 1.857 + 1.858 + fprintf(OUTPUT_, "%s<br>", utf8); 1.859 + fprintf(OUTPUT_, "<tt>"); 1.860 + outputUChar(element.ch, element.count); 1.861 + 1.862 + if (compare == 0) { 1.863 + fprintf(OUTPUT_, "</tt></td><td> </td><td> </td><td> </td><td>Q</td><td>"); 1.864 + } 1.865 + else if (compare == -1) { 1.866 + fprintf(OUTPUT_, "</tt></td><td>P</td><td> </td><td> </td><td> </td><td>"); 1.867 + } 1.868 + else if (compare == -2) { 1.869 + fprintf(OUTPUT_, "</tt></td><td> </td><td>S</td><td> </td><td> </td><td>"); 1.870 + } 1.871 + else if (compare == -3) { 1.872 + fprintf(OUTPUT_, "</tt></td><td> </td><td> </td><td>T</td><td> </td><td>"); 1.873 + } 1.874 + 1.875 + i = 0; 1.876 + while (i < element.count) { 1.877 + char str[128]; 1.878 + UChar32 codepoint; 1.879 + U16_NEXT(element.ch, i, element.count, codepoint); 1.880 + int32_t temp = u_charName(codepoint, U_UNICODE_CHAR_NAME, str, 128, 1.881 + &error); 1.882 + if (U_FAILURE(error)) { 1.883 + fprintf(stdout, "Error getting character name\n"); 1.884 + return; 1.885 + } 1.886 + if (element.tailored) { 1.887 + fprintf(OUTPUT_, "<b>"); 1.888 + } 1.889 + fprintf(OUTPUT_, "%s", str); 1.890 + if (element.tailored) { 1.891 + fprintf(OUTPUT_, " *</b>"); 1.892 + } 1.893 + if (i < element.count) { 1.894 + fprintf(OUTPUT_, "<br>\n"); 1.895 + } 1.896 + } 1.897 + 1.898 + fprintf(OUTPUT_, "</td></tr>\n"); 1.899 +} 1.900 + 1.901 +/** 1.902 +* Checks if codepoint belongs to scripts 1.903 +* @param script list 1.904 +* @param scriptcount number of scripts 1.905 +* @param codepoint to test 1.906 +* @return TRUE if codepoint belongs to scripts 1.907 +*/ 1.908 +UBool checkInScripts(UScriptCode script[], int scriptcount, 1.909 + UChar32 codepoint) 1.910 +{ 1.911 + UErrorCode error = U_ZERO_ERROR; 1.912 + for (int i = 0; i < scriptcount; i ++) { 1.913 + if (script[i] == USCRIPT_HAN && options[10].doesOccur) { 1.914 + if ((codepoint >= 0x2E80 && codepoint <= 0x2EE4) || 1.915 + (codepoint >= 0x2A672 && codepoint <= 0x2A6D6)) { 1.916 + // reduce han 1.917 + return TRUE; 1.918 + } 1.919 + } 1.920 + else if (uscript_getScript(codepoint, &error) == script[i]) { 1.921 + return TRUE; 1.922 + } 1.923 + if (U_FAILURE(error)) { 1.924 + fprintf(stdout, "Error checking character in scripts\n"); 1.925 + return FALSE; 1.926 + } 1.927 + } 1.928 + return FALSE; 1.929 +} 1.930 + 1.931 +/** 1.932 +* Checks if the set of codepoints belongs to the script 1.933 +* @param script list 1.934 +* @param scriptcount number of scripts 1.935 +* @param scriptelem 1.936 +* @return TRUE if all codepoints belongs to the script 1.937 +*/ 1.938 +inline UBool checkInScripts(UScriptCode script[], int scriptcount, 1.939 + ScriptElement scriptelem) 1.940 +{ 1.941 + int i = 0; 1.942 + while (i < scriptelem.count) { 1.943 + UChar32 codepoint; 1.944 + U16_NEXT(scriptelem.ch, i, scriptelem.count, codepoint); 1.945 + UErrorCode error = U_ZERO_ERROR; 1.946 + if (checkInScripts(script, scriptcount, codepoint)) { 1.947 + return TRUE; 1.948 + } 1.949 + } 1.950 + return FALSE; 1.951 +} 1.952 + 1.953 +/** 1.954 +* Gets the script elements and contractions belonging to the script 1.955 +* @param elems output list 1.956 +* @param locale locale 1.957 +* @return number of script elements 1.958 +* Add by Richard 1.959 +*/ 1.960 +int getScriptElementsFromExemplars(ScriptElement scriptelem[], const char* locale) { 1.961 + UErrorCode error = U_ZERO_ERROR; 1.962 + UChar32 codepoint = 0; 1.963 + 1.964 + UResourceBundle* ures = ures_open(NULL, locale, &error); 1.965 + if (U_FAILURE(error)) { 1.966 + fprintf(stdout, "Can not find resource bundle for locale: %s\n", locale); 1.967 + return -1; 1.968 + } 1.969 + int32_t length; 1.970 + const UChar* exemplarChars = ures_getStringByKey(ures, "ExemplarCharacters", &length, &error); 1.971 + 1.972 + if (U_FAILURE(error)) { 1.973 + fprintf(stdout, "Can not find ExemplarCharacters in resource bundle\n"); 1.974 + return -1; 1.975 + } 1.976 + 1.977 + UChar* upperChars = new UChar[length * 2]; 1.978 + if (upperChars == 0) { 1.979 + fprintf(stdout, "Memory error\n"); 1.980 + return -1; 1.981 + } 1.982 + 1.983 + int32_t destLength = u_strToUpper(upperChars, length * 2, exemplarChars, -1, locale, &error); 1.984 + if (U_FAILURE(error)) { 1.985 + fprintf(stdout, "Error when u_strToUpper() \n"); 1.986 + return -1; 1.987 + } 1.988 + 1.989 + UChar* pattern = new UChar[length + destLength + 10]; 1.990 + UChar left[2] = {0x005b, 0x0}; 1.991 + UChar right[2] = {0x005d, 0x0}; 1.992 + pattern = u_strcpy(pattern, left); 1.993 + pattern = u_strcat(pattern, exemplarChars); 1.994 + pattern = u_strcat(pattern, upperChars); 1.995 + pattern = u_strcat(pattern, right); 1.996 + 1.997 + UnicodeSet * uniset = new UnicodeSet(UnicodeString(pattern), error); 1.998 + if (U_FAILURE(error)) { 1.999 + fprintf(stdout, "Can not open USet \n"); 1.1000 + return -1; 1.1001 + } 1.1002 + 1.1003 + UnicodeSetIterator* usetiter = new UnicodeSetIterator(*uniset); 1.1004 + 1.1005 + int32_t count = 0; 1.1006 + 1.1007 + while (usetiter -> next()) { 1.1008 + if (usetiter -> isString()) { 1.1009 + UnicodeString strItem = usetiter -> getString(); 1.1010 + 1.1011 + scriptelem[count].count = 0; 1.1012 + for (int i = 0; i < strItem.length(); i++) { 1.1013 + codepoint = strItem.char32At(i); 1.1014 + UTF16_APPEND_CHAR_UNSAFE(scriptelem[count].ch, scriptelem[count].count, codepoint); 1.1015 + scriptelem[count].tailored = FALSE; 1.1016 + } 1.1017 + } else { 1.1018 + codepoint = usetiter -> getCodepoint(); 1.1019 + scriptelem[count].count = 0; 1.1020 + UTF16_APPEND_CHAR_UNSAFE(scriptelem[count].ch, scriptelem[count].count, codepoint); 1.1021 + scriptelem[count].tailored = FALSE; 1.1022 + } 1.1023 + 1.1024 + count++; 1.1025 + } 1.1026 + delete []pattern; 1.1027 + 1.1028 + return count; 1.1029 +} 1.1030 + 1.1031 +/** 1.1032 +* Gets the script elements and contractions belonging to the script 1.1033 +* @param script list 1.1034 +* @param scriptcount number of scripts 1.1035 +* @param elems output list 1.1036 +* @return number of script elements 1.1037 +*/ 1.1038 +int getScriptElements(UScriptCode script[], int scriptcount, 1.1039 + ScriptElement scriptelem[]) 1.1040 +{ 1.1041 + UErrorCode error = U_ZERO_ERROR; 1.1042 + UChar32 codepoint = 0; 1.1043 + int count = 0; 1.1044 + while (codepoint <= UCHAR_MAX_VALUE) { 1.1045 + if (checkInScripts(script, scriptcount, codepoint)) { 1.1046 + scriptelem[count].count = 0; 1.1047 + UTF16_APPEND_CHAR_UNSAFE(scriptelem[count].ch, 1.1048 + scriptelem[count].count, codepoint); 1.1049 + scriptelem[count].tailored = FALSE; 1.1050 + count ++; 1.1051 + } 1.1052 + if (U_FAILURE(error)) { 1.1053 + fprintf(stdout, "Error determining codepoint in script\n"); 1.1054 + return -1; 1.1055 + } 1.1056 + codepoint ++; 1.1057 + } 1.1058 + 1.1059 + const UChar *current = NULL; 1.1060 + uint32_t strength = 0; 1.1061 + uint32_t chOffset = 0; 1.1062 + uint32_t chLen = 0; 1.1063 + uint32_t exOffset = 0; 1.1064 + uint32_t exLen = 0; 1.1065 + uint32_t prefixOffset = 0; 1.1066 + uint32_t prefixLen = 0; 1.1067 + uint8_t specs = 0; 1.1068 + UBool rstart = TRUE; 1.1069 + UColTokenParser src; 1.1070 + UColOptionSet opts; 1.1071 + UParseError parseError; 1.1072 + 1.1073 + int32_t rulelength = ucol_getRulesEx(COLLATOR_, UCOL_FULL_RULES, NULL, 0); 1.1074 + src.source = (UChar *)malloc(sizeof(UChar) * 1.1075 + (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE)); 1.1076 + rulelength = ucol_getRulesEx(COLLATOR_, UCOL_FULL_RULES, src.source, 1.1077 + rulelength); 1.1078 + src.current = src.source; 1.1079 + src.end = src.source + rulelength; 1.1080 + src.extraCurrent = src.end; 1.1081 + src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE; 1.1082 + src.opts = &opts; 1.1083 + 1.1084 + /* 1.1085 + ucol_tok_parseNextToken(&src, &strength, &chOffset, 1.1086 + &chLen, &exOffset, &exLen, 1.1087 + &prefixOffset, &prefixLen, 1.1088 + &specs, rstart, &parseError, 1.1089 + &error) 1.1090 + */ 1.1091 + while ((current = ucol_tok_parseNextToken(&src, rstart, &parseError, 1.1092 + &error)) != NULL) { 1.1093 + // contractions handled here 1.1094 + if (chLen > 1) { 1.1095 + u_strncpy(scriptelem[count].ch, src.source + chOffset, chLen); 1.1096 + scriptelem[count].count = chLen; 1.1097 + if (checkInScripts(script, scriptcount, scriptelem[count])) { 1.1098 + scriptelem[count].tailored = FALSE; 1.1099 + count ++; 1.1100 + } 1.1101 + } 1.1102 + rstart = FALSE; 1.1103 + } 1.1104 + if (U_FAILURE(error)) { 1.1105 + fprintf(stdout, "Error parsing rules: %s\n", u_errorName(error)); 1.1106 + } 1.1107 + // rule might have been reallocated, so delete this instead 1.1108 + free(src.source); 1.1109 + return count; 1.1110 +} 1.1111 + 1.1112 +int compareCodepoints(const void *elem1, const void *elem2) 1.1113 +{ 1.1114 + UChar *ch1 = ((ScriptElement *)elem1)->ch; // key 1.1115 + UChar *ch2 = ((ScriptElement *)elem2)->ch; 1.1116 + ch1[((ScriptElement *)elem1)->count] = 0; 1.1117 + ch2[((ScriptElement *)elem2)->count] = 0; 1.1118 + 1.1119 + // compare the 2 codepoints 1.1120 + return u_strcmp(ch1, ch2); 1.1121 +} 1.1122 + 1.1123 +UBool hasSubNFD(ScriptElement &se, ScriptElement &key) 1.1124 +{ 1.1125 + UChar *ch1 = se.ch; 1.1126 + UChar *ch2 = key.ch; // key 1.1127 + ch1[se.count] = 0; 1.1128 + ch2[key.count] = 0; 1.1129 + 1.1130 + // compare the 2 codepoints 1.1131 + if (u_strstr(ch1, ch2) != NULL) { 1.1132 + return TRUE; 1.1133 + } 1.1134 + 1.1135 + // check the decomposition 1.1136 + UChar norm[32]; 1.1137 + UErrorCode error = U_ZERO_ERROR; 1.1138 + int size = unorm_normalize(ch1, se.count, UNORM_NFD, 0, norm, 32, 1.1139 + &error); 1.1140 + if (U_FAILURE(error)) { 1.1141 + fprintf(stdout, "Error normalizing\n"); 1.1142 + } 1.1143 + if (u_strstr(norm, ch2) != NULL) { 1.1144 + return TRUE; 1.1145 + } 1.1146 + return FALSE; 1.1147 +} 1.1148 + 1.1149 +/** 1.1150 +* Marks tailored elements 1.1151 +* @param script list 1.1152 +* @param scriptcount number of scripts 1.1153 +* @param scriptelem script element list 1.1154 +* @param scriptelemlength size of the script element list 1.1155 +*/ 1.1156 +void markTailored(UScriptCode script[], int scriptcount, 1.1157 + ScriptElement scriptelem[], int scriptelemlength) 1.1158 +{ 1.1159 + int32_t rulelength; 1.1160 + const UChar *rule = ucol_getRules(COLLATOR_, &rulelength); 1.1161 + 1.1162 + const UChar *current = NULL; 1.1163 + uint32_t strength = 0; 1.1164 + uint32_t chOffset = 0; 1.1165 + uint32_t chLen = 0; 1.1166 + uint32_t exOffset = 0; 1.1167 + uint32_t exLen = 0; 1.1168 + uint32_t prefixOffset = 0; 1.1169 + uint32_t prefixLen = 0; 1.1170 + uint8_t specs = 0; 1.1171 + UBool rstart = TRUE; 1.1172 + UColTokenParser src; 1.1173 + UColOptionSet opts; 1.1174 + UParseError parseError; 1.1175 + 1.1176 + src.opts = &opts; 1.1177 + src.source = (UChar *)malloc( 1.1178 + (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar)); 1.1179 + memcpy(src.source, rule, rulelength * sizeof(UChar)); 1.1180 + src.current = src.source; 1.1181 + src.end = (UChar *)src.source + rulelength; 1.1182 + src.extraCurrent = src.end; 1.1183 + src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE; 1.1184 + 1.1185 + UErrorCode error = U_ZERO_ERROR; 1.1186 + 1.1187 + while ((current = ucol_tok_parseNextToken(&src, rstart, &parseError, 1.1188 + &error)) != NULL) { 1.1189 + if (chLen >= 1 && strength != UCOL_TOK_RESET) { 1.1190 + // skipping the reset characters and non useful stuff. 1.1191 + ScriptElement se; 1.1192 + u_strncpy(se.ch, src.source + chOffset, chLen); 1.1193 + se.count = chLen; 1.1194 + 1.1195 + if (checkInScripts(script, scriptcount, se)) { 1.1196 + /* 1.1197 + ScriptElement *tse = (ScriptElement *)bsearch(&se, scriptelem, 1.1198 + scriptelemlength, 1.1199 + sizeof(ScriptElement), 1.1200 + compareCodepoints); 1.1201 + */ 1.1202 + for (int i = 0; i < scriptelemlength; i ++) { 1.1203 + if (!scriptelem[i].tailored && 1.1204 + hasSubNFD(scriptelem[i], se)) { 1.1205 + scriptelem[i].tailored = TRUE; 1.1206 + } 1.1207 + } 1.1208 + } 1.1209 + } 1.1210 + rstart = FALSE; 1.1211 + } 1.1212 + free(src.source); 1.1213 + if (U_FAILURE(error)) { 1.1214 + fprintf(stdout, "Error parsing rules\n"); 1.1215 + } 1.1216 +} 1.1217 + 1.1218 +/** 1.1219 +* Checks if the collation iterator has more than 1 collation element 1.1220 +* @parem coleiter collation element iterator 1.1221 +* @return TRUE if collation iterator has more than 1 collation element 1.1222 +*/ 1.1223 +UBool hasExpansions(UCollationElements *coleiter) 1.1224 +{ 1.1225 + UErrorCode error = U_ZERO_ERROR; 1.1226 + int32_t ce = ucol_next(coleiter, &error); 1.1227 + int count = 0; 1.1228 + 1.1229 + if (U_FAILURE(error)) { 1.1230 + fprintf(stdout, "Error getting next collation element\n"); 1.1231 + } 1.1232 + while (ce != UCOL_NULLORDER) { 1.1233 + if ((UCOL_PRIMARYORDER(ce) != 0) && !isContinuation(ce)) { 1.1234 + count ++; 1.1235 + if (count == 2) { 1.1236 + return TRUE; 1.1237 + } 1.1238 + } 1.1239 + ce = ucol_next(coleiter, &error); 1.1240 + if (U_FAILURE(error)) { 1.1241 + fprintf(stdout, "Error getting next collation element\n"); 1.1242 + } 1.1243 + } 1.1244 + return FALSE; 1.1245 +} 1.1246 + 1.1247 +/** 1.1248 +* Prints the footer for index.html 1.1249 +* @param file output file 1.1250 +*/ 1.1251 +void outputHTMLFooter() 1.1252 +{ 1.1253 + fprintf(OUTPUT_, "</table>\n"); 1.1254 + fprintf(OUTPUT_, "</body>\n"); 1.1255 + fprintf(OUTPUT_, "</html>\n"); 1.1256 +} 1.1257 + 1.1258 +/** 1.1259 +* Serialize the codepoints from start to end into an html file. 1.1260 +* Arranging them into ascending collation order. 1.1261 +* @param script code list 1.1262 +* @param scriptcount number of scripts 1.1263 +*/ 1.1264 +//void serializeScripts(UScriptCode script[], int scriptcount) 1.1265 +//Richard 1.1266 +void serializeScripts(UScriptCode script[], int scriptcount, const char* locale = NULL) 1.1267 +{ 1.1268 + UErrorCode error = U_ZERO_ERROR; 1.1269 + 1.1270 + ScriptElement *scriptelem = 1.1271 + (ScriptElement *)malloc(sizeof(ScriptElement) * 0x20000); 1.1272 + if (scriptelem == NULL) { 1.1273 + fprintf(stdout, "Memory error\n"); 1.1274 + return; 1.1275 + } 1.1276 + int count = 0; 1.1277 + if(locale) { 1.1278 + count = getScriptElementsFromExemplars(scriptelem, locale); 1.1279 + } else { 1.1280 + count = getScriptElements(script, scriptcount, scriptelem); 1.1281 + } 1.1282 + 1.1283 + // Sort script elements using Quicksort algorithm: 1.1284 + qsort(scriptelem, count, sizeof(ScriptElement), compareCodepoints); 1.1285 + markTailored(script, scriptcount, scriptelem, count); 1.1286 + // Sort script elements using Quicksort algorithm: 1.1287 + qsort(scriptelem, count, sizeof(ScriptElement), compareSortKey); 1.1288 + 1.1289 + UCollationElements* coleiter = ucol_openElements(COLLATOR_, 1.1290 + scriptelem[0].ch, 1.1291 + scriptelem[0].count, 1.1292 + &error); 1.1293 + if (U_FAILURE(error)) { 1.1294 + fprintf(stdout, "Error creating collation element iterator\n"); 1.1295 + return; 1.1296 + } 1.1297 + 1.1298 + outputScriptElem(scriptelem[0], -1, hasExpansions(coleiter)); 1.1299 + for (int i = 0; i < count - 1; i ++) { 1.1300 + ucol_setText(coleiter, scriptelem[i + 1].ch, scriptelem[i + 1].count, 1.1301 + &error); 1.1302 + if (U_FAILURE(error)) { 1.1303 + fprintf(stdout, "Error setting text in collation element iterator\n"); 1.1304 + return; 1.1305 + } 1.1306 + outputScriptElem(scriptelem[i + 1], 1.1307 + compareSortKey(scriptelem + i, scriptelem + i + 1), 1.1308 + hasExpansions(coleiter)); 1.1309 + } 1.1310 + free(scriptelem); 1.1311 + outputHTMLFooter(); 1.1312 +} 1.1313 + 1.1314 +/** 1.1315 +* Prints the header for the html 1.1316 +* @param locale name 1.1317 +* @param script 1.1318 +* @param scriptcount number of scripts 1.1319 +*/ 1.1320 +void outputHTMLHeader(const char *locale, UScriptCode script[], 1.1321 + int scriptcount) 1.1322 +{ 1.1323 + fprintf(OUTPUT_, "<html>\n"); 1.1324 + fprintf(OUTPUT_, "<head>\n"); 1.1325 + fprintf(OUTPUT_, "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\n"); 1.1326 + fprintf(OUTPUT_, "<meta http-equiv=\"Content-Language\" content=\"en-us\">\n"); 1.1327 + fprintf(OUTPUT_, "<link rel=\"stylesheet\" href=\"charts.css\" type=\"text/css\">\n"); 1.1328 + fprintf(OUTPUT_, "<title>ICU Collation charts</title>\n"); 1.1329 + fprintf(OUTPUT_, "<base target=\"main\">\n"); 1.1330 + fprintf(OUTPUT_, "</head>\n"); 1.1331 + 1.1332 + fprintf(OUTPUT_, "<body bgcolor=#FFFFFF>\n"); 1.1333 + fprintf(OUTPUT_, "<!--\n"); 1.1334 + fprintf(OUTPUT_, "This file contains sorted characters in ascending order according to the locale stated\n"); 1.1335 + fprintf(OUTPUT_, "If the character is in red, it is tailored in the collation rules.\n"); 1.1336 + fprintf(OUTPUT_, "Background colours have certain meanings:\n"); 1.1337 + fprintf(OUTPUT_, "White - equals the previous character\n"); 1.1338 + fprintf(OUTPUT_, "dark blue - primary greater than the previous character\n"); 1.1339 + fprintf(OUTPUT_, "blue - secondary greater than the previous character\n"); 1.1340 + fprintf(OUTPUT_, "light blue - tertiary greater than the previous character\n"); 1.1341 + fprintf(OUTPUT_, "--!>\n"); 1.1342 + 1.1343 + fprintf(OUTPUT_, "<table border=0>\n"); 1.1344 + UChar displayname[64]; 1.1345 + UErrorCode error = U_ZERO_ERROR; 1.1346 + int32_t size = uloc_getDisplayName(locale, "en_US", displayname, 64, &error); 1.1347 + char utf8displayname[128]; 1.1348 + if (U_FAILURE(error)) { 1.1349 + utf8displayname[0] = 0; 1.1350 + } 1.1351 + else { 1.1352 + int32_t utf8size = 0; 1.1353 + u_strToUTF8(utf8displayname, 128, &utf8size, displayname, size, &error); 1.1354 + } 1.1355 + 1.1356 + fprintf(OUTPUT_, "<tr><th>Locale</th><td class='noborder'>%s</td></tr>\n", utf8displayname); 1.1357 + fprintf(OUTPUT_, "<tr><th>Script(s)</th>"); 1.1358 + fprintf(OUTPUT_, "<td class='noborder'>"); 1.1359 + for (int i = 0; i < scriptcount; i ++) { 1.1360 + fprintf(OUTPUT_, "%s", uscript_getName(script[i])); 1.1361 + if (i + 1 != scriptcount) { 1.1362 + fprintf(OUTPUT_, ", "); 1.1363 + } 1.1364 + } 1.1365 + fprintf(OUTPUT_, "</td></tr>\n"); 1.1366 + 1.1367 + fprintf(OUTPUT_, "<tr><th>Rules</th><td class='noborder'><a href=\"http://dev.icu-project.org/cgi-bin/viewcvs.cgi/*checkout*/icu/source/data/coll/%s.txt\">%s.txt</a></td></tr>\n", locale, locale); 1.1368 + 1.1369 + UVersionInfo version; 1.1370 + ucol_getVersion(COLLATOR_, version); 1.1371 + fprintf(OUTPUT_, "<tr><th>Collator version</th><td class='noborder'>%d.%d.%d.%d</td></tr>\n", 1.1372 + version[0], version[1], version[2], version[3]); 1.1373 + 1.1374 + UColAttribute attr = UCOL_FRENCH_COLLATION; 1.1375 + while (attr < UCOL_ATTRIBUTE_COUNT) { 1.1376 + UColAttributeValue value = ucol_getAttribute(COLLATOR_, attr, &error); 1.1377 + if (U_FAILURE(error)) { 1.1378 + fprintf(stdout, "Error getting attribute\n"); 1.1379 + return; 1.1380 + } 1.1381 + if (value != UCOL_DEFAULT) { 1.1382 + if (attr == UCOL_FRENCH_COLLATION && value != UCOL_OFF) { 1.1383 + fprintf(OUTPUT_, "<tr><th>French Collation</th><td class='noborder'>on, code %d</td></tr>\n", value); 1.1384 + } 1.1385 + if (attr == UCOL_ALTERNATE_HANDLING && value != UCOL_NON_IGNORABLE) { 1.1386 + fprintf(OUTPUT_, "<tr><th>Alternate Handling</th><td class='noborder'>shifted, code%d</td></tr>\n", value); 1.1387 + } 1.1388 + if (attr == UCOL_CASE_FIRST && value != UCOL_OFF) { 1.1389 + fprintf(OUTPUT_, "<tr><th>Case First</th><td class='noborder'>on, code %d</td></tr>\n", value); 1.1390 + } 1.1391 + if (attr == UCOL_CASE_LEVEL && value != UCOL_OFF) { 1.1392 + fprintf(OUTPUT_, "<tr><th>Case Level</th><td class='noborder'>on, code %d</td></tr>\n", value); 1.1393 + } 1.1394 + if (attr == UCOL_NORMALIZATION_MODE && value != UCOL_OFF) { 1.1395 + fprintf(OUTPUT_, "<tr><th>Normalization</th><td class='noborder'>on, code %d</td></tr>\n", value); 1.1396 + } 1.1397 + if (attr == UCOL_STRENGTH && value != UCOL_TERTIARY) { 1.1398 + fprintf(OUTPUT_, "<tr><th>Strength</th><td class='noborder'>code %d</td></tr>\n", value); 1.1399 + } 1.1400 + if (attr == UCOL_HIRAGANA_QUATERNARY_MODE && value != UCOL_OFF) { 1.1401 + fprintf(OUTPUT_, "<tr><th>Hiragana Quaternary</th><td class='noborder'>on, code %d</td></tr>\n", value); 1.1402 + } 1.1403 + } 1.1404 + attr = (UColAttribute)(attr + 1); 1.1405 + } 1.1406 + 1.1407 + // Get UNIX-style time and display as number and string. 1.1408 + time_t ltime; 1.1409 + time( <ime ); 1.1410 + fprintf(OUTPUT_, "<tr><th>Date Generated</th><td class='noborder'>%s</td></tr>", ctime(<ime)); 1.1411 + 1.1412 + fprintf(OUTPUT_, "</table>\n"); 1.1413 + 1.1414 + fprintf(OUTPUT_, "<p><a href=help.html>How to read the table</a><br>\n"); 1.1415 + fprintf(OUTPUT_, "<a href=http://www.jtcsv.com/cgi-bin/icu-bugs/ target=new>Submit a bug</a></p>\n"); 1.1416 + fprintf(OUTPUT_, "\n<table>\n"); 1.1417 + fprintf(OUTPUT_, "\n<tr><th>Codepoint</th><th>P</th><th>S</th><th>T</th><th>Q</th><th>Name</th></tr>\n"); 1.1418 +} 1.1419 + 1.1420 +/** 1.1421 +* Prints the header for index.html 1.1422 +* @param file output file 1.1423 +*/ 1.1424 +void outputListHTMLHeader(FILE *file) 1.1425 +{ 1.1426 + fprintf(file, "<html>\n"); 1.1427 + fprintf(file, "<head>\n"); 1.1428 + fprintf(file, "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\n"); 1.1429 + fprintf(file, "<meta http-equiv=\"Content-Language\" content=\"en-us\">\n"); 1.1430 + fprintf(file, "<title>ICU Collation Charts</title>\n"); 1.1431 + fprintf(file, "<base target=\"main\">\n"); 1.1432 + fprintf(file, "</head>\n"); 1.1433 + fprintf(file, "<body bgcolor=#FFFFFF>\n"); 1.1434 + fprintf(file, "<h2 align=center>ICU Collation Charts</h2>\n"); 1.1435 + fprintf(file, "<p align=center>\n"); 1.1436 + fprintf(file, "<a href=http://www.unicode.org/charts/collation/ target=new>UCA Charts</a><br>"); 1.1437 +} 1.1438 + 1.1439 +/** 1.1440 +* Prints the footer for index.html 1.1441 +* @param file output file 1.1442 +*/ 1.1443 +void outputListHTMLFooter(FILE *file) 1.1444 +{ 1.1445 + fprintf(file, "</p>\n"); 1.1446 + //fprintf(file, "<center><image src=http://oss.software.ibm.com/icu/images/w24.gif></center>\n"); 1.1447 + fprintf(file, "</body>\n"); 1.1448 + fprintf(file, "</html>\n"); 1.1449 +} 1.1450 + 1.1451 +/** 1.1452 +* Gets all scripts and serialize their codepoints into an html file. 1.1453 +*/ 1.1454 +void serializeScripts() { 1.1455 + char filename[128]; 1.1456 + int dirlength = 0; 1.1457 + 1.1458 + if (options[4].doesOccur) { 1.1459 + strcpy(filename, options[4].value); 1.1460 + dirlength = appendDirSeparator(filename); 1.1461 + } else { 1.1462 + filename[0] = 0; 1.1463 + } 1.1464 + 1.1465 + const char *locale; 1.1466 + int32_t localelist = 0; 1.1467 + int32_t localesize; 1.1468 + 1.1469 + localesize = ucol_countAvailable(); 1.1470 + locale = ucol_getAvailable(localelist); 1.1471 + 1.1472 + strcat(filename, "list.html"); 1.1473 + FILE *list = fopen(filename, "w"); 1.1474 + filename[dirlength] = 0; 1.1475 + if (list == NULL) { 1.1476 + fprintf(stdout, "Cannot open file: %s\n", filename); 1.1477 + return; 1.1478 + } 1.1479 + 1.1480 + outputListHTMLHeader(list); 1.1481 + fprintf(list, "<blockquote>\n"); 1.1482 + while (TRUE) { 1.1483 + UErrorCode error = U_ZERO_ERROR; 1.1484 + COLLATOR_ = ucol_open(locale, &error); 1.1485 + if (U_FAILURE(error)) { 1.1486 + fprintf(stdout, "Collator creation failed:"); 1.1487 + fprintf(stdout, u_errorName(error)); 1.1488 + break; 1.1489 + } 1.1490 + if ((error != U_USING_FALLBACK_WARNING && // not tailored 1.1491 + error != U_USING_DEFAULT_WARNING) || 1.1492 + checkLocaleForLanguage(locale)) { 1.1493 + fprintf(list, "<a href=%s.html>%s</a> ", locale, locale); 1.1494 + setAttributes(COLLATOR_, &error); 1.1495 + if (U_FAILURE(error)) { 1.1496 + fprintf(stdout, "Collator attribute setting failed:"); 1.1497 + fprintf(stdout, u_errorName(error)); 1.1498 + break; 1.1499 + } 1.1500 + 1.1501 + UScriptCode scriptcode[32]; 1.1502 + uint32_t scriptcount = uscript_getCode(locale, scriptcode, 32, 1.1503 + &error); 1.1504 + if (U_FAILURE(error)) { 1.1505 + fprintf(stdout, "Error getting lcale scripts\n"); 1.1506 + break; 1.1507 + } 1.1508 + 1.1509 + strcat(filename, locale); 1.1510 + strcat(filename, ".html"); 1.1511 + OUTPUT_ = fopen(filename, "w"); 1.1512 + if (OUTPUT_ == NULL) { 1.1513 + fprintf(stdout, "Cannot open file:%s\n", filename); 1.1514 + break; 1.1515 + } 1.1516 + outputHTMLHeader(locale, scriptcode, scriptcount); 1.1517 + fprintf(stdout, "%s\n", locale); 1.1518 + 1.1519 + if(options[12].doesOccur) { 1.1520 + // use whole scripts 1.1521 + serializeScripts(scriptcode, scriptcount); 1.1522 + } else { 1.1523 + // use exemplar chars 1.1524 + serializeScripts(scriptcode, scriptcount, locale); 1.1525 + } 1.1526 + fclose(OUTPUT_); 1.1527 + } 1.1528 + ucol_close(COLLATOR_); 1.1529 + 1.1530 + filename[dirlength] = 0; 1.1531 + localelist ++; 1.1532 + if (localelist == localesize) { 1.1533 + break; 1.1534 + } 1.1535 + locale = ucol_getAvailable(localelist); 1.1536 + } 1.1537 + fprintf(list, "<br><a href=help.html>help</a><br>"); 1.1538 + fprintf(list, "</blockquote>\n"); 1.1539 + outputListHTMLFooter(list); 1.1540 + fclose(list); 1.1541 +} 1.1542 + 1.1543 +/** 1.1544 +* Main -- process command line, read in and pre-process the test file, 1.1545 +* call other functions to do the actual tests. 1.1546 +*/ 1.1547 +int main(int argc, char *argv[]) { 1.1548 + 1.1549 + argc = u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), 1.1550 + options); 1.1551 + 1.1552 + // error handling, printing usage message 1.1553 + if (argc < 0) { 1.1554 + fprintf(stdout, "error in command line argument: "); 1.1555 + fprintf(stdout, argv[-argc]); 1.1556 + fprintf(stdout, "\n"); 1.1557 + } 1.1558 + if (argc < 0 || options[0].doesOccur || options[1].doesOccur) { 1.1559 + fprintf(stdout, "Usage: dumpce options...\n" 1.1560 + "--help\n" 1.1561 + " Display this message.\n" 1.1562 + "--locale name|all\n" 1.1563 + " ICU locale to use. Default is en_US\n" 1.1564 + "--serialize\n" 1.1565 + " Serializes the collation elements in -locale or all locales available and outputs them into --outputdir/locale_ce.txt\n" 1.1566 + "--destdir dir_name\n" 1.1567 + " Path for outputing the serialized collation elements. Defaults to stdout if no defined\n" 1.1568 + "--sourcedir dir_name\n" 1.1569 + " Path for the input rule file for collation\n" 1.1570 + "--attribute name=value,name=value...\n" 1.1571 + " Pairs of attribute names and values for setting\n" 1.1572 + "--rule filename\n" 1.1573 + " Name of file containing the collation rules.\n" 1.1574 + "--normalizaton mode\n" 1.1575 + " UNormalizationMode mode to be used.\n" 1.1576 + "--scripts\n" 1.1577 + " Codepoints from all scripts are sorted and serialized.\n" 1.1578 + "--reducehan\n" 1.1579 + " Only 200 Han script characters will be displayed with the use of --scripts.\n" 1.1580 + "--wholescripts\n" 1.1581 + " Show collation order for whole scripts instead of just for exemplar characters of a locale\n\n"); 1.1582 + 1.1583 + fprintf(stdout, "Example to generate *.txt files : dumpce --serialize --locale af --destdir /temp --attribute UCOL_STRENGTH=UCOL_DEFAULT_STRENGTH,4=17\n\n"); 1.1584 + fprintf(stdout, "Example to generate *.html files for oss web display: dumpce --scripts --destdir /temp --reducehan\n"); 1.1585 + return argc < 0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; 1.1586 + } 1.1587 + 1.1588 + OUTPUT_ = stdout; 1.1589 + if (options[6].doesOccur) { 1.1590 + fprintf(stdout, "attributes %s\n", options[6].value); 1.1591 + parseAttributes(); 1.1592 + } 1.1593 + if (options[3].doesOccur) { 1.1594 + serialize(); 1.1595 + } 1.1596 + if (options[9].doesOccur) { 1.1597 + serializeScripts(); 1.1598 + } 1.1599 + return 0; 1.1600 +}