Wed, 31 Dec 2014 07:22:50 +0100
Correct previous dual key logic pending first delivery installment.
michael@0 | 1 | /******************************************************************** |
michael@0 | 2 | * COPYRIGHT: |
michael@0 | 3 | * Copyright (C) 2001-2011 IBM, Inc. All Rights Reserved. |
michael@0 | 4 | * |
michael@0 | 5 | ********************************************************************/ |
michael@0 | 6 | /******************************************************************************** |
michael@0 | 7 | * |
michael@0 | 8 | * File dumpce.cpp |
michael@0 | 9 | * |
michael@0 | 10 | * Modification History: |
michael@0 | 11 | * Name Date Description |
michael@0 | 12 | * synwee May 31 2001 Creation |
michael@0 | 13 | * |
michael@0 | 14 | ********************************************************************************* |
michael@0 | 15 | */ |
michael@0 | 16 | |
michael@0 | 17 | /** |
michael@0 | 18 | * This program outputs the collation elements used for a requested tailoring. |
michael@0 | 19 | * |
michael@0 | 20 | * Usage: |
michael@0 | 21 | * dumpce options... please check main function. |
michael@0 | 22 | */ |
michael@0 | 23 | #include <unicode/utypes.h> |
michael@0 | 24 | #include <unicode/ucol.h> |
michael@0 | 25 | #include <unicode/uloc.h> |
michael@0 | 26 | #include <unicode/ucoleitr.h> |
michael@0 | 27 | #include <unicode/uchar.h> |
michael@0 | 28 | #include <unicode/uscript.h> |
michael@0 | 29 | #include <unicode/utf16.h> |
michael@0 | 30 | #include <unicode/putil.h> |
michael@0 | 31 | #include <unicode/ustring.h> |
michael@0 | 32 | #include <stdio.h> |
michael@0 | 33 | #include <stdlib.h> |
michael@0 | 34 | #include <string.h> |
michael@0 | 35 | #include <time.h> |
michael@0 | 36 | #include "ucol_tok.h" |
michael@0 | 37 | #include "cstring.h" |
michael@0 | 38 | #include "uoptions.h" |
michael@0 | 39 | #include "ucol_imp.h" |
michael@0 | 40 | #include <unicode/ures.h> |
michael@0 | 41 | #include <unicode/uniset.h> |
michael@0 | 42 | #include <unicode/usetiter.h> |
michael@0 | 43 | |
michael@0 | 44 | /** |
michael@0 | 45 | * Command line option variables. |
michael@0 | 46 | * These global variables are set according to the options specified on the |
michael@0 | 47 | * command line by the user. |
michael@0 | 48 | */ |
michael@0 | 49 | static UOption options[]={ |
michael@0 | 50 | /* 00 */ UOPTION_HELP_H, |
michael@0 | 51 | /* 01 */ UOPTION_HELP_QUESTION_MARK, |
michael@0 | 52 | /* 02 */ {"locale", NULL, NULL, NULL, 'l', UOPT_REQUIRES_ARG, 0}, |
michael@0 | 53 | /* 03 */ {"serialize", NULL, NULL, NULL, 'z', UOPT_NO_ARG, 0}, |
michael@0 | 54 | /* 04 */ UOPTION_DESTDIR, |
michael@0 | 55 | /* 05 */ UOPTION_SOURCEDIR, |
michael@0 | 56 | /* 06 */ {"attribute", NULL, NULL, NULL, 'a', UOPT_REQUIRES_ARG, 0}, |
michael@0 | 57 | /* 07 */ {"rule", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0}, |
michael@0 | 58 | /* 08 */ {"normalization", NULL, NULL, NULL, 'n', UOPT_REQUIRES_ARG, 0}, |
michael@0 | 59 | /* 09 */ {"scripts", NULL, NULL, NULL, 't', UOPT_NO_ARG, 0}, |
michael@0 | 60 | /* 10 */ {"reducehan", NULL, NULL, NULL, 'e', UOPT_NO_ARG, 0}, |
michael@0 | 61 | /* 11 */ UOPTION_VERBOSE, |
michael@0 | 62 | /* 12 */ {"wholescripts", NULL, NULL, NULL, 'W', UOPT_NO_ARG, 0} |
michael@0 | 63 | }; |
michael@0 | 64 | |
michael@0 | 65 | /** |
michael@0 | 66 | * Collator used in this program |
michael@0 | 67 | */ |
michael@0 | 68 | static UCollator *COLLATOR_; |
michael@0 | 69 | /** |
michael@0 | 70 | * Output strea, used in this program |
michael@0 | 71 | */ |
michael@0 | 72 | static FILE *OUTPUT_; |
michael@0 | 73 | |
michael@0 | 74 | static UColAttributeValue ATTRIBUTE_[UCOL_ATTRIBUTE_COUNT] = { |
michael@0 | 75 | UCOL_DEFAULT, UCOL_DEFAULT, UCOL_DEFAULT, UCOL_DEFAULT, UCOL_DEFAULT, |
michael@0 | 76 | UCOL_DEFAULT, UCOL_DEFAULT, UCOL_DEFAULT, |
michael@0 | 77 | }; |
michael@0 | 78 | |
michael@0 | 79 | typedef struct { |
michael@0 | 80 | int value; |
michael@0 | 81 | char *name; |
michael@0 | 82 | } EnumNameValuePair; |
michael@0 | 83 | |
michael@0 | 84 | static const EnumNameValuePair ATTRIBUTE_NAME_[] = { |
michael@0 | 85 | {UCOL_FRENCH_COLLATION, "UCOL_FRENCH_COLLATION"}, |
michael@0 | 86 | {UCOL_ALTERNATE_HANDLING, "UCOL_ALTERNATE_HANDLING"}, |
michael@0 | 87 | {UCOL_CASE_FIRST, "UCOL_CASE_FIRST"}, |
michael@0 | 88 | {UCOL_CASE_LEVEL, "UCOL_CASE_LEVEL"}, |
michael@0 | 89 | {UCOL_NORMALIZATION_MODE, |
michael@0 | 90 | "UCOL_NORMALIZATION_MODE|UCOL_DECOMPOSITION_MODE"}, |
michael@0 | 91 | {UCOL_STRENGTH, "UCOL_STRENGTH"}, |
michael@0 | 92 | {UCOL_HIRAGANA_QUATERNARY_MODE, "UCOL_HIRAGANA_QUATERNARY_MODE"}, |
michael@0 | 93 | {UCOL_NUMERIC_COLLATION, "UCOL_NUMERIC_COLLATION"}, |
michael@0 | 94 | NULL |
michael@0 | 95 | }; |
michael@0 | 96 | |
michael@0 | 97 | static const EnumNameValuePair ATTRIBUTE_VALUE_[] = { |
michael@0 | 98 | {UCOL_PRIMARY, "UCOL_PRIMARY"}, |
michael@0 | 99 | {UCOL_SECONDARY, "UCOL_SECONDARY"}, |
michael@0 | 100 | {UCOL_TERTIARY, "UCOL_TERTIARY|UCOL_DEFAULT_STRENGTH"}, |
michael@0 | 101 | {UCOL_QUATERNARY, "UCOL_QUATERNARY"}, |
michael@0 | 102 | {UCOL_IDENTICAL, "UCOL_IDENTICAL"}, |
michael@0 | 103 | {UCOL_OFF, "UCOL_OFF"}, |
michael@0 | 104 | {UCOL_ON, "UCOL_ON"}, |
michael@0 | 105 | {UCOL_SHIFTED, "UCOL_SHIFTED"}, |
michael@0 | 106 | {UCOL_NON_IGNORABLE, "UCOL_NON_IGNORABLE"}, |
michael@0 | 107 | {UCOL_LOWER_FIRST, "UCOL_LOWER_FIRST"}, |
michael@0 | 108 | {UCOL_UPPER_FIRST, "UCOL_UPPER_FIRST"}, |
michael@0 | 109 | NULL |
michael@0 | 110 | }; |
michael@0 | 111 | |
michael@0 | 112 | typedef struct { |
michael@0 | 113 | UChar ch[32]; |
michael@0 | 114 | int count; // number of codepoint |
michael@0 | 115 | UBool tailored; |
michael@0 | 116 | } ScriptElement; |
michael@0 | 117 | |
michael@0 | 118 | /** |
michael@0 | 119 | * Writes the hexadecimal of a null-terminated array of codepoints into a |
michael@0 | 120 | * file |
michael@0 | 121 | * @param f UFILE instance to store |
michael@0 | 122 | * @param c codepoints array |
michael@0 | 123 | */ |
michael@0 | 124 | void serialize(FILE *f, const UChar *c) |
michael@0 | 125 | { |
michael@0 | 126 | UChar cp = *(c ++); |
michael@0 | 127 | |
michael@0 | 128 | fprintf(f, " %04x", cp); |
michael@0 | 129 | |
michael@0 | 130 | while (*c != 0) { |
michael@0 | 131 | cp = *(c ++); |
michael@0 | 132 | fprintf(f, " %04x", cp); |
michael@0 | 133 | } |
michael@0 | 134 | } |
michael@0 | 135 | |
michael@0 | 136 | /** |
michael@0 | 137 | * Writes the hexadecimal of a non-null-terminated array of codepoints into a |
michael@0 | 138 | * file |
michael@0 | 139 | * @param f UFILE instance to store |
michael@0 | 140 | * @param c codepoints array |
michael@0 | 141 | * @param l codepoints array length |
michael@0 | 142 | */ |
michael@0 | 143 | void serialize(FILE *f, const UChar *c, int l) |
michael@0 | 144 | { |
michael@0 | 145 | int count = 1; |
michael@0 | 146 | UChar cp = *(c ++); |
michael@0 | 147 | |
michael@0 | 148 | fprintf(f, " %04x", cp); |
michael@0 | 149 | |
michael@0 | 150 | while (count < l) { |
michael@0 | 151 | cp = *(c ++); |
michael@0 | 152 | fprintf(f, " %04x", cp); |
michael@0 | 153 | count ++; |
michael@0 | 154 | } |
michael@0 | 155 | } |
michael@0 | 156 | |
michael@0 | 157 | /** |
michael@0 | 158 | * Sets the iterator to the argument string and outputs the collation elements. |
michael@0 | 159 | * @param f file output stream |
michael@0 | 160 | * @param iter collation element iterator |
michael@0 | 161 | */ |
michael@0 | 162 | void serialize(FILE *f, UCollationElements *iter) { |
michael@0 | 163 | const UChar *codepoint = iter->iteratordata_.string; |
michael@0 | 164 | // unlikely that sortkeys will be over this size |
michael@0 | 165 | uint8_t sortkey[64]; |
michael@0 | 166 | uint8_t *psortkey = sortkey; |
michael@0 | 167 | int sortkeylength = 0; |
michael@0 | 168 | |
michael@0 | 169 | if (iter->iteratordata_.flags & UCOL_ITER_HASLEN) { |
michael@0 | 170 | serialize(f, codepoint, iter->iteratordata_.endp - codepoint); |
michael@0 | 171 | sortkeylength = ucol_getSortKey(iter->iteratordata_.coll, codepoint, |
michael@0 | 172 | iter->iteratordata_.endp - codepoint, sortkey, 64); |
michael@0 | 173 | } |
michael@0 | 174 | else { |
michael@0 | 175 | serialize(f, codepoint); |
michael@0 | 176 | sortkeylength = ucol_getSortKey(iter->iteratordata_.coll, codepoint, |
michael@0 | 177 | -1, sortkey, 64); |
michael@0 | 178 | } |
michael@0 | 179 | if (options[11].doesOccur) { |
michael@0 | 180 | serialize(stdout, codepoint); |
michael@0 | 181 | fprintf(stdout, "\n"); |
michael@0 | 182 | } |
michael@0 | 183 | |
michael@0 | 184 | fprintf(f, "; "); |
michael@0 | 185 | |
michael@0 | 186 | UErrorCode error = U_ZERO_ERROR; |
michael@0 | 187 | uint32_t ce = ucol_next(iter, &error); |
michael@0 | 188 | if (U_FAILURE(error)) { |
michael@0 | 189 | fprintf(f, "Error retrieving collation elements\n"); |
michael@0 | 190 | return; |
michael@0 | 191 | } |
michael@0 | 192 | |
michael@0 | 193 | while (TRUE) { |
michael@0 | 194 | fprintf(f, "["); |
michael@0 | 195 | if (UCOL_PRIMARYORDER(ce) != 0) { |
michael@0 | 196 | fprintf(f, "%04x", UCOL_PRIMARYORDER(ce)); |
michael@0 | 197 | } |
michael@0 | 198 | fprintf(f, ","); |
michael@0 | 199 | if (UCOL_SECONDARYORDER(ce) != 0) { |
michael@0 | 200 | fprintf(f, " %02x", UCOL_SECONDARYORDER(ce)); |
michael@0 | 201 | } |
michael@0 | 202 | fprintf(f, ","); |
michael@0 | 203 | if (UCOL_TERTIARYORDER(ce) != 0) { |
michael@0 | 204 | fprintf(f, " %02x", UCOL_TERTIARYORDER(ce)); |
michael@0 | 205 | } |
michael@0 | 206 | fprintf(f, "] "); |
michael@0 | 207 | |
michael@0 | 208 | ce = ucol_next(iter, &error); |
michael@0 | 209 | if (ce == UCOL_NULLORDER) { |
michael@0 | 210 | break; |
michael@0 | 211 | } |
michael@0 | 212 | if (U_FAILURE(error)) { |
michael@0 | 213 | fprintf(stdout, "Error retrieving collation elements"); |
michael@0 | 214 | return; |
michael@0 | 215 | } |
michael@0 | 216 | } |
michael@0 | 217 | |
michael@0 | 218 | if (sortkeylength > 64) { |
michael@0 | 219 | fprintf(f, "Sortkey exceeds pre-allocated size"); |
michael@0 | 220 | } |
michael@0 | 221 | |
michael@0 | 222 | fprintf(f, "["); |
michael@0 | 223 | while (TRUE) { |
michael@0 | 224 | fprintf(f, "%02x", *psortkey); |
michael@0 | 225 | psortkey ++; |
michael@0 | 226 | if ((*psortkey) == 0) { |
michael@0 | 227 | break; |
michael@0 | 228 | } |
michael@0 | 229 | fprintf(f, " "); |
michael@0 | 230 | } |
michael@0 | 231 | fprintf(f, "]\n"); |
michael@0 | 232 | } |
michael@0 | 233 | |
michael@0 | 234 | /** |
michael@0 | 235 | * Serializes the contraction within the given argument rule |
michael@0 | 236 | * @param f file output stream |
michael@0 | 237 | * @param r rule |
michael@0 | 238 | * @param rlen rule length |
michael@0 | 239 | * @param contractionsonly flag to indicate if only contractions are to be |
michael@0 | 240 | * output or all collation elements |
michael@0 | 241 | * @param iter iterator to iterate over collation elements |
michael@0 | 242 | */ |
michael@0 | 243 | void serialize(FILE *f, UChar *rule, int rlen, UBool contractiononly, |
michael@0 | 244 | UCollationElements *iter) { |
michael@0 | 245 | const UChar *current = NULL; |
michael@0 | 246 | uint32_t strength = 0; |
michael@0 | 247 | uint32_t chOffset = 0; |
michael@0 | 248 | uint32_t chLen = 0; |
michael@0 | 249 | uint32_t exOffset = 0; |
michael@0 | 250 | uint32_t exLen = 0; |
michael@0 | 251 | uint32_t prefixOffset = 0; |
michael@0 | 252 | uint32_t prefixLen = 0; |
michael@0 | 253 | uint8_t specs = 0; |
michael@0 | 254 | UBool rstart = TRUE; |
michael@0 | 255 | UColTokenParser src; |
michael@0 | 256 | UColOptionSet opts; |
michael@0 | 257 | UParseError parseError; |
michael@0 | 258 | UErrorCode error = U_ZERO_ERROR; |
michael@0 | 259 | |
michael@0 | 260 | src.opts = &opts; |
michael@0 | 261 | |
michael@0 | 262 | src.source = rule; |
michael@0 | 263 | src.current = rule; |
michael@0 | 264 | src.end = rule + rlen; |
michael@0 | 265 | src.extraCurrent = src.end; |
michael@0 | 266 | src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE; |
michael@0 | 267 | |
michael@0 | 268 | |
michael@0 | 269 | while ((current = ucol_tok_parseNextToken(&src, rstart, &parseError, |
michael@0 | 270 | &error)) != NULL) { |
michael@0 | 271 | chOffset = src.parsedToken.charsOffset; |
michael@0 | 272 | chLen = src.parsedToken.charsLen; |
michael@0 | 273 | // contractions handled here |
michael@0 | 274 | if (!contractiononly || chLen > 1) { |
michael@0 | 275 | ucol_setText(iter, rule + chOffset, chLen, &error); |
michael@0 | 276 | if (U_FAILURE(error)) { |
michael@0 | 277 | fprintf(stdout, "Error setting text in iterator\n"); |
michael@0 | 278 | return; |
michael@0 | 279 | } |
michael@0 | 280 | serialize(f, iter); |
michael@0 | 281 | } |
michael@0 | 282 | rstart = FALSE; |
michael@0 | 283 | } |
michael@0 | 284 | } |
michael@0 | 285 | |
michael@0 | 286 | /** |
michael@0 | 287 | * Prints the attribute values in the argument collator into the output stream |
michael@0 | 288 | * @param collator |
michael@0 | 289 | */ |
michael@0 | 290 | void outputAttribute(UCollator *collator, UErrorCode *error) |
michael@0 | 291 | { |
michael@0 | 292 | UColAttribute attribute = UCOL_FRENCH_COLLATION; |
michael@0 | 293 | while (attribute < UCOL_ATTRIBUTE_COUNT) { |
michael@0 | 294 | int count = 0; |
michael@0 | 295 | while (TRUE) { |
michael@0 | 296 | // getting attribute name |
michael@0 | 297 | if (ATTRIBUTE_NAME_[count].value == attribute) { |
michael@0 | 298 | fprintf(OUTPUT_, "%s = ", ATTRIBUTE_NAME_[count].name); |
michael@0 | 299 | break; |
michael@0 | 300 | } |
michael@0 | 301 | count ++; |
michael@0 | 302 | } |
michael@0 | 303 | count = 0; |
michael@0 | 304 | int attributeval = ucol_getAttribute(collator, attribute, error); |
michael@0 | 305 | if (U_FAILURE(*error)) { |
michael@0 | 306 | fprintf(stdout, "Failure in reading collator attribute\n"); |
michael@0 | 307 | return; |
michael@0 | 308 | } |
michael@0 | 309 | while (TRUE) { |
michael@0 | 310 | // getting attribute value |
michael@0 | 311 | if (ATTRIBUTE_VALUE_[count].value == attributeval) { |
michael@0 | 312 | fprintf(OUTPUT_, "%s\n", ATTRIBUTE_VALUE_[count].name); |
michael@0 | 313 | break; |
michael@0 | 314 | } |
michael@0 | 315 | count ++; |
michael@0 | 316 | } |
michael@0 | 317 | attribute = (UColAttribute)(attribute + 1); |
michael@0 | 318 | } |
michael@0 | 319 | } |
michael@0 | 320 | |
michael@0 | 321 | /** |
michael@0 | 322 | * Prints the normalization mode in the argument collator into the output stream |
michael@0 | 323 | * @param collator |
michael@0 | 324 | */ |
michael@0 | 325 | void outputNormalization(UCollator *collator) |
michael@0 | 326 | { |
michael@0 | 327 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 328 | int normmode = ucol_getAttribute(collator, UCOL_NORMALIZATION_MODE, &status); |
michael@0 | 329 | int count = 0; |
michael@0 | 330 | while (TRUE) { |
michael@0 | 331 | // getting attribute name |
michael@0 | 332 | if (ATTRIBUTE_VALUE_[count].value == normmode) { |
michael@0 | 333 | break; |
michael@0 | 334 | } |
michael@0 | 335 | count ++; |
michael@0 | 336 | } |
michael@0 | 337 | fprintf(OUTPUT_, "NORMALIZATION MODE = %s\n", |
michael@0 | 338 | ATTRIBUTE_VALUE_[count].name); |
michael@0 | 339 | } |
michael@0 | 340 | |
michael@0 | 341 | /** |
michael@0 | 342 | * Output the collation element belonging to the locale into a file |
michael@0 | 343 | * @param locale string |
michael@0 | 344 | * @param fullrules flag to indicate if only tailored collation elements are to |
michael@0 | 345 | * be output or all collation elements |
michael@0 | 346 | */ |
michael@0 | 347 | void serialize(const char *locale, UBool tailoredonly) { |
michael@0 | 348 | UErrorCode error = U_ZERO_ERROR; |
michael@0 | 349 | UChar str[128]; |
michael@0 | 350 | int strlen = 0; |
michael@0 | 351 | |
michael@0 | 352 | fprintf(OUTPUT_, "# This file contains the serialized collation elements\n"); |
michael@0 | 353 | fprintf(OUTPUT_, "# as of the collation version indicated below.\n"); |
michael@0 | 354 | fprintf(OUTPUT_, "# Data format: xxxx xxxx..; [yyyy, yy, yy] [yyyy, yy, yy] ... [yyyy, yy, yy] [zz zz..\n"); |
michael@0 | 355 | fprintf(OUTPUT_, "# where xxxx are codepoints in hexadecimals,\n"); |
michael@0 | 356 | fprintf(OUTPUT_, "# yyyyyyyy are the corresponding\n"); |
michael@0 | 357 | fprintf(OUTPUT_, "# collation elements in hexadecimals\n"); |
michael@0 | 358 | fprintf(OUTPUT_, "# and zz are the sortkey values in hexadecimals\n"); |
michael@0 | 359 | |
michael@0 | 360 | fprintf(OUTPUT_, "\n# Collator information\n"); |
michael@0 | 361 | |
michael@0 | 362 | fprintf(OUTPUT_, "\nLocale: %s\n", locale); |
michael@0 | 363 | fprintf(stdout, "Locale: %s\n", locale); |
michael@0 | 364 | UVersionInfo version; |
michael@0 | 365 | ucol_getVersion(COLLATOR_, version); |
michael@0 | 366 | fprintf(OUTPUT_, "Version number: %d.%d.%d.%d\n", |
michael@0 | 367 | version[0], version[1], version[2], version[3]); |
michael@0 | 368 | outputAttribute(COLLATOR_, &error); |
michael@0 | 369 | outputNormalization(COLLATOR_); |
michael@0 | 370 | |
michael@0 | 371 | UCollationElements *iter = ucol_openElements(COLLATOR_, str, strlen, |
michael@0 | 372 | &error); |
michael@0 | 373 | if (U_FAILURE(error)) { |
michael@0 | 374 | fprintf(stdout, "Error creating iterator\n"); |
michael@0 | 375 | return; |
michael@0 | 376 | } |
michael@0 | 377 | |
michael@0 | 378 | if (!tailoredonly) { |
michael@0 | 379 | fprintf(OUTPUT_, "\n# Range of unicode characters\n\n"); |
michael@0 | 380 | UChar32 codepoint = 0; |
michael@0 | 381 | while (codepoint <= UCHAR_MAX_VALUE) { |
michael@0 | 382 | if (u_isdefined(codepoint)) { |
michael@0 | 383 | strlen = 0; |
michael@0 | 384 | UTF16_APPEND_CHAR_UNSAFE(str, strlen, codepoint); |
michael@0 | 385 | str[strlen] = 0; |
michael@0 | 386 | ucol_setText(iter, str, strlen, &error); |
michael@0 | 387 | if (U_FAILURE(error)) { |
michael@0 | 388 | fprintf(stdout, "Error setting text in iterator\n"); |
michael@0 | 389 | return; |
michael@0 | 390 | } |
michael@0 | 391 | serialize(OUTPUT_, iter); |
michael@0 | 392 | } |
michael@0 | 393 | codepoint ++; |
michael@0 | 394 | } |
michael@0 | 395 | } |
michael@0 | 396 | |
michael@0 | 397 | UChar ucarules[0x10000]; |
michael@0 | 398 | UChar *rules; |
michael@0 | 399 | int32_t rulelength = 0; |
michael@0 | 400 | rules = ucarules; |
michael@0 | 401 | |
michael@0 | 402 | if (tailoredonly) { |
michael@0 | 403 | int32_t rulelength = 0; |
michael@0 | 404 | const UChar *temp = ucol_getRules(COLLATOR_, &rulelength); |
michael@0 | 405 | if (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE > 0x10000) { |
michael@0 | 406 | rules = (UChar *)malloc(sizeof(UChar) * |
michael@0 | 407 | (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE)); |
michael@0 | 408 | } |
michael@0 | 409 | memcpy(rules, temp, rulelength * sizeof(UChar)); |
michael@0 | 410 | rules[rulelength] = 0; |
michael@0 | 411 | fprintf(OUTPUT_, "\n# Tailorings\n\n"); |
michael@0 | 412 | serialize(OUTPUT_, rules, rulelength, FALSE, iter); |
michael@0 | 413 | if (rules != ucarules) { |
michael@0 | 414 | free(rules); |
michael@0 | 415 | } |
michael@0 | 416 | } |
michael@0 | 417 | else { |
michael@0 | 418 | rulelength = ucol_getRulesEx(COLLATOR_, UCOL_FULL_RULES, ucarules, |
michael@0 | 419 | 0x10000); |
michael@0 | 420 | if (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE > 0x10000) { |
michael@0 | 421 | rules = (UChar *)malloc(sizeof(UChar) * |
michael@0 | 422 | (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE)); |
michael@0 | 423 | rulelength = ucol_getRulesEx(COLLATOR_, UCOL_FULL_RULES, rules, |
michael@0 | 424 | rulelength); |
michael@0 | 425 | } |
michael@0 | 426 | fprintf(OUTPUT_, "\n# Contractions\n\n"); |
michael@0 | 427 | serialize(OUTPUT_, rules, rulelength, TRUE, iter); |
michael@0 | 428 | if (rules != ucarules) { |
michael@0 | 429 | free(rules); |
michael@0 | 430 | } |
michael@0 | 431 | } |
michael@0 | 432 | |
michael@0 | 433 | ucol_closeElements(iter); |
michael@0 | 434 | } |
michael@0 | 435 | |
michael@0 | 436 | /** |
michael@0 | 437 | * Sets the collator with the attribute values |
michael@0 | 438 | * @param collator |
michael@0 | 439 | * @param error status |
michael@0 | 440 | */ |
michael@0 | 441 | void setAttributes(UCollator *collator, UErrorCode *error) |
michael@0 | 442 | { |
michael@0 | 443 | int count = 0; |
michael@0 | 444 | while (count < UCOL_ATTRIBUTE_COUNT) { |
michael@0 | 445 | if (ATTRIBUTE_[count] != UCOL_DEFAULT) { |
michael@0 | 446 | ucol_setAttribute(collator, (UColAttribute)count, |
michael@0 | 447 | ATTRIBUTE_[count], error); |
michael@0 | 448 | if (U_FAILURE(*error)) { |
michael@0 | 449 | return; |
michael@0 | 450 | } |
michael@0 | 451 | } |
michael@0 | 452 | count ++; |
michael@0 | 453 | } |
michael@0 | 454 | } |
michael@0 | 455 | |
michael@0 | 456 | /** |
michael@0 | 457 | * Appends directory path with an ending seperator if necessary. |
michael@0 | 458 | * @param path with enough space to append one seperator |
michael@0 | 459 | * @return new directory path length |
michael@0 | 460 | */ |
michael@0 | 461 | int appendDirSeparator(char *dir) |
michael@0 | 462 | { |
michael@0 | 463 | int dirlength = strlen(dir); |
michael@0 | 464 | char dirending = dir[dirlength - 1]; |
michael@0 | 465 | if (dirending != U_FILE_SEP_CHAR) { |
michael@0 | 466 | dir[dirlength] = U_FILE_SEP_CHAR; |
michael@0 | 467 | dir[dirlength + 1] = 0; |
michael@0 | 468 | return dirlength + 1; |
michael@0 | 469 | } |
michael@0 | 470 | return dirlength; |
michael@0 | 471 | } |
michael@0 | 472 | |
michael@0 | 473 | /** |
michael@0 | 474 | * Output the collation element into a file |
michael@0 | 475 | */ |
michael@0 | 476 | void serialize() { |
michael@0 | 477 | char filename[128]; |
michael@0 | 478 | int dirlength = 0; |
michael@0 | 479 | |
michael@0 | 480 | if (options[4].doesOccur) { |
michael@0 | 481 | strcpy(filename, options[4].value); |
michael@0 | 482 | dirlength = appendDirSeparator(filename); |
michael@0 | 483 | } |
michael@0 | 484 | |
michael@0 | 485 | if (options[2].doesOccur) { |
michael@0 | 486 | const char *locale = (char *)options[2].value; |
michael@0 | 487 | int32_t localeindex = 0; |
michael@0 | 488 | |
michael@0 | 489 | if (strcmp(locale, "all") == 0) { |
michael@0 | 490 | if (options[4].doesOccur) { |
michael@0 | 491 | strcat(filename, "UCA.txt"); |
michael@0 | 492 | OUTPUT_ = fopen(filename, "w"); |
michael@0 | 493 | if (OUTPUT_ == NULL) { |
michael@0 | 494 | fprintf(stdout, "Cannot open file:%s\n", filename); |
michael@0 | 495 | return; |
michael@0 | 496 | } |
michael@0 | 497 | } |
michael@0 | 498 | fprintf(stdout, "UCA\n"); |
michael@0 | 499 | UErrorCode error = U_ZERO_ERROR; |
michael@0 | 500 | COLLATOR_ = ucol_open("en_US", &error); |
michael@0 | 501 | if (U_FAILURE(error)) { |
michael@0 | 502 | fprintf(stdout, "Collator creation failed:"); |
michael@0 | 503 | fprintf(stdout, u_errorName(error)); |
michael@0 | 504 | goto CLOSEUCA; |
michael@0 | 505 | return; |
michael@0 | 506 | } |
michael@0 | 507 | setAttributes(COLLATOR_, &error); |
michael@0 | 508 | if (U_FAILURE(error)) { |
michael@0 | 509 | fprintf(stdout, "Collator attribute setting failed:"); |
michael@0 | 510 | fprintf(stdout, u_errorName(error)); |
michael@0 | 511 | goto CLOSEUCA; |
michael@0 | 512 | return; |
michael@0 | 513 | } |
michael@0 | 514 | |
michael@0 | 515 | serialize("UCA", FALSE); |
michael@0 | 516 | CLOSEUCA : |
michael@0 | 517 | if (options[4].doesOccur) { |
michael@0 | 518 | filename[dirlength] = 0; |
michael@0 | 519 | fclose(OUTPUT_); |
michael@0 | 520 | } |
michael@0 | 521 | ucol_close(COLLATOR_); |
michael@0 | 522 | localeindex = ucol_countAvailable() - 1; |
michael@0 | 523 | fprintf(stdout, "Number of locales: %d\n", localeindex + 1); |
michael@0 | 524 | locale = ucol_getAvailable(localeindex); |
michael@0 | 525 | } |
michael@0 | 526 | |
michael@0 | 527 | while (TRUE) { |
michael@0 | 528 | UErrorCode error = U_ZERO_ERROR; |
michael@0 | 529 | COLLATOR_ = ucol_open(locale, &error); |
michael@0 | 530 | if (U_FAILURE(error)) { |
michael@0 | 531 | fprintf(stdout, "Collator creation failed:"); |
michael@0 | 532 | fprintf(stdout, u_errorName(error)); |
michael@0 | 533 | goto CLOSETAILOR; |
michael@0 | 534 | return; |
michael@0 | 535 | } |
michael@0 | 536 | setAttributes(COLLATOR_, &error); |
michael@0 | 537 | if (U_FAILURE(error)) { |
michael@0 | 538 | fprintf(stdout, "Collator attribute setting failed:"); |
michael@0 | 539 | fprintf(stdout, u_errorName(error)); |
michael@0 | 540 | goto CLOSETAILOR; |
michael@0 | 541 | return; |
michael@0 | 542 | } |
michael@0 | 543 | |
michael@0 | 544 | if (options[4].doesOccur) { |
michael@0 | 545 | strcat(filename, locale); |
michael@0 | 546 | strcat(filename, ".txt"); |
michael@0 | 547 | OUTPUT_ = fopen(filename, "w"); |
michael@0 | 548 | if (OUTPUT_ == NULL) { |
michael@0 | 549 | fprintf(stdout, "Cannot open file:%s\n", filename); |
michael@0 | 550 | return; |
michael@0 | 551 | } |
michael@0 | 552 | } |
michael@0 | 553 | |
michael@0 | 554 | if (options[3].doesOccur) { |
michael@0 | 555 | serialize(locale, TRUE); |
michael@0 | 556 | } |
michael@0 | 557 | |
michael@0 | 558 | ucol_close(COLLATOR_); |
michael@0 | 559 | |
michael@0 | 560 | CLOSETAILOR : |
michael@0 | 561 | if (options[4].doesOccur) { |
michael@0 | 562 | filename[dirlength] = 0; |
michael@0 | 563 | fclose(OUTPUT_); |
michael@0 | 564 | } |
michael@0 | 565 | |
michael@0 | 566 | localeindex --; |
michael@0 | 567 | if (localeindex < 0) { |
michael@0 | 568 | break; |
michael@0 | 569 | } |
michael@0 | 570 | locale = ucol_getAvailable(localeindex); |
michael@0 | 571 | } |
michael@0 | 572 | } |
michael@0 | 573 | |
michael@0 | 574 | if (options[7].doesOccur) { |
michael@0 | 575 | char inputfilename[128] = ""; |
michael@0 | 576 | // rules are to be used |
michael@0 | 577 | if (options[5].doesOccur) { |
michael@0 | 578 | strcpy(inputfilename, options[5].value); |
michael@0 | 579 | appendDirSeparator(inputfilename); |
michael@0 | 580 | } |
michael@0 | 581 | strcat(inputfilename, options[7].value); |
michael@0 | 582 | FILE *input = fopen(inputfilename, "r"); |
michael@0 | 583 | if (input == NULL) { |
michael@0 | 584 | fprintf(stdout, "Cannot open file:%s\n", filename); |
michael@0 | 585 | return; |
michael@0 | 586 | } |
michael@0 | 587 | |
michael@0 | 588 | char s[1024]; |
michael@0 | 589 | UChar rule[1024]; |
michael@0 | 590 | UChar *prule = rule; |
michael@0 | 591 | int size = 1024; |
michael@0 | 592 | // synwee TODO: make this part dynamic |
michael@0 | 593 | while (fscanf(input, "%[^\n]s", s) != EOF) { |
michael@0 | 594 | size -= u_unescape(s, prule, size); |
michael@0 | 595 | prule = prule + u_strlen(prule); |
michael@0 | 596 | } |
michael@0 | 597 | fclose(input); |
michael@0 | 598 | |
michael@0 | 599 | if (options[4].doesOccur) { |
michael@0 | 600 | strcat(filename, "Rules.txt"); |
michael@0 | 601 | OUTPUT_ = fopen(filename, "w"); |
michael@0 | 602 | if (OUTPUT_ == NULL) { |
michael@0 | 603 | fprintf(stdout, "Cannot open file:%s\n", filename); |
michael@0 | 604 | return; |
michael@0 | 605 | } |
michael@0 | 606 | } |
michael@0 | 607 | |
michael@0 | 608 | fprintf(stdout, "Rules\n"); |
michael@0 | 609 | UErrorCode error = U_ZERO_ERROR; |
michael@0 | 610 | UParseError parseError; |
michael@0 | 611 | COLLATOR_ = ucol_openRules(rule, u_strlen(rule), UCOL_DEFAULT, |
michael@0 | 612 | UCOL_DEFAULT_STRENGTH, &parseError, &error); |
michael@0 | 613 | if (U_FAILURE(error)) { |
michael@0 | 614 | fprintf(stdout, "Collator creation failed:"); |
michael@0 | 615 | fprintf(stdout, u_errorName(error)); |
michael@0 | 616 | goto CLOSERULES; |
michael@0 | 617 | return; |
michael@0 | 618 | } |
michael@0 | 619 | setAttributes(COLLATOR_, &error); |
michael@0 | 620 | if (U_FAILURE(error)) { |
michael@0 | 621 | fprintf(stdout, "Collator attribute setting failed:"); |
michael@0 | 622 | fprintf(stdout, u_errorName(error)); |
michael@0 | 623 | goto CLOSERULES; |
michael@0 | 624 | return; |
michael@0 | 625 | } |
michael@0 | 626 | |
michael@0 | 627 | serialize("Rule-based", TRUE); |
michael@0 | 628 | ucol_close(COLLATOR_); |
michael@0 | 629 | |
michael@0 | 630 | CLOSERULES : |
michael@0 | 631 | if (options[4].doesOccur) { |
michael@0 | 632 | filename[dirlength] = 0; |
michael@0 | 633 | fclose(OUTPUT_); |
michael@0 | 634 | } |
michael@0 | 635 | } |
michael@0 | 636 | } |
michael@0 | 637 | |
michael@0 | 638 | /** |
michael@0 | 639 | * Parse for enum values. |
michael@0 | 640 | * Note this only works for positive enum values. |
michael@0 | 641 | * @param enumarray array containing names of the enum values in string and |
michael@0 | 642 | * their corresponding value. |
michael@0 | 643 | * declared enum value. |
michael@0 | 644 | * @param str string to be parsed |
michael@0 | 645 | * @return corresponding integer enum value or -1 if value is not found. |
michael@0 | 646 | */ |
michael@0 | 647 | int parseEnums(const EnumNameValuePair enumarray[], const char *str) |
michael@0 | 648 | { |
michael@0 | 649 | const char *enumname = enumarray[0].name; |
michael@0 | 650 | int result = atoi(str); |
michael@0 | 651 | if (result == 0 && str[0] != '0') { |
michael@0 | 652 | while (strcmp(enumname, str) != 0) { |
michael@0 | 653 | // checking for multiple enum names sharing the same values |
michael@0 | 654 | enumname = strstr(enumname, str); |
michael@0 | 655 | if (enumname != NULL) { |
michael@0 | 656 | int size = strchr(enumname, '|') - enumname; |
michael@0 | 657 | if (size < 0) { |
michael@0 | 658 | size = strlen(enumname); |
michael@0 | 659 | } |
michael@0 | 660 | if (size == (int)strlen(str)) { |
michael@0 | 661 | return enumarray[result].value; |
michael@0 | 662 | } |
michael@0 | 663 | } |
michael@0 | 664 | result ++; |
michael@0 | 665 | if (&(enumarray[result]) == NULL) { |
michael@0 | 666 | return -1; |
michael@0 | 667 | } |
michael@0 | 668 | enumname = enumarray[result].name; |
michael@0 | 669 | } |
michael@0 | 670 | } |
michael@0 | 671 | return -1; |
michael@0 | 672 | } |
michael@0 | 673 | |
michael@0 | 674 | /** |
michael@0 | 675 | * Parser for attribute name value pair |
michael@0 | 676 | */ |
michael@0 | 677 | void parseAttributes() { |
michael@0 | 678 | char str[32]; |
michael@0 | 679 | const char *pname = options[6].value; |
michael@0 | 680 | const char *pend = options[6].value + strlen(options[6].value); |
michael@0 | 681 | const char *pvalue; |
michael@0 | 682 | |
michael@0 | 683 | while (pname < pend) { |
michael@0 | 684 | pvalue = strchr(pname, '='); |
michael@0 | 685 | if (pvalue == NULL) { |
michael@0 | 686 | fprintf(stdout, |
michael@0 | 687 | "No matching value found for attribute argument %s\n", |
michael@0 | 688 | pname); |
michael@0 | 689 | return; |
michael@0 | 690 | } |
michael@0 | 691 | int count = pvalue - pname; |
michael@0 | 692 | strncpy(str, pname, count); |
michael@0 | 693 | str[count] = 0; |
michael@0 | 694 | |
michael@0 | 695 | int name = parseEnums(ATTRIBUTE_NAME_, str); |
michael@0 | 696 | if (name == -1) { |
michael@0 | 697 | fprintf(stdout, "Attribute name not found: %s\n", str); |
michael@0 | 698 | return; |
michael@0 | 699 | } |
michael@0 | 700 | |
michael@0 | 701 | pvalue ++; |
michael@0 | 702 | // getting corresponding enum value |
michael@0 | 703 | pname = strchr(pvalue, ','); |
michael@0 | 704 | if (pname == NULL) { |
michael@0 | 705 | pname = pend; |
michael@0 | 706 | } |
michael@0 | 707 | count = pname - pvalue; |
michael@0 | 708 | strncpy(str, pvalue, count); |
michael@0 | 709 | str[count] = 0; |
michael@0 | 710 | int value = parseEnums(ATTRIBUTE_VALUE_, str); |
michael@0 | 711 | if (value == -1) { |
michael@0 | 712 | fprintf(stdout, "Attribute value not found: %s\n", str); |
michael@0 | 713 | return; |
michael@0 | 714 | } |
michael@0 | 715 | ATTRIBUTE_[name] = (UColAttributeValue)value; |
michael@0 | 716 | pname ++; |
michael@0 | 717 | } |
michael@0 | 718 | } |
michael@0 | 719 | |
michael@0 | 720 | /** |
michael@0 | 721 | * Checks if the locale argument is a base language |
michael@0 | 722 | * @param locale to be checked |
michael@0 | 723 | * @return TRUE if it is a base language |
michael@0 | 724 | */ |
michael@0 | 725 | inline UBool checkLocaleForLanguage(const char *locale) |
michael@0 | 726 | { |
michael@0 | 727 | return strlen(locale) <= 2; |
michael@0 | 728 | } |
michael@0 | 729 | |
michael@0 | 730 | /** |
michael@0 | 731 | * Converts a UChar array into its string form "xxxx xxxx" |
michael@0 | 732 | * @param ch array of UChar characters |
michael@0 | 733 | * @param count number of UChar characters |
michael@0 | 734 | */ |
michael@0 | 735 | void outputUChar(UChar ch[], int count) |
michael@0 | 736 | { |
michael@0 | 737 | for (int i = 0; i < count; i ++) { |
michael@0 | 738 | fprintf(OUTPUT_, "%04X ", ch[i]); |
michael@0 | 739 | } |
michael@0 | 740 | } |
michael@0 | 741 | |
michael@0 | 742 | /** |
michael@0 | 743 | * If it is a primary difference returns -1 or 1. |
michael@0 | 744 | * If it is a secondary difference returns -2 or 2. |
michael@0 | 745 | * If it is a tertiary difference returns -3 or 3. |
michael@0 | 746 | * If equals returns 0. |
michael@0 | 747 | */ |
michael@0 | 748 | int compareSortKey(const void *elem1, const void *elem2) |
michael@0 | 749 | { |
michael@0 | 750 | // compare the 2 script element sort key |
michael@0 | 751 | UChar *ch1 = ((ScriptElement *)elem1)->ch; |
michael@0 | 752 | UChar *ch2 = ((ScriptElement *)elem2)->ch; |
michael@0 | 753 | int size1 = ((ScriptElement *)elem1)->count; |
michael@0 | 754 | int size2 = ((ScriptElement *)elem2)->count; |
michael@0 | 755 | UErrorCode error = U_ZERO_ERROR; |
michael@0 | 756 | |
michael@0 | 757 | ucol_setStrength(COLLATOR_, UCOL_PRIMARY); |
michael@0 | 758 | int result = ucol_strcoll(COLLATOR_, ch1, size1, ch2, size2); |
michael@0 | 759 | if (result == 0) { |
michael@0 | 760 | ucol_setStrength(COLLATOR_, UCOL_SECONDARY); |
michael@0 | 761 | result = ucol_strcoll(COLLATOR_, ch1, size1, ch2, size2); |
michael@0 | 762 | if (result == 0) { |
michael@0 | 763 | ucol_setStrength(COLLATOR_, UCOL_TERTIARY); |
michael@0 | 764 | result = ucol_strcoll(COLLATOR_, ch1, size1, ch2, size2); |
michael@0 | 765 | if (result < 0) { |
michael@0 | 766 | return -3; |
michael@0 | 767 | } |
michael@0 | 768 | if (result > 0) { |
michael@0 | 769 | return 3; |
michael@0 | 770 | } |
michael@0 | 771 | } |
michael@0 | 772 | if (result < 0) { |
michael@0 | 773 | return -2; |
michael@0 | 774 | } |
michael@0 | 775 | if (result > 0) { |
michael@0 | 776 | return 2; |
michael@0 | 777 | } |
michael@0 | 778 | } |
michael@0 | 779 | return result; |
michael@0 | 780 | } |
michael@0 | 781 | |
michael@0 | 782 | /** |
michael@0 | 783 | * Output serialized script elements |
michael@0 | 784 | * @param element the element to output |
michael@0 | 785 | * @param compare the comparison with the previous element |
michael@0 | 786 | * @param expansion flags TRUE if element has an expansion |
michael@0 | 787 | */ |
michael@0 | 788 | void outputScriptElem(ScriptElement &element, int compare, UBool expansion) |
michael@0 | 789 | { |
michael@0 | 790 | switch (compare) { |
michael@0 | 791 | case 0: |
michael@0 | 792 | if (expansion) { |
michael@0 | 793 | fprintf(OUTPUT_, "<tr><td class='eq' title='["); |
michael@0 | 794 | } |
michael@0 | 795 | else { |
michael@0 | 796 | fprintf(OUTPUT_, "<tr><td class='q' title='["); |
michael@0 | 797 | } |
michael@0 | 798 | break; |
michael@0 | 799 | case -1: |
michael@0 | 800 | if (expansion) { |
michael@0 | 801 | fprintf(OUTPUT_, "<tr><td class='ep' title='["); |
michael@0 | 802 | } |
michael@0 | 803 | else { |
michael@0 | 804 | fprintf(OUTPUT_, "<tr><td class='p' title='["); |
michael@0 | 805 | } |
michael@0 | 806 | break; |
michael@0 | 807 | case -2: |
michael@0 | 808 | if (expansion) { |
michael@0 | 809 | fprintf(OUTPUT_, "<tr><td class='es' title='["); |
michael@0 | 810 | } |
michael@0 | 811 | else { |
michael@0 | 812 | fprintf(OUTPUT_, "<tr><td class='s' title='["); |
michael@0 | 813 | } |
michael@0 | 814 | break; |
michael@0 | 815 | default: |
michael@0 | 816 | if (expansion) { |
michael@0 | 817 | fprintf(OUTPUT_, "<tr><td class='et' title='["); |
michael@0 | 818 | } |
michael@0 | 819 | else { |
michael@0 | 820 | fprintf(OUTPUT_, "<tr><td class='t' title='["); |
michael@0 | 821 | } |
michael@0 | 822 | } |
michael@0 | 823 | |
michael@0 | 824 | uint8_t sortkey[32]; |
michael@0 | 825 | ucol_setStrength(COLLATOR_, UCOL_TERTIARY); |
michael@0 | 826 | ucol_getSortKey(COLLATOR_, element.ch, element.count, sortkey, 32); |
michael@0 | 827 | int i = 0; |
michael@0 | 828 | while (sortkey[i] != 0) { |
michael@0 | 829 | if (sortkey[i] == 1) { |
michael@0 | 830 | fprintf(OUTPUT_, " | "); |
michael@0 | 831 | } |
michael@0 | 832 | else { |
michael@0 | 833 | fprintf(OUTPUT_, "%02x", sortkey[i]); |
michael@0 | 834 | } |
michael@0 | 835 | |
michael@0 | 836 | i ++; |
michael@0 | 837 | } |
michael@0 | 838 | |
michael@0 | 839 | fprintf(OUTPUT_, "]'>"); |
michael@0 | 840 | |
michael@0 | 841 | UErrorCode error = U_ZERO_ERROR; |
michael@0 | 842 | char utf8[64]; |
michael@0 | 843 | UChar nfc[32]; |
michael@0 | 844 | int32_t length = unorm_normalize(element.ch, element.count, UNORM_NFC, 0, nfc, |
michael@0 | 845 | 32, &error); |
michael@0 | 846 | if (U_FAILURE(error)) { |
michael@0 | 847 | fprintf(stdout, "Error normalizing contractions to NFC\n"); |
michael@0 | 848 | } |
michael@0 | 849 | u_strToUTF8(utf8, 64, &length, nfc, length, &error); |
michael@0 | 850 | if (U_FAILURE(error)) { |
michael@0 | 851 | fprintf(stdout, "Error converting UChar to utf8\n"); |
michael@0 | 852 | return; |
michael@0 | 853 | } |
michael@0 | 854 | |
michael@0 | 855 | fprintf(OUTPUT_, "%s<br>", utf8); |
michael@0 | 856 | fprintf(OUTPUT_, "<tt>"); |
michael@0 | 857 | outputUChar(element.ch, element.count); |
michael@0 | 858 | |
michael@0 | 859 | if (compare == 0) { |
michael@0 | 860 | fprintf(OUTPUT_, "</tt></td><td> </td><td> </td><td> </td><td>Q</td><td>"); |
michael@0 | 861 | } |
michael@0 | 862 | else if (compare == -1) { |
michael@0 | 863 | fprintf(OUTPUT_, "</tt></td><td>P</td><td> </td><td> </td><td> </td><td>"); |
michael@0 | 864 | } |
michael@0 | 865 | else if (compare == -2) { |
michael@0 | 866 | fprintf(OUTPUT_, "</tt></td><td> </td><td>S</td><td> </td><td> </td><td>"); |
michael@0 | 867 | } |
michael@0 | 868 | else if (compare == -3) { |
michael@0 | 869 | fprintf(OUTPUT_, "</tt></td><td> </td><td> </td><td>T</td><td> </td><td>"); |
michael@0 | 870 | } |
michael@0 | 871 | |
michael@0 | 872 | i = 0; |
michael@0 | 873 | while (i < element.count) { |
michael@0 | 874 | char str[128]; |
michael@0 | 875 | UChar32 codepoint; |
michael@0 | 876 | U16_NEXT(element.ch, i, element.count, codepoint); |
michael@0 | 877 | int32_t temp = u_charName(codepoint, U_UNICODE_CHAR_NAME, str, 128, |
michael@0 | 878 | &error); |
michael@0 | 879 | if (U_FAILURE(error)) { |
michael@0 | 880 | fprintf(stdout, "Error getting character name\n"); |
michael@0 | 881 | return; |
michael@0 | 882 | } |
michael@0 | 883 | if (element.tailored) { |
michael@0 | 884 | fprintf(OUTPUT_, "<b>"); |
michael@0 | 885 | } |
michael@0 | 886 | fprintf(OUTPUT_, "%s", str); |
michael@0 | 887 | if (element.tailored) { |
michael@0 | 888 | fprintf(OUTPUT_, " *</b>"); |
michael@0 | 889 | } |
michael@0 | 890 | if (i < element.count) { |
michael@0 | 891 | fprintf(OUTPUT_, "<br>\n"); |
michael@0 | 892 | } |
michael@0 | 893 | } |
michael@0 | 894 | |
michael@0 | 895 | fprintf(OUTPUT_, "</td></tr>\n"); |
michael@0 | 896 | } |
michael@0 | 897 | |
michael@0 | 898 | /** |
michael@0 | 899 | * Checks if codepoint belongs to scripts |
michael@0 | 900 | * @param script list |
michael@0 | 901 | * @param scriptcount number of scripts |
michael@0 | 902 | * @param codepoint to test |
michael@0 | 903 | * @return TRUE if codepoint belongs to scripts |
michael@0 | 904 | */ |
michael@0 | 905 | UBool checkInScripts(UScriptCode script[], int scriptcount, |
michael@0 | 906 | UChar32 codepoint) |
michael@0 | 907 | { |
michael@0 | 908 | UErrorCode error = U_ZERO_ERROR; |
michael@0 | 909 | for (int i = 0; i < scriptcount; i ++) { |
michael@0 | 910 | if (script[i] == USCRIPT_HAN && options[10].doesOccur) { |
michael@0 | 911 | if ((codepoint >= 0x2E80 && codepoint <= 0x2EE4) || |
michael@0 | 912 | (codepoint >= 0x2A672 && codepoint <= 0x2A6D6)) { |
michael@0 | 913 | // reduce han |
michael@0 | 914 | return TRUE; |
michael@0 | 915 | } |
michael@0 | 916 | } |
michael@0 | 917 | else if (uscript_getScript(codepoint, &error) == script[i]) { |
michael@0 | 918 | return TRUE; |
michael@0 | 919 | } |
michael@0 | 920 | if (U_FAILURE(error)) { |
michael@0 | 921 | fprintf(stdout, "Error checking character in scripts\n"); |
michael@0 | 922 | return FALSE; |
michael@0 | 923 | } |
michael@0 | 924 | } |
michael@0 | 925 | return FALSE; |
michael@0 | 926 | } |
michael@0 | 927 | |
michael@0 | 928 | /** |
michael@0 | 929 | * Checks if the set of codepoints belongs to the script |
michael@0 | 930 | * @param script list |
michael@0 | 931 | * @param scriptcount number of scripts |
michael@0 | 932 | * @param scriptelem |
michael@0 | 933 | * @return TRUE if all codepoints belongs to the script |
michael@0 | 934 | */ |
michael@0 | 935 | inline UBool checkInScripts(UScriptCode script[], int scriptcount, |
michael@0 | 936 | ScriptElement scriptelem) |
michael@0 | 937 | { |
michael@0 | 938 | int i = 0; |
michael@0 | 939 | while (i < scriptelem.count) { |
michael@0 | 940 | UChar32 codepoint; |
michael@0 | 941 | U16_NEXT(scriptelem.ch, i, scriptelem.count, codepoint); |
michael@0 | 942 | UErrorCode error = U_ZERO_ERROR; |
michael@0 | 943 | if (checkInScripts(script, scriptcount, codepoint)) { |
michael@0 | 944 | return TRUE; |
michael@0 | 945 | } |
michael@0 | 946 | } |
michael@0 | 947 | return FALSE; |
michael@0 | 948 | } |
michael@0 | 949 | |
michael@0 | 950 | /** |
michael@0 | 951 | * Gets the script elements and contractions belonging to the script |
michael@0 | 952 | * @param elems output list |
michael@0 | 953 | * @param locale locale |
michael@0 | 954 | * @return number of script elements |
michael@0 | 955 | * Add by Richard |
michael@0 | 956 | */ |
michael@0 | 957 | int getScriptElementsFromExemplars(ScriptElement scriptelem[], const char* locale) { |
michael@0 | 958 | UErrorCode error = U_ZERO_ERROR; |
michael@0 | 959 | UChar32 codepoint = 0; |
michael@0 | 960 | |
michael@0 | 961 | UResourceBundle* ures = ures_open(NULL, locale, &error); |
michael@0 | 962 | if (U_FAILURE(error)) { |
michael@0 | 963 | fprintf(stdout, "Can not find resource bundle for locale: %s\n", locale); |
michael@0 | 964 | return -1; |
michael@0 | 965 | } |
michael@0 | 966 | int32_t length; |
michael@0 | 967 | const UChar* exemplarChars = ures_getStringByKey(ures, "ExemplarCharacters", &length, &error); |
michael@0 | 968 | |
michael@0 | 969 | if (U_FAILURE(error)) { |
michael@0 | 970 | fprintf(stdout, "Can not find ExemplarCharacters in resource bundle\n"); |
michael@0 | 971 | return -1; |
michael@0 | 972 | } |
michael@0 | 973 | |
michael@0 | 974 | UChar* upperChars = new UChar[length * 2]; |
michael@0 | 975 | if (upperChars == 0) { |
michael@0 | 976 | fprintf(stdout, "Memory error\n"); |
michael@0 | 977 | return -1; |
michael@0 | 978 | } |
michael@0 | 979 | |
michael@0 | 980 | int32_t destLength = u_strToUpper(upperChars, length * 2, exemplarChars, -1, locale, &error); |
michael@0 | 981 | if (U_FAILURE(error)) { |
michael@0 | 982 | fprintf(stdout, "Error when u_strToUpper() \n"); |
michael@0 | 983 | return -1; |
michael@0 | 984 | } |
michael@0 | 985 | |
michael@0 | 986 | UChar* pattern = new UChar[length + destLength + 10]; |
michael@0 | 987 | UChar left[2] = {0x005b, 0x0}; |
michael@0 | 988 | UChar right[2] = {0x005d, 0x0}; |
michael@0 | 989 | pattern = u_strcpy(pattern, left); |
michael@0 | 990 | pattern = u_strcat(pattern, exemplarChars); |
michael@0 | 991 | pattern = u_strcat(pattern, upperChars); |
michael@0 | 992 | pattern = u_strcat(pattern, right); |
michael@0 | 993 | |
michael@0 | 994 | UnicodeSet * uniset = new UnicodeSet(UnicodeString(pattern), error); |
michael@0 | 995 | if (U_FAILURE(error)) { |
michael@0 | 996 | fprintf(stdout, "Can not open USet \n"); |
michael@0 | 997 | return -1; |
michael@0 | 998 | } |
michael@0 | 999 | |
michael@0 | 1000 | UnicodeSetIterator* usetiter = new UnicodeSetIterator(*uniset); |
michael@0 | 1001 | |
michael@0 | 1002 | int32_t count = 0; |
michael@0 | 1003 | |
michael@0 | 1004 | while (usetiter -> next()) { |
michael@0 | 1005 | if (usetiter -> isString()) { |
michael@0 | 1006 | UnicodeString strItem = usetiter -> getString(); |
michael@0 | 1007 | |
michael@0 | 1008 | scriptelem[count].count = 0; |
michael@0 | 1009 | for (int i = 0; i < strItem.length(); i++) { |
michael@0 | 1010 | codepoint = strItem.char32At(i); |
michael@0 | 1011 | UTF16_APPEND_CHAR_UNSAFE(scriptelem[count].ch, scriptelem[count].count, codepoint); |
michael@0 | 1012 | scriptelem[count].tailored = FALSE; |
michael@0 | 1013 | } |
michael@0 | 1014 | } else { |
michael@0 | 1015 | codepoint = usetiter -> getCodepoint(); |
michael@0 | 1016 | scriptelem[count].count = 0; |
michael@0 | 1017 | UTF16_APPEND_CHAR_UNSAFE(scriptelem[count].ch, scriptelem[count].count, codepoint); |
michael@0 | 1018 | scriptelem[count].tailored = FALSE; |
michael@0 | 1019 | } |
michael@0 | 1020 | |
michael@0 | 1021 | count++; |
michael@0 | 1022 | } |
michael@0 | 1023 | delete []pattern; |
michael@0 | 1024 | |
michael@0 | 1025 | return count; |
michael@0 | 1026 | } |
michael@0 | 1027 | |
michael@0 | 1028 | /** |
michael@0 | 1029 | * Gets the script elements and contractions belonging to the script |
michael@0 | 1030 | * @param script list |
michael@0 | 1031 | * @param scriptcount number of scripts |
michael@0 | 1032 | * @param elems output list |
michael@0 | 1033 | * @return number of script elements |
michael@0 | 1034 | */ |
michael@0 | 1035 | int getScriptElements(UScriptCode script[], int scriptcount, |
michael@0 | 1036 | ScriptElement scriptelem[]) |
michael@0 | 1037 | { |
michael@0 | 1038 | UErrorCode error = U_ZERO_ERROR; |
michael@0 | 1039 | UChar32 codepoint = 0; |
michael@0 | 1040 | int count = 0; |
michael@0 | 1041 | while (codepoint <= UCHAR_MAX_VALUE) { |
michael@0 | 1042 | if (checkInScripts(script, scriptcount, codepoint)) { |
michael@0 | 1043 | scriptelem[count].count = 0; |
michael@0 | 1044 | UTF16_APPEND_CHAR_UNSAFE(scriptelem[count].ch, |
michael@0 | 1045 | scriptelem[count].count, codepoint); |
michael@0 | 1046 | scriptelem[count].tailored = FALSE; |
michael@0 | 1047 | count ++; |
michael@0 | 1048 | } |
michael@0 | 1049 | if (U_FAILURE(error)) { |
michael@0 | 1050 | fprintf(stdout, "Error determining codepoint in script\n"); |
michael@0 | 1051 | return -1; |
michael@0 | 1052 | } |
michael@0 | 1053 | codepoint ++; |
michael@0 | 1054 | } |
michael@0 | 1055 | |
michael@0 | 1056 | const UChar *current = NULL; |
michael@0 | 1057 | uint32_t strength = 0; |
michael@0 | 1058 | uint32_t chOffset = 0; |
michael@0 | 1059 | uint32_t chLen = 0; |
michael@0 | 1060 | uint32_t exOffset = 0; |
michael@0 | 1061 | uint32_t exLen = 0; |
michael@0 | 1062 | uint32_t prefixOffset = 0; |
michael@0 | 1063 | uint32_t prefixLen = 0; |
michael@0 | 1064 | uint8_t specs = 0; |
michael@0 | 1065 | UBool rstart = TRUE; |
michael@0 | 1066 | UColTokenParser src; |
michael@0 | 1067 | UColOptionSet opts; |
michael@0 | 1068 | UParseError parseError; |
michael@0 | 1069 | |
michael@0 | 1070 | int32_t rulelength = ucol_getRulesEx(COLLATOR_, UCOL_FULL_RULES, NULL, 0); |
michael@0 | 1071 | src.source = (UChar *)malloc(sizeof(UChar) * |
michael@0 | 1072 | (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE)); |
michael@0 | 1073 | rulelength = ucol_getRulesEx(COLLATOR_, UCOL_FULL_RULES, src.source, |
michael@0 | 1074 | rulelength); |
michael@0 | 1075 | src.current = src.source; |
michael@0 | 1076 | src.end = src.source + rulelength; |
michael@0 | 1077 | src.extraCurrent = src.end; |
michael@0 | 1078 | src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE; |
michael@0 | 1079 | src.opts = &opts; |
michael@0 | 1080 | |
michael@0 | 1081 | /* |
michael@0 | 1082 | ucol_tok_parseNextToken(&src, &strength, &chOffset, |
michael@0 | 1083 | &chLen, &exOffset, &exLen, |
michael@0 | 1084 | &prefixOffset, &prefixLen, |
michael@0 | 1085 | &specs, rstart, &parseError, |
michael@0 | 1086 | &error) |
michael@0 | 1087 | */ |
michael@0 | 1088 | while ((current = ucol_tok_parseNextToken(&src, rstart, &parseError, |
michael@0 | 1089 | &error)) != NULL) { |
michael@0 | 1090 | // contractions handled here |
michael@0 | 1091 | if (chLen > 1) { |
michael@0 | 1092 | u_strncpy(scriptelem[count].ch, src.source + chOffset, chLen); |
michael@0 | 1093 | scriptelem[count].count = chLen; |
michael@0 | 1094 | if (checkInScripts(script, scriptcount, scriptelem[count])) { |
michael@0 | 1095 | scriptelem[count].tailored = FALSE; |
michael@0 | 1096 | count ++; |
michael@0 | 1097 | } |
michael@0 | 1098 | } |
michael@0 | 1099 | rstart = FALSE; |
michael@0 | 1100 | } |
michael@0 | 1101 | if (U_FAILURE(error)) { |
michael@0 | 1102 | fprintf(stdout, "Error parsing rules: %s\n", u_errorName(error)); |
michael@0 | 1103 | } |
michael@0 | 1104 | // rule might have been reallocated, so delete this instead |
michael@0 | 1105 | free(src.source); |
michael@0 | 1106 | return count; |
michael@0 | 1107 | } |
michael@0 | 1108 | |
michael@0 | 1109 | int compareCodepoints(const void *elem1, const void *elem2) |
michael@0 | 1110 | { |
michael@0 | 1111 | UChar *ch1 = ((ScriptElement *)elem1)->ch; // key |
michael@0 | 1112 | UChar *ch2 = ((ScriptElement *)elem2)->ch; |
michael@0 | 1113 | ch1[((ScriptElement *)elem1)->count] = 0; |
michael@0 | 1114 | ch2[((ScriptElement *)elem2)->count] = 0; |
michael@0 | 1115 | |
michael@0 | 1116 | // compare the 2 codepoints |
michael@0 | 1117 | return u_strcmp(ch1, ch2); |
michael@0 | 1118 | } |
michael@0 | 1119 | |
michael@0 | 1120 | UBool hasSubNFD(ScriptElement &se, ScriptElement &key) |
michael@0 | 1121 | { |
michael@0 | 1122 | UChar *ch1 = se.ch; |
michael@0 | 1123 | UChar *ch2 = key.ch; // key |
michael@0 | 1124 | ch1[se.count] = 0; |
michael@0 | 1125 | ch2[key.count] = 0; |
michael@0 | 1126 | |
michael@0 | 1127 | // compare the 2 codepoints |
michael@0 | 1128 | if (u_strstr(ch1, ch2) != NULL) { |
michael@0 | 1129 | return TRUE; |
michael@0 | 1130 | } |
michael@0 | 1131 | |
michael@0 | 1132 | // check the decomposition |
michael@0 | 1133 | UChar norm[32]; |
michael@0 | 1134 | UErrorCode error = U_ZERO_ERROR; |
michael@0 | 1135 | int size = unorm_normalize(ch1, se.count, UNORM_NFD, 0, norm, 32, |
michael@0 | 1136 | &error); |
michael@0 | 1137 | if (U_FAILURE(error)) { |
michael@0 | 1138 | fprintf(stdout, "Error normalizing\n"); |
michael@0 | 1139 | } |
michael@0 | 1140 | if (u_strstr(norm, ch2) != NULL) { |
michael@0 | 1141 | return TRUE; |
michael@0 | 1142 | } |
michael@0 | 1143 | return FALSE; |
michael@0 | 1144 | } |
michael@0 | 1145 | |
michael@0 | 1146 | /** |
michael@0 | 1147 | * Marks tailored elements |
michael@0 | 1148 | * @param script list |
michael@0 | 1149 | * @param scriptcount number of scripts |
michael@0 | 1150 | * @param scriptelem script element list |
michael@0 | 1151 | * @param scriptelemlength size of the script element list |
michael@0 | 1152 | */ |
michael@0 | 1153 | void markTailored(UScriptCode script[], int scriptcount, |
michael@0 | 1154 | ScriptElement scriptelem[], int scriptelemlength) |
michael@0 | 1155 | { |
michael@0 | 1156 | int32_t rulelength; |
michael@0 | 1157 | const UChar *rule = ucol_getRules(COLLATOR_, &rulelength); |
michael@0 | 1158 | |
michael@0 | 1159 | const UChar *current = NULL; |
michael@0 | 1160 | uint32_t strength = 0; |
michael@0 | 1161 | uint32_t chOffset = 0; |
michael@0 | 1162 | uint32_t chLen = 0; |
michael@0 | 1163 | uint32_t exOffset = 0; |
michael@0 | 1164 | uint32_t exLen = 0; |
michael@0 | 1165 | uint32_t prefixOffset = 0; |
michael@0 | 1166 | uint32_t prefixLen = 0; |
michael@0 | 1167 | uint8_t specs = 0; |
michael@0 | 1168 | UBool rstart = TRUE; |
michael@0 | 1169 | UColTokenParser src; |
michael@0 | 1170 | UColOptionSet opts; |
michael@0 | 1171 | UParseError parseError; |
michael@0 | 1172 | |
michael@0 | 1173 | src.opts = &opts; |
michael@0 | 1174 | src.source = (UChar *)malloc( |
michael@0 | 1175 | (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar)); |
michael@0 | 1176 | memcpy(src.source, rule, rulelength * sizeof(UChar)); |
michael@0 | 1177 | src.current = src.source; |
michael@0 | 1178 | src.end = (UChar *)src.source + rulelength; |
michael@0 | 1179 | src.extraCurrent = src.end; |
michael@0 | 1180 | src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE; |
michael@0 | 1181 | |
michael@0 | 1182 | UErrorCode error = U_ZERO_ERROR; |
michael@0 | 1183 | |
michael@0 | 1184 | while ((current = ucol_tok_parseNextToken(&src, rstart, &parseError, |
michael@0 | 1185 | &error)) != NULL) { |
michael@0 | 1186 | if (chLen >= 1 && strength != UCOL_TOK_RESET) { |
michael@0 | 1187 | // skipping the reset characters and non useful stuff. |
michael@0 | 1188 | ScriptElement se; |
michael@0 | 1189 | u_strncpy(se.ch, src.source + chOffset, chLen); |
michael@0 | 1190 | se.count = chLen; |
michael@0 | 1191 | |
michael@0 | 1192 | if (checkInScripts(script, scriptcount, se)) { |
michael@0 | 1193 | /* |
michael@0 | 1194 | ScriptElement *tse = (ScriptElement *)bsearch(&se, scriptelem, |
michael@0 | 1195 | scriptelemlength, |
michael@0 | 1196 | sizeof(ScriptElement), |
michael@0 | 1197 | compareCodepoints); |
michael@0 | 1198 | */ |
michael@0 | 1199 | for (int i = 0; i < scriptelemlength; i ++) { |
michael@0 | 1200 | if (!scriptelem[i].tailored && |
michael@0 | 1201 | hasSubNFD(scriptelem[i], se)) { |
michael@0 | 1202 | scriptelem[i].tailored = TRUE; |
michael@0 | 1203 | } |
michael@0 | 1204 | } |
michael@0 | 1205 | } |
michael@0 | 1206 | } |
michael@0 | 1207 | rstart = FALSE; |
michael@0 | 1208 | } |
michael@0 | 1209 | free(src.source); |
michael@0 | 1210 | if (U_FAILURE(error)) { |
michael@0 | 1211 | fprintf(stdout, "Error parsing rules\n"); |
michael@0 | 1212 | } |
michael@0 | 1213 | } |
michael@0 | 1214 | |
michael@0 | 1215 | /** |
michael@0 | 1216 | * Checks if the collation iterator has more than 1 collation element |
michael@0 | 1217 | * @parem coleiter collation element iterator |
michael@0 | 1218 | * @return TRUE if collation iterator has more than 1 collation element |
michael@0 | 1219 | */ |
michael@0 | 1220 | UBool hasExpansions(UCollationElements *coleiter) |
michael@0 | 1221 | { |
michael@0 | 1222 | UErrorCode error = U_ZERO_ERROR; |
michael@0 | 1223 | int32_t ce = ucol_next(coleiter, &error); |
michael@0 | 1224 | int count = 0; |
michael@0 | 1225 | |
michael@0 | 1226 | if (U_FAILURE(error)) { |
michael@0 | 1227 | fprintf(stdout, "Error getting next collation element\n"); |
michael@0 | 1228 | } |
michael@0 | 1229 | while (ce != UCOL_NULLORDER) { |
michael@0 | 1230 | if ((UCOL_PRIMARYORDER(ce) != 0) && !isContinuation(ce)) { |
michael@0 | 1231 | count ++; |
michael@0 | 1232 | if (count == 2) { |
michael@0 | 1233 | return TRUE; |
michael@0 | 1234 | } |
michael@0 | 1235 | } |
michael@0 | 1236 | ce = ucol_next(coleiter, &error); |
michael@0 | 1237 | if (U_FAILURE(error)) { |
michael@0 | 1238 | fprintf(stdout, "Error getting next collation element\n"); |
michael@0 | 1239 | } |
michael@0 | 1240 | } |
michael@0 | 1241 | return FALSE; |
michael@0 | 1242 | } |
michael@0 | 1243 | |
michael@0 | 1244 | /** |
michael@0 | 1245 | * Prints the footer for index.html |
michael@0 | 1246 | * @param file output file |
michael@0 | 1247 | */ |
michael@0 | 1248 | void outputHTMLFooter() |
michael@0 | 1249 | { |
michael@0 | 1250 | fprintf(OUTPUT_, "</table>\n"); |
michael@0 | 1251 | fprintf(OUTPUT_, "</body>\n"); |
michael@0 | 1252 | fprintf(OUTPUT_, "</html>\n"); |
michael@0 | 1253 | } |
michael@0 | 1254 | |
michael@0 | 1255 | /** |
michael@0 | 1256 | * Serialize the codepoints from start to end into an html file. |
michael@0 | 1257 | * Arranging them into ascending collation order. |
michael@0 | 1258 | * @param script code list |
michael@0 | 1259 | * @param scriptcount number of scripts |
michael@0 | 1260 | */ |
michael@0 | 1261 | //void serializeScripts(UScriptCode script[], int scriptcount) |
michael@0 | 1262 | //Richard |
michael@0 | 1263 | void serializeScripts(UScriptCode script[], int scriptcount, const char* locale = NULL) |
michael@0 | 1264 | { |
michael@0 | 1265 | UErrorCode error = U_ZERO_ERROR; |
michael@0 | 1266 | |
michael@0 | 1267 | ScriptElement *scriptelem = |
michael@0 | 1268 | (ScriptElement *)malloc(sizeof(ScriptElement) * 0x20000); |
michael@0 | 1269 | if (scriptelem == NULL) { |
michael@0 | 1270 | fprintf(stdout, "Memory error\n"); |
michael@0 | 1271 | return; |
michael@0 | 1272 | } |
michael@0 | 1273 | int count = 0; |
michael@0 | 1274 | if(locale) { |
michael@0 | 1275 | count = getScriptElementsFromExemplars(scriptelem, locale); |
michael@0 | 1276 | } else { |
michael@0 | 1277 | count = getScriptElements(script, scriptcount, scriptelem); |
michael@0 | 1278 | } |
michael@0 | 1279 | |
michael@0 | 1280 | // Sort script elements using Quicksort algorithm: |
michael@0 | 1281 | qsort(scriptelem, count, sizeof(ScriptElement), compareCodepoints); |
michael@0 | 1282 | markTailored(script, scriptcount, scriptelem, count); |
michael@0 | 1283 | // Sort script elements using Quicksort algorithm: |
michael@0 | 1284 | qsort(scriptelem, count, sizeof(ScriptElement), compareSortKey); |
michael@0 | 1285 | |
michael@0 | 1286 | UCollationElements* coleiter = ucol_openElements(COLLATOR_, |
michael@0 | 1287 | scriptelem[0].ch, |
michael@0 | 1288 | scriptelem[0].count, |
michael@0 | 1289 | &error); |
michael@0 | 1290 | if (U_FAILURE(error)) { |
michael@0 | 1291 | fprintf(stdout, "Error creating collation element iterator\n"); |
michael@0 | 1292 | return; |
michael@0 | 1293 | } |
michael@0 | 1294 | |
michael@0 | 1295 | outputScriptElem(scriptelem[0], -1, hasExpansions(coleiter)); |
michael@0 | 1296 | for (int i = 0; i < count - 1; i ++) { |
michael@0 | 1297 | ucol_setText(coleiter, scriptelem[i + 1].ch, scriptelem[i + 1].count, |
michael@0 | 1298 | &error); |
michael@0 | 1299 | if (U_FAILURE(error)) { |
michael@0 | 1300 | fprintf(stdout, "Error setting text in collation element iterator\n"); |
michael@0 | 1301 | return; |
michael@0 | 1302 | } |
michael@0 | 1303 | outputScriptElem(scriptelem[i + 1], |
michael@0 | 1304 | compareSortKey(scriptelem + i, scriptelem + i + 1), |
michael@0 | 1305 | hasExpansions(coleiter)); |
michael@0 | 1306 | } |
michael@0 | 1307 | free(scriptelem); |
michael@0 | 1308 | outputHTMLFooter(); |
michael@0 | 1309 | } |
michael@0 | 1310 | |
michael@0 | 1311 | /** |
michael@0 | 1312 | * Prints the header for the html |
michael@0 | 1313 | * @param locale name |
michael@0 | 1314 | * @param script |
michael@0 | 1315 | * @param scriptcount number of scripts |
michael@0 | 1316 | */ |
michael@0 | 1317 | void outputHTMLHeader(const char *locale, UScriptCode script[], |
michael@0 | 1318 | int scriptcount) |
michael@0 | 1319 | { |
michael@0 | 1320 | fprintf(OUTPUT_, "<html>\n"); |
michael@0 | 1321 | fprintf(OUTPUT_, "<head>\n"); |
michael@0 | 1322 | fprintf(OUTPUT_, "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\n"); |
michael@0 | 1323 | fprintf(OUTPUT_, "<meta http-equiv=\"Content-Language\" content=\"en-us\">\n"); |
michael@0 | 1324 | fprintf(OUTPUT_, "<link rel=\"stylesheet\" href=\"charts.css\" type=\"text/css\">\n"); |
michael@0 | 1325 | fprintf(OUTPUT_, "<title>ICU Collation charts</title>\n"); |
michael@0 | 1326 | fprintf(OUTPUT_, "<base target=\"main\">\n"); |
michael@0 | 1327 | fprintf(OUTPUT_, "</head>\n"); |
michael@0 | 1328 | |
michael@0 | 1329 | fprintf(OUTPUT_, "<body bgcolor=#FFFFFF>\n"); |
michael@0 | 1330 | fprintf(OUTPUT_, "<!--\n"); |
michael@0 | 1331 | fprintf(OUTPUT_, "This file contains sorted characters in ascending order according to the locale stated\n"); |
michael@0 | 1332 | fprintf(OUTPUT_, "If the character is in red, it is tailored in the collation rules.\n"); |
michael@0 | 1333 | fprintf(OUTPUT_, "Background colours have certain meanings:\n"); |
michael@0 | 1334 | fprintf(OUTPUT_, "White - equals the previous character\n"); |
michael@0 | 1335 | fprintf(OUTPUT_, "dark blue - primary greater than the previous character\n"); |
michael@0 | 1336 | fprintf(OUTPUT_, "blue - secondary greater than the previous character\n"); |
michael@0 | 1337 | fprintf(OUTPUT_, "light blue - tertiary greater than the previous character\n"); |
michael@0 | 1338 | fprintf(OUTPUT_, "--!>\n"); |
michael@0 | 1339 | |
michael@0 | 1340 | fprintf(OUTPUT_, "<table border=0>\n"); |
michael@0 | 1341 | UChar displayname[64]; |
michael@0 | 1342 | UErrorCode error = U_ZERO_ERROR; |
michael@0 | 1343 | int32_t size = uloc_getDisplayName(locale, "en_US", displayname, 64, &error); |
michael@0 | 1344 | char utf8displayname[128]; |
michael@0 | 1345 | if (U_FAILURE(error)) { |
michael@0 | 1346 | utf8displayname[0] = 0; |
michael@0 | 1347 | } |
michael@0 | 1348 | else { |
michael@0 | 1349 | int32_t utf8size = 0; |
michael@0 | 1350 | u_strToUTF8(utf8displayname, 128, &utf8size, displayname, size, &error); |
michael@0 | 1351 | } |
michael@0 | 1352 | |
michael@0 | 1353 | fprintf(OUTPUT_, "<tr><th>Locale</th><td class='noborder'>%s</td></tr>\n", utf8displayname); |
michael@0 | 1354 | fprintf(OUTPUT_, "<tr><th>Script(s)</th>"); |
michael@0 | 1355 | fprintf(OUTPUT_, "<td class='noborder'>"); |
michael@0 | 1356 | for (int i = 0; i < scriptcount; i ++) { |
michael@0 | 1357 | fprintf(OUTPUT_, "%s", uscript_getName(script[i])); |
michael@0 | 1358 | if (i + 1 != scriptcount) { |
michael@0 | 1359 | fprintf(OUTPUT_, ", "); |
michael@0 | 1360 | } |
michael@0 | 1361 | } |
michael@0 | 1362 | fprintf(OUTPUT_, "</td></tr>\n"); |
michael@0 | 1363 | |
michael@0 | 1364 | fprintf(OUTPUT_, "<tr><th>Rules</th><td class='noborder'><a href=\"http://dev.icu-project.org/cgi-bin/viewcvs.cgi/*checkout*/icu/source/data/coll/%s.txt\">%s.txt</a></td></tr>\n", locale, locale); |
michael@0 | 1365 | |
michael@0 | 1366 | UVersionInfo version; |
michael@0 | 1367 | ucol_getVersion(COLLATOR_, version); |
michael@0 | 1368 | fprintf(OUTPUT_, "<tr><th>Collator version</th><td class='noborder'>%d.%d.%d.%d</td></tr>\n", |
michael@0 | 1369 | version[0], version[1], version[2], version[3]); |
michael@0 | 1370 | |
michael@0 | 1371 | UColAttribute attr = UCOL_FRENCH_COLLATION; |
michael@0 | 1372 | while (attr < UCOL_ATTRIBUTE_COUNT) { |
michael@0 | 1373 | UColAttributeValue value = ucol_getAttribute(COLLATOR_, attr, &error); |
michael@0 | 1374 | if (U_FAILURE(error)) { |
michael@0 | 1375 | fprintf(stdout, "Error getting attribute\n"); |
michael@0 | 1376 | return; |
michael@0 | 1377 | } |
michael@0 | 1378 | if (value != UCOL_DEFAULT) { |
michael@0 | 1379 | if (attr == UCOL_FRENCH_COLLATION && value != UCOL_OFF) { |
michael@0 | 1380 | fprintf(OUTPUT_, "<tr><th>French Collation</th><td class='noborder'>on, code %d</td></tr>\n", value); |
michael@0 | 1381 | } |
michael@0 | 1382 | if (attr == UCOL_ALTERNATE_HANDLING && value != UCOL_NON_IGNORABLE) { |
michael@0 | 1383 | fprintf(OUTPUT_, "<tr><th>Alternate Handling</th><td class='noborder'>shifted, code%d</td></tr>\n", value); |
michael@0 | 1384 | } |
michael@0 | 1385 | if (attr == UCOL_CASE_FIRST && value != UCOL_OFF) { |
michael@0 | 1386 | fprintf(OUTPUT_, "<tr><th>Case First</th><td class='noborder'>on, code %d</td></tr>\n", value); |
michael@0 | 1387 | } |
michael@0 | 1388 | if (attr == UCOL_CASE_LEVEL && value != UCOL_OFF) { |
michael@0 | 1389 | fprintf(OUTPUT_, "<tr><th>Case Level</th><td class='noborder'>on, code %d</td></tr>\n", value); |
michael@0 | 1390 | } |
michael@0 | 1391 | if (attr == UCOL_NORMALIZATION_MODE && value != UCOL_OFF) { |
michael@0 | 1392 | fprintf(OUTPUT_, "<tr><th>Normalization</th><td class='noborder'>on, code %d</td></tr>\n", value); |
michael@0 | 1393 | } |
michael@0 | 1394 | if (attr == UCOL_STRENGTH && value != UCOL_TERTIARY) { |
michael@0 | 1395 | fprintf(OUTPUT_, "<tr><th>Strength</th><td class='noborder'>code %d</td></tr>\n", value); |
michael@0 | 1396 | } |
michael@0 | 1397 | if (attr == UCOL_HIRAGANA_QUATERNARY_MODE && value != UCOL_OFF) { |
michael@0 | 1398 | fprintf(OUTPUT_, "<tr><th>Hiragana Quaternary</th><td class='noborder'>on, code %d</td></tr>\n", value); |
michael@0 | 1399 | } |
michael@0 | 1400 | } |
michael@0 | 1401 | attr = (UColAttribute)(attr + 1); |
michael@0 | 1402 | } |
michael@0 | 1403 | |
michael@0 | 1404 | // Get UNIX-style time and display as number and string. |
michael@0 | 1405 | time_t ltime; |
michael@0 | 1406 | time( <ime ); |
michael@0 | 1407 | fprintf(OUTPUT_, "<tr><th>Date Generated</th><td class='noborder'>%s</td></tr>", ctime(<ime)); |
michael@0 | 1408 | |
michael@0 | 1409 | fprintf(OUTPUT_, "</table>\n"); |
michael@0 | 1410 | |
michael@0 | 1411 | fprintf(OUTPUT_, "<p><a href=help.html>How to read the table</a><br>\n"); |
michael@0 | 1412 | fprintf(OUTPUT_, "<a href=http://www.jtcsv.com/cgi-bin/icu-bugs/ target=new>Submit a bug</a></p>\n"); |
michael@0 | 1413 | fprintf(OUTPUT_, "\n<table>\n"); |
michael@0 | 1414 | fprintf(OUTPUT_, "\n<tr><th>Codepoint</th><th>P</th><th>S</th><th>T</th><th>Q</th><th>Name</th></tr>\n"); |
michael@0 | 1415 | } |
michael@0 | 1416 | |
michael@0 | 1417 | /** |
michael@0 | 1418 | * Prints the header for index.html |
michael@0 | 1419 | * @param file output file |
michael@0 | 1420 | */ |
michael@0 | 1421 | void outputListHTMLHeader(FILE *file) |
michael@0 | 1422 | { |
michael@0 | 1423 | fprintf(file, "<html>\n"); |
michael@0 | 1424 | fprintf(file, "<head>\n"); |
michael@0 | 1425 | fprintf(file, "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\n"); |
michael@0 | 1426 | fprintf(file, "<meta http-equiv=\"Content-Language\" content=\"en-us\">\n"); |
michael@0 | 1427 | fprintf(file, "<title>ICU Collation Charts</title>\n"); |
michael@0 | 1428 | fprintf(file, "<base target=\"main\">\n"); |
michael@0 | 1429 | fprintf(file, "</head>\n"); |
michael@0 | 1430 | fprintf(file, "<body bgcolor=#FFFFFF>\n"); |
michael@0 | 1431 | fprintf(file, "<h2 align=center>ICU Collation Charts</h2>\n"); |
michael@0 | 1432 | fprintf(file, "<p align=center>\n"); |
michael@0 | 1433 | fprintf(file, "<a href=http://www.unicode.org/charts/collation/ target=new>UCA Charts</a><br>"); |
michael@0 | 1434 | } |
michael@0 | 1435 | |
michael@0 | 1436 | /** |
michael@0 | 1437 | * Prints the footer for index.html |
michael@0 | 1438 | * @param file output file |
michael@0 | 1439 | */ |
michael@0 | 1440 | void outputListHTMLFooter(FILE *file) |
michael@0 | 1441 | { |
michael@0 | 1442 | fprintf(file, "</p>\n"); |
michael@0 | 1443 | //fprintf(file, "<center><image src=http://oss.software.ibm.com/icu/images/w24.gif></center>\n"); |
michael@0 | 1444 | fprintf(file, "</body>\n"); |
michael@0 | 1445 | fprintf(file, "</html>\n"); |
michael@0 | 1446 | } |
michael@0 | 1447 | |
michael@0 | 1448 | /** |
michael@0 | 1449 | * Gets all scripts and serialize their codepoints into an html file. |
michael@0 | 1450 | */ |
michael@0 | 1451 | void serializeScripts() { |
michael@0 | 1452 | char filename[128]; |
michael@0 | 1453 | int dirlength = 0; |
michael@0 | 1454 | |
michael@0 | 1455 | if (options[4].doesOccur) { |
michael@0 | 1456 | strcpy(filename, options[4].value); |
michael@0 | 1457 | dirlength = appendDirSeparator(filename); |
michael@0 | 1458 | } else { |
michael@0 | 1459 | filename[0] = 0; |
michael@0 | 1460 | } |
michael@0 | 1461 | |
michael@0 | 1462 | const char *locale; |
michael@0 | 1463 | int32_t localelist = 0; |
michael@0 | 1464 | int32_t localesize; |
michael@0 | 1465 | |
michael@0 | 1466 | localesize = ucol_countAvailable(); |
michael@0 | 1467 | locale = ucol_getAvailable(localelist); |
michael@0 | 1468 | |
michael@0 | 1469 | strcat(filename, "list.html"); |
michael@0 | 1470 | FILE *list = fopen(filename, "w"); |
michael@0 | 1471 | filename[dirlength] = 0; |
michael@0 | 1472 | if (list == NULL) { |
michael@0 | 1473 | fprintf(stdout, "Cannot open file: %s\n", filename); |
michael@0 | 1474 | return; |
michael@0 | 1475 | } |
michael@0 | 1476 | |
michael@0 | 1477 | outputListHTMLHeader(list); |
michael@0 | 1478 | fprintf(list, "<blockquote>\n"); |
michael@0 | 1479 | while (TRUE) { |
michael@0 | 1480 | UErrorCode error = U_ZERO_ERROR; |
michael@0 | 1481 | COLLATOR_ = ucol_open(locale, &error); |
michael@0 | 1482 | if (U_FAILURE(error)) { |
michael@0 | 1483 | fprintf(stdout, "Collator creation failed:"); |
michael@0 | 1484 | fprintf(stdout, u_errorName(error)); |
michael@0 | 1485 | break; |
michael@0 | 1486 | } |
michael@0 | 1487 | if ((error != U_USING_FALLBACK_WARNING && // not tailored |
michael@0 | 1488 | error != U_USING_DEFAULT_WARNING) || |
michael@0 | 1489 | checkLocaleForLanguage(locale)) { |
michael@0 | 1490 | fprintf(list, "<a href=%s.html>%s</a> ", locale, locale); |
michael@0 | 1491 | setAttributes(COLLATOR_, &error); |
michael@0 | 1492 | if (U_FAILURE(error)) { |
michael@0 | 1493 | fprintf(stdout, "Collator attribute setting failed:"); |
michael@0 | 1494 | fprintf(stdout, u_errorName(error)); |
michael@0 | 1495 | break; |
michael@0 | 1496 | } |
michael@0 | 1497 | |
michael@0 | 1498 | UScriptCode scriptcode[32]; |
michael@0 | 1499 | uint32_t scriptcount = uscript_getCode(locale, scriptcode, 32, |
michael@0 | 1500 | &error); |
michael@0 | 1501 | if (U_FAILURE(error)) { |
michael@0 | 1502 | fprintf(stdout, "Error getting lcale scripts\n"); |
michael@0 | 1503 | break; |
michael@0 | 1504 | } |
michael@0 | 1505 | |
michael@0 | 1506 | strcat(filename, locale); |
michael@0 | 1507 | strcat(filename, ".html"); |
michael@0 | 1508 | OUTPUT_ = fopen(filename, "w"); |
michael@0 | 1509 | if (OUTPUT_ == NULL) { |
michael@0 | 1510 | fprintf(stdout, "Cannot open file:%s\n", filename); |
michael@0 | 1511 | break; |
michael@0 | 1512 | } |
michael@0 | 1513 | outputHTMLHeader(locale, scriptcode, scriptcount); |
michael@0 | 1514 | fprintf(stdout, "%s\n", locale); |
michael@0 | 1515 | |
michael@0 | 1516 | if(options[12].doesOccur) { |
michael@0 | 1517 | // use whole scripts |
michael@0 | 1518 | serializeScripts(scriptcode, scriptcount); |
michael@0 | 1519 | } else { |
michael@0 | 1520 | // use exemplar chars |
michael@0 | 1521 | serializeScripts(scriptcode, scriptcount, locale); |
michael@0 | 1522 | } |
michael@0 | 1523 | fclose(OUTPUT_); |
michael@0 | 1524 | } |
michael@0 | 1525 | ucol_close(COLLATOR_); |
michael@0 | 1526 | |
michael@0 | 1527 | filename[dirlength] = 0; |
michael@0 | 1528 | localelist ++; |
michael@0 | 1529 | if (localelist == localesize) { |
michael@0 | 1530 | break; |
michael@0 | 1531 | } |
michael@0 | 1532 | locale = ucol_getAvailable(localelist); |
michael@0 | 1533 | } |
michael@0 | 1534 | fprintf(list, "<br><a href=help.html>help</a><br>"); |
michael@0 | 1535 | fprintf(list, "</blockquote>\n"); |
michael@0 | 1536 | outputListHTMLFooter(list); |
michael@0 | 1537 | fclose(list); |
michael@0 | 1538 | } |
michael@0 | 1539 | |
michael@0 | 1540 | /** |
michael@0 | 1541 | * Main -- process command line, read in and pre-process the test file, |
michael@0 | 1542 | * call other functions to do the actual tests. |
michael@0 | 1543 | */ |
michael@0 | 1544 | int main(int argc, char *argv[]) { |
michael@0 | 1545 | |
michael@0 | 1546 | argc = u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), |
michael@0 | 1547 | options); |
michael@0 | 1548 | |
michael@0 | 1549 | // error handling, printing usage message |
michael@0 | 1550 | if (argc < 0) { |
michael@0 | 1551 | fprintf(stdout, "error in command line argument: "); |
michael@0 | 1552 | fprintf(stdout, argv[-argc]); |
michael@0 | 1553 | fprintf(stdout, "\n"); |
michael@0 | 1554 | } |
michael@0 | 1555 | if (argc < 0 || options[0].doesOccur || options[1].doesOccur) { |
michael@0 | 1556 | fprintf(stdout, "Usage: dumpce options...\n" |
michael@0 | 1557 | "--help\n" |
michael@0 | 1558 | " Display this message.\n" |
michael@0 | 1559 | "--locale name|all\n" |
michael@0 | 1560 | " ICU locale to use. Default is en_US\n" |
michael@0 | 1561 | "--serialize\n" |
michael@0 | 1562 | " Serializes the collation elements in -locale or all locales available and outputs them into --outputdir/locale_ce.txt\n" |
michael@0 | 1563 | "--destdir dir_name\n" |
michael@0 | 1564 | " Path for outputing the serialized collation elements. Defaults to stdout if no defined\n" |
michael@0 | 1565 | "--sourcedir dir_name\n" |
michael@0 | 1566 | " Path for the input rule file for collation\n" |
michael@0 | 1567 | "--attribute name=value,name=value...\n" |
michael@0 | 1568 | " Pairs of attribute names and values for setting\n" |
michael@0 | 1569 | "--rule filename\n" |
michael@0 | 1570 | " Name of file containing the collation rules.\n" |
michael@0 | 1571 | "--normalizaton mode\n" |
michael@0 | 1572 | " UNormalizationMode mode to be used.\n" |
michael@0 | 1573 | "--scripts\n" |
michael@0 | 1574 | " Codepoints from all scripts are sorted and serialized.\n" |
michael@0 | 1575 | "--reducehan\n" |
michael@0 | 1576 | " Only 200 Han script characters will be displayed with the use of --scripts.\n" |
michael@0 | 1577 | "--wholescripts\n" |
michael@0 | 1578 | " Show collation order for whole scripts instead of just for exemplar characters of a locale\n\n"); |
michael@0 | 1579 | |
michael@0 | 1580 | fprintf(stdout, "Example to generate *.txt files : dumpce --serialize --locale af --destdir /temp --attribute UCOL_STRENGTH=UCOL_DEFAULT_STRENGTH,4=17\n\n"); |
michael@0 | 1581 | fprintf(stdout, "Example to generate *.html files for oss web display: dumpce --scripts --destdir /temp --reducehan\n"); |
michael@0 | 1582 | return argc < 0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; |
michael@0 | 1583 | } |
michael@0 | 1584 | |
michael@0 | 1585 | OUTPUT_ = stdout; |
michael@0 | 1586 | if (options[6].doesOccur) { |
michael@0 | 1587 | fprintf(stdout, "attributes %s\n", options[6].value); |
michael@0 | 1588 | parseAttributes(); |
michael@0 | 1589 | } |
michael@0 | 1590 | if (options[3].doesOccur) { |
michael@0 | 1591 | serialize(); |
michael@0 | 1592 | } |
michael@0 | 1593 | if (options[9].doesOccur) { |
michael@0 | 1594 | serializeScripts(); |
michael@0 | 1595 | } |
michael@0 | 1596 | return 0; |
michael@0 | 1597 | } |