intl/icu/source/tools/dumpce/dumpce.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/tools/dumpce/dumpce.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1597 @@
     1.4 +/********************************************************************
     1.5 + * COPYRIGHT:
     1.6 + * Copyright (C) 2001-2011 IBM, Inc.   All Rights Reserved.
     1.7 + *
     1.8 + ********************************************************************/
     1.9 +/********************************************************************************
    1.10 +*
    1.11 +* File dumpce.cpp
    1.12 +*
    1.13 +* Modification History:
    1.14 +* Name          Date           Description
    1.15 +* synwee        May 31 2001    Creation
    1.16 +*
    1.17 +*********************************************************************************
    1.18 +*/
    1.19 +
    1.20 +/**
    1.21 +* This program outputs the collation elements used for a requested tailoring.
    1.22 +*
    1.23 +* Usage:
    1.24 +*     dumpce options... please check main function.
    1.25 +*/
    1.26 +#include <unicode/utypes.h>
    1.27 +#include <unicode/ucol.h>
    1.28 +#include <unicode/uloc.h>
    1.29 +#include <unicode/ucoleitr.h>
    1.30 +#include <unicode/uchar.h>
    1.31 +#include <unicode/uscript.h>
    1.32 +#include <unicode/utf16.h>
    1.33 +#include <unicode/putil.h>
    1.34 +#include <unicode/ustring.h>
    1.35 +#include <stdio.h>
    1.36 +#include <stdlib.h>
    1.37 +#include <string.h>
    1.38 +#include <time.h>
    1.39 +#include "ucol_tok.h"
    1.40 +#include "cstring.h"
    1.41 +#include "uoptions.h"
    1.42 +#include "ucol_imp.h"
    1.43 +#include <unicode/ures.h>
    1.44 +#include <unicode/uniset.h>
    1.45 +#include <unicode/usetiter.h>
    1.46 +
    1.47 +/**
    1.48 +* Command line option variables. 
    1.49 +* These global variables are set according to the options specified on the 
    1.50 +* command line by the user.
    1.51 +*/
    1.52 +static UOption options[]={
    1.53 +    /* 00 */ UOPTION_HELP_H, 
    1.54 +    /* 01 */ UOPTION_HELP_QUESTION_MARK,
    1.55 +    /* 02 */ {"locale",        NULL, NULL, NULL, 'l', UOPT_REQUIRES_ARG, 0},
    1.56 +    /* 03 */ {"serialize",     NULL, NULL, NULL, 'z', UOPT_NO_ARG, 0},
    1.57 +	/* 04 */ UOPTION_DESTDIR,
    1.58 +    /* 05 */ UOPTION_SOURCEDIR,
    1.59 +    /* 06 */ {"attribute",     NULL, NULL, NULL, 'a', UOPT_REQUIRES_ARG, 0},
    1.60 +    /* 07 */ {"rule",          NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0},
    1.61 +    /* 08 */ {"normalization", NULL, NULL, NULL, 'n', UOPT_REQUIRES_ARG, 0},
    1.62 +    /* 09 */ {"scripts",       NULL, NULL, NULL, 't', UOPT_NO_ARG, 0},
    1.63 +    /* 10 */ {"reducehan",     NULL, NULL, NULL, 'e', UOPT_NO_ARG, 0},
    1.64 +	/* 11 */ UOPTION_VERBOSE,
    1.65 +    /* 12 */ {"wholescripts",      NULL, NULL, NULL, 'W', UOPT_NO_ARG, 0}
    1.66 +};
    1.67 +
    1.68 +/**
    1.69 +* Collator used in this program
    1.70 +*/
    1.71 +static UCollator *COLLATOR_;
    1.72 +/**
    1.73 +* Output strea, used in this program
    1.74 +*/
    1.75 +static FILE *OUTPUT_;
    1.76 +
    1.77 +static UColAttributeValue ATTRIBUTE_[UCOL_ATTRIBUTE_COUNT] = {
    1.78 +    UCOL_DEFAULT, UCOL_DEFAULT, UCOL_DEFAULT, UCOL_DEFAULT, UCOL_DEFAULT, 
    1.79 +    UCOL_DEFAULT, UCOL_DEFAULT, UCOL_DEFAULT,
    1.80 +};
    1.81 +
    1.82 +typedef struct {
    1.83 +    int   value;
    1.84 +    char *name;
    1.85 +} EnumNameValuePair;
    1.86 +
    1.87 +static const EnumNameValuePair ATTRIBUTE_NAME_[] = {
    1.88 +    {UCOL_FRENCH_COLLATION, "UCOL_FRENCH_COLLATION"},
    1.89 +    {UCOL_ALTERNATE_HANDLING, "UCOL_ALTERNATE_HANDLING"}, 
    1.90 +    {UCOL_CASE_FIRST, "UCOL_CASE_FIRST"}, 
    1.91 +    {UCOL_CASE_LEVEL, "UCOL_CASE_LEVEL"}, 
    1.92 +    {UCOL_NORMALIZATION_MODE, 
    1.93 +        "UCOL_NORMALIZATION_MODE|UCOL_DECOMPOSITION_MODE"},
    1.94 +    {UCOL_STRENGTH, "UCOL_STRENGTH"},
    1.95 +	{UCOL_HIRAGANA_QUATERNARY_MODE, "UCOL_HIRAGANA_QUATERNARY_MODE"},
    1.96 +    {UCOL_NUMERIC_COLLATION, "UCOL_NUMERIC_COLLATION"},
    1.97 +    NULL
    1.98 +};
    1.99 +     
   1.100 +static const EnumNameValuePair ATTRIBUTE_VALUE_[] = {
   1.101 +    {UCOL_PRIMARY, "UCOL_PRIMARY"},
   1.102 +    {UCOL_SECONDARY, "UCOL_SECONDARY"},
   1.103 +    {UCOL_TERTIARY, "UCOL_TERTIARY|UCOL_DEFAULT_STRENGTH"},
   1.104 +    {UCOL_QUATERNARY, "UCOL_QUATERNARY"},
   1.105 +    {UCOL_IDENTICAL, "UCOL_IDENTICAL"},
   1.106 +    {UCOL_OFF, "UCOL_OFF"},
   1.107 +    {UCOL_ON, "UCOL_ON"},
   1.108 +    {UCOL_SHIFTED, "UCOL_SHIFTED"},
   1.109 +    {UCOL_NON_IGNORABLE, "UCOL_NON_IGNORABLE"},
   1.110 +    {UCOL_LOWER_FIRST, "UCOL_LOWER_FIRST"},
   1.111 +    {UCOL_UPPER_FIRST, "UCOL_UPPER_FIRST"},
   1.112 +    NULL
   1.113 +};
   1.114 +
   1.115 +typedef struct {
   1.116 +    UChar ch[32];
   1.117 +    int   count; // number of codepoint
   1.118 +    UBool tailored;
   1.119 +} ScriptElement;
   1.120 +
   1.121 +/**
   1.122 +* Writes the hexadecimal of a null-terminated array of codepoints into a 
   1.123 +* file
   1.124 +* @param f UFILE instance to store
   1.125 +* @param c codepoints array
   1.126 +*/
   1.127 +void serialize(FILE *f, const UChar *c) 
   1.128 +{
   1.129 +    UChar cp = *(c ++);
   1.130 +    
   1.131 +    fprintf(f, " %04x", cp);
   1.132 +   
   1.133 +    while (*c != 0) {
   1.134 +        cp = *(c ++);
   1.135 +        fprintf(f, " %04x", cp);
   1.136 +    }
   1.137 +}
   1.138 +
   1.139 +/**
   1.140 +* Writes the hexadecimal of a non-null-terminated array of codepoints into a 
   1.141 +* file
   1.142 +* @param f UFILE instance to store
   1.143 +* @param c codepoints array
   1.144 +* @param l codepoints array length
   1.145 +*/
   1.146 +void serialize(FILE *f, const UChar *c, int l) 
   1.147 +{
   1.148 +    int   count = 1;
   1.149 +    UChar cp    = *(c ++);
   1.150 +    
   1.151 +    fprintf(f, " %04x", cp);
   1.152 +   
   1.153 +    while (count < l) {
   1.154 +        cp = *(c ++);
   1.155 +        fprintf(f, " %04x", cp);
   1.156 +        count ++;
   1.157 +    }
   1.158 +}
   1.159 +
   1.160 +/**
   1.161 +* Sets the iterator to the argument string and outputs the collation elements.
   1.162 +* @param f file output stream
   1.163 +* @param iter collation element iterator
   1.164 +*/
   1.165 +void serialize(FILE *f, UCollationElements *iter) {
   1.166 +    const UChar   *codepoint = iter->iteratordata_.string;
   1.167 +    // unlikely that sortkeys will be over this size 
   1.168 +    uint8_t  sortkey[64];
   1.169 +    uint8_t *psortkey = sortkey;
   1.170 +    int      sortkeylength = 0;
   1.171 +
   1.172 +    if (iter->iteratordata_.flags & UCOL_ITER_HASLEN) {
   1.173 +        serialize(f, codepoint, iter->iteratordata_.endp - codepoint);
   1.174 +        sortkeylength = ucol_getSortKey(iter->iteratordata_.coll, codepoint, 
   1.175 +                        iter->iteratordata_.endp - codepoint, sortkey, 64);
   1.176 +    }
   1.177 +    else {
   1.178 +        serialize(f, codepoint);
   1.179 +        sortkeylength = ucol_getSortKey(iter->iteratordata_.coll, codepoint, 
   1.180 +                                        -1, sortkey, 64);
   1.181 +    }
   1.182 +    if (options[11].doesOccur) {
   1.183 +        serialize(stdout, codepoint);
   1.184 +        fprintf(stdout, "\n");
   1.185 +    }
   1.186 +
   1.187 +    fprintf(f, "; ");
   1.188 +
   1.189 +    UErrorCode error = U_ZERO_ERROR;
   1.190 +    uint32_t ce = ucol_next(iter, &error);
   1.191 +    if (U_FAILURE(error)) {
   1.192 +        fprintf(f, "Error retrieving collation elements\n");
   1.193 +        return;
   1.194 +    }
   1.195 +    
   1.196 +    while (TRUE) {
   1.197 +        fprintf(f, "[");
   1.198 +        if (UCOL_PRIMARYORDER(ce) != 0) {
   1.199 +            fprintf(f, "%04x", UCOL_PRIMARYORDER(ce));
   1.200 +        }
   1.201 +        fprintf(f, ",");
   1.202 +        if (UCOL_SECONDARYORDER(ce) != 0) {
   1.203 +            fprintf(f, " %02x", UCOL_SECONDARYORDER(ce));
   1.204 +        }
   1.205 +        fprintf(f, ",");
   1.206 +        if (UCOL_TERTIARYORDER(ce) != 0) {
   1.207 +            fprintf(f, " %02x", UCOL_TERTIARYORDER(ce));
   1.208 +        }
   1.209 +        fprintf(f, "] ");
   1.210 +
   1.211 +        ce = ucol_next(iter, &error);
   1.212 +        if (ce == UCOL_NULLORDER) {
   1.213 +            break;
   1.214 +        }
   1.215 +        if (U_FAILURE(error)) {
   1.216 +            fprintf(stdout, "Error retrieving collation elements");
   1.217 +            return;
   1.218 +        }
   1.219 +    }
   1.220 +    
   1.221 +    if (sortkeylength > 64) {
   1.222 +        fprintf(f, "Sortkey exceeds pre-allocated size");
   1.223 +    }
   1.224 +
   1.225 +    fprintf(f, "[");
   1.226 +    while (TRUE) {
   1.227 +        fprintf(f, "%02x", *psortkey);
   1.228 +        psortkey ++;
   1.229 +        if ((*psortkey) == 0) {
   1.230 +            break;
   1.231 +        }
   1.232 +        fprintf(f, " ");
   1.233 +    }
   1.234 +    fprintf(f, "]\n");
   1.235 +}
   1.236 +
   1.237 +/**
   1.238 +* Serializes the contraction within the given argument rule
   1.239 +* @param f file output stream
   1.240 +* @param r rule
   1.241 +* @param rlen rule length
   1.242 +* @param contractionsonly flag to indicate if only contractions are to be 
   1.243 +*                         output or all collation elements
   1.244 +* @param iter iterator to iterate over collation elements
   1.245 +*/
   1.246 +void serialize(FILE *f, UChar *rule, int rlen, UBool contractiononly, 
   1.247 +               UCollationElements *iter) {
   1.248 +    const UChar           *current  = NULL;
   1.249 +          uint32_t         strength = 0;
   1.250 +          uint32_t         chOffset = 0; 
   1.251 +          uint32_t         chLen    = 0;
   1.252 +          uint32_t         exOffset = 0; 
   1.253 +          uint32_t         exLen    = 0;
   1.254 +          uint32_t         prefixOffset = 0; 
   1.255 +          uint32_t         prefixLen    = 0;
   1.256 +          uint8_t          specs    = 0;
   1.257 +          UBool            rstart   = TRUE;
   1.258 +          UColTokenParser  src;
   1.259 +          UColOptionSet    opts;
   1.260 +          UParseError      parseError;
   1.261 +          UErrorCode       error    = U_ZERO_ERROR;
   1.262 +    
   1.263 +    src.opts = &opts;
   1.264 +      
   1.265 +    src.source       = rule; 
   1.266 +	src.current = rule;
   1.267 +    src.end          = rule + rlen;
   1.268 +    src.extraCurrent = src.end;
   1.269 +    src.extraEnd     = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
   1.270 +
   1.271 +        
   1.272 +    while ((current = ucol_tok_parseNextToken(&src, rstart, &parseError,
   1.273 +                                              &error)) != NULL) {
   1.274 +      chOffset = src.parsedToken.charsOffset;
   1.275 +      chLen = src.parsedToken.charsLen;
   1.276 +        // contractions handled here
   1.277 +        if (!contractiononly || chLen > 1) {
   1.278 +            ucol_setText(iter, rule + chOffset, chLen, &error);
   1.279 +            if (U_FAILURE(error)) {
   1.280 +                fprintf(stdout, "Error setting text in iterator\n");
   1.281 +                return;
   1.282 +            }
   1.283 +            serialize(f, iter);
   1.284 +        }
   1.285 +        rstart = FALSE;
   1.286 +    }
   1.287 +}
   1.288 +
   1.289 +/**
   1.290 +* Prints the attribute values in the argument collator into the output stream
   1.291 +* @param collator
   1.292 +*/
   1.293 +void outputAttribute(UCollator *collator, UErrorCode *error) 
   1.294 +{
   1.295 +    UColAttribute attribute = UCOL_FRENCH_COLLATION;
   1.296 +    while (attribute < UCOL_ATTRIBUTE_COUNT) {
   1.297 +        int count = 0;
   1.298 +        while (TRUE) {
   1.299 +            // getting attribute name
   1.300 +            if (ATTRIBUTE_NAME_[count].value == attribute) {
   1.301 +                fprintf(OUTPUT_, "%s = ", ATTRIBUTE_NAME_[count].name);
   1.302 +                break;
   1.303 +            }
   1.304 +            count ++;
   1.305 +        }
   1.306 +        count = 0;
   1.307 +        int attributeval = ucol_getAttribute(collator, attribute, error);
   1.308 +        if (U_FAILURE(*error)) {
   1.309 +            fprintf(stdout, "Failure in reading collator attribute\n");
   1.310 +            return;
   1.311 +        }
   1.312 +        while (TRUE) {
   1.313 +            // getting attribute value
   1.314 +            if (ATTRIBUTE_VALUE_[count].value == attributeval) {
   1.315 +                fprintf(OUTPUT_, "%s\n", ATTRIBUTE_VALUE_[count].name);
   1.316 +                break;
   1.317 +            }
   1.318 +            count ++;
   1.319 +        }
   1.320 +        attribute = (UColAttribute)(attribute + 1);
   1.321 +    }
   1.322 +}
   1.323 +
   1.324 +/**
   1.325 +* Prints the normalization mode in the argument collator into the output stream
   1.326 +* @param collator
   1.327 +*/
   1.328 +void outputNormalization(UCollator *collator) 
   1.329 +{
   1.330 +	UErrorCode status = U_ZERO_ERROR;
   1.331 +    int normmode = ucol_getAttribute(collator, UCOL_NORMALIZATION_MODE, &status);
   1.332 +    int count = 0;
   1.333 +    while (TRUE) {
   1.334 +        // getting attribute name
   1.335 +        if (ATTRIBUTE_VALUE_[count].value == normmode) {
   1.336 +            break;
   1.337 +        }
   1.338 +        count ++;
   1.339 +    }
   1.340 +    fprintf(OUTPUT_, "NORMALIZATION MODE = %s\n", 
   1.341 +            ATTRIBUTE_VALUE_[count].name);
   1.342 +}
   1.343 +
   1.344 +/**
   1.345 +* Output the collation element belonging to the locale into a file
   1.346 +* @param locale string
   1.347 +* @param fullrules flag to indicate if only tailored collation elements are to
   1.348 +*        be output or all collation elements
   1.349 +*/
   1.350 +void serialize(const char *locale, UBool tailoredonly) {
   1.351 +    UErrorCode  error              = U_ZERO_ERROR;
   1.352 +    UChar       str[128];
   1.353 +    int         strlen = 0;
   1.354 +
   1.355 +    fprintf(OUTPUT_, "# This file contains the serialized collation elements\n");
   1.356 +    fprintf(OUTPUT_, "# as of the collation version indicated below.\n");
   1.357 +    fprintf(OUTPUT_, "# Data format: xxxx xxxx..; [yyyy, yy, yy] [yyyy, yy, yy] ... [yyyy, yy, yy] [zz zz..\n");
   1.358 +    fprintf(OUTPUT_, "#              where xxxx are codepoints in hexadecimals,\n");
   1.359 +    fprintf(OUTPUT_, "#              yyyyyyyy are the corresponding\n");
   1.360 +    fprintf(OUTPUT_, "#              collation elements in hexadecimals\n");
   1.361 +    fprintf(OUTPUT_, "#              and zz are the sortkey values in hexadecimals\n");
   1.362 +
   1.363 +    fprintf(OUTPUT_, "\n# Collator information\n");
   1.364 +
   1.365 +    fprintf(OUTPUT_, "\nLocale: %s\n", locale);
   1.366 +    fprintf(stdout, "Locale: %s\n", locale);
   1.367 +    UVersionInfo version;
   1.368 +    ucol_getVersion(COLLATOR_, version);
   1.369 +    fprintf(OUTPUT_, "Version number: %d.%d.%d.%d\n", 
   1.370 +                      version[0], version[1], version[2], version[3]);
   1.371 +    outputAttribute(COLLATOR_, &error);
   1.372 +    outputNormalization(COLLATOR_);
   1.373 +    
   1.374 +    UCollationElements *iter = ucol_openElements(COLLATOR_, str, strlen, 
   1.375 +                                                 &error);
   1.376 +    if (U_FAILURE(error)) {
   1.377 +        fprintf(stdout, "Error creating iterator\n");
   1.378 +        return;
   1.379 +    }
   1.380 +
   1.381 +    if (!tailoredonly) {
   1.382 +        fprintf(OUTPUT_, "\n# Range of unicode characters\n\n");
   1.383 +        UChar32     codepoint          = 0;
   1.384 +        while (codepoint <= UCHAR_MAX_VALUE) { 
   1.385 +            if (u_isdefined(codepoint)) {
   1.386 +                strlen = 0;
   1.387 +                UTF16_APPEND_CHAR_UNSAFE(str, strlen, codepoint);
   1.388 +                str[strlen] = 0;
   1.389 +                ucol_setText(iter, str, strlen, &error);
   1.390 +                if (U_FAILURE(error)) {
   1.391 +                    fprintf(stdout, "Error setting text in iterator\n");
   1.392 +                    return;
   1.393 +                }
   1.394 +                serialize(OUTPUT_, iter);
   1.395 +            }
   1.396 +            codepoint ++;
   1.397 +        }
   1.398 +    }
   1.399 +
   1.400 +    UChar    ucarules[0x10000];
   1.401 +    UChar   *rules;
   1.402 +    int32_t  rulelength = 0;
   1.403 +    rules      = ucarules;
   1.404 +    
   1.405 +    if (tailoredonly) {
   1.406 +              int32_t  rulelength = 0;
   1.407 +        const UChar   *temp = ucol_getRules(COLLATOR_, &rulelength);
   1.408 +        if (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE > 0x10000) {
   1.409 +            rules = (UChar *)malloc(sizeof(UChar) * 
   1.410 +                                (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE));
   1.411 +        }
   1.412 +        memcpy(rules, temp, rulelength * sizeof(UChar));
   1.413 +        rules[rulelength] = 0;
   1.414 +        fprintf(OUTPUT_, "\n# Tailorings\n\n");
   1.415 +        serialize(OUTPUT_, rules, rulelength, FALSE, iter);
   1.416 +        if (rules != ucarules) {
   1.417 +            free(rules);
   1.418 +        }
   1.419 +    }
   1.420 +    else {        
   1.421 +        rulelength = ucol_getRulesEx(COLLATOR_, UCOL_FULL_RULES, ucarules, 
   1.422 +                                     0x10000);
   1.423 +        if (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE > 0x10000) {
   1.424 +            rules = (UChar *)malloc(sizeof(UChar) * 
   1.425 +                                (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE));
   1.426 +            rulelength = ucol_getRulesEx(COLLATOR_, UCOL_FULL_RULES, rules, 
   1.427 +                                         rulelength);
   1.428 +        }
   1.429 +        fprintf(OUTPUT_, "\n# Contractions\n\n");
   1.430 +        serialize(OUTPUT_, rules, rulelength, TRUE, iter);
   1.431 +        if (rules != ucarules) {
   1.432 +            free(rules);
   1.433 +        }
   1.434 +    }
   1.435 +        
   1.436 +    ucol_closeElements(iter);
   1.437 +}
   1.438 +
   1.439 +/**
   1.440 +* Sets the collator with the attribute values
   1.441 +* @param collator
   1.442 +* @param error status
   1.443 +*/
   1.444 +void setAttributes(UCollator *collator, UErrorCode *error) 
   1.445 +{
   1.446 +    int count = 0;
   1.447 +    while (count < UCOL_ATTRIBUTE_COUNT) {
   1.448 +        if (ATTRIBUTE_[count] != UCOL_DEFAULT) {
   1.449 +            ucol_setAttribute(collator, (UColAttribute)count, 
   1.450 +                              ATTRIBUTE_[count], error);
   1.451 +            if (U_FAILURE(*error)) {
   1.452 +                return;
   1.453 +            }
   1.454 +        }
   1.455 +        count ++;
   1.456 +    }
   1.457 +}
   1.458 +
   1.459 +/**
   1.460 +* Appends directory path with an ending seperator if necessary.
   1.461 +* @param path with enough space to append one seperator
   1.462 +* @return new directory path length
   1.463 +*/
   1.464 +int appendDirSeparator(char *dir) 
   1.465 +{
   1.466 +    int dirlength = strlen(dir);
   1.467 +    char dirending = dir[dirlength - 1];
   1.468 +    if (dirending != U_FILE_SEP_CHAR) {
   1.469 +        dir[dirlength] = U_FILE_SEP_CHAR;
   1.470 +        dir[dirlength + 1] = 0;
   1.471 +        return dirlength + 1;
   1.472 +    }
   1.473 +    return dirlength;
   1.474 +}
   1.475 +
   1.476 +/**
   1.477 +* Output the collation element into a file
   1.478 +*/
   1.479 +void serialize() {
   1.480 +    char filename[128];
   1.481 +    int  dirlength = 0;
   1.482 +
   1.483 +    if (options[4].doesOccur) {
   1.484 +        strcpy(filename, options[4].value);
   1.485 +        dirlength = appendDirSeparator(filename);
   1.486 +    }
   1.487 +
   1.488 +    if (options[2].doesOccur) {
   1.489 +        const char    *locale      = (char *)options[2].value;
   1.490 +              int32_t  localeindex = 0;
   1.491 +        
   1.492 +        if (strcmp(locale, "all") == 0) {
   1.493 +            if (options[4].doesOccur) {
   1.494 +                strcat(filename, "UCA.txt");
   1.495 +                OUTPUT_ = fopen(filename, "w");
   1.496 +                if (OUTPUT_ == NULL) {
   1.497 +                    fprintf(stdout, "Cannot open file:%s\n", filename);
   1.498 +                    return;
   1.499 +                }
   1.500 +            }
   1.501 +            fprintf(stdout, "UCA\n");
   1.502 +            UErrorCode error = U_ZERO_ERROR;
   1.503 +            COLLATOR_ = ucol_open("en_US", &error);
   1.504 +            if (U_FAILURE(error)) {
   1.505 +                fprintf(stdout, "Collator creation failed:");
   1.506 +                fprintf(stdout, u_errorName(error));
   1.507 +                goto CLOSEUCA;
   1.508 +                return;
   1.509 +            }
   1.510 +            setAttributes(COLLATOR_, &error);
   1.511 +            if (U_FAILURE(error)) {
   1.512 +                fprintf(stdout, "Collator attribute setting failed:");
   1.513 +                fprintf(stdout, u_errorName(error));
   1.514 +                goto CLOSEUCA;
   1.515 +                return;
   1.516 +            }
   1.517 +        
   1.518 +            serialize("UCA", FALSE);
   1.519 +CLOSEUCA :  
   1.520 +            if (options[4].doesOccur) {
   1.521 +                filename[dirlength] = 0;
   1.522 +                fclose(OUTPUT_);
   1.523 +            }
   1.524 +            ucol_close(COLLATOR_);
   1.525 +            localeindex = ucol_countAvailable() - 1;
   1.526 +            fprintf(stdout, "Number of locales: %d\n", localeindex + 1);
   1.527 +            locale      = ucol_getAvailable(localeindex);
   1.528 +        }
   1.529 +
   1.530 +        while (TRUE) {
   1.531 +            UErrorCode error = U_ZERO_ERROR;
   1.532 +            COLLATOR_ = ucol_open(locale, &error);
   1.533 +            if (U_FAILURE(error)) {
   1.534 +                fprintf(stdout, "Collator creation failed:");
   1.535 +                fprintf(stdout, u_errorName(error));
   1.536 +                goto CLOSETAILOR;
   1.537 +                return;
   1.538 +            }
   1.539 +            setAttributes(COLLATOR_, &error);
   1.540 +            if (U_FAILURE(error)) {
   1.541 +                fprintf(stdout, "Collator attribute setting failed:");
   1.542 +                fprintf(stdout, u_errorName(error));
   1.543 +                goto CLOSETAILOR;
   1.544 +                return;
   1.545 +            }
   1.546 +
   1.547 +            if (options[4].doesOccur) {
   1.548 +                strcat(filename, locale);
   1.549 +                strcat(filename, ".txt");
   1.550 +                OUTPUT_ = fopen(filename, "w");
   1.551 +                if (OUTPUT_ == NULL) {
   1.552 +                    fprintf(stdout, "Cannot open file:%s\n", filename);
   1.553 +                    return;
   1.554 +                }
   1.555 +            }
   1.556 +
   1.557 +            if (options[3].doesOccur) {
   1.558 +                serialize(locale, TRUE);
   1.559 +            }
   1.560 +
   1.561 +            ucol_close(COLLATOR_);
   1.562 +
   1.563 +CLOSETAILOR : 
   1.564 +            if (options[4].doesOccur) {
   1.565 +                filename[dirlength] = 0;
   1.566 +                fclose(OUTPUT_);
   1.567 +            }
   1.568 +    
   1.569 +            localeindex --;
   1.570 +            if (localeindex < 0) {
   1.571 +                break;
   1.572 +            }
   1.573 +            locale = ucol_getAvailable(localeindex);
   1.574 +        }
   1.575 +    }
   1.576 +
   1.577 +    if (options[7].doesOccur) {
   1.578 +        char inputfilename[128] = "";
   1.579 +        // rules are to be used
   1.580 +        if (options[5].doesOccur) {
   1.581 +            strcpy(inputfilename, options[5].value);
   1.582 +            appendDirSeparator(inputfilename);
   1.583 +        }
   1.584 +        strcat(inputfilename, options[7].value);
   1.585 +        FILE *input = fopen(inputfilename, "r");
   1.586 +        if (input == NULL) {
   1.587 +            fprintf(stdout, "Cannot open file:%s\n", filename);
   1.588 +            return;
   1.589 +        }
   1.590 +        
   1.591 +        char   s[1024];
   1.592 +        UChar  rule[1024];
   1.593 +        UChar *prule = rule;
   1.594 +        int    size = 1024;
   1.595 +        // synwee TODO: make this part dynamic
   1.596 +        while (fscanf(input, "%[^\n]s", s) != EOF) {
   1.597 +            size -= u_unescape(s, prule, size);
   1.598 +            prule = prule + u_strlen(prule);
   1.599 +        }
   1.600 +        fclose(input);
   1.601 +
   1.602 +        if (options[4].doesOccur) {
   1.603 +            strcat(filename, "Rules.txt");
   1.604 +            OUTPUT_ = fopen(filename, "w");
   1.605 +            if (OUTPUT_ == NULL) {
   1.606 +                fprintf(stdout, "Cannot open file:%s\n", filename);
   1.607 +                return;
   1.608 +            }
   1.609 +        }
   1.610 +
   1.611 +        fprintf(stdout, "Rules\n");
   1.612 +        UErrorCode  error = U_ZERO_ERROR;
   1.613 +        UParseError parseError;
   1.614 +        COLLATOR_ = ucol_openRules(rule, u_strlen(rule), UCOL_DEFAULT, 
   1.615 +                                   UCOL_DEFAULT_STRENGTH, &parseError, &error);
   1.616 +        if (U_FAILURE(error)) {
   1.617 +            fprintf(stdout, "Collator creation failed:");
   1.618 +            fprintf(stdout, u_errorName(error));
   1.619 +            goto CLOSERULES;
   1.620 +            return;
   1.621 +        }
   1.622 +        setAttributes(COLLATOR_, &error);
   1.623 +        if (U_FAILURE(error)) {
   1.624 +            fprintf(stdout, "Collator attribute setting failed:");
   1.625 +            fprintf(stdout, u_errorName(error));
   1.626 +            goto CLOSERULES;
   1.627 +            return;
   1.628 +        }
   1.629 +        
   1.630 +        serialize("Rule-based", TRUE);
   1.631 +        ucol_close(COLLATOR_);
   1.632 +
   1.633 +CLOSERULES :
   1.634 +        if (options[4].doesOccur) {
   1.635 +            filename[dirlength] = 0;
   1.636 +            fclose(OUTPUT_);
   1.637 +        }
   1.638 +    }
   1.639 +}
   1.640 +
   1.641 +/**
   1.642 +* Parse for enum values.
   1.643 +* Note this only works for positive enum values.
   1.644 +* @param enumarray array containing names of the enum values in string and 
   1.645 +*        their corresponding value.
   1.646 +*        declared enum value.
   1.647 +* @param str string to be parsed
   1.648 +* @return corresponding integer enum value or -1 if value is not found.
   1.649 +*/
   1.650 +int parseEnums(const EnumNameValuePair enumarray[], const char *str) 
   1.651 +{
   1.652 +    const char *enumname = enumarray[0].name;
   1.653 +    int result = atoi(str);
   1.654 +    if (result == 0 && str[0] != '0') {
   1.655 +        while (strcmp(enumname, str) != 0) {
   1.656 +            // checking for multiple enum names sharing the same values
   1.657 +            enumname = strstr(enumname, str);
   1.658 +            if (enumname != NULL) {
   1.659 +                int size = strchr(enumname, '|') - enumname;
   1.660 +                if (size < 0) {
   1.661 +                    size = strlen(enumname);
   1.662 +                }
   1.663 +                if (size == (int)strlen(str)) {
   1.664 +                    return enumarray[result].value;
   1.665 +                }
   1.666 +            }
   1.667 +            result ++;
   1.668 +            if (&(enumarray[result]) == NULL) {
   1.669 +                return -1;
   1.670 +            }
   1.671 +            enumname = enumarray[result].name;
   1.672 +        }
   1.673 +    }
   1.674 +    return -1;
   1.675 +}
   1.676 +
   1.677 +/**
   1.678 +* Parser for attribute name value pair
   1.679 +*/
   1.680 +void parseAttributes() {
   1.681 +    char str[32];
   1.682 +    const char *pname = options[6].value;
   1.683 +    const char *pend  = options[6].value + strlen(options[6].value);
   1.684 +    const char *pvalue;
   1.685 +    
   1.686 +    while (pname < pend) {
   1.687 +        pvalue = strchr(pname, '=');
   1.688 +        if (pvalue == NULL) {
   1.689 +            fprintf(stdout, 
   1.690 +                    "No matching value found for attribute argument %s\n", 
   1.691 +                    pname);        
   1.692 +            return;
   1.693 +        }
   1.694 +        int count = pvalue - pname;
   1.695 +        strncpy(str, pname, count);
   1.696 +        str[count] = 0;
   1.697 +
   1.698 +        int name = parseEnums(ATTRIBUTE_NAME_, str);
   1.699 +        if (name == -1) {
   1.700 +            fprintf(stdout, "Attribute name not found: %s\n", str);
   1.701 +            return;
   1.702 +        }
   1.703 +        
   1.704 +        pvalue ++;
   1.705 +        // getting corresponding enum value
   1.706 +        pname = strchr(pvalue, ',');
   1.707 +        if (pname == NULL) {
   1.708 +            pname = pend;
   1.709 +        }
   1.710 +        count = pname - pvalue;
   1.711 +        strncpy(str, pvalue, count);
   1.712 +        str[count] = 0;
   1.713 +        int value = parseEnums(ATTRIBUTE_VALUE_, str);
   1.714 +        if (value == -1) {
   1.715 +            fprintf(stdout, "Attribute value not found: %s\n", str);
   1.716 +            return;
   1.717 +        }
   1.718 +        ATTRIBUTE_[name] = (UColAttributeValue)value;
   1.719 +        pname ++;
   1.720 +    }
   1.721 +}
   1.722 +
   1.723 +/**
   1.724 +* Checks if the locale argument is a base language 
   1.725 +* @param locale to be checked
   1.726 +* @return TRUE if it is a base language
   1.727 +*/
   1.728 +inline UBool checkLocaleForLanguage(const char *locale)
   1.729 +{
   1.730 +    return strlen(locale) <= 2;
   1.731 +}
   1.732 +
   1.733 +/**
   1.734 +* Converts a UChar array into its string form "xxxx xxxx"
   1.735 +* @param ch array of UChar characters
   1.736 +* @param count number of UChar characters
   1.737 +*/
   1.738 +void outputUChar(UChar ch[], int count)
   1.739 +{
   1.740 +    for (int i = 0; i < count; i ++) {
   1.741 +        fprintf(OUTPUT_, "%04X ", ch[i]);
   1.742 +    }
   1.743 +}
   1.744 +
   1.745 +/**
   1.746 +* If it is a primary difference returns -1 or 1.
   1.747 +* If it is a secondary difference returns -2 or 2.
   1.748 +* If it is a tertiary difference returns -3 or 3.
   1.749 +* If equals returns 0.
   1.750 +*/
   1.751 +int compareSortKey(const void *elem1, const void *elem2)
   1.752 +{
   1.753 +    // compare the 2 script element sort key
   1.754 +    UChar     *ch1   = ((ScriptElement *)elem1)->ch;
   1.755 +    UChar     *ch2   = ((ScriptElement *)elem2)->ch;
   1.756 +    int        size1 = ((ScriptElement *)elem1)->count;
   1.757 +    int        size2 = ((ScriptElement *)elem2)->count;
   1.758 +    UErrorCode error = U_ZERO_ERROR;
   1.759 +    
   1.760 +    ucol_setStrength(COLLATOR_, UCOL_PRIMARY);
   1.761 +    int result = ucol_strcoll(COLLATOR_, ch1, size1, ch2, size2);
   1.762 +    if (result == 0) {
   1.763 +        ucol_setStrength(COLLATOR_, UCOL_SECONDARY);
   1.764 +        result = ucol_strcoll(COLLATOR_, ch1, size1, ch2, size2);
   1.765 +        if (result == 0) {
   1.766 +            ucol_setStrength(COLLATOR_, UCOL_TERTIARY);
   1.767 +            result = ucol_strcoll(COLLATOR_, ch1, size1, ch2, size2);
   1.768 +            if (result < 0) {
   1.769 +                return -3;
   1.770 +            }
   1.771 +            if (result > 0) {
   1.772 +                return 3;
   1.773 +            }    
   1.774 +        }
   1.775 +        if (result < 0) {
   1.776 +            return -2;
   1.777 +        }
   1.778 +        if (result > 0) {
   1.779 +            return 2;
   1.780 +        }
   1.781 +    }
   1.782 +    return result;
   1.783 +}
   1.784 +
   1.785 +/**
   1.786 +* Output serialized script elements
   1.787 +* @param element the element to output
   1.788 +* @param compare the comparison with the previous element
   1.789 +* @param expansion flags TRUE if element has an expansion
   1.790 +*/
   1.791 +void outputScriptElem(ScriptElement &element, int compare, UBool expansion)
   1.792 +{
   1.793 +    switch (compare) {
   1.794 +    case 0: 
   1.795 +        if (expansion) {
   1.796 +            fprintf(OUTPUT_, "<tr><td class='eq' title='["); 
   1.797 +        }
   1.798 +        else {
   1.799 +            fprintf(OUTPUT_, "<tr><td class='q' title='["); 
   1.800 +        }
   1.801 +        break;  
   1.802 +    case -1: 
   1.803 +        if (expansion) {
   1.804 +            fprintf(OUTPUT_, "<tr><td class='ep' title='["); 
   1.805 +        }
   1.806 +        else {
   1.807 +            fprintf(OUTPUT_, "<tr><td class='p' title='["); 
   1.808 +        }
   1.809 +        break;        
   1.810 +    case -2: 
   1.811 +        if (expansion) {
   1.812 +            fprintf(OUTPUT_, "<tr><td class='es' title='["); 
   1.813 +        }
   1.814 +        else {
   1.815 +            fprintf(OUTPUT_, "<tr><td class='s' title='["); 
   1.816 +        }
   1.817 +        break;
   1.818 +    default: 
   1.819 +        if (expansion) {
   1.820 +            fprintf(OUTPUT_, "<tr><td class='et' title='["); 
   1.821 +        }
   1.822 +        else {
   1.823 +            fprintf(OUTPUT_, "<tr><td class='t' title='["); 
   1.824 +        }
   1.825 +    }
   1.826 +
   1.827 +    uint8_t sortkey[32];
   1.828 +    ucol_setStrength(COLLATOR_, UCOL_TERTIARY);
   1.829 +    ucol_getSortKey(COLLATOR_, element.ch, element.count, sortkey, 32);
   1.830 +    int i = 0;
   1.831 +    while (sortkey[i] != 0) {
   1.832 +        if (sortkey[i] == 1) {
   1.833 +            fprintf(OUTPUT_, " | ");
   1.834 +        }
   1.835 +        else {
   1.836 +            fprintf(OUTPUT_, "%02x", sortkey[i]);
   1.837 +        }
   1.838 +
   1.839 +        i ++;
   1.840 +    }
   1.841 +
   1.842 +    fprintf(OUTPUT_, "]'>");
   1.843 +    
   1.844 +    UErrorCode error = U_ZERO_ERROR;
   1.845 +    char       utf8[64];
   1.846 +    UChar      nfc[32];
   1.847 +    int32_t    length = unorm_normalize(element.ch, element.count, UNORM_NFC, 0, nfc, 
   1.848 +                                        32, &error);
   1.849 +    if (U_FAILURE(error)) {
   1.850 +        fprintf(stdout, "Error normalizing contractions to NFC\n");
   1.851 +    }
   1.852 +    u_strToUTF8(utf8, 64, &length, nfc, length, &error);
   1.853 +    if (U_FAILURE(error)) {
   1.854 +        fprintf(stdout, "Error converting UChar to utf8\n");
   1.855 +        return;
   1.856 +    }
   1.857 +    
   1.858 +    fprintf(OUTPUT_, "%s<br>", utf8);
   1.859 +    fprintf(OUTPUT_, "<tt>");
   1.860 +    outputUChar(element.ch, element.count);
   1.861 +
   1.862 +    if (compare == 0) {
   1.863 +        fprintf(OUTPUT_, "</tt></td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>Q</td><td>");
   1.864 +    }
   1.865 +    else if (compare == -1) {
   1.866 +        fprintf(OUTPUT_, "</tt></td><td>P</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>");
   1.867 +    }
   1.868 +    else if (compare == -2) {
   1.869 +        fprintf(OUTPUT_, "</tt></td><td>&nbsp;</td><td>S</td><td>&nbsp;</td><td>&nbsp;</td><td>");
   1.870 +    }
   1.871 +    else if (compare == -3) {
   1.872 +        fprintf(OUTPUT_, "</tt></td><td>&nbsp;</td><td>&nbsp;</td><td>T</td><td>&nbsp;</td><td>");
   1.873 +    }
   1.874 +
   1.875 +    i = 0;
   1.876 +    while (i < element.count) {
   1.877 +        char    str[128];
   1.878 +        UChar32 codepoint;
   1.879 +        U16_NEXT(element.ch, i, element.count, codepoint);
   1.880 +        int32_t temp = u_charName(codepoint, U_UNICODE_CHAR_NAME, str, 128, 
   1.881 +                                      &error);
   1.882 +        if (U_FAILURE(error)) {
   1.883 +            fprintf(stdout, "Error getting character name\n");
   1.884 +            return;
   1.885 +        }
   1.886 +        if (element.tailored) {
   1.887 +            fprintf(OUTPUT_, "<b>");
   1.888 +        }
   1.889 +        fprintf(OUTPUT_, "%s", str);
   1.890 +        if (element.tailored) {
   1.891 +            fprintf(OUTPUT_, " *</b>");
   1.892 +        }
   1.893 +        if (i < element.count) {
   1.894 +            fprintf(OUTPUT_, "<br>\n");
   1.895 +        }
   1.896 +    }
   1.897 +
   1.898 +    fprintf(OUTPUT_, "</td></tr>\n");
   1.899 +}
   1.900 +
   1.901 +/**
   1.902 +* Checks if codepoint belongs to scripts
   1.903 +* @param script list
   1.904 +* @param scriptcount number of scripts
   1.905 +* @param codepoint to test
   1.906 +* @return TRUE if codepoint belongs to scripts
   1.907 +*/
   1.908 +UBool checkInScripts(UScriptCode script[], int scriptcount, 
   1.909 +                     UChar32 codepoint)
   1.910 +{
   1.911 +    UErrorCode error = U_ZERO_ERROR;
   1.912 +    for (int i = 0; i < scriptcount; i ++) {
   1.913 +        if (script[i] == USCRIPT_HAN && options[10].doesOccur) { 
   1.914 +            if ((codepoint >= 0x2E80 && codepoint <= 0x2EE4) ||
   1.915 +                (codepoint >= 0x2A672 && codepoint <= 0x2A6D6)) {
   1.916 +                // reduce han
   1.917 +                return TRUE;
   1.918 +            }
   1.919 +        }
   1.920 +        else if (uscript_getScript(codepoint, &error) == script[i]) {
   1.921 +            return TRUE;
   1.922 +        }
   1.923 +        if (U_FAILURE(error)) {
   1.924 +            fprintf(stdout, "Error checking character in scripts\n");
   1.925 +            return FALSE;
   1.926 +        }
   1.927 +    }
   1.928 +    return FALSE;
   1.929 +}
   1.930 +
   1.931 +/**
   1.932 +* Checks if the set of codepoints belongs to the script
   1.933 +* @param script list
   1.934 +* @param scriptcount number of scripts
   1.935 +* @param scriptelem
   1.936 +* @return TRUE if all codepoints belongs to the script
   1.937 +*/
   1.938 +inline UBool checkInScripts(UScriptCode script[], int scriptcount,
   1.939 +                           ScriptElement scriptelem)
   1.940 +{
   1.941 +    int i = 0;
   1.942 +    while (i < scriptelem.count) {
   1.943 +        UChar32     codepoint;
   1.944 +        U16_NEXT(scriptelem.ch, i, scriptelem.count, codepoint);
   1.945 +        UErrorCode  error = U_ZERO_ERROR;
   1.946 +        if (checkInScripts(script, scriptcount, codepoint)) {
   1.947 +            return TRUE;
   1.948 +        }
   1.949 +    }
   1.950 +    return FALSE;
   1.951 +}
   1.952 +
   1.953 +/**
   1.954 +* Gets the script elements and contractions belonging to the script
   1.955 +* @param elems output list
   1.956 +* @param locale locale
   1.957 +* @return number of script elements
   1.958 +* Add by Richard
   1.959 +*/
   1.960 +int getScriptElementsFromExemplars(ScriptElement scriptelem[], const char* locale) {
   1.961 +    UErrorCode error = U_ZERO_ERROR;
   1.962 +    UChar32 codepoint = 0;
   1.963 +
   1.964 +    UResourceBundle* ures = ures_open(NULL, locale, &error);
   1.965 +    if (U_FAILURE(error)) {
   1.966 +        fprintf(stdout, "Can not find resource bundle for locale: %s\n", locale);
   1.967 +        return -1;
   1.968 +    }
   1.969 +    int32_t length;
   1.970 +    const UChar* exemplarChars = ures_getStringByKey(ures, "ExemplarCharacters", &length, &error);
   1.971 +
   1.972 +    if (U_FAILURE(error)) {
   1.973 +        fprintf(stdout, "Can not find ExemplarCharacters in resource bundle\n");
   1.974 +        return -1;
   1.975 +    }
   1.976 +
   1.977 +    UChar* upperChars = new UChar[length * 2];
   1.978 +    if (upperChars == 0) {
   1.979 +        fprintf(stdout, "Memory error\n");
   1.980 +        return -1;
   1.981 +    }
   1.982 +
   1.983 +    int32_t destLength = u_strToUpper(upperChars, length * 2, exemplarChars, -1, locale, &error);
   1.984 +    if (U_FAILURE(error)) {
   1.985 +        fprintf(stdout, "Error when u_strToUpper() \n");
   1.986 +        return -1;
   1.987 +    }
   1.988 +
   1.989 +    UChar* pattern = new UChar[length + destLength + 10];
   1.990 +    UChar left[2] = {0x005b, 0x0};
   1.991 +    UChar right[2] = {0x005d, 0x0};
   1.992 +    pattern = u_strcpy(pattern, left);
   1.993 +    pattern = u_strcat(pattern, exemplarChars);
   1.994 +    pattern = u_strcat(pattern, upperChars);
   1.995 +    pattern = u_strcat(pattern, right);
   1.996 +
   1.997 +    UnicodeSet * uniset = new UnicodeSet(UnicodeString(pattern), error);
   1.998 +    if (U_FAILURE(error)) {
   1.999 +        fprintf(stdout, "Can not open USet \n");
  1.1000 +        return -1;
  1.1001 +    }
  1.1002 +
  1.1003 +    UnicodeSetIterator* usetiter = new UnicodeSetIterator(*uniset);
  1.1004 +
  1.1005 +    int32_t count = 0;
  1.1006 +
  1.1007 +    while (usetiter -> next()) {
  1.1008 +        if (usetiter -> isString()) {
  1.1009 +            UnicodeString strItem = usetiter -> getString();
  1.1010 +
  1.1011 +            scriptelem[count].count = 0;
  1.1012 +            for (int i = 0; i < strItem.length(); i++) {
  1.1013 +                codepoint = strItem.char32At(i);
  1.1014 +                UTF16_APPEND_CHAR_UNSAFE(scriptelem[count].ch, scriptelem[count].count, codepoint);
  1.1015 +                scriptelem[count].tailored = FALSE;
  1.1016 +            }
  1.1017 +        } else {
  1.1018 +            codepoint = usetiter -> getCodepoint();
  1.1019 +            scriptelem[count].count = 0;
  1.1020 +            UTF16_APPEND_CHAR_UNSAFE(scriptelem[count].ch, scriptelem[count].count, codepoint);
  1.1021 +            scriptelem[count].tailored = FALSE;
  1.1022 +        }
  1.1023 +
  1.1024 +        count++;
  1.1025 +    }
  1.1026 +    delete []pattern;
  1.1027 +
  1.1028 +    return count;
  1.1029 +}
  1.1030 +
  1.1031 +/**
  1.1032 +* Gets the script elements and contractions belonging to the script
  1.1033 +* @param script list
  1.1034 +* @param scriptcount number of scripts
  1.1035 +* @param elems output list
  1.1036 +* @return number of script elements
  1.1037 +*/
  1.1038 +int getScriptElements(UScriptCode script[], int scriptcount, 
  1.1039 +                      ScriptElement scriptelem[])
  1.1040 +{
  1.1041 +    UErrorCode error = U_ZERO_ERROR;
  1.1042 +    UChar32    codepoint = 0;
  1.1043 +    int        count     = 0;
  1.1044 +    while (codepoint <= UCHAR_MAX_VALUE) { 
  1.1045 +        if (checkInScripts(script, scriptcount, codepoint)) {
  1.1046 +            scriptelem[count].count = 0;
  1.1047 +            UTF16_APPEND_CHAR_UNSAFE(scriptelem[count].ch, 
  1.1048 +                                     scriptelem[count].count, codepoint);
  1.1049 +            scriptelem[count].tailored = FALSE;
  1.1050 +            count ++;
  1.1051 +        }
  1.1052 +        if (U_FAILURE(error)) {
  1.1053 +            fprintf(stdout, "Error determining codepoint in script\n");
  1.1054 +            return -1;
  1.1055 +        }
  1.1056 +        codepoint ++;
  1.1057 +    }
  1.1058 +
  1.1059 +    const UChar           *current  = NULL;
  1.1060 +          uint32_t         strength = 0;
  1.1061 +          uint32_t         chOffset = 0; 
  1.1062 +          uint32_t         chLen    = 0;
  1.1063 +          uint32_t         exOffset = 0; 
  1.1064 +          uint32_t         exLen    = 0;
  1.1065 +          uint32_t         prefixOffset = 0; 
  1.1066 +          uint32_t         prefixLen    = 0;
  1.1067 +          uint8_t          specs    = 0;
  1.1068 +          UBool            rstart   = TRUE;
  1.1069 +          UColTokenParser  src;
  1.1070 +          UColOptionSet    opts;
  1.1071 +          UParseError      parseError;
  1.1072 +
  1.1073 +    int32_t  rulelength = ucol_getRulesEx(COLLATOR_, UCOL_FULL_RULES, NULL, 0);
  1.1074 +    src.source       = (UChar *)malloc(sizeof(UChar) * 
  1.1075 +                                (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE));
  1.1076 +    rulelength = ucol_getRulesEx(COLLATOR_, UCOL_FULL_RULES, src.source, 
  1.1077 +                                 rulelength);
  1.1078 +    src.current      = src.source;
  1.1079 +    src.end          = src.source + rulelength;
  1.1080 +    src.extraCurrent = src.end;
  1.1081 +    src.extraEnd     = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
  1.1082 +    src.opts         = &opts;
  1.1083 +        
  1.1084 +	/*
  1.1085 +	ucol_tok_parseNextToken(&src, &strength, &chOffset, 
  1.1086 +                                              &chLen, &exOffset, &exLen,
  1.1087 +                                              &prefixOffset, &prefixLen,
  1.1088 +                                              &specs, rstart, &parseError,
  1.1089 +                                              &error)
  1.1090 +    */
  1.1091 +    while ((current = ucol_tok_parseNextToken(&src, rstart, &parseError,
  1.1092 +                                              &error)) != NULL) {
  1.1093 +        // contractions handled here
  1.1094 +        if (chLen > 1) {
  1.1095 +            u_strncpy(scriptelem[count].ch, src.source + chOffset, chLen);
  1.1096 +            scriptelem[count].count = chLen;
  1.1097 +            if (checkInScripts(script, scriptcount, scriptelem[count])) {
  1.1098 +                scriptelem[count].tailored     = FALSE;
  1.1099 +                count ++;
  1.1100 +            }
  1.1101 +        }
  1.1102 +        rstart = FALSE;
  1.1103 +    }
  1.1104 +    if (U_FAILURE(error)) {
  1.1105 +        fprintf(stdout, "Error parsing rules: %s\n", u_errorName(error));
  1.1106 +    }
  1.1107 +	// rule might have been reallocated, so delete this instead
  1.1108 +    free(src.source); 
  1.1109 +    return count;
  1.1110 +}
  1.1111 +
  1.1112 +int compareCodepoints(const void *elem1, const void *elem2)
  1.1113 +{
  1.1114 +    UChar *ch1 = ((ScriptElement *)elem1)->ch; // key
  1.1115 +    UChar *ch2 = ((ScriptElement *)elem2)->ch;
  1.1116 +    ch1[((ScriptElement *)elem1)->count] = 0;
  1.1117 +    ch2[((ScriptElement *)elem2)->count] = 0;
  1.1118 +
  1.1119 +    // compare the 2 codepoints
  1.1120 +    return u_strcmp(ch1, ch2);
  1.1121 +}
  1.1122 +
  1.1123 +UBool hasSubNFD(ScriptElement &se, ScriptElement &key)
  1.1124 +{
  1.1125 +    UChar *ch1 = se.ch; 
  1.1126 +    UChar *ch2 = key.ch; // key
  1.1127 +    ch1[se.count] = 0;
  1.1128 +    ch2[key.count] = 0;
  1.1129 +    
  1.1130 +    // compare the 2 codepoints
  1.1131 +    if (u_strstr(ch1, ch2) != NULL) {
  1.1132 +        return TRUE;
  1.1133 +    }
  1.1134 +
  1.1135 +    // check the decomposition 
  1.1136 +    UChar      norm[32];
  1.1137 +    UErrorCode error = U_ZERO_ERROR;
  1.1138 +    int        size  = unorm_normalize(ch1, se.count, UNORM_NFD, 0, norm, 32, 
  1.1139 +                                       &error);    
  1.1140 +    if (U_FAILURE(error)) {
  1.1141 +        fprintf(stdout, "Error normalizing\n");
  1.1142 +    }
  1.1143 +    if (u_strstr(norm, ch2) != NULL) {
  1.1144 +        return TRUE;
  1.1145 +    }
  1.1146 +    return FALSE;
  1.1147 +}
  1.1148 +
  1.1149 +/**
  1.1150 +* Marks tailored elements
  1.1151 +* @param script list
  1.1152 +* @param scriptcount number of scripts
  1.1153 +* @param scriptelem script element list
  1.1154 +* @param scriptelemlength size of the script element list
  1.1155 +*/
  1.1156 +void markTailored(UScriptCode script[], int scriptcount, 
  1.1157 +                  ScriptElement scriptelem[], int scriptelemlength)
  1.1158 +{
  1.1159 +          int32_t  rulelength;
  1.1160 +    const UChar   *rule = ucol_getRules(COLLATOR_, &rulelength);
  1.1161 +    
  1.1162 +    const UChar           *current  = NULL;
  1.1163 +          uint32_t         strength = 0;
  1.1164 +          uint32_t         chOffset = 0; 
  1.1165 +          uint32_t         chLen    = 0;
  1.1166 +          uint32_t         exOffset = 0; 
  1.1167 +          uint32_t         exLen    = 0;
  1.1168 +          uint32_t         prefixOffset = 0; 
  1.1169 +          uint32_t         prefixLen    = 0;
  1.1170 +          uint8_t          specs    = 0;
  1.1171 +          UBool            rstart   = TRUE;
  1.1172 +          UColTokenParser  src;
  1.1173 +          UColOptionSet    opts;
  1.1174 +          UParseError      parseError;
  1.1175 +    
  1.1176 +    src.opts         = &opts;
  1.1177 +    src.source       = (UChar *)malloc(
  1.1178 +               (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar));
  1.1179 +    memcpy(src.source, rule, rulelength * sizeof(UChar));
  1.1180 +	src.current      = src.source;
  1.1181 +    src.end          = (UChar *)src.source + rulelength;
  1.1182 +    src.extraCurrent = src.end;
  1.1183 +    src.extraEnd     = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
  1.1184 +
  1.1185 +    UErrorCode    error = U_ZERO_ERROR;
  1.1186 +        
  1.1187 +    while ((current = ucol_tok_parseNextToken(&src, rstart, &parseError,
  1.1188 +                                              &error)) != NULL) {
  1.1189 +        if (chLen >= 1 && strength != UCOL_TOK_RESET) {
  1.1190 +            // skipping the reset characters and non useful stuff.
  1.1191 +            ScriptElement se;
  1.1192 +            u_strncpy(se.ch, src.source + chOffset, chLen);
  1.1193 +            se.count = chLen;
  1.1194 +
  1.1195 +            if (checkInScripts(script, scriptcount, se)) {
  1.1196 +                /*
  1.1197 +                ScriptElement *tse = (ScriptElement *)bsearch(&se, scriptelem, 
  1.1198 +                                                              scriptelemlength, 
  1.1199 +                                                         sizeof(ScriptElement), 
  1.1200 +                                                         compareCodepoints);
  1.1201 +                */
  1.1202 +                for (int i = 0; i < scriptelemlength; i ++) {
  1.1203 +                    if (!scriptelem[i].tailored && 
  1.1204 +                        hasSubNFD(scriptelem[i], se)) {
  1.1205 +                        scriptelem[i].tailored = TRUE;
  1.1206 +                    }
  1.1207 +                }
  1.1208 +            }
  1.1209 +        }
  1.1210 +        rstart = FALSE;
  1.1211 +    }
  1.1212 +    free(src.source);
  1.1213 +    if (U_FAILURE(error)) {
  1.1214 +        fprintf(stdout, "Error parsing rules\n");
  1.1215 +    }
  1.1216 +}
  1.1217 +
  1.1218 +/**
  1.1219 +* Checks if the collation iterator has more than 1 collation element
  1.1220 +* @parem coleiter collation element iterator
  1.1221 +* @return TRUE if collation iterator has more than 1 collation element
  1.1222 +*/
  1.1223 +UBool hasExpansions(UCollationElements *coleiter)
  1.1224 +{
  1.1225 +    UErrorCode error = U_ZERO_ERROR;
  1.1226 +    int32_t    ce    = ucol_next(coleiter, &error);
  1.1227 +    int        count = 0;
  1.1228 +
  1.1229 +    if (U_FAILURE(error)) {
  1.1230 +        fprintf(stdout, "Error getting next collation element\n");
  1.1231 +    }
  1.1232 +    while (ce != UCOL_NULLORDER) {
  1.1233 +        if ((UCOL_PRIMARYORDER(ce) != 0) && !isContinuation(ce)) {
  1.1234 +            count ++;
  1.1235 +            if (count == 2) {
  1.1236 +                return TRUE;
  1.1237 +            }
  1.1238 +        }
  1.1239 +        ce = ucol_next(coleiter, &error);
  1.1240 +        if (U_FAILURE(error)) {
  1.1241 +            fprintf(stdout, "Error getting next collation element\n");
  1.1242 +        }
  1.1243 +    }
  1.1244 +    return FALSE;
  1.1245 +}
  1.1246 +
  1.1247 +/**
  1.1248 +* Prints the footer for index.html
  1.1249 +* @param file output file
  1.1250 +*/
  1.1251 +void outputHTMLFooter()
  1.1252 +{
  1.1253 +    fprintf(OUTPUT_, "</table>\n");
  1.1254 +    fprintf(OUTPUT_, "</body>\n");
  1.1255 +    fprintf(OUTPUT_, "</html>\n");
  1.1256 +}
  1.1257 +
  1.1258 +/**
  1.1259 +* Serialize the codepoints from start to end into an html file.
  1.1260 +* Arranging them into ascending collation order.
  1.1261 +* @param script code list
  1.1262 +* @param scriptcount number of scripts
  1.1263 +*/
  1.1264 +//void serializeScripts(UScriptCode script[], int scriptcount) 
  1.1265 +//Richard
  1.1266 +void serializeScripts(UScriptCode script[], int scriptcount, const char* locale = NULL) 
  1.1267 +{
  1.1268 +    UErrorCode  error  = U_ZERO_ERROR;
  1.1269 +    
  1.1270 +    ScriptElement *scriptelem = 
  1.1271 +                     (ScriptElement *)malloc(sizeof(ScriptElement) * 0x20000);
  1.1272 +    if (scriptelem == NULL) {
  1.1273 +        fprintf(stdout, "Memory error\n");
  1.1274 +        return;
  1.1275 +    }
  1.1276 +    int count = 0;
  1.1277 +    if(locale) {
  1.1278 +      count = getScriptElementsFromExemplars(scriptelem, locale);
  1.1279 +    } else {
  1.1280 +      count = getScriptElements(script, scriptcount, scriptelem); 
  1.1281 +    }
  1.1282 +
  1.1283 +    // Sort script elements using Quicksort algorithm:
  1.1284 +    qsort(scriptelem, count, sizeof(ScriptElement), compareCodepoints);
  1.1285 +    markTailored(script, scriptcount, scriptelem, count);
  1.1286 +    // Sort script elements using Quicksort algorithm:
  1.1287 +    qsort(scriptelem, count, sizeof(ScriptElement), compareSortKey);
  1.1288 +
  1.1289 +    UCollationElements* coleiter = ucol_openElements(COLLATOR_, 
  1.1290 +                                                     scriptelem[0].ch,
  1.1291 +                                                     scriptelem[0].count,
  1.1292 +                                                     &error);
  1.1293 +    if (U_FAILURE(error)) {
  1.1294 +        fprintf(stdout, "Error creating collation element iterator\n");
  1.1295 +        return;
  1.1296 +    }
  1.1297 +
  1.1298 +    outputScriptElem(scriptelem[0], -1, hasExpansions(coleiter));
  1.1299 +    for (int i = 0; i < count - 1; i ++) {
  1.1300 +        ucol_setText(coleiter, scriptelem[i + 1].ch, scriptelem[i + 1].count,
  1.1301 +                     &error);
  1.1302 +        if (U_FAILURE(error)) {
  1.1303 +            fprintf(stdout, "Error setting text in collation element iterator\n");
  1.1304 +            return;
  1.1305 +        }
  1.1306 +        outputScriptElem(scriptelem[i + 1], 
  1.1307 +                         compareSortKey(scriptelem + i, scriptelem + i + 1),
  1.1308 +                         hasExpansions(coleiter));
  1.1309 +    }
  1.1310 +    free(scriptelem);
  1.1311 +    outputHTMLFooter();
  1.1312 +}
  1.1313 +
  1.1314 +/**
  1.1315 +* Prints the header for the html
  1.1316 +* @param locale name
  1.1317 +* @param script
  1.1318 +* @param scriptcount number of scripts
  1.1319 +*/
  1.1320 +void outputHTMLHeader(const char *locale, UScriptCode script[], 
  1.1321 +                      int scriptcount)
  1.1322 +{
  1.1323 +    fprintf(OUTPUT_, "<html>\n");
  1.1324 +    fprintf(OUTPUT_, "<head>\n");
  1.1325 +    fprintf(OUTPUT_, "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\n");
  1.1326 +    fprintf(OUTPUT_, "<meta http-equiv=\"Content-Language\" content=\"en-us\">\n");
  1.1327 +    fprintf(OUTPUT_, "<link rel=\"stylesheet\" href=\"charts.css\" type=\"text/css\">\n");
  1.1328 +    fprintf(OUTPUT_, "<title>ICU Collation charts</title>\n");
  1.1329 +    fprintf(OUTPUT_, "<base target=\"main\">\n");
  1.1330 +    fprintf(OUTPUT_, "</head>\n");
  1.1331 +
  1.1332 +    fprintf(OUTPUT_, "<body bgcolor=#FFFFFF>\n");
  1.1333 +    fprintf(OUTPUT_, "<!--\n");
  1.1334 +    fprintf(OUTPUT_, "This file contains sorted characters in ascending order according to the locale stated\n");
  1.1335 +    fprintf(OUTPUT_, "If the character is in red, it is tailored in the collation rules.\n");
  1.1336 +    fprintf(OUTPUT_, "Background colours have certain meanings:\n");
  1.1337 +    fprintf(OUTPUT_, "White - equals the previous character\n");
  1.1338 +    fprintf(OUTPUT_, "dark blue - primary greater than the previous character\n");
  1.1339 +    fprintf(OUTPUT_, "blue - secondary greater than the previous character\n");
  1.1340 +    fprintf(OUTPUT_, "light blue - tertiary greater than the previous character\n");
  1.1341 +    fprintf(OUTPUT_, "--!>\n");
  1.1342 +
  1.1343 +    fprintf(OUTPUT_, "<table border=0>\n");
  1.1344 +    UChar      displayname[64];
  1.1345 +    UErrorCode error = U_ZERO_ERROR;
  1.1346 +    int32_t size = uloc_getDisplayName(locale, "en_US", displayname, 64, &error);
  1.1347 +    char       utf8displayname[128];
  1.1348 +    if (U_FAILURE(error)) {
  1.1349 +        utf8displayname[0] = 0;
  1.1350 +    }
  1.1351 +    else {
  1.1352 +        int32_t utf8size = 0;
  1.1353 +        u_strToUTF8(utf8displayname, 128, &utf8size, displayname, size, &error);
  1.1354 +    }
  1.1355 +
  1.1356 +    fprintf(OUTPUT_, "<tr><th>Locale</th><td class='noborder'>%s</td></tr>\n", utf8displayname);
  1.1357 +    fprintf(OUTPUT_, "<tr><th>Script(s)</th>");
  1.1358 +    fprintf(OUTPUT_, "<td class='noborder'>");
  1.1359 +    for (int i = 0; i < scriptcount; i ++) {
  1.1360 +        fprintf(OUTPUT_, "%s", uscript_getName(script[i]));
  1.1361 +        if (i + 1 != scriptcount) {
  1.1362 +            fprintf(OUTPUT_, ", ");
  1.1363 +        }
  1.1364 +    }
  1.1365 +    fprintf(OUTPUT_, "</td></tr>\n");
  1.1366 +    
  1.1367 +    fprintf(OUTPUT_, "<tr><th>Rules</th><td class='noborder'><a href=\"http://dev.icu-project.org/cgi-bin/viewcvs.cgi/*checkout*/icu/source/data/coll/%s.txt\">%s.txt</a></td></tr>\n", locale, locale);
  1.1368 +    
  1.1369 +    UVersionInfo version;
  1.1370 +    ucol_getVersion(COLLATOR_, version);
  1.1371 +    fprintf(OUTPUT_, "<tr><th>Collator version</th><td class='noborder'>%d.%d.%d.%d</td></tr>\n", 
  1.1372 +                      version[0], version[1], version[2], version[3]);
  1.1373 +    
  1.1374 +    UColAttribute attr = UCOL_FRENCH_COLLATION;
  1.1375 +    while (attr < UCOL_ATTRIBUTE_COUNT) {
  1.1376 +        UColAttributeValue value = ucol_getAttribute(COLLATOR_, attr, &error);
  1.1377 +        if (U_FAILURE(error)) {
  1.1378 +            fprintf(stdout, "Error getting attribute\n");
  1.1379 +            return;
  1.1380 +        }
  1.1381 +        if (value != UCOL_DEFAULT) {
  1.1382 +            if (attr == UCOL_FRENCH_COLLATION && value != UCOL_OFF) {
  1.1383 +                fprintf(OUTPUT_, "<tr><th>French Collation</th><td class='noborder'>on, code %d</td></tr>\n", value);
  1.1384 +            }
  1.1385 +            if (attr == UCOL_ALTERNATE_HANDLING && value != UCOL_NON_IGNORABLE) {
  1.1386 +                fprintf(OUTPUT_, "<tr><th>Alternate Handling</th><td class='noborder'>shifted, code%d</td></tr>\n", value);
  1.1387 +            }
  1.1388 +            if (attr == UCOL_CASE_FIRST && value != UCOL_OFF) {
  1.1389 +                fprintf(OUTPUT_, "<tr><th>Case First</th><td class='noborder'>on, code %d</td></tr>\n", value);
  1.1390 +            }
  1.1391 +            if (attr == UCOL_CASE_LEVEL && value != UCOL_OFF) {
  1.1392 +                fprintf(OUTPUT_, "<tr><th>Case Level</th><td class='noborder'>on, code %d</td></tr>\n", value);
  1.1393 +            }
  1.1394 +            if (attr == UCOL_NORMALIZATION_MODE && value != UCOL_OFF) {
  1.1395 +                fprintf(OUTPUT_, "<tr><th>Normalization</th><td class='noborder'>on, code %d</td></tr>\n", value);
  1.1396 +            }
  1.1397 +            if (attr == UCOL_STRENGTH && value != UCOL_TERTIARY) {
  1.1398 +                fprintf(OUTPUT_, "<tr><th>Strength</th><td class='noborder'>code %d</td></tr>\n", value);
  1.1399 +            }
  1.1400 +            if (attr == UCOL_HIRAGANA_QUATERNARY_MODE && value != UCOL_OFF) {
  1.1401 +                fprintf(OUTPUT_, "<tr><th>Hiragana Quaternary</th><td class='noborder'>on, code %d</td></tr>\n", value);
  1.1402 +            }
  1.1403 +        }
  1.1404 +        attr = (UColAttribute)(attr + 1);
  1.1405 +    }
  1.1406 +
  1.1407 +    // Get UNIX-style time and display as number and string.
  1.1408 +    time_t ltime;
  1.1409 +    time( &ltime );
  1.1410 +    fprintf(OUTPUT_, "<tr><th>Date Generated</th><td class='noborder'>%s</td></tr>", ctime(&ltime));
  1.1411 +     
  1.1412 +    fprintf(OUTPUT_, "</table>\n");
  1.1413 +
  1.1414 +    fprintf(OUTPUT_, "<p><a href=help.html>How to read the table</a><br>\n");
  1.1415 +    fprintf(OUTPUT_, "<a href=http://www.jtcsv.com/cgi-bin/icu-bugs/ target=new>Submit a bug</a></p>\n");
  1.1416 +    fprintf(OUTPUT_, "\n<table>\n");
  1.1417 +    fprintf(OUTPUT_, "\n<tr><th>Codepoint</th><th>P</th><th>S</th><th>T</th><th>Q</th><th>Name</th></tr>\n");
  1.1418 +}
  1.1419 +
  1.1420 +/**
  1.1421 +* Prints the header for index.html
  1.1422 +* @param file output file
  1.1423 +*/
  1.1424 +void outputListHTMLHeader(FILE *file)
  1.1425 +{
  1.1426 +    fprintf(file, "<html>\n");
  1.1427 +    fprintf(file, "<head>\n");
  1.1428 +    fprintf(file, "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\n");
  1.1429 +    fprintf(file, "<meta http-equiv=\"Content-Language\" content=\"en-us\">\n");
  1.1430 +    fprintf(file, "<title>ICU Collation Charts</title>\n");
  1.1431 +    fprintf(file, "<base target=\"main\">\n");
  1.1432 +    fprintf(file, "</head>\n");
  1.1433 +    fprintf(file, "<body bgcolor=#FFFFFF>\n");
  1.1434 +    fprintf(file, "<h2 align=center>ICU Collation Charts</h2>\n");
  1.1435 +    fprintf(file, "<p align=center>\n");
  1.1436 +    fprintf(file, "<a href=http://www.unicode.org/charts/collation/ target=new>UCA Charts</a><br>");
  1.1437 +}
  1.1438 +
  1.1439 +/**
  1.1440 +* Prints the footer for index.html
  1.1441 +* @param file output file
  1.1442 +*/
  1.1443 +void outputListHTMLFooter(FILE *file)
  1.1444 +{
  1.1445 +    fprintf(file, "</p>\n");
  1.1446 +	//fprintf(file, "<center><image src=http://oss.software.ibm.com/icu/images/w24.gif></center>\n");
  1.1447 +    fprintf(file, "</body>\n");
  1.1448 +    fprintf(file, "</html>\n");
  1.1449 +}
  1.1450 +
  1.1451 +/**
  1.1452 +* Gets all scripts and serialize their codepoints into an html file.
  1.1453 +*/
  1.1454 +void serializeScripts() {
  1.1455 +    char filename[128];
  1.1456 +    int  dirlength = 0;
  1.1457 +
  1.1458 +    if (options[4].doesOccur) {
  1.1459 +        strcpy(filename, options[4].value);
  1.1460 +        dirlength = appendDirSeparator(filename);
  1.1461 +    } else {
  1.1462 +      filename[0] = 0;
  1.1463 +    }
  1.1464 +
  1.1465 +    const char    *locale;
  1.1466 +          int32_t  localelist = 0;
  1.1467 +          int32_t  localesize;
  1.1468 +        
  1.1469 +    localesize = ucol_countAvailable();
  1.1470 +    locale     = ucol_getAvailable(localelist);
  1.1471 +
  1.1472 +    strcat(filename, "list.html");
  1.1473 +    FILE *list = fopen(filename, "w");
  1.1474 +    filename[dirlength] = 0;
  1.1475 +    if (list == NULL) {
  1.1476 +        fprintf(stdout, "Cannot open file: %s\n", filename);
  1.1477 +        return;
  1.1478 +    }
  1.1479 +
  1.1480 +    outputListHTMLHeader(list);
  1.1481 +    fprintf(list, "<blockquote>\n");
  1.1482 +    while (TRUE) {
  1.1483 +        UErrorCode error = U_ZERO_ERROR;
  1.1484 +        COLLATOR_ = ucol_open(locale, &error);
  1.1485 +        if (U_FAILURE(error)) {
  1.1486 +            fprintf(stdout, "Collator creation failed:");
  1.1487 +            fprintf(stdout, u_errorName(error));
  1.1488 +            break;
  1.1489 +        }
  1.1490 +        if ((error != U_USING_FALLBACK_WARNING && // not tailored
  1.1491 +            error != U_USING_DEFAULT_WARNING) ||
  1.1492 +            checkLocaleForLanguage(locale)) {
  1.1493 +            fprintf(list, "<a href=%s.html>%s</a> ", locale, locale);
  1.1494 +	        setAttributes(COLLATOR_, &error);
  1.1495 +            if (U_FAILURE(error)) {
  1.1496 +               fprintf(stdout, "Collator attribute setting failed:");
  1.1497 +               fprintf(stdout, u_errorName(error));
  1.1498 +               break;
  1.1499 +            }
  1.1500 +
  1.1501 +            UScriptCode scriptcode[32];
  1.1502 +            uint32_t scriptcount = uscript_getCode(locale, scriptcode, 32, 
  1.1503 +                                                   &error);
  1.1504 +            if (U_FAILURE(error)) {
  1.1505 +                fprintf(stdout, "Error getting lcale scripts\n");
  1.1506 +                break;
  1.1507 +            }
  1.1508 +
  1.1509 +            strcat(filename, locale);
  1.1510 +            strcat(filename, ".html");
  1.1511 +            OUTPUT_ = fopen(filename, "w");
  1.1512 +            if (OUTPUT_ == NULL) {
  1.1513 +                fprintf(stdout, "Cannot open file:%s\n", filename);
  1.1514 +                break;
  1.1515 +            }
  1.1516 +            outputHTMLHeader(locale, scriptcode, scriptcount);
  1.1517 +            fprintf(stdout, "%s\n", locale);
  1.1518 +
  1.1519 +            if(options[12].doesOccur) {
  1.1520 +              // use whole scripts
  1.1521 +                serializeScripts(scriptcode, scriptcount);
  1.1522 +            } else {
  1.1523 +              // use exemplar chars
  1.1524 +              serializeScripts(scriptcode, scriptcount, locale);
  1.1525 +            }
  1.1526 +            fclose(OUTPUT_);
  1.1527 +        }
  1.1528 +        ucol_close(COLLATOR_);
  1.1529 +
  1.1530 +        filename[dirlength] = 0;
  1.1531 +        localelist ++;
  1.1532 +        if (localelist == localesize) {
  1.1533 +            break;
  1.1534 +        }
  1.1535 +        locale = ucol_getAvailable(localelist);
  1.1536 +    }
  1.1537 +    fprintf(list, "<br><a href=help.html>help</a><br>");
  1.1538 +    fprintf(list, "</blockquote>\n");
  1.1539 +    outputListHTMLFooter(list);
  1.1540 +    fclose(list);
  1.1541 +}
  1.1542 +
  1.1543 +/** 
  1.1544 +* Main   --  process command line, read in and pre-process the test file,
  1.1545 +*            call other functions to do the actual tests.
  1.1546 +*/
  1.1547 +int main(int argc, char *argv[]) {
  1.1548 +    
  1.1549 +    argc = u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), 
  1.1550 +                       options);
  1.1551 +    
  1.1552 +    // error handling, printing usage message
  1.1553 +    if (argc < 0) {
  1.1554 +        fprintf(stdout, "error in command line argument: ");
  1.1555 +        fprintf(stdout, argv[-argc]);
  1.1556 +        fprintf(stdout, "\n");
  1.1557 +    }
  1.1558 +    if (argc < 0 || options[0].doesOccur || options[1].doesOccur) {
  1.1559 +        fprintf(stdout, "Usage: dumpce options...\n"
  1.1560 +                        "--help\n"
  1.1561 +                        "    Display this message.\n"
  1.1562 +                        "--locale name|all\n"
  1.1563 +                        "    ICU locale to use. Default is en_US\n"
  1.1564 +                        "--serialize\n"
  1.1565 +                        "    Serializes the collation elements in -locale or all locales available and outputs them into --outputdir/locale_ce.txt\n"
  1.1566 +                        "--destdir dir_name\n"
  1.1567 +                        "    Path for outputing the serialized collation elements. Defaults to stdout if no defined\n"
  1.1568 +                        "--sourcedir dir_name\n"
  1.1569 +                        "    Path for the input rule file for collation\n"
  1.1570 +                        "--attribute name=value,name=value...\n" 
  1.1571 +                        "    Pairs of attribute names and values for setting\n"
  1.1572 +                        "--rule filename\n" 
  1.1573 +                        "    Name of file containing the collation rules.\n"
  1.1574 +                        "--normalizaton mode\n" 
  1.1575 +                        "    UNormalizationMode mode to be used.\n"
  1.1576 +                        "--scripts\n" 
  1.1577 +                        "    Codepoints from all scripts are sorted and serialized.\n"
  1.1578 +                        "--reducehan\n" 
  1.1579 +                        "    Only 200 Han script characters will be displayed with the use of --scripts.\n"
  1.1580 +                        "--wholescripts\n"
  1.1581 +                        "    Show collation order for whole scripts instead of just for exemplar characters of a locale\n\n");
  1.1582 +
  1.1583 +        fprintf(stdout, "Example to generate *.txt files : dumpce --serialize --locale af --destdir /temp --attribute UCOL_STRENGTH=UCOL_DEFAULT_STRENGTH,4=17\n\n");
  1.1584 +        fprintf(stdout, "Example to generate *.html files for oss web display: dumpce --scripts --destdir /temp --reducehan\n");
  1.1585 +        return argc < 0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
  1.1586 +    }
  1.1587 +
  1.1588 +    OUTPUT_ = stdout;
  1.1589 +    if (options[6].doesOccur) {
  1.1590 +        fprintf(stdout, "attributes %s\n", options[6].value);
  1.1591 +        parseAttributes();
  1.1592 +    }
  1.1593 +    if (options[3].doesOccur) {
  1.1594 +        serialize();
  1.1595 +    }
  1.1596 +    if (options[9].doesOccur) {
  1.1597 +        serializeScripts();
  1.1598 +    }
  1.1599 +    return 0;
  1.1600 +}

mercurial