intl/icu/source/tools/dumpce/dumpce.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /********************************************************************
     2  * COPYRIGHT:
     3  * Copyright (C) 2001-2011 IBM, Inc.   All Rights Reserved.
     4  *
     5  ********************************************************************/
     6 /********************************************************************************
     7 *
     8 * File dumpce.cpp
     9 *
    10 * Modification History:
    11 * Name          Date           Description
    12 * synwee        May 31 2001    Creation
    13 *
    14 *********************************************************************************
    15 */
    17 /**
    18 * This program outputs the collation elements used for a requested tailoring.
    19 *
    20 * Usage:
    21 *     dumpce options... please check main function.
    22 */
    23 #include <unicode/utypes.h>
    24 #include <unicode/ucol.h>
    25 #include <unicode/uloc.h>
    26 #include <unicode/ucoleitr.h>
    27 #include <unicode/uchar.h>
    28 #include <unicode/uscript.h>
    29 #include <unicode/utf16.h>
    30 #include <unicode/putil.h>
    31 #include <unicode/ustring.h>
    32 #include <stdio.h>
    33 #include <stdlib.h>
    34 #include <string.h>
    35 #include <time.h>
    36 #include "ucol_tok.h"
    37 #include "cstring.h"
    38 #include "uoptions.h"
    39 #include "ucol_imp.h"
    40 #include <unicode/ures.h>
    41 #include <unicode/uniset.h>
    42 #include <unicode/usetiter.h>
    44 /**
    45 * Command line option variables. 
    46 * These global variables are set according to the options specified on the 
    47 * command line by the user.
    48 */
    49 static UOption options[]={
    50     /* 00 */ UOPTION_HELP_H, 
    51     /* 01 */ UOPTION_HELP_QUESTION_MARK,
    52     /* 02 */ {"locale",        NULL, NULL, NULL, 'l', UOPT_REQUIRES_ARG, 0},
    53     /* 03 */ {"serialize",     NULL, NULL, NULL, 'z', UOPT_NO_ARG, 0},
    54 	/* 04 */ UOPTION_DESTDIR,
    55     /* 05 */ UOPTION_SOURCEDIR,
    56     /* 06 */ {"attribute",     NULL, NULL, NULL, 'a', UOPT_REQUIRES_ARG, 0},
    57     /* 07 */ {"rule",          NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0},
    58     /* 08 */ {"normalization", NULL, NULL, NULL, 'n', UOPT_REQUIRES_ARG, 0},
    59     /* 09 */ {"scripts",       NULL, NULL, NULL, 't', UOPT_NO_ARG, 0},
    60     /* 10 */ {"reducehan",     NULL, NULL, NULL, 'e', UOPT_NO_ARG, 0},
    61 	/* 11 */ UOPTION_VERBOSE,
    62     /* 12 */ {"wholescripts",      NULL, NULL, NULL, 'W', UOPT_NO_ARG, 0}
    63 };
    65 /**
    66 * Collator used in this program
    67 */
    68 static UCollator *COLLATOR_;
    69 /**
    70 * Output strea, used in this program
    71 */
    72 static FILE *OUTPUT_;
    74 static UColAttributeValue ATTRIBUTE_[UCOL_ATTRIBUTE_COUNT] = {
    75     UCOL_DEFAULT, UCOL_DEFAULT, UCOL_DEFAULT, UCOL_DEFAULT, UCOL_DEFAULT, 
    76     UCOL_DEFAULT, UCOL_DEFAULT, UCOL_DEFAULT,
    77 };
    79 typedef struct {
    80     int   value;
    81     char *name;
    82 } EnumNameValuePair;
    84 static const EnumNameValuePair ATTRIBUTE_NAME_[] = {
    85     {UCOL_FRENCH_COLLATION, "UCOL_FRENCH_COLLATION"},
    86     {UCOL_ALTERNATE_HANDLING, "UCOL_ALTERNATE_HANDLING"}, 
    87     {UCOL_CASE_FIRST, "UCOL_CASE_FIRST"}, 
    88     {UCOL_CASE_LEVEL, "UCOL_CASE_LEVEL"}, 
    89     {UCOL_NORMALIZATION_MODE, 
    90         "UCOL_NORMALIZATION_MODE|UCOL_DECOMPOSITION_MODE"},
    91     {UCOL_STRENGTH, "UCOL_STRENGTH"},
    92 	{UCOL_HIRAGANA_QUATERNARY_MODE, "UCOL_HIRAGANA_QUATERNARY_MODE"},
    93     {UCOL_NUMERIC_COLLATION, "UCOL_NUMERIC_COLLATION"},
    94     NULL
    95 };
    97 static const EnumNameValuePair ATTRIBUTE_VALUE_[] = {
    98     {UCOL_PRIMARY, "UCOL_PRIMARY"},
    99     {UCOL_SECONDARY, "UCOL_SECONDARY"},
   100     {UCOL_TERTIARY, "UCOL_TERTIARY|UCOL_DEFAULT_STRENGTH"},
   101     {UCOL_QUATERNARY, "UCOL_QUATERNARY"},
   102     {UCOL_IDENTICAL, "UCOL_IDENTICAL"},
   103     {UCOL_OFF, "UCOL_OFF"},
   104     {UCOL_ON, "UCOL_ON"},
   105     {UCOL_SHIFTED, "UCOL_SHIFTED"},
   106     {UCOL_NON_IGNORABLE, "UCOL_NON_IGNORABLE"},
   107     {UCOL_LOWER_FIRST, "UCOL_LOWER_FIRST"},
   108     {UCOL_UPPER_FIRST, "UCOL_UPPER_FIRST"},
   109     NULL
   110 };
   112 typedef struct {
   113     UChar ch[32];
   114     int   count; // number of codepoint
   115     UBool tailored;
   116 } ScriptElement;
   118 /**
   119 * Writes the hexadecimal of a null-terminated array of codepoints into a 
   120 * file
   121 * @param f UFILE instance to store
   122 * @param c codepoints array
   123 */
   124 void serialize(FILE *f, const UChar *c) 
   125 {
   126     UChar cp = *(c ++);
   128     fprintf(f, " %04x", cp);
   130     while (*c != 0) {
   131         cp = *(c ++);
   132         fprintf(f, " %04x", cp);
   133     }
   134 }
   136 /**
   137 * Writes the hexadecimal of a non-null-terminated array of codepoints into a 
   138 * file
   139 * @param f UFILE instance to store
   140 * @param c codepoints array
   141 * @param l codepoints array length
   142 */
   143 void serialize(FILE *f, const UChar *c, int l) 
   144 {
   145     int   count = 1;
   146     UChar cp    = *(c ++);
   148     fprintf(f, " %04x", cp);
   150     while (count < l) {
   151         cp = *(c ++);
   152         fprintf(f, " %04x", cp);
   153         count ++;
   154     }
   155 }
   157 /**
   158 * Sets the iterator to the argument string and outputs the collation elements.
   159 * @param f file output stream
   160 * @param iter collation element iterator
   161 */
   162 void serialize(FILE *f, UCollationElements *iter) {
   163     const UChar   *codepoint = iter->iteratordata_.string;
   164     // unlikely that sortkeys will be over this size 
   165     uint8_t  sortkey[64];
   166     uint8_t *psortkey = sortkey;
   167     int      sortkeylength = 0;
   169     if (iter->iteratordata_.flags & UCOL_ITER_HASLEN) {
   170         serialize(f, codepoint, iter->iteratordata_.endp - codepoint);
   171         sortkeylength = ucol_getSortKey(iter->iteratordata_.coll, codepoint, 
   172                         iter->iteratordata_.endp - codepoint, sortkey, 64);
   173     }
   174     else {
   175         serialize(f, codepoint);
   176         sortkeylength = ucol_getSortKey(iter->iteratordata_.coll, codepoint, 
   177                                         -1, sortkey, 64);
   178     }
   179     if (options[11].doesOccur) {
   180         serialize(stdout, codepoint);
   181         fprintf(stdout, "\n");
   182     }
   184     fprintf(f, "; ");
   186     UErrorCode error = U_ZERO_ERROR;
   187     uint32_t ce = ucol_next(iter, &error);
   188     if (U_FAILURE(error)) {
   189         fprintf(f, "Error retrieving collation elements\n");
   190         return;
   191     }
   193     while (TRUE) {
   194         fprintf(f, "[");
   195         if (UCOL_PRIMARYORDER(ce) != 0) {
   196             fprintf(f, "%04x", UCOL_PRIMARYORDER(ce));
   197         }
   198         fprintf(f, ",");
   199         if (UCOL_SECONDARYORDER(ce) != 0) {
   200             fprintf(f, " %02x", UCOL_SECONDARYORDER(ce));
   201         }
   202         fprintf(f, ",");
   203         if (UCOL_TERTIARYORDER(ce) != 0) {
   204             fprintf(f, " %02x", UCOL_TERTIARYORDER(ce));
   205         }
   206         fprintf(f, "] ");
   208         ce = ucol_next(iter, &error);
   209         if (ce == UCOL_NULLORDER) {
   210             break;
   211         }
   212         if (U_FAILURE(error)) {
   213             fprintf(stdout, "Error retrieving collation elements");
   214             return;
   215         }
   216     }
   218     if (sortkeylength > 64) {
   219         fprintf(f, "Sortkey exceeds pre-allocated size");
   220     }
   222     fprintf(f, "[");
   223     while (TRUE) {
   224         fprintf(f, "%02x", *psortkey);
   225         psortkey ++;
   226         if ((*psortkey) == 0) {
   227             break;
   228         }
   229         fprintf(f, " ");
   230     }
   231     fprintf(f, "]\n");
   232 }
   234 /**
   235 * Serializes the contraction within the given argument rule
   236 * @param f file output stream
   237 * @param r rule
   238 * @param rlen rule length
   239 * @param contractionsonly flag to indicate if only contractions are to be 
   240 *                         output or all collation elements
   241 * @param iter iterator to iterate over collation elements
   242 */
   243 void serialize(FILE *f, UChar *rule, int rlen, UBool contractiononly, 
   244                UCollationElements *iter) {
   245     const UChar           *current  = NULL;
   246           uint32_t         strength = 0;
   247           uint32_t         chOffset = 0; 
   248           uint32_t         chLen    = 0;
   249           uint32_t         exOffset = 0; 
   250           uint32_t         exLen    = 0;
   251           uint32_t         prefixOffset = 0; 
   252           uint32_t         prefixLen    = 0;
   253           uint8_t          specs    = 0;
   254           UBool            rstart   = TRUE;
   255           UColTokenParser  src;
   256           UColOptionSet    opts;
   257           UParseError      parseError;
   258           UErrorCode       error    = U_ZERO_ERROR;
   260     src.opts = &opts;
   262     src.source       = rule; 
   263 	src.current = rule;
   264     src.end          = rule + rlen;
   265     src.extraCurrent = src.end;
   266     src.extraEnd     = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
   269     while ((current = ucol_tok_parseNextToken(&src, rstart, &parseError,
   270                                               &error)) != NULL) {
   271       chOffset = src.parsedToken.charsOffset;
   272       chLen = src.parsedToken.charsLen;
   273         // contractions handled here
   274         if (!contractiononly || chLen > 1) {
   275             ucol_setText(iter, rule + chOffset, chLen, &error);
   276             if (U_FAILURE(error)) {
   277                 fprintf(stdout, "Error setting text in iterator\n");
   278                 return;
   279             }
   280             serialize(f, iter);
   281         }
   282         rstart = FALSE;
   283     }
   284 }
   286 /**
   287 * Prints the attribute values in the argument collator into the output stream
   288 * @param collator
   289 */
   290 void outputAttribute(UCollator *collator, UErrorCode *error) 
   291 {
   292     UColAttribute attribute = UCOL_FRENCH_COLLATION;
   293     while (attribute < UCOL_ATTRIBUTE_COUNT) {
   294         int count = 0;
   295         while (TRUE) {
   296             // getting attribute name
   297             if (ATTRIBUTE_NAME_[count].value == attribute) {
   298                 fprintf(OUTPUT_, "%s = ", ATTRIBUTE_NAME_[count].name);
   299                 break;
   300             }
   301             count ++;
   302         }
   303         count = 0;
   304         int attributeval = ucol_getAttribute(collator, attribute, error);
   305         if (U_FAILURE(*error)) {
   306             fprintf(stdout, "Failure in reading collator attribute\n");
   307             return;
   308         }
   309         while (TRUE) {
   310             // getting attribute value
   311             if (ATTRIBUTE_VALUE_[count].value == attributeval) {
   312                 fprintf(OUTPUT_, "%s\n", ATTRIBUTE_VALUE_[count].name);
   313                 break;
   314             }
   315             count ++;
   316         }
   317         attribute = (UColAttribute)(attribute + 1);
   318     }
   319 }
   321 /**
   322 * Prints the normalization mode in the argument collator into the output stream
   323 * @param collator
   324 */
   325 void outputNormalization(UCollator *collator) 
   326 {
   327 	UErrorCode status = U_ZERO_ERROR;
   328     int normmode = ucol_getAttribute(collator, UCOL_NORMALIZATION_MODE, &status);
   329     int count = 0;
   330     while (TRUE) {
   331         // getting attribute name
   332         if (ATTRIBUTE_VALUE_[count].value == normmode) {
   333             break;
   334         }
   335         count ++;
   336     }
   337     fprintf(OUTPUT_, "NORMALIZATION MODE = %s\n", 
   338             ATTRIBUTE_VALUE_[count].name);
   339 }
   341 /**
   342 * Output the collation element belonging to the locale into a file
   343 * @param locale string
   344 * @param fullrules flag to indicate if only tailored collation elements are to
   345 *        be output or all collation elements
   346 */
   347 void serialize(const char *locale, UBool tailoredonly) {
   348     UErrorCode  error              = U_ZERO_ERROR;
   349     UChar       str[128];
   350     int         strlen = 0;
   352     fprintf(OUTPUT_, "# This file contains the serialized collation elements\n");
   353     fprintf(OUTPUT_, "# as of the collation version indicated below.\n");
   354     fprintf(OUTPUT_, "# Data format: xxxx xxxx..; [yyyy, yy, yy] [yyyy, yy, yy] ... [yyyy, yy, yy] [zz zz..\n");
   355     fprintf(OUTPUT_, "#              where xxxx are codepoints in hexadecimals,\n");
   356     fprintf(OUTPUT_, "#              yyyyyyyy are the corresponding\n");
   357     fprintf(OUTPUT_, "#              collation elements in hexadecimals\n");
   358     fprintf(OUTPUT_, "#              and zz are the sortkey values in hexadecimals\n");
   360     fprintf(OUTPUT_, "\n# Collator information\n");
   362     fprintf(OUTPUT_, "\nLocale: %s\n", locale);
   363     fprintf(stdout, "Locale: %s\n", locale);
   364     UVersionInfo version;
   365     ucol_getVersion(COLLATOR_, version);
   366     fprintf(OUTPUT_, "Version number: %d.%d.%d.%d\n", 
   367                       version[0], version[1], version[2], version[3]);
   368     outputAttribute(COLLATOR_, &error);
   369     outputNormalization(COLLATOR_);
   371     UCollationElements *iter = ucol_openElements(COLLATOR_, str, strlen, 
   372                                                  &error);
   373     if (U_FAILURE(error)) {
   374         fprintf(stdout, "Error creating iterator\n");
   375         return;
   376     }
   378     if (!tailoredonly) {
   379         fprintf(OUTPUT_, "\n# Range of unicode characters\n\n");
   380         UChar32     codepoint          = 0;
   381         while (codepoint <= UCHAR_MAX_VALUE) { 
   382             if (u_isdefined(codepoint)) {
   383                 strlen = 0;
   384                 UTF16_APPEND_CHAR_UNSAFE(str, strlen, codepoint);
   385                 str[strlen] = 0;
   386                 ucol_setText(iter, str, strlen, &error);
   387                 if (U_FAILURE(error)) {
   388                     fprintf(stdout, "Error setting text in iterator\n");
   389                     return;
   390                 }
   391                 serialize(OUTPUT_, iter);
   392             }
   393             codepoint ++;
   394         }
   395     }
   397     UChar    ucarules[0x10000];
   398     UChar   *rules;
   399     int32_t  rulelength = 0;
   400     rules      = ucarules;
   402     if (tailoredonly) {
   403               int32_t  rulelength = 0;
   404         const UChar   *temp = ucol_getRules(COLLATOR_, &rulelength);
   405         if (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE > 0x10000) {
   406             rules = (UChar *)malloc(sizeof(UChar) * 
   407                                 (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE));
   408         }
   409         memcpy(rules, temp, rulelength * sizeof(UChar));
   410         rules[rulelength] = 0;
   411         fprintf(OUTPUT_, "\n# Tailorings\n\n");
   412         serialize(OUTPUT_, rules, rulelength, FALSE, iter);
   413         if (rules != ucarules) {
   414             free(rules);
   415         }
   416     }
   417     else {        
   418         rulelength = ucol_getRulesEx(COLLATOR_, UCOL_FULL_RULES, ucarules, 
   419                                      0x10000);
   420         if (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE > 0x10000) {
   421             rules = (UChar *)malloc(sizeof(UChar) * 
   422                                 (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE));
   423             rulelength = ucol_getRulesEx(COLLATOR_, UCOL_FULL_RULES, rules, 
   424                                          rulelength);
   425         }
   426         fprintf(OUTPUT_, "\n# Contractions\n\n");
   427         serialize(OUTPUT_, rules, rulelength, TRUE, iter);
   428         if (rules != ucarules) {
   429             free(rules);
   430         }
   431     }
   433     ucol_closeElements(iter);
   434 }
   436 /**
   437 * Sets the collator with the attribute values
   438 * @param collator
   439 * @param error status
   440 */
   441 void setAttributes(UCollator *collator, UErrorCode *error) 
   442 {
   443     int count = 0;
   444     while (count < UCOL_ATTRIBUTE_COUNT) {
   445         if (ATTRIBUTE_[count] != UCOL_DEFAULT) {
   446             ucol_setAttribute(collator, (UColAttribute)count, 
   447                               ATTRIBUTE_[count], error);
   448             if (U_FAILURE(*error)) {
   449                 return;
   450             }
   451         }
   452         count ++;
   453     }
   454 }
   456 /**
   457 * Appends directory path with an ending seperator if necessary.
   458 * @param path with enough space to append one seperator
   459 * @return new directory path length
   460 */
   461 int appendDirSeparator(char *dir) 
   462 {
   463     int dirlength = strlen(dir);
   464     char dirending = dir[dirlength - 1];
   465     if (dirending != U_FILE_SEP_CHAR) {
   466         dir[dirlength] = U_FILE_SEP_CHAR;
   467         dir[dirlength + 1] = 0;
   468         return dirlength + 1;
   469     }
   470     return dirlength;
   471 }
   473 /**
   474 * Output the collation element into a file
   475 */
   476 void serialize() {
   477     char filename[128];
   478     int  dirlength = 0;
   480     if (options[4].doesOccur) {
   481         strcpy(filename, options[4].value);
   482         dirlength = appendDirSeparator(filename);
   483     }
   485     if (options[2].doesOccur) {
   486         const char    *locale      = (char *)options[2].value;
   487               int32_t  localeindex = 0;
   489         if (strcmp(locale, "all") == 0) {
   490             if (options[4].doesOccur) {
   491                 strcat(filename, "UCA.txt");
   492                 OUTPUT_ = fopen(filename, "w");
   493                 if (OUTPUT_ == NULL) {
   494                     fprintf(stdout, "Cannot open file:%s\n", filename);
   495                     return;
   496                 }
   497             }
   498             fprintf(stdout, "UCA\n");
   499             UErrorCode error = U_ZERO_ERROR;
   500             COLLATOR_ = ucol_open("en_US", &error);
   501             if (U_FAILURE(error)) {
   502                 fprintf(stdout, "Collator creation failed:");
   503                 fprintf(stdout, u_errorName(error));
   504                 goto CLOSEUCA;
   505                 return;
   506             }
   507             setAttributes(COLLATOR_, &error);
   508             if (U_FAILURE(error)) {
   509                 fprintf(stdout, "Collator attribute setting failed:");
   510                 fprintf(stdout, u_errorName(error));
   511                 goto CLOSEUCA;
   512                 return;
   513             }
   515             serialize("UCA", FALSE);
   516 CLOSEUCA :  
   517             if (options[4].doesOccur) {
   518                 filename[dirlength] = 0;
   519                 fclose(OUTPUT_);
   520             }
   521             ucol_close(COLLATOR_);
   522             localeindex = ucol_countAvailable() - 1;
   523             fprintf(stdout, "Number of locales: %d\n", localeindex + 1);
   524             locale      = ucol_getAvailable(localeindex);
   525         }
   527         while (TRUE) {
   528             UErrorCode error = U_ZERO_ERROR;
   529             COLLATOR_ = ucol_open(locale, &error);
   530             if (U_FAILURE(error)) {
   531                 fprintf(stdout, "Collator creation failed:");
   532                 fprintf(stdout, u_errorName(error));
   533                 goto CLOSETAILOR;
   534                 return;
   535             }
   536             setAttributes(COLLATOR_, &error);
   537             if (U_FAILURE(error)) {
   538                 fprintf(stdout, "Collator attribute setting failed:");
   539                 fprintf(stdout, u_errorName(error));
   540                 goto CLOSETAILOR;
   541                 return;
   542             }
   544             if (options[4].doesOccur) {
   545                 strcat(filename, locale);
   546                 strcat(filename, ".txt");
   547                 OUTPUT_ = fopen(filename, "w");
   548                 if (OUTPUT_ == NULL) {
   549                     fprintf(stdout, "Cannot open file:%s\n", filename);
   550                     return;
   551                 }
   552             }
   554             if (options[3].doesOccur) {
   555                 serialize(locale, TRUE);
   556             }
   558             ucol_close(COLLATOR_);
   560 CLOSETAILOR : 
   561             if (options[4].doesOccur) {
   562                 filename[dirlength] = 0;
   563                 fclose(OUTPUT_);
   564             }
   566             localeindex --;
   567             if (localeindex < 0) {
   568                 break;
   569             }
   570             locale = ucol_getAvailable(localeindex);
   571         }
   572     }
   574     if (options[7].doesOccur) {
   575         char inputfilename[128] = "";
   576         // rules are to be used
   577         if (options[5].doesOccur) {
   578             strcpy(inputfilename, options[5].value);
   579             appendDirSeparator(inputfilename);
   580         }
   581         strcat(inputfilename, options[7].value);
   582         FILE *input = fopen(inputfilename, "r");
   583         if (input == NULL) {
   584             fprintf(stdout, "Cannot open file:%s\n", filename);
   585             return;
   586         }
   588         char   s[1024];
   589         UChar  rule[1024];
   590         UChar *prule = rule;
   591         int    size = 1024;
   592         // synwee TODO: make this part dynamic
   593         while (fscanf(input, "%[^\n]s", s) != EOF) {
   594             size -= u_unescape(s, prule, size);
   595             prule = prule + u_strlen(prule);
   596         }
   597         fclose(input);
   599         if (options[4].doesOccur) {
   600             strcat(filename, "Rules.txt");
   601             OUTPUT_ = fopen(filename, "w");
   602             if (OUTPUT_ == NULL) {
   603                 fprintf(stdout, "Cannot open file:%s\n", filename);
   604                 return;
   605             }
   606         }
   608         fprintf(stdout, "Rules\n");
   609         UErrorCode  error = U_ZERO_ERROR;
   610         UParseError parseError;
   611         COLLATOR_ = ucol_openRules(rule, u_strlen(rule), UCOL_DEFAULT, 
   612                                    UCOL_DEFAULT_STRENGTH, &parseError, &error);
   613         if (U_FAILURE(error)) {
   614             fprintf(stdout, "Collator creation failed:");
   615             fprintf(stdout, u_errorName(error));
   616             goto CLOSERULES;
   617             return;
   618         }
   619         setAttributes(COLLATOR_, &error);
   620         if (U_FAILURE(error)) {
   621             fprintf(stdout, "Collator attribute setting failed:");
   622             fprintf(stdout, u_errorName(error));
   623             goto CLOSERULES;
   624             return;
   625         }
   627         serialize("Rule-based", TRUE);
   628         ucol_close(COLLATOR_);
   630 CLOSERULES :
   631         if (options[4].doesOccur) {
   632             filename[dirlength] = 0;
   633             fclose(OUTPUT_);
   634         }
   635     }
   636 }
   638 /**
   639 * Parse for enum values.
   640 * Note this only works for positive enum values.
   641 * @param enumarray array containing names of the enum values in string and 
   642 *        their corresponding value.
   643 *        declared enum value.
   644 * @param str string to be parsed
   645 * @return corresponding integer enum value or -1 if value is not found.
   646 */
   647 int parseEnums(const EnumNameValuePair enumarray[], const char *str) 
   648 {
   649     const char *enumname = enumarray[0].name;
   650     int result = atoi(str);
   651     if (result == 0 && str[0] != '0') {
   652         while (strcmp(enumname, str) != 0) {
   653             // checking for multiple enum names sharing the same values
   654             enumname = strstr(enumname, str);
   655             if (enumname != NULL) {
   656                 int size = strchr(enumname, '|') - enumname;
   657                 if (size < 0) {
   658                     size = strlen(enumname);
   659                 }
   660                 if (size == (int)strlen(str)) {
   661                     return enumarray[result].value;
   662                 }
   663             }
   664             result ++;
   665             if (&(enumarray[result]) == NULL) {
   666                 return -1;
   667             }
   668             enumname = enumarray[result].name;
   669         }
   670     }
   671     return -1;
   672 }
   674 /**
   675 * Parser for attribute name value pair
   676 */
   677 void parseAttributes() {
   678     char str[32];
   679     const char *pname = options[6].value;
   680     const char *pend  = options[6].value + strlen(options[6].value);
   681     const char *pvalue;
   683     while (pname < pend) {
   684         pvalue = strchr(pname, '=');
   685         if (pvalue == NULL) {
   686             fprintf(stdout, 
   687                     "No matching value found for attribute argument %s\n", 
   688                     pname);        
   689             return;
   690         }
   691         int count = pvalue - pname;
   692         strncpy(str, pname, count);
   693         str[count] = 0;
   695         int name = parseEnums(ATTRIBUTE_NAME_, str);
   696         if (name == -1) {
   697             fprintf(stdout, "Attribute name not found: %s\n", str);
   698             return;
   699         }
   701         pvalue ++;
   702         // getting corresponding enum value
   703         pname = strchr(pvalue, ',');
   704         if (pname == NULL) {
   705             pname = pend;
   706         }
   707         count = pname - pvalue;
   708         strncpy(str, pvalue, count);
   709         str[count] = 0;
   710         int value = parseEnums(ATTRIBUTE_VALUE_, str);
   711         if (value == -1) {
   712             fprintf(stdout, "Attribute value not found: %s\n", str);
   713             return;
   714         }
   715         ATTRIBUTE_[name] = (UColAttributeValue)value;
   716         pname ++;
   717     }
   718 }
   720 /**
   721 * Checks if the locale argument is a base language 
   722 * @param locale to be checked
   723 * @return TRUE if it is a base language
   724 */
   725 inline UBool checkLocaleForLanguage(const char *locale)
   726 {
   727     return strlen(locale) <= 2;
   728 }
   730 /**
   731 * Converts a UChar array into its string form "xxxx xxxx"
   732 * @param ch array of UChar characters
   733 * @param count number of UChar characters
   734 */
   735 void outputUChar(UChar ch[], int count)
   736 {
   737     for (int i = 0; i < count; i ++) {
   738         fprintf(OUTPUT_, "%04X ", ch[i]);
   739     }
   740 }
   742 /**
   743 * If it is a primary difference returns -1 or 1.
   744 * If it is a secondary difference returns -2 or 2.
   745 * If it is a tertiary difference returns -3 or 3.
   746 * If equals returns 0.
   747 */
   748 int compareSortKey(const void *elem1, const void *elem2)
   749 {
   750     // compare the 2 script element sort key
   751     UChar     *ch1   = ((ScriptElement *)elem1)->ch;
   752     UChar     *ch2   = ((ScriptElement *)elem2)->ch;
   753     int        size1 = ((ScriptElement *)elem1)->count;
   754     int        size2 = ((ScriptElement *)elem2)->count;
   755     UErrorCode error = U_ZERO_ERROR;
   757     ucol_setStrength(COLLATOR_, UCOL_PRIMARY);
   758     int result = ucol_strcoll(COLLATOR_, ch1, size1, ch2, size2);
   759     if (result == 0) {
   760         ucol_setStrength(COLLATOR_, UCOL_SECONDARY);
   761         result = ucol_strcoll(COLLATOR_, ch1, size1, ch2, size2);
   762         if (result == 0) {
   763             ucol_setStrength(COLLATOR_, UCOL_TERTIARY);
   764             result = ucol_strcoll(COLLATOR_, ch1, size1, ch2, size2);
   765             if (result < 0) {
   766                 return -3;
   767             }
   768             if (result > 0) {
   769                 return 3;
   770             }    
   771         }
   772         if (result < 0) {
   773             return -2;
   774         }
   775         if (result > 0) {
   776             return 2;
   777         }
   778     }
   779     return result;
   780 }
   782 /**
   783 * Output serialized script elements
   784 * @param element the element to output
   785 * @param compare the comparison with the previous element
   786 * @param expansion flags TRUE if element has an expansion
   787 */
   788 void outputScriptElem(ScriptElement &element, int compare, UBool expansion)
   789 {
   790     switch (compare) {
   791     case 0: 
   792         if (expansion) {
   793             fprintf(OUTPUT_, "<tr><td class='eq' title='["); 
   794         }
   795         else {
   796             fprintf(OUTPUT_, "<tr><td class='q' title='["); 
   797         }
   798         break;  
   799     case -1: 
   800         if (expansion) {
   801             fprintf(OUTPUT_, "<tr><td class='ep' title='["); 
   802         }
   803         else {
   804             fprintf(OUTPUT_, "<tr><td class='p' title='["); 
   805         }
   806         break;        
   807     case -2: 
   808         if (expansion) {
   809             fprintf(OUTPUT_, "<tr><td class='es' title='["); 
   810         }
   811         else {
   812             fprintf(OUTPUT_, "<tr><td class='s' title='["); 
   813         }
   814         break;
   815     default: 
   816         if (expansion) {
   817             fprintf(OUTPUT_, "<tr><td class='et' title='["); 
   818         }
   819         else {
   820             fprintf(OUTPUT_, "<tr><td class='t' title='["); 
   821         }
   822     }
   824     uint8_t sortkey[32];
   825     ucol_setStrength(COLLATOR_, UCOL_TERTIARY);
   826     ucol_getSortKey(COLLATOR_, element.ch, element.count, sortkey, 32);
   827     int i = 0;
   828     while (sortkey[i] != 0) {
   829         if (sortkey[i] == 1) {
   830             fprintf(OUTPUT_, " | ");
   831         }
   832         else {
   833             fprintf(OUTPUT_, "%02x", sortkey[i]);
   834         }
   836         i ++;
   837     }
   839     fprintf(OUTPUT_, "]'>");
   841     UErrorCode error = U_ZERO_ERROR;
   842     char       utf8[64];
   843     UChar      nfc[32];
   844     int32_t    length = unorm_normalize(element.ch, element.count, UNORM_NFC, 0, nfc, 
   845                                         32, &error);
   846     if (U_FAILURE(error)) {
   847         fprintf(stdout, "Error normalizing contractions to NFC\n");
   848     }
   849     u_strToUTF8(utf8, 64, &length, nfc, length, &error);
   850     if (U_FAILURE(error)) {
   851         fprintf(stdout, "Error converting UChar to utf8\n");
   852         return;
   853     }
   855     fprintf(OUTPUT_, "%s<br>", utf8);
   856     fprintf(OUTPUT_, "<tt>");
   857     outputUChar(element.ch, element.count);
   859     if (compare == 0) {
   860         fprintf(OUTPUT_, "</tt></td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>Q</td><td>");
   861     }
   862     else if (compare == -1) {
   863         fprintf(OUTPUT_, "</tt></td><td>P</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>");
   864     }
   865     else if (compare == -2) {
   866         fprintf(OUTPUT_, "</tt></td><td>&nbsp;</td><td>S</td><td>&nbsp;</td><td>&nbsp;</td><td>");
   867     }
   868     else if (compare == -3) {
   869         fprintf(OUTPUT_, "</tt></td><td>&nbsp;</td><td>&nbsp;</td><td>T</td><td>&nbsp;</td><td>");
   870     }
   872     i = 0;
   873     while (i < element.count) {
   874         char    str[128];
   875         UChar32 codepoint;
   876         U16_NEXT(element.ch, i, element.count, codepoint);
   877         int32_t temp = u_charName(codepoint, U_UNICODE_CHAR_NAME, str, 128, 
   878                                       &error);
   879         if (U_FAILURE(error)) {
   880             fprintf(stdout, "Error getting character name\n");
   881             return;
   882         }
   883         if (element.tailored) {
   884             fprintf(OUTPUT_, "<b>");
   885         }
   886         fprintf(OUTPUT_, "%s", str);
   887         if (element.tailored) {
   888             fprintf(OUTPUT_, " *</b>");
   889         }
   890         if (i < element.count) {
   891             fprintf(OUTPUT_, "<br>\n");
   892         }
   893     }
   895     fprintf(OUTPUT_, "</td></tr>\n");
   896 }
   898 /**
   899 * Checks if codepoint belongs to scripts
   900 * @param script list
   901 * @param scriptcount number of scripts
   902 * @param codepoint to test
   903 * @return TRUE if codepoint belongs to scripts
   904 */
   905 UBool checkInScripts(UScriptCode script[], int scriptcount, 
   906                      UChar32 codepoint)
   907 {
   908     UErrorCode error = U_ZERO_ERROR;
   909     for (int i = 0; i < scriptcount; i ++) {
   910         if (script[i] == USCRIPT_HAN && options[10].doesOccur) { 
   911             if ((codepoint >= 0x2E80 && codepoint <= 0x2EE4) ||
   912                 (codepoint >= 0x2A672 && codepoint <= 0x2A6D6)) {
   913                 // reduce han
   914                 return TRUE;
   915             }
   916         }
   917         else if (uscript_getScript(codepoint, &error) == script[i]) {
   918             return TRUE;
   919         }
   920         if (U_FAILURE(error)) {
   921             fprintf(stdout, "Error checking character in scripts\n");
   922             return FALSE;
   923         }
   924     }
   925     return FALSE;
   926 }
   928 /**
   929 * Checks if the set of codepoints belongs to the script
   930 * @param script list
   931 * @param scriptcount number of scripts
   932 * @param scriptelem
   933 * @return TRUE if all codepoints belongs to the script
   934 */
   935 inline UBool checkInScripts(UScriptCode script[], int scriptcount,
   936                            ScriptElement scriptelem)
   937 {
   938     int i = 0;
   939     while (i < scriptelem.count) {
   940         UChar32     codepoint;
   941         U16_NEXT(scriptelem.ch, i, scriptelem.count, codepoint);
   942         UErrorCode  error = U_ZERO_ERROR;
   943         if (checkInScripts(script, scriptcount, codepoint)) {
   944             return TRUE;
   945         }
   946     }
   947     return FALSE;
   948 }
   950 /**
   951 * Gets the script elements and contractions belonging to the script
   952 * @param elems output list
   953 * @param locale locale
   954 * @return number of script elements
   955 * Add by Richard
   956 */
   957 int getScriptElementsFromExemplars(ScriptElement scriptelem[], const char* locale) {
   958     UErrorCode error = U_ZERO_ERROR;
   959     UChar32 codepoint = 0;
   961     UResourceBundle* ures = ures_open(NULL, locale, &error);
   962     if (U_FAILURE(error)) {
   963         fprintf(stdout, "Can not find resource bundle for locale: %s\n", locale);
   964         return -1;
   965     }
   966     int32_t length;
   967     const UChar* exemplarChars = ures_getStringByKey(ures, "ExemplarCharacters", &length, &error);
   969     if (U_FAILURE(error)) {
   970         fprintf(stdout, "Can not find ExemplarCharacters in resource bundle\n");
   971         return -1;
   972     }
   974     UChar* upperChars = new UChar[length * 2];
   975     if (upperChars == 0) {
   976         fprintf(stdout, "Memory error\n");
   977         return -1;
   978     }
   980     int32_t destLength = u_strToUpper(upperChars, length * 2, exemplarChars, -1, locale, &error);
   981     if (U_FAILURE(error)) {
   982         fprintf(stdout, "Error when u_strToUpper() \n");
   983         return -1;
   984     }
   986     UChar* pattern = new UChar[length + destLength + 10];
   987     UChar left[2] = {0x005b, 0x0};
   988     UChar right[2] = {0x005d, 0x0};
   989     pattern = u_strcpy(pattern, left);
   990     pattern = u_strcat(pattern, exemplarChars);
   991     pattern = u_strcat(pattern, upperChars);
   992     pattern = u_strcat(pattern, right);
   994     UnicodeSet * uniset = new UnicodeSet(UnicodeString(pattern), error);
   995     if (U_FAILURE(error)) {
   996         fprintf(stdout, "Can not open USet \n");
   997         return -1;
   998     }
  1000     UnicodeSetIterator* usetiter = new UnicodeSetIterator(*uniset);
  1002     int32_t count = 0;
  1004     while (usetiter -> next()) {
  1005         if (usetiter -> isString()) {
  1006             UnicodeString strItem = usetiter -> getString();
  1008             scriptelem[count].count = 0;
  1009             for (int i = 0; i < strItem.length(); i++) {
  1010                 codepoint = strItem.char32At(i);
  1011                 UTF16_APPEND_CHAR_UNSAFE(scriptelem[count].ch, scriptelem[count].count, codepoint);
  1012                 scriptelem[count].tailored = FALSE;
  1014         } else {
  1015             codepoint = usetiter -> getCodepoint();
  1016             scriptelem[count].count = 0;
  1017             UTF16_APPEND_CHAR_UNSAFE(scriptelem[count].ch, scriptelem[count].count, codepoint);
  1018             scriptelem[count].tailored = FALSE;
  1021         count++;
  1023     delete []pattern;
  1025     return count;
  1028 /**
  1029 * Gets the script elements and contractions belonging to the script
  1030 * @param script list
  1031 * @param scriptcount number of scripts
  1032 * @param elems output list
  1033 * @return number of script elements
  1034 */
  1035 int getScriptElements(UScriptCode script[], int scriptcount, 
  1036                       ScriptElement scriptelem[])
  1038     UErrorCode error = U_ZERO_ERROR;
  1039     UChar32    codepoint = 0;
  1040     int        count     = 0;
  1041     while (codepoint <= UCHAR_MAX_VALUE) { 
  1042         if (checkInScripts(script, scriptcount, codepoint)) {
  1043             scriptelem[count].count = 0;
  1044             UTF16_APPEND_CHAR_UNSAFE(scriptelem[count].ch, 
  1045                                      scriptelem[count].count, codepoint);
  1046             scriptelem[count].tailored = FALSE;
  1047             count ++;
  1049         if (U_FAILURE(error)) {
  1050             fprintf(stdout, "Error determining codepoint in script\n");
  1051             return -1;
  1053         codepoint ++;
  1056     const UChar           *current  = NULL;
  1057           uint32_t         strength = 0;
  1058           uint32_t         chOffset = 0; 
  1059           uint32_t         chLen    = 0;
  1060           uint32_t         exOffset = 0; 
  1061           uint32_t         exLen    = 0;
  1062           uint32_t         prefixOffset = 0; 
  1063           uint32_t         prefixLen    = 0;
  1064           uint8_t          specs    = 0;
  1065           UBool            rstart   = TRUE;
  1066           UColTokenParser  src;
  1067           UColOptionSet    opts;
  1068           UParseError      parseError;
  1070     int32_t  rulelength = ucol_getRulesEx(COLLATOR_, UCOL_FULL_RULES, NULL, 0);
  1071     src.source       = (UChar *)malloc(sizeof(UChar) * 
  1072                                 (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE));
  1073     rulelength = ucol_getRulesEx(COLLATOR_, UCOL_FULL_RULES, src.source, 
  1074                                  rulelength);
  1075     src.current      = src.source;
  1076     src.end          = src.source + rulelength;
  1077     src.extraCurrent = src.end;
  1078     src.extraEnd     = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
  1079     src.opts         = &opts;
  1081 	/*
  1082 	ucol_tok_parseNextToken(&src, &strength, &chOffset, 
  1083                                               &chLen, &exOffset, &exLen,
  1084                                               &prefixOffset, &prefixLen,
  1085                                               &specs, rstart, &parseError,
  1086                                               &error)
  1087     */
  1088     while ((current = ucol_tok_parseNextToken(&src, rstart, &parseError,
  1089                                               &error)) != NULL) {
  1090         // contractions handled here
  1091         if (chLen > 1) {
  1092             u_strncpy(scriptelem[count].ch, src.source + chOffset, chLen);
  1093             scriptelem[count].count = chLen;
  1094             if (checkInScripts(script, scriptcount, scriptelem[count])) {
  1095                 scriptelem[count].tailored     = FALSE;
  1096                 count ++;
  1099         rstart = FALSE;
  1101     if (U_FAILURE(error)) {
  1102         fprintf(stdout, "Error parsing rules: %s\n", u_errorName(error));
  1104 	// rule might have been reallocated, so delete this instead
  1105     free(src.source); 
  1106     return count;
  1109 int compareCodepoints(const void *elem1, const void *elem2)
  1111     UChar *ch1 = ((ScriptElement *)elem1)->ch; // key
  1112     UChar *ch2 = ((ScriptElement *)elem2)->ch;
  1113     ch1[((ScriptElement *)elem1)->count] = 0;
  1114     ch2[((ScriptElement *)elem2)->count] = 0;
  1116     // compare the 2 codepoints
  1117     return u_strcmp(ch1, ch2);
  1120 UBool hasSubNFD(ScriptElement &se, ScriptElement &key)
  1122     UChar *ch1 = se.ch; 
  1123     UChar *ch2 = key.ch; // key
  1124     ch1[se.count] = 0;
  1125     ch2[key.count] = 0;
  1127     // compare the 2 codepoints
  1128     if (u_strstr(ch1, ch2) != NULL) {
  1129         return TRUE;
  1132     // check the decomposition 
  1133     UChar      norm[32];
  1134     UErrorCode error = U_ZERO_ERROR;
  1135     int        size  = unorm_normalize(ch1, se.count, UNORM_NFD, 0, norm, 32, 
  1136                                        &error);    
  1137     if (U_FAILURE(error)) {
  1138         fprintf(stdout, "Error normalizing\n");
  1140     if (u_strstr(norm, ch2) != NULL) {
  1141         return TRUE;
  1143     return FALSE;
  1146 /**
  1147 * Marks tailored elements
  1148 * @param script list
  1149 * @param scriptcount number of scripts
  1150 * @param scriptelem script element list
  1151 * @param scriptelemlength size of the script element list
  1152 */
  1153 void markTailored(UScriptCode script[], int scriptcount, 
  1154                   ScriptElement scriptelem[], int scriptelemlength)
  1156           int32_t  rulelength;
  1157     const UChar   *rule = ucol_getRules(COLLATOR_, &rulelength);
  1159     const UChar           *current  = NULL;
  1160           uint32_t         strength = 0;
  1161           uint32_t         chOffset = 0; 
  1162           uint32_t         chLen    = 0;
  1163           uint32_t         exOffset = 0; 
  1164           uint32_t         exLen    = 0;
  1165           uint32_t         prefixOffset = 0; 
  1166           uint32_t         prefixLen    = 0;
  1167           uint8_t          specs    = 0;
  1168           UBool            rstart   = TRUE;
  1169           UColTokenParser  src;
  1170           UColOptionSet    opts;
  1171           UParseError      parseError;
  1173     src.opts         = &opts;
  1174     src.source       = (UChar *)malloc(
  1175                (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar));
  1176     memcpy(src.source, rule, rulelength * sizeof(UChar));
  1177 	src.current      = src.source;
  1178     src.end          = (UChar *)src.source + rulelength;
  1179     src.extraCurrent = src.end;
  1180     src.extraEnd     = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
  1182     UErrorCode    error = U_ZERO_ERROR;
  1184     while ((current = ucol_tok_parseNextToken(&src, rstart, &parseError,
  1185                                               &error)) != NULL) {
  1186         if (chLen >= 1 && strength != UCOL_TOK_RESET) {
  1187             // skipping the reset characters and non useful stuff.
  1188             ScriptElement se;
  1189             u_strncpy(se.ch, src.source + chOffset, chLen);
  1190             se.count = chLen;
  1192             if (checkInScripts(script, scriptcount, se)) {
  1193                 /*
  1194                 ScriptElement *tse = (ScriptElement *)bsearch(&se, scriptelem, 
  1195                                                               scriptelemlength, 
  1196                                                          sizeof(ScriptElement), 
  1197                                                          compareCodepoints);
  1198                 */
  1199                 for (int i = 0; i < scriptelemlength; i ++) {
  1200                     if (!scriptelem[i].tailored && 
  1201                         hasSubNFD(scriptelem[i], se)) {
  1202                         scriptelem[i].tailored = TRUE;
  1207         rstart = FALSE;
  1209     free(src.source);
  1210     if (U_FAILURE(error)) {
  1211         fprintf(stdout, "Error parsing rules\n");
  1215 /**
  1216 * Checks if the collation iterator has more than 1 collation element
  1217 * @parem coleiter collation element iterator
  1218 * @return TRUE if collation iterator has more than 1 collation element
  1219 */
  1220 UBool hasExpansions(UCollationElements *coleiter)
  1222     UErrorCode error = U_ZERO_ERROR;
  1223     int32_t    ce    = ucol_next(coleiter, &error);
  1224     int        count = 0;
  1226     if (U_FAILURE(error)) {
  1227         fprintf(stdout, "Error getting next collation element\n");
  1229     while (ce != UCOL_NULLORDER) {
  1230         if ((UCOL_PRIMARYORDER(ce) != 0) && !isContinuation(ce)) {
  1231             count ++;
  1232             if (count == 2) {
  1233                 return TRUE;
  1236         ce = ucol_next(coleiter, &error);
  1237         if (U_FAILURE(error)) {
  1238             fprintf(stdout, "Error getting next collation element\n");
  1241     return FALSE;
  1244 /**
  1245 * Prints the footer for index.html
  1246 * @param file output file
  1247 */
  1248 void outputHTMLFooter()
  1250     fprintf(OUTPUT_, "</table>\n");
  1251     fprintf(OUTPUT_, "</body>\n");
  1252     fprintf(OUTPUT_, "</html>\n");
  1255 /**
  1256 * Serialize the codepoints from start to end into an html file.
  1257 * Arranging them into ascending collation order.
  1258 * @param script code list
  1259 * @param scriptcount number of scripts
  1260 */
  1261 //void serializeScripts(UScriptCode script[], int scriptcount) 
  1262 //Richard
  1263 void serializeScripts(UScriptCode script[], int scriptcount, const char* locale = NULL) 
  1265     UErrorCode  error  = U_ZERO_ERROR;
  1267     ScriptElement *scriptelem = 
  1268                      (ScriptElement *)malloc(sizeof(ScriptElement) * 0x20000);
  1269     if (scriptelem == NULL) {
  1270         fprintf(stdout, "Memory error\n");
  1271         return;
  1273     int count = 0;
  1274     if(locale) {
  1275       count = getScriptElementsFromExemplars(scriptelem, locale);
  1276     } else {
  1277       count = getScriptElements(script, scriptcount, scriptelem); 
  1280     // Sort script elements using Quicksort algorithm:
  1281     qsort(scriptelem, count, sizeof(ScriptElement), compareCodepoints);
  1282     markTailored(script, scriptcount, scriptelem, count);
  1283     // Sort script elements using Quicksort algorithm:
  1284     qsort(scriptelem, count, sizeof(ScriptElement), compareSortKey);
  1286     UCollationElements* coleiter = ucol_openElements(COLLATOR_, 
  1287                                                      scriptelem[0].ch,
  1288                                                      scriptelem[0].count,
  1289                                                      &error);
  1290     if (U_FAILURE(error)) {
  1291         fprintf(stdout, "Error creating collation element iterator\n");
  1292         return;
  1295     outputScriptElem(scriptelem[0], -1, hasExpansions(coleiter));
  1296     for (int i = 0; i < count - 1; i ++) {
  1297         ucol_setText(coleiter, scriptelem[i + 1].ch, scriptelem[i + 1].count,
  1298                      &error);
  1299         if (U_FAILURE(error)) {
  1300             fprintf(stdout, "Error setting text in collation element iterator\n");
  1301             return;
  1303         outputScriptElem(scriptelem[i + 1], 
  1304                          compareSortKey(scriptelem + i, scriptelem + i + 1),
  1305                          hasExpansions(coleiter));
  1307     free(scriptelem);
  1308     outputHTMLFooter();
  1311 /**
  1312 * Prints the header for the html
  1313 * @param locale name
  1314 * @param script
  1315 * @param scriptcount number of scripts
  1316 */
  1317 void outputHTMLHeader(const char *locale, UScriptCode script[], 
  1318                       int scriptcount)
  1320     fprintf(OUTPUT_, "<html>\n");
  1321     fprintf(OUTPUT_, "<head>\n");
  1322     fprintf(OUTPUT_, "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\n");
  1323     fprintf(OUTPUT_, "<meta http-equiv=\"Content-Language\" content=\"en-us\">\n");
  1324     fprintf(OUTPUT_, "<link rel=\"stylesheet\" href=\"charts.css\" type=\"text/css\">\n");
  1325     fprintf(OUTPUT_, "<title>ICU Collation charts</title>\n");
  1326     fprintf(OUTPUT_, "<base target=\"main\">\n");
  1327     fprintf(OUTPUT_, "</head>\n");
  1329     fprintf(OUTPUT_, "<body bgcolor=#FFFFFF>\n");
  1330     fprintf(OUTPUT_, "<!--\n");
  1331     fprintf(OUTPUT_, "This file contains sorted characters in ascending order according to the locale stated\n");
  1332     fprintf(OUTPUT_, "If the character is in red, it is tailored in the collation rules.\n");
  1333     fprintf(OUTPUT_, "Background colours have certain meanings:\n");
  1334     fprintf(OUTPUT_, "White - equals the previous character\n");
  1335     fprintf(OUTPUT_, "dark blue - primary greater than the previous character\n");
  1336     fprintf(OUTPUT_, "blue - secondary greater than the previous character\n");
  1337     fprintf(OUTPUT_, "light blue - tertiary greater than the previous character\n");
  1338     fprintf(OUTPUT_, "--!>\n");
  1340     fprintf(OUTPUT_, "<table border=0>\n");
  1341     UChar      displayname[64];
  1342     UErrorCode error = U_ZERO_ERROR;
  1343     int32_t size = uloc_getDisplayName(locale, "en_US", displayname, 64, &error);
  1344     char       utf8displayname[128];
  1345     if (U_FAILURE(error)) {
  1346         utf8displayname[0] = 0;
  1348     else {
  1349         int32_t utf8size = 0;
  1350         u_strToUTF8(utf8displayname, 128, &utf8size, displayname, size, &error);
  1353     fprintf(OUTPUT_, "<tr><th>Locale</th><td class='noborder'>%s</td></tr>\n", utf8displayname);
  1354     fprintf(OUTPUT_, "<tr><th>Script(s)</th>");
  1355     fprintf(OUTPUT_, "<td class='noborder'>");
  1356     for (int i = 0; i < scriptcount; i ++) {
  1357         fprintf(OUTPUT_, "%s", uscript_getName(script[i]));
  1358         if (i + 1 != scriptcount) {
  1359             fprintf(OUTPUT_, ", ");
  1362     fprintf(OUTPUT_, "</td></tr>\n");
  1364     fprintf(OUTPUT_, "<tr><th>Rules</th><td class='noborder'><a href=\"http://dev.icu-project.org/cgi-bin/viewcvs.cgi/*checkout*/icu/source/data/coll/%s.txt\">%s.txt</a></td></tr>\n", locale, locale);
  1366     UVersionInfo version;
  1367     ucol_getVersion(COLLATOR_, version);
  1368     fprintf(OUTPUT_, "<tr><th>Collator version</th><td class='noborder'>%d.%d.%d.%d</td></tr>\n", 
  1369                       version[0], version[1], version[2], version[3]);
  1371     UColAttribute attr = UCOL_FRENCH_COLLATION;
  1372     while (attr < UCOL_ATTRIBUTE_COUNT) {
  1373         UColAttributeValue value = ucol_getAttribute(COLLATOR_, attr, &error);
  1374         if (U_FAILURE(error)) {
  1375             fprintf(stdout, "Error getting attribute\n");
  1376             return;
  1378         if (value != UCOL_DEFAULT) {
  1379             if (attr == UCOL_FRENCH_COLLATION && value != UCOL_OFF) {
  1380                 fprintf(OUTPUT_, "<tr><th>French Collation</th><td class='noborder'>on, code %d</td></tr>\n", value);
  1382             if (attr == UCOL_ALTERNATE_HANDLING && value != UCOL_NON_IGNORABLE) {
  1383                 fprintf(OUTPUT_, "<tr><th>Alternate Handling</th><td class='noborder'>shifted, code%d</td></tr>\n", value);
  1385             if (attr == UCOL_CASE_FIRST && value != UCOL_OFF) {
  1386                 fprintf(OUTPUT_, "<tr><th>Case First</th><td class='noborder'>on, code %d</td></tr>\n", value);
  1388             if (attr == UCOL_CASE_LEVEL && value != UCOL_OFF) {
  1389                 fprintf(OUTPUT_, "<tr><th>Case Level</th><td class='noborder'>on, code %d</td></tr>\n", value);
  1391             if (attr == UCOL_NORMALIZATION_MODE && value != UCOL_OFF) {
  1392                 fprintf(OUTPUT_, "<tr><th>Normalization</th><td class='noborder'>on, code %d</td></tr>\n", value);
  1394             if (attr == UCOL_STRENGTH && value != UCOL_TERTIARY) {
  1395                 fprintf(OUTPUT_, "<tr><th>Strength</th><td class='noborder'>code %d</td></tr>\n", value);
  1397             if (attr == UCOL_HIRAGANA_QUATERNARY_MODE && value != UCOL_OFF) {
  1398                 fprintf(OUTPUT_, "<tr><th>Hiragana Quaternary</th><td class='noborder'>on, code %d</td></tr>\n", value);
  1401         attr = (UColAttribute)(attr + 1);
  1404     // Get UNIX-style time and display as number and string.
  1405     time_t ltime;
  1406     time( &ltime );
  1407     fprintf(OUTPUT_, "<tr><th>Date Generated</th><td class='noborder'>%s</td></tr>", ctime(&ltime));
  1409     fprintf(OUTPUT_, "</table>\n");
  1411     fprintf(OUTPUT_, "<p><a href=help.html>How to read the table</a><br>\n");
  1412     fprintf(OUTPUT_, "<a href=http://www.jtcsv.com/cgi-bin/icu-bugs/ target=new>Submit a bug</a></p>\n");
  1413     fprintf(OUTPUT_, "\n<table>\n");
  1414     fprintf(OUTPUT_, "\n<tr><th>Codepoint</th><th>P</th><th>S</th><th>T</th><th>Q</th><th>Name</th></tr>\n");
  1417 /**
  1418 * Prints the header for index.html
  1419 * @param file output file
  1420 */
  1421 void outputListHTMLHeader(FILE *file)
  1423     fprintf(file, "<html>\n");
  1424     fprintf(file, "<head>\n");
  1425     fprintf(file, "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\n");
  1426     fprintf(file, "<meta http-equiv=\"Content-Language\" content=\"en-us\">\n");
  1427     fprintf(file, "<title>ICU Collation Charts</title>\n");
  1428     fprintf(file, "<base target=\"main\">\n");
  1429     fprintf(file, "</head>\n");
  1430     fprintf(file, "<body bgcolor=#FFFFFF>\n");
  1431     fprintf(file, "<h2 align=center>ICU Collation Charts</h2>\n");
  1432     fprintf(file, "<p align=center>\n");
  1433     fprintf(file, "<a href=http://www.unicode.org/charts/collation/ target=new>UCA Charts</a><br>");
  1436 /**
  1437 * Prints the footer for index.html
  1438 * @param file output file
  1439 */
  1440 void outputListHTMLFooter(FILE *file)
  1442     fprintf(file, "</p>\n");
  1443 	//fprintf(file, "<center><image src=http://oss.software.ibm.com/icu/images/w24.gif></center>\n");
  1444     fprintf(file, "</body>\n");
  1445     fprintf(file, "</html>\n");
  1448 /**
  1449 * Gets all scripts and serialize their codepoints into an html file.
  1450 */
  1451 void serializeScripts() {
  1452     char filename[128];
  1453     int  dirlength = 0;
  1455     if (options[4].doesOccur) {
  1456         strcpy(filename, options[4].value);
  1457         dirlength = appendDirSeparator(filename);
  1458     } else {
  1459       filename[0] = 0;
  1462     const char    *locale;
  1463           int32_t  localelist = 0;
  1464           int32_t  localesize;
  1466     localesize = ucol_countAvailable();
  1467     locale     = ucol_getAvailable(localelist);
  1469     strcat(filename, "list.html");
  1470     FILE *list = fopen(filename, "w");
  1471     filename[dirlength] = 0;
  1472     if (list == NULL) {
  1473         fprintf(stdout, "Cannot open file: %s\n", filename);
  1474         return;
  1477     outputListHTMLHeader(list);
  1478     fprintf(list, "<blockquote>\n");
  1479     while (TRUE) {
  1480         UErrorCode error = U_ZERO_ERROR;
  1481         COLLATOR_ = ucol_open(locale, &error);
  1482         if (U_FAILURE(error)) {
  1483             fprintf(stdout, "Collator creation failed:");
  1484             fprintf(stdout, u_errorName(error));
  1485             break;
  1487         if ((error != U_USING_FALLBACK_WARNING && // not tailored
  1488             error != U_USING_DEFAULT_WARNING) ||
  1489             checkLocaleForLanguage(locale)) {
  1490             fprintf(list, "<a href=%s.html>%s</a> ", locale, locale);
  1491 	        setAttributes(COLLATOR_, &error);
  1492             if (U_FAILURE(error)) {
  1493                fprintf(stdout, "Collator attribute setting failed:");
  1494                fprintf(stdout, u_errorName(error));
  1495                break;
  1498             UScriptCode scriptcode[32];
  1499             uint32_t scriptcount = uscript_getCode(locale, scriptcode, 32, 
  1500                                                    &error);
  1501             if (U_FAILURE(error)) {
  1502                 fprintf(stdout, "Error getting lcale scripts\n");
  1503                 break;
  1506             strcat(filename, locale);
  1507             strcat(filename, ".html");
  1508             OUTPUT_ = fopen(filename, "w");
  1509             if (OUTPUT_ == NULL) {
  1510                 fprintf(stdout, "Cannot open file:%s\n", filename);
  1511                 break;
  1513             outputHTMLHeader(locale, scriptcode, scriptcount);
  1514             fprintf(stdout, "%s\n", locale);
  1516             if(options[12].doesOccur) {
  1517               // use whole scripts
  1518                 serializeScripts(scriptcode, scriptcount);
  1519             } else {
  1520               // use exemplar chars
  1521               serializeScripts(scriptcode, scriptcount, locale);
  1523             fclose(OUTPUT_);
  1525         ucol_close(COLLATOR_);
  1527         filename[dirlength] = 0;
  1528         localelist ++;
  1529         if (localelist == localesize) {
  1530             break;
  1532         locale = ucol_getAvailable(localelist);
  1534     fprintf(list, "<br><a href=help.html>help</a><br>");
  1535     fprintf(list, "</blockquote>\n");
  1536     outputListHTMLFooter(list);
  1537     fclose(list);
  1540 /** 
  1541 * Main   --  process command line, read in and pre-process the test file,
  1542 *            call other functions to do the actual tests.
  1543 */
  1544 int main(int argc, char *argv[]) {
  1546     argc = u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), 
  1547                        options);
  1549     // error handling, printing usage message
  1550     if (argc < 0) {
  1551         fprintf(stdout, "error in command line argument: ");
  1552         fprintf(stdout, argv[-argc]);
  1553         fprintf(stdout, "\n");
  1555     if (argc < 0 || options[0].doesOccur || options[1].doesOccur) {
  1556         fprintf(stdout, "Usage: dumpce options...\n"
  1557                         "--help\n"
  1558                         "    Display this message.\n"
  1559                         "--locale name|all\n"
  1560                         "    ICU locale to use. Default is en_US\n"
  1561                         "--serialize\n"
  1562                         "    Serializes the collation elements in -locale or all locales available and outputs them into --outputdir/locale_ce.txt\n"
  1563                         "--destdir dir_name\n"
  1564                         "    Path for outputing the serialized collation elements. Defaults to stdout if no defined\n"
  1565                         "--sourcedir dir_name\n"
  1566                         "    Path for the input rule file for collation\n"
  1567                         "--attribute name=value,name=value...\n" 
  1568                         "    Pairs of attribute names and values for setting\n"
  1569                         "--rule filename\n" 
  1570                         "    Name of file containing the collation rules.\n"
  1571                         "--normalizaton mode\n" 
  1572                         "    UNormalizationMode mode to be used.\n"
  1573                         "--scripts\n" 
  1574                         "    Codepoints from all scripts are sorted and serialized.\n"
  1575                         "--reducehan\n" 
  1576                         "    Only 200 Han script characters will be displayed with the use of --scripts.\n"
  1577                         "--wholescripts\n"
  1578                         "    Show collation order for whole scripts instead of just for exemplar characters of a locale\n\n");
  1580         fprintf(stdout, "Example to generate *.txt files : dumpce --serialize --locale af --destdir /temp --attribute UCOL_STRENGTH=UCOL_DEFAULT_STRENGTH,4=17\n\n");
  1581         fprintf(stdout, "Example to generate *.html files for oss web display: dumpce --scripts --destdir /temp --reducehan\n");
  1582         return argc < 0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
  1585     OUTPUT_ = stdout;
  1586     if (options[6].doesOccur) {
  1587         fprintf(stdout, "attributes %s\n", options[6].value);
  1588         parseAttributes();
  1590     if (options[3].doesOccur) {
  1591         serialize();
  1593     if (options[9].doesOccur) {
  1594         serializeScripts();
  1596     return 0;

mercurial