intl/icu/source/tools/gennorm2/gennorm2.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

     1 /*
     2 *******************************************************************************
     3 *
     4 *   Copyright (C) 2009-2012, International Business Machines
     5 *   Corporation and others.  All Rights Reserved.
     6 *
     7 *******************************************************************************
     8 *   file name:  gennorm2.cpp
     9 *   encoding:   US-ASCII
    10 *   tab size:   8 (not used)
    11 *   indentation:4
    12 *
    13 *   created on: 2009nov25
    14 *   created by: Markus W. Scherer
    15 *
    16 *   This program reads text files that define Unicode normalization,
    17 *   parses them, and builds a binary data file.
    18 */
    20 #include "unicode/utypes.h"
    21 #include "n2builder.h"
    23 #include <stdio.h>
    24 #include <stdlib.h>
    25 #include <string.h>
    26 #include "unicode/errorcode.h"
    27 #include "unicode/localpointer.h"
    28 #include "unicode/putil.h"
    29 #include "unicode/uchar.h"
    30 #include "unicode/unistr.h"
    31 #include "charstr.h"
    32 #include "normalizer2impl.h"
    33 #include "toolutil.h"
    34 #include "uoptions.h"
    35 #include "uparse.h"
    37 #if UCONFIG_NO_NORMALIZATION
    38 #include "unewdata.h"
    39 #endif
    41 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
    43 U_NAMESPACE_BEGIN
    45 UBool beVerbose=FALSE, haveCopyright=TRUE;
    47 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
    49 #if !UCONFIG_NO_NORMALIZATION
    50 void parseFile(FILE *f, Normalizer2DataBuilder &builder);
    51 #endif
    53 /* -------------------------------------------------------------------------- */
    55 enum {
    56     HELP_H,
    57     HELP_QUESTION_MARK,
    58     VERBOSE,
    59     COPYRIGHT,
    60     SOURCEDIR,
    61     OUTPUT_FILENAME,
    62     UNICODE_VERSION,
    63     OPT_FAST
    64 };
    66 static UOption options[]={
    67     UOPTION_HELP_H,
    68     UOPTION_HELP_QUESTION_MARK,
    69     UOPTION_VERBOSE,
    70     UOPTION_COPYRIGHT,
    71     UOPTION_SOURCEDIR,
    72     UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
    73     UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
    74     UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
    75 };
    77 extern "C" int
    78 main(int argc, char* argv[]) {
    79     U_MAIN_INIT_ARGS(argc, argv);
    81     /* preset then read command line options */
    82     options[SOURCEDIR].value="";
    83     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options);
    85     /* error handling, printing usage message */
    86     if(argc<0) {
    87         fprintf(stderr,
    88             "error in command line argument \"%s\"\n",
    89             argv[-argc]);
    90     }
    91     if(!options[OUTPUT_FILENAME].doesOccur) {
    92         argc=-1;
    93     }
    94     if( argc<2 ||
    95         options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
    96     ) {
    97         /*
    98          * Broken into chunks because the C89 standard says the minimum
    99          * required supported string length is 509 bytes.
   100          */
   101         fprintf(stderr,
   102             "Usage: %s [-options] infiles+ -o outputfilename\n"
   103             "\n"
   104             "Reads the infiles with normalization data and\n"
   105             "creates a binary file (outputfilename) with the data.\n"
   106             "\n",
   107             argv[0]);
   108         fprintf(stderr,
   109             "Options:\n"
   110             "\t-h or -? or --help  this usage text\n"
   111             "\t-v or --verbose     verbose output\n"
   112             "\t-c or --copyright   include a copyright notice\n"
   113             "\t-u or --unicode     Unicode version, followed by the version like 5.2.0\n");
   114         fprintf(stderr,
   115             "\t-s or --sourcedir   source directory, followed by the path\n"
   116             "\t-o or --output      output filename\n");
   117         fprintf(stderr,
   118             "\t      --fast        optimize the .nrm file for fast normalization,\n"
   119             "\t                    which might increase its size  (Writes fully decomposed\n"
   120             "\t                    regular mappings instead of delta mappings.\n"
   121             "\t                    You should measure the runtime speed to make sure that\n"
   122             "\t                    this is a good trade-off.)\n");
   123         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
   124     }
   126     beVerbose=options[VERBOSE].doesOccur;
   127     haveCopyright=options[COPYRIGHT].doesOccur;
   129     IcuToolErrorCode errorCode("gennorm2/main()");
   131 #if UCONFIG_NO_NORMALIZATION
   133     fprintf(stderr,
   134         "gennorm2 writes a dummy binary data file "
   135         "because UCONFIG_NO_NORMALIZATION is set, \n"
   136         "see icu/source/common/unicode/uconfig.h\n");
   137     udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode);
   138     // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on.
   139     // return U_UNSUPPORTED_ERROR;
   140     return 0;
   142 #else
   144     LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode));
   145     errorCode.assertSuccess();
   147     if(options[UNICODE_VERSION].doesOccur) {
   148         builder->setUnicodeVersion(options[UNICODE_VERSION].value);
   149     }
   151     if(options[OPT_FAST].doesOccur) {
   152         builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
   153     }
   155     // prepare the filename beginning with the source dir
   156     CharString filename(options[SOURCEDIR].value, errorCode);
   157     int32_t pathLength=filename.length();
   158     if( pathLength>0 &&
   159         filename[pathLength-1]!=U_FILE_SEP_CHAR &&
   160         filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR
   161     ) {
   162         filename.append(U_FILE_SEP_CHAR, errorCode);
   163         pathLength=filename.length();
   164     }
   166     for(int i=1; i<argc; ++i) {
   167         printf("gennorm2: processing %s\n", argv[i]);
   168         filename.append(argv[i], errorCode);
   169         LocalStdioFilePointer f(fopen(filename.data(), "r"));
   170         if(f==NULL) {
   171             fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data());
   172             exit(U_FILE_ACCESS_ERROR);
   173         }
   174         builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS);
   175         parseFile(f.getAlias(), *builder);
   176         filename.truncate(pathLength);
   177     }
   179     builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
   181     return errorCode.get();
   183 #endif
   184 }
   186 #if !UCONFIG_NO_NORMALIZATION
   188 void parseFile(FILE *f, Normalizer2DataBuilder &builder) {
   189     IcuToolErrorCode errorCode("gennorm2/parseFile()");
   190     char line[300];
   191     uint32_t startCP, endCP;
   192     while(NULL!=fgets(line, (int)sizeof(line), f)) {
   193         char *comment=(char *)strchr(line, '#');
   194         if(comment!=NULL) {
   195             *comment=0;
   196         }
   197         u_rtrim(line);
   198         if(line[0]==0) {
   199             continue;  // skip empty and comment-only lines
   200         }
   201         if(line[0]=='*') {
   202             const char *s=u_skipWhitespace(line+1);
   203             if(0==strncmp(s, "Unicode", 7)) {
   204                 s=u_skipWhitespace(s+7);
   205                 builder.setUnicodeVersion(s);
   206             }
   207             continue;  // reserved syntax
   208         }
   209         const char *delimiter;
   210         int32_t rangeLength=
   211             u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode);
   212         if(errorCode.isFailure()) {
   213             fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
   214             exit(errorCode.reset());
   215         }
   216         delimiter=u_skipWhitespace(delimiter);
   217         if(*delimiter==':') {
   218             const char *s=u_skipWhitespace(delimiter+1);
   219             char *end;
   220             unsigned long value=strtoul(s, &end, 10);
   221             if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) {
   222                 fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line);
   223                 exit(U_PARSE_ERROR);
   224             }
   225             for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
   226                 builder.setCC(c, (uint8_t)value);
   227             }
   228             continue;
   229         }
   230         if(*delimiter=='-') {
   231             if(*u_skipWhitespace(delimiter+1)!=0) {
   232                 fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line);
   233                 exit(U_PARSE_ERROR);
   234             }
   235             for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
   236                 builder.removeMapping(c);
   237             }
   238             continue;
   239         }
   240         if(*delimiter=='=' || *delimiter=='>') {
   241             UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK];
   242             int32_t length=u_parseString(delimiter+1, uchars, LENGTHOF(uchars), NULL, errorCode);
   243             if(errorCode.isFailure()) {
   244                 fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line);
   245                 exit(errorCode.reset());
   246             }
   247             UnicodeString mapping(FALSE, uchars, length);
   248             if(*delimiter=='=') {
   249                 if(rangeLength!=1) {
   250                     fprintf(stderr,
   251                             "gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
   252                             line);
   253                     exit(U_PARSE_ERROR);
   254                 }
   255                 builder.setRoundTripMapping((UChar32)startCP, mapping);
   256             } else {
   257                 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
   258                     builder.setOneWayMapping(c, mapping);
   259                 }
   260             }
   261             continue;
   262         }
   263         fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line);
   264         exit(U_PARSE_ERROR);
   265     }
   266 }
   268 #endif // !UCONFIG_NO_NORMALIZATION
   270 U_NAMESPACE_END
   272 /*
   273  * Hey, Emacs, please set the following:
   274  *
   275  * Local Variables:
   276  * indent-tabs-mode: nil
   277  * End:
   278  *
   279  */

mercurial