intl/icu/source/tools/gennorm2/gennorm2.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/tools/gennorm2/gennorm2.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,279 @@
     1.4 +/*
     1.5 +*******************************************************************************
     1.6 +*
     1.7 +*   Copyright (C) 2009-2012, International Business Machines
     1.8 +*   Corporation and others.  All Rights Reserved.
     1.9 +*
    1.10 +*******************************************************************************
    1.11 +*   file name:  gennorm2.cpp
    1.12 +*   encoding:   US-ASCII
    1.13 +*   tab size:   8 (not used)
    1.14 +*   indentation:4
    1.15 +*
    1.16 +*   created on: 2009nov25
    1.17 +*   created by: Markus W. Scherer
    1.18 +*
    1.19 +*   This program reads text files that define Unicode normalization,
    1.20 +*   parses them, and builds a binary data file.
    1.21 +*/
    1.22 +
    1.23 +#include "unicode/utypes.h"
    1.24 +#include "n2builder.h"
    1.25 +
    1.26 +#include <stdio.h>
    1.27 +#include <stdlib.h>
    1.28 +#include <string.h>
    1.29 +#include "unicode/errorcode.h"
    1.30 +#include "unicode/localpointer.h"
    1.31 +#include "unicode/putil.h"
    1.32 +#include "unicode/uchar.h"
    1.33 +#include "unicode/unistr.h"
    1.34 +#include "charstr.h"
    1.35 +#include "normalizer2impl.h"
    1.36 +#include "toolutil.h"
    1.37 +#include "uoptions.h"
    1.38 +#include "uparse.h"
    1.39 +
    1.40 +#if UCONFIG_NO_NORMALIZATION
    1.41 +#include "unewdata.h"
    1.42 +#endif
    1.43 +
    1.44 +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
    1.45 +
    1.46 +U_NAMESPACE_BEGIN
    1.47 +
    1.48 +UBool beVerbose=FALSE, haveCopyright=TRUE;
    1.49 +
    1.50 +U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
    1.51 +
    1.52 +#if !UCONFIG_NO_NORMALIZATION
    1.53 +void parseFile(FILE *f, Normalizer2DataBuilder &builder);
    1.54 +#endif
    1.55 +
    1.56 +/* -------------------------------------------------------------------------- */
    1.57 +
    1.58 +enum {
    1.59 +    HELP_H,
    1.60 +    HELP_QUESTION_MARK,
    1.61 +    VERBOSE,
    1.62 +    COPYRIGHT,
    1.63 +    SOURCEDIR,
    1.64 +    OUTPUT_FILENAME,
    1.65 +    UNICODE_VERSION,
    1.66 +    OPT_FAST
    1.67 +};
    1.68 +
    1.69 +static UOption options[]={
    1.70 +    UOPTION_HELP_H,
    1.71 +    UOPTION_HELP_QUESTION_MARK,
    1.72 +    UOPTION_VERBOSE,
    1.73 +    UOPTION_COPYRIGHT,
    1.74 +    UOPTION_SOURCEDIR,
    1.75 +    UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
    1.76 +    UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
    1.77 +    UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
    1.78 +};
    1.79 +
    1.80 +extern "C" int
    1.81 +main(int argc, char* argv[]) {
    1.82 +    U_MAIN_INIT_ARGS(argc, argv);
    1.83 +
    1.84 +    /* preset then read command line options */
    1.85 +    options[SOURCEDIR].value="";
    1.86 +    argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options);
    1.87 +
    1.88 +    /* error handling, printing usage message */
    1.89 +    if(argc<0) {
    1.90 +        fprintf(stderr,
    1.91 +            "error in command line argument \"%s\"\n",
    1.92 +            argv[-argc]);
    1.93 +    }
    1.94 +    if(!options[OUTPUT_FILENAME].doesOccur) {
    1.95 +        argc=-1;
    1.96 +    }
    1.97 +    if( argc<2 ||
    1.98 +        options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
    1.99 +    ) {
   1.100 +        /*
   1.101 +         * Broken into chunks because the C89 standard says the minimum
   1.102 +         * required supported string length is 509 bytes.
   1.103 +         */
   1.104 +        fprintf(stderr,
   1.105 +            "Usage: %s [-options] infiles+ -o outputfilename\n"
   1.106 +            "\n"
   1.107 +            "Reads the infiles with normalization data and\n"
   1.108 +            "creates a binary file (outputfilename) with the data.\n"
   1.109 +            "\n",
   1.110 +            argv[0]);
   1.111 +        fprintf(stderr,
   1.112 +            "Options:\n"
   1.113 +            "\t-h or -? or --help  this usage text\n"
   1.114 +            "\t-v or --verbose     verbose output\n"
   1.115 +            "\t-c or --copyright   include a copyright notice\n"
   1.116 +            "\t-u or --unicode     Unicode version, followed by the version like 5.2.0\n");
   1.117 +        fprintf(stderr,
   1.118 +            "\t-s or --sourcedir   source directory, followed by the path\n"
   1.119 +            "\t-o or --output      output filename\n");
   1.120 +        fprintf(stderr,
   1.121 +            "\t      --fast        optimize the .nrm file for fast normalization,\n"
   1.122 +            "\t                    which might increase its size  (Writes fully decomposed\n"
   1.123 +            "\t                    regular mappings instead of delta mappings.\n"
   1.124 +            "\t                    You should measure the runtime speed to make sure that\n"
   1.125 +            "\t                    this is a good trade-off.)\n");
   1.126 +        return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
   1.127 +    }
   1.128 +
   1.129 +    beVerbose=options[VERBOSE].doesOccur;
   1.130 +    haveCopyright=options[COPYRIGHT].doesOccur;
   1.131 +
   1.132 +    IcuToolErrorCode errorCode("gennorm2/main()");
   1.133 +
   1.134 +#if UCONFIG_NO_NORMALIZATION
   1.135 +
   1.136 +    fprintf(stderr,
   1.137 +        "gennorm2 writes a dummy binary data file "
   1.138 +        "because UCONFIG_NO_NORMALIZATION is set, \n"
   1.139 +        "see icu/source/common/unicode/uconfig.h\n");
   1.140 +    udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode);
   1.141 +    // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on.
   1.142 +    // return U_UNSUPPORTED_ERROR;
   1.143 +    return 0;
   1.144 +
   1.145 +#else
   1.146 +
   1.147 +    LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode));
   1.148 +    errorCode.assertSuccess();
   1.149 +
   1.150 +    if(options[UNICODE_VERSION].doesOccur) {
   1.151 +        builder->setUnicodeVersion(options[UNICODE_VERSION].value);
   1.152 +    }
   1.153 +
   1.154 +    if(options[OPT_FAST].doesOccur) {
   1.155 +        builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
   1.156 +    }
   1.157 +
   1.158 +    // prepare the filename beginning with the source dir
   1.159 +    CharString filename(options[SOURCEDIR].value, errorCode);
   1.160 +    int32_t pathLength=filename.length();
   1.161 +    if( pathLength>0 &&
   1.162 +        filename[pathLength-1]!=U_FILE_SEP_CHAR &&
   1.163 +        filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR
   1.164 +    ) {
   1.165 +        filename.append(U_FILE_SEP_CHAR, errorCode);
   1.166 +        pathLength=filename.length();
   1.167 +    }
   1.168 +
   1.169 +    for(int i=1; i<argc; ++i) {
   1.170 +        printf("gennorm2: processing %s\n", argv[i]);
   1.171 +        filename.append(argv[i], errorCode);
   1.172 +        LocalStdioFilePointer f(fopen(filename.data(), "r"));
   1.173 +        if(f==NULL) {
   1.174 +            fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data());
   1.175 +            exit(U_FILE_ACCESS_ERROR);
   1.176 +        }
   1.177 +        builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS);
   1.178 +        parseFile(f.getAlias(), *builder);
   1.179 +        filename.truncate(pathLength);
   1.180 +    }
   1.181 +
   1.182 +    builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
   1.183 +
   1.184 +    return errorCode.get();
   1.185 +
   1.186 +#endif
   1.187 +}
   1.188 +
   1.189 +#if !UCONFIG_NO_NORMALIZATION
   1.190 +
   1.191 +void parseFile(FILE *f, Normalizer2DataBuilder &builder) {
   1.192 +    IcuToolErrorCode errorCode("gennorm2/parseFile()");
   1.193 +    char line[300];
   1.194 +    uint32_t startCP, endCP;
   1.195 +    while(NULL!=fgets(line, (int)sizeof(line), f)) {
   1.196 +        char *comment=(char *)strchr(line, '#');
   1.197 +        if(comment!=NULL) {
   1.198 +            *comment=0;
   1.199 +        }
   1.200 +        u_rtrim(line);
   1.201 +        if(line[0]==0) {
   1.202 +            continue;  // skip empty and comment-only lines
   1.203 +        }
   1.204 +        if(line[0]=='*') {
   1.205 +            const char *s=u_skipWhitespace(line+1);
   1.206 +            if(0==strncmp(s, "Unicode", 7)) {
   1.207 +                s=u_skipWhitespace(s+7);
   1.208 +                builder.setUnicodeVersion(s);
   1.209 +            }
   1.210 +            continue;  // reserved syntax
   1.211 +        }
   1.212 +        const char *delimiter;
   1.213 +        int32_t rangeLength=
   1.214 +            u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode);
   1.215 +        if(errorCode.isFailure()) {
   1.216 +            fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
   1.217 +            exit(errorCode.reset());
   1.218 +        }
   1.219 +        delimiter=u_skipWhitespace(delimiter);
   1.220 +        if(*delimiter==':') {
   1.221 +            const char *s=u_skipWhitespace(delimiter+1);
   1.222 +            char *end;
   1.223 +            unsigned long value=strtoul(s, &end, 10);
   1.224 +            if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) {
   1.225 +                fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line);
   1.226 +                exit(U_PARSE_ERROR);
   1.227 +            }
   1.228 +            for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
   1.229 +                builder.setCC(c, (uint8_t)value);
   1.230 +            }
   1.231 +            continue;
   1.232 +        }
   1.233 +        if(*delimiter=='-') {
   1.234 +            if(*u_skipWhitespace(delimiter+1)!=0) {
   1.235 +                fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line);
   1.236 +                exit(U_PARSE_ERROR);
   1.237 +            }
   1.238 +            for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
   1.239 +                builder.removeMapping(c);
   1.240 +            }
   1.241 +            continue;
   1.242 +        }
   1.243 +        if(*delimiter=='=' || *delimiter=='>') {
   1.244 +            UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK];
   1.245 +            int32_t length=u_parseString(delimiter+1, uchars, LENGTHOF(uchars), NULL, errorCode);
   1.246 +            if(errorCode.isFailure()) {
   1.247 +                fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line);
   1.248 +                exit(errorCode.reset());
   1.249 +            }
   1.250 +            UnicodeString mapping(FALSE, uchars, length);
   1.251 +            if(*delimiter=='=') {
   1.252 +                if(rangeLength!=1) {
   1.253 +                    fprintf(stderr,
   1.254 +                            "gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
   1.255 +                            line);
   1.256 +                    exit(U_PARSE_ERROR);
   1.257 +                }
   1.258 +                builder.setRoundTripMapping((UChar32)startCP, mapping);
   1.259 +            } else {
   1.260 +                for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
   1.261 +                    builder.setOneWayMapping(c, mapping);
   1.262 +                }
   1.263 +            }
   1.264 +            continue;
   1.265 +        }
   1.266 +        fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line);
   1.267 +        exit(U_PARSE_ERROR);
   1.268 +    }
   1.269 +}
   1.270 +
   1.271 +#endif // !UCONFIG_NO_NORMALIZATION
   1.272 +
   1.273 +U_NAMESPACE_END
   1.274 +
   1.275 +/*
   1.276 + * Hey, Emacs, please set the following:
   1.277 + *
   1.278 + * Local Variables:
   1.279 + * indent-tabs-mode: nil
   1.280 + * End:
   1.281 + *
   1.282 + */

mercurial