michael@0: /*
michael@0: *******************************************************************************
michael@0: *
michael@0: *   Copyright (C) 2009-2012, International Business Machines
michael@0: *   Corporation and others.  All Rights Reserved.
michael@0: *
michael@0: *******************************************************************************
michael@0: *   file name:  gennorm2.cpp
michael@0: *   encoding:   US-ASCII
michael@0: *   tab size:   8 (not used)
michael@0: *   indentation:4
michael@0: *
michael@0: *   created on: 2009nov25
michael@0: *   created by: Markus W. Scherer
michael@0: *
michael@0: *   This program reads text files that define Unicode normalization,
michael@0: *   parses them, and builds a binary data file.
michael@0: */
michael@0: 
michael@0: #include "unicode/utypes.h"
michael@0: #include "n2builder.h"
michael@0: 
michael@0: #include <stdio.h>
michael@0: #include <stdlib.h>
michael@0: #include <string.h>
michael@0: #include "unicode/errorcode.h"
michael@0: #include "unicode/localpointer.h"
michael@0: #include "unicode/putil.h"
michael@0: #include "unicode/uchar.h"
michael@0: #include "unicode/unistr.h"
michael@0: #include "charstr.h"
michael@0: #include "normalizer2impl.h"
michael@0: #include "toolutil.h"
michael@0: #include "uoptions.h"
michael@0: #include "uparse.h"
michael@0: 
michael@0: #if UCONFIG_NO_NORMALIZATION
michael@0: #include "unewdata.h"
michael@0: #endif
michael@0: 
michael@0: #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
michael@0: 
michael@0: U_NAMESPACE_BEGIN
michael@0: 
michael@0: UBool beVerbose=FALSE, haveCopyright=TRUE;
michael@0: 
michael@0: U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
michael@0: 
michael@0: #if !UCONFIG_NO_NORMALIZATION
michael@0: void parseFile(FILE *f, Normalizer2DataBuilder &builder);
michael@0: #endif
michael@0: 
michael@0: /* -------------------------------------------------------------------------- */
michael@0: 
michael@0: enum {
michael@0:     HELP_H,
michael@0:     HELP_QUESTION_MARK,
michael@0:     VERBOSE,
michael@0:     COPYRIGHT,
michael@0:     SOURCEDIR,
michael@0:     OUTPUT_FILENAME,
michael@0:     UNICODE_VERSION,
michael@0:     OPT_FAST
michael@0: };
michael@0: 
michael@0: static UOption options[]={
michael@0:     UOPTION_HELP_H,
michael@0:     UOPTION_HELP_QUESTION_MARK,
michael@0:     UOPTION_VERBOSE,
michael@0:     UOPTION_COPYRIGHT,
michael@0:     UOPTION_SOURCEDIR,
michael@0:     UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
michael@0:     UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
michael@0:     UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
michael@0: };
michael@0: 
michael@0: extern "C" int
michael@0: main(int argc, char* argv[]) {
michael@0:     U_MAIN_INIT_ARGS(argc, argv);
michael@0: 
michael@0:     /* preset then read command line options */
michael@0:     options[SOURCEDIR].value="";
michael@0:     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options);
michael@0: 
michael@0:     /* error handling, printing usage message */
michael@0:     if(argc<0) {
michael@0:         fprintf(stderr,
michael@0:             "error in command line argument \"%s\"\n",
michael@0:             argv[-argc]);
michael@0:     }
michael@0:     if(!options[OUTPUT_FILENAME].doesOccur) {
michael@0:         argc=-1;
michael@0:     }
michael@0:     if( argc<2 ||
michael@0:         options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
michael@0:     ) {
michael@0:         /*
michael@0:          * Broken into chunks because the C89 standard says the minimum
michael@0:          * required supported string length is 509 bytes.
michael@0:          */
michael@0:         fprintf(stderr,
michael@0:             "Usage: %s [-options] infiles+ -o outputfilename\n"
michael@0:             "\n"
michael@0:             "Reads the infiles with normalization data and\n"
michael@0:             "creates a binary file (outputfilename) with the data.\n"
michael@0:             "\n",
michael@0:             argv[0]);
michael@0:         fprintf(stderr,
michael@0:             "Options:\n"
michael@0:             "\t-h or -? or --help  this usage text\n"
michael@0:             "\t-v or --verbose     verbose output\n"
michael@0:             "\t-c or --copyright   include a copyright notice\n"
michael@0:             "\t-u or --unicode     Unicode version, followed by the version like 5.2.0\n");
michael@0:         fprintf(stderr,
michael@0:             "\t-s or --sourcedir   source directory, followed by the path\n"
michael@0:             "\t-o or --output      output filename\n");
michael@0:         fprintf(stderr,
michael@0:             "\t      --fast        optimize the .nrm file for fast normalization,\n"
michael@0:             "\t                    which might increase its size  (Writes fully decomposed\n"
michael@0:             "\t                    regular mappings instead of delta mappings.\n"
michael@0:             "\t                    You should measure the runtime speed to make sure that\n"
michael@0:             "\t                    this is a good trade-off.)\n");
michael@0:         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
michael@0:     }
michael@0: 
michael@0:     beVerbose=options[VERBOSE].doesOccur;
michael@0:     haveCopyright=options[COPYRIGHT].doesOccur;
michael@0: 
michael@0:     IcuToolErrorCode errorCode("gennorm2/main()");
michael@0: 
michael@0: #if UCONFIG_NO_NORMALIZATION
michael@0: 
michael@0:     fprintf(stderr,
michael@0:         "gennorm2 writes a dummy binary data file "
michael@0:         "because UCONFIG_NO_NORMALIZATION is set, \n"
michael@0:         "see icu/source/common/unicode/uconfig.h\n");
michael@0:     udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode);
michael@0:     // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on.
michael@0:     // return U_UNSUPPORTED_ERROR;
michael@0:     return 0;
michael@0: 
michael@0: #else
michael@0: 
michael@0:     LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode));
michael@0:     errorCode.assertSuccess();
michael@0: 
michael@0:     if(options[UNICODE_VERSION].doesOccur) {
michael@0:         builder->setUnicodeVersion(options[UNICODE_VERSION].value);
michael@0:     }
michael@0: 
michael@0:     if(options[OPT_FAST].doesOccur) {
michael@0:         builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
michael@0:     }
michael@0: 
michael@0:     // prepare the filename beginning with the source dir
michael@0:     CharString filename(options[SOURCEDIR].value, errorCode);
michael@0:     int32_t pathLength=filename.length();
michael@0:     if( pathLength>0 &&
michael@0:         filename[pathLength-1]!=U_FILE_SEP_CHAR &&
michael@0:         filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR
michael@0:     ) {
michael@0:         filename.append(U_FILE_SEP_CHAR, errorCode);
michael@0:         pathLength=filename.length();
michael@0:     }
michael@0: 
michael@0:     for(int i=1; i<argc; ++i) {
michael@0:         printf("gennorm2: processing %s\n", argv[i]);
michael@0:         filename.append(argv[i], errorCode);
michael@0:         LocalStdioFilePointer f(fopen(filename.data(), "r"));
michael@0:         if(f==NULL) {
michael@0:             fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data());
michael@0:             exit(U_FILE_ACCESS_ERROR);
michael@0:         }
michael@0:         builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS);
michael@0:         parseFile(f.getAlias(), *builder);
michael@0:         filename.truncate(pathLength);
michael@0:     }
michael@0: 
michael@0:     builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
michael@0: 
michael@0:     return errorCode.get();
michael@0: 
michael@0: #endif
michael@0: }
michael@0: 
michael@0: #if !UCONFIG_NO_NORMALIZATION
michael@0: 
michael@0: void parseFile(FILE *f, Normalizer2DataBuilder &builder) {
michael@0:     IcuToolErrorCode errorCode("gennorm2/parseFile()");
michael@0:     char line[300];
michael@0:     uint32_t startCP, endCP;
michael@0:     while(NULL!=fgets(line, (int)sizeof(line), f)) {
michael@0:         char *comment=(char *)strchr(line, '#');
michael@0:         if(comment!=NULL) {
michael@0:             *comment=0;
michael@0:         }
michael@0:         u_rtrim(line);
michael@0:         if(line[0]==0) {
michael@0:             continue;  // skip empty and comment-only lines
michael@0:         }
michael@0:         if(line[0]=='*') {
michael@0:             const char *s=u_skipWhitespace(line+1);
michael@0:             if(0==strncmp(s, "Unicode", 7)) {
michael@0:                 s=u_skipWhitespace(s+7);
michael@0:                 builder.setUnicodeVersion(s);
michael@0:             }
michael@0:             continue;  // reserved syntax
michael@0:         }
michael@0:         const char *delimiter;
michael@0:         int32_t rangeLength=
michael@0:             u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode);
michael@0:         if(errorCode.isFailure()) {
michael@0:             fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
michael@0:             exit(errorCode.reset());
michael@0:         }
michael@0:         delimiter=u_skipWhitespace(delimiter);
michael@0:         if(*delimiter==':') {
michael@0:             const char *s=u_skipWhitespace(delimiter+1);
michael@0:             char *end;
michael@0:             unsigned long value=strtoul(s, &end, 10);
michael@0:             if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) {
michael@0:                 fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line);
michael@0:                 exit(U_PARSE_ERROR);
michael@0:             }
michael@0:             for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
michael@0:                 builder.setCC(c, (uint8_t)value);
michael@0:             }
michael@0:             continue;
michael@0:         }
michael@0:         if(*delimiter=='-') {
michael@0:             if(*u_skipWhitespace(delimiter+1)!=0) {
michael@0:                 fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line);
michael@0:                 exit(U_PARSE_ERROR);
michael@0:             }
michael@0:             for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
michael@0:                 builder.removeMapping(c);
michael@0:             }
michael@0:             continue;
michael@0:         }
michael@0:         if(*delimiter=='=' || *delimiter=='>') {
michael@0:             UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK];
michael@0:             int32_t length=u_parseString(delimiter+1, uchars, LENGTHOF(uchars), NULL, errorCode);
michael@0:             if(errorCode.isFailure()) {
michael@0:                 fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line);
michael@0:                 exit(errorCode.reset());
michael@0:             }
michael@0:             UnicodeString mapping(FALSE, uchars, length);
michael@0:             if(*delimiter=='=') {
michael@0:                 if(rangeLength!=1) {
michael@0:                     fprintf(stderr,
michael@0:                             "gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
michael@0:                             line);
michael@0:                     exit(U_PARSE_ERROR);
michael@0:                 }
michael@0:                 builder.setRoundTripMapping((UChar32)startCP, mapping);
michael@0:             } else {
michael@0:                 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
michael@0:                     builder.setOneWayMapping(c, mapping);
michael@0:                 }
michael@0:             }
michael@0:             continue;
michael@0:         }
michael@0:         fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line);
michael@0:         exit(U_PARSE_ERROR);
michael@0:     }
michael@0: }
michael@0: 
michael@0: #endif // !UCONFIG_NO_NORMALIZATION
michael@0: 
michael@0: U_NAMESPACE_END
michael@0: 
michael@0: /*
michael@0:  * Hey, Emacs, please set the following:
michael@0:  *
michael@0:  * Local Variables:
michael@0:  * indent-tabs-mode: nil
michael@0:  * End:
michael@0:  *
michael@0:  */