Wed, 31 Dec 2014 07:22:50 +0100
Correct previous dual key logic pending first delivery installment.
michael@0 | 1 | /* |
michael@0 | 2 | ******************************************************************************* |
michael@0 | 3 | * |
michael@0 | 4 | * Copyright (C) 2009-2012, International Business Machines |
michael@0 | 5 | * Corporation and others. All Rights Reserved. |
michael@0 | 6 | * |
michael@0 | 7 | ******************************************************************************* |
michael@0 | 8 | * file name: gennorm2.cpp |
michael@0 | 9 | * encoding: US-ASCII |
michael@0 | 10 | * tab size: 8 (not used) |
michael@0 | 11 | * indentation:4 |
michael@0 | 12 | * |
michael@0 | 13 | * created on: 2009nov25 |
michael@0 | 14 | * created by: Markus W. Scherer |
michael@0 | 15 | * |
michael@0 | 16 | * This program reads text files that define Unicode normalization, |
michael@0 | 17 | * parses them, and builds a binary data file. |
michael@0 | 18 | */ |
michael@0 | 19 | |
michael@0 | 20 | #include "unicode/utypes.h" |
michael@0 | 21 | #include "n2builder.h" |
michael@0 | 22 | |
michael@0 | 23 | #include <stdio.h> |
michael@0 | 24 | #include <stdlib.h> |
michael@0 | 25 | #include <string.h> |
michael@0 | 26 | #include "unicode/errorcode.h" |
michael@0 | 27 | #include "unicode/localpointer.h" |
michael@0 | 28 | #include "unicode/putil.h" |
michael@0 | 29 | #include "unicode/uchar.h" |
michael@0 | 30 | #include "unicode/unistr.h" |
michael@0 | 31 | #include "charstr.h" |
michael@0 | 32 | #include "normalizer2impl.h" |
michael@0 | 33 | #include "toolutil.h" |
michael@0 | 34 | #include "uoptions.h" |
michael@0 | 35 | #include "uparse.h" |
michael@0 | 36 | |
michael@0 | 37 | #if UCONFIG_NO_NORMALIZATION |
michael@0 | 38 | #include "unewdata.h" |
michael@0 | 39 | #endif |
michael@0 | 40 | |
michael@0 | 41 | #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
michael@0 | 42 | |
michael@0 | 43 | U_NAMESPACE_BEGIN |
michael@0 | 44 | |
michael@0 | 45 | UBool beVerbose=FALSE, haveCopyright=TRUE; |
michael@0 | 46 | |
michael@0 | 47 | U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose); |
michael@0 | 48 | |
michael@0 | 49 | #if !UCONFIG_NO_NORMALIZATION |
michael@0 | 50 | void parseFile(FILE *f, Normalizer2DataBuilder &builder); |
michael@0 | 51 | #endif |
michael@0 | 52 | |
michael@0 | 53 | /* -------------------------------------------------------------------------- */ |
michael@0 | 54 | |
michael@0 | 55 | enum { |
michael@0 | 56 | HELP_H, |
michael@0 | 57 | HELP_QUESTION_MARK, |
michael@0 | 58 | VERBOSE, |
michael@0 | 59 | COPYRIGHT, |
michael@0 | 60 | SOURCEDIR, |
michael@0 | 61 | OUTPUT_FILENAME, |
michael@0 | 62 | UNICODE_VERSION, |
michael@0 | 63 | OPT_FAST |
michael@0 | 64 | }; |
michael@0 | 65 | |
michael@0 | 66 | static UOption options[]={ |
michael@0 | 67 | UOPTION_HELP_H, |
michael@0 | 68 | UOPTION_HELP_QUESTION_MARK, |
michael@0 | 69 | UOPTION_VERBOSE, |
michael@0 | 70 | UOPTION_COPYRIGHT, |
michael@0 | 71 | UOPTION_SOURCEDIR, |
michael@0 | 72 | UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG), |
michael@0 | 73 | UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG), |
michael@0 | 74 | UOPTION_DEF("fast", '\1', UOPT_NO_ARG) |
michael@0 | 75 | }; |
michael@0 | 76 | |
michael@0 | 77 | extern "C" int |
michael@0 | 78 | main(int argc, char* argv[]) { |
michael@0 | 79 | U_MAIN_INIT_ARGS(argc, argv); |
michael@0 | 80 | |
michael@0 | 81 | /* preset then read command line options */ |
michael@0 | 82 | options[SOURCEDIR].value=""; |
michael@0 | 83 | argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options); |
michael@0 | 84 | |
michael@0 | 85 | /* error handling, printing usage message */ |
michael@0 | 86 | if(argc<0) { |
michael@0 | 87 | fprintf(stderr, |
michael@0 | 88 | "error in command line argument \"%s\"\n", |
michael@0 | 89 | argv[-argc]); |
michael@0 | 90 | } |
michael@0 | 91 | if(!options[OUTPUT_FILENAME].doesOccur) { |
michael@0 | 92 | argc=-1; |
michael@0 | 93 | } |
michael@0 | 94 | if( argc<2 || |
michael@0 | 95 | options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur |
michael@0 | 96 | ) { |
michael@0 | 97 | /* |
michael@0 | 98 | * Broken into chunks because the C89 standard says the minimum |
michael@0 | 99 | * required supported string length is 509 bytes. |
michael@0 | 100 | */ |
michael@0 | 101 | fprintf(stderr, |
michael@0 | 102 | "Usage: %s [-options] infiles+ -o outputfilename\n" |
michael@0 | 103 | "\n" |
michael@0 | 104 | "Reads the infiles with normalization data and\n" |
michael@0 | 105 | "creates a binary file (outputfilename) with the data.\n" |
michael@0 | 106 | "\n", |
michael@0 | 107 | argv[0]); |
michael@0 | 108 | fprintf(stderr, |
michael@0 | 109 | "Options:\n" |
michael@0 | 110 | "\t-h or -? or --help this usage text\n" |
michael@0 | 111 | "\t-v or --verbose verbose output\n" |
michael@0 | 112 | "\t-c or --copyright include a copyright notice\n" |
michael@0 | 113 | "\t-u or --unicode Unicode version, followed by the version like 5.2.0\n"); |
michael@0 | 114 | fprintf(stderr, |
michael@0 | 115 | "\t-s or --sourcedir source directory, followed by the path\n" |
michael@0 | 116 | "\t-o or --output output filename\n"); |
michael@0 | 117 | fprintf(stderr, |
michael@0 | 118 | "\t --fast optimize the .nrm file for fast normalization,\n" |
michael@0 | 119 | "\t which might increase its size (Writes fully decomposed\n" |
michael@0 | 120 | "\t regular mappings instead of delta mappings.\n" |
michael@0 | 121 | "\t You should measure the runtime speed to make sure that\n" |
michael@0 | 122 | "\t this is a good trade-off.)\n"); |
michael@0 | 123 | return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; |
michael@0 | 124 | } |
michael@0 | 125 | |
michael@0 | 126 | beVerbose=options[VERBOSE].doesOccur; |
michael@0 | 127 | haveCopyright=options[COPYRIGHT].doesOccur; |
michael@0 | 128 | |
michael@0 | 129 | IcuToolErrorCode errorCode("gennorm2/main()"); |
michael@0 | 130 | |
michael@0 | 131 | #if UCONFIG_NO_NORMALIZATION |
michael@0 | 132 | |
michael@0 | 133 | fprintf(stderr, |
michael@0 | 134 | "gennorm2 writes a dummy binary data file " |
michael@0 | 135 | "because UCONFIG_NO_NORMALIZATION is set, \n" |
michael@0 | 136 | "see icu/source/common/unicode/uconfig.h\n"); |
michael@0 | 137 | udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode); |
michael@0 | 138 | // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on. |
michael@0 | 139 | // return U_UNSUPPORTED_ERROR; |
michael@0 | 140 | return 0; |
michael@0 | 141 | |
michael@0 | 142 | #else |
michael@0 | 143 | |
michael@0 | 144 | LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode)); |
michael@0 | 145 | errorCode.assertSuccess(); |
michael@0 | 146 | |
michael@0 | 147 | if(options[UNICODE_VERSION].doesOccur) { |
michael@0 | 148 | builder->setUnicodeVersion(options[UNICODE_VERSION].value); |
michael@0 | 149 | } |
michael@0 | 150 | |
michael@0 | 151 | if(options[OPT_FAST].doesOccur) { |
michael@0 | 152 | builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST); |
michael@0 | 153 | } |
michael@0 | 154 | |
michael@0 | 155 | // prepare the filename beginning with the source dir |
michael@0 | 156 | CharString filename(options[SOURCEDIR].value, errorCode); |
michael@0 | 157 | int32_t pathLength=filename.length(); |
michael@0 | 158 | if( pathLength>0 && |
michael@0 | 159 | filename[pathLength-1]!=U_FILE_SEP_CHAR && |
michael@0 | 160 | filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR |
michael@0 | 161 | ) { |
michael@0 | 162 | filename.append(U_FILE_SEP_CHAR, errorCode); |
michael@0 | 163 | pathLength=filename.length(); |
michael@0 | 164 | } |
michael@0 | 165 | |
michael@0 | 166 | for(int i=1; i<argc; ++i) { |
michael@0 | 167 | printf("gennorm2: processing %s\n", argv[i]); |
michael@0 | 168 | filename.append(argv[i], errorCode); |
michael@0 | 169 | LocalStdioFilePointer f(fopen(filename.data(), "r")); |
michael@0 | 170 | if(f==NULL) { |
michael@0 | 171 | fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data()); |
michael@0 | 172 | exit(U_FILE_ACCESS_ERROR); |
michael@0 | 173 | } |
michael@0 | 174 | builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS); |
michael@0 | 175 | parseFile(f.getAlias(), *builder); |
michael@0 | 176 | filename.truncate(pathLength); |
michael@0 | 177 | } |
michael@0 | 178 | |
michael@0 | 179 | builder->writeBinaryFile(options[OUTPUT_FILENAME].value); |
michael@0 | 180 | |
michael@0 | 181 | return errorCode.get(); |
michael@0 | 182 | |
michael@0 | 183 | #endif |
michael@0 | 184 | } |
michael@0 | 185 | |
michael@0 | 186 | #if !UCONFIG_NO_NORMALIZATION |
michael@0 | 187 | |
michael@0 | 188 | void parseFile(FILE *f, Normalizer2DataBuilder &builder) { |
michael@0 | 189 | IcuToolErrorCode errorCode("gennorm2/parseFile()"); |
michael@0 | 190 | char line[300]; |
michael@0 | 191 | uint32_t startCP, endCP; |
michael@0 | 192 | while(NULL!=fgets(line, (int)sizeof(line), f)) { |
michael@0 | 193 | char *comment=(char *)strchr(line, '#'); |
michael@0 | 194 | if(comment!=NULL) { |
michael@0 | 195 | *comment=0; |
michael@0 | 196 | } |
michael@0 | 197 | u_rtrim(line); |
michael@0 | 198 | if(line[0]==0) { |
michael@0 | 199 | continue; // skip empty and comment-only lines |
michael@0 | 200 | } |
michael@0 | 201 | if(line[0]=='*') { |
michael@0 | 202 | const char *s=u_skipWhitespace(line+1); |
michael@0 | 203 | if(0==strncmp(s, "Unicode", 7)) { |
michael@0 | 204 | s=u_skipWhitespace(s+7); |
michael@0 | 205 | builder.setUnicodeVersion(s); |
michael@0 | 206 | } |
michael@0 | 207 | continue; // reserved syntax |
michael@0 | 208 | } |
michael@0 | 209 | const char *delimiter; |
michael@0 | 210 | int32_t rangeLength= |
michael@0 | 211 | u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode); |
michael@0 | 212 | if(errorCode.isFailure()) { |
michael@0 | 213 | fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line); |
michael@0 | 214 | exit(errorCode.reset()); |
michael@0 | 215 | } |
michael@0 | 216 | delimiter=u_skipWhitespace(delimiter); |
michael@0 | 217 | if(*delimiter==':') { |
michael@0 | 218 | const char *s=u_skipWhitespace(delimiter+1); |
michael@0 | 219 | char *end; |
michael@0 | 220 | unsigned long value=strtoul(s, &end, 10); |
michael@0 | 221 | if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) { |
michael@0 | 222 | fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line); |
michael@0 | 223 | exit(U_PARSE_ERROR); |
michael@0 | 224 | } |
michael@0 | 225 | for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { |
michael@0 | 226 | builder.setCC(c, (uint8_t)value); |
michael@0 | 227 | } |
michael@0 | 228 | continue; |
michael@0 | 229 | } |
michael@0 | 230 | if(*delimiter=='-') { |
michael@0 | 231 | if(*u_skipWhitespace(delimiter+1)!=0) { |
michael@0 | 232 | fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line); |
michael@0 | 233 | exit(U_PARSE_ERROR); |
michael@0 | 234 | } |
michael@0 | 235 | for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { |
michael@0 | 236 | builder.removeMapping(c); |
michael@0 | 237 | } |
michael@0 | 238 | continue; |
michael@0 | 239 | } |
michael@0 | 240 | if(*delimiter=='=' || *delimiter=='>') { |
michael@0 | 241 | UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK]; |
michael@0 | 242 | int32_t length=u_parseString(delimiter+1, uchars, LENGTHOF(uchars), NULL, errorCode); |
michael@0 | 243 | if(errorCode.isFailure()) { |
michael@0 | 244 | fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line); |
michael@0 | 245 | exit(errorCode.reset()); |
michael@0 | 246 | } |
michael@0 | 247 | UnicodeString mapping(FALSE, uchars, length); |
michael@0 | 248 | if(*delimiter=='=') { |
michael@0 | 249 | if(rangeLength!=1) { |
michael@0 | 250 | fprintf(stderr, |
michael@0 | 251 | "gennorm2 error: round-trip mapping for more than 1 code point on %s\n", |
michael@0 | 252 | line); |
michael@0 | 253 | exit(U_PARSE_ERROR); |
michael@0 | 254 | } |
michael@0 | 255 | builder.setRoundTripMapping((UChar32)startCP, mapping); |
michael@0 | 256 | } else { |
michael@0 | 257 | for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { |
michael@0 | 258 | builder.setOneWayMapping(c, mapping); |
michael@0 | 259 | } |
michael@0 | 260 | } |
michael@0 | 261 | continue; |
michael@0 | 262 | } |
michael@0 | 263 | fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line); |
michael@0 | 264 | exit(U_PARSE_ERROR); |
michael@0 | 265 | } |
michael@0 | 266 | } |
michael@0 | 267 | |
michael@0 | 268 | #endif // !UCONFIG_NO_NORMALIZATION |
michael@0 | 269 | |
michael@0 | 270 | U_NAMESPACE_END |
michael@0 | 271 | |
michael@0 | 272 | /* |
michael@0 | 273 | * Hey, Emacs, please set the following: |
michael@0 | 274 | * |
michael@0 | 275 | * Local Variables: |
michael@0 | 276 | * indent-tabs-mode: nil |
michael@0 | 277 | * End: |
michael@0 | 278 | * |
michael@0 | 279 | */ |