intl/icu/source/tools/gennorm2/gennorm2.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 *******************************************************************************
michael@0 3 *
michael@0 4 * Copyright (C) 2009-2012, International Business Machines
michael@0 5 * Corporation and others. All Rights Reserved.
michael@0 6 *
michael@0 7 *******************************************************************************
michael@0 8 * file name: gennorm2.cpp
michael@0 9 * encoding: US-ASCII
michael@0 10 * tab size: 8 (not used)
michael@0 11 * indentation:4
michael@0 12 *
michael@0 13 * created on: 2009nov25
michael@0 14 * created by: Markus W. Scherer
michael@0 15 *
michael@0 16 * This program reads text files that define Unicode normalization,
michael@0 17 * parses them, and builds a binary data file.
michael@0 18 */
michael@0 19
michael@0 20 #include "unicode/utypes.h"
michael@0 21 #include "n2builder.h"
michael@0 22
michael@0 23 #include <stdio.h>
michael@0 24 #include <stdlib.h>
michael@0 25 #include <string.h>
michael@0 26 #include "unicode/errorcode.h"
michael@0 27 #include "unicode/localpointer.h"
michael@0 28 #include "unicode/putil.h"
michael@0 29 #include "unicode/uchar.h"
michael@0 30 #include "unicode/unistr.h"
michael@0 31 #include "charstr.h"
michael@0 32 #include "normalizer2impl.h"
michael@0 33 #include "toolutil.h"
michael@0 34 #include "uoptions.h"
michael@0 35 #include "uparse.h"
michael@0 36
michael@0 37 #if UCONFIG_NO_NORMALIZATION
michael@0 38 #include "unewdata.h"
michael@0 39 #endif
michael@0 40
michael@0 41 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
michael@0 42
michael@0 43 U_NAMESPACE_BEGIN
michael@0 44
michael@0 45 UBool beVerbose=FALSE, haveCopyright=TRUE;
michael@0 46
michael@0 47 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
michael@0 48
michael@0 49 #if !UCONFIG_NO_NORMALIZATION
michael@0 50 void parseFile(FILE *f, Normalizer2DataBuilder &builder);
michael@0 51 #endif
michael@0 52
michael@0 53 /* -------------------------------------------------------------------------- */
michael@0 54
michael@0 55 enum {
michael@0 56 HELP_H,
michael@0 57 HELP_QUESTION_MARK,
michael@0 58 VERBOSE,
michael@0 59 COPYRIGHT,
michael@0 60 SOURCEDIR,
michael@0 61 OUTPUT_FILENAME,
michael@0 62 UNICODE_VERSION,
michael@0 63 OPT_FAST
michael@0 64 };
michael@0 65
michael@0 66 static UOption options[]={
michael@0 67 UOPTION_HELP_H,
michael@0 68 UOPTION_HELP_QUESTION_MARK,
michael@0 69 UOPTION_VERBOSE,
michael@0 70 UOPTION_COPYRIGHT,
michael@0 71 UOPTION_SOURCEDIR,
michael@0 72 UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
michael@0 73 UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
michael@0 74 UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
michael@0 75 };
michael@0 76
michael@0 77 extern "C" int
michael@0 78 main(int argc, char* argv[]) {
michael@0 79 U_MAIN_INIT_ARGS(argc, argv);
michael@0 80
michael@0 81 /* preset then read command line options */
michael@0 82 options[SOURCEDIR].value="";
michael@0 83 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options);
michael@0 84
michael@0 85 /* error handling, printing usage message */
michael@0 86 if(argc<0) {
michael@0 87 fprintf(stderr,
michael@0 88 "error in command line argument \"%s\"\n",
michael@0 89 argv[-argc]);
michael@0 90 }
michael@0 91 if(!options[OUTPUT_FILENAME].doesOccur) {
michael@0 92 argc=-1;
michael@0 93 }
michael@0 94 if( argc<2 ||
michael@0 95 options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
michael@0 96 ) {
michael@0 97 /*
michael@0 98 * Broken into chunks because the C89 standard says the minimum
michael@0 99 * required supported string length is 509 bytes.
michael@0 100 */
michael@0 101 fprintf(stderr,
michael@0 102 "Usage: %s [-options] infiles+ -o outputfilename\n"
michael@0 103 "\n"
michael@0 104 "Reads the infiles with normalization data and\n"
michael@0 105 "creates a binary file (outputfilename) with the data.\n"
michael@0 106 "\n",
michael@0 107 argv[0]);
michael@0 108 fprintf(stderr,
michael@0 109 "Options:\n"
michael@0 110 "\t-h or -? or --help this usage text\n"
michael@0 111 "\t-v or --verbose verbose output\n"
michael@0 112 "\t-c or --copyright include a copyright notice\n"
michael@0 113 "\t-u or --unicode Unicode version, followed by the version like 5.2.0\n");
michael@0 114 fprintf(stderr,
michael@0 115 "\t-s or --sourcedir source directory, followed by the path\n"
michael@0 116 "\t-o or --output output filename\n");
michael@0 117 fprintf(stderr,
michael@0 118 "\t --fast optimize the .nrm file for fast normalization,\n"
michael@0 119 "\t which might increase its size (Writes fully decomposed\n"
michael@0 120 "\t regular mappings instead of delta mappings.\n"
michael@0 121 "\t You should measure the runtime speed to make sure that\n"
michael@0 122 "\t this is a good trade-off.)\n");
michael@0 123 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
michael@0 124 }
michael@0 125
michael@0 126 beVerbose=options[VERBOSE].doesOccur;
michael@0 127 haveCopyright=options[COPYRIGHT].doesOccur;
michael@0 128
michael@0 129 IcuToolErrorCode errorCode("gennorm2/main()");
michael@0 130
michael@0 131 #if UCONFIG_NO_NORMALIZATION
michael@0 132
michael@0 133 fprintf(stderr,
michael@0 134 "gennorm2 writes a dummy binary data file "
michael@0 135 "because UCONFIG_NO_NORMALIZATION is set, \n"
michael@0 136 "see icu/source/common/unicode/uconfig.h\n");
michael@0 137 udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode);
michael@0 138 // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on.
michael@0 139 // return U_UNSUPPORTED_ERROR;
michael@0 140 return 0;
michael@0 141
michael@0 142 #else
michael@0 143
michael@0 144 LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode));
michael@0 145 errorCode.assertSuccess();
michael@0 146
michael@0 147 if(options[UNICODE_VERSION].doesOccur) {
michael@0 148 builder->setUnicodeVersion(options[UNICODE_VERSION].value);
michael@0 149 }
michael@0 150
michael@0 151 if(options[OPT_FAST].doesOccur) {
michael@0 152 builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
michael@0 153 }
michael@0 154
michael@0 155 // prepare the filename beginning with the source dir
michael@0 156 CharString filename(options[SOURCEDIR].value, errorCode);
michael@0 157 int32_t pathLength=filename.length();
michael@0 158 if( pathLength>0 &&
michael@0 159 filename[pathLength-1]!=U_FILE_SEP_CHAR &&
michael@0 160 filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR
michael@0 161 ) {
michael@0 162 filename.append(U_FILE_SEP_CHAR, errorCode);
michael@0 163 pathLength=filename.length();
michael@0 164 }
michael@0 165
michael@0 166 for(int i=1; i<argc; ++i) {
michael@0 167 printf("gennorm2: processing %s\n", argv[i]);
michael@0 168 filename.append(argv[i], errorCode);
michael@0 169 LocalStdioFilePointer f(fopen(filename.data(), "r"));
michael@0 170 if(f==NULL) {
michael@0 171 fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data());
michael@0 172 exit(U_FILE_ACCESS_ERROR);
michael@0 173 }
michael@0 174 builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS);
michael@0 175 parseFile(f.getAlias(), *builder);
michael@0 176 filename.truncate(pathLength);
michael@0 177 }
michael@0 178
michael@0 179 builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
michael@0 180
michael@0 181 return errorCode.get();
michael@0 182
michael@0 183 #endif
michael@0 184 }
michael@0 185
michael@0 186 #if !UCONFIG_NO_NORMALIZATION
michael@0 187
michael@0 188 void parseFile(FILE *f, Normalizer2DataBuilder &builder) {
michael@0 189 IcuToolErrorCode errorCode("gennorm2/parseFile()");
michael@0 190 char line[300];
michael@0 191 uint32_t startCP, endCP;
michael@0 192 while(NULL!=fgets(line, (int)sizeof(line), f)) {
michael@0 193 char *comment=(char *)strchr(line, '#');
michael@0 194 if(comment!=NULL) {
michael@0 195 *comment=0;
michael@0 196 }
michael@0 197 u_rtrim(line);
michael@0 198 if(line[0]==0) {
michael@0 199 continue; // skip empty and comment-only lines
michael@0 200 }
michael@0 201 if(line[0]=='*') {
michael@0 202 const char *s=u_skipWhitespace(line+1);
michael@0 203 if(0==strncmp(s, "Unicode", 7)) {
michael@0 204 s=u_skipWhitespace(s+7);
michael@0 205 builder.setUnicodeVersion(s);
michael@0 206 }
michael@0 207 continue; // reserved syntax
michael@0 208 }
michael@0 209 const char *delimiter;
michael@0 210 int32_t rangeLength=
michael@0 211 u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode);
michael@0 212 if(errorCode.isFailure()) {
michael@0 213 fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
michael@0 214 exit(errorCode.reset());
michael@0 215 }
michael@0 216 delimiter=u_skipWhitespace(delimiter);
michael@0 217 if(*delimiter==':') {
michael@0 218 const char *s=u_skipWhitespace(delimiter+1);
michael@0 219 char *end;
michael@0 220 unsigned long value=strtoul(s, &end, 10);
michael@0 221 if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) {
michael@0 222 fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line);
michael@0 223 exit(U_PARSE_ERROR);
michael@0 224 }
michael@0 225 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
michael@0 226 builder.setCC(c, (uint8_t)value);
michael@0 227 }
michael@0 228 continue;
michael@0 229 }
michael@0 230 if(*delimiter=='-') {
michael@0 231 if(*u_skipWhitespace(delimiter+1)!=0) {
michael@0 232 fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line);
michael@0 233 exit(U_PARSE_ERROR);
michael@0 234 }
michael@0 235 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
michael@0 236 builder.removeMapping(c);
michael@0 237 }
michael@0 238 continue;
michael@0 239 }
michael@0 240 if(*delimiter=='=' || *delimiter=='>') {
michael@0 241 UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK];
michael@0 242 int32_t length=u_parseString(delimiter+1, uchars, LENGTHOF(uchars), NULL, errorCode);
michael@0 243 if(errorCode.isFailure()) {
michael@0 244 fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line);
michael@0 245 exit(errorCode.reset());
michael@0 246 }
michael@0 247 UnicodeString mapping(FALSE, uchars, length);
michael@0 248 if(*delimiter=='=') {
michael@0 249 if(rangeLength!=1) {
michael@0 250 fprintf(stderr,
michael@0 251 "gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
michael@0 252 line);
michael@0 253 exit(U_PARSE_ERROR);
michael@0 254 }
michael@0 255 builder.setRoundTripMapping((UChar32)startCP, mapping);
michael@0 256 } else {
michael@0 257 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
michael@0 258 builder.setOneWayMapping(c, mapping);
michael@0 259 }
michael@0 260 }
michael@0 261 continue;
michael@0 262 }
michael@0 263 fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line);
michael@0 264 exit(U_PARSE_ERROR);
michael@0 265 }
michael@0 266 }
michael@0 267
michael@0 268 #endif // !UCONFIG_NO_NORMALIZATION
michael@0 269
michael@0 270 U_NAMESPACE_END
michael@0 271
michael@0 272 /*
michael@0 273 * Hey, Emacs, please set the following:
michael@0 274 *
michael@0 275 * Local Variables:
michael@0 276 * indent-tabs-mode: nil
michael@0 277 * End:
michael@0 278 *
michael@0 279 */

mercurial