1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/tools/gennorm2/gennorm2.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,279 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* 1.7 +* Copyright (C) 2009-2012, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +******************************************************************************* 1.11 +* file name: gennorm2.cpp 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 2009nov25 1.17 +* created by: Markus W. Scherer 1.18 +* 1.19 +* This program reads text files that define Unicode normalization, 1.20 +* parses them, and builds a binary data file. 1.21 +*/ 1.22 + 1.23 +#include "unicode/utypes.h" 1.24 +#include "n2builder.h" 1.25 + 1.26 +#include <stdio.h> 1.27 +#include <stdlib.h> 1.28 +#include <string.h> 1.29 +#include "unicode/errorcode.h" 1.30 +#include "unicode/localpointer.h" 1.31 +#include "unicode/putil.h" 1.32 +#include "unicode/uchar.h" 1.33 +#include "unicode/unistr.h" 1.34 +#include "charstr.h" 1.35 +#include "normalizer2impl.h" 1.36 +#include "toolutil.h" 1.37 +#include "uoptions.h" 1.38 +#include "uparse.h" 1.39 + 1.40 +#if UCONFIG_NO_NORMALIZATION 1.41 +#include "unewdata.h" 1.42 +#endif 1.43 + 1.44 +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 1.45 + 1.46 +U_NAMESPACE_BEGIN 1.47 + 1.48 +UBool beVerbose=FALSE, haveCopyright=TRUE; 1.49 + 1.50 +U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose); 1.51 + 1.52 +#if !UCONFIG_NO_NORMALIZATION 1.53 +void parseFile(FILE *f, Normalizer2DataBuilder &builder); 1.54 +#endif 1.55 + 1.56 +/* -------------------------------------------------------------------------- */ 1.57 + 1.58 +enum { 1.59 + HELP_H, 1.60 + HELP_QUESTION_MARK, 1.61 + VERBOSE, 1.62 + COPYRIGHT, 1.63 + SOURCEDIR, 1.64 + OUTPUT_FILENAME, 1.65 + UNICODE_VERSION, 1.66 + OPT_FAST 1.67 +}; 1.68 + 1.69 +static UOption options[]={ 1.70 + UOPTION_HELP_H, 1.71 + UOPTION_HELP_QUESTION_MARK, 1.72 + UOPTION_VERBOSE, 1.73 + UOPTION_COPYRIGHT, 1.74 + UOPTION_SOURCEDIR, 1.75 + UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG), 1.76 + UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG), 1.77 + UOPTION_DEF("fast", '\1', UOPT_NO_ARG) 1.78 +}; 1.79 + 1.80 +extern "C" int 1.81 +main(int argc, char* argv[]) { 1.82 + U_MAIN_INIT_ARGS(argc, argv); 1.83 + 1.84 + /* preset then read command line options */ 1.85 + options[SOURCEDIR].value=""; 1.86 + argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options); 1.87 + 1.88 + /* error handling, printing usage message */ 1.89 + if(argc<0) { 1.90 + fprintf(stderr, 1.91 + "error in command line argument \"%s\"\n", 1.92 + argv[-argc]); 1.93 + } 1.94 + if(!options[OUTPUT_FILENAME].doesOccur) { 1.95 + argc=-1; 1.96 + } 1.97 + if( argc<2 || 1.98 + options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur 1.99 + ) { 1.100 + /* 1.101 + * Broken into chunks because the C89 standard says the minimum 1.102 + * required supported string length is 509 bytes. 1.103 + */ 1.104 + fprintf(stderr, 1.105 + "Usage: %s [-options] infiles+ -o outputfilename\n" 1.106 + "\n" 1.107 + "Reads the infiles with normalization data and\n" 1.108 + "creates a binary file (outputfilename) with the data.\n" 1.109 + "\n", 1.110 + argv[0]); 1.111 + fprintf(stderr, 1.112 + "Options:\n" 1.113 + "\t-h or -? or --help this usage text\n" 1.114 + "\t-v or --verbose verbose output\n" 1.115 + "\t-c or --copyright include a copyright notice\n" 1.116 + "\t-u or --unicode Unicode version, followed by the version like 5.2.0\n"); 1.117 + fprintf(stderr, 1.118 + "\t-s or --sourcedir source directory, followed by the path\n" 1.119 + "\t-o or --output output filename\n"); 1.120 + fprintf(stderr, 1.121 + "\t --fast optimize the .nrm file for fast normalization,\n" 1.122 + "\t which might increase its size (Writes fully decomposed\n" 1.123 + "\t regular mappings instead of delta mappings.\n" 1.124 + "\t You should measure the runtime speed to make sure that\n" 1.125 + "\t this is a good trade-off.)\n"); 1.126 + return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; 1.127 + } 1.128 + 1.129 + beVerbose=options[VERBOSE].doesOccur; 1.130 + haveCopyright=options[COPYRIGHT].doesOccur; 1.131 + 1.132 + IcuToolErrorCode errorCode("gennorm2/main()"); 1.133 + 1.134 +#if UCONFIG_NO_NORMALIZATION 1.135 + 1.136 + fprintf(stderr, 1.137 + "gennorm2 writes a dummy binary data file " 1.138 + "because UCONFIG_NO_NORMALIZATION is set, \n" 1.139 + "see icu/source/common/unicode/uconfig.h\n"); 1.140 + udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode); 1.141 + // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on. 1.142 + // return U_UNSUPPORTED_ERROR; 1.143 + return 0; 1.144 + 1.145 +#else 1.146 + 1.147 + LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode)); 1.148 + errorCode.assertSuccess(); 1.149 + 1.150 + if(options[UNICODE_VERSION].doesOccur) { 1.151 + builder->setUnicodeVersion(options[UNICODE_VERSION].value); 1.152 + } 1.153 + 1.154 + if(options[OPT_FAST].doesOccur) { 1.155 + builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST); 1.156 + } 1.157 + 1.158 + // prepare the filename beginning with the source dir 1.159 + CharString filename(options[SOURCEDIR].value, errorCode); 1.160 + int32_t pathLength=filename.length(); 1.161 + if( pathLength>0 && 1.162 + filename[pathLength-1]!=U_FILE_SEP_CHAR && 1.163 + filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR 1.164 + ) { 1.165 + filename.append(U_FILE_SEP_CHAR, errorCode); 1.166 + pathLength=filename.length(); 1.167 + } 1.168 + 1.169 + for(int i=1; i<argc; ++i) { 1.170 + printf("gennorm2: processing %s\n", argv[i]); 1.171 + filename.append(argv[i], errorCode); 1.172 + LocalStdioFilePointer f(fopen(filename.data(), "r")); 1.173 + if(f==NULL) { 1.174 + fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data()); 1.175 + exit(U_FILE_ACCESS_ERROR); 1.176 + } 1.177 + builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS); 1.178 + parseFile(f.getAlias(), *builder); 1.179 + filename.truncate(pathLength); 1.180 + } 1.181 + 1.182 + builder->writeBinaryFile(options[OUTPUT_FILENAME].value); 1.183 + 1.184 + return errorCode.get(); 1.185 + 1.186 +#endif 1.187 +} 1.188 + 1.189 +#if !UCONFIG_NO_NORMALIZATION 1.190 + 1.191 +void parseFile(FILE *f, Normalizer2DataBuilder &builder) { 1.192 + IcuToolErrorCode errorCode("gennorm2/parseFile()"); 1.193 + char line[300]; 1.194 + uint32_t startCP, endCP; 1.195 + while(NULL!=fgets(line, (int)sizeof(line), f)) { 1.196 + char *comment=(char *)strchr(line, '#'); 1.197 + if(comment!=NULL) { 1.198 + *comment=0; 1.199 + } 1.200 + u_rtrim(line); 1.201 + if(line[0]==0) { 1.202 + continue; // skip empty and comment-only lines 1.203 + } 1.204 + if(line[0]=='*') { 1.205 + const char *s=u_skipWhitespace(line+1); 1.206 + if(0==strncmp(s, "Unicode", 7)) { 1.207 + s=u_skipWhitespace(s+7); 1.208 + builder.setUnicodeVersion(s); 1.209 + } 1.210 + continue; // reserved syntax 1.211 + } 1.212 + const char *delimiter; 1.213 + int32_t rangeLength= 1.214 + u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode); 1.215 + if(errorCode.isFailure()) { 1.216 + fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line); 1.217 + exit(errorCode.reset()); 1.218 + } 1.219 + delimiter=u_skipWhitespace(delimiter); 1.220 + if(*delimiter==':') { 1.221 + const char *s=u_skipWhitespace(delimiter+1); 1.222 + char *end; 1.223 + unsigned long value=strtoul(s, &end, 10); 1.224 + if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) { 1.225 + fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line); 1.226 + exit(U_PARSE_ERROR); 1.227 + } 1.228 + for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { 1.229 + builder.setCC(c, (uint8_t)value); 1.230 + } 1.231 + continue; 1.232 + } 1.233 + if(*delimiter=='-') { 1.234 + if(*u_skipWhitespace(delimiter+1)!=0) { 1.235 + fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line); 1.236 + exit(U_PARSE_ERROR); 1.237 + } 1.238 + for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { 1.239 + builder.removeMapping(c); 1.240 + } 1.241 + continue; 1.242 + } 1.243 + if(*delimiter=='=' || *delimiter=='>') { 1.244 + UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK]; 1.245 + int32_t length=u_parseString(delimiter+1, uchars, LENGTHOF(uchars), NULL, errorCode); 1.246 + if(errorCode.isFailure()) { 1.247 + fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line); 1.248 + exit(errorCode.reset()); 1.249 + } 1.250 + UnicodeString mapping(FALSE, uchars, length); 1.251 + if(*delimiter=='=') { 1.252 + if(rangeLength!=1) { 1.253 + fprintf(stderr, 1.254 + "gennorm2 error: round-trip mapping for more than 1 code point on %s\n", 1.255 + line); 1.256 + exit(U_PARSE_ERROR); 1.257 + } 1.258 + builder.setRoundTripMapping((UChar32)startCP, mapping); 1.259 + } else { 1.260 + for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { 1.261 + builder.setOneWayMapping(c, mapping); 1.262 + } 1.263 + } 1.264 + continue; 1.265 + } 1.266 + fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line); 1.267 + exit(U_PARSE_ERROR); 1.268 + } 1.269 +} 1.270 + 1.271 +#endif // !UCONFIG_NO_NORMALIZATION 1.272 + 1.273 +U_NAMESPACE_END 1.274 + 1.275 +/* 1.276 + * Hey, Emacs, please set the following: 1.277 + * 1.278 + * Local Variables: 1.279 + * indent-tabs-mode: nil 1.280 + * End: 1.281 + * 1.282 + */