intl/icu/source/tools/gensprep/gensprep.c

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 *******************************************************************************
michael@0 3 *
michael@0 4 * Copyright (C) 2003-2012, International Business Machines
michael@0 5 * Corporation and others. All Rights Reserved.
michael@0 6 *
michael@0 7 *******************************************************************************
michael@0 8 * file name: gensprep.c
michael@0 9 * encoding: US-ASCII
michael@0 10 * tab size: 8 (not used)
michael@0 11 * indentation:4
michael@0 12 *
michael@0 13 * created on: 2003-02-06
michael@0 14 * created by: Ram Viswanadha
michael@0 15 *
michael@0 16 * This program reads the Profile.txt files,
michael@0 17 * parses them, and extracts the data for StringPrep profile.
michael@0 18 * It then preprocesses it and writes a binary file for efficient use
michael@0 19 * in various StringPrep conversion processes.
michael@0 20 */
michael@0 21
michael@0 22 #define USPREP_TYPE_NAMES_ARRAY 1
michael@0 23
michael@0 24 #include <stdio.h>
michael@0 25 #include <stdlib.h>
michael@0 26
michael@0 27 #include "cmemory.h"
michael@0 28 #include "cstring.h"
michael@0 29 #include "unewdata.h"
michael@0 30 #include "uoptions.h"
michael@0 31 #include "uparse.h"
michael@0 32 #include "sprpimpl.h"
michael@0 33
michael@0 34 #include "unicode/uclean.h"
michael@0 35 #include "unicode/udata.h"
michael@0 36 #include "unicode/utypes.h"
michael@0 37 #include "unicode/putil.h"
michael@0 38
michael@0 39
michael@0 40 U_CDECL_BEGIN
michael@0 41 #include "gensprep.h"
michael@0 42 U_CDECL_END
michael@0 43
michael@0 44 UBool beVerbose=FALSE, haveCopyright=TRUE;
michael@0 45
michael@0 46 #define NORM_CORRECTIONS_FILE_NAME "NormalizationCorrections.txt"
michael@0 47
michael@0 48 #define NORMALIZE_DIRECTIVE "normalize"
michael@0 49 #define NORMALIZE_DIRECTIVE_LEN 9
michael@0 50 #define CHECK_BIDI_DIRECTIVE "check-bidi"
michael@0 51 #define CHECK_BIDI_DIRECTIVE_LEN 10
michael@0 52
michael@0 53 /* prototypes --------------------------------------------------------------- */
michael@0 54
michael@0 55 static void
michael@0 56 parseMappings(const char *filename, UBool reportError, UErrorCode *pErrorCode);
michael@0 57
michael@0 58 static void
michael@0 59 parseNormalizationCorrections(const char *filename, UErrorCode *pErrorCode);
michael@0 60
michael@0 61
michael@0 62 /* -------------------------------------------------------------------------- */
michael@0 63
michael@0 64 static UOption options[]={
michael@0 65 UOPTION_HELP_H,
michael@0 66 UOPTION_HELP_QUESTION_MARK,
michael@0 67 UOPTION_VERBOSE,
michael@0 68 UOPTION_COPYRIGHT,
michael@0 69 UOPTION_DESTDIR,
michael@0 70 UOPTION_SOURCEDIR,
michael@0 71 UOPTION_ICUDATADIR,
michael@0 72 UOPTION_BUNDLE_NAME,
michael@0 73 { "normalization", NULL, NULL, NULL, 'n', UOPT_REQUIRES_ARG, 0 },
michael@0 74 { "norm-correction", NULL, NULL, NULL, 'm', UOPT_REQUIRES_ARG, 0 },
michael@0 75 { "check-bidi", NULL, NULL, NULL, 'k', UOPT_NO_ARG, 0},
michael@0 76 { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 },
michael@0 77 };
michael@0 78
michael@0 79 enum{
michael@0 80 HELP,
michael@0 81 HELP_QUESTION_MARK,
michael@0 82 VERBOSE,
michael@0 83 COPYRIGHT,
michael@0 84 DESTDIR,
michael@0 85 SOURCEDIR,
michael@0 86 ICUDATADIR,
michael@0 87 BUNDLE_NAME,
michael@0 88 NORMALIZE,
michael@0 89 NORM_CORRECTION_DIR,
michael@0 90 CHECK_BIDI,
michael@0 91 UNICODE_VERSION
michael@0 92 };
michael@0 93
michael@0 94 static int printHelp(int argc, char* argv[]){
michael@0 95 /*
michael@0 96 * Broken into chucks because the C89 standard says the minimum
michael@0 97 * required supported string length is 509 bytes.
michael@0 98 */
michael@0 99 fprintf(stderr,
michael@0 100 "Usage: %s [-options] [file_name]\n"
michael@0 101 "\n"
michael@0 102 "Read the files specified and\n"
michael@0 103 "create a binary file [package-name]_[bundle-name]." DATA_TYPE " with the StringPrep profile data\n"
michael@0 104 "\n",
michael@0 105 argv[0]);
michael@0 106 fprintf(stderr,
michael@0 107 "Options:\n"
michael@0 108 "\t-h or -? or --help print this usage text\n"
michael@0 109 "\t-v or --verbose verbose output\n"
michael@0 110 "\t-c or --copyright include a copyright notice\n");
michael@0 111 fprintf(stderr,
michael@0 112 "\t-d or --destdir destination directory, followed by the path\n"
michael@0 113 "\t-s or --sourcedir source directory of ICU data, followed by the path\n"
michael@0 114 "\t-b or --bundle-name generate the ouput data file with the name specified\n"
michael@0 115 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
michael@0 116 "\t followed by path, defaults to %s\n",
michael@0 117 u_getDataDirectory());
michael@0 118 fprintf(stderr,
michael@0 119 "\t-n or --normalize turn on the option for normalization and include mappings\n"
michael@0 120 "\t from NormalizationCorrections.txt from the given path,\n"
michael@0 121 "\t e.g: /test/icu/source/data/unidata\n");
michael@0 122 fprintf(stderr,
michael@0 123 "\t-m or --norm-correction use NormalizationCorrections.txt from the given path\n"
michael@0 124 "\t when the input file contains a normalization directive.\n"
michael@0 125 "\t unlike -n/--normalize, this option does not force the\n"
michael@0 126 "\t normalization.\n");
michael@0 127 fprintf(stderr,
michael@0 128 "\t-k or --check-bidi turn on the option for checking for BiDi in the profile\n"
michael@0 129 "\t-u or --unicode version of Unicode to be used with this profile followed by the version\n"
michael@0 130 );
michael@0 131 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
michael@0 132 }
michael@0 133
michael@0 134
michael@0 135 extern int
michael@0 136 main(int argc, char* argv[]) {
michael@0 137 #if !UCONFIG_NO_IDNA
michael@0 138 char* filename = NULL;
michael@0 139 #endif
michael@0 140 const char *srcDir=NULL, *destDir=NULL, *icuUniDataDir=NULL;
michael@0 141 const char *bundleName=NULL, *inputFileName = NULL;
michael@0 142 char *basename=NULL;
michael@0 143 int32_t sprepOptions = 0;
michael@0 144
michael@0 145 UErrorCode errorCode=U_ZERO_ERROR;
michael@0 146
michael@0 147 U_MAIN_INIT_ARGS(argc, argv);
michael@0 148
michael@0 149 /* preset then read command line options */
michael@0 150 options[DESTDIR].value=u_getDataDirectory();
michael@0 151 options[SOURCEDIR].value="";
michael@0 152 options[UNICODE_VERSION].value="0"; /* don't assume the unicode version */
michael@0 153 options[BUNDLE_NAME].value = DATA_NAME;
michael@0 154 options[NORMALIZE].value = "";
michael@0 155
michael@0 156 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
michael@0 157
michael@0 158 /* error handling, printing usage message */
michael@0 159 if(argc<0) {
michael@0 160 fprintf(stderr,
michael@0 161 "error in command line argument \"%s\"\n",
michael@0 162 argv[-argc]);
michael@0 163 }
michael@0 164 if(argc<0 || options[HELP].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
michael@0 165 return printHelp(argc, argv);
michael@0 166
michael@0 167 }
michael@0 168
michael@0 169 /* get the options values */
michael@0 170 beVerbose=options[VERBOSE].doesOccur;
michael@0 171 haveCopyright=options[COPYRIGHT].doesOccur;
michael@0 172 srcDir=options[SOURCEDIR].value;
michael@0 173 destDir=options[DESTDIR].value;
michael@0 174 bundleName = options[BUNDLE_NAME].value;
michael@0 175 if(options[NORMALIZE].doesOccur) {
michael@0 176 icuUniDataDir = options[NORMALIZE].value;
michael@0 177 } else {
michael@0 178 icuUniDataDir = options[NORM_CORRECTION_DIR].value;
michael@0 179 }
michael@0 180
michael@0 181 if(argc<2) {
michael@0 182 /* print the help message */
michael@0 183 return printHelp(argc, argv);
michael@0 184 } else {
michael@0 185 inputFileName = argv[1];
michael@0 186 }
michael@0 187 if(!options[UNICODE_VERSION].doesOccur){
michael@0 188 return printHelp(argc, argv);
michael@0 189 }
michael@0 190 if(options[ICUDATADIR].doesOccur) {
michael@0 191 u_setDataDirectory(options[ICUDATADIR].value);
michael@0 192 }
michael@0 193 #if UCONFIG_NO_IDNA
michael@0 194
michael@0 195 fprintf(stderr,
michael@0 196 "gensprep writes dummy " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE
michael@0 197 " because UCONFIG_NO_IDNA is set, \n"
michael@0 198 "see icu/source/common/unicode/uconfig.h\n");
michael@0 199 generateData(destDir, bundleName);
michael@0 200
michael@0 201 #else
michael@0 202
michael@0 203 setUnicodeVersion(options[UNICODE_VERSION].value);
michael@0 204 filename = (char* ) uprv_malloc(uprv_strlen(srcDir) + 300); /* hopefully this should be enough */
michael@0 205
michael@0 206 /* prepare the filename beginning with the source dir */
michael@0 207 if(uprv_strchr(srcDir,U_FILE_SEP_CHAR) == NULL && uprv_strchr(srcDir,U_FILE_ALT_SEP_CHAR) == NULL){
michael@0 208 filename[0] = '.';
michael@0 209 filename[1] = U_FILE_SEP_CHAR;
michael@0 210 uprv_strcpy(filename+2,srcDir);
michael@0 211 }else{
michael@0 212 uprv_strcpy(filename, srcDir);
michael@0 213 }
michael@0 214
michael@0 215 basename=filename+uprv_strlen(filename);
michael@0 216 if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
michael@0 217 *basename++=U_FILE_SEP_CHAR;
michael@0 218 }
michael@0 219
michael@0 220 /* initialize */
michael@0 221 init();
michael@0 222
michael@0 223 /* process the file */
michael@0 224 uprv_strcpy(basename,inputFileName);
michael@0 225 parseMappings(filename,FALSE, &errorCode);
michael@0 226 if(U_FAILURE(errorCode)) {
michael@0 227 fprintf(stderr, "Could not open file %s for reading. Error: %s \n", filename, u_errorName(errorCode));
michael@0 228 return errorCode;
michael@0 229 }
michael@0 230
michael@0 231 if(options[NORMALIZE].doesOccur){ /* this option might be set by @normalize;; in the source file */
michael@0 232 /* set up directory for NormalizationCorrections.txt */
michael@0 233 uprv_strcpy(filename,icuUniDataDir);
michael@0 234 basename=filename+uprv_strlen(filename);
michael@0 235 if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
michael@0 236 *basename++=U_FILE_SEP_CHAR;
michael@0 237 }
michael@0 238
michael@0 239 *basename++=U_FILE_SEP_CHAR;
michael@0 240 uprv_strcpy(basename,NORM_CORRECTIONS_FILE_NAME);
michael@0 241
michael@0 242 parseNormalizationCorrections(filename,&errorCode);
michael@0 243 if(U_FAILURE(errorCode)){
michael@0 244 fprintf(stderr,"Could not open file %s for reading \n", filename);
michael@0 245 return errorCode;
michael@0 246 }
michael@0 247 sprepOptions |= _SPREP_NORMALIZATION_ON;
michael@0 248 }
michael@0 249
michael@0 250 if(options[CHECK_BIDI].doesOccur){ /* this option might be set by @check-bidi;; in the source file */
michael@0 251 sprepOptions |= _SPREP_CHECK_BIDI_ON;
michael@0 252 }
michael@0 253
michael@0 254 setOptions(sprepOptions);
michael@0 255
michael@0 256 /* process parsed data */
michael@0 257 if(U_SUCCESS(errorCode)) {
michael@0 258 /* write the data file */
michael@0 259 generateData(destDir, bundleName);
michael@0 260
michael@0 261 cleanUpData();
michael@0 262 }
michael@0 263
michael@0 264 uprv_free(filename);
michael@0 265
michael@0 266 u_cleanup();
michael@0 267
michael@0 268 #endif
michael@0 269
michael@0 270 return errorCode;
michael@0 271 }
michael@0 272
michael@0 273 #if !UCONFIG_NO_IDNA
michael@0 274
michael@0 275 static void U_CALLCONV
michael@0 276 normalizationCorrectionsLineFn(void *context,
michael@0 277 char *fields[][2], int32_t fieldCount,
michael@0 278 UErrorCode *pErrorCode) {
michael@0 279 uint32_t mapping[40];
michael@0 280 char *end, *s;
michael@0 281 uint32_t code;
michael@0 282 int32_t length;
michael@0 283 UVersionInfo version;
michael@0 284 UVersionInfo thisVersion;
michael@0 285
michael@0 286 /* get the character code, field 0 */
michael@0 287 code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
michael@0 288 if(U_FAILURE(*pErrorCode)) {
michael@0 289 fprintf(stderr, "gensprep: error parsing NormalizationCorrections.txt mapping at %s\n", fields[0][0]);
michael@0 290 exit(*pErrorCode);
michael@0 291 }
michael@0 292 /* Original (erroneous) decomposition */
michael@0 293 s = fields[1][0];
michael@0 294
michael@0 295 /* parse the mapping string */
michael@0 296 length=u_parseCodePoints(s, mapping, sizeof(mapping)/4, pErrorCode);
michael@0 297
michael@0 298 /* ignore corrected decomposition */
michael@0 299
michael@0 300 u_versionFromString(version,fields[3][0] );
michael@0 301 u_versionFromString(thisVersion, "3.2.0");
michael@0 302
michael@0 303
michael@0 304
michael@0 305 if(U_FAILURE(*pErrorCode)) {
michael@0 306 fprintf(stderr, "gensprep error parsing NormalizationCorrections.txt of U+%04lx - %s\n",
michael@0 307 (long)code, u_errorName(*pErrorCode));
michael@0 308 exit(*pErrorCode);
michael@0 309 }
michael@0 310
michael@0 311 /* store the mapping */
michael@0 312 if( version[0] > thisVersion[0] ||
michael@0 313 ((version[0]==thisVersion[0]) && (version[1] > thisVersion[1]))
michael@0 314 ){
michael@0 315 storeMapping(code,mapping, length, USPREP_MAP, pErrorCode);
michael@0 316 }
michael@0 317 setUnicodeVersionNC(version);
michael@0 318 }
michael@0 319
michael@0 320 static void
michael@0 321 parseNormalizationCorrections(const char *filename, UErrorCode *pErrorCode) {
michael@0 322 char *fields[4][2];
michael@0 323
michael@0 324 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
michael@0 325 return;
michael@0 326 }
michael@0 327
michael@0 328 u_parseDelimitedFile(filename, ';', fields, 4, normalizationCorrectionsLineFn, NULL, pErrorCode);
michael@0 329
michael@0 330 /* fprintf(stdout,"Number of code points that have NormalizationCorrections mapping with length >1 : %i\n",len); */
michael@0 331
michael@0 332 if(U_FAILURE(*pErrorCode) && ( *pErrorCode!=U_FILE_ACCESS_ERROR)) {
michael@0 333 fprintf(stderr, "gensprep error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
michael@0 334 exit(*pErrorCode);
michael@0 335 }
michael@0 336 }
michael@0 337
michael@0 338 static void U_CALLCONV
michael@0 339 strprepProfileLineFn(void *context,
michael@0 340 char *fields[][2], int32_t fieldCount,
michael@0 341 UErrorCode *pErrorCode) {
michael@0 342 uint32_t mapping[40];
michael@0 343 char *end, *map;
michael@0 344 uint32_t code;
michael@0 345 int32_t length;
michael@0 346 /*UBool* mapWithNorm = (UBool*) context;*/
michael@0 347 const char* typeName;
michael@0 348 uint32_t rangeStart=0,rangeEnd =0;
michael@0 349 const char* filename = (const char*) context;
michael@0 350 const char *s;
michael@0 351
michael@0 352 s = u_skipWhitespace(fields[0][0]);
michael@0 353 if (*s == '@') {
michael@0 354 /* special directive */
michael@0 355 s++;
michael@0 356 length = fields[0][1] - s;
michael@0 357 if (length >= NORMALIZE_DIRECTIVE_LEN
michael@0 358 && uprv_strncmp(s, NORMALIZE_DIRECTIVE, NORMALIZE_DIRECTIVE_LEN) == 0) {
michael@0 359 options[NORMALIZE].doesOccur = TRUE;
michael@0 360 return;
michael@0 361 }
michael@0 362 else if (length >= CHECK_BIDI_DIRECTIVE_LEN
michael@0 363 && uprv_strncmp(s, CHECK_BIDI_DIRECTIVE, CHECK_BIDI_DIRECTIVE_LEN) == 0) {
michael@0 364 options[CHECK_BIDI].doesOccur = TRUE;
michael@0 365 return;
michael@0 366 }
michael@0 367 else {
michael@0 368 fprintf(stderr, "gensprep error parsing a directive %s.", fields[0][0]);
michael@0 369 }
michael@0 370 }
michael@0 371
michael@0 372 typeName = fields[2][0];
michael@0 373 map = fields[1][0];
michael@0 374
michael@0 375 if(uprv_strstr(typeName, usprepTypeNames[USPREP_UNASSIGNED])!=NULL){
michael@0 376
michael@0 377 u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode);
michael@0 378 if(U_FAILURE(*pErrorCode)){
michael@0 379 fprintf(stderr, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode));
michael@0 380 return;
michael@0 381 }
michael@0 382
michael@0 383 /* store the range */
michael@0 384 storeRange(rangeStart,rangeEnd,USPREP_UNASSIGNED, pErrorCode);
michael@0 385
michael@0 386 }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_PROHIBITED])!=NULL){
michael@0 387
michael@0 388 u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode);
michael@0 389 if(U_FAILURE(*pErrorCode)){
michael@0 390 fprintf(stderr, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode));
michael@0 391 return;
michael@0 392 }
michael@0 393
michael@0 394 /* store the range */
michael@0 395 storeRange(rangeStart,rangeEnd,USPREP_PROHIBITED, pErrorCode);
michael@0 396
michael@0 397 }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_MAP])!=NULL){
michael@0 398
michael@0 399 /* get the character code, field 0 */
michael@0 400 code=(uint32_t)uprv_strtoul(s, &end, 16);
michael@0 401 if(end<=s || end!=fields[0][1]) {
michael@0 402 fprintf(stderr, "gensprep: syntax error in field 0 at %s\n", fields[0][0]);
michael@0 403 *pErrorCode=U_PARSE_ERROR;
michael@0 404 exit(U_PARSE_ERROR);
michael@0 405 }
michael@0 406
michael@0 407 /* parse the mapping string */
michael@0 408 length=u_parseCodePoints(map, mapping, sizeof(mapping)/4, pErrorCode);
michael@0 409
michael@0 410 /* store the mapping */
michael@0 411 storeMapping(code,mapping, length,USPREP_MAP, pErrorCode);
michael@0 412
michael@0 413 }else{
michael@0 414 *pErrorCode = U_INVALID_FORMAT_ERROR;
michael@0 415 }
michael@0 416
michael@0 417 if(U_FAILURE(*pErrorCode)) {
michael@0 418 fprintf(stderr, "gensprep error parsing %s line %s at %s. Error: %s\n",filename,
michael@0 419 fields[0][0],fields[2][0],u_errorName(*pErrorCode));
michael@0 420 exit(*pErrorCode);
michael@0 421 }
michael@0 422
michael@0 423 }
michael@0 424
michael@0 425 static void
michael@0 426 parseMappings(const char *filename, UBool reportError, UErrorCode *pErrorCode) {
michael@0 427 char *fields[3][2];
michael@0 428
michael@0 429 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
michael@0 430 return;
michael@0 431 }
michael@0 432
michael@0 433 u_parseDelimitedFile(filename, ';', fields, 3, strprepProfileLineFn, (void*)filename, pErrorCode);
michael@0 434
michael@0 435 /*fprintf(stdout,"Number of code points that have mappings with length >1 : %i\n",len);*/
michael@0 436
michael@0 437 if(U_FAILURE(*pErrorCode) && (reportError || *pErrorCode!=U_FILE_ACCESS_ERROR)) {
michael@0 438 fprintf(stderr, "gensprep error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
michael@0 439 exit(*pErrorCode);
michael@0 440 }
michael@0 441 }
michael@0 442
michael@0 443
michael@0 444 #endif /* #if !UCONFIG_NO_IDNA */
michael@0 445
michael@0 446 /*
michael@0 447 * Hey, Emacs, please set the following:
michael@0 448 *
michael@0 449 * Local Variables:
michael@0 450 * indent-tabs-mode: nil
michael@0 451 * End:
michael@0 452 *
michael@0 453 */

mercurial