1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/tools/gencfu/gencfu.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,330 @@ 1.4 +/* 1.5 +********************************************************************** 1.6 +* Copyright (C) 2009-2011, International Business Machines 1.7 +* Corporation and others. All Rights Reserved. 1.8 +********************************************************************** 1.9 +* 1.10 +* File gencfu.c 1.11 +*/ 1.12 + 1.13 +//-------------------------------------------------------------------- 1.14 +// 1.15 +// Tool for generating Unicode Confusable data files (.cfu files). 1.16 +// .cfu files contain the compiled of the confusable data 1.17 +// derived from the Unicode Consortium data described in 1.18 +// Unicode UAX 39. 1.19 +// 1.20 +// Usage: gencfu [options] -r confusables-file.txt -w whole-script-confusables.txt -o output-file.cfu 1.21 +// 1.22 +// options: -v verbose 1.23 +// -? or -h help 1.24 +// 1.25 +// The input rule filew is are plain text files containing confusable character 1.26 +// definitions in the input format defined by Unicode UAX39 for the files 1.27 +// confusables.txt and confusablesWholeScript.txt. This source (.txt) format 1.28 +// is also accepted direaccepted by ICU spoof detedtors. The 1.29 +// files must be encoded in utf-8 format, with or without a BOM. 1.30 +// 1.31 +//-------------------------------------------------------------------- 1.32 + 1.33 +#include "unicode/utypes.h" 1.34 +#include "unicode/unistr.h" 1.35 +#include "unicode/uclean.h" 1.36 +#include "unicode/udata.h" 1.37 +#include "unicode/putil.h" 1.38 + 1.39 +#include "uoptions.h" 1.40 +#include "unewdata.h" 1.41 +#include "ucmndata.h" 1.42 +#include "uspoof_impl.h" 1.43 +#include "cmemory.h" 1.44 + 1.45 +#include <stdio.h> 1.46 +#include <stdlib.h> 1.47 +#include <string.h> 1.48 + 1.49 +U_NAMESPACE_USE 1.50 + 1.51 +static char *progName; 1.52 +static UOption options[]={ 1.53 + UOPTION_HELP_H, /* 0 */ 1.54 + UOPTION_HELP_QUESTION_MARK, /* 1 */ 1.55 + UOPTION_VERBOSE, /* 2 */ 1.56 + { "rules", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0 }, /* 3 */ 1.57 + { "wsrules", NULL, NULL, NULL, 'w', UOPT_REQUIRES_ARG, 0}, /* 4 */ 1.58 + { "out", NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 }, /* 5 */ 1.59 + UOPTION_ICUDATADIR, /* 6 */ 1.60 + UOPTION_DESTDIR, /* 7 */ 1.61 + UOPTION_COPYRIGHT, /* 8 */ 1.62 +}; 1.63 + 1.64 +void usageAndDie(int retCode) { 1.65 + printf("Usage: %s [-v] [-options] -r confusablesRules.txt -w wholeScriptConfusables.txt -o output-file\n", progName); 1.66 + printf("\tRead in Unicode confusable character definitions and write out the binary data\n" 1.67 + "options:\n" 1.68 + "\t-h or -? or --help this usage text\n" 1.69 + "\t-V or --version show a version message\n" 1.70 + "\t-c or --copyright include a copyright notice\n" 1.71 + "\t-v or --verbose turn on verbose output\n" 1.72 + "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" 1.73 + "\t followed by path, defaults to %s\n" 1.74 + "\t-d or --destdir destination directory, followed by the path\n", 1.75 + u_getDataDirectory()); 1.76 + exit (retCode); 1.77 +} 1.78 + 1.79 + 1.80 +#if UCONFIG_NO_REGULAR_EXPRESSIONS || UCONFIG_NO_NORMALIZATION || UCONFIG_NO_FILE_IO 1.81 + 1.82 +/* dummy UDataInfo cf. udata.h */ 1.83 +static UDataInfo dummyDataInfo = { 1.84 + sizeof(UDataInfo), 1.85 + 0, 1.86 + 1.87 + U_IS_BIG_ENDIAN, 1.88 + U_CHARSET_FAMILY, 1.89 + U_SIZEOF_UCHAR, 1.90 + 0, 1.91 + 1.92 + { 0, 0, 0, 0 }, /* dummy dataFormat */ 1.93 + { 0, 0, 0, 0 }, /* dummy formatVersion */ 1.94 + { 0, 0, 0, 0 } /* dummy dataVersion */ 1.95 +}; 1.96 + 1.97 +#else 1.98 + 1.99 +// 1.100 +// Set up the ICU data header, defined in ucmndata.h 1.101 +// 1.102 +DataHeader dh ={ 1.103 + {sizeof(DataHeader), // Struct MappedData 1.104 + 0xda, 1.105 + 0x27}, 1.106 + 1.107 + { // struct UDataInfo 1.108 + sizeof(UDataInfo), // size 1.109 + 0, // reserved 1.110 + U_IS_BIG_ENDIAN, 1.111 + U_CHARSET_FAMILY, 1.112 + U_SIZEOF_UCHAR, 1.113 + 0, // reserved 1.114 + 1.115 + { 0x43, 0x66, 0x75, 0x20 }, // dataFormat="Cfu " 1.116 + { 0xff, 0, 0, 0 }, // formatVersion. Filled in later with values 1.117 + // from the builder. The values declared 1.118 + // here should never appear in any real data. 1.119 + { 5, 1, 0, 0 } // dataVersion (Unicode version) 1.120 + }}; 1.121 + 1.122 +#endif 1.123 + 1.124 +// Forward declaration for function for reading source files. 1.125 +static const char *readFile(const char *fileName, int32_t *len); 1.126 + 1.127 +//---------------------------------------------------------------------------- 1.128 +// 1.129 +// main for gencfu 1.130 +// 1.131 +//---------------------------------------------------------------------------- 1.132 +int main(int argc, char **argv) { 1.133 + UErrorCode status = U_ZERO_ERROR; 1.134 + const char *confFileName; 1.135 + const char *confWSFileName; 1.136 + const char *outFileName; 1.137 + const char *outDir = NULL; 1.138 + const char *copyright = NULL; 1.139 + 1.140 + // 1.141 + // Pick up and check the command line arguments, 1.142 + // using the standard ICU tool utils option handling. 1.143 + // 1.144 + U_MAIN_INIT_ARGS(argc, argv); 1.145 + progName = argv[0]; 1.146 + argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); 1.147 + if(argc<0) { 1.148 + // Unrecognized option 1.149 + fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]); 1.150 + usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); 1.151 + } 1.152 + 1.153 + if(options[0].doesOccur || options[1].doesOccur) { 1.154 + // -? or -h for help. 1.155 + usageAndDie(0); 1.156 + } 1.157 + 1.158 + if (!(options[3].doesOccur && options[4].doesOccur && options[5].doesOccur)) { 1.159 + fprintf(stderr, "confusables file, whole script confusables file and output file must all be specified.\n"); 1.160 + usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); 1.161 + } 1.162 + confFileName = options[3].value; 1.163 + confWSFileName = options[4].value; 1.164 + outFileName = options[5].value; 1.165 + 1.166 + if (options[6].doesOccur) { 1.167 + u_setDataDirectory(options[6].value); 1.168 + } 1.169 + 1.170 + status = U_ZERO_ERROR; 1.171 + 1.172 + /* Combine the directory with the file name */ 1.173 + if(options[7].doesOccur) { 1.174 + outDir = options[7].value; 1.175 + } 1.176 + if (options[8].doesOccur) { 1.177 + copyright = U_COPYRIGHT_STRING; 1.178 + } 1.179 + 1.180 +#if UCONFIG_NO_REGULAR_EXPRESSIONS || UCONFIG_NO_NORMALIZATION || UCONFIG_NO_FILE_IO 1.181 + // spoof detection data file parsing is dependent on regular expressions. 1.182 + // TODO: have the tool return an error status. Requires fixing the ICU data build 1.183 + // so that it doesn't abort entirely on that error. 1.184 + 1.185 + UNewDataMemory *pData; 1.186 + char msg[1024]; 1.187 + 1.188 + /* write message with just the name */ 1.189 + sprintf(msg, "gencfu writes dummy %s because of UCONFIG_NO_REGULAR_EXPRESSIONS and/or UCONFIG_NO_NORMALIZATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName); 1.190 + fprintf(stderr, "%s\n", msg); 1.191 + 1.192 + /* write the dummy data file */ 1.193 + pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status); 1.194 + udata_writeBlock(pData, msg, strlen(msg)); 1.195 + udata_finish(pData, &status); 1.196 + return (int)status; 1.197 + 1.198 +#else 1.199 + /* Initialize ICU */ 1.200 + u_init(&status); 1.201 + if (U_FAILURE(status)) { 1.202 + fprintf(stderr, "%s: can not initialize ICU. status = %s\n", 1.203 + argv[0], u_errorName(status)); 1.204 + exit(1); 1.205 + } 1.206 + status = U_ZERO_ERROR; 1.207 + 1.208 + // Read in the confusables source file 1.209 + 1.210 + int32_t confusablesLen = 0; 1.211 + const char *confusables = readFile(confFileName, &confusablesLen); 1.212 + if (confusables == NULL) { 1.213 + printf("gencfu: error reading file \"%s\"\n", confFileName); 1.214 + exit(-1); 1.215 + } 1.216 + 1.217 + int32_t wsConfusablesLen = 0; 1.218 + const char *wsConfsables = readFile(confWSFileName, &wsConfusablesLen); 1.219 + if (wsConfsables == NULL) { 1.220 + printf("gencfu: error reading file \"%s\"\n", confFileName); 1.221 + exit(-1); 1.222 + } 1.223 + 1.224 + // 1.225 + // Create the Spoof Detector from the source confusables files. 1.226 + // This will compile the data. 1.227 + // 1.228 + UParseError parseError; 1.229 + parseError.line = 0; 1.230 + parseError.offset = 0; 1.231 + int32_t errType; 1.232 + USpoofChecker *sc = uspoof_openFromSource(confusables, confusablesLen, 1.233 + wsConfsables, wsConfusablesLen, 1.234 + &errType, &parseError, &status); 1.235 + if (U_FAILURE(status)) { 1.236 + const char *errFile = 1.237 + (errType == USPOOF_WHOLE_SCRIPT_CONFUSABLE)? confWSFileName : confFileName; 1.238 + fprintf(stderr, "gencfu: uspoof_openFromSource error \"%s\" at file %s, line %d, column %d\n", 1.239 + u_errorName(status), errFile, (int)parseError.line, (int)parseError.offset); 1.240 + exit(status); 1.241 + }; 1.242 + 1.243 + 1.244 + // 1.245 + // Get the compiled rule data from the USpoofChecker. 1.246 + // 1.247 + uint32_t outDataSize; 1.248 + uint8_t *outData; 1.249 + outDataSize = uspoof_serialize(sc, NULL, 0, &status); 1.250 + if (status != U_BUFFER_OVERFLOW_ERROR) { 1.251 + fprintf(stderr, "gencfu: uspoof_serialize() returned %s\n", u_errorName(status)); 1.252 + exit(status); 1.253 + } 1.254 + status = U_ZERO_ERROR; 1.255 + outData = new uint8_t[outDataSize]; 1.256 + uspoof_serialize(sc, outData, outDataSize, &status); 1.257 + 1.258 + // Copy the data format version numbers from the spoof data header into the UDataMemory header. 1.259 + 1.260 + uprv_memcpy(dh.info.formatVersion, 1.261 + reinterpret_cast<SpoofDataHeader *>(outData)->fFormatVersion, 1.262 + sizeof(dh.info.formatVersion)); 1.263 + 1.264 + // 1.265 + // Create the output file 1.266 + // 1.267 + size_t bytesWritten; 1.268 + UNewDataMemory *pData; 1.269 + pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status); 1.270 + if(U_FAILURE(status)) { 1.271 + fprintf(stderr, "gencfu: Could not open output file \"%s\", \"%s\"\n", 1.272 + outFileName, u_errorName(status)); 1.273 + exit(status); 1.274 + } 1.275 + 1.276 + 1.277 + // Write the data itself. 1.278 + udata_writeBlock(pData, outData, outDataSize); 1.279 + // finish up 1.280 + bytesWritten = udata_finish(pData, &status); 1.281 + if(U_FAILURE(status)) { 1.282 + fprintf(stderr, "gencfu: Error %d writing the output file\n", status); 1.283 + exit(status); 1.284 + } 1.285 + 1.286 + if (bytesWritten != outDataSize) { 1.287 + fprintf(stderr, "gencfu: Error writing to output file \"%s\"\n", outFileName); 1.288 + exit(-1); 1.289 + } 1.290 + 1.291 + uspoof_close(sc); 1.292 + delete [] outData; 1.293 + delete [] confusables; 1.294 + delete [] wsConfsables; 1.295 + u_cleanup(); 1.296 + printf("gencfu: tool completed successfully.\n"); 1.297 + return 0; 1.298 +#endif // UCONFIG_NO_REGULAR_EXPRESSIONS 1.299 +} 1.300 + 1.301 + 1.302 + // 1.303 + // Read in a confusables source file 1.304 + // 1.305 + static const char *readFile(const char *fileName, int32_t *len) { 1.306 + char *result; 1.307 + long fileSize; 1.308 + FILE *file; 1.309 + 1.310 + file = fopen(fileName, "rb"); 1.311 + if( file == 0 ) { 1.312 + return NULL; 1.313 + } 1.314 + fseek(file, 0, SEEK_END); 1.315 + fileSize = ftell(file); 1.316 + fseek(file, 0, SEEK_SET); 1.317 + result = new char[fileSize+10]; 1.318 + if (result==NULL) { 1.319 + fclose(file); 1.320 + return NULL; 1.321 + } 1.322 + 1.323 + long t = fread(result, 1, fileSize, file); 1.324 + if (t != fileSize) { 1.325 + delete [] result; 1.326 + fclose(file); 1.327 + return NULL; 1.328 + } 1.329 + result[fileSize]=0; 1.330 + *len = static_cast<int32_t>(fileSize); 1.331 + fclose(file); 1.332 + return result; 1.333 + }