intl/icu/source/tools/gencfu/gencfu.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

     1 /*
     2 **********************************************************************
     3 *   Copyright (C) 2009-2011, International Business Machines
     4 *   Corporation and others.  All Rights Reserved.
     5 **********************************************************************
     6 *
     7 * File gencfu.c
     8 */
    10 //--------------------------------------------------------------------
    11 //
    12 //   Tool for generating Unicode Confusable data files (.cfu files).
    13 //   .cfu files contain the compiled of the confusable data
    14 //   derived from the Unicode Consortium data described in
    15 //   Unicode UAX 39.
    16 //
    17 //   Usage:  gencfu [options] -r confusables-file.txt -w whole-script-confusables.txt  -o output-file.cfu
    18 //
    19 //       options:   -v         verbose
    20 //                  -? or -h   help
    21 //
    22 //   The input rule filew is are plain text files containing confusable character
    23 //    definitions in the input format defined by Unicode UAX39 for the files
    24 //    confusables.txt and confusablesWholeScript.txt.  This source (.txt) format
    25 //    is also accepted direaccepted by ICU spoof detedtors.  The
    26 //    files must be encoded in utf-8 format, with or without a BOM.
    27 //
    28 //--------------------------------------------------------------------
    30 #include "unicode/utypes.h"
    31 #include "unicode/unistr.h"
    32 #include "unicode/uclean.h"
    33 #include "unicode/udata.h"
    34 #include "unicode/putil.h"
    36 #include "uoptions.h"
    37 #include "unewdata.h"
    38 #include "ucmndata.h"
    39 #include "uspoof_impl.h"
    40 #include "cmemory.h"
    42 #include <stdio.h>
    43 #include <stdlib.h>
    44 #include <string.h>
    46 U_NAMESPACE_USE
    48 static char *progName;
    49 static UOption options[]={
    50     UOPTION_HELP_H,             /* 0 */
    51     UOPTION_HELP_QUESTION_MARK, /* 1 */
    52     UOPTION_VERBOSE,            /* 2 */
    53     { "rules", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0 },   /* 3 */
    54     { "wsrules", NULL, NULL, NULL, 'w', UOPT_REQUIRES_ARG, 0},  /* 4 */
    55     { "out",   NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 },   /* 5 */
    56     UOPTION_ICUDATADIR,         /* 6 */
    57     UOPTION_DESTDIR,            /* 7 */
    58     UOPTION_COPYRIGHT,          /* 8 */
    59 };
    61 void usageAndDie(int retCode) {
    62         printf("Usage: %s [-v] [-options] -r confusablesRules.txt -w wholeScriptConfusables.txt -o output-file\n", progName);
    63         printf("\tRead in Unicode confusable character definitions and write out the binary data\n"
    64             "options:\n"
    65             "\t-h or -? or --help  this usage text\n"
    66             "\t-V or --version     show a version message\n"
    67             "\t-c or --copyright   include a copyright notice\n"
    68             "\t-v or --verbose     turn on verbose output\n"
    69             "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
    70             "\t                    followed by path, defaults to %s\n"
    71             "\t-d or --destdir     destination directory, followed by the path\n",
    72             u_getDataDirectory());
    73         exit (retCode);
    74 }
    77 #if UCONFIG_NO_REGULAR_EXPRESSIONS || UCONFIG_NO_NORMALIZATION || UCONFIG_NO_FILE_IO
    79 /* dummy UDataInfo cf. udata.h */
    80 static UDataInfo dummyDataInfo = {
    81     sizeof(UDataInfo),
    82     0,
    84     U_IS_BIG_ENDIAN,
    85     U_CHARSET_FAMILY,
    86     U_SIZEOF_UCHAR,
    87     0,
    89     { 0, 0, 0, 0 },                 /* dummy dataFormat */
    90     { 0, 0, 0, 0 },                 /* dummy formatVersion */
    91     { 0, 0, 0, 0 }                  /* dummy dataVersion */
    92 };
    94 #else
    96 //
    97 //  Set up the ICU data header, defined in ucmndata.h
    98 //
    99 DataHeader dh ={
   100     {sizeof(DataHeader),           // Struct MappedData
   101         0xda,
   102         0x27},
   104     {                               // struct UDataInfo
   105         sizeof(UDataInfo),          //     size
   106         0,                          //     reserved
   107         U_IS_BIG_ENDIAN,
   108         U_CHARSET_FAMILY,
   109         U_SIZEOF_UCHAR,
   110         0,                          //     reserved
   112     { 0x43, 0x66, 0x75, 0x20 },     //     dataFormat="Cfu "
   113     { 0xff, 0, 0, 0 },              //     formatVersion.  Filled in later with values
   114                                     //      from the  builder.  The  values declared
   115                                     //      here should never appear in any real data.
   116         { 5, 1, 0, 0 }              //   dataVersion (Unicode version)
   117     }};
   119 #endif
   121 // Forward declaration for function for reading source files.
   122 static const char *readFile(const char *fileName, int32_t *len);
   124 //----------------------------------------------------------------------------
   125 //
   126 //  main      for gencfu
   127 //
   128 //----------------------------------------------------------------------------
   129 int  main(int argc, char **argv) {
   130     UErrorCode  status = U_ZERO_ERROR;
   131     const char *confFileName;
   132     const char *confWSFileName;
   133     const char *outFileName;
   134     const char *outDir = NULL;
   135     const char *copyright = NULL;
   137     //
   138     // Pick up and check the command line arguments,
   139     //    using the standard ICU tool utils option handling.
   140     //
   141     U_MAIN_INIT_ARGS(argc, argv);
   142     progName = argv[0];
   143     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
   144     if(argc<0) {
   145         // Unrecognized option
   146         fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
   147         usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
   148     }
   150     if(options[0].doesOccur || options[1].doesOccur) {
   151         //  -? or -h for help.
   152         usageAndDie(0);
   153     }
   155     if (!(options[3].doesOccur && options[4].doesOccur && options[5].doesOccur)) {
   156         fprintf(stderr, "confusables file, whole script confusables file and output file must all be specified.\n");
   157         usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
   158     }
   159     confFileName   = options[3].value;
   160     confWSFileName = options[4].value;
   161     outFileName    = options[5].value;
   163     if (options[6].doesOccur) {
   164         u_setDataDirectory(options[6].value);
   165     }
   167     status = U_ZERO_ERROR;
   169     /* Combine the directory with the file name */
   170     if(options[7].doesOccur) {
   171         outDir = options[7].value;
   172     }
   173     if (options[8].doesOccur) {
   174         copyright = U_COPYRIGHT_STRING;
   175     }
   177 #if UCONFIG_NO_REGULAR_EXPRESSIONS || UCONFIG_NO_NORMALIZATION || UCONFIG_NO_FILE_IO
   178     // spoof detection data file parsing is dependent on regular expressions.
   179     // TODO: have the tool return an error status.  Requires fixing the ICU data build
   180     //       so that it doesn't abort entirely on that error.
   182     UNewDataMemory *pData;
   183     char msg[1024];
   185     /* write message with just the name */
   186     sprintf(msg, "gencfu writes dummy %s because of UCONFIG_NO_REGULAR_EXPRESSIONS and/or UCONFIG_NO_NORMALIZATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
   187     fprintf(stderr, "%s\n", msg);
   189     /* write the dummy data file */
   190     pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
   191     udata_writeBlock(pData, msg, strlen(msg));
   192     udata_finish(pData, &status);
   193     return (int)status;
   195 #else
   196     /* Initialize ICU */
   197     u_init(&status);
   198     if (U_FAILURE(status)) {
   199         fprintf(stderr, "%s: can not initialize ICU.  status = %s\n",
   200             argv[0], u_errorName(status));
   201         exit(1);
   202     }
   203     status = U_ZERO_ERROR;
   205     //  Read in the confusables source file
   207     int32_t      confusablesLen = 0;
   208     const char  *confusables = readFile(confFileName, &confusablesLen);
   209     if (confusables == NULL) {
   210         printf("gencfu: error reading file  \"%s\"\n", confFileName);
   211         exit(-1);
   212     }
   214     int32_t     wsConfusablesLen = 0;
   215     const char *wsConfsables =  readFile(confWSFileName, &wsConfusablesLen);
   216     if (wsConfsables == NULL) {
   217         printf("gencfu: error reading file  \"%s\"\n", confFileName);
   218         exit(-1);
   219     }
   221     //
   222     //  Create the Spoof Detector from the source confusables files.
   223     //     This will compile the data.
   224     //
   225     UParseError parseError;
   226     parseError.line = 0;
   227     parseError.offset = 0;
   228     int32_t errType;
   229     USpoofChecker *sc = uspoof_openFromSource(confusables, confusablesLen,
   230                                               wsConfsables, wsConfusablesLen,
   231                                               &errType, &parseError, &status);
   232     if (U_FAILURE(status)) {
   233         const char *errFile = 
   234             (errType == USPOOF_WHOLE_SCRIPT_CONFUSABLE)? confWSFileName : confFileName;
   235         fprintf(stderr, "gencfu: uspoof_openFromSource error \"%s\"  at file %s, line %d, column %d\n",
   236                 u_errorName(status), errFile, (int)parseError.line, (int)parseError.offset);
   237         exit(status);
   238     };
   241     //
   242     //  Get the compiled rule data from the USpoofChecker.
   243     //
   244     uint32_t        outDataSize;
   245     uint8_t        *outData;
   246     outDataSize = uspoof_serialize(sc, NULL, 0, &status);
   247     if (status != U_BUFFER_OVERFLOW_ERROR) {
   248         fprintf(stderr, "gencfu: uspoof_serialize() returned %s\n", u_errorName(status));
   249         exit(status);
   250     }
   251     status = U_ZERO_ERROR;
   252     outData = new uint8_t[outDataSize];
   253     uspoof_serialize(sc, outData, outDataSize, &status);
   255     // Copy the data format version numbers from the spoof data header into the UDataMemory header.
   257     uprv_memcpy(dh.info.formatVersion, 
   258                 reinterpret_cast<SpoofDataHeader *>(outData)->fFormatVersion,
   259                 sizeof(dh.info.formatVersion));
   261     //
   262     //  Create the output file
   263     //
   264     size_t bytesWritten;
   265     UNewDataMemory *pData;
   266     pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
   267     if(U_FAILURE(status)) {
   268         fprintf(stderr, "gencfu: Could not open output file \"%s\", \"%s\"\n", 
   269                          outFileName, u_errorName(status));
   270         exit(status);
   271     }
   274     //  Write the data itself.
   275     udata_writeBlock(pData, outData, outDataSize);
   276     // finish up 
   277     bytesWritten = udata_finish(pData, &status);
   278     if(U_FAILURE(status)) {
   279         fprintf(stderr, "gencfu: Error %d writing the output file\n", status);
   280         exit(status);
   281     }
   283     if (bytesWritten != outDataSize) {
   284         fprintf(stderr, "gencfu: Error writing to output file \"%s\"\n", outFileName);
   285         exit(-1);
   286     }
   288     uspoof_close(sc);
   289     delete [] outData;
   290     delete [] confusables;
   291     delete [] wsConfsables;
   292     u_cleanup();
   293     printf("gencfu: tool completed successfully.\n");
   294     return 0;
   295 #endif   // UCONFIG_NO_REGULAR_EXPRESSIONS
   296 }
   299  //
   300  //  Read in a confusables source file
   301  //
   302  static const char *readFile(const char *fileName, int32_t *len) {
   303     char       *result;
   304     long        fileSize;
   305     FILE        *file;
   307     file = fopen(fileName, "rb");
   308     if( file == 0 ) {
   309         return NULL;
   310     }
   311     fseek(file, 0, SEEK_END);
   312     fileSize = ftell(file);
   313     fseek(file, 0, SEEK_SET);
   314     result = new char[fileSize+10];
   315     if (result==NULL) {
   316         fclose(file);
   317         return NULL;
   318     }
   320     long t = fread(result, 1, fileSize, file);
   321     if (t != fileSize)  {
   322         delete [] result;
   323         fclose(file);
   324         return NULL;
   325     }
   326     result[fileSize]=0;
   327     *len = static_cast<int32_t>(fileSize);
   328     fclose(file);
   329     return result;
   330  }

mercurial