intl/icu/source/tools/genbrk/genbrk.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/tools/genbrk/genbrk.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,346 @@
     1.4 +/*
     1.5 +**********************************************************************
     1.6 +*   Copyright (C) 2002-2009, International Business Machines
     1.7 +*   Corporation and others.  All Rights Reserved.
     1.8 +**********************************************************************
     1.9 +*
    1.10 +* File genbrk.c
    1.11 +*/
    1.12 +
    1.13 +//--------------------------------------------------------------------
    1.14 +//
    1.15 +//   Tool for generating RuleBasedBreakIterator data files (.brk files).
    1.16 +//   .brk files contain the precompiled rules for standard types
    1.17 +//   of iterators - word, line, sentence, etc.
    1.18 +//
    1.19 +//   Usage:  genbrk [options] -r rule-file.txt  -o output-file.brk
    1.20 +//
    1.21 +//       options:   -v         verbose
    1.22 +//                  -? or -h   help
    1.23 +//
    1.24 +//   The input rule file is a plain text file containing break rules
    1.25 +//    in the input format accepted by RuleBasedBreakIterators.  The
    1.26 +//    file can be encoded as utf-8, or utf-16 (either endian), or
    1.27 +//    in the default code page (platform dependent.).  utf encoded
    1.28 +//    files must include a BOM.
    1.29 +//
    1.30 +//--------------------------------------------------------------------
    1.31 +
    1.32 +#include "unicode/utypes.h"
    1.33 +#include "unicode/ucnv.h"
    1.34 +#include "unicode/unistr.h"
    1.35 +#include "unicode/rbbi.h"
    1.36 +#include "unicode/uclean.h"
    1.37 +#include "unicode/udata.h"
    1.38 +#include "unicode/putil.h"
    1.39 +
    1.40 +#include "uoptions.h"
    1.41 +#include "unewdata.h"
    1.42 +#include "ucmndata.h"
    1.43 +#include "rbbidata.h"
    1.44 +#include "cmemory.h"
    1.45 +
    1.46 +#include <stdio.h>
    1.47 +#include <stdlib.h>
    1.48 +#include <string.h>
    1.49 +
    1.50 +U_NAMESPACE_USE
    1.51 +
    1.52 +static char *progName;
    1.53 +static UOption options[]={
    1.54 +    UOPTION_HELP_H,             /* 0 */
    1.55 +    UOPTION_HELP_QUESTION_MARK, /* 1 */
    1.56 +    UOPTION_VERBOSE,            /* 2 */
    1.57 +    { "rules", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0 },   /* 3 */
    1.58 +    { "out",   NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 },   /* 4 */
    1.59 +    UOPTION_ICUDATADIR,         /* 5 */
    1.60 +    UOPTION_DESTDIR,            /* 6 */
    1.61 +    UOPTION_COPYRIGHT,          /* 7 */
    1.62 +};
    1.63 +
    1.64 +void usageAndDie(int retCode) {
    1.65 +        printf("Usage: %s [-v] [-options] -r rule-file -o output-file\n", progName);
    1.66 +        printf("\tRead in break iteration rules text and write out the binary data\n"
    1.67 +            "options:\n"
    1.68 +            "\t-h or -? or --help  this usage text\n"
    1.69 +            "\t-V or --version     show a version message\n"
    1.70 +            "\t-c or --copyright   include a copyright notice\n"
    1.71 +            "\t-v or --verbose     turn on verbose output\n"
    1.72 +            "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
    1.73 +            "\t                    followed by path, defaults to %s\n"
    1.74 +            "\t-d or --destdir     destination directory, followed by the path\n",
    1.75 +            u_getDataDirectory());
    1.76 +        exit (retCode);
    1.77 +}
    1.78 +
    1.79 +
    1.80 +#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
    1.81 +
    1.82 +/* dummy UDataInfo cf. udata.h */
    1.83 +static UDataInfo dummyDataInfo = {
    1.84 +    sizeof(UDataInfo),
    1.85 +    0,
    1.86 +
    1.87 +    U_IS_BIG_ENDIAN,
    1.88 +    U_CHARSET_FAMILY,
    1.89 +    U_SIZEOF_UCHAR,
    1.90 +    0,
    1.91 +
    1.92 +    { 0, 0, 0, 0 },                 /* dummy dataFormat */
    1.93 +    { 0, 0, 0, 0 },                 /* dummy formatVersion */
    1.94 +    { 0, 0, 0, 0 }                  /* dummy dataVersion */
    1.95 +};
    1.96 +
    1.97 +#else
    1.98 +
    1.99 +//
   1.100 +//  Set up the ICU data header, defined in ucmndata.h
   1.101 +//
   1.102 +DataHeader dh ={
   1.103 +    {sizeof(DataHeader),           // Struct MappedData
   1.104 +        0xda,
   1.105 +        0x27},
   1.106 +
   1.107 +    {                               // struct UDataInfo
   1.108 +        sizeof(UDataInfo),          //     size
   1.109 +        0,                          //     reserved
   1.110 +        U_IS_BIG_ENDIAN,
   1.111 +        U_CHARSET_FAMILY,
   1.112 +        U_SIZEOF_UCHAR,
   1.113 +        0,                          //     reserved
   1.114 +
   1.115 +    { 0x42, 0x72, 0x6b, 0x20 },     //     dataFormat="Brk "
   1.116 +    { 0xff, 0, 0, 0 },              //     formatVersion.  Filled in later with values
   1.117 +                                    //      from the RBBI rule builder.  The  values declared
   1.118 +                                    //      here should never appear in any real RBBI data.
   1.119 +        { 4, 1, 0, 0 }              //   dataVersion (Unicode version)
   1.120 +    }};
   1.121 +
   1.122 +#endif
   1.123 +
   1.124 +//----------------------------------------------------------------------------
   1.125 +//
   1.126 +//  main      for genbrk
   1.127 +//
   1.128 +//----------------------------------------------------------------------------
   1.129 +int  main(int argc, char **argv) {
   1.130 +    UErrorCode  status = U_ZERO_ERROR;
   1.131 +    const char *ruleFileName;
   1.132 +    const char *outFileName;
   1.133 +    const char *outDir = NULL;
   1.134 +    const char *copyright = NULL;
   1.135 +
   1.136 +    //
   1.137 +    // Pick up and check the command line arguments,
   1.138 +    //    using the standard ICU tool utils option handling.
   1.139 +    //
   1.140 +    U_MAIN_INIT_ARGS(argc, argv);
   1.141 +    progName = argv[0];
   1.142 +    argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
   1.143 +    if(argc<0) {
   1.144 +        // Unrecognized option
   1.145 +        fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
   1.146 +        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
   1.147 +    }
   1.148 +
   1.149 +    if(options[0].doesOccur || options[1].doesOccur) {
   1.150 +        //  -? or -h for help.
   1.151 +        usageAndDie(0);
   1.152 +    }
   1.153 +
   1.154 +    if (!(options[3].doesOccur && options[4].doesOccur)) {
   1.155 +        fprintf(stderr, "rule file and output file must both be specified.\n");
   1.156 +        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
   1.157 +    }
   1.158 +    ruleFileName = options[3].value;
   1.159 +    outFileName  = options[4].value;
   1.160 +
   1.161 +    if (options[5].doesOccur) {
   1.162 +        u_setDataDirectory(options[5].value);
   1.163 +    }
   1.164 +
   1.165 +    status = U_ZERO_ERROR;
   1.166 +
   1.167 +    /* Combine the directory with the file name */
   1.168 +    if(options[6].doesOccur) {
   1.169 +        outDir = options[6].value;
   1.170 +    }
   1.171 +    if (options[7].doesOccur) {
   1.172 +        copyright = U_COPYRIGHT_STRING;
   1.173 +    }
   1.174 +
   1.175 +#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
   1.176 +
   1.177 +    UNewDataMemory *pData;
   1.178 +    char msg[1024];
   1.179 +
   1.180 +    /* write message with just the name */
   1.181 +    sprintf(msg, "genbrk writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
   1.182 +    fprintf(stderr, "%s\n", msg);
   1.183 +
   1.184 +    /* write the dummy data file */
   1.185 +    pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
   1.186 +    udata_writeBlock(pData, msg, strlen(msg));
   1.187 +    udata_finish(pData, &status);
   1.188 +    return (int)status;
   1.189 +
   1.190 +#else
   1.191 +    /* Initialize ICU */
   1.192 +    u_init(&status);
   1.193 +    if (U_FAILURE(status)) {
   1.194 +        fprintf(stderr, "%s: can not initialize ICU.  status = %s\n",
   1.195 +            argv[0], u_errorName(status));
   1.196 +        exit(1);
   1.197 +    }
   1.198 +    status = U_ZERO_ERROR;
   1.199 +
   1.200 +    //
   1.201 +    //  Read in the rule source file
   1.202 +    //
   1.203 +    long        result;
   1.204 +    long        ruleFileSize;
   1.205 +    FILE        *file;
   1.206 +    char        *ruleBufferC;
   1.207 +
   1.208 +    file = fopen(ruleFileName, "rb");
   1.209 +    if( file == 0 ) {
   1.210 +        fprintf(stderr, "Could not open file \"%s\"\n", ruleFileName);
   1.211 +        exit(-1);
   1.212 +    }
   1.213 +    fseek(file, 0, SEEK_END);
   1.214 +    ruleFileSize = ftell(file);
   1.215 +    fseek(file, 0, SEEK_SET);
   1.216 +    ruleBufferC = new char[ruleFileSize+10];
   1.217 +
   1.218 +    result = (long)fread(ruleBufferC, 1, ruleFileSize, file);
   1.219 +    if (result != ruleFileSize)  {
   1.220 +        fprintf(stderr, "Error reading file \"%s\"\n", ruleFileName);
   1.221 +        exit (-1);
   1.222 +    }
   1.223 +    ruleBufferC[ruleFileSize]=0;
   1.224 +    fclose(file);
   1.225 +
   1.226 +    //
   1.227 +    // Look for a Unicode Signature (BOM) on the rule file
   1.228 +    //
   1.229 +    int32_t        signatureLength;
   1.230 +    const char *   ruleSourceC = ruleBufferC;
   1.231 +    const char*    encoding = ucnv_detectUnicodeSignature(
   1.232 +                           ruleSourceC, ruleFileSize, &signatureLength, &status);
   1.233 +    if (U_FAILURE(status)) {
   1.234 +        exit(status);
   1.235 +    }
   1.236 +    if(encoding!=NULL ){
   1.237 +        ruleSourceC  += signatureLength;
   1.238 +        ruleFileSize -= signatureLength;
   1.239 +    }
   1.240 +
   1.241 +    //
   1.242 +    // Open a converter to take the rule file to UTF-16
   1.243 +    //
   1.244 +    UConverter* conv;
   1.245 +    conv = ucnv_open(encoding, &status);
   1.246 +    if (U_FAILURE(status)) {
   1.247 +        fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
   1.248 +        exit(status);
   1.249 +    }
   1.250 +
   1.251 +    //
   1.252 +    // Convert the rules to UChar.
   1.253 +    //  Preflight first to determine required buffer size.
   1.254 +    //
   1.255 +    uint32_t destCap = ucnv_toUChars(conv,
   1.256 +                       NULL,           //  dest,
   1.257 +                       0,              //  destCapacity,
   1.258 +                       ruleSourceC,
   1.259 +                       ruleFileSize,
   1.260 +                       &status);
   1.261 +    if (status != U_BUFFER_OVERFLOW_ERROR) {
   1.262 +        fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   1.263 +        exit(status);
   1.264 +    };
   1.265 +
   1.266 +    status = U_ZERO_ERROR;
   1.267 +    UChar *ruleSourceU = new UChar[destCap+1];
   1.268 +    ucnv_toUChars(conv,
   1.269 +                  ruleSourceU,     //  dest,
   1.270 +                  destCap+1,
   1.271 +                  ruleSourceC,
   1.272 +                  ruleFileSize,
   1.273 +                  &status);
   1.274 +    if (U_FAILURE(status)) {
   1.275 +        fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   1.276 +        exit(status);
   1.277 +    };
   1.278 +    ucnv_close(conv);
   1.279 +
   1.280 +
   1.281 +    //
   1.282 +    //  Put the source rules into a UnicodeString
   1.283 +    //
   1.284 +    UnicodeString ruleSourceS(FALSE, ruleSourceU, destCap);
   1.285 +
   1.286 +    //
   1.287 +    //  Create the break iterator from the rules
   1.288 +    //     This will compile the rules.
   1.289 +    //
   1.290 +    UParseError parseError;
   1.291 +    parseError.line = 0;
   1.292 +    parseError.offset = 0;
   1.293 +    RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(ruleSourceS, parseError, status);
   1.294 +    if (U_FAILURE(status)) {
   1.295 +        fprintf(stderr, "createRuleBasedBreakIterator: ICU Error \"%s\"  at line %d, column %d\n",
   1.296 +                u_errorName(status), (int)parseError.line, (int)parseError.offset);
   1.297 +        exit(status);
   1.298 +    };
   1.299 +
   1.300 +
   1.301 +    //
   1.302 +    //  Get the compiled rule data from the break iterator.
   1.303 +    //
   1.304 +    uint32_t        outDataSize;
   1.305 +    const uint8_t  *outData;
   1.306 +    outData = bi->getBinaryRules(outDataSize);
   1.307 +
   1.308 +    // Copy the data format version numbers from the RBBI data header into the UDataMemory header.
   1.309 +    uprv_memcpy(dh.info.formatVersion, ((RBBIDataHeader *)outData)->fFormatVersion, sizeof(dh.info.formatVersion));
   1.310 +
   1.311 +    //
   1.312 +    //  Create the output file
   1.313 +    //
   1.314 +    size_t bytesWritten;
   1.315 +    UNewDataMemory *pData;
   1.316 +    pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
   1.317 +    if(U_FAILURE(status)) {
   1.318 +        fprintf(stderr, "genbrk: Could not open output file \"%s\", \"%s\"\n", 
   1.319 +                         outFileName, u_errorName(status));
   1.320 +        exit(status);
   1.321 +    }
   1.322 +
   1.323 +
   1.324 +    //  Write the data itself.
   1.325 +    udata_writeBlock(pData, outData, outDataSize);
   1.326 +    // finish up 
   1.327 +    bytesWritten = udata_finish(pData, &status);
   1.328 +    if(U_FAILURE(status)) {
   1.329 +        fprintf(stderr, "genbrk: error %d writing the output file\n", status);
   1.330 +        exit(status);
   1.331 +    }
   1.332 +    
   1.333 +    if (bytesWritten != outDataSize) {
   1.334 +        fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
   1.335 +        exit(-1);
   1.336 +    }
   1.337 +
   1.338 +    delete bi;
   1.339 +    delete[] ruleSourceU;
   1.340 +    delete[] ruleBufferC;
   1.341 +    u_cleanup();
   1.342 +
   1.343 +
   1.344 +    printf("genbrk: tool completed successfully.\n");
   1.345 +    return 0;
   1.346 +
   1.347 +#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   1.348 +}
   1.349 +

mercurial