intl/icu/source/tools/genbrk/genbrk.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*
     2 **********************************************************************
     3 *   Copyright (C) 2002-2009, International Business Machines
     4 *   Corporation and others.  All Rights Reserved.
     5 **********************************************************************
     6 *
     7 * File genbrk.c
     8 */
    10 //--------------------------------------------------------------------
    11 //
    12 //   Tool for generating RuleBasedBreakIterator data files (.brk files).
    13 //   .brk files contain the precompiled rules for standard types
    14 //   of iterators - word, line, sentence, etc.
    15 //
    16 //   Usage:  genbrk [options] -r rule-file.txt  -o output-file.brk
    17 //
    18 //       options:   -v         verbose
    19 //                  -? or -h   help
    20 //
    21 //   The input rule file is a plain text file containing break rules
    22 //    in the input format accepted by RuleBasedBreakIterators.  The
    23 //    file can be encoded as utf-8, or utf-16 (either endian), or
    24 //    in the default code page (platform dependent.).  utf encoded
    25 //    files must include a BOM.
    26 //
    27 //--------------------------------------------------------------------
    29 #include "unicode/utypes.h"
    30 #include "unicode/ucnv.h"
    31 #include "unicode/unistr.h"
    32 #include "unicode/rbbi.h"
    33 #include "unicode/uclean.h"
    34 #include "unicode/udata.h"
    35 #include "unicode/putil.h"
    37 #include "uoptions.h"
    38 #include "unewdata.h"
    39 #include "ucmndata.h"
    40 #include "rbbidata.h"
    41 #include "cmemory.h"
    43 #include <stdio.h>
    44 #include <stdlib.h>
    45 #include <string.h>
    47 U_NAMESPACE_USE
    49 static char *progName;
    50 static UOption options[]={
    51     UOPTION_HELP_H,             /* 0 */
    52     UOPTION_HELP_QUESTION_MARK, /* 1 */
    53     UOPTION_VERBOSE,            /* 2 */
    54     { "rules", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0 },   /* 3 */
    55     { "out",   NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 },   /* 4 */
    56     UOPTION_ICUDATADIR,         /* 5 */
    57     UOPTION_DESTDIR,            /* 6 */
    58     UOPTION_COPYRIGHT,          /* 7 */
    59 };
    61 void usageAndDie(int retCode) {
    62         printf("Usage: %s [-v] [-options] -r rule-file -o output-file\n", progName);
    63         printf("\tRead in break iteration rules text and write out the binary data\n"
    64             "options:\n"
    65             "\t-h or -? or --help  this usage text\n"
    66             "\t-V or --version     show a version message\n"
    67             "\t-c or --copyright   include a copyright notice\n"
    68             "\t-v or --verbose     turn on verbose output\n"
    69             "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
    70             "\t                    followed by path, defaults to %s\n"
    71             "\t-d or --destdir     destination directory, followed by the path\n",
    72             u_getDataDirectory());
    73         exit (retCode);
    74 }
    77 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
    79 /* dummy UDataInfo cf. udata.h */
    80 static UDataInfo dummyDataInfo = {
    81     sizeof(UDataInfo),
    82     0,
    84     U_IS_BIG_ENDIAN,
    85     U_CHARSET_FAMILY,
    86     U_SIZEOF_UCHAR,
    87     0,
    89     { 0, 0, 0, 0 },                 /* dummy dataFormat */
    90     { 0, 0, 0, 0 },                 /* dummy formatVersion */
    91     { 0, 0, 0, 0 }                  /* dummy dataVersion */
    92 };
    94 #else
    96 //
    97 //  Set up the ICU data header, defined in ucmndata.h
    98 //
    99 DataHeader dh ={
   100     {sizeof(DataHeader),           // Struct MappedData
   101         0xda,
   102         0x27},
   104     {                               // struct UDataInfo
   105         sizeof(UDataInfo),          //     size
   106         0,                          //     reserved
   107         U_IS_BIG_ENDIAN,
   108         U_CHARSET_FAMILY,
   109         U_SIZEOF_UCHAR,
   110         0,                          //     reserved
   112     { 0x42, 0x72, 0x6b, 0x20 },     //     dataFormat="Brk "
   113     { 0xff, 0, 0, 0 },              //     formatVersion.  Filled in later with values
   114                                     //      from the RBBI rule builder.  The  values declared
   115                                     //      here should never appear in any real RBBI data.
   116         { 4, 1, 0, 0 }              //   dataVersion (Unicode version)
   117     }};
   119 #endif
   121 //----------------------------------------------------------------------------
   122 //
   123 //  main      for genbrk
   124 //
   125 //----------------------------------------------------------------------------
   126 int  main(int argc, char **argv) {
   127     UErrorCode  status = U_ZERO_ERROR;
   128     const char *ruleFileName;
   129     const char *outFileName;
   130     const char *outDir = NULL;
   131     const char *copyright = NULL;
   133     //
   134     // Pick up and check the command line arguments,
   135     //    using the standard ICU tool utils option handling.
   136     //
   137     U_MAIN_INIT_ARGS(argc, argv);
   138     progName = argv[0];
   139     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
   140     if(argc<0) {
   141         // Unrecognized option
   142         fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
   143         usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
   144     }
   146     if(options[0].doesOccur || options[1].doesOccur) {
   147         //  -? or -h for help.
   148         usageAndDie(0);
   149     }
   151     if (!(options[3].doesOccur && options[4].doesOccur)) {
   152         fprintf(stderr, "rule file and output file must both be specified.\n");
   153         usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
   154     }
   155     ruleFileName = options[3].value;
   156     outFileName  = options[4].value;
   158     if (options[5].doesOccur) {
   159         u_setDataDirectory(options[5].value);
   160     }
   162     status = U_ZERO_ERROR;
   164     /* Combine the directory with the file name */
   165     if(options[6].doesOccur) {
   166         outDir = options[6].value;
   167     }
   168     if (options[7].doesOccur) {
   169         copyright = U_COPYRIGHT_STRING;
   170     }
   172 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
   174     UNewDataMemory *pData;
   175     char msg[1024];
   177     /* write message with just the name */
   178     sprintf(msg, "genbrk writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
   179     fprintf(stderr, "%s\n", msg);
   181     /* write the dummy data file */
   182     pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
   183     udata_writeBlock(pData, msg, strlen(msg));
   184     udata_finish(pData, &status);
   185     return (int)status;
   187 #else
   188     /* Initialize ICU */
   189     u_init(&status);
   190     if (U_FAILURE(status)) {
   191         fprintf(stderr, "%s: can not initialize ICU.  status = %s\n",
   192             argv[0], u_errorName(status));
   193         exit(1);
   194     }
   195     status = U_ZERO_ERROR;
   197     //
   198     //  Read in the rule source file
   199     //
   200     long        result;
   201     long        ruleFileSize;
   202     FILE        *file;
   203     char        *ruleBufferC;
   205     file = fopen(ruleFileName, "rb");
   206     if( file == 0 ) {
   207         fprintf(stderr, "Could not open file \"%s\"\n", ruleFileName);
   208         exit(-1);
   209     }
   210     fseek(file, 0, SEEK_END);
   211     ruleFileSize = ftell(file);
   212     fseek(file, 0, SEEK_SET);
   213     ruleBufferC = new char[ruleFileSize+10];
   215     result = (long)fread(ruleBufferC, 1, ruleFileSize, file);
   216     if (result != ruleFileSize)  {
   217         fprintf(stderr, "Error reading file \"%s\"\n", ruleFileName);
   218         exit (-1);
   219     }
   220     ruleBufferC[ruleFileSize]=0;
   221     fclose(file);
   223     //
   224     // Look for a Unicode Signature (BOM) on the rule file
   225     //
   226     int32_t        signatureLength;
   227     const char *   ruleSourceC = ruleBufferC;
   228     const char*    encoding = ucnv_detectUnicodeSignature(
   229                            ruleSourceC, ruleFileSize, &signatureLength, &status);
   230     if (U_FAILURE(status)) {
   231         exit(status);
   232     }
   233     if(encoding!=NULL ){
   234         ruleSourceC  += signatureLength;
   235         ruleFileSize -= signatureLength;
   236     }
   238     //
   239     // Open a converter to take the rule file to UTF-16
   240     //
   241     UConverter* conv;
   242     conv = ucnv_open(encoding, &status);
   243     if (U_FAILURE(status)) {
   244         fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
   245         exit(status);
   246     }
   248     //
   249     // Convert the rules to UChar.
   250     //  Preflight first to determine required buffer size.
   251     //
   252     uint32_t destCap = ucnv_toUChars(conv,
   253                        NULL,           //  dest,
   254                        0,              //  destCapacity,
   255                        ruleSourceC,
   256                        ruleFileSize,
   257                        &status);
   258     if (status != U_BUFFER_OVERFLOW_ERROR) {
   259         fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   260         exit(status);
   261     };
   263     status = U_ZERO_ERROR;
   264     UChar *ruleSourceU = new UChar[destCap+1];
   265     ucnv_toUChars(conv,
   266                   ruleSourceU,     //  dest,
   267                   destCap+1,
   268                   ruleSourceC,
   269                   ruleFileSize,
   270                   &status);
   271     if (U_FAILURE(status)) {
   272         fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   273         exit(status);
   274     };
   275     ucnv_close(conv);
   278     //
   279     //  Put the source rules into a UnicodeString
   280     //
   281     UnicodeString ruleSourceS(FALSE, ruleSourceU, destCap);
   283     //
   284     //  Create the break iterator from the rules
   285     //     This will compile the rules.
   286     //
   287     UParseError parseError;
   288     parseError.line = 0;
   289     parseError.offset = 0;
   290     RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(ruleSourceS, parseError, status);
   291     if (U_FAILURE(status)) {
   292         fprintf(stderr, "createRuleBasedBreakIterator: ICU Error \"%s\"  at line %d, column %d\n",
   293                 u_errorName(status), (int)parseError.line, (int)parseError.offset);
   294         exit(status);
   295     };
   298     //
   299     //  Get the compiled rule data from the break iterator.
   300     //
   301     uint32_t        outDataSize;
   302     const uint8_t  *outData;
   303     outData = bi->getBinaryRules(outDataSize);
   305     // Copy the data format version numbers from the RBBI data header into the UDataMemory header.
   306     uprv_memcpy(dh.info.formatVersion, ((RBBIDataHeader *)outData)->fFormatVersion, sizeof(dh.info.formatVersion));
   308     //
   309     //  Create the output file
   310     //
   311     size_t bytesWritten;
   312     UNewDataMemory *pData;
   313     pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
   314     if(U_FAILURE(status)) {
   315         fprintf(stderr, "genbrk: Could not open output file \"%s\", \"%s\"\n", 
   316                          outFileName, u_errorName(status));
   317         exit(status);
   318     }
   321     //  Write the data itself.
   322     udata_writeBlock(pData, outData, outDataSize);
   323     // finish up 
   324     bytesWritten = udata_finish(pData, &status);
   325     if(U_FAILURE(status)) {
   326         fprintf(stderr, "genbrk: error %d writing the output file\n", status);
   327         exit(status);
   328     }
   330     if (bytesWritten != outDataSize) {
   331         fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
   332         exit(-1);
   333     }
   335     delete bi;
   336     delete[] ruleSourceU;
   337     delete[] ruleBufferC;
   338     u_cleanup();
   341     printf("genbrk: tool completed successfully.\n");
   342     return 0;
   344 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   345 }

mercurial