intl/icu/source/common/rbbirb.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/rbbirb.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,318 @@
     1.4 +//
     1.5 +//  file:  rbbirb.cpp
     1.6 +//
     1.7 +//  Copyright (C) 2002-2011, International Business Machines Corporation and others.
     1.8 +//  All Rights Reserved.
     1.9 +//
    1.10 +//  This file contains the RBBIRuleBuilder class implementation.  This is the main class for
    1.11 +//    building (compiling) break rules into the tables required by the runtime
    1.12 +//    RBBI engine.
    1.13 +//
    1.14 +
    1.15 +#include "unicode/utypes.h"
    1.16 +
    1.17 +#if !UCONFIG_NO_BREAK_ITERATION
    1.18 +
    1.19 +#include "unicode/brkiter.h"
    1.20 +#include "unicode/rbbi.h"
    1.21 +#include "unicode/ubrk.h"
    1.22 +#include "unicode/unistr.h"
    1.23 +#include "unicode/uniset.h"
    1.24 +#include "unicode/uchar.h"
    1.25 +#include "unicode/uchriter.h"
    1.26 +#include "unicode/parsepos.h"
    1.27 +#include "unicode/parseerr.h"
    1.28 +#include "cmemory.h"
    1.29 +#include "cstring.h"
    1.30 +
    1.31 +#include "rbbirb.h"
    1.32 +#include "rbbinode.h"
    1.33 +
    1.34 +#include "rbbiscan.h"
    1.35 +#include "rbbisetb.h"
    1.36 +#include "rbbitblb.h"
    1.37 +#include "rbbidata.h"
    1.38 +
    1.39 +
    1.40 +U_NAMESPACE_BEGIN
    1.41 +
    1.42 +
    1.43 +//----------------------------------------------------------------------------------------
    1.44 +//
    1.45 +//  Constructor.
    1.46 +//
    1.47 +//----------------------------------------------------------------------------------------
    1.48 +RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString   &rules,
    1.49 +                                       UParseError     *parseErr,
    1.50 +                                       UErrorCode      &status)
    1.51 + : fRules(rules)
    1.52 +{
    1.53 +    fStatus = &status; // status is checked below
    1.54 +    fParseError = parseErr;
    1.55 +    fDebugEnv   = NULL;
    1.56 +#ifdef RBBI_DEBUG
    1.57 +    fDebugEnv   = getenv("U_RBBIDEBUG");
    1.58 +#endif
    1.59 +
    1.60 +
    1.61 +    fForwardTree        = NULL;
    1.62 +    fReverseTree        = NULL;
    1.63 +    fSafeFwdTree        = NULL;
    1.64 +    fSafeRevTree        = NULL;
    1.65 +    fDefaultTree        = &fForwardTree;
    1.66 +    fForwardTables      = NULL;
    1.67 +    fReverseTables      = NULL;
    1.68 +    fSafeFwdTables      = NULL;
    1.69 +    fSafeRevTables      = NULL;
    1.70 +    fRuleStatusVals     = NULL;
    1.71 +    fChainRules         = FALSE;
    1.72 +    fLBCMNoChain        = FALSE;
    1.73 +    fLookAheadHardBreak = FALSE;
    1.74 +    fUSetNodes          = NULL;
    1.75 +    fRuleStatusVals     = NULL;
    1.76 +    fScanner            = NULL;
    1.77 +    fSetBuilder         = NULL;
    1.78 +    if (parseErr) {
    1.79 +        uprv_memset(parseErr, 0, sizeof(UParseError));
    1.80 +    }
    1.81 +
    1.82 +    if (U_FAILURE(status)) {
    1.83 +        return;
    1.84 +    }
    1.85 +
    1.86 +    fUSetNodes          = new UVector(status); // bcos status gets overwritten here
    1.87 +    fRuleStatusVals     = new UVector(status);
    1.88 +    fScanner            = new RBBIRuleScanner(this);
    1.89 +    fSetBuilder         = new RBBISetBuilder(this);
    1.90 +    if (U_FAILURE(status)) {
    1.91 +        return;
    1.92 +    }
    1.93 +    if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) {
    1.94 +        status = U_MEMORY_ALLOCATION_ERROR;
    1.95 +    }
    1.96 +}
    1.97 +
    1.98 +
    1.99 +
   1.100 +//----------------------------------------------------------------------------------------
   1.101 +//
   1.102 +//  Destructor
   1.103 +//
   1.104 +//----------------------------------------------------------------------------------------
   1.105 +RBBIRuleBuilder::~RBBIRuleBuilder() {
   1.106 +
   1.107 +    int        i;
   1.108 +    for (i=0; ; i++) {
   1.109 +        RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i);
   1.110 +        if (n==NULL) {
   1.111 +            break;
   1.112 +        }
   1.113 +        delete n;
   1.114 +    }
   1.115 +
   1.116 +    delete fUSetNodes;
   1.117 +    delete fSetBuilder;
   1.118 +    delete fForwardTables;
   1.119 +    delete fReverseTables;
   1.120 +    delete fSafeFwdTables;
   1.121 +    delete fSafeRevTables;
   1.122 +
   1.123 +    delete fForwardTree;
   1.124 +    delete fReverseTree;
   1.125 +    delete fSafeFwdTree;
   1.126 +    delete fSafeRevTree;
   1.127 +    delete fScanner;
   1.128 +    delete fRuleStatusVals;
   1.129 +}
   1.130 +
   1.131 +
   1.132 +
   1.133 +
   1.134 +
   1.135 +//----------------------------------------------------------------------------------------
   1.136 +//
   1.137 +//   flattenData() -  Collect up the compiled RBBI rule data and put it into
   1.138 +//                    the format for saving in ICU data files,
   1.139 +//                    which is also the format needed by the RBBI runtime engine.
   1.140 +//
   1.141 +//----------------------------------------------------------------------------------------
   1.142 +static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}
   1.143 +
   1.144 +RBBIDataHeader *RBBIRuleBuilder::flattenData() {
   1.145 +    int32_t    i;
   1.146 +
   1.147 +    if (U_FAILURE(*fStatus)) {
   1.148 +        return NULL;
   1.149 +    }
   1.150 +
   1.151 +    // Remove comments and whitespace from the rules to make it smaller.
   1.152 +    UnicodeString strippedRules((const UnicodeString&)RBBIRuleScanner::stripRules(fRules));
   1.153 +
   1.154 +    // Calculate the size of each section in the data.
   1.155 +    //   Sizes here are padded up to a multiple of 8 for better memory alignment.
   1.156 +    //   Sections sizes actually stored in the header are for the actual data
   1.157 +    //     without the padding.
   1.158 +    //
   1.159 +    int32_t headerSize        = align8(sizeof(RBBIDataHeader));
   1.160 +    int32_t forwardTableSize  = align8(fForwardTables->getTableSize());
   1.161 +    int32_t reverseTableSize  = align8(fReverseTables->getTableSize());
   1.162 +    int32_t safeFwdTableSize  = align8(fSafeFwdTables->getTableSize());
   1.163 +    int32_t safeRevTableSize  = align8(fSafeRevTables->getTableSize());
   1.164 +    int32_t trieSize          = align8(fSetBuilder->getTrieSize());
   1.165 +    int32_t statusTableSize   = align8(fRuleStatusVals->size() * sizeof(int32_t));
   1.166 +    int32_t rulesSize         = align8((strippedRules.length()+1) * sizeof(UChar));
   1.167 +
   1.168 +    int32_t         totalSize = headerSize + forwardTableSize + reverseTableSize
   1.169 +                                + safeFwdTableSize + safeRevTableSize 
   1.170 +                                + statusTableSize + trieSize + rulesSize;
   1.171 +
   1.172 +    RBBIDataHeader  *data     = (RBBIDataHeader *)uprv_malloc(totalSize);
   1.173 +    if (data == NULL) {
   1.174 +        *fStatus = U_MEMORY_ALLOCATION_ERROR;
   1.175 +        return NULL;
   1.176 +    }
   1.177 +    uprv_memset(data, 0, totalSize);
   1.178 +
   1.179 +
   1.180 +    data->fMagic            = 0xb1a0;
   1.181 +    data->fFormatVersion[0] = 3;
   1.182 +    data->fFormatVersion[1] = 1;
   1.183 +    data->fFormatVersion[2] = 0;
   1.184 +    data->fFormatVersion[3] = 0;
   1.185 +    data->fLength           = totalSize;
   1.186 +    data->fCatCount         = fSetBuilder->getNumCharCategories();
   1.187 +
   1.188 +    data->fFTable        = headerSize;
   1.189 +    data->fFTableLen     = forwardTableSize;
   1.190 +    data->fRTable        = data->fFTable  + forwardTableSize;
   1.191 +    data->fRTableLen     = reverseTableSize;
   1.192 +    data->fSFTable       = data->fRTable  + reverseTableSize;
   1.193 +    data->fSFTableLen    = safeFwdTableSize;
   1.194 +    data->fSRTable       = data->fSFTable + safeFwdTableSize;
   1.195 +    data->fSRTableLen    = safeRevTableSize;
   1.196 +
   1.197 +    data->fTrie          = data->fSRTable + safeRevTableSize;
   1.198 +    data->fTrieLen       = fSetBuilder->getTrieSize();
   1.199 +    data->fStatusTable   = data->fTrie    + trieSize;
   1.200 +    data->fStatusTableLen= statusTableSize;
   1.201 +    data->fRuleSource    = data->fStatusTable + statusTableSize;
   1.202 +    data->fRuleSourceLen = strippedRules.length() * sizeof(UChar);
   1.203 +
   1.204 +    uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
   1.205 +
   1.206 +    fForwardTables->exportTable((uint8_t *)data + data->fFTable);
   1.207 +    fReverseTables->exportTable((uint8_t *)data + data->fRTable);
   1.208 +    fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable);
   1.209 +    fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
   1.210 +    fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
   1.211 +
   1.212 +    int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
   1.213 +    for (i=0; i<fRuleStatusVals->size(); i++) {
   1.214 +        ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
   1.215 +    }
   1.216 +
   1.217 +    strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
   1.218 +
   1.219 +    return data;
   1.220 +}
   1.221 +
   1.222 +
   1.223 +
   1.224 +
   1.225 +
   1.226 +
   1.227 +//----------------------------------------------------------------------------------------
   1.228 +//
   1.229 +//  createRuleBasedBreakIterator    construct from source rules that are passed in
   1.230 +//                                  in a UnicodeString
   1.231 +//
   1.232 +//----------------------------------------------------------------------------------------
   1.233 +BreakIterator *
   1.234 +RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString    &rules,
   1.235 +                                    UParseError      *parseError,
   1.236 +                                    UErrorCode       &status)
   1.237 +{
   1.238 +    // status checked below
   1.239 +
   1.240 +    //
   1.241 +    // Read the input rules, generate a parse tree, symbol table,
   1.242 +    // and list of all Unicode Sets referenced by the rules.
   1.243 +    //
   1.244 +    RBBIRuleBuilder  builder(rules, parseError, status);
   1.245 +    if (U_FAILURE(status)) { // status checked here bcos build below doesn't
   1.246 +        return NULL;
   1.247 +    }
   1.248 +    builder.fScanner->parse();
   1.249 +
   1.250 +    //
   1.251 +    // UnicodeSet processing.
   1.252 +    //    Munge the Unicode Sets to create a set of character categories.
   1.253 +    //    Generate the mapping tables (TRIE) from input 32-bit characters to
   1.254 +    //    the character categories.
   1.255 +    //
   1.256 +    builder.fSetBuilder->build();
   1.257 +
   1.258 +
   1.259 +    //
   1.260 +    //   Generate the DFA state transition table.
   1.261 +    //
   1.262 +    builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree);
   1.263 +    builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree);
   1.264 +    builder.fSafeFwdTables = new RBBITableBuilder(&builder, &builder.fSafeFwdTree);
   1.265 +    builder.fSafeRevTables = new RBBITableBuilder(&builder, &builder.fSafeRevTree);
   1.266 +    if (builder.fForwardTables == NULL || builder.fReverseTables == NULL ||
   1.267 +        builder.fSafeFwdTables == NULL || builder.fSafeRevTables == NULL)
   1.268 +    {
   1.269 +        status = U_MEMORY_ALLOCATION_ERROR;
   1.270 +        delete builder.fForwardTables; builder.fForwardTables = NULL;
   1.271 +        delete builder.fReverseTables; builder.fReverseTables = NULL;
   1.272 +        delete builder.fSafeFwdTables; builder.fSafeFwdTables = NULL;
   1.273 +        delete builder.fSafeRevTables; builder.fSafeRevTables = NULL;
   1.274 +        return NULL;
   1.275 +    }
   1.276 +
   1.277 +    builder.fForwardTables->build();
   1.278 +    builder.fReverseTables->build();
   1.279 +    builder.fSafeFwdTables->build();
   1.280 +    builder.fSafeRevTables->build();
   1.281 +
   1.282 +#ifdef RBBI_DEBUG
   1.283 +    if (builder.fDebugEnv && uprv_strstr(builder.fDebugEnv, "states")) {
   1.284 +        builder.fForwardTables->printRuleStatusTable();
   1.285 +    }
   1.286 +#endif
   1.287 +
   1.288 +    //
   1.289 +    //   Package up the compiled data into a memory image
   1.290 +    //      in the run-time format.
   1.291 +    //
   1.292 +    RBBIDataHeader *data = builder.flattenData(); // returns NULL if error
   1.293 +    if (U_FAILURE(*builder.fStatus)) {
   1.294 +        return NULL;
   1.295 +    }
   1.296 +
   1.297 +
   1.298 +    //
   1.299 +    //  Clean up the compiler related stuff
   1.300 +    //
   1.301 +
   1.302 +
   1.303 +    //
   1.304 +    //  Create a break iterator from the compiled rules.
   1.305 +    //     (Identical to creation from stored pre-compiled rules)
   1.306 +    //
   1.307 +    // status is checked after init in construction.
   1.308 +    RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
   1.309 +    if (U_FAILURE(status)) {
   1.310 +        delete This;
   1.311 +        This = NULL;
   1.312 +    } 
   1.313 +    else if(This == NULL) { // test for NULL
   1.314 +        status = U_MEMORY_ALLOCATION_ERROR;
   1.315 +    }
   1.316 +    return This;
   1.317 +}
   1.318 +
   1.319 +U_NAMESPACE_END
   1.320 +
   1.321 +#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

mercurial