1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/rbbirb.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,318 @@ 1.4 +// 1.5 +// file: rbbirb.cpp 1.6 +// 1.7 +// Copyright (C) 2002-2011, International Business Machines Corporation and others. 1.8 +// All Rights Reserved. 1.9 +// 1.10 +// This file contains the RBBIRuleBuilder class implementation. This is the main class for 1.11 +// building (compiling) break rules into the tables required by the runtime 1.12 +// RBBI engine. 1.13 +// 1.14 + 1.15 +#include "unicode/utypes.h" 1.16 + 1.17 +#if !UCONFIG_NO_BREAK_ITERATION 1.18 + 1.19 +#include "unicode/brkiter.h" 1.20 +#include "unicode/rbbi.h" 1.21 +#include "unicode/ubrk.h" 1.22 +#include "unicode/unistr.h" 1.23 +#include "unicode/uniset.h" 1.24 +#include "unicode/uchar.h" 1.25 +#include "unicode/uchriter.h" 1.26 +#include "unicode/parsepos.h" 1.27 +#include "unicode/parseerr.h" 1.28 +#include "cmemory.h" 1.29 +#include "cstring.h" 1.30 + 1.31 +#include "rbbirb.h" 1.32 +#include "rbbinode.h" 1.33 + 1.34 +#include "rbbiscan.h" 1.35 +#include "rbbisetb.h" 1.36 +#include "rbbitblb.h" 1.37 +#include "rbbidata.h" 1.38 + 1.39 + 1.40 +U_NAMESPACE_BEGIN 1.41 + 1.42 + 1.43 +//---------------------------------------------------------------------------------------- 1.44 +// 1.45 +// Constructor. 1.46 +// 1.47 +//---------------------------------------------------------------------------------------- 1.48 +RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules, 1.49 + UParseError *parseErr, 1.50 + UErrorCode &status) 1.51 + : fRules(rules) 1.52 +{ 1.53 + fStatus = &status; // status is checked below 1.54 + fParseError = parseErr; 1.55 + fDebugEnv = NULL; 1.56 +#ifdef RBBI_DEBUG 1.57 + fDebugEnv = getenv("U_RBBIDEBUG"); 1.58 +#endif 1.59 + 1.60 + 1.61 + fForwardTree = NULL; 1.62 + fReverseTree = NULL; 1.63 + fSafeFwdTree = NULL; 1.64 + fSafeRevTree = NULL; 1.65 + fDefaultTree = &fForwardTree; 1.66 + fForwardTables = NULL; 1.67 + fReverseTables = NULL; 1.68 + fSafeFwdTables = NULL; 1.69 + fSafeRevTables = NULL; 1.70 + fRuleStatusVals = NULL; 1.71 + fChainRules = FALSE; 1.72 + fLBCMNoChain = FALSE; 1.73 + fLookAheadHardBreak = FALSE; 1.74 + fUSetNodes = NULL; 1.75 + fRuleStatusVals = NULL; 1.76 + fScanner = NULL; 1.77 + fSetBuilder = NULL; 1.78 + if (parseErr) { 1.79 + uprv_memset(parseErr, 0, sizeof(UParseError)); 1.80 + } 1.81 + 1.82 + if (U_FAILURE(status)) { 1.83 + return; 1.84 + } 1.85 + 1.86 + fUSetNodes = new UVector(status); // bcos status gets overwritten here 1.87 + fRuleStatusVals = new UVector(status); 1.88 + fScanner = new RBBIRuleScanner(this); 1.89 + fSetBuilder = new RBBISetBuilder(this); 1.90 + if (U_FAILURE(status)) { 1.91 + return; 1.92 + } 1.93 + if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) { 1.94 + status = U_MEMORY_ALLOCATION_ERROR; 1.95 + } 1.96 +} 1.97 + 1.98 + 1.99 + 1.100 +//---------------------------------------------------------------------------------------- 1.101 +// 1.102 +// Destructor 1.103 +// 1.104 +//---------------------------------------------------------------------------------------- 1.105 +RBBIRuleBuilder::~RBBIRuleBuilder() { 1.106 + 1.107 + int i; 1.108 + for (i=0; ; i++) { 1.109 + RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i); 1.110 + if (n==NULL) { 1.111 + break; 1.112 + } 1.113 + delete n; 1.114 + } 1.115 + 1.116 + delete fUSetNodes; 1.117 + delete fSetBuilder; 1.118 + delete fForwardTables; 1.119 + delete fReverseTables; 1.120 + delete fSafeFwdTables; 1.121 + delete fSafeRevTables; 1.122 + 1.123 + delete fForwardTree; 1.124 + delete fReverseTree; 1.125 + delete fSafeFwdTree; 1.126 + delete fSafeRevTree; 1.127 + delete fScanner; 1.128 + delete fRuleStatusVals; 1.129 +} 1.130 + 1.131 + 1.132 + 1.133 + 1.134 + 1.135 +//---------------------------------------------------------------------------------------- 1.136 +// 1.137 +// flattenData() - Collect up the compiled RBBI rule data and put it into 1.138 +// the format for saving in ICU data files, 1.139 +// which is also the format needed by the RBBI runtime engine. 1.140 +// 1.141 +//---------------------------------------------------------------------------------------- 1.142 +static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;} 1.143 + 1.144 +RBBIDataHeader *RBBIRuleBuilder::flattenData() { 1.145 + int32_t i; 1.146 + 1.147 + if (U_FAILURE(*fStatus)) { 1.148 + return NULL; 1.149 + } 1.150 + 1.151 + // Remove comments and whitespace from the rules to make it smaller. 1.152 + UnicodeString strippedRules((const UnicodeString&)RBBIRuleScanner::stripRules(fRules)); 1.153 + 1.154 + // Calculate the size of each section in the data. 1.155 + // Sizes here are padded up to a multiple of 8 for better memory alignment. 1.156 + // Sections sizes actually stored in the header are for the actual data 1.157 + // without the padding. 1.158 + // 1.159 + int32_t headerSize = align8(sizeof(RBBIDataHeader)); 1.160 + int32_t forwardTableSize = align8(fForwardTables->getTableSize()); 1.161 + int32_t reverseTableSize = align8(fReverseTables->getTableSize()); 1.162 + int32_t safeFwdTableSize = align8(fSafeFwdTables->getTableSize()); 1.163 + int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize()); 1.164 + int32_t trieSize = align8(fSetBuilder->getTrieSize()); 1.165 + int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t)); 1.166 + int32_t rulesSize = align8((strippedRules.length()+1) * sizeof(UChar)); 1.167 + 1.168 + int32_t totalSize = headerSize + forwardTableSize + reverseTableSize 1.169 + + safeFwdTableSize + safeRevTableSize 1.170 + + statusTableSize + trieSize + rulesSize; 1.171 + 1.172 + RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize); 1.173 + if (data == NULL) { 1.174 + *fStatus = U_MEMORY_ALLOCATION_ERROR; 1.175 + return NULL; 1.176 + } 1.177 + uprv_memset(data, 0, totalSize); 1.178 + 1.179 + 1.180 + data->fMagic = 0xb1a0; 1.181 + data->fFormatVersion[0] = 3; 1.182 + data->fFormatVersion[1] = 1; 1.183 + data->fFormatVersion[2] = 0; 1.184 + data->fFormatVersion[3] = 0; 1.185 + data->fLength = totalSize; 1.186 + data->fCatCount = fSetBuilder->getNumCharCategories(); 1.187 + 1.188 + data->fFTable = headerSize; 1.189 + data->fFTableLen = forwardTableSize; 1.190 + data->fRTable = data->fFTable + forwardTableSize; 1.191 + data->fRTableLen = reverseTableSize; 1.192 + data->fSFTable = data->fRTable + reverseTableSize; 1.193 + data->fSFTableLen = safeFwdTableSize; 1.194 + data->fSRTable = data->fSFTable + safeFwdTableSize; 1.195 + data->fSRTableLen = safeRevTableSize; 1.196 + 1.197 + data->fTrie = data->fSRTable + safeRevTableSize; 1.198 + data->fTrieLen = fSetBuilder->getTrieSize(); 1.199 + data->fStatusTable = data->fTrie + trieSize; 1.200 + data->fStatusTableLen= statusTableSize; 1.201 + data->fRuleSource = data->fStatusTable + statusTableSize; 1.202 + data->fRuleSourceLen = strippedRules.length() * sizeof(UChar); 1.203 + 1.204 + uprv_memset(data->fReserved, 0, sizeof(data->fReserved)); 1.205 + 1.206 + fForwardTables->exportTable((uint8_t *)data + data->fFTable); 1.207 + fReverseTables->exportTable((uint8_t *)data + data->fRTable); 1.208 + fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable); 1.209 + fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable); 1.210 + fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie); 1.211 + 1.212 + int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable); 1.213 + for (i=0; i<fRuleStatusVals->size(); i++) { 1.214 + ruleStatusTable[i] = fRuleStatusVals->elementAti(i); 1.215 + } 1.216 + 1.217 + strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus); 1.218 + 1.219 + return data; 1.220 +} 1.221 + 1.222 + 1.223 + 1.224 + 1.225 + 1.226 + 1.227 +//---------------------------------------------------------------------------------------- 1.228 +// 1.229 +// createRuleBasedBreakIterator construct from source rules that are passed in 1.230 +// in a UnicodeString 1.231 +// 1.232 +//---------------------------------------------------------------------------------------- 1.233 +BreakIterator * 1.234 +RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules, 1.235 + UParseError *parseError, 1.236 + UErrorCode &status) 1.237 +{ 1.238 + // status checked below 1.239 + 1.240 + // 1.241 + // Read the input rules, generate a parse tree, symbol table, 1.242 + // and list of all Unicode Sets referenced by the rules. 1.243 + // 1.244 + RBBIRuleBuilder builder(rules, parseError, status); 1.245 + if (U_FAILURE(status)) { // status checked here bcos build below doesn't 1.246 + return NULL; 1.247 + } 1.248 + builder.fScanner->parse(); 1.249 + 1.250 + // 1.251 + // UnicodeSet processing. 1.252 + // Munge the Unicode Sets to create a set of character categories. 1.253 + // Generate the mapping tables (TRIE) from input 32-bit characters to 1.254 + // the character categories. 1.255 + // 1.256 + builder.fSetBuilder->build(); 1.257 + 1.258 + 1.259 + // 1.260 + // Generate the DFA state transition table. 1.261 + // 1.262 + builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree); 1.263 + builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree); 1.264 + builder.fSafeFwdTables = new RBBITableBuilder(&builder, &builder.fSafeFwdTree); 1.265 + builder.fSafeRevTables = new RBBITableBuilder(&builder, &builder.fSafeRevTree); 1.266 + if (builder.fForwardTables == NULL || builder.fReverseTables == NULL || 1.267 + builder.fSafeFwdTables == NULL || builder.fSafeRevTables == NULL) 1.268 + { 1.269 + status = U_MEMORY_ALLOCATION_ERROR; 1.270 + delete builder.fForwardTables; builder.fForwardTables = NULL; 1.271 + delete builder.fReverseTables; builder.fReverseTables = NULL; 1.272 + delete builder.fSafeFwdTables; builder.fSafeFwdTables = NULL; 1.273 + delete builder.fSafeRevTables; builder.fSafeRevTables = NULL; 1.274 + return NULL; 1.275 + } 1.276 + 1.277 + builder.fForwardTables->build(); 1.278 + builder.fReverseTables->build(); 1.279 + builder.fSafeFwdTables->build(); 1.280 + builder.fSafeRevTables->build(); 1.281 + 1.282 +#ifdef RBBI_DEBUG 1.283 + if (builder.fDebugEnv && uprv_strstr(builder.fDebugEnv, "states")) { 1.284 + builder.fForwardTables->printRuleStatusTable(); 1.285 + } 1.286 +#endif 1.287 + 1.288 + // 1.289 + // Package up the compiled data into a memory image 1.290 + // in the run-time format. 1.291 + // 1.292 + RBBIDataHeader *data = builder.flattenData(); // returns NULL if error 1.293 + if (U_FAILURE(*builder.fStatus)) { 1.294 + return NULL; 1.295 + } 1.296 + 1.297 + 1.298 + // 1.299 + // Clean up the compiler related stuff 1.300 + // 1.301 + 1.302 + 1.303 + // 1.304 + // Create a break iterator from the compiled rules. 1.305 + // (Identical to creation from stored pre-compiled rules) 1.306 + // 1.307 + // status is checked after init in construction. 1.308 + RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status); 1.309 + if (U_FAILURE(status)) { 1.310 + delete This; 1.311 + This = NULL; 1.312 + } 1.313 + else if(This == NULL) { // test for NULL 1.314 + status = U_MEMORY_ALLOCATION_ERROR; 1.315 + } 1.316 + return This; 1.317 +} 1.318 + 1.319 +U_NAMESPACE_END 1.320 + 1.321 +#endif /* #if !UCONFIG_NO_BREAK_ITERATION */