intl/icu/source/common/rbbirb.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 //
michael@0 2 // file: rbbirb.cpp
michael@0 3 //
michael@0 4 // Copyright (C) 2002-2011, International Business Machines Corporation and others.
michael@0 5 // All Rights Reserved.
michael@0 6 //
michael@0 7 // This file contains the RBBIRuleBuilder class implementation. This is the main class for
michael@0 8 // building (compiling) break rules into the tables required by the runtime
michael@0 9 // RBBI engine.
michael@0 10 //
michael@0 11
michael@0 12 #include "unicode/utypes.h"
michael@0 13
michael@0 14 #if !UCONFIG_NO_BREAK_ITERATION
michael@0 15
michael@0 16 #include "unicode/brkiter.h"
michael@0 17 #include "unicode/rbbi.h"
michael@0 18 #include "unicode/ubrk.h"
michael@0 19 #include "unicode/unistr.h"
michael@0 20 #include "unicode/uniset.h"
michael@0 21 #include "unicode/uchar.h"
michael@0 22 #include "unicode/uchriter.h"
michael@0 23 #include "unicode/parsepos.h"
michael@0 24 #include "unicode/parseerr.h"
michael@0 25 #include "cmemory.h"
michael@0 26 #include "cstring.h"
michael@0 27
michael@0 28 #include "rbbirb.h"
michael@0 29 #include "rbbinode.h"
michael@0 30
michael@0 31 #include "rbbiscan.h"
michael@0 32 #include "rbbisetb.h"
michael@0 33 #include "rbbitblb.h"
michael@0 34 #include "rbbidata.h"
michael@0 35
michael@0 36
michael@0 37 U_NAMESPACE_BEGIN
michael@0 38
michael@0 39
michael@0 40 //----------------------------------------------------------------------------------------
michael@0 41 //
michael@0 42 // Constructor.
michael@0 43 //
michael@0 44 //----------------------------------------------------------------------------------------
michael@0 45 RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules,
michael@0 46 UParseError *parseErr,
michael@0 47 UErrorCode &status)
michael@0 48 : fRules(rules)
michael@0 49 {
michael@0 50 fStatus = &status; // status is checked below
michael@0 51 fParseError = parseErr;
michael@0 52 fDebugEnv = NULL;
michael@0 53 #ifdef RBBI_DEBUG
michael@0 54 fDebugEnv = getenv("U_RBBIDEBUG");
michael@0 55 #endif
michael@0 56
michael@0 57
michael@0 58 fForwardTree = NULL;
michael@0 59 fReverseTree = NULL;
michael@0 60 fSafeFwdTree = NULL;
michael@0 61 fSafeRevTree = NULL;
michael@0 62 fDefaultTree = &fForwardTree;
michael@0 63 fForwardTables = NULL;
michael@0 64 fReverseTables = NULL;
michael@0 65 fSafeFwdTables = NULL;
michael@0 66 fSafeRevTables = NULL;
michael@0 67 fRuleStatusVals = NULL;
michael@0 68 fChainRules = FALSE;
michael@0 69 fLBCMNoChain = FALSE;
michael@0 70 fLookAheadHardBreak = FALSE;
michael@0 71 fUSetNodes = NULL;
michael@0 72 fRuleStatusVals = NULL;
michael@0 73 fScanner = NULL;
michael@0 74 fSetBuilder = NULL;
michael@0 75 if (parseErr) {
michael@0 76 uprv_memset(parseErr, 0, sizeof(UParseError));
michael@0 77 }
michael@0 78
michael@0 79 if (U_FAILURE(status)) {
michael@0 80 return;
michael@0 81 }
michael@0 82
michael@0 83 fUSetNodes = new UVector(status); // bcos status gets overwritten here
michael@0 84 fRuleStatusVals = new UVector(status);
michael@0 85 fScanner = new RBBIRuleScanner(this);
michael@0 86 fSetBuilder = new RBBISetBuilder(this);
michael@0 87 if (U_FAILURE(status)) {
michael@0 88 return;
michael@0 89 }
michael@0 90 if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) {
michael@0 91 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 92 }
michael@0 93 }
michael@0 94
michael@0 95
michael@0 96
michael@0 97 //----------------------------------------------------------------------------------------
michael@0 98 //
michael@0 99 // Destructor
michael@0 100 //
michael@0 101 //----------------------------------------------------------------------------------------
michael@0 102 RBBIRuleBuilder::~RBBIRuleBuilder() {
michael@0 103
michael@0 104 int i;
michael@0 105 for (i=0; ; i++) {
michael@0 106 RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i);
michael@0 107 if (n==NULL) {
michael@0 108 break;
michael@0 109 }
michael@0 110 delete n;
michael@0 111 }
michael@0 112
michael@0 113 delete fUSetNodes;
michael@0 114 delete fSetBuilder;
michael@0 115 delete fForwardTables;
michael@0 116 delete fReverseTables;
michael@0 117 delete fSafeFwdTables;
michael@0 118 delete fSafeRevTables;
michael@0 119
michael@0 120 delete fForwardTree;
michael@0 121 delete fReverseTree;
michael@0 122 delete fSafeFwdTree;
michael@0 123 delete fSafeRevTree;
michael@0 124 delete fScanner;
michael@0 125 delete fRuleStatusVals;
michael@0 126 }
michael@0 127
michael@0 128
michael@0 129
michael@0 130
michael@0 131
michael@0 132 //----------------------------------------------------------------------------------------
michael@0 133 //
michael@0 134 // flattenData() - Collect up the compiled RBBI rule data and put it into
michael@0 135 // the format for saving in ICU data files,
michael@0 136 // which is also the format needed by the RBBI runtime engine.
michael@0 137 //
michael@0 138 //----------------------------------------------------------------------------------------
michael@0 139 static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}
michael@0 140
michael@0 141 RBBIDataHeader *RBBIRuleBuilder::flattenData() {
michael@0 142 int32_t i;
michael@0 143
michael@0 144 if (U_FAILURE(*fStatus)) {
michael@0 145 return NULL;
michael@0 146 }
michael@0 147
michael@0 148 // Remove comments and whitespace from the rules to make it smaller.
michael@0 149 UnicodeString strippedRules((const UnicodeString&)RBBIRuleScanner::stripRules(fRules));
michael@0 150
michael@0 151 // Calculate the size of each section in the data.
michael@0 152 // Sizes here are padded up to a multiple of 8 for better memory alignment.
michael@0 153 // Sections sizes actually stored in the header are for the actual data
michael@0 154 // without the padding.
michael@0 155 //
michael@0 156 int32_t headerSize = align8(sizeof(RBBIDataHeader));
michael@0 157 int32_t forwardTableSize = align8(fForwardTables->getTableSize());
michael@0 158 int32_t reverseTableSize = align8(fReverseTables->getTableSize());
michael@0 159 int32_t safeFwdTableSize = align8(fSafeFwdTables->getTableSize());
michael@0 160 int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize());
michael@0 161 int32_t trieSize = align8(fSetBuilder->getTrieSize());
michael@0 162 int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t));
michael@0 163 int32_t rulesSize = align8((strippedRules.length()+1) * sizeof(UChar));
michael@0 164
michael@0 165 int32_t totalSize = headerSize + forwardTableSize + reverseTableSize
michael@0 166 + safeFwdTableSize + safeRevTableSize
michael@0 167 + statusTableSize + trieSize + rulesSize;
michael@0 168
michael@0 169 RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize);
michael@0 170 if (data == NULL) {
michael@0 171 *fStatus = U_MEMORY_ALLOCATION_ERROR;
michael@0 172 return NULL;
michael@0 173 }
michael@0 174 uprv_memset(data, 0, totalSize);
michael@0 175
michael@0 176
michael@0 177 data->fMagic = 0xb1a0;
michael@0 178 data->fFormatVersion[0] = 3;
michael@0 179 data->fFormatVersion[1] = 1;
michael@0 180 data->fFormatVersion[2] = 0;
michael@0 181 data->fFormatVersion[3] = 0;
michael@0 182 data->fLength = totalSize;
michael@0 183 data->fCatCount = fSetBuilder->getNumCharCategories();
michael@0 184
michael@0 185 data->fFTable = headerSize;
michael@0 186 data->fFTableLen = forwardTableSize;
michael@0 187 data->fRTable = data->fFTable + forwardTableSize;
michael@0 188 data->fRTableLen = reverseTableSize;
michael@0 189 data->fSFTable = data->fRTable + reverseTableSize;
michael@0 190 data->fSFTableLen = safeFwdTableSize;
michael@0 191 data->fSRTable = data->fSFTable + safeFwdTableSize;
michael@0 192 data->fSRTableLen = safeRevTableSize;
michael@0 193
michael@0 194 data->fTrie = data->fSRTable + safeRevTableSize;
michael@0 195 data->fTrieLen = fSetBuilder->getTrieSize();
michael@0 196 data->fStatusTable = data->fTrie + trieSize;
michael@0 197 data->fStatusTableLen= statusTableSize;
michael@0 198 data->fRuleSource = data->fStatusTable + statusTableSize;
michael@0 199 data->fRuleSourceLen = strippedRules.length() * sizeof(UChar);
michael@0 200
michael@0 201 uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
michael@0 202
michael@0 203 fForwardTables->exportTable((uint8_t *)data + data->fFTable);
michael@0 204 fReverseTables->exportTable((uint8_t *)data + data->fRTable);
michael@0 205 fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable);
michael@0 206 fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
michael@0 207 fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
michael@0 208
michael@0 209 int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
michael@0 210 for (i=0; i<fRuleStatusVals->size(); i++) {
michael@0 211 ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
michael@0 212 }
michael@0 213
michael@0 214 strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
michael@0 215
michael@0 216 return data;
michael@0 217 }
michael@0 218
michael@0 219
michael@0 220
michael@0 221
michael@0 222
michael@0 223
michael@0 224 //----------------------------------------------------------------------------------------
michael@0 225 //
michael@0 226 // createRuleBasedBreakIterator construct from source rules that are passed in
michael@0 227 // in a UnicodeString
michael@0 228 //
michael@0 229 //----------------------------------------------------------------------------------------
michael@0 230 BreakIterator *
michael@0 231 RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
michael@0 232 UParseError *parseError,
michael@0 233 UErrorCode &status)
michael@0 234 {
michael@0 235 // status checked below
michael@0 236
michael@0 237 //
michael@0 238 // Read the input rules, generate a parse tree, symbol table,
michael@0 239 // and list of all Unicode Sets referenced by the rules.
michael@0 240 //
michael@0 241 RBBIRuleBuilder builder(rules, parseError, status);
michael@0 242 if (U_FAILURE(status)) { // status checked here bcos build below doesn't
michael@0 243 return NULL;
michael@0 244 }
michael@0 245 builder.fScanner->parse();
michael@0 246
michael@0 247 //
michael@0 248 // UnicodeSet processing.
michael@0 249 // Munge the Unicode Sets to create a set of character categories.
michael@0 250 // Generate the mapping tables (TRIE) from input 32-bit characters to
michael@0 251 // the character categories.
michael@0 252 //
michael@0 253 builder.fSetBuilder->build();
michael@0 254
michael@0 255
michael@0 256 //
michael@0 257 // Generate the DFA state transition table.
michael@0 258 //
michael@0 259 builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree);
michael@0 260 builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree);
michael@0 261 builder.fSafeFwdTables = new RBBITableBuilder(&builder, &builder.fSafeFwdTree);
michael@0 262 builder.fSafeRevTables = new RBBITableBuilder(&builder, &builder.fSafeRevTree);
michael@0 263 if (builder.fForwardTables == NULL || builder.fReverseTables == NULL ||
michael@0 264 builder.fSafeFwdTables == NULL || builder.fSafeRevTables == NULL)
michael@0 265 {
michael@0 266 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 267 delete builder.fForwardTables; builder.fForwardTables = NULL;
michael@0 268 delete builder.fReverseTables; builder.fReverseTables = NULL;
michael@0 269 delete builder.fSafeFwdTables; builder.fSafeFwdTables = NULL;
michael@0 270 delete builder.fSafeRevTables; builder.fSafeRevTables = NULL;
michael@0 271 return NULL;
michael@0 272 }
michael@0 273
michael@0 274 builder.fForwardTables->build();
michael@0 275 builder.fReverseTables->build();
michael@0 276 builder.fSafeFwdTables->build();
michael@0 277 builder.fSafeRevTables->build();
michael@0 278
michael@0 279 #ifdef RBBI_DEBUG
michael@0 280 if (builder.fDebugEnv && uprv_strstr(builder.fDebugEnv, "states")) {
michael@0 281 builder.fForwardTables->printRuleStatusTable();
michael@0 282 }
michael@0 283 #endif
michael@0 284
michael@0 285 //
michael@0 286 // Package up the compiled data into a memory image
michael@0 287 // in the run-time format.
michael@0 288 //
michael@0 289 RBBIDataHeader *data = builder.flattenData(); // returns NULL if error
michael@0 290 if (U_FAILURE(*builder.fStatus)) {
michael@0 291 return NULL;
michael@0 292 }
michael@0 293
michael@0 294
michael@0 295 //
michael@0 296 // Clean up the compiler related stuff
michael@0 297 //
michael@0 298
michael@0 299
michael@0 300 //
michael@0 301 // Create a break iterator from the compiled rules.
michael@0 302 // (Identical to creation from stored pre-compiled rules)
michael@0 303 //
michael@0 304 // status is checked after init in construction.
michael@0 305 RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
michael@0 306 if (U_FAILURE(status)) {
michael@0 307 delete This;
michael@0 308 This = NULL;
michael@0 309 }
michael@0 310 else if(This == NULL) { // test for NULL
michael@0 311 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 312 }
michael@0 313 return This;
michael@0 314 }
michael@0 315
michael@0 316 U_NAMESPACE_END
michael@0 317
michael@0 318 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

mercurial