michael@0: // michael@0: // file: rbbirb.cpp michael@0: // michael@0: // Copyright (C) 2002-2011, International Business Machines Corporation and others. michael@0: // All Rights Reserved. michael@0: // michael@0: // This file contains the RBBIRuleBuilder class implementation. This is the main class for michael@0: // building (compiling) break rules into the tables required by the runtime michael@0: // RBBI engine. michael@0: // michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_BREAK_ITERATION michael@0: michael@0: #include "unicode/brkiter.h" michael@0: #include "unicode/rbbi.h" michael@0: #include "unicode/ubrk.h" michael@0: #include "unicode/unistr.h" michael@0: #include "unicode/uniset.h" michael@0: #include "unicode/uchar.h" michael@0: #include "unicode/uchriter.h" michael@0: #include "unicode/parsepos.h" michael@0: #include "unicode/parseerr.h" michael@0: #include "cmemory.h" michael@0: #include "cstring.h" michael@0: michael@0: #include "rbbirb.h" michael@0: #include "rbbinode.h" michael@0: michael@0: #include "rbbiscan.h" michael@0: #include "rbbisetb.h" michael@0: #include "rbbitblb.h" michael@0: #include "rbbidata.h" michael@0: michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: michael@0: //---------------------------------------------------------------------------------------- michael@0: // michael@0: // Constructor. michael@0: // michael@0: //---------------------------------------------------------------------------------------- michael@0: RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules, michael@0: UParseError *parseErr, michael@0: UErrorCode &status) michael@0: : fRules(rules) michael@0: { michael@0: fStatus = &status; // status is checked below michael@0: fParseError = parseErr; michael@0: fDebugEnv = NULL; michael@0: #ifdef RBBI_DEBUG michael@0: fDebugEnv = getenv("U_RBBIDEBUG"); michael@0: #endif michael@0: michael@0: michael@0: fForwardTree = NULL; michael@0: fReverseTree = NULL; michael@0: fSafeFwdTree = NULL; michael@0: fSafeRevTree = NULL; michael@0: fDefaultTree = &fForwardTree; michael@0: fForwardTables = NULL; michael@0: fReverseTables = NULL; michael@0: fSafeFwdTables = NULL; michael@0: fSafeRevTables = NULL; michael@0: fRuleStatusVals = NULL; michael@0: fChainRules = FALSE; michael@0: fLBCMNoChain = FALSE; michael@0: fLookAheadHardBreak = FALSE; michael@0: fUSetNodes = NULL; michael@0: fRuleStatusVals = NULL; michael@0: fScanner = NULL; michael@0: fSetBuilder = NULL; michael@0: if (parseErr) { michael@0: uprv_memset(parseErr, 0, sizeof(UParseError)); michael@0: } michael@0: michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: michael@0: fUSetNodes = new UVector(status); // bcos status gets overwritten here michael@0: fRuleStatusVals = new UVector(status); michael@0: fScanner = new RBBIRuleScanner(this); michael@0: fSetBuilder = new RBBISetBuilder(this); michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: } michael@0: } michael@0: michael@0: michael@0: michael@0: //---------------------------------------------------------------------------------------- michael@0: // michael@0: // Destructor michael@0: // michael@0: //---------------------------------------------------------------------------------------- michael@0: RBBIRuleBuilder::~RBBIRuleBuilder() { michael@0: michael@0: int i; michael@0: for (i=0; ; i++) { michael@0: RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i); michael@0: if (n==NULL) { michael@0: break; michael@0: } michael@0: delete n; michael@0: } michael@0: michael@0: delete fUSetNodes; michael@0: delete fSetBuilder; michael@0: delete fForwardTables; michael@0: delete fReverseTables; michael@0: delete fSafeFwdTables; michael@0: delete fSafeRevTables; michael@0: michael@0: delete fForwardTree; michael@0: delete fReverseTree; michael@0: delete fSafeFwdTree; michael@0: delete fSafeRevTree; michael@0: delete fScanner; michael@0: delete fRuleStatusVals; michael@0: } michael@0: michael@0: michael@0: michael@0: michael@0: michael@0: //---------------------------------------------------------------------------------------- michael@0: // michael@0: // flattenData() - Collect up the compiled RBBI rule data and put it into michael@0: // the format for saving in ICU data files, michael@0: // which is also the format needed by the RBBI runtime engine. michael@0: // michael@0: //---------------------------------------------------------------------------------------- michael@0: static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;} michael@0: michael@0: RBBIDataHeader *RBBIRuleBuilder::flattenData() { michael@0: int32_t i; michael@0: michael@0: if (U_FAILURE(*fStatus)) { michael@0: return NULL; michael@0: } michael@0: michael@0: // Remove comments and whitespace from the rules to make it smaller. michael@0: UnicodeString strippedRules((const UnicodeString&)RBBIRuleScanner::stripRules(fRules)); michael@0: michael@0: // Calculate the size of each section in the data. michael@0: // Sizes here are padded up to a multiple of 8 for better memory alignment. michael@0: // Sections sizes actually stored in the header are for the actual data michael@0: // without the padding. michael@0: // michael@0: int32_t headerSize = align8(sizeof(RBBIDataHeader)); michael@0: int32_t forwardTableSize = align8(fForwardTables->getTableSize()); michael@0: int32_t reverseTableSize = align8(fReverseTables->getTableSize()); michael@0: int32_t safeFwdTableSize = align8(fSafeFwdTables->getTableSize()); michael@0: int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize()); michael@0: int32_t trieSize = align8(fSetBuilder->getTrieSize()); michael@0: int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t)); michael@0: int32_t rulesSize = align8((strippedRules.length()+1) * sizeof(UChar)); michael@0: michael@0: int32_t totalSize = headerSize + forwardTableSize + reverseTableSize michael@0: + safeFwdTableSize + safeRevTableSize michael@0: + statusTableSize + trieSize + rulesSize; michael@0: michael@0: RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize); michael@0: if (data == NULL) { michael@0: *fStatus = U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: uprv_memset(data, 0, totalSize); michael@0: michael@0: michael@0: data->fMagic = 0xb1a0; michael@0: data->fFormatVersion[0] = 3; michael@0: data->fFormatVersion[1] = 1; michael@0: data->fFormatVersion[2] = 0; michael@0: data->fFormatVersion[3] = 0; michael@0: data->fLength = totalSize; michael@0: data->fCatCount = fSetBuilder->getNumCharCategories(); michael@0: michael@0: data->fFTable = headerSize; michael@0: data->fFTableLen = forwardTableSize; michael@0: data->fRTable = data->fFTable + forwardTableSize; michael@0: data->fRTableLen = reverseTableSize; michael@0: data->fSFTable = data->fRTable + reverseTableSize; michael@0: data->fSFTableLen = safeFwdTableSize; michael@0: data->fSRTable = data->fSFTable + safeFwdTableSize; michael@0: data->fSRTableLen = safeRevTableSize; michael@0: michael@0: data->fTrie = data->fSRTable + safeRevTableSize; michael@0: data->fTrieLen = fSetBuilder->getTrieSize(); michael@0: data->fStatusTable = data->fTrie + trieSize; michael@0: data->fStatusTableLen= statusTableSize; michael@0: data->fRuleSource = data->fStatusTable + statusTableSize; michael@0: data->fRuleSourceLen = strippedRules.length() * sizeof(UChar); michael@0: michael@0: uprv_memset(data->fReserved, 0, sizeof(data->fReserved)); michael@0: michael@0: fForwardTables->exportTable((uint8_t *)data + data->fFTable); michael@0: fReverseTables->exportTable((uint8_t *)data + data->fRTable); michael@0: fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable); michael@0: fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable); michael@0: fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie); michael@0: michael@0: int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable); michael@0: for (i=0; isize(); i++) { michael@0: ruleStatusTable[i] = fRuleStatusVals->elementAti(i); michael@0: } michael@0: michael@0: strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus); michael@0: michael@0: return data; michael@0: } michael@0: michael@0: michael@0: michael@0: michael@0: michael@0: michael@0: //---------------------------------------------------------------------------------------- michael@0: // michael@0: // createRuleBasedBreakIterator construct from source rules that are passed in michael@0: // in a UnicodeString michael@0: // michael@0: //---------------------------------------------------------------------------------------- michael@0: BreakIterator * michael@0: RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules, michael@0: UParseError *parseError, michael@0: UErrorCode &status) michael@0: { michael@0: // status checked below michael@0: michael@0: // michael@0: // Read the input rules, generate a parse tree, symbol table, michael@0: // and list of all Unicode Sets referenced by the rules. michael@0: // michael@0: RBBIRuleBuilder builder(rules, parseError, status); michael@0: if (U_FAILURE(status)) { // status checked here bcos build below doesn't michael@0: return NULL; michael@0: } michael@0: builder.fScanner->parse(); michael@0: michael@0: // michael@0: // UnicodeSet processing. michael@0: // Munge the Unicode Sets to create a set of character categories. michael@0: // Generate the mapping tables (TRIE) from input 32-bit characters to michael@0: // the character categories. michael@0: // michael@0: builder.fSetBuilder->build(); michael@0: michael@0: michael@0: // michael@0: // Generate the DFA state transition table. michael@0: // michael@0: builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree); michael@0: builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree); michael@0: builder.fSafeFwdTables = new RBBITableBuilder(&builder, &builder.fSafeFwdTree); michael@0: builder.fSafeRevTables = new RBBITableBuilder(&builder, &builder.fSafeRevTree); michael@0: if (builder.fForwardTables == NULL || builder.fReverseTables == NULL || michael@0: builder.fSafeFwdTables == NULL || builder.fSafeRevTables == NULL) michael@0: { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: delete builder.fForwardTables; builder.fForwardTables = NULL; michael@0: delete builder.fReverseTables; builder.fReverseTables = NULL; michael@0: delete builder.fSafeFwdTables; builder.fSafeFwdTables = NULL; michael@0: delete builder.fSafeRevTables; builder.fSafeRevTables = NULL; michael@0: return NULL; michael@0: } michael@0: michael@0: builder.fForwardTables->build(); michael@0: builder.fReverseTables->build(); michael@0: builder.fSafeFwdTables->build(); michael@0: builder.fSafeRevTables->build(); michael@0: michael@0: #ifdef RBBI_DEBUG michael@0: if (builder.fDebugEnv && uprv_strstr(builder.fDebugEnv, "states")) { michael@0: builder.fForwardTables->printRuleStatusTable(); michael@0: } michael@0: #endif michael@0: michael@0: // michael@0: // Package up the compiled data into a memory image michael@0: // in the run-time format. michael@0: // michael@0: RBBIDataHeader *data = builder.flattenData(); // returns NULL if error michael@0: if (U_FAILURE(*builder.fStatus)) { michael@0: return NULL; michael@0: } michael@0: michael@0: michael@0: // michael@0: // Clean up the compiler related stuff michael@0: // michael@0: michael@0: michael@0: // michael@0: // Create a break iterator from the compiled rules. michael@0: // (Identical to creation from stored pre-compiled rules) michael@0: // michael@0: // status is checked after init in construction. michael@0: RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status); michael@0: if (U_FAILURE(status)) { michael@0: delete This; michael@0: This = NULL; michael@0: } michael@0: else if(This == NULL) { // test for NULL michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: } michael@0: return This; michael@0: } michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: #endif /* #if !UCONFIG_NO_BREAK_ITERATION */