michael@0: /* michael@0: ********************************************************************** michael@0: * Copyright (c) 2002-2009, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: ********************************************************************** michael@0: */ michael@0: // michael@0: // rbbitblb.cpp michael@0: // michael@0: michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_BREAK_ITERATION michael@0: michael@0: #include "unicode/unistr.h" michael@0: #include "rbbitblb.h" michael@0: #include "rbbirb.h" michael@0: #include "rbbisetb.h" michael@0: #include "rbbidata.h" michael@0: #include "cstring.h" michael@0: #include "uassert.h" michael@0: #include "cmemory.h" michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) : michael@0: fTree(*rootNode) { michael@0: fRB = rb; michael@0: fStatus = fRB->fStatus; michael@0: UErrorCode status = U_ZERO_ERROR; michael@0: fDStates = new UVector(status); michael@0: if (U_FAILURE(*fStatus)) { michael@0: return; michael@0: } michael@0: if (U_FAILURE(status)) { michael@0: *fStatus = status; michael@0: return; michael@0: } michael@0: if (fDStates == NULL) { michael@0: *fStatus = U_MEMORY_ALLOCATION_ERROR;; michael@0: } michael@0: } michael@0: michael@0: michael@0: michael@0: RBBITableBuilder::~RBBITableBuilder() { michael@0: int i; michael@0: for (i=0; isize(); i++) { michael@0: delete (RBBIStateDescriptor *)fDStates->elementAt(i); michael@0: } michael@0: delete fDStates; michael@0: } michael@0: michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // RBBITableBuilder::build - This is the main function for building the DFA state transtion michael@0: // table from the RBBI rules parse tree. michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: void RBBITableBuilder::build() { michael@0: michael@0: if (U_FAILURE(*fStatus)) { michael@0: return; michael@0: } michael@0: michael@0: // If there were no rules, just return. This situation can easily arise michael@0: // for the reverse rules. michael@0: if (fTree==NULL) { michael@0: return; michael@0: } michael@0: michael@0: // michael@0: // Walk through the tree, replacing any references to $variables with a copy of the michael@0: // parse tree for the substition expression. michael@0: // michael@0: fTree = fTree->flattenVariables(); michael@0: #ifdef RBBI_DEBUG michael@0: if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) { michael@0: RBBIDebugPuts("Parse tree after flattening variable references."); michael@0: fTree->printTree(TRUE); michael@0: } michael@0: #endif michael@0: michael@0: // michael@0: // If the rules contained any references to {bof} michael@0: // add a {bof} to the michael@0: // tree. Means that all matches must start out with the michael@0: // {bof} fake character. michael@0: // michael@0: if (fRB->fSetBuilder->sawBOF()) { michael@0: RBBINode *bofTop = new RBBINode(RBBINode::opCat); michael@0: RBBINode *bofLeaf = new RBBINode(RBBINode::leafChar); michael@0: // Delete and exit if memory allocation failed. michael@0: if (bofTop == NULL || bofLeaf == NULL) { michael@0: *fStatus = U_MEMORY_ALLOCATION_ERROR; michael@0: delete bofTop; michael@0: delete bofLeaf; michael@0: return; michael@0: } michael@0: bofTop->fLeftChild = bofLeaf; michael@0: bofTop->fRightChild = fTree; michael@0: bofLeaf->fParent = bofTop; michael@0: bofLeaf->fVal = 2; // Reserved value for {bof}. michael@0: fTree = bofTop; michael@0: } michael@0: michael@0: // michael@0: // Add a unique right-end marker to the expression. michael@0: // Appears as a cat-node, left child being the original tree, michael@0: // right child being the end marker. michael@0: // michael@0: RBBINode *cn = new RBBINode(RBBINode::opCat); michael@0: // Exit if memory allocation failed. michael@0: if (cn == NULL) { michael@0: *fStatus = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: cn->fLeftChild = fTree; michael@0: fTree->fParent = cn; michael@0: cn->fRightChild = new RBBINode(RBBINode::endMark); michael@0: // Delete and exit if memory allocation failed. michael@0: if (cn->fRightChild == NULL) { michael@0: *fStatus = U_MEMORY_ALLOCATION_ERROR; michael@0: delete cn; michael@0: return; michael@0: } michael@0: cn->fRightChild->fParent = cn; michael@0: fTree = cn; michael@0: michael@0: // michael@0: // Replace all references to UnicodeSets with the tree for the equivalent michael@0: // expression. michael@0: // michael@0: fTree->flattenSets(); michael@0: #ifdef RBBI_DEBUG michael@0: if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) { michael@0: RBBIDebugPuts("Parse tree after flattening Unicode Set references."); michael@0: fTree->printTree(TRUE); michael@0: } michael@0: #endif michael@0: michael@0: michael@0: // michael@0: // calculate the functions nullable, firstpos, lastpos and followpos on michael@0: // nodes in the parse tree. michael@0: // See the alogrithm description in Aho. michael@0: // Understanding how this works by looking at the code alone will be michael@0: // nearly impossible. michael@0: // michael@0: calcNullable(fTree); michael@0: calcFirstPos(fTree); michael@0: calcLastPos(fTree); michael@0: calcFollowPos(fTree); michael@0: if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) { michael@0: RBBIDebugPuts("\n"); michael@0: printPosSets(fTree); michael@0: } michael@0: michael@0: // michael@0: // For "chained" rules, modify the followPos sets michael@0: // michael@0: if (fRB->fChainRules) { michael@0: calcChainedFollowPos(fTree); michael@0: } michael@0: michael@0: // michael@0: // BOF (start of input) test fixup. michael@0: // michael@0: if (fRB->fSetBuilder->sawBOF()) { michael@0: bofFixup(); michael@0: } michael@0: michael@0: // michael@0: // Build the DFA state transition tables. michael@0: // michael@0: buildStateTable(); michael@0: flagAcceptingStates(); michael@0: flagLookAheadStates(); michael@0: flagTaggedStates(); michael@0: michael@0: // michael@0: // Update the global table of rule status {tag} values michael@0: // The rule builder has a global vector of status values that are common michael@0: // for all tables. Merge the ones from this table into the global set. michael@0: // michael@0: mergeRuleStatusVals(); michael@0: michael@0: if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();}; michael@0: } michael@0: michael@0: michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // calcNullable. Impossible to explain succinctly. See Aho, section 3.9 michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: void RBBITableBuilder::calcNullable(RBBINode *n) { michael@0: if (n == NULL) { michael@0: return; michael@0: } michael@0: if (n->fType == RBBINode::setRef || michael@0: n->fType == RBBINode::endMark ) { michael@0: // These are non-empty leaf node types. michael@0: n->fNullable = FALSE; michael@0: return; michael@0: } michael@0: michael@0: if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) { michael@0: // Lookahead marker node. It's a leaf, so no recursion on children. michael@0: // It's nullable because it does not match any literal text from the input stream. michael@0: n->fNullable = TRUE; michael@0: return; michael@0: } michael@0: michael@0: michael@0: // The node is not a leaf. michael@0: // Calculate nullable on its children. michael@0: calcNullable(n->fLeftChild); michael@0: calcNullable(n->fRightChild); michael@0: michael@0: // Apply functions from table 3.40 in Aho michael@0: if (n->fType == RBBINode::opOr) { michael@0: n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable; michael@0: } michael@0: else if (n->fType == RBBINode::opCat) { michael@0: n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable; michael@0: } michael@0: else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) { michael@0: n->fNullable = TRUE; michael@0: } michael@0: else { michael@0: n->fNullable = FALSE; michael@0: } michael@0: } michael@0: michael@0: michael@0: michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9 michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: void RBBITableBuilder::calcFirstPos(RBBINode *n) { michael@0: if (n == NULL) { michael@0: return; michael@0: } michael@0: if (n->fType == RBBINode::leafChar || michael@0: n->fType == RBBINode::endMark || michael@0: n->fType == RBBINode::lookAhead || michael@0: n->fType == RBBINode::tag) { michael@0: // These are non-empty leaf node types. michael@0: // Note: In order to maintain the sort invariant on the set, michael@0: // this function should only be called on a node whose set is michael@0: // empty to start with. michael@0: n->fFirstPosSet->addElement(n, *fStatus); michael@0: return; michael@0: } michael@0: michael@0: // The node is not a leaf. michael@0: // Calculate firstPos on its children. michael@0: calcFirstPos(n->fLeftChild); michael@0: calcFirstPos(n->fRightChild); michael@0: michael@0: // Apply functions from table 3.40 in Aho michael@0: if (n->fType == RBBINode::opOr) { michael@0: setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet); michael@0: setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet); michael@0: } michael@0: else if (n->fType == RBBINode::opCat) { michael@0: setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet); michael@0: if (n->fLeftChild->fNullable) { michael@0: setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet); michael@0: } michael@0: } michael@0: else if (n->fType == RBBINode::opStar || michael@0: n->fType == RBBINode::opQuestion || michael@0: n->fType == RBBINode::opPlus) { michael@0: setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet); michael@0: } michael@0: } michael@0: michael@0: michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // calcLastPos. Impossible to explain succinctly. See Aho, section 3.9 michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: void RBBITableBuilder::calcLastPos(RBBINode *n) { michael@0: if (n == NULL) { michael@0: return; michael@0: } michael@0: if (n->fType == RBBINode::leafChar || michael@0: n->fType == RBBINode::endMark || michael@0: n->fType == RBBINode::lookAhead || michael@0: n->fType == RBBINode::tag) { michael@0: // These are non-empty leaf node types. michael@0: // Note: In order to maintain the sort invariant on the set, michael@0: // this function should only be called on a node whose set is michael@0: // empty to start with. michael@0: n->fLastPosSet->addElement(n, *fStatus); michael@0: return; michael@0: } michael@0: michael@0: // The node is not a leaf. michael@0: // Calculate lastPos on its children. michael@0: calcLastPos(n->fLeftChild); michael@0: calcLastPos(n->fRightChild); michael@0: michael@0: // Apply functions from table 3.40 in Aho michael@0: if (n->fType == RBBINode::opOr) { michael@0: setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet); michael@0: setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet); michael@0: } michael@0: else if (n->fType == RBBINode::opCat) { michael@0: setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet); michael@0: if (n->fRightChild->fNullable) { michael@0: setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet); michael@0: } michael@0: } michael@0: else if (n->fType == RBBINode::opStar || michael@0: n->fType == RBBINode::opQuestion || michael@0: n->fType == RBBINode::opPlus) { michael@0: setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet); michael@0: } michael@0: } michael@0: michael@0: michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9 michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: void RBBITableBuilder::calcFollowPos(RBBINode *n) { michael@0: if (n == NULL || michael@0: n->fType == RBBINode::leafChar || michael@0: n->fType == RBBINode::endMark) { michael@0: return; michael@0: } michael@0: michael@0: calcFollowPos(n->fLeftChild); michael@0: calcFollowPos(n->fRightChild); michael@0: michael@0: // Aho rule #1 michael@0: if (n->fType == RBBINode::opCat) { michael@0: RBBINode *i; // is 'i' in Aho's description michael@0: uint32_t ix; michael@0: michael@0: UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet; michael@0: michael@0: for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) { michael@0: i = (RBBINode *)LastPosOfLeftChild->elementAt(ix); michael@0: setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet); michael@0: } michael@0: } michael@0: michael@0: // Aho rule #2 michael@0: if (n->fType == RBBINode::opStar || michael@0: n->fType == RBBINode::opPlus) { michael@0: RBBINode *i; // again, n and i are the names from Aho's description. michael@0: uint32_t ix; michael@0: michael@0: for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) { michael@0: i = (RBBINode *)n->fLastPosSet->elementAt(ix); michael@0: setAdd(i->fFollowPos, n->fFirstPosSet); michael@0: } michael@0: } michael@0: michael@0: michael@0: michael@0: } michael@0: michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // calcChainedFollowPos. Modify the previously calculated followPos sets michael@0: // to implement rule chaining. NOT described by Aho michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) { michael@0: michael@0: UVector endMarkerNodes(*fStatus); michael@0: UVector leafNodes(*fStatus); michael@0: int32_t i; michael@0: michael@0: if (U_FAILURE(*fStatus)) { michael@0: return; michael@0: } michael@0: michael@0: // get a list of all endmarker nodes. michael@0: tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus); michael@0: michael@0: // get a list all leaf nodes michael@0: tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus); michael@0: if (U_FAILURE(*fStatus)) { michael@0: return; michael@0: } michael@0: michael@0: // Get all nodes that can be the start a match, which is FirstPosition() michael@0: // of the portion of the tree corresponding to user-written rules. michael@0: // See the tree description in bofFixup(). michael@0: RBBINode *userRuleRoot = tree; michael@0: if (fRB->fSetBuilder->sawBOF()) { michael@0: userRuleRoot = tree->fLeftChild->fRightChild; michael@0: } michael@0: U_ASSERT(userRuleRoot != NULL); michael@0: UVector *matchStartNodes = userRuleRoot->fFirstPosSet; michael@0: michael@0: michael@0: // Iteratate over all leaf nodes, michael@0: // michael@0: int32_t endNodeIx; michael@0: int32_t startNodeIx; michael@0: michael@0: for (endNodeIx=0; endNodeIxfFollowPos->contains(endMarkerNodes.elementAt(i))) { michael@0: endNode = tNode; michael@0: break; michael@0: } michael@0: } michael@0: if (endNode == NULL) { michael@0: // node wasn't an end node. Try again with the next. michael@0: continue; michael@0: } michael@0: michael@0: // We've got a node that can end a match. michael@0: michael@0: // Line Break Specific hack: If this node's val correspond to the $CM char class, michael@0: // don't chain from it. michael@0: // TODO: Add rule syntax for this behavior, get specifics out of here and michael@0: // into the rule file. michael@0: if (fRB->fLBCMNoChain) { michael@0: UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal); michael@0: if (c != -1) { michael@0: // c == -1 occurs with sets containing only the {eof} marker string. michael@0: ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK); michael@0: if (cLBProp == U_LB_COMBINING_MARK) { michael@0: continue; michael@0: } michael@0: } michael@0: } michael@0: michael@0: michael@0: // Now iterate over the nodes that can start a match, looking for ones michael@0: // with the same char class as our ending node. michael@0: RBBINode *startNode; michael@0: for (startNodeIx = 0; startNodeIxsize(); startNodeIx++) { michael@0: startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx); michael@0: if (startNode->fType != RBBINode::leafChar) { michael@0: continue; michael@0: } michael@0: michael@0: if (endNode->fVal == startNode->fVal) { michael@0: // The end val (character class) of one possible match is the michael@0: // same as the start of another. michael@0: michael@0: // Add all nodes from the followPos of the start node to the michael@0: // followPos set of the end node, which will have the effect of michael@0: // letting matches transition from a match state at endNode michael@0: // to the second char of a match starting with startNode. michael@0: setAdd(endNode->fFollowPos, startNode->fFollowPos); michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // bofFixup. Fixup for state tables that include {bof} beginning of input testing. michael@0: // Do an swizzle similar to chaining, modifying the followPos set of michael@0: // the bofNode to include the followPos nodes from other {bot} nodes michael@0: // scattered through the tree. michael@0: // michael@0: // This function has much in common with calcChainedFollowPos(). michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: void RBBITableBuilder::bofFixup() { michael@0: michael@0: if (U_FAILURE(*fStatus)) { michael@0: return; michael@0: } michael@0: michael@0: // The parse tree looks like this ... michael@0: // fTree root ---> michael@0: // / \ . michael@0: // <#end node> michael@0: // / \ . michael@0: // rest michael@0: // of tree michael@0: // michael@0: // We will be adding things to the followPos set of the michael@0: // michael@0: RBBINode *bofNode = fTree->fLeftChild->fLeftChild; michael@0: U_ASSERT(bofNode->fType == RBBINode::leafChar); michael@0: U_ASSERT(bofNode->fVal == 2); michael@0: michael@0: // Get all nodes that can be the start a match of the user-written rules michael@0: // (excluding the fake bofNode) michael@0: // We want the nodes that can start a match in the michael@0: // part labeled "rest of tree" michael@0: // michael@0: UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet; michael@0: michael@0: RBBINode *startNode; michael@0: int startNodeIx; michael@0: for (startNodeIx = 0; startNodeIxsize(); startNodeIx++) { michael@0: startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx); michael@0: if (startNode->fType != RBBINode::leafChar) { michael@0: continue; michael@0: } michael@0: michael@0: if (startNode->fVal == bofNode->fVal) { michael@0: // We found a leaf node corresponding to a {bof} that was michael@0: // explicitly written into a rule. michael@0: // Add everything from the followPos set of this node to the michael@0: // followPos set of the fake bofNode at the start of the tree. michael@0: // michael@0: setAdd(bofNode->fFollowPos, startNode->fFollowPos); michael@0: } michael@0: } michael@0: } michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // buildStateTable() Determine the set of runtime DFA states and the michael@0: // transition tables for these states, by the algorithm michael@0: // of fig. 3.44 in Aho. michael@0: // michael@0: // Most of the comments are quotes of Aho's psuedo-code. michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: void RBBITableBuilder::buildStateTable() { michael@0: if (U_FAILURE(*fStatus)) { michael@0: return; michael@0: } michael@0: RBBIStateDescriptor *failState; michael@0: // Set it to NULL to avoid uninitialized warning michael@0: RBBIStateDescriptor *initialState = NULL; michael@0: // michael@0: // Add a dummy state 0 - the stop state. Not from Aho. michael@0: int lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1; michael@0: failState = new RBBIStateDescriptor(lastInputSymbol, fStatus); michael@0: if (failState == NULL) { michael@0: *fStatus = U_MEMORY_ALLOCATION_ERROR; michael@0: goto ExitBuildSTdeleteall; michael@0: } michael@0: failState->fPositions = new UVector(*fStatus); michael@0: if (failState->fPositions == NULL) { michael@0: *fStatus = U_MEMORY_ALLOCATION_ERROR; michael@0: } michael@0: if (failState->fPositions == NULL || U_FAILURE(*fStatus)) { michael@0: goto ExitBuildSTdeleteall; michael@0: } michael@0: fDStates->addElement(failState, *fStatus); michael@0: if (U_FAILURE(*fStatus)) { michael@0: goto ExitBuildSTdeleteall; michael@0: } michael@0: michael@0: // initially, the only unmarked state in Dstates is firstpos(root), michael@0: // where toot is the root of the syntax tree for (r)#; michael@0: initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus); michael@0: if (initialState == NULL) { michael@0: *fStatus = U_MEMORY_ALLOCATION_ERROR; michael@0: } michael@0: if (U_FAILURE(*fStatus)) { michael@0: goto ExitBuildSTdeleteall; michael@0: } michael@0: initialState->fPositions = new UVector(*fStatus); michael@0: if (initialState->fPositions == NULL) { michael@0: *fStatus = U_MEMORY_ALLOCATION_ERROR; michael@0: } michael@0: if (U_FAILURE(*fStatus)) { michael@0: goto ExitBuildSTdeleteall; michael@0: } michael@0: setAdd(initialState->fPositions, fTree->fFirstPosSet); michael@0: fDStates->addElement(initialState, *fStatus); michael@0: if (U_FAILURE(*fStatus)) { michael@0: goto ExitBuildSTdeleteall; michael@0: } michael@0: michael@0: // while there is an unmarked state T in Dstates do begin michael@0: for (;;) { michael@0: RBBIStateDescriptor *T = NULL; michael@0: int32_t tx; michael@0: for (tx=1; txsize(); tx++) { michael@0: RBBIStateDescriptor *temp; michael@0: temp = (RBBIStateDescriptor *)fDStates->elementAt(tx); michael@0: if (temp->fMarked == FALSE) { michael@0: T = temp; michael@0: break; michael@0: } michael@0: } michael@0: if (T == NULL) { michael@0: break; michael@0: } michael@0: michael@0: // mark T; michael@0: T->fMarked = TRUE; michael@0: michael@0: // for each input symbol a do begin michael@0: int32_t a; michael@0: for (a = 1; a<=lastInputSymbol; a++) { michael@0: // let U be the set of positions that are in followpos(p) michael@0: // for some position p in T michael@0: // such that the symbol at position p is a; michael@0: UVector *U = NULL; michael@0: RBBINode *p; michael@0: int32_t px; michael@0: for (px=0; pxfPositions->size(); px++) { michael@0: p = (RBBINode *)T->fPositions->elementAt(px); michael@0: if ((p->fType == RBBINode::leafChar) && (p->fVal == a)) { michael@0: if (U == NULL) { michael@0: U = new UVector(*fStatus); michael@0: if (U == NULL) { michael@0: *fStatus = U_MEMORY_ALLOCATION_ERROR; michael@0: goto ExitBuildSTdeleteall; michael@0: } michael@0: } michael@0: setAdd(U, p->fFollowPos); michael@0: } michael@0: } michael@0: michael@0: // if U is not empty and not in DStates then michael@0: int32_t ux = 0; michael@0: UBool UinDstates = FALSE; michael@0: if (U != NULL) { michael@0: U_ASSERT(U->size() > 0); michael@0: int ix; michael@0: for (ix=0; ixsize(); ix++) { michael@0: RBBIStateDescriptor *temp2; michael@0: temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix); michael@0: if (setEquals(U, temp2->fPositions)) { michael@0: delete U; michael@0: U = temp2->fPositions; michael@0: ux = ix; michael@0: UinDstates = TRUE; michael@0: break; michael@0: } michael@0: } michael@0: michael@0: // Add U as an unmarked state to Dstates michael@0: if (!UinDstates) michael@0: { michael@0: RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus); michael@0: if (newState == NULL) { michael@0: *fStatus = U_MEMORY_ALLOCATION_ERROR; michael@0: } michael@0: if (U_FAILURE(*fStatus)) { michael@0: goto ExitBuildSTdeleteall; michael@0: } michael@0: newState->fPositions = U; michael@0: fDStates->addElement(newState, *fStatus); michael@0: if (U_FAILURE(*fStatus)) { michael@0: return; michael@0: } michael@0: ux = fDStates->size()-1; michael@0: } michael@0: michael@0: // Dtran[T, a] := U; michael@0: T->fDtran->setElementAt(ux, a); michael@0: } michael@0: } michael@0: } michael@0: return; michael@0: // delete local pointers only if error occured. michael@0: ExitBuildSTdeleteall: michael@0: delete initialState; michael@0: delete failState; michael@0: } michael@0: michael@0: michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // flagAcceptingStates Identify accepting states. michael@0: // First get a list of all of the end marker nodes. michael@0: // Then, for each state s, michael@0: // if s contains one of the end marker nodes in its list of tree positions then michael@0: // s is an accepting state. michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: void RBBITableBuilder::flagAcceptingStates() { michael@0: if (U_FAILURE(*fStatus)) { michael@0: return; michael@0: } michael@0: UVector endMarkerNodes(*fStatus); michael@0: RBBINode *endMarker; michael@0: int32_t i; michael@0: int32_t n; michael@0: michael@0: if (U_FAILURE(*fStatus)) { michael@0: return; michael@0: } michael@0: michael@0: fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus); michael@0: if (U_FAILURE(*fStatus)) { michael@0: return; michael@0: } michael@0: michael@0: for (i=0; isize(); n++) { michael@0: RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); michael@0: if (sd->fPositions->indexOf(endMarker) >= 0) { michael@0: // Any non-zero value for fAccepting means this is an accepting node. michael@0: // The value is what will be returned to the user as the break status. michael@0: // If no other value was specified, force it to -1. michael@0: michael@0: if (sd->fAccepting==0) { michael@0: // State hasn't been marked as accepting yet. Do it now. michael@0: sd->fAccepting = endMarker->fVal; michael@0: if (sd->fAccepting == 0) { michael@0: sd->fAccepting = -1; michael@0: } michael@0: } michael@0: if (sd->fAccepting==-1 && endMarker->fVal != 0) { michael@0: // Both lookahead and non-lookahead accepting for this state. michael@0: // Favor the look-ahead. Expedient for line break. michael@0: // TODO: need a more elegant resolution for conflicting rules. michael@0: sd->fAccepting = endMarker->fVal; michael@0: } michael@0: // implicit else: michael@0: // if sd->fAccepting already had a value other than 0 or -1, leave it be. michael@0: michael@0: // If the end marker node is from a look-ahead rule, set michael@0: // the fLookAhead field or this state also. michael@0: if (endMarker->fLookAheadEnd) { michael@0: // TODO: don't change value if already set? michael@0: // TODO: allow for more than one active look-ahead rule in engine. michael@0: // Make value here an index to a side array in engine? michael@0: sd->fLookAhead = sd->fAccepting; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // flagLookAheadStates Very similar to flagAcceptingStates, above. michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: void RBBITableBuilder::flagLookAheadStates() { michael@0: if (U_FAILURE(*fStatus)) { michael@0: return; michael@0: } michael@0: UVector lookAheadNodes(*fStatus); michael@0: RBBINode *lookAheadNode; michael@0: int32_t i; michael@0: int32_t n; michael@0: michael@0: fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus); michael@0: if (U_FAILURE(*fStatus)) { michael@0: return; michael@0: } michael@0: for (i=0; isize(); n++) { michael@0: RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); michael@0: if (sd->fPositions->indexOf(lookAheadNode) >= 0) { michael@0: sd->fLookAhead = lookAheadNode->fVal; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: michael@0: michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // flagTaggedStates michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: void RBBITableBuilder::flagTaggedStates() { michael@0: if (U_FAILURE(*fStatus)) { michael@0: return; michael@0: } michael@0: UVector tagNodes(*fStatus); michael@0: RBBINode *tagNode; michael@0: int32_t i; michael@0: int32_t n; michael@0: michael@0: if (U_FAILURE(*fStatus)) { michael@0: return; michael@0: } michael@0: fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus); michael@0: if (U_FAILURE(*fStatus)) { michael@0: return; michael@0: } michael@0: for (i=0; isize(); n++) { // For each state s (row in the state table) michael@0: RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); michael@0: if (sd->fPositions->indexOf(tagNode) >= 0) { // if s include the tag node t michael@0: sortedAdd(&sd->fTagVals, tagNode->fVal); michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: michael@0: michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // mergeRuleStatusVals michael@0: // michael@0: // Update the global table of rule status {tag} values michael@0: // The rule builder has a global vector of status values that are common michael@0: // for all tables. Merge the ones from this table into the global set. michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: void RBBITableBuilder::mergeRuleStatusVals() { michael@0: // michael@0: // The basic outline of what happens here is this... michael@0: // michael@0: // for each state in this state table michael@0: // if the status tag list for this state is in the global statuses list michael@0: // record where and michael@0: // continue with the next state michael@0: // else michael@0: // add the tag list for this state to the global list. michael@0: // michael@0: int i; michael@0: int n; michael@0: michael@0: // Pre-set a single tag of {0} into the table. michael@0: // We will need this as a default, for rule sets with no explicit tagging. michael@0: if (fRB->fRuleStatusVals->size() == 0) { michael@0: fRB->fRuleStatusVals->addElement(1, *fStatus); // Num of statuses in group michael@0: fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus); // and our single status of zero michael@0: } michael@0: michael@0: // For each state michael@0: for (n=0; nsize(); n++) { michael@0: RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); michael@0: UVector *thisStatesTagValues = sd->fTagVals; michael@0: if (thisStatesTagValues == NULL) { michael@0: // No tag values are explicitly associated with this state. michael@0: // Set the default tag value. michael@0: sd->fTagsIdx = 0; michael@0: continue; michael@0: } michael@0: michael@0: // There are tag(s) associated with this state. michael@0: // fTagsIdx will be the index into the global tag list for this state's tag values. michael@0: // Initial value of -1 flags that we haven't got it set yet. michael@0: sd->fTagsIdx = -1; michael@0: int32_t thisTagGroupStart = 0; // indexes into the global rule status vals list michael@0: int32_t nextTagGroupStart = 0; michael@0: michael@0: // Loop runs once per group of tags in the global list michael@0: while (nextTagGroupStart < fRB->fRuleStatusVals->size()) { michael@0: thisTagGroupStart = nextTagGroupStart; michael@0: nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1; michael@0: if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) { michael@0: // The number of tags for this state is different from michael@0: // the number of tags in this group from the global list. michael@0: // Continue with the next group from the global list. michael@0: continue; michael@0: } michael@0: // The lengths match, go ahead and compare the actual tag values michael@0: // between this state and the group from the global list. michael@0: for (i=0; isize(); i++) { michael@0: if (thisStatesTagValues->elementAti(i) != michael@0: fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) { michael@0: // Mismatch. michael@0: break; michael@0: } michael@0: } michael@0: michael@0: if (i == thisStatesTagValues->size()) { michael@0: // We found a set of tag values in the global list that match michael@0: // those for this state. Use them. michael@0: sd->fTagsIdx = thisTagGroupStart; michael@0: break; michael@0: } michael@0: } michael@0: michael@0: if (sd->fTagsIdx == -1) { michael@0: // No suitable entry in the global tag list already. Add one michael@0: sd->fTagsIdx = fRB->fRuleStatusVals->size(); michael@0: fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus); michael@0: for (i=0; isize(); i++) { michael@0: fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus); michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: michael@0: michael@0: michael@0: michael@0: michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // sortedAdd Add a value to a vector of sorted values (ints). michael@0: // Do not replicate entries; if the value is already there, do not michael@0: // add a second one. michael@0: // Lazily create the vector if it does not already exist. michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) { michael@0: int32_t i; michael@0: michael@0: if (*vector == NULL) { michael@0: *vector = new UVector(*fStatus); michael@0: } michael@0: if (*vector == NULL || U_FAILURE(*fStatus)) { michael@0: return; michael@0: } michael@0: UVector *vec = *vector; michael@0: int32_t vSize = vec->size(); michael@0: for (i=0; ielementAti(i); michael@0: if (valAtI == val) { michael@0: // The value is already in the vector. Don't add it again. michael@0: return; michael@0: } michael@0: if (valAtI > val) { michael@0: break; michael@0: } michael@0: } michael@0: vec->insertElementAt(val, i, *fStatus); michael@0: } michael@0: michael@0: michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // setAdd Set operation on UVector michael@0: // dest = dest union source michael@0: // Elements may only appear once and must be sorted. michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: void RBBITableBuilder::setAdd(UVector *dest, UVector *source) { michael@0: int32_t destOriginalSize = dest->size(); michael@0: int32_t sourceSize = source->size(); michael@0: int32_t di = 0; michael@0: MaybeStackArray destArray, sourceArray; // Handle small cases without malloc michael@0: void **destPtr, **sourcePtr; michael@0: void **destLim, **sourceLim; michael@0: michael@0: if (destOriginalSize > destArray.getCapacity()) { michael@0: if (destArray.resize(destOriginalSize) == NULL) { michael@0: return; michael@0: } michael@0: } michael@0: destPtr = destArray.getAlias(); michael@0: destLim = destPtr + destOriginalSize; // destArray.getArrayLimit()? michael@0: michael@0: if (sourceSize > sourceArray.getCapacity()) { michael@0: if (sourceArray.resize(sourceSize) == NULL) { michael@0: return; michael@0: } michael@0: } michael@0: sourcePtr = sourceArray.getAlias(); michael@0: sourceLim = sourcePtr + sourceSize; // sourceArray.getArrayLimit()? michael@0: michael@0: // Avoid multiple "get element" calls by getting the contents into arrays michael@0: (void) dest->toArray(destPtr); michael@0: (void) source->toArray(sourcePtr); michael@0: michael@0: dest->setSize(sourceSize+destOriginalSize, *fStatus); michael@0: michael@0: while (sourcePtr < sourceLim && destPtr < destLim) { michael@0: if (*destPtr == *sourcePtr) { michael@0: dest->setElementAt(*sourcePtr++, di++); michael@0: destPtr++; michael@0: } michael@0: // This check is required for machines with segmented memory, like i5/OS. michael@0: // Direct pointer comparison is not recommended. michael@0: else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) { michael@0: dest->setElementAt(*destPtr++, di++); michael@0: } michael@0: else { /* *sourcePtr < *destPtr */ michael@0: dest->setElementAt(*sourcePtr++, di++); michael@0: } michael@0: } michael@0: michael@0: // At most one of these two cleanup loops will execute michael@0: while (destPtr < destLim) { michael@0: dest->setElementAt(*destPtr++, di++); michael@0: } michael@0: while (sourcePtr < sourceLim) { michael@0: dest->setElementAt(*sourcePtr++, di++); michael@0: } michael@0: michael@0: dest->setSize(di, *fStatus); michael@0: } michael@0: michael@0: michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // setEqual Set operation on UVector. michael@0: // Compare for equality. michael@0: // Elements must be sorted. michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) { michael@0: return a->equals(*b); michael@0: } michael@0: michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos michael@0: // for each node in the tree. michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: #ifdef RBBI_DEBUG michael@0: void RBBITableBuilder::printPosSets(RBBINode *n) { michael@0: if (n==NULL) { michael@0: return; michael@0: } michael@0: n->printNode(); michael@0: RBBIDebugPrintf(" Nullable: %s\n", n->fNullable?"TRUE":"FALSE"); michael@0: michael@0: RBBIDebugPrintf(" firstpos: "); michael@0: printSet(n->fFirstPosSet); michael@0: michael@0: RBBIDebugPrintf(" lastpos: "); michael@0: printSet(n->fLastPosSet); michael@0: michael@0: RBBIDebugPrintf(" followpos: "); michael@0: printSet(n->fFollowPos); michael@0: michael@0: printPosSets(n->fLeftChild); michael@0: printPosSets(n->fRightChild); michael@0: } michael@0: #endif michael@0: michael@0: michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // getTableSize() Calculate the size of the runtime form of this michael@0: // state transition table. michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: int32_t RBBITableBuilder::getTableSize() const { michael@0: int32_t size = 0; michael@0: int32_t numRows; michael@0: int32_t numCols; michael@0: int32_t rowSize; michael@0: michael@0: if (fTree == NULL) { michael@0: return 0; michael@0: } michael@0: michael@0: size = sizeof(RBBIStateTable) - 4; // The header, with no rows to the table. michael@0: michael@0: numRows = fDStates->size(); michael@0: numCols = fRB->fSetBuilder->getNumCharCategories(); michael@0: michael@0: // Note The declaration of RBBIStateTableRow is for a table of two columns. michael@0: // Therefore we subtract two from numCols when determining michael@0: // how much storage to add to a row for the total columns. michael@0: rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2); michael@0: size += numRows * rowSize; michael@0: return size; michael@0: } michael@0: michael@0: michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // exportTable() export the state transition table in the format required michael@0: // by the runtime engine. getTableSize() bytes of memory michael@0: // must be available at the output address "where". michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: void RBBITableBuilder::exportTable(void *where) { michael@0: RBBIStateTable *table = (RBBIStateTable *)where; michael@0: uint32_t state; michael@0: int col; michael@0: michael@0: if (U_FAILURE(*fStatus) || fTree == NULL) { michael@0: return; michael@0: } michael@0: michael@0: if (fRB->fSetBuilder->getNumCharCategories() > 0x7fff || michael@0: fDStates->size() > 0x7fff) { michael@0: *fStatus = U_BRK_INTERNAL_ERROR; michael@0: return; michael@0: } michael@0: michael@0: table->fRowLen = sizeof(RBBIStateTableRow) + michael@0: sizeof(uint16_t) * (fRB->fSetBuilder->getNumCharCategories() - 2); michael@0: table->fNumStates = fDStates->size(); michael@0: table->fFlags = 0; michael@0: if (fRB->fLookAheadHardBreak) { michael@0: table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK; michael@0: } michael@0: if (fRB->fSetBuilder->sawBOF()) { michael@0: table->fFlags |= RBBI_BOF_REQUIRED; michael@0: } michael@0: table->fReserved = 0; michael@0: michael@0: for (state=0; statefNumStates; state++) { michael@0: RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state); michael@0: RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen); michael@0: U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767); michael@0: U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767); michael@0: row->fAccepting = (int16_t)sd->fAccepting; michael@0: row->fLookAhead = (int16_t)sd->fLookAhead; michael@0: row->fTagIdx = (int16_t)sd->fTagsIdx; michael@0: for (col=0; colfSetBuilder->getNumCharCategories(); col++) { michael@0: row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col); michael@0: } michael@0: } michael@0: } michael@0: michael@0: michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // printSet Debug function. Print the contents of a UVector michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: #ifdef RBBI_DEBUG michael@0: void RBBITableBuilder::printSet(UVector *s) { michael@0: int32_t i; michael@0: for (i=0; isize(); i++) { michael@0: void *v = s->elementAt(i); michael@0: RBBIDebugPrintf("%10p", v); michael@0: } michael@0: RBBIDebugPrintf("\n"); michael@0: } michael@0: #endif michael@0: michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // printStates Debug Function. Dump the fully constructed state transition table. michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: #ifdef RBBI_DEBUG michael@0: void RBBITableBuilder::printStates() { michael@0: int c; // input "character" michael@0: int n; // state number michael@0: michael@0: RBBIDebugPrintf("state | i n p u t s y m b o l s \n"); michael@0: RBBIDebugPrintf(" | Acc LA Tag"); michael@0: for (c=0; cfSetBuilder->getNumCharCategories(); c++) { michael@0: RBBIDebugPrintf(" %2d", c); michael@0: } michael@0: RBBIDebugPrintf("\n"); michael@0: RBBIDebugPrintf(" |---------------"); michael@0: for (c=0; cfSetBuilder->getNumCharCategories(); c++) { michael@0: RBBIDebugPrintf("---"); michael@0: } michael@0: RBBIDebugPrintf("\n"); michael@0: michael@0: for (n=0; nsize(); n++) { michael@0: RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); michael@0: RBBIDebugPrintf(" %3d | " , n); michael@0: RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx); michael@0: for (c=0; cfSetBuilder->getNumCharCategories(); c++) { michael@0: RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c)); michael@0: } michael@0: RBBIDebugPrintf("\n"); michael@0: } michael@0: RBBIDebugPrintf("\n\n"); michael@0: } michael@0: #endif michael@0: michael@0: michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // printRuleStatusTable Debug Function. Dump the common rule status table michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: #ifdef RBBI_DEBUG michael@0: void RBBITableBuilder::printRuleStatusTable() { michael@0: int32_t thisRecord = 0; michael@0: int32_t nextRecord = 0; michael@0: int i; michael@0: UVector *tbl = fRB->fRuleStatusVals; michael@0: michael@0: RBBIDebugPrintf("index | tags \n"); michael@0: RBBIDebugPrintf("-------------------\n"); michael@0: michael@0: while (nextRecord < tbl->size()) { michael@0: thisRecord = nextRecord; michael@0: nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1; michael@0: RBBIDebugPrintf("%4d ", thisRecord); michael@0: for (i=thisRecord+1; ielementAti(i)); michael@0: } michael@0: RBBIDebugPrintf("\n"); michael@0: } michael@0: RBBIDebugPrintf("\n\n"); michael@0: } michael@0: #endif michael@0: michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // RBBIStateDescriptor Methods. This is a very struct-like class michael@0: // Most access is directly to the fields. michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: michael@0: RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) { michael@0: fMarked = FALSE; michael@0: fAccepting = 0; michael@0: fLookAhead = 0; michael@0: fTagsIdx = 0; michael@0: fTagVals = NULL; michael@0: fPositions = NULL; michael@0: fDtran = NULL; michael@0: michael@0: fDtran = new UVector(lastInputSymbol+1, *fStatus); michael@0: if (U_FAILURE(*fStatus)) { michael@0: return; michael@0: } michael@0: if (fDtran == NULL) { michael@0: *fStatus = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: fDtran->setSize(lastInputSymbol+1, *fStatus); // fDtran needs to be pre-sized. michael@0: // It is indexed by input symbols, and will michael@0: // hold the next state number for each michael@0: // symbol. michael@0: } michael@0: michael@0: michael@0: RBBIStateDescriptor::~RBBIStateDescriptor() { michael@0: delete fPositions; michael@0: delete fDtran; michael@0: delete fTagVals; michael@0: fPositions = NULL; michael@0: fDtran = NULL; michael@0: fTagVals = NULL; michael@0: } michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: #endif /* #if !UCONFIG_NO_BREAK_ITERATION */